1   /*
2    *  Copyright (c) 1998-2001, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan, 01 Feb 2000
10   *
11   *  $Id: POSTagger.java,v 1.14 2002/03/06 17:15:39 kalina Exp $
12   */
13  
14  package gate.creole;
15  
16  import gate.*;
17  import gate.creole.*;
18  import gate.util.*;
19  import gate.event.*;
20  
21  import hepple.postag.*;
22  
23  import java.util.*;
24  import java.io.*;
25  import java.net.URL;
26  import java.text.NumberFormat;
27  /**
28   * This class is a wrapper for HepTag, Mark Hepple's POS tagger.
29   */
30  public class POSTagger extends AbstractLanguageAnalyser {
31  
32    public static final String
33      TAG_DOCUMENT_PARAMETER_NAME = "document";
34  
35    public static final String
36      TAG_INPUT_AS_PARAMETER_NAME = "inputASName";
37  
38    public static final String
39      TAG_OUTPUT_AS_PARAMETER_NAME = "outputASName";
40  
41    public static final String
42      TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
43  
44    public static final String
45      TAG_RULES_URL_PARAMETER_NAME = "rulesURL";
46  
47    public POSTagger() {
48    }
49  
50    public Resource init()throws ResourceInstantiationException{
51      if(lexiconURL == null){
52        throw new ResourceInstantiationException(
53          "NoURL provided for the lexicon!");
54      }
55      if(rulesURL == null){
56        throw new ResourceInstantiationException(
57          "No URL provided for the rules!");
58      }
59      try{
60        tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL);
61      }catch(Exception e){
62        throw new ResourceInstantiationException(e);
63      }
64      return this;
65    }
66  
67  
68    public void execute() throws ExecutionException{
69      try{
70        //check the parameters
71        if(document == null) throw new GateRuntimeException(
72          "No document to process!");
73        if(inputASName != null && inputASName.equals("")) inputASName = null;
74        if(outputASName != null && outputASName.equals("")) outputASName = null;
75        AnnotationSet inputAS = (inputASName == null) ?
76                                document.getAnnotations() :
77                                document.getAnnotations(inputASName);
78        AnnotationSet outputAS = (outputASName == null) ?
79                                 document.getAnnotations() :
80                                 document.getAnnotations(outputASName);
81  
82        fireStatusChanged("POS tagging " + document.getName());
83        fireProgressChanged(0);
84        //prepare the input for HepTag
85        //define a comparator for annotations by start offset
86        Comparator offsetComparator = new OffsetComparator();
87        AnnotationSet as = inputAS.get(SENTENCE_ANNOTATION_TYPE);
88        if(as != null && as.size() > 0){
89          List sentences = new ArrayList(as);
90          Collections.sort(sentences, offsetComparator);
91          Iterator sentIter = sentences.iterator();
92          int sentIndex = 0;
93          int sentCnt = sentences.size();
94          long startTime= System.currentTimeMillis();
95          while(sentIter.hasNext()){
96            Annotation sentenceAnn = (Annotation)sentIter.next();
97            AnnotationSet rangeSet = inputAS.get(
98                                      sentenceAnn.getStartNode().getOffset(),
99                                      sentenceAnn.getEndNode().getOffset());
100           if(rangeSet == null) continue;
101           AnnotationSet tokensSet = rangeSet.get(TOKEN_ANNOTATION_TYPE);
102           if(tokensSet == null) continue;
103           List tokens = new ArrayList(tokensSet);
104           Collections.sort(tokens, offsetComparator);
105 
106 //          List tokens = (List)sentenceAnn.getFeatures().get("tokens");
107           List sentence = new ArrayList(tokens.size());
108           Iterator tokIter = tokens.iterator();
109           while(tokIter.hasNext()){
110             Annotation token = (Annotation)tokIter.next();
111             String text = (String)token.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
112             sentence.add(text);
113           }//while(tokIter.hasNext())
114 
115           //run the POSTagger over this sentence
116           List sentences4tagger = new ArrayList(1);
117           sentences4tagger.add(sentence);
118           List taggerResults = tagger.runTagger(sentences4tagger);
119           //add the results to the output annotation set
120           //we only get one sentence
121           List sentenceFromTagger = (List)taggerResults.get(0);
122           if(sentenceFromTagger.size() != sentence.size()){
123             String taggerResult = "";
124             for(int i = 0; i< sentenceFromTagger.size(); i++){
125               taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", ";
126             }
127             throw new GateRuntimeException(
128               "POS Tagger malfunction: the output size (" +
129               sentenceFromTagger.size() +
130               ") is different from the input size (" +
131               sentence.size() + ")!" +
132               "\n Input: " + sentence + "\nOutput: " + taggerResult);
133           }
134           for(int i = 0; i< sentence.size(); i++){
135             String category = ((String[])sentenceFromTagger.get(i))[1];
136             Annotation token = (Annotation)tokens.get(i);
137             token.getFeatures().
138               put(TOKEN_CATEGORY_FEATURE_NAME, category);
139           }//for(i = 0; i<= sentence.size(); i++)
140           fireProgressChanged(sentIndex++ * 100 / sentCnt);
141         }//while(sentIter.hasNext())
142 
143           fireProcessFinished();
144           long endTime = System.currentTimeMillis();
145           fireStatusChanged(document.getName() + " tagged in " +
146                           NumberFormat.getInstance().format(
147                           (double)(endTime - startTime) / 1000) + " seconds!");
148       }else{
149         throw new GateRuntimeException("No sentences to process!\n" +
150                                        "Please run a sentence splitter first!");
151       }//if(as != null && as.size() > 0)
152     }catch(Exception e){
153       throw new ExecutionException(e);
154     }
155   }
156 
157 
158   public void setLexiconURL(java.net.URL newLexiconURL) {
159     lexiconURL = newLexiconURL;
160   }
161   public java.net.URL getLexiconURL() {
162     return lexiconURL;
163   }
164   public void setRulesURL(java.net.URL newRulesURL) {
165     rulesURL = newRulesURL;
166   }
167   public java.net.URL getRulesURL() {
168     return rulesURL;
169   }
170   public void setInputASName(String newInputASName) {
171     inputASName = newInputASName;
172   }
173   public String getInputASName() {
174     return inputASName;
175   }
176   public void setOutputASName(String newOutputASName) {
177     outputASName = newOutputASName;
178   }
179   public String getOutputASName() {
180     return outputASName;
181   }
182 
183   protected hepple.postag.POSTagger tagger;
184   private java.net.URL lexiconURL;
185   private java.net.URL rulesURL;
186   private String inputASName;
187   private String outputASName;
188 }