1   /*
2    * DefaultGazeteer.java
3    *
4    * Copyright (c) 2000-2001, The University of Sheffield.
5    *
6    * This file is part of GATE (see http://gate.ac.uk/), and is free
7    * software, licenced under the GNU Library General Public License,
8    * Version 2, June1991.
9    *
10   * A copy of this licence is included in the distribution in the file
11   * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
12   *
13   * Valentin Tablan, 03/07/2000
14   *
15   * $Id: DefaultGazetteer.java,v 1.36 2002/03/13 11:19:36 valyt Exp $
16   */
17  
18  package gate.creole.gazetteer;
19  
20  import java.io.*;
21  import java.util.*;
22  import java.net.*;
23  
24  import gate.util.*;
25  import gate.creole.*;
26  import gate.event.*;
27  import gate.*;
28  
29  /** This component is responsible for doing lists lookup. The implementaion is
30   * based on finite state machines.
31   * The phrases to be recognised should be listed in a set of files, one for
32   * each type of occurences.
33   * The gazeteer is build with the information from a file that contains the set
34   * of lists (which are files as well) and the associated type for each list.
35   * The file defining the set of lists should have the following syntax:
36   * each list definition should be written on its own line and should contain:
37   * <ol>
38   * <li>the file name (required) </li>
39   * <li>the major type (required) </li>
40   * <li>the minor type (optional)</li>
41   * <li>the language(s) (optional) </li>
42   * </ol>
43   * The elements of each definition are separated by &quot;:&quot;.
44   * The following is an example of a valid definition: <br>
45   * <code>personmale.lst:person:male:english</code>
46   * Each list file named in the lists definition file is just a list containing
47   * one entry per line.
48   * When this gazetter will be run over some input text (a Gate document) it
49   * will generate annotations of type Lookup having the attributes specified in
50   * the definition file.
51   */
52  public class DefaultGazetteer extends AbstractLanguageAnalyser
53               implements ProcessingResource {
54  
55    public static final String
56      DEF_GAZ_DOCUMENT_PARAMETER_NAME = "document";
57  
58    public static final String
59      DEF_GAZ_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
60  
61    public static final String
62      DEF_GAZ_LISTS_URL_PARAMETER_NAME = "listsURL";
63  
64    public static final String
65      DEF_GAZ_ENCODING_PARAMETER_NAME = "encoding";
66  
67    public static final String
68      DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
69  
70    /** Debug flag
71     */
72    private static final boolean DEBUG = false;
73  
74    /** Build a gazetter using the default lists from the agte resources
75     * {@see init()}
76     */
77    public DefaultGazetteer(){
78    }
79  
80    /** Does the actual loading and parsing of the lists. This method must be
81     * called before the gazetteer can be used
82     */
83    public Resource init()throws ResourceInstantiationException{
84      fsmStates = new HashSet();
85      try{
86        initialState = new FSMState(this);
87        if(listsURL == null){
88          throw new ResourceInstantiationException (
89                "No URL provided for gazetteer creation!");
90        }
91  
92        //find the number of lines
93        Reader reader = new InputStreamReader(listsURL.openStream(), encoding);
94        int linesCnt = 0;
95        BufferedReader bReader = new BufferedReader(reader);
96        String line = bReader.readLine();
97        while (line != null) {
98          linesCnt++;
99          line = bReader.readLine();
100       }
101       bReader.close();
102 
103       //parse the file
104       reader = new InputStreamReader(listsURL.openStream(), encoding);
105       bReader = new BufferedReader(reader);
106       line = bReader.readLine();
107       ///String toParse = "";
108       StringBuffer toParse = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
109 
110       int lineIdx = 0;
111       while (line != null) {
112         if(line.endsWith("\\")) {
113           ///toParse += line.substring(0,line.length()-1);
114           toParse.append(line.substring(0,line.length()-1));
115         } else {
116           ///toParse += line;
117           toParse.append(line);
118           fireStatusChanged("Reading " + toParse.toString());
119           fireProgressChanged(lineIdx * 100 / linesCnt);
120           lineIdx ++;
121           readList(toParse.toString(), true);
122           ///toParse = "";
123           toParse.delete(0,toParse.length());
124         }
125         line = bReader.readLine();
126       }
127       fireProcessFinished();
128     }catch(IOException ioe){
129       throw new ResourceInstantiationException(ioe);
130     }catch(GazetteerException ge){
131       throw new ResourceInstantiationException(ge);
132     }
133     return this;
134   }
135 
136 
137   /** Reads one lists (one file) of phrases
138    *
139    * @param listDesc the line from the definition file
140    * @param add
141    * @add if <b>true</b> will add the phrases found in the list to the ones
142    *     recognised by this gazetter, if <b>false</b> the phrases found in the
143    *     list will be removed from the list of phrases recognised by this
144    *     gazetteer.
145    */
146   void readList(String listDesc, boolean add) throws FileNotFoundException,
147                                         IOException,
148                                         GazetteerException{
149     String listName, majorType, minorType, languages;
150     int firstColon = listDesc.indexOf(':');
151     int secondColon = listDesc.indexOf(':', firstColon + 1);
152     int thirdColon = listDesc.indexOf(':', secondColon + 1);
153     if(firstColon == -1){
154       throw new GazetteerException("Invalid list definition: " + listDesc);
155     }
156     listName = listDesc.substring(0, firstColon);
157 
158     if(secondColon == -1){
159       majorType = listDesc.substring(firstColon + 1);
160       minorType = null;
161       languages = null;
162     } else {
163       majorType = listDesc.substring(firstColon + 1, secondColon);
164       if(thirdColon == -1) {
165         minorType = listDesc.substring(secondColon + 1);
166         languages = null;
167       } else {
168         minorType = listDesc.substring(secondColon + 1, thirdColon);
169         languages = listDesc.substring(thirdColon + 1);
170       }
171     }
172     BufferedReader listReader;
173 
174     listReader = new BufferedReader(new InputStreamReader(
175                             (new URL(listsURL, listName)).openStream(), encoding));
176 
177     Lookup lookup = new Lookup(majorType, minorType, languages);
178     String line = listReader.readLine();
179     while(null != line){
180       if(add)addLookup(line, lookup);
181       else removeLookup(line, lookup);
182       line = listReader.readLine();
183     }
184   } // void readList(String listDesc)
185 
186   /** Adds one phrase to the list of phrases recognised by this gazetteer
187    *
188    * @param text the phrase to be added
189    * @param lookup the description of the annotation to be added when this
190    *     phrase is recognised
191    */
192 // >>> DAM, was
193 /*
194   public void addLookup(String text, Lookup lookup) {
195     Character currentChar;
196     FSMState currentState = initialState;
197     FSMState nextState;
198     Lookup oldLookup;
199     boolean isSpace;
200 
201     for(int i = 0; i< text.length(); i++) {
202       isSpace = Character.isWhitespace(text.charAt(i));
203       if(isSpace) currentChar = new Character(' ');
204       else currentChar = (caseSensitive.booleanValue()) ?
205                           new Character(text.charAt(i)) :
206                           new Character(Character.toUpperCase(text.charAt(i))) ;
207       nextState = currentState.next(currentChar);
208       if(nextState == null){
209         nextState = new FSMState(this);
210         currentState.put(currentChar, nextState);
211         if(isSpace) nextState.put(new Character(' '),nextState);
212       }
213       currentState = nextState;
214     } //for(int i = 0; i< text.length(); i++)
215 
216     currentState.addLookup(lookup);
217     //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);
218 
219   } // addLookup
220 */
221 // >>> DAM: TransArray optimization
222   public void addLookup(String text, Lookup lookup) {
223     char currentChar;
224     FSMState currentState = initialState;
225     FSMState nextState;
226     Lookup oldLookup;
227     boolean isSpace;
228 
229     for(int i = 0; i< text.length(); i++) {
230         currentChar = text.charAt(i);
231         isSpace = Character.isWhitespace(currentChar);
232         if(isSpace) currentChar = ' ';
233         else currentChar = (caseSensitive.booleanValue()) ?
234                           currentChar :
235                           Character.toUpperCase(currentChar) ;
236       nextState = currentState.next(currentChar);
237       if(nextState == null){
238         nextState = new FSMState(this);
239         currentState.put(currentChar, nextState);
240         if(isSpace) nextState.put(' ',nextState);
241       }
242       currentState = nextState;
243     } //for(int i = 0; i< text.length(); i++)
244 
245     currentState.addLookup(lookup);
246     //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);
247 
248   } // addLookup
249 // >>> DAM, end
250 
251   /** Removes one phrase to the list of phrases recognised by this gazetteer
252    *
253    * @param text the phrase to be removed
254    * @param lookup the description of the annotation associated to this phrase
255    */
256 // >>> DAM, was
257 /*
258   public void removeLookup(String text, Lookup lookup) {
259     Character currentChar;
260     FSMState currentState = initialState;
261     FSMState nextState;
262     Lookup oldLookup;
263     boolean isSpace;
264 
265     for(int i = 0; i< text.length(); i++) {
266       isSpace = Character.isWhitespace(text.charAt(i));
267       if(isSpace) currentChar = new Character(' ');
268       else currentChar = new Character(text.charAt(i));
269       nextState = currentState.next(currentChar);
270       if(nextState == null) return;//nothing to remove
271       currentState = nextState;
272     } //for(int i = 0; i< text.length(); i++)
273     currentState.removeLookup(lookup);
274   } // removeLookup
275 */
276 // >>> DAM: TransArray optimization
277   public void removeLookup(String text, Lookup lookup) {
278     char currentChar;
279     FSMState currentState = initialState;
280     FSMState nextState;
281     Lookup oldLookup;
282 
283     for(int i = 0; i< text.length(); i++) {
284         currentChar = text.charAt(i);
285         if(Character.isWhitespace(currentChar)) currentChar = ' ';
286         nextState = currentState.next(currentChar);
287         if(nextState == null) return;//nothing to remove
288         currentState = nextState;
289     } //for(int i = 0; i< text.length(); i++)
290     currentState.removeLookup(lookup);
291   } // removeLookup
292 // >>> DAM, end
293 
294   /** Returns a string representation of the deterministic FSM graph using
295    * GML.
296    */
297   public String getFSMgml() {
298     String res = "graph[ \ndirected 1\n";
299     ///String nodes = "", edges = "";
300     StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE),
301                 edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
302     Iterator fsmStatesIter = fsmStates.iterator();
303     while (fsmStatesIter.hasNext()){
304       FSMState currentState = (FSMState)fsmStatesIter.next();
305       int stateIndex = currentState.getIndex();
306       /*nodes += "node[ id " + stateIndex +
307                " label \"" + stateIndex;
308       */
309       nodes.append("node[ id ");
310       nodes.append(stateIndex);
311       nodes.append(" label \"");
312       nodes.append(stateIndex);
313 
314              if(currentState.isFinal()){
315               ///nodes += ",F\\n" + currentState.getLookupSet();
316               nodes.append(",F\\n");
317               nodes.append(currentState.getLookupSet());
318              }
319              ///nodes +=  "\"  ]\n";
320              nodes.append("\"  ]\n");
321       //edges += currentState.getEdgesGML();
322       edges.append(currentState.getEdgesGML());
323     }
324     res += nodes.toString() + edges.toString() + "]\n";
325     return res;
326   } // getFSMgml
327 
328   //no doc required: javadoc will copy it from the interface
329   /**    */
330   public FeatureMap getFeatures(){
331     return features;
332   } // getFeatures
333 
334   /**    */
335   public void setFeatures(FeatureMap features){
336     this.features = features;
337   } // setFeatures
338 
339 
340 
341   /**
342    * This method runs the gazetteer. It assumes that all the needed parameters
343    * are set. If they are not, an exception will be fired.
344    */
345   public void execute() throws ExecutionException{
346     interrupted = false;
347     AnnotationSet annotationSet;
348     //check the input
349     if(document == null) {
350       throw new ExecutionException(
351         "No document to process!"
352       );
353     }
354 
355     if(annotationSetName == null ||
356        annotationSetName.equals("")) annotationSet = document.getAnnotations();
357     else annotationSet = document.getAnnotations(annotationSetName);
358 
359     fireStatusChanged("Doing lookup in " + document.getName() + "...");
360     String content = document.getContent().toString();
361     int length = content.length();
362 // >>> DAM, was
363 /*
364     Character currentChar;
365 */
366 // >>> DAM: TransArray optimization
367     char currentChar;
368 // >>> DAM, end
369     FSMState currentState = initialState;
370     FSMState nextState;
371     FSMState lastMatchingState = null;
372     int matchedRegionEnd = 0;
373     int matchedRegionStart = 0;
374     int charIdx = 0;
375     int oldCharIdx = 0;
376     FeatureMap fm;
377     Lookup currentLookup;
378 
379 // >>> DAM, was
380 /*
381     while(charIdx < length) {
382       if(Character.isWhitespace(content.charAt(charIdx)))
383         currentChar = new Character(' ');
384       else currentChar = (caseSensitive.booleanValue()) ?
385                          new Character(content.charAt(charIdx)) :
386                          new Character(Character.toUpperCase(
387                                        content.charAt(charIdx)));
388 */
389 // >>> DAM: TransArray optimization
390     while(charIdx < length) {
391       currentChar = content.charAt(charIdx);
392       if(Character.isWhitespace(currentChar)) currentChar = ' ';
393       else currentChar = caseSensitive.booleanValue() ?
394                           currentChar :
395                           Character.toUpperCase(currentChar);
396 // >>> DAM, end
397       nextState = currentState.next(currentChar);
398       if(nextState == null) {
399         //the matching stopped
400 
401         //if we had a successful match then act on it;
402         if(lastMatchingState != null){
403           //let's add the new annotation(s)
404           Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
405 
406           while(lookupIter.hasNext()) {
407             currentLookup = (Lookup)lookupIter.next();
408             fm = Factory.newFeatureMap();
409             fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME,
410                   currentLookup.majorType);
411             if(null != currentLookup.minorType) {
412               fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME,
413                     currentLookup.minorType);
414               if(null != currentLookup.languages)
415                 fm.put("language", currentLookup.languages);
416             }
417             try {
418               annotationSet.add(new Long(matchedRegionStart),
419                               new Long(matchedRegionEnd + 1),
420                               LOOKUP_ANNOTATION_TYPE,
421                               fm);
422             } catch(InvalidOffsetException ioe) {
423               throw new LuckyException(ioe.toString());
424             }
425           }//while(lookupIter.hasNext())
426           lastMatchingState = null;
427         }
428 
429         //reset the FSM
430         charIdx = matchedRegionStart + 1;
431         matchedRegionStart = charIdx;
432         currentState = initialState;
433 
434       } else{//go on with the matching
435         currentState = nextState;
436         //if we have a successful state then store it
437         if(currentState.isFinal() &&
438            (matchedRegionStart == 0 ||
439             !Character.isLetter(content.charAt(matchedRegionStart - 1))) &&
440            (charIdx + 1 >= content.length()   ||
441             !Character.isLetter(content.charAt(charIdx + 1)))
442           ){
443           matchedRegionEnd = charIdx;
444           lastMatchingState = currentState;
445         }
446         charIdx ++;
447         if(charIdx == content.length()){
448           //we can't go on, use the last matching state and restart matching
449           //from the next char
450           if(lastMatchingState != null){
451             //let's add the new annotation(s)
452             Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
453 
454             while(lookupIter.hasNext()) {
455               currentLookup = (Lookup)lookupIter.next();
456               fm = Factory.newFeatureMap();
457               fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME,
458                     currentLookup.majorType);
459               if(null != currentLookup.minorType) {
460                 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME,
461                       currentLookup.minorType);
462                 if(null != currentLookup.languages)
463                   fm.put("language", currentLookup.languages);
464               }
465               try {
466                 annotationSet.add(new Long(matchedRegionStart),
467                                 new Long(matchedRegionEnd + 1),
468                                 LOOKUP_ANNOTATION_TYPE,
469                                 fm);
470               } catch(InvalidOffsetException ioe) {
471                 throw new LuckyException(ioe.toString());
472               }
473             }//while(lookupIter.hasNext())
474             lastMatchingState = null;
475           }
476 
477           //reset the FSM
478           charIdx = matchedRegionStart + 1;
479           matchedRegionStart = charIdx;
480           currentState = initialState;
481         }
482       }
483       if(charIdx - oldCharIdx > 256) {
484         fireProgressChanged((100 * charIdx )/ length );
485         oldCharIdx = charIdx;
486         if(isInterrupted()) throw new ExecutionInterruptedException(
487             "The execution of the " + getName() +
488             " gazetteer has been abruptly interrupted!");
489       }
490     } // while(charIdx < length)
491 
492     if(lastMatchingState != null) {
493       Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
494       while(lookupIter.hasNext()) {
495         currentLookup = (Lookup)lookupIter.next();
496         fm = Factory.newFeatureMap();
497         fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME,
498                currentLookup.majorType);
499         if(null != currentLookup.minorType)
500           fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME,
501                  currentLookup.minorType);
502         try{
503           annotationSet.add(new Long(matchedRegionStart),
504                           new Long(matchedRegionEnd + 1),
505                           LOOKUP_ANNOTATION_TYPE,
506                           fm);
507         } catch(InvalidOffsetException ioe) {
508           throw new GateRuntimeException(ioe.toString());
509         }
510       }//while(lookupIter.hasNext())
511     }
512     fireProcessFinished();
513     fireStatusChanged("Lookup complete!");
514   } // execute
515 
516 
517   /**
518    * Sets the AnnotationSet that will be used at the next run for the newly
519    * produced annotations.
520    */
521   public void setAnnotationSetName(String newAnnotationSetName) {
522     annotationSetName = newAnnotationSetName;
523   }
524 
525 
526   /** The initial state of the FSM that backs this gazetteer
527    */
528   FSMState initialState;
529 
530   /** A set containing all the states of the FSM backing the gazetteer
531    */
532   Set fsmStates;
533 
534   protected FeatureMap features  = null;
535 
536   /** Used to store the annotation set currently being used for the newly
537    * generated annotations
538    */
539   protected String annotationSetName;
540 
541   private String encoding = "UTF-8";
542 
543   /**
544    * The value of this property is the URL that will be used for reading the
545    * lists dtaht define this Gazetteer
546    */
547   private java.net.URL listsURL;
548 
549   /**
550    * Should this gazetteer be case sensitive. The default value is true.
551    */
552   private Boolean caseSensitive = new Boolean(true);
553 
554   public void setEncoding(String newEncoding) {
555     encoding = newEncoding;
556   }
557   public String getEncoding() {
558     return encoding;
559   }
560   public void setListsURL(java.net.URL newListsURL) {
561     listsURL = newListsURL;
562   }
563   public java.net.URL getListsURL() {
564     return listsURL;
565   }
566   public void setCaseSensitive(Boolean newCaseSensitive) {
567     caseSensitive = newCaseSensitive;
568   }
569   public Boolean getCaseSensitive() {
570     return caseSensitive;
571   }
572   public String getAnnotationSetName() {
573     return annotationSetName;
574   }
575 
576 } // DefaultGazetteer
577 
578 // >>> DAM: TransArray optimization, new charMap implementation
579 interface Iter
580 {
581     public boolean hasNext();
582     public char next();
583 } // iter class
584 
585 /**
586  * class implementing the map using binary serach by char as key
587  * to retrive the coresponding object.
588  */
589 class charMap
590 {
591     char[] itemsKeys = null;
592     Object[] itemsObjs = null;
593 
594     /**
595      * resize the containers by one leavaing empty elemant at position 'index'
596      */
597     void resize(int index)
598     {
599         int newsz = itemsKeys.length + 1;
600         char[] tempKeys = new char[newsz];
601         Object[] tempObjs = new Object[newsz];
602         int i;
603         for (i= 0; i < index; i++)
604         {
605             tempKeys[i] = itemsKeys[i];
606             tempObjs[i] = itemsObjs[i];
607         }
608         for (i= index+1; i < newsz; i++)
609         {
610             tempKeys[i] = itemsKeys[i-1];
611             tempObjs[i] = itemsObjs[i-1];
612         }
613 
614         itemsKeys = tempKeys;
615         itemsObjs = tempObjs;
616     } // resize
617 
618 /**
619  * get the object from the map using the char key
620  */
621     Object get(char key)
622     {
623         if (itemsKeys == null) return null;
624         int index = Arrays.binarySearch(itemsKeys, key);
625         if (index<0)
626             return null;
627         return itemsObjs[index];
628     }
629 /**
630  * put the object into the char map using the chat as the key
631  */
632     Object put(char key, Object value)
633     {
634         if (itemsKeys == null)
635         {
636             itemsKeys = new char[1];
637             itemsKeys[0] = key;
638             itemsObjs = new Object[1];
639             itemsObjs[0] = value;
640             return value;
641         }// if first time
642         int index = Arrays.binarySearch(itemsKeys, key);
643         if (index<0)
644         {
645             index = ~index;
646             resize(index);
647             itemsKeys[index] = key;
648             itemsObjs[index] = value;
649         }
650         return itemsObjs[index];
651     } // put
652 /**
653  * the keys itereator
654  * /
655     public Iter iter()
656     {
657         return new Iter()
658         {
659             int counter = 0;
660             public boolean hasNext() {return counter < itemsKeys.length;}
661             public char next() { return itemsKeys[counter];}
662         };
663     } // iter()
664  */
665 
666 } // class charMap
667 // >>> DAM, end, new charMap instead MAP for transition function in the FSMState