1   /*
2    *  OrthoMatcher.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 24/August/2001
12   *
13   *  $Id: OrthoMatcher.java,v 1.42 2002/03/06 17:15:44 kalina Exp $
14   */
15  
16  
17  package gate.creole.orthomatcher;
18  
19  import gate.*;
20  import gate.util.*;
21  import gate.creole.*;
22  import gate.corpora.*;
23  import gate.annotation.*;
24  import java.util.*;
25  import java.io.*;
26  import java.net.*;
27  import gnu.regexp.*;
28  
29  public class OrthoMatcher extends AbstractLanguageAnalyser
30                            implements ANNIEConstants{
31  
32    public static final String
33      OM_DOCUMENT_PARAMETER_NAME = "document";
34  
35    public static final String
36      OM_ANN_SET_PARAMETER_NAME = "annotationSetName";
37  
38    public static final String
39      OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
40  
41    public static final String
42      OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes";
43  
44    public static final String
45      OM_ORG_TYPE_PARAMETER_NAME = "organizationType";
46  
47    public static final String
48      OM_PERSON_TYPE_PARAMETER_NAME = "personType";
49  
50    public static final String
51      OM_EXT_LISTS_PARAMETER_NAME = "extLists";
52  
53    protected static final String CDGLISTNAME = "cdg";
54    protected static final String ALIASLISTNAME = "alias";
55    protected static final String ARTLISTNAME = "def_art";
56    protected static final String PREPLISTNAME = "prepos";
57    protected static final String CONNECTORLISTNAME = "connector";
58    protected static final String SPURLISTNAME = "spur_match";
59  
60    protected static final String PUNCTUATION_VALUE = "punctuation";
61    protected static final String THE_VALUE = "The";
62  
63  
64    /**the name of the annotation set*/
65    protected String annotationSetName;
66  
67    /** the types of the annotation */
68    protected List annotationTypes = new ArrayList(10);
69  
70    /** the organization type*/
71    protected String organizationType = ORGANIZATION_ANNOTATION_TYPE;
72  
73    /** the person type*/
74    protected String personType = PERSON_ANNOTATION_TYPE;
75  
76    protected String unknownType = "Unknown";
77  
78    /** internal or external list */
79    protected boolean extLists = true;
80  
81    protected boolean matchingUnknowns = true;
82  
83    /** This is an internal variable to indicate whether
84     *  we matched using a rule that requires that
85     *  the newly matched annotation matches all the others
86     *  This is needed, because organizations can share
87     *  first/last tokens like News and be different
88     */
89    private   boolean allMatchingNeeded = false;
90  
91    //** Orthomatching is not case-sensitive by default*/
92    protected boolean caseSensitive = false;
93  
94    protected FeatureMap queryFM = Factory.newFeatureMap();
95  
96  //  protected ExecutionException executionException;
97  
98    // name lookup tables (used for namematch)
99    //gave them bigger default size, coz rehash is expensive
100   protected HashMap alias = new HashMap(100);
101   protected HashSet cdg = new HashSet(50);
102   protected HashMap spur_match = new HashMap(100);
103   protected HashMap def_art = new HashMap(20);
104   protected HashMap connector = new HashMap(20);
105   protected HashMap prepos = new HashMap(30);
106 
107 
108   protected AnnotationSet nameAllAnnots = null;
109   protected HashMap processedAnnots = new HashMap(150);
110   protected HashMap annots2Remove = new HashMap(75);
111   protected List matchesDocFeature = new ArrayList();
112   //maps annotation ids to array lists of tokens
113   protected HashMap tokensMap = new HashMap(150);
114 
115   protected Annotation shortAnnot, longAnnot;
116 
117   protected ArrayList tokensLongAnnot, tokensShortAnnot;
118 
119   /** a feature map to be used when retrieving annotations
120    *  declared here so can be reused for efficiency
121    *  clear() before each use
122    */
123   protected FeatureMap tempMap = Factory.newFeatureMap();
124 
125   /** a buffer in order to read an array of char */
126   private char[] cbuffer = null;
127 
128   /** the size of the buffer */
129   private final static int BUFF_SIZE = 65000;
130 
131   /** @link dependency */
132   /*#OrthoMatcher lnkOrthoMatcher;*/
133 
134   public OrthoMatcher () {
135     annotationTypes.add(organizationType);
136     annotationTypes.add(personType);
137     annotationTypes.add("Location");
138     annotationTypes.add("Date");
139   }
140 
141   /** Initialise this resource, and return it. */
142   public Resource init() throws ResourceInstantiationException {
143     cbuffer = new char[BUFF_SIZE];
144 
145     //initialise the list of annotations which we will match
146     try {
147       createLists();
148     } catch (IOException ioe) {ioe.printStackTrace();}
149     return this;
150   } // init()
151 
152   /**  Run the resource. It doesn't make sense not to override
153     *  this in subclasses so the default implementation signals an
154     *  exception.
155     */
156   public void execute() throws ExecutionException{
157 
158     //check the input
159     if(document == null) {
160       throw new ExecutionException(
161         "No document for namematch!"
162       );
163     }
164 
165     // get the annotations from document
166     if ((annotationSetName == null)|| (annotationSetName.equals("")))
167       nameAllAnnots = document.getAnnotations();
168     else
169       nameAllAnnots = document.getAnnotations(annotationSetName);
170 
171     //if none found, print warning and exit
172     if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) {
173       Out.prln("OrthoMatcher Warning: No annotations found for processing");
174       return;
175     }
176 
177     //check if we've been run on this document before
178     //and clean the doc if needed
179     docCleanup();
180     Map matchesMap = (Map)document.getFeatures().
181                      get(DOCUMENT_COREF_FEATURE_NAME);
182 
183     // creates the cdg list from the document
184     //no need to create otherwise, coz already done in init()
185     if (!extLists)
186       buildTables(nameAllAnnots);
187 
188     //first match all name annotations
189     matchNameAnnotations();
190 
191     //then match the unknown ones to all name ones
192     if (matchingUnknowns)
193       matchUnknown();
194 
195     // set the matches of the document
196 //    determineMatchesDocument();
197     if (! matchesDocFeature.isEmpty()) {
198       if(matchesMap == null){
199         matchesMap = new HashMap();
200       }
201       matchesMap.put(nameAllAnnots.getName(), matchesDocFeature);
202       //we need to put it even if it was already present in order to triger
203       //the update events
204       document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap);
205 
206       //cannot do clear() as this has already been put on the document
207       //so I need a new one for the next run of matcher
208       matchesDocFeature = new ArrayList();
209     }
210 
211 //    Out.prln("Processed strings" + processedAnnots.values());
212     //clean-up the internal data structures for next run
213     nameAllAnnots = null;
214     processedAnnots.clear();
215     annots2Remove.clear();
216     tokensMap.clear();
217     matchesDocFeature = new ArrayList();
218     longAnnot = null;
219     shortAnnot = null;
220     tokensLongAnnot = null;
221     tokensShortAnnot = null;
222 
223   } // run()
224 
225   protected void matchNameAnnotations() throws ExecutionException{
226     // go through all the annotation types
227     Iterator iterAnnotationTypes = annotationTypes.iterator();
228     while (iterAnnotationTypes.hasNext()) {
229       String annotationType = (String)iterAnnotationTypes.next();
230 
231       AnnotationSet nameAnnots = nameAllAnnots.get(annotationType);
232 
233       // continue if no such annotations exist
234       if ((nameAnnots == null) || nameAnnots.isEmpty())
235         continue;
236 
237       Iterator iterNames = nameAnnots.iterator();
238       while (iterNames.hasNext()) {
239         Annotation nameAnnot = (Annotation) iterNames.next();
240         Integer id = nameAnnot.getId();
241 
242         // get string and value
243         String annotString = null;
244         try {
245             annotString = document.getContent().getContent(
246             nameAnnot.getStartNode().getOffset(),
247             nameAnnot.getEndNode().getOffset()
248             ).toString();
249           // now do the reg. exp. substitutions
250           annotString = regularExpressions(annotString," ", "\\s+");
251 
252         } catch (InvalidOffsetException ioe) {
253             throw new ExecutionException
254                                    ("Invalid offset of the annotation");
255         }
256         //convert to lower case if we are not doing a case sensitive match
257         if (!caseSensitive)
258           annotString = annotString.toLowerCase();
259 
260         //get the tokens
261         List tokens = new ArrayList((Set)
262                         nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
263                           nameAnnot.getStartNode().getOffset(),
264                           nameAnnot.getEndNode().getOffset()
265                         ));
266         //if no tokens to match, do nothing
267         if (tokens.isEmpty())
268           continue;
269         Collections.sort(tokens, new gate.util.OffsetComparator());
270         //check if these actually do not end after the name
271         //needed coz new tokeniser conflates
272         //strings with dashes. So British Gas-style is two tokens
273         //instead of three. So cannot match properly British Gas
274 //        tokens = checkTokens(tokens);
275         tokensMap.put(nameAnnot.getId(), tokens);
276 
277 //        Out.prln("Matching annot " + nameAnnot + ": string " + annotString);
278 
279         //first check whether we have not matched such a string already
280         //if so, just consider it matched, don't bother calling the rules
281         if (processedAnnots.containsValue(annotString)) {
282 //          Out.prln("Contained string found " + annotString);
283           updateMatches(nameAnnot, annotString);
284           processedAnnots.put(nameAnnot.getId(), annotString);
285           continue;
286         } else if (processedAnnots.isEmpty()) {
287           processedAnnots.put(nameAnnot.getId(), annotString);
288           continue;
289         }
290 
291         //if a person, then remove their title before matching
292         if (nameAnnot.getType().equals(personType))
293           annotString = containTitle(annotString, nameAnnot);
294         else if (nameAnnot.getType().equals(organizationType))
295           annotString = stripCDG(annotString, nameAnnot);
296 
297         if(null == annotString || "".equals(annotString))
298           continue;
299 
300         //otherwise try matching with previous annotations
301         matchWithPrevious(nameAnnot, annotString);
302 
303 //        Out.prln("Putting in previous " + nameAnnot + ": string " + annotString);
304         //finally add the current annotations to the processed map
305         processedAnnots.put(nameAnnot.getId(), annotString);
306       }//while through name annotations
307 
308     }//while through annotation types
309 
310   }
311 
312   protected void matchUnknown() throws ExecutionException {
313     //get all Unknown annotations
314     AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType);
315 
316     if ((unknownAnnots == null) || unknownAnnots.isEmpty())
317       return;
318 
319     Iterator iter = unknownAnnots.iterator();
320     //loop through the unknown annots
321     while (iter.hasNext()) {
322       Annotation unknown = (Annotation) iter.next();
323 
324       // get string and value
325       String unknownString = null;
326       try {
327           unknownString = document.getContent().getContent(
328             unknown.getStartNode().getOffset(),
329             unknown.getEndNode().getOffset()
330             ).toString();
331         // now do the reg. exp. substitutions
332         unknownString = regularExpressions(unknownString," ", "\\s+");
333       } catch (InvalidOffsetException ioe) {
334           throw new ExecutionException
335                                  ("Invalid offset of the annotation");
336       }
337       //convert to lower case if we are not doing a case sensitive match
338       if (!caseSensitive)
339         unknownString = unknownString.toLowerCase();
340 
341       //get the tokens
342       List tokens = new ArrayList((Set)
343                       nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
344                         unknown.getStartNode().getOffset(),
345                         unknown.getEndNode().getOffset()
346                       ));
347       if (tokens.isEmpty())
348         continue;
349       Collections.sort(tokens, new gate.util.OffsetComparator());
350       tokensMap.put(unknown.getId(), tokens);
351 
352 
353       //first check whether we have not matched such a string already
354       //if so, just consider it matched, don't bother calling the rules
355       if (processedAnnots.containsValue(unknownString)) {
356         Annotation matchedAnnot = updateMatches(unknown, unknownString);
357 //        Out.prln("Matched " + unknown + "with string " + unknownString);
358 //        Out.prln("That's same as " + matchedAnnot);
359         if (matchedAnnot.getType().equals(unknownType)) {
360           annots2Remove.put(unknown.getId(),
361                             annots2Remove.get(matchedAnnot.getId()));
362         }
363         else
364           annots2Remove.put(unknown.getId(), matchedAnnot.getType());
365         processedAnnots.put(unknown.getId(), unknownString);
366         unknown.getFeatures().put("NMRule", unknownType);
367         continue;
368       }
369 
370       //check if we should do sub-string matching in case it's hyphenated
371       //for example US-led
372       if (tokens.size() == 1
373           && "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) {
374         if (matchHyphenatedUnknowns(unknown, unknownString, iter))
375           continue;
376       }//if
377 
378       matchWithPrevious(unknown, unknownString);
379 
380     } //while though unknowns
381 
382     if (! annots2Remove.isEmpty()) {
383       Iterator unknownIter = annots2Remove.keySet().iterator();
384       while (unknownIter.hasNext()) {
385         Integer unknId = (Integer) unknownIter.next();
386         Annotation unknown = nameAllAnnots.get(unknId);
387         Integer newID = nameAllAnnots.add(
388           unknown.getStartNode(),
389           unknown.getEndNode(),
390           (String) annots2Remove.get(unknId),
391           unknown.getFeatures()
392         );
393         nameAllAnnots.remove(unknown);
394 
395         //change the id in the matches list
396         List mList = (List)unknown.getFeatures().
397                      get(ANNOTATION_COREF_FEATURE_NAME);
398         mList.remove(unknId);
399         mList.add(newID);
400       }//while
401     }//if
402   }
403 
404   private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString,
405                                        Iterator iter){
406     boolean matched = false;
407 
408     //only take the substring before the hyphen
409     int stringEnd = unknownString.indexOf("-");
410     unknownString = unknownString.substring(0, stringEnd);
411     //check if we've already matched this string
412     //because only exact match of the substring are considered
413     if (processedAnnots.containsValue(unknownString)) {
414       matched = true;
415       Annotation matchedAnnot = updateMatches(unknown, unknownString);
416       //only do the matching if not a person, because we do not match
417       //those on sub-strings
418       iter.remove();
419       String newType;
420       if (matchedAnnot.getType().equals(unknownType))
421         newType = (String)annots2Remove.get(matchedAnnot.getId());
422       else
423         newType = matchedAnnot.getType();
424 
425       Integer newID = new Integer(-1);
426       try {
427         newID = nameAllAnnots.add(
428           unknown.getStartNode().getOffset(),
429           new Long(unknown.getStartNode().getOffset().longValue()
430                   + stringEnd),
431           newType,
432           unknown.getFeatures()
433         );
434       } catch (InvalidOffsetException ex) {
435         throw new GateRuntimeException(ex.getMessage());
436       }
437       nameAllAnnots.remove(unknown);
438 
439       //change the id in the matches list
440       List mList = (List)unknown.getFeatures().
441                    get(ANNOTATION_COREF_FEATURE_NAME);
442       mList.remove(unknown.getId());
443       mList.add(newID);
444 
445     }
446     return matched;
447   }
448 
449   protected void matchWithPrevious(Annotation nameAnnot, String annotString) {
450     boolean matchedUnknown = false;
451 
452     Iterator prevIter = processedAnnots.keySet().iterator();
453     while (prevIter.hasNext()) {
454       Integer prevId = (Integer) prevIter.next();
455       Annotation prevAnnot = nameAllAnnots.get(prevId);
456 
457       //check if the two are from the same type or the new one is unknown
458       if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType())
459           && ! nameAnnot.getType().equals(unknownType))
460          )
461         continue;
462       //do not compare two unknown annotations either
463       //they are only matched to those of known types
464       if (  nameAnnot.getType().equals(unknownType)
465             && prevAnnot.getType().equals(unknownType))
466       continue;
467 
468       //check if we have already matched this annotation to the new one
469       if (matchedAlready(nameAnnot, prevAnnot) )
470         continue;
471 
472       //now changed to a rule, here we just match by gender
473       if (prevAnnot.getType().equals(personType)) {
474         String prevGender =
475           (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
476         String nameGender =
477           (String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
478         if (   prevGender != null
479             && nameGender != null
480             && ( (nameGender.equalsIgnoreCase("female")
481                   &&
482                   prevGender.equalsIgnoreCase("male")
483                   )
484                ||
485                   (prevGender.equalsIgnoreCase("female")
486                    && nameGender.equalsIgnoreCase("male")
487                   )
488                 )
489             ) //if condition
490           continue; //we don't have a match if the two genders are different
491 
492       }//if
493 
494       //if the two annotations match
495       if (matchAnnotations(nameAnnot, annotString,  prevAnnot)) {
496 //        Out.prln("Matched " + shortName + "and " + longName);
497         updateMatches(nameAnnot, prevAnnot);
498         //if unknown annotation, we need to change to the new type
499         if (nameAnnot.getType().equals(unknownType)) {
500           matchedUnknown = true;
501           if (prevAnnot.getType().equals(unknownType))
502             annots2Remove.put(nameAnnot.getId(),
503                               annots2Remove.get(prevAnnot.getId()));
504           else
505             annots2Remove.put(nameAnnot.getId(), prevAnnot.getType());
506          //also put an attribute to indicate that
507           nameAnnot.getFeatures().put("NMRule", unknownType);
508         }//if unknown
509         break; //no need to match further
510       }//if annotations matched
511 
512     }//while through previous annotations
513 
514     if (matchedUnknown)
515       processedAnnots.put(nameAnnot.getId(), annotString);
516 
517 
518   }//matchWithPrevious
519 
520   protected boolean matchAnnotations(Annotation newAnnot, String annotString,
521                                      Annotation prevAnnot) {
522     //do not match two annotations that overlap
523     if (newAnnot.overlaps(prevAnnot))
524       return false;
525 
526     // find which annotation string of the two is longer
527     //  this is useful for some of the matching rules
528     String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
529 
530     String longName = prevAnnotString;
531     String shortName = annotString;
532     longAnnot = prevAnnot;
533     shortAnnot = newAnnot;
534 
535     if (shortName.length()>=longName.length()) {
536       String temp = longName;
537       longName = shortName;
538       shortName = temp;
539       Annotation tempAnn = longAnnot;
540       longAnnot = shortAnnot;
541       shortAnnot = tempAnn;
542     }//if
543 
544     tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
545     tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
546 
547     List matchesList = (List) prevAnnot.getFeatures().
548                               get(ANNOTATION_COREF_FEATURE_NAME);
549     if (matchesList == null || matchesList.isEmpty())
550       return apply_rules_namematch(prevAnnot.getType(), shortName,longName);
551 
552     //if these two match, then let's see if all the other matching one will too
553     //that's needed, because sometimes names can share a token (e.g., first or
554     //last but not be the same
555     if (apply_rules_namematch(prevAnnot.getType(), shortName,longName)) {
556       /**
557        * Check whether we need to ensure that there is a match with the rest
558        * of the matching annotations, because the rule requires that
559        * transtivity is not assummed.
560        */
561       if (allMatchingNeeded) {
562         allMatchingNeeded = false;
563 
564         List toMatchList = new ArrayList(matchesList);
565   //      if (newAnnot.getType().equals(unknownType))
566   //        Out.prln("Matching new " + annotString + " with annots " + toMatchList);
567         toMatchList.remove(prevAnnot.getId());
568 
569         return matchOtherAnnots(toMatchList, newAnnot, annotString);
570       } else
571         return true;
572     }
573     return false;
574   }
575 
576   /** This method checkes whether the new annotation matches
577    *  all annotations given in the toMatchList (it contains ids)
578    *  The idea is that the new annotation needs to match all those,
579    *  because assuming transitivity does not always work, when
580    *  two different entities share a common token: e.g., BT Cellnet
581    *  and BT and British Telecom.
582   */
583   protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot,
584                                       String annotString) {
585 
586     //if the list is empty, then we're matching all right :-)
587     if (toMatchList.isEmpty())
588       return true;
589 
590     boolean matchedAll = true;
591     int i = 0;
592 
593     while (matchedAll && i < toMatchList.size()) {
594       Annotation prevAnnot = nameAllAnnots.get((Integer) toMatchList.get(i));
595 
596       // find which annotation string of the two is longer
597       //  this is useful for some of the matching rules
598       String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
599       if (prevAnnotString == null)
600         try {
601           prevAnnotString = document.getContent().getContent(
602             prevAnnot.getStartNode().getOffset(),
603             prevAnnot.getEndNode().getOffset()
604             ).toString();
605         } catch (InvalidOffsetException ioe) {
606           return false;
607         }//try
608 
609 
610       String longName = prevAnnotString;
611       String shortName = annotString;
612       longAnnot = prevAnnot;
613       shortAnnot = newAnnot;
614 
615       if (shortName.length()>=longName.length()) {
616         String temp = longName;
617         longName = shortName;
618         shortName = temp;
619         Annotation tempAnn = longAnnot;
620         longAnnot = shortAnnot;
621         shortAnnot = tempAnn;
622       }//if
623 
624       tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
625       tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
626 
627       matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName);
628 //      if (newAnnot.getType().equals(unknownType))
629 //        Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll);
630 
631       i++;
632     }//while
633     return matchedAll;
634   }
635 
636 
637   protected boolean matchedAlready(Annotation annot1, Annotation annot2) {
638     //the two annotations are already matched if the matches list of the first
639     //contains the id of the second
640     List matchesList = (List) annot1.getFeatures().
641                        get(ANNOTATION_COREF_FEATURE_NAME);
642     if ((matchesList == null) || matchesList.isEmpty())
643       return false;
644     else if (matchesList.contains(annot2.getId()))
645       return true;
646     return false;
647   }
648 
649   protected Annotation updateMatches(Annotation newAnnot, String annotString) {
650     Annotation matchedAnnot = null;
651     Integer id;
652 
653     //first find a processed annotation with the same string
654     Iterator iter = processedAnnots.keySet().iterator();
655     while (iter.hasNext()) {
656       id = (Integer) iter.next();
657       String oldString = (String) processedAnnots.get(id);
658       if (annotString.equals(oldString)) {
659         matchedAnnot = nameAllAnnots.get(id);
660         break;
661       }//if
662     }//while
663 
664     if (matchedAnnot == null) return null;
665     //if the two matching annotations are of different type which is not
666     //unknown, do not match them
667     if (! matchedAnnot.getType().equals(newAnnot.getType())
668         && !newAnnot.getType().equals(unknownType) )
669       return matchedAnnot;
670 
671     List matchesList = (List) matchedAnnot.getFeatures().
672                        get(ANNOTATION_COREF_FEATURE_NAME);
673     if ((matchesList == null) || matchesList.isEmpty()) {
674       //no previous matches, so need to add
675       if (matchesList == null) {
676         matchesList = new ArrayList();
677         matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME,
678                                        matchesList);
679         matchesDocFeature.add(matchesList);
680       }//if
681       matchesList.add(matchedAnnot.getId());
682       matchesList.add(newAnnot.getId());
683     } else {
684       //just add the new annotation
685       matchesList.add(newAnnot.getId());
686     }//if
687     //add the matches list to the new annotation
688     newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
689     return matchedAnnot;
690   }
691 
692   protected void updateMatches(Annotation newAnnot, Annotation prevAnnot) {
693 
694     List matchesList = (List) prevAnnot.getFeatures().
695                               get(ANNOTATION_COREF_FEATURE_NAME);
696     if ((matchesList == null) || matchesList.isEmpty()) {
697       //no previous matches, so need to add
698       if (matchesList == null) {
699         matchesList = new ArrayList();
700         prevAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
701         matchesDocFeature.add(matchesList);
702       }//if
703       matchesList.add(prevAnnot.getId());
704       matchesList.add(newAnnot.getId());
705     } else {
706       //just add the new annotation
707       matchesList.add(newAnnot.getId());
708     }//if
709     //add the matches list to the new annotation
710     newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
711     //propagate the gender if two persons are matched
712     if (prevAnnot.getType().equals(personType)) {
713       String prevGender =
714         (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
715       String newGender =
716         (String) newAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
717       boolean unknownPrevGender = isUnknownGender(prevGender);
718       boolean unknownNewGender = isUnknownGender(newGender);
719       if (unknownPrevGender && !unknownNewGender)
720         prevAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, newGender);
721       else if (unknownNewGender && !unknownPrevGender)
722         newAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, prevGender);
723     }//if
724   }
725 
726 
727   protected void docCleanup() {
728     Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME);
729     if (matchesValue != null && (matchesValue instanceof Map))
730       ((Map)matchesValue).remove(nameAllAnnots.getName());
731     else if (matchesValue != null) {
732       document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap());
733     }
734 
735     //get all annotations that have a matches feature
736     HashSet fNames = new HashSet();
737     fNames.add(ANNOTATION_COREF_FEATURE_NAME);
738     AnnotationSet annots =
739                   nameAllAnnots.get(null, fNames);
740 
741 //    Out.prln("Annots to cleanup" + annots);
742 
743     if (annots == null || annots.isEmpty())
744       return;
745 
746     Iterator iter = annots.iterator();
747     while (iter.hasNext()) {
748       while (iter.hasNext())
749         ((Annotation) iter.next()).getFeatures().
750                                    remove(ANNOTATION_COREF_FEATURE_NAME);
751     } //while
752   }//cleanup
753 
754   /** return a person name without title */
755   protected String containTitle (String annotString, Annotation annot)
756                       throws ExecutionException {
757     // get the offsets
758     Long startAnnot = annot.getStartNode().getOffset();
759     Long endAnnot = annot.getEndNode().getOffset();
760 
761     // determine "Lookup" annotation set
762     queryFM.clear();
763     queryFM.put("majorType", "title");
764     AnnotationSet as1 = nameAllAnnots.get(startAnnot,endAnnot);
765     if (as1 == null || as1.isEmpty())
766       return annotString;
767     AnnotationSet as =
768       as1.get("Lookup", queryFM);
769     if (as !=null && ! as.isEmpty()) {
770       List titles = new ArrayList((Set)as);
771       Collections.sort(titles, new gate.util.OffsetComparator());
772 
773       Iterator iter = titles.iterator();
774       while (iter.hasNext()) {
775         Annotation titleAnn = (Annotation)(iter.next());
776 
777         //we've not found a title at the start offset,
778         //there's no point in looking further
779         //coz titles come first
780         if (titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0)
781           return annotString;
782 
783         try {
784           // the title from the current annotation
785           String annotTitle =
786             document.getContent().getContent(
787               titleAnn.getStartNode().getOffset(),
788               titleAnn.getEndNode().getOffset()
789             ).toString();
790 
791           // eliminate the title from annotation string and return the result
792           if (annotTitle.length()<annotString.length()) {
793             //remove from the array of tokens, so then we can compare properly
794             //the remaining tokens
795 //            Out.prln("Removing title from: " + annot + " with string " + annotString);
796 //            Out.prln("Tokens are" + tokensMap.get(annot.getId()));
797 //            Out.prln("Title is" + annotTitle);
798             ((ArrayList) tokensMap.get(annot.getId())).remove(0);
799             return annotString.substring(
800                                  annotTitle.length()+1,annotString.length());
801           }
802         } catch (InvalidOffsetException ioe) {
803             throw new ExecutionException
804                                ("Invalid offset of the annotation");
805         }//try
806       }// while
807     }//if
808     return annotString;
809 
810   }
811 
812   /** return an organization  without a designator and starting The*/
813   protected String stripCDG (String annotString, Annotation annot){
814 
815     ArrayList tokens = (ArrayList) tokensMap.get(annot.getId());
816 
817     //strip starting The first
818     if ( ((String) ((Annotation) tokens.get(0)
819           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME))
820           .equalsIgnoreCase(THE_VALUE))
821       tokens.remove(0);
822 
823     //no need to check for cdg if there is only 1 token or less
824     if (tokens.size()>1 && cdg.contains(((Annotation) tokens.get(tokens.size()-1)
825           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
826       tokens.remove(tokens.size()-1);
827 
828     StringBuffer newString = new StringBuffer(50);
829     for (int i = 0; i < tokens.size(); i++){
830       newString.append((String) ((Annotation) tokens.get(i)
831           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) );
832       if (i != tokens.size()-1)
833         newString.append(" ");
834     }
835 //    Out.prln("Strip CDG returned: " + newString + "for string " + annotString);
836 
837     if (caseSensitive)
838       return newString.toString();
839 
840     return newString.toString().toLowerCase();
841   }
842 
843 /*
844   public void check() throws ExecutionException {
845     if (executionException != null) {
846       ExecutionException e = executionException;
847       executionException = null;
848       throw e;
849     }
850   } // check()
851 */
852 
853   /** if ( == false) then reads the names of files in order
854     *  to create the lookup tables
855     */
856   protected void createLists() throws IOException {
857     InputStream inputStream = Files.getGateResourceAsStream(
858                                               "creole/namematcher/listsNM.def");
859     InputStreamReader inputStreamReader = new InputStreamReader (
860                                                     inputStream);
861     BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
862 
863     String lineRead = null;
864     while ((lineRead = bufferedReader.readLine()) != null){
865       int index = lineRead.indexOf(":");
866       if (index != -1){
867         String nameFile = lineRead.substring(0,index);
868         String nameList = lineRead.substring(index+1,lineRead.length());
869         createAnnotList(nameFile,nameList);
870       }// if
871     }//while
872     bufferedReader.close();
873     inputStreamReader.close();
874     inputStream.close();
875   }// createLists()
876 
877   /** creates the lookup tables */
878   protected void createAnnotList(String nameFile,String nameList)
879                                                           throws IOException{
880     InputStream inputStream = Files.getGateResourceAsStream(
881                                               "creole/namematcher/"+nameFile);
882     InputStreamReader inputStreamReader = new InputStreamReader (
883                                                     inputStream);
884     BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
885 
886     String lineRead = null;
887     while ((lineRead = bufferedReader.readLine()) != null){
888       if (nameList.compareTo(CDGLISTNAME)==0){
889         if (caseSensitive)
890           cdg.add(lineRead);
891         else
892           cdg.add(lineRead.toLowerCase());
893       }// if
894       else {
895         int index = lineRead.indexOf("£");
896         if (index != -1){
897           String  expr = lineRead.substring(0,index);
898           //if not case-sensitive, we need to downcase all strings
899           if (!caseSensitive)
900             expr = expr.toLowerCase();
901           String code = lineRead.substring(index+1,lineRead.length());
902           if (nameList.equals(ALIASLISTNAME))
903                             alias.put(expr, code);
904           else
905           if (nameList.equals(ARTLISTNAME))
906                             def_art.put(expr, code);
907           else
908           if (nameList.equals(PREPLISTNAME))
909                             prepos.put(expr, code);
910           else
911           if (nameList.equals(CONNECTORLISTNAME))
912                             connector.put(expr, code);
913           else
914           if (nameList.equals(SPURLISTNAME))
915                             spur_match.put(expr, code);
916 
917         }//if
918       }// else
919 
920     }//while
921   }//createAnnotList
922 
923 
924   /** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */
925   private boolean apply_rules_namematch(String annotationType, String shortName,
926                                         String longName) {
927     // first apply rule for spurius matches i.e. rule0
928     if (matchRule0(longName, shortName))
929       return false;
930     if (
931          (// rules for all annotations
932           //no longer use rule1, coz I do the check for same string via the
933           //hash table
934             matchRule2(longName, shortName)
935          ||
936             matchRule3(longName, shortName)
937          ) // rules for all annotations
938          ||
939          (// rules for organisation annotations
940              ( annotationType.equals(organizationType)
941                //ACE addition
942                || annotationType.equals("Facility"))
943              &&
944              (    matchRule4(longName, shortName)
945                ||
946                   matchRule5(longName, shortName)
947                ||
948                   matchRule6(longName, shortName)
949                ||
950                   matchRule7(longName, shortName)
951                ||
952 //                  matchRule8(longName, shortName)
953 //               ||
954                   matchRule9(longName, shortName)
955                ||
956                   matchRule10(longName, shortName)
957                ||
958                   matchRule11(longName, shortName)
959                ||
960                   matchRule12(longName, shortName)
961                ||
962                   matchRule13(shortName, longName)
963               )
964            )// rules for organisation annotations
965          ||
966          (// rules for person annotations
967              (    annotationType.equals(personType))
968                &&
969              (    matchRule4(longName, shortName)
970                ||
971                   matchRule5(longName, shortName)
972                ||
973                   matchRule14(longName, shortName)
974                || //kalina: added this, so it matches names when contain more
975                   //than one first and one last name
976                   matchRule15(longName, shortName)
977               )
978           )// rules for person annotations
979          ) //if
980       return true;
981     return false;
982   }//apply_rules
983 
984 
985   /** set the extLists flag */
986   public void setExtLists(Boolean newExtLists) {
987     extLists = newExtLists.booleanValue();
988   }//setextLists
989 
990   /** set the caseSensitive flag */
991   public void setCaseSensitive(Boolean newCase) {
992     caseSensitive = newCase.booleanValue();
993   }//setextLists
994 
995   /** set the annotation set name*/
996   public void setAnnotationSetName(String newAnnotationSetName) {
997     annotationSetName = newAnnotationSetName;
998   }//setAnnotationSetName
999 
1000  /** set the types of the annotations*/
1001  public void setAnnotationTypes(List newType) {
1002    annotationTypes = newType;
1003  }//setAnnotationTypes
1004
1005  public void setOrganizationType(String newOrganizationType) {
1006    organizationType = newOrganizationType;
1007  }//setOrganizationType
1008
1009  public void setPersonType(String newPersonType) {
1010    personType = newPersonType;
1011  }//setPersonType
1012
1013  /**get the name of the annotation set*/
1014  public String getAnnotationSetName() {
1015    return annotationSetName;
1016  }//getAnnotationSetName
1017
1018  /** get the types of the annotation*/
1019  public List getAnnotationTypes() {
1020    return annotationTypes;
1021  }//getAnnotationTypes
1022
1023  public String getOrganizationType() {
1024    return organizationType;
1025  }
1026
1027  public String getPersonType() {
1028    return personType;
1029  }
1030
1031  public Boolean getExtLists() {
1032    return new Boolean(extLists);
1033  }
1034
1035  public Boolean getCaseSensitive() {
1036    return new Boolean(caseSensitive);
1037  }
1038
1039/*
1040  public List getMatchesDocument() {
1041    return matchesDocument;
1042  }
1043*/
1044
1045  protected boolean isUnknownGender(String gender) {
1046    if (gender == null)
1047      return true;
1048    if (gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female"))
1049      return false;
1050    return true;
1051
1052  } //isUnknownGender
1053
1054  /** RULE #0: If the two names are listed in table of
1055    * spurius matches then they do NOT match
1056    * Condition(s): -
1057    * Applied to: all name annotations
1058    */
1059  public boolean matchRule0(String s1,
1060           String s2) {
1061    if (spur_match.containsKey(s1)
1062        && spur_match.containsKey(s2) )
1063      return
1064        spur_match.get(s1).toString().equals(spur_match.get(s2).toString());
1065
1066    return false;
1067  }//matchRule0
1068
1069  /** RULE #1: If the two names are identical then they are the same
1070    * no longer used, because I do the check for same string via the
1071    * hash table of previous annotations
1072    * Condition(s): depend on case
1073    * Applied to: all name annotations
1074    */
1075  public boolean matchRule1(String s1,
1076           String s2,
1077           boolean matchCase) {
1078//    Out.prln("Rule1: Matching " + s1 + "and " + s2);
1079
1080    boolean matched = false;
1081    if (!matchCase)
1082        matched = s1.equalsIgnoreCase(s2);
1083    else matched =  s1.equals(s2) ;
1084//kalina: do not remove, nice for debug
1085//    if (matched && (s2.startsWith("Kenneth") || s1.startsWith("Kenneth")))
1086//        Out.prln("Rule1: Matched " + s1 + "and " + s2);
1087    return matched;
1088  }//matchRule1
1089
1090
1091  /**
1092    * RULE #2: if the two names are listed as equivalent in the
1093    * lookup table (alias) then they match
1094    * Condition(s): -
1095    * Applied to: all name annotations
1096    */
1097  public boolean matchRule2(String s1,
1098           String s2) {
1099
1100    if (alias.containsKey(s1) && alias.containsKey(s2))
1101      return (alias.get(s1).toString().equals(alias.get(s2).toString()));
1102
1103    return false;
1104  }//matchRule2
1105
1106  /**
1107    * RULE #3: adding a possessive at the end
1108    * of one name causes a match
1109    * e.g. "Standard and Poor" == "Standard and Poor's"
1110    * and also "Standard and Poor" == "Standard's"
1111    * Condition(s): case-insensitive match
1112    * Applied to: all name annotations
1113    */
1114  public boolean matchRule3(String s1, //long string
1115                             String s2) { //short string
1116
1117    if (s2.endsWith("'s") || s2.endsWith("'")
1118        ||(s1.endsWith("'s")|| s1.endsWith("'"))) {
1119
1120
1121      String s2_poss = null;
1122
1123      if (!s2.endsWith("'s")) s2_poss = s2.concat("'s");
1124      else s2_poss = s2.concat("'");
1125
1126      if (s2_poss != null && matchRule1(s1, s2_poss,caseSensitive)) return true;
1127
1128      // now check the second case i.e. "Standard and Poor" == "Standard's"
1129      String token = (String)
1130        ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1131
1132      if (!token.endsWith("'s")) s2_poss = token.concat("'s");
1133      else s2_poss = token.concat("'");
1134
1135      if (s2_poss != null && matchRule1(s2_poss,s2,caseSensitive)) return true;
1136
1137    } // if (s2.endsWith("'s")
1138    return false;
1139  }//matchRule3
1140
1141  /**
1142    * RULE #4: Do all tokens other than the punctuation marks
1143    * , and . match?
1144    * e.g. "Smith, Jones" == "Smith Jones"
1145    * Condition(s): case-insensitive match
1146    * Applied to: organisation and person annotations
1147    */
1148  public boolean matchRule4(String s1,
1149           String s2) {
1150
1151    boolean allTokensMatch = true;
1152
1153    Iterator tokensLongAnnotIter = tokensLongAnnot.iterator();
1154    Iterator tokensShortAnnotIter = tokensShortAnnot.iterator();
1155    while (tokensLongAnnotIter.hasNext() && tokensShortAnnotIter.hasNext()) {
1156      Annotation token = (Annotation) tokensLongAnnotIter.next();
1157      if (((String)token.getFeatures().get(TOKEN_KIND_FEATURE_NAME)).equals(PUNCTUATION_VALUE))
1158        continue;
1159//      Out.prln("Matching" + tokensLongAnnot + " with " + tokensShortAnnot);
1160      if (! token.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1161             ((Annotation) tokensShortAnnotIter.next()).getFeatures().get(TOKEN_STRING_FEATURE_NAME))) {
1162        allTokensMatch = false;
1163        break;
1164      } // if (!tokensLongAnnot.nextToken()
1165    } // while
1166//    if (allTokensMatch)
1167//      Out.prln("rule4 fired. result is: " + allTokensMatch);
1168    return allTokensMatch;
1169  }//matchRule4
1170
1171  /**
1172    * RULE #5: if the 1st token of one name
1173    * matches the second name
1174    * e.g. "Pepsi Cola" == "Pepsi"
1175    * Condition(s): case-insensitive match
1176    * Applied to: all name annotations
1177    */
1178  public boolean matchRule5(String s1,
1179           String s2) {
1180
1181    //do not match numbers by this rule
1182    if (tokensLongAnnot.size()> 1 &&
1183        ((Annotation) tokensLongAnnot.get(0)).getFeatures().get("kind").equals("number"))
1184      return false;
1185
1186//    if (s1.startsWith("Patrick") || s2.startsWith("Patrick")) {
1187//      Out.prln("Rule 5: " + s1 + "and " + s2);
1188//    }
1189
1190    //require that when matching person names, the shorter one to be of length 1
1191    //for the rule to apply. In other words, avoid matching Peter Smith and
1192    //Peter Kline, because they share a Peter token.
1193    if ( (shortAnnot.getType().equals(personType)
1194         || longAnnot.getType().equals(personType)
1195         )
1196       &&
1197         tokensShortAnnot.size()>1
1198       )
1199       return false;
1200
1201    if (tokensLongAnnot.size()<=1)
1202      return false;
1203    boolean result = matchRule1((String)
1204                      ((Annotation) tokensLongAnnot.get(0)
1205                        ).getFeatures().get(TOKEN_STRING_FEATURE_NAME),
1206                      s2,
1207                      caseSensitive);
1208
1209//    if (s1.startsWith("Patrick") || s2.startsWith("Patrick"))
1210//      Out.prln("rule 5 result: " + result);
1211    return result;
1212
1213  }//matchRule5
1214
1215  /**
1216    * RULE #6: if one name is the acronym of the other
1217    * e.g. "Imperial Chemical Industries" == "ICI"
1218    * Applied to: organisation annotations only
1219    */
1220  public boolean matchRule6(String s1,
1221           String s2) {
1222
1223    int i = 0;
1224
1225    //check and if the shorted string has a space in it, then it's not
1226    //an acronym
1227    if (s2.indexOf(" ") > 0)
1228      return false;
1229
1230    //Out.prln("Acronym: Matching " + s1 + "and " + s2);
1231    StringBuffer acronym_s1 = new StringBuffer("");
1232    StringBuffer acronymDot_s1 = new StringBuffer("");
1233
1234    for ( ;i < tokensLongAnnot.size(); i++ ) {
1235      String toAppend = ( (String) ((Annotation) tokensLongAnnot.get(i)
1236                         ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)).substring(0,1);
1237      acronym_s1.append(toAppend);
1238      acronymDot_s1.append(toAppend);
1239      acronymDot_s1.append(".");
1240    }
1241
1242    //Out.prln("Acronym dot: To Match " + acronymDot_s1 + "and " + s2);
1243    //Out.prln("Result: " + matchRule1(acronymDot_s1.toString(),s2,caseSensitive));
1244
1245    if (matchRule1(acronym_s1.toString(),s2,caseSensitive) ||
1246        matchRule1(acronymDot_s1.toString(),s2,caseSensitive) )
1247      return true;
1248
1249    return false;
1250  }//matchRule6
1251
1252  /**
1253    * RULE #7: if one of the tokens in one of the
1254    * names is in the list of separators eg. "&"
1255    * then check if the token before the separator
1256    * matches the other name
1257    * e.g. "R.H. Macy & Co." == "Macy"
1258    * Condition(s): case-sensitive match
1259    * Applied to: organisation annotations only
1260    */
1261  public boolean matchRule7(String s1,
1262           String s2) {
1263
1264    //don't try it unless the second string is just one token
1265    if (tokensShortAnnot.size() != 1)
1266      return false;
1267
1268    String previous_token = null;
1269
1270    for (int i = 0;  i < tokensLongAnnot.size(); i++ ) {
1271      if (connector.containsKey( ((Annotation) tokensLongAnnot.get(i)
1272          ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) )) {
1273        previous_token = (String) ((Annotation) tokensLongAnnot.get(i-1)
1274                                    ).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1275
1276        break;
1277      }
1278    }
1279
1280    //now match previous_token with other name
1281    if (previous_token != null) {
1282//      if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin"))
1283//        Out.prln("Rule7");
1284      return matchRule1(previous_token,s2,caseSensitive);
1285
1286    }
1287    return false;
1288  }//matchRule7
1289
1290  /**
1291   * This rule is now obsolete, as The and the trailing CDG
1292   * are stripped before matching.
1293   * DO NOT CALL!!!
1294   *
1295    * RULE #8: if the names match, ignoring The and
1296    * and trailing company designator (which have already been stripped)
1297    * e.g. "The Magic Tricks Co." == "Magic Tricks"
1298    * Condition(s): case-sensitive match
1299    * Applied to: organisation annotations only
1300    */
1301  public boolean matchRule8(String s1,
1302           String s2) {
1303    Out.prln("OrthoMatcher warning: This rule has been discontinued!");
1304/*
1305    if (s1.startsWith("The ")) s1 = s1.substring(4);
1306    if (s2.startsWith("The ")) s2 = s2.substring(4);
1307
1308    // check that cdg is not empty
1309    if (!cdg.isEmpty()) {
1310      String stringToTokenize1 = s1;
1311      StringTokenizer tokensLongAnnot = new StringTokenizer(stringToTokenize1," ");
1312
1313      String stringToTokenize2 = s2;
1314      StringTokenizer tokensShortAnnot = new StringTokenizer(stringToTokenize2," ");
1315      String token = null;
1316      String cdg1 = null;
1317      String cdg2 = null;
1318
1319      s1 = "";
1320      s2 = "";
1321
1322      //check last token of s1
1323      while (tokensLongAnnot.hasMoreTokens()) {
1324        token = tokensLongAnnot.nextToken();
1325        if (!tokensLongAnnot.hasMoreTokens()
1326            && cdg.contains(token)) cdg1=token;
1327        else s1 = s1+token;
1328      }
1329
1330      // do the same for s2
1331      while (tokensShortAnnot.hasMoreTokens()) {
1332        token = tokensShortAnnot.nextToken();
1333        if (!tokensShortAnnot.hasMoreTokens()
1334          && cdg.contains(token)) cdg2=token;
1335        else s2 = s2+token;
1336      }
1337
1338      // if the company designators are different
1339      // then they are NOT the same organisations
1340      if ((cdg1!=null && cdg2!=null)
1341    && !cdg1.equalsIgnoreCase(cdg2)) return false;
1342    }
1343    if (!s1.equals("") && !s2.equals("")) return matchRule1(s1,s2,caseSensitive);
1344*/
1345    return false;
1346
1347  }//matchRule8
1348
1349  /**
1350    * RULE #9: does one of the names match the token
1351    * just before a trailing company designator
1352    * in the other name?
1353    * The company designator has already been chopped off,
1354    * so the token before it, is in fact the last token
1355    * e.g. "R.H. Macy Co." == "Macy"
1356    * Applied to: organisation annotations only
1357    */
1358  public boolean matchRule9(String s1,
1359           String s2) {
1360
1361//    if (s1.equalsIgnoreCase("news") || s2.equalsIgnoreCase("news"))
1362//      Out.prln("Rule 9 " + s1 + " and " + s2);
1363    String s1_short = (String)
1364                      ((Annotation) tokensLongAnnot.get(
1365                          tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1366//    Out.prln("Converted to " + s1_short);
1367    if (tokensLongAnnot.size()>1) {
1368      boolean matched = matchRule1(s1_short, s2, caseSensitive);
1369      //we need to make sure all names match, instead of assuming transitivity,
1370      //to avoid matching BBC News with News then News with ITV News, which
1371      //by transitivity leads to BBC News matching ITV News which is not what
1372      //we want
1373      if (matched)
1374        allMatchingNeeded = true;
1375      return matched;
1376    } //if
1377
1378    return false;
1379  }//matchRule9
1380
1381  /**
1382    * RULE #10: is one name the reverse of the other
1383    * reversing around prepositions only?
1384    * e.g. "Department of Defence" == "Defence Department"
1385    * Condition(s): case-sensitive match
1386    * Applied to: organisation annotations only
1387    */
1388  public boolean matchRule10(String s1,
1389            String s2) {
1390
1391    String token = null;
1392    String previous_token = null;
1393    String next_token = null;
1394    boolean invoke_rule=false;
1395
1396    if (tokensLongAnnot.size() >= 3
1397        && tokensShortAnnot.size() >= 2) {
1398
1399      // first get the tokens before and after the preposition
1400      int i = 0;
1401      for (; i< tokensLongAnnot.size(); i++) {
1402        token = (String)
1403                  ((Annotation) tokensLongAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1404        if (prepos.containsKey(token)) {
1405          invoke_rule=true;
1406          break;
1407        }//if
1408        previous_token = token;
1409      }//while
1410
1411      if (! invoke_rule)
1412        return false;
1413
1414      if (i < tokensLongAnnot.size()
1415          && previous_token != null)
1416        next_token= (String)
1417                    ((Annotation) tokensLongAnnot.get(i++)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1418      else return false;
1419
1420      String s21 = (String)
1421                    ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1422      String s22 = (String)
1423                    ((Annotation) tokensShortAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1424      // then compare (in reverse) with the first two tokens of s2
1425      if (matchRule1(next_token,(String) s21,caseSensitive)
1426          && matchRule1(previous_token, s22,caseSensitive))
1427        return true ;
1428    }//if (tokensLongAnnot.countTokens() >= 3
1429    return false;
1430  }//matchRule10
1431
1432  /**
1433    * RULE #11: does one name consist of contractions
1434    * of the first two tokens of the other name?
1435    * e.g. "Communications Satellite" == "ComSat"
1436    * and "Pan American" == "Pan Am"
1437    * Condition(s): case-sensitive match
1438    * Applied to: organisation annotations only
1439    */
1440  public boolean matchRule11(String s1,
1441            String s2) {
1442
1443
1444    // first do the easy case e.g. "Pan American" == "Pan Am"
1445
1446    String token11 = null;
1447    String token12 = null;
1448    String token21 = null;
1449    String token22 = null;
1450
1451    if (tokensLongAnnot.size() < 2)
1452      return false;
1453
1454    // 1st get the first two tokens of s1
1455    token11 = (String)
1456                ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1457    token12 = (String)
1458                ((Annotation) tokensLongAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1459
1460    // now check for the first case i.e. "Pan American" == "Pan Am"
1461    if (tokensShortAnnot.size() == 2)  {
1462
1463      token21 = (String)
1464                  ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1465      token22 = (String)
1466                  ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1467
1468      if (token11.startsWith(token21)
1469          && token12.startsWith(token22))
1470        return true;
1471
1472    } // if (tokensShortAnnot.countTokens() == 2)
1473
1474    // now the second case e.g.  "Communications Satellite" == "ComSat"
1475    else if (tokensShortAnnot.size()==1 && s2.length()>=3) {
1476
1477      // split the token into possible contractions
1478      // ignore case for matching
1479      for (int i=2;i<s2.length();i++) {
1480        token21=s2.substring(0,i+1);
1481        token22=s2.substring(i+1);
1482
1483        if (token11.startsWith(token21)
1484            && token12.startsWith(token22))
1485          return true;
1486      }// for
1487    } // else if
1488
1489    return false;
1490  }//matchRule11
1491
1492  /**
1493    * RULE #12: do the first and last tokens of one name
1494    * match the first and last tokens of the other?
1495    * Condition(s): case-sensitive match
1496    * Applied to: organisation annotations only
1497    */
1498  public boolean matchRule12(String s1,
1499            String s2) {
1500
1501    // first do the easy case e.g. "Pan American" == "Pan Am"
1502
1503    if (tokensLongAnnot.size()>1 && tokensShortAnnot.size()>1) {
1504//     Out.prln("Rule 12");
1505
1506      // get first and last tokens of s1 & s2
1507      String s1_first = (String)
1508                     ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1509      String s2_first = (String)
1510                     ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1511
1512      if (!matchRule1(s1_first,s2_first,caseSensitive))
1513        return false;
1514
1515      String s1_last = (String)
1516         ((Annotation) tokensLongAnnot.get(tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1517      String s2_last = (String)
1518         ((Annotation) tokensShortAnnot.get(tokensShortAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1519
1520      return matchRule1(s1_last,s2_last,caseSensitive);
1521    } // if (tokensLongAnnot.countTokens()>1
1522    return false;
1523  }//matchRule12
1524
1525  /**
1526    * RULE #13: do multi-word names match except for
1527    * one token e.g.
1528    * "Second Force Recon Company" == "Force Recon Company"
1529    * Note that this rule has NOT been used in LaSIE's 1.5
1530    * namematcher
1531    * Restrictions: - remove cdg first
1532    *               - shortest name should be 2 words or more
1533    *               - if N is the number of tokens of the longest
1534    *                 name, then N-1 tokens should be matched
1535    * Condition(s): case-sensitive match
1536    * Applied to: organisation or person annotations only
1537    */
1538  public boolean matchRule13(String s1,
1539            String s2) {
1540
1541
1542    String token1 = null;
1543    String token2 = null;
1544
1545    int matched_tokens = 0, mismatches = 0;;
1546
1547    // if names < 2 words then rule is invalid
1548    if (tokensLongAnnot.size() < 3 || tokensShortAnnot.size() < 2) return false;
1549
1550//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) {
1551//      Out.prln("Rule 13: Matching tokens" + tokensLongAnnot);
1552//      Out.prln("with tokens " + tokensShortAnnot);
1553//    }
1554
1555    // now do the matching
1556    for (int i=0,j= 0; i < tokensShortAnnot.size() && mismatches < 2; i++) {
1557
1558//      Out.prln("i = " + i);
1559//      Out.prln("j = " + j);
1560      if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1561           ((Annotation) tokensShortAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) {
1562        matched_tokens++;
1563        j++;
1564      } else
1565        mismatches++;
1566    } // for
1567
1568    if (matched_tokens >= tokensLongAnnot.size()-1)
1569      return true;
1570
1571    return false;
1572  }//matchRule13
1573
1574  /**
1575    * RULE #14: if the last token of one name
1576    * matches the second name
1577    * e.g. "Hamish Cunningham" == "Cunningham"
1578    * Condition(s): case-insensitive match
1579    * Applied to: all person annotations
1580    */
1581  public boolean matchRule14(String s1,
1582           String s2) {
1583
1584//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin"))
1585//      Out.prln("Rule 14 " + s1 + " and " + s2);
1586    String s1_short = (String)
1587                      ((Annotation) tokensLongAnnot.get(
1588                          tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1589//    Out.prln("Converted to " + s1_short);
1590    if (tokensLongAnnot.size()>1)
1591      return matchRule1(s1_short,
1592                      s2,
1593                      caseSensitive);
1594
1595    return false;
1596
1597  }//matchRule14
1598
1599  /**
1600    * RULE #15: does one token from a Person name appear as the other token
1601    * Note that this rule has NOT been used in LaSIE's 1.5
1602    * namematcher; added for ACE by Di's request
1603    */
1604  public boolean matchRule15(String s1,
1605            String s2) {
1606
1607    int matched_tokens = 0;
1608
1609    // if names < 2 words then rule is invalid
1610
1611//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) {
1612//      Out.prln("Rule 15:" );
1613//      Out.prln("with tokens " + tokensShortAnnot);
1614//    }
1615
1616    // now do the matching
1617    Annotation token1, token2;
1618    for (int i=0; i < tokensShortAnnot.size() && matched_tokens == 0; i++) {
1619      token1 = (Annotation) tokensShortAnnot.get(i);
1620      //first check if not punctuation, because we need to skip it
1621      if (token1.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1622        continue;
1623
1624      for (int j=0; j<tokensLongAnnot.size() && matched_tokens ==0; j++) {
1625//      Out.prln("i = " + i);
1626        token2 = (Annotation) tokensLongAnnot.get(j);
1627        if (token2.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1628          continue;
1629        if ( token1.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1630             token2.getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
1631          matched_tokens++;
1632      }//for
1633    } // for
1634
1635    //19 February 2002: kalina
1636    //was originally > 0 (i.e., any match is good)
1637    //ensure that we've matched all the tokens in the short annotation
1638    //the reason for that is, because otherwise we match
1639    //Patrick Viera and Patrick Somebody - not good!
1640    if (matched_tokens == tokensShortAnnot.size())
1641      return true;
1642
1643    return false;
1644  }//matchRule15
1645
1646  /** Tables for namematch info
1647    * (used by the namematch rules)
1648    */
1649  private void buildTables(AnnotationSet nameAllAnnots) {
1650
1651    //reset the tables first
1652    cdg.clear();
1653
1654    if (! extLists) {
1655    // i.e. get cdg from Lookup annotations
1656      // get all Lookup annotations
1657      tempMap.clear();
1658      tempMap.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, "cdg");
1659      //now get all lookup annotations which are cdg
1660      AnnotationSet nameAnnots =
1661        nameAllAnnots.get(LOOKUP_ANNOTATION_TYPE, tempMap);
1662
1663      if ((nameAnnots ==null) || nameAnnots.isEmpty())
1664        return;
1665
1666      Iterator iter = nameAnnots.iterator();
1667      while (iter.hasNext()) {
1668         Annotation annot = (Annotation)iter.next();
1669         // get the actual string
1670         Long offsetStartAnnot = annot.getStartNode().getOffset();
1671         Long offsetEndAnnot = annot.getEndNode().getOffset();
1672         try {
1673           gate.Document doc = nameAllAnnots.getDocument();
1674           String annotString =
1675                            doc.getContent().getContent(
1676                            offsetStartAnnot,offsetEndAnnot
1677                            ).toString();
1678                cdg.add(annotString);
1679         } catch (InvalidOffsetException ioe) {
1680             ioe.printStackTrace(Err.getPrintWriter());
1681         }
1682      }// while
1683    }//if
1684  }//buildTables
1685
1686  /** substitute all multiple spaces, tabes and newlines
1687    * with a single space
1688    */
1689  public String regularExpressions ( String text, String replacement,
1690                                      String regEx) {
1691    String result = text;
1692    try {
1693      RE re = new RE(regEx);
1694      result = re.substituteAll( text,replacement);
1695    } catch (REException ree) {ree.printStackTrace();}
1696    return result;
1697  }//regularExpressions
1698
1699
1700  private static class Class1 {
1701  }
1702} // public class OrthoMatcher
1703
1704