|
OrthoMatcher |
|
1 /* 2 * OrthoMatcher.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 24/August/2001 12 * 13 * $Id: OrthoMatcher.java,v 1.42 2002/03/06 17:15:44 kalina Exp $ 14 */ 15 16 17 package gate.creole.orthomatcher; 18 19 import gate.*; 20 import gate.util.*; 21 import gate.creole.*; 22 import gate.corpora.*; 23 import gate.annotation.*; 24 import java.util.*; 25 import java.io.*; 26 import java.net.*; 27 import gnu.regexp.*; 28 29 public class OrthoMatcher extends AbstractLanguageAnalyser 30 implements ANNIEConstants{ 31 32 public static final String 33 OM_DOCUMENT_PARAMETER_NAME = "document"; 34 35 public static final String 36 OM_ANN_SET_PARAMETER_NAME = "annotationSetName"; 37 38 public static final String 39 OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive"; 40 41 public static final String 42 OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes"; 43 44 public static final String 45 OM_ORG_TYPE_PARAMETER_NAME = "organizationType"; 46 47 public static final String 48 OM_PERSON_TYPE_PARAMETER_NAME = "personType"; 49 50 public static final String 51 OM_EXT_LISTS_PARAMETER_NAME = "extLists"; 52 53 protected static final String CDGLISTNAME = "cdg"; 54 protected static final String ALIASLISTNAME = "alias"; 55 protected static final String ARTLISTNAME = "def_art"; 56 protected static final String PREPLISTNAME = "prepos"; 57 protected static final String CONNECTORLISTNAME = "connector"; 58 protected static final String SPURLISTNAME = "spur_match"; 59 60 protected static final String PUNCTUATION_VALUE = "punctuation"; 61 protected static final String THE_VALUE = "The"; 62 63 64 /**the name of the annotation set*/ 65 protected String annotationSetName; 66 67 /** the types of the annotation */ 68 protected List annotationTypes = new ArrayList(10); 69 70 /** the organization type*/ 71 protected String organizationType = ORGANIZATION_ANNOTATION_TYPE; 72 73 /** the person type*/ 74 protected String personType = PERSON_ANNOTATION_TYPE; 75 76 protected String unknownType = "Unknown"; 77 78 /** internal or external list */ 79 protected boolean extLists = true; 80 81 protected boolean matchingUnknowns = true; 82 83 /** This is an internal variable to indicate whether 84 * we matched using a rule that requires that 85 * the newly matched annotation matches all the others 86 * This is needed, because organizations can share 87 * first/last tokens like News and be different 88 */ 89 private boolean allMatchingNeeded = false; 90 91 //** Orthomatching is not case-sensitive by default*/ 92 protected boolean caseSensitive = false; 93 94 protected FeatureMap queryFM = Factory.newFeatureMap(); 95 96 // protected ExecutionException executionException; 97 98 // name lookup tables (used for namematch) 99 //gave them bigger default size, coz rehash is expensive 100 protected HashMap alias = new HashMap(100); 101 protected HashSet cdg = new HashSet(50); 102 protected HashMap spur_match = new HashMap(100); 103 protected HashMap def_art = new HashMap(20); 104 protected HashMap connector = new HashMap(20); 105 protected HashMap prepos = new HashMap(30); 106 107 108 protected AnnotationSet nameAllAnnots = null; 109 protected HashMap processedAnnots = new HashMap(150); 110 protected HashMap annots2Remove = new HashMap(75); 111 protected List matchesDocFeature = new ArrayList(); 112 //maps annotation ids to array lists of tokens 113 protected HashMap tokensMap = new HashMap(150); 114 115 protected Annotation shortAnnot, longAnnot; 116 117 protected ArrayList tokensLongAnnot, tokensShortAnnot; 118 119 /** a feature map to be used when retrieving annotations 120 * declared here so can be reused for efficiency 121 * clear() before each use 122 */ 123 protected FeatureMap tempMap = Factory.newFeatureMap(); 124 125 /** a buffer in order to read an array of char */ 126 private char[] cbuffer = null; 127 128 /** the size of the buffer */ 129 private final static int BUFF_SIZE = 65000; 130 131 /** @link dependency */ 132 /*#OrthoMatcher lnkOrthoMatcher;*/ 133 134 public OrthoMatcher () { 135 annotationTypes.add(organizationType); 136 annotationTypes.add(personType); 137 annotationTypes.add("Location"); 138 annotationTypes.add("Date"); 139 } 140 141 /** Initialise this resource, and return it. */ 142 public Resource init() throws ResourceInstantiationException { 143 cbuffer = new char[BUFF_SIZE]; 144 145 //initialise the list of annotations which we will match 146 try { 147 createLists(); 148 } catch (IOException ioe) {ioe.printStackTrace();} 149 return this; 150 } // init() 151 152 /** Run the resource. It doesn't make sense not to override 153 * this in subclasses so the default implementation signals an 154 * exception. 155 */ 156 public void execute() throws ExecutionException{ 157 158 //check the input 159 if(document == null) { 160 throw new ExecutionException( 161 "No document for namematch!" 162 ); 163 } 164 165 // get the annotations from document 166 if ((annotationSetName == null)|| (annotationSetName.equals(""))) 167 nameAllAnnots = document.getAnnotations(); 168 else 169 nameAllAnnots = document.getAnnotations(annotationSetName); 170 171 //if none found, print warning and exit 172 if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) { 173 Out.prln("OrthoMatcher Warning: No annotations found for processing"); 174 return; 175 } 176 177 //check if we've been run on this document before 178 //and clean the doc if needed 179 docCleanup(); 180 Map matchesMap = (Map)document.getFeatures(). 181 get(DOCUMENT_COREF_FEATURE_NAME); 182 183 // creates the cdg list from the document 184 //no need to create otherwise, coz already done in init() 185 if (!extLists) 186 buildTables(nameAllAnnots); 187 188 //first match all name annotations 189 matchNameAnnotations(); 190 191 //then match the unknown ones to all name ones 192 if (matchingUnknowns) 193 matchUnknown(); 194 195 // set the matches of the document 196 // determineMatchesDocument(); 197 if (! matchesDocFeature.isEmpty()) { 198 if(matchesMap == null){ 199 matchesMap = new HashMap(); 200 } 201 matchesMap.put(nameAllAnnots.getName(), matchesDocFeature); 202 //we need to put it even if it was already present in order to triger 203 //the update events 204 document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap); 205 206 //cannot do clear() as this has already been put on the document 207 //so I need a new one for the next run of matcher 208 matchesDocFeature = new ArrayList(); 209 } 210 211 // Out.prln("Processed strings" + processedAnnots.values()); 212 //clean-up the internal data structures for next run 213 nameAllAnnots = null; 214 processedAnnots.clear(); 215 annots2Remove.clear(); 216 tokensMap.clear(); 217 matchesDocFeature = new ArrayList(); 218 longAnnot = null; 219 shortAnnot = null; 220 tokensLongAnnot = null; 221 tokensShortAnnot = null; 222 223 } // run() 224 225 protected void matchNameAnnotations() throws ExecutionException{ 226 // go through all the annotation types 227 Iterator iterAnnotationTypes = annotationTypes.iterator(); 228 while (iterAnnotationTypes.hasNext()) { 229 String annotationType = (String)iterAnnotationTypes.next(); 230 231 AnnotationSet nameAnnots = nameAllAnnots.get(annotationType); 232 233 // continue if no such annotations exist 234 if ((nameAnnots == null) || nameAnnots.isEmpty()) 235 continue; 236 237 Iterator iterNames = nameAnnots.iterator(); 238 while (iterNames.hasNext()) { 239 Annotation nameAnnot = (Annotation) iterNames.next(); 240 Integer id = nameAnnot.getId(); 241 242 // get string and value 243 String annotString = null; 244 try { 245 annotString = document.getContent().getContent( 246 nameAnnot.getStartNode().getOffset(), 247 nameAnnot.getEndNode().getOffset() 248 ).toString(); 249 // now do the reg. exp. substitutions 250 annotString = regularExpressions(annotString," ", "\\s+"); 251 252 } catch (InvalidOffsetException ioe) { 253 throw new ExecutionException 254 ("Invalid offset of the annotation"); 255 } 256 //convert to lower case if we are not doing a case sensitive match 257 if (!caseSensitive) 258 annotString = annotString.toLowerCase(); 259 260 //get the tokens 261 List tokens = new ArrayList((Set) 262 nameAllAnnots.get(TOKEN_ANNOTATION_TYPE, 263 nameAnnot.getStartNode().getOffset(), 264 nameAnnot.getEndNode().getOffset() 265 )); 266 //if no tokens to match, do nothing 267 if (tokens.isEmpty()) 268 continue; 269 Collections.sort(tokens, new gate.util.OffsetComparator()); 270 //check if these actually do not end after the name 271 //needed coz new tokeniser conflates 272 //strings with dashes. So British Gas-style is two tokens 273 //instead of three. So cannot match properly British Gas 274 // tokens = checkTokens(tokens); 275 tokensMap.put(nameAnnot.getId(), tokens); 276 277 // Out.prln("Matching annot " + nameAnnot + ": string " + annotString); 278 279 //first check whether we have not matched such a string already 280 //if so, just consider it matched, don't bother calling the rules 281 if (processedAnnots.containsValue(annotString)) { 282 // Out.prln("Contained string found " + annotString); 283 updateMatches(nameAnnot, annotString); 284 processedAnnots.put(nameAnnot.getId(), annotString); 285 continue; 286 } else if (processedAnnots.isEmpty()) { 287 processedAnnots.put(nameAnnot.getId(), annotString); 288 continue; 289 } 290 291 //if a person, then remove their title before matching 292 if (nameAnnot.getType().equals(personType)) 293 annotString = containTitle(annotString, nameAnnot); 294 else if (nameAnnot.getType().equals(organizationType)) 295 annotString = stripCDG(annotString, nameAnnot); 296 297 if(null == annotString || "".equals(annotString)) 298 continue; 299 300 //otherwise try matching with previous annotations 301 matchWithPrevious(nameAnnot, annotString); 302 303 // Out.prln("Putting in previous " + nameAnnot + ": string " + annotString); 304 //finally add the current annotations to the processed map 305 processedAnnots.put(nameAnnot.getId(), annotString); 306 }//while through name annotations 307 308 }//while through annotation types 309 310 } 311 312 protected void matchUnknown() throws ExecutionException { 313 //get all Unknown annotations 314 AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType); 315 316 if ((unknownAnnots == null) || unknownAnnots.isEmpty()) 317 return; 318 319 Iterator iter = unknownAnnots.iterator(); 320 //loop through the unknown annots 321 while (iter.hasNext()) { 322 Annotation unknown = (Annotation) iter.next(); 323 324 // get string and value 325 String unknownString = null; 326 try { 327 unknownString = document.getContent().getContent( 328 unknown.getStartNode().getOffset(), 329 unknown.getEndNode().getOffset() 330 ).toString(); 331 // now do the reg. exp. substitutions 332 unknownString = regularExpressions(unknownString," ", "\\s+"); 333 } catch (InvalidOffsetException ioe) { 334 throw new ExecutionException 335 ("Invalid offset of the annotation"); 336 } 337 //convert to lower case if we are not doing a case sensitive match 338 if (!caseSensitive) 339 unknownString = unknownString.toLowerCase(); 340 341 //get the tokens 342 List tokens = new ArrayList((Set) 343 nameAllAnnots.get(TOKEN_ANNOTATION_TYPE, 344 unknown.getStartNode().getOffset(), 345 unknown.getEndNode().getOffset() 346 )); 347 if (tokens.isEmpty()) 348 continue; 349 Collections.sort(tokens, new gate.util.OffsetComparator()); 350 tokensMap.put(unknown.getId(), tokens); 351 352 353 //first check whether we have not matched such a string already 354 //if so, just consider it matched, don't bother calling the rules 355 if (processedAnnots.containsValue(unknownString)) { 356 Annotation matchedAnnot = updateMatches(unknown, unknownString); 357 // Out.prln("Matched " + unknown + "with string " + unknownString); 358 // Out.prln("That's same as " + matchedAnnot); 359 if (matchedAnnot.getType().equals(unknownType)) { 360 annots2Remove.put(unknown.getId(), 361 annots2Remove.get(matchedAnnot.getId())); 362 } 363 else 364 annots2Remove.put(unknown.getId(), matchedAnnot.getType()); 365 processedAnnots.put(unknown.getId(), unknownString); 366 unknown.getFeatures().put("NMRule", unknownType); 367 continue; 368 } 369 370 //check if we should do sub-string matching in case it's hyphenated 371 //for example US-led 372 if (tokens.size() == 1 373 && "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) { 374 if (matchHyphenatedUnknowns(unknown, unknownString, iter)) 375 continue; 376 }//if 377 378 matchWithPrevious(unknown, unknownString); 379 380 } //while though unknowns 381 382 if (! annots2Remove.isEmpty()) { 383 Iterator unknownIter = annots2Remove.keySet().iterator(); 384 while (unknownIter.hasNext()) { 385 Integer unknId = (Integer) unknownIter.next(); 386 Annotation unknown = nameAllAnnots.get(unknId); 387 Integer newID = nameAllAnnots.add( 388 unknown.getStartNode(), 389 unknown.getEndNode(), 390 (String) annots2Remove.get(unknId), 391 unknown.getFeatures() 392 ); 393 nameAllAnnots.remove(unknown); 394 395 //change the id in the matches list 396 List mList = (List)unknown.getFeatures(). 397 get(ANNOTATION_COREF_FEATURE_NAME); 398 mList.remove(unknId); 399 mList.add(newID); 400 }//while 401 }//if 402 } 403 404 private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString, 405 Iterator iter){ 406 boolean matched = false; 407 408 //only take the substring before the hyphen 409 int stringEnd = unknownString.indexOf("-"); 410 unknownString = unknownString.substring(0, stringEnd); 411 //check if we've already matched this string 412 //because only exact match of the substring are considered 413 if (processedAnnots.containsValue(unknownString)) { 414 matched = true; 415 Annotation matchedAnnot = updateMatches(unknown, unknownString); 416 //only do the matching if not a person, because we do not match 417 //those on sub-strings 418 iter.remove(); 419 String newType; 420 if (matchedAnnot.getType().equals(unknownType)) 421 newType = (String)annots2Remove.get(matchedAnnot.getId()); 422 else 423 newType = matchedAnnot.getType(); 424 425 Integer newID = new Integer(-1); 426 try { 427 newID = nameAllAnnots.add( 428 unknown.getStartNode().getOffset(), 429 new Long(unknown.getStartNode().getOffset().longValue() 430 + stringEnd), 431 newType, 432 unknown.getFeatures() 433 ); 434 } catch (InvalidOffsetException ex) { 435 throw new GateRuntimeException(ex.getMessage()); 436 } 437 nameAllAnnots.remove(unknown); 438 439 //change the id in the matches list 440 List mList = (List)unknown.getFeatures(). 441 get(ANNOTATION_COREF_FEATURE_NAME); 442 mList.remove(unknown.getId()); 443 mList.add(newID); 444 445 } 446 return matched; 447 } 448 449 protected void matchWithPrevious(Annotation nameAnnot, String annotString) { 450 boolean matchedUnknown = false; 451 452 Iterator prevIter = processedAnnots.keySet().iterator(); 453 while (prevIter.hasNext()) { 454 Integer prevId = (Integer) prevIter.next(); 455 Annotation prevAnnot = nameAllAnnots.get(prevId); 456 457 //check if the two are from the same type or the new one is unknown 458 if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType()) 459 && ! nameAnnot.getType().equals(unknownType)) 460 ) 461 continue; 462 //do not compare two unknown annotations either 463 //they are only matched to those of known types 464 if ( nameAnnot.getType().equals(unknownType) 465 && prevAnnot.getType().equals(unknownType)) 466 continue; 467 468 //check if we have already matched this annotation to the new one 469 if (matchedAlready(nameAnnot, prevAnnot) ) 470 continue; 471 472 //now changed to a rule, here we just match by gender 473 if (prevAnnot.getType().equals(personType)) { 474 String prevGender = 475 (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 476 String nameGender = 477 (String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 478 if ( prevGender != null 479 && nameGender != null 480 && ( (nameGender.equalsIgnoreCase("female") 481 && 482 prevGender.equalsIgnoreCase("male") 483 ) 484 || 485 (prevGender.equalsIgnoreCase("female") 486 && nameGender.equalsIgnoreCase("male") 487 ) 488 ) 489 ) //if condition 490 continue; //we don't have a match if the two genders are different 491 492 }//if 493 494 //if the two annotations match 495 if (matchAnnotations(nameAnnot, annotString, prevAnnot)) { 496 // Out.prln("Matched " + shortName + "and " + longName); 497 updateMatches(nameAnnot, prevAnnot); 498 //if unknown annotation, we need to change to the new type 499 if (nameAnnot.getType().equals(unknownType)) { 500 matchedUnknown = true; 501 if (prevAnnot.getType().equals(unknownType)) 502 annots2Remove.put(nameAnnot.getId(), 503 annots2Remove.get(prevAnnot.getId())); 504 else 505 annots2Remove.put(nameAnnot.getId(), prevAnnot.getType()); 506 //also put an attribute to indicate that 507 nameAnnot.getFeatures().put("NMRule", unknownType); 508 }//if unknown 509 break; //no need to match further 510 }//if annotations matched 511 512 }//while through previous annotations 513 514 if (matchedUnknown) 515 processedAnnots.put(nameAnnot.getId(), annotString); 516 517 518 }//matchWithPrevious 519 520 protected boolean matchAnnotations(Annotation newAnnot, String annotString, 521 Annotation prevAnnot) { 522 //do not match two annotations that overlap 523 if (newAnnot.overlaps(prevAnnot)) 524 return false; 525 526 // find which annotation string of the two is longer 527 // this is useful for some of the matching rules 528 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId()); 529 530 String longName = prevAnnotString; 531 String shortName = annotString; 532 longAnnot = prevAnnot; 533 shortAnnot = newAnnot; 534 535 if (shortName.length()>=longName.length()) { 536 String temp = longName; 537 longName = shortName; 538 shortName = temp; 539 Annotation tempAnn = longAnnot; 540 longAnnot = shortAnnot; 541 shortAnnot = tempAnn; 542 }//if 543 544 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId()); 545 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId()); 546 547 List matchesList = (List) prevAnnot.getFeatures(). 548 get(ANNOTATION_COREF_FEATURE_NAME); 549 if (matchesList == null || matchesList.isEmpty()) 550 return apply_rules_namematch(prevAnnot.getType(), shortName,longName); 551 552 //if these two match, then let's see if all the other matching one will too 553 //that's needed, because sometimes names can share a token (e.g., first or 554 //last but not be the same 555 if (apply_rules_namematch(prevAnnot.getType(), shortName,longName)) { 556 /** 557 * Check whether we need to ensure that there is a match with the rest 558 * of the matching annotations, because the rule requires that 559 * transtivity is not assummed. 560 */ 561 if (allMatchingNeeded) { 562 allMatchingNeeded = false; 563 564 List toMatchList = new ArrayList(matchesList); 565 // if (newAnnot.getType().equals(unknownType)) 566 // Out.prln("Matching new " + annotString + " with annots " + toMatchList); 567 toMatchList.remove(prevAnnot.getId()); 568 569 return matchOtherAnnots(toMatchList, newAnnot, annotString); 570 } else 571 return true; 572 } 573 return false; 574 } 575 576 /** This method checkes whether the new annotation matches 577 * all annotations given in the toMatchList (it contains ids) 578 * The idea is that the new annotation needs to match all those, 579 * because assuming transitivity does not always work, when 580 * two different entities share a common token: e.g., BT Cellnet 581 * and BT and British Telecom. 582 */ 583 protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot, 584 String annotString) { 585 586 //if the list is empty, then we're matching all right :-) 587 if (toMatchList.isEmpty()) 588 return true; 589 590 boolean matchedAll = true; 591 int i = 0; 592 593 while (matchedAll && i < toMatchList.size()) { 594 Annotation prevAnnot = nameAllAnnots.get((Integer) toMatchList.get(i)); 595 596 // find which annotation string of the two is longer 597 // this is useful for some of the matching rules 598 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId()); 599 if (prevAnnotString == null) 600 try { 601 prevAnnotString = document.getContent().getContent( 602 prevAnnot.getStartNode().getOffset(), 603 prevAnnot.getEndNode().getOffset() 604 ).toString(); 605 } catch (InvalidOffsetException ioe) { 606 return false; 607 }//try 608 609 610 String longName = prevAnnotString; 611 String shortName = annotString; 612 longAnnot = prevAnnot; 613 shortAnnot = newAnnot; 614 615 if (shortName.length()>=longName.length()) { 616 String temp = longName; 617 longName = shortName; 618 shortName = temp; 619 Annotation tempAnn = longAnnot; 620 longAnnot = shortAnnot; 621 shortAnnot = tempAnn; 622 }//if 623 624 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId()); 625 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId()); 626 627 matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName); 628 // if (newAnnot.getType().equals(unknownType)) 629 // Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll); 630 631 i++; 632 }//while 633 return matchedAll; 634 } 635 636 637 protected boolean matchedAlready(Annotation annot1, Annotation annot2) { 638 //the two annotations are already matched if the matches list of the first 639 //contains the id of the second 640 List matchesList = (List) annot1.getFeatures(). 641 get(ANNOTATION_COREF_FEATURE_NAME); 642 if ((matchesList == null) || matchesList.isEmpty()) 643 return false; 644 else if (matchesList.contains(annot2.getId())) 645 return true; 646 return false; 647 } 648 649 protected Annotation updateMatches(Annotation newAnnot, String annotString) { 650 Annotation matchedAnnot = null; 651 Integer id; 652 653 //first find a processed annotation with the same string 654 Iterator iter = processedAnnots.keySet().iterator(); 655 while (iter.hasNext()) { 656 id = (Integer) iter.next(); 657 String oldString = (String) processedAnnots.get(id); 658 if (annotString.equals(oldString)) { 659 matchedAnnot = nameAllAnnots.get(id); 660 break; 661 }//if 662 }//while 663 664 if (matchedAnnot == null) return null; 665 //if the two matching annotations are of different type which is not 666 //unknown, do not match them 667 if (! matchedAnnot.getType().equals(newAnnot.getType()) 668 && !newAnnot.getType().equals(unknownType) ) 669 return matchedAnnot; 670 671 List matchesList = (List) matchedAnnot.getFeatures(). 672 get(ANNOTATION_COREF_FEATURE_NAME); 673 if ((matchesList == null) || matchesList.isEmpty()) { 674 //no previous matches, so need to add 675 if (matchesList == null) { 676 matchesList = new ArrayList(); 677 matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, 678 matchesList); 679 matchesDocFeature.add(matchesList); 680 }//if 681 matchesList.add(matchedAnnot.getId()); 682 matchesList.add(newAnnot.getId()); 683 } else { 684 //just add the new annotation 685 matchesList.add(newAnnot.getId()); 686 }//if 687 //add the matches list to the new annotation 688 newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList); 689 return matchedAnnot; 690 } 691 692 protected void updateMatches(Annotation newAnnot, Annotation prevAnnot) { 693 694 List matchesList = (List) prevAnnot.getFeatures(). 695 get(ANNOTATION_COREF_FEATURE_NAME); 696 if ((matchesList == null) || matchesList.isEmpty()) { 697 //no previous matches, so need to add 698 if (matchesList == null) { 699 matchesList = new ArrayList(); 700 prevAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList); 701 matchesDocFeature.add(matchesList); 702 }//if 703 matchesList.add(prevAnnot.getId()); 704 matchesList.add(newAnnot.getId()); 705 } else { 706 //just add the new annotation 707 matchesList.add(newAnnot.getId()); 708 }//if 709 //add the matches list to the new annotation 710 newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList); 711 //propagate the gender if two persons are matched 712 if (prevAnnot.getType().equals(personType)) { 713 String prevGender = 714 (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 715 String newGender = 716 (String) newAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 717 boolean unknownPrevGender = isUnknownGender(prevGender); 718 boolean unknownNewGender = isUnknownGender(newGender); 719 if (unknownPrevGender && !unknownNewGender) 720 prevAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, newGender); 721 else if (unknownNewGender && !unknownPrevGender) 722 newAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, prevGender); 723 }//if 724 } 725 726 727 protected void docCleanup() { 728 Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME); 729 if (matchesValue != null && (matchesValue instanceof Map)) 730 ((Map)matchesValue).remove(nameAllAnnots.getName()); 731 else if (matchesValue != null) { 732 document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap()); 733 } 734 735 //get all annotations that have a matches feature 736 HashSet fNames = new HashSet(); 737 fNames.add(ANNOTATION_COREF_FEATURE_NAME); 738 AnnotationSet annots = 739 nameAllAnnots.get(null, fNames); 740 741 // Out.prln("Annots to cleanup" + annots); 742 743 if (annots == null || annots.isEmpty()) 744 return; 745 746 Iterator iter = annots.iterator(); 747 while (iter.hasNext()) { 748 while (iter.hasNext()) 749 ((Annotation) iter.next()).getFeatures(). 750 remove(ANNOTATION_COREF_FEATURE_NAME); 751 } //while 752 }//cleanup 753 754 /** return a person name without title */ 755 protected String containTitle (String annotString, Annotation annot) 756 throws ExecutionException { 757 // get the offsets 758 Long startAnnot = annot.getStartNode().getOffset(); 759 Long endAnnot = annot.getEndNode().getOffset(); 760 761 // determine "Lookup" annotation set 762 queryFM.clear(); 763 queryFM.put("majorType", "title"); 764 AnnotationSet as1 = nameAllAnnots.get(startAnnot,endAnnot); 765 if (as1 == null || as1.isEmpty()) 766 return annotString; 767 AnnotationSet as = 768 as1.get("Lookup", queryFM); 769 if (as !=null && ! as.isEmpty()) { 770 List titles = new ArrayList((Set)as); 771 Collections.sort(titles, new gate.util.OffsetComparator()); 772 773 Iterator iter = titles.iterator(); 774 while (iter.hasNext()) { 775 Annotation titleAnn = (Annotation)(iter.next()); 776 777 //we've not found a title at the start offset, 778 //there's no point in looking further 779 //coz titles come first 780 if (titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0) 781 return annotString; 782 783 try { 784 // the title from the current annotation 785 String annotTitle = 786 document.getContent().getContent( 787 titleAnn.getStartNode().getOffset(), 788 titleAnn.getEndNode().getOffset() 789 ).toString(); 790 791 // eliminate the title from annotation string and return the result 792 if (annotTitle.length()<annotString.length()) { 793 //remove from the array of tokens, so then we can compare properly 794 //the remaining tokens 795 // Out.prln("Removing title from: " + annot + " with string " + annotString); 796 // Out.prln("Tokens are" + tokensMap.get(annot.getId())); 797 // Out.prln("Title is" + annotTitle); 798 ((ArrayList) tokensMap.get(annot.getId())).remove(0); 799 return annotString.substring( 800 annotTitle.length()+1,annotString.length()); 801 } 802 } catch (InvalidOffsetException ioe) { 803 throw new ExecutionException 804 ("Invalid offset of the annotation"); 805 }//try 806 }// while 807 }//if 808 return annotString; 809 810 } 811 812 /** return an organization without a designator and starting The*/ 813 protected String stripCDG (String annotString, Annotation annot){ 814 815 ArrayList tokens = (ArrayList) tokensMap.get(annot.getId()); 816 817 //strip starting The first 818 if ( ((String) ((Annotation) tokens.get(0) 819 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) 820 .equalsIgnoreCase(THE_VALUE)) 821 tokens.remove(0); 822 823 //no need to check for cdg if there is only 1 token or less 824 if (tokens.size()>1 && cdg.contains(((Annotation) tokens.get(tokens.size()-1) 825 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) 826 tokens.remove(tokens.size()-1); 827 828 StringBuffer newString = new StringBuffer(50); 829 for (int i = 0; i < tokens.size(); i++){ 830 newString.append((String) ((Annotation) tokens.get(i) 831 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) ); 832 if (i != tokens.size()-1) 833 newString.append(" "); 834 } 835 // Out.prln("Strip CDG returned: " + newString + "for string " + annotString); 836 837 if (caseSensitive) 838 return newString.toString(); 839 840 return newString.toString().toLowerCase(); 841 } 842 843 /* 844 public void check() throws ExecutionException { 845 if (executionException != null) { 846 ExecutionException e = executionException; 847 executionException = null; 848 throw e; 849 } 850 } // check() 851 */ 852 853 /** if ( == false) then reads the names of files in order 854 * to create the lookup tables 855 */ 856 protected void createLists() throws IOException { 857 InputStream inputStream = Files.getGateResourceAsStream( 858 "creole/namematcher/listsNM.def"); 859 InputStreamReader inputStreamReader = new InputStreamReader ( 860 inputStream); 861 BufferedReader bufferedReader = new BufferedReader(inputStreamReader); 862 863 String lineRead = null; 864 while ((lineRead = bufferedReader.readLine()) != null){ 865 int index = lineRead.indexOf(":"); 866 if (index != -1){ 867 String nameFile = lineRead.substring(0,index); 868 String nameList = lineRead.substring(index+1,lineRead.length()); 869 createAnnotList(nameFile,nameList); 870 }// if 871 }//while 872 bufferedReader.close(); 873 inputStreamReader.close(); 874 inputStream.close(); 875 }// createLists() 876 877 /** creates the lookup tables */ 878 protected void createAnnotList(String nameFile,String nameList) 879 throws IOException{ 880 InputStream inputStream = Files.getGateResourceAsStream( 881 "creole/namematcher/"+nameFile); 882 InputStreamReader inputStreamReader = new InputStreamReader ( 883 inputStream); 884 BufferedReader bufferedReader = new BufferedReader(inputStreamReader); 885 886 String lineRead = null; 887 while ((lineRead = bufferedReader.readLine()) != null){ 888 if (nameList.compareTo(CDGLISTNAME)==0){ 889 if (caseSensitive) 890 cdg.add(lineRead); 891 else 892 cdg.add(lineRead.toLowerCase()); 893 }// if 894 else { 895 int index = lineRead.indexOf("£"); 896 if (index != -1){ 897 String expr = lineRead.substring(0,index); 898 //if not case-sensitive, we need to downcase all strings 899 if (!caseSensitive) 900 expr = expr.toLowerCase(); 901 String code = lineRead.substring(index+1,lineRead.length()); 902 if (nameList.equals(ALIASLISTNAME)) 903 alias.put(expr, code); 904 else 905 if (nameList.equals(ARTLISTNAME)) 906 def_art.put(expr, code); 907 else 908 if (nameList.equals(PREPLISTNAME)) 909 prepos.put(expr, code); 910 else 911 if (nameList.equals(CONNECTORLISTNAME)) 912 connector.put(expr, code); 913 else 914 if (nameList.equals(SPURLISTNAME)) 915 spur_match.put(expr, code); 916 917 }//if 918 }// else 919 920 }//while 921 }//createAnnotList 922 923 924 /** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */ 925 private boolean apply_rules_namematch(String annotationType, String shortName, 926 String longName) { 927 // first apply rule for spurius matches i.e. rule0 928 if (matchRule0(longName, shortName)) 929 return false; 930 if ( 931 (// rules for all annotations 932 //no longer use rule1, coz I do the check for same string via the 933 //hash table 934 matchRule2(longName, shortName) 935 || 936 matchRule3(longName, shortName) 937 ) // rules for all annotations 938 || 939 (// rules for organisation annotations 940 ( annotationType.equals(organizationType) 941 //ACE addition 942 || annotationType.equals("Facility")) 943 && 944 ( matchRule4(longName, shortName) 945 || 946 matchRule5(longName, shortName) 947 || 948 matchRule6(longName, shortName) 949 || 950 matchRule7(longName, shortName) 951 || 952 // matchRule8(longName, shortName) 953 // || 954 matchRule9(longName, shortName) 955 || 956 matchRule10(longName, shortName) 957 || 958 matchRule11(longName, shortName) 959 || 960 matchRule12(longName, shortName) 961 || 962 matchRule13(shortName, longName) 963 ) 964 )// rules for organisation annotations 965 || 966 (// rules for person annotations 967 ( annotationType.equals(personType)) 968 && 969 ( matchRule4(longName, shortName) 970 || 971 matchRule5(longName, shortName) 972 || 973 matchRule14(longName, shortName) 974 || //kalina: added this, so it matches names when contain more 975 //than one first and one last name 976 matchRule15(longName, shortName) 977 ) 978 )// rules for person annotations 979 ) //if 980 return true; 981 return false; 982 }//apply_rules 983 984 985 /** set the extLists flag */ 986 public void setExtLists(Boolean newExtLists) { 987 extLists = newExtLists.booleanValue(); 988 }//setextLists 989 990 /** set the caseSensitive flag */ 991 public void setCaseSensitive(Boolean newCase) { 992 caseSensitive = newCase.booleanValue(); 993 }//setextLists 994 995 /** set the annotation set name*/ 996 public void setAnnotationSetName(String newAnnotationSetName) { 997 annotationSetName = newAnnotationSetName; 998 }//setAnnotationSetName 999 1000 /** set the types of the annotations*/ 1001 public void setAnnotationTypes(List newType) { 1002 annotationTypes = newType; 1003 }//setAnnotationTypes 1004 1005 public void setOrganizationType(String newOrganizationType) { 1006 organizationType = newOrganizationType; 1007 }//setOrganizationType 1008 1009 public void setPersonType(String newPersonType) { 1010 personType = newPersonType; 1011 }//setPersonType 1012 1013 /**get the name of the annotation set*/ 1014 public String getAnnotationSetName() { 1015 return annotationSetName; 1016 }//getAnnotationSetName 1017 1018 /** get the types of the annotation*/ 1019 public List getAnnotationTypes() { 1020 return annotationTypes; 1021 }//getAnnotationTypes 1022 1023 public String getOrganizationType() { 1024 return organizationType; 1025 } 1026 1027 public String getPersonType() { 1028 return personType; 1029 } 1030 1031 public Boolean getExtLists() { 1032 return new Boolean(extLists); 1033 } 1034 1035 public Boolean getCaseSensitive() { 1036 return new Boolean(caseSensitive); 1037 } 1038 1039/* 1040 public List getMatchesDocument() { 1041 return matchesDocument; 1042 } 1043*/ 1044 1045 protected boolean isUnknownGender(String gender) { 1046 if (gender == null) 1047 return true; 1048 if (gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female")) 1049 return false; 1050 return true; 1051 1052 } //isUnknownGender 1053 1054 /** RULE #0: If the two names are listed in table of 1055 * spurius matches then they do NOT match 1056 * Condition(s): - 1057 * Applied to: all name annotations 1058 */ 1059 public boolean matchRule0(String s1, 1060 String s2) { 1061 if (spur_match.containsKey(s1) 1062 && spur_match.containsKey(s2) ) 1063 return 1064 spur_match.get(s1).toString().equals(spur_match.get(s2).toString()); 1065 1066 return false; 1067 }//matchRule0 1068 1069 /** RULE #1: If the two names are identical then they are the same 1070 * no longer used, because I do the check for same string via the 1071 * hash table of previous annotations 1072 * Condition(s): depend on case 1073 * Applied to: all name annotations 1074 */ 1075 public boolean matchRule1(String s1, 1076 String s2, 1077 boolean matchCase) { 1078// Out.prln("Rule1: Matching " + s1 + "and " + s2); 1079 1080 boolean matched = false; 1081 if (!matchCase) 1082 matched = s1.equalsIgnoreCase(s2); 1083 else matched = s1.equals(s2) ; 1084//kalina: do not remove, nice for debug 1085// if (matched && (s2.startsWith("Kenneth") || s1.startsWith("Kenneth"))) 1086// Out.prln("Rule1: Matched " + s1 + "and " + s2); 1087 return matched; 1088 }//matchRule1 1089 1090 1091 /** 1092 * RULE #2: if the two names are listed as equivalent in the 1093 * lookup table (alias) then they match 1094 * Condition(s): - 1095 * Applied to: all name annotations 1096 */ 1097 public boolean matchRule2(String s1, 1098 String s2) { 1099 1100 if (alias.containsKey(s1) && alias.containsKey(s2)) 1101 return (alias.get(s1).toString().equals(alias.get(s2).toString())); 1102 1103 return false; 1104 }//matchRule2 1105 1106 /** 1107 * RULE #3: adding a possessive at the end 1108 * of one name causes a match 1109 * e.g. "Standard and Poor" == "Standard and Poor's" 1110 * and also "Standard and Poor" == "Standard's" 1111 * Condition(s): case-insensitive match 1112 * Applied to: all name annotations 1113 */ 1114 public boolean matchRule3(String s1, //long string 1115 String s2) { //short string 1116 1117 if (s2.endsWith("'s") || s2.endsWith("'") 1118 ||(s1.endsWith("'s")|| s1.endsWith("'"))) { 1119 1120 1121 String s2_poss = null; 1122 1123 if (!s2.endsWith("'s")) s2_poss = s2.concat("'s"); 1124 else s2_poss = s2.concat("'"); 1125 1126 if (s2_poss != null && matchRule1(s1, s2_poss,caseSensitive)) return true; 1127 1128 // now check the second case i.e. "Standard and Poor" == "Standard's" 1129 String token = (String) 1130 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1131 1132 if (!token.endsWith("'s")) s2_poss = token.concat("'s"); 1133 else s2_poss = token.concat("'"); 1134 1135 if (s2_poss != null && matchRule1(s2_poss,s2,caseSensitive)) return true; 1136 1137 } // if (s2.endsWith("'s") 1138 return false; 1139 }//matchRule3 1140 1141 /** 1142 * RULE #4: Do all tokens other than the punctuation marks 1143 * , and . match? 1144 * e.g. "Smith, Jones" == "Smith Jones" 1145 * Condition(s): case-insensitive match 1146 * Applied to: organisation and person annotations 1147 */ 1148 public boolean matchRule4(String s1, 1149 String s2) { 1150 1151 boolean allTokensMatch = true; 1152 1153 Iterator tokensLongAnnotIter = tokensLongAnnot.iterator(); 1154 Iterator tokensShortAnnotIter = tokensShortAnnot.iterator(); 1155 while (tokensLongAnnotIter.hasNext() && tokensShortAnnotIter.hasNext()) { 1156 Annotation token = (Annotation) tokensLongAnnotIter.next(); 1157 if (((String)token.getFeatures().get(TOKEN_KIND_FEATURE_NAME)).equals(PUNCTUATION_VALUE)) 1158 continue; 1159// Out.prln("Matching" + tokensLongAnnot + " with " + tokensShortAnnot); 1160 if (! token.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals( 1161 ((Annotation) tokensShortAnnotIter.next()).getFeatures().get(TOKEN_STRING_FEATURE_NAME))) { 1162 allTokensMatch = false; 1163 break; 1164 } // if (!tokensLongAnnot.nextToken() 1165 } // while 1166// if (allTokensMatch) 1167// Out.prln("rule4 fired. result is: " + allTokensMatch); 1168 return allTokensMatch; 1169 }//matchRule4 1170 1171 /** 1172 * RULE #5: if the 1st token of one name 1173 * matches the second name 1174 * e.g. "Pepsi Cola" == "Pepsi" 1175 * Condition(s): case-insensitive match 1176 * Applied to: all name annotations 1177 */ 1178 public boolean matchRule5(String s1, 1179 String s2) { 1180 1181 //do not match numbers by this rule 1182 if (tokensLongAnnot.size()> 1 && 1183 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get("kind").equals("number")) 1184 return false; 1185 1186// if (s1.startsWith("Patrick") || s2.startsWith("Patrick")) { 1187// Out.prln("Rule 5: " + s1 + "and " + s2); 1188// } 1189 1190 //require that when matching person names, the shorter one to be of length 1 1191 //for the rule to apply. In other words, avoid matching Peter Smith and 1192 //Peter Kline, because they share a Peter token. 1193 if ( (shortAnnot.getType().equals(personType) 1194 || longAnnot.getType().equals(personType) 1195 ) 1196 && 1197 tokensShortAnnot.size()>1 1198 ) 1199 return false; 1200 1201 if (tokensLongAnnot.size()<=1) 1202 return false; 1203 boolean result = matchRule1((String) 1204 ((Annotation) tokensLongAnnot.get(0) 1205 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME), 1206 s2, 1207 caseSensitive); 1208 1209// if (s1.startsWith("Patrick") || s2.startsWith("Patrick")) 1210// Out.prln("rule 5 result: " + result); 1211 return result; 1212 1213 }//matchRule5 1214 1215 /** 1216 * RULE #6: if one name is the acronym of the other 1217 * e.g. "Imperial Chemical Industries" == "ICI" 1218 * Applied to: organisation annotations only 1219 */ 1220 public boolean matchRule6(String s1, 1221 String s2) { 1222 1223 int i = 0; 1224 1225 //check and if the shorted string has a space in it, then it's not 1226 //an acronym 1227 if (s2.indexOf(" ") > 0) 1228 return false; 1229 1230 //Out.prln("Acronym: Matching " + s1 + "and " + s2); 1231 StringBuffer acronym_s1 = new StringBuffer(""); 1232 StringBuffer acronymDot_s1 = new StringBuffer(""); 1233 1234 for ( ;i < tokensLongAnnot.size(); i++ ) { 1235 String toAppend = ( (String) ((Annotation) tokensLongAnnot.get(i) 1236 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)).substring(0,1); 1237 acronym_s1.append(toAppend); 1238 acronymDot_s1.append(toAppend); 1239 acronymDot_s1.append("."); 1240 } 1241 1242 //Out.prln("Acronym dot: To Match " + acronymDot_s1 + "and " + s2); 1243 //Out.prln("Result: " + matchRule1(acronymDot_s1.toString(),s2,caseSensitive)); 1244 1245 if (matchRule1(acronym_s1.toString(),s2,caseSensitive) || 1246 matchRule1(acronymDot_s1.toString(),s2,caseSensitive) ) 1247 return true; 1248 1249 return false; 1250 }//matchRule6 1251 1252 /** 1253 * RULE #7: if one of the tokens in one of the 1254 * names is in the list of separators eg. "&" 1255 * then check if the token before the separator 1256 * matches the other name 1257 * e.g. "R.H. Macy & Co." == "Macy" 1258 * Condition(s): case-sensitive match 1259 * Applied to: organisation annotations only 1260 */ 1261 public boolean matchRule7(String s1, 1262 String s2) { 1263 1264 //don't try it unless the second string is just one token 1265 if (tokensShortAnnot.size() != 1) 1266 return false; 1267 1268 String previous_token = null; 1269 1270 for (int i = 0; i < tokensLongAnnot.size(); i++ ) { 1271 if (connector.containsKey( ((Annotation) tokensLongAnnot.get(i) 1272 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) )) { 1273 previous_token = (String) ((Annotation) tokensLongAnnot.get(i-1) 1274 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1275 1276 break; 1277 } 1278 } 1279 1280 //now match previous_token with other name 1281 if (previous_token != null) { 1282// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) 1283// Out.prln("Rule7"); 1284 return matchRule1(previous_token,s2,caseSensitive); 1285 1286 } 1287 return false; 1288 }//matchRule7 1289 1290 /** 1291 * This rule is now obsolete, as The and the trailing CDG 1292 * are stripped before matching. 1293 * DO NOT CALL!!! 1294 * 1295 * RULE #8: if the names match, ignoring The and 1296 * and trailing company designator (which have already been stripped) 1297 * e.g. "The Magic Tricks Co." == "Magic Tricks" 1298 * Condition(s): case-sensitive match 1299 * Applied to: organisation annotations only 1300 */ 1301 public boolean matchRule8(String s1, 1302 String s2) { 1303 Out.prln("OrthoMatcher warning: This rule has been discontinued!"); 1304/* 1305 if (s1.startsWith("The ")) s1 = s1.substring(4); 1306 if (s2.startsWith("The ")) s2 = s2.substring(4); 1307 1308 // check that cdg is not empty 1309 if (!cdg.isEmpty()) { 1310 String stringToTokenize1 = s1; 1311 StringTokenizer tokensLongAnnot = new StringTokenizer(stringToTokenize1," "); 1312 1313 String stringToTokenize2 = s2; 1314 StringTokenizer tokensShortAnnot = new StringTokenizer(stringToTokenize2," "); 1315 String token = null; 1316 String cdg1 = null; 1317 String cdg2 = null; 1318 1319 s1 = ""; 1320 s2 = ""; 1321 1322 //check last token of s1 1323 while (tokensLongAnnot.hasMoreTokens()) { 1324 token = tokensLongAnnot.nextToken(); 1325 if (!tokensLongAnnot.hasMoreTokens() 1326 && cdg.contains(token)) cdg1=token; 1327 else s1 = s1+token; 1328 } 1329 1330 // do the same for s2 1331 while (tokensShortAnnot.hasMoreTokens()) { 1332 token = tokensShortAnnot.nextToken(); 1333 if (!tokensShortAnnot.hasMoreTokens() 1334 && cdg.contains(token)) cdg2=token; 1335 else s2 = s2+token; 1336 } 1337 1338 // if the company designators are different 1339 // then they are NOT the same organisations 1340 if ((cdg1!=null && cdg2!=null) 1341 && !cdg1.equalsIgnoreCase(cdg2)) return false; 1342 } 1343 if (!s1.equals("") && !s2.equals("")) return matchRule1(s1,s2,caseSensitive); 1344*/ 1345 return false; 1346 1347 }//matchRule8 1348 1349 /** 1350 * RULE #9: does one of the names match the token 1351 * just before a trailing company designator 1352 * in the other name? 1353 * The company designator has already been chopped off, 1354 * so the token before it, is in fact the last token 1355 * e.g. "R.H. Macy Co." == "Macy" 1356 * Applied to: organisation annotations only 1357 */ 1358 public boolean matchRule9(String s1, 1359 String s2) { 1360 1361// if (s1.equalsIgnoreCase("news") || s2.equalsIgnoreCase("news")) 1362// Out.prln("Rule 9 " + s1 + " and " + s2); 1363 String s1_short = (String) 1364 ((Annotation) tokensLongAnnot.get( 1365 tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1366// Out.prln("Converted to " + s1_short); 1367 if (tokensLongAnnot.size()>1) { 1368 boolean matched = matchRule1(s1_short, s2, caseSensitive); 1369 //we need to make sure all names match, instead of assuming transitivity, 1370 //to avoid matching BBC News with News then News with ITV News, which 1371 //by transitivity leads to BBC News matching ITV News which is not what 1372 //we want 1373 if (matched) 1374 allMatchingNeeded = true; 1375 return matched; 1376 } //if 1377 1378 return false; 1379 }//matchRule9 1380 1381 /** 1382 * RULE #10: is one name the reverse of the other 1383 * reversing around prepositions only? 1384 * e.g. "Department of Defence" == "Defence Department" 1385 * Condition(s): case-sensitive match 1386 * Applied to: organisation annotations only 1387 */ 1388 public boolean matchRule10(String s1, 1389 String s2) { 1390 1391 String token = null; 1392 String previous_token = null; 1393 String next_token = null; 1394 boolean invoke_rule=false; 1395 1396 if (tokensLongAnnot.size() >= 3 1397 && tokensShortAnnot.size() >= 2) { 1398 1399 // first get the tokens before and after the preposition 1400 int i = 0; 1401 for (; i< tokensLongAnnot.size(); i++) { 1402 token = (String) 1403 ((Annotation) tokensLongAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1404 if (prepos.containsKey(token)) { 1405 invoke_rule=true; 1406 break; 1407 }//if 1408 previous_token = token; 1409 }//while 1410 1411 if (! invoke_rule) 1412 return false; 1413 1414 if (i < tokensLongAnnot.size() 1415 && previous_token != null) 1416 next_token= (String) 1417 ((Annotation) tokensLongAnnot.get(i++)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1418 else return false; 1419 1420 String s21 = (String) 1421 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1422 String s22 = (String) 1423 ((Annotation) tokensShortAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1424 // then compare (in reverse) with the first two tokens of s2 1425 if (matchRule1(next_token,(String) s21,caseSensitive) 1426 && matchRule1(previous_token, s22,caseSensitive)) 1427 return true ; 1428 }//if (tokensLongAnnot.countTokens() >= 3 1429 return false; 1430 }//matchRule10 1431 1432 /** 1433 * RULE #11: does one name consist of contractions 1434 * of the first two tokens of the other name? 1435 * e.g. "Communications Satellite" == "ComSat" 1436 * and "Pan American" == "Pan Am" 1437 * Condition(s): case-sensitive match 1438 * Applied to: organisation annotations only 1439 */ 1440 public boolean matchRule11(String s1, 1441 String s2) { 1442 1443 1444 // first do the easy case e.g. "Pan American" == "Pan Am" 1445 1446 String token11 = null; 1447 String token12 = null; 1448 String token21 = null; 1449 String token22 = null; 1450 1451 if (tokensLongAnnot.size() < 2) 1452 return false; 1453 1454 // 1st get the first two tokens of s1 1455 token11 = (String) 1456 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1457 token12 = (String) 1458 ((Annotation) tokensLongAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1459 1460 // now check for the first case i.e. "Pan American" == "Pan Am" 1461 if (tokensShortAnnot.size() == 2) { 1462 1463 token21 = (String) 1464 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1465 token22 = (String) 1466 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1467 1468 if (token11.startsWith(token21) 1469 && token12.startsWith(token22)) 1470 return true; 1471 1472 } // if (tokensShortAnnot.countTokens() == 2) 1473 1474 // now the second case e.g. "Communications Satellite" == "ComSat" 1475 else if (tokensShortAnnot.size()==1 && s2.length()>=3) { 1476 1477 // split the token into possible contractions 1478 // ignore case for matching 1479 for (int i=2;i<s2.length();i++) { 1480 token21=s2.substring(0,i+1); 1481 token22=s2.substring(i+1); 1482 1483 if (token11.startsWith(token21) 1484 && token12.startsWith(token22)) 1485 return true; 1486 }// for 1487 } // else if 1488 1489 return false; 1490 }//matchRule11 1491 1492 /** 1493 * RULE #12: do the first and last tokens of one name 1494 * match the first and last tokens of the other? 1495 * Condition(s): case-sensitive match 1496 * Applied to: organisation annotations only 1497 */ 1498 public boolean matchRule12(String s1, 1499 String s2) { 1500 1501 // first do the easy case e.g. "Pan American" == "Pan Am" 1502 1503 if (tokensLongAnnot.size()>1 && tokensShortAnnot.size()>1) { 1504// Out.prln("Rule 12"); 1505 1506 // get first and last tokens of s1 & s2 1507 String s1_first = (String) 1508 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1509 String s2_first = (String) 1510 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1511 1512 if (!matchRule1(s1_first,s2_first,caseSensitive)) 1513 return false; 1514 1515 String s1_last = (String) 1516 ((Annotation) tokensLongAnnot.get(tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1517 String s2_last = (String) 1518 ((Annotation) tokensShortAnnot.get(tokensShortAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1519 1520 return matchRule1(s1_last,s2_last,caseSensitive); 1521 } // if (tokensLongAnnot.countTokens()>1 1522 return false; 1523 }//matchRule12 1524 1525 /** 1526 * RULE #13: do multi-word names match except for 1527 * one token e.g. 1528 * "Second Force Recon Company" == "Force Recon Company" 1529 * Note that this rule has NOT been used in LaSIE's 1.5 1530 * namematcher 1531 * Restrictions: - remove cdg first 1532 * - shortest name should be 2 words or more 1533 * - if N is the number of tokens of the longest 1534 * name, then N-1 tokens should be matched 1535 * Condition(s): case-sensitive match 1536 * Applied to: organisation or person annotations only 1537 */ 1538 public boolean matchRule13(String s1, 1539 String s2) { 1540 1541 1542 String token1 = null; 1543 String token2 = null; 1544 1545 int matched_tokens = 0, mismatches = 0;; 1546 1547 // if names < 2 words then rule is invalid 1548 if (tokensLongAnnot.size() < 3 || tokensShortAnnot.size() < 2) return false; 1549 1550// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) { 1551// Out.prln("Rule 13: Matching tokens" + tokensLongAnnot); 1552// Out.prln("with tokens " + tokensShortAnnot); 1553// } 1554 1555 // now do the matching 1556 for (int i=0,j= 0; i < tokensShortAnnot.size() && mismatches < 2; i++) { 1557 1558// Out.prln("i = " + i); 1559// Out.prln("j = " + j); 1560 if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals( 1561 ((Annotation) tokensShortAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) { 1562 matched_tokens++; 1563 j++; 1564 } else 1565 mismatches++; 1566 } // for 1567 1568 if (matched_tokens >= tokensLongAnnot.size()-1) 1569 return true; 1570 1571 return false; 1572 }//matchRule13 1573 1574 /** 1575 * RULE #14: if the last token of one name 1576 * matches the second name 1577 * e.g. "Hamish Cunningham" == "Cunningham" 1578 * Condition(s): case-insensitive match 1579 * Applied to: all person annotations 1580 */ 1581 public boolean matchRule14(String s1, 1582 String s2) { 1583 1584// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) 1585// Out.prln("Rule 14 " + s1 + " and " + s2); 1586 String s1_short = (String) 1587 ((Annotation) tokensLongAnnot.get( 1588 tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1589// Out.prln("Converted to " + s1_short); 1590 if (tokensLongAnnot.size()>1) 1591 return matchRule1(s1_short, 1592 s2, 1593 caseSensitive); 1594 1595 return false; 1596 1597 }//matchRule14 1598 1599 /** 1600 * RULE #15: does one token from a Person name appear as the other token 1601 * Note that this rule has NOT been used in LaSIE's 1.5 1602 * namematcher; added for ACE by Di's request 1603 */ 1604 public boolean matchRule15(String s1, 1605 String s2) { 1606 1607 int matched_tokens = 0; 1608 1609 // if names < 2 words then rule is invalid 1610 1611// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) { 1612// Out.prln("Rule 15:" ); 1613// Out.prln("with tokens " + tokensShortAnnot); 1614// } 1615 1616 // now do the matching 1617 Annotation token1, token2; 1618 for (int i=0; i < tokensShortAnnot.size() && matched_tokens == 0; i++) { 1619 token1 = (Annotation) tokensShortAnnot.get(i); 1620 //first check if not punctuation, because we need to skip it 1621 if (token1.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE)) 1622 continue; 1623 1624 for (int j=0; j<tokensLongAnnot.size() && matched_tokens ==0; j++) { 1625// Out.prln("i = " + i); 1626 token2 = (Annotation) tokensLongAnnot.get(j); 1627 if (token2.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE)) 1628 continue; 1629 if ( token1.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals( 1630 token2.getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) 1631 matched_tokens++; 1632 }//for 1633 } // for 1634 1635 //19 February 2002: kalina 1636 //was originally > 0 (i.e., any match is good) 1637 //ensure that we've matched all the tokens in the short annotation 1638 //the reason for that is, because otherwise we match 1639 //Patrick Viera and Patrick Somebody - not good! 1640 if (matched_tokens == tokensShortAnnot.size()) 1641 return true; 1642 1643 return false; 1644 }//matchRule15 1645 1646 /** Tables for namematch info 1647 * (used by the namematch rules) 1648 */ 1649 private void buildTables(AnnotationSet nameAllAnnots) { 1650 1651 //reset the tables first 1652 cdg.clear(); 1653 1654 if (! extLists) { 1655 // i.e. get cdg from Lookup annotations 1656 // get all Lookup annotations 1657 tempMap.clear(); 1658 tempMap.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, "cdg"); 1659 //now get all lookup annotations which are cdg 1660 AnnotationSet nameAnnots = 1661 nameAllAnnots.get(LOOKUP_ANNOTATION_TYPE, tempMap); 1662 1663 if ((nameAnnots ==null) || nameAnnots.isEmpty()) 1664 return; 1665 1666 Iterator iter = nameAnnots.iterator(); 1667 while (iter.hasNext()) { 1668 Annotation annot = (Annotation)iter.next(); 1669 // get the actual string 1670 Long offsetStartAnnot = annot.getStartNode().getOffset(); 1671 Long offsetEndAnnot = annot.getEndNode().getOffset(); 1672 try { 1673 gate.Document doc = nameAllAnnots.getDocument(); 1674 String annotString = 1675 doc.getContent().getContent( 1676 offsetStartAnnot,offsetEndAnnot 1677 ).toString(); 1678 cdg.add(annotString); 1679 } catch (InvalidOffsetException ioe) { 1680 ioe.printStackTrace(Err.getPrintWriter()); 1681 } 1682 }// while 1683 }//if 1684 }//buildTables 1685 1686 /** substitute all multiple spaces, tabes and newlines 1687 * with a single space 1688 */ 1689 public String regularExpressions ( String text, String replacement, 1690 String regEx) { 1691 String result = text; 1692 try { 1693 RE re = new RE(regEx); 1694 result = re.substituteAll( text,replacement); 1695 } catch (REException ree) {ree.printStackTrace();} 1696 return result; 1697 }//regularExpressions 1698 1699 1700 private static class Class1 { 1701 } 1702} // public class OrthoMatcher 1703 1704
|
OrthoMatcher |
|