|
DefaultGazetteer |
|
1 /* 2 * DefaultGazeteer.java 3 * 4 * Copyright (c) 2000-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June1991. 9 * 10 * A copy of this licence is included in the distribution in the file 11 * licence.html, and is also available at http://gate.ac.uk/gate/licence.html. 12 * 13 * Valentin Tablan, 03/07/2000 14 * 15 * $Id: DefaultGazetteer.java,v 1.36 2002/03/13 11:19:36 valyt Exp $ 16 */ 17 18 package gate.creole.gazetteer; 19 20 import java.io.*; 21 import java.util.*; 22 import java.net.*; 23 24 import gate.util.*; 25 import gate.creole.*; 26 import gate.event.*; 27 import gate.*; 28 29 /** This component is responsible for doing lists lookup. The implementaion is 30 * based on finite state machines. 31 * The phrases to be recognised should be listed in a set of files, one for 32 * each type of occurences. 33 * The gazeteer is build with the information from a file that contains the set 34 * of lists (which are files as well) and the associated type for each list. 35 * The file defining the set of lists should have the following syntax: 36 * each list definition should be written on its own line and should contain: 37 * <ol> 38 * <li>the file name (required) </li> 39 * <li>the major type (required) </li> 40 * <li>the minor type (optional)</li> 41 * <li>the language(s) (optional) </li> 42 * </ol> 43 * The elements of each definition are separated by ":". 44 * The following is an example of a valid definition: <br> 45 * <code>personmale.lst:person:male:english</code> 46 * Each list file named in the lists definition file is just a list containing 47 * one entry per line. 48 * When this gazetter will be run over some input text (a Gate document) it 49 * will generate annotations of type Lookup having the attributes specified in 50 * the definition file. 51 */ 52 public class DefaultGazetteer extends AbstractLanguageAnalyser 53 implements ProcessingResource { 54 55 public static final String 56 DEF_GAZ_DOCUMENT_PARAMETER_NAME = "document"; 57 58 public static final String 59 DEF_GAZ_ANNOT_SET_PARAMETER_NAME = "annotationSetName"; 60 61 public static final String 62 DEF_GAZ_LISTS_URL_PARAMETER_NAME = "listsURL"; 63 64 public static final String 65 DEF_GAZ_ENCODING_PARAMETER_NAME = "encoding"; 66 67 public static final String 68 DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive"; 69 70 /** Debug flag 71 */ 72 private static final boolean DEBUG = false; 73 74 /** Build a gazetter using the default lists from the agte resources 75 * {@see init()} 76 */ 77 public DefaultGazetteer(){ 78 } 79 80 /** Does the actual loading and parsing of the lists. This method must be 81 * called before the gazetteer can be used 82 */ 83 public Resource init()throws ResourceInstantiationException{ 84 fsmStates = new HashSet(); 85 try{ 86 initialState = new FSMState(this); 87 if(listsURL == null){ 88 throw new ResourceInstantiationException ( 89 "No URL provided for gazetteer creation!"); 90 } 91 92 //find the number of lines 93 Reader reader = new InputStreamReader(listsURL.openStream(), encoding); 94 int linesCnt = 0; 95 BufferedReader bReader = new BufferedReader(reader); 96 String line = bReader.readLine(); 97 while (line != null) { 98 linesCnt++; 99 line = bReader.readLine(); 100 } 101 bReader.close(); 102 103 //parse the file 104 reader = new InputStreamReader(listsURL.openStream(), encoding); 105 bReader = new BufferedReader(reader); 106 line = bReader.readLine(); 107 ///String toParse = ""; 108 StringBuffer toParse = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE); 109 110 int lineIdx = 0; 111 while (line != null) { 112 if(line.endsWith("\\")) { 113 ///toParse += line.substring(0,line.length()-1); 114 toParse.append(line.substring(0,line.length()-1)); 115 } else { 116 ///toParse += line; 117 toParse.append(line); 118 fireStatusChanged("Reading " + toParse.toString()); 119 fireProgressChanged(lineIdx * 100 / linesCnt); 120 lineIdx ++; 121 readList(toParse.toString(), true); 122 ///toParse = ""; 123 toParse.delete(0,toParse.length()); 124 } 125 line = bReader.readLine(); 126 } 127 fireProcessFinished(); 128 }catch(IOException ioe){ 129 throw new ResourceInstantiationException(ioe); 130 }catch(GazetteerException ge){ 131 throw new ResourceInstantiationException(ge); 132 } 133 return this; 134 } 135 136 137 /** Reads one lists (one file) of phrases 138 * 139 * @param listDesc the line from the definition file 140 * @param add 141 * @add if <b>true</b> will add the phrases found in the list to the ones 142 * recognised by this gazetter, if <b>false</b> the phrases found in the 143 * list will be removed from the list of phrases recognised by this 144 * gazetteer. 145 */ 146 void readList(String listDesc, boolean add) throws FileNotFoundException, 147 IOException, 148 GazetteerException{ 149 String listName, majorType, minorType, languages; 150 int firstColon = listDesc.indexOf(':'); 151 int secondColon = listDesc.indexOf(':', firstColon + 1); 152 int thirdColon = listDesc.indexOf(':', secondColon + 1); 153 if(firstColon == -1){ 154 throw new GazetteerException("Invalid list definition: " + listDesc); 155 } 156 listName = listDesc.substring(0, firstColon); 157 158 if(secondColon == -1){ 159 majorType = listDesc.substring(firstColon + 1); 160 minorType = null; 161 languages = null; 162 } else { 163 majorType = listDesc.substring(firstColon + 1, secondColon); 164 if(thirdColon == -1) { 165 minorType = listDesc.substring(secondColon + 1); 166 languages = null; 167 } else { 168 minorType = listDesc.substring(secondColon + 1, thirdColon); 169 languages = listDesc.substring(thirdColon + 1); 170 } 171 } 172 BufferedReader listReader; 173 174 listReader = new BufferedReader(new InputStreamReader( 175 (new URL(listsURL, listName)).openStream(), encoding)); 176 177 Lookup lookup = new Lookup(majorType, minorType, languages); 178 String line = listReader.readLine(); 179 while(null != line){ 180 if(add)addLookup(line, lookup); 181 else removeLookup(line, lookup); 182 line = listReader.readLine(); 183 } 184 } // void readList(String listDesc) 185 186 /** Adds one phrase to the list of phrases recognised by this gazetteer 187 * 188 * @param text the phrase to be added 189 * @param lookup the description of the annotation to be added when this 190 * phrase is recognised 191 */ 192 // >>> DAM, was 193 /* 194 public void addLookup(String text, Lookup lookup) { 195 Character currentChar; 196 FSMState currentState = initialState; 197 FSMState nextState; 198 Lookup oldLookup; 199 boolean isSpace; 200 201 for(int i = 0; i< text.length(); i++) { 202 isSpace = Character.isWhitespace(text.charAt(i)); 203 if(isSpace) currentChar = new Character(' '); 204 else currentChar = (caseSensitive.booleanValue()) ? 205 new Character(text.charAt(i)) : 206 new Character(Character.toUpperCase(text.charAt(i))) ; 207 nextState = currentState.next(currentChar); 208 if(nextState == null){ 209 nextState = new FSMState(this); 210 currentState.put(currentChar, nextState); 211 if(isSpace) nextState.put(new Character(' '),nextState); 212 } 213 currentState = nextState; 214 } //for(int i = 0; i< text.length(); i++) 215 216 currentState.addLookup(lookup); 217 //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType); 218 219 } // addLookup 220 */ 221 // >>> DAM: TransArray optimization 222 public void addLookup(String text, Lookup lookup) { 223 char currentChar; 224 FSMState currentState = initialState; 225 FSMState nextState; 226 Lookup oldLookup; 227 boolean isSpace; 228 229 for(int i = 0; i< text.length(); i++) { 230 currentChar = text.charAt(i); 231 isSpace = Character.isWhitespace(currentChar); 232 if(isSpace) currentChar = ' '; 233 else currentChar = (caseSensitive.booleanValue()) ? 234 currentChar : 235 Character.toUpperCase(currentChar) ; 236 nextState = currentState.next(currentChar); 237 if(nextState == null){ 238 nextState = new FSMState(this); 239 currentState.put(currentChar, nextState); 240 if(isSpace) nextState.put(' ',nextState); 241 } 242 currentState = nextState; 243 } //for(int i = 0; i< text.length(); i++) 244 245 currentState.addLookup(lookup); 246 //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType); 247 248 } // addLookup 249 // >>> DAM, end 250 251 /** Removes one phrase to the list of phrases recognised by this gazetteer 252 * 253 * @param text the phrase to be removed 254 * @param lookup the description of the annotation associated to this phrase 255 */ 256 // >>> DAM, was 257 /* 258 public void removeLookup(String text, Lookup lookup) { 259 Character currentChar; 260 FSMState currentState = initialState; 261 FSMState nextState; 262 Lookup oldLookup; 263 boolean isSpace; 264 265 for(int i = 0; i< text.length(); i++) { 266 isSpace = Character.isWhitespace(text.charAt(i)); 267 if(isSpace) currentChar = new Character(' '); 268 else currentChar = new Character(text.charAt(i)); 269 nextState = currentState.next(currentChar); 270 if(nextState == null) return;//nothing to remove 271 currentState = nextState; 272 } //for(int i = 0; i< text.length(); i++) 273 currentState.removeLookup(lookup); 274 } // removeLookup 275 */ 276 // >>> DAM: TransArray optimization 277 public void removeLookup(String text, Lookup lookup) { 278 char currentChar; 279 FSMState currentState = initialState; 280 FSMState nextState; 281 Lookup oldLookup; 282 283 for(int i = 0; i< text.length(); i++) { 284 currentChar = text.charAt(i); 285 if(Character.isWhitespace(currentChar)) currentChar = ' '; 286 nextState = currentState.next(currentChar); 287 if(nextState == null) return;//nothing to remove 288 currentState = nextState; 289 } //for(int i = 0; i< text.length(); i++) 290 currentState.removeLookup(lookup); 291 } // removeLookup 292 // >>> DAM, end 293 294 /** Returns a string representation of the deterministic FSM graph using 295 * GML. 296 */ 297 public String getFSMgml() { 298 String res = "graph[ \ndirected 1\n"; 299 ///String nodes = "", edges = ""; 300 StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE), 301 edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE); 302 Iterator fsmStatesIter = fsmStates.iterator(); 303 while (fsmStatesIter.hasNext()){ 304 FSMState currentState = (FSMState)fsmStatesIter.next(); 305 int stateIndex = currentState.getIndex(); 306 /*nodes += "node[ id " + stateIndex + 307 " label \"" + stateIndex; 308 */ 309 nodes.append("node[ id "); 310 nodes.append(stateIndex); 311 nodes.append(" label \""); 312 nodes.append(stateIndex); 313 314 if(currentState.isFinal()){ 315 ///nodes += ",F\\n" + currentState.getLookupSet(); 316 nodes.append(",F\\n"); 317 nodes.append(currentState.getLookupSet()); 318 } 319 ///nodes += "\" ]\n"; 320 nodes.append("\" ]\n"); 321 //edges += currentState.getEdgesGML(); 322 edges.append(currentState.getEdgesGML()); 323 } 324 res += nodes.toString() + edges.toString() + "]\n"; 325 return res; 326 } // getFSMgml 327 328 //no doc required: javadoc will copy it from the interface 329 /** */ 330 public FeatureMap getFeatures(){ 331 return features; 332 } // getFeatures 333 334 /** */ 335 public void setFeatures(FeatureMap features){ 336 this.features = features; 337 } // setFeatures 338 339 340 341 /** 342 * This method runs the gazetteer. It assumes that all the needed parameters 343 * are set. If they are not, an exception will be fired. 344 */ 345 public void execute() throws ExecutionException{ 346 interrupted = false; 347 AnnotationSet annotationSet; 348 //check the input 349 if(document == null) { 350 throw new ExecutionException( 351 "No document to process!" 352 ); 353 } 354 355 if(annotationSetName == null || 356 annotationSetName.equals("")) annotationSet = document.getAnnotations(); 357 else annotationSet = document.getAnnotations(annotationSetName); 358 359 fireStatusChanged("Doing lookup in " + document.getName() + "..."); 360 String content = document.getContent().toString(); 361 int length = content.length(); 362 // >>> DAM, was 363 /* 364 Character currentChar; 365 */ 366 // >>> DAM: TransArray optimization 367 char currentChar; 368 // >>> DAM, end 369 FSMState currentState = initialState; 370 FSMState nextState; 371 FSMState lastMatchingState = null; 372 int matchedRegionEnd = 0; 373 int matchedRegionStart = 0; 374 int charIdx = 0; 375 int oldCharIdx = 0; 376 FeatureMap fm; 377 Lookup currentLookup; 378 379 // >>> DAM, was 380 /* 381 while(charIdx < length) { 382 if(Character.isWhitespace(content.charAt(charIdx))) 383 currentChar = new Character(' '); 384 else currentChar = (caseSensitive.booleanValue()) ? 385 new Character(content.charAt(charIdx)) : 386 new Character(Character.toUpperCase( 387 content.charAt(charIdx))); 388 */ 389 // >>> DAM: TransArray optimization 390 while(charIdx < length) { 391 currentChar = content.charAt(charIdx); 392 if(Character.isWhitespace(currentChar)) currentChar = ' '; 393 else currentChar = caseSensitive.booleanValue() ? 394 currentChar : 395 Character.toUpperCase(currentChar); 396 // >>> DAM, end 397 nextState = currentState.next(currentChar); 398 if(nextState == null) { 399 //the matching stopped 400 401 //if we had a successful match then act on it; 402 if(lastMatchingState != null){ 403 //let's add the new annotation(s) 404 Iterator lookupIter = lastMatchingState.getLookupSet().iterator(); 405 406 while(lookupIter.hasNext()) { 407 currentLookup = (Lookup)lookupIter.next(); 408 fm = Factory.newFeatureMap(); 409 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, 410 currentLookup.majorType); 411 if(null != currentLookup.minorType) { 412 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, 413 currentLookup.minorType); 414 if(null != currentLookup.languages) 415 fm.put("language", currentLookup.languages); 416 } 417 try { 418 annotationSet.add(new Long(matchedRegionStart), 419 new Long(matchedRegionEnd + 1), 420 LOOKUP_ANNOTATION_TYPE, 421 fm); 422 } catch(InvalidOffsetException ioe) { 423 throw new LuckyException(ioe.toString()); 424 } 425 }//while(lookupIter.hasNext()) 426 lastMatchingState = null; 427 } 428 429 //reset the FSM 430 charIdx = matchedRegionStart + 1; 431 matchedRegionStart = charIdx; 432 currentState = initialState; 433 434 } else{//go on with the matching 435 currentState = nextState; 436 //if we have a successful state then store it 437 if(currentState.isFinal() && 438 (matchedRegionStart == 0 || 439 !Character.isLetter(content.charAt(matchedRegionStart - 1))) && 440 (charIdx + 1 >= content.length() || 441 !Character.isLetter(content.charAt(charIdx + 1))) 442 ){ 443 matchedRegionEnd = charIdx; 444 lastMatchingState = currentState; 445 } 446 charIdx ++; 447 if(charIdx == content.length()){ 448 //we can't go on, use the last matching state and restart matching 449 //from the next char 450 if(lastMatchingState != null){ 451 //let's add the new annotation(s) 452 Iterator lookupIter = lastMatchingState.getLookupSet().iterator(); 453 454 while(lookupIter.hasNext()) { 455 currentLookup = (Lookup)lookupIter.next(); 456 fm = Factory.newFeatureMap(); 457 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, 458 currentLookup.majorType); 459 if(null != currentLookup.minorType) { 460 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, 461 currentLookup.minorType); 462 if(null != currentLookup.languages) 463 fm.put("language", currentLookup.languages); 464 } 465 try { 466 annotationSet.add(new Long(matchedRegionStart), 467 new Long(matchedRegionEnd + 1), 468 LOOKUP_ANNOTATION_TYPE, 469 fm); 470 } catch(InvalidOffsetException ioe) { 471 throw new LuckyException(ioe.toString()); 472 } 473 }//while(lookupIter.hasNext()) 474 lastMatchingState = null; 475 } 476 477 //reset the FSM 478 charIdx = matchedRegionStart + 1; 479 matchedRegionStart = charIdx; 480 currentState = initialState; 481 } 482 } 483 if(charIdx - oldCharIdx > 256) { 484 fireProgressChanged((100 * charIdx )/ length ); 485 oldCharIdx = charIdx; 486 if(isInterrupted()) throw new ExecutionInterruptedException( 487 "The execution of the " + getName() + 488 " gazetteer has been abruptly interrupted!"); 489 } 490 } // while(charIdx < length) 491 492 if(lastMatchingState != null) { 493 Iterator lookupIter = lastMatchingState.getLookupSet().iterator(); 494 while(lookupIter.hasNext()) { 495 currentLookup = (Lookup)lookupIter.next(); 496 fm = Factory.newFeatureMap(); 497 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, 498 currentLookup.majorType); 499 if(null != currentLookup.minorType) 500 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, 501 currentLookup.minorType); 502 try{ 503 annotationSet.add(new Long(matchedRegionStart), 504 new Long(matchedRegionEnd + 1), 505 LOOKUP_ANNOTATION_TYPE, 506 fm); 507 } catch(InvalidOffsetException ioe) { 508 throw new GateRuntimeException(ioe.toString()); 509 } 510 }//while(lookupIter.hasNext()) 511 } 512 fireProcessFinished(); 513 fireStatusChanged("Lookup complete!"); 514 } // execute 515 516 517 /** 518 * Sets the AnnotationSet that will be used at the next run for the newly 519 * produced annotations. 520 */ 521 public void setAnnotationSetName(String newAnnotationSetName) { 522 annotationSetName = newAnnotationSetName; 523 } 524 525 526 /** The initial state of the FSM that backs this gazetteer 527 */ 528 FSMState initialState; 529 530 /** A set containing all the states of the FSM backing the gazetteer 531 */ 532 Set fsmStates; 533 534 protected FeatureMap features = null; 535 536 /** Used to store the annotation set currently being used for the newly 537 * generated annotations 538 */ 539 protected String annotationSetName; 540 541 private String encoding = "UTF-8"; 542 543 /** 544 * The value of this property is the URL that will be used for reading the 545 * lists dtaht define this Gazetteer 546 */ 547 private java.net.URL listsURL; 548 549 /** 550 * Should this gazetteer be case sensitive. The default value is true. 551 */ 552 private Boolean caseSensitive = new Boolean(true); 553 554 public void setEncoding(String newEncoding) { 555 encoding = newEncoding; 556 } 557 public String getEncoding() { 558 return encoding; 559 } 560 public void setListsURL(java.net.URL newListsURL) { 561 listsURL = newListsURL; 562 } 563 public java.net.URL getListsURL() { 564 return listsURL; 565 } 566 public void setCaseSensitive(Boolean newCaseSensitive) { 567 caseSensitive = newCaseSensitive; 568 } 569 public Boolean getCaseSensitive() { 570 return caseSensitive; 571 } 572 public String getAnnotationSetName() { 573 return annotationSetName; 574 } 575 576 } // DefaultGazetteer 577 578 // >>> DAM: TransArray optimization, new charMap implementation 579 interface Iter 580 { 581 public boolean hasNext(); 582 public char next(); 583 } // iter class 584 585 /** 586 * class implementing the map using binary serach by char as key 587 * to retrive the coresponding object. 588 */ 589 class charMap 590 { 591 char[] itemsKeys = null; 592 Object[] itemsObjs = null; 593 594 /** 595 * resize the containers by one leavaing empty elemant at position 'index' 596 */ 597 void resize(int index) 598 { 599 int newsz = itemsKeys.length + 1; 600 char[] tempKeys = new char[newsz]; 601 Object[] tempObjs = new Object[newsz]; 602 int i; 603 for (i= 0; i < index; i++) 604 { 605 tempKeys[i] = itemsKeys[i]; 606 tempObjs[i] = itemsObjs[i]; 607 } 608 for (i= index+1; i < newsz; i++) 609 { 610 tempKeys[i] = itemsKeys[i-1]; 611 tempObjs[i] = itemsObjs[i-1]; 612 } 613 614 itemsKeys = tempKeys; 615 itemsObjs = tempObjs; 616 } // resize 617 618 /** 619 * get the object from the map using the char key 620 */ 621 Object get(char key) 622 { 623 if (itemsKeys == null) return null; 624 int index = Arrays.binarySearch(itemsKeys, key); 625 if (index<0) 626 return null; 627 return itemsObjs[index]; 628 } 629 /** 630 * put the object into the char map using the chat as the key 631 */ 632 Object put(char key, Object value) 633 { 634 if (itemsKeys == null) 635 { 636 itemsKeys = new char[1]; 637 itemsKeys[0] = key; 638 itemsObjs = new Object[1]; 639 itemsObjs[0] = value; 640 return value; 641 }// if first time 642 int index = Arrays.binarySearch(itemsKeys, key); 643 if (index<0) 644 { 645 index = ~index; 646 resize(index); 647 itemsKeys[index] = key; 648 itemsObjs[index] = value; 649 } 650 return itemsObjs[index]; 651 } // put 652 /** 653 * the keys itereator 654 * / 655 public Iter iter() 656 { 657 return new Iter() 658 { 659 int counter = 0; 660 public boolean hasNext() {return counter < itemsKeys.length;} 661 public char next() { return itemsKeys[counter];} 662 }; 663 } // iter() 664 */ 665 666 } // class charMap 667 // >>> DAM, end, new charMap instead MAP for transition function in the FSMState
|
DefaultGazetteer |
|