|
XmlDocumentHandler |
|
1 /* 2 * XmlDocumentHandler.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 9 May 2000 12 * 13 * $Id: XmlDocumentHandler.java,v 1.39 2002/01/28 14:22:32 nasso Exp $ 14 */ 15 16 package gate.xml; 17 18 import java.util.*; 19 20 import gate.corpora.*; 21 import gate.util.*; 22 import gate.*; 23 import gate.event.*; 24 25 26 import org.xml.sax.*; 27 import org.xml.sax.helpers.*; 28 29 30 /** 31 * Implements the behaviour of the XML reader 32 * Methods of an object of this class are called by the SAX parser when 33 * events will appear. 34 * The idea is to parse the XML document and construct Gate annotations 35 * objects. 36 * This class also will replace the content of the Gate document with a 37 * new one containing only text from the XML document. 38 */ 39 public class XmlDocumentHandler extends XmlPositionCorrectionHandler { 40 /** Debug flag */ 41 private static final boolean DEBUG = false; 42 43 /** Keep the refference to this structure */ 44 private RepositioningInfo reposInfo = null; 45 46 /** Keep the refference to this structure */ 47 private RepositioningInfo ampCodingInfo = null; 48 49 /** Set repositioning information structure refference. If you set this 50 * refference to <B>null</B> information wouldn't be collected. 51 */ 52 public void setRepositioningInfo(RepositioningInfo info) { 53 reposInfo = info; 54 } // setRepositioningInfo 55 56 /** Return current RepositioningInfo object */ 57 public RepositioningInfo getRepositioningInfo() { 58 return reposInfo; 59 } // getRepositioningInfo 60 61 /** Set repositioning information structure refference for ampersand coding. 62 * If you set this refference to <B>null</B> information wouldn't be used. 63 */ 64 public void setAmpCodingInfo(RepositioningInfo info) { 65 ampCodingInfo = info; 66 } // setRepositioningInfo 67 68 /** Return current RepositioningInfo object for ampersand coding. */ 69 public RepositioningInfo getAmpCodingInfo() { 70 return ampCodingInfo; 71 } // getRepositioningInfo 72 73 /** 74 * Constructs a XmlDocumentHandler object. The annotationSet set will be the 75 * default one taken from the gate document. 76 * @param aDocument the Gate document that will be processed. 77 * @param aMarkupElementsMap this map contains the elements name that we 78 * want to create. 79 * @param anElement2StringMap this map contains the strings that will be 80 * added to the text contained by the key element. 81 */ 82 public XmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap, 83 Map anElement2StringMap){ 84 this(aDocument,aMarkupElementsMap,anElement2StringMap,null); 85 } // XmlDocumentHandler 86 87 /** 88 * Constructs a XmlDocumentHandler object. 89 * @param aDocument the Gate document that will be processed. 90 * @param aMarkupElementsMap this map contains the elements name that we 91 * want to create. 92 * @param anElement2StringMap this map contains the strings that will be 93 * added to the text contained by the key element. 94 * @param anAnnotationSet is the annotation set that will be filled when the 95 * document was processed 96 */ 97 public XmlDocumentHandler(gate.Document aDocument, 98 Map aMarkupElementsMap, 99 Map anElement2StringMap, 100 gate.AnnotationSet anAnnotationSet){ 101 // init parent 102 super(); 103 // init stack 104 stack = new java.util.Stack(); 105 106 // this string contains the plain text (the text without markup) 107 tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue()); 108 109 // colector is used later to transform all custom objects into annotation 110 // objects 111 colector = new LinkedList(); 112 113 // the Gate document 114 doc = aDocument; 115 116 // this map contains the elements name that we want to create 117 // if it's null all the elements from the XML documents will be transformed 118 // into Gate annotation objects 119 markupElementsMap = aMarkupElementsMap; 120 121 // this map contains the string that we want to insert iside the document 122 // content, when a certain element is found 123 // if the map is null then no string is added 124 element2StringMap = anElement2StringMap; 125 126 basicAS = anAnnotationSet; 127 customObjectsId = 0; 128 }// XmlDocumentHandler()/ 129 130 /** 131 * This method is called when the SAX parser encounts the beginning of the 132 * XML document. 133 */ 134 public void startDocument() throws org.xml.sax.SAXException { 135 // init of variables in the parent 136 super.startDocument(); 137 } 138 139 /** 140 * This method is called when the SAX parser encounts the end of the 141 * XML document. 142 * Here we set the content of the gate Document to be the one generated 143 * inside this class (tmpDocContent). 144 * After that we use the colector to generate all the annotation reffering 145 * this new gate document. 146 */ 147 public void endDocument() throws org.xml.sax.SAXException { 148 149 // replace the document content with the one without markups 150 doc.setContent(new DocumentContentImpl(tmpDocContent.toString())); 151 152 // fire the status listener 153 fireStatusChangedEvent("Total elements: " + elements); 154 155 // If basicAs is null then get the default AnnotationSet, 156 // based on the gate document. 157 if (basicAS == null) 158 basicAS=doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 159 160 // sort colector ascending on its id 161 Collections.sort(colector); 162 Set testIdsSet = new HashSet(); 163 // create all the annotations (on this new document) from the collector 164 while (!colector.isEmpty()){ 165 CustomObject obj = (CustomObject) colector.getFirst(); 166 // Test to see if there are two annotation objects with the same id. 167 if (testIdsSet.contains(obj.getId())){ 168 throw new GateSaxException("Found two annotations with the same Id("+ 169 obj.getId()+ 170 ").The document is inconsistent."); 171 }else{ 172 testIdsSet.add(obj.getId()); 173 }// End iff 174 // create a new annotation and add it to the annotation set 175 try{ 176 // the annotation type will be conforming with markupElementsMap 177 //add the annotation to the Annotation Set 178 if (markupElementsMap == null) 179 basicAS.add( obj.getId(), 180 obj.getStart(), 181 obj.getEnd(), 182 obj.getElemName(), 183 obj.getFM ()); 184 else { 185 // get the type of the annotation from Map 186 String annotationType = (String) 187 markupElementsMap.get(obj.getElemName()); 188 if (annotationType != null) 189 basicAS.add( obj.getId(), 190 obj.getStart(), 191 obj.getEnd(), 192 annotationType, 193 obj.getFM()); 194 }// End if 195 }catch (gate.util.InvalidOffsetException e){ 196 Err.prln("InvalidOffsetException for annot :" + obj.getElemName() + 197 " with Id =" + obj.getId() + ". Discarded..."); 198 }// End try 199 colector.remove(obj); 200 }// End while 201 }// endDocument(); 202 203 /** 204 * This method is called when the SAX parser encounts the beginning of an 205 * XML element. 206 */ 207 public void startElement (String uri, String qName, String elemName, 208 Attributes atts){ 209 // Inform the progress listener to fire only if no of elements processed 210 // so far is a multiple of ELEMENTS_RATE 211 if ((++elements % ELEMENTS_RATE) == 0) 212 fireStatusChangedEvent("Processed elements : " + elements); 213 214 Integer customObjectId = null; 215 // Construct a SimpleFeatureMapImpl from the list of attributes 216 FeatureMap fm = Factory.newFeatureMap(); 217 //Get the name and the value of the attributes and add them to a FeaturesMAP 218 for (int i = 0; i < atts.getLength(); i++) { 219 String attName = atts.getLocalName(i); 220 String attValue = atts.getValue(i); 221 String attUri = atts.getURI(i); 222 if (attUri != null && Gate.URI.equals(attUri)){ 223 if ("gateId".equals(attName)){ 224 customObjectId = new Integer(attValue); 225 }// End if 226 if ("annotMaxId".equals(attName)){ 227 customObjectsId = new Integer(attValue).intValue(); 228 }// End if 229 if ("matches".equals(attName)){ 230 StringTokenizer strTokenizer = new StringTokenizer(attValue,";"); 231 List list = new ArrayList(); 232 // Take all tokens,create Integers and add them to the list 233 while (strTokenizer.hasMoreTokens()){ 234 String token = strTokenizer.nextToken(); 235 list.add(new Integer(token)); 236 }// End while 237 fm.put(attName,list); 238 }// End if 239 }else{ 240 fm.put(attName,attValue); 241 }// End if 242 }// End for 243 244 // create the START index of the annotation 245 Long startIndex = new Long(tmpDocContent.length()); 246 247 // initialy the Start index is equal with End index 248 CustomObject obj = new CustomObject(customObjectId,elemName,fm, 249 startIndex, startIndex); 250 251 // put this object into the stack 252 stack.push(obj); 253 }// startElement(); 254 255 /** 256 * This method is called when the SAX parser encounts the end of an 257 * XML element. 258 * Here we extract 259 */ 260 public void endElement (String uri, String qName, String elemName ) 261 throws SAXException{ 262 // obj is for internal use 263 CustomObject obj = null; 264 265 // if the stack is not empty, we extract the custom object and delete it 266 if (!stack.isEmpty ()){ 267 obj = (CustomObject) stack.pop(); 268 }// End if 269 270 // Before adding it to the colector, we need to check if is an 271 // emptyAndSpan one. See CustomObject's isEmptyAndSpan field. 272 if (obj.getStart().equals(obj.getEnd())){ 273 // The element had an end tag and its start was equal to its end. Hence 274 // it is anEmptyAndSpan one. 275 obj.getFM().put("isEmptyAndSpan","true"); 276 }// End iff 277 278 // Put the object into colector 279 // Later, when the document ends we will use colector to create all the 280 // annotations 281 colector.add(obj); 282 283 // if element is found on Element2String map, then add the string to the 284 // end of the document content 285 if (element2StringMap != null){ 286 String stringFromMap = null; 287 288 // test to see if element is inside the map 289 // if it is then get the string value and add it to the document content 290 stringFromMap = (String) element2StringMap.get(elemName); 291 if (stringFromMap != null) 292 tmpDocContent.append(stringFromMap); 293 }// End if 294 }// endElement(); 295 296 /** 297 * This method is called when the SAX parser encounts text in the XML doc. 298 * Here we calculate the end indices for all the elements present inside the 299 * stack and update with the new values. For entities, this method is called 300 * separatley regardless of the text sourinding the entity. 301 */ 302 public void characters( char[] text,int start,int length) throws SAXException{ 303 // correction of real offset. Didn't affect on other data. 304 super.characters(text, start, length); 305 // create a string object based on the reported text 306 String content = new String(text, start, length); 307 StringBuffer contentBuffer = new StringBuffer(""); 308 int tmpDocContentSize = tmpDocContent.length(); 309 boolean incrementStartIndex = false; 310 // If the first char of the text just read "text[0]" is NOT whitespace AND 311 // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then 312 // concatenation "tmpDocContent + content" will result into a new different 313 // word... and we want to avoid that, because the tokenizer, gazetter and 314 // Jape work on the raw text and concatenating tokens might be not good. 315 if ( tmpDocContentSize != 0 && 316 content.length() != 0 && 317 !Character.isWhitespace(content.charAt(0)) && 318 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){ 319 320 // If we are here it means that a concatenation between the last 321 // token in the tmpDocContent and the content(which doesn't start 322 // with a white space) will be performed. In order to prevent this, 323 // we will add a " " space char in order to assure that the 2 tokens 324 // stay apart. Howerver we will except from this rule the most known 325 // internal entities like &, <, >, etc 326 if ( 327 ( 328 // Testing the length against 1 makes it more likely that 329 // an internal entity was called. characters() gets called for 330 // each entity separately. 331 (content.length() == 1) 332 && 333 (content.charAt(0) == '&' || 334 content.charAt(0) == '<' || 335 content.charAt(0) == '>' || 336 content.charAt(0) == '"' || 337 content.charAt(0) == '\'' 338 ) 339 ) || 340 (tmpDocContent.charAt(tmpDocContentSize - 1) == '&' || 341 tmpDocContent.charAt(tmpDocContentSize - 1) == '<' || 342 tmpDocContent.charAt(tmpDocContentSize - 1) == '>' || 343 tmpDocContent.charAt(tmpDocContentSize - 1) == '"' || 344 tmpDocContent.charAt(tmpDocContentSize - 1) == '\'' 345 )){// do nothing. The content will be appended 346 }else{ 347 // In all other cases append " " 348 contentBuffer.append(" "); 349 incrementStartIndex = true; 350 }// End if 351 }// End if 352 353 // put the repositioning information 354 if(reposInfo != null) { 355 if(! (start == 0 && length == 1 && text.length <= 2)) { 356 // normal piece of text 357 reposInfo.addPositionInfo(getRealOffset(), content.length(), 358 tmpDocContent.length()+contentBuffer.length(), 359 content.length()); 360 } 361 else { 362 // unicode char or &xxx; coding 363 // Reported from the parser offset is 0 364 // The real offset should be found in the ampCodingInfo structure. 365 366 long lastPosition = 0; 367 RepositioningInfo.PositionInfo pi; 368 369 if(reposInfo.size() > 0) { 370 pi = 371 (RepositioningInfo.PositionInfo) reposInfo.get(reposInfo.size()-1); 372 lastPosition = pi.getOriginalPosition(); 373 } // if 374 375 for(int i = 0; i < ampCodingInfo.size(); ++i) { 376 pi = (RepositioningInfo.PositionInfo) ampCodingInfo.get(i); 377 if(pi.getOriginalPosition() > lastPosition) { 378 // found 379 reposInfo.addPositionInfo(pi.getOriginalPosition(), 380 pi.getOriginalLength(), 381 tmpDocContent.length()+contentBuffer.length(), 382 content.length()); 383 break; 384 } // if 385 } // for 386 } // if 387 } // if 388 389 // update the document content 390 contentBuffer.append(content); 391 // calculate the End index for all the elements of the stack 392 // the expression is : End index = Current doc length + text length 393 Long end = new Long(tmpDocContent.length() + contentBuffer.length()); 394 395 CustomObject obj = null; 396 // Iterate through stack to modify the End index of the existing elements 397 398 java.util.Iterator anIterator = stack.iterator(); 399 while (anIterator.hasNext ()){ 400 // get the object and move to the next one 401 obj = (CustomObject) anIterator.next (); 402 if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){ 403 obj.setStart(new Long(obj.getStart().longValue() + 1)); 404 }// End if 405 // sets its End index 406 obj.setEnd(end); 407 }// End while 408 409 tmpDocContent.append(contentBuffer.toString()); 410 }// characters(); 411 412 /** 413 * This method is called when the SAX parser encounts white spaces 414 */ 415 public void ignorableWhitespace(char ch[],int start,int length) throws 416 SAXException{ 417 418 // internal String object 419 String text = new String(ch, start, length); 420 // if the last character in tmpDocContent is \n and the read whitespace is 421 // \n then don't add it to tmpDocContent... 422 423 if (tmpDocContent.length () != 0) 424 if (tmpDocContent.charAt (tmpDocContent.length () - 1) != '\n' || 425 !text.equalsIgnoreCase("\n") 426 ) 427 tmpDocContent.append(text); 428 } 429 430 /** 431 * Error method.We deal with this exception inside SimpleErrorHandler class 432 */ 433 public void error(SAXParseException ex) throws SAXException { 434 // deal with a SAXParseException 435 // see SimpleErrorhandler class 436 _seh.error(ex); 437 } 438 439 /** 440 * FatalError method. 441 */ 442 public void fatalError(SAXParseException ex) throws SAXException { 443 // deal with a SAXParseException 444 // see SimpleErrorhandler class 445 _seh.fatalError(ex); 446 } 447 448 /** 449 * Warning method comment. 450 */ 451 public void warning(SAXParseException ex) throws SAXException { 452 // deal with a SAXParseException 453 // see SimpleErrorhandler class 454 _seh.warning(ex); 455 } 456 457 /** 458 * This method is called when the SAX parser encounts a comment 459 * It works only if the XmlDocumentHandler implements a 460 * com.sun.parser.LexicalEventListener 461 */ 462 public void comment(String text) throws SAXException { 463 // create a FeatureMap and then add the comment to the annotation set. 464 /* 465 gate.util.SimpleFeatureMapImpl fm = new gate.util.SimpleFeatureMapImpl(); 466 fm.put ("text_comment",text); 467 Long node = new Long (tmpDocContent.length()); 468 CustomObject anObject = new CustomObject("Comment",fm,node,node); 469 colector.add(anObject); 470 */ 471 } 472 473 /** 474 * This method is called when the SAX parser encounts a start of a CDATA 475 * section 476 * It works only if the XmlDocumentHandler implements a 477 * com.sun.parser.LexicalEventListener 478 */ 479 public void startCDATA()throws SAXException { 480 } 481 482 /** 483 * This method is called when the SAX parser encounts the end of a CDATA 484 * section. 485 * It works only if the XmlDocumentHandler implements a 486 * com.sun.parser.LexicalEventListener 487 */ 488 public void endCDATA() throws SAXException { 489 } 490 491 /** 492 * This method is called when the SAX parser encounts a parsed Entity 493 * It works only if the XmlDocumentHandler implements a 494 * com.sun.parser.LexicalEventListener 495 */ 496 public void startParsedEntity(String name) throws SAXException { 497 } 498 499 /** 500 * This method is called when the SAX parser encounts a parsed entity and 501 * informs the application if that entity was parsed or not 502 * It's working only if the CustomDocumentHandler implements a 503 * com.sun.parser.LexicalEventListener 504 */ 505 public void endParsedEntity(String name, boolean included)throws SAXException{ 506 } 507 508 //StatusReporter Implementation 509 510 /** 511 * This methos is called when a listener is registered with this class 512 */ 513 public void addStatusListener(StatusListener listener){ 514 myStatusListeners.add(listener); 515 } 516 /** 517 * This methos is called when a listener is removed 518 */ 519 public void removeStatusListener(StatusListener listener){ 520 myStatusListeners.remove(listener); 521 } 522 /** 523 * This methos is called whenever we need to inform the listener about an 524 * event. 525 */ 526 protected void fireStatusChangedEvent(String text){ 527 Iterator listenersIter = myStatusListeners.iterator(); 528 while(listenersIter.hasNext()) 529 ((StatusListener)listenersIter.next()).statusChanged(text); 530 } 531 532 /** This method is a workaround of the java 4 non namespace supporting parser 533 * It receives a qualified name and returns its local name. 534 * For eg. if it receives gate:gateId it will return gateId 535 */ 536 private String getMyLocalName(String aQName){ 537 if (aQName == null) return ""; 538 StringTokenizer strToken = new StringTokenizer(aQName,":"); 539 if (strToken.countTokens()<= 1) return aQName; 540 // The nr of tokens is >= than 2 541 // Skip the first token which is the QName 542 strToken.nextToken(); 543 return strToken.nextToken(); 544 }//getMyLocalName() 545 546 /** Also a workaround for URI identifier. If the QName is gate it will return 547 * GATE's. Otherwhise it will return the empty string 548 */ 549 private String getMyURI(String aQName){ 550 if (aQName == null) return ""; 551 StringTokenizer strToken = new StringTokenizer(aQName,":"); 552 if (strToken.countTokens()<= 1) return ""; 553 // If first token is "gate" then return GATE's URI 554 if ("gate".equalsIgnoreCase(strToken.nextToken())) 555 return Gate.URI; 556 return ""; 557 }// getMyURI() 558 559 // XmlDocumentHandler member data 560 561 // this constant indicates when to fire the status listener 562 // this listener will add an overhead and we don't want a big overhead 563 // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE 564 final static int ELEMENTS_RATE = 128; 565 566 // this map contains the elements name that we want to create 567 // if it's null all the elements from the XML documents will be transformed 568 // into Gate annotation objects otherwise only the elements it contains will 569 // be transformed 570 private Map markupElementsMap = null; 571 572 // this map contains the string that we want to insert iside the document 573 // content, when a certain element is found 574 // if the map is null then no string is added 575 private Map element2StringMap = null; 576 577 /**This object inducates what to do when the parser encounts an error*/ 578 private SimpleErrorHandler _seh = new SimpleErrorHandler(); 579 580 /**The content of the XML document, without any tag for internal use*/ 581 private StringBuffer tmpDocContent = null; 582 583 /**A stack used to remember elements and to keep the order */ 584 private java.util.Stack stack = null; 585 586 /**A gate document */ 587 private gate.Document doc = null; 588 589 /**An annotation set used for creating annotation reffering the doc */ 590 private gate.AnnotationSet basicAS = null; 591 592 /**Listeners for status report */ 593 protected List myStatusListeners = new LinkedList(); 594 595 /**This reports the the number of elements that have beed processed so far*/ 596 private int elements = 0; 597 598 /** We need a colection to retain all the CustomObjects that will be 599 * transformed into annotation over the gate document... 600 * the transformation will take place inside onDocumentEnd() method 601 */ 602 private LinkedList colector = null; 603 604 /** This is used to generate unique Ids for the CustomObjects read*/ 605 protected int customObjectsId = 0; 606 607 /** Accesor method for the customObjectsId field*/ 608 public int getCustomObjectsId(){ return customObjectsId;} 609 610 //////// INNER CLASS 611 /** 612 * The objects belonging to this class are used inside the stack. 613 * This class is for internal needs 614 */ 615 class CustomObject implements Comparable { 616 617 // constructor 618 public CustomObject(Integer anId,String anElemName, FeatureMap aFm, 619 Long aStart, Long anEnd) { 620 elemName = anElemName; 621 fm = aFm; 622 start = aStart; 623 end = anEnd; 624 if (anId == null){ 625 id = new Integer(customObjectsId ++); 626 }else{ 627 id = anId; 628 if (customObjectsId <= anId.intValue()) 629 customObjectsId = anId.intValue() + 1 ; 630 }// End if 631 }// End CustomObject() 632 633 // Methos implemented as required by Comparable interface 634 public int compareTo(Object o){ 635 CustomObject obj = (CustomObject) o; 636 return this.id.compareTo(obj.getId()); 637 }// compareTo(); 638 639 // accesor 640 public String getElemName() { 641 return elemName; 642 }// getElemName() 643 644 public FeatureMap getFM() { 645 return fm; 646 }// getFM() 647 648 public Long getStart() { 649 return start; 650 }// getStart() 651 652 public Long getEnd() { 653 return end; 654 }// getEnd() 655 656 public Integer getId(){ return id;} 657 658 // mutator 659 public void setElemName(String anElemName) { 660 elemName = anElemName; 661 }// getElemName() 662 663 public void setFM(FeatureMap aFm) { 664 fm = aFm; 665 }// setFM(); 666 667 public void setStart(Long aStart) { 668 start = aStart; 669 }// setStart(); 670 671 public void setEnd(Long anEnd) { 672 end = anEnd; 673 }// setEnd(); 674 675 // data fields 676 private String elemName = null; 677 private FeatureMap fm = null; 678 private Long start = null; 679 private Long end = null; 680 private Integer id = null; 681 682 } // End inner class CustomObject 683 684 } //XmlDocumentHandler 685 686 687 688
|
XmlDocumentHandler |
|