|
DocumentImpl |
|
1 /* 2 * DocumentImpl.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 11/Feb/2000 12 * 13 * $Id: DocumentImpl.java,v 1.109 2002/02/28 15:08:47 nasso Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.net.*; 20 import java.io.*; 21 22 import gate.*; 23 import gate.annotation.*; 24 import gate.util.*; 25 import gate.creole.*; 26 import gate.gui.*; 27 import gate.event.*; 28 29 /** Represents the commonalities between all sorts of documents. 30 * 31 * <H2>Editing</H2> 32 * 33 * <P> 34 * The DocumentImpl class implements the Document interface. 35 * The DocumentContentImpl class models the textual or audio-visual 36 * materials which are the source and content of Documents. 37 * The AnnotationSetImpl class supplies annotations on Documents. 38 * 39 * <P> 40 * Abbreviations: 41 * 42 * <UL> 43 * <LI> 44 * DC = DocumentContent 45 * <LI> 46 * D = Document 47 * <LI> 48 * AS = AnnotationSet 49 * </UL> 50 * 51 * <P> 52 * We add an edit method to each of these classes; for DC and AS 53 * the methods are package private; D has the public method. 54 * 55 * <PRE> 56 * void edit(Long start, Long end, DocumentContent replacement) 57 * throws InvalidOffsetException; 58 * </PRE> 59 * 60 * <P> 61 * D receives edit requests and forwards them to DC and AS. 62 * On DC, this method makes a change to the content - e.g. replacing 63 * a String range from start to end with replacement. (Deletions 64 * are catered for by having replacement = null.) D then calls 65 * AS.edit on each of its annotation sets. 66 * 67 * <P> 68 * On AS, edit calls replacement.size() (i.e. DC.size()) to 69 * figure out how long the replacement is (0 for null). It then 70 * considers annotations that terminate (start or end) in 71 * the altered or deleted range as invalid; annotations that 72 * terminate after the range have their offsets adjusted. 73 * I.e.: 74 * <UL> 75 * <LI> 76 * the nodes that pointed inside the old modified area are invalid now and 77 * will be deleted along with the connected annotations; 78 * <LI> 79 * the nodes that are before the start of the modified area remain 80 * untouched; 81 * <LI> 82 * the nodes that are after the end of the affected area will have the 83 * offset changed according to the formula below. 84 * </UL> 85 * 86 * <P> 87 * A note re. AS and annotations: annotations no longer have 88 * offsets as in the old model, they now have nodes, and nodes 89 * have offsets. 90 * 91 * <P> 92 * To implement AS.edit, we have several indices: 93 * <PRE> 94 * HashMap annotsByStartNode, annotsByEndNode; 95 * </PRE> 96 * which map node ids to annotations; 97 * <PRE> 98 * RBTreeMap nodesByOffset; 99 * </PRE> 100 * which maps offset to Nodes. 101 * 102 * <P> 103 * When we get an edit request, we traverse that part of the 104 * nodesByOffset tree representing the altered or deleted 105 * range of the DC. For each node found, we delete any annotations 106 * that terminate on the node, and then delete the node itself. 107 * We then traverse the rest of the tree, changing the offset 108 * on all remaining nodes by: 109 * <PRE> 110 * newOffset = 111 * oldOffset - 112 * ( 113 * (end - start) - // size of mod 114 * ( (replacement == null) ? 0 : replacement.size() ) // size of repl 115 * ); 116 * </PRE> 117 * Note that we use the same convention as e.g. java.lang.String: start 118 * offsets are inclusive; end offsets are exclusive. I.e. for string "abcd" 119 * range 1-3 = "bc". Examples, for a node with offset 4: 120 * <PRE> 121 * edit(1, 3, "BC"); 122 * newOffset = 4 - ( (3 - 1) - 2 ) = 4 123 * 124 * edit(1, 3, null); 125 * newOffset = 4 - ( (3 - 1) - 0 ) = 2 126 * 127 * edit(1, 3, "BBCC"); 128 * newOffset = 4 - ( (3 - 1) - 4 ) = 6 129 * </PRE> 130 */ 131 public class DocumentImpl 132 extends AbstractLanguageResource implements Document, CreoleListener, DatastoreListener { 133 /** Debug flag */ 134 private static final boolean DEBUG = false; 135 136 /** If you set this flag to true the original content of the document will 137 * be kept in the document feature. <br> 138 * Default value is false to avoid the unnecessary waste of memory */ 139 private Boolean preserveOriginalContent = new Boolean(false); 140 141 /** If you set this flag to true the repositioning information for 142 * the document will be kept in the document feature. <br> 143 * Default value is false to avoid the unnecessary waste of time and memory 144 */ 145 private Boolean collectRepositioningInfo = new Boolean(false); 146 147 /** 148 * This is a variable which contains the latest crossed over annotation 149 * found during export with preserving format, i.e., toXml(annotations) 150 * method. 151 */ 152 private Annotation crossedOverAnnotation = null; 153 154 /** Default construction. Content left empty. */ 155 public DocumentImpl() { 156 content = new DocumentContentImpl(); 157 } // default construction 158 159 /** Initialise this resource, and return it. */ 160 public Resource init() throws ResourceInstantiationException { 161 162 // set up the source URL and create the content 163 if(sourceUrl == null) { 164 if(stringContent == null) { 165 throw new ResourceInstantiationException( 166 "The sourceURL and document's content were null." 167 ); 168 } 169 170 content = new DocumentContentImpl(stringContent); 171 getFeatures().put("gate.SourceURL", "created from String"); 172 } else { 173 try { 174 175 content = new DocumentContentImpl( 176 sourceUrl, encoding, sourceUrlStartOffset, sourceUrlEndOffset); 177 getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm()); 178 } catch(IOException e) { 179 throw new ResourceInstantiationException("DocumentImpl.init: " + e); 180 } 181 182 if(preserveOriginalContent.booleanValue() && content != null) { 183 String originalContent = new String( 184 ((DocumentContentImpl) content).getOriginalContent()); 185 getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME, 186 originalContent); 187 } // if 188 } 189 190 // set up a DocumentFormat if markup unpacking required 191 if(getMarkupAware().booleanValue()) { 192 DocumentFormat docFormat = 193 DocumentFormat.getDocumentFormat(this, sourceUrl); 194 try { 195 if(docFormat != null){ 196 StatusListener sListener = (StatusListener) 197 gate.gui.MainFrame.getListeners(). 198 get("gate.event.StatusListener"); 199 if(sListener != null) docFormat.addStatusListener(sListener); 200 201 // set the flag if true and if the document format support collecting 202 docFormat.setShouldCollectRepositioning(collectRepositioningInfo); 203 204 if(docFormat.getShouldCollectRepositioning().booleanValue()) { 205 // unpack with collectiong of repositioning information 206 RepositioningInfo info = new RepositioningInfo(); 207 208 String origContent = (String) getFeatures().get( 209 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); 210 211 RepositioningInfo ampCodingInfo = new RepositioningInfo(); 212 if(origContent != null) { 213 boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat; 214 collectInformationForAmpCodding(origContent, ampCodingInfo, 215 shouldCorrectCR); 216 if(docFormat instanceof HtmlDocumentFormat) { 217 collectInformationForWS(origContent, ampCodingInfo); 218 } // if 219 } // if 220 221 docFormat.unpackMarkup(this, info, ampCodingInfo); 222 223 if(origContent != null 224 && docFormat instanceof XmlDocumentFormat) { 225 // CRLF correction of RepositioningInfo 226 correctRepositioningForCRLFInXML(origContent, info); 227 } // if 228 229 getFeatures().put( 230 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info); 231 } 232 else { 233 // normal old fashioned unpack 234 docFormat.unpackMarkup(this); 235 } 236 docFormat.removeStatusListener(sListener); 237 } //if format != null 238 } catch(DocumentFormatException e) { 239 throw new ResourceInstantiationException( 240 "Couldn't unpack markup in document " + sourceUrl.toExternalForm() + 241 " " + e 242 ); 243 } 244 } // if markup aware 245 246 return this; 247 } // init() 248 249 /** 250 * Correct repositioning information for substitution of "\r\n" with "\n" 251 */ 252 private void correctRepositioningForCRLFInXML(String content, 253 RepositioningInfo info) { 254 int index = -1; 255 256 do { 257 index = content.indexOf("\r\n", index+1); 258 if(index != -1) { 259 info.correctInformationOriginalMove(index, 1); 260 } // if 261 } while(index != -1); 262 } // correctRepositioningForCRLF 263 264 /** 265 * Collect information for substitution of "&xxx;" with "y" 266 * 267 * It couldn't be collected a position information about 268 * some unicode and &-coded symbols during parsing. The parser "hide" the 269 * information about the position of such kind of parsed text. 270 * So, there is minimal chance to have &-coded symbol inside the covered by 271 * repositioning records area. The new record should be created for every 272 * coded symbol outside the existing records. 273 * <BR> 274 * If <code>shouldCorrectCR</code> flag is <code>true</code> the correction 275 * for CRLF substitution is performed. 276 */ 277 private void collectInformationForAmpCodding(String content, 278 RepositioningInfo info, 279 boolean shouldCorrectCR) { 280 281 if(content == null || info == null) return; 282 283 int ampIndex = -1; 284 int semiIndex; 285 286 do { 287 ampIndex = content.indexOf('&', ampIndex+1); 288 if(ampIndex != -1) { 289 semiIndex = content.indexOf(';', ampIndex+1); 290 // have semicolon and it is near enough for amp codding 291 if(semiIndex != -1 && (semiIndex-ampIndex) < 8) { 292 info.addPositionInfo(ampIndex, semiIndex-ampIndex+1, 0, 1); 293 } 294 else { 295 // no semicolon or it is too far 296 // analyse for amp codding without semicolon 297 int maxEnd = Math.min(ampIndex+8, content.length()); 298 String ampCandidate = content.substring(ampIndex, maxEnd); 299 int ampCodingSize = analyseAmpCodding(ampCandidate); 300 301 if(ampCodingSize != -1) { 302 info.addPositionInfo(ampIndex, ampCodingSize, 0, 1); 303 } // if 304 305 } // if - semicolon found 306 } // if - ampersand found 307 } while (ampIndex != -1); 308 309 // correct the collected information to adjust it's positions 310 // with reported by the parser 311 int index = -1; 312 313 if(shouldCorrectCR) { 314 do { 315 index = content.indexOf("\r\n", index+1); 316 if(index != -1) { 317 info.correctInformationOriginalMove(index, -1); 318 } // if 319 } while(index != -1); 320 } // if 321 } // collectInformationForAmpCodding 322 323 /** 324 * This function compute size of the ampersand codded sequence when 325 * semicolin is not present. 326 */ 327 private int analyseAmpCodding(String content) { 328 int result = -1; 329 330 try { 331 char ch = content.charAt(1); 332 333 switch(ch) { 334 case 'l' : // < 335 case 'L' : // < 336 if(content.charAt(2) == 't' || content.charAt(2) == 'T') { 337 result = 3; 338 } // if 339 break; 340 case 'g' : // > 341 case 'G' : // > 342 if(content.charAt(2) == 't' || content.charAt(2) == 'T') { 343 result = 3; 344 } // if 345 break; 346 case 'a' : // & 347 case 'A' : // & 348 if(content.substring(2, 4).equalsIgnoreCase("mp")) { 349 result = 4; 350 } // if 351 break; 352 case 'q' : // " 353 case 'Q' : // " 354 if(content.substring(2, 5).equalsIgnoreCase("uot")) { 355 result = 5; 356 } // if 357 break; 358 case '#' : // #number (example ‘, 䰸) 359 int endIndex = 2; 360 boolean hexCoded = false; 361 if(content.charAt(2) == 'x' || content.charAt(2) == 'X') { 362 // Hex codding 363 ++endIndex; 364 hexCoded = true; 365 } // if 366 367 while (endIndex < 8 368 && isNumber(content.charAt(endIndex), hexCoded) ) { 369 ++endIndex; 370 } // while 371 result = endIndex; 372 break; 373 } // switch 374 } catch (StringIndexOutOfBoundsException ex) { 375 // do nothing 376 } // catch 377 378 return result; 379 } // analyseAmpCodding 380 381 /** Check for numeric range. If hex is true the A..F range is included */ 382 private boolean isNumber(char ch, boolean hex) { 383 if(ch >= '0' && ch <= '9') return true; 384 385 if(hex) { 386 if(ch >= 'A' && ch <= 'F') return true; 387 if(ch >= 'a' && ch <= 'f') return true; 388 } // if 389 390 return false; 391 } // isNumber 392 393 /** HTML parser perform substitution of multiple whitespaces (WS) with 394 * a single WS. To create correct repositioning information structure we 395 * should keep the information for such multiple WS. 396 * <BR> 397 * The criteria for WS is <code>(ch <= ' ')</code>. 398 */ 399 private void collectInformationForWS(String content, RepositioningInfo info) { 400 401 if(content == null || info == null) return; 402 403 // analyse the content and correct the repositioning information 404 char ch; 405 int startWS, endWS; 406 407 startWS = endWS = -1; 408 int contentLength = content.length(); 409 410 for(int i=0; i<contentLength; ++i) { 411 ch = content.charAt(i); 412 413 // is whitespace 414 if(ch <= ' ') { 415 if(startWS == -1) { 416 startWS = i; 417 } // if 418 endWS = i; 419 } 420 else { 421 if(endWS - startWS > 0) { 422 // put the repositioning information about the WS substitution 423 info.addPositionInfo( 424 (long)startWS, (long)(endWS - startWS + 1), 0, 1); 425 } // if 426 // clear positions 427 startWS = endWS = -1; 428 }// if 429 } // for 430 } // collectInformationForWS 431 432 /** Clear all the data members of the object. */ 433 public void cleanup() { 434 435 defaultAnnots = null; 436 if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty())) 437 namedAnnotSets.clear(); 438 if (DEBUG) Out.prln("Document cleanup called"); 439 if (this.lrPersistentId != null) 440 Gate.getCreoleRegister().removeCreoleListener(this); 441 if(this.getDataStore() != null) 442 this.getDataStore().removeDatastoreListener(this); 443 } // cleanup() 444 445 446 /** Documents are identified by URLs */ 447 public URL getSourceUrl() { return sourceUrl; } 448 449 /** Set method for the document's URL */ 450 public void setSourceUrl(URL sourceUrl) { 451 this.sourceUrl = sourceUrl; 452 } // setSourceUrl 453 454 /** Documents may be packed within files; in this case an optional pair of 455 * offsets refer to the location of the document. 456 */ 457 public Long[] getSourceUrlOffsets() { 458 Long[] sourceUrlOffsets = new Long[2]; 459 sourceUrlOffsets[0] = sourceUrlStartOffset; 460 sourceUrlOffsets[1] = sourceUrlEndOffset; 461 return sourceUrlOffsets; 462 } // getSourceUrlOffsets 463 464 /** 465 * Allow/disallow preserving of the original document content. 466 * If is <B>true</B> the original content will be retrieved from 467 * the DocumentContent object and preserved as document feature. 468 */ 469 public void setPreserveOriginalContent(Boolean b) { 470 preserveOriginalContent = b; 471 } // setPreserveOriginalContent 472 473 /** Get the preserving of content status of the Document. 474 * 475 * @return whether the Document should preserve it's original content. 476 */ 477 public Boolean getPreserveOriginalContent() { 478 return preserveOriginalContent; 479 } // getPreserveOriginalContent 480 481 /** 482 * Allow/disallow collecting of repositioning information. 483 * If is <B>true</B> information will be retrieved and preserved 484 * as document feature.<BR> 485 * Preserving of repositioning information give the possibilities 486 * for converting of coordinates between the original document content and 487 * extracted from the document text. 488 */ 489 public void setCollectRepositioningInfo(Boolean b) { 490 collectRepositioningInfo = b; 491 } // setCollectRepositioningInfo 492 493 /** Get the collectiong and preserving of repositioning information 494 * for the Document. <BR> 495 * Preserving of repositioning information give the possibilities 496 * for converting of coordinates between the original document content and 497 * extracted from the document text. 498 * 499 * @return whether the Document should collect and preserve information. 500 */ 501 public Boolean getCollectRepositioningInfo() { 502 return collectRepositioningInfo; 503 } // getCollectRepositioningInfo 504 505 /** Documents may be packed within files; in this case an optional pair of 506 * offsets refer to the location of the document. This method gets the 507 * start offset. 508 */ 509 public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; } 510 511 /** Documents may be packed within files; in this case an optional pair of 512 * offsets refer to the location of the document. This method sets the 513 * start offset. 514 */ 515 public void setSourceUrlStartOffset(Long sourceUrlStartOffset) { 516 this.sourceUrlStartOffset = sourceUrlStartOffset; 517 } // setSourceUrlStartOffset 518 519 /** Documents may be packed within files; in this case an optional pair of 520 * offsets refer to the location of the document. This method gets the 521 * end offset. 522 */ 523 public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; } 524 525 /** Documents may be packed within files; in this case an optional pair of 526 * offsets refer to the location of the document. This method sets the 527 * end offset. 528 */ 529 public void setSourceUrlEndOffset(Long sourceUrlEndOffset) { 530 this.sourceUrlEndOffset = sourceUrlEndOffset; 531 } // setSourceUrlStartOffset 532 533 /** The content of the document: a String for text; MPEG for video; etc. */ 534 public DocumentContent getContent() { return content; } 535 536 /** Set method for the document content */ 537 public void setContent(DocumentContent content) { this.content = content; } 538 539 /** Get the encoding of the document content source */ 540 public String getEncoding() { return encoding; } 541 542 /** Set the encoding of the document content source */ 543 public void setEncoding(String encoding) { this.encoding = encoding; } 544 545 /** Get the default set of annotations. The set is created if it 546 * doesn't exist yet. 547 */ 548 public AnnotationSet getAnnotations() { 549 if(defaultAnnots == null){ 550 defaultAnnots = new AnnotationSetImpl(this); 551 fireAnnotationSetAdded(new DocumentEvent( 552 this, DocumentEvent.ANNOTATION_SET_ADDED, null)); 553 }//if 554 return defaultAnnots; 555 } // getAnnotations() 556 557 /** Get a named set of annotations. Creates a new set if one with this 558 * name doesn't exist yet. 559 * If the provided name is null then it returns the default annotation set. 560 */ 561 public AnnotationSet getAnnotations(String name) { 562 if(name == null) return getAnnotations(); 563 if(namedAnnotSets == null) 564 namedAnnotSets = new HashMap(); 565 AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name); 566 567 if(namedSet == null) { 568 namedSet = new AnnotationSetImpl(this, name); 569 namedAnnotSets.put(name, namedSet); 570 571 DocumentEvent evt = new DocumentEvent( 572 this, DocumentEvent.ANNOTATION_SET_ADDED, name 573 ); 574 fireAnnotationSetAdded(evt); 575 } 576 return namedSet; 577 } // getAnnotations(name) 578 579 /** Make the document markup-aware. This will trigger the creation 580 * of a DocumentFormat object at Document initialisation time; the 581 * DocumentFormat object will unpack the markup in the Document and 582 * add it as annotations. Documents are <B>not</B> markup-aware by default. 583 * 584 * @param b markup awareness status. 585 */ 586 public void setMarkupAware(Boolean newMarkupAware) { 587 this.markupAware = newMarkupAware; 588 } 589 590 /** Get the markup awareness status of the Document. 591 * <B>Documents are markup-aware by default.</B> 592 * @return whether the Document is markup aware. 593 */ 594 public Boolean getMarkupAware() { return markupAware; } 595 596 /** Returns an XML document aming to preserve the original markups( 597 * the original markup will be in the same place and format as it was 598 * before processing the document) and include (if possible) 599 * the annotations specified in the aSourceAnnotationSet. 600 * It is equivalent to toXml(aSourceAnnotationSet, true). 601 */ 602 public String toXml(Set aSourceAnnotationSet){ 603 return toXml(aSourceAnnotationSet, true); 604 } 605 606 /** Returns an XML document aming to preserve the original markups( 607 * the original markup will be in the same place and format as it was 608 * before processing the document) and include (if possible) 609 * the annotations specified in the aSourceAnnotationSet. 610 * <b>Warning:</b> Annotations from the aSourceAnnotationSet will be lost 611 * if they will cause a crosed over situation. 612 * @param aSourceAnnotationSet is an annotation set containing all the 613 * annotations that will be combined with the original marup set. If the 614 * param is <code>null</code> it will only dump the original markups. 615 * @param includeFeatures is a boolean that controls whether the annotation 616 * features should be included or not. If false, only the annotation type 617 * is included in the tag. 618 * @return a string representing an XML document containing the original 619 * markup + dumped annotations form the aSourceAnnotationSet 620 */ 621 public String toXml(Set aSourceAnnotationSet, boolean includeFeatures){ 622 623 if(hasOriginalContentFeatures()) { 624 return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet,includeFeatures); 625 } // if 626 627 AnnotationSet originalMarkupsAnnotSet = 628 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 629 630 // Create a dumping annotation set on the document. It will be used for 631 // dumping annotations... 632 AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this); 633 634 // This set will be constructed inside this method. If is not empty, the 635 // annotation contained will be lost. 636 if (!dumpingSet.isEmpty()){ 637 Out.prln("WARNING: The dumping annotation set was not empty."+ 638 "All annotation it contained were lost."); 639 dumpingSet.clear(); 640 }// End if 641 642 StatusListener sListener = (StatusListener) 643 gate.gui.MainFrame.getListeners(). 644 get("gate.event.StatusListener"); 645 // Construct the dumping set in that way that all annotations will verify 646 // the condition that there are not annotations which are crossed. 647 // First add all annotation from the original markups 648 if(sListener != null) 649 sListener.statusChanged("Constructing the dumping annotation set."); 650 dumpingSet.addAll(originalMarkupsAnnotSet); 651 // Then take all the annotations from aSourceAnnotationSet and verify if 652 // they can be inserted safely into the dumpingSet. Where not possible, 653 // report. 654 if (aSourceAnnotationSet != null){ 655 Iterator iter = aSourceAnnotationSet.iterator(); 656 while (iter.hasNext()){ 657 Annotation currentAnnot = (Annotation) iter.next(); 658 if(insertsSafety(dumpingSet,currentAnnot)){ 659 dumpingSet.add(currentAnnot); 660 }else if (crossedOverAnnotation != null){ 661 try { 662 Out.prln("Warning: Annotations were found to violate the " + 663 "crossed over condition: \n" + 664 "1. [" + 665 getContent().getContent( 666 crossedOverAnnotation.getStartNode().getOffset(), 667 crossedOverAnnotation.getEndNode().getOffset()) + 668 " (" + crossedOverAnnotation.getType() + ": " + 669 crossedOverAnnotation.getStartNode().getOffset() + 670 ";" + crossedOverAnnotation.getEndNode().getOffset() + 671 ")]\n" + 672 "2. [" + 673 getContent().getContent( 674 currentAnnot.getStartNode().getOffset(), 675 currentAnnot.getEndNode().getOffset()) + 676 " (" + currentAnnot.getType() + ": " + 677 currentAnnot.getStartNode().getOffset() + 678 ";" + currentAnnot.getEndNode().getOffset() + 679 ")]\nThe second one will be discarded.\n" ); 680 } catch (gate.util.InvalidOffsetException ex) { 681 throw new GateRuntimeException(ex.getMessage()); 682 } 683 }// End if 684 }// End while 685 }// End if 686 687 // The dumpingSet is ready to be exported as XML 688 // Here we go. 689 if(sListener != null) sListener.statusChanged("Dumping annotations as XML"); 690 StringBuffer xmlDoc = new StringBuffer( 691 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue())); 692 // Add xml header 693 // xmlDoc.append("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"); 694 695 // If the annotation set contains this "GatePreserveFormat" 696 // type, then this is removed because it will be added in the saving 697 // process. The reason of this removal is that if the loaded document 698 // was previously loaded from a GatePreserveFormat then we 699 // don't want to create lots of annotation for this type. This annotation 700 // type should be always the root element of a XML preserving format 701 // GATE document. 702 FeatureMap docFeatures = this.getFeatures(); 703 String mimeTypeStr = null; 704 // addGatePreserveFormatTag = false; 705 if ( docFeatures != null && 706 null != (mimeTypeStr=(String)docFeatures.get("MimeType")) && 707 ( 708 "text/html".equalsIgnoreCase(mimeTypeStr) || 709 "text/xml".equalsIgnoreCase(mimeTypeStr) || 710 "text/sgml".equalsIgnoreCase(mimeTypeStr) 711 ) 712 ){ 713 /* don't add the root tag */ 714 }else{ 715 // Add the root start element 716 // xmlDoc.append("<GatePreserveFormat"+ 717 // " xmlns:gate=\"http://www.gate.ac.uk\"" + 718 // " gate:annotMaxId=\"" + 719 // getNextAnnotationId() + 720 // "\">"); 721 // addGatePreserveFormatTag = true; 722 }// End if 723 724 xmlDoc.append(saveAnnotationSetAsXml(dumpingSet, includeFeatures)); 725 726 // if (addGatePreserveFormatTag){ 727 // // Add the root end element 728 // xmlDoc.append("</GatePreserveFormat>"); 729 // }// End if 730 if(sListener != null) sListener.statusChanged("Done."); 731 return xmlDoc.toString(); 732 }//End toXml() 733 734 /** This method verifies if aSourceAnnotation can ve inserted safety into the 735 * aTargetAnnotSet. Safety means that it doesn't violate the crossed over 736 * contition with any annotation from the aTargetAnnotSet. 737 * @param aTargetAnnotSet the annotation set to include the aSourceAnnotation 738 * @param aSourceAnnotation the annotation to be inserted into the 739 * aTargetAnnotSet 740 * @return true if the annotation inserts safety, or false otherwise. 741 */ 742 private boolean insertsSafety(AnnotationSet aTargetAnnotSet, 743 Annotation aSourceAnnotation){ 744 745 if (aTargetAnnotSet == null || aSourceAnnotation == null) { 746 this.crossedOverAnnotation = null; 747 return false; 748 } 749 if (aSourceAnnotation.getStartNode() == null || 750 aSourceAnnotation.getStartNode().getOffset()== null) { 751 this.crossedOverAnnotation = null; 752 return false; 753 } 754 if (aSourceAnnotation.getEndNode() == null || 755 aSourceAnnotation.getEndNode().getOffset()== null) { 756 this.crossedOverAnnotation = null; 757 return false; 758 } 759 760 // Get the start and end offsets 761 Long start = aSourceAnnotation.getStartNode().getOffset(); 762 Long end = aSourceAnnotation.getEndNode().getOffset(); 763 // Read aSourceAnnotation offsets long 764 long s2 = start.longValue(); 765 long e2 = end.longValue(); 766 767 // Obtain a set with all annotations annotations that overlap 768 // totaly or partially with the interval defined by the two provided offsets 769 AnnotationSet as = aTargetAnnotSet.get(start,end); 770 771 // Investigate all the annotations from as to see if there is one that 772 // comes in conflict with aSourceAnnotation 773 Iterator it = as.iterator(); 774 while(it.hasNext()){ 775 Annotation ann = (Annotation) it.next(); 776 // Read ann offsets 777 long s1 = ann.getStartNode().getOffset().longValue(); 778 long e1 = ann.getEndNode().getOffset().longValue(); 779 780 if (s1<s2 && s2<e1 && e1<e2) { 781 this.crossedOverAnnotation = ann; 782 return false; 783 } 784 if (s2<s1 && s1<e2 && e2<e1) { 785 this.crossedOverAnnotation = ann; 786 return false; 787 } 788 }// End while 789 return true; 790 }// insertsSafety() 791 792 /** This method saves all the annotations from aDumpAnnotSet and combines 793 * them with the document content. 794 * @param aDumpAnnotationSet is a GATE annotation set prepared to be used 795 * on the raw text from document content. If aDumpAnnotSet is <b>null<b> 796 * then an empty string will be returned. 797 * @param includeFeatures is a boolean, which controls whether the annotation 798 * features and gate ID are included or not. 799 * @return The XML document obtained from raw text + the information from 800 * the dump annotation set. 801 */ 802 private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet, 803 boolean includeFeatures){ 804 String content = null; 805 if (this.getContent()== null) 806 content = new String(""); 807 else 808 content = this.getContent().toString(); 809 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content)); 810 if (aDumpAnnotSet == null) return docContStrBuff.toString(); 811 812 TreeMap offsets2CharsMap = new TreeMap(); 813 if (this.getContent().size().longValue() != 0){ 814 // Fill the offsets2CharsMap with all the indices where 815 // special chars appear 816 buildEntityMapFromString(content,offsets2CharsMap); 817 }//End if 818 // The saving alghorithm is as follows: 819 /////////////////////////////////////////// 820 // Construct a set of annot with all IDs in asc order. 821 // All annotations that end at that offset swap their place in descending 822 // order. For each node write all the tags from left to right. 823 824 // Construct the node set 825 TreeSet offsets = new TreeSet(); 826 Iterator iter = aDumpAnnotSet.iterator(); 827 while (iter.hasNext()){ 828 Annotation annot = (Annotation) iter.next(); 829 offsets.add(annot.getStartNode().getOffset()); 830 offsets.add(annot.getEndNode().getOffset()); 831 }// End while 832 isRootTag = false; 833 // ofsets is sorted in ascending order. 834 // Iterate this set in descending order and remove an offset at each 835 // iteration 836 while (!offsets.isEmpty()){ 837 Long offset = (Long)offsets.last(); 838 // Remove the offset from the set 839 offsets.remove(offset); 840 // Now, use it. 841 // Returns a list with annotations that needs to be serialized in that 842 // offset. 843 List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset); 844 // Attention: the annotation are serialized from left to right 845 StringBuffer tmpBuff = new StringBuffer(""); 846 Stack stack = new Stack(); 847 // Iterate through all these annotations and serialize them 848 Iterator it = annotations.iterator(); 849 while(it.hasNext()){ 850 Annotation a = (Annotation) it.next(); 851 it.remove(); 852 // Test if a Ends at offset 853 if ( offset.equals(a.getEndNode().getOffset()) ){ 854 // Test if a Starts at offset 855 if ( offset.equals(a.getStartNode().getOffset()) ){ 856 // Here, the annotation a Starts and Ends at the offset 857 if ( null != a.getFeatures().get("isEmptyAndSpan") && 858 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ 859 860 // Assert: annotation a with start == end and isEmptyAndSpan 861 if (offsets.isEmpty() && "".equals(tmpBuff.toString())){ 862 // a is the doc's root tag to be written 863 // The annotations are serialized from left to right. 864 // The first annot in the last offset is the ROOT one 865 isRootTag = true; 866 }// End if 867 tmpBuff.append(writeStartTag(a, includeFeatures)); 868 stack.push(a); 869 }else{ 870 // Assert annotation a with start == end and an empty tag 871 tmpBuff.append(writeEmptyTag(a)); 872 // The annotation is removed from dumped set 873 aDumpAnnotSet.remove(a); 874 }// End if 875 }else{ 876 // Here the annotation a Ends at the offset. 877 // In this case empty the stack and write the end tag 878 if (!stack.isEmpty()){ 879 while(!stack.isEmpty()){ 880 Annotation a1 = (Annotation)stack.pop(); 881 tmpBuff.append(writeEndTag(a1)); 882 }// End while 883 }// End if 884 tmpBuff.append(writeEndTag(a)); 885 }// End if 886 }else{ 887 // The annotation a does NOT end at the offset. Let's see if it starts 888 // at the offset 889 if ( offset.equals(a.getStartNode().getOffset()) ){ 890 // The annotation a starts at the offset. 891 // In this case empty the stack and write the end tag 892 if (!stack.isEmpty()){ 893 while(!stack.isEmpty()){ 894 Annotation a1 = (Annotation)stack.pop(); 895 tmpBuff.append(writeEndTag(a1)); 896 }// End while 897 }// End if 898 if (offsets.isEmpty() && "".equals(tmpBuff.toString())){ 899 // a is the last tag to be written 900 // The annotations are serialized from left to right. 901 // The first annot in the last offset is the ROOT one. 902 isRootTag = true; 903 }// End if 904 tmpBuff.append(writeStartTag(a, includeFeatures)); 905 // The annotation is removed from dumped set 906 aDumpAnnotSet.remove(a); 907 }// End if ( offset.equals(a.getStartNode().getOffset()) ) 908 }// End if ( offset.equals(a.getEndNode().getOffset()) ) 909 }// End while(it.hasNext()){ 910 911 // In this case empty the stack and write the end tag 912 if (!stack.isEmpty()){ 913 while(!stack.isEmpty()){ 914 Annotation a1 = (Annotation)stack.pop(); 915 tmpBuff.append(writeEndTag(a1)); 916 }// End while 917 }// End if 918 919 // Before inserting tmpBuff into docContStrBuff we need to check 920 // if there are chars to be replaced and if there are, they would be 921 // replaced. 922 if (!offsets2CharsMap.isEmpty()){ 923 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 924 while( !offsets2CharsMap.isEmpty() && 925 offsChar.intValue() >= offset.intValue()){ 926 // Replace the char at offsChar with its corresponding entity form 927 // the entitiesMap. 928 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1, 929 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 930 // Discard the offsChar after it was used. 931 offsets2CharsMap.remove(offsChar); 932 // Investigate next offsChar 933 if (!offsets2CharsMap.isEmpty()) 934 offsChar = (Integer) offsets2CharsMap.lastKey(); 935 }// End while 936 }// End if 937 // Insert tmpBuff to the location where it belongs in docContStrBuff 938 docContStrBuff.insert(offset.intValue(),tmpBuff.toString()); 939 }// End while(!offsets.isEmpty()) 940 // Need to replace the entities in the remaining text, if there is any text 941 // So, if there are any more items in offsets2CharsMap they need to be 942 // replaced 943 while (!offsets2CharsMap.isEmpty()){ 944 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 945 // Replace the char with its entity 946 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1, 947 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 948 // remove the offset from the map 949 offsets2CharsMap.remove(offsChar); 950 }// End while 951 return docContStrBuff.toString(); 952 }// saveAnnotationSetAsXml() 953 954 /** 955 * Return true only if the document has features for original content and 956 * repositioning information. 957 */ 958 private boolean hasOriginalContentFeatures() { 959 FeatureMap features = getFeatures(); 960 boolean result = false; 961 962 result = 963 (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null) 964 && 965 (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME) 966 != null); 967 968 return result; 969 } // hasOriginalContentFeatures 970 971 /** This method saves all the annotations from aDumpAnnotSet and combines 972 * them with the original document content, if preserved as feature. 973 * @param aDumpAnnotationSet is a GATE annotation set prepared to be used 974 * on the raw text from document content. If aDumpAnnotSet is <b>null<b> 975 * then an empty string will be returned. 976 * @param includeFeatures is a boolean, which controls whether the annotation 977 * features and gate ID are included or not. 978 * @return The XML document obtained from raw text + the information from 979 * the dump annotation set. 980 */ 981 private String saveAnnotationSetAsXmlInOrig(Set aSourceAnnotationSet, 982 boolean includeFeatures){ 983 StringBuffer docContStrBuff; 984 985 String origContent; 986 987 origContent = 988 (String)features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); 989 if(origContent == null) { 990 origContent = ""; 991 } // if 992 993 long originalContentSize = origContent.length(); 994 995 RepositioningInfo repositioning = (RepositioningInfo) 996 getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); 997 998 docContStrBuff = new StringBuffer(origContent); 999 if (aSourceAnnotationSet == null) return docContStrBuff.toString(); 1000 1001 StatusListener sListener = (StatusListener) 1002 gate.gui.MainFrame.getListeners(). 1003 get("gate.event.StatusListener"); 1004 1005 AnnotationSet originalMarkupsAnnotSet = 1006 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 1007 // Create a dumping annotation set on the document. It will be used for 1008 // dumping annotations... 1009 AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this); 1010 if(sListener != null) 1011 sListener.statusChanged("Constructing the dumping annotation set."); 1012 // Then take all the annotations from aSourceAnnotationSet and verify if 1013 // they can be inserted safely into the dumpingSet. Where not possible, 1014 // report. 1015 if (aSourceAnnotationSet != null){ 1016 Iterator iter = aSourceAnnotationSet.iterator(); 1017 Annotation currentAnnot; 1018 while (iter.hasNext()){ 1019 currentAnnot = (Annotation) iter.next(); 1020 if(insertsSafety(originalMarkupsAnnotSet, currentAnnot) 1021 && insertsSafety(dumpingSet, currentAnnot)){ 1022 dumpingSet.add(currentAnnot); 1023 }else{ 1024 Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() + 1025 ", startOffset=" + currentAnnot.getStartNode().getOffset() + 1026 ", endOffset=" + currentAnnot.getEndNode().getOffset() + 1027 ", type=" + currentAnnot.getType()+ " was found to violate the" + 1028 " crossed over condition. It will be discarded"); 1029 }// End if 1030 }// End while 1031 }// End if 1032 1033 // The dumpingSet is ready to be exported as XML 1034 // Here we go. 1035 if(sListener != null) sListener.statusChanged("Dumping annotations as XML"); 1036 1037 /////////////////////////////////////////// 1038 // Construct a set of annot with all IDs in asc order. 1039 // All annotations that end at that offset swap their place in descending 1040 // order. For each node write all the tags from left to right. 1041 1042 // Construct the node set 1043 TreeSet offsets = new TreeSet(); 1044 Iterator iter = aSourceAnnotationSet.iterator(); 1045 while (iter.hasNext()){ 1046 Annotation annot = (Annotation) iter.next(); 1047 offsets.add(annot.getStartNode().getOffset()); 1048 offsets.add(annot.getEndNode().getOffset()); 1049 }// End while 1050 isRootTag = false; 1051 1052 // ofsets is sorted in ascending order. 1053 // Iterate this set in descending order and remove an offset at each 1054 // iteration 1055 while (!offsets.isEmpty()){ 1056 Long offset = (Long)offsets.last(); 1057 // Remove the offset from the set 1058 offsets.remove(offset); 1059 // Now, use it. 1060 // Returns a list with annotations that needs to be serialized in that 1061 // offset. 1062 List annotations = getAnnotationsForOffset(aSourceAnnotationSet,offset); 1063 // Attention: the annotation are serialized from left to right 1064 StringBuffer tmpBuff = new StringBuffer(""); 1065 Stack stack = new Stack(); 1066 // Iterate through all these annotations and serialize them 1067 Iterator it = annotations.iterator(); 1068 Annotation a = null; 1069 while(it.hasNext()) { 1070 a = (Annotation) it.next(); 1071 it.remove(); 1072 // Test if a Ends at offset 1073 if ( offset.equals(a.getEndNode().getOffset()) ){ 1074 // Test if a Starts at offset 1075 if ( offset.equals(a.getStartNode().getOffset()) ){ 1076 // Here, the annotation a Starts and Ends at the offset 1077 if ( null != a.getFeatures().get("isEmptyAndSpan") && 1078 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ 1079 1080 // Assert: annotation a with start == end and isEmptyAndSpan 1081 tmpBuff.append(writeStartTag(a, includeFeatures, false)); 1082 stack.push(a); 1083 }else{ 1084 // Assert annotation a with start == end and an empty tag 1085 tmpBuff.append(writeEmptyTag(a, false)); 1086 // The annotation is removed from dumped set 1087 aSourceAnnotationSet.remove(a); 1088 }// End if 1089 }else{ 1090 // Here the annotation a Ends at the offset. 1091 // In this case empty the stack and write the end tag 1092 while(!stack.isEmpty()){ 1093 Annotation a1 = (Annotation)stack.pop(); 1094 tmpBuff.append(writeEndTag(a1)); 1095 }// End while 1096 tmpBuff.append(writeEndTag(a)); 1097 }// End if 1098 }else{ 1099 // The annotation a does NOT end at the offset. Let's see if it starts 1100 // at the offset 1101 if ( offset.equals(a.getStartNode().getOffset()) ){ 1102 // The annotation a starts at the offset. 1103 // In this case empty the stack and write the end tag 1104 while(!stack.isEmpty()){ 1105 Annotation a1 = (Annotation)stack.pop(); 1106 tmpBuff.append(writeEndTag(a1)); 1107 }// End while 1108 1109 tmpBuff.append(writeStartTag(a, includeFeatures, false)); 1110 // The annotation is removed from dumped set 1111 aSourceAnnotationSet.remove(a); 1112 }// End if ( offset.equals(a.getStartNode().getOffset()) ) 1113 }// End if ( offset.equals(a.getEndNode().getOffset()) ) 1114 }// End while(it.hasNext()){ 1115 1116 // In this case empty the stack and write the end tag 1117 while(!stack.isEmpty()){ 1118 Annotation a1 = (Annotation)stack.pop(); 1119 tmpBuff.append(writeEndTag(a1)); 1120 }// End while 1121 1122 long originalPosition = -1; 1123 boolean backPositioning = 1124 a != null && offset.equals(a.getEndNode().getOffset()); 1125 if ( backPositioning ) { 1126 // end of the annotation correction 1127 originalPosition = 1128 repositioning.getOriginalPos(offset.intValue(), true); 1129 } // if 1130 1131 if(originalPosition == -1) { 1132 originalPosition = repositioning.getOriginalPos(offset.intValue()); 1133 } // if 1134 1135 // Insert tmpBuff to the location where it belongs in docContStrBuff 1136 if(originalPosition != -1 && originalPosition <= originalContentSize ) { 1137 docContStrBuff.insert((int) originalPosition, tmpBuff.toString()); 1138 } 1139 else { 1140 Out.prln("Error in the repositioning. The offset ("+offset.intValue() 1141 +") could not be positioned in the original document. \n" 1142 +"Calculated position is: "+originalPosition 1143 +" placed back: "+backPositioning); 1144 } // if 1145 1146 }// End while(!offsets.isEmpty()) 1147 1148 return docContStrBuff.toString(); 1149 } // saveAnnotationSetAsXml() 1150 1151 /** This method returns a list with annotations ordered that way that 1152 * they can be serialized from left to right, at the offset. If one of the 1153 * params is null then an empty list will be returned. 1154 * @param aDumpAnnotSet is a set containing all annotations that will be 1155 * dumped. 1156 * @param offset represent the offset at witch the annotation must start 1157 * AND/OR end. 1158 * @return a list with those annotations that need to be serialized. 1159 */ 1160 private List getAnnotationsForOffset(Set aDumpAnnotSet, Long offset){ 1161 List annotationList = new LinkedList(); 1162 if (aDumpAnnotSet == null || offset == null) return annotationList; 1163 Set annotThatStartAtOffset = new TreeSet( 1164 new AnnotationComparator(ORDER_ON_END_OFFSET,DESC)); 1165 Set annotThatEndAtOffset = new TreeSet( 1166 new AnnotationComparator(ORDER_ON_START_OFFSET,DESC)); 1167 Set annotThatStartAndEndAtOffset = new TreeSet( 1168 new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC)); 1169 1170 // Fill these tree lists with annotation tat start, end or start and 1171 // end at the offset. 1172 Iterator iter = aDumpAnnotSet.iterator(); 1173 while(iter.hasNext()){ 1174 Annotation ann = (Annotation) iter.next(); 1175 if (offset.equals(ann.getStartNode().getOffset())){ 1176 if (offset.equals(ann.getEndNode().getOffset())) 1177 annotThatStartAndEndAtOffset.add(ann); 1178 else 1179 annotThatStartAtOffset.add(ann); 1180 }else{ 1181 if (offset.equals(ann.getEndNode().getOffset())) 1182 annotThatEndAtOffset.add(ann); 1183 }// End if 1184 }// End while 1185 annotationList.addAll(annotThatEndAtOffset); 1186 annotThatEndAtOffset = null; 1187 annotationList.addAll(annotThatStartAtOffset); 1188 annotThatStartAtOffset = null; 1189 iter = annotThatStartAndEndAtOffset.iterator(); 1190 while(iter.hasNext()){ 1191 Annotation ann = (Annotation) iter.next(); 1192 Iterator it = annotationList.iterator(); 1193 boolean breaked = false; 1194 while (it.hasNext()){ 1195 Annotation annFromList = (Annotation) it.next(); 1196 if (annFromList.getId().intValue() > ann.getId().intValue()){ 1197 annotationList.add(annotationList.indexOf(annFromList),ann); 1198 breaked = true; 1199 break; 1200 }// End if 1201 }// End while 1202 if (!breaked) 1203 annotationList.add(ann); 1204 iter.remove(); 1205 }// End while 1206 return annotationList; 1207 }// getAnnotationsForOffset() 1208 1209 private String writeStartTag(Annotation annot, boolean includeFeatures){ 1210 return writeStartTag(annot, includeFeatures, true); 1211 } // writeStartTag 1212 1213 /** Returns a string representing a start tag based on the input annot*/ 1214 private String writeStartTag(Annotation annot, boolean includeFeatures, 1215 boolean includeNamespace){ 1216 AnnotationSet originalMarkupsAnnotSet = 1217 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 1218 1219 StringBuffer strBuff = new StringBuffer(""); 1220 if (annot == null) return strBuff.toString(); 1221// if (!addGatePreserveFormatTag && isRootTag){ 1222 if (isRootTag){ 1223 //the features are included either if desired or if that's an annotation 1224 //from the original markup of the document. We don't want for example to 1225 //spoil all links in an HTML file! 1226 if (includeFeatures) { 1227 strBuff.append("<"); 1228 strBuff.append(annot.getType()); 1229 strBuff.append(" "); 1230 if(includeNamespace) { 1231 strBuff.append(" xmlns:gate=\"http://www.gate.ac.uk\""); 1232 strBuff.append(" gate:"); 1233 } 1234 strBuff.append("gateId=\""); 1235 strBuff.append(annot.getId()); 1236 strBuff.append("\""); 1237 strBuff.append(" "); 1238 if(includeNamespace) { 1239 strBuff.append("gate:"); 1240 } 1241 strBuff.append("annotMaxId=\""); 1242 strBuff.append(getNextAnnotationId()); 1243 strBuff.append("\""); 1244 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1245 strBuff.append(">"); 1246 } 1247 else if (originalMarkupsAnnotSet.contains(annot)) { 1248 strBuff.append("<"); 1249 strBuff.append(annot.getType()); 1250 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1251 strBuff.append(">"); 1252 } 1253 else { 1254 strBuff.append("<"); 1255 strBuff.append(annot.getType()); 1256 strBuff.append(">"); 1257 } 1258 // Once the root tag was writen then there will be no other Root tag 1259 isRootTag = false; 1260 }else{ 1261 //the features are included either if desired or if that's an annotation 1262 //from the original markup of the document. We don't want for example to 1263 //spoil all links in an HTML file! 1264 if (includeFeatures) { 1265 strBuff.append("<"); 1266 strBuff.append(annot.getType()); 1267 strBuff.append(" "); 1268 if(includeNamespace) { 1269 strBuff.append("gate:"); 1270 } // if includeNamespaces 1271 strBuff.append("gateId=\""); 1272 strBuff.append(annot.getId()); 1273 strBuff.append("\""); 1274 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1275 strBuff.append(">"); 1276 } 1277 else if (originalMarkupsAnnotSet.contains(annot)) { 1278 strBuff.append("<"); 1279 strBuff.append(annot.getType()); 1280 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1281 strBuff.append(">"); 1282 } 1283 else { 1284 strBuff.append("<"); 1285 strBuff.append(annot.getType()); 1286 strBuff.append(">"); 1287 } 1288 }// End if 1289 return strBuff.toString(); 1290 }// writeStartTag() 1291 1292 /** This method takes aScanString and searches for those chars from 1293 * entitiesMap that appear in the string. A tree map(offset2Char) is filled 1294 * using as key the offsets where those Chars appear and the Char. 1295 * If one of the params is null the method simply returns. 1296 */ 1297 private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){ 1298 if (aScanString == null || aMapToFill == null) return; 1299 if (entitiesMap == null || entitiesMap.isEmpty()){ 1300 Err.prln("WARNING: Entities map was not initialised !"); 1301 return; 1302 }// End if 1303 // Fill the Map with the offsets of the special chars 1304 Iterator entitiesMapIterator = entitiesMap.keySet().iterator(); 1305 while(entitiesMapIterator.hasNext()){ 1306 Character c = (Character) entitiesMapIterator.next(); 1307 int fromIndex = 0; 1308 while (-1 != fromIndex){ 1309 fromIndex = aScanString.indexOf(c.charValue(),fromIndex); 1310 if (-1 != fromIndex){ 1311 aMapToFill.put(new Integer(fromIndex),c); 1312 fromIndex ++; 1313 }// End if 1314 }// End while 1315 }// End while 1316 }//buildEntityMapFromString(); 1317 1318 private String writeEmptyTag(Annotation annot){ 1319 return writeEmptyTag(annot, true); 1320 } // writeEmptyTag 1321 1322 /** Returns a string representing an empty tag based on the input annot*/ 1323 private String writeEmptyTag(Annotation annot, boolean includeNamespace){ 1324 StringBuffer strBuff = new StringBuffer(""); 1325 if (annot == null) return strBuff.toString(); 1326 1327 strBuff.append("<"); 1328 strBuff.append(annot.getType()); 1329 1330 AnnotationSet originalMarkupsAnnotSet = 1331 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 1332 if (! originalMarkupsAnnotSet.contains(annot)) { 1333 strBuff.append(" gateId=\""); 1334 strBuff.append(annot.getId()); 1335 strBuff.append("\""); 1336 } 1337 strBuff.append(writeFeatures(annot.getFeatures(),includeNamespace)); 1338 strBuff.append("/>"); 1339 1340 return strBuff.toString(); 1341 }// writeEmptyTag() 1342 1343 /** Returns a string representing an end tag based on the input annot*/ 1344 private String writeEndTag(Annotation annot){ 1345 StringBuffer strBuff = new StringBuffer(""); 1346 if (annot == null) return strBuff.toString(); 1347/* 1348 if (annot.getType().indexOf(" ") != -1) 1349 Out.prln("Warning: Truncating end tag to first word for annot type \"" 1350 +annot.getType()+ "\". "); 1351*/ 1352 strBuff.append("</"+annot.getType()+">"); 1353 return strBuff.toString(); 1354 }// writeEndTag() 1355 1356 /** Returns a string representing a FeatureMap serialized as XML attributes*/ 1357 private String writeFeatures(FeatureMap feat, boolean includeNamespace){ 1358 StringBuffer strBuff = new StringBuffer(""); 1359 if (feat == null) return strBuff.toString(); 1360 Iterator it = feat.keySet().iterator(); 1361 while (it.hasNext()){ 1362 Object key = it.next(); 1363 Object value = feat.get(key); 1364 if ( (key != null) && (value != null) ){ 1365 // Eliminate a feature inserted at reading time and which help to 1366 // take some decissions at saving time 1367 if ("isEmptyAndSpan".equals(key.toString())) 1368 continue; 1369 if( !(String.class.isAssignableFrom(key.getClass()) || 1370 Number.class.isAssignableFrom(key.getClass()))){ 1371 1372 Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+ 1373 " from String or Number.(feature discarded)"); 1374 continue; 1375 }// End if 1376 if ( !(String.class.isAssignableFrom(value.getClass()) || 1377 Number.class.isAssignableFrom(value.getClass()) || 1378 java.util.Collection.class.isAssignableFrom(value.getClass()))){ 1379 1380 Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+ 1381 " from String, Number or Collection.(feature discarded)"); 1382 continue; 1383 }// End if 1384 if ("matches".equals(key)) { 1385 strBuff.append(" "); 1386 if(includeNamespace) { 1387 strBuff.append("gate:"); 1388 } 1389 strBuff.append(key); 1390 strBuff.append("=\""); 1391 } 1392 else { 1393 strBuff.append(" "); 1394 strBuff.append(key); 1395 strBuff.append("=\""); 1396 } 1397 if (java.util.Collection.class.isAssignableFrom(value.getClass())){ 1398 Iterator valueIter = ((Collection)value).iterator(); 1399 while(valueIter.hasNext()){ 1400 Object item = valueIter.next(); 1401 if (!(String.class.isAssignableFrom(item.getClass()) || 1402 Number.class.isAssignableFrom(item.getClass()))) 1403 continue; 1404 strBuff.append(item); 1405 strBuff.append(";"); 1406 }// End while 1407 if (strBuff.charAt(strBuff.length()-1) == ';') 1408 strBuff.deleteCharAt(strBuff.length()-1); 1409 }else{ 1410 strBuff.append(value); 1411 }// End if 1412 strBuff.append("\""); 1413 }// End if 1414 }// End while 1415 return strBuff.toString(); 1416 }// writeFeatures() 1417 1418 /** Returns a GateXml document that is a custom XML format for wich there is 1419 * a reader inside GATE called gate.xml.GateFormatXmlHandler. 1420 * What it does is to serialize a GATE document in an XML format. 1421 * @return a string representing a Gate Xml document. If saved in a file,this 1422 * string must be written using the UTF-8 encoding because the first line 1423 * in the generated xml document is <?xml version="1.0" encoding="UTF-8" ?> 1424 */ 1425 public String toXml(){ 1426 // Initialize the xmlContent with 3 time the size of the current document. 1427 // This is because of the tags size. This measure is made to increase the 1428 // performance of StringBuffer. 1429 StringBuffer xmlContent = new StringBuffer( 1430 DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue())); 1431 // Add xml header 1432 xmlContent.append("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"); 1433 // Add the root element 1434 xmlContent.append("<GateDocument>\n"); 1435 xmlContent.append("<!-- The document's features-->\n\n"); 1436 xmlContent.append("<GateDocumentFeatures>\n"); 1437 1438 xmlContent.append(featuresToXml(this.getFeatures())); 1439 xmlContent.append("</GateDocumentFeatures>\n"); 1440 xmlContent.append("<!-- The document content area with serialized"+ 1441 " nodes -->\n\n"); 1442 // Add plain text element 1443 xmlContent.append("<TextWithNodes>"); 1444 xmlContent.append(textWithNodes(this.getContent().toString())); 1445 xmlContent.append("</TextWithNodes>\n"); 1446 // Serialize as XML all document's annotation sets 1447 // Serialize the default AnnotationSet 1448 StatusListener sListener = (StatusListener) 1449 gate.gui.MainFrame.getListeners(). 1450 get("gate.event.StatusListener"); 1451 if(sListener != null) 1452 sListener.statusChanged("Saving the default annotation set "); 1453 xmlContent.append("<!-- The default annotation set -->\n\n"); 1454 xmlContent.append(annotationSetToXml(this.getAnnotations())); 1455 // Serialize all others AnnotationSets 1456 // namedAnnotSets is a Map containing all other named Annotation Sets. 1457 if (namedAnnotSets != null){ 1458 Iterator iter = namedAnnotSets.values().iterator(); 1459 while(iter.hasNext()){ 1460 AnnotationSet annotSet = (AnnotationSet) iter.next(); 1461 xmlContent.append("<!-- Named annotation set -->\n\n"); 1462 // Serialize it as XML 1463 if(sListener != null) sListener.statusChanged("Saving " + 1464 annotSet.getName()+ 1465 " annotation set "); 1466 xmlContent.append(annotationSetToXml(annotSet)); 1467 }// End while 1468 }// End if 1469 // Add the end of GateDocument 1470 xmlContent.append("</GateDocument>"); 1471 if(sListener != null) sListener.statusChanged("Done !"); 1472 // return the XmlGateDocument 1473 return xmlContent.toString(); 1474 }// toXml 1475 1476 /** This method filters any non XML char 1477 * see: http://www.w3c.org/TR/2000/REC-xml-20001006#charsets 1478 * All non XML chars will be replaced with 0x20 (space char) This assures 1479 * that the next time the document is loaded there won't be any problems. 1480 * @param aStrBuffer represents the input String that is filtred. If the 1481 * aStrBuffer is null then an empty string will be returend 1482 * @return the "purified" StringBuffer version of the aStrBuffer 1483 */ 1484 private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){ 1485 if (aStrBuffer == null) return new StringBuffer(""); 1486 String space = new String(" "); 1487 for (int i=aStrBuffer.length()-1;i>=0; i--){ 1488 if (!isXmlChar(aStrBuffer.charAt(i))) 1489 aStrBuffer.replace(i,i+1,space); 1490 }// End for 1491 return aStrBuffer; 1492 }// filterNonXmlChars() 1493 1494 /** This method decide if a char is a valid XML one or not 1495 * @param ch the char to be tested 1496 * @return true if is a valid XML char and fals if is not. 1497 */ 1498 public static boolean isXmlChar(char ch){ 1499 if (ch == 0x9 || ch == 0xA || ch ==0xD) return true; 1500 if ((0x20 <= ch) && (ch <= 0xD7FF)) return true; 1501 if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true; 1502 if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true; 1503 return false; 1504 }// End isXmlChar() 1505 1506 /** This method saves a FeatureMap as XML elements. 1507 * @ param aFeatureMap the feature map that has to be saved as XML. 1508 * @ return a String like this: <Feature><Name>...</Name> 1509 * <Value>...</Value></Feature><Feature>...</Feature> 1510 */ 1511 private String featuresToXml(FeatureMap aFeatureMap){ 1512 StringBuffer str = new StringBuffer(""); 1513 1514 if (aFeatureMap == null) return str.toString(); 1515 1516 Set keySet = aFeatureMap.keySet(); 1517 Iterator keyIterator = keySet.iterator(); 1518 while(keyIterator.hasNext()){ 1519 Object key = keyIterator.next(); 1520 Object value = aFeatureMap.get(key); 1521 if ((key != null) && (value != null)){ 1522 String keyClassName = null; 1523 String keyItemClassName = null; 1524 String valueClassName = null; 1525 String valueItemClassName = null; 1526 String key2String = key.toString(); 1527 String value2String = value.toString(); 1528 1529 Object item = null; 1530 // Test key if it is String, Number or Collection 1531 if (key instanceof java.lang.String || 1532 key instanceof java.lang.Number || 1533 key instanceof java.util.Collection) 1534 keyClassName = key.getClass().getName(); 1535 1536 // Test value if it is String, Number or Collection 1537 if (value instanceof java.lang.String || 1538 value instanceof java.lang.Number || 1539 value instanceof java.util.Collection) 1540 valueClassName = value.getClass().getName(); 1541 1542 // Features and values that are not Strings, Numbers or collections 1543 // will be discarded. 1544 if (keyClassName == null || valueClassName == null) continue; 1545 1546 // If key is collection serialize the colection in a specific format 1547 if (key instanceof java.util.Collection){ 1548 StringBuffer keyStrBuff = new StringBuffer(""); 1549 Iterator iter = ((Collection) key).iterator(); 1550 if (iter.hasNext()){ 1551 item = iter.next(); 1552 if (item instanceof java.lang.Number) 1553 keyItemClassName = item.getClass().getName(); 1554 else 1555 keyItemClassName = String.class.getName(); 1556 keyStrBuff.append(item.toString()); 1557 }// End if 1558 while (iter.hasNext()){ 1559 item = iter.next(); 1560 keyStrBuff.append(";" + item.toString()); 1561 }// End while 1562 key2String = keyStrBuff.toString(); 1563 }// End if 1564 // If key is collection serialize the colection in a specific format 1565 if (value instanceof java.util.Collection){ 1566 StringBuffer valueStrBuff = new StringBuffer(""); 1567 Iterator iter = ((Collection) value).iterator(); 1568 if (iter.hasNext()){ 1569 item = iter.next(); 1570 if (item instanceof java.lang.Number) 1571 valueItemClassName = item.getClass().getName(); 1572 else 1573 valueItemClassName = String.class.getName(); 1574 valueStrBuff.append(item.toString()); 1575 }// End if 1576 while (iter.hasNext()){ 1577 item = iter.next(); 1578 valueStrBuff.append(";" + item.toString()); 1579 }// End while 1580 value2String = valueStrBuff.toString(); 1581 }// End if 1582 str.append("<Feature>\n <Name"); 1583 if (keyClassName != null) 1584 str.append(" className=\""+keyClassName+"\""); 1585 if (keyItemClassName != null) 1586 str.append(" itemClassName=\""+keyItemClassName+"\""); 1587 str.append(">"); 1588 str.append(filterNonXmlChars(replaceCharsWithEntities(key2String))); 1589 str.append("</Name>\n <Value"); 1590 if (valueClassName != null) 1591 str.append(" className=\"" + valueClassName + "\""); 1592 if (valueItemClassName != null) 1593 str.append(" itemClassName=\"" + valueItemClassName + "\""); 1594 str.append(">"); 1595 str.append(filterNonXmlChars(replaceCharsWithEntities(value2String))); 1596 str.append("</Value>\n</Feature>\n"); 1597 }// End if 1598 }// end While 1599 return str.toString(); 1600 }//featuresToXml 1601 1602 /** This method replace all chars that appears in the anInputString and also 1603 * that are in the entitiesMap with their corresponding entity 1604 * @param anInputString the string analyzed. If it is null then returns the 1605 * empty string 1606 * @return a string representing the input string with chars replaced with 1607 * entities 1608 */ 1609 private StringBuffer replaceCharsWithEntities(String anInputString){ 1610 if (anInputString == null) return new StringBuffer(""); 1611 StringBuffer strBuff = new StringBuffer(anInputString); 1612 for (int i=strBuff.length()-1; i>=0; i--){ 1613 Character ch = new Character(strBuff.charAt(i)); 1614 if (entitiesMap.keySet().contains(ch)){ 1615 strBuff.replace(i,i+1,(String) entitiesMap.get(ch)); 1616 }// End if 1617 }// End for 1618 return strBuff; 1619 }//replaceCharsWithEntities() 1620 1621 /** This method creates Node XML elements and inserts them at the 1622 * corresponding offset inside the text. Nodes are created from the default 1623 * annotation set, as well as from all existing named annotation sets. 1624 * @param aText The text representing the document's plain text. 1625 * @return The text with empty <Node id="NodeId"/> elements. 1626 */ 1627 private String textWithNodes(String aText){ 1628 if (aText == null) return new String(""); 1629 StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText)); 1630 1631 // Construct a map from offsets to Chars 1632 TreeMap offsets2CharsMap = new TreeMap(); 1633 if (aText.length()!= 0){ 1634 // Fill the offsets2CharsMap with all the indices where special chars appear 1635 buildEntityMapFromString(aText,offsets2CharsMap); 1636 }//End if 1637 // Construct the offsetsSet for all nodes belonging to this document 1638 TreeSet offsetsSet = new TreeSet(); 1639 Iterator annotSetIter = this.getAnnotations().iterator(); 1640 while (annotSetIter.hasNext()){ 1641 Annotation annot = (Annotation) annotSetIter.next(); 1642 offsetsSet.add(annot.getStartNode().getOffset()); 1643 offsetsSet.add(annot.getEndNode().getOffset()); 1644 }// end While 1645 // Get the nodes from all other named annotation sets. 1646 if (namedAnnotSets != null){ 1647 Iterator iter = namedAnnotSets.values().iterator(); 1648 while(iter.hasNext()){ 1649 AnnotationSet annotSet = (AnnotationSet) iter.next(); 1650 Iterator iter2 = annotSet.iterator(); 1651 while(iter2.hasNext()){ 1652 Annotation annotTmp = (Annotation) iter2.next(); 1653 offsetsSet.add(annotTmp.getStartNode().getOffset()); 1654 offsetsSet.add(annotTmp.getEndNode().getOffset()); 1655 }// End while 1656 }// End while 1657 }// End if 1658 // offsetsSet is ordered in ascending order because the structure 1659 // is a TreeSet 1660 1661 if (offsetsSet.isEmpty()){ 1662 return replaceCharsWithEntities(aText).toString(); 1663 }// End if 1664 // Iterate through all nodes from anAnnotSet and transform them to 1665 // XML elements. Then insert those elements at the node's offset into the 1666 // textWithNodes . 1667 while (!offsetsSet.isEmpty()){ 1668 Long offset = (Long) offsetsSet.last(); 1669 // Eliminate the offset from the list in order to create more memory space 1670 offsetsSet.remove(offset); 1671 // Use offset 1672 int offsetValue = offset.intValue(); 1673 String strNode = "<Node id=\"" + offsetValue + "\"/>"; 1674 // Before inserting this string into the textWithNodes, check to see if 1675 // there are any chars to be replaced with their corresponding entities 1676 if (!offsets2CharsMap.isEmpty()){ 1677 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 1678 while( !offsets2CharsMap.isEmpty() && 1679 offsChar.intValue() >= offset.intValue()){ 1680 // Replace the char at offsChar with its corresponding entity form 1681 // the entitiesMap. 1682 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1, 1683 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 1684 // Discard the offsChar after it was used because this offset will 1685 // never appear again 1686 offsets2CharsMap.remove(offsChar); 1687 // Investigate next offsChar 1688 if (!offsets2CharsMap.isEmpty()) 1689 offsChar = (Integer) offsets2CharsMap.lastKey(); 1690 }// End while 1691 }// End if 1692 // Now it is safe to insert the node 1693 textWithNodes.insert(offsetValue,strNode); 1694 }// end while 1695 // Need to replace the entities in the remaining text, if there is any text 1696 // So, if there are any more items in offsets2CharsMap they need to be 1697 // replaced 1698 while (!offsets2CharsMap.isEmpty()){ 1699 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 1700 // Replace the char with its entity 1701 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1, 1702 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 1703 // remove the offset from the map 1704 offsets2CharsMap.remove(offsChar); 1705 }// End while 1706 return textWithNodes.toString(); 1707 }//textWithNodes() 1708 1709 /** This method saves an AnnotationSet as XML. 1710 * @param anAnnotationSet The annotation set that has to be saved as XML. 1711 * @return a String like this: <AnnotationSet> <Annotation>.... 1712 * </AnnotationSet> 1713 */ 1714 private String annotationSetToXml(AnnotationSet anAnnotationSet){ 1715 StringBuffer str = new StringBuffer(""); 1716 1717 if (anAnnotationSet == null){ 1718 str.append("<AnnotationSet>\n"); 1719 str.append("</AnnotationSet>\n"); 1720 return str.toString(); 1721 }// End if 1722 if (anAnnotationSet.getName() == null) 1723 str.append("<AnnotationSet>\n"); 1724 else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+ 1725 "\" >\n"); 1726 // Iterate through AnnotationSet and save each Annotation as XML 1727 Iterator iterator = anAnnotationSet.iterator(); 1728 while (iterator.hasNext()){ 1729 Annotation annot = (Annotation) iterator.next(); 1730 str.append("<Annotation " + "Type=\"" + annot.getType() + 1731 "\" StartNode=\"" + annot.getStartNode().getOffset() + 1732 "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n"); 1733 str.append(featuresToXml(annot.getFeatures())); 1734 str.append("</Annotation>\n"); 1735 }// End while 1736 1737 str.append("</AnnotationSet>\n"); 1738 return str.toString(); 1739 }// annotationSetToXml 1740 1741 /** Returns a map with the named annotation sets. It returns <code>null</code> 1742 * if no named annotaton set exists. */ 1743 public Map getNamedAnnotationSets() { 1744 return namedAnnotSets; 1745 } // getNamedAnnotationSets 1746 1747 /** 1748 * Removes one of the named annotation sets. 1749 * Note that the default annotation set cannot be removed. 1750 * @param name the name of the annotation set to be removed 1751 */ 1752 public void removeAnnotationSet(String name){ 1753 Object removed = namedAnnotSets.remove(name); 1754 if(removed != null){ 1755 fireAnnotationSetRemoved( 1756 new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name)); 1757 } 1758 } 1759 1760 /** Propagate edit changes to the document content and annotations. */ 1761 public void edit(Long start, Long end, DocumentContent replacement) 1762 throws InvalidOffsetException 1763 { 1764 if(! isValidOffsetRange(start, end)) 1765 throw new InvalidOffsetException(); 1766 1767 if(content != null) 1768 ((DocumentContentImpl) content).edit(start, end, replacement); 1769 1770 if(defaultAnnots != null) 1771 ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement); 1772 1773 if(namedAnnotSets != null) { 1774 Iterator iter = namedAnnotSets.values().iterator(); 1775 while(iter.hasNext()) 1776 ((AnnotationSetImpl) iter.next()).edit(start, end, replacement); 1777 } 1778 1779 } // edit(start,end,replacement) 1780 1781 /** Check that an offset is valid, i.e. it is non-null, greater than 1782 * or equal to 0 and less than the size of the document content. 1783 */ 1784 public boolean isValidOffset(Long offset) { 1785 if(offset == null) 1786 return false; 1787 1788 long o = offset.longValue(); 1789 if(o > getContent().size().longValue() || o < 0) 1790 return false; 1791 1792 return true; 1793 } // isValidOffset 1794 1795 /** Check that both start and end are valid offsets and that 1796 * they constitute a valid offset range, i.e. start is greater 1797 * than or equal to long. 1798 */ 1799 public boolean isValidOffsetRange(Long start, Long end) { 1800 return 1801 isValidOffset(start) && isValidOffset(end) && 1802 start.longValue() <= end.longValue(); 1803 } // isValidOffsetRange(start,end) 1804 1805 /** Sets the nextAnnotationId */ 1806 public void setNextAnnotationId(int aNextAnnotationId){ 1807 nextAnnotationId = aNextAnnotationId; 1808 }// setNextAnnotationId(); 1809 1810 /** Generate and return the next annotation ID */ 1811 public Integer getNextAnnotationId() { 1812 return new Integer(nextAnnotationId++); 1813 } // getNextAnnotationId 1814 1815 /** Generate and return the next node ID */ 1816 public Integer getNextNodeId() { return new Integer(nextNodeId++); } 1817 1818 /** Ordering based on URL.toString() and the URL offsets (if any) */ 1819 public int compareTo(Object o) throws ClassCastException { 1820 DocumentImpl other = (DocumentImpl) o; 1821 return getOrderingString().compareTo(other.getOrderingString()); 1822 } // compareTo 1823 1824 /** Utility method to produce a string for comparison in ordering. 1825 * String is based on the source URL and offsets. 1826 */ 1827 protected String getOrderingString() { 1828 if(sourceUrl == null) return toString(); 1829 1830 StringBuffer orderingString = new StringBuffer(sourceUrl.toString()); 1831 if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) { 1832 orderingString.append(sourceUrlStartOffset.toString()); 1833 orderingString.append(sourceUrlEndOffset.toString()); 1834 } 1835 1836 return orderingString.toString(); 1837 } // getOrderingString() 1838 1839 /** The id of the next new annotation */ 1840 protected int nextAnnotationId = 0; 1841 1842 /** The id of the next new node */ 1843 protected int nextNodeId = 0; 1844 /** The source URL */ 1845 protected URL sourceUrl; 1846 1847 /** The document's URL name. */ 1848 1849 /** The content of the document */ 1850 protected DocumentContent content; 1851 1852 /** The encoding of the source of the document content */ 1853 protected String encoding = "UTF-8"; 1854 1855 // Data needed in toXml(AnnotationSet) methos 1856 1857 /** This field indicates whether or not to add the tag 1858 * called GatePreserveFormat to the document. HTML, XML, SGML docs won't 1859 * have this tag added 1860 */ 1861// private boolean addGatePreserveFormatTag = false; 1862 1863 /** This field indicates if an annotation is the doc's root tag. 1864 * It is needed when adding the namespace information 1865 */ 1866 private boolean isRootTag = false; 1867 1868 /** This field is used when creating StringBuffers for toXml() methods. 1869 * The size of the StringBuffer will be docDonctent.size() multiplied by this 1870 * value. It is aimed to improve the performance of StringBuffer 1871 */ 1872 private final int DOC_SIZE_MULTIPLICATION_FACTOR = 1; 1873 1874 /** Constant used in the inner class AnnotationComparator to order 1875 * annotations on their start offset 1876 */ 1877 private final int ORDER_ON_START_OFFSET = 0; 1878 /** Constant used in the inner class AnnotationComparator to order 1879 * annotations on their end offset 1880 */ 1881 private final int ORDER_ON_END_OFFSET = 1; 1882 /** Constant used in the inner class AnnotationComparator to order 1883 * annotations on their ID 1884 */ 1885 private final int ORDER_ON_ANNOT_ID = 2; 1886 /** Constant used in the inner class AnnotationComparator to order 1887 * annotations ascending 1888 */ 1889 private final int ASC = 3; 1890 /** Constant used in the inner class AnnotationComparator to order 1891 * annotations descending 1892 */ 1893 private final int DESC = -3; 1894 1895 /** A map initialized in init() containing entities that needs to be 1896 * replaced in strings 1897 */ 1898 private static Map entitiesMap = null; 1899 // Initialize the entities map use when saving as xml 1900 static{ 1901 entitiesMap = new HashMap(); 1902 entitiesMap.put(new Character('<'),"<"); 1903 entitiesMap.put(new Character('>'),">"); 1904 entitiesMap.put(new Character('&'),"&"); 1905 entitiesMap.put(new Character('\''),"'"); 1906 entitiesMap.put(new Character('"'),"""); 1907 entitiesMap.put(new Character((char)160)," "); 1908 entitiesMap.put(new Character((char)169),"©"); 1909 }//static 1910 1911 /** The range that the content comes from at the source URL 1912 * (or null if none). 1913 */ 1914 //protected Long[] sourceUrlOffsets; 1915 1916 /** The start of the range that the content comes from at the source URL 1917 * (or null if none). 1918 */ 1919 protected Long sourceUrlStartOffset; 1920 1921 /** The end of the range that the content comes from at the source URL 1922 * (or null if none). 1923 */ 1924 protected Long sourceUrlEndOffset; 1925 1926 /** The default annotation set */ 1927 protected AnnotationSet defaultAnnots; 1928 1929 /** Named sets of annotations */ 1930 protected Map namedAnnotSets; 1931 1932 /** 1933 * A property of the document that will be set when the user 1934 * wants to create the document from a string, as opposed to from 1935 * a URL. 1936 */ 1937 private String stringContent; 1938 1939 /** 1940 * The stringContent of a document is 1941 * a property of the document that will be set when the user 1942 * wants to create the document from a string, as opposed to from 1943 * a URL. 1944 * <B>Use the <TT>getContent</TT> method instead to get the actual document 1945 * content.</B> 1946 */ 1947 public String getStringContent() { return stringContent; } 1948 1949 /** 1950 * The stringContent of a document is 1951 * a property of the document that will be set when the user 1952 * wants to create the document from a string, as opposed to from 1953 * a URL. 1954 * <B>Use the <TT>setContent</TT> method instead to update the actual 1955 * document content.</B> 1956 */ 1957 public void setStringContent(String stringContent) { 1958 this.stringContent = stringContent; 1959 } // set StringContent 1960 1961 /** Is the document markup-aware? */ 1962 protected Boolean markupAware = new Boolean(false); 1963 1964 /** Check: test 2 objects for equality */ 1965 protected boolean check(Object a, Object b) { 1966 if( (a == null || b == null) ) 1967 return a == b; 1968 1969 return a.equals(b); 1970 } // check(a,b) 1971 1972 /** Equals */ 1973 public boolean equals(Object other) { 1974 if(other == null || 1975 !(other instanceof DocumentImpl))return false; 1976 DocumentImpl doc = (DocumentImpl) other; 1977 1978// PENDING EQUALS IMPLS 1979 if(! check(content, doc.content)) return false; 1980 if(! check(defaultAnnots, doc.defaultAnnots)) return false; 1981 if(! check(encoding, doc.encoding)) return false; 1982 if(! check(features, doc.features)) return false; 1983 if(!markupAware.equals(doc.markupAware)) return false; 1984 if(! check(namedAnnotSets, doc.namedAnnotSets)) return false; 1985 if(nextAnnotationId != doc.nextAnnotationId) return false; 1986 if(nextNodeId != doc.nextNodeId) return false; 1987 if(! check(sourceUrl, doc.sourceUrl)) return false; 1988 if(! check(sourceUrlStartOffset, doc.sourceUrlStartOffset)) return false; 1989 if(! check(sourceUrlEndOffset, doc.sourceUrlEndOffset)) return false; 1990 1991 return true; 1992 } // equals 1993 1994 /** Hash code */ 1995 public int hashCode() { 1996 int code = getContent().hashCode(); 1997 int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode(); 1998 code += memberCode; 1999 memberCode = (encoding == null) ? 0 : encoding.hashCode(); 2000 code += memberCode; 2001 memberCode = (features == null) ? 0 : features.hashCode(); 2002 code += memberCode; 2003 code += (markupAware.booleanValue()) ? 0 : 1; 2004 memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode(); 2005 code += memberCode; 2006 code += nextAnnotationId; 2007 code += nextNodeId; 2008 memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode(); 2009 code += memberCode; 2010 memberCode = 2011 (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode(); 2012 code += memberCode; 2013 memberCode = 2014 (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode(); 2015 code += memberCode; 2016 return code; 2017 } // hashcode 2018 2019 /** String respresentation */ 2020 public String toString() { 2021 String n = Strings.getNl(); 2022 StringBuffer s = new StringBuffer("DocumentImpl: " + n); 2023 s.append(" content:" + content + n); 2024 s.append(" defaultAnnots:" + defaultAnnots + n); 2025 s.append(" encoding:" + encoding + n); 2026 s.append(" features:" + features + n); 2027 s.append(" markupAware:" + markupAware + n); 2028 s.append(" namedAnnotSets:" + namedAnnotSets + n); 2029 s.append(" nextAnnotationId:" + nextAnnotationId + n); 2030 s.append(" nextNodeId:" + nextNodeId + n); 2031 s.append(" sourceUrl:" + sourceUrl + n); 2032 s.append(" sourceUrlStartOffset:" + sourceUrlStartOffset + n); 2033 s.append(" sourceUrlEndOffset:" + sourceUrlEndOffset + n); 2034 s.append(n); 2035 2036 return s.toString(); 2037 } // toString 2038 2039 /** Freeze the serialization UID. */ 2040 static final long serialVersionUID = -8456893608311510260L; 2041 2042 /** Inner class needed to compare annotations*/ 2043 class AnnotationComparator implements java.util.Comparator { 2044 int orderOn = -1; 2045 int orderType = ASC; 2046 /** Constructs a comparator according to one of three sorter types: 2047 * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET 2048 */ 2049 public AnnotationComparator(int anOrderOn, int anOrderType){ 2050 orderOn = anOrderOn; 2051 orderType = anOrderType; 2052 }// AnnotationComparator() 2053 2054 /**This method must be implemented according to Comparator interface */ 2055 public int compare(Object o1, Object o2){ 2056 Annotation a1 = (Annotation) o1; 2057 Annotation a2 = (Annotation) o2; 2058 // ORDER_ON_START_OFFSET ? 2059 if (orderOn == ORDER_ON_START_OFFSET){ 2060 int result = a1.getStartNode().getOffset().compareTo( 2061 a2.getStartNode().getOffset()); 2062 if (orderType == ASC){ 2063 // ASC 2064 // If they are equal then their ID will decide. 2065 if (result == 0) 2066 return a1.getId().compareTo(a2.getId()); 2067 return result; 2068 }else{ 2069 // DESC 2070 if (result == 0) 2071 return - (a1.getId().compareTo(a2.getId())); 2072 return -result; 2073 }// End if (orderType == ASC) 2074 }// End if (orderOn == ORDER_ON_START_OFFSET) 2075 2076 // ORDER_ON_END_OFFSET ? 2077 if (orderOn == ORDER_ON_END_OFFSET){ 2078 int result = a1.getEndNode().getOffset().compareTo( 2079 a2.getEndNode().getOffset()); 2080 if (orderType == ASC){ 2081 // ASC 2082 // If they are equal then their ID will decide. 2083 if (result == 0) 2084 return - (a1.getId().compareTo(a2.getId())); 2085 return result; 2086 }else{ 2087 // DESC 2088 // If they are equal then their ID will decide. 2089 if (result == 0) 2090 return a1.getId().compareTo(a2.getId()); 2091 return - result; 2092 }// End if (orderType == ASC) 2093 }// End if (orderOn == ORDER_ON_END_OFFSET) 2094 2095 // ORDER_ON_ANNOT_ID ? 2096 if (orderOn == ORDER_ON_ANNOT_ID){ 2097 if (orderType == ASC) 2098 return a1.getId().compareTo(a2.getId()); 2099 else 2100 return -(a1.getId().compareTo(a2.getId())); 2101 }// End if 2102 return 0; 2103 }//compare() 2104 } // End inner class AnnotationComparator 2105 2106 2107 private transient Vector documentListeners; 2108 private transient Vector gateListeners; 2109 2110 public synchronized void removeDocumentListener(DocumentListener l) { 2111 if (documentListeners != null && documentListeners.contains(l)) { 2112 Vector v = (Vector) documentListeners.clone(); 2113 v.removeElement(l); 2114 documentListeners = v; 2115 } 2116 } 2117 public synchronized void addDocumentListener(DocumentListener l) { 2118 Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone(); 2119 if (!v.contains(l)) { 2120 v.addElement(l); 2121 documentListeners = v; 2122 } 2123 } 2124 protected void fireAnnotationSetAdded(DocumentEvent e) { 2125 if (documentListeners != null) { 2126 Vector listeners = documentListeners; 2127 int count = listeners.size(); 2128 for (int i = 0; i < count; i++) { 2129 ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e); 2130 } 2131 } 2132 } 2133 protected void fireAnnotationSetRemoved(DocumentEvent e) { 2134 if (documentListeners != null) { 2135 Vector listeners = documentListeners; 2136 int count = listeners.size(); 2137 for (int i = 0; i < count; i++) { 2138 ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e); 2139 } 2140 } 2141 } 2142 public void resourceLoaded(CreoleEvent e) { 2143 } 2144 public void resourceUnloaded(CreoleEvent e) { 2145 } 2146 public void datastoreOpened(CreoleEvent e) { 2147 } 2148 public void datastoreCreated(CreoleEvent e) { 2149 } 2150 public void resourceRenamed(Resource resource, String oldName, 2151 String newName){ 2152 } 2153 public void datastoreClosed(CreoleEvent e) { 2154 if (! e.getDatastore().equals(this.getDataStore())) 2155 return; 2156 //close this lr, since it cannot stay open when the DS it comes from 2157 //is closed 2158 Factory.deleteResource(this); 2159 } 2160 public void setLRPersistenceId(Object lrID) { 2161 super.setLRPersistenceId( lrID); 2162 //make persistent documents listen to the creole register 2163 //for events about their DS 2164 Gate.getCreoleRegister().addCreoleListener(this); 2165 } 2166 public void resourceAdopted(DatastoreEvent evt) { 2167 } 2168 public void resourceDeleted(DatastoreEvent evt) { 2169 if(! evt.getSource().equals(this.getDataStore())) 2170 return; 2171 //if an open document is deleted from a DS, then 2172 //it must close itself immediately, as is no longer valid 2173 if(evt.getResourceID().equals(this.getLRPersistenceId())) 2174 Factory.deleteResource(this); 2175 } 2176 public void resourceWritten(DatastoreEvent evt) { 2177 } 2178 public void setDataStore(DataStore dataStore) throws gate.persist.PersistenceException { 2179 super.setDataStore( dataStore); 2180 if (this.dataStore != null) 2181 this.dataStore.addDatastoreListener(this); 2182 } 2183 2184} // class DocumentImpl 2185
|
DocumentImpl |
|