|
SerialCorpusImpl |
|
1 /* 2 * SerialCorpusImpl.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 19/Oct/2001 12 * 13 * $Id: SerialCorpusImpl.java,v 1.26 2002/03/07 16:59:42 kalina Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 20 import gate.*; 21 import gate.util.*; 22 import gate.annotation.*; 23 import gate.persist.*; 24 import java.io.*; 25 import java.net.*; 26 import gate.event.*; 27 import gate.creole.*; 28 import gate.security.SecurityException; 29 30 //The initial design was to implement this on the basis of a WeakValueHashMap. 31 //However this creates problems, because the user might e.g., add a transient 32 //document to the corpus and then if the Document variable goes out of scope 33 //before sync() is called, nothing will be saved of the new document. Bad! 34 //Instead, to cope with the unloading for memory saving use, I implemented 35 //a documentUnload() method, which sets the in-memory copy to null but can 36 //always restore the doc, because it has its persistence ID. 37 38 public class SerialCorpusImpl extends 39 AbstractLanguageResource 40 implements Corpus, CreoleListener, DatastoreListener { 41 42 /** Debug flag */ 43 private static final boolean DEBUG = false; 44 45 static final long serialVersionUID = 3632609241787241616L; 46 47 protected transient Vector corpusListeners; 48 protected java.util.List docDataList = null; 49 50 //here I keep document index as key (same as the index in docDataList 51 //which defines the document order) and Documents as value 52 protected transient List documents = null; 53 54 public SerialCorpusImpl() { 55 } 56 57 /** 58 * Constructor to create a SerialCorpus from a transient one. 59 * This is called by adopt() to store the transient corpus 60 * and re-route the methods calls to it, until the corpus is 61 * sync-ed on disk. After that, the transientCorpus will always 62 * be null, so the new functionality will be used instead. 63 */ 64 protected SerialCorpusImpl(Corpus tCorpus){ 65 //copy the corpus name and features from the one in memory 66 this.setName(tCorpus.getName()); 67 this.setFeatures(tCorpus.getFeatures()); 68 69 docDataList = new ArrayList(); 70 //now cache the names of all docs for future use 71 Iterator iter = tCorpus.getDocumentNames().iterator(); 72 while (iter.hasNext()) 73 docDataList.add(new DocumentData((String) iter.next(), null)); 74 75 //copy all the documents from the transient corpus 76 documents = new ArrayList(); 77 documents.addAll(tCorpus); 78 79 //make sure we fire events when docs are added/removed/etc 80 Gate.getCreoleRegister().addCreoleListener(this); 81 } 82 83 /** 84 * Gets the names of the documents in this corpus. 85 * @return a {@link List} of Strings representing the names of the documents 86 * in this corpus. 87 */ 88 public List getDocumentNames(){ 89 List docsNames = new ArrayList(); 90 if(docDataList == null) 91 return docsNames; 92 Iterator iter = docDataList.iterator(); 93 while (iter.hasNext()) { 94 DocumentData data = (DocumentData) iter.next(); 95 docsNames.add(data.getDocumentName()); 96 } 97 return docsNames; 98 } 99 100 /** 101 * This method should only be used by the Serial Datastore to set 102 */ 103 public void setDocumentPersistentID(int index, Object persID){ 104 if (index >= docDataList.size()) return; 105 ((DocumentData)docDataList.get(index)).setPersistentID(persID); 106 if (DEBUG) Out.prln("IDs are now: " + docDataList); 107 } 108 109 /** 110 * Gets the name of a document in this corpus. 111 * @param index the index of the document 112 * @return a String value representing the name of the document at 113 * <tt>index</tt> in this corpus.<P> 114 */ 115 public String getDocumentName(int index){ 116 if (index >= docDataList.size()) return "No such document"; 117 118 return ((DocumentData) docDataList.get(index)).getDocumentName(); 119 } 120 121 /** 122 * Unloads the document from memory, but calls sync() first, to store the 123 * changes 124 */ 125 public void unloadDocument(int index) { 126 //1. check whether its been loaded and is a persistent one 127 // if a persistent doc is not loaded, there's nothing we need to do 128 if ( (! isDocumentLoaded(index)) && isPersistentDocument(index)) 129 return; 130 131 //2. sync the document before releasing it from memory, because the 132 //creole register garbage collects all LRs which are not used any more 133 Document doc = (Document) documents.get(index); 134 try { 135 //if the document is not already adopted, we need to do that first 136 if (doc.getLRPersistenceId() == null) { 137 doc = (Document) this.getDataStore().adopt(doc, null); 138 this.getDataStore().sync(doc); 139 this.setDocumentPersistentID(index, doc.getLRPersistenceId()); 140 } else //if it is adopted, just sync it 141 this.getDataStore().sync(doc); 142 143 //3. remove the document from the memory 144 //do this, only if the saving has succeeded 145 documents.set(index, null); 146 147 } catch (PersistenceException ex) { 148 throw new GateRuntimeException("Error unloading document from corpus" 149 + "because document sync failed: " + ex.getMessage()); 150 } catch (gate.security.SecurityException ex1) { 151 throw new GateRuntimeException("Error unloading document from corpus" 152 + "because of document access error: " + ex1.getMessage()); 153 } 154 155 } 156 157 /** 158 * Unloads a document from memory 159 */ 160 public void unloadDocument(Document doc) { 161 if (DEBUG) Out.prln("Document to be unloaded :" + doc.getName()); 162 //1. determine the index of the document; if not there, do nothing 163 int index = findDocument(doc); 164 if (index == -1) 165 return; 166 if (DEBUG) Out.prln("Index of doc: " + index); 167 if (DEBUG) Out.prln("Size of corpus: " + documents.size()); 168 unloadDocument(index); 169 // documents.remove(new Integer(index)); 170 } 171 172 /** 173 * This method returns true when the document is already loaded in memory 174 */ 175 public boolean isDocumentLoaded(int index) { 176 if (documents == null || documents.isEmpty()) return false; 177 return documents.get(index) != null; 178 } 179 180 /** 181 * This method returns true when the document is already stored on disk 182 * i.e., is not transient 183 */ 184 public boolean isPersistentDocument(int index) { 185 if (documents == null || documents.isEmpty()) return false; 186 return (((DocumentData)docDataList.get(index)).getPersistentID() != null); 187 } 188 189 /** 190 * Every LR that is a CreoleListener (and other Listeners too) must 191 * override this method and make sure it removes itself from the 192 * objects which it has been listening to. Otherwise, the object will 193 * not be released from memory (memory leak!). 194 */ 195 public void cleanup() { 196 if (DEBUG) Out.prln("serial corpus cleanup called"); 197 if (corpusListeners != null) 198 corpusListeners = null; 199 if (documents != null) 200 documents.clear(); 201 docDataList.clear(); 202 Gate.getCreoleRegister().removeCreoleListener(this); 203 if (this.dataStore != null) { 204 this.dataStore.removeDatastoreListener(this); 205 } 206 } 207 208 /** 209 * Fills this corpus with documents created from files in a directory. 210 * @param filter the file filter used to select files from the target 211 * directory. If the filter is <tt>null</tt> all the files will be accepted. 212 * @param directory the directory from which the files will be picked. This 213 * parameter is an URL for uniformity. It needs to be a URL of type file 214 * otherwise an InvalidArgumentException will be thrown. 215 * An implementation for this method is provided as a static method at 216 * {@link gate.corpora.CorpusImpl#populate(Corpus,URL,FileFilter,boolean)}. 217 * @param encoding the encoding to be used for reading the documents 218 * @param recurseDirectories should the directory be parsed recursively?. If 219 * <tt>true</tt> all the files from the provided directory and all its 220 * children directories (on as many levels as necessary) will be picked if 221 * accepted by the filter otherwise the children directories will be ignored. 222 */ 223 public void populate(URL directory, FileFilter filter, String encoding, 224 boolean recurseDirectories) 225 throws IOException, ResourceInstantiationException{ 226 CorpusImpl.populate(this, directory, filter, encoding, recurseDirectories); 227 } 228 229 230 public synchronized void removeCorpusListener(CorpusListener l) { 231 if (corpusListeners != null && corpusListeners.contains(l)) { 232 Vector v = (Vector) corpusListeners.clone(); 233 v.removeElement(l); 234 corpusListeners = v; 235 } 236 } 237 public synchronized void addCorpusListener(CorpusListener l) { 238 Vector v = corpusListeners == null ? new Vector(2) : (Vector) corpusListeners.clone(); 239 if (!v.contains(l)) { 240 v.addElement(l); 241 corpusListeners = v; 242 } 243 } 244 protected void fireDocumentAdded(CorpusEvent e) { 245 if (corpusListeners != null) { 246 Vector listeners = corpusListeners; 247 int count = listeners.size(); 248 for (int i = 0; i < count; i++) { 249 ((CorpusListener) listeners.elementAt(i)).documentAdded(e); 250 } 251 } 252 } 253 protected void fireDocumentRemoved(CorpusEvent e) { 254 if (corpusListeners != null) { 255 Vector listeners = corpusListeners; 256 int count = listeners.size(); 257 for (int i = 0; i < count; i++) { 258 ((CorpusListener) listeners.elementAt(i)).documentRemoved(e); 259 } 260 } 261 } 262 public void resourceLoaded(CreoleEvent e) { 263 } 264 265 public void resourceRenamed(Resource resource, String oldName, 266 String newName){} 267 268 public void resourceUnloaded(CreoleEvent e) { 269 Resource res = e.getResource(); 270 if (res instanceof Document) { 271 Document doc = (Document) res; 272 if (DEBUG) 273 Out.prln("resource Unloaded called "); 274 //remove from the corpus too, if a transient one 275 if (doc.getDataStore() != this.getDataStore()) { 276 this.remove(doc); 277 } else { 278 //unload all occurences 279 int index = indexOf(res); 280 if (index < 0) 281 return; 282 documents.set(index, null); 283 if (DEBUG) 284 Out.prln("corpus: document "+ index + " unloaded and set to null"); 285 } //if 286 } 287 } 288 public void datastoreOpened(CreoleEvent e) { 289 } 290 public void datastoreCreated(CreoleEvent e) { 291 } 292 public void datastoreClosed(CreoleEvent e) { 293 if (! e.getDatastore().equals(this.getDataStore())) 294 return; 295 if (this.getDataStore() != null) 296 this.getDataStore().removeDatastoreListener(this); 297 //close this corpus, since it cannot stay open when the DS it comes from 298 //is closed 299 Factory.deleteResource(this); 300 } 301 /** 302 * Called by a datastore when a new resource has been adopted 303 */ 304 public void resourceAdopted(DatastoreEvent evt){ 305 } 306 307 /** 308 * Called by a datastore when a resource has been deleted 309 */ 310 public void resourceDeleted(DatastoreEvent evt){ 311 DataStore ds = (DataStore)evt.getSource(); 312 //1. check whether this datastore fired the event. If not, return. 313 if (!ds.equals(this.dataStore)) 314 return; 315 316 Object docID = evt.getResourceID(); 317 if (docID == null) 318 return; 319 320 if (DEBUG) Out.prln("Resource deleted called for: " + docID); 321 //first check if it is this corpus that's been deleted, it must be 322 //unloaded immediately 323 if (docID.equals(this.getLRPersistenceId())) { 324 Factory.deleteResource(this); 325 return; 326 }//if 327 328 boolean isDirty=false; 329 //the problem here is that I only have the doc persistent ID 330 //and nothing else, so I need to determine the index of the doc first 331 for (int i=0; i< docDataList.size(); i++) { 332 DocumentData docData = (DocumentData)docDataList.get(i); 333 //we've found the correct document 334 //don't break the loop, because it might appear more than once 335 if (docID.equals(docData.getPersistentID())) { 336 remove(i); 337 isDirty = true; 338 }//if 339 }//for loop through the doc data 340 341 if (isDirty) 342 try { 343 this.dataStore.sync(this); 344 } catch (PersistenceException ex) { 345 throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage()); 346 } catch (SecurityException sex) { 347 throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage()); 348 } 349 }//resourceDeleted 350 351 /** 352 * Called by a datastore when a resource has been wrote into the datastore 353 */ 354 public void resourceWritten(DatastoreEvent evt){ 355 } 356 357 358 359 //List methods 360 //java docs will be automatically copied from the List interface. 361 362 public int size() { 363 return docDataList.size(); 364 } 365 366 public boolean isEmpty() { 367 return docDataList.isEmpty(); 368 } 369 370 public boolean contains(Object o){ 371 //return true if: 372 // - the document data list contains a document with such a name 373 // and persistent id 374 375 if(! (o instanceof Document)) 376 return false; 377 378 int index = findDocument((Document) o); 379 if (index < 0) 380 return false; 381 else 382 return true; 383 } 384 385 public Iterator iterator(){ 386 return new Iterator(){ 387 Iterator docDataIter = docDataList.iterator(); 388 389 public boolean hasNext() { 390 return docDataIter.hasNext(); 391 } 392 393 public Object next(){ 394 395 //try finding a document with the same name and persistent ID 396 DocumentData docData = (DocumentData) docDataIter.next(); 397 int index = docDataList.indexOf(docData); 398 return SerialCorpusImpl.this.get(index); 399 } 400 401 public void remove() { 402 throw new UnsupportedOperationException("SerialCorpusImpl does not " + 403 "support remove in the iterators"); 404 } 405 }; //return 406 407 }//iterator 408 409 public String toString() { 410 return "document data " + docDataList.toString() + " documents " + documents; 411 } 412 413 public Object[] toArray(){ 414 //there is a problem here, because some docs might not be instantiated 415 throw new MethodNotImplementedException( 416 "toArray() is not implemented for SerialCorpusImpl"); 417 } 418 419 public Object[] toArray(Object[] a){ 420 //there is a problem here, because some docs might not be instantiated 421 throw new MethodNotImplementedException( 422 "toArray(Object[] a) is not implemented for SerialCorpusImpl"); 423 } 424 425 public boolean add(Object o){ 426 if (! (o instanceof Document) || o == null) 427 return false; 428 Document doc = (Document) o; 429 430 //make it accept only docs from its own datastore 431 if (doc.getDataStore() != null 432 && !this.dataStore.equals(doc.getDataStore())) { 433 Err.prln("Error: Persistent corpus can only accept documents " + 434 "from its own datastore!"); 435 return false; 436 }//if 437 438 //add the document with its index in the docDataList 439 //in this case, since it's going to be added to the end 440 //the index will be the size of the docDataList before 441 //the addition 442 DocumentData docData = new DocumentData(doc.getName(), 443 doc.getLRPersistenceId()); 444 boolean result = docDataList.add(docData); 445 documents.add(doc); 446 fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this, 447 doc, 448 docDataList.size()-1, 449 CorpusEvent.DOCUMENT_ADDED)); 450 451 return result; 452 } 453 454 public boolean remove(Object o){ 455 if (DEBUG) Out.prln("SerialCorpus:Remove object called"); 456 if (! (o instanceof Document)) 457 return false; 458 Document doc = (Document) o; 459 460 //see if we can find it first. If not, then judt return 461 int index = findDocument(doc); 462 if (index == -1) 463 return false; 464 465 if(index < docDataList.size()) { //we found it, so remove it 466 docDataList.remove(index); 467 Document oldDoc = (Document) documents.remove(index); 468 if (DEBUG) Out.prln("documents after remove of " + oldDoc.getName() 469 + " are " + documents); 470 fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this, 471 oldDoc, 472 index, 473 CorpusEvent.DOCUMENT_REMOVED)); 474 } 475 476 return true; 477 } 478 479 public int findDocument(Document doc) { 480 boolean found = false; 481 DocumentData docData = null; 482 483 //first try finding the document in memory 484 int index = documents.indexOf(doc); 485 if (index > -1 && index < docDataList.size()) 486 return index; 487 488 //else try finding a document with the same name and persistent ID 489 Iterator iter = docDataList.iterator(); 490 for (index = 0; iter.hasNext(); index++) { 491 docData = (DocumentData) iter.next(); 492 if (docData.getDocumentName().equals(doc.getName()) && 493 docData.getPersistentID().equals(doc.getLRPersistenceId())) { 494 found = true; 495 break; 496 } 497 } 498 if (found && index < docDataList.size()) 499 return index; 500 else 501 return -1; 502 }//findDocument 503 504 public boolean containsAll(Collection c){ 505 Iterator iter = c.iterator(); 506 while (iter.hasNext()) { 507 if (! contains(iter.next())) 508 return false; 509 } 510 return true; 511 } 512 513 public boolean addAll(Collection c){ 514 boolean allAdded = true; 515 Iterator iter = c.iterator(); 516 while (iter.hasNext()) { 517 if (! add(iter.next())) 518 allAdded = false; 519 } 520 return allAdded; 521 } 522 523 public boolean addAll(int index, Collection c){ 524 throw new UnsupportedOperationException(); 525 } 526 527 public boolean removeAll(Collection c){ 528 boolean allRemoved = true; 529 Iterator iter = c.iterator(); 530 while (iter.hasNext()) { 531 if (! remove(iter.next())) 532 allRemoved = false; 533 } 534 return allRemoved; 535 536 } 537 538 public boolean retainAll(Collection c){ 539 throw new UnsupportedOperationException(); 540 } 541 542 public void clear(){ 543 documents.clear(); 544 docDataList.clear(); 545 } 546 547 public boolean equals(Object o){ 548 if (! (o instanceof SerialCorpusImpl)) 549 return false; 550 SerialCorpusImpl oCorpus = (SerialCorpusImpl) o; 551 if ((this == null && oCorpus != null) || (oCorpus == null && this != null)) 552 return false; 553 if (oCorpus == this) 554 return true; 555 if ((oCorpus.lrPersistentId == this.lrPersistentId || 556 ( this.lrPersistentId != null && 557 this.lrPersistentId.equals(oCorpus.lrPersistentId)) 558 ) 559 && 560 oCorpus.name.equals(this.name) 561 && 562 (oCorpus.dataStore == this.dataStore 563 || oCorpus.dataStore.equals(this.dataStore)) 564 && 565 oCorpus.docDataList.equals(docDataList)) 566 return true; 567 return false; 568 } 569 570 public int hashCode(){ 571 return docDataList.hashCode(); 572 } 573 574 public Object get(int index){ 575 if (index >= docDataList.size()) 576 return null; 577 578 Object res = documents.get(index); 579 580 if (DEBUG) 581 Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res); 582 583 //if the document is null, then I must get it from the DS 584 if (res == null) { 585 FeatureMap features = Factory.newFeatureMap(); 586 features.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore); 587 try { 588 features.put(DataStore.LR_ID_FEATURE_NAME, 589 ((DocumentData)docDataList.get(index)).getPersistentID()); 590 Resource lr = Factory.createResource( "gate.corpora.DocumentImpl", 591 features); 592 if (DEBUG) 593 Out.prln("Loaded document :" + lr.getName()); 594 //change the result to the newly loaded doc 595 res = lr; 596 597 //finally replace the doc with the instantiated version 598 documents.set(index, lr); 599 } catch (ResourceInstantiationException ex) { 600 Err.prln("Error reading document inside a serialised corpus."); 601 throw new GateRuntimeException(ex.getMessage()); 602 } 603 } 604 605 return res; 606 } 607 608 public Object set(int index, Object element){ 609 throw new gate.util.MethodNotImplementedException(); 610 //fire the 2 events 611 /* fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this, 612 oldDoc, 613 ((Integer) key).intValue(), 614 CorpusEvent.DOCUMENT_REMOVED)); 615 fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this, 616 newDoc, 617 ((Integer) key).intValue(), 618 CorpusEvent.DOCUMENT_ADDED)); 619 */ 620 } 621 622 public void add(int index, Object o){ 623 if (! (o instanceof Document) || o == null) 624 return; 625 Document doc = (Document) o; 626 627 DocumentData docData = new DocumentData(doc.getName(), 628 doc.getLRPersistenceId()); 629 docDataList.add(index, docData); 630 631 documents.add(index, doc); 632 fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this, 633 doc, 634 index, 635 CorpusEvent.DOCUMENT_ADDED)); 636 637 } 638 639 public Object remove(int index){ 640 if (DEBUG) Out.prln("Remove index called"); 641 docDataList.remove(index); 642 Document res = (Document) documents.remove(index); 643 fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this, 644 res, 645 index, 646 CorpusEvent.DOCUMENT_REMOVED)); 647 return res; 648 649 } 650 651 public int indexOf(Object o){ 652 if (o instanceof Document) 653 return findDocument((Document) o); 654 655 return -1; 656 } 657 658 public int lastIndexOf(Object o){ 659 throw new gate.util.MethodNotImplementedException(); 660 } 661 662 public ListIterator listIterator(){ 663 throw new gate.util.MethodNotImplementedException(); 664 } 665 666 public ListIterator listIterator(int index){ 667 throw new gate.util.MethodNotImplementedException(); 668 } 669 670 /** 671 * persistent Corpus does not support this method as all 672 * the documents might no be in memory 673 */ 674 public List subList(int fromIndex, int toIndex){ 675 throw new gate.util.MethodNotImplementedException(); 676 } 677 678 public void setDataStore(DataStore dataStore) 679 throws gate.persist.PersistenceException { 680 super.setDataStore( dataStore); 681 if (this.dataStore != null) 682 this.dataStore.addDatastoreListener(this); 683 } 684 685 public void setTransientSource(Object source) { 686 if (! (source instanceof Corpus)) 687 return; 688 689 //the following initialisation is only valid when we're constructing 690 //this object from a transient one. If it has already been stored in 691 //a datastore, then the initialisation is done in readObject() since 692 //this method is the one called by serialisation, when objects 693 //are restored. 694 if (this.dataStore != null && this.lrPersistentId != null) 695 return; 696 697 Corpus tCorpus = (Corpus) source; 698 699 //copy the corpus name and features from the one in memory 700 this.setName(tCorpus.getName()); 701 this.setFeatures(tCorpus.getFeatures()); 702 703 docDataList = new ArrayList(); 704 //now cache the names of all docs for future use 705 Iterator iter = tCorpus.getDocumentNames().iterator(); 706 while (iter.hasNext()) 707 docDataList.add(new DocumentData((String) iter.next(), null)); 708 709 //copy all the documents from the transient corpus 710 documents = new ArrayList(); 711 documents.addAll(tCorpus); 712 713 //make sure we fire events when docs are added/removed/etc 714 Gate.getCreoleRegister().addCreoleListener(this); 715 716 } 717 718 //we don't keep the transient source, so always return null 719 //Sill this must be implemented, coz of the GUI and Factory 720 public Object getTransientSource() { 721 return null; 722 } 723 724 725 public Resource init() throws gate.creole.ResourceInstantiationException { 726 super.init(); 727 728 return this; 729 730 } 731 732 733 /** 734 * readObject - calls the default readObject() and then initialises the 735 * transient data 736 * 737 * @serialData Read serializable fields. No optional data read. 738 */ 739 private void readObject(ObjectInputStream s) 740 throws IOException, ClassNotFoundException { 741 s.defaultReadObject(); 742 documents = new ArrayList(docDataList.size()); 743 for (int i = 0; i < docDataList.size(); i++) 744 documents.add(null); 745 corpusListeners = new Vector(); 746 //finally set the creole listeners if the LR is like that 747 Gate.getCreoleRegister().addCreoleListener(this); 748 if (this.dataStore != null) 749 this.dataStore.addDatastoreListener(this); 750 751 }//readObject 752 }
|
SerialCorpusImpl |
|