1   /*
2    *  SerialCorpusImpl.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 19/Oct/2001
12   *
13   *  $Id: SerialCorpusImpl.java,v 1.26 2002/03/07 16:59:42 kalina Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  
20  import gate.*;
21  import gate.util.*;
22  import gate.annotation.*;
23  import gate.persist.*;
24  import java.io.*;
25  import java.net.*;
26  import gate.event.*;
27  import gate.creole.*;
28  import gate.security.SecurityException;
29  
30  //The initial design was to implement this on the basis of a WeakValueHashMap.
31  //However this creates problems, because the user might e.g., add a transient
32  //document to the corpus and then if the Document variable goes out of scope
33  //before sync() is called, nothing will be saved of the new document. Bad!
34  //Instead, to cope with the unloading for memory saving use, I implemented
35  //a documentUnload() method, which sets the in-memory copy to null but can
36  //always restore the doc, because it has its persistence ID.
37  
38  public class SerialCorpusImpl extends
39            AbstractLanguageResource
40                        implements Corpus, CreoleListener, DatastoreListener {
41  
42    /** Debug flag */
43    private static final boolean DEBUG = false;
44  
45    static final long serialVersionUID = 3632609241787241616L;
46  
47    protected transient Vector corpusListeners;
48    protected java.util.List docDataList = null;
49  
50    //here I keep document index as key (same as the index in docDataList
51    //which defines the document order) and Documents as value
52    protected transient List documents = null;
53  
54    public SerialCorpusImpl() {
55    }
56  
57    /**
58     * Constructor to create a SerialCorpus from a transient one.
59     * This is called by adopt() to store the transient corpus
60     * and re-route the methods calls to it, until the corpus is
61     * sync-ed on disk. After that, the transientCorpus will always
62     * be null, so the new functionality will be used instead.
63     */
64    protected SerialCorpusImpl(Corpus tCorpus){
65      //copy the corpus name and features from the one in memory
66      this.setName(tCorpus.getName());
67      this.setFeatures(tCorpus.getFeatures());
68  
69      docDataList = new ArrayList();
70      //now cache the names of all docs for future use
71      Iterator iter = tCorpus.getDocumentNames().iterator();
72      while (iter.hasNext())
73        docDataList.add(new DocumentData((String) iter.next(), null));
74  
75      //copy all the documents from the transient corpus
76      documents = new ArrayList();
77      documents.addAll(tCorpus);
78  
79      //make sure we fire events when docs are added/removed/etc
80      Gate.getCreoleRegister().addCreoleListener(this);
81    }
82  
83    /**
84     * Gets the names of the documents in this corpus.
85     * @return a {@link List} of Strings representing the names of the documents
86     * in this corpus.
87     */
88    public List getDocumentNames(){
89      List docsNames = new ArrayList();
90      if(docDataList == null)
91        return docsNames;
92      Iterator iter = docDataList.iterator();
93      while (iter.hasNext()) {
94        DocumentData data = (DocumentData) iter.next();
95        docsNames.add(data.getDocumentName());
96      }
97      return docsNames;
98    }
99  
100   /**
101    * This method should only be used by the Serial Datastore to set
102    */
103   public void setDocumentPersistentID(int index, Object persID){
104     if (index >= docDataList.size()) return;
105     ((DocumentData)docDataList.get(index)).setPersistentID(persID);
106     if (DEBUG) Out.prln("IDs are now: " + docDataList);
107   }
108 
109   /**
110    * Gets the name of a document in this corpus.
111    * @param index the index of the document
112    * @return a String value representing the name of the document at
113    * <tt>index</tt> in this corpus.<P>
114    */
115   public String getDocumentName(int index){
116     if (index >= docDataList.size()) return "No such document";
117 
118     return ((DocumentData) docDataList.get(index)).getDocumentName();
119   }
120 
121   /**
122    * Unloads the document from memory, but calls sync() first, to store the
123    * changes
124    */
125   public void unloadDocument(int index) {
126     //1. check whether its been loaded and is a persistent one
127     // if a persistent doc is not loaded, there's nothing we need to do
128     if ( (! isDocumentLoaded(index)) && isPersistentDocument(index))
129       return;
130 
131     //2. sync the document before releasing it from memory, because the
132     //creole register garbage collects all LRs which are not used any more
133     Document doc = (Document) documents.get(index);
134     try {
135       //if the document is not already adopted, we need to do that first
136       if (doc.getLRPersistenceId() == null) {
137         doc = (Document) this.getDataStore().adopt(doc, null);
138         this.getDataStore().sync(doc);
139         this.setDocumentPersistentID(index, doc.getLRPersistenceId());
140       } else //if it is adopted, just sync it
141         this.getDataStore().sync(doc);
142 
143       //3. remove the document from the memory
144       //do this, only if the saving has succeeded
145       documents.set(index, null);
146 
147     } catch (PersistenceException ex) {
148         throw new GateRuntimeException("Error unloading document from corpus"
149                       + "because document sync failed: " + ex.getMessage());
150     } catch (gate.security.SecurityException ex1) {
151         throw new GateRuntimeException("Error unloading document from corpus"
152                       + "because of document access error: " + ex1.getMessage());
153     }
154 
155   }
156 
157   /**
158    * Unloads a document from memory
159    */
160   public void unloadDocument(Document doc) {
161     if (DEBUG) Out.prln("Document to be unloaded :" + doc.getName());
162     //1. determine the index of the document; if not there, do nothing
163     int index = findDocument(doc);
164     if (index == -1)
165       return;
166     if (DEBUG) Out.prln("Index of doc: " + index);
167     if (DEBUG) Out.prln("Size of corpus: " + documents.size());
168     unloadDocument(index);
169 //    documents.remove(new Integer(index));
170   }
171 
172   /**
173    * This method returns true when the document is already loaded in memory
174    */
175   public boolean isDocumentLoaded(int index) {
176     if (documents == null || documents.isEmpty()) return false;
177     return documents.get(index) != null;
178   }
179 
180   /**
181    * This method returns true when the document is already stored on disk
182    * i.e., is not transient
183    */
184   public boolean isPersistentDocument(int index) {
185     if (documents == null || documents.isEmpty()) return false;
186     return (((DocumentData)docDataList.get(index)).getPersistentID() != null);
187   }
188 
189   /**
190    * Every LR that is a CreoleListener (and other Listeners too) must
191    * override this method and make sure it removes itself from the
192    * objects which it has been listening to. Otherwise, the object will
193    * not be released from memory (memory leak!).
194    */
195   public void cleanup() {
196     if (DEBUG) Out.prln("serial corpus cleanup called");
197     if (corpusListeners != null)
198       corpusListeners = null;
199     if (documents != null)
200       documents.clear();
201     docDataList.clear();
202     Gate.getCreoleRegister().removeCreoleListener(this);
203     if (this.dataStore != null) {
204       this.dataStore.removeDatastoreListener(this);
205     }
206   }
207 
208   /**
209    * Fills this corpus with documents created from files in a directory.
210    * @param filter the file filter used to select files from the target
211    * directory. If the filter is <tt>null</tt> all the files will be accepted.
212    * @param directory the directory from which the files will be picked. This
213    * parameter is an URL for uniformity. It needs to be a URL of type file
214    * otherwise an InvalidArgumentException will be thrown.
215    * An implementation for this method is provided as a static method at
216    * {@link gate.corpora.CorpusImpl#populate(Corpus,URL,FileFilter,boolean)}.
217    * @param encoding the encoding to be used for reading the documents
218    * @param recurseDirectories should the directory be parsed recursively?. If
219    * <tt>true</tt> all the files from the provided directory and all its
220    * children directories (on as many levels as necessary) will be picked if
221    * accepted by the filter otherwise the children directories will be ignored.
222    */
223   public void populate(URL directory, FileFilter filter, String encoding,
224                        boolean recurseDirectories)
225               throws IOException, ResourceInstantiationException{
226     CorpusImpl.populate(this, directory, filter, encoding, recurseDirectories);
227   }
228 
229 
230   public synchronized void removeCorpusListener(CorpusListener l) {
231     if (corpusListeners != null && corpusListeners.contains(l)) {
232       Vector v = (Vector) corpusListeners.clone();
233       v.removeElement(l);
234       corpusListeners = v;
235     }
236   }
237   public synchronized void addCorpusListener(CorpusListener l) {
238     Vector v = corpusListeners == null ? new Vector(2) : (Vector) corpusListeners.clone();
239     if (!v.contains(l)) {
240       v.addElement(l);
241       corpusListeners = v;
242     }
243   }
244   protected void fireDocumentAdded(CorpusEvent e) {
245     if (corpusListeners != null) {
246       Vector listeners = corpusListeners;
247       int count = listeners.size();
248       for (int i = 0; i < count; i++) {
249         ((CorpusListener) listeners.elementAt(i)).documentAdded(e);
250       }
251     }
252   }
253   protected void fireDocumentRemoved(CorpusEvent e) {
254     if (corpusListeners != null) {
255       Vector listeners = corpusListeners;
256       int count = listeners.size();
257       for (int i = 0; i < count; i++) {
258         ((CorpusListener) listeners.elementAt(i)).documentRemoved(e);
259       }
260     }
261   }
262   public void resourceLoaded(CreoleEvent e) {
263   }
264 
265   public void resourceRenamed(Resource resource, String oldName,
266                               String newName){}
267 
268   public void resourceUnloaded(CreoleEvent e) {
269     Resource res = e.getResource();
270     if (res instanceof Document) {
271       Document doc = (Document) res;
272       if (DEBUG)
273         Out.prln("resource Unloaded called ");
274       //remove from the corpus too, if a transient one
275       if (doc.getDataStore() != this.getDataStore()) {
276         this.remove(doc);
277       } else {
278         //unload all occurences
279         int index = indexOf(res);
280         if (index < 0)
281           return;
282         documents.set(index, null);
283         if (DEBUG)
284           Out.prln("corpus: document "+ index + " unloaded and set to null");
285       } //if
286     }
287   }
288   public void datastoreOpened(CreoleEvent e) {
289   }
290   public void datastoreCreated(CreoleEvent e) {
291   }
292   public void datastoreClosed(CreoleEvent e) {
293     if (! e.getDatastore().equals(this.getDataStore()))
294       return;
295     if (this.getDataStore() != null)
296       this.getDataStore().removeDatastoreListener(this);
297     //close this corpus, since it cannot stay open when the DS it comes from
298     //is closed
299     Factory.deleteResource(this);
300   }
301   /**
302    * Called by a datastore when a new resource has been adopted
303    */
304   public void resourceAdopted(DatastoreEvent evt){
305   }
306 
307   /**
308    * Called by a datastore when a resource has been deleted
309    */
310   public void resourceDeleted(DatastoreEvent evt){
311     DataStore ds = (DataStore)evt.getSource();
312     //1. check whether this datastore fired the event. If not, return.
313     if (!ds.equals(this.dataStore))
314       return;
315 
316     Object docID = evt.getResourceID();
317     if (docID == null)
318       return;
319 
320     if (DEBUG) Out.prln("Resource deleted called for: " + docID);
321     //first check if it is this corpus that's been deleted, it must be
322     //unloaded immediately
323     if (docID.equals(this.getLRPersistenceId())) {
324       Factory.deleteResource(this);
325       return;
326     }//if
327 
328     boolean isDirty=false;
329     //the problem here is that I only have the doc persistent ID
330     //and nothing else, so I need to determine the index of the doc first
331     for (int i=0; i< docDataList.size(); i++) {
332       DocumentData docData = (DocumentData)docDataList.get(i);
333       //we've found the correct document
334       //don't break the loop, because it might appear more than once
335       if (docID.equals(docData.getPersistentID())) {
336         remove(i);
337         isDirty = true;
338       }//if
339     }//for loop through the doc data
340 
341     if (isDirty)
342       try {
343         this.dataStore.sync(this);
344       } catch (PersistenceException ex) {
345         throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage());
346       } catch (SecurityException sex) {
347         throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage());
348       }
349   }//resourceDeleted
350 
351   /**
352    * Called by a datastore when a resource has been wrote into the datastore
353    */
354   public void resourceWritten(DatastoreEvent evt){
355   }
356 
357 
358 
359   //List methods
360   //java docs will be automatically copied from the List interface.
361 
362   public int size() {
363     return docDataList.size();
364   }
365 
366   public boolean isEmpty() {
367     return docDataList.isEmpty();
368   }
369 
370   public boolean contains(Object o){
371     //return true if:
372     // - the document data list contains a document with such a name
373     //   and persistent id
374 
375     if(! (o instanceof Document))
376       return false;
377 
378     int index = findDocument((Document) o);
379     if (index < 0)
380       return false;
381     else
382       return true;
383   }
384 
385   public Iterator iterator(){
386     return new Iterator(){
387       Iterator docDataIter = docDataList.iterator();
388 
389       public boolean hasNext() {
390         return docDataIter.hasNext();
391       }
392 
393       public Object next(){
394 
395         //try finding a document with the same name and persistent ID
396         DocumentData docData = (DocumentData) docDataIter.next();
397         int index = docDataList.indexOf(docData);
398         return SerialCorpusImpl.this.get(index);
399       }
400 
401       public void remove() {
402         throw new UnsupportedOperationException("SerialCorpusImpl does not " +
403                     "support remove in the iterators");
404       }
405     }; //return
406 
407   }//iterator
408 
409   public String toString() {
410     return "document data " + docDataList.toString() + " documents " + documents;
411   }
412 
413   public Object[] toArray(){
414     //there is a problem here, because some docs might not be instantiated
415     throw new MethodNotImplementedException(
416                 "toArray() is not implemented for SerialCorpusImpl");
417   }
418 
419   public Object[] toArray(Object[] a){
420     //there is a problem here, because some docs might not be instantiated
421     throw new MethodNotImplementedException(
422                 "toArray(Object[] a) is not implemented for SerialCorpusImpl");
423   }
424 
425   public boolean add(Object o){
426     if (! (o instanceof Document) || o == null)
427       return false;
428     Document doc = (Document) o;
429 
430     //make it accept only docs from its own datastore
431     if (doc.getDataStore() != null
432         && !this.dataStore.equals(doc.getDataStore())) {
433       Err.prln("Error: Persistent corpus can only accept documents " +
434                "from its own datastore!");
435       return false;
436     }//if
437 
438     //add the document with its index in the docDataList
439     //in this case, since it's going to be added to the end
440     //the index will be the size of the docDataList before
441     //the addition
442     DocumentData docData = new DocumentData(doc.getName(),
443                                             doc.getLRPersistenceId());
444     boolean result = docDataList.add(docData);
445     documents.add(doc);
446     fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this,
447                                       doc,
448                                       docDataList.size()-1,
449                                       CorpusEvent.DOCUMENT_ADDED));
450 
451     return result;
452   }
453 
454   public boolean remove(Object o){
455     if (DEBUG) Out.prln("SerialCorpus:Remove object called");
456     if (! (o instanceof Document))
457       return false;
458     Document doc = (Document) o;
459 
460     //see if we can find it first. If not, then judt return
461     int index = findDocument(doc);
462     if (index == -1)
463       return false;
464 
465     if(index < docDataList.size()) { //we found it, so remove it
466       docDataList.remove(index);
467       Document oldDoc =  (Document) documents.remove(index);
468       if (DEBUG) Out.prln("documents after remove of " + oldDoc.getName()
469                           + " are " + documents);
470       fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
471                                           oldDoc,
472                                           index,
473                                           CorpusEvent.DOCUMENT_REMOVED));
474     }
475 
476     return true;
477   }
478 
479   public int findDocument(Document doc) {
480     boolean found = false;
481     DocumentData docData = null;
482 
483     //first try finding the document in memory
484     int index = documents.indexOf(doc);
485     if (index > -1 && index < docDataList.size())
486       return index;
487 
488     //else try finding a document with the same name and persistent ID
489     Iterator iter = docDataList.iterator();
490     for (index = 0;  iter.hasNext(); index++) {
491       docData = (DocumentData) iter.next();
492       if (docData.getDocumentName().equals(doc.getName()) &&
493           docData.getPersistentID().equals(doc.getLRPersistenceId())) {
494         found = true;
495         break;
496       }
497     }
498     if (found && index < docDataList.size())
499       return index;
500     else
501       return -1;
502   }//findDocument
503 
504   public boolean containsAll(Collection c){
505     Iterator iter = c.iterator();
506     while (iter.hasNext()) {
507       if (! contains(iter.next()))
508         return false;
509     }
510     return true;
511   }
512 
513   public boolean addAll(Collection c){
514     boolean allAdded = true;
515     Iterator iter = c.iterator();
516     while (iter.hasNext()) {
517       if (! add(iter.next()))
518         allAdded = false;
519     }
520     return allAdded;
521   }
522 
523   public boolean addAll(int index, Collection c){
524     throw new UnsupportedOperationException();
525   }
526 
527   public boolean removeAll(Collection c){
528     boolean allRemoved = true;
529     Iterator iter = c.iterator();
530     while (iter.hasNext()) {
531       if (! remove(iter.next()))
532         allRemoved = false;
533     }
534     return allRemoved;
535 
536   }
537 
538   public boolean retainAll(Collection c){
539     throw new UnsupportedOperationException();
540   }
541 
542   public void clear(){
543     documents.clear();
544     docDataList.clear();
545   }
546 
547   public boolean equals(Object o){
548     if (! (o instanceof SerialCorpusImpl))
549       return false;
550     SerialCorpusImpl oCorpus = (SerialCorpusImpl) o;
551     if ((this == null && oCorpus != null) || (oCorpus == null && this != null))
552       return false;
553     if (oCorpus == this)
554       return true;
555     if ((oCorpus.lrPersistentId == this.lrPersistentId ||
556           ( this.lrPersistentId != null &&
557             this.lrPersistentId.equals(oCorpus.lrPersistentId))
558           )
559         &&
560         oCorpus.name.equals(this.name)
561         &&
562         (oCorpus.dataStore == this.dataStore
563           || oCorpus.dataStore.equals(this.dataStore))
564         &&
565         oCorpus.docDataList.equals(docDataList))
566       return true;
567     return false;
568   }
569 
570   public int hashCode(){
571     return docDataList.hashCode();
572   }
573 
574   public Object get(int index){
575       if (index >= docDataList.size())
576         return null;
577 
578       Object res = documents.get(index);
579 
580       if (DEBUG)
581         Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res);
582 
583       //if the document is null, then I must get it from the DS
584       if (res == null) {
585         FeatureMap features = Factory.newFeatureMap();
586         features.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore);
587         try {
588           features.put(DataStore.LR_ID_FEATURE_NAME,
589                       ((DocumentData)docDataList.get(index)).getPersistentID());
590           Resource lr = Factory.createResource( "gate.corpora.DocumentImpl",
591                                                 features);
592           if (DEBUG)
593             Out.prln("Loaded document :" + lr.getName());
594           //change the result to the newly loaded doc
595           res = lr;
596 
597           //finally replace the doc with the instantiated version
598           documents.set(index, lr);
599         } catch (ResourceInstantiationException ex) {
600           Err.prln("Error reading document inside a serialised corpus.");
601           throw new GateRuntimeException(ex.getMessage());
602         }
603       }
604 
605       return res;
606   }
607 
608   public Object set(int index, Object element){
609     throw new gate.util.MethodNotImplementedException();
610         //fire the 2 events
611 /*        fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
612                                             oldDoc,
613                                             ((Integer) key).intValue(),
614                                             CorpusEvent.DOCUMENT_REMOVED));
615         fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this,
616                                           newDoc,
617                                           ((Integer) key).intValue(),
618                                           CorpusEvent.DOCUMENT_ADDED));
619 */
620   }
621 
622   public void add(int index, Object o){
623     if (! (o instanceof Document) || o == null)
624       return;
625     Document doc = (Document) o;
626 
627     DocumentData docData = new DocumentData(doc.getName(),
628                                             doc.getLRPersistenceId());
629     docDataList.add(index, docData);
630 
631     documents.add(index, doc);
632     fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this,
633                                       doc,
634                                       index,
635                                       CorpusEvent.DOCUMENT_ADDED));
636 
637   }
638 
639   public Object remove(int index){
640     if (DEBUG) Out.prln("Remove index called");
641     docDataList.remove(index);
642     Document res = (Document) documents.remove(index);
643     fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
644                                         res,
645                                         index,
646                                         CorpusEvent.DOCUMENT_REMOVED));
647     return res;
648 
649   }
650 
651   public int indexOf(Object o){
652     if (o instanceof Document)
653       return findDocument((Document) o);
654 
655     return -1;
656   }
657 
658   public int lastIndexOf(Object o){
659     throw new gate.util.MethodNotImplementedException();
660   }
661 
662   public ListIterator listIterator(){
663     throw new gate.util.MethodNotImplementedException();
664   }
665 
666   public ListIterator listIterator(int index){
667     throw new gate.util.MethodNotImplementedException();
668   }
669 
670   /**
671    * persistent Corpus does not support this method as all
672    * the documents might no be in memory
673    */
674   public List subList(int fromIndex, int toIndex){
675     throw new gate.util.MethodNotImplementedException();
676   }
677 
678   public void setDataStore(DataStore dataStore)
679                 throws gate.persist.PersistenceException {
680     super.setDataStore( dataStore);
681     if (this.dataStore != null)
682       this.dataStore.addDatastoreListener(this);
683   }
684 
685   public void setTransientSource(Object source) {
686     if (! (source instanceof Corpus))
687       return;
688 
689     //the following initialisation is only valid when we're constructing
690     //this object from a transient one. If it has already been stored in
691     //a datastore, then the initialisation is done in readObject() since
692     //this method is the one called by serialisation, when objects
693     //are restored.
694     if (this.dataStore != null && this.lrPersistentId != null)
695       return;
696 
697     Corpus tCorpus = (Corpus) source;
698 
699     //copy the corpus name and features from the one in memory
700     this.setName(tCorpus.getName());
701     this.setFeatures(tCorpus.getFeatures());
702 
703     docDataList = new ArrayList();
704     //now cache the names of all docs for future use
705     Iterator iter = tCorpus.getDocumentNames().iterator();
706     while (iter.hasNext())
707       docDataList.add(new DocumentData((String) iter.next(), null));
708 
709     //copy all the documents from the transient corpus
710     documents = new ArrayList();
711     documents.addAll(tCorpus);
712 
713     //make sure we fire events when docs are added/removed/etc
714     Gate.getCreoleRegister().addCreoleListener(this);
715 
716   }
717 
718   //we don't keep the transient source, so always return null
719   //Sill this must be implemented, coz of the GUI and Factory
720   public Object getTransientSource() {
721     return null;
722   }
723 
724 
725   public Resource init() throws gate.creole.ResourceInstantiationException {
726     super.init();
727 
728     return this;
729 
730   }
731 
732 
733   /**
734    * readObject - calls the default readObject() and then initialises the
735    * transient data
736    *
737    * @serialData Read serializable fields. No optional data read.
738    */
739   private void readObject(ObjectInputStream s)
740       throws IOException, ClassNotFoundException {
741     s.defaultReadObject();
742     documents = new ArrayList(docDataList.size());
743     for (int i = 0; i < docDataList.size(); i++)
744       documents.add(null);
745     corpusListeners = new Vector();
746     //finally set the creole listeners if the LR is like that
747     Gate.getCreoleRegister().addCreoleListener(this);
748     if (this.dataStore != null)
749       this.dataStore.addDatastoreListener(this);
750 
751   }//readObject
752 }