1   /*
2    *  Batch.java - transducer class
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 10/08/98
12   *
13   *  $Id: Batch.java,v 1.31 2001/09/28 15:45:23 valyt Exp $
14   *
15   *  DEVELOPER NOTES:
16   *
17   *  This is one that got away; the relation between constructors,
18   *  initTransducer and parseTransducer are totally screwy and get worse
19   *  every time I add something (e.g. support for resource loading).
20   *  We should probably junk this whole thing and start again....
21   */
22  
23  package gate.jape;
24  
25  import java.util.*;
26  import java.util.jar.*;
27  import java.io.*;
28  import java.net.*;
29  
30  import gate.annotation.*;
31  import gate.util.*;
32  import gate.*;
33  import gate.event.*;
34  import gate.creole.*;
35  
36  /** Batch processing of JAPE transducers against documents or collections.
37    * Construction will parse or deserialise a transducer as required.
38    */
39  public class Batch implements JapeConstants {
40    /** Debug flag */
41    private static final boolean DEBUG = false;
42  
43    /** The name of the transducer file, a .jape or .ser. */
44  //  private String japeFileName;
45  
46    /** The URL that points to a .jape file */
47    private URL japeURL;
48  
49    /**The encoding used for reading the grammar file(s)*/
50    private String encoding;
51  
52    /** The JAPE transducer. */
53    private Transducer transducer;
54  
55    /** A stream connected to the JAPE file (often null). */
56  //  private InputStream japeStream = null;
57  
58    /** Create non-initialised instance (private, used in main). */
59    private Batch() { }
60  
61    /** Create a fully initialised instance.
62      * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer
63      * file. This may be an absolute path, or may a .jar
64      * that lives somewhere on the classpath.
65      */
66    public Batch(URL url, String encoding) throws JapeException {
67      this.japeURL = url;
68      this.encoding =  encoding;
69      parseJape();
70      if(transducer != null){
71        transducer.addStatusListener(new StatusListener(){
72          public void statusChanged(String text){
73            fireStatusChanged(text);
74          }
75        });
76  
77        transducer.addProgressListener(new ProgressListener(){
78          public void progressChanged(int value){
79            fireProgressChanged(value);
80          }
81  
82          public void processFinished(){
83            fireProcessFinished();
84          }
85        });
86      }
87  
88    } // full init constructor
89  
90    public Batch(URL url, String encoding, StatusListener sListener)
91           throws JapeException {
92  
93      this.addStatusListener(sListener);
94      this.japeURL = url;
95      this.encoding =  encoding;
96      parseJape();
97      if(transducer != null){
98        transducer.addStatusListener(new StatusListener(){
99          public void statusChanged(String text){
100           fireStatusChanged(text);
101         }
102       });
103 
104       transducer.addProgressListener(new ProgressListener(){
105         public void progressChanged(int value){
106           fireProgressChanged(value);
107         }
108 
109         public void processFinished(){
110           fireProcessFinished();
111         }
112       });
113     }
114   } // full init constructor
115 
116   /**
117    * Notifies this PR that it should stop its execution as soon as possible.
118    */
119   public synchronized void interrupt(){
120     transducer.interrupt();
121   }
122   /** Create a fully initialised instance.
123     * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer
124     * file. This may be an absolute path, or may a .jar
125     * that lives somewhere on the classpath.
126     */
127 /*
128   public Batch(String japeFileName) throws JapeException {
129     this.japeFileName = japeFileName;
130     initTransducer();
131   } // full init constructor
132 */
133 /*
134   public Batch(String japeFileName, StatusListener sListener)
135                                                         throws JapeException {
136     this.japeFileName = japeFileName;
137     this.addStatusListener(sListener);
138     initTransducer();
139   } // full init constructor
140 */
141 
142   /** Create a fully initialised instance from an InputStream connected
143     * to the JAPE file.
144     */
145 /*
146   public Batch(InputStream japeStream) throws JapeException {
147     if(japeStream == null)
148       throw new JapeException(
149         "attempt to create a batch parser with null input stream"
150       );
151     this.japeFileName = "stream";
152     this.japeStream = japeStream;
153     initTransducer();
154   } // full init constructor
155 */
156   /** Create a fully initialised instance from a resource path and resource
157     * name.
158     */
159 /*
160   public Batch(String resPath, String resName) throws JapeException {
161     fromResource = true;
162     this.japeFileName = resName;
163     this.resPath = resPath;
164     initTransducer();
165   } // full init constructor
166 */
167 
168   /** Get the transducer. */
169   public Transducer getTransducer() { return transducer; }
170 
171   /** Instantiate transducer member as necessary. */
172 /*
173   private void initTransducer()
174   throws JapeException {
175     if(fromResource) {
176       parseJape(resPath, japeFileName);
177     } else if(japeFileName.endsWith(".ser") || japeFileName.endsWith(".SER"))
178       deserialiseJape(new File(japeFileName));
179     else if(japeFileName.endsWith(".jape") || japeFileName.endsWith(".JAPE"))
180       parseJape();
181     else if(japeFileName.endsWith(".jar") || japeFileName.endsWith(".JAR"))
182       deserialiseJape();
183     else if(japeFileName.equals("stream"))
184       parseJape(japeStream);
185     else
186       throw new JapeException(
187         "unknown file type (not .jape, .ser or .jar):" + japeFileName
188       );
189     if(transducer != null) transducer.addStatusListener(new StatusListener() {
190       public void statusChanged(String text){
191         fireStatusChangedEvent(text);
192       }
193     });
194   }
195 */
196   /** Parse a jape file from {@link #japeURL} and store the transducer. */
197   private void parseJape() throws JapeException {
198     try {
199       gate.jape.parser.ParseCpsl parser =
200         new gate.jape.parser.ParseCpsl(japeURL, encoding);
201 
202       StatusListener listener = null;
203       listener = new StatusListener(){
204         public void statusChanged(String text){
205           fireStatusChanged(text);
206         }
207       };
208       parser.addStatusListener(listener);
209       transducer = parser.MultiPhaseTransducer();
210       parser.removeStatusListener(listener);
211     } catch (gate.jape.parser.ParseException e) {
212       throw new
213         JapeException("Batch: error parsing transducer: " + e.getMessage());
214     } catch (java.io.IOException e) {
215       throw new
216         JapeException("Batch: couldn't open JAPE file: " + e.getMessage());
217     }
218   } // parseJape
219 
220   /** Parse a jape file from an InputStream and store the transducer. */
221 /*
222   private void parseJape(InputStream japeStream) throws JapeException {
223     try {
224       gate.jape.parser.ParseCpsl parser =
225         new gate.jape.parser.ParseCpsl(japeFileName, japeStream);
226       transducer = parser.MultiPhaseTransducer();
227     } catch (gate.jape.parser.ParseException e) {
228       throw new
229         JapeException("Batch: error parsing transducer: " + e.getMessage());
230     } catch (java.io.IOException e) {
231       throw new
232         JapeException("Batch: couldn't read JAPE stream: " + e.getMessage());
233     }
234   } // parseJape(InputStream)
235 */
236   /** Parse a jape file from a resource and store the transducer. */
237 /*
238   private void parseJape(String resPath, String resName) throws JapeException {
239     try {
240       gate.jape.parser.ParseCpsl parser =
241         new gate.jape.parser.ParseCpsl(resPath, resName);
242       transducer = parser.MultiPhaseTransducer();
243     } catch (gate.jape.parser.ParseException e) {
244       throw new
245         JapeException("Batch: error parsing transducer: " + e.getMessage());
246     } catch (java.io.IOException e) {
247       throw new
248         JapeException("Batch: couldn't read JAPE resource: " + e.getMessage());
249     }
250   } // parseJape(resPath, resName)
251 */
252 
253   /** Deserialise from a .ser file. */
254 /*
255   private void deserialiseJape(File japeFile) throws JapeException {
256 
257     // set up a file input stream
258     FileInputStream japeInputStream = null;
259     try {
260       japeInputStream = new FileInputStream(japeFile.getPath());
261     } catch (IOException e) {
262       throw new JapeException(
263         "Can't read from " + japeFile.getPath() + ": " + e.getMessage()
264       );
265     }
266 
267     // call the input stream deserialise method
268     deserialiseJape(japeInputStream);
269   } // deserialiseJape(File)
270 */
271   /** Deserialise from a JAR file. */
272 /*
273   private void deserialiseJape() throws JapeException {
274     // find the jar from CLASSPATH
275     //SearchPath classPath =
276     //  new SearchPath(System.getProperty("java.class.path"), ".");
277     File jarFile = new File(japeFileName); //classPath.getFile(japeFileName);
278     if(jarFile == null)
279       throw new JapeException("Batch: can't find " + japeFileName);
280 
281     // get a byte array input stream with the .ser in out of the jar file
282     JarFile jar = null;
283     BufferedInputStream japeInputStream = null;
284     try {
285       jar = new JarFile(jarFile.getPath());
286       japeInputStream = new BufferedInputStream(
287         jar.getInputStream(jar.getJarEntry(jarNameToSerName(japeFileName)))
288       );
289     } catch(IOException e) {
290       throw new JapeException("couldn't read jar file " + japeFileName);
291     }
292 
293 
294     // call the input stream deserialise method
295     deserialiseJape(japeInputStream);
296   } // deserialiseJape()
297 */
298   /** Create a transducer from an object input stream (deserialisation). */
299 /*
300   private void deserialiseJape(InputStream japeInputStream)
301   throws JapeException {
302     try {
303       ObjectInputStream ois = new ObjectInputStream(japeInputStream);
304       transducer = (Transducer) ois.readObject();
305       ois.close();
306       japeInputStream.close(); // redundant?
307     } catch (IOException e) {
308       throw new JapeException(
309         "Batch: can't deserialise InputStream (1): " + e.getMessage()
310       );
311     } catch (ClassNotFoundException e) {
312       throw new JapeException(
313         "Batch: can't deserialise InputStream (2): " + e.getMessage()
314       );
315     }
316   } // deserialise(OIS)
317 */
318   /** Create a .ser name from a .jar name. */
319 /*
320   private String jarNameToSerName(String jarName) {
321     return jarName.substring(0, jarName.length() - 4) + ".ser";
322   } // jarNameToSerName
323 */
324 
325   /** Process the given collection. */
326   public void transduce(Corpus coll) throws JapeException, ExecutionException {
327     // for each doc run the transducer
328     Iterator iter = coll.iterator();
329     while(iter.hasNext()) {
330       Document doc = (Document) iter.next();
331       // transducer.transduce(doc);
332       transduce(doc, doc.getAnnotations(), doc.getAnnotations());
333     }
334   } // transduce(coll)
335 
336   /** Process a single document. */
337   public void transduce(Document doc) throws JapeException, ExecutionException {
338     transducer.transduce(doc, doc.getAnnotations(), doc.getAnnotations());
339   } // transduce(doc)
340 
341   /** Process a single document. */
342   public void transduce(Document doc, AnnotationSet inputAS,
343                         AnnotationSet outputAS) throws JapeException,
344                                                        ExecutionException {
345     //no need to transduce empty document
346     if (inputAS == null || inputAS.isEmpty())
347       return;
348     transducer.transduce(doc, inputAS, outputAS);
349 
350   } // transduce(doc)
351 
352   /** Process a single text. */
353 /*
354   public Document transduce(String text) throws JapeException {
355     Document doc = null;
356     try {
357       doc = Factory.newDocument(text);
358     } catch (ResourceInstantiationException e) {
359       throw new JapeException(e.toString());
360     }
361     transducer.transduce(doc, doc.getAnnotations());
362     return doc;
363   } // transduce(text)
364 */
365   /** Process a single file. */
366 /*
367   public Document transduce(File textFile) throws JapeException {
368     String text = null;
369     try {
370       text = gate.util.Files.getString(textFile);
371     } catch(IOException e) { throw new JapeException(e.toString()); }
372     return transduce(text);
373   } // transduce(textFile)
374 */
375   /** Process a set of files. */
376 /*
377   public Corpus transduce(String[] textFileNames) throws JapeException {
378     Corpus coll = null;
379     try {
380       coll = Factory.newCorpus("JAPE batch corpus");
381       Document doc = null;
382       for(int i = 0; i < textFileNames.length; i++) {
383           doc = Factory.newDocument(textFileNames[i]);
384           doc.setFeatures(Factory.newFeatureMap());
385           /*coll.createDocument(
386             textFileNames[i],
387             null, // the text - should get read from disk
388             new AnnotationSetImpl(doc),
389             Factory.newFeatureMap(),
390             Document.COPIED
391           );*/
392 /*
393         transducer.transduce(doc, doc.getAnnotations());
394       }
395     } catch(ResourceInstantiationException e) {
396       throw new JapeException(e.toString());
397     }
398     return coll;
399   } // transduce(textFileNames)
400 */
401   /** This is where it all happens. This is <I>the</I> place to be. Take
402     * your summer holidays here. Visit on Saturday nights. Buy a season
403     * ticket from <CODE>www.programmer.gone.insane.com</CODE>.
404     * <P>
405     * Takes a .jape/.jar/.ser
406     *  file name (-j option) which is assumed to hold a pattern
407     * grammar for a multi-phase transducer, and a collection
408     * name (-c option) or a list of files. As needed it then parses and
409     * compiles the transducer, then transduces all the documents in the
410     * collection and saves it to disk.
411     */
412   public static void main(String args[]) {
413 /*
414     // oh great bug in the sky give us this day our daily fuckup
415     //gate.util.Debug.setDebug(true);
416     //gate.util.Debug.setDebug(Rule.class, true);
417     //gate.util.Debug.setDebug(LeftHandSide.class, true);
418     //gate.util.Debug.setDebug(BasicPatternElement.class, true);
419     //gate.util.Debug.setDebug(AnnotationSet.class, true);
420 
421     // The persistent name of the collection.
422     String persCollName = null;;
423 
424     // The collection to process.
425     Corpus collection = null;
426 
427     // create one of us
428     Batch batch = new Batch();
429 
430     // process the options
431     int i = 0;
432     for( ; i<args.length; i++) {
433       if(args[i].equals("-c") && ++i < args.length) // -c = coll name
434         persCollName = args[i];
435       else if(args[i].equals("-j") && ++i < args.length)// -j = transducer name
436         batch.japeFileName = args[i];
437       else if(args[i].equals("-v")) // -v = verbose
438         batch.setVerbose(true);
439       else if(args[i].startsWith("-"))
440         batch.usage("unknown option " + args[i]);
441       else
442         break;
443     } // for each arg
444 
445     // file name list
446     String[] fileNames = null;
447     if(args.length > i) {
448       fileNames = new String[args.length - i];
449       for(int j = 0; i<args.length; j++, i++)
450         fileNames[j] = args[i];
451     }
452 
453     // did they give valid options?
454     if(batch.japeFileName == null)
455       batch.usage("you must supply a transducer name");
456     if(fileNames != null && persCollName != null)
457       batch.usage("can't read a collection AND process a file list");
458 
459     // parse the transducer or bomb
460     batch.message("parsing the transducer");
461     try { batch.initTransducer(); }
462     catch(JapeException e) {
463       batch.usage("oops: " + e.toString());
464     }
465 
466     Corpus coll = null;
467     if(persCollName != null) { // we got a collection name, not a list of files
468 
469       // open the collection or bomb
470       coll = null;
471       batch.message("opening the collection");
472       try {
473         coll = Factory.newCorpus(persCollName);
474       } catch(ResourceInstantiationException e) {
475         batch.usage("oops (x): " + e);
476       }
477 
478       // transduce
479       batch.message("calling transducer");
480       try { batch.transduce(coll); }
481       catch(JapeException e) {
482         batch.usage("oops (1): " + e.toString());
483       }
484 
485       // save to disk
486       batch.message("saving the collection");
487       batch.usage("couldn't sync coll ");
488 
489     // we got a list of files, not a collection
490     } else {
491       batch.message("transducing transient collection");
492       try {
493         coll = batch.transduce(fileNames);
494       } catch(JapeException e) {
495         batch.usage("oops (2): " + e.toString());
496       }
497     }
498 
499     // we won! we won! we can smash up all the computers now!
500     batch.message("done");
501     //System.exit(0);
502 */
503   } // main
504 
505 
506   /** Whether to print progress messages or not. */
507   private boolean verbose = false;
508 
509   /** Set verbosity. */
510   public void setVerbose(boolean turtleSoup) { verbose = turtleSoup; }
511 
512   /** You got something wrong, dumbo. */
513   public void usage(String errorMessage) {
514     String usageMessage =
515       "usage: java gate.jape.Batch.main [-v] " +
516         "-j japefile(.ser|.jape|.jar) " +
517         "(-c CollectionName | filenames)";
518 
519     Err.println(errorMessage);
520     Err.println(usageMessage);
521     // System.exit(1);
522 
523   } // usage
524 
525   /** Hello? Anybody there?? */
526   public void message(String mess) {
527     if(verbose) Out.println("Batch: " + mess);
528   } // message
529 
530   public void setFeatures(gate.FeatureMap newFeatures) {
531     features = newFeatures;
532   }
533   public gate.FeatureMap getFeatures() {
534     return features;
535   }
536   public synchronized void removeProgressListener(ProgressListener l) {
537     if (progressListeners != null && progressListeners.contains(l)) {
538       Vector v = (Vector) progressListeners.clone();
539       v.removeElement(l);
540       progressListeners = v;
541     }
542   }
543   public synchronized void addProgressListener(ProgressListener l) {
544     Vector v = progressListeners == null ? new Vector(2) : (Vector) progressListeners.clone();
545     if (!v.contains(l)) {
546       v.addElement(l);
547       progressListeners = v;
548     }
549   }
550 
551   //ProcessProgressReporter implementation ends here
552 
553   /** Are we initialising from a resource? */
554 //  private boolean fromResource = false;
555 
556   /** Path to the resources tree */
557 //  private String resPath = null;
558 
559 
560   private gate.FeatureMap features;
561   private transient Vector progressListeners;
562   private transient Vector statusListeners;
563   protected void fireProgressChanged(int e) {
564     if (progressListeners != null) {
565       Vector listeners = progressListeners;
566       int count = listeners.size();
567       for (int i = 0; i < count; i++) {
568         ((ProgressListener) listeners.elementAt(i)).progressChanged(e);
569       }
570     }
571   }
572   protected void fireProcessFinished() {
573     if (progressListeners != null) {
574       Vector listeners = progressListeners;
575       int count = listeners.size();
576       for (int i = 0; i < count; i++) {
577         ((ProgressListener) listeners.elementAt(i)).processFinished();
578       }
579     }
580   }
581   public synchronized void removeStatusListener(StatusListener l) {
582     if (statusListeners != null && statusListeners.contains(l)) {
583       Vector v = (Vector) statusListeners.clone();
584       v.removeElement(l);
585       statusListeners = v;
586     }
587   }
588   public synchronized void addStatusListener(StatusListener l) {
589     Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone();
590     if (!v.contains(l)) {
591       v.addElement(l);
592       statusListeners = v;
593     }
594   }
595   protected void fireStatusChanged(String e) {
596     if (statusListeners != null) {
597       Vector listeners = statusListeners;
598       int count = listeners.size();
599       for (int i = 0; i < count; i++) {
600         ((StatusListener) listeners.elementAt(i)).statusChanged(e);
601       }
602     }
603   }
604 
605 
606   /*
607   private void writeObject(ObjectOutputStream oos) throws IOException {
608     Out.prln("writing batch");
609     oos.defaultWriteObject();
610     Out.prln("finished writing batch");
611   } // writeObject
612   */
613 
614 } // class Batch
615 
616