|
Batch |
|
1 /* 2 * Batch.java - transducer class 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 10/08/98 12 * 13 * $Id: Batch.java,v 1.31 2001/09/28 15:45:23 valyt Exp $ 14 * 15 * DEVELOPER NOTES: 16 * 17 * This is one that got away; the relation between constructors, 18 * initTransducer and parseTransducer are totally screwy and get worse 19 * every time I add something (e.g. support for resource loading). 20 * We should probably junk this whole thing and start again.... 21 */ 22 23 package gate.jape; 24 25 import java.util.*; 26 import java.util.jar.*; 27 import java.io.*; 28 import java.net.*; 29 30 import gate.annotation.*; 31 import gate.util.*; 32 import gate.*; 33 import gate.event.*; 34 import gate.creole.*; 35 36 /** Batch processing of JAPE transducers against documents or collections. 37 * Construction will parse or deserialise a transducer as required. 38 */ 39 public class Batch implements JapeConstants { 40 /** Debug flag */ 41 private static final boolean DEBUG = false; 42 43 /** The name of the transducer file, a .jape or .ser. */ 44 // private String japeFileName; 45 46 /** The URL that points to a .jape file */ 47 private URL japeURL; 48 49 /**The encoding used for reading the grammar file(s)*/ 50 private String encoding; 51 52 /** The JAPE transducer. */ 53 private Transducer transducer; 54 55 /** A stream connected to the JAPE file (often null). */ 56 // private InputStream japeStream = null; 57 58 /** Create non-initialised instance (private, used in main). */ 59 private Batch() { } 60 61 /** Create a fully initialised instance. 62 * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer 63 * file. This may be an absolute path, or may a .jar 64 * that lives somewhere on the classpath. 65 */ 66 public Batch(URL url, String encoding) throws JapeException { 67 this.japeURL = url; 68 this.encoding = encoding; 69 parseJape(); 70 if(transducer != null){ 71 transducer.addStatusListener(new StatusListener(){ 72 public void statusChanged(String text){ 73 fireStatusChanged(text); 74 } 75 }); 76 77 transducer.addProgressListener(new ProgressListener(){ 78 public void progressChanged(int value){ 79 fireProgressChanged(value); 80 } 81 82 public void processFinished(){ 83 fireProcessFinished(); 84 } 85 }); 86 } 87 88 } // full init constructor 89 90 public Batch(URL url, String encoding, StatusListener sListener) 91 throws JapeException { 92 93 this.addStatusListener(sListener); 94 this.japeURL = url; 95 this.encoding = encoding; 96 parseJape(); 97 if(transducer != null){ 98 transducer.addStatusListener(new StatusListener(){ 99 public void statusChanged(String text){ 100 fireStatusChanged(text); 101 } 102 }); 103 104 transducer.addProgressListener(new ProgressListener(){ 105 public void progressChanged(int value){ 106 fireProgressChanged(value); 107 } 108 109 public void processFinished(){ 110 fireProcessFinished(); 111 } 112 }); 113 } 114 } // full init constructor 115 116 /** 117 * Notifies this PR that it should stop its execution as soon as possible. 118 */ 119 public synchronized void interrupt(){ 120 transducer.interrupt(); 121 } 122 /** Create a fully initialised instance. 123 * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer 124 * file. This may be an absolute path, or may a .jar 125 * that lives somewhere on the classpath. 126 */ 127 /* 128 public Batch(String japeFileName) throws JapeException { 129 this.japeFileName = japeFileName; 130 initTransducer(); 131 } // full init constructor 132 */ 133 /* 134 public Batch(String japeFileName, StatusListener sListener) 135 throws JapeException { 136 this.japeFileName = japeFileName; 137 this.addStatusListener(sListener); 138 initTransducer(); 139 } // full init constructor 140 */ 141 142 /** Create a fully initialised instance from an InputStream connected 143 * to the JAPE file. 144 */ 145 /* 146 public Batch(InputStream japeStream) throws JapeException { 147 if(japeStream == null) 148 throw new JapeException( 149 "attempt to create a batch parser with null input stream" 150 ); 151 this.japeFileName = "stream"; 152 this.japeStream = japeStream; 153 initTransducer(); 154 } // full init constructor 155 */ 156 /** Create a fully initialised instance from a resource path and resource 157 * name. 158 */ 159 /* 160 public Batch(String resPath, String resName) throws JapeException { 161 fromResource = true; 162 this.japeFileName = resName; 163 this.resPath = resPath; 164 initTransducer(); 165 } // full init constructor 166 */ 167 168 /** Get the transducer. */ 169 public Transducer getTransducer() { return transducer; } 170 171 /** Instantiate transducer member as necessary. */ 172 /* 173 private void initTransducer() 174 throws JapeException { 175 if(fromResource) { 176 parseJape(resPath, japeFileName); 177 } else if(japeFileName.endsWith(".ser") || japeFileName.endsWith(".SER")) 178 deserialiseJape(new File(japeFileName)); 179 else if(japeFileName.endsWith(".jape") || japeFileName.endsWith(".JAPE")) 180 parseJape(); 181 else if(japeFileName.endsWith(".jar") || japeFileName.endsWith(".JAR")) 182 deserialiseJape(); 183 else if(japeFileName.equals("stream")) 184 parseJape(japeStream); 185 else 186 throw new JapeException( 187 "unknown file type (not .jape, .ser or .jar):" + japeFileName 188 ); 189 if(transducer != null) transducer.addStatusListener(new StatusListener() { 190 public void statusChanged(String text){ 191 fireStatusChangedEvent(text); 192 } 193 }); 194 } 195 */ 196 /** Parse a jape file from {@link #japeURL} and store the transducer. */ 197 private void parseJape() throws JapeException { 198 try { 199 gate.jape.parser.ParseCpsl parser = 200 new gate.jape.parser.ParseCpsl(japeURL, encoding); 201 202 StatusListener listener = null; 203 listener = new StatusListener(){ 204 public void statusChanged(String text){ 205 fireStatusChanged(text); 206 } 207 }; 208 parser.addStatusListener(listener); 209 transducer = parser.MultiPhaseTransducer(); 210 parser.removeStatusListener(listener); 211 } catch (gate.jape.parser.ParseException e) { 212 throw new 213 JapeException("Batch: error parsing transducer: " + e.getMessage()); 214 } catch (java.io.IOException e) { 215 throw new 216 JapeException("Batch: couldn't open JAPE file: " + e.getMessage()); 217 } 218 } // parseJape 219 220 /** Parse a jape file from an InputStream and store the transducer. */ 221 /* 222 private void parseJape(InputStream japeStream) throws JapeException { 223 try { 224 gate.jape.parser.ParseCpsl parser = 225 new gate.jape.parser.ParseCpsl(japeFileName, japeStream); 226 transducer = parser.MultiPhaseTransducer(); 227 } catch (gate.jape.parser.ParseException e) { 228 throw new 229 JapeException("Batch: error parsing transducer: " + e.getMessage()); 230 } catch (java.io.IOException e) { 231 throw new 232 JapeException("Batch: couldn't read JAPE stream: " + e.getMessage()); 233 } 234 } // parseJape(InputStream) 235 */ 236 /** Parse a jape file from a resource and store the transducer. */ 237 /* 238 private void parseJape(String resPath, String resName) throws JapeException { 239 try { 240 gate.jape.parser.ParseCpsl parser = 241 new gate.jape.parser.ParseCpsl(resPath, resName); 242 transducer = parser.MultiPhaseTransducer(); 243 } catch (gate.jape.parser.ParseException e) { 244 throw new 245 JapeException("Batch: error parsing transducer: " + e.getMessage()); 246 } catch (java.io.IOException e) { 247 throw new 248 JapeException("Batch: couldn't read JAPE resource: " + e.getMessage()); 249 } 250 } // parseJape(resPath, resName) 251 */ 252 253 /** Deserialise from a .ser file. */ 254 /* 255 private void deserialiseJape(File japeFile) throws JapeException { 256 257 // set up a file input stream 258 FileInputStream japeInputStream = null; 259 try { 260 japeInputStream = new FileInputStream(japeFile.getPath()); 261 } catch (IOException e) { 262 throw new JapeException( 263 "Can't read from " + japeFile.getPath() + ": " + e.getMessage() 264 ); 265 } 266 267 // call the input stream deserialise method 268 deserialiseJape(japeInputStream); 269 } // deserialiseJape(File) 270 */ 271 /** Deserialise from a JAR file. */ 272 /* 273 private void deserialiseJape() throws JapeException { 274 // find the jar from CLASSPATH 275 //SearchPath classPath = 276 // new SearchPath(System.getProperty("java.class.path"), "."); 277 File jarFile = new File(japeFileName); //classPath.getFile(japeFileName); 278 if(jarFile == null) 279 throw new JapeException("Batch: can't find " + japeFileName); 280 281 // get a byte array input stream with the .ser in out of the jar file 282 JarFile jar = null; 283 BufferedInputStream japeInputStream = null; 284 try { 285 jar = new JarFile(jarFile.getPath()); 286 japeInputStream = new BufferedInputStream( 287 jar.getInputStream(jar.getJarEntry(jarNameToSerName(japeFileName))) 288 ); 289 } catch(IOException e) { 290 throw new JapeException("couldn't read jar file " + japeFileName); 291 } 292 293 294 // call the input stream deserialise method 295 deserialiseJape(japeInputStream); 296 } // deserialiseJape() 297 */ 298 /** Create a transducer from an object input stream (deserialisation). */ 299 /* 300 private void deserialiseJape(InputStream japeInputStream) 301 throws JapeException { 302 try { 303 ObjectInputStream ois = new ObjectInputStream(japeInputStream); 304 transducer = (Transducer) ois.readObject(); 305 ois.close(); 306 japeInputStream.close(); // redundant? 307 } catch (IOException e) { 308 throw new JapeException( 309 "Batch: can't deserialise InputStream (1): " + e.getMessage() 310 ); 311 } catch (ClassNotFoundException e) { 312 throw new JapeException( 313 "Batch: can't deserialise InputStream (2): " + e.getMessage() 314 ); 315 } 316 } // deserialise(OIS) 317 */ 318 /** Create a .ser name from a .jar name. */ 319 /* 320 private String jarNameToSerName(String jarName) { 321 return jarName.substring(0, jarName.length() - 4) + ".ser"; 322 } // jarNameToSerName 323 */ 324 325 /** Process the given collection. */ 326 public void transduce(Corpus coll) throws JapeException, ExecutionException { 327 // for each doc run the transducer 328 Iterator iter = coll.iterator(); 329 while(iter.hasNext()) { 330 Document doc = (Document) iter.next(); 331 // transducer.transduce(doc); 332 transduce(doc, doc.getAnnotations(), doc.getAnnotations()); 333 } 334 } // transduce(coll) 335 336 /** Process a single document. */ 337 public void transduce(Document doc) throws JapeException, ExecutionException { 338 transducer.transduce(doc, doc.getAnnotations(), doc.getAnnotations()); 339 } // transduce(doc) 340 341 /** Process a single document. */ 342 public void transduce(Document doc, AnnotationSet inputAS, 343 AnnotationSet outputAS) throws JapeException, 344 ExecutionException { 345 //no need to transduce empty document 346 if (inputAS == null || inputAS.isEmpty()) 347 return; 348 transducer.transduce(doc, inputAS, outputAS); 349 350 } // transduce(doc) 351 352 /** Process a single text. */ 353 /* 354 public Document transduce(String text) throws JapeException { 355 Document doc = null; 356 try { 357 doc = Factory.newDocument(text); 358 } catch (ResourceInstantiationException e) { 359 throw new JapeException(e.toString()); 360 } 361 transducer.transduce(doc, doc.getAnnotations()); 362 return doc; 363 } // transduce(text) 364 */ 365 /** Process a single file. */ 366 /* 367 public Document transduce(File textFile) throws JapeException { 368 String text = null; 369 try { 370 text = gate.util.Files.getString(textFile); 371 } catch(IOException e) { throw new JapeException(e.toString()); } 372 return transduce(text); 373 } // transduce(textFile) 374 */ 375 /** Process a set of files. */ 376 /* 377 public Corpus transduce(String[] textFileNames) throws JapeException { 378 Corpus coll = null; 379 try { 380 coll = Factory.newCorpus("JAPE batch corpus"); 381 Document doc = null; 382 for(int i = 0; i < textFileNames.length; i++) { 383 doc = Factory.newDocument(textFileNames[i]); 384 doc.setFeatures(Factory.newFeatureMap()); 385 /*coll.createDocument( 386 textFileNames[i], 387 null, // the text - should get read from disk 388 new AnnotationSetImpl(doc), 389 Factory.newFeatureMap(), 390 Document.COPIED 391 );*/ 392 /* 393 transducer.transduce(doc, doc.getAnnotations()); 394 } 395 } catch(ResourceInstantiationException e) { 396 throw new JapeException(e.toString()); 397 } 398 return coll; 399 } // transduce(textFileNames) 400 */ 401 /** This is where it all happens. This is <I>the</I> place to be. Take 402 * your summer holidays here. Visit on Saturday nights. Buy a season 403 * ticket from <CODE>www.programmer.gone.insane.com</CODE>. 404 * <P> 405 * Takes a .jape/.jar/.ser 406 * file name (-j option) which is assumed to hold a pattern 407 * grammar for a multi-phase transducer, and a collection 408 * name (-c option) or a list of files. As needed it then parses and 409 * compiles the transducer, then transduces all the documents in the 410 * collection and saves it to disk. 411 */ 412 public static void main(String args[]) { 413 /* 414 // oh great bug in the sky give us this day our daily fuckup 415 //gate.util.Debug.setDebug(true); 416 //gate.util.Debug.setDebug(Rule.class, true); 417 //gate.util.Debug.setDebug(LeftHandSide.class, true); 418 //gate.util.Debug.setDebug(BasicPatternElement.class, true); 419 //gate.util.Debug.setDebug(AnnotationSet.class, true); 420 421 // The persistent name of the collection. 422 String persCollName = null;; 423 424 // The collection to process. 425 Corpus collection = null; 426 427 // create one of us 428 Batch batch = new Batch(); 429 430 // process the options 431 int i = 0; 432 for( ; i<args.length; i++) { 433 if(args[i].equals("-c") && ++i < args.length) // -c = coll name 434 persCollName = args[i]; 435 else if(args[i].equals("-j") && ++i < args.length)// -j = transducer name 436 batch.japeFileName = args[i]; 437 else if(args[i].equals("-v")) // -v = verbose 438 batch.setVerbose(true); 439 else if(args[i].startsWith("-")) 440 batch.usage("unknown option " + args[i]); 441 else 442 break; 443 } // for each arg 444 445 // file name list 446 String[] fileNames = null; 447 if(args.length > i) { 448 fileNames = new String[args.length - i]; 449 for(int j = 0; i<args.length; j++, i++) 450 fileNames[j] = args[i]; 451 } 452 453 // did they give valid options? 454 if(batch.japeFileName == null) 455 batch.usage("you must supply a transducer name"); 456 if(fileNames != null && persCollName != null) 457 batch.usage("can't read a collection AND process a file list"); 458 459 // parse the transducer or bomb 460 batch.message("parsing the transducer"); 461 try { batch.initTransducer(); } 462 catch(JapeException e) { 463 batch.usage("oops: " + e.toString()); 464 } 465 466 Corpus coll = null; 467 if(persCollName != null) { // we got a collection name, not a list of files 468 469 // open the collection or bomb 470 coll = null; 471 batch.message("opening the collection"); 472 try { 473 coll = Factory.newCorpus(persCollName); 474 } catch(ResourceInstantiationException e) { 475 batch.usage("oops (x): " + e); 476 } 477 478 // transduce 479 batch.message("calling transducer"); 480 try { batch.transduce(coll); } 481 catch(JapeException e) { 482 batch.usage("oops (1): " + e.toString()); 483 } 484 485 // save to disk 486 batch.message("saving the collection"); 487 batch.usage("couldn't sync coll "); 488 489 // we got a list of files, not a collection 490 } else { 491 batch.message("transducing transient collection"); 492 try { 493 coll = batch.transduce(fileNames); 494 } catch(JapeException e) { 495 batch.usage("oops (2): " + e.toString()); 496 } 497 } 498 499 // we won! we won! we can smash up all the computers now! 500 batch.message("done"); 501 //System.exit(0); 502 */ 503 } // main 504 505 506 /** Whether to print progress messages or not. */ 507 private boolean verbose = false; 508 509 /** Set verbosity. */ 510 public void setVerbose(boolean turtleSoup) { verbose = turtleSoup; } 511 512 /** You got something wrong, dumbo. */ 513 public void usage(String errorMessage) { 514 String usageMessage = 515 "usage: java gate.jape.Batch.main [-v] " + 516 "-j japefile(.ser|.jape|.jar) " + 517 "(-c CollectionName | filenames)"; 518 519 Err.println(errorMessage); 520 Err.println(usageMessage); 521 // System.exit(1); 522 523 } // usage 524 525 /** Hello? Anybody there?? */ 526 public void message(String mess) { 527 if(verbose) Out.println("Batch: " + mess); 528 } // message 529 530 public void setFeatures(gate.FeatureMap newFeatures) { 531 features = newFeatures; 532 } 533 public gate.FeatureMap getFeatures() { 534 return features; 535 } 536 public synchronized void removeProgressListener(ProgressListener l) { 537 if (progressListeners != null && progressListeners.contains(l)) { 538 Vector v = (Vector) progressListeners.clone(); 539 v.removeElement(l); 540 progressListeners = v; 541 } 542 } 543 public synchronized void addProgressListener(ProgressListener l) { 544 Vector v = progressListeners == null ? new Vector(2) : (Vector) progressListeners.clone(); 545 if (!v.contains(l)) { 546 v.addElement(l); 547 progressListeners = v; 548 } 549 } 550 551 //ProcessProgressReporter implementation ends here 552 553 /** Are we initialising from a resource? */ 554 // private boolean fromResource = false; 555 556 /** Path to the resources tree */ 557 // private String resPath = null; 558 559 560 private gate.FeatureMap features; 561 private transient Vector progressListeners; 562 private transient Vector statusListeners; 563 protected void fireProgressChanged(int e) { 564 if (progressListeners != null) { 565 Vector listeners = progressListeners; 566 int count = listeners.size(); 567 for (int i = 0; i < count; i++) { 568 ((ProgressListener) listeners.elementAt(i)).progressChanged(e); 569 } 570 } 571 } 572 protected void fireProcessFinished() { 573 if (progressListeners != null) { 574 Vector listeners = progressListeners; 575 int count = listeners.size(); 576 for (int i = 0; i < count; i++) { 577 ((ProgressListener) listeners.elementAt(i)).processFinished(); 578 } 579 } 580 } 581 public synchronized void removeStatusListener(StatusListener l) { 582 if (statusListeners != null && statusListeners.contains(l)) { 583 Vector v = (Vector) statusListeners.clone(); 584 v.removeElement(l); 585 statusListeners = v; 586 } 587 } 588 public synchronized void addStatusListener(StatusListener l) { 589 Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone(); 590 if (!v.contains(l)) { 591 v.addElement(l); 592 statusListeners = v; 593 } 594 } 595 protected void fireStatusChanged(String e) { 596 if (statusListeners != null) { 597 Vector listeners = statusListeners; 598 int count = listeners.size(); 599 for (int i = 0; i < count; i++) { 600 ((StatusListener) listeners.elementAt(i)).statusChanged(e); 601 } 602 } 603 } 604 605 606 /* 607 private void writeObject(ObjectOutputStream oos) throws IOException { 608 Out.prln("writing batch"); 609 oos.defaultWriteObject(); 610 Out.prln("finished writing batch"); 611 } // writeObject 612 */ 613 614 } // class Batch 615 616
|
Batch |
|