|
CookBook |
|
1 /* 2 * CookBook.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 16/Feb/2000 12 * 13 * $Id: CookBook.java,v 1.32 2002/03/06 17:15:37 kalina Exp $ 14 */ 15 16 package gate; 17 18 import java.util.*; 19 import java.net.*; 20 import java.io.*; 21 import junit.framework.*; 22 23 import gate.*; 24 import gate.util.*; 25 import gate.creole.*; 26 import gate.creole.tokeniser.*; 27 import gate.creole.splitter.*; 28 import gate.creole.gazetteer.*; 29 import gate.creole.orthomatcher.*; 30 31 32 /** 33 * <P><B>NOTE: this class has been REPLACED by the GateExamples package; 34 * see 35 * <A HREF=http://gate.ac.uk/GateExamples/doc/>http://gate.ac.uk/GateExamples/doc/</A>.</B> 36 * 37 * <P> 38 * This class provides examples of using the GATE APIs. 39 * Read this documentation along with a copy of the 40 * <A HREF=http://gate.ac.uk/gate/doc/java2html/gate/CookBook.java.html>source 41 * code</A>. 42 * 43 * <P> 44 * The CookBook is set up as 45 * part of the GATE test suite (using the 46 * <A HREF="http://www.junit.org/>JUnit testing framework</A>), so there's 47 * an easy way to run the examples (viz., 48 * <A HREF=../gate/TestGate.html>gate.TestGate</A>'s <TT>main</TT> method, 49 * which will invoke the 50 * JUnit test runner). Also, we can use JUnit's assert methods: e.g. 51 * <TT>assertTrue(corpus.isEmpty());</TT> 52 * tests that a corpus object is empty, and creates a test failure report if 53 * this is not the case. (To add a new test class to the suite, see the 54 * <A HREF=../gate/util/TestTemplate.html>gate.util.TestTemplate</A> class.) 55 * 56 * <P> 57 * Programming to the GATE Java API involves manipulating the classes and 58 * interfaces in the <A HREF=package-summary.html>gate package</A> 59 * (and to a lesser extent other packages). These are 60 * often interfaces; classes there are often to do with getting 61 * access to objects that implement the interfaces (without exposing those 62 * implementations). In other words, there's a lot of interface-based design 63 * around. 64 * 65 * <P> 66 * For more details and for a conceptual view, see 67 * <A HREF=http://gate.ac.uk/sale/tao/>Developing Language Processing 68 * Components with GATE</A> (for which this class provides some of the 69 * examples). 70 * 71 * <P> 72 * The rest of this documentation refers to methods in the code that 73 * provide examples of using the GATE API. 74 * 75 * <P> 76 * The <A HREF=#testResourceCreation()>testResourceCreation</A> method gives 77 * an example of creating a resource via 78 * <A HREF=../gate/Factory.html>gate.Factory</A>. 79 * 80 * <P> 81 * The <A HREF=Corpus.html>Corpus interface</A> represents collections of 82 * <A HREF=Document.html>Documents</A> (and takes the place of the old TIPSTER 83 * <TT>Collection</TT> class). 84 * 85 * <P> 86 * The <A HREF=#testCorpusConstruction()>testCorpusConstruction</A> method 87 * gives an example of how to create a new transient Corpus object. 88 * 89 * <P> 90 * The <A HREF=#testAddingDocuments()>testAddingDocuments</A> method gives 91 * examples of adding documents to corpora. 92 * 93 * <P> 94 * The <A HREF=#testAddingAnnotations()>testAddingAnnotations</A> method gives 95 * examples of adding annotations to documents. 96 * 97 * 98 * <P> 99 * The <A HREF=#testUsingFeatures()>testUsingFeatures</A> method gives 100 * examples of using features. <A HREF=FeatureMap.html>The FeatureMap 101 * interface</A> is a mechanism for associating arbitrary data with GATE 102 * entities. Corpora, documents and annotations all share this 103 * mechanism. Simple feature maps use Java's Map interface. 104 * 105 * 106 * <H3>Other sources of examples</H3> 107 * 108 * <P> 109 * See also the other test classes, although note that they also use methods 110 * that are not part of the public API. Test classes include: 111 * <A HREF=corpora/TestCreole.html>TestCreole</A>; 112 * <A HREF=corpora/TestCorpus.html>TestCorpus</A>; 113 * <A HREF=corpora/TestDocument.html>TestDocument</A>; 114 * <A HREF=corpora/TestAnnotation.html>TestAnnotation</A>; anything 115 * else starting "Test" - about 30 of them at the last count. 116 */ 117 public class CookBook extends TestCase 118 { 119 /** Debug flag */ 120 private static final boolean DEBUG = false; 121 122 /** A corpus */ 123 Corpus corpus = null; 124 125 /** A document */ 126 Document doc1 = null; 127 128 /** Another document */ 129 Document doc2 = null; 130 131 /** Constructing a resource */ 132 public void testResourceCreation() throws GateException { 133 134 // before creating a resource we need a feature map to store 135 // parameter values 136 FeatureMap params = Factory.newFeatureMap(); 137 138 // to create a document we need a sourceUrlName parameter giving 139 // the location of the source for the document content 140 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, 141 Gate.getUrl("tests/doc0.html")); 142 params.put(Document.DOCUMENT_MARKUP_AWARE_PARAMETER_NAME, 143 new Boolean(true)); 144 Resource res = Factory.createResource("gate.corpora.DocumentImpl", params); 145 146 // now we have a document 147 assertTrue( 148 "should be document but the class is: " + res.getClass().getName(), 149 res instanceof gate.Document 150 ); 151 Document doc = (Document) res; 152 AnnotationSet markupAnnotations = doc.getAnnotations( 153 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 154 //this is useless as doc.getAnnotations() will never return null! 155 assertNotNull("no markup annotations on doc " + doc, markupAnnotations); 156 int numMarkupAnnotations = markupAnnotations.size(); 157 if(DEBUG) 158 Out.prln("annotations on doc after unpack= " + numMarkupAnnotations); 159 assertTrue( 160 "wrong number annots on doc: " + doc + numMarkupAnnotations, 161 numMarkupAnnotations == 27 162 ); 163 164 } // testResourceCreation 165 166 /** Constructing a corpus */ 167 public void testCorpusConstruction() throws GateException { 168 169 // corpus constructors require a name 170 corpus = Factory.newCorpus("My example corpus"); 171 172 // the corpus interface inherits all the sorted set methods 173 assertTrue(corpus.isEmpty()); 174 175 } // testCorpusConstruction 176 177 /** Adding documents to a corpus */ 178 public void testAddingDocuments() throws GateException { 179 180 corpus = Factory.newCorpus("My example corpus"); 181 182 // add a document or two.... 183 corpus.add(doc1); 184 corpus.add(doc2); 185 186 // iterate the corpus members and do some random tests 187 Iterator iter = corpus.iterator(); 188 while(iter.hasNext()) { 189 Document doc = (Document) iter.next(); 190 assertTrue( 191 "document url not as expected", 192 doc.getSourceUrl().toExternalForm().endsWith("doc0.html") || 193 doc.getSourceUrl().toExternalForm().endsWith("test1.htm") 194 ); 195 } // while 196 197 } // testAddingDocuments 198 199 /** Adding annotations to documents */ 200 public void testAddingAnnotations() { 201 AnnotationSet as = doc1.getAnnotations(); 202 FeatureMap fm = doc1.getFeatures(); 203 Integer id; 204 205 // during creation of annotations offsets are checked and an invalid 206 // offset exception thrown if they are invalid 207 try { 208 id = as.add(new Long(10), new Long(20), "T1", fm); 209 } catch (InvalidOffsetException e) { 210 fail(e.toString()); 211 } 212 } // testAddingAnnotations 213 214 /** Using the FeatureMap interface */ 215 public void testUsingFeatures() { 216 AnnotationSet as = doc1.getAnnotations(); 217 Integer id; // the id of new annotations 218 219 // putting features on documents 220 FeatureMap fm = Factory.newFeatureMap(); 221 doc1.setFeatures(fm); 222 assertTrue(fm.size() == 0); 223 fm.put("author", "segovia"); 224 assertTrue(fm.get("author").equals("segovia")); 225 fm.put("author", "brendl"); // map puts overwrite existing values 226 assertTrue(fm.get("author").equals("brendl")); 227 assertTrue(fm.size() == 1); 228 229 } // testUsingFeatures 230 231 /** String to print when wrong command-line args */ 232 private static String usage = 233 "usage: CookBook [-dir directory-name | file(s)]"; 234 235 /** 236 * Main function: an example of embedding GATE-based 237 * batch processing. The method: 238 * <UL> 239 * <LI> 240 * initialises the GATE library, and creates PRs for 241 * tokenisation, sentence splitting and part of speech tagging 242 * <LI> 243 * takes a directory name as argument (-dir option) or just a list 244 * of files 245 * <LI> 246 * creates a directory called "out" and an index.html file there 247 * <LI> 248 * for each .html file in that directory: 249 * <BR> create a GATE document from the file 250 * <BR> run the PRs on the document 251 * <BR> dump some output for the file to "out/gate__[file name].txt", 252 * and add a line to the index 253 * </UL> 254 */ 255 public static void main(String[] args) throws Exception { 256 // say "hi" 257 Out.prln("CookBook.main"); 258 Out.prln("processing command line arguments"); 259 260 // check we have a directory name or list of files 261 List inputFiles = null; 262 if(args.length < 1) throw new GateException(usage); 263 264 // set up a list of all the files to process 265 if(args[0].equals("-dir")) { // list all the files in the dir 266 if(args.length < 2) throw new GateException(usage); 267 File dir = new File(args[1]); 268 File[] filesArray = dir.listFiles(); 269 if(filesArray == null) 270 throw new GateException( 271 dir.getPath() + " is not a directory; " + usage 272 ); 273 inputFiles = Arrays.asList(filesArray); 274 275 } else { // all args should be file names 276 inputFiles = new ArrayList(); 277 for(int i = 0; i < args.length; i++) 278 inputFiles.add(new File(args[i])); 279 } 280 281 // did we get some file names? 282 if(inputFiles.isEmpty()) { 283 throw new GateException("No files to process!"); 284 } 285 286 // initialise GATE 287 Out.prln("initialising GATE"); 288 Gate.init(); 289 290 // create some processing resources 291 Out.prln("creating PRs"); 292 //create a tokeniser 293 DefaultTokeniser tokeniser = (DefaultTokeniser)Factory.createResource( 294 "gate.creole.tokeniser.DefaultTokeniser"); 295 //create a sentence splitter 296 SentenceSplitter splitter = (SentenceSplitter)Factory.createResource( 297 "gate.creole.splitter.SentenceSplitter"); 298 //create a POS tagger 299 POSTagger tagger = (POSTagger)Factory.createResource( 300 "gate.creole.POSTagger"); 301 302 //create a gazetteer 303 DefaultGazetteer gazetteer = (DefaultGazetteer)Factory.createResource( 304 "gate.creole.gazetteer.DefaultGazetteer"); 305 306 //create a grammar 307 ANNIETransducer transducer = (ANNIETransducer)Factory.createResource( 308 "gate.creole.ANNIETransducer"); 309 310 //create an orthomatcher 311 OrthoMatcher orthomatcher = (OrthoMatcher) Factory.createResource( 312 "gate.creole.orthomatcher.OrthoMatcher"); 313 314 // make the "out" directory that will contain the results. 315 String outDirName = 316 ((File) inputFiles.get(0)).getParent() + Strings.getFileSep() + "out"; 317 if(! new File(outDirName).mkdir()){ 318 throw new GateException("Could not create the output directory"); 319 } 320 321 // construct a name for the output index file; open; dump header 322 String nl = Strings.getNl(); // shorthand for platform's newline 323 String fsep = 324 Strings.getFileSep(); // shorthand for platform's file separator 325 String indexName = 326 ( (File) inputFiles.get(0) ).getParent() + fsep + "index.html"; 327 FileWriter indexWriter = new FileWriter(new File(indexName)); 328 indexWriter.write("<HTML><HEAD><TITLE>Documents list</TITLE></HEAD>"); 329 indexWriter.write(nl + "<BODY>" + nl + "<UL>" + nl); 330 331 // main loop: 332 // for each document 333 // create a gate doc 334 // set as the document for the PRs 335 // run the PRs 336 // dump output from the doc to out/gate__.....txt 337 // delete the doc 338 339 // loop on files list 340 Iterator filesIter = inputFiles.iterator(); 341 Out.prln("looping on input files list"); 342 while(filesIter.hasNext()) { 343 File inFile = (File) filesIter.next(); // the current file 344 Out.prln("processing file " + inFile.getPath()); 345 FeatureMap params = Factory.newFeatureMap(); // params list for new doc 346 347 // set the source URL parameter to a "file:..." URL string 348 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, 349 inFile.toURL().toExternalForm()); 350 351 // use the platform's default encoding rather than GATE's 352 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 353 354 // create the document 355 Document doc = (Document) Factory.createResource( 356 "gate.corpora.DocumentImpl", params 357 ); 358 359 // set the document param on the PRs 360 tokeniser.setDocument(doc); 361 splitter.setDocument(doc); 362 tagger.setDocument(doc); 363 gazetteer.setDocument(doc); 364 transducer.setDocument(doc); 365 orthomatcher.setDocument(doc); 366 367 // run each PR 368 tokeniser.execute(); 369 splitter.execute(); 370 tagger.execute(); 371 gazetteer.execute(); 372 transducer.execute(); 373 orthomatcher.execute(); 374 375 // dump out results 376 377 // construct a name for the output file and open a stream 378 StringBuffer outFileName = new StringBuffer(inFile.getParent()); 379 outFileName.append(fsep); 380 outFileName.append("out"); 381 outFileName.append(fsep); 382 outFileName.append("gate__"); 383 outFileName.append(inFile.getName()); 384 outFileName.append(".txt"); 385 File outFile = new File(outFileName.toString()); 386 FileWriter outFileWriter = new FileWriter(outFile); 387 Out.prln("dumping " + outFile.getPath()); 388 389 // iterate round the token annotations writing to the out file 390 // NOTE: to dump all to XML: outFileWriter.write(doc.toXml(tokens)); 391 AnnotationSet tokens = doc.getAnnotations("nercAS"). 392 get(ANNIEConstants.TOKEN_ANNOTATION_TYPE); 393 Iterator iter = tokens.iterator(); 394 while(iter.hasNext()) { 395 Annotation token = (Annotation) iter.next(); 396 FeatureMap tokFeats = token.getFeatures(); 397 String tokStr = (String) tokFeats. 398 get(ANNIEConstants.TOKEN_STRING_FEATURE_NAME); 399 String tokPos = (String) tokFeats. 400 get(ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME); 401 outFileWriter.write(tokStr + "\t" + tokPos + nl); 402 } 403 outFileWriter.write(doc.getFeatures().get("entitySet").toString()); 404 405 // close the out file stream; add an index line 406 outFileWriter.close(); 407 indexWriter.write( 408 "<LI><A href=\"" + inFile.getName() + "\">" + inFile.getName() + 409 "</a>" + " -> " + "<a href=\"" + "out" + fsep + outFile.getName() + 410 "\">" + "out" + fsep + outFile.getName() + "</a></LI>\n" 411 ); 412 413 // make the doc a candidate for garbage collection 414 Out.prln("deleting gate doc"); 415 416 Factory.deleteResource(doc); 417 } // input files loop 418 419 // finish the index file 420 indexWriter.write(nl + "</UL>" + nl + "</BODY></HTML>" + nl); 421 indexWriter.close(); 422 423 Out.prln("The End (roll credits)"); 424 } // main 425 426 /** Fixture set up: initialise members before each test method */ 427 public void setUp() throws GateException, IOException { 428 corpus = Factory.newCorpus("My example corpus"); 429 430 doc1 = Factory.newDocument(Gate.getUrl("tests/doc0.html")); 431 doc2 = Factory.newDocument(Gate.getUrl("tests/html/test1.htm")); 432 } // setUp 433 434 /** Construction */ 435 public CookBook(String name) { super(name); } 436 437 /** Test suite routine for the test runner */ 438 public static Test suite() { 439 return new TestSuite(CookBook.class); 440 } // suite 441 442 } // class CookBook 443
|
CookBook |
|