1   /*
2    *  CookBook.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 16/Feb/2000
12   *
13   *  $Id: CookBook.java,v 1.32 2002/03/06 17:15:37 kalina Exp $
14   */
15  
16  package gate;
17  
18  import java.util.*;
19  import java.net.*;
20  import java.io.*;
21  import junit.framework.*;
22  
23  import gate.*;
24  import gate.util.*;
25  import gate.creole.*;
26  import gate.creole.tokeniser.*;
27  import gate.creole.splitter.*;
28  import gate.creole.gazetteer.*;
29  import gate.creole.orthomatcher.*;
30  
31  
32  /**
33    * <P><B>NOTE: this class has been REPLACED by the GateExamples package;
34    * see
35    * <A HREF=http://gate.ac.uk/GateExamples/doc/>http://gate.ac.uk/GateExamples/doc/</A>.</B>
36    *
37    * <P>
38    * This class provides examples of using the GATE APIs.
39    * Read this documentation along with a copy of the
40    * <A HREF=http://gate.ac.uk/gate/doc/java2html/gate/CookBook.java.html>source
41    * code</A>.
42    *
43    * <P>
44    * The CookBook is set up as
45    * part of the GATE test suite (using the
46    * <A HREF="http://www.junit.org/>JUnit testing framework</A>), so there's
47    * an easy way to run the examples (viz.,
48    * <A HREF=../gate/TestGate.html>gate.TestGate</A>'s <TT>main</TT> method,
49    * which will invoke the
50    * JUnit test runner). Also, we can use JUnit's assert methods: e.g.
51    * <TT>assertTrue(corpus.isEmpty());</TT>
52    * tests that a corpus object is empty, and creates a test failure report if
53    * this is not the case. (To add a new test class to the suite, see the
54    * <A HREF=../gate/util/TestTemplate.html>gate.util.TestTemplate</A> class.)
55    *
56    * <P>
57    * Programming to the GATE Java API involves manipulating the classes and
58    * interfaces in the <A HREF=package-summary.html>gate package</A>
59    * (and to a lesser extent other packages). These are
60    * often interfaces; classes there are often to do with getting
61    * access to objects that implement the interfaces (without exposing those
62    * implementations). In other words, there's a lot of interface-based design
63    * around.
64    *
65    * <P>
66    * For more details and for a conceptual view, see
67    * <A HREF=http://gate.ac.uk/sale/tao/>Developing Language Processing
68    * Components with GATE</A> (for which this class provides some of the
69    * examples).
70    *
71    * <P>
72    * The rest of this documentation refers to methods in the code that
73    * provide examples of using the GATE API.
74    *
75    * <P>
76    * The <A HREF=#testResourceCreation()>testResourceCreation</A> method gives
77    * an example of creating a resource via
78    * <A HREF=../gate/Factory.html>gate.Factory</A>.
79    *
80    * <P>
81    * The <A HREF=Corpus.html>Corpus interface</A> represents collections of
82    * <A HREF=Document.html>Documents</A> (and takes the place of the old TIPSTER
83    * <TT>Collection</TT> class).
84    *
85    * <P>
86    * The <A HREF=#testCorpusConstruction()>testCorpusConstruction</A> method
87    * gives an example of how to create a new transient Corpus object.
88    *
89    * <P>
90    * The <A HREF=#testAddingDocuments()>testAddingDocuments</A> method gives
91    * examples of adding documents to corpora.
92    *
93    * <P>
94    * The <A HREF=#testAddingAnnotations()>testAddingAnnotations</A> method gives
95    * examples of adding annotations to documents.
96    *
97    *
98    * <P>
99    * The <A HREF=#testUsingFeatures()>testUsingFeatures</A> method gives
100   * examples of using features. <A HREF=FeatureMap.html>The FeatureMap
101   * interface</A> is a mechanism for associating arbitrary data with GATE
102   * entities. Corpora, documents and annotations all share this
103   * mechanism. Simple feature maps use Java's Map interface.
104   *
105   *
106   * <H3>Other sources of examples</H3>
107   *
108   * <P>
109   * See also the other test classes, although note that they also use methods
110   * that are not part of the public API. Test classes include:
111   * <A HREF=corpora/TestCreole.html>TestCreole</A>;
112   * <A HREF=corpora/TestCorpus.html>TestCorpus</A>;
113   * <A HREF=corpora/TestDocument.html>TestDocument</A>;
114   * <A HREF=corpora/TestAnnotation.html>TestAnnotation</A>; anything
115   * else starting "Test" - about 30 of them at the last count.
116   */
117 public class CookBook extends TestCase
118 {
119   /** Debug flag */
120   private static final boolean DEBUG = false;
121 
122   /** A corpus */
123   Corpus corpus = null;
124 
125   /** A document */
126   Document doc1 = null;
127 
128   /** Another document */
129   Document doc2 = null;
130 
131   /** Constructing a resource */
132   public void testResourceCreation() throws GateException {
133 
134     // before creating a resource we need a feature map to store
135     // parameter values
136     FeatureMap params = Factory.newFeatureMap();
137 
138     // to create a document we need a sourceUrlName parameter giving
139     // the location of the source for the document content
140     params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
141       Gate.getUrl("tests/doc0.html"));
142     params.put(Document.DOCUMENT_MARKUP_AWARE_PARAMETER_NAME,
143       new Boolean(true));
144     Resource res = Factory.createResource("gate.corpora.DocumentImpl", params);
145 
146     // now we have a document
147     assertTrue(
148       "should be document but the class is: " + res.getClass().getName(),
149       res instanceof gate.Document
150     );
151     Document doc = (Document) res;
152     AnnotationSet markupAnnotations = doc.getAnnotations(
153                         GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
154     //this is useless as doc.getAnnotations() will never return null!
155     assertNotNull("no markup annotations on doc " + doc, markupAnnotations);
156     int numMarkupAnnotations = markupAnnotations.size();
157     if(DEBUG)
158       Out.prln("annotations on doc after unpack= " + numMarkupAnnotations);
159     assertTrue(
160       "wrong number annots on doc: " + doc + numMarkupAnnotations,
161       numMarkupAnnotations == 27
162     );
163 
164   } // testResourceCreation
165 
166   /** Constructing a corpus */
167   public void testCorpusConstruction() throws GateException {
168 
169     // corpus constructors require a name
170     corpus = Factory.newCorpus("My example corpus");
171 
172     // the corpus interface inherits all the sorted set methods
173     assertTrue(corpus.isEmpty());
174 
175   } // testCorpusConstruction
176 
177   /** Adding documents to a corpus */
178   public void testAddingDocuments() throws GateException {
179 
180     corpus = Factory.newCorpus("My example corpus");
181 
182     // add a document or two....
183     corpus.add(doc1);
184     corpus.add(doc2);
185 
186     // iterate the corpus members and do some random tests
187     Iterator iter = corpus.iterator();
188     while(iter.hasNext()) {
189       Document doc = (Document) iter.next();
190       assertTrue(
191         "document url not as expected",
192         doc.getSourceUrl().toExternalForm().endsWith("doc0.html") ||
193           doc.getSourceUrl().toExternalForm().endsWith("test1.htm")
194       );
195     } // while
196 
197   } // testAddingDocuments
198 
199   /** Adding annotations to documents */
200   public void testAddingAnnotations() {
201     AnnotationSet as = doc1.getAnnotations();
202     FeatureMap fm = doc1.getFeatures();
203     Integer id;
204 
205     // during creation of annotations offsets are checked and an invalid
206     // offset exception thrown if they are invalid
207     try {
208       id = as.add(new Long(10), new Long(20), "T1", fm);
209     } catch (InvalidOffsetException e) {
210       fail(e.toString());
211     }
212   } // testAddingAnnotations
213 
214   /** Using the FeatureMap interface */
215   public void testUsingFeatures() {
216     AnnotationSet as = doc1.getAnnotations();
217     Integer id; // the id of new annotations
218 
219     // putting features on documents
220     FeatureMap fm = Factory.newFeatureMap();
221     doc1.setFeatures(fm);
222     assertTrue(fm.size() == 0);
223     fm.put("author", "segovia");
224     assertTrue(fm.get("author").equals("segovia"));
225     fm.put("author", "brendl"); // map puts overwrite existing values
226     assertTrue(fm.get("author").equals("brendl"));
227     assertTrue(fm.size() == 1);
228 
229   } // testUsingFeatures
230 
231   /** String to print when wrong command-line args */
232   private static String usage =
233     "usage: CookBook [-dir directory-name | file(s)]";
234 
235   /**
236    * Main function: an example of embedding GATE-based
237    * batch processing. The method:
238    * <UL>
239    * <LI>
240    * initialises the GATE library, and creates PRs for
241    * tokenisation, sentence splitting and part of speech tagging
242    * <LI>
243    * takes a directory name as argument (-dir option) or just a list
244    * of files
245    * <LI>
246    * creates a directory called "out" and an index.html file there
247    * <LI>
248    * for each .html file in that directory:
249    * <BR> create a GATE document from the file
250    * <BR> run the PRs on the document
251    * <BR> dump some output for the file to "out/gate__[file name].txt",
252    * and add a line to the index
253    * </UL>
254    */
255   public static void main(String[] args) throws Exception {
256     // say "hi"
257     Out.prln("CookBook.main");
258     Out.prln("processing command line arguments");
259 
260     // check we have a directory name or list of files
261     List inputFiles = null;
262     if(args.length < 1) throw new GateException(usage);
263 
264     // set up a list of all the files to process
265     if(args[0].equals("-dir")) { // list all the files in the dir
266       if(args.length < 2) throw new GateException(usage);
267       File dir = new File(args[1]);
268       File[] filesArray = dir.listFiles();
269       if(filesArray == null)
270         throw new GateException(
271           dir.getPath() + " is not a directory; " + usage
272         );
273       inputFiles = Arrays.asList(filesArray);
274 
275     } else { // all args should be file names
276       inputFiles = new ArrayList();
277       for(int i = 0; i < args.length; i++)
278         inputFiles.add(new File(args[i]));
279     }
280 
281     // did we get some file names?
282     if(inputFiles.isEmpty()) {
283       throw new GateException("No files to process!");
284     }
285 
286     // initialise GATE
287     Out.prln("initialising GATE");
288     Gate.init();
289 
290     // create some processing resources
291     Out.prln("creating PRs");
292     //create a tokeniser
293     DefaultTokeniser tokeniser = (DefaultTokeniser)Factory.createResource(
294                                       "gate.creole.tokeniser.DefaultTokeniser");
295     //create a sentence splitter
296     SentenceSplitter splitter = (SentenceSplitter)Factory.createResource(
297                                       "gate.creole.splitter.SentenceSplitter");
298     //create a POS tagger
299     POSTagger tagger = (POSTagger)Factory.createResource(
300                                       "gate.creole.POSTagger");
301 
302     //create  a gazetteer
303     DefaultGazetteer gazetteer = (DefaultGazetteer)Factory.createResource(
304                                       "gate.creole.gazetteer.DefaultGazetteer");
305 
306     //create a grammar
307     ANNIETransducer transducer = (ANNIETransducer)Factory.createResource(
308                                       "gate.creole.ANNIETransducer");
309 
310     //create an orthomatcher
311     OrthoMatcher orthomatcher = (OrthoMatcher) Factory.createResource(
312                                 "gate.creole.orthomatcher.OrthoMatcher");
313 
314     // make the "out" directory that will contain the results.
315     String outDirName =
316       ((File) inputFiles.get(0)).getParent() + Strings.getFileSep() + "out";
317     if(! new File(outDirName).mkdir()){
318       throw new GateException("Could not create the output directory");
319     }
320 
321     // construct a name for the output index file; open; dump header
322     String nl = Strings.getNl(); // shorthand for platform's newline
323     String fsep =
324       Strings.getFileSep(); // shorthand for platform's file separator
325     String indexName =
326       ( (File) inputFiles.get(0) ).getParent() + fsep + "index.html";
327     FileWriter indexWriter = new FileWriter(new File(indexName));
328     indexWriter.write("<HTML><HEAD><TITLE>Documents list</TITLE></HEAD>");
329     indexWriter.write(nl + "<BODY>" + nl + "<UL>" + nl);
330 
331     // main loop:
332     // for each document
333     //   create a gate doc
334     //   set as the document for the PRs
335     //   run the PRs
336     //   dump output from the doc to out/gate__.....txt
337     //   delete the doc
338 
339     // loop on files list
340     Iterator filesIter = inputFiles.iterator();
341     Out.prln("looping on input files list");
342     while(filesIter.hasNext()) {
343       File inFile = (File) filesIter.next(); // the current file
344       Out.prln("processing file " + inFile.getPath());
345       FeatureMap params = Factory.newFeatureMap(); // params list for new doc
346 
347       // set the source URL parameter to a "file:..." URL string
348       params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
349         inFile.toURL().toExternalForm());
350 
351       // use the platform's default encoding rather than GATE's
352       params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
353 
354       // create the document
355       Document doc = (Document) Factory.createResource(
356         "gate.corpora.DocumentImpl", params
357       );
358 
359       // set the document param on the PRs
360        tokeniser.setDocument(doc);
361        splitter.setDocument(doc);
362        tagger.setDocument(doc);
363        gazetteer.setDocument(doc);
364        transducer.setDocument(doc);
365        orthomatcher.setDocument(doc);
366 
367       // run each PR
368       tokeniser.execute();
369       splitter.execute();
370       tagger.execute();
371       gazetteer.execute();
372       transducer.execute();
373       orthomatcher.execute();
374 
375       // dump out results
376 
377       // construct a name for the output file and open a stream
378       StringBuffer outFileName = new StringBuffer(inFile.getParent());
379       outFileName.append(fsep);
380       outFileName.append("out");
381       outFileName.append(fsep);
382       outFileName.append("gate__");
383       outFileName.append(inFile.getName());
384       outFileName.append(".txt");
385       File outFile = new File(outFileName.toString());
386       FileWriter outFileWriter = new FileWriter(outFile);
387       Out.prln("dumping " + outFile.getPath());
388 
389       // iterate round the token annotations writing to the out file
390       // NOTE: to dump all to XML: outFileWriter.write(doc.toXml(tokens));
391       AnnotationSet tokens = doc.getAnnotations("nercAS").
392         get(ANNIEConstants.TOKEN_ANNOTATION_TYPE);
393       Iterator iter = tokens.iterator();
394       while(iter.hasNext()) {
395         Annotation token = (Annotation) iter.next();
396         FeatureMap tokFeats = token.getFeatures();
397         String tokStr = (String) tokFeats.
398           get(ANNIEConstants.TOKEN_STRING_FEATURE_NAME);
399         String tokPos = (String) tokFeats.
400           get(ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME);
401         outFileWriter.write(tokStr + "\t" + tokPos + nl);
402       }
403       outFileWriter.write(doc.getFeatures().get("entitySet").toString());
404 
405       // close the out file stream; add an index line
406       outFileWriter.close();
407       indexWriter.write(
408         "<LI><A href=\"" + inFile.getName() + "\">" + inFile.getName() +
409         "</a>" + " -> " + "<a href=\"" + "out" + fsep + outFile.getName() +
410         "\">" + "out" + fsep + outFile.getName() + "</a></LI>\n"
411       );
412 
413       // make the doc a candidate for garbage collection
414       Out.prln("deleting gate doc");
415 
416       Factory.deleteResource(doc);
417     } // input files loop
418 
419     // finish the index file
420     indexWriter.write(nl + "</UL>" + nl + "</BODY></HTML>" + nl);
421     indexWriter.close();
422 
423     Out.prln("The End (roll credits)");
424   } // main
425 
426   /** Fixture set up: initialise members before each test method */
427   public void setUp() throws GateException, IOException {
428     corpus = Factory.newCorpus("My example corpus");
429 
430     doc1 = Factory.newDocument(Gate.getUrl("tests/doc0.html"));
431     doc2 = Factory.newDocument(Gate.getUrl("tests/html/test1.htm"));
432   } // setUp
433 
434   /** Construction */
435   public CookBook(String name) { super(name); }
436 
437   /** Test suite routine for the test runner */
438   public static Test suite() {
439     return new TestSuite(CookBook.class);
440   } // suite
441 
442 } // class CookBook
443