|
TextualDocumentFormat |
|
1 /* 2 * TextualDocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/May/2000 12 * 13 * $Id: TextualDocumentFormat.java,v 1.20 2002/01/28 14:25:09 nasso Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.net.*; 20 21 import gate.util.*; 22 import gate.*; 23 import gate.creole.*; 24 25 import org.w3c.www.mime.*; 26 27 /** The format of Documents. Subclasses of DocumentFormat know about 28 * particular MIME types and how to unpack the information in any 29 * markup or formatting they contain into GATE annotations. Each MIME 30 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 31 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 32 * with a static index residing here when they are constructed. Static 33 * getDocumentFormat methods can then be used to get the appropriate 34 * format class for a particular document. 35 */ 36 public class TextualDocumentFormat extends DocumentFormat 37 { 38 39 /** Debug flag */ 40 private static final boolean DEBUG = false; 41 42 /** Default construction */ 43 public TextualDocumentFormat() { super(); } 44 45 /** Initialise this resource, and return it. */ 46 public Resource init() throws ResourceInstantiationException{ 47 // Register plain text mime type 48 MimeType mime = new MimeType("text","plain"); 49 // Register the class handler for this mime type 50 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 51 this); 52 // Register the mime type with mine string 53 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 54 // Register file sufixes for this mime type 55 suffixes2mimeTypeMap.put("txt",mime); 56 suffixes2mimeTypeMap.put("text",mime); 57 // Set the mimeType for this language resource 58 setMimeType(mime); 59 return this; 60 } // init() 61 62 /** Unpack the markup in the document. This converts markup from the 63 * native format (e.g. XML, RTF) into annotations in GATE format. 64 * Uses the markupElementsMap to determine which elements to convert, and 65 * what annotation type names to use. 66 */ 67 public void unpackMarkup(Document doc) throws DocumentFormatException{ 68 if (doc == null || doc.getContent() == null) return; 69 // Create paragraph annotations in the specified annotation set 70 int endOffset = doc.getContent().toString().length(); 71 int startOffset = 0; 72 annotateParagraphs(doc,startOffset,endOffset, 73 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 74 }//unpackMarkup 75 76 public void unpackMarkup(Document doc, RepositioningInfo repInfo, 77 RepositioningInfo ampCodingInfo) 78 throws DocumentFormatException { 79 unpackMarkup(doc); 80 } // unpackMarkup 81 82 83 /** This method annotates paragraphs in a GATE document. The investigated text 84 * spans beetween start and end offsets and the paragraph annotations are 85 * created in the annotSetName. If annotSetName is null then they are creted 86 * in the default annotation set. 87 * @param aDoc is the gate document on which the paragraph detection would 88 * be performed.If it is null or its content it's null then the method woul 89 * simply return doing nothing. 90 * @param startOffset is the index form the document content from which the 91 * paragraph detection will start 92 * @param endOffset is the offset where the detection will end. 93 * @param annotSetName is the name of the set in which paragraph annotation 94 * would be created.The annotation type created will be "paragraph" 95 */ 96 public void annotateParagraphs(Document aDoc,int startOffset,int endOffset, 97 String annotSetName)throws DocumentFormatException{ 98 // Simply return if the document is null or its content 99 if (aDoc == null || aDoc.getContent() == null) return; 100 // Simply return if the start is > than the end 101 if (startOffset > endOffset) return; 102 // Decide where to put the newly detected annotations 103 AnnotationSet annotSet = null; 104 if (annotSetName == null) 105 annotSet = aDoc.getAnnotations(); 106 else 107 annotSet = aDoc.getAnnotations(annotSetName); 108 // Extract the document content 109 String content = aDoc.getContent().toString(); 110 // This is the offset marking the start of a para 111 int startOffsetPara = startOffset; 112 // This marks the ned of a para 113 int endOffsetPara = endOffset; 114 // The initial sate of the FSA 115 int state = 1; 116 // This field marks that a BR entity was read 117 // A BR entity can be NL or NL CR, depending on the operating system (UNIX 118 // or DOS) 119 boolean readBR = false; 120 int index = startOffset; 121 while (index < endOffset){ 122 // Read the current char 123 char ch = content.charAt(index); 124 // Test if a BR entity was read 125 if (ch =='\n'){ 126 readBR = true; 127 // If \n is followed by a \r then advance the index in order to read a 128 // BR entity 129 while ((index+1 < endOffset) && (content.charAt(index+1) == '\r')) 130 index ++; 131 }// End if 132 switch(state){ 133 // It is the initial and also a final state 134 // Stay in state 1 while it reads whitespaces 135 case 1:{ 136 // If reads a non whitespace char then move to state 2 and record 137 // the beggining of a paragraph 138 if (!Character.isWhitespace(ch)){ 139 state = 2; 140 startOffsetPara = index; 141 }// End if 142 }break; 143 // It can be also a final state. 144 case 2:{ 145 // Stay in state 2 while reading chars != BR entities 146 if (readBR){ 147 // If you find a BR char go to state 3. The possible end of the para 148 // can be index. This will be confirmed by state 3. So, this is why 149 // the end of a para is recorded here. 150 readBR = false; 151 endOffsetPara = index; 152 state = 3; 153 }// End if 154 }break; 155 // It can be also a final state 156 // From state 3 there are only 2 possible ways: (state 2 or state1) 157 // In state 1 it needs to read a BR 158 // For state 2 it nead to read something different then a BR 159 case 3:{ 160 if (readBR){ 161 // A BR was read. Go to state 1 162 readBR = false; 163 state = 1; 164 // Create an annotation type paragraph 165 try{ 166 annotSet.add( new Long(startOffsetPara), 167 new Long(endOffsetPara), 168 "paragraph", 169 Factory.newFeatureMap()); 170 } catch (gate.util.InvalidOffsetException ioe){ 171 throw new DocumentFormatException("Coudn't create a paragraph"+ 172 " annotation",ioe); 173 }// End try 174 }else{ 175 // Go to state 2 an keep reading chars 176 state = 2; 177 }// End if 178 }break; 179 }// End switch 180 // Prepare to read the next char. 181 index ++; 182 }// End while 183 endOffsetPara = index; 184 // Investigate where the finite automata has stoped 185 if ( state==2 || state==3 ){ 186 // Create an annotation type paragraph 187 try{ 188 annotSet.add( new Long(startOffsetPara), 189 // Create the final annotation using the endOffset 190 new Long(endOffsetPara), 191 "paragraph", 192 Factory.newFeatureMap()); 193 } catch (gate.util.InvalidOffsetException ioe){ 194 throw new DocumentFormatException("Coudn't create a paragraph"+ 195 " annotation",ioe); 196 }// End try 197 }// End if 198 }// End annotateParagraphs(); 199 200 public DataStore getDataStore(){ return null;} 201 202 } // class TextualDocumentFormat 203
|
TextualDocumentFormat |
|