|
RtfDocumentFormat |
|
1 /* 2 * RtfDocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/July/2000 12 * 13 * $Id: RtfDocumentFormat.java,v 1.15 2001/11/30 14:38:44 cursu Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.io.*; 20 import java.net.*; 21 22 import gate.util.*; 23 import gate.*; 24 import gate.gui.*; 25 import gate.creole.*; 26 27 // rtf tools 28 import javax.swing.text.rtf.*; 29 import javax.swing.text.*; 30 import org.w3c.www.mime.*; 31 32 /** The format of Documents. Subclasses of DocumentFormat know about 33 * particular MIME types and how to unpack the information in any 34 * markup or formatting they contain into GATE annotations. Each MIME 35 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 36 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 37 * with a static index residing here when they are constructed. Static 38 * getDocumentFormat methods can then be used to get the appropriate 39 * format class for a particular document. 40 */ 41 public class RtfDocumentFormat extends TextualDocumentFormat{ 42 43 /** Debug flag */ 44 private static final boolean DEBUG = false; 45 46 /** Default construction */ 47 public RtfDocumentFormat() { super(); } 48 49 /** Unpack the markup in the document. This converts markup from the 50 * native format (e.g.RTF) into annotations in GATE format. 51 * Uses the markupElementsMap to determine which elements to convert, and 52 * what annotation type names to use. 53 * It always tryes to parse te doc's content. It doesn't matter if the 54 * sourceUrl is null or not. 55 * 56 * @param Document doc The gate document you want to parse. 57 * 58 */ 59 public void unpackMarkup(gate.Document doc) throws DocumentFormatException { 60 61 if ( (doc == null) || 62 (doc.getSourceUrl() == null && doc.getContent() == null)){ 63 64 throw new DocumentFormatException( 65 "GATE document is null or no content found. Nothing to parse!"); 66 }// End if 67 68 // create a RTF editor kit 69 RTFEditorKit aRtfEditorkit = new RTFEditorKit(); 70 71 // create a Styled Document 72 // NOTE that RTF Kit works only with Systled Document interface 73 StyledDocument styledDoc = new DefaultStyledDocument(); 74 75 // get an Input stream from the gate document 76 InputStream in = new ByteArrayInputStream( 77 doc.getContent().toString().getBytes() 78 ); 79 80 try { 81 aRtfEditorkit.read(in, styledDoc, 0); 82 // replace the document content with the one without markups 83 doc.setContent(new DocumentContentImpl( 84 styledDoc.getText(0,styledDoc.getLength()) 85 ) 86 ); 87 } catch (BadLocationException e) { 88 throw new DocumentFormatException(e); 89 } catch (IOException e){ 90 throw new DocumentFormatException("I/O exception for " + 91 doc.getSourceUrl().toExternalForm(),e); 92 } 93 } // unpackMarkup(doc) 94 95 /** Initialise this resource, and return it. */ 96 public Resource init() throws ResourceInstantiationException{ 97 // Register RTF mime type 98 MimeType mime = new MimeType("text","rtf"); 99 // Register the class handler for this mime type 100 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 101 this); 102 // Register the mime type with mine string 103 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 104 // Register file sufixes for this mime type 105 suffixes2mimeTypeMap.put("rtf",mime); 106 // Register magic numbers for this mime type 107 magic2mimeTypeMap.put("{\\rtf1",mime); 108 // Set the mimeType for this language resource 109 setMimeType(mime); 110 return this; 111 }// init() 112 }// class RtfDocumentFormat 113
|
RtfDocumentFormat |
|