RtfDocumentFormat.java |
1 /* 2 * RtfDocumentFormat.java 3 * 4 * Copyright (c) 1998-2004, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/July/2000 12 * 13 * $Id: RtfDocumentFormat.java,v 1.19 2004/07/21 17:10:03 akshay Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.io.*; 19 20 import javax.swing.text.*; 21 import javax.swing.text.rtf.RTFEditorKit; 22 23 import gate.Resource; 24 import gate.creole.ResourceInstantiationException; 25 import gate.util.DocumentFormatException; 26 //import org.w3c.www.mime.*; 27 28 /** The format of Documents. Subclasses of DocumentFormat know about 29 * particular MIME types and how to unpack the information in any 30 * markup or formatting they contain into GATE annotations. Each MIME 31 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 32 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 33 * with a static index residing here when they are constructed. Static 34 * getDocumentFormat methods can then be used to get the appropriate 35 * format class for a particular document. 36 */ 37 public class RtfDocumentFormat extends TextualDocumentFormat{ 38 39 /** Debug flag */ 40 private static final boolean DEBUG = false; 41 42 /** Default construction */ 43 public RtfDocumentFormat() { super(); } 44 45 /** Unpack the markup in the document. This converts markup from the 46 * native format (e.g.RTF) into annotations in GATE format. 47 * Uses the markupElementsMap to determine which elements to convert, and 48 * what annotation type names to use. 49 * It always tryes to parse te doc's content. It doesn't matter if the 50 * sourceUrl is null or not. 51 * 52 * @param doc The gate document you want to parse. 53 * 54 */ 55 public void unpackMarkup(gate.Document doc) throws DocumentFormatException { 56 57 if ( (doc == null) || 58 (doc.getSourceUrl() == null && doc.getContent() == null)){ 59 60 throw new DocumentFormatException( 61 "GATE document is null or no content found. Nothing to parse!"); 62 }// End if 63 64 // create a RTF editor kit 65 RTFEditorKit aRtfEditorkit = new RTFEditorKit(); 66 67 // create a Styled Document 68 // NOTE that RTF Kit works only with Systled Document interface 69 StyledDocument styledDoc = new DefaultStyledDocument(); 70 71 // get an Input stream from the gate document 72 InputStream in = new ByteArrayInputStream( 73 doc.getContent().toString().getBytes() 74 ); 75 76 try { 77 aRtfEditorkit.read(in, styledDoc, 0); 78 // replace the document content with the one without markups 79 doc.setContent(new DocumentContentImpl( 80 styledDoc.getText(0,styledDoc.getLength()) 81 ) 82 ); 83 } catch (BadLocationException e) { 84 throw new DocumentFormatException(e); 85 } catch (IOException e){ 86 throw new DocumentFormatException("I/O exception for " + 87 doc.getSourceUrl().toExternalForm(),e); 88 } 89 } // unpackMarkup(doc) 90 91 /** Initialise this resource, and return it. */ 92 public Resource init() throws ResourceInstantiationException{ 93 // Register RTF mime type 94 MimeType mime = new MimeType("text","rtf"); 95 // Register the class handler for this mime type 96 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 97 this); 98 // Register the mime type with mine string 99 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 100 // Register file sufixes for this mime type 101 suffixes2mimeTypeMap.put("rtf",mime); 102 // Register magic numbers for this mime type 103 magic2mimeTypeMap.put("{\\rtf1",mime); 104 // Set the mimeType for this language resource 105 setMimeType(mime); 106 return this; 107 }// init() 108 }// class RtfDocumentFormat 109