1   /*
2    *  DocumentContentImpl.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 11/Feb/2000
12   *
13   *  $Id: DocumentContentImpl.java,v 1.26 2002/01/03 12:46:44 nasso Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  import java.net.*;
20  import java.io.*;
21  
22  import gate.*;
23  import gate.annotation.*;
24  import gate.util.*;
25  
26  /** Represents the commonalities between all sorts of document contents.
27    */
28  public class DocumentContentImpl implements DocumentContent
29  {
30    /** Debug flag */
31    private static final boolean DEBUG = false;
32  
33    /** Buffer size for reading
34     *  16k is 4 times the block size on most filesystems
35     *  so it should be efficient for most cases
36     *  */
37    private static final int INTERNAL_BUFFER_SIZE  = 16*1024;
38  
39    /** Default construction */
40    public DocumentContentImpl() {
41      content = new String();
42    } // default construction
43  
44    /** Contruction from URL and offsets. */
45    public DocumentContentImpl(URL u, String encoding, Long start, Long end)
46    throws IOException {
47  
48      int readLength = 0;
49      char[] readBuffer = new char[INTERNAL_BUFFER_SIZE];
50  
51      BufferedReader uReader = null;
52      StringBuffer buf = new StringBuffer();
53      char c;
54      long s = 0, e = Long.MAX_VALUE, counter = 0;
55      if(start != null && end != null) {
56        s = start.longValue();
57        e = end.longValue();
58      }
59  
60      if(encoding != null && !encoding.equalsIgnoreCase("")) {
61        uReader = new BufferedReader(
62          new InputStreamReader(u.openStream(), encoding), INTERNAL_BUFFER_SIZE
63        );
64      } else {
65        uReader = new BufferedReader(
66          new InputStreamReader(u.openStream()), INTERNAL_BUFFER_SIZE
67        );
68      };
69  
70      // 1. skip S characters
71      uReader.skip(s);
72  
73      // 2. how many character shall I read?
74      long toRead = e - s;
75  
76      // 3. read gtom source into buffer
77      while (
78        toRead > 0 &&
79        (readLength = uReader.read(readBuffer, 0, INTERNAL_BUFFER_SIZE)) != -1
80      ) {
81        if (toRead <  readLength) {
82          //well, if toRead(long) is less than readLenght(int)
83          //then there can be no overflow, so the cast is safe
84          readLength = (int)toRead;
85        }
86  
87        buf.append(readBuffer, 0, readLength);
88        toRead -= readLength;
89      }
90  
91      // 4.close reader
92      uReader.close();
93  
94      content = new String(buf);
95      originalContent = content;
96    } // Contruction from URL and offsets
97  
98    /** Propagate changes to the document content. */
99    void edit(Long start, Long end, DocumentContent replacement)
100   {
101     int s = start.intValue(), e = end.intValue();
102     String repl = ((DocumentContentImpl) replacement).content;
103     StringBuffer newContent = new StringBuffer(content);
104     newContent.replace(s, e, repl);
105     content = newContent.toString();
106   } // edit(start,end,replacement)
107 
108   /** The contents under a particular span. */
109   public DocumentContent getContent(Long start, Long end)
110     throws InvalidOffsetException
111   {
112     if(! isValidOffsetRange(start, end))
113       throw new InvalidOffsetException();
114 
115     return new DocumentContentImpl(
116       content.substring(start.intValue(), end.intValue())
117     );
118   } // getContent(start, end)
119 
120   /** Returns the String representing the content in case of a textual document.
121     * NOTE: this is a temporary solution until we have a more generic one.
122     */
123   public String toString(){
124     return content;
125   }
126 
127   /** The size of this content (e.g. character length for textual
128     * content).
129     */
130   public Long size() {
131     return new Long(content.length());
132   } // size()
133 
134   /** Check that an offset is valid */
135   boolean isValidOffset(Long offset) {
136     if(offset == null)
137       return false;
138 
139     long o = offset.longValue();
140     long len = content.length();
141     if(o > len || o < 0)
142       return false;
143 
144     return true;
145   } // isValidOffset
146 
147   /** Check that both start and end are valid offsets and that
148     * they constitute a valid offset range
149     */
150   boolean isValidOffsetRange(Long start, Long end) {
151     return
152       isValidOffset(start) && isValidOffset(end) &&
153       start.longValue() <= end.longValue();
154   } // isValidOffsetRange(start,end)
155 
156   /** Two documents are the same if their contents is the same
157    */
158   public boolean equals(Object other) {
159     if (!(other instanceof DocumentContentImpl)) return false;
160 
161     DocumentContentImpl docImpl = (DocumentContentImpl) other;
162     return content.equals(docImpl.toString());
163   } // equals
164 
165   /** Calculate the hash value for the object. */
166   public int hashCode(){ return toString().hashCode(); }
167 
168   /** Just for now - later we have to cater for different types of
169     * content.
170     */
171   String content;
172 
173   /**
174    * For preserving the original content of the document.
175    * The edit command didn't affect on the original content.
176    * If you construct the content by URL the originalContent will keep
177    * whole information retrieved by URL even you set some start and end.
178    */
179   String originalContent;
180 
181   /**
182    * Return the original content of the document received during the loading
183    * phase or on construction from string.
184    */
185   public String getOriginalContent() { return originalContent; }
186 
187   /** For ranges */
188   public DocumentContentImpl(String s)
189     { content = s; originalContent = content; }
190 
191   /** Freeze the serialization UID. */
192   static final long serialVersionUID = -1426940535575467461L;
193 } // class DocumentContentImpl
194