1   /*
2    *  RepositioningInfo.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Angel Kirilov, 04/January/2002
12   *
13   *  $Id: RepositioningInfo.java,v 1.6 2002/01/30 14:49:37 nasso Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.io.*;
19  import java.util.ArrayList;
20  
21  import gate.util.*;
22  
23  /**
24   * RepositioningInfo keep information about correspondence of positions
25   * between the original and extracted document content. With this information
26   * this class could be used for computing of this correspondence in the strict
27   * way (return -1 where is no correspondence)
28   * or in "flow" way (return near computable position)
29   */
30  
31  public class RepositioningInfo extends ArrayList {
32  
33    /** Freeze the serialization UID. */
34    static final long serialVersionUID = -2895662600168468559L;
35  
36    /**
37     * Just information keeper inner class. No significant functionality.
38     */
39    public class PositionInfo implements Serializable {
40  
41      /** Freeze the serialization UID. */
42      static final long serialVersionUID = -7747351720249898499L;
43  
44      /** Data members for one peace of text information */
45      private long m_origPos, m_origLength, m_currPos, m_currLength;
46  
47      /** The only constructor. We haven't set methods for data members. */
48      public PositionInfo(long orig, long origLen, long curr, long currLen) {
49        m_origPos = orig;
50        m_origLength = origLen;
51        m_currPos = curr;
52        m_currLength = currLen;
53      } // PositionInfo
54  
55      /** Position in the extracted (and probably changed) content */
56      public long getCurrentPosition() {
57        return m_currPos;
58      } // getCurrentPosition
59  
60      /** Position in the original content */
61      public long getOriginalPosition() {
62        return m_origPos;
63      } // getOriginalPosition
64  
65      /** Length of peace of text in the original content */
66      public long getOriginalLength() {
67        return m_origLength;
68      } // getOriginalLength
69  
70      /** Length of peace of text in the extracted content */
71      public long getCurrentLength() {
72        return m_currLength;
73      } // getCurrentLength
74  
75      /** For debug purposes */
76      public String toString() {
77        return "("+m_origPos+","+m_origLength+","
78                  +m_currPos+","+m_currLength+")";
79      } // toString
80    } // class PositionInfo
81  
82    /** Default constructor */
83    public RepositioningInfo() {
84      super();
85    } // RepositioningInfo
86  
87    /** Create a new position information record. */
88    public void addPositionInfo(long origPos, long origLength,
89                                long currPos, long currLength) {
90      // sorted add of new position
91      int insertPos = 0;
92      PositionInfo lastPI;
93  
94      for(int i = size(); i>0; i--) {
95        lastPI = (PositionInfo) get(i-1);
96        if(lastPI.getOriginalPosition() < origPos) {
97          insertPos = i;
98          break;
99        } // if - sort key
100     } // for
101 
102     add(insertPos, new PositionInfo(origPos, origLength, currPos, currLength));
103   } // addPositionInfo
104 
105   /** Compute position in extracted content by position in the original content.
106    *  If there is no correspondence return -1.
107    */
108   public long getExtractedPos(long absPos) {
109     long result = absPos;
110     PositionInfo currPI = null;
111     int size = size();
112 
113     if(size != 0) {
114       long origPos, origLen;
115       boolean found = false;
116 
117       for(int i=0; i<size; ++i) {
118         currPI = (PositionInfo) get(i);
119         origPos = currPI.getOriginalPosition();
120         origLen = currPI.getOriginalLength();
121 
122         if(absPos <= origPos+origLen) {
123           if(absPos < origPos) {
124             // outside the range of information
125             result = -1;
126           }
127           else {
128             // current position + offset in this PositionInfo record
129             result = currPI.getCurrentPosition() + absPos - origPos;
130           } // if
131           found = true;
132           break;
133         } // if
134       } // for
135 
136       if(!found) {
137         // after the last repositioning info
138         result = -1;
139       } // if - !found
140     } // if
141 
142     return result;
143   } // getExtractedPos
144 
145   public long getOriginalPos(long relPos) {
146     return getOriginalPos(relPos, false);
147   } // getOriginalPos
148 
149   /** Compute position in original content by position in the extracted content.
150    *  If there is no correspondence return -1.
151    */
152   public long getOriginalPos(long relPos, boolean afterChar) {
153     long result = relPos;
154     PositionInfo currPI = null;
155     int size = size();
156 
157     if(size != 0) {
158       long currPos, currLen;
159       boolean found = false;
160 
161       for(int i=0; i<size; ++i) {
162         currPI = (PositionInfo) get(i);
163         currPos = currPI.getCurrentPosition();
164         currLen = currPI.getCurrentLength();
165 
166         if(afterChar && relPos == currPos+currLen) {
167           result = currPI.getOriginalPosition() + currPI.getOriginalLength();
168           found = true;
169           break;
170         } // if
171 
172         if(relPos < currPos+currLen) {
173           if(relPos < currPos) {
174             // outside the range of information
175             result = -1;
176           }
177           else {
178             // current position + offset in this PositionInfo record
179             result = currPI.getOriginalPosition() + relPos - currPos;
180           } // if
181           found = true;
182           break;
183         } // if
184       } // for
185 
186       if(!found) {
187         // after the last repositioning info
188         result = -1;
189       } // if - !found
190     } // if
191 
192     return result;
193   } // getOriginalPos
194 
195   /** Not finished yet */
196   public long getExtractedPosFlow(long absPos) {
197     long result = -1;
198     return result;
199   } // getExtractedPosFlow
200 
201   /** Not finished yet */
202   public long getOriginalPosFlow(long relPos) {
203     long result = -1;
204     return result;
205   } // getOriginalPosFlow
206 
207   /**
208    * Return the position info index containing <B>@param absPos</B>
209    * If there is no such position info return -1.
210    */
211   public int getIndexByOriginalPosition(long absPos) {
212     PositionInfo currPI = null;
213     int result = -1;
214 
215     int size = size();
216     long origPos, origLen;
217 
218     // Find with the liniear algorithm. Could be extended to binary search.
219     for(int i=0; i<size; ++i) {
220       currPI = (PositionInfo) get(i);
221       origPos = currPI.getOriginalPosition();
222       origLen = currPI.getOriginalLength();
223 
224       if(absPos <= origPos+origLen) {
225         if(absPos >= origPos) {
226           result = i;
227         } // if
228         break;
229       } // if
230     } // for
231 
232     return result;
233   } // getItemByOriginalPosition
234 
235   /**
236    * Return the position info index containing <B>@param absPos</B>
237    * or the index of record before this position.
238    * Result is -1 if the position is before the first record.
239    * Rezult is size() if the position is after the last record.
240    */
241   public int getIndexByOriginalPositionFlow(long absPos) {
242     PositionInfo currPI = null;
243 
244     int size = size();
245     int result = size;
246     long origPos, origLen;
247 
248     // Find with the liniear algorithm. Could be extended to binary search.
249     for(int i=0; i<size; ++i) {
250       currPI = (PositionInfo) get(i);
251       origPos = currPI.getOriginalPosition();
252       origLen = currPI.getOriginalLength();
253 
254       if(absPos <= origPos+origLen) {
255         // is inside of current record
256         if(absPos >= origPos) {
257           result = i;
258         }
259         else {
260           // not inside the current recort - return previous
261           result = i-1;
262         } // if
263         break;
264       } // if
265     } // for
266 
267     return result;
268   } // getItemByOriginalPositionFlow
269 
270   /**
271    *  Correct the RepositioningInfo structure for shrink/expand changes.
272    *  <br>
273    *
274    *  Normaly the text peaces have same sizes in both original text and
275    *  extracted text. But in some cases there are nonlinear substitutions.
276    *  For example the sequence "&lt;" is converted to "<".
277    *  <br>
278    *
279    *  The correction will split the corresponding PositionInfo structure to
280    *  3 new records - before correction, correction record and after correction.
281    *  Front and end records are the same maner like the original record -
282    *  m_origLength == m_currLength, since the middle record has different
283    *  values because of shrink/expand changes. All records after this middle
284    *  record should be corrected with the difference between these values.
285    *  <br>
286    *
287    *  All m_currPos above the current information record should be corrected
288    *  with (origLen - newLen) i.e.
289    *  <code> m_currPos -= origLen - newLen; </code>
290    *  <br>
291    *
292    *  @param originalPos Position of changed text in the original content.
293    *  @param origLen Length of changed peace of text in the original content.
294    *  @param newLen Length of new peace of text substiting the original peace.
295    */
296   public void correctInformation(long originalPos, long origLen, long newLen) {
297     PositionInfo currPI;
298     PositionInfo frontPI, correctPI, endPI;
299 
300     int index = getIndexByOriginalPositionFlow(originalPos);
301 
302     // correct the index when the originalPos precede all records
303     if(index == -1) {
304       index = 0;
305     } // if
306 
307     // correction of all other information records
308     // All m_currPos above the current record should be corrected with
309     // (origLen - newLen) i.e. <code> m_currPos -= origLen - newLen; </code>
310 
311     for(int i=index; i<size(); ++i) {
312       currPI = (PositionInfo) get(i);
313       currPI.m_currPos -= origLen - newLen;
314     } // for
315 
316     currPI = (PositionInfo) get(index);
317     if(originalPos >= currPI.m_origPos
318         && currPI.m_origPos + currPI.m_origLength >= originalPos + origLen) {
319       long frontLen = originalPos - currPI.m_origPos;
320 
321       frontPI = new PositionInfo(currPI.m_origPos,
322                               frontLen,
323                               currPI.m_currPos,
324                               frontLen);
325       correctPI = new PositionInfo(originalPos,
326                               origLen,
327                               currPI.m_currPos + frontLen,
328                               newLen);
329       long endLen = currPI.m_origLength - frontLen - origLen;
330       endPI = new PositionInfo(originalPos + origLen,
331                               endLen,
332                               currPI.m_currPos + frontLen + newLen,
333                               endLen);
334 
335       set(index, frontPI); // substitute old element
336       if(endPI.m_origLength != 0) {
337         add(index+1, endPI); // insert new end element
338       } // if
339       add(index+1, correctPI); // insert middle new element
340     } // if - substitution range check
341   } // correctInformation
342 
343   /**
344    *  Correct the original position information in the records. When some text
345    *  is shrinked/expanded by the parser. With this method is corrected the
346    *  substitution of "\r\n" with "\n".
347    */
348   public void correctInformationOriginalMove(long originalPos, long moveLen) {
349     PositionInfo currPI;
350 
351     int index = getIndexByOriginalPositionFlow(originalPos);
352 
353     // correct the index when the originalPos precede all records
354     if(index == -1) {
355       index = 0;
356     } // if
357 
358     // position is after all records in list
359     if(index == size()) {
360       return;
361     } // if
362 
363     for(int i = index+1; i<size(); ++i) {
364       currPI = (PositionInfo) get(i);
365       currPI.m_origPos += moveLen;
366     } // for
367 
368     currPI = (PositionInfo) get(index);
369 
370     // should we split this record to two new records (inside the record)
371     if(originalPos > currPI.m_origPos) {
372       if(originalPos < currPI.m_origPos + currPI.m_origLength) {
373         PositionInfo frontPI, endPI;
374         long frontLen = originalPos - currPI.m_origPos;
375         frontPI = new PositionInfo(currPI.m_origPos,
376                                 frontLen,
377                                 currPI.m_currPos,
378                                 frontLen);
379 
380         long endLen = currPI.m_origLength - frontLen;
381         endPI = new PositionInfo(originalPos + frontLen + moveLen,
382                                 endLen,
383                                 currPI.m_currPos + frontLen,
384                                 endLen);
385         set(index, frontPI); // substitute old element
386         if(endPI.m_origLength != 0) {
387           add(index+1, endPI); // insert new end element
388         } // if - should add this record
389       } // if - inside the record
390     } // if
391     else {
392       // correction if the position is before the current record
393       currPI.m_origPos += moveLen;
394     }
395   } // correctInformationOriginalMove
396 
397 } // class RepositioningInfo