1   /*
2    *  XmlPositionCorrectionHandler.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Angel Kirilov,  4 January 2002
12   *
13   *  $Id: XmlPositionCorrectionHandler.java,v 1.2 2002/01/24 13:04:14 nasso Exp $
14   */
15  
16  package gate.xml;
17  
18  import org.xml.sax.helpers.*;
19  
20  
21  /**
22   * This class correct a Xerces parser bug in reported position in file during
23   * the parsing process. Xerces parser cut processed file to 16K peaces. If
24   * the parser cross the 16K border reported in the characters() position is
25   * zerro.
26   *
27   * This bug could be covered if you extend this content handler instead of
28   * org.xml.sax.helpers.DefaultHandler.
29   *
30   * The real content handler should call methods startDocument() and characters()
31   * in order to compute correct position in file. The corrected position could be
32   * received throug protected data member m_realOffset or with getRealOffset().
33   */
34  public class XmlPositionCorrectionHandler extends DefaultHandler {
35  
36    /**
37     * Variables for correction of 16K parser limit for offset
38     */
39    protected long m_realOffset;
40    private int m_lastPosition;
41    private int m_multiplyer;
42  
43    /** Constructor for initialization of variables */
44    public XmlPositionCorrectionHandler() {
45      m_realOffset = 0;
46      m_lastPosition = 0;
47      m_multiplyer = 0;
48    } // XmlPositionCorrectionHandler
49  
50    /** Initialization of variables on start of document parsing */
51    public void startDocument() throws org.xml.sax.SAXException {
52      m_realOffset = 0;
53      m_lastPosition = 0;
54      m_multiplyer = 0;
55    } // startDocument
56  
57    /** Return corrected offset for last characters() call */
58    public long getRealOffset() {
59      return m_realOffset;
60    } // getRealOffset
61  
62    /** Here is the correction of the Xerces parser bug. */
63    public void characters(char[] text, int offset, int len)
64                    throws org.xml.sax.SAXException {
65      if(offset == 0 && len == 1 && text.length <= 2) {
66          // unicode char or &xxx; coding
67          return;
68      } // if
69  
70      // There is 16K limit for offset. Here is the correction.
71      // Will catch the bug in most cases.
72      if(m_lastPosition - offset > 0x2000) {
73          m_multiplyer++;
74      }
75      m_lastPosition = offset;
76      m_realOffset = m_multiplyer*0x4000+offset;
77    } // characters
78  
79  } // XmlPositionCorrectionHandler