1 package gate.creole.tokeniser.chinesetokeniser;
2
3 import java.io.*;
4 import java.util.*;
5
6
14 public class Segmenter {
15 private TreeMap zhwords;
17 private TreeSet csurname, cforeign, cnumbers, cnotname;
18
19 private boolean debug;
20
21 public final static int TRAD = 0;
23 public final static int SIMP = 1;
24 public final static int BOTH = 2;
25
26 private ArrayList marks;
28
29 public Segmenter(int charform, boolean loadwordfile) {
31 debug = false;
32
33 int count = 0;
34
35 int treelevel;
36
37 csurname = new TreeSet();
38 cforeign = new TreeSet();
39 cnumbers = new TreeSet();
40 cnotname = new TreeSet();
41
42 if (charform == SIMP) {
43 loadset(cnumbers,
44 "gate:/creole/tokeniser/chinesetokeniser/snumbers_u8.txt");
45 loadset(cforeign,
46 "gate:/creole/tokeniser/chinesetokeniser/sforeign_u8.txt");
47 loadset(csurname,
48 "gate:/creole/tokeniser/chinesetokeniser/ssurname_u8.txt");
49 loadset(cnotname,
50 "gate:/creole/tokeniser/chinesetokeniser/snotname_u8.txt");
51 }
52 else if (charform == TRAD) {
53 loadset(cnumbers,
54 "gate:/creole/tokeniser/chinesetokeniser/tnumbers_u8.txt");
55 loadset(cforeign,
56 "gate:/creole/tokeniser/chinesetokeniser/tforeign_u8.txt");
57 loadset(csurname,
58 "gate:/creole/tokeniser/chinesetokeniser/tsurname_u8.txt");
59 loadset(cnotname,
60 "gate:/creole/tokeniser/chinesetokeniser/tnotname_u8.txt");
61 }
62 else { loadset(cnumbers,
64 "gate:/creole/tokeniser/chinesetokeniser/snumbers_u8.txt");
65 loadset(cforeign,
66 "gate:/creole/tokeniser/chinesetokeniser/sforeign_u8.txt");
67 loadset(csurname,
68 "gate:/creole/tokeniser/chinesetokeniser/ssurname_u8.txt");
69 loadset(cnotname,
70 "gate:/creole/tokeniser/chinesetokeniser/snotname_u8.txt");
71 loadset(cnumbers,
72 "gate:/creole/tokeniser/chinesetokeniser/tnumbers_u8.txt");
73 loadset(cforeign,
74 "gate:/creole/tokeniser/chinesetokeniser/tforeign_u8.txt");
75 loadset(csurname,
76 "gate:/creole/tokeniser/chinesetokeniser/tsurname_u8.txt");
77 loadset(cnotname,
78 "gate:/creole/tokeniser/chinesetokeniser/tnotname_u8.txt");
79 }
80
81 zhwords = new TreeMap();
82
83 if (!loadwordfile) {
84 return;
85 }
86
87 String newword = null;
88 try {
89 InputStream worddata = null;
90 if (charform == SIMP) {
91 worddata = new java.net.URL(
92 "gate:/creole/tokeniser/chinesetokeniser/simplexu8.txt").openStream();
93 }
94 else if (charform == TRAD) {
95 worddata = new java.net.URL(
96 "gate:/creole/tokeniser/chinesetokeniser/tradlexu8.txt").openStream();
97 }
98 else if (charform == BOTH) {
99 worddata = new java.net.URL(
100 "gate:/creole/tokeniser/chinesetokeniser/bothlexu8.txt").openStream();
101 }
102 BufferedReader in = new BufferedReader(new InputStreamReader(worddata,
103 "UTF8"));
104 while ( (newword = in.readLine()) != null) {
105 if ( (newword.indexOf("#") == -1) && (newword.length() < 5)) {
106
107 zhwords.put(newword.intern(), "1");
108
109 if (newword.length() == 3) {
110 if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
111 zhwords.put(newword.substring(0, 2).intern(), "2");
112 }
113 }
114
115 if (newword.length() == 4) {
116 if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
117 zhwords.put(newword.substring(0, 2).intern(), "2");
118 }
119 if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
120 zhwords.put(newword.substring(0, 3).intern(), "2");
121 }
122
123 }
124
125 }
127 }
128 in.close();
129
130 }
131 catch (IOException e) {
132 }
134
135 }
136
137
138 private void loadset(TreeSet targetset, String sourcefile) {
139 String dataline;
140 try {
141 InputStream setdata = new java.net.URL(sourcefile).openStream();
142 BufferedReader in = new BufferedReader(new InputStreamReader(setdata,
143 "UTF-8"));
144 while ( (dataline = in.readLine()) != null) {
145 if ( (dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
146 continue;
147 }
148 targetset.add(dataline.intern());
149 }
150 in.close();
151 }
152 catch (Exception e) {
153 }
155
156 }
157
158 public boolean isNumber(String testword) {
159 boolean result = true;
160 for (int i = 0; i < testword.length(); i++) {
161 if (cnumbers.contains(testword.substring(i, i + 1).intern()) == false) {
162 result = false;
163 break;
164 }
165 }
166
167 return result;
168 }
169
170 public boolean isAllForeign(String testword) {
171 boolean result = true;
172 for (int i = 0; i < testword.length(); i++) {
173 if (cforeign.contains(testword.substring(i, i + 1).intern()) == false) {
174 result = false;
175 break;
176 }
177 }
178
179 return result;
180 }
181
182 public boolean isNotCJK(String testword) {
183 boolean result = true;
184 for (int i = 0; i < testword.length(); i++) {
185 if (Character.UnicodeBlock.of(testword.charAt(i)) ==
186 Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
187 result = false;
188 break;
189 }
190 }
191
192 return result;
193 }
194
195 public String stemWord(String word) {
196 String[] prefix = new String[] {
197 "\u7b2c", "\u526f", "\u4e0d"};
198 String[] suffix = new String[] {
199 "\u4e86", "\u7684", "\u5730", "\u4e0b", "\u4e0a", "\u4e2d", "\u91cc",
200 "\u5230", "\u5185", "\u5916", "\u4eec"};
201 String[] infix = new String[] {
202 "\u5f97", "\u4e0d"};
203 int i;
204
205 StringBuffer unstemmed = new StringBuffer(word);
206
207 for (i = 0; i < prefix.length; i++) {
208 if (unstemmed.substring(0, 1).equals(prefix[i]) == true &&
209 (zhwords.get(unstemmed.substring(1, unstemmed.length()).intern()) != null ||
210 unstemmed.length() == 2)) {
211 unstemmed.deleteCharAt(0);
212 return unstemmed.toString();
213 }
214 }
215
216 for (i = 0; i < suffix.length; i++) {
217 if (unstemmed.substring(unstemmed.length() - 1, unstemmed.length()).
218 equals(suffix[i]) == true &&
219 (zhwords.get(unstemmed.substring(0, unstemmed.length() - 1).intern()) != null ||
220 unstemmed.length() == 2)) {
221 unstemmed.deleteCharAt(unstemmed.length() - 1);
222 return unstemmed.toString();
223 }
224 }
225
226 for (i = 0; i < infix.length; i++) {
227 if (unstemmed.length() == 3 && unstemmed.substring(1, 2).equals(infix[i]) == true &&
228 zhwords.get(new String(unstemmed.substring(0, 1) +
229 unstemmed.substring(2, 3)).intern()) != null) {
230 unstemmed.deleteCharAt(1);
231 return unstemmed.toString();
232 }
233 }
234
235 return unstemmed.toString();
236 }
237
238 public String segmentLine(String cline, String separator) {
241 StringBuffer currentword = new StringBuffer();
242 StringBuffer outline = new StringBuffer();
243 int i, clength;
244 char currentchar;
245 separator = " ";
246
247 clength = cline.length();
248 int[][] offsets = new int[clength][2];
249 marks = new ArrayList();
251 for (i = 0; i < clength; i++) {
252 currentchar = cline.charAt(i);
253 if (Character.UnicodeBlock.of(currentchar) ==
254 Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
255 isNumber(cline.substring(i, i + 1)) == true) {
256 if (currentword.length() == 0) { if (i > 0 && (Character.isWhitespace(cline.charAt(i - 1)) == false)) {
260
261 marks.add(new Long(i + marks.size())); outline.append(separator);
263 }
264 currentword.append(currentchar);
265
266 }
267 else {
268 if (zhwords.containsKey(new String(currentword.toString() +
269 currentchar).intern()) == true &&
270 ( (String) (zhwords.get(new String(currentword.toString() +
271 currentchar).intern()))).
272 equals("1") == true) {
273 currentword.append(currentchar);
275
276 }
277 else if (isAllForeign(currentword.toString()) &&
278 cforeign.contains(new String(new char[] {currentchar}).
279 intern()) &&
280 i + 2 < clength &&
281 (zhwords.containsKey(cline.substring(i, i + 2).intern()) == false)) {
282 currentword.append(currentchar);
284
285 }
286 else if (isNumber(currentword.toString()) &&
287 cnumbers.contains(new String(new char[] {currentchar}).
288 intern())
289 ) {
291 currentword.append(currentchar);
293
294 }
295 else if ( (zhwords.containsKey(new String(currentword.toString() +
296 currentchar).intern())) &&
297 ( ( (String) (zhwords.get(new String(currentword.toString() +
298 currentchar).intern()))).equals("2") == true) &&
299 i + 1 < clength &&
300 (zhwords.containsKey(new String(currentword.toString() +
301 currentchar +
302 cline.charAt(i + 1)).intern()) == true)) {
303
304 currentword.append(currentchar);
306
307 }
308 else {
310 outline.append(currentword.toString());
311 if (Character.isWhitespace(currentchar) == false) {
312 marks.add(new Long(i + marks.size()));
314 outline.append(separator);
316 }
317 currentword.setLength(0);
318 currentword.append(currentchar);
319 }
320 }
321
322 }
323 else { if (currentword.length() > 0) {
326 outline.append(currentword.toString());
327 if (Character.isWhitespace(currentchar) == false) {
328 marks.add(new Long(i + marks.size()));
330 outline.append(separator);
332 }
333 currentword.setLength(0);
334 }
335 outline.append(currentchar);
336 }
337 }
338
339 outline.append(currentword.toString());
340
341 return outline.toString();
342 }
344
345 public void addword(String newword) {
346 zhwords.put(newword.intern(), "1");
347
348 if (newword.length() == 3) {
349 if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
350 zhwords.put(newword.substring(0, 2).intern(), "2");
351 }
352 }
353
354 if (newword.length() == 4) {
355 if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
356 zhwords.put(newword.substring(0, 2).intern(), "2");
357 }
358 if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
359 zhwords.put(newword.substring(0, 3).intern(), "2");
360 }
361
362 }
363
364 if (newword.length() == 5) {
365 if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
366 zhwords.put(newword.substring(0, 2).intern(), "2");
367 }
368 if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
369 zhwords.put(newword.substring(0, 3).intern(), "2");
370 }
371 if (zhwords.containsKey(newword.substring(0, 4).intern()) == false) {
372 zhwords.put(newword.substring(0, 4).intern(), "2");
373 }
374 }
375
376 if (newword.length() == 6) {
377 if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
378 zhwords.put(newword.substring(0, 2).intern(), "2");
379 }
380 if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
381 zhwords.put(newword.substring(0, 3).intern(), "2");
382 }
383 if (zhwords.containsKey(newword.substring(0, 4).intern()) == false) {
384 zhwords.put(newword.substring(0, 4).intern(), "2");
385 }
386 if (zhwords.containsKey(newword.substring(0, 5).intern()) == false) {
387 zhwords.put(newword.substring(0, 5).intern(), "2");
388 }
389 }
390
391 }
392
393
396 public ArrayList getMarks() {
398 return marks;
399 }
400
401 public String segmentData(String fileContents, String encoding) {
402 byte[] gbbytes;
403 String segstring = "";
404 boolean debug = false;
405
406 try {
407 segstring = segmentLine(fileContents, " ");
408 if (debug) {
409 gbbytes = segstring.getBytes(encoding);
410 }
411 }
412 catch (Exception e) {
413 }
415
416 return segstring;
417 }
418 }