1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * sub3.c ... ALE enhancement.
28 * Since a typical Asian language has a huge character set, it is not
29 * ideal to index an array by a character code itself, which requires
30 * as large as 2**16 entries per array.
31 * To get arround this problem, we identify a set of characters that
32 * causes the same transition on all states and call it character group.
33 * Every character in a same character group has a unique number called
34 * character group id. A function yycgid(c) maps the character c (in process
35 * code) to the id. This mapping is determined by analyzing all regular
36 * expressions in the lex program.
37 *
38 */
39 #include <stdlib.h>
40 #include <widec.h>
41 #include <search.h>
42 #include "ldefs.h"
43
44 /*
45 * "lchar" stands for linearized character. It is a variant of
46 * process code. AT&T's 16-bit process code has a drawback in which
47 * for three three process code C, D and E where C <= D <= E,
48 * codeset(C)==codeset(E) does not mean codeset(D)==codeset(C).
49 * In other words, four codesets alternates as the magnitude
50 * of character increases.
51 * The lchar representation holds this property:
52 * If three lchar C', D' and E' have the relationship C' < D' < E' and
53 * codeset(C') == codeset(E') then D' is guaranteed to belong to
54 * the same codeset as C' and E'.
55 * lchar is implemented as 32 bit entities and the function linearize()
56 * that maps a wchar_t to lchar is defined below. There is no
57 * reverse function for it though.
58 * The 32-bit process code by AT&T, used only for Taiwanese version at the
59 * time of wrting, has no such problem and we use it as it is.
60 */
61
62 lchar yycgidtbl[MAXNCG] = {
63 0, /* For ease of computation of the id. */
64 '\n', /* Newline is always special because '.' exclude it. */
65 0x000000ff, /* The upper limit of codeset 0. */
66 0x20ffffff, /* The upper limit of codeset 2. */
67 0x40ffffff /* The upper limit of codeset 3. */
68 /* 0x60ffffff The upper limit of codeset 1. */
69 /* Above assumes the number of significant bits of wchar_t is <= 24. */
70 };
71 int ncgidtbl = 5; /* # elements in yycgidtbl. */
72 int ncg; /* Should set to ncgidtbl*2; this is the largest value yycgid() */
73 /* returns plus 1. */
74
75 static void setsymbol(int i);
76
77 /*
78 * For given 16-bit wchar_t (See NOTE), lchar is computed as illustrated below:
79 *
80 * wc: axxxxxxbyyyyyyy
81 *
82 * returns: 0ab0000000000000axxxxxxxbyyyyyyy
83 *
84 * linearize() doesn't do any if compiled with 32-bit wchar_t, use of
85 * which is flagged with LONG_WCHAR_T macro.
86 * NOTE:
87 * The implementation is highly depends on the process code representation.
88 * This function should be modified when 32-bit process code is used.
89 * There is no need to keep 'a' and 'b' bits in the lower half of lchar.
90 * You can actually omit these and squeeze the xxxxxx part one bit right.
91 * We don't do that here just in sake of speed.
92 */
93 lchar
linearize(wchar_t wc)94 linearize(wchar_t wc)
95 {
96 #ifdef LONG_WCHAR_T
97 return ((lchar)wc); /* Don't do anything. */
98 #else
99
100 lchar prefix;
101 switch (wc&0x8080) {
102 case 0x0000: prefix = 0x00000000; break;
103 case 0x0080: prefix = 0x20000000; break;
104 case 0x8000: prefix = 0x40000000; break;
105 case 0x8080: prefix = 0x60000000; break;
106 }
107 return (prefix|wc);
108 #endif
109 }
110
111 /* compare liniear characters pointed to by pc1 and pc2 */
112 int
cmplc(const void * arg1,const void * arg2)113 cmplc(const void *arg1, const void *arg2)
114 {
115 lchar *pc1 = (lchar *)arg1;
116 lchar *pc2 = (lchar *)arg2;
117
118 if (*pc1 > *pc2)
119 return (1);
120 else if (*pc1 == *pc2)
121 return (0);
122 else
123 return (-1);
124 }
125
126 void
remch(wchar_t c)127 remch(wchar_t c)
128 {
129 lchar lc = linearize(c);
130 size_t local_ncgidtbl;
131
132 /*
133 * User-friendliness consideration:
134 * Make sure no EUC chars are used in reg. exp.
135 */
136 if (!handleeuc) {
137 if (!isascii(c)) {
138 if (iswprint(c))
139 warning(
140 "Non-ASCII character '%wc' in pattern; use -w or -e lex option.", c);
141 else warning(
142 "Non-ASCII character of value %#x in pattern; use -w or -e lex option.", c);
143 }
144 /* In any case, we don't need to construct ncgidtbl[]. */
145 return;
146 }
147
148 /*
149 * lsearch wants ncgidtbl to be size_t, but it is int. Hence,
150 * the use of local_ncgidtbl to satisfy the calling interface.
151 */
152 local_ncgidtbl = ncgidtbl;
153 (void) lsearch(&lc, yycgidtbl,
154 &local_ncgidtbl, sizeof (lchar), cmplc);
155 ncgidtbl = (int)local_ncgidtbl;
156 }
157
158 void
sortcgidtbl(void)159 sortcgidtbl(void)
160 {
161 if (!handleeuc)
162 return;
163 qsort(yycgidtbl, ncgidtbl, sizeof (lchar), cmplc);
164 }
165
166 /*
167 * int yycgid(wchar_t c)
168 * Takes c and returns its character group id, determind by the
169 * following algorithm. The program also uses the binary search
170 * algorithm, generalized from Knuth (6.2.1) Algorithm B.
171 *
172 * This function computes the "character group id" based on
173 * a table yycgidtbl of which each lchar entry is pre-sorted
174 * in ascending sequence The number of valid entries is given
175 * by YYNCGIDTBL. There is no duplicate entries in yycgidtbl.
176 * const int YYNCGIDTBL;
177 * lchar yycgidtbl[YYNCGIDTBL];
178 *
179 * yycgidtbl[0] is guaranteed to have zero.
180 *
181 * For given c, yycgid(c) returns:
182 * 2*i iff yycgidtbl[i] == lc
183 * 2*i+1 iff yycgidtbl[i] < lc < yycgidtbl[i+1]
184 * YYNCGIDTBL*2-1
185 * iff yycgidtbl[YYNCGIDTBL-1] < lc
186 * where lc=linearize(c).
187 *
188 * Some interesting properties.:
189 * 1. For any c, 0 <= yycgid(c) <= 2*YYNCGIDTBL-1
190 * 2. yycgid(c) == 0 iff c == 0.
191 * 3. For any wchar_t c and d, if linearize(c) < linearize(d) then
192 * yycgid(c) <= yycgid(d).
193 * 4. For any wchar_t c and d, if yycgid(c) < yycgid(d) then
194 * linearize(c) < linearize(d).
195 */
196 #define YYNCGIDTBL ncgidtbl
197
198 int
yycgid(wchar_t c)199 yycgid(wchar_t c)
200 {
201 int first = 0;
202 int last = YYNCGIDTBL - 1;
203 lchar lc;
204
205 /*
206 * In ASCII compat. mode, each character forms a "group" and the
207 * group-id is itself...
208 */
209 if (!handleeuc)
210 return (c);
211
212 lc = linearize(c);
213
214 /* An exceptional case: yycgidtbl[YYNCGIDTBL-1] < lc */
215 if (yycgidtbl[YYNCGIDTBL - 1] < lc)
216 return (YYNCGIDTBL*2 - 1);
217
218 while (last >= 0) {
219 int i = (first+last)/2;
220 if (lc == yycgidtbl[i])
221 return (2*i); /* lc exactly matches an element. */
222 else if (yycgidtbl[i] < lc) {
223 if (lc < yycgidtbl[i+1]) {
224 /* lc is in between two elements */
225 return (2*i+1);
226 }
227 else
228 first = i + 1;
229 } else
230 last = i - 1;
231 }
232 error(
233 "system error in yycgid():binary search failed for c=0x%04x\n", c);
234 return (0);
235 }
236
237 /*
238 * repbycgid --- replaces each character in the parsing tree by its
239 * character group id. This, however, should be called even in
240 * the ASCII compat. mode to process DOT nodes and to call cclinter()
241 * for the DOT and CCL nodes.
242 */
243 void
repbycgid(void)244 repbycgid(void)
245 {
246 int i, c;
247
248 for (i = 0; i < tptr; ++i) {
249 c = name[i];
250 if (!ISOPERATOR(c)) {
251 /* If not an operator, it must be a char. */
252 name[i] = yycgid((wchar_t)c); /* So replace it. */
253 #ifdef DEBUG
254 if (debug) {
255 printf("name[%d]:'%c'->%d;\n", i, c, name[i]);
256 }
257 #endif
258 } else if (c == RSTR) {
259 c = right[i];
260 right[i] = yycgid((wchar_t)c);
261 #ifdef DEBUG
262 if (debug) {
263 printf(
264 "name[%d].right:'%c'->%d;\n",
265 i, c, right[i]);
266 }
267 #endif
268 } else if ((c == RCCL) || (c == RNCCL)) {
269 CHR cc, *s;
270 int j;
271 CHR ccltoken[CCLSIZE];
272 CHR *ccp;
273 int m;
274 /*
275 * This node represetns a character class RE [ccccc]
276 * s points to the string of characters that forms
277 * the class and/or a special prefix notation
278 * <RANGE>XY which corresponds to the RE X-Y,
279 * characters in the range of X and Y. Here,
280 * X <= Y is guranteed.
281 * We transform these characters into a string
282 * of sorted character group ids.
283 *
284 * There is another mechanism of packing tables
285 * that is inherited from the ASCII lex. Call of
286 * cclinter() is required for this packing.
287 * This used to be done as yylex() reads the lex
288 * rules but we have to do this here because the
289 * transition table is made to work on the char-group
290 * ids and the mapping cannot be determined until
291 * the entire file is read.
292 */
293 #ifdef DEBUG
294 if (debug) {
295 printf("name[%d]:R[N]CCL of \"", i);
296 strpt(left[i]);
297 printf(" -> {");
298 }
299 #endif
300 /* Prepare symbol[] for cclinter(). */
301 for (j = 0; j < ncg; ++j)
302 symbol[j] = FALSE;
303
304 s = (CHR *) left[i];
305 while ((cc = *s++) != 0) {
306 if (cc == RANGE) {
307 int low, high, i;
308 /*
309 * Special form: <RANGE>XY
310 * This means the range X-Y.
311 * We mark all symbols[]
312 * elements for yycgid(X) thru
313 * yycgid(Y), inclusively.
314 */
315 low = yycgid(*s++);
316 high = yycgid(*s++);
317 for (i = low; i <= high; ++i)
318 setsymbol(i);
319 } else {
320 setsymbol(yycgid(cc));
321 }
322 }
323
324 /* Now make a transformed string of cgids. */
325 s = ccptr;
326 m = 0;
327 for (j = 0; j < ncg; ++j)
328 if (symbol[j]) {
329 ccltoken[m++] = (CHR)j;
330 #ifdef DEBUG
331 if (debug) printf("%d, ", j);
332 #endif
333 }
334
335 #ifdef DEBUG
336 if (debug) printf("}\n");
337 #endif
338 ccltoken[m] = 0;
339 ccp = ccl;
340 while (ccp < ccptr && scomp(ccltoken, ccp) != 0)
341 ccp++;
342 if (ccp < ccptr) { /* character class found in ccl */
343 left[i] = (int)ccp;
344 } else { /* not in ccl, add it */
345 left[i] = (int)ccptr;
346 scopy(ccltoken, ccptr);
347 ccptr += slength(ccltoken) + 1;
348 if (ccptr > ccl + CCLSIZE)
349 error(
350 "Too many large character classes");
351 }
352 cclinter(c == RCCL);
353 } else if (c == DOT) {
354 if (psave == 0) { /* First DOT node. */
355 int j, nlid;
356 /*
357 * Make symbol[k]=TRUE for all k
358 * except k == yycgid('\n').
359 */
360 nlid = yycgid('\n');
361 psave = ccptr;
362 for (j = 1; j < ncg; ++j) {
363 if (j == nlid) {
364 symbol[j] = FALSE;
365 } else {
366 symbol[j] = TRUE;
367 *ccptr++ = (CHR) j;
368 }
369 }
370 *ccptr++ = 0;
371 if (ccptr > ccl + CCLSIZE)
372 error(
373 "Too many large character classes");
374 }
375 /* Mimic mn1(RCCL,psave)... */
376 name[i] = RCCL;
377 left[i] = (int)psave;
378 cclinter(1);
379 }
380 }
381 #ifdef DEBUG
382 if (debug) {
383 printf("treedump after repbycgid().\n");
384 treedump();
385 }
386 #endif
387 }
388
389 static void
setsymbol(int i)390 setsymbol(int i)
391 {
392 if (i > (int)sizeof (symbol))
393 error("setsymbol: (SYSERR) %d out of range", i);
394 symbol[i] = TRUE;
395 }
396