1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 /*
29 * sub3.c ... ALE enhancement.
30 * Since a typical Asian language has a huge character set, it is not
31 * ideal to index an array by a character code itself, which requires
32 * as large as 2**16 entries per array.
33 * To get arround this problem, we identify a set of characters that
34 * causes the same transition on all states and call it character group.
35 * Every character in a same character group has a unique number called
36 * character group id. A function yycgid(c) maps the character c (in process
37 * code) to the id. This mapping is determined by analyzing all regular
38 * expressions in the lex program.
39 *
40 */
41 #include <stdlib.h>
42 #include <widec.h>
43 #include <search.h>
44 #include "ldefs.h"
45
46 /*
47 * "lchar" stands for linearized character. It is a variant of
48 * process code. AT&T's 16-bit process code has a drawback in which
49 * for three three process code C, D and E where C <= D <= E,
50 * codeset(C)==codeset(E) does not mean codeset(D)==codeset(C).
51 * In other words, four codesets alternates as the magnitude
52 * of character increases.
53 * The lchar representation holds this property:
54 * If three lchar C', D' and E' have the relationship C' < D' < E' and
55 * codeset(C') == codeset(E') then D' is guaranteed to belong to
56 * the same codeset as C' and E'.
57 * lchar is implemented as 32 bit entities and the function linearize()
58 * that maps a wchar_t to lchar is defined below. There is no
59 * reverse function for it though.
60 * The 32-bit process code by AT&T, used only for Taiwanese version at the
61 * time of wrting, has no such problem and we use it as it is.
62 */
63
64 lchar yycgidtbl[MAXNCG] = {
65 0, /* For ease of computation of the id. */
66 '\n', /* Newline is always special because '.' exclude it. */
67 0x000000ff, /* The upper limit of codeset 0. */
68 0x20ffffff, /* The upper limit of codeset 2. */
69 0x40ffffff /* The upper limit of codeset 3. */
70 /* 0x60ffffff The upper limit of codeset 1. */
71 /* Above assumes the number of significant bits of wchar_t is <= 24. */
72 };
73 int ncgidtbl = 5; /* # elements in yycgidtbl. */
74 int ncg; /* Should set to ncgidtbl*2; this is the largest value yycgid() */
75 /* returns plus 1. */
76
77 static void setsymbol(int i);
78
79 /*
80 * For given 16-bit wchar_t (See NOTE), lchar is computed as illustrated below:
81 *
82 * wc: axxxxxxbyyyyyyy
83 *
84 * returns: 0ab0000000000000axxxxxxxbyyyyyyy
85 *
86 * linearize() doesn't do any if compiled with 32-bit wchar_t, use of
87 * which is flagged with LONG_WCHAR_T macro.
88 * NOTE:
89 * The implementation is highly depends on the process code representation.
90 * This function should be modified when 32-bit process code is used.
91 * There is no need to keep 'a' and 'b' bits in the lower half of lchar.
92 * You can actually omit these and squeeze the xxxxxx part one bit right.
93 * We don't do that here just in sake of speed.
94 */
95 lchar
linearize(wchar_t wc)96 linearize(wchar_t wc)
97 {
98 #ifdef LONG_WCHAR_T
99 return ((lchar)wc); /* Don't do anything. */
100 #else
101
102 lchar prefix;
103 switch (wc&0x8080) {
104 case 0x0000: prefix = 0x00000000; break;
105 case 0x0080: prefix = 0x20000000; break;
106 case 0x8000: prefix = 0x40000000; break;
107 case 0x8080: prefix = 0x60000000; break;
108 }
109 return (prefix|wc);
110 #endif
111 }
112
113 /* compare liniear characters pointed to by pc1 and pc2 */
114 int
cmplc(const void * arg1,const void * arg2)115 cmplc(const void *arg1, const void *arg2)
116 {
117 lchar *pc1 = (lchar *)arg1;
118 lchar *pc2 = (lchar *)arg2;
119
120 if (*pc1 > *pc2)
121 return (1);
122 else if (*pc1 == *pc2)
123 return (0);
124 else
125 return (-1);
126 }
127
128 void
remch(wchar_t c)129 remch(wchar_t c)
130 {
131 lchar lc = linearize(c);
132 size_t local_ncgidtbl;
133
134 /*
135 * User-friendliness consideration:
136 * Make sure no EUC chars are used in reg. exp.
137 */
138 if (!handleeuc) {
139 if (!isascii(c))
140 if (iswprint(c))
141 warning(
142 "Non-ASCII character '%wc' in pattern; use -w or -e lex option.", c);
143 else warning(
144 "Non-ASCII character of value %#x in pattern; use -w or -e lex option.", c);
145 /* In any case, we don't need to construct ncgidtbl[]. */
146 return;
147 }
148
149 /*
150 * lsearch wants ncgidtbl to be size_t, but it is int. Hence,
151 * the use of local_ncgidtbl to satisfy the calling interface.
152 */
153 local_ncgidtbl = ncgidtbl;
154 (void) lsearch(&lc, yycgidtbl,
155 &local_ncgidtbl, sizeof (lchar), cmplc);
156 ncgidtbl = (int)local_ncgidtbl;
157 }
158
159 void
sortcgidtbl(void)160 sortcgidtbl(void)
161 {
162 if (!handleeuc)
163 return;
164 qsort(yycgidtbl, ncgidtbl, sizeof (lchar), cmplc);
165 }
166
167 /*
168 * int yycgid(wchar_t c)
169 * Takes c and returns its character group id, determind by the
170 * following algorithm. The program also uses the binary search
171 * algorithm, generalized from Knuth (6.2.1) Algorithm B.
172 *
173 * This function computes the "character group id" based on
174 * a table yycgidtbl of which each lchar entry is pre-sorted
175 * in ascending sequence The number of valid entries is given
176 * by YYNCGIDTBL. There is no duplicate entries in yycgidtbl.
177 * const int YYNCGIDTBL;
178 * lchar yycgidtbl[YYNCGIDTBL];
179 *
180 * yycgidtbl[0] is guaranteed to have zero.
181 *
182 * For given c, yycgid(c) returns:
183 * 2*i iff yycgidtbl[i] == lc
184 * 2*i+1 iff yycgidtbl[i] < lc < yycgidtbl[i+1]
185 * YYNCGIDTBL*2-1
186 * iff yycgidtbl[YYNCGIDTBL-1] < lc
187 * where lc=linearize(c).
188 *
189 * Some interesting properties.:
190 * 1. For any c, 0 <= yycgid(c) <= 2*YYNCGIDTBL-1
191 * 2. yycgid(c) == 0 iff c == 0.
192 * 3. For any wchar_t c and d, if linearize(c) < linearize(d) then
193 * yycgid(c) <= yycgid(d).
194 * 4. For any wchar_t c and d, if yycgid(c) < yycgid(d) then
195 * linearize(c) < linearize(d).
196 */
197 #define YYNCGIDTBL ncgidtbl
198
199 int
yycgid(wchar_t c)200 yycgid(wchar_t c)
201 {
202 int first = 0;
203 int last = YYNCGIDTBL - 1;
204 lchar lc;
205
206 /*
207 * In ASCII compat. mode, each character forms a "group" and the
208 * group-id is itself...
209 */
210 if (!handleeuc)
211 return (c);
212
213 lc = linearize(c);
214
215 /* An exceptional case: yycgidtbl[YYNCGIDTBL-1] < lc */
216 if (yycgidtbl[YYNCGIDTBL - 1] < lc)
217 return (YYNCGIDTBL*2 - 1);
218
219 while (last >= 0) {
220 int i = (first+last)/2;
221 if (lc == yycgidtbl[i])
222 return (2*i); /* lc exactly matches an element. */
223 else if (yycgidtbl[i] < lc) {
224 if (lc < yycgidtbl[i+1]) {
225 /* lc is in between two elements */
226 return (2*i+1);
227 }
228 else
229 first = i + 1;
230 } else
231 last = i - 1;
232 }
233 error(
234 "system error in yycgid():binary search failed for c=0x%04x\n", c);
235 return (0);
236 }
237
238 /*
239 * repbycgid --- replaces each character in the parsing tree by its
240 * character group id. This, however, should be called even in
241 * the ASCII compat. mode to process DOT nodes and to call cclinter()
242 * for the DOT and CCL nodes.
243 */
244 void
repbycgid(void)245 repbycgid(void)
246 {
247 int i, c;
248
249 for (i = 0; i < tptr; ++i) {
250 c = name[i];
251 if (!ISOPERATOR(c)) {
252 /* If not an operator, it must be a char. */
253 name[i] = yycgid((wchar_t)c); /* So replace it. */
254 #ifdef DEBUG
255 if (debug) {
256 printf("name[%d]:'%c'->%d;\n", i, c, name[i]);
257 }
258 #endif
259 } else if (c == RSTR) {
260 c = right[i];
261 right[i] = yycgid((wchar_t)c);
262 #ifdef DEBUG
263 if (debug) {
264 printf(
265 "name[%d].right:'%c'->%d;\n",
266 i, c, right[i]);
267 }
268 #endif
269 } else if ((c == RCCL) || (c == RNCCL)) {
270 CHR cc, *s;
271 int j;
272 CHR ccltoken[CCLSIZE];
273 CHR *ccp;
274 int m;
275 /*
276 * This node represetns a character class RE [ccccc]
277 * s points to the string of characters that forms
278 * the class and/or a special prefix notation
279 * <RANGE>XY which corresponds to the RE X-Y,
280 * characters in the range of X and Y. Here,
281 * X <= Y is guranteed.
282 * We transform these characters into a string
283 * of sorted character group ids.
284 *
285 * There is another mechanism of packing tables
286 * that is inherited from the ASCII lex. Call of
287 * cclinter() is required for this packing.
288 * This used to be done as yylex() reads the lex
289 * rules but we have to do this here because the
290 * transition table is made to work on the char-group
291 * ids and the mapping cannot be determined until
292 * the entire file is read.
293 */
294 #ifdef DEBUG
295 if (debug) {
296 printf("name[%d]:R[N]CCL of \"", i);
297 strpt(left[i]);
298 printf(" -> {");
299 }
300 #endif
301 /* Prepare symbol[] for cclinter(). */
302 for (j = 0; j < ncg; ++j)
303 symbol[j] = FALSE;
304
305 s = (CHR *) left[i];
306 while (cc = *s++) {
307 if (cc == RANGE) {
308 int low, high, i;
309 /*
310 * Special form: <RANGE>XY
311 * This means the range X-Y.
312 * We mark all symbols[]
313 * elements for yycgid(X) thru
314 * yycgid(Y), inclusively.
315 */
316 low = yycgid(*s++);
317 high = yycgid(*s++);
318 for (i = low; i <= high; ++i)
319 setsymbol(i);
320 } else {
321 setsymbol(yycgid(cc));
322 }
323 }
324
325 /* Now make a transformed string of cgids. */
326 s = ccptr;
327 m = 0;
328 for (j = 0; j < ncg; ++j)
329 if (symbol[j]) {
330 ccltoken[m++] = (CHR)j;
331 #ifdef DEBUG
332 if (debug) printf("%d, ", j);
333 #endif
334 }
335
336 #ifdef DEBUG
337 if (debug) printf("}\n");
338 #endif
339 ccltoken[m] = 0;
340 ccp = ccl;
341 while (ccp < ccptr && scomp(ccltoken, ccp) != 0)
342 ccp++;
343 if (ccp < ccptr) { /* character class found in ccl */
344 left[i] = (int)ccp;
345 } else { /* not in ccl, add it */
346 left[i] = (int)ccptr;
347 scopy(ccltoken, ccptr);
348 ccptr += slength(ccltoken) + 1;
349 if (ccptr > ccl + CCLSIZE)
350 error(
351 "Too many large character classes");
352 }
353 cclinter(c == RCCL);
354 } else if (c == DOT) {
355 if (psave == 0) { /* First DOT node. */
356 int j, nlid;
357 /*
358 * Make symbol[k]=TRUE for all k
359 * except k == yycgid('\n').
360 */
361 nlid = yycgid('\n');
362 psave = ccptr;
363 for (j = 1; j < ncg; ++j) {
364 if (j == nlid) {
365 symbol[j] = FALSE;
366 } else {
367 symbol[j] = TRUE;
368 *ccptr++ = (CHR) j;
369 }
370 }
371 *ccptr++ = 0;
372 if (ccptr > ccl + CCLSIZE)
373 error(
374 "Too many large character classes");
375 }
376 /* Mimic mn1(RCCL,psave)... */
377 name[i] = RCCL;
378 left[i] = (int)psave;
379 cclinter(1);
380 }
381 }
382 #ifdef DEBUG
383 if (debug) {
384 printf("treedump after repbycgid().\n");
385 treedump();
386 }
387 #endif
388 }
389
390 static void
setsymbol(int i)391 setsymbol(int i)
392 {
393 if (i > sizeof (symbol))
394 error("setsymbol: (SYSERR) %d out of range", i);
395 symbol[i] = TRUE;
396 }
397