1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * sub3.c ... ALE enhancement. 30 * Since a typical Asian language has a huge character set, it is not 31 * ideal to index an array by a character code itself, which requires 32 * as large as 2**16 entries per array. 33 * To get arround this problem, we identify a set of characters that 34 * causes the same transition on all states and call it character group. 35 * Every character in a same character group has a unique number called 36 * character group id. A function yycgid(c) maps the character c (in process 37 * code) to the id. This mapping is determined by analyzing all regular 38 * expressions in the lex program. 39 * 40 */ 41 #include <stdlib.h> 42 #include <widec.h> 43 #include <search.h> 44 #include "ldefs.h" 45 46 /* 47 * "lchar" stands for linearized character. It is a variant of 48 * process code. AT&T's 16-bit process code has a drawback in which 49 * for three three process code C, D and E where C <= D <= E, 50 * codeset(C)==codeset(E) does not mean codeset(D)==codeset(C). 51 * In other words, four codesets alternates as the magnitude 52 * of character increases. 53 * The lchar representation holds this property: 54 * If three lchar C', D' and E' have the relationship C' < D' < E' and 55 * codeset(C') == codeset(E') then D' is guaranteed to belong to 56 * the same codeset as C' and E'. 57 * lchar is implemented as 32 bit entities and the function linearize() 58 * that maps a wchar_t to lchar is defined below. There is no 59 * reverse function for it though. 60 * The 32-bit process code by AT&T, used only for Taiwanese version at the 61 * time of wrting, has no such problem and we use it as it is. 62 */ 63 64 lchar yycgidtbl[MAXNCG] = { 65 0, /* For ease of computation of the id. */ 66 '\n', /* Newline is always special because '.' exclude it. */ 67 0x000000ff, /* The upper limit of codeset 0. */ 68 0x20ffffff, /* The upper limit of codeset 2. */ 69 0x40ffffff /* The upper limit of codeset 3. */ 70 /* 0x60ffffff The upper limit of codeset 1. */ 71 /* Above assumes the number of significant bits of wchar_t is <= 24. */ 72 }; 73 int ncgidtbl = 5; /* # elements in yycgidtbl. */ 74 int ncg; /* Should set to ncgidtbl*2; this is the largest value yycgid() */ 75 /* returns plus 1. */ 76 77 static void setsymbol(int i); 78 79 /* 80 * For given 16-bit wchar_t (See NOTE), lchar is computed as illustrated below: 81 * 82 * wc: axxxxxxbyyyyyyy 83 * 84 * returns: 0ab0000000000000axxxxxxxbyyyyyyy 85 * 86 * linearize() doesn't do any if compiled with 32-bit wchar_t, use of 87 * which is flagged with LONG_WCHAR_T macro. 88 * NOTE: 89 * The implementation is highly depends on the process code representation. 90 * This function should be modified when 32-bit process code is used. 91 * There is no need to keep 'a' and 'b' bits in the lower half of lchar. 92 * You can actually omit these and squeeze the xxxxxx part one bit right. 93 * We don't do that here just in sake of speed. 94 */ 95 lchar 96 linearize(wchar_t wc) 97 { 98 #ifdef LONG_WCHAR_T 99 return ((lchar)wc); /* Don't do anything. */ 100 #else 101 102 lchar prefix; 103 switch (wc&0x8080) { 104 case 0x0000: prefix = 0x00000000; break; 105 case 0x0080: prefix = 0x20000000; break; 106 case 0x8000: prefix = 0x40000000; break; 107 case 0x8080: prefix = 0x60000000; break; 108 } 109 return (prefix|wc); 110 #endif 111 } 112 113 /* compare liniear characters pointed to by pc1 and pc2 */ 114 int 115 cmplc(const void *arg1, const void *arg2) 116 { 117 lchar *pc1 = (lchar *)arg1; 118 lchar *pc2 = (lchar *)arg2; 119 120 if (*pc1 > *pc2) 121 return (1); 122 else if (*pc1 == *pc2) 123 return (0); 124 else 125 return (-1); 126 } 127 128 void 129 remch(wchar_t c) 130 { 131 lchar lc = linearize(c); 132 size_t local_ncgidtbl; 133 134 /* 135 * User-friendliness consideration: 136 * Make sure no EUC chars are used in reg. exp. 137 */ 138 if (!handleeuc) { 139 if (!isascii(c)) 140 if (iswprint(c)) 141 warning( 142 "Non-ASCII character '%wc' in pattern; use -w or -e lex option.", c); 143 else warning( 144 "Non-ASCII character of value %#x in pattern; use -w or -e lex option.", c); 145 /* In any case, we don't need to construct ncgidtbl[]. */ 146 return; 147 } 148 149 /* 150 * lsearch wants ncgidtbl to be size_t, but it is int. Hence, 151 * the use of local_ncgidtbl to satisfy the calling interface. 152 */ 153 local_ncgidtbl = ncgidtbl; 154 (void) lsearch(&lc, yycgidtbl, 155 &local_ncgidtbl, sizeof (lchar), cmplc); 156 ncgidtbl = (int)local_ncgidtbl; 157 } 158 159 void 160 sortcgidtbl(void) 161 { 162 if (!handleeuc) 163 return; 164 qsort(yycgidtbl, ncgidtbl, sizeof (lchar), cmplc); 165 } 166 167 /* 168 * int yycgid(wchar_t c) 169 * Takes c and returns its character group id, determind by the 170 * following algorithm. The program also uses the binary search 171 * algorithm, generalized from Knuth (6.2.1) Algorithm B. 172 * 173 * This function computes the "character group id" based on 174 * a table yycgidtbl of which each lchar entry is pre-sorted 175 * in ascending sequence The number of valid entries is given 176 * by YYNCGIDTBL. There is no duplicate entries in yycgidtbl. 177 * const int YYNCGIDTBL; 178 * lchar yycgidtbl[YYNCGIDTBL]; 179 * 180 * yycgidtbl[0] is guaranteed to have zero. 181 * 182 * For given c, yycgid(c) returns: 183 * 2*i iff yycgidtbl[i] == lc 184 * 2*i+1 iff yycgidtbl[i] < lc < yycgidtbl[i+1] 185 * YYNCGIDTBL*2-1 186 * iff yycgidtbl[YYNCGIDTBL-1] < lc 187 * where lc=linearize(c). 188 * 189 * Some interesting properties.: 190 * 1. For any c, 0 <= yycgid(c) <= 2*YYNCGIDTBL-1 191 * 2. yycgid(c) == 0 iff c == 0. 192 * 3. For any wchar_t c and d, if linearize(c) < linearize(d) then 193 * yycgid(c) <= yycgid(d). 194 * 4. For any wchar_t c and d, if yycgid(c) < yycgid(d) then 195 * linearize(c) < linearize(d). 196 */ 197 #define YYNCGIDTBL ncgidtbl 198 199 int 200 yycgid(wchar_t c) 201 { 202 int first = 0; 203 int last = YYNCGIDTBL - 1; 204 lchar lc; 205 206 /* 207 * In ASCII compat. mode, each character forms a "group" and the 208 * group-id is itself... 209 */ 210 if (!handleeuc) 211 return (c); 212 213 lc = linearize(c); 214 215 /* An exceptional case: yycgidtbl[YYNCGIDTBL-1] < lc */ 216 if (yycgidtbl[YYNCGIDTBL - 1] < lc) 217 return (YYNCGIDTBL*2 - 1); 218 219 while (last >= 0) { 220 int i = (first+last)/2; 221 if (lc == yycgidtbl[i]) 222 return (2*i); /* lc exactly matches an element. */ 223 else if (yycgidtbl[i] < lc) { 224 if (lc < yycgidtbl[i+1]) { 225 /* lc is in between two elements */ 226 return (2*i+1); 227 } 228 else 229 first = i + 1; 230 } else 231 last = i - 1; 232 } 233 error( 234 "system error in yycgid():binary search failed for c=0x%04x\n", c); 235 return (0); 236 } 237 238 /* 239 * repbycgid --- replaces each character in the parsing tree by its 240 * character group id. This, however, should be called even in 241 * the ASCII compat. mode to process DOT nodes and to call cclinter() 242 * for the DOT and CCL nodes. 243 */ 244 void 245 repbycgid(void) 246 { 247 int i, c; 248 249 for (i = 0; i < tptr; ++i) { 250 c = name[i]; 251 if (!ISOPERATOR(c)) { 252 /* If not an operator, it must be a char. */ 253 name[i] = yycgid((wchar_t)c); /* So replace it. */ 254 #ifdef DEBUG 255 if (debug) { 256 printf("name[%d]:'%c'->%d;\n", i, c, name[i]); 257 } 258 #endif 259 } else if (c == RSTR) { 260 c = right[i]; 261 right[i] = yycgid((wchar_t)c); 262 #ifdef DEBUG 263 if (debug) { 264 printf( 265 "name[%d].right:'%c'->%d;\n", 266 i, c, right[i]); 267 } 268 #endif 269 } else if ((c == RCCL) || (c == RNCCL)) { 270 CHR cc, *s; 271 int j; 272 CHR ccltoken[CCLSIZE]; 273 CHR *ccp; 274 int m; 275 /* 276 * This node represetns a character class RE [ccccc] 277 * s points to the string of characters that forms 278 * the class and/or a special prefix notation 279 * <RANGE>XY which corresponds to the RE X-Y, 280 * characters in the range of X and Y. Here, 281 * X <= Y is guranteed. 282 * We transform these characters into a string 283 * of sorted character group ids. 284 * 285 * There is another mechanism of packing tables 286 * that is inherited from the ASCII lex. Call of 287 * cclinter() is required for this packing. 288 * This used to be done as yylex() reads the lex 289 * rules but we have to do this here because the 290 * transition table is made to work on the char-group 291 * ids and the mapping cannot be determined until 292 * the entire file is read. 293 */ 294 #ifdef DEBUG 295 if (debug) { 296 printf("name[%d]:R[N]CCL of \"", i); 297 strpt(left[i]); 298 printf(" -> {"); 299 } 300 #endif 301 /* Prepare symbol[] for cclinter(). */ 302 for (j = 0; j < ncg; ++j) 303 symbol[j] = FALSE; 304 305 s = (CHR *) left[i]; 306 while (cc = *s++) { 307 if (cc == RANGE) { 308 int low, high, i; 309 /* 310 * Special form: <RANGE>XY 311 * This means the range X-Y. 312 * We mark all symbols[] 313 * elements for yycgid(X) thru 314 * yycgid(Y), inclusively. 315 */ 316 low = yycgid(*s++); 317 high = yycgid(*s++); 318 for (i = low; i <= high; ++i) 319 setsymbol(i); 320 } else { 321 setsymbol(yycgid(cc)); 322 } 323 } 324 325 /* Now make a transformed string of cgids. */ 326 s = ccptr; 327 m = 0; 328 for (j = 0; j < ncg; ++j) 329 if (symbol[j]) { 330 ccltoken[m++] = (CHR)j; 331 #ifdef DEBUG 332 if (debug) printf("%d, ", j); 333 #endif 334 } 335 336 #ifdef DEBUG 337 if (debug) printf("}\n"); 338 #endif 339 ccltoken[m] = 0; 340 ccp = ccl; 341 while (ccp < ccptr && scomp(ccltoken, ccp) != 0) 342 ccp++; 343 if (ccp < ccptr) { /* character class found in ccl */ 344 left[i] = (int)ccp; 345 } else { /* not in ccl, add it */ 346 left[i] = (int)ccptr; 347 scopy(ccltoken, ccptr); 348 ccptr += slength(ccltoken) + 1; 349 if (ccptr > ccl + CCLSIZE) 350 error( 351 "Too many large character classes"); 352 } 353 cclinter(c == RCCL); 354 } else if (c == DOT) { 355 if (psave == 0) { /* First DOT node. */ 356 int j, nlid; 357 /* 358 * Make symbol[k]=TRUE for all k 359 * except k == yycgid('\n'). 360 */ 361 nlid = yycgid('\n'); 362 psave = ccptr; 363 for (j = 1; j < ncg; ++j) { 364 if (j == nlid) { 365 symbol[j] = FALSE; 366 } else { 367 symbol[j] = TRUE; 368 *ccptr++ = (CHR) j; 369 } 370 } 371 *ccptr++ = 0; 372 if (ccptr > ccl + CCLSIZE) 373 error( 374 "Too many large character classes"); 375 } 376 /* Mimic mn1(RCCL,psave)... */ 377 name[i] = RCCL; 378 left[i] = (int)psave; 379 cclinter(1); 380 } 381 } 382 #ifdef DEBUG 383 if (debug) { 384 printf("treedump after repbycgid().\n"); 385 treedump(); 386 } 387 #endif 388 } 389 390 static void 391 setsymbol(int i) 392 { 393 if (i > sizeof (symbol)) 394 error("setsymbol: (SYSERR) %d out of range", i); 395 symbol[i] = TRUE; 396 } 397