1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * sub3.c ... ALE enhancement. 30 * Since a typical Asian language has a huge character set, it is not 31 * ideal to index an array by a character code itself, which requires 32 * as large as 2**16 entries per array. 33 * To get arround this problem, we identify a set of characters that 34 * causes the same transition on all states and call it character group. 35 * Every character in a same character group has a unique number called 36 * character group id. A function yycgid(c) maps the character c (in process 37 * code) to the id. This mapping is determined by analyzing all regular 38 * expressions in the lex program. 39 * 40 */ 41 #include <stdlib.h> 42 #include <widec.h> 43 #include <search.h> 44 #include "ldefs.h" 45 46 /* 47 * "lchar" stands for linearized character. It is a variant of 48 * process code. AT&T's 16-bit process code has a drawback in which 49 * for three three process code C, D and E where C <= D <= E, 50 * codeset(C)==codeset(E) does not mean codeset(D)==codeset(C). 51 * In other words, four codesets alternates as the magnitude 52 * of character increases. 53 * The lchar representation holds this property: 54 * If three lchar C', D' and E' have the relationship C' < D' < E' and 55 * codeset(C') == codeset(E') then D' is guaranteed to belong to 56 * the same codeset as C' and E'. 57 * lchar is implemented as 32 bit entities and the function linearize() 58 * that maps a wchar_t to lchar is defined below. There is no 59 * reverse function for it though. 60 * The 32-bit process code by AT&T, used only for Taiwanese version at the 61 * time of wrting, has no such problem and we use it as it is. 62 */ 63 64 lchar yycgidtbl[MAXNCG] = { 65 0, /* For ease of computation of the id. */ 66 '\n', /* Newline is always special because '.' exclude it. */ 67 0x000000ff, /* The upper limit of codeset 0. */ 68 0x20ffffff, /* The upper limit of codeset 2. */ 69 0x40ffffff /* The upper limit of codeset 3. */ 70 /* 0x60ffffff The upper limit of codeset 1. */ 71 /* Above assumes the number of significant bits of wchar_t is <= 24. */ 72 }; 73 int ncgidtbl = 5; /* # elements in yycgidtbl. */ 74 int ncg; /* Should set to ncgidtbl*2; this is the largest value yycgid() */ 75 /* returns plus 1. */ 76 77 static void setsymbol(int i); 78 79 /* 80 * For given 16-bit wchar_t (See NOTE), lchar is computed as illustrated below: 81 * 82 * wc: axxxxxxbyyyyyyy 83 * 84 * returns: 0ab0000000000000axxxxxxxbyyyyyyy 85 * 86 * linearize() doesn't do any if compiled with 32-bit wchar_t, use of 87 * which is flagged with LONG_WCHAR_T macro. 88 * NOTE: 89 * The implementation is highly depends on the process code representation. 90 * This function should be modified when 32-bit process code is used. 91 * There is no need to keep 'a' and 'b' bits in the lower half of lchar. 92 * You can actually omit these and squeeze the xxxxxx part one bit right. 93 * We don't do that here just in sake of speed. 94 */ 95 lchar 96 linearize(wchar_t wc) 97 { 98 #ifdef LONG_WCHAR_T 99 return ((lchar)wc); /* Don't do anything. */ 100 #else 101 102 lchar prefix; 103 switch (wc&0x8080) { 104 case 0x0000: prefix = 0x00000000; break; 105 case 0x0080: prefix = 0x20000000; break; 106 case 0x8000: prefix = 0x40000000; break; 107 case 0x8080: prefix = 0x60000000; break; 108 } 109 return (prefix|wc); 110 #endif 111 } 112 113 /* compare liniear characters pointed to by pc1 and pc2 */ 114 int 115 cmplc(const void *arg1, const void *arg2) 116 { 117 lchar *pc1 = (lchar *)arg1; 118 lchar *pc2 = (lchar *)arg2; 119 120 if (*pc1 > *pc2) 121 return (1); 122 else if (*pc1 == *pc2) 123 return (0); 124 else 125 return (-1); 126 } 127 128 void 129 remch(wchar_t c) 130 { 131 lchar lc = linearize(c); 132 133 /* 134 * User-friendliness consideration: 135 * Make sure no EUC chars are used in reg. exp. 136 */ 137 if (!handleeuc) { 138 if (!isascii(c)) 139 if (iswprint(c)) 140 warning( 141 "Non-ASCII character '%wc' in pattern; use -w or -e lex option.", c); 142 else warning( 143 "Non-ASCII character of value %#x in pattern; use -w or -e lex option.", c); 144 /* In any case, we don't need to construct ncgidtbl[]. */ 145 return; 146 } 147 148 lsearch(&lc, yycgidtbl, 149 (size_t *)&ncgidtbl, sizeof (lchar), cmplc); 150 } 151 152 void 153 sortcgidtbl(void) 154 { 155 if (!handleeuc) 156 return; 157 qsort(yycgidtbl, ncgidtbl, sizeof (lchar), cmplc); 158 } 159 160 /* 161 * int yycgid(wchar_t c) 162 * Takes c and returns its character group id, determind by the 163 * following algorithm. The program also uses the binary search 164 * algorithm, generalized from Knuth (6.2.1) Algorithm B. 165 * 166 * This function computes the "character group id" based on 167 * a table yycgidtbl of which each lchar entry is pre-sorted 168 * in ascending sequence The number of valid entries is given 169 * by YYNCGIDTBL. There is no duplicate entries in yycgidtbl. 170 * const int YYNCGIDTBL; 171 * lchar yycgidtbl[YYNCGIDTBL]; 172 * 173 * yycgidtbl[0] is guaranteed to have zero. 174 * 175 * For given c, yycgid(c) returns: 176 * 2*i iff yycgidtbl[i] == lc 177 * 2*i+1 iff yycgidtbl[i] < lc < yycgidtbl[i+1] 178 * YYNCGIDTBL*2-1 179 * iff yycgidtbl[YYNCGIDTBL-1] < lc 180 * where lc=linearize(c). 181 * 182 * Some interesting properties.: 183 * 1. For any c, 0 <= yycgid(c) <= 2*YYNCGIDTBL-1 184 * 2. yycgid(c) == 0 iff c == 0. 185 * 3. For any wchar_t c and d, if linearize(c) < linearize(d) then 186 * yycgid(c) <= yycgid(d). 187 * 4. For any wchar_t c and d, if yycgid(c) < yycgid(d) then 188 * linearize(c) < linearize(d). 189 */ 190 #define YYNCGIDTBL ncgidtbl 191 192 int 193 yycgid(wchar_t c) 194 { 195 int first = 0; 196 int last = YYNCGIDTBL - 1; 197 lchar lc; 198 199 /* 200 * In ASCII compat. mode, each character forms a "group" and the 201 * group-id is itself... 202 */ 203 if (!handleeuc) 204 return (c); 205 206 lc = linearize(c); 207 208 /* An exceptional case: yycgidtbl[YYNCGIDTBL-1] < lc */ 209 if (yycgidtbl[YYNCGIDTBL - 1] < lc) 210 return (YYNCGIDTBL*2 - 1); 211 212 while (last >= 0) { 213 int i = (first+last)/2; 214 if (lc == yycgidtbl[i]) 215 return (2*i); /* lc exactly matches an element. */ 216 else if (yycgidtbl[i] < lc) { 217 if (lc < yycgidtbl[i+1]) { 218 /* lc is in between two elements */ 219 return (2*i+1); 220 } 221 else 222 first = i + 1; 223 } else 224 last = i - 1; 225 } 226 error( 227 "system error in yycgid():binary search failed for c=0x%04x\n", c); 228 return (0); 229 } 230 231 /* 232 * repbycgid --- replaces each character in the parsing tree by its 233 * character group id. This, however, should be called even in 234 * the ASCII compat. mode to process DOT nodes and to call cclinter() 235 * for the DOT and CCL nodes. 236 */ 237 void 238 repbycgid(void) 239 { 240 int i, c; 241 242 for (i = 0; i < tptr; ++i) { 243 c = name[i]; 244 if (!ISOPERATOR(c)) { 245 /* If not an operator, it must be a char. */ 246 name[i] = yycgid((wchar_t)c); /* So replace it. */ 247 #ifdef DEBUG 248 if (debug) { 249 printf("name[%d]:'%c'->%d;\n", i, c, name[i]); 250 } 251 #endif 252 } else if (c == RSTR) { 253 c = right[i]; 254 right[i] = yycgid((wchar_t)c); 255 #ifdef DEBUG 256 if (debug) { 257 printf( 258 "name[%d].right:'%c'->%d;\n", 259 i, c, right[i]); 260 } 261 #endif 262 } else if ((c == RCCL) || (c == RNCCL)) { 263 CHR cc, *s; 264 int j; 265 CHR ccltoken[CCLSIZE]; 266 CHR *ccp; 267 int m; 268 /* 269 * This node represetns a character class RE [ccccc] 270 * s points to the string of characters that forms 271 * the class and/or a special prefix notation 272 * <RANGE>XY which corresponds to the RE X-Y, 273 * characters in the range of X and Y. Here, 274 * X <= Y is guranteed. 275 * We transform these characters into a string 276 * of sorted character group ids. 277 * 278 * There is another mechanism of packing tables 279 * that is inherited from the ASCII lex. Call of 280 * cclinter() is required for this packing. 281 * This used to be done as yylex() reads the lex 282 * rules but we have to do this here because the 283 * transition table is made to work on the char-group 284 * ids and the mapping cannot be determined until 285 * the entire file is read. 286 */ 287 #ifdef DEBUG 288 if (debug) { 289 printf("name[%d]:R[N]CCL of \"", i); 290 strpt(left[i]); 291 printf(" -> {"); 292 } 293 #endif 294 /* Prepare symbol[] for cclinter(). */ 295 for (j = 0; j < ncg; ++j) 296 symbol[j] = FALSE; 297 298 s = (CHR *) left[i]; 299 while (cc = *s++) { 300 if (cc == RANGE) { 301 int low, high, i; 302 /* 303 * Special form: <RANGE>XY 304 * This means the range X-Y. 305 * We mark all symbols[] 306 * elements for yycgid(X) thru 307 * yycgid(Y), inclusively. 308 */ 309 low = yycgid(*s++); 310 high = yycgid(*s++); 311 for (i = low; i <= high; ++i) 312 setsymbol(i); 313 } else { 314 setsymbol(yycgid(cc)); 315 } 316 } 317 318 /* Now make a transformed string of cgids. */ 319 s = ccptr; 320 m = 0; 321 for (j = 0; j < ncg; ++j) 322 if (symbol[j]) { 323 ccltoken[m++] = (CHR)j; 324 #ifdef DEBUG 325 if (debug) printf("%d, ", j); 326 #endif 327 } 328 329 #ifdef DEBUG 330 if (debug) printf("}\n"); 331 #endif 332 ccltoken[m] = 0; 333 ccp = ccl; 334 while (ccp < ccptr && scomp(ccltoken, ccp) != 0) 335 ccp++; 336 if (ccp < ccptr) { /* character class found in ccl */ 337 left[i] = (int)ccp; 338 } else { /* not in ccl, add it */ 339 left[i] = (int)ccptr; 340 scopy(ccltoken, ccptr); 341 ccptr += slength(ccltoken) + 1; 342 if (ccptr > ccl + CCLSIZE) 343 error( 344 "Too many large character classes"); 345 } 346 cclinter(c == RCCL); 347 } else if (c == DOT) { 348 if (psave == 0) { /* First DOT node. */ 349 int j, nlid; 350 /* 351 * Make symbol[k]=TRUE for all k 352 * except k == yycgid('\n'). 353 */ 354 nlid = yycgid('\n'); 355 psave = ccptr; 356 for (j = 1; j < ncg; ++j) { 357 if (j == nlid) { 358 symbol[j] = FALSE; 359 } else { 360 symbol[j] = TRUE; 361 *ccptr++ = (CHR) j; 362 } 363 } 364 *ccptr++ = 0; 365 if (ccptr > ccl + CCLSIZE) 366 error( 367 "Too many large character classes"); 368 } 369 /* Mimic mn1(RCCL,psave)... */ 370 name[i] = RCCL; 371 left[i] = (int)psave; 372 cclinter(1); 373 } 374 } 375 #ifdef DEBUG 376 if (debug) { 377 printf("treedump after repbycgid().\n"); 378 treedump(); 379 } 380 #endif 381 } 382 383 static void 384 setsymbol(int i) 385 { 386 if (i > sizeof (symbol)) 387 error("setsymbol: (SYSERR) %d out of range", i); 388 symbol[i] = TRUE; 389 } 390