1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. 24 * All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 /* 31 * sub3.c ... ALE enhancement. 32 * Since a typical Asian language has a huge character set, it is not 33 * ideal to index an array by a character code itself, which requires 34 * as large as 2**16 entries per array. 35 * To get arround this problem, we identify a set of characters that 36 * causes the same transition on all states and call it character group. 37 * Every character in a same character group has a unique number called 38 * character group id. A function yycgid(c) maps the character c (in process 39 * code) to the id. This mapping is determined by analyzing all regular 40 * expressions in the lex program. 41 * 42 */ 43 #include <stdlib.h> 44 #include <widec.h> 45 #include <search.h> 46 #include "ldefs.c" 47 48 /* 49 * "lchar" stands for linearized character. It is a variant of 50 * process code. AT&T's 16-bit process code has a drawback in which 51 * for three three process code C, D and E where C <= D <= E, 52 * codeset(C)==codeset(E) does not mean codeset(D)==codeset(C). 53 * In other words, four codesets alternates as the magnitude 54 * of character increases. 55 * The lchar representation holds this property: 56 * If three lchar C', D' and E' have the relationship C' < D' < E' and 57 * codeset(C') == codeset(E') then D' is guaranteed to belong to 58 * the same codeset as C' and E'. 59 * lchar is implemented as 32 bit entities and the function linearize() 60 * that maps a wchar_t to lchar is defined below. There is no 61 * reverse function for it though. 62 * The 32-bit process code by AT&T, used only for Taiwanese version at the 63 * time of wrting, has no such problem and we use it as it is. 64 */ 65 66 lchar yycgidtbl[MAXNCG] = { 67 0, /* For ease of computation of the id. */ 68 '\n', /* Newline is always special because '.' exclude it. */ 69 0x000000ff, /* The upper limit of codeset 0. */ 70 0x20ffffff, /* The upper limit of codeset 2. */ 71 0x40ffffff /* The upper limit of codeset 3. */ 72 /* 0x60ffffff The upper limit of codeset 1. */ 73 /* Above assumes the number of significant bits of wchar_t is <= 24. */ 74 }; 75 int ncgidtbl = 5; /* # elements in yycgidtbl. */ 76 int ncg; /* Should set to ncgidtbl*2; this is the largest value yycgid() */ 77 /* returns plus 1. */ 78 79 static void setsymbol(int i); 80 81 /* 82 * For given 16-bit wchar_t (See NOTE), lchar is computed as illustrated below: 83 * 84 * wc: axxxxxxbyyyyyyy 85 * 86 * returns: 0ab0000000000000axxxxxxxbyyyyyyy 87 * 88 * linearize() doesn't do any if compiled with 32-bit wchar_t, use of 89 * which is flagged with LONG_WCHAR_T macro. 90 * NOTE: 91 * The implementation is highly depends on the process code representation. 92 * This function should be modified when 32-bit process code is used. 93 * There is no need to keep 'a' and 'b' bits in the lower half of lchar. 94 * You can actually omit these and squeeze the xxxxxx part one bit right. 95 * We don't do that here just in sake of speed. 96 */ 97 lchar 98 linearize(wchar_t wc) 99 { 100 #ifdef LONG_WCHAR_T 101 return ((lchar)wc); /* Don't do anything. */ 102 #else 103 104 lchar prefix; 105 switch (wc&0x8080) { 106 case 0x0000: prefix = 0x00000000; break; 107 case 0x0080: prefix = 0x20000000; break; 108 case 0x8000: prefix = 0x40000000; break; 109 case 0x8080: prefix = 0x60000000; break; 110 } 111 return (prefix|wc); 112 #endif 113 } 114 115 /* compare liniear characters pointed to by pc1 and pc2 */ 116 int 117 cmplc(const void *arg1, const void *arg2) 118 { 119 lchar *pc1 = (lchar *)arg1; 120 lchar *pc2 = (lchar *)arg2; 121 122 if (*pc1 > *pc2) 123 return (1); 124 else if (*pc1 == *pc2) 125 return (0); 126 else 127 return (-1); 128 } 129 130 void 131 remch(wchar_t c) 132 { 133 lchar lc = linearize(c); 134 135 /* 136 * User-friendliness consideration: 137 * Make sure no EUC chars are used in reg. exp. 138 */ 139 if (!handleeuc) { 140 if (!isascii(c)) 141 if (iswprint(c)) 142 warning( 143 "Non-ASCII character '%wc' in pattern; use -w or -e lex option.", c); 144 else warning( 145 "Non-ASCII character of value %#x in pattern; use -w or -e lex option.", c); 146 /* In any case, we don't need to construct ncgidtbl[]. */ 147 return; 148 } 149 150 lsearch(&lc, yycgidtbl, 151 (size_t *)&ncgidtbl, sizeof (lchar), cmplc); 152 } 153 154 void 155 sortcgidtbl(void) 156 { 157 if (!handleeuc) 158 return; 159 qsort(yycgidtbl, ncgidtbl, sizeof (lchar), cmplc); 160 } 161 162 /* 163 * int yycgid(wchar_t c) 164 * Takes c and returns its character group id, determind by the 165 * following algorithm. The program also uses the binary search 166 * algorithm, generalized from Knuth (6.2.1) Algorithm B. 167 * 168 * This function computes the "character group id" based on 169 * a table yycgidtbl of which each lchar entry is pre-sorted 170 * in ascending sequence The number of valid entries is given 171 * by YYNCGIDTBL. There is no duplicate entries in yycgidtbl. 172 * const int YYNCGIDTBL; 173 * lchar yycgidtbl[YYNCGIDTBL]; 174 * 175 * yycgidtbl[0] is guaranteed to have zero. 176 * 177 * For given c, yycgid(c) returns: 178 * 2*i iff yycgidtbl[i] == lc 179 * 2*i+1 iff yycgidtbl[i] < lc < yycgidtbl[i+1] 180 * YYNCGIDTBL*2-1 181 * iff yycgidtbl[YYNCGIDTBL-1] < lc 182 * where lc=linearize(c). 183 * 184 * Some interesting properties.: 185 * 1. For any c, 0 <= yycgid(c) <= 2*YYNCGIDTBL-1 186 * 2. yycgid(c) == 0 iff c == 0. 187 * 3. For any wchar_t c and d, if linearize(c) < linearize(d) then 188 * yycgid(c) <= yycgid(d). 189 * 4. For any wchar_t c and d, if yycgid(c) < yycgid(d) then 190 * linearize(c) < linearize(d). 191 */ 192 #define YYNCGIDTBL ncgidtbl 193 194 int 195 yycgid(wchar_t c) 196 { 197 int first = 0; 198 int last = YYNCGIDTBL - 1; 199 lchar lc; 200 201 /* 202 * In ASCII compat. mode, each character forms a "group" and the 203 * group-id is itself... 204 */ 205 if (!handleeuc) 206 return (c); 207 208 lc = linearize(c); 209 210 /* An exceptional case: yycgidtbl[YYNCGIDTBL-1] < lc */ 211 if (yycgidtbl[YYNCGIDTBL - 1] < lc) 212 return (YYNCGIDTBL*2 - 1); 213 214 while (last >= 0) { 215 int i = (first+last)/2; 216 if (lc == yycgidtbl[i]) 217 return (2*i); /* lc exactly matches an element. */ 218 else if (yycgidtbl[i] < lc) { 219 if (lc < yycgidtbl[i+1]) 220 return (2*i+1); /* lc is in between two elements. */ 221 else 222 first = i + 1; 223 } else 224 last = i - 1; 225 } 226 error( 227 "system error in yycgid():binary search failed for c=0x%04x\n", c); 228 return (0); 229 } 230 231 /* 232 * repbycgid --- replaces each character in the parsing tree by its 233 * character group id. This, however, should be called even in 234 * the ASCII compat. mode to process DOT nodes and to call cclinter() 235 * for the DOT and CCL nodes. 236 */ 237 void 238 repbycgid(void) 239 { 240 int i, c; 241 242 for (i = 0; i < tptr; ++i) { 243 c = name[i]; 244 if (!ISOPERATOR(c)) { 245 /* If not an operator, it must be a char. */ 246 name[i] = yycgid((wchar_t)c); /* So replace it. */ 247 #ifdef DEBUG 248 if (debug) { 249 printf("name[%d]:'%c'->%d;\n", i, c, name[i]); 250 } 251 #endif 252 } else if (c == RSTR) { 253 c = right[i]; 254 right[i] = yycgid((wchar_t)c); 255 #ifdef DEBUG 256 if (debug) { 257 printf( 258 "name[%d].right:'%c'->%d;\n", i, c, right[i]); 259 } 260 #endif 261 } else if ((c == RCCL) || (c == RNCCL)) { 262 CHR cc, *s; 263 int j; 264 CHR ccltoken[CCLSIZE]; 265 CHR *ccp; 266 int m; 267 /* 268 * This node represetns a character class RE [ccccc] 269 * s points to the string of characters that forms 270 * the class and/or a special prefix notation 271 * <RANGE>XY which corresponds to the RE X-Y, 272 * characters in the range of X and Y. Here, 273 * X <= Y is guranteed. 274 * We transform these characters into a string 275 * of sorted character group ids. 276 * 277 * There is another mechanism of packing tables 278 * that is inherited from the ASCII lex. Call of 279 * cclinter() is required for this packing. 280 * This used to be done as yylex() reads the lex 281 * rules but we have to do this here because the 282 * transition table is made to work on the char-group 283 * ids and the mapping cannot be determined until 284 * the entire file is read. 285 */ 286 #ifdef DEBUG 287 if (debug) { 288 printf("name[%d]:R[N]CCL of \"", i); 289 strpt(left[i]); 290 printf(" -> {"); 291 } 292 #endif 293 /* Prepare symbol[] for cclinter(). */ 294 for (j = 0; j < ncg; ++j) 295 symbol[j] = FALSE; 296 297 s = (CHR *) left[i]; 298 while (cc = *s++) { 299 if (cc == RANGE) { 300 int low, high, i; 301 /* 302 * Special form: <RANGE>XY 303 * This means the range X-Y. 304 * We mark all symbols[] 305 * elements for yycgid(X) thru 306 * yycgid(Y), inclusively. 307 */ 308 low = yycgid(*s++); 309 high = yycgid(*s++); 310 for (i = low; i <= high; ++i) 311 setsymbol(i); 312 } else { 313 setsymbol(yycgid(cc)); 314 } 315 } 316 317 /* Now make a transformed string of cgids. */ 318 s = ccptr; 319 m = 0; 320 for (j = 0; j < ncg; ++j) 321 if (symbol[j]) { 322 ccltoken[m++] = (CHR)j; 323 #ifdef DEBUG 324 if (debug) printf("%d, ", j); 325 #endif 326 } 327 328 #ifdef DEBUG 329 if (debug) printf("}\n"); 330 #endif 331 ccltoken[m] = 0; 332 ccp = ccl; 333 while (ccp < ccptr && scomp(ccltoken, ccp) != 0) 334 ccp++; 335 if (ccp < ccptr) { /* character class found in ccl */ 336 left[i] = (int)ccp; 337 } else { /* not in ccl, add it */ 338 left[i] = (int)ccptr; 339 scopy(ccltoken, ccptr); 340 ccptr += slength(ccltoken) + 1; 341 if (ccptr > ccl + CCLSIZE) 342 error("Too many large character classes"); 343 } 344 cclinter(c == RCCL); 345 } else if (c == DOT) { 346 if (psave == 0) { /* First DOT node. */ 347 int j, nlid; 348 /* 349 * Make symbol[k]=TRUE for all k 350 * except k == yycgid('\n'). 351 */ 352 nlid = yycgid('\n'); 353 psave = ccptr; 354 for (j = 1; j < ncg; ++j) { 355 if (j == nlid) { 356 symbol[j] = FALSE; 357 } else { 358 symbol[j] = TRUE; 359 *ccptr++ = (CHR) j; 360 } 361 } 362 *ccptr++ = 0; 363 if (ccptr > ccl + CCLSIZE) 364 error("Too many large character classes"); 365 } 366 /* Mimic mn1(RCCL,psave)... */ 367 name[i] = RCCL; 368 left[i] = (int)psave; 369 cclinter(1); 370 } 371 } 372 #ifdef DEBUG 373 if (debug) { 374 printf("treedump after repbycgid().\n"); 375 treedump(); 376 } 377 #endif 378 } 379 380 static void 381 setsymbol(int i) 382 { 383 if (i > sizeof (symbol)) 384 error("setsymbol: (SYSERR) %d out of range", i); 385 symbol[i] = TRUE; 386 } 387