1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * sub3.c ... ALE enhancement. 28 * Since a typical Asian language has a huge character set, it is not 29 * ideal to index an array by a character code itself, which requires 30 * as large as 2**16 entries per array. 31 * To get arround this problem, we identify a set of characters that 32 * causes the same transition on all states and call it character group. 33 * Every character in a same character group has a unique number called 34 * character group id. A function yycgid(c) maps the character c (in process 35 * code) to the id. This mapping is determined by analyzing all regular 36 * expressions in the lex program. 37 * 38 */ 39 #include <stdlib.h> 40 #include <widec.h> 41 #include <search.h> 42 #include "ldefs.h" 43 44 /* 45 * "lchar" stands for linearized character. It is a variant of 46 * process code. AT&T's 16-bit process code has a drawback in which 47 * for three three process code C, D and E where C <= D <= E, 48 * codeset(C)==codeset(E) does not mean codeset(D)==codeset(C). 49 * In other words, four codesets alternates as the magnitude 50 * of character increases. 51 * The lchar representation holds this property: 52 * If three lchar C', D' and E' have the relationship C' < D' < E' and 53 * codeset(C') == codeset(E') then D' is guaranteed to belong to 54 * the same codeset as C' and E'. 55 * lchar is implemented as 32 bit entities and the function linearize() 56 * that maps a wchar_t to lchar is defined below. There is no 57 * reverse function for it though. 58 * The 32-bit process code by AT&T, used only for Taiwanese version at the 59 * time of wrting, has no such problem and we use it as it is. 60 */ 61 62 lchar yycgidtbl[MAXNCG] = { 63 0, /* For ease of computation of the id. */ 64 '\n', /* Newline is always special because '.' exclude it. */ 65 0x000000ff, /* The upper limit of codeset 0. */ 66 0x20ffffff, /* The upper limit of codeset 2. */ 67 0x40ffffff /* The upper limit of codeset 3. */ 68 /* 0x60ffffff The upper limit of codeset 1. */ 69 /* Above assumes the number of significant bits of wchar_t is <= 24. */ 70 }; 71 int ncgidtbl = 5; /* # elements in yycgidtbl. */ 72 int ncg; /* Should set to ncgidtbl*2; this is the largest value yycgid() */ 73 /* returns plus 1. */ 74 75 static void setsymbol(int i); 76 77 /* 78 * For given 16-bit wchar_t (See NOTE), lchar is computed as illustrated below: 79 * 80 * wc: axxxxxxbyyyyyyy 81 * 82 * returns: 0ab0000000000000axxxxxxxbyyyyyyy 83 * 84 * linearize() doesn't do any if compiled with 32-bit wchar_t, use of 85 * which is flagged with LONG_WCHAR_T macro. 86 * NOTE: 87 * The implementation is highly depends on the process code representation. 88 * This function should be modified when 32-bit process code is used. 89 * There is no need to keep 'a' and 'b' bits in the lower half of lchar. 90 * You can actually omit these and squeeze the xxxxxx part one bit right. 91 * We don't do that here just in sake of speed. 92 */ 93 lchar 94 linearize(wchar_t wc) 95 { 96 #ifdef LONG_WCHAR_T 97 return ((lchar)wc); /* Don't do anything. */ 98 #else 99 100 lchar prefix; 101 switch (wc&0x8080) { 102 case 0x0000: prefix = 0x00000000; break; 103 case 0x0080: prefix = 0x20000000; break; 104 case 0x8000: prefix = 0x40000000; break; 105 case 0x8080: prefix = 0x60000000; break; 106 } 107 return (prefix|wc); 108 #endif 109 } 110 111 /* compare liniear characters pointed to by pc1 and pc2 */ 112 int 113 cmplc(const void *arg1, const void *arg2) 114 { 115 lchar *pc1 = (lchar *)arg1; 116 lchar *pc2 = (lchar *)arg2; 117 118 if (*pc1 > *pc2) 119 return (1); 120 else if (*pc1 == *pc2) 121 return (0); 122 else 123 return (-1); 124 } 125 126 void 127 remch(wchar_t c) 128 { 129 lchar lc = linearize(c); 130 size_t local_ncgidtbl; 131 132 /* 133 * User-friendliness consideration: 134 * Make sure no EUC chars are used in reg. exp. 135 */ 136 if (!handleeuc) { 137 if (!isascii(c)) { 138 if (iswprint(c)) 139 warning( 140 "Non-ASCII character '%wc' in pattern; use -w or -e lex option.", c); 141 else warning( 142 "Non-ASCII character of value %#x in pattern; use -w or -e lex option.", c); 143 } 144 /* In any case, we don't need to construct ncgidtbl[]. */ 145 return; 146 } 147 148 /* 149 * lsearch wants ncgidtbl to be size_t, but it is int. Hence, 150 * the use of local_ncgidtbl to satisfy the calling interface. 151 */ 152 local_ncgidtbl = ncgidtbl; 153 (void) lsearch(&lc, yycgidtbl, 154 &local_ncgidtbl, sizeof (lchar), cmplc); 155 ncgidtbl = (int)local_ncgidtbl; 156 } 157 158 void 159 sortcgidtbl(void) 160 { 161 if (!handleeuc) 162 return; 163 qsort(yycgidtbl, ncgidtbl, sizeof (lchar), cmplc); 164 } 165 166 /* 167 * int yycgid(wchar_t c) 168 * Takes c and returns its character group id, determind by the 169 * following algorithm. The program also uses the binary search 170 * algorithm, generalized from Knuth (6.2.1) Algorithm B. 171 * 172 * This function computes the "character group id" based on 173 * a table yycgidtbl of which each lchar entry is pre-sorted 174 * in ascending sequence The number of valid entries is given 175 * by YYNCGIDTBL. There is no duplicate entries in yycgidtbl. 176 * const int YYNCGIDTBL; 177 * lchar yycgidtbl[YYNCGIDTBL]; 178 * 179 * yycgidtbl[0] is guaranteed to have zero. 180 * 181 * For given c, yycgid(c) returns: 182 * 2*i iff yycgidtbl[i] == lc 183 * 2*i+1 iff yycgidtbl[i] < lc < yycgidtbl[i+1] 184 * YYNCGIDTBL*2-1 185 * iff yycgidtbl[YYNCGIDTBL-1] < lc 186 * where lc=linearize(c). 187 * 188 * Some interesting properties.: 189 * 1. For any c, 0 <= yycgid(c) <= 2*YYNCGIDTBL-1 190 * 2. yycgid(c) == 0 iff c == 0. 191 * 3. For any wchar_t c and d, if linearize(c) < linearize(d) then 192 * yycgid(c) <= yycgid(d). 193 * 4. For any wchar_t c and d, if yycgid(c) < yycgid(d) then 194 * linearize(c) < linearize(d). 195 */ 196 #define YYNCGIDTBL ncgidtbl 197 198 int 199 yycgid(wchar_t c) 200 { 201 int first = 0; 202 int last = YYNCGIDTBL - 1; 203 lchar lc; 204 205 /* 206 * In ASCII compat. mode, each character forms a "group" and the 207 * group-id is itself... 208 */ 209 if (!handleeuc) 210 return (c); 211 212 lc = linearize(c); 213 214 /* An exceptional case: yycgidtbl[YYNCGIDTBL-1] < lc */ 215 if (yycgidtbl[YYNCGIDTBL - 1] < lc) 216 return (YYNCGIDTBL*2 - 1); 217 218 while (last >= 0) { 219 int i = (first+last)/2; 220 if (lc == yycgidtbl[i]) 221 return (2*i); /* lc exactly matches an element. */ 222 else if (yycgidtbl[i] < lc) { 223 if (lc < yycgidtbl[i+1]) { 224 /* lc is in between two elements */ 225 return (2*i+1); 226 } 227 else 228 first = i + 1; 229 } else 230 last = i - 1; 231 } 232 error( 233 "system error in yycgid():binary search failed for c=0x%04x\n", c); 234 return (0); 235 } 236 237 /* 238 * repbycgid --- replaces each character in the parsing tree by its 239 * character group id. This, however, should be called even in 240 * the ASCII compat. mode to process DOT nodes and to call cclinter() 241 * for the DOT and CCL nodes. 242 */ 243 void 244 repbycgid(void) 245 { 246 int i, c; 247 248 for (i = 0; i < tptr; ++i) { 249 c = name[i]; 250 if (!ISOPERATOR(c)) { 251 /* If not an operator, it must be a char. */ 252 name[i] = yycgid((wchar_t)c); /* So replace it. */ 253 #ifdef DEBUG 254 if (debug) { 255 printf("name[%d]:'%c'->%d;\n", i, c, name[i]); 256 } 257 #endif 258 } else if (c == RSTR) { 259 c = right[i]; 260 right[i] = yycgid((wchar_t)c); 261 #ifdef DEBUG 262 if (debug) { 263 printf( 264 "name[%d].right:'%c'->%d;\n", 265 i, c, right[i]); 266 } 267 #endif 268 } else if ((c == RCCL) || (c == RNCCL)) { 269 CHR cc, *s; 270 int j; 271 CHR ccltoken[CCLSIZE]; 272 CHR *ccp; 273 int m; 274 /* 275 * This node represetns a character class RE [ccccc] 276 * s points to the string of characters that forms 277 * the class and/or a special prefix notation 278 * <RANGE>XY which corresponds to the RE X-Y, 279 * characters in the range of X and Y. Here, 280 * X <= Y is guranteed. 281 * We transform these characters into a string 282 * of sorted character group ids. 283 * 284 * There is another mechanism of packing tables 285 * that is inherited from the ASCII lex. Call of 286 * cclinter() is required for this packing. 287 * This used to be done as yylex() reads the lex 288 * rules but we have to do this here because the 289 * transition table is made to work on the char-group 290 * ids and the mapping cannot be determined until 291 * the entire file is read. 292 */ 293 #ifdef DEBUG 294 if (debug) { 295 printf("name[%d]:R[N]CCL of \"", i); 296 strpt(left[i]); 297 printf(" -> {"); 298 } 299 #endif 300 /* Prepare symbol[] for cclinter(). */ 301 for (j = 0; j < ncg; ++j) 302 symbol[j] = FALSE; 303 304 s = (CHR *) left[i]; 305 while ((cc = *s++) != 0) { 306 if (cc == RANGE) { 307 int low, high, i; 308 /* 309 * Special form: <RANGE>XY 310 * This means the range X-Y. 311 * We mark all symbols[] 312 * elements for yycgid(X) thru 313 * yycgid(Y), inclusively. 314 */ 315 low = yycgid(*s++); 316 high = yycgid(*s++); 317 for (i = low; i <= high; ++i) 318 setsymbol(i); 319 } else { 320 setsymbol(yycgid(cc)); 321 } 322 } 323 324 /* Now make a transformed string of cgids. */ 325 s = ccptr; 326 m = 0; 327 for (j = 0; j < ncg; ++j) 328 if (symbol[j]) { 329 ccltoken[m++] = (CHR)j; 330 #ifdef DEBUG 331 if (debug) printf("%d, ", j); 332 #endif 333 } 334 335 #ifdef DEBUG 336 if (debug) printf("}\n"); 337 #endif 338 ccltoken[m] = 0; 339 ccp = ccl; 340 while (ccp < ccptr && scomp(ccltoken, ccp) != 0) 341 ccp++; 342 if (ccp < ccptr) { /* character class found in ccl */ 343 left[i] = (int)ccp; 344 } else { /* not in ccl, add it */ 345 left[i] = (int)ccptr; 346 scopy(ccltoken, ccptr); 347 ccptr += slength(ccltoken) + 1; 348 if (ccptr > ccl + CCLSIZE) 349 error( 350 "Too many large character classes"); 351 } 352 cclinter(c == RCCL); 353 } else if (c == DOT) { 354 if (psave == 0) { /* First DOT node. */ 355 int j, nlid; 356 /* 357 * Make symbol[k]=TRUE for all k 358 * except k == yycgid('\n'). 359 */ 360 nlid = yycgid('\n'); 361 psave = ccptr; 362 for (j = 1; j < ncg; ++j) { 363 if (j == nlid) { 364 symbol[j] = FALSE; 365 } else { 366 symbol[j] = TRUE; 367 *ccptr++ = (CHR) j; 368 } 369 } 370 *ccptr++ = 0; 371 if (ccptr > ccl + CCLSIZE) 372 error( 373 "Too many large character classes"); 374 } 375 /* Mimic mn1(RCCL,psave)... */ 376 name[i] = RCCL; 377 left[i] = (int)psave; 378 cclinter(1); 379 } 380 } 381 #ifdef DEBUG 382 if (debug) { 383 printf("treedump after repbycgid().\n"); 384 treedump(); 385 } 386 #endif 387 } 388 389 static void 390 setsymbol(int i) 391 { 392 if (i > (int)sizeof (symbol)) 393 error("setsymbol: (SYSERR) %d out of range", i); 394 symbol[i] = TRUE; 395 } 396