1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2010,2011 Nexenta Systems, Inc. All rights reserved. 14 * Copyright 2012 Garrett D'Amore <garrett@damore.org> 15 * Copyright 2013 DEY Storage Systems, Inc. 16 */ 17 18 /* 19 * LC_CTYPE database generation routines for localedef. 20 */ 21 22 #include <stdio.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #include <sys/types.h> 26 #include <sys/avl.h> 27 #include <wchar.h> 28 #include <ctype.h> 29 #include <wctype.h> 30 #include <unistd.h> 31 #include "_ctype.h" 32 #include "localedef.h" 33 #include "parser.tab.h" 34 #include "runefile.h" 35 36 static avl_tree_t ctypes; 37 38 static wchar_t last_ctype; 39 40 typedef struct ctype_node { 41 wchar_t wc; 42 int32_t ctype; 43 int32_t toupper; 44 int32_t tolower; 45 avl_node_t avl; 46 } ctype_node_t; 47 48 typedef struct width_node { 49 wchar_t start; 50 wchar_t end; 51 int8_t width; 52 avl_node_t avl; 53 } width_node_t; 54 55 static int 56 ctype_compare(const void *n1, const void *n2) 57 { 58 const ctype_node_t *c1 = n1; 59 const ctype_node_t *c2 = n2; 60 61 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 62 } 63 64 void 65 init_ctype(void) 66 { 67 avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t), 68 offsetof(ctype_node_t, avl)); 69 } 70 71 72 static void 73 add_ctype_impl(ctype_node_t *ctn) 74 { 75 switch (last_kw) { 76 case T_ISUPPER: 77 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 78 break; 79 case T_ISLOWER: 80 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 81 break; 82 case T_ISALPHA: 83 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 84 break; 85 case T_ISDIGIT: 86 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT); 87 break; 88 case T_ISSPACE: 89 ctn->ctype |= _ISSPACE; 90 break; 91 case T_ISCNTRL: 92 ctn->ctype |= _ISCNTRL; 93 break; 94 case T_ISGRAPH: 95 ctn->ctype |= (_ISGRAPH | _ISPRINT); 96 break; 97 case T_ISPRINT: 98 ctn->ctype |= _ISPRINT; 99 break; 100 case T_ISPUNCT: 101 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 102 break; 103 case T_ISXDIGIT: 104 ctn->ctype |= (_ISXDIGIT | _ISPRINT); 105 break; 106 case T_ISBLANK: 107 ctn->ctype |= (_ISBLANK | _ISSPACE); 108 break; 109 case T_ISPHONOGRAM: 110 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 111 break; 112 case T_ISIDEOGRAM: 113 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 114 break; 115 case T_ISENGLISH: 116 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 117 break; 118 case T_ISNUMBER: 119 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 120 break; 121 case T_ISSPECIAL: 122 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 123 break; 124 case T_ISALNUM: 125 /* 126 * We can't do anything with this. The character 127 * should already be specified as a digit or alpha. 128 */ 129 break; 130 default: 131 errf(_("not a valid character class")); 132 } 133 } 134 135 static ctype_node_t * 136 get_ctype(wchar_t wc) 137 { 138 ctype_node_t srch; 139 ctype_node_t *ctn; 140 avl_index_t where; 141 142 srch.wc = wc; 143 if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) { 144 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 145 errf(_("out of memory")); 146 return (NULL); 147 } 148 ctn->wc = wc; 149 150 avl_insert(&ctypes, ctn, where); 151 } 152 return (ctn); 153 } 154 155 void 156 add_ctype(int val) 157 { 158 ctype_node_t *ctn; 159 160 if ((ctn = get_ctype(val)) == NULL) { 161 INTERR; 162 return; 163 } 164 add_ctype_impl(ctn); 165 last_ctype = ctn->wc; 166 } 167 168 void 169 add_ctype_range(int end) 170 { 171 ctype_node_t *ctn; 172 wchar_t cur; 173 174 if (end < last_ctype) { 175 errf(_("malformed character range (%u ... %u))"), 176 last_ctype, end); 177 return; 178 } 179 for (cur = last_ctype + 1; cur <= end; cur++) { 180 if ((ctn = get_ctype(cur)) == NULL) { 181 INTERR; 182 return; 183 } 184 add_ctype_impl(ctn); 185 } 186 last_ctype = end; 187 188 } 189 190 /* 191 * A word about widths: if the width mask is specified, then libc 192 * unconditionally honors it. Otherwise, it assumes printable 193 * characters have width 1, and non-printable characters have width 194 * -1 (except for NULL which is special with with 0). Hence, we have 195 * no need to inject defaults here -- the "default" unset value of 0 196 * indicates that libc should use its own logic in wcwidth as described. 197 */ 198 void 199 add_width(int wc, int width) 200 { 201 ctype_node_t *ctn; 202 203 if ((ctn = get_ctype(wc)) == NULL) { 204 INTERR; 205 return; 206 } 207 ctn->ctype &= ~(_CTYPE_SWM); 208 switch (width) { 209 case 0: 210 ctn->ctype |= _CTYPE_SW0; 211 break; 212 case 1: 213 ctn->ctype |= _CTYPE_SW1; 214 break; 215 case 2: 216 ctn->ctype |= _CTYPE_SW2; 217 break; 218 case 3: 219 ctn->ctype |= _CTYPE_SW3; 220 break; 221 } 222 } 223 224 void 225 add_width_range(int start, int end, int width) 226 { 227 for (; start <= end; start++) { 228 add_width(start, width); 229 } 230 } 231 232 void 233 add_caseconv(int val, int wc) 234 { 235 ctype_node_t *ctn; 236 237 ctn = get_ctype(val); 238 if (ctn == NULL) { 239 INTERR; 240 return; 241 } 242 243 switch (last_kw) { 244 case T_TOUPPER: 245 ctn->toupper = wc; 246 break; 247 case T_TOLOWER: 248 ctn->tolower = wc; 249 break; 250 default: 251 INTERR; 252 break; 253 } 254 } 255 256 void 257 dump_ctype(void) 258 { 259 FILE *f; 260 _FileRuneLocale rl; 261 ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 262 _FileRuneEntry *ct = NULL; 263 _FileRuneEntry *lo = NULL; 264 _FileRuneEntry *up = NULL; 265 wchar_t wc; 266 267 (void) memset(&rl, 0, sizeof (rl)); 268 last_ct = NULL; 269 last_lo = NULL; 270 last_up = NULL; 271 272 if ((f = open_category()) == NULL) 273 return; 274 275 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 276 (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 277 278 /* 279 * Initialize the identity map. 280 */ 281 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { 282 rl.maplower[wc] = wc; 283 rl.mapupper[wc] = wc; 284 } 285 286 for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) { 287 int conflict = 0; 288 289 290 wc = ctn->wc; 291 292 /* 293 * POSIX requires certain portable characters have 294 * certain types. Add them if they are missing. 295 */ 296 if ((wc >= 1) && (wc <= 127)) { 297 if ((wc >= 'A') && (wc <= 'Z')) 298 ctn->ctype |= _ISUPPER; 299 if ((wc >= 'a') && (wc <= 'z')) 300 ctn->ctype |= _ISLOWER; 301 if ((wc >= '0') && (wc <= '9')) 302 ctn->ctype |= _ISDIGIT; 303 if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 304 ctn->ctype |= _ISSPACE; 305 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 306 ctn->ctype |= _ISXDIGIT; 307 if (strchr(" \t", (char)wc)) 308 ctn->ctype |= _ISBLANK; 309 310 /* 311 * Technically these settings are only 312 * required for the C locale. However, it 313 * turns out that because of the historical 314 * version of isprint(), we need them for all 315 * locales as well. Note that these are not 316 * necessarily valid punctation characters in 317 * the current language, but ispunct() needs 318 * to return TRUE for them. 319 */ 320 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 321 (char)wc)) 322 ctn->ctype |= _ISPUNCT; 323 } 324 325 /* 326 * POSIX also requires that certain types imply 327 * others. Add any inferred types here. 328 */ 329 if (ctn->ctype & (_ISUPPER |_ISLOWER)) 330 ctn->ctype |= _ISALPHA; 331 if (ctn->ctype & _ISDIGIT) 332 ctn->ctype |= _ISXDIGIT; 333 if (ctn->ctype & _ISBLANK) 334 ctn->ctype |= _ISSPACE; 335 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 336 ctn->ctype |= _ISGRAPH; 337 if (ctn->ctype & _ISGRAPH) 338 ctn->ctype |= _ISPRINT; 339 340 /* 341 * Finally, POSIX requires that certain combinations 342 * are invalid. We don't flag this as a fatal error, 343 * but we will warn about. 344 */ 345 if ((ctn->ctype & _ISALPHA) && 346 (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 347 conflict++; 348 if ((ctn->ctype & _ISPUNCT) & 349 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 350 conflict++; 351 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 352 conflict++; 353 if ((ctn->ctype & _ISCNTRL) & _ISPRINT) 354 conflict++; 355 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 356 conflict++; 357 358 if (conflict) { 359 warn("conflicting classes for character 0x%x (%x)", 360 wc, ctn->ctype); 361 } 362 /* 363 * Handle the lower 256 characters using the simple 364 * optimization. Note that if we have not defined the 365 * upper/lower case, then we identity map it. 366 */ 367 if ((unsigned)wc < _CACHED_RUNES) { 368 rl.runetype[wc] = ctn->ctype; 369 if (ctn->tolower) 370 rl.maplower[wc] = ctn->tolower; 371 if (ctn->toupper) 372 rl.mapupper[wc] = ctn->toupper; 373 continue; 374 } 375 376 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) { 377 ct[rl.runetype_ext_nranges-1].max = wc; 378 last_ct = ctn; 379 } else { 380 rl.runetype_ext_nranges++; 381 ct = realloc(ct, 382 sizeof (*ct) * rl.runetype_ext_nranges); 383 ct[rl.runetype_ext_nranges - 1].min = wc; 384 ct[rl.runetype_ext_nranges - 1].max = wc; 385 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; 386 last_ct = ctn; 387 } 388 if (ctn->tolower == 0) { 389 last_lo = NULL; 390 } else if ((last_lo != NULL) && 391 (last_lo->tolower + 1 == ctn->tolower)) { 392 lo[rl.maplower_ext_nranges-1].max = wc; 393 last_lo = ctn; 394 } else { 395 rl.maplower_ext_nranges++; 396 lo = realloc(lo, 397 sizeof (*lo) * rl.maplower_ext_nranges); 398 lo[rl.maplower_ext_nranges - 1].min = wc; 399 lo[rl.maplower_ext_nranges - 1].max = wc; 400 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; 401 last_lo = ctn; 402 } 403 404 if (ctn->toupper == 0) { 405 last_up = NULL; 406 } else if ((last_up != NULL) && 407 (last_up->toupper + 1 == ctn->toupper)) { 408 up[rl.mapupper_ext_nranges-1].max = wc; 409 last_up = ctn; 410 } else { 411 rl.mapupper_ext_nranges++; 412 up = realloc(up, 413 sizeof (*up) * rl.mapupper_ext_nranges); 414 up[rl.mapupper_ext_nranges - 1].min = wc; 415 up[rl.mapupper_ext_nranges - 1].max = wc; 416 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; 417 last_up = ctn; 418 } 419 } 420 421 if ((wr_category(&rl, sizeof (rl), f) < 0) || 422 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || 423 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || 424 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { 425 return; 426 } 427 428 close_category(f); 429 } 430