1 /*- 2 * Copyright 2018 Nexenta Systems, Inc. 3 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved. 4 * Copyright 2015 John Marino <draco@marino.st> 5 * 6 * This source code is derived from the illumos localedef command, and 7 * provided under BSD-style license terms by Nexenta Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * LC_CTYPE database generation routines for localedef. 34 */ 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include <sys/tree.h> 39 40 #include <stdio.h> 41 #include <stdlib.h> 42 #include <stddef.h> 43 #include <string.h> 44 #include <sys/types.h> 45 #include <wchar.h> 46 #include <ctype.h> 47 #include <wctype.h> 48 #include <unistd.h> 49 #include "localedef.h" 50 #include "parser.h" 51 #include "runefile.h" 52 53 54 /* Needed for bootstrapping, _CTYPE_N */ 55 #ifndef _CTYPE_N 56 #define _CTYPE_N 0x00400000L 57 #endif 58 59 #define _ISUPPER _CTYPE_U 60 #define _ISLOWER _CTYPE_L 61 #define _ISDIGIT _CTYPE_D 62 #define _ISXDIGIT _CTYPE_X 63 #define _ISSPACE _CTYPE_S 64 #define _ISBLANK _CTYPE_B 65 #define _ISALPHA _CTYPE_A 66 #define _ISPUNCT _CTYPE_P 67 #define _ISGRAPH _CTYPE_G 68 #define _ISPRINT _CTYPE_R 69 #define _ISCNTRL _CTYPE_C 70 #define _E1 _CTYPE_Q 71 #define _E2 _CTYPE_I 72 #define _E3 0 73 #define _E4 _CTYPE_N 74 #define _E5 _CTYPE_T 75 76 static wchar_t last_ctype; 77 static int ctype_compare(const void *n1, const void *n2); 78 79 typedef struct ctype_node { 80 wchar_t wc; 81 int32_t ctype; 82 int32_t toupper; 83 int32_t tolower; 84 RB_ENTRY(ctype_node) entry; 85 } ctype_node_t; 86 87 static RB_HEAD(ctypes, ctype_node) ctypes; 88 RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare); 89 90 static int 91 ctype_compare(const void *n1, const void *n2) 92 { 93 const ctype_node_t *c1 = n1; 94 const ctype_node_t *c2 = n2; 95 96 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 97 } 98 99 void 100 init_ctype(void) 101 { 102 RB_INIT(&ctypes); 103 } 104 105 106 static void 107 add_ctype_impl(ctype_node_t *ctn) 108 { 109 switch (last_kw) { 110 case T_ISUPPER: 111 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 112 break; 113 case T_ISLOWER: 114 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 115 break; 116 case T_ISALPHA: 117 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 118 break; 119 case T_ISDIGIT: 120 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4); 121 break; 122 case T_ISSPACE: 123 /* 124 * This can be troublesome as <form-feed>, <newline>, 125 * <carriage-return>, <tab>, and <vertical-tab> are defined both 126 * as space and cntrl, and POSIX doesn't allow cntrl/print 127 * combination. We will take care of this in dump_ctype(). 128 */ 129 ctn->ctype |= (_ISSPACE | _ISPRINT); 130 break; 131 case T_ISCNTRL: 132 ctn->ctype |= _ISCNTRL; 133 break; 134 case T_ISGRAPH: 135 ctn->ctype |= (_ISGRAPH | _ISPRINT); 136 break; 137 case T_ISPRINT: 138 ctn->ctype |= _ISPRINT; 139 break; 140 case T_ISPUNCT: 141 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 142 break; 143 case T_ISXDIGIT: 144 ctn->ctype |= (_ISXDIGIT | _ISPRINT); 145 break; 146 case T_ISBLANK: 147 ctn->ctype |= (_ISBLANK | _ISSPACE); 148 break; 149 case T_ISPHONOGRAM: 150 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 151 break; 152 case T_ISIDEOGRAM: 153 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 154 break; 155 case T_ISENGLISH: 156 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 157 break; 158 case T_ISNUMBER: 159 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 160 break; 161 case T_ISSPECIAL: 162 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 163 break; 164 case T_ISALNUM: 165 /* 166 * We can't do anything with this. The character 167 * should already be specified as a digit or alpha. 168 */ 169 break; 170 default: 171 errf("not a valid character class"); 172 } 173 } 174 175 static ctype_node_t * 176 get_ctype(wchar_t wc) 177 { 178 ctype_node_t srch; 179 ctype_node_t *ctn; 180 181 srch.wc = wc; 182 if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) { 183 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 184 errf("out of memory"); 185 return (NULL); 186 } 187 ctn->wc = wc; 188 189 RB_INSERT(ctypes, &ctypes, ctn); 190 } 191 return (ctn); 192 } 193 194 void 195 add_ctype(int val) 196 { 197 ctype_node_t *ctn; 198 199 if ((ctn = get_ctype(val)) == NULL) { 200 INTERR; 201 return; 202 } 203 add_ctype_impl(ctn); 204 last_ctype = ctn->wc; 205 } 206 207 void 208 add_ctype_range(wchar_t end) 209 { 210 ctype_node_t *ctn; 211 wchar_t cur; 212 213 if (end < last_ctype) { 214 errf("malformed character range (%u ... %u))", 215 last_ctype, end); 216 return; 217 } 218 for (cur = last_ctype + 1; cur <= end; cur++) { 219 if ((ctn = get_ctype(cur)) == NULL) { 220 INTERR; 221 return; 222 } 223 add_ctype_impl(ctn); 224 } 225 last_ctype = end; 226 227 } 228 229 /* 230 * A word about widths: if the width mask is specified, then libc 231 * unconditionally honors it. Otherwise, it assumes printable 232 * characters have width 1, and non-printable characters have width 233 * -1 (except for NULL which is special with width 0). Hence, we have 234 * no need to inject defaults here -- the "default" unset value of 0 235 * indicates that libc should use its own logic in wcwidth as described. 236 */ 237 void 238 add_width(int wc, int width) 239 { 240 ctype_node_t *ctn; 241 242 if ((ctn = get_ctype(wc)) == NULL) { 243 INTERR; 244 return; 245 } 246 ctn->ctype &= ~(_CTYPE_SWM); 247 switch (width) { 248 case 0: 249 ctn->ctype |= _CTYPE_SW0; 250 break; 251 case 1: 252 ctn->ctype |= _CTYPE_SW1; 253 break; 254 case 2: 255 ctn->ctype |= _CTYPE_SW2; 256 break; 257 case 3: 258 ctn->ctype |= _CTYPE_SW3; 259 break; 260 } 261 } 262 263 void 264 add_width_range(int start, int end, int width) 265 { 266 for (; start <= end; start++) { 267 add_width(start, width); 268 } 269 } 270 271 void 272 add_caseconv(int val, int wc) 273 { 274 ctype_node_t *ctn; 275 276 ctn = get_ctype(val); 277 if (ctn == NULL) { 278 INTERR; 279 return; 280 } 281 282 switch (last_kw) { 283 case T_TOUPPER: 284 ctn->toupper = wc; 285 break; 286 case T_TOLOWER: 287 ctn->tolower = wc; 288 break; 289 default: 290 INTERR; 291 break; 292 } 293 } 294 295 void 296 dump_ctype(void) 297 { 298 FILE *f; 299 _FileRuneLocale rl; 300 ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 301 _FileRuneEntry *ct = NULL; 302 _FileRuneEntry *lo = NULL; 303 _FileRuneEntry *up = NULL; 304 wchar_t wc; 305 uint32_t runetype_ext_nranges; 306 uint32_t maplower_ext_nranges; 307 uint32_t mapupper_ext_nranges; 308 309 (void) memset(&rl, 0, sizeof (rl)); 310 runetype_ext_nranges = 0; 311 last_ct = NULL; 312 maplower_ext_nranges = 0; 313 last_lo = NULL; 314 mapupper_ext_nranges = 0; 315 last_up = NULL; 316 317 if ((f = open_category()) == NULL) 318 return; 319 320 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 321 (void) strlcpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 322 323 /* 324 * Initialize the identity map. 325 */ 326 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { 327 rl.maplower[wc] = htote(wc); 328 rl.mapupper[wc] = htote(wc); 329 } 330 331 RB_FOREACH(ctn, ctypes, &ctypes) { 332 int conflict = 0; 333 334 wc = ctn->wc; 335 336 /* 337 * POSIX requires certain portable characters have 338 * certain types. Add them if they are missing. 339 */ 340 if ((wc >= 1) && (wc <= 127)) { 341 if ((wc >= 'A') && (wc <= 'Z')) 342 ctn->ctype |= _ISUPPER; 343 if ((wc >= 'a') && (wc <= 'z')) 344 ctn->ctype |= _ISLOWER; 345 if ((wc >= '0') && (wc <= '9')) 346 ctn->ctype |= _ISDIGIT; 347 if (wc == ' ') 348 ctn->ctype |= _ISPRINT; 349 if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 350 ctn->ctype |= _ISSPACE; 351 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 352 ctn->ctype |= _ISXDIGIT; 353 if (strchr(" \t", (char)wc)) 354 ctn->ctype |= _ISBLANK; 355 356 /* 357 * Technically these settings are only 358 * required for the C locale. However, it 359 * turns out that because of the historical 360 * version of isprint(), we need them for all 361 * locales as well. Note that these are not 362 * necessarily valid punctation characters in 363 * the current language, but ispunct() needs 364 * to return TRUE for them. 365 */ 366 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 367 (char)wc)) 368 ctn->ctype |= _ISPUNCT; 369 } 370 371 /* 372 * POSIX also requires that certain types imply 373 * others. Add any inferred types here. 374 */ 375 if (ctn->ctype & (_ISUPPER |_ISLOWER)) 376 ctn->ctype |= _ISALPHA; 377 if (ctn->ctype & _ISDIGIT) 378 ctn->ctype |= _ISXDIGIT; 379 if (ctn->ctype & _ISBLANK) 380 ctn->ctype |= _ISSPACE; 381 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 382 ctn->ctype |= _ISGRAPH; 383 if (ctn->ctype & _ISGRAPH) 384 ctn->ctype |= _ISPRINT; 385 386 /* 387 * POSIX requires that certain combinations are invalid. 388 * Try fixing the cases we know about (see add_ctype_impl()). 389 */ 390 if ((ctn->ctype & (_ISSPACE|_ISCNTRL)) == (_ISSPACE|_ISCNTRL)) 391 ctn->ctype &= ~_ISPRINT; 392 393 /* 394 * Finally, don't flag remaining cases as a fatal error, 395 * and just warn about them. 396 */ 397 if ((ctn->ctype & _ISALPHA) && 398 (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 399 conflict++; 400 if ((ctn->ctype & _ISPUNCT) && 401 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 402 conflict++; 403 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 404 conflict++; 405 if ((ctn->ctype & _ISCNTRL) && (ctn->ctype & _ISPRINT)) 406 conflict++; 407 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 408 conflict++; 409 410 if (conflict) { 411 warn("conflicting classes for character 0x%x (%x)", 412 wc, ctn->ctype); 413 } 414 /* 415 * Handle the lower 256 characters using the simple 416 * optimization. Note that if we have not defined the 417 * upper/lower case, then we identity map it. 418 */ 419 if ((unsigned)wc < _CACHED_RUNES) { 420 rl.runetype[wc] = htote(ctn->ctype); 421 if (ctn->tolower) 422 rl.maplower[wc] = htote(ctn->tolower); 423 if (ctn->toupper) 424 rl.mapupper[wc] = htote(ctn->toupper); 425 continue; 426 } 427 428 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) && 429 (last_ct->wc + 1 == wc)) { 430 ct[runetype_ext_nranges - 1].max = htote(wc); 431 } else { 432 runetype_ext_nranges++; 433 ct = realloc(ct, sizeof (*ct) * runetype_ext_nranges); 434 ct[runetype_ext_nranges - 1].min = htote(wc); 435 ct[runetype_ext_nranges - 1].max = htote(wc); 436 ct[runetype_ext_nranges - 1].map = 437 htote(ctn->ctype); 438 } 439 last_ct = ctn; 440 if (ctn->tolower == 0) { 441 last_lo = NULL; 442 } else if ((last_lo != NULL) && 443 (last_lo->tolower + 1 == ctn->tolower)) { 444 lo[maplower_ext_nranges - 1].max = htote(wc); 445 last_lo = ctn; 446 } else { 447 maplower_ext_nranges++; 448 lo = realloc(lo, sizeof (*lo) * maplower_ext_nranges); 449 lo[maplower_ext_nranges - 1].min = htote(wc); 450 lo[maplower_ext_nranges - 1].max = htote(wc); 451 lo[maplower_ext_nranges - 1].map = 452 htote(ctn->tolower); 453 last_lo = ctn; 454 } 455 456 if (ctn->toupper == 0) { 457 last_up = NULL; 458 } else if ((last_up != NULL) && 459 (last_up->toupper + 1 == ctn->toupper)) { 460 up[mapupper_ext_nranges-1].max = htote(wc); 461 last_up = ctn; 462 } else { 463 mapupper_ext_nranges++; 464 up = realloc(up, sizeof (*up) * mapupper_ext_nranges); 465 up[mapupper_ext_nranges - 1].min = htote(wc); 466 up[mapupper_ext_nranges - 1].max = htote(wc); 467 up[mapupper_ext_nranges - 1].map = 468 htote(ctn->toupper); 469 last_up = ctn; 470 } 471 } 472 473 rl.runetype_ext_nranges = htote(runetype_ext_nranges); 474 rl.maplower_ext_nranges = htote(maplower_ext_nranges); 475 rl.mapupper_ext_nranges = htote(mapupper_ext_nranges); 476 if ((wr_category(&rl, sizeof (rl), f) < 0) || 477 (wr_category(ct, sizeof (*ct) * runetype_ext_nranges, f) < 0) || 478 (wr_category(lo, sizeof (*lo) * maplower_ext_nranges, f) < 0) || 479 (wr_category(up, sizeof (*up) * mapupper_ext_nranges, f) < 0)) { 480 return; 481 } 482 483 close_category(f); 484 } 485