1 /*- 2 * Copyright 2018 Nexenta Systems, Inc. 3 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved. 4 * Copyright 2015 John Marino <draco@marino.st> 5 * 6 * This source code is derived from the illumos localedef command, and 7 * provided under BSD-style license terms by Nexenta Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * LC_CTYPE database generation routines for localedef. 34 */ 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include <sys/tree.h> 39 40 #include <stdio.h> 41 #include <stdlib.h> 42 #include <stddef.h> 43 #include <string.h> 44 #include <sys/types.h> 45 #include <wchar.h> 46 #include <unistd.h> 47 #include "localedef.h" 48 #include "parser.h" 49 50 /* Always include the defines for the target: */ 51 #define _DONT_USE_CTYPE_INLINE_ /* Avoid dependencies on runetype.h */ 52 #include "_ctype.h" 53 #include "runefile.h" 54 55 56 /* Needed for bootstrapping, _CTYPE_N */ 57 #ifndef _CTYPE_N 58 #define _CTYPE_N 0x00400000L 59 #endif 60 61 #define _ISUPPER _CTYPE_U 62 #define _ISLOWER _CTYPE_L 63 #define _ISDIGIT _CTYPE_D 64 #define _ISXDIGIT _CTYPE_X 65 #define _ISSPACE _CTYPE_S 66 #define _ISBLANK _CTYPE_B 67 #define _ISALPHA _CTYPE_A 68 #define _ISPUNCT _CTYPE_P 69 #define _ISGRAPH _CTYPE_G 70 #define _ISPRINT _CTYPE_R 71 #define _ISCNTRL _CTYPE_C 72 #define _E1 _CTYPE_Q 73 #define _E2 _CTYPE_I 74 #define _E3 0 75 #define _E4 _CTYPE_N 76 #define _E5 _CTYPE_T 77 78 static wchar_t last_ctype; 79 static int ctype_compare(const void *n1, const void *n2); 80 81 typedef struct ctype_node { 82 wchar_t wc; 83 int32_t ctype; 84 int32_t toupper; 85 int32_t tolower; 86 RB_ENTRY(ctype_node) entry; 87 } ctype_node_t; 88 89 static RB_HEAD(ctypes, ctype_node) ctypes; 90 RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare); 91 92 static int 93 ctype_compare(const void *n1, const void *n2) 94 { 95 const ctype_node_t *c1 = n1; 96 const ctype_node_t *c2 = n2; 97 98 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 99 } 100 101 void 102 init_ctype(void) 103 { 104 RB_INIT(&ctypes); 105 } 106 107 108 static void 109 add_ctype_impl(ctype_node_t *ctn) 110 { 111 switch (last_kw) { 112 case T_ISUPPER: 113 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 114 break; 115 case T_ISLOWER: 116 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 117 break; 118 case T_ISALPHA: 119 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 120 break; 121 case T_ISDIGIT: 122 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4); 123 break; 124 case T_ISSPACE: 125 /* 126 * This can be troublesome as <form-feed>, <newline>, 127 * <carriage-return>, <tab>, and <vertical-tab> are defined both 128 * as space and cntrl, and POSIX doesn't allow cntrl/print 129 * combination. We will take care of this in dump_ctype(). 130 */ 131 ctn->ctype |= (_ISSPACE | _ISPRINT); 132 break; 133 case T_ISCNTRL: 134 ctn->ctype |= _ISCNTRL; 135 break; 136 case T_ISGRAPH: 137 ctn->ctype |= (_ISGRAPH | _ISPRINT); 138 break; 139 case T_ISPRINT: 140 ctn->ctype |= _ISPRINT; 141 break; 142 case T_ISPUNCT: 143 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 144 break; 145 case T_ISXDIGIT: 146 ctn->ctype |= (_ISXDIGIT | _ISPRINT); 147 break; 148 case T_ISBLANK: 149 ctn->ctype |= (_ISBLANK | _ISSPACE); 150 break; 151 case T_ISPHONOGRAM: 152 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 153 break; 154 case T_ISIDEOGRAM: 155 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 156 break; 157 case T_ISENGLISH: 158 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 159 break; 160 case T_ISNUMBER: 161 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 162 break; 163 case T_ISSPECIAL: 164 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 165 break; 166 case T_ISALNUM: 167 /* 168 * We can't do anything with this. The character 169 * should already be specified as a digit or alpha. 170 */ 171 break; 172 default: 173 errf("not a valid character class"); 174 } 175 } 176 177 static ctype_node_t * 178 get_ctype(wchar_t wc) 179 { 180 ctype_node_t srch; 181 ctype_node_t *ctn; 182 183 srch.wc = wc; 184 if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) { 185 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 186 errf("out of memory"); 187 return (NULL); 188 } 189 ctn->wc = wc; 190 191 RB_INSERT(ctypes, &ctypes, ctn); 192 } 193 return (ctn); 194 } 195 196 void 197 add_ctype(int val) 198 { 199 ctype_node_t *ctn; 200 201 if ((ctn = get_ctype(val)) == NULL) { 202 INTERR; 203 return; 204 } 205 add_ctype_impl(ctn); 206 last_ctype = ctn->wc; 207 } 208 209 void 210 add_ctype_range(wchar_t end) 211 { 212 ctype_node_t *ctn; 213 wchar_t cur; 214 215 if (end < last_ctype) { 216 errf("malformed character range (%u ... %u))", 217 last_ctype, end); 218 return; 219 } 220 for (cur = last_ctype + 1; cur <= end; cur++) { 221 if ((ctn = get_ctype(cur)) == NULL) { 222 INTERR; 223 return; 224 } 225 add_ctype_impl(ctn); 226 } 227 last_ctype = end; 228 229 } 230 231 /* 232 * A word about widths: if the width mask is specified, then libc 233 * unconditionally honors it. Otherwise, it assumes printable 234 * characters have width 1, and non-printable characters have width 235 * -1 (except for NULL which is special with width 0). Hence, we have 236 * no need to inject defaults here -- the "default" unset value of 0 237 * indicates that libc should use its own logic in wcwidth as described. 238 */ 239 void 240 add_width(int wc, int width) 241 { 242 ctype_node_t *ctn; 243 244 if ((ctn = get_ctype(wc)) == NULL) { 245 INTERR; 246 return; 247 } 248 ctn->ctype &= ~(_CTYPE_SWM); 249 switch (width) { 250 case 0: 251 ctn->ctype |= _CTYPE_SW0; 252 break; 253 case 1: 254 ctn->ctype |= _CTYPE_SW1; 255 break; 256 case 2: 257 ctn->ctype |= _CTYPE_SW2; 258 break; 259 case 3: 260 ctn->ctype |= _CTYPE_SW3; 261 break; 262 } 263 } 264 265 void 266 add_width_range(int start, int end, int width) 267 { 268 for (; start <= end; start++) { 269 add_width(start, width); 270 } 271 } 272 273 void 274 add_caseconv(int val, int wc) 275 { 276 ctype_node_t *ctn; 277 278 ctn = get_ctype(val); 279 if (ctn == NULL) { 280 INTERR; 281 return; 282 } 283 284 switch (last_kw) { 285 case T_TOUPPER: 286 ctn->toupper = wc; 287 break; 288 case T_TOLOWER: 289 ctn->tolower = wc; 290 break; 291 default: 292 INTERR; 293 break; 294 } 295 } 296 297 void 298 dump_ctype(void) 299 { 300 FILE *f; 301 _FileRuneLocale rl; 302 ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 303 _FileRuneEntry *ct = NULL; 304 _FileRuneEntry *lo = NULL; 305 _FileRuneEntry *up = NULL; 306 wchar_t wc; 307 uint32_t runetype_ext_nranges; 308 uint32_t maplower_ext_nranges; 309 uint32_t mapupper_ext_nranges; 310 311 (void) memset(&rl, 0, sizeof (rl)); 312 runetype_ext_nranges = 0; 313 last_ct = NULL; 314 maplower_ext_nranges = 0; 315 last_lo = NULL; 316 mapupper_ext_nranges = 0; 317 last_up = NULL; 318 319 if ((f = open_category()) == NULL) 320 return; 321 322 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 323 (void) strlcpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 324 325 /* 326 * Initialize the identity map. 327 */ 328 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { 329 rl.maplower[wc] = htote(wc); 330 rl.mapupper[wc] = htote(wc); 331 } 332 333 RB_FOREACH(ctn, ctypes, &ctypes) { 334 int conflict = 0; 335 336 wc = ctn->wc; 337 338 /* 339 * POSIX requires certain portable characters have 340 * certain types. Add them if they are missing. 341 */ 342 if ((wc >= 1) && (wc <= 127)) { 343 if ((wc >= 'A') && (wc <= 'Z')) 344 ctn->ctype |= _ISUPPER; 345 if ((wc >= 'a') && (wc <= 'z')) 346 ctn->ctype |= _ISLOWER; 347 if ((wc >= '0') && (wc <= '9')) 348 ctn->ctype |= _ISDIGIT; 349 if (wc == ' ') 350 ctn->ctype |= _ISPRINT; 351 if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 352 ctn->ctype |= _ISSPACE; 353 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 354 ctn->ctype |= _ISXDIGIT; 355 if (strchr(" \t", (char)wc)) 356 ctn->ctype |= _ISBLANK; 357 358 /* 359 * Technically these settings are only 360 * required for the C locale. However, it 361 * turns out that because of the historical 362 * version of isprint(), we need them for all 363 * locales as well. Note that these are not 364 * necessarily valid punctation characters in 365 * the current language, but ispunct() needs 366 * to return TRUE for them. 367 */ 368 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 369 (char)wc)) 370 ctn->ctype |= _ISPUNCT; 371 } 372 373 /* 374 * POSIX also requires that certain types imply 375 * others. Add any inferred types here. 376 */ 377 if (ctn->ctype & (_ISUPPER |_ISLOWER)) 378 ctn->ctype |= _ISALPHA; 379 if (ctn->ctype & _ISDIGIT) 380 ctn->ctype |= _ISXDIGIT; 381 if (ctn->ctype & _ISBLANK) 382 ctn->ctype |= _ISSPACE; 383 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 384 ctn->ctype |= _ISGRAPH; 385 if (ctn->ctype & _ISGRAPH) 386 ctn->ctype |= _ISPRINT; 387 388 /* 389 * POSIX requires that certain combinations are invalid. 390 * Try fixing the cases we know about (see add_ctype_impl()). 391 */ 392 if ((ctn->ctype & (_ISSPACE|_ISCNTRL)) == (_ISSPACE|_ISCNTRL)) 393 ctn->ctype &= ~_ISPRINT; 394 395 /* 396 * Finally, don't flag remaining cases as a fatal error, 397 * and just warn about them. 398 */ 399 if ((ctn->ctype & _ISALPHA) && 400 (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 401 conflict++; 402 if ((ctn->ctype & _ISPUNCT) && 403 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 404 conflict++; 405 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 406 conflict++; 407 if ((ctn->ctype & _ISCNTRL) && (ctn->ctype & _ISPRINT)) 408 conflict++; 409 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 410 conflict++; 411 412 if (conflict) { 413 warn("conflicting classes for character 0x%x (%x)", 414 wc, ctn->ctype); 415 } 416 /* 417 * Handle the lower 256 characters using the simple 418 * optimization. Note that if we have not defined the 419 * upper/lower case, then we identity map it. 420 */ 421 if ((unsigned)wc < _CACHED_RUNES) { 422 rl.runetype[wc] = htote(ctn->ctype); 423 if (ctn->tolower) 424 rl.maplower[wc] = htote(ctn->tolower); 425 if (ctn->toupper) 426 rl.mapupper[wc] = htote(ctn->toupper); 427 continue; 428 } 429 430 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) && 431 (last_ct->wc + 1 == wc)) { 432 ct[runetype_ext_nranges - 1].max = htote(wc); 433 } else { 434 runetype_ext_nranges++; 435 ct = realloc(ct, sizeof (*ct) * runetype_ext_nranges); 436 ct[runetype_ext_nranges - 1].min = htote(wc); 437 ct[runetype_ext_nranges - 1].max = htote(wc); 438 ct[runetype_ext_nranges - 1].map = 439 htote(ctn->ctype); 440 } 441 last_ct = ctn; 442 if (ctn->tolower == 0) { 443 last_lo = NULL; 444 } else if ((last_lo != NULL) && 445 (last_lo->tolower + 1 == ctn->tolower)) { 446 lo[maplower_ext_nranges - 1].max = htote(wc); 447 last_lo = ctn; 448 } else { 449 maplower_ext_nranges++; 450 lo = realloc(lo, sizeof (*lo) * maplower_ext_nranges); 451 lo[maplower_ext_nranges - 1].min = htote(wc); 452 lo[maplower_ext_nranges - 1].max = htote(wc); 453 lo[maplower_ext_nranges - 1].map = 454 htote(ctn->tolower); 455 last_lo = ctn; 456 } 457 458 if (ctn->toupper == 0) { 459 last_up = NULL; 460 } else if ((last_up != NULL) && 461 (last_up->toupper + 1 == ctn->toupper)) { 462 up[mapupper_ext_nranges-1].max = htote(wc); 463 last_up = ctn; 464 } else { 465 mapupper_ext_nranges++; 466 up = realloc(up, sizeof (*up) * mapupper_ext_nranges); 467 up[mapupper_ext_nranges - 1].min = htote(wc); 468 up[mapupper_ext_nranges - 1].max = htote(wc); 469 up[mapupper_ext_nranges - 1].map = 470 htote(ctn->toupper); 471 last_up = ctn; 472 } 473 } 474 475 rl.runetype_ext_nranges = htote(runetype_ext_nranges); 476 rl.maplower_ext_nranges = htote(maplower_ext_nranges); 477 rl.mapupper_ext_nranges = htote(mapupper_ext_nranges); 478 if ((wr_category(&rl, sizeof (rl), f) < 0) || 479 (wr_category(ct, sizeof (*ct) * runetype_ext_nranges, f) < 0) || 480 (wr_category(lo, sizeof (*lo) * maplower_ext_nranges, f) < 0) || 481 (wr_category(up, sizeof (*up) * mapupper_ext_nranges, f) < 0)) { 482 return; 483 } 484 485 close_category(f); 486 } 487