1 /*- 2 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 3 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved. 4 * Copyright 2015 John Marino <draco@marino.st> 5 * 6 * This source code is derived from the illumos localedef command, and 7 * provided under BSD-style license terms by Nexenta Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * LC_CTYPE database generation routines for localedef. 34 */ 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include <sys/tree.h> 39 40 #include <stdio.h> 41 #include <stdlib.h> 42 #include <stddef.h> 43 #include <string.h> 44 #include <sys/types.h> 45 #include <wchar.h> 46 #include <ctype.h> 47 #include <wctype.h> 48 #include <unistd.h> 49 #include "localedef.h" 50 #include "parser.h" 51 #include "runefile.h" 52 53 54 /* Needed for bootstrapping, _CTYPE_N */ 55 #ifndef _CTYPE_N 56 #define _CTYPE_N 0x00400000L 57 #endif 58 59 #define _ISUPPER _CTYPE_U 60 #define _ISLOWER _CTYPE_L 61 #define _ISDIGIT _CTYPE_D 62 #define _ISXDIGIT _CTYPE_X 63 #define _ISSPACE _CTYPE_S 64 #define _ISBLANK _CTYPE_B 65 #define _ISALPHA _CTYPE_A 66 #define _ISPUNCT _CTYPE_P 67 #define _ISGRAPH _CTYPE_G 68 #define _ISPRINT _CTYPE_R 69 #define _ISCNTRL _CTYPE_C 70 #define _E1 _CTYPE_Q 71 #define _E2 _CTYPE_I 72 #define _E3 0 73 #define _E4 _CTYPE_N 74 #define _E5 _CTYPE_T 75 76 static wchar_t last_ctype; 77 static int ctype_compare(const void *n1, const void *n2); 78 79 typedef struct ctype_node { 80 wchar_t wc; 81 int32_t ctype; 82 int32_t toupper; 83 int32_t tolower; 84 RB_ENTRY(ctype_node) entry; 85 } ctype_node_t; 86 87 static RB_HEAD(ctypes, ctype_node) ctypes; 88 RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare); 89 90 static int 91 ctype_compare(const void *n1, const void *n2) 92 { 93 const ctype_node_t *c1 = n1; 94 const ctype_node_t *c2 = n2; 95 96 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 97 } 98 99 void 100 init_ctype(void) 101 { 102 RB_INIT(&ctypes); 103 } 104 105 106 static void 107 add_ctype_impl(ctype_node_t *ctn) 108 { 109 switch (last_kw) { 110 case T_ISUPPER: 111 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 112 break; 113 case T_ISLOWER: 114 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 115 break; 116 case T_ISALPHA: 117 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 118 break; 119 case T_ISDIGIT: 120 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4); 121 break; 122 case T_ISSPACE: 123 ctn->ctype |= _ISSPACE; 124 break; 125 case T_ISCNTRL: 126 ctn->ctype |= _ISCNTRL; 127 break; 128 case T_ISGRAPH: 129 ctn->ctype |= (_ISGRAPH | _ISPRINT); 130 break; 131 case T_ISPRINT: 132 ctn->ctype |= _ISPRINT; 133 break; 134 case T_ISPUNCT: 135 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 136 break; 137 case T_ISXDIGIT: 138 ctn->ctype |= (_ISXDIGIT | _ISPRINT); 139 break; 140 case T_ISBLANK: 141 ctn->ctype |= (_ISBLANK | _ISSPACE); 142 break; 143 case T_ISPHONOGRAM: 144 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 145 break; 146 case T_ISIDEOGRAM: 147 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 148 break; 149 case T_ISENGLISH: 150 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 151 break; 152 case T_ISNUMBER: 153 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 154 break; 155 case T_ISSPECIAL: 156 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 157 break; 158 case T_ISALNUM: 159 /* 160 * We can't do anything with this. The character 161 * should already be specified as a digit or alpha. 162 */ 163 break; 164 default: 165 errf("not a valid character class"); 166 } 167 } 168 169 static ctype_node_t * 170 get_ctype(wchar_t wc) 171 { 172 ctype_node_t srch; 173 ctype_node_t *ctn; 174 175 srch.wc = wc; 176 if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) { 177 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 178 errf("out of memory"); 179 return (NULL); 180 } 181 ctn->wc = wc; 182 183 RB_INSERT(ctypes, &ctypes, ctn); 184 } 185 return (ctn); 186 } 187 188 void 189 add_ctype(int val) 190 { 191 ctype_node_t *ctn; 192 193 if ((ctn = get_ctype(val)) == NULL) { 194 INTERR; 195 return; 196 } 197 add_ctype_impl(ctn); 198 last_ctype = ctn->wc; 199 } 200 201 void 202 add_ctype_range(wchar_t end) 203 { 204 ctype_node_t *ctn; 205 wchar_t cur; 206 207 if (end < last_ctype) { 208 errf("malformed character range (%u ... %u))", 209 last_ctype, end); 210 return; 211 } 212 for (cur = last_ctype + 1; cur <= end; cur++) { 213 if ((ctn = get_ctype(cur)) == NULL) { 214 INTERR; 215 return; 216 } 217 add_ctype_impl(ctn); 218 } 219 last_ctype = end; 220 221 } 222 223 /* 224 * A word about widths: if the width mask is specified, then libc 225 * unconditionally honors it. Otherwise, it assumes printable 226 * characters have width 1, and non-printable characters have width 227 * -1 (except for NULL which is special with width 0). Hence, we have 228 * no need to inject defaults here -- the "default" unset value of 0 229 * indicates that libc should use its own logic in wcwidth as described. 230 */ 231 void 232 add_width(int wc, int width) 233 { 234 ctype_node_t *ctn; 235 236 if ((ctn = get_ctype(wc)) == NULL) { 237 INTERR; 238 return; 239 } 240 ctn->ctype &= ~(_CTYPE_SWM); 241 switch (width) { 242 case 0: 243 ctn->ctype |= _CTYPE_SW0; 244 break; 245 case 1: 246 ctn->ctype |= _CTYPE_SW1; 247 break; 248 case 2: 249 ctn->ctype |= _CTYPE_SW2; 250 break; 251 case 3: 252 ctn->ctype |= _CTYPE_SW3; 253 break; 254 } 255 } 256 257 void 258 add_width_range(int start, int end, int width) 259 { 260 for (; start <= end; start++) { 261 add_width(start, width); 262 } 263 } 264 265 void 266 add_caseconv(int val, int wc) 267 { 268 ctype_node_t *ctn; 269 270 ctn = get_ctype(val); 271 if (ctn == NULL) { 272 INTERR; 273 return; 274 } 275 276 switch (last_kw) { 277 case T_TOUPPER: 278 ctn->toupper = wc; 279 break; 280 case T_TOLOWER: 281 ctn->tolower = wc; 282 break; 283 default: 284 INTERR; 285 break; 286 } 287 } 288 289 void 290 dump_ctype(void) 291 { 292 FILE *f; 293 _FileRuneLocale rl; 294 ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 295 _FileRuneEntry *ct = NULL; 296 _FileRuneEntry *lo = NULL; 297 _FileRuneEntry *up = NULL; 298 wchar_t wc; 299 300 (void) memset(&rl, 0, sizeof (rl)); 301 last_ct = NULL; 302 last_lo = NULL; 303 last_up = NULL; 304 305 if ((f = open_category()) == NULL) 306 return; 307 308 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 309 (void) strlcpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 310 311 /* 312 * Initialize the identity map. 313 */ 314 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { 315 rl.maplower[wc] = wc; 316 rl.mapupper[wc] = wc; 317 } 318 319 RB_FOREACH(ctn, ctypes, &ctypes) { 320 int conflict = 0; 321 322 wc = ctn->wc; 323 324 /* 325 * POSIX requires certain portable characters have 326 * certain types. Add them if they are missing. 327 */ 328 if ((wc >= 1) && (wc <= 127)) { 329 if ((wc >= 'A') && (wc <= 'Z')) 330 ctn->ctype |= _ISUPPER; 331 if ((wc >= 'a') && (wc <= 'z')) 332 ctn->ctype |= _ISLOWER; 333 if ((wc >= '0') && (wc <= '9')) 334 ctn->ctype |= _ISDIGIT; 335 if (wc == ' ') 336 ctn->ctype |= _ISPRINT; 337 if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 338 ctn->ctype |= _ISSPACE; 339 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 340 ctn->ctype |= _ISXDIGIT; 341 if (strchr(" \t", (char)wc)) 342 ctn->ctype |= _ISBLANK; 343 344 /* 345 * Technically these settings are only 346 * required for the C locale. However, it 347 * turns out that because of the historical 348 * version of isprint(), we need them for all 349 * locales as well. Note that these are not 350 * necessarily valid punctation characters in 351 * the current language, but ispunct() needs 352 * to return TRUE for them. 353 */ 354 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 355 (char)wc)) 356 ctn->ctype |= _ISPUNCT; 357 } 358 359 /* 360 * POSIX also requires that certain types imply 361 * others. Add any inferred types here. 362 */ 363 if (ctn->ctype & (_ISUPPER |_ISLOWER)) 364 ctn->ctype |= _ISALPHA; 365 if (ctn->ctype & _ISDIGIT) 366 ctn->ctype |= _ISXDIGIT; 367 if (ctn->ctype & _ISBLANK) 368 ctn->ctype |= _ISSPACE; 369 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 370 ctn->ctype |= _ISGRAPH; 371 if (ctn->ctype & _ISGRAPH) 372 ctn->ctype |= _ISPRINT; 373 374 /* 375 * Finally, POSIX requires that certain combinations 376 * are invalid. We don't flag this as a fatal error, 377 * but we will warn about. 378 */ 379 if ((ctn->ctype & _ISALPHA) && 380 (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 381 conflict++; 382 if ((ctn->ctype & _ISPUNCT) && 383 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 384 conflict++; 385 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 386 conflict++; 387 if ((ctn->ctype & _ISCNTRL) && (ctn->ctype & _ISPRINT)) 388 conflict++; 389 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 390 conflict++; 391 392 if (conflict) { 393 warn("conflicting classes for character 0x%x (%x)", 394 wc, ctn->ctype); 395 } 396 /* 397 * Handle the lower 256 characters using the simple 398 * optimization. Note that if we have not defined the 399 * upper/lower case, then we identity map it. 400 */ 401 if ((unsigned)wc < _CACHED_RUNES) { 402 rl.runetype[wc] = ctn->ctype; 403 if (ctn->tolower) 404 rl.maplower[wc] = ctn->tolower; 405 if (ctn->toupper) 406 rl.mapupper[wc] = ctn->toupper; 407 continue; 408 } 409 410 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) && 411 (last_ct->wc + 1 == wc)) { 412 ct[rl.runetype_ext_nranges-1].max = wc; 413 } else { 414 rl.runetype_ext_nranges++; 415 ct = realloc(ct, 416 sizeof (*ct) * rl.runetype_ext_nranges); 417 ct[rl.runetype_ext_nranges - 1].min = wc; 418 ct[rl.runetype_ext_nranges - 1].max = wc; 419 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; 420 } 421 last_ct = ctn; 422 if (ctn->tolower == 0) { 423 last_lo = NULL; 424 } else if ((last_lo != NULL) && 425 (last_lo->tolower + 1 == ctn->tolower)) { 426 lo[rl.maplower_ext_nranges-1].max = wc; 427 last_lo = ctn; 428 } else { 429 rl.maplower_ext_nranges++; 430 lo = realloc(lo, 431 sizeof (*lo) * rl.maplower_ext_nranges); 432 lo[rl.maplower_ext_nranges - 1].min = wc; 433 lo[rl.maplower_ext_nranges - 1].max = wc; 434 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; 435 last_lo = ctn; 436 } 437 438 if (ctn->toupper == 0) { 439 last_up = NULL; 440 } else if ((last_up != NULL) && 441 (last_up->toupper + 1 == ctn->toupper)) { 442 up[rl.mapupper_ext_nranges-1].max = wc; 443 last_up = ctn; 444 } else { 445 rl.mapupper_ext_nranges++; 446 up = realloc(up, 447 sizeof (*up) * rl.mapupper_ext_nranges); 448 up[rl.mapupper_ext_nranges - 1].min = wc; 449 up[rl.mapupper_ext_nranges - 1].max = wc; 450 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; 451 last_up = ctn; 452 } 453 } 454 455 if ((wr_category(&rl, sizeof (rl), f) < 0) || 456 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || 457 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || 458 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { 459 return; 460 } 461 462 close_category(f); 463 } 464