1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2010,2011 Nexenta Systems, Inc. All rights reserved. 14 */ 15 16 /* 17 * LC_CTYPE database generation routines for localedef. 18 */ 19 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include <string.h> 23 #include <sys/types.h> 24 #include <sys/avl.h> 25 #include <wchar.h> 26 #include <ctype.h> 27 #include <wctype.h> 28 #include <unistd.h> 29 #include "localedef.h" 30 #include "parser.tab.h" 31 #include "runefile.h" 32 33 static avl_tree_t ctypes; 34 35 static wchar_t last_ctype; 36 37 typedef struct ctype_node { 38 wchar_t wc; 39 int32_t ctype; 40 int32_t toupper; 41 int32_t tolower; 42 avl_node_t avl; 43 } ctype_node_t; 44 45 static int 46 ctype_compare(const void *n1, const void *n2) 47 { 48 const ctype_node_t *c1 = n1; 49 const ctype_node_t *c2 = n2; 50 51 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 52 } 53 54 void 55 init_ctype(void) 56 { 57 avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t), 58 offsetof(ctype_node_t, avl)); 59 } 60 61 62 static void 63 add_ctype_impl(ctype_node_t *ctn) 64 { 65 switch (last_kw) { 66 case T_ISUPPER: 67 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 68 break; 69 case T_ISLOWER: 70 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 71 break; 72 case T_ISALPHA: 73 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 74 break; 75 case T_ISDIGIT: 76 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT); 77 break; 78 case T_ISSPACE: 79 ctn->ctype |= _ISSPACE; 80 break; 81 case T_ISCNTRL: 82 ctn->ctype |= _ISCNTRL; 83 break; 84 case T_ISGRAPH: 85 ctn->ctype |= (_ISGRAPH | _ISPRINT); 86 break; 87 case T_ISPRINT: 88 ctn->ctype |= _ISPRINT; 89 break; 90 case T_ISPUNCT: 91 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 92 break; 93 case T_ISXDIGIT: 94 ctn->ctype |= (_ISXDIGIT | _ISPRINT); 95 break; 96 case T_ISBLANK: 97 ctn->ctype |= (_ISBLANK | _ISSPACE); 98 break; 99 case T_ISPHONOGRAM: 100 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 101 break; 102 case T_ISIDEOGRAM: 103 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 104 break; 105 case T_ISENGLISH: 106 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 107 break; 108 case T_ISNUMBER: 109 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 110 break; 111 case T_ISSPECIAL: 112 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 113 break; 114 case T_ISALNUM: 115 /* 116 * We can't do anything with this. The character 117 * should already be specified as a digit or alpha. 118 */ 119 break; 120 default: 121 errf(_("not a valid character class")); 122 } 123 } 124 125 static ctype_node_t * 126 get_ctype(wchar_t wc) 127 { 128 ctype_node_t srch; 129 ctype_node_t *ctn; 130 avl_index_t where; 131 132 srch.wc = wc; 133 if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) { 134 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 135 errf(_("out of memory")); 136 return (NULL); 137 } 138 ctn->wc = wc; 139 140 avl_insert(&ctypes, ctn, where); 141 } 142 return (ctn); 143 } 144 145 void 146 add_ctype(int val) 147 { 148 ctype_node_t *ctn; 149 150 if ((ctn = get_ctype(val)) == NULL) { 151 INTERR; 152 return; 153 } 154 add_ctype_impl(ctn); 155 last_ctype = ctn->wc; 156 } 157 158 void 159 add_ctype_range(int end) 160 { 161 ctype_node_t *ctn; 162 wchar_t cur; 163 164 if (end < last_ctype) { 165 errf(_("malformed character range (%u ... %u))"), 166 last_ctype, end); 167 return; 168 } 169 for (cur = last_ctype + 1; cur <= end; cur++) { 170 if ((ctn = get_ctype(cur)) == NULL) { 171 INTERR; 172 return; 173 } 174 add_ctype_impl(ctn); 175 } 176 last_ctype = end; 177 178 } 179 180 void 181 add_caseconv(int val, int wc) 182 { 183 ctype_node_t *ctn; 184 185 ctn = get_ctype(val); 186 if (ctn == NULL) { 187 INTERR; 188 return; 189 } 190 191 switch (last_kw) { 192 case T_TOUPPER: 193 ctn->toupper = wc; 194 break; 195 case T_TOLOWER: 196 ctn->tolower = wc; 197 break; 198 default: 199 INTERR; 200 break; 201 } 202 } 203 204 void 205 dump_ctype(void) 206 { 207 FILE *f; 208 _FileRuneLocale rl; 209 ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 210 _FileRuneEntry *ct = NULL; 211 _FileRuneEntry *lo = NULL; 212 _FileRuneEntry *up = NULL; 213 214 (void) memset(&rl, 0, sizeof (rl)); 215 last_ct = NULL; 216 last_lo = NULL; 217 last_up = NULL; 218 219 if ((f = open_category()) == NULL) 220 return; 221 222 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 223 (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 224 225 for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) { 226 227 wchar_t wc = ctn->wc; 228 int conflict = 0; 229 230 /* 231 * POSIX requires certain portable characters have 232 * certain types. Add them if they are missing. 233 */ 234 if ((wc >= 1) && (wc <= 127)) { 235 if ((wc >= 'A') && (wc <= 'Z')) 236 ctn->ctype |= _ISUPPER; 237 if ((wc >= 'a') && (wc <= 'z')) 238 ctn->ctype |= _ISLOWER; 239 if ((wc >= '0') && (wc <= '9')) 240 ctn->ctype |= _ISDIGIT; 241 if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 242 ctn->ctype |= _ISSPACE; 243 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 244 ctn->ctype |= _ISXDIGIT; 245 if (strchr(" \t", (char)wc)) 246 ctn->ctype |= _ISBLANK; 247 248 /* 249 * Technically these settings are only 250 * required for the C locale. However, it 251 * turns out that because of the historical 252 * version of isprint(), we need them for all 253 * locales as well. Note that these are not 254 * necessarily valid punctation characters in 255 * the current language, but ispunct() needs 256 * to return TRUE for them. 257 */ 258 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 259 (char)wc)) 260 ctn->ctype |= _ISPUNCT; 261 } 262 263 /* 264 * POSIX also requires that certain types imply 265 * others. Add any inferred types here. 266 */ 267 if (ctn->ctype & (_ISUPPER |_ISLOWER)) 268 ctn->ctype |= _ISALPHA; 269 if (ctn->ctype & _ISDIGIT) 270 ctn->ctype |= _ISXDIGIT; 271 if (ctn->ctype & _ISBLANK) 272 ctn->ctype |= _ISSPACE; 273 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 274 ctn->ctype |= _ISGRAPH; 275 if (ctn->ctype & _ISGRAPH) 276 ctn->ctype |= _ISPRINT; 277 278 /* 279 * Finally, POSIX requires that certain combinations 280 * are invalid. We don't flag this as a fatal error, 281 * but we will warn about. 282 */ 283 if ((ctn->ctype & _ISALPHA) && 284 (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 285 conflict++; 286 if ((ctn->ctype & _ISPUNCT) & 287 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 288 conflict++; 289 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 290 conflict++; 291 if ((ctn->ctype & _ISCNTRL) & _ISPRINT) 292 conflict++; 293 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 294 conflict++; 295 296 if (conflict) { 297 warn("conflicting classes for character 0x%x (%x)", 298 wc, ctn->ctype); 299 } 300 /* 301 * Handle the lower 256 characters using the simple 302 * optimization. Note that if we have not defined the 303 * upper/lower case, then we identity map it. 304 */ 305 if (wc < _CACHED_RUNES) { 306 rl.runetype[wc] = ctn->ctype; 307 rl.maplower[wc] = ctn->tolower ? ctn->tolower : wc; 308 rl.mapupper[wc] = ctn->toupper ? ctn->toupper : wc; 309 continue; 310 } 311 312 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) { 313 ct[rl.runetype_ext_nranges-1].max = wc; 314 last_ct = ctn; 315 } else { 316 rl.runetype_ext_nranges++; 317 ct = realloc(ct, 318 sizeof (*ct) * rl.runetype_ext_nranges); 319 ct[rl.runetype_ext_nranges - 1].min = wc; 320 ct[rl.runetype_ext_nranges - 1].max = wc; 321 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; 322 last_ct = ctn; 323 } 324 if (ctn->tolower == 0) { 325 last_lo = NULL; 326 } else if ((last_lo != NULL) && 327 (last_lo->tolower + 1 == ctn->tolower)) { 328 lo[rl.maplower_ext_nranges-1].max = wc; 329 last_lo = ctn; 330 } else { 331 rl.maplower_ext_nranges++; 332 lo = realloc(lo, 333 sizeof (*lo) * rl.maplower_ext_nranges); 334 lo[rl.maplower_ext_nranges - 1].min = wc; 335 lo[rl.maplower_ext_nranges - 1].max = wc; 336 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; 337 last_lo = ctn; 338 } 339 340 if (ctn->toupper == 0) { 341 last_up = NULL; 342 } else if ((last_up != NULL) && 343 (last_up->toupper + 1 == ctn->toupper)) { 344 up[rl.mapupper_ext_nranges-1].max = wc; 345 last_up = ctn; 346 } else { 347 rl.mapupper_ext_nranges++; 348 up = realloc(up, 349 sizeof (*up) * rl.mapupper_ext_nranges); 350 up[rl.mapupper_ext_nranges - 1].min = wc; 351 up[rl.mapupper_ext_nranges - 1].max = wc; 352 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; 353 last_up = ctn; 354 } 355 } 356 357 if ((wr_category(&rl, sizeof (rl), f) < 0) || 358 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || 359 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || 360 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { 361 return; 362 } 363 364 close_category(f); 365 } 366