1 /*-
2 * Copyright 2018 Nexenta Systems, Inc.
3 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved.
4 * Copyright 2015 John Marino <draco@marino.st>
5 *
6 * This source code is derived from the illumos localedef command, and
7 * provided under BSD-style license terms by Nexenta Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * LC_CTYPE database generation routines for localedef.
34 */
35 #include <sys/cdefs.h>
36 #include <sys/tree.h>
37
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <stddef.h>
41 #include <string.h>
42 #include <sys/types.h>
43 #include <wchar.h>
44 #include <unistd.h>
45 #include "localedef.h"
46 #include "parser.h"
47
48 /* Always include the defines for the target: */
49 #define _DONT_USE_CTYPE_INLINE_ /* Avoid dependencies on runetype.h */
50 #include "_ctype.h"
51 #include "runefile.h"
52
53
54 /* Needed for bootstrapping, _CTYPE_N */
55 #ifndef _CTYPE_N
56 #define _CTYPE_N 0x00400000L
57 #endif
58
59 #define _ISUPPER _CTYPE_U
60 #define _ISLOWER _CTYPE_L
61 #define _ISDIGIT _CTYPE_D
62 #define _ISXDIGIT _CTYPE_X
63 #define _ISSPACE _CTYPE_S
64 #define _ISBLANK _CTYPE_B
65 #define _ISALPHA _CTYPE_A
66 #define _ISPUNCT _CTYPE_P
67 #define _ISGRAPH _CTYPE_G
68 #define _ISPRINT _CTYPE_R
69 #define _ISCNTRL _CTYPE_C
70 #define _E1 _CTYPE_Q
71 #define _E2 _CTYPE_I
72 #define _E3 0
73 #define _E4 _CTYPE_N
74 #define _E5 _CTYPE_T
75
76 static wchar_t last_ctype;
77 static int ctype_compare(const void *n1, const void *n2);
78
79 typedef struct ctype_node {
80 wchar_t wc;
81 int32_t ctype;
82 int32_t toupper;
83 int32_t tolower;
84 RB_ENTRY(ctype_node) entry;
85 } ctype_node_t;
86
87 static RB_HEAD(ctypes, ctype_node) ctypes;
88 RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare);
89
90 static int
ctype_compare(const void * n1,const void * n2)91 ctype_compare(const void *n1, const void *n2)
92 {
93 const ctype_node_t *c1 = n1;
94 const ctype_node_t *c2 = n2;
95
96 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
97 }
98
99 void
init_ctype(void)100 init_ctype(void)
101 {
102 RB_INIT(&ctypes);
103 }
104
105
106 static void
add_ctype_impl(ctype_node_t * ctn)107 add_ctype_impl(ctype_node_t *ctn)
108 {
109 switch (last_kw) {
110 case T_ISUPPER:
111 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
112 break;
113 case T_ISLOWER:
114 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
115 break;
116 case T_ISALPHA:
117 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
118 break;
119 case T_ISDIGIT:
120 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4);
121 break;
122 case T_ISSPACE:
123 /*
124 * This can be troublesome as <form-feed>, <newline>,
125 * <carriage-return>, <tab>, and <vertical-tab> are defined both
126 * as space and cntrl, and POSIX doesn't allow cntrl/print
127 * combination. We will take care of this in dump_ctype().
128 */
129 ctn->ctype |= (_ISSPACE | _ISPRINT);
130 break;
131 case T_ISCNTRL:
132 ctn->ctype |= _ISCNTRL;
133 break;
134 case T_ISGRAPH:
135 ctn->ctype |= (_ISGRAPH | _ISPRINT);
136 break;
137 case T_ISPRINT:
138 ctn->ctype |= _ISPRINT;
139 break;
140 case T_ISPUNCT:
141 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
142 break;
143 case T_ISXDIGIT:
144 ctn->ctype |= (_ISXDIGIT | _ISPRINT);
145 break;
146 case T_ISBLANK:
147 ctn->ctype |= (_ISBLANK | _ISSPACE);
148 break;
149 case T_ISPHONOGRAM:
150 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
151 break;
152 case T_ISIDEOGRAM:
153 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
154 break;
155 case T_ISENGLISH:
156 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
157 break;
158 case T_ISNUMBER:
159 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
160 break;
161 case T_ISSPECIAL:
162 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
163 break;
164 case T_ISALNUM:
165 /*
166 * We can't do anything with this. The character
167 * should already be specified as a digit or alpha.
168 */
169 break;
170 default:
171 errf("not a valid character class");
172 }
173 }
174
175 static ctype_node_t *
get_ctype(wchar_t wc)176 get_ctype(wchar_t wc)
177 {
178 ctype_node_t srch;
179 ctype_node_t *ctn;
180
181 srch.wc = wc;
182 if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) {
183 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
184 errf("out of memory");
185 return (NULL);
186 }
187 ctn->wc = wc;
188
189 RB_INSERT(ctypes, &ctypes, ctn);
190 }
191 return (ctn);
192 }
193
194 void
add_ctype(int val)195 add_ctype(int val)
196 {
197 ctype_node_t *ctn;
198
199 if ((ctn = get_ctype(val)) == NULL) {
200 INTERR;
201 return;
202 }
203 add_ctype_impl(ctn);
204 last_ctype = ctn->wc;
205 }
206
207 void
add_ctype_range(wchar_t end)208 add_ctype_range(wchar_t end)
209 {
210 ctype_node_t *ctn;
211 wchar_t cur;
212
213 if (end < last_ctype) {
214 errf("malformed character range (%u ... %u))",
215 last_ctype, end);
216 return;
217 }
218 for (cur = last_ctype + 1; cur <= end; cur++) {
219 if ((ctn = get_ctype(cur)) == NULL) {
220 INTERR;
221 return;
222 }
223 add_ctype_impl(ctn);
224 }
225 last_ctype = end;
226
227 }
228
229 /*
230 * A word about widths: if the width mask is specified, then libc
231 * unconditionally honors it. Otherwise, it assumes printable
232 * characters have width 1, and non-printable characters have width
233 * -1 (except for NULL which is special with width 0). Hence, we have
234 * no need to inject defaults here -- the "default" unset value of 0
235 * indicates that libc should use its own logic in wcwidth as described.
236 */
237 void
add_width(int wc,int width)238 add_width(int wc, int width)
239 {
240 ctype_node_t *ctn;
241
242 if ((ctn = get_ctype(wc)) == NULL) {
243 INTERR;
244 return;
245 }
246 ctn->ctype &= ~(_CTYPE_SWM);
247 switch (width) {
248 case 0:
249 ctn->ctype |= _CTYPE_SW0;
250 break;
251 case 1:
252 ctn->ctype |= _CTYPE_SW1;
253 break;
254 case 2:
255 ctn->ctype |= _CTYPE_SW2;
256 break;
257 case 3:
258 ctn->ctype |= _CTYPE_SW3;
259 break;
260 }
261 }
262
263 void
add_width_range(int start,int end,int width)264 add_width_range(int start, int end, int width)
265 {
266 for (; start <= end; start++) {
267 add_width(start, width);
268 }
269 }
270
271 void
add_caseconv(int val,int wc)272 add_caseconv(int val, int wc)
273 {
274 ctype_node_t *ctn;
275
276 ctn = get_ctype(val);
277 if (ctn == NULL) {
278 INTERR;
279 return;
280 }
281
282 switch (last_kw) {
283 case T_TOUPPER:
284 ctn->toupper = wc;
285 break;
286 case T_TOLOWER:
287 ctn->tolower = wc;
288 break;
289 default:
290 INTERR;
291 break;
292 }
293 }
294
295 void
dump_ctype(void)296 dump_ctype(void)
297 {
298 FILE *f;
299 _FileRuneLocale rl;
300 ctype_node_t *ctn, *last_ct, *last_lo, *last_up;
301 _FileRuneEntry *ct = NULL;
302 _FileRuneEntry *lo = NULL;
303 _FileRuneEntry *up = NULL;
304 wchar_t wc;
305 uint32_t runetype_ext_nranges;
306 uint32_t maplower_ext_nranges;
307 uint32_t mapupper_ext_nranges;
308
309 (void) memset(&rl, 0, sizeof (rl));
310 runetype_ext_nranges = 0;
311 last_ct = NULL;
312 maplower_ext_nranges = 0;
313 last_lo = NULL;
314 mapupper_ext_nranges = 0;
315 last_up = NULL;
316
317 if ((f = open_category()) == NULL)
318 return;
319
320 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
321 (void) strlcpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
322
323 /*
324 * Initialize the identity map.
325 */
326 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
327 rl.maplower[wc] = htote(wc);
328 rl.mapupper[wc] = htote(wc);
329 }
330
331 RB_FOREACH(ctn, ctypes, &ctypes) {
332 int conflict = 0;
333
334 wc = ctn->wc;
335
336 /*
337 * POSIX requires certain portable characters have
338 * certain types. Add them if they are missing.
339 */
340 if ((wc >= 1) && (wc <= 127)) {
341 if ((wc >= 'A') && (wc <= 'Z'))
342 ctn->ctype |= _ISUPPER;
343 if ((wc >= 'a') && (wc <= 'z'))
344 ctn->ctype |= _ISLOWER;
345 if ((wc >= '0') && (wc <= '9'))
346 ctn->ctype |= _ISDIGIT;
347 if (wc == ' ')
348 ctn->ctype |= _ISPRINT;
349 if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
350 ctn->ctype |= _ISSPACE;
351 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
352 ctn->ctype |= _ISXDIGIT;
353 if (strchr(" \t", (char)wc))
354 ctn->ctype |= _ISBLANK;
355
356 /*
357 * Technically these settings are only
358 * required for the C locale. However, it
359 * turns out that because of the historical
360 * version of isprint(), we need them for all
361 * locales as well. Note that these are not
362 * necessarily valid punctation characters in
363 * the current language, but ispunct() needs
364 * to return TRUE for them.
365 */
366 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
367 (char)wc))
368 ctn->ctype |= _ISPUNCT;
369 }
370
371 /*
372 * POSIX also requires that certain types imply
373 * others. Add any inferred types here.
374 */
375 if (ctn->ctype & (_ISUPPER |_ISLOWER))
376 ctn->ctype |= _ISALPHA;
377 if (ctn->ctype & _ISDIGIT)
378 ctn->ctype |= _ISXDIGIT;
379 if (ctn->ctype & _ISBLANK)
380 ctn->ctype |= _ISSPACE;
381 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
382 ctn->ctype |= _ISGRAPH;
383 if (ctn->ctype & _ISGRAPH)
384 ctn->ctype |= _ISPRINT;
385
386 /*
387 * POSIX requires that certain combinations are invalid.
388 * Try fixing the cases we know about (see add_ctype_impl()).
389 */
390 if ((ctn->ctype & (_ISSPACE|_ISCNTRL)) == (_ISSPACE|_ISCNTRL))
391 ctn->ctype &= ~_ISPRINT;
392
393 /*
394 * Finally, don't flag remaining cases as a fatal error,
395 * and just warn about them.
396 */
397 if ((ctn->ctype & _ISALPHA) &&
398 (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
399 conflict++;
400 if ((ctn->ctype & _ISPUNCT) &&
401 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
402 conflict++;
403 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
404 conflict++;
405 if ((ctn->ctype & _ISCNTRL) && (ctn->ctype & _ISPRINT))
406 conflict++;
407 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
408 conflict++;
409
410 if (conflict) {
411 warn("conflicting classes for character 0x%x (%x)",
412 wc, ctn->ctype);
413 }
414 /*
415 * Handle the lower 256 characters using the simple
416 * optimization. Note that if we have not defined the
417 * upper/lower case, then we identity map it.
418 */
419 if ((unsigned)wc < _CACHED_RUNES) {
420 rl.runetype[wc] = htote(ctn->ctype);
421 if (ctn->tolower)
422 rl.maplower[wc] = htote(ctn->tolower);
423 if (ctn->toupper)
424 rl.mapupper[wc] = htote(ctn->toupper);
425 continue;
426 }
427
428 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) &&
429 (last_ct->wc + 1 == wc)) {
430 ct[runetype_ext_nranges - 1].max = htote(wc);
431 } else {
432 runetype_ext_nranges++;
433 ct = realloc(ct, sizeof (*ct) * runetype_ext_nranges);
434 ct[runetype_ext_nranges - 1].min = htote(wc);
435 ct[runetype_ext_nranges - 1].max = htote(wc);
436 ct[runetype_ext_nranges - 1].map =
437 htote(ctn->ctype);
438 }
439 last_ct = ctn;
440 if (ctn->tolower == 0) {
441 last_lo = NULL;
442 } else if ((last_lo != NULL) &&
443 (last_lo->tolower + 1 == ctn->tolower)) {
444 lo[maplower_ext_nranges - 1].max = htote(wc);
445 last_lo = ctn;
446 } else {
447 maplower_ext_nranges++;
448 lo = realloc(lo, sizeof (*lo) * maplower_ext_nranges);
449 lo[maplower_ext_nranges - 1].min = htote(wc);
450 lo[maplower_ext_nranges - 1].max = htote(wc);
451 lo[maplower_ext_nranges - 1].map =
452 htote(ctn->tolower);
453 last_lo = ctn;
454 }
455
456 if (ctn->toupper == 0) {
457 last_up = NULL;
458 } else if ((last_up != NULL) &&
459 (last_up->toupper + 1 == ctn->toupper)) {
460 up[mapupper_ext_nranges-1].max = htote(wc);
461 last_up = ctn;
462 } else {
463 mapupper_ext_nranges++;
464 up = realloc(up, sizeof (*up) * mapupper_ext_nranges);
465 up[mapupper_ext_nranges - 1].min = htote(wc);
466 up[mapupper_ext_nranges - 1].max = htote(wc);
467 up[mapupper_ext_nranges - 1].map =
468 htote(ctn->toupper);
469 last_up = ctn;
470 }
471 }
472
473 rl.runetype_ext_nranges = htote(runetype_ext_nranges);
474 rl.maplower_ext_nranges = htote(maplower_ext_nranges);
475 rl.mapupper_ext_nranges = htote(mapupper_ext_nranges);
476 if ((wr_category(&rl, sizeof (rl), f) < 0) ||
477 (wr_category(ct, sizeof (*ct) * runetype_ext_nranges, f) < 0) ||
478 (wr_category(lo, sizeof (*lo) * maplower_ext_nranges, f) < 0) ||
479 (wr_category(up, sizeof (*up) * mapupper_ext_nranges, f) < 0)) {
480 return;
481 }
482
483 close_category(f);
484 }
485