1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2010,2011 Nexenta Systems, Inc. All rights reserved.
14 * Copyright 2012 Garrett D'Amore <garrett@damore.org>
15 * Copyright 2013 DEY Storage Systems, Inc.
16 */
17
18 /*
19 * LC_CTYPE database generation routines for localedef.
20 */
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <sys/types.h>
26 #include <sys/avl.h>
27 #include <wchar.h>
28 #include <ctype.h>
29 #include <wctype.h>
30 #include <unistd.h>
31 #include "_ctype.h"
32 #include "localedef.h"
33 #include "parser.tab.h"
34 #include "runefile.h"
35
36 static avl_tree_t ctypes;
37
38 static wchar_t last_ctype;
39
40 typedef struct ctype_node {
41 wchar_t wc;
42 int32_t ctype;
43 int32_t toupper;
44 int32_t tolower;
45 avl_node_t avl;
46 } ctype_node_t;
47
48 typedef struct width_node {
49 wchar_t start;
50 wchar_t end;
51 int8_t width;
52 avl_node_t avl;
53 } width_node_t;
54
55 static int
ctype_compare(const void * n1,const void * n2)56 ctype_compare(const void *n1, const void *n2)
57 {
58 const ctype_node_t *c1 = n1;
59 const ctype_node_t *c2 = n2;
60
61 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
62 }
63
64 void
init_ctype(void)65 init_ctype(void)
66 {
67 avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
68 offsetof(ctype_node_t, avl));
69 }
70
71
72 static void
add_ctype_impl(ctype_node_t * ctn)73 add_ctype_impl(ctype_node_t *ctn)
74 {
75 switch (last_kw) {
76 case T_ISUPPER:
77 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
78 break;
79 case T_ISLOWER:
80 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
81 break;
82 case T_ISALPHA:
83 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
84 break;
85 case T_ISDIGIT:
86 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
87 break;
88 case T_ISSPACE:
89 ctn->ctype |= _ISSPACE;
90 break;
91 case T_ISCNTRL:
92 ctn->ctype |= _ISCNTRL;
93 break;
94 case T_ISGRAPH:
95 ctn->ctype |= (_ISGRAPH | _ISPRINT);
96 break;
97 case T_ISPRINT:
98 ctn->ctype |= _ISPRINT;
99 break;
100 case T_ISPUNCT:
101 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
102 break;
103 case T_ISXDIGIT:
104 ctn->ctype |= (_ISXDIGIT | _ISPRINT);
105 break;
106 case T_ISBLANK:
107 ctn->ctype |= (_ISBLANK | _ISSPACE);
108 break;
109 case T_ISPHONOGRAM:
110 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
111 break;
112 case T_ISIDEOGRAM:
113 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
114 break;
115 case T_ISENGLISH:
116 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
117 break;
118 case T_ISNUMBER:
119 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
120 break;
121 case T_ISSPECIAL:
122 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
123 break;
124 case T_ISALNUM:
125 /*
126 * We can't do anything with this. The character
127 * should already be specified as a digit or alpha.
128 */
129 break;
130 default:
131 errf(_("not a valid character class"));
132 }
133 }
134
135 static ctype_node_t *
get_ctype(wchar_t wc)136 get_ctype(wchar_t wc)
137 {
138 ctype_node_t srch;
139 ctype_node_t *ctn;
140 avl_index_t where;
141
142 srch.wc = wc;
143 if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
144 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
145 errf(_("out of memory"));
146 return (NULL);
147 }
148 ctn->wc = wc;
149
150 avl_insert(&ctypes, ctn, where);
151 }
152 return (ctn);
153 }
154
155 void
add_ctype(int val)156 add_ctype(int val)
157 {
158 ctype_node_t *ctn;
159
160 if ((ctn = get_ctype(val)) == NULL) {
161 INTERR;
162 return;
163 }
164 add_ctype_impl(ctn);
165 last_ctype = ctn->wc;
166 }
167
168 void
add_ctype_range(int end)169 add_ctype_range(int end)
170 {
171 ctype_node_t *ctn;
172 wchar_t cur;
173
174 if (end < last_ctype) {
175 errf(_("malformed character range (%u ... %u))"),
176 last_ctype, end);
177 return;
178 }
179 for (cur = last_ctype + 1; cur <= end; cur++) {
180 if ((ctn = get_ctype(cur)) == NULL) {
181 INTERR;
182 return;
183 }
184 add_ctype_impl(ctn);
185 }
186 last_ctype = end;
187
188 }
189
190 /*
191 * A word about widths: if the width mask is specified, then libc
192 * unconditionally honors it. Otherwise, it assumes printable
193 * characters have width 1, and non-printable characters have width
194 * -1 (except for NULL which is special with with 0). Hence, we have
195 * no need to inject defaults here -- the "default" unset value of 0
196 * indicates that libc should use its own logic in wcwidth as described.
197 */
198 void
add_width(int wc,int width)199 add_width(int wc, int width)
200 {
201 ctype_node_t *ctn;
202
203 if ((ctn = get_ctype(wc)) == NULL) {
204 INTERR;
205 return;
206 }
207 ctn->ctype &= ~(_CTYPE_SWM);
208 switch (width) {
209 case 0:
210 ctn->ctype |= _CTYPE_SW0;
211 break;
212 case 1:
213 ctn->ctype |= _CTYPE_SW1;
214 break;
215 case 2:
216 ctn->ctype |= _CTYPE_SW2;
217 break;
218 case 3:
219 ctn->ctype |= _CTYPE_SW3;
220 break;
221 }
222 }
223
224 void
add_width_range(int start,int end,int width)225 add_width_range(int start, int end, int width)
226 {
227 for (; start <= end; start++) {
228 add_width(start, width);
229 }
230 }
231
232 void
add_caseconv(int val,int wc)233 add_caseconv(int val, int wc)
234 {
235 ctype_node_t *ctn;
236
237 ctn = get_ctype(val);
238 if (ctn == NULL) {
239 INTERR;
240 return;
241 }
242
243 switch (last_kw) {
244 case T_TOUPPER:
245 ctn->toupper = wc;
246 break;
247 case T_TOLOWER:
248 ctn->tolower = wc;
249 break;
250 default:
251 INTERR;
252 break;
253 }
254 }
255
256 void
dump_ctype(void)257 dump_ctype(void)
258 {
259 FILE *f;
260 _FileRuneLocale rl;
261 ctype_node_t *ctn, *last_ct, *last_lo, *last_up;
262 _FileRuneEntry *ct = NULL;
263 _FileRuneEntry *lo = NULL;
264 _FileRuneEntry *up = NULL;
265 wchar_t wc;
266
267 (void) memset(&rl, 0, sizeof (rl));
268 last_ct = NULL;
269 last_lo = NULL;
270 last_up = NULL;
271
272 if ((f = open_category()) == NULL)
273 return;
274
275 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
276 (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
277
278 /*
279 * Initialize the identity map.
280 */
281 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
282 rl.maplower[wc] = wc;
283 rl.mapupper[wc] = wc;
284 }
285
286 for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
287 int conflict = 0;
288
289
290 wc = ctn->wc;
291
292 /*
293 * POSIX requires certain portable characters have
294 * certain types. Add them if they are missing.
295 */
296 if ((wc >= 1) && (wc <= 127)) {
297 if ((wc >= 'A') && (wc <= 'Z'))
298 ctn->ctype |= _ISUPPER;
299 if ((wc >= 'a') && (wc <= 'z'))
300 ctn->ctype |= _ISLOWER;
301 if ((wc >= '0') && (wc <= '9'))
302 ctn->ctype |= _ISDIGIT;
303 if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
304 ctn->ctype |= _ISSPACE;
305 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
306 ctn->ctype |= _ISXDIGIT;
307 if (strchr(" \t", (char)wc))
308 ctn->ctype |= _ISBLANK;
309
310 /*
311 * Technically these settings are only
312 * required for the C locale. However, it
313 * turns out that because of the historical
314 * version of isprint(), we need them for all
315 * locales as well. Note that these are not
316 * necessarily valid punctation characters in
317 * the current language, but ispunct() needs
318 * to return TRUE for them.
319 */
320 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
321 (char)wc))
322 ctn->ctype |= _ISPUNCT;
323 }
324
325 /*
326 * POSIX also requires that certain types imply
327 * others. Add any inferred types here.
328 */
329 if (ctn->ctype & (_ISUPPER |_ISLOWER))
330 ctn->ctype |= _ISALPHA;
331 if (ctn->ctype & _ISDIGIT)
332 ctn->ctype |= _ISXDIGIT;
333 if (ctn->ctype & _ISBLANK)
334 ctn->ctype |= _ISSPACE;
335 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
336 ctn->ctype |= _ISGRAPH;
337 if (ctn->ctype & _ISGRAPH)
338 ctn->ctype |= _ISPRINT;
339
340 /*
341 * Finally, POSIX requires that certain combinations
342 * are invalid. We don't flag this as a fatal error,
343 * but we will warn about.
344 */
345 if ((ctn->ctype & _ISALPHA) &&
346 (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
347 conflict++;
348 if ((ctn->ctype & _ISPUNCT) &
349 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
350 conflict++;
351 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
352 conflict++;
353 if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
354 conflict++;
355 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
356 conflict++;
357
358 if (conflict) {
359 warn("conflicting classes for character 0x%x (%x)",
360 wc, ctn->ctype);
361 }
362 /*
363 * Handle the lower 256 characters using the simple
364 * optimization. Note that if we have not defined the
365 * upper/lower case, then we identity map it.
366 */
367 if ((unsigned)wc < _CACHED_RUNES) {
368 rl.runetype[wc] = ctn->ctype;
369 if (ctn->tolower)
370 rl.maplower[wc] = ctn->tolower;
371 if (ctn->toupper)
372 rl.mapupper[wc] = ctn->toupper;
373 continue;
374 }
375
376 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
377 ct[rl.runetype_ext_nranges-1].max = wc;
378 last_ct = ctn;
379 } else {
380 rl.runetype_ext_nranges++;
381 ct = realloc(ct,
382 sizeof (*ct) * rl.runetype_ext_nranges);
383 ct[rl.runetype_ext_nranges - 1].min = wc;
384 ct[rl.runetype_ext_nranges - 1].max = wc;
385 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
386 last_ct = ctn;
387 }
388 if (ctn->tolower == 0) {
389 last_lo = NULL;
390 } else if ((last_lo != NULL) &&
391 (last_lo->tolower + 1 == ctn->tolower)) {
392 lo[rl.maplower_ext_nranges-1].max = wc;
393 last_lo = ctn;
394 } else {
395 rl.maplower_ext_nranges++;
396 lo = realloc(lo,
397 sizeof (*lo) * rl.maplower_ext_nranges);
398 lo[rl.maplower_ext_nranges - 1].min = wc;
399 lo[rl.maplower_ext_nranges - 1].max = wc;
400 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
401 last_lo = ctn;
402 }
403
404 if (ctn->toupper == 0) {
405 last_up = NULL;
406 } else if ((last_up != NULL) &&
407 (last_up->toupper + 1 == ctn->toupper)) {
408 up[rl.mapupper_ext_nranges-1].max = wc;
409 last_up = ctn;
410 } else {
411 rl.mapupper_ext_nranges++;
412 up = realloc(up,
413 sizeof (*up) * rl.mapupper_ext_nranges);
414 up[rl.mapupper_ext_nranges - 1].min = wc;
415 up[rl.mapupper_ext_nranges - 1].max = wc;
416 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
417 last_up = ctn;
418 }
419 }
420
421 if ((wr_category(&rl, sizeof (rl), f) < 0) ||
422 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
423 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
424 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
425 return;
426 }
427
428 close_category(f);
429 }
430