xref: /titanic_44/usr/src/cmd/localedef/ctype.c (revision 2da1cd3a39e2d3da7f9d15071ea9462919c011ac)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2010,2011 Nexenta Systems, Inc.  All rights reserved.
14  * Copyright 2012 Garrett D'Amore <garrett@damore.org>
15  * Copyright 2013 DEY Storage Systems, Inc.
16  */
17 
18 /*
19  * LC_CTYPE database generation routines for localedef.
20  */
21 
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <sys/types.h>
26 #include <sys/avl.h>
27 #include <wchar.h>
28 #include <ctype.h>
29 #include <wctype.h>
30 #include <unistd.h>
31 #include "_ctype.h"
32 #include "localedef.h"
33 #include "parser.tab.h"
34 #include "runefile.h"
35 
36 static avl_tree_t	ctypes;
37 
38 static wchar_t		last_ctype;
39 
40 typedef struct ctype_node {
41 	wchar_t wc;
42 	int32_t	ctype;
43 	int32_t	toupper;
44 	int32_t	tolower;
45 	avl_node_t avl;
46 } ctype_node_t;
47 
48 typedef struct width_node {
49 	wchar_t start;
50 	wchar_t end;
51 	int8_t width;
52 	avl_node_t avl;
53 } width_node_t;
54 
55 static int
ctype_compare(const void * n1,const void * n2)56 ctype_compare(const void *n1, const void *n2)
57 {
58 	const ctype_node_t *c1 = n1;
59 	const ctype_node_t *c2 = n2;
60 
61 	return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
62 }
63 
64 void
init_ctype(void)65 init_ctype(void)
66 {
67 	avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
68 	    offsetof(ctype_node_t, avl));
69 }
70 
71 
72 static void
add_ctype_impl(ctype_node_t * ctn)73 add_ctype_impl(ctype_node_t *ctn)
74 {
75 	switch (last_kw) {
76 	case T_ISUPPER:
77 		ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
78 		break;
79 	case T_ISLOWER:
80 		ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
81 		break;
82 	case T_ISALPHA:
83 		ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
84 		break;
85 	case T_ISDIGIT:
86 		ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
87 		break;
88 	case T_ISSPACE:
89 		ctn->ctype |= _ISSPACE;
90 		break;
91 	case T_ISCNTRL:
92 		ctn->ctype |= _ISCNTRL;
93 		break;
94 	case T_ISGRAPH:
95 		ctn->ctype |= (_ISGRAPH | _ISPRINT);
96 		break;
97 	case T_ISPRINT:
98 		ctn->ctype |= _ISPRINT;
99 		break;
100 	case T_ISPUNCT:
101 		ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
102 		break;
103 	case T_ISXDIGIT:
104 		ctn->ctype |= (_ISXDIGIT | _ISPRINT);
105 		break;
106 	case T_ISBLANK:
107 		ctn->ctype |= (_ISBLANK | _ISSPACE);
108 		break;
109 	case T_ISPHONOGRAM:
110 		ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
111 		break;
112 	case T_ISIDEOGRAM:
113 		ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
114 		break;
115 	case T_ISENGLISH:
116 		ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
117 		break;
118 	case T_ISNUMBER:
119 		ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
120 		break;
121 	case T_ISSPECIAL:
122 		ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
123 		break;
124 	case T_ISALNUM:
125 		/*
126 		 * We can't do anything with this.  The character
127 		 * should already be specified as a digit or alpha.
128 		 */
129 		break;
130 	default:
131 		errf(_("not a valid character class"));
132 	}
133 }
134 
135 static ctype_node_t *
get_ctype(wchar_t wc)136 get_ctype(wchar_t wc)
137 {
138 	ctype_node_t	srch;
139 	ctype_node_t	*ctn;
140 	avl_index_t	where;
141 
142 	srch.wc = wc;
143 	if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
144 		if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
145 			errf(_("out of memory"));
146 			return (NULL);
147 		}
148 		ctn->wc = wc;
149 
150 		avl_insert(&ctypes, ctn, where);
151 	}
152 	return (ctn);
153 }
154 
155 void
add_ctype(int val)156 add_ctype(int val)
157 {
158 	ctype_node_t	*ctn;
159 
160 	if ((ctn = get_ctype(val)) == NULL) {
161 		INTERR;
162 		return;
163 	}
164 	add_ctype_impl(ctn);
165 	last_ctype = ctn->wc;
166 }
167 
168 void
add_ctype_range(int end)169 add_ctype_range(int end)
170 {
171 	ctype_node_t	*ctn;
172 	wchar_t		cur;
173 
174 	if (end < last_ctype) {
175 		errf(_("malformed character range (%u ... %u))"),
176 		    last_ctype, end);
177 		return;
178 	}
179 	for (cur = last_ctype + 1; cur <= end; cur++) {
180 		if ((ctn = get_ctype(cur)) == NULL) {
181 			INTERR;
182 			return;
183 		}
184 		add_ctype_impl(ctn);
185 	}
186 	last_ctype = end;
187 
188 }
189 
190 /*
191  * A word about widths: if the width mask is specified, then libc
192  * unconditionally honors it.  Otherwise, it assumes printable
193  * characters have width 1, and non-printable characters have width
194  * -1 (except for NULL which is special with with 0).  Hence, we have
195  * no need to inject defaults here -- the "default" unset value of 0
196  * indicates that libc should use its own logic in wcwidth as described.
197  */
198 void
add_width(int wc,int width)199 add_width(int wc, int width)
200 {
201 	ctype_node_t	*ctn;
202 
203 	if ((ctn = get_ctype(wc)) == NULL) {
204 		INTERR;
205 		return;
206 	}
207 	ctn->ctype &= ~(_CTYPE_SWM);
208 	switch (width) {
209 	case 0:
210 		ctn->ctype |= _CTYPE_SW0;
211 		break;
212 	case 1:
213 		ctn->ctype |= _CTYPE_SW1;
214 		break;
215 	case 2:
216 		ctn->ctype |= _CTYPE_SW2;
217 		break;
218 	case 3:
219 		ctn->ctype |= _CTYPE_SW3;
220 		break;
221 	}
222 }
223 
224 void
add_width_range(int start,int end,int width)225 add_width_range(int start, int end, int width)
226 {
227 	for (; start <= end; start++) {
228 		add_width(start, width);
229 	}
230 }
231 
232 void
add_caseconv(int val,int wc)233 add_caseconv(int val, int wc)
234 {
235 	ctype_node_t	*ctn;
236 
237 	ctn = get_ctype(val);
238 	if (ctn == NULL) {
239 		INTERR;
240 		return;
241 	}
242 
243 	switch (last_kw) {
244 	case T_TOUPPER:
245 		ctn->toupper = wc;
246 		break;
247 	case T_TOLOWER:
248 		ctn->tolower = wc;
249 		break;
250 	default:
251 		INTERR;
252 		break;
253 	}
254 }
255 
256 void
dump_ctype(void)257 dump_ctype(void)
258 {
259 	FILE		*f;
260 	_FileRuneLocale	rl;
261 	ctype_node_t	*ctn, *last_ct, *last_lo, *last_up;
262 	_FileRuneEntry	*ct = NULL;
263 	_FileRuneEntry	*lo = NULL;
264 	_FileRuneEntry	*up = NULL;
265 	wchar_t		wc;
266 
267 	(void) memset(&rl, 0, sizeof (rl));
268 	last_ct = NULL;
269 	last_lo = NULL;
270 	last_up = NULL;
271 
272 	if ((f = open_category()) == NULL)
273 		return;
274 
275 	(void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
276 	(void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
277 
278 	/*
279 	 * Initialize the identity map.
280 	 */
281 	for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
282 		rl.maplower[wc] = wc;
283 		rl.mapupper[wc] = wc;
284 	}
285 
286 	for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
287 		int conflict = 0;
288 
289 
290 		wc = ctn->wc;
291 
292 		/*
293 		 * POSIX requires certain portable characters have
294 		 * certain types.  Add them if they are missing.
295 		 */
296 		if ((wc >= 1) && (wc <= 127)) {
297 			if ((wc >= 'A') && (wc <= 'Z'))
298 				ctn->ctype |= _ISUPPER;
299 			if ((wc >= 'a') && (wc <= 'z'))
300 				ctn->ctype |= _ISLOWER;
301 			if ((wc >= '0') && (wc <= '9'))
302 				ctn->ctype |= _ISDIGIT;
303 			if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
304 				ctn->ctype |= _ISSPACE;
305 			if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
306 				ctn->ctype |= _ISXDIGIT;
307 			if (strchr(" \t", (char)wc))
308 				ctn->ctype |= _ISBLANK;
309 
310 			/*
311 			 * Technically these settings are only
312 			 * required for the C locale.  However, it
313 			 * turns out that because of the historical
314 			 * version of isprint(), we need them for all
315 			 * locales as well.  Note that these are not
316 			 * necessarily valid punctation characters in
317 			 * the current language, but ispunct() needs
318 			 * to return TRUE for them.
319 			 */
320 			if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
321 			    (char)wc))
322 				ctn->ctype |= _ISPUNCT;
323 		}
324 
325 		/*
326 		 * POSIX also requires that certain types imply
327 		 * others.  Add any inferred types here.
328 		 */
329 		if (ctn->ctype & (_ISUPPER |_ISLOWER))
330 			ctn->ctype |= _ISALPHA;
331 		if (ctn->ctype & _ISDIGIT)
332 			ctn->ctype |= _ISXDIGIT;
333 		if (ctn->ctype & _ISBLANK)
334 			ctn->ctype |= _ISSPACE;
335 		if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
336 			ctn->ctype |= _ISGRAPH;
337 		if (ctn->ctype & _ISGRAPH)
338 			ctn->ctype |= _ISPRINT;
339 
340 		/*
341 		 * Finally, POSIX requires that certain combinations
342 		 * are invalid.  We don't flag this as a fatal error,
343 		 * but we will warn about.
344 		 */
345 		if ((ctn->ctype & _ISALPHA) &&
346 		    (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
347 			conflict++;
348 		if ((ctn->ctype & _ISPUNCT) &
349 		    (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
350 			conflict++;
351 		if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
352 			conflict++;
353 		if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
354 			conflict++;
355 		if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
356 			conflict++;
357 
358 		if (conflict) {
359 			warn("conflicting classes for character 0x%x (%x)",
360 			    wc, ctn->ctype);
361 		}
362 		/*
363 		 * Handle the lower 256 characters using the simple
364 		 * optimization.  Note that if we have not defined the
365 		 * upper/lower case, then we identity map it.
366 		 */
367 		if ((unsigned)wc < _CACHED_RUNES) {
368 			rl.runetype[wc] = ctn->ctype;
369 			if (ctn->tolower)
370 				rl.maplower[wc] = ctn->tolower;
371 			if (ctn->toupper)
372 				rl.mapupper[wc] = ctn->toupper;
373 			continue;
374 		}
375 
376 		if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
377 			ct[rl.runetype_ext_nranges-1].max = wc;
378 			last_ct = ctn;
379 		} else {
380 			rl.runetype_ext_nranges++;
381 			ct = realloc(ct,
382 			    sizeof (*ct) * rl.runetype_ext_nranges);
383 			ct[rl.runetype_ext_nranges - 1].min = wc;
384 			ct[rl.runetype_ext_nranges - 1].max = wc;
385 			ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
386 			last_ct = ctn;
387 		}
388 		if (ctn->tolower == 0) {
389 			last_lo = NULL;
390 		} else if ((last_lo != NULL) &&
391 		    (last_lo->tolower + 1 == ctn->tolower)) {
392 			lo[rl.maplower_ext_nranges-1].max = wc;
393 			last_lo = ctn;
394 		} else {
395 			rl.maplower_ext_nranges++;
396 			lo = realloc(lo,
397 			    sizeof (*lo) * rl.maplower_ext_nranges);
398 			lo[rl.maplower_ext_nranges - 1].min = wc;
399 			lo[rl.maplower_ext_nranges - 1].max = wc;
400 			lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
401 			last_lo = ctn;
402 		}
403 
404 		if (ctn->toupper == 0) {
405 			last_up = NULL;
406 		} else if ((last_up != NULL) &&
407 		    (last_up->toupper + 1 == ctn->toupper)) {
408 			up[rl.mapupper_ext_nranges-1].max = wc;
409 			last_up = ctn;
410 		} else {
411 			rl.mapupper_ext_nranges++;
412 			up = realloc(up,
413 			    sizeof (*up) * rl.mapupper_ext_nranges);
414 			up[rl.mapupper_ext_nranges - 1].min = wc;
415 			up[rl.mapupper_ext_nranges - 1].max = wc;
416 			up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
417 			last_up = ctn;
418 		}
419 	}
420 
421 	if ((wr_category(&rl, sizeof (rl), f) < 0) ||
422 	    (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
423 	    (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
424 	    (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
425 		return;
426 	}
427 
428 	close_category(f);
429 }
430