xref: /freebsd/usr.bin/localedef/ctype.c (revision 5dae51da3da0cc94d17bd67b308fad304ebec7e0)
1 /*
2  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright 2012 Garrett D'Amore <garrett@damore.org>  All rights reserved.
4  * Copyright 2015 John Marino <draco@marino.st>
5  *
6  * This source code is derived from the illumos localedef command, and
7  * provided under BSD-style license terms by Nexenta Systems, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * LC_CTYPE database generation routines for localedef.
34  */
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include <sys/tree.h>
39 
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <stddef.h>
43 #include <string.h>
44 #include <sys/types.h>
45 #include <wchar.h>
46 #include <ctype.h>
47 #include <wctype.h>
48 #include <unistd.h>
49 #include "localedef.h"
50 #include "parser.h"
51 #include "runefile.h"
52 
53 
54 /* Needed for bootstrapping, _CTYPE_N */
55 #ifndef _CTYPE_N
56 #define _CTYPE_N       0x00400000L
57 #endif
58 
59 #define _ISUPPER	_CTYPE_U
60 #define _ISLOWER	_CTYPE_L
61 #define	_ISDIGIT	_CTYPE_D
62 #define	_ISXDIGIT	_CTYPE_X
63 #define	_ISSPACE	_CTYPE_S
64 #define	_ISBLANK	_CTYPE_B
65 #define	_ISALPHA	_CTYPE_A
66 #define	_ISPUNCT	_CTYPE_P
67 #define	_ISGRAPH	_CTYPE_G
68 #define	_ISPRINT	_CTYPE_R
69 #define	_ISCNTRL	_CTYPE_C
70 #define	_E1		_CTYPE_Q
71 #define	_E2		_CTYPE_I
72 #define	_E3		0
73 #define	_E4		_CTYPE_N
74 #define	_E5		_CTYPE_T
75 
76 static wchar_t		last_ctype;
77 static int ctype_compare(const void *n1, const void *n2);
78 
79 typedef struct ctype_node {
80 	wchar_t wc;
81 	int32_t	ctype;
82 	int32_t	toupper;
83 	int32_t	tolower;
84 	RB_ENTRY(ctype_node) entry;
85 } ctype_node_t;
86 
87 static RB_HEAD(ctypes, ctype_node) ctypes;
88 RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare);
89 
90 static int
91 ctype_compare(const void *n1, const void *n2)
92 {
93 	const ctype_node_t *c1 = n1;
94 	const ctype_node_t *c2 = n2;
95 
96 	return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
97 }
98 
99 void
100 init_ctype(void)
101 {
102 	RB_INIT(&ctypes);
103 }
104 
105 
106 static void
107 add_ctype_impl(ctype_node_t *ctn)
108 {
109 	switch (last_kw) {
110 	case T_ISUPPER:
111 		ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
112 		break;
113 	case T_ISLOWER:
114 		ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
115 		break;
116 	case T_ISALPHA:
117 		ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
118 		break;
119 	case T_ISDIGIT:
120 		ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4);
121 		break;
122 	case T_ISSPACE:
123 		ctn->ctype |= _ISSPACE;
124 		break;
125 	case T_ISCNTRL:
126 		ctn->ctype |= _ISCNTRL;
127 		break;
128 	case T_ISGRAPH:
129 		ctn->ctype |= (_ISGRAPH | _ISPRINT);
130 		break;
131 	case T_ISPRINT:
132 		ctn->ctype |= _ISPRINT;
133 		break;
134 	case T_ISPUNCT:
135 		ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
136 		break;
137 	case T_ISXDIGIT:
138 		ctn->ctype |= (_ISXDIGIT | _ISPRINT);
139 		break;
140 	case T_ISBLANK:
141 		ctn->ctype |= (_ISBLANK | _ISSPACE);
142 		break;
143 	case T_ISPHONOGRAM:
144 		ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
145 		break;
146 	case T_ISIDEOGRAM:
147 		ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
148 		break;
149 	case T_ISENGLISH:
150 		ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
151 		break;
152 	case T_ISNUMBER:
153 		ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
154 		break;
155 	case T_ISSPECIAL:
156 		ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
157 		break;
158 	case T_ISALNUM:
159 		/*
160 		 * We can't do anything with this.  The character
161 		 * should already be specified as a digit or alpha.
162 		 */
163 		break;
164 	default:
165 		errf("not a valid character class");
166 	}
167 }
168 
169 static ctype_node_t *
170 get_ctype(wchar_t wc)
171 {
172 	ctype_node_t	srch;
173 	ctype_node_t	*ctn;
174 
175 	srch.wc = wc;
176 	if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) {
177 		if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
178 			errf("out of memory");
179 			return (NULL);
180 		}
181 		ctn->wc = wc;
182 
183 		RB_INSERT(ctypes, &ctypes, ctn);
184 	}
185 	return (ctn);
186 }
187 
188 void
189 add_ctype(int val)
190 {
191 	ctype_node_t	*ctn;
192 
193 	if ((ctn = get_ctype(val)) == NULL) {
194 		INTERR;
195 		return;
196 	}
197 	add_ctype_impl(ctn);
198 	last_ctype = ctn->wc;
199 }
200 
201 void
202 add_ctype_range(wchar_t end)
203 {
204 	ctype_node_t	*ctn;
205 	wchar_t		cur;
206 
207 	if (end < last_ctype) {
208 		errf("malformed character range (%u ... %u))",
209 		    last_ctype, end);
210 		return;
211 	}
212 	for (cur = last_ctype + 1; cur <= end; cur++) {
213 		if ((ctn = get_ctype(cur)) == NULL) {
214 			INTERR;
215 			return;
216 		}
217 		add_ctype_impl(ctn);
218 	}
219 	last_ctype = end;
220 
221 }
222 
223 /*
224  * A word about widths: if the width mask is specified, then libc
225  * unconditionally honors it.  Otherwise, it assumes printable
226  * characters have width 1, and non-printable characters have width
227  * -1 (except for NULL which is special with with 0).  Hence, we have
228  * no need to inject defaults here -- the "default" unset value of 0
229  * indicates that libc should use its own logic in wcwidth as described.
230  */
231 void
232 add_width(int wc, int width)
233 {
234 	ctype_node_t	*ctn;
235 
236 	if ((ctn = get_ctype(wc)) == NULL) {
237 		INTERR;
238 		return;
239 	}
240 	ctn->ctype &= ~(_CTYPE_SWM);
241 	switch (width) {
242 	case 0:
243 		ctn->ctype |= _CTYPE_SW0;
244 		break;
245 	case 1:
246 		ctn->ctype |= _CTYPE_SW1;
247 		break;
248 	case 2:
249 		ctn->ctype |= _CTYPE_SW2;
250 		break;
251 	case 3:
252 		ctn->ctype |= _CTYPE_SW3;
253 		break;
254 	}
255 }
256 
257 void
258 add_width_range(int start, int end, int width)
259 {
260 	for (; start <= end; start++) {
261 		add_width(start, width);
262 	}
263 }
264 
265 void
266 add_caseconv(int val, int wc)
267 {
268 	ctype_node_t	*ctn;
269 
270 	ctn = get_ctype(val);
271 	if (ctn == NULL) {
272 		INTERR;
273 		return;
274 	}
275 
276 	switch (last_kw) {
277 	case T_TOUPPER:
278 		ctn->toupper = wc;
279 		break;
280 	case T_TOLOWER:
281 		ctn->tolower = wc;
282 		break;
283 	default:
284 		INTERR;
285 		break;
286 	}
287 }
288 
289 void
290 dump_ctype(void)
291 {
292 	FILE		*f;
293 	_FileRuneLocale	rl;
294 	ctype_node_t	*ctn, *last_ct, *last_lo, *last_up;
295 	_FileRuneEntry	*ct = NULL;
296 	_FileRuneEntry	*lo = NULL;
297 	_FileRuneEntry	*up = NULL;
298 	wchar_t		wc;
299 
300 	(void) memset(&rl, 0, sizeof (rl));
301 	last_ct = NULL;
302 	last_lo = NULL;
303 	last_up = NULL;
304 
305 	if ((f = open_category()) == NULL)
306 		return;
307 
308 	(void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
309 	(void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
310 
311 	/*
312 	 * Initialize the identity map.
313 	 */
314 	for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
315 		rl.maplower[wc] = wc;
316 		rl.mapupper[wc] = wc;
317 	}
318 
319 	RB_FOREACH(ctn, ctypes, &ctypes) {
320 		int conflict = 0;
321 
322 		wc = ctn->wc;
323 
324 		/*
325 		 * POSIX requires certain portable characters have
326 		 * certain types.  Add them if they are missing.
327 		 */
328 		if ((wc >= 1) && (wc <= 127)) {
329 			if ((wc >= 'A') && (wc <= 'Z'))
330 				ctn->ctype |= _ISUPPER;
331 			if ((wc >= 'a') && (wc <= 'z'))
332 				ctn->ctype |= _ISLOWER;
333 			if ((wc >= '0') && (wc <= '9'))
334 				ctn->ctype |= _ISDIGIT;
335 			if (wc == ' ')
336 				ctn->ctype |= _ISPRINT;
337 			if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
338 				ctn->ctype |= _ISSPACE;
339 			if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
340 				ctn->ctype |= _ISXDIGIT;
341 			if (strchr(" \t", (char)wc))
342 				ctn->ctype |= _ISBLANK;
343 
344 			/*
345 			 * Technically these settings are only
346 			 * required for the C locale.  However, it
347 			 * turns out that because of the historical
348 			 * version of isprint(), we need them for all
349 			 * locales as well.  Note that these are not
350 			 * necessarily valid punctation characters in
351 			 * the current language, but ispunct() needs
352 			 * to return TRUE for them.
353 			 */
354 			if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
355 			    (char)wc))
356 				ctn->ctype |= _ISPUNCT;
357 		}
358 
359 		/*
360 		 * POSIX also requires that certain types imply
361 		 * others.  Add any inferred types here.
362 		 */
363 		if (ctn->ctype & (_ISUPPER |_ISLOWER))
364 			ctn->ctype |= _ISALPHA;
365 		if (ctn->ctype & _ISDIGIT)
366 			ctn->ctype |= _ISXDIGIT;
367 		if (ctn->ctype & _ISBLANK)
368 			ctn->ctype |= _ISSPACE;
369 		if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
370 			ctn->ctype |= _ISGRAPH;
371 		if (ctn->ctype & _ISGRAPH)
372 			ctn->ctype |= _ISPRINT;
373 
374 		/*
375 		 * Finally, POSIX requires that certain combinations
376 		 * are invalid.  We don't flag this as a fatal error,
377 		 * but we will warn about.
378 		 */
379 		if ((ctn->ctype & _ISALPHA) &&
380 		    (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
381 			conflict++;
382 		if ((ctn->ctype & _ISPUNCT) &
383 		    (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
384 			conflict++;
385 		if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
386 			conflict++;
387 		if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
388 			conflict++;
389 		if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
390 			conflict++;
391 
392 		if (conflict) {
393 			warn("conflicting classes for character 0x%x (%x)",
394 			    wc, ctn->ctype);
395 		}
396 		/*
397 		 * Handle the lower 256 characters using the simple
398 		 * optimization.  Note that if we have not defined the
399 		 * upper/lower case, then we identity map it.
400 		 */
401 		if ((unsigned)wc < _CACHED_RUNES) {
402 			rl.runetype[wc] = ctn->ctype;
403 			if (ctn->tolower)
404 				rl.maplower[wc] = ctn->tolower;
405 			if (ctn->toupper)
406 				rl.mapupper[wc] = ctn->toupper;
407 			continue;
408 		}
409 
410 		if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) &&
411 		    (last_ct->wc + 1 == wc)) {
412 			ct[rl.runetype_ext_nranges-1].max = wc;
413 		} else {
414 			rl.runetype_ext_nranges++;
415 			ct = realloc(ct,
416 			    sizeof (*ct) * rl.runetype_ext_nranges);
417 			ct[rl.runetype_ext_nranges - 1].min = wc;
418 			ct[rl.runetype_ext_nranges - 1].max = wc;
419 			ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
420 		}
421 		last_ct = ctn;
422 		if (ctn->tolower == 0) {
423 			last_lo = NULL;
424 		} else if ((last_lo != NULL) &&
425 		    (last_lo->tolower + 1 == ctn->tolower)) {
426 			lo[rl.maplower_ext_nranges-1].max = wc;
427 			last_lo = ctn;
428 		} else {
429 			rl.maplower_ext_nranges++;
430 			lo = realloc(lo,
431 			    sizeof (*lo) * rl.maplower_ext_nranges);
432 			lo[rl.maplower_ext_nranges - 1].min = wc;
433 			lo[rl.maplower_ext_nranges - 1].max = wc;
434 			lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
435 			last_lo = ctn;
436 		}
437 
438 		if (ctn->toupper == 0) {
439 			last_up = NULL;
440 		} else if ((last_up != NULL) &&
441 		    (last_up->toupper + 1 == ctn->toupper)) {
442 			up[rl.mapupper_ext_nranges-1].max = wc;
443 			last_up = ctn;
444 		} else {
445 			rl.mapupper_ext_nranges++;
446 			up = realloc(up,
447 			    sizeof (*up) * rl.mapupper_ext_nranges);
448 			up[rl.mapupper_ext_nranges - 1].min = wc;
449 			up[rl.mapupper_ext_nranges - 1].max = wc;
450 			up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
451 			last_up = ctn;
452 		}
453 	}
454 
455 	if ((wr_category(&rl, sizeof (rl), f) < 0) ||
456 	    (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
457 	    (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
458 	    (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
459 		return;
460 	}
461 
462 	close_category(f);
463 }
464