xref: /freebsd/usr.bin/localedef/ctype.c (revision 9e5787d2284e187abb5b654d924394a65772e004)
1 /*-
2  * Copyright 2018 Nexenta Systems, Inc.
3  * Copyright 2012 Garrett D'Amore <garrett@damore.org>  All rights reserved.
4  * Copyright 2015 John Marino <draco@marino.st>
5  *
6  * This source code is derived from the illumos localedef command, and
7  * provided under BSD-style license terms by Nexenta Systems, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * LC_CTYPE database generation routines for localedef.
34  */
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include <sys/tree.h>
39 
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <stddef.h>
43 #include <string.h>
44 #include <sys/types.h>
45 #include <wchar.h>
46 #include <unistd.h>
47 #include "localedef.h"
48 #include "parser.h"
49 
50 /* Always include the defines for the target: */
51 #define _DONT_USE_CTYPE_INLINE_ /* Avoid dependencies on runetype.h */
52 #include "_ctype.h"
53 #include "runefile.h"
54 
55 
56 /* Needed for bootstrapping, _CTYPE_N */
57 #ifndef _CTYPE_N
58 #define _CTYPE_N       0x00400000L
59 #endif
60 
61 #define _ISUPPER	_CTYPE_U
62 #define _ISLOWER	_CTYPE_L
63 #define	_ISDIGIT	_CTYPE_D
64 #define	_ISXDIGIT	_CTYPE_X
65 #define	_ISSPACE	_CTYPE_S
66 #define	_ISBLANK	_CTYPE_B
67 #define	_ISALPHA	_CTYPE_A
68 #define	_ISPUNCT	_CTYPE_P
69 #define	_ISGRAPH	_CTYPE_G
70 #define	_ISPRINT	_CTYPE_R
71 #define	_ISCNTRL	_CTYPE_C
72 #define	_E1		_CTYPE_Q
73 #define	_E2		_CTYPE_I
74 #define	_E3		0
75 #define	_E4		_CTYPE_N
76 #define	_E5		_CTYPE_T
77 
78 static wchar_t		last_ctype;
79 static int ctype_compare(const void *n1, const void *n2);
80 
81 typedef struct ctype_node {
82 	wchar_t wc;
83 	int32_t	ctype;
84 	int32_t	toupper;
85 	int32_t	tolower;
86 	RB_ENTRY(ctype_node) entry;
87 } ctype_node_t;
88 
89 static RB_HEAD(ctypes, ctype_node) ctypes;
90 RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare);
91 
92 static int
93 ctype_compare(const void *n1, const void *n2)
94 {
95 	const ctype_node_t *c1 = n1;
96 	const ctype_node_t *c2 = n2;
97 
98 	return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
99 }
100 
101 void
102 init_ctype(void)
103 {
104 	RB_INIT(&ctypes);
105 }
106 
107 
108 static void
109 add_ctype_impl(ctype_node_t *ctn)
110 {
111 	switch (last_kw) {
112 	case T_ISUPPER:
113 		ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
114 		break;
115 	case T_ISLOWER:
116 		ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
117 		break;
118 	case T_ISALPHA:
119 		ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
120 		break;
121 	case T_ISDIGIT:
122 		ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4);
123 		break;
124 	case T_ISSPACE:
125 		/*
126 		 * This can be troublesome as <form-feed>, <newline>,
127 		 * <carriage-return>, <tab>, and <vertical-tab> are defined both
128 		 * as space and cntrl, and POSIX doesn't allow cntrl/print
129 		 * combination.  We will take care of this in dump_ctype().
130 		 */
131 		ctn->ctype |= (_ISSPACE | _ISPRINT);
132 		break;
133 	case T_ISCNTRL:
134 		ctn->ctype |= _ISCNTRL;
135 		break;
136 	case T_ISGRAPH:
137 		ctn->ctype |= (_ISGRAPH | _ISPRINT);
138 		break;
139 	case T_ISPRINT:
140 		ctn->ctype |= _ISPRINT;
141 		break;
142 	case T_ISPUNCT:
143 		ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
144 		break;
145 	case T_ISXDIGIT:
146 		ctn->ctype |= (_ISXDIGIT | _ISPRINT);
147 		break;
148 	case T_ISBLANK:
149 		ctn->ctype |= (_ISBLANK | _ISSPACE);
150 		break;
151 	case T_ISPHONOGRAM:
152 		ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
153 		break;
154 	case T_ISIDEOGRAM:
155 		ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
156 		break;
157 	case T_ISENGLISH:
158 		ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
159 		break;
160 	case T_ISNUMBER:
161 		ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
162 		break;
163 	case T_ISSPECIAL:
164 		ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
165 		break;
166 	case T_ISALNUM:
167 		/*
168 		 * We can't do anything with this.  The character
169 		 * should already be specified as a digit or alpha.
170 		 */
171 		break;
172 	default:
173 		errf("not a valid character class");
174 	}
175 }
176 
177 static ctype_node_t *
178 get_ctype(wchar_t wc)
179 {
180 	ctype_node_t	srch;
181 	ctype_node_t	*ctn;
182 
183 	srch.wc = wc;
184 	if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) {
185 		if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
186 			errf("out of memory");
187 			return (NULL);
188 		}
189 		ctn->wc = wc;
190 
191 		RB_INSERT(ctypes, &ctypes, ctn);
192 	}
193 	return (ctn);
194 }
195 
196 void
197 add_ctype(int val)
198 {
199 	ctype_node_t	*ctn;
200 
201 	if ((ctn = get_ctype(val)) == NULL) {
202 		INTERR;
203 		return;
204 	}
205 	add_ctype_impl(ctn);
206 	last_ctype = ctn->wc;
207 }
208 
209 void
210 add_ctype_range(wchar_t end)
211 {
212 	ctype_node_t	*ctn;
213 	wchar_t		cur;
214 
215 	if (end < last_ctype) {
216 		errf("malformed character range (%u ... %u))",
217 		    last_ctype, end);
218 		return;
219 	}
220 	for (cur = last_ctype + 1; cur <= end; cur++) {
221 		if ((ctn = get_ctype(cur)) == NULL) {
222 			INTERR;
223 			return;
224 		}
225 		add_ctype_impl(ctn);
226 	}
227 	last_ctype = end;
228 
229 }
230 
231 /*
232  * A word about widths: if the width mask is specified, then libc
233  * unconditionally honors it.  Otherwise, it assumes printable
234  * characters have width 1, and non-printable characters have width
235  * -1 (except for NULL which is special with width 0).  Hence, we have
236  * no need to inject defaults here -- the "default" unset value of 0
237  * indicates that libc should use its own logic in wcwidth as described.
238  */
239 void
240 add_width(int wc, int width)
241 {
242 	ctype_node_t	*ctn;
243 
244 	if ((ctn = get_ctype(wc)) == NULL) {
245 		INTERR;
246 		return;
247 	}
248 	ctn->ctype &= ~(_CTYPE_SWM);
249 	switch (width) {
250 	case 0:
251 		ctn->ctype |= _CTYPE_SW0;
252 		break;
253 	case 1:
254 		ctn->ctype |= _CTYPE_SW1;
255 		break;
256 	case 2:
257 		ctn->ctype |= _CTYPE_SW2;
258 		break;
259 	case 3:
260 		ctn->ctype |= _CTYPE_SW3;
261 		break;
262 	}
263 }
264 
265 void
266 add_width_range(int start, int end, int width)
267 {
268 	for (; start <= end; start++) {
269 		add_width(start, width);
270 	}
271 }
272 
273 void
274 add_caseconv(int val, int wc)
275 {
276 	ctype_node_t	*ctn;
277 
278 	ctn = get_ctype(val);
279 	if (ctn == NULL) {
280 		INTERR;
281 		return;
282 	}
283 
284 	switch (last_kw) {
285 	case T_TOUPPER:
286 		ctn->toupper = wc;
287 		break;
288 	case T_TOLOWER:
289 		ctn->tolower = wc;
290 		break;
291 	default:
292 		INTERR;
293 		break;
294 	}
295 }
296 
297 void
298 dump_ctype(void)
299 {
300 	FILE		*f;
301 	_FileRuneLocale	rl;
302 	ctype_node_t	*ctn, *last_ct, *last_lo, *last_up;
303 	_FileRuneEntry	*ct = NULL;
304 	_FileRuneEntry	*lo = NULL;
305 	_FileRuneEntry	*up = NULL;
306 	wchar_t		wc;
307 	uint32_t	runetype_ext_nranges;
308 	uint32_t	maplower_ext_nranges;
309 	uint32_t	mapupper_ext_nranges;
310 
311 	(void) memset(&rl, 0, sizeof (rl));
312 	runetype_ext_nranges = 0;
313 	last_ct = NULL;
314 	maplower_ext_nranges = 0;
315 	last_lo = NULL;
316 	mapupper_ext_nranges = 0;
317 	last_up = NULL;
318 
319 	if ((f = open_category()) == NULL)
320 		return;
321 
322 	(void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
323 	(void) strlcpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
324 
325 	/*
326 	 * Initialize the identity map.
327 	 */
328 	for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
329 		rl.maplower[wc] = htote(wc);
330 		rl.mapupper[wc] = htote(wc);
331 	}
332 
333 	RB_FOREACH(ctn, ctypes, &ctypes) {
334 		int conflict = 0;
335 
336 		wc = ctn->wc;
337 
338 		/*
339 		 * POSIX requires certain portable characters have
340 		 * certain types.  Add them if they are missing.
341 		 */
342 		if ((wc >= 1) && (wc <= 127)) {
343 			if ((wc >= 'A') && (wc <= 'Z'))
344 				ctn->ctype |= _ISUPPER;
345 			if ((wc >= 'a') && (wc <= 'z'))
346 				ctn->ctype |= _ISLOWER;
347 			if ((wc >= '0') && (wc <= '9'))
348 				ctn->ctype |= _ISDIGIT;
349 			if (wc == ' ')
350 				ctn->ctype |= _ISPRINT;
351 			if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
352 				ctn->ctype |= _ISSPACE;
353 			if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
354 				ctn->ctype |= _ISXDIGIT;
355 			if (strchr(" \t", (char)wc))
356 				ctn->ctype |= _ISBLANK;
357 
358 			/*
359 			 * Technically these settings are only
360 			 * required for the C locale.  However, it
361 			 * turns out that because of the historical
362 			 * version of isprint(), we need them for all
363 			 * locales as well.  Note that these are not
364 			 * necessarily valid punctation characters in
365 			 * the current language, but ispunct() needs
366 			 * to return TRUE for them.
367 			 */
368 			if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
369 			    (char)wc))
370 				ctn->ctype |= _ISPUNCT;
371 		}
372 
373 		/*
374 		 * POSIX also requires that certain types imply
375 		 * others.  Add any inferred types here.
376 		 */
377 		if (ctn->ctype & (_ISUPPER |_ISLOWER))
378 			ctn->ctype |= _ISALPHA;
379 		if (ctn->ctype & _ISDIGIT)
380 			ctn->ctype |= _ISXDIGIT;
381 		if (ctn->ctype & _ISBLANK)
382 			ctn->ctype |= _ISSPACE;
383 		if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
384 			ctn->ctype |= _ISGRAPH;
385 		if (ctn->ctype & _ISGRAPH)
386 			ctn->ctype |= _ISPRINT;
387 
388 		/*
389 		 * POSIX requires that certain combinations are invalid.
390 		 * Try fixing the cases we know about (see add_ctype_impl()).
391 		 */
392 		if ((ctn->ctype & (_ISSPACE|_ISCNTRL)) == (_ISSPACE|_ISCNTRL))
393 			ctn->ctype &= ~_ISPRINT;
394 
395 		/*
396 		 * Finally, don't flag remaining cases as a fatal error,
397 		 * and just warn about them.
398 		 */
399 		if ((ctn->ctype & _ISALPHA) &&
400 		    (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
401 			conflict++;
402 		if ((ctn->ctype & _ISPUNCT) &&
403 		    (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
404 			conflict++;
405 		if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
406 			conflict++;
407 		if ((ctn->ctype & _ISCNTRL) && (ctn->ctype & _ISPRINT))
408 			conflict++;
409 		if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
410 			conflict++;
411 
412 		if (conflict) {
413 			warn("conflicting classes for character 0x%x (%x)",
414 			    wc, ctn->ctype);
415 		}
416 		/*
417 		 * Handle the lower 256 characters using the simple
418 		 * optimization.  Note that if we have not defined the
419 		 * upper/lower case, then we identity map it.
420 		 */
421 		if ((unsigned)wc < _CACHED_RUNES) {
422 			rl.runetype[wc] = htote(ctn->ctype);
423 			if (ctn->tolower)
424 				rl.maplower[wc] = htote(ctn->tolower);
425 			if (ctn->toupper)
426 				rl.mapupper[wc] = htote(ctn->toupper);
427 			continue;
428 		}
429 
430 		if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) &&
431 		    (last_ct->wc + 1 == wc)) {
432 			ct[runetype_ext_nranges - 1].max = htote(wc);
433 		} else {
434 			runetype_ext_nranges++;
435 			ct = realloc(ct, sizeof (*ct) * runetype_ext_nranges);
436 			ct[runetype_ext_nranges - 1].min = htote(wc);
437 			ct[runetype_ext_nranges - 1].max = htote(wc);
438 			ct[runetype_ext_nranges - 1].map =
439 			    htote(ctn->ctype);
440 		}
441 		last_ct = ctn;
442 		if (ctn->tolower == 0) {
443 			last_lo = NULL;
444 		} else if ((last_lo != NULL) &&
445 		    (last_lo->tolower + 1 == ctn->tolower)) {
446 			lo[maplower_ext_nranges - 1].max = htote(wc);
447 			last_lo = ctn;
448 		} else {
449 			maplower_ext_nranges++;
450 			lo = realloc(lo, sizeof (*lo) * maplower_ext_nranges);
451 			lo[maplower_ext_nranges - 1].min = htote(wc);
452 			lo[maplower_ext_nranges - 1].max = htote(wc);
453 			lo[maplower_ext_nranges - 1].map =
454 			    htote(ctn->tolower);
455 			last_lo = ctn;
456 		}
457 
458 		if (ctn->toupper == 0) {
459 			last_up = NULL;
460 		} else if ((last_up != NULL) &&
461 		    (last_up->toupper + 1 == ctn->toupper)) {
462 			up[mapupper_ext_nranges-1].max = htote(wc);
463 			last_up = ctn;
464 		} else {
465 			mapupper_ext_nranges++;
466 			up = realloc(up, sizeof (*up) * mapupper_ext_nranges);
467 			up[mapupper_ext_nranges - 1].min = htote(wc);
468 			up[mapupper_ext_nranges - 1].max = htote(wc);
469 			up[mapupper_ext_nranges - 1].map =
470 			    htote(ctn->toupper);
471 			last_up = ctn;
472 		}
473 	}
474 
475 	rl.runetype_ext_nranges = htote(runetype_ext_nranges);
476 	rl.maplower_ext_nranges = htote(maplower_ext_nranges);
477 	rl.mapupper_ext_nranges = htote(mapupper_ext_nranges);
478 	if ((wr_category(&rl, sizeof (rl), f) < 0) ||
479 	    (wr_category(ct, sizeof (*ct) * runetype_ext_nranges, f) < 0) ||
480 	    (wr_category(lo, sizeof (*lo) * maplower_ext_nranges, f) < 0) ||
481 	    (wr_category(up, sizeof (*up) * mapupper_ext_nranges, f) < 0)) {
482 		return;
483 	}
484 
485 	close_category(f);
486 }
487