1e12a957fSPedro F. Giffuni /*-
24644f9beSYuri Pankov * Copyright 2018 Nexenta Systems, Inc.
3057ca2d4SBaptiste Daroussin * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved.
4057ca2d4SBaptiste Daroussin * Copyright 2015 John Marino <draco@marino.st>
5057ca2d4SBaptiste Daroussin *
6057ca2d4SBaptiste Daroussin * This source code is derived from the illumos localedef command, and
7057ca2d4SBaptiste Daroussin * provided under BSD-style license terms by Nexenta Systems, Inc.
8057ca2d4SBaptiste Daroussin *
9057ca2d4SBaptiste Daroussin * Redistribution and use in source and binary forms, with or without
10057ca2d4SBaptiste Daroussin * modification, are permitted provided that the following conditions
11057ca2d4SBaptiste Daroussin * are met:
12057ca2d4SBaptiste Daroussin *
13057ca2d4SBaptiste Daroussin * 1. Redistributions of source code must retain the above copyright
14057ca2d4SBaptiste Daroussin * notice, this list of conditions and the following disclaimer.
15057ca2d4SBaptiste Daroussin * 2. Redistributions in binary form must reproduce the above copyright
16057ca2d4SBaptiste Daroussin * notice, this list of conditions and the following disclaimer in the
17057ca2d4SBaptiste Daroussin * documentation and/or other materials provided with the distribution.
18057ca2d4SBaptiste Daroussin *
19057ca2d4SBaptiste Daroussin * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20057ca2d4SBaptiste Daroussin * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21057ca2d4SBaptiste Daroussin * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22057ca2d4SBaptiste Daroussin * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23057ca2d4SBaptiste Daroussin * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24057ca2d4SBaptiste Daroussin * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25057ca2d4SBaptiste Daroussin * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26057ca2d4SBaptiste Daroussin * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27057ca2d4SBaptiste Daroussin * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28057ca2d4SBaptiste Daroussin * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29057ca2d4SBaptiste Daroussin * POSSIBILITY OF SUCH DAMAGE.
30057ca2d4SBaptiste Daroussin */
31057ca2d4SBaptiste Daroussin
32057ca2d4SBaptiste Daroussin /*
33057ca2d4SBaptiste Daroussin * LC_CTYPE database generation routines for localedef.
34057ca2d4SBaptiste Daroussin */
35057ca2d4SBaptiste Daroussin #include <sys/cdefs.h>
366131dc6aSBaptiste Daroussin #include <sys/tree.h>
37057ca2d4SBaptiste Daroussin
38057ca2d4SBaptiste Daroussin #include <stdio.h>
39057ca2d4SBaptiste Daroussin #include <stdlib.h>
40057ca2d4SBaptiste Daroussin #include <stddef.h>
41057ca2d4SBaptiste Daroussin #include <string.h>
42057ca2d4SBaptiste Daroussin #include <sys/types.h>
43057ca2d4SBaptiste Daroussin #include <wchar.h>
44057ca2d4SBaptiste Daroussin #include <unistd.h>
45057ca2d4SBaptiste Daroussin #include "localedef.h"
46057ca2d4SBaptiste Daroussin #include "parser.h"
4700c61a3bSAlex Richardson
4800c61a3bSAlex Richardson /* Always include the defines for the target: */
4900c61a3bSAlex Richardson #define _DONT_USE_CTYPE_INLINE_ /* Avoid dependencies on runetype.h */
5000c61a3bSAlex Richardson #include "_ctype.h"
51057ca2d4SBaptiste Daroussin #include "runefile.h"
52057ca2d4SBaptiste Daroussin
53057ca2d4SBaptiste Daroussin
54f5dde016SBaptiste Daroussin /* Needed for bootstrapping, _CTYPE_N */
55f5dde016SBaptiste Daroussin #ifndef _CTYPE_N
56f5dde016SBaptiste Daroussin #define _CTYPE_N 0x00400000L
57f5dde016SBaptiste Daroussin #endif
58f5dde016SBaptiste Daroussin
59057ca2d4SBaptiste Daroussin #define _ISUPPER _CTYPE_U
60057ca2d4SBaptiste Daroussin #define _ISLOWER _CTYPE_L
61057ca2d4SBaptiste Daroussin #define _ISDIGIT _CTYPE_D
62057ca2d4SBaptiste Daroussin #define _ISXDIGIT _CTYPE_X
63057ca2d4SBaptiste Daroussin #define _ISSPACE _CTYPE_S
64057ca2d4SBaptiste Daroussin #define _ISBLANK _CTYPE_B
65057ca2d4SBaptiste Daroussin #define _ISALPHA _CTYPE_A
66057ca2d4SBaptiste Daroussin #define _ISPUNCT _CTYPE_P
67057ca2d4SBaptiste Daroussin #define _ISGRAPH _CTYPE_G
68057ca2d4SBaptiste Daroussin #define _ISPRINT _CTYPE_R
69057ca2d4SBaptiste Daroussin #define _ISCNTRL _CTYPE_C
70057ca2d4SBaptiste Daroussin #define _E1 _CTYPE_Q
71057ca2d4SBaptiste Daroussin #define _E2 _CTYPE_I
72057ca2d4SBaptiste Daroussin #define _E3 0
73f5dde016SBaptiste Daroussin #define _E4 _CTYPE_N
74057ca2d4SBaptiste Daroussin #define _E5 _CTYPE_T
75057ca2d4SBaptiste Daroussin
76057ca2d4SBaptiste Daroussin static wchar_t last_ctype;
776131dc6aSBaptiste Daroussin static int ctype_compare(const void *n1, const void *n2);
78057ca2d4SBaptiste Daroussin
79057ca2d4SBaptiste Daroussin typedef struct ctype_node {
80057ca2d4SBaptiste Daroussin wchar_t wc;
81057ca2d4SBaptiste Daroussin int32_t ctype;
82057ca2d4SBaptiste Daroussin int32_t toupper;
83057ca2d4SBaptiste Daroussin int32_t tolower;
846131dc6aSBaptiste Daroussin RB_ENTRY(ctype_node) entry;
85057ca2d4SBaptiste Daroussin } ctype_node_t;
86057ca2d4SBaptiste Daroussin
873a444dbdSBaptiste Daroussin static RB_HEAD(ctypes, ctype_node) ctypes;
88e30c75b1SBaptiste Daroussin RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare);
89057ca2d4SBaptiste Daroussin
90057ca2d4SBaptiste Daroussin static int
ctype_compare(const void * n1,const void * n2)91057ca2d4SBaptiste Daroussin ctype_compare(const void *n1, const void *n2)
92057ca2d4SBaptiste Daroussin {
93057ca2d4SBaptiste Daroussin const ctype_node_t *c1 = n1;
94057ca2d4SBaptiste Daroussin const ctype_node_t *c2 = n2;
95057ca2d4SBaptiste Daroussin
96*770fba24SMark Johnston return (wchar_cmp(c1->wc, c2->wc));
97057ca2d4SBaptiste Daroussin }
98057ca2d4SBaptiste Daroussin
99057ca2d4SBaptiste Daroussin void
init_ctype(void)100057ca2d4SBaptiste Daroussin init_ctype(void)
101057ca2d4SBaptiste Daroussin {
1026131dc6aSBaptiste Daroussin RB_INIT(&ctypes);
103057ca2d4SBaptiste Daroussin }
104057ca2d4SBaptiste Daroussin
105057ca2d4SBaptiste Daroussin
106057ca2d4SBaptiste Daroussin static void
add_ctype_impl(ctype_node_t * ctn)107057ca2d4SBaptiste Daroussin add_ctype_impl(ctype_node_t *ctn)
108057ca2d4SBaptiste Daroussin {
109057ca2d4SBaptiste Daroussin switch (last_kw) {
110057ca2d4SBaptiste Daroussin case T_ISUPPER:
111057ca2d4SBaptiste Daroussin ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
112057ca2d4SBaptiste Daroussin break;
113057ca2d4SBaptiste Daroussin case T_ISLOWER:
114057ca2d4SBaptiste Daroussin ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
115057ca2d4SBaptiste Daroussin break;
116057ca2d4SBaptiste Daroussin case T_ISALPHA:
117057ca2d4SBaptiste Daroussin ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
118057ca2d4SBaptiste Daroussin break;
119057ca2d4SBaptiste Daroussin case T_ISDIGIT:
120227d35daSBaptiste Daroussin ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4);
121057ca2d4SBaptiste Daroussin break;
122057ca2d4SBaptiste Daroussin case T_ISSPACE:
1232d1cfed1SYuri Pankov /*
1242d1cfed1SYuri Pankov * This can be troublesome as <form-feed>, <newline>,
1252d1cfed1SYuri Pankov * <carriage-return>, <tab>, and <vertical-tab> are defined both
1262d1cfed1SYuri Pankov * as space and cntrl, and POSIX doesn't allow cntrl/print
1272d1cfed1SYuri Pankov * combination. We will take care of this in dump_ctype().
1282d1cfed1SYuri Pankov */
1292d1cfed1SYuri Pankov ctn->ctype |= (_ISSPACE | _ISPRINT);
130057ca2d4SBaptiste Daroussin break;
131057ca2d4SBaptiste Daroussin case T_ISCNTRL:
132057ca2d4SBaptiste Daroussin ctn->ctype |= _ISCNTRL;
133057ca2d4SBaptiste Daroussin break;
134057ca2d4SBaptiste Daroussin case T_ISGRAPH:
135057ca2d4SBaptiste Daroussin ctn->ctype |= (_ISGRAPH | _ISPRINT);
136057ca2d4SBaptiste Daroussin break;
137057ca2d4SBaptiste Daroussin case T_ISPRINT:
138057ca2d4SBaptiste Daroussin ctn->ctype |= _ISPRINT;
139057ca2d4SBaptiste Daroussin break;
140057ca2d4SBaptiste Daroussin case T_ISPUNCT:
141057ca2d4SBaptiste Daroussin ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
142057ca2d4SBaptiste Daroussin break;
143057ca2d4SBaptiste Daroussin case T_ISXDIGIT:
14471e8badeSBaptiste Daroussin ctn->ctype |= (_ISXDIGIT | _ISPRINT);
145057ca2d4SBaptiste Daroussin break;
146057ca2d4SBaptiste Daroussin case T_ISBLANK:
147057ca2d4SBaptiste Daroussin ctn->ctype |= (_ISBLANK | _ISSPACE);
148057ca2d4SBaptiste Daroussin break;
149057ca2d4SBaptiste Daroussin case T_ISPHONOGRAM:
150057ca2d4SBaptiste Daroussin ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
151057ca2d4SBaptiste Daroussin break;
152057ca2d4SBaptiste Daroussin case T_ISIDEOGRAM:
153057ca2d4SBaptiste Daroussin ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
154057ca2d4SBaptiste Daroussin break;
155057ca2d4SBaptiste Daroussin case T_ISENGLISH:
156057ca2d4SBaptiste Daroussin ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
157057ca2d4SBaptiste Daroussin break;
158057ca2d4SBaptiste Daroussin case T_ISNUMBER:
159057ca2d4SBaptiste Daroussin ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
160057ca2d4SBaptiste Daroussin break;
161057ca2d4SBaptiste Daroussin case T_ISSPECIAL:
162057ca2d4SBaptiste Daroussin ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
163057ca2d4SBaptiste Daroussin break;
164057ca2d4SBaptiste Daroussin case T_ISALNUM:
165057ca2d4SBaptiste Daroussin /*
166057ca2d4SBaptiste Daroussin * We can't do anything with this. The character
167057ca2d4SBaptiste Daroussin * should already be specified as a digit or alpha.
168057ca2d4SBaptiste Daroussin */
169057ca2d4SBaptiste Daroussin break;
170057ca2d4SBaptiste Daroussin default:
171057ca2d4SBaptiste Daroussin errf("not a valid character class");
172057ca2d4SBaptiste Daroussin }
173057ca2d4SBaptiste Daroussin }
174057ca2d4SBaptiste Daroussin
175057ca2d4SBaptiste Daroussin static ctype_node_t *
get_ctype(wchar_t wc)176057ca2d4SBaptiste Daroussin get_ctype(wchar_t wc)
177057ca2d4SBaptiste Daroussin {
178057ca2d4SBaptiste Daroussin ctype_node_t srch;
179057ca2d4SBaptiste Daroussin ctype_node_t *ctn;
180057ca2d4SBaptiste Daroussin
181057ca2d4SBaptiste Daroussin srch.wc = wc;
1826131dc6aSBaptiste Daroussin if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) {
183057ca2d4SBaptiste Daroussin if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
184057ca2d4SBaptiste Daroussin errf("out of memory");
185057ca2d4SBaptiste Daroussin return (NULL);
186057ca2d4SBaptiste Daroussin }
187057ca2d4SBaptiste Daroussin ctn->wc = wc;
188057ca2d4SBaptiste Daroussin
1896131dc6aSBaptiste Daroussin RB_INSERT(ctypes, &ctypes, ctn);
190057ca2d4SBaptiste Daroussin }
191057ca2d4SBaptiste Daroussin return (ctn);
192057ca2d4SBaptiste Daroussin }
193057ca2d4SBaptiste Daroussin
194057ca2d4SBaptiste Daroussin void
add_ctype(int val)195057ca2d4SBaptiste Daroussin add_ctype(int val)
196057ca2d4SBaptiste Daroussin {
197057ca2d4SBaptiste Daroussin ctype_node_t *ctn;
198057ca2d4SBaptiste Daroussin
199057ca2d4SBaptiste Daroussin if ((ctn = get_ctype(val)) == NULL) {
200057ca2d4SBaptiste Daroussin INTERR;
201057ca2d4SBaptiste Daroussin return;
202057ca2d4SBaptiste Daroussin }
203057ca2d4SBaptiste Daroussin add_ctype_impl(ctn);
204057ca2d4SBaptiste Daroussin last_ctype = ctn->wc;
205057ca2d4SBaptiste Daroussin }
206057ca2d4SBaptiste Daroussin
207057ca2d4SBaptiste Daroussin void
add_ctype_range(wchar_t end)2088c859b07SBaptiste Daroussin add_ctype_range(wchar_t end)
209057ca2d4SBaptiste Daroussin {
210057ca2d4SBaptiste Daroussin ctype_node_t *ctn;
211057ca2d4SBaptiste Daroussin wchar_t cur;
212057ca2d4SBaptiste Daroussin
213057ca2d4SBaptiste Daroussin if (end < last_ctype) {
214057ca2d4SBaptiste Daroussin errf("malformed character range (%u ... %u))",
215057ca2d4SBaptiste Daroussin last_ctype, end);
216057ca2d4SBaptiste Daroussin return;
217057ca2d4SBaptiste Daroussin }
218057ca2d4SBaptiste Daroussin for (cur = last_ctype + 1; cur <= end; cur++) {
219057ca2d4SBaptiste Daroussin if ((ctn = get_ctype(cur)) == NULL) {
220057ca2d4SBaptiste Daroussin INTERR;
221057ca2d4SBaptiste Daroussin return;
222057ca2d4SBaptiste Daroussin }
223057ca2d4SBaptiste Daroussin add_ctype_impl(ctn);
224057ca2d4SBaptiste Daroussin }
225057ca2d4SBaptiste Daroussin last_ctype = end;
226057ca2d4SBaptiste Daroussin
227057ca2d4SBaptiste Daroussin }
228057ca2d4SBaptiste Daroussin
229057ca2d4SBaptiste Daroussin /*
230057ca2d4SBaptiste Daroussin * A word about widths: if the width mask is specified, then libc
231057ca2d4SBaptiste Daroussin * unconditionally honors it. Otherwise, it assumes printable
232057ca2d4SBaptiste Daroussin * characters have width 1, and non-printable characters have width
233dae3a64fSEitan Adler * -1 (except for NULL which is special with width 0). Hence, we have
234057ca2d4SBaptiste Daroussin * no need to inject defaults here -- the "default" unset value of 0
235057ca2d4SBaptiste Daroussin * indicates that libc should use its own logic in wcwidth as described.
236057ca2d4SBaptiste Daroussin */
237057ca2d4SBaptiste Daroussin void
add_width(int wc,int width)238057ca2d4SBaptiste Daroussin add_width(int wc, int width)
239057ca2d4SBaptiste Daroussin {
240057ca2d4SBaptiste Daroussin ctype_node_t *ctn;
241057ca2d4SBaptiste Daroussin
242057ca2d4SBaptiste Daroussin if ((ctn = get_ctype(wc)) == NULL) {
243057ca2d4SBaptiste Daroussin INTERR;
244057ca2d4SBaptiste Daroussin return;
245057ca2d4SBaptiste Daroussin }
246057ca2d4SBaptiste Daroussin ctn->ctype &= ~(_CTYPE_SWM);
247057ca2d4SBaptiste Daroussin switch (width) {
248057ca2d4SBaptiste Daroussin case 0:
249057ca2d4SBaptiste Daroussin ctn->ctype |= _CTYPE_SW0;
250057ca2d4SBaptiste Daroussin break;
251057ca2d4SBaptiste Daroussin case 1:
252057ca2d4SBaptiste Daroussin ctn->ctype |= _CTYPE_SW1;
253057ca2d4SBaptiste Daroussin break;
254057ca2d4SBaptiste Daroussin case 2:
255057ca2d4SBaptiste Daroussin ctn->ctype |= _CTYPE_SW2;
256057ca2d4SBaptiste Daroussin break;
257057ca2d4SBaptiste Daroussin case 3:
258057ca2d4SBaptiste Daroussin ctn->ctype |= _CTYPE_SW3;
259057ca2d4SBaptiste Daroussin break;
260057ca2d4SBaptiste Daroussin }
261057ca2d4SBaptiste Daroussin }
262057ca2d4SBaptiste Daroussin
263057ca2d4SBaptiste Daroussin void
add_width_range(int start,int end,int width)264057ca2d4SBaptiste Daroussin add_width_range(int start, int end, int width)
265057ca2d4SBaptiste Daroussin {
266057ca2d4SBaptiste Daroussin for (; start <= end; start++) {
267057ca2d4SBaptiste Daroussin add_width(start, width);
268057ca2d4SBaptiste Daroussin }
269057ca2d4SBaptiste Daroussin }
270057ca2d4SBaptiste Daroussin
271057ca2d4SBaptiste Daroussin void
add_caseconv(int val,int wc)272057ca2d4SBaptiste Daroussin add_caseconv(int val, int wc)
273057ca2d4SBaptiste Daroussin {
274057ca2d4SBaptiste Daroussin ctype_node_t *ctn;
275057ca2d4SBaptiste Daroussin
276057ca2d4SBaptiste Daroussin ctn = get_ctype(val);
277057ca2d4SBaptiste Daroussin if (ctn == NULL) {
278057ca2d4SBaptiste Daroussin INTERR;
279057ca2d4SBaptiste Daroussin return;
280057ca2d4SBaptiste Daroussin }
281057ca2d4SBaptiste Daroussin
282057ca2d4SBaptiste Daroussin switch (last_kw) {
283057ca2d4SBaptiste Daroussin case T_TOUPPER:
284057ca2d4SBaptiste Daroussin ctn->toupper = wc;
285057ca2d4SBaptiste Daroussin break;
286057ca2d4SBaptiste Daroussin case T_TOLOWER:
287057ca2d4SBaptiste Daroussin ctn->tolower = wc;
288057ca2d4SBaptiste Daroussin break;
289057ca2d4SBaptiste Daroussin default:
290057ca2d4SBaptiste Daroussin INTERR;
291057ca2d4SBaptiste Daroussin break;
292057ca2d4SBaptiste Daroussin }
293057ca2d4SBaptiste Daroussin }
294057ca2d4SBaptiste Daroussin
295057ca2d4SBaptiste Daroussin void
dump_ctype(void)296057ca2d4SBaptiste Daroussin dump_ctype(void)
297057ca2d4SBaptiste Daroussin {
298057ca2d4SBaptiste Daroussin FILE *f;
299057ca2d4SBaptiste Daroussin _FileRuneLocale rl;
300057ca2d4SBaptiste Daroussin ctype_node_t *ctn, *last_ct, *last_lo, *last_up;
301057ca2d4SBaptiste Daroussin _FileRuneEntry *ct = NULL;
302057ca2d4SBaptiste Daroussin _FileRuneEntry *lo = NULL;
303057ca2d4SBaptiste Daroussin _FileRuneEntry *up = NULL;
304057ca2d4SBaptiste Daroussin wchar_t wc;
3054644f9beSYuri Pankov uint32_t runetype_ext_nranges;
3064644f9beSYuri Pankov uint32_t maplower_ext_nranges;
3074644f9beSYuri Pankov uint32_t mapupper_ext_nranges;
308057ca2d4SBaptiste Daroussin
309057ca2d4SBaptiste Daroussin (void) memset(&rl, 0, sizeof (rl));
3104644f9beSYuri Pankov runetype_ext_nranges = 0;
311057ca2d4SBaptiste Daroussin last_ct = NULL;
3124644f9beSYuri Pankov maplower_ext_nranges = 0;
313057ca2d4SBaptiste Daroussin last_lo = NULL;
3144644f9beSYuri Pankov mapupper_ext_nranges = 0;
315057ca2d4SBaptiste Daroussin last_up = NULL;
316057ca2d4SBaptiste Daroussin
317057ca2d4SBaptiste Daroussin if ((f = open_category()) == NULL)
318057ca2d4SBaptiste Daroussin return;
319057ca2d4SBaptiste Daroussin
320057ca2d4SBaptiste Daroussin (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
3211bb0ddf9SPedro F. Giffuni (void) strlcpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
322057ca2d4SBaptiste Daroussin
323057ca2d4SBaptiste Daroussin /*
324057ca2d4SBaptiste Daroussin * Initialize the identity map.
325057ca2d4SBaptiste Daroussin */
326057ca2d4SBaptiste Daroussin for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
3274644f9beSYuri Pankov rl.maplower[wc] = htote(wc);
3284644f9beSYuri Pankov rl.mapupper[wc] = htote(wc);
329057ca2d4SBaptiste Daroussin }
330057ca2d4SBaptiste Daroussin
3316131dc6aSBaptiste Daroussin RB_FOREACH(ctn, ctypes, &ctypes) {
332057ca2d4SBaptiste Daroussin int conflict = 0;
333057ca2d4SBaptiste Daroussin
334057ca2d4SBaptiste Daroussin wc = ctn->wc;
335057ca2d4SBaptiste Daroussin
336057ca2d4SBaptiste Daroussin /*
337057ca2d4SBaptiste Daroussin * POSIX requires certain portable characters have
338057ca2d4SBaptiste Daroussin * certain types. Add them if they are missing.
339057ca2d4SBaptiste Daroussin */
340057ca2d4SBaptiste Daroussin if ((wc >= 1) && (wc <= 127)) {
341057ca2d4SBaptiste Daroussin if ((wc >= 'A') && (wc <= 'Z'))
342057ca2d4SBaptiste Daroussin ctn->ctype |= _ISUPPER;
343057ca2d4SBaptiste Daroussin if ((wc >= 'a') && (wc <= 'z'))
344057ca2d4SBaptiste Daroussin ctn->ctype |= _ISLOWER;
345057ca2d4SBaptiste Daroussin if ((wc >= '0') && (wc <= '9'))
346057ca2d4SBaptiste Daroussin ctn->ctype |= _ISDIGIT;
347046c3cdaSPedro F. Giffuni if (wc == ' ')
348046c3cdaSPedro F. Giffuni ctn->ctype |= _ISPRINT;
349057ca2d4SBaptiste Daroussin if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
350057ca2d4SBaptiste Daroussin ctn->ctype |= _ISSPACE;
351057ca2d4SBaptiste Daroussin if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
352057ca2d4SBaptiste Daroussin ctn->ctype |= _ISXDIGIT;
353057ca2d4SBaptiste Daroussin if (strchr(" \t", (char)wc))
354057ca2d4SBaptiste Daroussin ctn->ctype |= _ISBLANK;
355057ca2d4SBaptiste Daroussin
356057ca2d4SBaptiste Daroussin /*
357057ca2d4SBaptiste Daroussin * Technically these settings are only
358057ca2d4SBaptiste Daroussin * required for the C locale. However, it
359057ca2d4SBaptiste Daroussin * turns out that because of the historical
360057ca2d4SBaptiste Daroussin * version of isprint(), we need them for all
361057ca2d4SBaptiste Daroussin * locales as well. Note that these are not
362057ca2d4SBaptiste Daroussin * necessarily valid punctation characters in
363057ca2d4SBaptiste Daroussin * the current language, but ispunct() needs
364057ca2d4SBaptiste Daroussin * to return TRUE for them.
365057ca2d4SBaptiste Daroussin */
366057ca2d4SBaptiste Daroussin if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
367057ca2d4SBaptiste Daroussin (char)wc))
368057ca2d4SBaptiste Daroussin ctn->ctype |= _ISPUNCT;
369057ca2d4SBaptiste Daroussin }
370057ca2d4SBaptiste Daroussin
371057ca2d4SBaptiste Daroussin /*
372057ca2d4SBaptiste Daroussin * POSIX also requires that certain types imply
373057ca2d4SBaptiste Daroussin * others. Add any inferred types here.
374057ca2d4SBaptiste Daroussin */
375057ca2d4SBaptiste Daroussin if (ctn->ctype & (_ISUPPER |_ISLOWER))
376057ca2d4SBaptiste Daroussin ctn->ctype |= _ISALPHA;
377057ca2d4SBaptiste Daroussin if (ctn->ctype & _ISDIGIT)
378057ca2d4SBaptiste Daroussin ctn->ctype |= _ISXDIGIT;
379057ca2d4SBaptiste Daroussin if (ctn->ctype & _ISBLANK)
380057ca2d4SBaptiste Daroussin ctn->ctype |= _ISSPACE;
381057ca2d4SBaptiste Daroussin if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
382057ca2d4SBaptiste Daroussin ctn->ctype |= _ISGRAPH;
383057ca2d4SBaptiste Daroussin if (ctn->ctype & _ISGRAPH)
384057ca2d4SBaptiste Daroussin ctn->ctype |= _ISPRINT;
385057ca2d4SBaptiste Daroussin
386057ca2d4SBaptiste Daroussin /*
3872d1cfed1SYuri Pankov * POSIX requires that certain combinations are invalid.
3882d1cfed1SYuri Pankov * Try fixing the cases we know about (see add_ctype_impl()).
3892d1cfed1SYuri Pankov */
3902d1cfed1SYuri Pankov if ((ctn->ctype & (_ISSPACE|_ISCNTRL)) == (_ISSPACE|_ISCNTRL))
3912d1cfed1SYuri Pankov ctn->ctype &= ~_ISPRINT;
3922d1cfed1SYuri Pankov
3932d1cfed1SYuri Pankov /*
3942d1cfed1SYuri Pankov * Finally, don't flag remaining cases as a fatal error,
3952d1cfed1SYuri Pankov * and just warn about them.
396057ca2d4SBaptiste Daroussin */
397057ca2d4SBaptiste Daroussin if ((ctn->ctype & _ISALPHA) &&
398057ca2d4SBaptiste Daroussin (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
399057ca2d4SBaptiste Daroussin conflict++;
4001bb0ddf9SPedro F. Giffuni if ((ctn->ctype & _ISPUNCT) &&
401057ca2d4SBaptiste Daroussin (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
402057ca2d4SBaptiste Daroussin conflict++;
403057ca2d4SBaptiste Daroussin if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
404057ca2d4SBaptiste Daroussin conflict++;
40556b1edd6SPedro F. Giffuni if ((ctn->ctype & _ISCNTRL) && (ctn->ctype & _ISPRINT))
406057ca2d4SBaptiste Daroussin conflict++;
407057ca2d4SBaptiste Daroussin if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
408057ca2d4SBaptiste Daroussin conflict++;
409057ca2d4SBaptiste Daroussin
410057ca2d4SBaptiste Daroussin if (conflict) {
411057ca2d4SBaptiste Daroussin warn("conflicting classes for character 0x%x (%x)",
412057ca2d4SBaptiste Daroussin wc, ctn->ctype);
413057ca2d4SBaptiste Daroussin }
414057ca2d4SBaptiste Daroussin /*
415057ca2d4SBaptiste Daroussin * Handle the lower 256 characters using the simple
416057ca2d4SBaptiste Daroussin * optimization. Note that if we have not defined the
417057ca2d4SBaptiste Daroussin * upper/lower case, then we identity map it.
418057ca2d4SBaptiste Daroussin */
419057ca2d4SBaptiste Daroussin if ((unsigned)wc < _CACHED_RUNES) {
4204644f9beSYuri Pankov rl.runetype[wc] = htote(ctn->ctype);
421057ca2d4SBaptiste Daroussin if (ctn->tolower)
4224644f9beSYuri Pankov rl.maplower[wc] = htote(ctn->tolower);
423057ca2d4SBaptiste Daroussin if (ctn->toupper)
4244644f9beSYuri Pankov rl.mapupper[wc] = htote(ctn->toupper);
425057ca2d4SBaptiste Daroussin continue;
426057ca2d4SBaptiste Daroussin }
427057ca2d4SBaptiste Daroussin
428c7edf4fdSBaptiste Daroussin if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) &&
429c7edf4fdSBaptiste Daroussin (last_ct->wc + 1 == wc)) {
4304644f9beSYuri Pankov ct[runetype_ext_nranges - 1].max = htote(wc);
431057ca2d4SBaptiste Daroussin } else {
4324644f9beSYuri Pankov runetype_ext_nranges++;
4334644f9beSYuri Pankov ct = realloc(ct, sizeof (*ct) * runetype_ext_nranges);
4344644f9beSYuri Pankov ct[runetype_ext_nranges - 1].min = htote(wc);
4354644f9beSYuri Pankov ct[runetype_ext_nranges - 1].max = htote(wc);
4364644f9beSYuri Pankov ct[runetype_ext_nranges - 1].map =
4374644f9beSYuri Pankov htote(ctn->ctype);
438057ca2d4SBaptiste Daroussin }
439c7edf4fdSBaptiste Daroussin last_ct = ctn;
440057ca2d4SBaptiste Daroussin if (ctn->tolower == 0) {
441057ca2d4SBaptiste Daroussin last_lo = NULL;
442057ca2d4SBaptiste Daroussin } else if ((last_lo != NULL) &&
443057ca2d4SBaptiste Daroussin (last_lo->tolower + 1 == ctn->tolower)) {
4444644f9beSYuri Pankov lo[maplower_ext_nranges - 1].max = htote(wc);
445057ca2d4SBaptiste Daroussin last_lo = ctn;
446057ca2d4SBaptiste Daroussin } else {
4474644f9beSYuri Pankov maplower_ext_nranges++;
4484644f9beSYuri Pankov lo = realloc(lo, sizeof (*lo) * maplower_ext_nranges);
4494644f9beSYuri Pankov lo[maplower_ext_nranges - 1].min = htote(wc);
4504644f9beSYuri Pankov lo[maplower_ext_nranges - 1].max = htote(wc);
4514644f9beSYuri Pankov lo[maplower_ext_nranges - 1].map =
4524644f9beSYuri Pankov htote(ctn->tolower);
453057ca2d4SBaptiste Daroussin last_lo = ctn;
454057ca2d4SBaptiste Daroussin }
455057ca2d4SBaptiste Daroussin
456057ca2d4SBaptiste Daroussin if (ctn->toupper == 0) {
457057ca2d4SBaptiste Daroussin last_up = NULL;
458057ca2d4SBaptiste Daroussin } else if ((last_up != NULL) &&
459057ca2d4SBaptiste Daroussin (last_up->toupper + 1 == ctn->toupper)) {
4604644f9beSYuri Pankov up[mapupper_ext_nranges-1].max = htote(wc);
461057ca2d4SBaptiste Daroussin last_up = ctn;
462057ca2d4SBaptiste Daroussin } else {
4634644f9beSYuri Pankov mapupper_ext_nranges++;
4644644f9beSYuri Pankov up = realloc(up, sizeof (*up) * mapupper_ext_nranges);
4654644f9beSYuri Pankov up[mapupper_ext_nranges - 1].min = htote(wc);
4664644f9beSYuri Pankov up[mapupper_ext_nranges - 1].max = htote(wc);
4674644f9beSYuri Pankov up[mapupper_ext_nranges - 1].map =
4684644f9beSYuri Pankov htote(ctn->toupper);
469057ca2d4SBaptiste Daroussin last_up = ctn;
470057ca2d4SBaptiste Daroussin }
471057ca2d4SBaptiste Daroussin }
472057ca2d4SBaptiste Daroussin
4734644f9beSYuri Pankov rl.runetype_ext_nranges = htote(runetype_ext_nranges);
4744644f9beSYuri Pankov rl.maplower_ext_nranges = htote(maplower_ext_nranges);
4754644f9beSYuri Pankov rl.mapupper_ext_nranges = htote(mapupper_ext_nranges);
476c48c87b7SPedro F. Giffuni if ((wr_category(&rl, sizeof (rl), f) < 0) ||
4774644f9beSYuri Pankov (wr_category(ct, sizeof (*ct) * runetype_ext_nranges, f) < 0) ||
4784644f9beSYuri Pankov (wr_category(lo, sizeof (*lo) * maplower_ext_nranges, f) < 0) ||
4794644f9beSYuri Pankov (wr_category(up, sizeof (*up) * mapupper_ext_nranges, f) < 0)) {
480c48c87b7SPedro F. Giffuni return;
481057ca2d4SBaptiste Daroussin }
482057ca2d4SBaptiste Daroussin
483c48c87b7SPedro F. Giffuni close_category(f);
484c48c87b7SPedro F. Giffuni }
485