xref: /freebsd/sys/contrib/openzfs/module/zfs/u8_textprep.c (revision 8a62a2a5659d1839d8799b4274c04469d7f17c78)
1*8a62a2a5SMartin Matuska // SPDX-License-Identifier: CDDL-1.0
2*8a62a2a5SMartin Matuska /*
3*8a62a2a5SMartin Matuska  * CDDL HEADER START
4*8a62a2a5SMartin Matuska  *
5*8a62a2a5SMartin Matuska  * The contents of this file are subject to the terms of the
6*8a62a2a5SMartin Matuska  * Common Development and Distribution License (the "License").
7*8a62a2a5SMartin Matuska  * You may not use this file except in compliance with the License.
8*8a62a2a5SMartin Matuska  *
9*8a62a2a5SMartin Matuska  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*8a62a2a5SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
11*8a62a2a5SMartin Matuska  * See the License for the specific language governing permissions
12*8a62a2a5SMartin Matuska  * and limitations under the License.
13*8a62a2a5SMartin Matuska  *
14*8a62a2a5SMartin Matuska  * When distributing Covered Code, include this CDDL HEADER in each
15*8a62a2a5SMartin Matuska  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*8a62a2a5SMartin Matuska  * If applicable, add the following below this CDDL HEADER, with the
17*8a62a2a5SMartin Matuska  * fields enclosed by brackets "[]" replaced with your own identifying
18*8a62a2a5SMartin Matuska  * information: Portions Copyright [yyyy] [name of copyright owner]
19*8a62a2a5SMartin Matuska  *
20*8a62a2a5SMartin Matuska  * CDDL HEADER END
21*8a62a2a5SMartin Matuska  */
22*8a62a2a5SMartin Matuska /*
23*8a62a2a5SMartin Matuska  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24*8a62a2a5SMartin Matuska  * Use is subject to license terms.
25*8a62a2a5SMartin Matuska  */
26*8a62a2a5SMartin Matuska 
27*8a62a2a5SMartin Matuska /*
28*8a62a2a5SMartin Matuska  * Copyright 2022 MNX Cloud, Inc.
29*8a62a2a5SMartin Matuska  */
30*8a62a2a5SMartin Matuska 
31*8a62a2a5SMartin Matuska 
32*8a62a2a5SMartin Matuska 
33*8a62a2a5SMartin Matuska /*
34*8a62a2a5SMartin Matuska  * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
35*8a62a2a5SMartin Matuska  *
36*8a62a2a5SMartin Matuska  * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
37*8a62a2a5SMartin Matuska  * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
38*8a62a2a5SMartin Matuska  * the section 3C man pages.
39*8a62a2a5SMartin Matuska  * Interface stability: Committed.
40*8a62a2a5SMartin Matuska  */
41*8a62a2a5SMartin Matuska 
42*8a62a2a5SMartin Matuska #include <sys/types.h>
43*8a62a2a5SMartin Matuska #include <sys/string.h>
44*8a62a2a5SMartin Matuska #include <sys/param.h>
45*8a62a2a5SMartin Matuska #include <sys/sysmacros.h>
46*8a62a2a5SMartin Matuska #include <sys/debug.h>
47*8a62a2a5SMartin Matuska #include <sys/kmem.h>
48*8a62a2a5SMartin Matuska #include <sys/sunddi.h>
49*8a62a2a5SMartin Matuska #include <sys/u8_textprep.h>
50*8a62a2a5SMartin Matuska #include <sys/byteorder.h>
51*8a62a2a5SMartin Matuska #include <sys/errno.h>
52*8a62a2a5SMartin Matuska #include <sys/u8_textprep_data.h>
53*8a62a2a5SMartin Matuska #include <sys/mod.h>
54*8a62a2a5SMartin Matuska 
55*8a62a2a5SMartin Matuska /* The maximum possible number of bytes in a UTF-8 character. */
56*8a62a2a5SMartin Matuska #define	U8_MB_CUR_MAX			(4)
57*8a62a2a5SMartin Matuska 
58*8a62a2a5SMartin Matuska /*
59*8a62a2a5SMartin Matuska  * The maximum number of bytes needed for a UTF-8 character to cover
60*8a62a2a5SMartin Matuska  * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
61*8a62a2a5SMartin Matuska  */
62*8a62a2a5SMartin Matuska #define	U8_MAX_BYTES_UCS2		(3)
63*8a62a2a5SMartin Matuska 
64*8a62a2a5SMartin Matuska /* The maximum possible number of bytes in a Stream-Safe Text. */
65*8a62a2a5SMartin Matuska #define	U8_STREAM_SAFE_TEXT_MAX		(128)
66*8a62a2a5SMartin Matuska 
67*8a62a2a5SMartin Matuska /*
68*8a62a2a5SMartin Matuska  * The maximum number of characters in a combining/conjoining sequence and
69*8a62a2a5SMartin Matuska  * the actual upperbound limit of a combining/conjoining sequence.
70*8a62a2a5SMartin Matuska  */
71*8a62a2a5SMartin Matuska #define	U8_MAX_CHARS_A_SEQ		(32)
72*8a62a2a5SMartin Matuska #define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
73*8a62a2a5SMartin Matuska 
74*8a62a2a5SMartin Matuska /* The combining class value for Starter. */
75*8a62a2a5SMartin Matuska #define	U8_COMBINING_CLASS_STARTER	(0)
76*8a62a2a5SMartin Matuska 
77*8a62a2a5SMartin Matuska /*
78*8a62a2a5SMartin Matuska  * Some Hangul related macros at below.
79*8a62a2a5SMartin Matuska  *
80*8a62a2a5SMartin Matuska  * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
81*8a62a2a5SMartin Matuska  * Vowels, and optional Trailing consonants in Unicode scalar values.
82*8a62a2a5SMartin Matuska  *
83*8a62a2a5SMartin Matuska  * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
84*8a62a2a5SMartin Matuska  * the actual U+11A8. This is due to that the trailing consonant is optional
85*8a62a2a5SMartin Matuska  * and thus we are doing a pre-calculation of subtracting one.
86*8a62a2a5SMartin Matuska  *
87*8a62a2a5SMartin Matuska  * Each of 19 modern leading consonants has total 588 possible syllables since
88*8a62a2a5SMartin Matuska  * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
89*8a62a2a5SMartin Matuska  * no trailing consonant case, i.e., 21 x 28 = 588.
90*8a62a2a5SMartin Matuska  *
91*8a62a2a5SMartin Matuska  * We also have bunch of Hangul related macros at below. Please bear in mind
92*8a62a2a5SMartin Matuska  * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
93*8a62a2a5SMartin Matuska  * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
94*8a62a2a5SMartin Matuska  * Jamo; it just guarantee that it will be most likely.
95*8a62a2a5SMartin Matuska  */
96*8a62a2a5SMartin Matuska #define	U8_HANGUL_SYL_FIRST		(0xAC00U)
97*8a62a2a5SMartin Matuska #define	U8_HANGUL_SYL_LAST		(0xD7A3U)
98*8a62a2a5SMartin Matuska 
99*8a62a2a5SMartin Matuska #define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
100*8a62a2a5SMartin Matuska #define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
101*8a62a2a5SMartin Matuska #define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
102*8a62a2a5SMartin Matuska #define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
103*8a62a2a5SMartin Matuska #define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
104*8a62a2a5SMartin Matuska #define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
105*8a62a2a5SMartin Matuska 
106*8a62a2a5SMartin Matuska #define	U8_HANGUL_V_COUNT		(21)
107*8a62a2a5SMartin Matuska #define	U8_HANGUL_VT_COUNT		(588)
108*8a62a2a5SMartin Matuska #define	U8_HANGUL_T_COUNT		(28)
109*8a62a2a5SMartin Matuska 
110*8a62a2a5SMartin Matuska #define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
111*8a62a2a5SMartin Matuska 
112*8a62a2a5SMartin Matuska #define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
113*8a62a2a5SMartin Matuska 	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
114*8a62a2a5SMartin Matuska 	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
115*8a62a2a5SMartin Matuska 	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
116*8a62a2a5SMartin Matuska 
117*8a62a2a5SMartin Matuska #define	U8_HANGUL_JAMO_L(u) \
118*8a62a2a5SMartin Matuska 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
119*8a62a2a5SMartin Matuska 
120*8a62a2a5SMartin Matuska #define	U8_HANGUL_JAMO_V(u) \
121*8a62a2a5SMartin Matuska 	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
122*8a62a2a5SMartin Matuska 
123*8a62a2a5SMartin Matuska #define	U8_HANGUL_JAMO_T(u) \
124*8a62a2a5SMartin Matuska 	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
125*8a62a2a5SMartin Matuska 
126*8a62a2a5SMartin Matuska #define	U8_HANGUL_JAMO(u) \
127*8a62a2a5SMartin Matuska 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
128*8a62a2a5SMartin Matuska 
129*8a62a2a5SMartin Matuska #define	U8_HANGUL_SYLLABLE(u) \
130*8a62a2a5SMartin Matuska 	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
131*8a62a2a5SMartin Matuska 
132*8a62a2a5SMartin Matuska #define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
133*8a62a2a5SMartin Matuska 	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
134*8a62a2a5SMartin Matuska 
135*8a62a2a5SMartin Matuska #define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
136*8a62a2a5SMartin Matuska 	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
137*8a62a2a5SMartin Matuska 
138*8a62a2a5SMartin Matuska /* The types of decomposition mappings. */
139*8a62a2a5SMartin Matuska #define	U8_DECOMP_BOTH			(0xF5U)
140*8a62a2a5SMartin Matuska #define	U8_DECOMP_CANONICAL		(0xF6U)
141*8a62a2a5SMartin Matuska 
142*8a62a2a5SMartin Matuska /* The indicator for 16-bit table. */
143*8a62a2a5SMartin Matuska #define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
144*8a62a2a5SMartin Matuska 
145*8a62a2a5SMartin Matuska /* The following are some convenience macros. */
146*8a62a2a5SMartin Matuska #define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3)  \
147*8a62a2a5SMartin Matuska 	(u) = ((((uint32_t)(b1) & 0x0F) << 12) | \
148*8a62a2a5SMartin Matuska 		(((uint32_t)(b2) & 0x3F) << 6)  | \
149*8a62a2a5SMartin Matuska 		((uint32_t)(b3) & 0x3F));
150*8a62a2a5SMartin Matuska 
151*8a62a2a5SMartin Matuska #define	U8_SIMPLE_SWAP(a, b, t) \
152*8a62a2a5SMartin Matuska 	(t) = (a); \
153*8a62a2a5SMartin Matuska 	(a) = (b); \
154*8a62a2a5SMartin Matuska 	(b) = (t);
155*8a62a2a5SMartin Matuska 
156*8a62a2a5SMartin Matuska #define	U8_ASCII_TOUPPER(c) \
157*8a62a2a5SMartin Matuska 	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
158*8a62a2a5SMartin Matuska 
159*8a62a2a5SMartin Matuska #define	U8_ASCII_TOLOWER(c) \
160*8a62a2a5SMartin Matuska 	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
161*8a62a2a5SMartin Matuska 
162*8a62a2a5SMartin Matuska #define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
163*8a62a2a5SMartin Matuska /*
164*8a62a2a5SMartin Matuska  * The following macro assumes that the two characters that are to be
165*8a62a2a5SMartin Matuska  * swapped are adjacent to each other and 'a' comes before 'b'.
166*8a62a2a5SMartin Matuska  *
167*8a62a2a5SMartin Matuska  * If the assumptions are not met, then, the macro will fail.
168*8a62a2a5SMartin Matuska  */
169*8a62a2a5SMartin Matuska #define	U8_SWAP_COMB_MARKS(a, b) \
170*8a62a2a5SMartin Matuska 	for (k = 0; k < disp[(a)]; k++) \
171*8a62a2a5SMartin Matuska 		u8t[k] = u8s[start[(a)] + k]; \
172*8a62a2a5SMartin Matuska 	for (k = 0; k < disp[(b)]; k++) \
173*8a62a2a5SMartin Matuska 		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
174*8a62a2a5SMartin Matuska 	start[(b)] = start[(a)] + disp[(b)]; \
175*8a62a2a5SMartin Matuska 	for (k = 0; k < disp[(a)]; k++) \
176*8a62a2a5SMartin Matuska 		u8s[start[(b)] + k] = u8t[k]; \
177*8a62a2a5SMartin Matuska 	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
178*8a62a2a5SMartin Matuska 	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
179*8a62a2a5SMartin Matuska 
180*8a62a2a5SMartin Matuska /* The possible states during normalization. */
181*8a62a2a5SMartin Matuska typedef enum {
182*8a62a2a5SMartin Matuska 	U8_STATE_START = 0,
183*8a62a2a5SMartin Matuska 	U8_STATE_HANGUL_L = 1,
184*8a62a2a5SMartin Matuska 	U8_STATE_HANGUL_LV = 2,
185*8a62a2a5SMartin Matuska 	U8_STATE_HANGUL_LVT = 3,
186*8a62a2a5SMartin Matuska 	U8_STATE_HANGUL_V = 4,
187*8a62a2a5SMartin Matuska 	U8_STATE_HANGUL_T = 5,
188*8a62a2a5SMartin Matuska 	U8_STATE_COMBINING_MARK = 6
189*8a62a2a5SMartin Matuska } u8_normalization_states_t;
190*8a62a2a5SMartin Matuska 
191*8a62a2a5SMartin Matuska /*
192*8a62a2a5SMartin Matuska  * The three vectors at below are used to check bytes of a given UTF-8
193*8a62a2a5SMartin Matuska  * character are valid and not containing any malformed byte values.
194*8a62a2a5SMartin Matuska  *
195*8a62a2a5SMartin Matuska  * We used to have a quite relaxed UTF-8 binary representation but then there
196*8a62a2a5SMartin Matuska  * was some security related issues and so the Unicode Consortium defined
197*8a62a2a5SMartin Matuska  * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
198*8a62a2a5SMartin Matuska  * one more time at the Unicode 3.2. The following three tables are based on
199*8a62a2a5SMartin Matuska  * that.
200*8a62a2a5SMartin Matuska  */
201*8a62a2a5SMartin Matuska 
202*8a62a2a5SMartin Matuska #define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
203*8a62a2a5SMartin Matuska 
204*8a62a2a5SMartin Matuska #define	I_				U8_ILLEGAL_CHAR
205*8a62a2a5SMartin Matuska #define	O_				U8_OUT_OF_RANGE_CHAR
206*8a62a2a5SMartin Matuska 
207*8a62a2a5SMartin Matuska static const int8_t u8_number_of_bytes[0x100] = {
208*8a62a2a5SMartin Matuska 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
209*8a62a2a5SMartin Matuska 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
210*8a62a2a5SMartin Matuska 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
211*8a62a2a5SMartin Matuska 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
212*8a62a2a5SMartin Matuska 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
213*8a62a2a5SMartin Matuska 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
214*8a62a2a5SMartin Matuska 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
215*8a62a2a5SMartin Matuska 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
216*8a62a2a5SMartin Matuska 
217*8a62a2a5SMartin Matuska /*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
218*8a62a2a5SMartin Matuska 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
219*8a62a2a5SMartin Matuska 
220*8a62a2a5SMartin Matuska /*	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
221*8a62a2a5SMartin Matuska 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
222*8a62a2a5SMartin Matuska 
223*8a62a2a5SMartin Matuska /*	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
224*8a62a2a5SMartin Matuska 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
225*8a62a2a5SMartin Matuska 
226*8a62a2a5SMartin Matuska /*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
227*8a62a2a5SMartin Matuska 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
228*8a62a2a5SMartin Matuska 
229*8a62a2a5SMartin Matuska /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
230*8a62a2a5SMartin Matuska 	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
231*8a62a2a5SMartin Matuska 
232*8a62a2a5SMartin Matuska /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
233*8a62a2a5SMartin Matuska 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
234*8a62a2a5SMartin Matuska 
235*8a62a2a5SMartin Matuska /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
236*8a62a2a5SMartin Matuska 	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
237*8a62a2a5SMartin Matuska 
238*8a62a2a5SMartin Matuska /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
239*8a62a2a5SMartin Matuska 	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
240*8a62a2a5SMartin Matuska };
241*8a62a2a5SMartin Matuska 
242*8a62a2a5SMartin Matuska #undef	I_
243*8a62a2a5SMartin Matuska #undef	O_
244*8a62a2a5SMartin Matuska 
245*8a62a2a5SMartin Matuska static const uint8_t u8_valid_min_2nd_byte[0x100] = {
246*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
247*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
248*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
249*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
250*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
251*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
252*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
253*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
254*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
255*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
256*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
257*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
258*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
259*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
260*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
261*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
262*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
263*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
264*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
265*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
266*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
267*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
268*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
269*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
270*8a62a2a5SMartin Matuska /*	C0    C1    C2    C3    C4    C5    C6    C7    */
271*8a62a2a5SMartin Matuska 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
272*8a62a2a5SMartin Matuska /*	C8    C9    CA    CB    CC    CD    CE    CF    */
273*8a62a2a5SMartin Matuska 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
274*8a62a2a5SMartin Matuska /*	D0    D1    D2    D3    D4    D5    D6    D7    */
275*8a62a2a5SMartin Matuska 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
276*8a62a2a5SMartin Matuska /*	D8    D9    DA    DB    DC    DD    DE    DF    */
277*8a62a2a5SMartin Matuska 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
278*8a62a2a5SMartin Matuska /*	E0    E1    E2    E3    E4    E5    E6    E7    */
279*8a62a2a5SMartin Matuska 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
280*8a62a2a5SMartin Matuska /*	E8    E9    EA    EB    EC    ED    EE    EF    */
281*8a62a2a5SMartin Matuska 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
282*8a62a2a5SMartin Matuska /*	F0    F1    F2    F3    F4    F5    F6    F7    */
283*8a62a2a5SMartin Matuska 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
284*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
285*8a62a2a5SMartin Matuska };
286*8a62a2a5SMartin Matuska 
287*8a62a2a5SMartin Matuska static const uint8_t u8_valid_max_2nd_byte[0x100] = {
288*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
289*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
290*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
291*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
292*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
293*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
294*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
295*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
296*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
297*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
298*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
299*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
300*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
301*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
302*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
303*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
304*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
305*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
306*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
307*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
308*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
309*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
310*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
311*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
312*8a62a2a5SMartin Matuska /*	C0    C1    C2    C3    C4    C5    C6    C7    */
313*8a62a2a5SMartin Matuska 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
314*8a62a2a5SMartin Matuska /*	C8    C9    CA    CB    CC    CD    CE    CF    */
315*8a62a2a5SMartin Matuska 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
316*8a62a2a5SMartin Matuska /*	D0    D1    D2    D3    D4    D5    D6    D7    */
317*8a62a2a5SMartin Matuska 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
318*8a62a2a5SMartin Matuska /*	D8    D9    DA    DB    DC    DD    DE    DF    */
319*8a62a2a5SMartin Matuska 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
320*8a62a2a5SMartin Matuska /*	E0    E1    E2    E3    E4    E5    E6    E7    */
321*8a62a2a5SMartin Matuska 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
322*8a62a2a5SMartin Matuska /*	E8    E9    EA    EB    EC    ED    EE    EF    */
323*8a62a2a5SMartin Matuska 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
324*8a62a2a5SMartin Matuska /*	F0    F1    F2    F3    F4    F5    F6    F7    */
325*8a62a2a5SMartin Matuska 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
326*8a62a2a5SMartin Matuska 	0,    0,    0,    0,    0,    0,    0,    0,
327*8a62a2a5SMartin Matuska };
328*8a62a2a5SMartin Matuska 
329*8a62a2a5SMartin Matuska 
330*8a62a2a5SMartin Matuska /*
331*8a62a2a5SMartin Matuska  * The u8_validate() validates on the given UTF-8 character string and
332*8a62a2a5SMartin Matuska  * calculate the byte length. It is quite similar to mblen(3C) except that
333*8a62a2a5SMartin Matuska  * this will validate against the list of characters if required and
334*8a62a2a5SMartin Matuska  * specific to UTF-8 and Unicode.
335*8a62a2a5SMartin Matuska  */
336*8a62a2a5SMartin Matuska int
u8_validate(const char * u8str,size_t n,char ** list,int flag,int * errnum)337*8a62a2a5SMartin Matuska u8_validate(const char *u8str, size_t n, char **list, int flag, int *errnum)
338*8a62a2a5SMartin Matuska {
339*8a62a2a5SMartin Matuska 	uchar_t *ib;
340*8a62a2a5SMartin Matuska 	uchar_t *ibtail;
341*8a62a2a5SMartin Matuska 	uchar_t **p;
342*8a62a2a5SMartin Matuska 	uchar_t *s1;
343*8a62a2a5SMartin Matuska 	uchar_t *s2;
344*8a62a2a5SMartin Matuska 	uchar_t f;
345*8a62a2a5SMartin Matuska 	int sz;
346*8a62a2a5SMartin Matuska 	size_t i;
347*8a62a2a5SMartin Matuska 	int ret_val;
348*8a62a2a5SMartin Matuska 	boolean_t second;
349*8a62a2a5SMartin Matuska 	boolean_t no_need_to_validate_entire;
350*8a62a2a5SMartin Matuska 	boolean_t check_additional;
351*8a62a2a5SMartin Matuska 	boolean_t validate_ucs2_range_only;
352*8a62a2a5SMartin Matuska 
353*8a62a2a5SMartin Matuska 	if (! u8str)
354*8a62a2a5SMartin Matuska 		return (0);
355*8a62a2a5SMartin Matuska 
356*8a62a2a5SMartin Matuska 	ib = (uchar_t *)u8str;
357*8a62a2a5SMartin Matuska 	ibtail = ib + n;
358*8a62a2a5SMartin Matuska 
359*8a62a2a5SMartin Matuska 	ret_val = 0;
360*8a62a2a5SMartin Matuska 
361*8a62a2a5SMartin Matuska 	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
362*8a62a2a5SMartin Matuska 	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
363*8a62a2a5SMartin Matuska 	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
364*8a62a2a5SMartin Matuska 
365*8a62a2a5SMartin Matuska 	while (ib < ibtail) {
366*8a62a2a5SMartin Matuska 		/*
367*8a62a2a5SMartin Matuska 		 * The first byte of a UTF-8 character tells how many
368*8a62a2a5SMartin Matuska 		 * bytes will follow for the character. If the first byte
369*8a62a2a5SMartin Matuska 		 * is an illegal byte value or out of range value, we just
370*8a62a2a5SMartin Matuska 		 * return -1 with an appropriate error number.
371*8a62a2a5SMartin Matuska 		 */
372*8a62a2a5SMartin Matuska 		sz = u8_number_of_bytes[*ib];
373*8a62a2a5SMartin Matuska 		if (sz == U8_ILLEGAL_CHAR) {
374*8a62a2a5SMartin Matuska 			*errnum = EILSEQ;
375*8a62a2a5SMartin Matuska 			return (-1);
376*8a62a2a5SMartin Matuska 		}
377*8a62a2a5SMartin Matuska 
378*8a62a2a5SMartin Matuska 		if (sz == U8_OUT_OF_RANGE_CHAR ||
379*8a62a2a5SMartin Matuska 		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
380*8a62a2a5SMartin Matuska 			*errnum = ERANGE;
381*8a62a2a5SMartin Matuska 			return (-1);
382*8a62a2a5SMartin Matuska 		}
383*8a62a2a5SMartin Matuska 
384*8a62a2a5SMartin Matuska 		/*
385*8a62a2a5SMartin Matuska 		 * If we don't have enough bytes to check on, that's also
386*8a62a2a5SMartin Matuska 		 * an error. As you can see, we give illegal byte sequence
387*8a62a2a5SMartin Matuska 		 * checking higher priority then EINVAL cases.
388*8a62a2a5SMartin Matuska 		 */
389*8a62a2a5SMartin Matuska 		if ((ibtail - ib) < sz) {
390*8a62a2a5SMartin Matuska 			*errnum = EINVAL;
391*8a62a2a5SMartin Matuska 			return (-1);
392*8a62a2a5SMartin Matuska 		}
393*8a62a2a5SMartin Matuska 
394*8a62a2a5SMartin Matuska 		if (sz == 1) {
395*8a62a2a5SMartin Matuska 			ib++;
396*8a62a2a5SMartin Matuska 			ret_val++;
397*8a62a2a5SMartin Matuska 		} else {
398*8a62a2a5SMartin Matuska 			/*
399*8a62a2a5SMartin Matuska 			 * Check on the multi-byte UTF-8 character. For more
400*8a62a2a5SMartin Matuska 			 * details on this, see comment added for the used
401*8a62a2a5SMartin Matuska 			 * data structures at the beginning of the file.
402*8a62a2a5SMartin Matuska 			 */
403*8a62a2a5SMartin Matuska 			f = *ib++;
404*8a62a2a5SMartin Matuska 			ret_val++;
405*8a62a2a5SMartin Matuska 			second = B_TRUE;
406*8a62a2a5SMartin Matuska 			for (i = 1; i < sz; i++) {
407*8a62a2a5SMartin Matuska 				if (second) {
408*8a62a2a5SMartin Matuska 					if (*ib < u8_valid_min_2nd_byte[f] ||
409*8a62a2a5SMartin Matuska 					    *ib > u8_valid_max_2nd_byte[f]) {
410*8a62a2a5SMartin Matuska 						*errnum = EILSEQ;
411*8a62a2a5SMartin Matuska 						return (-1);
412*8a62a2a5SMartin Matuska 					}
413*8a62a2a5SMartin Matuska 					second = B_FALSE;
414*8a62a2a5SMartin Matuska 				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
415*8a62a2a5SMartin Matuska 					*errnum = EILSEQ;
416*8a62a2a5SMartin Matuska 					return (-1);
417*8a62a2a5SMartin Matuska 				}
418*8a62a2a5SMartin Matuska 				ib++;
419*8a62a2a5SMartin Matuska 				ret_val++;
420*8a62a2a5SMartin Matuska 			}
421*8a62a2a5SMartin Matuska 		}
422*8a62a2a5SMartin Matuska 
423*8a62a2a5SMartin Matuska 		if (check_additional) {
424*8a62a2a5SMartin Matuska 			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
425*8a62a2a5SMartin Matuska 				s1 = ib - sz;
426*8a62a2a5SMartin Matuska 				s2 = p[i];
427*8a62a2a5SMartin Matuska 				while (s1 < ib) {
428*8a62a2a5SMartin Matuska 					if (*s1 != *s2 || *s2 == '\0')
429*8a62a2a5SMartin Matuska 						break;
430*8a62a2a5SMartin Matuska 					s1++;
431*8a62a2a5SMartin Matuska 					s2++;
432*8a62a2a5SMartin Matuska 				}
433*8a62a2a5SMartin Matuska 
434*8a62a2a5SMartin Matuska 				if (s1 >= ib && *s2 == '\0') {
435*8a62a2a5SMartin Matuska 					*errnum = EBADF;
436*8a62a2a5SMartin Matuska 					return (-1);
437*8a62a2a5SMartin Matuska 				}
438*8a62a2a5SMartin Matuska 			}
439*8a62a2a5SMartin Matuska 		}
440*8a62a2a5SMartin Matuska 
441*8a62a2a5SMartin Matuska 		if (no_need_to_validate_entire)
442*8a62a2a5SMartin Matuska 			break;
443*8a62a2a5SMartin Matuska 	}
444*8a62a2a5SMartin Matuska 
445*8a62a2a5SMartin Matuska 	return (ret_val);
446*8a62a2a5SMartin Matuska }
447*8a62a2a5SMartin Matuska 
448*8a62a2a5SMartin Matuska /*
449*8a62a2a5SMartin Matuska  * The do_case_conv() looks at the mapping tables and returns found
450*8a62a2a5SMartin Matuska  * bytes if any. If not found, the input bytes are returned. The function
451*8a62a2a5SMartin Matuska  * always terminate the return bytes with a null character assuming that
452*8a62a2a5SMartin Matuska  * there are plenty of room to do so.
453*8a62a2a5SMartin Matuska  *
454*8a62a2a5SMartin Matuska  * The case conversions are simple case conversions mapping a character to
455*8a62a2a5SMartin Matuska  * another character as specified in the Unicode data. The byte size of
456*8a62a2a5SMartin Matuska  * the mapped character could be different from that of the input character.
457*8a62a2a5SMartin Matuska  *
458*8a62a2a5SMartin Matuska  * The return value is the byte length of the returned character excluding
459*8a62a2a5SMartin Matuska  * the terminating null byte.
460*8a62a2a5SMartin Matuska  */
461*8a62a2a5SMartin Matuska static size_t
do_case_conv(int uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t is_it_toupper)462*8a62a2a5SMartin Matuska do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
463*8a62a2a5SMartin Matuska {
464*8a62a2a5SMartin Matuska 	size_t i;
465*8a62a2a5SMartin Matuska 	uint16_t b1 = 0;
466*8a62a2a5SMartin Matuska 	uint16_t b2 = 0;
467*8a62a2a5SMartin Matuska 	uint16_t b3 = 0;
468*8a62a2a5SMartin Matuska 	uint16_t b3_tbl;
469*8a62a2a5SMartin Matuska 	uint16_t b3_base;
470*8a62a2a5SMartin Matuska 	uint16_t b4 = 0;
471*8a62a2a5SMartin Matuska 	size_t start_id;
472*8a62a2a5SMartin Matuska 	size_t end_id;
473*8a62a2a5SMartin Matuska 
474*8a62a2a5SMartin Matuska 	/*
475*8a62a2a5SMartin Matuska 	 * At this point, the only possible values for sz are 2, 3, and 4.
476*8a62a2a5SMartin Matuska 	 * The u8s should point to a vector that is well beyond the size of
477*8a62a2a5SMartin Matuska 	 * 5 bytes.
478*8a62a2a5SMartin Matuska 	 */
479*8a62a2a5SMartin Matuska 	if (sz == 2) {
480*8a62a2a5SMartin Matuska 		b3 = u8s[0] = s[0];
481*8a62a2a5SMartin Matuska 		b4 = u8s[1] = s[1];
482*8a62a2a5SMartin Matuska 	} else if (sz == 3) {
483*8a62a2a5SMartin Matuska 		b2 = u8s[0] = s[0];
484*8a62a2a5SMartin Matuska 		b3 = u8s[1] = s[1];
485*8a62a2a5SMartin Matuska 		b4 = u8s[2] = s[2];
486*8a62a2a5SMartin Matuska 	} else if (sz == 4) {
487*8a62a2a5SMartin Matuska 		b1 = u8s[0] = s[0];
488*8a62a2a5SMartin Matuska 		b2 = u8s[1] = s[1];
489*8a62a2a5SMartin Matuska 		b3 = u8s[2] = s[2];
490*8a62a2a5SMartin Matuska 		b4 = u8s[3] = s[3];
491*8a62a2a5SMartin Matuska 	} else {
492*8a62a2a5SMartin Matuska 		/* This is not possible but just in case as a fallback. */
493*8a62a2a5SMartin Matuska 		if (is_it_toupper)
494*8a62a2a5SMartin Matuska 			*u8s = U8_ASCII_TOUPPER(*s);
495*8a62a2a5SMartin Matuska 		else
496*8a62a2a5SMartin Matuska 			*u8s = U8_ASCII_TOLOWER(*s);
497*8a62a2a5SMartin Matuska 		u8s[1] = '\0';
498*8a62a2a5SMartin Matuska 
499*8a62a2a5SMartin Matuska 		return (1);
500*8a62a2a5SMartin Matuska 	}
501*8a62a2a5SMartin Matuska 	u8s[sz] = '\0';
502*8a62a2a5SMartin Matuska 
503*8a62a2a5SMartin Matuska 	/*
504*8a62a2a5SMartin Matuska 	 * Let's find out if we have a corresponding character.
505*8a62a2a5SMartin Matuska 	 */
506*8a62a2a5SMartin Matuska 	b1 = u8_common_b1_tbl[uv][b1];
507*8a62a2a5SMartin Matuska 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
508*8a62a2a5SMartin Matuska 		return ((size_t)sz);
509*8a62a2a5SMartin Matuska 
510*8a62a2a5SMartin Matuska 	b2 = u8_case_common_b2_tbl[uv][b1][b2];
511*8a62a2a5SMartin Matuska 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
512*8a62a2a5SMartin Matuska 		return ((size_t)sz);
513*8a62a2a5SMartin Matuska 
514*8a62a2a5SMartin Matuska 	if (is_it_toupper) {
515*8a62a2a5SMartin Matuska 		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
516*8a62a2a5SMartin Matuska 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
517*8a62a2a5SMartin Matuska 			return ((size_t)sz);
518*8a62a2a5SMartin Matuska 
519*8a62a2a5SMartin Matuska 		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
520*8a62a2a5SMartin Matuska 		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
521*8a62a2a5SMartin Matuska 
522*8a62a2a5SMartin Matuska 		/* Either there is no match or an error at the table. */
523*8a62a2a5SMartin Matuska 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
524*8a62a2a5SMartin Matuska 			return ((size_t)sz);
525*8a62a2a5SMartin Matuska 
526*8a62a2a5SMartin Matuska 		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
527*8a62a2a5SMartin Matuska 
528*8a62a2a5SMartin Matuska 		for (i = 0; start_id < end_id; start_id++)
529*8a62a2a5SMartin Matuska 			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
530*8a62a2a5SMartin Matuska 	} else {
531*8a62a2a5SMartin Matuska #ifdef U8_STRCMP_CI_LOWER
532*8a62a2a5SMartin Matuska 		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
533*8a62a2a5SMartin Matuska 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
534*8a62a2a5SMartin Matuska 			return ((size_t)sz);
535*8a62a2a5SMartin Matuska 
536*8a62a2a5SMartin Matuska 		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
537*8a62a2a5SMartin Matuska 		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
538*8a62a2a5SMartin Matuska 
539*8a62a2a5SMartin Matuska 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
540*8a62a2a5SMartin Matuska 			return ((size_t)sz);
541*8a62a2a5SMartin Matuska 
542*8a62a2a5SMartin Matuska 		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
543*8a62a2a5SMartin Matuska 
544*8a62a2a5SMartin Matuska 		for (i = 0; start_id < end_id; start_id++)
545*8a62a2a5SMartin Matuska 			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
546*8a62a2a5SMartin Matuska #else
547*8a62a2a5SMartin Matuska 		__builtin_unreachable();
548*8a62a2a5SMartin Matuska #endif
549*8a62a2a5SMartin Matuska 	}
550*8a62a2a5SMartin Matuska 
551*8a62a2a5SMartin Matuska 	/*
552*8a62a2a5SMartin Matuska 	 * If i is still zero, that means there is no corresponding character.
553*8a62a2a5SMartin Matuska 	 */
554*8a62a2a5SMartin Matuska 	if (i == 0)
555*8a62a2a5SMartin Matuska 		return ((size_t)sz);
556*8a62a2a5SMartin Matuska 
557*8a62a2a5SMartin Matuska 	u8s[i] = '\0';
558*8a62a2a5SMartin Matuska 
559*8a62a2a5SMartin Matuska 	return (i);
560*8a62a2a5SMartin Matuska }
561*8a62a2a5SMartin Matuska 
562*8a62a2a5SMartin Matuska /*
563*8a62a2a5SMartin Matuska  * The do_case_compare() function compares the two input strings, s1 and s2,
564*8a62a2a5SMartin Matuska  * one character at a time doing case conversions if applicable and return
565*8a62a2a5SMartin Matuska  * the comparison result as like strcmp().
566*8a62a2a5SMartin Matuska  *
567*8a62a2a5SMartin Matuska  * Since, in empirical sense, most of text data are 7-bit ASCII characters,
568*8a62a2a5SMartin Matuska  * we treat the 7-bit ASCII characters as a special case trying to yield
569*8a62a2a5SMartin Matuska  * faster processing time.
570*8a62a2a5SMartin Matuska  */
571*8a62a2a5SMartin Matuska static int
do_case_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,boolean_t is_it_toupper,int * errnum)572*8a62a2a5SMartin Matuska do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
573*8a62a2a5SMartin Matuska     size_t n2, boolean_t is_it_toupper, int *errnum)
574*8a62a2a5SMartin Matuska {
575*8a62a2a5SMartin Matuska 	int f;
576*8a62a2a5SMartin Matuska 	int sz1;
577*8a62a2a5SMartin Matuska 	int sz2;
578*8a62a2a5SMartin Matuska 	size_t j;
579*8a62a2a5SMartin Matuska 	size_t i1;
580*8a62a2a5SMartin Matuska 	size_t i2;
581*8a62a2a5SMartin Matuska 	uchar_t u8s1[U8_MB_CUR_MAX + 1];
582*8a62a2a5SMartin Matuska 	uchar_t u8s2[U8_MB_CUR_MAX + 1];
583*8a62a2a5SMartin Matuska 
584*8a62a2a5SMartin Matuska 	i1 = i2 = 0;
585*8a62a2a5SMartin Matuska 	while (i1 < n1 && i2 < n2) {
586*8a62a2a5SMartin Matuska 		/*
587*8a62a2a5SMartin Matuska 		 * Find out what would be the byte length for this UTF-8
588*8a62a2a5SMartin Matuska 		 * character at string s1 and also find out if this is
589*8a62a2a5SMartin Matuska 		 * an illegal start byte or not and if so, issue a proper
590*8a62a2a5SMartin Matuska 		 * error number and yet treat this byte as a character.
591*8a62a2a5SMartin Matuska 		 */
592*8a62a2a5SMartin Matuska 		sz1 = u8_number_of_bytes[*s1];
593*8a62a2a5SMartin Matuska 		if (sz1 < 0) {
594*8a62a2a5SMartin Matuska 			*errnum = EILSEQ;
595*8a62a2a5SMartin Matuska 			sz1 = 1;
596*8a62a2a5SMartin Matuska 		}
597*8a62a2a5SMartin Matuska 
598*8a62a2a5SMartin Matuska 		/*
599*8a62a2a5SMartin Matuska 		 * For 7-bit ASCII characters mainly, we do a quick case
600*8a62a2a5SMartin Matuska 		 * conversion right at here.
601*8a62a2a5SMartin Matuska 		 *
602*8a62a2a5SMartin Matuska 		 * If we don't have enough bytes for this character, issue
603*8a62a2a5SMartin Matuska 		 * an EINVAL error and use what are available.
604*8a62a2a5SMartin Matuska 		 *
605*8a62a2a5SMartin Matuska 		 * If we have enough bytes, find out if there is
606*8a62a2a5SMartin Matuska 		 * a corresponding uppercase character and if so, copy over
607*8a62a2a5SMartin Matuska 		 * the bytes for a comparison later. If there is no
608*8a62a2a5SMartin Matuska 		 * corresponding uppercase character, then, use what we have
609*8a62a2a5SMartin Matuska 		 * for the comparison.
610*8a62a2a5SMartin Matuska 		 */
611*8a62a2a5SMartin Matuska 		if (sz1 == 1) {
612*8a62a2a5SMartin Matuska 			if (is_it_toupper)
613*8a62a2a5SMartin Matuska 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
614*8a62a2a5SMartin Matuska 			else
615*8a62a2a5SMartin Matuska 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
616*8a62a2a5SMartin Matuska 			s1++;
617*8a62a2a5SMartin Matuska 			u8s1[1] = '\0';
618*8a62a2a5SMartin Matuska 		} else if ((i1 + sz1) > n1) {
619*8a62a2a5SMartin Matuska 			*errnum = EINVAL;
620*8a62a2a5SMartin Matuska 			for (j = 0; (i1 + j) < n1; )
621*8a62a2a5SMartin Matuska 				u8s1[j++] = *s1++;
622*8a62a2a5SMartin Matuska 			u8s1[j] = '\0';
623*8a62a2a5SMartin Matuska 		} else {
624*8a62a2a5SMartin Matuska 			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
625*8a62a2a5SMartin Matuska 			s1 += sz1;
626*8a62a2a5SMartin Matuska 		}
627*8a62a2a5SMartin Matuska 
628*8a62a2a5SMartin Matuska 		/* Do the same for the string s2. */
629*8a62a2a5SMartin Matuska 		sz2 = u8_number_of_bytes[*s2];
630*8a62a2a5SMartin Matuska 		if (sz2 < 0) {
631*8a62a2a5SMartin Matuska 			*errnum = EILSEQ;
632*8a62a2a5SMartin Matuska 			sz2 = 1;
633*8a62a2a5SMartin Matuska 		}
634*8a62a2a5SMartin Matuska 
635*8a62a2a5SMartin Matuska 		if (sz2 == 1) {
636*8a62a2a5SMartin Matuska 			if (is_it_toupper)
637*8a62a2a5SMartin Matuska 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
638*8a62a2a5SMartin Matuska 			else
639*8a62a2a5SMartin Matuska 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
640*8a62a2a5SMartin Matuska 			s2++;
641*8a62a2a5SMartin Matuska 			u8s2[1] = '\0';
642*8a62a2a5SMartin Matuska 		} else if ((i2 + sz2) > n2) {
643*8a62a2a5SMartin Matuska 			*errnum = EINVAL;
644*8a62a2a5SMartin Matuska 			for (j = 0; (i2 + j) < n2; )
645*8a62a2a5SMartin Matuska 				u8s2[j++] = *s2++;
646*8a62a2a5SMartin Matuska 			u8s2[j] = '\0';
647*8a62a2a5SMartin Matuska 		} else {
648*8a62a2a5SMartin Matuska 			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
649*8a62a2a5SMartin Matuska 			s2 += sz2;
650*8a62a2a5SMartin Matuska 		}
651*8a62a2a5SMartin Matuska 
652*8a62a2a5SMartin Matuska 		/* Now compare the two characters. */
653*8a62a2a5SMartin Matuska 		if (sz1 == 1 && sz2 == 1) {
654*8a62a2a5SMartin Matuska 			if (*u8s1 > *u8s2)
655*8a62a2a5SMartin Matuska 				return (1);
656*8a62a2a5SMartin Matuska 			if (*u8s1 < *u8s2)
657*8a62a2a5SMartin Matuska 				return (-1);
658*8a62a2a5SMartin Matuska 		} else {
659*8a62a2a5SMartin Matuska 			f = strcmp((const char *)u8s1, (const char *)u8s2);
660*8a62a2a5SMartin Matuska 			if (f != 0)
661*8a62a2a5SMartin Matuska 				return (f);
662*8a62a2a5SMartin Matuska 		}
663*8a62a2a5SMartin Matuska 
664*8a62a2a5SMartin Matuska 		/*
665*8a62a2a5SMartin Matuska 		 * They were the same. Let's move on to the next
666*8a62a2a5SMartin Matuska 		 * characters then.
667*8a62a2a5SMartin Matuska 		 */
668*8a62a2a5SMartin Matuska 		i1 += sz1;
669*8a62a2a5SMartin Matuska 		i2 += sz2;
670*8a62a2a5SMartin Matuska 	}
671*8a62a2a5SMartin Matuska 
672*8a62a2a5SMartin Matuska 	/*
673*8a62a2a5SMartin Matuska 	 * We compared until the end of either or both strings.
674*8a62a2a5SMartin Matuska 	 *
675*8a62a2a5SMartin Matuska 	 * If we reached to or went over the ends for the both, that means
676*8a62a2a5SMartin Matuska 	 * they are the same.
677*8a62a2a5SMartin Matuska 	 *
678*8a62a2a5SMartin Matuska 	 * If we reached only one of the two ends, that means the other string
679*8a62a2a5SMartin Matuska 	 * has something which then the fact can be used to determine
680*8a62a2a5SMartin Matuska 	 * the return value.
681*8a62a2a5SMartin Matuska 	 */
682*8a62a2a5SMartin Matuska 	if (i1 >= n1) {
683*8a62a2a5SMartin Matuska 		if (i2 >= n2)
684*8a62a2a5SMartin Matuska 			return (0);
685*8a62a2a5SMartin Matuska 		return (-1);
686*8a62a2a5SMartin Matuska 	}
687*8a62a2a5SMartin Matuska 	return (1);
688*8a62a2a5SMartin Matuska }
689*8a62a2a5SMartin Matuska 
690*8a62a2a5SMartin Matuska /*
691*8a62a2a5SMartin Matuska  * The combining_class() function checks on the given bytes and find out
692*8a62a2a5SMartin Matuska  * the corresponding Unicode combining class value. The return value 0 means
693*8a62a2a5SMartin Matuska  * it is a Starter. Any illegal UTF-8 character will also be treated as
694*8a62a2a5SMartin Matuska  * a Starter.
695*8a62a2a5SMartin Matuska  */
696*8a62a2a5SMartin Matuska static uchar_t
combining_class(size_t uv,uchar_t * s,size_t sz)697*8a62a2a5SMartin Matuska combining_class(size_t uv, uchar_t *s, size_t sz)
698*8a62a2a5SMartin Matuska {
699*8a62a2a5SMartin Matuska 	uint16_t b1 = 0;
700*8a62a2a5SMartin Matuska 	uint16_t b2 = 0;
701*8a62a2a5SMartin Matuska 	uint16_t b3 = 0;
702*8a62a2a5SMartin Matuska 	uint16_t b4 = 0;
703*8a62a2a5SMartin Matuska 
704*8a62a2a5SMartin Matuska 	if (sz == 1 || sz > 4)
705*8a62a2a5SMartin Matuska 		return (0);
706*8a62a2a5SMartin Matuska 
707*8a62a2a5SMartin Matuska 	if (sz == 2) {
708*8a62a2a5SMartin Matuska 		b3 = s[0];
709*8a62a2a5SMartin Matuska 		b4 = s[1];
710*8a62a2a5SMartin Matuska 	} else if (sz == 3) {
711*8a62a2a5SMartin Matuska 		b2 = s[0];
712*8a62a2a5SMartin Matuska 		b3 = s[1];
713*8a62a2a5SMartin Matuska 		b4 = s[2];
714*8a62a2a5SMartin Matuska 	} else if (sz == 4) {
715*8a62a2a5SMartin Matuska 		b1 = s[0];
716*8a62a2a5SMartin Matuska 		b2 = s[1];
717*8a62a2a5SMartin Matuska 		b3 = s[2];
718*8a62a2a5SMartin Matuska 		b4 = s[3];
719*8a62a2a5SMartin Matuska 	}
720*8a62a2a5SMartin Matuska 
721*8a62a2a5SMartin Matuska 	b1 = u8_common_b1_tbl[uv][b1];
722*8a62a2a5SMartin Matuska 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
723*8a62a2a5SMartin Matuska 		return (0);
724*8a62a2a5SMartin Matuska 
725*8a62a2a5SMartin Matuska 	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
726*8a62a2a5SMartin Matuska 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
727*8a62a2a5SMartin Matuska 		return (0);
728*8a62a2a5SMartin Matuska 
729*8a62a2a5SMartin Matuska 	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
730*8a62a2a5SMartin Matuska 	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
731*8a62a2a5SMartin Matuska 		return (0);
732*8a62a2a5SMartin Matuska 
733*8a62a2a5SMartin Matuska 	return (u8_combining_class_b4_tbl[uv][b3][b4]);
734*8a62a2a5SMartin Matuska }
735*8a62a2a5SMartin Matuska 
736*8a62a2a5SMartin Matuska /*
737*8a62a2a5SMartin Matuska  * The do_decomp() function finds out a matching decomposition if any
738*8a62a2a5SMartin Matuska  * and return. If there is no match, the input bytes are copied and returned.
739*8a62a2a5SMartin Matuska  * The function also checks if there is a Hangul, decomposes it if necessary
740*8a62a2a5SMartin Matuska  * and returns.
741*8a62a2a5SMartin Matuska  *
742*8a62a2a5SMartin Matuska  * To save time, a single byte 7-bit ASCII character should be handled by
743*8a62a2a5SMartin Matuska  * the caller.
744*8a62a2a5SMartin Matuska  *
745*8a62a2a5SMartin Matuska  * The function returns the number of bytes returned sans always terminating
746*8a62a2a5SMartin Matuska  * the null byte. It will also return a state that will tell if there was
747*8a62a2a5SMartin Matuska  * a Hangul character decomposed which then will be used by the caller.
748*8a62a2a5SMartin Matuska  */
749*8a62a2a5SMartin Matuska static size_t
do_decomp(size_t uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t canonical_decomposition,u8_normalization_states_t * state)750*8a62a2a5SMartin Matuska do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
751*8a62a2a5SMartin Matuska     boolean_t canonical_decomposition, u8_normalization_states_t *state)
752*8a62a2a5SMartin Matuska {
753*8a62a2a5SMartin Matuska 	uint16_t b1 = 0;
754*8a62a2a5SMartin Matuska 	uint16_t b2 = 0;
755*8a62a2a5SMartin Matuska 	uint16_t b3 = 0;
756*8a62a2a5SMartin Matuska 	uint16_t b3_tbl;
757*8a62a2a5SMartin Matuska 	uint16_t b3_base;
758*8a62a2a5SMartin Matuska 	uint16_t b4 = 0;
759*8a62a2a5SMartin Matuska 	size_t start_id;
760*8a62a2a5SMartin Matuska 	size_t end_id;
761*8a62a2a5SMartin Matuska 	size_t i;
762*8a62a2a5SMartin Matuska 	uint32_t u1;
763*8a62a2a5SMartin Matuska 
764*8a62a2a5SMartin Matuska 	if (sz == 2) {
765*8a62a2a5SMartin Matuska 		b3 = u8s[0] = s[0];
766*8a62a2a5SMartin Matuska 		b4 = u8s[1] = s[1];
767*8a62a2a5SMartin Matuska 		u8s[2] = '\0';
768*8a62a2a5SMartin Matuska 	} else if (sz == 3) {
769*8a62a2a5SMartin Matuska 		/* Convert it to a Unicode scalar value. */
770*8a62a2a5SMartin Matuska 		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
771*8a62a2a5SMartin Matuska 
772*8a62a2a5SMartin Matuska 		/*
773*8a62a2a5SMartin Matuska 		 * If this is a Hangul syllable, we decompose it into
774*8a62a2a5SMartin Matuska 		 * a leading consonant, a vowel, and an optional trailing
775*8a62a2a5SMartin Matuska 		 * consonant and then return.
776*8a62a2a5SMartin Matuska 		 */
777*8a62a2a5SMartin Matuska 		if (U8_HANGUL_SYLLABLE(u1)) {
778*8a62a2a5SMartin Matuska 			u1 -= U8_HANGUL_SYL_FIRST;
779*8a62a2a5SMartin Matuska 
780*8a62a2a5SMartin Matuska 			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
781*8a62a2a5SMartin Matuska 			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
782*8a62a2a5SMartin Matuska 			    / U8_HANGUL_T_COUNT;
783*8a62a2a5SMartin Matuska 			b3 = u1 % U8_HANGUL_T_COUNT;
784*8a62a2a5SMartin Matuska 
785*8a62a2a5SMartin Matuska 			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
786*8a62a2a5SMartin Matuska 			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
787*8a62a2a5SMartin Matuska 			if (b3) {
788*8a62a2a5SMartin Matuska 				b3 += U8_HANGUL_JAMO_T_FIRST;
789*8a62a2a5SMartin Matuska 				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
790*8a62a2a5SMartin Matuska 
791*8a62a2a5SMartin Matuska 				u8s[9] = '\0';
792*8a62a2a5SMartin Matuska 				*state = U8_STATE_HANGUL_LVT;
793*8a62a2a5SMartin Matuska 				return (9);
794*8a62a2a5SMartin Matuska 			}
795*8a62a2a5SMartin Matuska 
796*8a62a2a5SMartin Matuska 			u8s[6] = '\0';
797*8a62a2a5SMartin Matuska 			*state = U8_STATE_HANGUL_LV;
798*8a62a2a5SMartin Matuska 			return (6);
799*8a62a2a5SMartin Matuska 		}
800*8a62a2a5SMartin Matuska 
801*8a62a2a5SMartin Matuska 		b2 = u8s[0] = s[0];
802*8a62a2a5SMartin Matuska 		b3 = u8s[1] = s[1];
803*8a62a2a5SMartin Matuska 		b4 = u8s[2] = s[2];
804*8a62a2a5SMartin Matuska 		u8s[3] = '\0';
805*8a62a2a5SMartin Matuska 
806*8a62a2a5SMartin Matuska 		/*
807*8a62a2a5SMartin Matuska 		 * If this is a Hangul Jamo, we know there is nothing
808*8a62a2a5SMartin Matuska 		 * further that we can decompose.
809*8a62a2a5SMartin Matuska 		 */
810*8a62a2a5SMartin Matuska 		if (U8_HANGUL_JAMO_L(u1)) {
811*8a62a2a5SMartin Matuska 			*state = U8_STATE_HANGUL_L;
812*8a62a2a5SMartin Matuska 			return (3);
813*8a62a2a5SMartin Matuska 		}
814*8a62a2a5SMartin Matuska 
815*8a62a2a5SMartin Matuska 		if (U8_HANGUL_JAMO_V(u1)) {
816*8a62a2a5SMartin Matuska 			if (*state == U8_STATE_HANGUL_L)
817*8a62a2a5SMartin Matuska 				*state = U8_STATE_HANGUL_LV;
818*8a62a2a5SMartin Matuska 			else
819*8a62a2a5SMartin Matuska 				*state = U8_STATE_HANGUL_V;
820*8a62a2a5SMartin Matuska 			return (3);
821*8a62a2a5SMartin Matuska 		}
822*8a62a2a5SMartin Matuska 
823*8a62a2a5SMartin Matuska 		if (U8_HANGUL_JAMO_T(u1)) {
824*8a62a2a5SMartin Matuska 			if (*state == U8_STATE_HANGUL_LV)
825*8a62a2a5SMartin Matuska 				*state = U8_STATE_HANGUL_LVT;
826*8a62a2a5SMartin Matuska 			else
827*8a62a2a5SMartin Matuska 				*state = U8_STATE_HANGUL_T;
828*8a62a2a5SMartin Matuska 			return (3);
829*8a62a2a5SMartin Matuska 		}
830*8a62a2a5SMartin Matuska 	} else if (sz == 4) {
831*8a62a2a5SMartin Matuska 		b1 = u8s[0] = s[0];
832*8a62a2a5SMartin Matuska 		b2 = u8s[1] = s[1];
833*8a62a2a5SMartin Matuska 		b3 = u8s[2] = s[2];
834*8a62a2a5SMartin Matuska 		b4 = u8s[3] = s[3];
835*8a62a2a5SMartin Matuska 		u8s[4] = '\0';
836*8a62a2a5SMartin Matuska 	} else {
837*8a62a2a5SMartin Matuska 		/*
838*8a62a2a5SMartin Matuska 		 * This is a fallback and should not happen if the function
839*8a62a2a5SMartin Matuska 		 * was called properly.
840*8a62a2a5SMartin Matuska 		 */
841*8a62a2a5SMartin Matuska 		u8s[0] = s[0];
842*8a62a2a5SMartin Matuska 		u8s[1] = '\0';
843*8a62a2a5SMartin Matuska 		*state = U8_STATE_START;
844*8a62a2a5SMartin Matuska 		return (1);
845*8a62a2a5SMartin Matuska 	}
846*8a62a2a5SMartin Matuska 
847*8a62a2a5SMartin Matuska 	/*
848*8a62a2a5SMartin Matuska 	 * At this point, this routine does not know what it would get.
849*8a62a2a5SMartin Matuska 	 * The caller should sort it out if the state isn't a Hangul one.
850*8a62a2a5SMartin Matuska 	 */
851*8a62a2a5SMartin Matuska 	*state = U8_STATE_START;
852*8a62a2a5SMartin Matuska 
853*8a62a2a5SMartin Matuska 	/* Try to find matching decomposition mapping byte sequence. */
854*8a62a2a5SMartin Matuska 	b1 = u8_common_b1_tbl[uv][b1];
855*8a62a2a5SMartin Matuska 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
856*8a62a2a5SMartin Matuska 		return ((size_t)sz);
857*8a62a2a5SMartin Matuska 
858*8a62a2a5SMartin Matuska 	b2 = u8_decomp_b2_tbl[uv][b1][b2];
859*8a62a2a5SMartin Matuska 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
860*8a62a2a5SMartin Matuska 		return ((size_t)sz);
861*8a62a2a5SMartin Matuska 
862*8a62a2a5SMartin Matuska 	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
863*8a62a2a5SMartin Matuska 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
864*8a62a2a5SMartin Matuska 		return ((size_t)sz);
865*8a62a2a5SMartin Matuska 
866*8a62a2a5SMartin Matuska 	/*
867*8a62a2a5SMartin Matuska 	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
868*8a62a2a5SMartin Matuska 	 * which is 0x8000, this means we couldn't fit the mappings into
869*8a62a2a5SMartin Matuska 	 * the cardinality of a unsigned byte.
870*8a62a2a5SMartin Matuska 	 */
871*8a62a2a5SMartin Matuska 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
872*8a62a2a5SMartin Matuska 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
873*8a62a2a5SMartin Matuska 		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
874*8a62a2a5SMartin Matuska 		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
875*8a62a2a5SMartin Matuska 	} else {
876*8a62a2a5SMartin Matuska 		// cppcheck-suppress arrayIndexOutOfBoundsCond
877*8a62a2a5SMartin Matuska 		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
878*8a62a2a5SMartin Matuska 		// cppcheck-suppress arrayIndexOutOfBoundsCond
879*8a62a2a5SMartin Matuska 		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
880*8a62a2a5SMartin Matuska 	}
881*8a62a2a5SMartin Matuska 
882*8a62a2a5SMartin Matuska 	/* This also means there wasn't any matching decomposition. */
883*8a62a2a5SMartin Matuska 	if (start_id >= end_id)
884*8a62a2a5SMartin Matuska 		return ((size_t)sz);
885*8a62a2a5SMartin Matuska 
886*8a62a2a5SMartin Matuska 	/*
887*8a62a2a5SMartin Matuska 	 * The final table for decomposition mappings has three types of
888*8a62a2a5SMartin Matuska 	 * byte sequences depending on whether a mapping is for compatibility
889*8a62a2a5SMartin Matuska 	 * decomposition, canonical decomposition, or both like the following:
890*8a62a2a5SMartin Matuska 	 *
891*8a62a2a5SMartin Matuska 	 * (1) Compatibility decomposition mappings:
892*8a62a2a5SMartin Matuska 	 *
893*8a62a2a5SMartin Matuska 	 *	+---+---+-...-+---+
894*8a62a2a5SMartin Matuska 	 *	| B0| B1| ... | Bm|
895*8a62a2a5SMartin Matuska 	 *	+---+---+-...-+---+
896*8a62a2a5SMartin Matuska 	 *
897*8a62a2a5SMartin Matuska 	 *	The first byte, B0, is always less than 0xF5 (U8_DECOMP_BOTH).
898*8a62a2a5SMartin Matuska 	 *
899*8a62a2a5SMartin Matuska 	 * (2) Canonical decomposition mappings:
900*8a62a2a5SMartin Matuska 	 *
901*8a62a2a5SMartin Matuska 	 *	+---+---+---+-...-+---+
902*8a62a2a5SMartin Matuska 	 *	| T | b0| b1| ... | bn|
903*8a62a2a5SMartin Matuska 	 *	+---+---+---+-...-+---+
904*8a62a2a5SMartin Matuska 	 *
905*8a62a2a5SMartin Matuska 	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
906*8a62a2a5SMartin Matuska 	 *
907*8a62a2a5SMartin Matuska 	 * (3) Both mappings:
908*8a62a2a5SMartin Matuska 	 *
909*8a62a2a5SMartin Matuska 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
910*8a62a2a5SMartin Matuska 	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
911*8a62a2a5SMartin Matuska 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
912*8a62a2a5SMartin Matuska 	 *
913*8a62a2a5SMartin Matuska 	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
914*8a62a2a5SMartin Matuska 	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
915*8a62a2a5SMartin Matuska 	 *	compatibility mapping bytes.
916*8a62a2a5SMartin Matuska 	 *
917*8a62a2a5SMartin Matuska 	 * Note that compatibility decomposition means doing recursive
918*8a62a2a5SMartin Matuska 	 * decompositions using both compatibility decomposition mappings and
919*8a62a2a5SMartin Matuska 	 * canonical decomposition mappings. On the other hand, canonical
920*8a62a2a5SMartin Matuska 	 * decomposition means doing recursive decompositions using only
921*8a62a2a5SMartin Matuska 	 * canonical decomposition mappings. Since the table we have has gone
922*8a62a2a5SMartin Matuska 	 * through the recursions already, we do not need to do so during
923*8a62a2a5SMartin Matuska 	 * runtime, i.e., the table has been completely flattened out
924*8a62a2a5SMartin Matuska 	 * already.
925*8a62a2a5SMartin Matuska 	 */
926*8a62a2a5SMartin Matuska 
927*8a62a2a5SMartin Matuska 	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
928*8a62a2a5SMartin Matuska 
929*8a62a2a5SMartin Matuska 	/* Get the type, T, of the byte sequence. */
930*8a62a2a5SMartin Matuska 	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
931*8a62a2a5SMartin Matuska 
932*8a62a2a5SMartin Matuska 	/*
933*8a62a2a5SMartin Matuska 	 * If necessary, adjust start_id, end_id, or both. Note that if
934*8a62a2a5SMartin Matuska 	 * this is compatibility decomposition mapping, there is no
935*8a62a2a5SMartin Matuska 	 * adjustment.
936*8a62a2a5SMartin Matuska 	 */
937*8a62a2a5SMartin Matuska 	if (canonical_decomposition) {
938*8a62a2a5SMartin Matuska 		/* Is the mapping only for compatibility decomposition? */
939*8a62a2a5SMartin Matuska 		if (b1 < U8_DECOMP_BOTH)
940*8a62a2a5SMartin Matuska 			return ((size_t)sz);
941*8a62a2a5SMartin Matuska 
942*8a62a2a5SMartin Matuska 		start_id++;
943*8a62a2a5SMartin Matuska 
944*8a62a2a5SMartin Matuska 		if (b1 == U8_DECOMP_BOTH) {
945*8a62a2a5SMartin Matuska 			end_id = start_id +
946*8a62a2a5SMartin Matuska 			    u8_decomp_final_tbl[uv][b3_base + start_id];
947*8a62a2a5SMartin Matuska 			start_id++;
948*8a62a2a5SMartin Matuska 		}
949*8a62a2a5SMartin Matuska 	} else {
950*8a62a2a5SMartin Matuska 		/*
951*8a62a2a5SMartin Matuska 		 * Unless this is a compatibility decomposition mapping,
952*8a62a2a5SMartin Matuska 		 * we adjust the start_id.
953*8a62a2a5SMartin Matuska 		 */
954*8a62a2a5SMartin Matuska 		if (b1 == U8_DECOMP_BOTH) {
955*8a62a2a5SMartin Matuska 			start_id++;
956*8a62a2a5SMartin Matuska 			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
957*8a62a2a5SMartin Matuska 		} else if (b1 == U8_DECOMP_CANONICAL) {
958*8a62a2a5SMartin Matuska 			start_id++;
959*8a62a2a5SMartin Matuska 		}
960*8a62a2a5SMartin Matuska 	}
961*8a62a2a5SMartin Matuska 
962*8a62a2a5SMartin Matuska 	for (i = 0; start_id < end_id; start_id++)
963*8a62a2a5SMartin Matuska 		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
964*8a62a2a5SMartin Matuska 	u8s[i] = '\0';
965*8a62a2a5SMartin Matuska 
966*8a62a2a5SMartin Matuska 	return (i);
967*8a62a2a5SMartin Matuska }
968*8a62a2a5SMartin Matuska 
969*8a62a2a5SMartin Matuska /*
970*8a62a2a5SMartin Matuska  * The find_composition_start() function uses the character bytes given and
971*8a62a2a5SMartin Matuska  * find out the matching composition mappings if any and return the address
972*8a62a2a5SMartin Matuska  * to the composition mappings as explained in the do_composition().
973*8a62a2a5SMartin Matuska  */
974*8a62a2a5SMartin Matuska static uchar_t *
find_composition_start(size_t uv,uchar_t * s,size_t sz)975*8a62a2a5SMartin Matuska find_composition_start(size_t uv, uchar_t *s, size_t sz)
976*8a62a2a5SMartin Matuska {
977*8a62a2a5SMartin Matuska 	uint16_t b1 = 0;
978*8a62a2a5SMartin Matuska 	uint16_t b2 = 0;
979*8a62a2a5SMartin Matuska 	uint16_t b3 = 0;
980*8a62a2a5SMartin Matuska 	uint16_t b3_tbl;
981*8a62a2a5SMartin Matuska 	uint16_t b3_base;
982*8a62a2a5SMartin Matuska 	uint16_t b4 = 0;
983*8a62a2a5SMartin Matuska 	size_t start_id;
984*8a62a2a5SMartin Matuska 	size_t end_id;
985*8a62a2a5SMartin Matuska 
986*8a62a2a5SMartin Matuska 	if (sz == 1) {
987*8a62a2a5SMartin Matuska 		b4 = s[0];
988*8a62a2a5SMartin Matuska 	} else if (sz == 2) {
989*8a62a2a5SMartin Matuska 		b3 = s[0];
990*8a62a2a5SMartin Matuska 		b4 = s[1];
991*8a62a2a5SMartin Matuska 	} else if (sz == 3) {
992*8a62a2a5SMartin Matuska 		b2 = s[0];
993*8a62a2a5SMartin Matuska 		b3 = s[1];
994*8a62a2a5SMartin Matuska 		b4 = s[2];
995*8a62a2a5SMartin Matuska 	} else if (sz == 4) {
996*8a62a2a5SMartin Matuska 		b1 = s[0];
997*8a62a2a5SMartin Matuska 		b2 = s[1];
998*8a62a2a5SMartin Matuska 		b3 = s[2];
999*8a62a2a5SMartin Matuska 		b4 = s[3];
1000*8a62a2a5SMartin Matuska 	} else {
1001*8a62a2a5SMartin Matuska 		/*
1002*8a62a2a5SMartin Matuska 		 * This is a fallback and should not happen if the function
1003*8a62a2a5SMartin Matuska 		 * was called properly.
1004*8a62a2a5SMartin Matuska 		 */
1005*8a62a2a5SMartin Matuska 		return (NULL);
1006*8a62a2a5SMartin Matuska 	}
1007*8a62a2a5SMartin Matuska 
1008*8a62a2a5SMartin Matuska 	b1 = u8_composition_b1_tbl[uv][b1];
1009*8a62a2a5SMartin Matuska 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
1010*8a62a2a5SMartin Matuska 		return (NULL);
1011*8a62a2a5SMartin Matuska 
1012*8a62a2a5SMartin Matuska 	b2 = u8_composition_b2_tbl[uv][b1][b2];
1013*8a62a2a5SMartin Matuska 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
1014*8a62a2a5SMartin Matuska 		return (NULL);
1015*8a62a2a5SMartin Matuska 
1016*8a62a2a5SMartin Matuska 	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
1017*8a62a2a5SMartin Matuska 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
1018*8a62a2a5SMartin Matuska 		return (NULL);
1019*8a62a2a5SMartin Matuska 
1020*8a62a2a5SMartin Matuska 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
1021*8a62a2a5SMartin Matuska 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
1022*8a62a2a5SMartin Matuska 		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
1023*8a62a2a5SMartin Matuska 		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
1024*8a62a2a5SMartin Matuska 	} else {
1025*8a62a2a5SMartin Matuska 		// cppcheck-suppress arrayIndexOutOfBoundsCond
1026*8a62a2a5SMartin Matuska 		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
1027*8a62a2a5SMartin Matuska 		// cppcheck-suppress arrayIndexOutOfBoundsCond
1028*8a62a2a5SMartin Matuska 		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
1029*8a62a2a5SMartin Matuska 	}
1030*8a62a2a5SMartin Matuska 
1031*8a62a2a5SMartin Matuska 	if (start_id >= end_id)
1032*8a62a2a5SMartin Matuska 		return (NULL);
1033*8a62a2a5SMartin Matuska 
1034*8a62a2a5SMartin Matuska 	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
1035*8a62a2a5SMartin Matuska 
1036*8a62a2a5SMartin Matuska 	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
1037*8a62a2a5SMartin Matuska }
1038*8a62a2a5SMartin Matuska 
1039*8a62a2a5SMartin Matuska /*
1040*8a62a2a5SMartin Matuska  * The blocked() function checks on the combining class values of previous
1041*8a62a2a5SMartin Matuska  * characters in this sequence and return whether it is blocked or not.
1042*8a62a2a5SMartin Matuska  */
1043*8a62a2a5SMartin Matuska static boolean_t
blocked(uchar_t * comb_class,size_t last)1044*8a62a2a5SMartin Matuska blocked(uchar_t *comb_class, size_t last)
1045*8a62a2a5SMartin Matuska {
1046*8a62a2a5SMartin Matuska 	uchar_t my_comb_class;
1047*8a62a2a5SMartin Matuska 	size_t i;
1048*8a62a2a5SMartin Matuska 
1049*8a62a2a5SMartin Matuska 	my_comb_class = comb_class[last];
1050*8a62a2a5SMartin Matuska 	for (i = 1; i < last; i++)
1051*8a62a2a5SMartin Matuska 		if (comb_class[i] >= my_comb_class ||
1052*8a62a2a5SMartin Matuska 		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
1053*8a62a2a5SMartin Matuska 			return (B_TRUE);
1054*8a62a2a5SMartin Matuska 
1055*8a62a2a5SMartin Matuska 	return (B_FALSE);
1056*8a62a2a5SMartin Matuska }
1057*8a62a2a5SMartin Matuska 
1058*8a62a2a5SMartin Matuska /*
1059*8a62a2a5SMartin Matuska  * The do_composition() reads the character string pointed by 's' and
1060*8a62a2a5SMartin Matuska  * do necessary canonical composition and then copy over the result back to
1061*8a62a2a5SMartin Matuska  * the 's'.
1062*8a62a2a5SMartin Matuska  *
1063*8a62a2a5SMartin Matuska  * The input argument 's' cannot contain more than 32 characters.
1064*8a62a2a5SMartin Matuska  */
1065*8a62a2a5SMartin Matuska static size_t
do_composition(size_t uv,uchar_t * s,uchar_t * comb_class,uchar_t * start,uchar_t * disp,size_t last,uchar_t ** os,uchar_t * oslast)1066*8a62a2a5SMartin Matuska do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
1067*8a62a2a5SMartin Matuska     uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
1068*8a62a2a5SMartin Matuska {
1069*8a62a2a5SMartin Matuska 	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
1070*8a62a2a5SMartin Matuska 	uchar_t tc[U8_MB_CUR_MAX] = { '\0' };
1071*8a62a2a5SMartin Matuska 	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
1072*8a62a2a5SMartin Matuska 	size_t saved_marks_count;
1073*8a62a2a5SMartin Matuska 	uchar_t *p;
1074*8a62a2a5SMartin Matuska 	uchar_t *saved_p;
1075*8a62a2a5SMartin Matuska 	uchar_t *q;
1076*8a62a2a5SMartin Matuska 	size_t i;
1077*8a62a2a5SMartin Matuska 	size_t saved_i;
1078*8a62a2a5SMartin Matuska 	size_t j;
1079*8a62a2a5SMartin Matuska 	size_t k;
1080*8a62a2a5SMartin Matuska 	size_t l;
1081*8a62a2a5SMartin Matuska 	size_t C;
1082*8a62a2a5SMartin Matuska 	size_t saved_l;
1083*8a62a2a5SMartin Matuska 	size_t size;
1084*8a62a2a5SMartin Matuska 	uint32_t u1;
1085*8a62a2a5SMartin Matuska 	uint32_t u2;
1086*8a62a2a5SMartin Matuska 	boolean_t match_not_found = B_TRUE;
1087*8a62a2a5SMartin Matuska 
1088*8a62a2a5SMartin Matuska 	/*
1089*8a62a2a5SMartin Matuska 	 * This should never happen unless the callers are doing some strange
1090*8a62a2a5SMartin Matuska 	 * and unexpected things.
1091*8a62a2a5SMartin Matuska 	 *
1092*8a62a2a5SMartin Matuska 	 * The "last" is the index pointing to the last character not last + 1.
1093*8a62a2a5SMartin Matuska 	 */
1094*8a62a2a5SMartin Matuska 	if (last >= U8_MAX_CHARS_A_SEQ)
1095*8a62a2a5SMartin Matuska 		last = U8_UPPER_LIMIT_IN_A_SEQ;
1096*8a62a2a5SMartin Matuska 
1097*8a62a2a5SMartin Matuska 	for (i = l = 0; i <= last; i++) {
1098*8a62a2a5SMartin Matuska 		/*
1099*8a62a2a5SMartin Matuska 		 * The last or any non-Starters at the beginning, we don't
1100*8a62a2a5SMartin Matuska 		 * have any chance to do composition and so we just copy them
1101*8a62a2a5SMartin Matuska 		 * to the temporary buffer.
1102*8a62a2a5SMartin Matuska 		 */
1103*8a62a2a5SMartin Matuska 		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
1104*8a62a2a5SMartin Matuska SAVE_THE_CHAR:
1105*8a62a2a5SMartin Matuska 			p = s + start[i];
1106*8a62a2a5SMartin Matuska 			size = disp[i];
1107*8a62a2a5SMartin Matuska 			for (k = 0; k < size; k++)
1108*8a62a2a5SMartin Matuska 				t[l++] = *p++;
1109*8a62a2a5SMartin Matuska 			continue;
1110*8a62a2a5SMartin Matuska 		}
1111*8a62a2a5SMartin Matuska 
1112*8a62a2a5SMartin Matuska 		/*
1113*8a62a2a5SMartin Matuska 		 * If this could be a start of Hangul Jamos, then, we try to
1114*8a62a2a5SMartin Matuska 		 * conjoin them.
1115*8a62a2a5SMartin Matuska 		 */
1116*8a62a2a5SMartin Matuska 		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
1117*8a62a2a5SMartin Matuska 			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
1118*8a62a2a5SMartin Matuska 			    s[start[i] + 1], s[start[i] + 2]);
1119*8a62a2a5SMartin Matuska 			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
1120*8a62a2a5SMartin Matuska 			    s[start[i] + 4], s[start[i] + 5]);
1121*8a62a2a5SMartin Matuska 
1122*8a62a2a5SMartin Matuska 			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
1123*8a62a2a5SMartin Matuska 				u1 -= U8_HANGUL_JAMO_L_FIRST;
1124*8a62a2a5SMartin Matuska 				u2 -= U8_HANGUL_JAMO_V_FIRST;
1125*8a62a2a5SMartin Matuska 				u1 = U8_HANGUL_SYL_FIRST +
1126*8a62a2a5SMartin Matuska 				    (u1 * U8_HANGUL_V_COUNT + u2) *
1127*8a62a2a5SMartin Matuska 				    U8_HANGUL_T_COUNT;
1128*8a62a2a5SMartin Matuska 
1129*8a62a2a5SMartin Matuska 				i += 2;
1130*8a62a2a5SMartin Matuska 				if (i <= last) {
1131*8a62a2a5SMartin Matuska 					U8_PUT_3BYTES_INTO_UTF32(u2,
1132*8a62a2a5SMartin Matuska 					    s[start[i]], s[start[i] + 1],
1133*8a62a2a5SMartin Matuska 					    s[start[i] + 2]);
1134*8a62a2a5SMartin Matuska 
1135*8a62a2a5SMartin Matuska 					if (U8_HANGUL_JAMO_T(u2)) {
1136*8a62a2a5SMartin Matuska 						u1 += u2 -
1137*8a62a2a5SMartin Matuska 						    U8_HANGUL_JAMO_T_FIRST;
1138*8a62a2a5SMartin Matuska 						i++;
1139*8a62a2a5SMartin Matuska 					}
1140*8a62a2a5SMartin Matuska 				}
1141*8a62a2a5SMartin Matuska 
1142*8a62a2a5SMartin Matuska 				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
1143*8a62a2a5SMartin Matuska 				i--;
1144*8a62a2a5SMartin Matuska 				l += 3;
1145*8a62a2a5SMartin Matuska 				continue;
1146*8a62a2a5SMartin Matuska 			}
1147*8a62a2a5SMartin Matuska 		}
1148*8a62a2a5SMartin Matuska 
1149*8a62a2a5SMartin Matuska 		/*
1150*8a62a2a5SMartin Matuska 		 * Let's then find out if this Starter has composition
1151*8a62a2a5SMartin Matuska 		 * mapping.
1152*8a62a2a5SMartin Matuska 		 */
1153*8a62a2a5SMartin Matuska 		p = find_composition_start(uv, s + start[i], disp[i]);
1154*8a62a2a5SMartin Matuska 		if (p == NULL)
1155*8a62a2a5SMartin Matuska 			goto SAVE_THE_CHAR;
1156*8a62a2a5SMartin Matuska 
1157*8a62a2a5SMartin Matuska 		/*
1158*8a62a2a5SMartin Matuska 		 * We have a Starter with composition mapping and the next
1159*8a62a2a5SMartin Matuska 		 * character is a non-Starter. Let's try to find out if
1160*8a62a2a5SMartin Matuska 		 * we can do composition.
1161*8a62a2a5SMartin Matuska 		 */
1162*8a62a2a5SMartin Matuska 
1163*8a62a2a5SMartin Matuska 		saved_p = p;
1164*8a62a2a5SMartin Matuska 		saved_i = i;
1165*8a62a2a5SMartin Matuska 		saved_l = l;
1166*8a62a2a5SMartin Matuska 		saved_marks_count = 0;
1167*8a62a2a5SMartin Matuska 
1168*8a62a2a5SMartin Matuska TRY_THE_NEXT_MARK:
1169*8a62a2a5SMartin Matuska 		q = s + start[++i];
1170*8a62a2a5SMartin Matuska 		size = disp[i];
1171*8a62a2a5SMartin Matuska 
1172*8a62a2a5SMartin Matuska 		/*
1173*8a62a2a5SMartin Matuska 		 * The next for() loop compares the non-Starter pointed by
1174*8a62a2a5SMartin Matuska 		 * 'q' with the possible (joinable) characters pointed by 'p'.
1175*8a62a2a5SMartin Matuska 		 *
1176*8a62a2a5SMartin Matuska 		 * The composition final table entry pointed by the 'p'
1177*8a62a2a5SMartin Matuska 		 * looks like the following:
1178*8a62a2a5SMartin Matuska 		 *
1179*8a62a2a5SMartin Matuska 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1180*8a62a2a5SMartin Matuska 		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
1181*8a62a2a5SMartin Matuska 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1182*8a62a2a5SMartin Matuska 		 *
1183*8a62a2a5SMartin Matuska 		 * where C is the count byte indicating the number of
1184*8a62a2a5SMartin Matuska 		 * mapping pairs where each pair would be look like
1185*8a62a2a5SMartin Matuska 		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
1186*8a62a2a5SMartin Matuska 		 * character of a canonical decomposition and the B0-Bm are
1187*8a62a2a5SMartin Matuska 		 * the bytes of a matching composite character. The F is
1188*8a62a2a5SMartin Matuska 		 * a filler byte after each character as the separator.
1189*8a62a2a5SMartin Matuska 		 */
1190*8a62a2a5SMartin Matuska 
1191*8a62a2a5SMartin Matuska 		match_not_found = B_TRUE;
1192*8a62a2a5SMartin Matuska 
1193*8a62a2a5SMartin Matuska 		for (C = *p++; C > 0; C--) {
1194*8a62a2a5SMartin Matuska 			for (k = 0; k < size; p++, k++)
1195*8a62a2a5SMartin Matuska 				if (*p != q[k])
1196*8a62a2a5SMartin Matuska 					break;
1197*8a62a2a5SMartin Matuska 
1198*8a62a2a5SMartin Matuska 			/* Have we found it? */
1199*8a62a2a5SMartin Matuska 			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
1200*8a62a2a5SMartin Matuska 				match_not_found = B_FALSE;
1201*8a62a2a5SMartin Matuska 
1202*8a62a2a5SMartin Matuska 				l = saved_l;
1203*8a62a2a5SMartin Matuska 
1204*8a62a2a5SMartin Matuska 				while (*++p != U8_TBL_ELEMENT_FILLER)
1205*8a62a2a5SMartin Matuska 					t[l++] = *p;
1206*8a62a2a5SMartin Matuska 
1207*8a62a2a5SMartin Matuska 				break;
1208*8a62a2a5SMartin Matuska 			}
1209*8a62a2a5SMartin Matuska 
1210*8a62a2a5SMartin Matuska 			/* We didn't find; skip to the next pair. */
1211*8a62a2a5SMartin Matuska 			if (*p != U8_TBL_ELEMENT_FILLER)
1212*8a62a2a5SMartin Matuska 				while (*++p != U8_TBL_ELEMENT_FILLER)
1213*8a62a2a5SMartin Matuska 					;
1214*8a62a2a5SMartin Matuska 			while (*++p != U8_TBL_ELEMENT_FILLER)
1215*8a62a2a5SMartin Matuska 				;
1216*8a62a2a5SMartin Matuska 			p++;
1217*8a62a2a5SMartin Matuska 		}
1218*8a62a2a5SMartin Matuska 
1219*8a62a2a5SMartin Matuska 		/*
1220*8a62a2a5SMartin Matuska 		 * If there was no match, we will need to save the combining
1221*8a62a2a5SMartin Matuska 		 * mark for later appending. After that, if the next one
1222*8a62a2a5SMartin Matuska 		 * is a non-Starter and not blocked, then, we try once
1223*8a62a2a5SMartin Matuska 		 * again to do composition with the next non-Starter.
1224*8a62a2a5SMartin Matuska 		 *
1225*8a62a2a5SMartin Matuska 		 * If there was no match and this was a Starter, then,
1226*8a62a2a5SMartin Matuska 		 * this is a new start.
1227*8a62a2a5SMartin Matuska 		 *
1228*8a62a2a5SMartin Matuska 		 * If there was a match and a composition done and we have
1229*8a62a2a5SMartin Matuska 		 * more to check on, then, we retrieve a new composition final
1230*8a62a2a5SMartin Matuska 		 * table entry for the composite and then try to do the
1231*8a62a2a5SMartin Matuska 		 * composition again.
1232*8a62a2a5SMartin Matuska 		 */
1233*8a62a2a5SMartin Matuska 
1234*8a62a2a5SMartin Matuska 		if (match_not_found) {
1235*8a62a2a5SMartin Matuska 			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
1236*8a62a2a5SMartin Matuska 				i--;
1237*8a62a2a5SMartin Matuska 				goto SAVE_THE_CHAR;
1238*8a62a2a5SMartin Matuska 			}
1239*8a62a2a5SMartin Matuska 
1240*8a62a2a5SMartin Matuska 			saved_marks[saved_marks_count++] = i;
1241*8a62a2a5SMartin Matuska 		}
1242*8a62a2a5SMartin Matuska 
1243*8a62a2a5SMartin Matuska 		if (saved_l == l) {
1244*8a62a2a5SMartin Matuska 			while (i < last) {
1245*8a62a2a5SMartin Matuska 				if (blocked(comb_class, i + 1))
1246*8a62a2a5SMartin Matuska 					saved_marks[saved_marks_count++] = ++i;
1247*8a62a2a5SMartin Matuska 				else
1248*8a62a2a5SMartin Matuska 					break;
1249*8a62a2a5SMartin Matuska 			}
1250*8a62a2a5SMartin Matuska 			if (i < last) {
1251*8a62a2a5SMartin Matuska 				p = saved_p;
1252*8a62a2a5SMartin Matuska 				goto TRY_THE_NEXT_MARK;
1253*8a62a2a5SMartin Matuska 			}
1254*8a62a2a5SMartin Matuska 		} else if (i < last) {
1255*8a62a2a5SMartin Matuska 			p = find_composition_start(uv, t + saved_l,
1256*8a62a2a5SMartin Matuska 			    l - saved_l);
1257*8a62a2a5SMartin Matuska 			if (p != NULL) {
1258*8a62a2a5SMartin Matuska 				saved_p = p;
1259*8a62a2a5SMartin Matuska 				goto TRY_THE_NEXT_MARK;
1260*8a62a2a5SMartin Matuska 			}
1261*8a62a2a5SMartin Matuska 		}
1262*8a62a2a5SMartin Matuska 
1263*8a62a2a5SMartin Matuska 		/*
1264*8a62a2a5SMartin Matuska 		 * There is no more composition possible.
1265*8a62a2a5SMartin Matuska 		 *
1266*8a62a2a5SMartin Matuska 		 * If there was no composition what so ever then we copy
1267*8a62a2a5SMartin Matuska 		 * over the original Starter and then append any non-Starters
1268*8a62a2a5SMartin Matuska 		 * remaining at the target string sequentially after that.
1269*8a62a2a5SMartin Matuska 		 */
1270*8a62a2a5SMartin Matuska 
1271*8a62a2a5SMartin Matuska 		if (saved_l == l) {
1272*8a62a2a5SMartin Matuska 			p = s + start[saved_i];
1273*8a62a2a5SMartin Matuska 			size = disp[saved_i];
1274*8a62a2a5SMartin Matuska 			for (j = 0; j < size; j++)
1275*8a62a2a5SMartin Matuska 				t[l++] = *p++;
1276*8a62a2a5SMartin Matuska 		}
1277*8a62a2a5SMartin Matuska 
1278*8a62a2a5SMartin Matuska 		for (k = 0; k < saved_marks_count; k++) {
1279*8a62a2a5SMartin Matuska 			p = s + start[saved_marks[k]];
1280*8a62a2a5SMartin Matuska 			size = disp[saved_marks[k]];
1281*8a62a2a5SMartin Matuska 			for (j = 0; j < size; j++)
1282*8a62a2a5SMartin Matuska 				t[l++] = *p++;
1283*8a62a2a5SMartin Matuska 		}
1284*8a62a2a5SMartin Matuska 	}
1285*8a62a2a5SMartin Matuska 
1286*8a62a2a5SMartin Matuska 	/*
1287*8a62a2a5SMartin Matuska 	 * If the last character is a Starter and if we have a character
1288*8a62a2a5SMartin Matuska 	 * (possibly another Starter) that can be turned into a composite,
1289*8a62a2a5SMartin Matuska 	 * we do so and we do so until there is no more of composition
1290*8a62a2a5SMartin Matuska 	 * possible.
1291*8a62a2a5SMartin Matuska 	 */
1292*8a62a2a5SMartin Matuska 	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
1293*8a62a2a5SMartin Matuska 		p = *os;
1294*8a62a2a5SMartin Matuska 		saved_l = l - disp[last];
1295*8a62a2a5SMartin Matuska 
1296*8a62a2a5SMartin Matuska 		while (p < oslast) {
1297*8a62a2a5SMartin Matuska 			int8_t number_of_bytes = u8_number_of_bytes[*p];
1298*8a62a2a5SMartin Matuska 
1299*8a62a2a5SMartin Matuska 			if (number_of_bytes <= 1)
1300*8a62a2a5SMartin Matuska 				break;
1301*8a62a2a5SMartin Matuska 			size = number_of_bytes;
1302*8a62a2a5SMartin Matuska 			if ((p + size) > oslast)
1303*8a62a2a5SMartin Matuska 				break;
1304*8a62a2a5SMartin Matuska 
1305*8a62a2a5SMartin Matuska 			saved_p = p;
1306*8a62a2a5SMartin Matuska 
1307*8a62a2a5SMartin Matuska 			for (i = 0; i < size; i++)
1308*8a62a2a5SMartin Matuska 				tc[i] = *p++;
1309*8a62a2a5SMartin Matuska 
1310*8a62a2a5SMartin Matuska 			q = find_composition_start(uv, t + saved_l,
1311*8a62a2a5SMartin Matuska 			    l - saved_l);
1312*8a62a2a5SMartin Matuska 			if (q == NULL) {
1313*8a62a2a5SMartin Matuska 				p = saved_p;
1314*8a62a2a5SMartin Matuska 				break;
1315*8a62a2a5SMartin Matuska 			}
1316*8a62a2a5SMartin Matuska 
1317*8a62a2a5SMartin Matuska 			match_not_found = B_TRUE;
1318*8a62a2a5SMartin Matuska 
1319*8a62a2a5SMartin Matuska 			for (C = *q++; C > 0; C--) {
1320*8a62a2a5SMartin Matuska 				for (k = 0; k < size; q++, k++)
1321*8a62a2a5SMartin Matuska 					if (*q != tc[k])
1322*8a62a2a5SMartin Matuska 						break;
1323*8a62a2a5SMartin Matuska 
1324*8a62a2a5SMartin Matuska 				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
1325*8a62a2a5SMartin Matuska 					match_not_found = B_FALSE;
1326*8a62a2a5SMartin Matuska 
1327*8a62a2a5SMartin Matuska 					l = saved_l;
1328*8a62a2a5SMartin Matuska 
1329*8a62a2a5SMartin Matuska 					while (*++q != U8_TBL_ELEMENT_FILLER) {
1330*8a62a2a5SMartin Matuska 						/*
1331*8a62a2a5SMartin Matuska 						 * This is practically
1332*8a62a2a5SMartin Matuska 						 * impossible but we don't
1333*8a62a2a5SMartin Matuska 						 * want to take any chances.
1334*8a62a2a5SMartin Matuska 						 */
1335*8a62a2a5SMartin Matuska 						if (l >=
1336*8a62a2a5SMartin Matuska 						    U8_STREAM_SAFE_TEXT_MAX) {
1337*8a62a2a5SMartin Matuska 							p = saved_p;
1338*8a62a2a5SMartin Matuska 							goto SAFE_RETURN;
1339*8a62a2a5SMartin Matuska 						}
1340*8a62a2a5SMartin Matuska 						t[l++] = *q;
1341*8a62a2a5SMartin Matuska 					}
1342*8a62a2a5SMartin Matuska 
1343*8a62a2a5SMartin Matuska 					break;
1344*8a62a2a5SMartin Matuska 				}
1345*8a62a2a5SMartin Matuska 
1346*8a62a2a5SMartin Matuska 				if (*q != U8_TBL_ELEMENT_FILLER)
1347*8a62a2a5SMartin Matuska 					while (*++q != U8_TBL_ELEMENT_FILLER)
1348*8a62a2a5SMartin Matuska 						;
1349*8a62a2a5SMartin Matuska 				while (*++q != U8_TBL_ELEMENT_FILLER)
1350*8a62a2a5SMartin Matuska 					;
1351*8a62a2a5SMartin Matuska 				q++;
1352*8a62a2a5SMartin Matuska 			}
1353*8a62a2a5SMartin Matuska 
1354*8a62a2a5SMartin Matuska 			if (match_not_found) {
1355*8a62a2a5SMartin Matuska 				p = saved_p;
1356*8a62a2a5SMartin Matuska 				break;
1357*8a62a2a5SMartin Matuska 			}
1358*8a62a2a5SMartin Matuska 		}
1359*8a62a2a5SMartin Matuska SAFE_RETURN:
1360*8a62a2a5SMartin Matuska 		*os = p;
1361*8a62a2a5SMartin Matuska 	}
1362*8a62a2a5SMartin Matuska 
1363*8a62a2a5SMartin Matuska 	/*
1364*8a62a2a5SMartin Matuska 	 * Now we copy over the temporary string to the target string.
1365*8a62a2a5SMartin Matuska 	 * Since composition always reduces the number of characters or
1366*8a62a2a5SMartin Matuska 	 * the number of characters stay, we don't need to worry about
1367*8a62a2a5SMartin Matuska 	 * the buffer overflow here.
1368*8a62a2a5SMartin Matuska 	 */
1369*8a62a2a5SMartin Matuska 	for (i = 0; i < l; i++)
1370*8a62a2a5SMartin Matuska 		s[i] = t[i];
1371*8a62a2a5SMartin Matuska 	s[l] = '\0';
1372*8a62a2a5SMartin Matuska 
1373*8a62a2a5SMartin Matuska 	return (l);
1374*8a62a2a5SMartin Matuska }
1375*8a62a2a5SMartin Matuska 
1376*8a62a2a5SMartin Matuska /*
1377*8a62a2a5SMartin Matuska  * The collect_a_seq() function checks on the given string s, collect
1378*8a62a2a5SMartin Matuska  * a sequence of characters at u8s, and return the sequence. While it collects
1379*8a62a2a5SMartin Matuska  * a sequence, it also applies case conversion, canonical or compatibility
1380*8a62a2a5SMartin Matuska  * decomposition, canonical decomposition, or some or all of them and
1381*8a62a2a5SMartin Matuska  * in that order.
1382*8a62a2a5SMartin Matuska  *
1383*8a62a2a5SMartin Matuska  * The collected sequence cannot be bigger than 32 characters since if
1384*8a62a2a5SMartin Matuska  * it is having more than 31 characters, the sequence will be terminated
1385*8a62a2a5SMartin Matuska  * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
1386*8a62a2a5SMartin Matuska  * a Stream-Safe Text. The collected sequence is always terminated with
1387*8a62a2a5SMartin Matuska  * a null byte and the return value is the byte length of the sequence
1388*8a62a2a5SMartin Matuska  * including 0. The return value does not include the terminating
1389*8a62a2a5SMartin Matuska  * null byte.
1390*8a62a2a5SMartin Matuska  */
1391*8a62a2a5SMartin Matuska static size_t
collect_a_seq(size_t uv,uchar_t * u8s,uchar_t ** source,uchar_t * slast,boolean_t is_it_toupper,boolean_t is_it_tolower,boolean_t canonical_decomposition,boolean_t compatibility_decomposition,boolean_t canonical_composition,int * errnum,u8_normalization_states_t * state)1392*8a62a2a5SMartin Matuska collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
1393*8a62a2a5SMartin Matuska     boolean_t is_it_toupper,
1394*8a62a2a5SMartin Matuska     boolean_t is_it_tolower,
1395*8a62a2a5SMartin Matuska     boolean_t canonical_decomposition,
1396*8a62a2a5SMartin Matuska     boolean_t compatibility_decomposition,
1397*8a62a2a5SMartin Matuska     boolean_t canonical_composition,
1398*8a62a2a5SMartin Matuska     int *errnum, u8_normalization_states_t *state)
1399*8a62a2a5SMartin Matuska {
1400*8a62a2a5SMartin Matuska 	uchar_t *s;
1401*8a62a2a5SMartin Matuska 	int sz;
1402*8a62a2a5SMartin Matuska 	int saved_sz;
1403*8a62a2a5SMartin Matuska 	size_t i;
1404*8a62a2a5SMartin Matuska 	size_t j;
1405*8a62a2a5SMartin Matuska 	size_t k;
1406*8a62a2a5SMartin Matuska 	size_t l;
1407*8a62a2a5SMartin Matuska 	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
1408*8a62a2a5SMartin Matuska 	uchar_t disp[U8_MAX_CHARS_A_SEQ];
1409*8a62a2a5SMartin Matuska 	uchar_t start[U8_MAX_CHARS_A_SEQ];
1410*8a62a2a5SMartin Matuska 	uchar_t u8t[U8_MB_CUR_MAX] = { '\0' };
1411*8a62a2a5SMartin Matuska 	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
1412*8a62a2a5SMartin Matuska 	uchar_t tc;
1413*8a62a2a5SMartin Matuska 	size_t last;
1414*8a62a2a5SMartin Matuska 	size_t saved_last;
1415*8a62a2a5SMartin Matuska 	uint32_t u1;
1416*8a62a2a5SMartin Matuska 
1417*8a62a2a5SMartin Matuska 	/*
1418*8a62a2a5SMartin Matuska 	 * Save the source string pointer which we will return a changed
1419*8a62a2a5SMartin Matuska 	 * pointer if we do processing.
1420*8a62a2a5SMartin Matuska 	 */
1421*8a62a2a5SMartin Matuska 	s = *source;
1422*8a62a2a5SMartin Matuska 
1423*8a62a2a5SMartin Matuska 	/*
1424*8a62a2a5SMartin Matuska 	 * The following is a fallback for just in case callers are not
1425*8a62a2a5SMartin Matuska 	 * checking the string boundaries before the calling.
1426*8a62a2a5SMartin Matuska 	 */
1427*8a62a2a5SMartin Matuska 	if (s >= slast) {
1428*8a62a2a5SMartin Matuska 		u8s[0] = '\0';
1429*8a62a2a5SMartin Matuska 
1430*8a62a2a5SMartin Matuska 		return (0);
1431*8a62a2a5SMartin Matuska 	}
1432*8a62a2a5SMartin Matuska 
1433*8a62a2a5SMartin Matuska 	/*
1434*8a62a2a5SMartin Matuska 	 * As the first thing, let's collect a character and do case
1435*8a62a2a5SMartin Matuska 	 * conversion if necessary.
1436*8a62a2a5SMartin Matuska 	 */
1437*8a62a2a5SMartin Matuska 
1438*8a62a2a5SMartin Matuska 	sz = u8_number_of_bytes[*s];
1439*8a62a2a5SMartin Matuska 
1440*8a62a2a5SMartin Matuska 	if (sz < 0) {
1441*8a62a2a5SMartin Matuska 		*errnum = EILSEQ;
1442*8a62a2a5SMartin Matuska 
1443*8a62a2a5SMartin Matuska 		u8s[0] = *s++;
1444*8a62a2a5SMartin Matuska 		u8s[1] = '\0';
1445*8a62a2a5SMartin Matuska 
1446*8a62a2a5SMartin Matuska 		*source = s;
1447*8a62a2a5SMartin Matuska 
1448*8a62a2a5SMartin Matuska 		return (1);
1449*8a62a2a5SMartin Matuska 	}
1450*8a62a2a5SMartin Matuska 
1451*8a62a2a5SMartin Matuska 	if (sz == 1) {
1452*8a62a2a5SMartin Matuska 		if (is_it_toupper)
1453*8a62a2a5SMartin Matuska 			u8s[0] = U8_ASCII_TOUPPER(*s);
1454*8a62a2a5SMartin Matuska 		else if (is_it_tolower)
1455*8a62a2a5SMartin Matuska 			u8s[0] = U8_ASCII_TOLOWER(*s);
1456*8a62a2a5SMartin Matuska 		else
1457*8a62a2a5SMartin Matuska 			u8s[0] = *s;
1458*8a62a2a5SMartin Matuska 		s++;
1459*8a62a2a5SMartin Matuska 		u8s[1] = '\0';
1460*8a62a2a5SMartin Matuska 	} else if ((s + sz) > slast) {
1461*8a62a2a5SMartin Matuska 		*errnum = EINVAL;
1462*8a62a2a5SMartin Matuska 
1463*8a62a2a5SMartin Matuska 		for (i = 0; s < slast; )
1464*8a62a2a5SMartin Matuska 			u8s[i++] = *s++;
1465*8a62a2a5SMartin Matuska 		u8s[i] = '\0';
1466*8a62a2a5SMartin Matuska 
1467*8a62a2a5SMartin Matuska 		*source = s;
1468*8a62a2a5SMartin Matuska 
1469*8a62a2a5SMartin Matuska 		return (i);
1470*8a62a2a5SMartin Matuska 	} else {
1471*8a62a2a5SMartin Matuska 		if (is_it_toupper || is_it_tolower) {
1472*8a62a2a5SMartin Matuska 			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
1473*8a62a2a5SMartin Matuska 			s += sz;
1474*8a62a2a5SMartin Matuska 			sz = i;
1475*8a62a2a5SMartin Matuska 		} else {
1476*8a62a2a5SMartin Matuska 			for (i = 0; i < sz; )
1477*8a62a2a5SMartin Matuska 				u8s[i++] = *s++;
1478*8a62a2a5SMartin Matuska 			u8s[i] = '\0';
1479*8a62a2a5SMartin Matuska 		}
1480*8a62a2a5SMartin Matuska 	}
1481*8a62a2a5SMartin Matuska 
1482*8a62a2a5SMartin Matuska 	/*
1483*8a62a2a5SMartin Matuska 	 * And then canonical/compatibility decomposition followed by
1484*8a62a2a5SMartin Matuska 	 * an optional canonical composition. Please be noted that
1485*8a62a2a5SMartin Matuska 	 * canonical composition is done only when a decomposition is
1486*8a62a2a5SMartin Matuska 	 * done.
1487*8a62a2a5SMartin Matuska 	 */
1488*8a62a2a5SMartin Matuska 	if (canonical_decomposition || compatibility_decomposition) {
1489*8a62a2a5SMartin Matuska 		if (sz == 1) {
1490*8a62a2a5SMartin Matuska 			*state = U8_STATE_START;
1491*8a62a2a5SMartin Matuska 
1492*8a62a2a5SMartin Matuska 			saved_sz = 1;
1493*8a62a2a5SMartin Matuska 
1494*8a62a2a5SMartin Matuska 			comb_class[0] = 0;
1495*8a62a2a5SMartin Matuska 			start[0] = 0;
1496*8a62a2a5SMartin Matuska 			disp[0] = 1;
1497*8a62a2a5SMartin Matuska 
1498*8a62a2a5SMartin Matuska 			last = 1;
1499*8a62a2a5SMartin Matuska 		} else {
1500*8a62a2a5SMartin Matuska 			saved_sz = do_decomp(uv, u8s, u8s, sz,
1501*8a62a2a5SMartin Matuska 			    canonical_decomposition, state);
1502*8a62a2a5SMartin Matuska 
1503*8a62a2a5SMartin Matuska 			last = 0;
1504*8a62a2a5SMartin Matuska 
1505*8a62a2a5SMartin Matuska 			for (i = 0; i < saved_sz; ) {
1506*8a62a2a5SMartin Matuska 				sz = u8_number_of_bytes[u8s[i]];
1507*8a62a2a5SMartin Matuska 
1508*8a62a2a5SMartin Matuska 				comb_class[last] = combining_class(uv,
1509*8a62a2a5SMartin Matuska 				    u8s + i, sz);
1510*8a62a2a5SMartin Matuska 				start[last] = i;
1511*8a62a2a5SMartin Matuska 				disp[last] = sz;
1512*8a62a2a5SMartin Matuska 
1513*8a62a2a5SMartin Matuska 				last++;
1514*8a62a2a5SMartin Matuska 				i += sz;
1515*8a62a2a5SMartin Matuska 			}
1516*8a62a2a5SMartin Matuska 
1517*8a62a2a5SMartin Matuska 			/*
1518*8a62a2a5SMartin Matuska 			 * Decomposition yields various Hangul related
1519*8a62a2a5SMartin Matuska 			 * states but not on combining marks. We need to
1520*8a62a2a5SMartin Matuska 			 * find out at here by checking on the last
1521*8a62a2a5SMartin Matuska 			 * character.
1522*8a62a2a5SMartin Matuska 			 */
1523*8a62a2a5SMartin Matuska 			if (*state == U8_STATE_START) {
1524*8a62a2a5SMartin Matuska 				if (comb_class[last - 1])
1525*8a62a2a5SMartin Matuska 					*state = U8_STATE_COMBINING_MARK;
1526*8a62a2a5SMartin Matuska 			}
1527*8a62a2a5SMartin Matuska 		}
1528*8a62a2a5SMartin Matuska 
1529*8a62a2a5SMartin Matuska 		saved_last = last;
1530*8a62a2a5SMartin Matuska 
1531*8a62a2a5SMartin Matuska 		while (s < slast) {
1532*8a62a2a5SMartin Matuska 			sz = u8_number_of_bytes[*s];
1533*8a62a2a5SMartin Matuska 
1534*8a62a2a5SMartin Matuska 			/*
1535*8a62a2a5SMartin Matuska 			 * If this is an illegal character, an incomplete
1536*8a62a2a5SMartin Matuska 			 * character, or an 7-bit ASCII Starter character,
1537*8a62a2a5SMartin Matuska 			 * then we have collected a sequence; break and let
1538*8a62a2a5SMartin Matuska 			 * the next call deal with the two cases.
1539*8a62a2a5SMartin Matuska 			 *
1540*8a62a2a5SMartin Matuska 			 * Note that this is okay only if you are using this
1541*8a62a2a5SMartin Matuska 			 * function with a fixed length string, not on
1542*8a62a2a5SMartin Matuska 			 * a buffer with multiple calls of one chunk at a time.
1543*8a62a2a5SMartin Matuska 			 */
1544*8a62a2a5SMartin Matuska 			if (sz <= 1) {
1545*8a62a2a5SMartin Matuska 				break;
1546*8a62a2a5SMartin Matuska 			} else if ((s + sz) > slast) {
1547*8a62a2a5SMartin Matuska 				break;
1548*8a62a2a5SMartin Matuska 			} else {
1549*8a62a2a5SMartin Matuska 				/*
1550*8a62a2a5SMartin Matuska 				 * If the previous character was a Hangul Jamo
1551*8a62a2a5SMartin Matuska 				 * and this character is a Hangul Jamo that
1552*8a62a2a5SMartin Matuska 				 * can be conjoined, we collect the Jamo.
1553*8a62a2a5SMartin Matuska 				 */
1554*8a62a2a5SMartin Matuska 				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
1555*8a62a2a5SMartin Matuska 					U8_PUT_3BYTES_INTO_UTF32(u1,
1556*8a62a2a5SMartin Matuska 					    *s, *(s + 1), *(s + 2));
1557*8a62a2a5SMartin Matuska 
1558*8a62a2a5SMartin Matuska 					if (U8_HANGUL_COMPOSABLE_L_V(*state,
1559*8a62a2a5SMartin Matuska 					    u1)) {
1560*8a62a2a5SMartin Matuska 						i = 0;
1561*8a62a2a5SMartin Matuska 						*state = U8_STATE_HANGUL_LV;
1562*8a62a2a5SMartin Matuska 						goto COLLECT_A_HANGUL;
1563*8a62a2a5SMartin Matuska 					}
1564*8a62a2a5SMartin Matuska 
1565*8a62a2a5SMartin Matuska 					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
1566*8a62a2a5SMartin Matuska 					    u1)) {
1567*8a62a2a5SMartin Matuska 						i = 0;
1568*8a62a2a5SMartin Matuska 						*state = U8_STATE_HANGUL_LVT;
1569*8a62a2a5SMartin Matuska 						goto COLLECT_A_HANGUL;
1570*8a62a2a5SMartin Matuska 					}
1571*8a62a2a5SMartin Matuska 				}
1572*8a62a2a5SMartin Matuska 
1573*8a62a2a5SMartin Matuska 				/*
1574*8a62a2a5SMartin Matuska 				 * Regardless of whatever it was, if this is
1575*8a62a2a5SMartin Matuska 				 * a Starter, we don't collect the character
1576*8a62a2a5SMartin Matuska 				 * since that's a new start and we will deal
1577*8a62a2a5SMartin Matuska 				 * with it at the next time.
1578*8a62a2a5SMartin Matuska 				 */
1579*8a62a2a5SMartin Matuska 				i = combining_class(uv, s, sz);
1580*8a62a2a5SMartin Matuska 				if (i == U8_COMBINING_CLASS_STARTER)
1581*8a62a2a5SMartin Matuska 					break;
1582*8a62a2a5SMartin Matuska 
1583*8a62a2a5SMartin Matuska 				/*
1584*8a62a2a5SMartin Matuska 				 * We know the current character is a combining
1585*8a62a2a5SMartin Matuska 				 * mark. If the previous character wasn't
1586*8a62a2a5SMartin Matuska 				 * a Starter (not Hangul) or a combining mark,
1587*8a62a2a5SMartin Matuska 				 * then, we don't collect this combining mark.
1588*8a62a2a5SMartin Matuska 				 */
1589*8a62a2a5SMartin Matuska 				if (*state != U8_STATE_START &&
1590*8a62a2a5SMartin Matuska 				    *state != U8_STATE_COMBINING_MARK)
1591*8a62a2a5SMartin Matuska 					break;
1592*8a62a2a5SMartin Matuska 
1593*8a62a2a5SMartin Matuska 				*state = U8_STATE_COMBINING_MARK;
1594*8a62a2a5SMartin Matuska COLLECT_A_HANGUL:
1595*8a62a2a5SMartin Matuska 				/*
1596*8a62a2a5SMartin Matuska 				 * If we collected a Starter and combining
1597*8a62a2a5SMartin Matuska 				 * marks up to 30, i.e., total 31 characters,
1598*8a62a2a5SMartin Matuska 				 * then, we terminate this degenerately long
1599*8a62a2a5SMartin Matuska 				 * combining sequence with a U+034F COMBINING
1600*8a62a2a5SMartin Matuska 				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
1601*8a62a2a5SMartin Matuska 				 * UTF-8 and turn this into a Stream-Safe
1602*8a62a2a5SMartin Matuska 				 * Text. This will be extremely rare but
1603*8a62a2a5SMartin Matuska 				 * possible.
1604*8a62a2a5SMartin Matuska 				 *
1605*8a62a2a5SMartin Matuska 				 * The following will also guarantee that
1606*8a62a2a5SMartin Matuska 				 * we are not writing more than 32 characters
1607*8a62a2a5SMartin Matuska 				 * plus a NULL at u8s[].
1608*8a62a2a5SMartin Matuska 				 */
1609*8a62a2a5SMartin Matuska 				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
1610*8a62a2a5SMartin Matuska TURN_STREAM_SAFE:
1611*8a62a2a5SMartin Matuska 					*state = U8_STATE_START;
1612*8a62a2a5SMartin Matuska 					comb_class[last] = 0;
1613*8a62a2a5SMartin Matuska 					start[last] = saved_sz;
1614*8a62a2a5SMartin Matuska 					disp[last] = 2;
1615*8a62a2a5SMartin Matuska 					last++;
1616*8a62a2a5SMartin Matuska 
1617*8a62a2a5SMartin Matuska 					u8s[saved_sz++] = 0xCD;
1618*8a62a2a5SMartin Matuska 					u8s[saved_sz++] = 0x8F;
1619*8a62a2a5SMartin Matuska 
1620*8a62a2a5SMartin Matuska 					break;
1621*8a62a2a5SMartin Matuska 				}
1622*8a62a2a5SMartin Matuska 
1623*8a62a2a5SMartin Matuska 				/*
1624*8a62a2a5SMartin Matuska 				 * Some combining marks also do decompose into
1625*8a62a2a5SMartin Matuska 				 * another combining mark or marks.
1626*8a62a2a5SMartin Matuska 				 */
1627*8a62a2a5SMartin Matuska 				if (*state == U8_STATE_COMBINING_MARK) {
1628*8a62a2a5SMartin Matuska 					k = last;
1629*8a62a2a5SMartin Matuska 					l = sz;
1630*8a62a2a5SMartin Matuska 					i = do_decomp(uv, uts, s, sz,
1631*8a62a2a5SMartin Matuska 					    canonical_decomposition, state);
1632*8a62a2a5SMartin Matuska 					for (j = 0; j < i; ) {
1633*8a62a2a5SMartin Matuska 						sz = u8_number_of_bytes[uts[j]];
1634*8a62a2a5SMartin Matuska 
1635*8a62a2a5SMartin Matuska 						comb_class[last] =
1636*8a62a2a5SMartin Matuska 						    combining_class(uv,
1637*8a62a2a5SMartin Matuska 						    uts + j, sz);
1638*8a62a2a5SMartin Matuska 						start[last] = saved_sz + j;
1639*8a62a2a5SMartin Matuska 						disp[last] = sz;
1640*8a62a2a5SMartin Matuska 
1641*8a62a2a5SMartin Matuska 						last++;
1642*8a62a2a5SMartin Matuska 						if (last >=
1643*8a62a2a5SMartin Matuska 						    U8_UPPER_LIMIT_IN_A_SEQ) {
1644*8a62a2a5SMartin Matuska 							last = k;
1645*8a62a2a5SMartin Matuska 							goto TURN_STREAM_SAFE;
1646*8a62a2a5SMartin Matuska 						}
1647*8a62a2a5SMartin Matuska 						j += sz;
1648*8a62a2a5SMartin Matuska 					}
1649*8a62a2a5SMartin Matuska 
1650*8a62a2a5SMartin Matuska 					*state = U8_STATE_COMBINING_MARK;
1651*8a62a2a5SMartin Matuska 					sz = i;
1652*8a62a2a5SMartin Matuska 					s += l;
1653*8a62a2a5SMartin Matuska 
1654*8a62a2a5SMartin Matuska 					for (i = 0; i < sz; i++)
1655*8a62a2a5SMartin Matuska 						u8s[saved_sz++] = uts[i];
1656*8a62a2a5SMartin Matuska 				} else {
1657*8a62a2a5SMartin Matuska 					comb_class[last] = i;
1658*8a62a2a5SMartin Matuska 					start[last] = saved_sz;
1659*8a62a2a5SMartin Matuska 					disp[last] = sz;
1660*8a62a2a5SMartin Matuska 					last++;
1661*8a62a2a5SMartin Matuska 
1662*8a62a2a5SMartin Matuska 					for (i = 0; i < sz; i++)
1663*8a62a2a5SMartin Matuska 						u8s[saved_sz++] = *s++;
1664*8a62a2a5SMartin Matuska 				}
1665*8a62a2a5SMartin Matuska 
1666*8a62a2a5SMartin Matuska 				/*
1667*8a62a2a5SMartin Matuska 				 * If this is U+0345 COMBINING GREEK
1668*8a62a2a5SMartin Matuska 				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
1669*8a62a2a5SMartin Matuska 				 * iota subscript, and need to be converted to
1670*8a62a2a5SMartin Matuska 				 * uppercase letter, convert it to U+0399 GREEK
1671*8a62a2a5SMartin Matuska 				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
1672*8a62a2a5SMartin Matuska 				 * i.e., convert to capital adscript form as
1673*8a62a2a5SMartin Matuska 				 * specified in the Unicode standard.
1674*8a62a2a5SMartin Matuska 				 *
1675*8a62a2a5SMartin Matuska 				 * This is the only special case of (ambiguous)
1676*8a62a2a5SMartin Matuska 				 * case conversion at combining marks and
1677*8a62a2a5SMartin Matuska 				 * probably the standard will never have
1678*8a62a2a5SMartin Matuska 				 * anything similar like this in future.
1679*8a62a2a5SMartin Matuska 				 */
1680*8a62a2a5SMartin Matuska 				if (is_it_toupper && sz >= 2 &&
1681*8a62a2a5SMartin Matuska 				    u8s[saved_sz - 2] == 0xCD &&
1682*8a62a2a5SMartin Matuska 				    u8s[saved_sz - 1] == 0x85) {
1683*8a62a2a5SMartin Matuska 					u8s[saved_sz - 2] = 0xCE;
1684*8a62a2a5SMartin Matuska 					u8s[saved_sz - 1] = 0x99;
1685*8a62a2a5SMartin Matuska 				}
1686*8a62a2a5SMartin Matuska 			}
1687*8a62a2a5SMartin Matuska 		}
1688*8a62a2a5SMartin Matuska 
1689*8a62a2a5SMartin Matuska 		/*
1690*8a62a2a5SMartin Matuska 		 * Let's try to ensure a canonical ordering for the collected
1691*8a62a2a5SMartin Matuska 		 * combining marks. We do this only if we have collected
1692*8a62a2a5SMartin Matuska 		 * at least one more non-Starter. (The decomposition mapping
1693*8a62a2a5SMartin Matuska 		 * data tables have fully (and recursively) expanded and
1694*8a62a2a5SMartin Matuska 		 * canonically ordered decompositions.)
1695*8a62a2a5SMartin Matuska 		 *
1696*8a62a2a5SMartin Matuska 		 * The U8_SWAP_COMB_MARKS() convenience macro has some
1697*8a62a2a5SMartin Matuska 		 * assumptions and we are meeting the assumptions.
1698*8a62a2a5SMartin Matuska 		 */
1699*8a62a2a5SMartin Matuska 		last--;
1700*8a62a2a5SMartin Matuska 		if (last >= saved_last) {
1701*8a62a2a5SMartin Matuska 			for (i = 0; i < last; i++)
1702*8a62a2a5SMartin Matuska 				for (j = last; j > i; j--)
1703*8a62a2a5SMartin Matuska 					if (comb_class[j] &&
1704*8a62a2a5SMartin Matuska 					    comb_class[j - 1] > comb_class[j]) {
1705*8a62a2a5SMartin Matuska 						U8_SWAP_COMB_MARKS(j - 1, j);
1706*8a62a2a5SMartin Matuska 					}
1707*8a62a2a5SMartin Matuska 		}
1708*8a62a2a5SMartin Matuska 
1709*8a62a2a5SMartin Matuska 		*source = s;
1710*8a62a2a5SMartin Matuska 
1711*8a62a2a5SMartin Matuska 		if (! canonical_composition) {
1712*8a62a2a5SMartin Matuska 			u8s[saved_sz] = '\0';
1713*8a62a2a5SMartin Matuska 			return (saved_sz);
1714*8a62a2a5SMartin Matuska 		}
1715*8a62a2a5SMartin Matuska 
1716*8a62a2a5SMartin Matuska 		/*
1717*8a62a2a5SMartin Matuska 		 * Now do the canonical composition. Note that we do this
1718*8a62a2a5SMartin Matuska 		 * only after a canonical or compatibility decomposition to
1719*8a62a2a5SMartin Matuska 		 * finish up NFC or NFKC.
1720*8a62a2a5SMartin Matuska 		 */
1721*8a62a2a5SMartin Matuska 		sz = do_composition(uv, u8s, comb_class, start, disp, last,
1722*8a62a2a5SMartin Matuska 		    &s, slast);
1723*8a62a2a5SMartin Matuska 	}
1724*8a62a2a5SMartin Matuska 
1725*8a62a2a5SMartin Matuska 	*source = s;
1726*8a62a2a5SMartin Matuska 
1727*8a62a2a5SMartin Matuska 	return ((size_t)sz);
1728*8a62a2a5SMartin Matuska }
1729*8a62a2a5SMartin Matuska 
1730*8a62a2a5SMartin Matuska /*
1731*8a62a2a5SMartin Matuska  * The do_norm_compare() function does string comparison based on Unicode
1732*8a62a2a5SMartin Matuska  * simple case mappings and Unicode Normalization definitions.
1733*8a62a2a5SMartin Matuska  *
1734*8a62a2a5SMartin Matuska  * It does so by collecting a sequence of character at a time and comparing
1735*8a62a2a5SMartin Matuska  * the collected sequences from the strings.
1736*8a62a2a5SMartin Matuska  *
1737*8a62a2a5SMartin Matuska  * The meanings on the return values are the same as the usual strcmp().
1738*8a62a2a5SMartin Matuska  */
1739*8a62a2a5SMartin Matuska static int
do_norm_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,int flag,int * errnum)1740*8a62a2a5SMartin Matuska do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1741*8a62a2a5SMartin Matuska     int flag, int *errnum)
1742*8a62a2a5SMartin Matuska {
1743*8a62a2a5SMartin Matuska 	int result;
1744*8a62a2a5SMartin Matuska 	size_t sz1;
1745*8a62a2a5SMartin Matuska 	size_t sz2;
1746*8a62a2a5SMartin Matuska 	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
1747*8a62a2a5SMartin Matuska 	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
1748*8a62a2a5SMartin Matuska 	uchar_t *s1last;
1749*8a62a2a5SMartin Matuska 	uchar_t *s2last;
1750*8a62a2a5SMartin Matuska 	boolean_t is_it_toupper;
1751*8a62a2a5SMartin Matuska 	boolean_t is_it_tolower;
1752*8a62a2a5SMartin Matuska 	boolean_t canonical_decomposition;
1753*8a62a2a5SMartin Matuska 	boolean_t compatibility_decomposition;
1754*8a62a2a5SMartin Matuska 	boolean_t canonical_composition;
1755*8a62a2a5SMartin Matuska 	u8_normalization_states_t state;
1756*8a62a2a5SMartin Matuska 
1757*8a62a2a5SMartin Matuska 	s1last = s1 + n1;
1758*8a62a2a5SMartin Matuska 	s2last = s2 + n2;
1759*8a62a2a5SMartin Matuska 
1760*8a62a2a5SMartin Matuska 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1761*8a62a2a5SMartin Matuska #ifdef U8_STRCMP_CI_LOWER
1762*8a62a2a5SMartin Matuska 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1763*8a62a2a5SMartin Matuska #else
1764*8a62a2a5SMartin Matuska 	is_it_tolower = 0;
1765*8a62a2a5SMartin Matuska #endif
1766*8a62a2a5SMartin Matuska 	canonical_decomposition = flag & U8_CANON_DECOMP;
1767*8a62a2a5SMartin Matuska 	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
1768*8a62a2a5SMartin Matuska 	canonical_composition = flag & U8_CANON_COMP;
1769*8a62a2a5SMartin Matuska 
1770*8a62a2a5SMartin Matuska 	while (s1 < s1last && s2 < s2last) {
1771*8a62a2a5SMartin Matuska 		/*
1772*8a62a2a5SMartin Matuska 		 * If the current character is a 7-bit ASCII and the last
1773*8a62a2a5SMartin Matuska 		 * character, or, if the current character and the next
1774*8a62a2a5SMartin Matuska 		 * character are both some 7-bit ASCII characters then
1775*8a62a2a5SMartin Matuska 		 * we treat the current character as a sequence.
1776*8a62a2a5SMartin Matuska 		 *
1777*8a62a2a5SMartin Matuska 		 * In any other cases, we need to call collect_a_seq().
1778*8a62a2a5SMartin Matuska 		 */
1779*8a62a2a5SMartin Matuska 
1780*8a62a2a5SMartin Matuska 		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
1781*8a62a2a5SMartin Matuska 		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
1782*8a62a2a5SMartin Matuska 			if (is_it_toupper)
1783*8a62a2a5SMartin Matuska 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
1784*8a62a2a5SMartin Matuska 			else if (is_it_tolower)
1785*8a62a2a5SMartin Matuska 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
1786*8a62a2a5SMartin Matuska 			else
1787*8a62a2a5SMartin Matuska 				u8s1[0] = *s1;
1788*8a62a2a5SMartin Matuska 			u8s1[1] = '\0';
1789*8a62a2a5SMartin Matuska 			sz1 = 1;
1790*8a62a2a5SMartin Matuska 			s1++;
1791*8a62a2a5SMartin Matuska 		} else {
1792*8a62a2a5SMartin Matuska 			state = U8_STATE_START;
1793*8a62a2a5SMartin Matuska 			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
1794*8a62a2a5SMartin Matuska 			    is_it_toupper, is_it_tolower,
1795*8a62a2a5SMartin Matuska 			    canonical_decomposition,
1796*8a62a2a5SMartin Matuska 			    compatibility_decomposition,
1797*8a62a2a5SMartin Matuska 			    canonical_composition, errnum, &state);
1798*8a62a2a5SMartin Matuska 		}
1799*8a62a2a5SMartin Matuska 
1800*8a62a2a5SMartin Matuska 		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
1801*8a62a2a5SMartin Matuska 		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
1802*8a62a2a5SMartin Matuska 			if (is_it_toupper)
1803*8a62a2a5SMartin Matuska 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
1804*8a62a2a5SMartin Matuska 			else if (is_it_tolower)
1805*8a62a2a5SMartin Matuska 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
1806*8a62a2a5SMartin Matuska 			else
1807*8a62a2a5SMartin Matuska 				u8s2[0] = *s2;
1808*8a62a2a5SMartin Matuska 			u8s2[1] = '\0';
1809*8a62a2a5SMartin Matuska 			sz2 = 1;
1810*8a62a2a5SMartin Matuska 			s2++;
1811*8a62a2a5SMartin Matuska 		} else {
1812*8a62a2a5SMartin Matuska 			state = U8_STATE_START;
1813*8a62a2a5SMartin Matuska 			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
1814*8a62a2a5SMartin Matuska 			    is_it_toupper, is_it_tolower,
1815*8a62a2a5SMartin Matuska 			    canonical_decomposition,
1816*8a62a2a5SMartin Matuska 			    compatibility_decomposition,
1817*8a62a2a5SMartin Matuska 			    canonical_composition, errnum, &state);
1818*8a62a2a5SMartin Matuska 		}
1819*8a62a2a5SMartin Matuska 
1820*8a62a2a5SMartin Matuska 		/*
1821*8a62a2a5SMartin Matuska 		 * Now compare the two characters. If they are the same,
1822*8a62a2a5SMartin Matuska 		 * we move on to the next character sequences.
1823*8a62a2a5SMartin Matuska 		 */
1824*8a62a2a5SMartin Matuska 		if (sz1 == 1 && sz2 == 1) {
1825*8a62a2a5SMartin Matuska 			if (*u8s1 > *u8s2)
1826*8a62a2a5SMartin Matuska 				return (1);
1827*8a62a2a5SMartin Matuska 			if (*u8s1 < *u8s2)
1828*8a62a2a5SMartin Matuska 				return (-1);
1829*8a62a2a5SMartin Matuska 		} else {
1830*8a62a2a5SMartin Matuska 			result = strcmp((const char *)u8s1, (const char *)u8s2);
1831*8a62a2a5SMartin Matuska 			if (result != 0)
1832*8a62a2a5SMartin Matuska 				return (result);
1833*8a62a2a5SMartin Matuska 		}
1834*8a62a2a5SMartin Matuska 	}
1835*8a62a2a5SMartin Matuska 
1836*8a62a2a5SMartin Matuska 	/*
1837*8a62a2a5SMartin Matuska 	 * We compared until the end of either or both strings.
1838*8a62a2a5SMartin Matuska 	 *
1839*8a62a2a5SMartin Matuska 	 * If we reached to or went over the ends for the both, that means
1840*8a62a2a5SMartin Matuska 	 * they are the same.
1841*8a62a2a5SMartin Matuska 	 *
1842*8a62a2a5SMartin Matuska 	 * If we reached only one end, that means the other string has
1843*8a62a2a5SMartin Matuska 	 * something which then can be used to determine the return value.
1844*8a62a2a5SMartin Matuska 	 */
1845*8a62a2a5SMartin Matuska 	if (s1 >= s1last) {
1846*8a62a2a5SMartin Matuska 		if (s2 >= s2last)
1847*8a62a2a5SMartin Matuska 			return (0);
1848*8a62a2a5SMartin Matuska 		return (-1);
1849*8a62a2a5SMartin Matuska 	}
1850*8a62a2a5SMartin Matuska 	return (1);
1851*8a62a2a5SMartin Matuska }
1852*8a62a2a5SMartin Matuska 
1853*8a62a2a5SMartin Matuska /*
1854*8a62a2a5SMartin Matuska  * The u8_strcmp() function compares two UTF-8 strings quite similar to
1855*8a62a2a5SMartin Matuska  * the strcmp(). For the comparison, however, Unicode Normalization specific
1856*8a62a2a5SMartin Matuska  * equivalency and Unicode simple case conversion mappings based equivalency
1857*8a62a2a5SMartin Matuska  * can be requested and checked against.
1858*8a62a2a5SMartin Matuska  */
1859*8a62a2a5SMartin Matuska int
u8_strcmp(const char * s1,const char * s2,size_t n,int flag,size_t uv,int * errnum)1860*8a62a2a5SMartin Matuska u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1861*8a62a2a5SMartin Matuska     int *errnum)
1862*8a62a2a5SMartin Matuska {
1863*8a62a2a5SMartin Matuska 	int f;
1864*8a62a2a5SMartin Matuska 	size_t n1;
1865*8a62a2a5SMartin Matuska 	size_t n2;
1866*8a62a2a5SMartin Matuska 
1867*8a62a2a5SMartin Matuska 	*errnum = 0;
1868*8a62a2a5SMartin Matuska 
1869*8a62a2a5SMartin Matuska 	/*
1870*8a62a2a5SMartin Matuska 	 * Check on the requested Unicode version, case conversion, and
1871*8a62a2a5SMartin Matuska 	 * normalization flag values.
1872*8a62a2a5SMartin Matuska 	 */
1873*8a62a2a5SMartin Matuska 
1874*8a62a2a5SMartin Matuska 	if (uv > U8_UNICODE_LATEST) {
1875*8a62a2a5SMartin Matuska 		*errnum = ERANGE;
1876*8a62a2a5SMartin Matuska 		uv = U8_UNICODE_LATEST;
1877*8a62a2a5SMartin Matuska 	}
1878*8a62a2a5SMartin Matuska 
1879*8a62a2a5SMartin Matuska 	if (flag == 0) {
1880*8a62a2a5SMartin Matuska 		flag = U8_STRCMP_CS;
1881*8a62a2a5SMartin Matuska 	} else {
1882*8a62a2a5SMartin Matuska #ifdef U8_STRCMP_CI_LOWER
1883*8a62a2a5SMartin Matuska 		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER
1884*8a62a2a5SMartin Matuska 		    | U8_STRCMP_CI_LOWER);
1885*8a62a2a5SMartin Matuska #else
1886*8a62a2a5SMartin Matuska 		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER);
1887*8a62a2a5SMartin Matuska #endif
1888*8a62a2a5SMartin Matuska 		if (f == 0) {
1889*8a62a2a5SMartin Matuska 			flag |= U8_STRCMP_CS;
1890*8a62a2a5SMartin Matuska 		}
1891*8a62a2a5SMartin Matuska #ifdef U8_STRCMP_CI_LOWER
1892*8a62a2a5SMartin Matuska 		else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
1893*8a62a2a5SMartin Matuska 		    f != U8_STRCMP_CI_LOWER)
1894*8a62a2a5SMartin Matuska #else
1895*8a62a2a5SMartin Matuska 		else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER)
1896*8a62a2a5SMartin Matuska #endif
1897*8a62a2a5SMartin Matuska 		{
1898*8a62a2a5SMartin Matuska 			*errnum = EBADF;
1899*8a62a2a5SMartin Matuska 			flag = U8_STRCMP_CS;
1900*8a62a2a5SMartin Matuska 		}
1901*8a62a2a5SMartin Matuska 
1902*8a62a2a5SMartin Matuska 		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1903*8a62a2a5SMartin Matuska 		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
1904*8a62a2a5SMartin Matuska 		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
1905*8a62a2a5SMartin Matuska 			*errnum = EBADF;
1906*8a62a2a5SMartin Matuska 			flag = U8_STRCMP_CS;
1907*8a62a2a5SMartin Matuska 		}
1908*8a62a2a5SMartin Matuska 	}
1909*8a62a2a5SMartin Matuska 
1910*8a62a2a5SMartin Matuska 	if (flag == U8_STRCMP_CS) {
1911*8a62a2a5SMartin Matuska 		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
1912*8a62a2a5SMartin Matuska 	}
1913*8a62a2a5SMartin Matuska 
1914*8a62a2a5SMartin Matuska 	n1 = strlen(s1);
1915*8a62a2a5SMartin Matuska 	n2 = strlen(s2);
1916*8a62a2a5SMartin Matuska 	if (n != 0) {
1917*8a62a2a5SMartin Matuska 		if (n < n1)
1918*8a62a2a5SMartin Matuska 			n1 = n;
1919*8a62a2a5SMartin Matuska 		if (n < n2)
1920*8a62a2a5SMartin Matuska 			n2 = n;
1921*8a62a2a5SMartin Matuska 	}
1922*8a62a2a5SMartin Matuska 
1923*8a62a2a5SMartin Matuska 	/*
1924*8a62a2a5SMartin Matuska 	 * Simple case conversion can be done much faster and so we do
1925*8a62a2a5SMartin Matuska 	 * them separately here.
1926*8a62a2a5SMartin Matuska 	 */
1927*8a62a2a5SMartin Matuska 	if (flag == U8_STRCMP_CI_UPPER) {
1928*8a62a2a5SMartin Matuska 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1929*8a62a2a5SMartin Matuska 		    n1, n2, B_TRUE, errnum));
1930*8a62a2a5SMartin Matuska 	}
1931*8a62a2a5SMartin Matuska #ifdef U8_STRCMP_CI_LOWER
1932*8a62a2a5SMartin Matuska 	else if (flag == U8_STRCMP_CI_LOWER) {
1933*8a62a2a5SMartin Matuska 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1934*8a62a2a5SMartin Matuska 		    n1, n2, B_FALSE, errnum));
1935*8a62a2a5SMartin Matuska 	}
1936*8a62a2a5SMartin Matuska #endif
1937*8a62a2a5SMartin Matuska 
1938*8a62a2a5SMartin Matuska 	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
1939*8a62a2a5SMartin Matuska 	    flag, errnum));
1940*8a62a2a5SMartin Matuska }
1941*8a62a2a5SMartin Matuska 
1942*8a62a2a5SMartin Matuska size_t
u8_textprep_str(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,size_t unicode_version,int * errnum)1943*8a62a2a5SMartin Matuska u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1944*8a62a2a5SMartin Matuska     int flag, size_t unicode_version, int *errnum)
1945*8a62a2a5SMartin Matuska {
1946*8a62a2a5SMartin Matuska 	int f;
1947*8a62a2a5SMartin Matuska 	int sz;
1948*8a62a2a5SMartin Matuska 	uchar_t *ib;
1949*8a62a2a5SMartin Matuska 	uchar_t *ibtail;
1950*8a62a2a5SMartin Matuska 	uchar_t *ob;
1951*8a62a2a5SMartin Matuska 	uchar_t *obtail;
1952*8a62a2a5SMartin Matuska 	boolean_t do_not_ignore_null;
1953*8a62a2a5SMartin Matuska 	boolean_t do_not_ignore_invalid;
1954*8a62a2a5SMartin Matuska 	boolean_t is_it_toupper;
1955*8a62a2a5SMartin Matuska 	boolean_t is_it_tolower;
1956*8a62a2a5SMartin Matuska 	boolean_t canonical_decomposition;
1957*8a62a2a5SMartin Matuska 	boolean_t compatibility_decomposition;
1958*8a62a2a5SMartin Matuska 	boolean_t canonical_composition;
1959*8a62a2a5SMartin Matuska 	size_t ret_val;
1960*8a62a2a5SMartin Matuska 	size_t i;
1961*8a62a2a5SMartin Matuska 	size_t j;
1962*8a62a2a5SMartin Matuska 	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
1963*8a62a2a5SMartin Matuska 	u8_normalization_states_t state;
1964*8a62a2a5SMartin Matuska 
1965*8a62a2a5SMartin Matuska 	if (unicode_version > U8_UNICODE_LATEST) {
1966*8a62a2a5SMartin Matuska 		*errnum = ERANGE;
1967*8a62a2a5SMartin Matuska 		return ((size_t)-1);
1968*8a62a2a5SMartin Matuska 	}
1969*8a62a2a5SMartin Matuska 
1970*8a62a2a5SMartin Matuska #ifdef U8_TEXTPREP_TOLOWER
1971*8a62a2a5SMartin Matuska 	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
1972*8a62a2a5SMartin Matuska 	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
1973*8a62a2a5SMartin Matuska 		*errnum = EBADF;
1974*8a62a2a5SMartin Matuska 		return ((size_t)-1);
1975*8a62a2a5SMartin Matuska 	}
1976*8a62a2a5SMartin Matuska #endif
1977*8a62a2a5SMartin Matuska 
1978*8a62a2a5SMartin Matuska 	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1979*8a62a2a5SMartin Matuska 	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
1980*8a62a2a5SMartin Matuska 	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
1981*8a62a2a5SMartin Matuska 		*errnum = EBADF;
1982*8a62a2a5SMartin Matuska 		return ((size_t)-1);
1983*8a62a2a5SMartin Matuska 	}
1984*8a62a2a5SMartin Matuska 
1985*8a62a2a5SMartin Matuska 	if (inarray == NULL || *inlen == 0)
1986*8a62a2a5SMartin Matuska 		return (0);
1987*8a62a2a5SMartin Matuska 
1988*8a62a2a5SMartin Matuska 	if (outarray == NULL) {
1989*8a62a2a5SMartin Matuska 		*errnum = E2BIG;
1990*8a62a2a5SMartin Matuska 		return ((size_t)-1);
1991*8a62a2a5SMartin Matuska 	}
1992*8a62a2a5SMartin Matuska 
1993*8a62a2a5SMartin Matuska 	ib = (uchar_t *)inarray;
1994*8a62a2a5SMartin Matuska 	ob = (uchar_t *)outarray;
1995*8a62a2a5SMartin Matuska 	ibtail = ib + *inlen;
1996*8a62a2a5SMartin Matuska 	obtail = ob + *outlen;
1997*8a62a2a5SMartin Matuska 
1998*8a62a2a5SMartin Matuska 	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
1999*8a62a2a5SMartin Matuska 	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
2000*8a62a2a5SMartin Matuska 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
2001*8a62a2a5SMartin Matuska #ifdef U8_TEXTPREP_TOLOWER
2002*8a62a2a5SMartin Matuska 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
2003*8a62a2a5SMartin Matuska #else
2004*8a62a2a5SMartin Matuska 	is_it_tolower = 0;
2005*8a62a2a5SMartin Matuska #endif
2006*8a62a2a5SMartin Matuska 
2007*8a62a2a5SMartin Matuska 	ret_val = 0;
2008*8a62a2a5SMartin Matuska 
2009*8a62a2a5SMartin Matuska 	/*
2010*8a62a2a5SMartin Matuska 	 * If we don't have a normalization flag set, we do the simple case
2011*8a62a2a5SMartin Matuska 	 * conversion based text preparation separately below. Text
2012*8a62a2a5SMartin Matuska 	 * preparation involving Normalization will be done in the false task
2013*8a62a2a5SMartin Matuska 	 * block, again, separately since it will take much more time and
2014*8a62a2a5SMartin Matuska 	 * resource than doing simple case conversions.
2015*8a62a2a5SMartin Matuska 	 */
2016*8a62a2a5SMartin Matuska 	if (f == 0) {
2017*8a62a2a5SMartin Matuska 		while (ib < ibtail) {
2018*8a62a2a5SMartin Matuska 			if (*ib == '\0' && do_not_ignore_null)
2019*8a62a2a5SMartin Matuska 				break;
2020*8a62a2a5SMartin Matuska 
2021*8a62a2a5SMartin Matuska 			sz = u8_number_of_bytes[*ib];
2022*8a62a2a5SMartin Matuska 
2023*8a62a2a5SMartin Matuska 			if (sz < 0) {
2024*8a62a2a5SMartin Matuska 				if (do_not_ignore_invalid) {
2025*8a62a2a5SMartin Matuska 					*errnum = EILSEQ;
2026*8a62a2a5SMartin Matuska 					ret_val = (size_t)-1;
2027*8a62a2a5SMartin Matuska 					break;
2028*8a62a2a5SMartin Matuska 				}
2029*8a62a2a5SMartin Matuska 
2030*8a62a2a5SMartin Matuska 				sz = 1;
2031*8a62a2a5SMartin Matuska 				ret_val++;
2032*8a62a2a5SMartin Matuska 			}
2033*8a62a2a5SMartin Matuska 
2034*8a62a2a5SMartin Matuska 			if (sz == 1) {
2035*8a62a2a5SMartin Matuska 				if (ob >= obtail) {
2036*8a62a2a5SMartin Matuska 					*errnum = E2BIG;
2037*8a62a2a5SMartin Matuska 					ret_val = (size_t)-1;
2038*8a62a2a5SMartin Matuska 					break;
2039*8a62a2a5SMartin Matuska 				}
2040*8a62a2a5SMartin Matuska 
2041*8a62a2a5SMartin Matuska 				if (is_it_toupper)
2042*8a62a2a5SMartin Matuska 					*ob = U8_ASCII_TOUPPER(*ib);
2043*8a62a2a5SMartin Matuska 				else if (is_it_tolower)
2044*8a62a2a5SMartin Matuska 					*ob = U8_ASCII_TOLOWER(*ib);
2045*8a62a2a5SMartin Matuska 				else
2046*8a62a2a5SMartin Matuska 					*ob = *ib;
2047*8a62a2a5SMartin Matuska 				ib++;
2048*8a62a2a5SMartin Matuska 				ob++;
2049*8a62a2a5SMartin Matuska 			} else if ((ib + sz) > ibtail) {
2050*8a62a2a5SMartin Matuska 				if (do_not_ignore_invalid) {
2051*8a62a2a5SMartin Matuska 					*errnum = EINVAL;
2052*8a62a2a5SMartin Matuska 					ret_val = (size_t)-1;
2053*8a62a2a5SMartin Matuska 					break;
2054*8a62a2a5SMartin Matuska 				}
2055*8a62a2a5SMartin Matuska 
2056*8a62a2a5SMartin Matuska 				if ((obtail - ob) < (ibtail - ib)) {
2057*8a62a2a5SMartin Matuska 					*errnum = E2BIG;
2058*8a62a2a5SMartin Matuska 					ret_val = (size_t)-1;
2059*8a62a2a5SMartin Matuska 					break;
2060*8a62a2a5SMartin Matuska 				}
2061*8a62a2a5SMartin Matuska 
2062*8a62a2a5SMartin Matuska 				/*
2063*8a62a2a5SMartin Matuska 				 * We treat the remaining incomplete character
2064*8a62a2a5SMartin Matuska 				 * bytes as a character.
2065*8a62a2a5SMartin Matuska 				 */
2066*8a62a2a5SMartin Matuska 				ret_val++;
2067*8a62a2a5SMartin Matuska 
2068*8a62a2a5SMartin Matuska 				while (ib < ibtail)
2069*8a62a2a5SMartin Matuska 					*ob++ = *ib++;
2070*8a62a2a5SMartin Matuska 			} else {
2071*8a62a2a5SMartin Matuska 				if (is_it_toupper || is_it_tolower) {
2072*8a62a2a5SMartin Matuska 					i = do_case_conv(unicode_version, u8s,
2073*8a62a2a5SMartin Matuska 					    ib, sz, is_it_toupper);
2074*8a62a2a5SMartin Matuska 
2075*8a62a2a5SMartin Matuska 					if ((obtail - ob) < i) {
2076*8a62a2a5SMartin Matuska 						*errnum = E2BIG;
2077*8a62a2a5SMartin Matuska 						ret_val = (size_t)-1;
2078*8a62a2a5SMartin Matuska 						break;
2079*8a62a2a5SMartin Matuska 					}
2080*8a62a2a5SMartin Matuska 
2081*8a62a2a5SMartin Matuska 					ib += sz;
2082*8a62a2a5SMartin Matuska 
2083*8a62a2a5SMartin Matuska 					for (sz = 0; sz < i; sz++)
2084*8a62a2a5SMartin Matuska 						*ob++ = u8s[sz];
2085*8a62a2a5SMartin Matuska 				} else {
2086*8a62a2a5SMartin Matuska 					if ((obtail - ob) < sz) {
2087*8a62a2a5SMartin Matuska 						*errnum = E2BIG;
2088*8a62a2a5SMartin Matuska 						ret_val = (size_t)-1;
2089*8a62a2a5SMartin Matuska 						break;
2090*8a62a2a5SMartin Matuska 					}
2091*8a62a2a5SMartin Matuska 
2092*8a62a2a5SMartin Matuska 					for (i = 0; i < sz; i++)
2093*8a62a2a5SMartin Matuska 						*ob++ = *ib++;
2094*8a62a2a5SMartin Matuska 				}
2095*8a62a2a5SMartin Matuska 			}
2096*8a62a2a5SMartin Matuska 		}
2097*8a62a2a5SMartin Matuska 	} else {
2098*8a62a2a5SMartin Matuska 		canonical_decomposition = flag & U8_CANON_DECOMP;
2099*8a62a2a5SMartin Matuska 		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
2100*8a62a2a5SMartin Matuska 		canonical_composition = flag & U8_CANON_COMP;
2101*8a62a2a5SMartin Matuska 
2102*8a62a2a5SMartin Matuska 		while (ib < ibtail) {
2103*8a62a2a5SMartin Matuska 			if (*ib == '\0' && do_not_ignore_null)
2104*8a62a2a5SMartin Matuska 				break;
2105*8a62a2a5SMartin Matuska 
2106*8a62a2a5SMartin Matuska 			/*
2107*8a62a2a5SMartin Matuska 			 * If the current character is a 7-bit ASCII
2108*8a62a2a5SMartin Matuska 			 * character and it is the last character, or,
2109*8a62a2a5SMartin Matuska 			 * if the current character is a 7-bit ASCII
2110*8a62a2a5SMartin Matuska 			 * character and the next character is also a 7-bit
2111*8a62a2a5SMartin Matuska 			 * ASCII character, then, we copy over this
2112*8a62a2a5SMartin Matuska 			 * character without going through collect_a_seq().
2113*8a62a2a5SMartin Matuska 			 *
2114*8a62a2a5SMartin Matuska 			 * In any other cases, we need to look further with
2115*8a62a2a5SMartin Matuska 			 * the collect_a_seq() function.
2116*8a62a2a5SMartin Matuska 			 */
2117*8a62a2a5SMartin Matuska 			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
2118*8a62a2a5SMartin Matuska 			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
2119*8a62a2a5SMartin Matuska 				if (ob >= obtail) {
2120*8a62a2a5SMartin Matuska 					*errnum = E2BIG;
2121*8a62a2a5SMartin Matuska 					ret_val = (size_t)-1;
2122*8a62a2a5SMartin Matuska 					break;
2123*8a62a2a5SMartin Matuska 				}
2124*8a62a2a5SMartin Matuska 
2125*8a62a2a5SMartin Matuska 				if (is_it_toupper)
2126*8a62a2a5SMartin Matuska 					*ob = U8_ASCII_TOUPPER(*ib);
2127*8a62a2a5SMartin Matuska 				else if (is_it_tolower)
2128*8a62a2a5SMartin Matuska 					*ob = U8_ASCII_TOLOWER(*ib);
2129*8a62a2a5SMartin Matuska 				else
2130*8a62a2a5SMartin Matuska 					*ob = *ib;
2131*8a62a2a5SMartin Matuska 				ib++;
2132*8a62a2a5SMartin Matuska 				ob++;
2133*8a62a2a5SMartin Matuska 			} else {
2134*8a62a2a5SMartin Matuska 				*errnum = 0;
2135*8a62a2a5SMartin Matuska 				state = U8_STATE_START;
2136*8a62a2a5SMartin Matuska 
2137*8a62a2a5SMartin Matuska 				j = collect_a_seq(unicode_version, u8s,
2138*8a62a2a5SMartin Matuska 				    &ib, ibtail,
2139*8a62a2a5SMartin Matuska 				    is_it_toupper,
2140*8a62a2a5SMartin Matuska 				    is_it_tolower,
2141*8a62a2a5SMartin Matuska 				    canonical_decomposition,
2142*8a62a2a5SMartin Matuska 				    compatibility_decomposition,
2143*8a62a2a5SMartin Matuska 				    canonical_composition,
2144*8a62a2a5SMartin Matuska 				    errnum, &state);
2145*8a62a2a5SMartin Matuska 
2146*8a62a2a5SMartin Matuska 				if (*errnum && do_not_ignore_invalid) {
2147*8a62a2a5SMartin Matuska 					ret_val = (size_t)-1;
2148*8a62a2a5SMartin Matuska 					break;
2149*8a62a2a5SMartin Matuska 				}
2150*8a62a2a5SMartin Matuska 
2151*8a62a2a5SMartin Matuska 				if ((obtail - ob) < j) {
2152*8a62a2a5SMartin Matuska 					*errnum = E2BIG;
2153*8a62a2a5SMartin Matuska 					ret_val = (size_t)-1;
2154*8a62a2a5SMartin Matuska 					break;
2155*8a62a2a5SMartin Matuska 				}
2156*8a62a2a5SMartin Matuska 
2157*8a62a2a5SMartin Matuska 				for (i = 0; i < j; i++)
2158*8a62a2a5SMartin Matuska 					*ob++ = u8s[i];
2159*8a62a2a5SMartin Matuska 			}
2160*8a62a2a5SMartin Matuska 		}
2161*8a62a2a5SMartin Matuska 	}
2162*8a62a2a5SMartin Matuska 
2163*8a62a2a5SMartin Matuska 	*inlen = ibtail - ib;
2164*8a62a2a5SMartin Matuska 	*outlen = obtail - ob;
2165*8a62a2a5SMartin Matuska 
2166*8a62a2a5SMartin Matuska 	return (ret_val);
2167*8a62a2a5SMartin Matuska }
2168*8a62a2a5SMartin Matuska 
2169*8a62a2a5SMartin Matuska EXPORT_SYMBOL(u8_validate);
2170*8a62a2a5SMartin Matuska EXPORT_SYMBOL(u8_strcmp);
2171*8a62a2a5SMartin Matuska EXPORT_SYMBOL(u8_textprep_str);
2172