xref: /freebsd/sys/contrib/openzfs/module/unicode/u8_textprep.c (revision 5c65a0a9163cc00389d8527ee12c4e69df07ea42)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2022 MNX Cloud, Inc.
28  */
29 
30 
31 
32 /*
33  * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
34  *
35  * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
36  * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
37  * the section 3C man pages.
38  * Interface stability: Committed.
39  */
40 
41 #include <sys/types.h>
42 #include <sys/string.h>
43 #include <sys/param.h>
44 #include <sys/sysmacros.h>
45 #include <sys/debug.h>
46 #include <sys/kmem.h>
47 #include <sys/sunddi.h>
48 #include <sys/u8_textprep.h>
49 #include <sys/byteorder.h>
50 #include <sys/errno.h>
51 #include <sys/u8_textprep_data.h>
52 #include <sys/mod.h>
53 
54 /* The maximum possible number of bytes in a UTF-8 character. */
55 #define	U8_MB_CUR_MAX			(4)
56 
57 /*
58  * The maximum number of bytes needed for a UTF-8 character to cover
59  * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
60  */
61 #define	U8_MAX_BYTES_UCS2		(3)
62 
63 /* The maximum possible number of bytes in a Stream-Safe Text. */
64 #define	U8_STREAM_SAFE_TEXT_MAX		(128)
65 
66 /*
67  * The maximum number of characters in a combining/conjoining sequence and
68  * the actual upperbound limit of a combining/conjoining sequence.
69  */
70 #define	U8_MAX_CHARS_A_SEQ		(32)
71 #define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
72 
73 /* The combining class value for Starter. */
74 #define	U8_COMBINING_CLASS_STARTER	(0)
75 
76 /*
77  * Some Hangul related macros at below.
78  *
79  * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
80  * Vowels, and optional Trailing consonants in Unicode scalar values.
81  *
82  * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
83  * the actual U+11A8. This is due to that the trailing consonant is optional
84  * and thus we are doing a pre-calculation of subtracting one.
85  *
86  * Each of 19 modern leading consonants has total 588 possible syllables since
87  * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
88  * no trailing consonant case, i.e., 21 x 28 = 588.
89  *
90  * We also have bunch of Hangul related macros at below. Please bear in mind
91  * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
92  * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
93  * Jamo; it just guarantee that it will be most likely.
94  */
95 #define	U8_HANGUL_SYL_FIRST		(0xAC00U)
96 #define	U8_HANGUL_SYL_LAST		(0xD7A3U)
97 
98 #define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
99 #define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
100 #define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
101 #define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
102 #define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
103 #define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
104 
105 #define	U8_HANGUL_V_COUNT		(21)
106 #define	U8_HANGUL_VT_COUNT		(588)
107 #define	U8_HANGUL_T_COUNT		(28)
108 
109 #define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
110 
111 #define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
112 	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
113 	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
114 	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
115 
116 #define	U8_HANGUL_JAMO_L(u) \
117 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
118 
119 #define	U8_HANGUL_JAMO_V(u) \
120 	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
121 
122 #define	U8_HANGUL_JAMO_T(u) \
123 	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
124 
125 #define	U8_HANGUL_JAMO(u) \
126 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
127 
128 #define	U8_HANGUL_SYLLABLE(u) \
129 	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
130 
131 #define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
132 	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
133 
134 #define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
135 	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
136 
137 /* The types of decomposition mappings. */
138 #define	U8_DECOMP_BOTH			(0xF5U)
139 #define	U8_DECOMP_CANONICAL		(0xF6U)
140 
141 /* The indicator for 16-bit table. */
142 #define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
143 
144 /* The following are some convenience macros. */
145 #define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3)  \
146 	(u) = ((((uint32_t)(b1) & 0x0F) << 12) | \
147 		(((uint32_t)(b2) & 0x3F) << 6)  | \
148 		((uint32_t)(b3) & 0x3F));
149 
150 #define	U8_SIMPLE_SWAP(a, b, t) \
151 	(t) = (a); \
152 	(a) = (b); \
153 	(b) = (t);
154 
155 #define	U8_ASCII_TOUPPER(c) \
156 	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
157 
158 #define	U8_ASCII_TOLOWER(c) \
159 	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
160 
161 #define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
162 /*
163  * The following macro assumes that the two characters that are to be
164  * swapped are adjacent to each other and 'a' comes before 'b'.
165  *
166  * If the assumptions are not met, then, the macro will fail.
167  */
168 #define	U8_SWAP_COMB_MARKS(a, b) \
169 	for (k = 0; k < disp[(a)]; k++) \
170 		u8t[k] = u8s[start[(a)] + k]; \
171 	for (k = 0; k < disp[(b)]; k++) \
172 		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
173 	start[(b)] = start[(a)] + disp[(b)]; \
174 	for (k = 0; k < disp[(a)]; k++) \
175 		u8s[start[(b)] + k] = u8t[k]; \
176 	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
177 	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
178 
179 /* The possible states during normalization. */
180 typedef enum {
181 	U8_STATE_START = 0,
182 	U8_STATE_HANGUL_L = 1,
183 	U8_STATE_HANGUL_LV = 2,
184 	U8_STATE_HANGUL_LVT = 3,
185 	U8_STATE_HANGUL_V = 4,
186 	U8_STATE_HANGUL_T = 5,
187 	U8_STATE_COMBINING_MARK = 6
188 } u8_normalization_states_t;
189 
190 /*
191  * The three vectors at below are used to check bytes of a given UTF-8
192  * character are valid and not containing any malformed byte values.
193  *
194  * We used to have a quite relaxed UTF-8 binary representation but then there
195  * was some security related issues and so the Unicode Consortium defined
196  * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
197  * one more time at the Unicode 3.2. The following three tables are based on
198  * that.
199  */
200 
201 #define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
202 
203 #define	I_				U8_ILLEGAL_CHAR
204 #define	O_				U8_OUT_OF_RANGE_CHAR
205 
206 static const int8_t u8_number_of_bytes[0x100] = {
207 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
208 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
209 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
210 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
211 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
212 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
213 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
214 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
215 
216 /*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
217 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
218 
219 /*	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
220 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
221 
222 /*	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
223 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
224 
225 /*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
226 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
227 
228 /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
229 	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
230 
231 /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
232 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
233 
234 /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
235 	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
236 
237 /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
238 	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
239 };
240 
241 #undef	I_
242 #undef	O_
243 
244 static const uint8_t u8_valid_min_2nd_byte[0x100] = {
245 	0,    0,    0,    0,    0,    0,    0,    0,
246 	0,    0,    0,    0,    0,    0,    0,    0,
247 	0,    0,    0,    0,    0,    0,    0,    0,
248 	0,    0,    0,    0,    0,    0,    0,    0,
249 	0,    0,    0,    0,    0,    0,    0,    0,
250 	0,    0,    0,    0,    0,    0,    0,    0,
251 	0,    0,    0,    0,    0,    0,    0,    0,
252 	0,    0,    0,    0,    0,    0,    0,    0,
253 	0,    0,    0,    0,    0,    0,    0,    0,
254 	0,    0,    0,    0,    0,    0,    0,    0,
255 	0,    0,    0,    0,    0,    0,    0,    0,
256 	0,    0,    0,    0,    0,    0,    0,    0,
257 	0,    0,    0,    0,    0,    0,    0,    0,
258 	0,    0,    0,    0,    0,    0,    0,    0,
259 	0,    0,    0,    0,    0,    0,    0,    0,
260 	0,    0,    0,    0,    0,    0,    0,    0,
261 	0,    0,    0,    0,    0,    0,    0,    0,
262 	0,    0,    0,    0,    0,    0,    0,    0,
263 	0,    0,    0,    0,    0,    0,    0,    0,
264 	0,    0,    0,    0,    0,    0,    0,    0,
265 	0,    0,    0,    0,    0,    0,    0,    0,
266 	0,    0,    0,    0,    0,    0,    0,    0,
267 	0,    0,    0,    0,    0,    0,    0,    0,
268 	0,    0,    0,    0,    0,    0,    0,    0,
269 /*	C0    C1    C2    C3    C4    C5    C6    C7    */
270 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
271 /*	C8    C9    CA    CB    CC    CD    CE    CF    */
272 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
273 /*	D0    D1    D2    D3    D4    D5    D6    D7    */
274 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
275 /*	D8    D9    DA    DB    DC    DD    DE    DF    */
276 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
277 /*	E0    E1    E2    E3    E4    E5    E6    E7    */
278 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
279 /*	E8    E9    EA    EB    EC    ED    EE    EF    */
280 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
281 /*	F0    F1    F2    F3    F4    F5    F6    F7    */
282 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
283 	0,    0,    0,    0,    0,    0,    0,    0,
284 };
285 
286 static const uint8_t u8_valid_max_2nd_byte[0x100] = {
287 	0,    0,    0,    0,    0,    0,    0,    0,
288 	0,    0,    0,    0,    0,    0,    0,    0,
289 	0,    0,    0,    0,    0,    0,    0,    0,
290 	0,    0,    0,    0,    0,    0,    0,    0,
291 	0,    0,    0,    0,    0,    0,    0,    0,
292 	0,    0,    0,    0,    0,    0,    0,    0,
293 	0,    0,    0,    0,    0,    0,    0,    0,
294 	0,    0,    0,    0,    0,    0,    0,    0,
295 	0,    0,    0,    0,    0,    0,    0,    0,
296 	0,    0,    0,    0,    0,    0,    0,    0,
297 	0,    0,    0,    0,    0,    0,    0,    0,
298 	0,    0,    0,    0,    0,    0,    0,    0,
299 	0,    0,    0,    0,    0,    0,    0,    0,
300 	0,    0,    0,    0,    0,    0,    0,    0,
301 	0,    0,    0,    0,    0,    0,    0,    0,
302 	0,    0,    0,    0,    0,    0,    0,    0,
303 	0,    0,    0,    0,    0,    0,    0,    0,
304 	0,    0,    0,    0,    0,    0,    0,    0,
305 	0,    0,    0,    0,    0,    0,    0,    0,
306 	0,    0,    0,    0,    0,    0,    0,    0,
307 	0,    0,    0,    0,    0,    0,    0,    0,
308 	0,    0,    0,    0,    0,    0,    0,    0,
309 	0,    0,    0,    0,    0,    0,    0,    0,
310 	0,    0,    0,    0,    0,    0,    0,    0,
311 /*	C0    C1    C2    C3    C4    C5    C6    C7    */
312 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
313 /*	C8    C9    CA    CB    CC    CD    CE    CF    */
314 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
315 /*	D0    D1    D2    D3    D4    D5    D6    D7    */
316 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
317 /*	D8    D9    DA    DB    DC    DD    DE    DF    */
318 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
319 /*	E0    E1    E2    E3    E4    E5    E6    E7    */
320 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
321 /*	E8    E9    EA    EB    EC    ED    EE    EF    */
322 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
323 /*	F0    F1    F2    F3    F4    F5    F6    F7    */
324 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
325 	0,    0,    0,    0,    0,    0,    0,    0,
326 };
327 
328 
329 /*
330  * The u8_validate() validates on the given UTF-8 character string and
331  * calculate the byte length. It is quite similar to mblen(3C) except that
332  * this will validate against the list of characters if required and
333  * specific to UTF-8 and Unicode.
334  */
335 int
u8_validate(const char * u8str,size_t n,char ** list,int flag,int * errnum)336 u8_validate(const char *u8str, size_t n, char **list, int flag, int *errnum)
337 {
338 	uchar_t *ib;
339 	uchar_t *ibtail;
340 	uchar_t **p;
341 	uchar_t *s1;
342 	uchar_t *s2;
343 	uchar_t f;
344 	int sz;
345 	size_t i;
346 	int ret_val;
347 	boolean_t second;
348 	boolean_t no_need_to_validate_entire;
349 	boolean_t check_additional;
350 	boolean_t validate_ucs2_range_only;
351 
352 	if (! u8str)
353 		return (0);
354 
355 	ib = (uchar_t *)u8str;
356 	ibtail = ib + n;
357 
358 	ret_val = 0;
359 
360 	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
361 	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
362 	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
363 
364 	while (ib < ibtail) {
365 		/*
366 		 * The first byte of a UTF-8 character tells how many
367 		 * bytes will follow for the character. If the first byte
368 		 * is an illegal byte value or out of range value, we just
369 		 * return -1 with an appropriate error number.
370 		 */
371 		sz = u8_number_of_bytes[*ib];
372 		if (sz == U8_ILLEGAL_CHAR) {
373 			*errnum = EILSEQ;
374 			return (-1);
375 		}
376 
377 		if (sz == U8_OUT_OF_RANGE_CHAR ||
378 		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
379 			*errnum = ERANGE;
380 			return (-1);
381 		}
382 
383 		/*
384 		 * If we don't have enough bytes to check on, that's also
385 		 * an error. As you can see, we give illegal byte sequence
386 		 * checking higher priority then EINVAL cases.
387 		 */
388 		if ((ibtail - ib) < sz) {
389 			*errnum = EINVAL;
390 			return (-1);
391 		}
392 
393 		if (sz == 1) {
394 			ib++;
395 			ret_val++;
396 		} else {
397 			/*
398 			 * Check on the multi-byte UTF-8 character. For more
399 			 * details on this, see comment added for the used
400 			 * data structures at the beginning of the file.
401 			 */
402 			f = *ib++;
403 			ret_val++;
404 			second = B_TRUE;
405 			for (i = 1; i < sz; i++) {
406 				if (second) {
407 					if (*ib < u8_valid_min_2nd_byte[f] ||
408 					    *ib > u8_valid_max_2nd_byte[f]) {
409 						*errnum = EILSEQ;
410 						return (-1);
411 					}
412 					second = B_FALSE;
413 				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
414 					*errnum = EILSEQ;
415 					return (-1);
416 				}
417 				ib++;
418 				ret_val++;
419 			}
420 		}
421 
422 		if (check_additional) {
423 			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
424 				s1 = ib - sz;
425 				s2 = p[i];
426 				while (s1 < ib) {
427 					if (*s1 != *s2 || *s2 == '\0')
428 						break;
429 					s1++;
430 					s2++;
431 				}
432 
433 				if (s1 >= ib && *s2 == '\0') {
434 					*errnum = EBADF;
435 					return (-1);
436 				}
437 			}
438 		}
439 
440 		if (no_need_to_validate_entire)
441 			break;
442 	}
443 
444 	return (ret_val);
445 }
446 
447 /*
448  * The do_case_conv() looks at the mapping tables and returns found
449  * bytes if any. If not found, the input bytes are returned. The function
450  * always terminate the return bytes with a null character assuming that
451  * there are plenty of room to do so.
452  *
453  * The case conversions are simple case conversions mapping a character to
454  * another character as specified in the Unicode data. The byte size of
455  * the mapped character could be different from that of the input character.
456  *
457  * The return value is the byte length of the returned character excluding
458  * the terminating null byte.
459  */
460 static size_t
do_case_conv(int uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t is_it_toupper)461 do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
462 {
463 	size_t i;
464 	uint16_t b1 = 0;
465 	uint16_t b2 = 0;
466 	uint16_t b3 = 0;
467 	uint16_t b3_tbl;
468 	uint16_t b3_base;
469 	uint16_t b4 = 0;
470 	size_t start_id;
471 	size_t end_id;
472 
473 	/*
474 	 * At this point, the only possible values for sz are 2, 3, and 4.
475 	 * The u8s should point to a vector that is well beyond the size of
476 	 * 5 bytes.
477 	 */
478 	if (sz == 2) {
479 		b3 = u8s[0] = s[0];
480 		b4 = u8s[1] = s[1];
481 	} else if (sz == 3) {
482 		b2 = u8s[0] = s[0];
483 		b3 = u8s[1] = s[1];
484 		b4 = u8s[2] = s[2];
485 	} else if (sz == 4) {
486 		b1 = u8s[0] = s[0];
487 		b2 = u8s[1] = s[1];
488 		b3 = u8s[2] = s[2];
489 		b4 = u8s[3] = s[3];
490 	} else {
491 		/* This is not possible but just in case as a fallback. */
492 		if (is_it_toupper)
493 			*u8s = U8_ASCII_TOUPPER(*s);
494 		else
495 			*u8s = U8_ASCII_TOLOWER(*s);
496 		u8s[1] = '\0';
497 
498 		return (1);
499 	}
500 	u8s[sz] = '\0';
501 
502 	/*
503 	 * Let's find out if we have a corresponding character.
504 	 */
505 	b1 = u8_common_b1_tbl[uv][b1];
506 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
507 		return ((size_t)sz);
508 
509 	b2 = u8_case_common_b2_tbl[uv][b1][b2];
510 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
511 		return ((size_t)sz);
512 
513 	if (is_it_toupper) {
514 		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
515 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
516 			return ((size_t)sz);
517 
518 		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
519 		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
520 
521 		/* Either there is no match or an error at the table. */
522 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
523 			return ((size_t)sz);
524 
525 		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
526 
527 		for (i = 0; start_id < end_id; start_id++)
528 			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
529 	} else {
530 #ifdef U8_STRCMP_CI_LOWER
531 		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
532 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
533 			return ((size_t)sz);
534 
535 		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
536 		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
537 
538 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
539 			return ((size_t)sz);
540 
541 		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
542 
543 		for (i = 0; start_id < end_id; start_id++)
544 			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
545 #else
546 		__builtin_unreachable();
547 #endif
548 	}
549 
550 	/*
551 	 * If i is still zero, that means there is no corresponding character.
552 	 */
553 	if (i == 0)
554 		return ((size_t)sz);
555 
556 	u8s[i] = '\0';
557 
558 	return (i);
559 }
560 
561 /*
562  * The do_case_compare() function compares the two input strings, s1 and s2,
563  * one character at a time doing case conversions if applicable and return
564  * the comparison result as like strcmp().
565  *
566  * Since, in empirical sense, most of text data are 7-bit ASCII characters,
567  * we treat the 7-bit ASCII characters as a special case trying to yield
568  * faster processing time.
569  */
570 static int
do_case_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,boolean_t is_it_toupper,int * errnum)571 do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
572     size_t n2, boolean_t is_it_toupper, int *errnum)
573 {
574 	int f;
575 	int sz1;
576 	int sz2;
577 	size_t j;
578 	size_t i1;
579 	size_t i2;
580 	uchar_t u8s1[U8_MB_CUR_MAX + 1];
581 	uchar_t u8s2[U8_MB_CUR_MAX + 1];
582 
583 	i1 = i2 = 0;
584 	while (i1 < n1 && i2 < n2) {
585 		/*
586 		 * Find out what would be the byte length for this UTF-8
587 		 * character at string s1 and also find out if this is
588 		 * an illegal start byte or not and if so, issue a proper
589 		 * error number and yet treat this byte as a character.
590 		 */
591 		sz1 = u8_number_of_bytes[*s1];
592 		if (sz1 < 0) {
593 			*errnum = EILSEQ;
594 			sz1 = 1;
595 		}
596 
597 		/*
598 		 * For 7-bit ASCII characters mainly, we do a quick case
599 		 * conversion right at here.
600 		 *
601 		 * If we don't have enough bytes for this character, issue
602 		 * an EINVAL error and use what are available.
603 		 *
604 		 * If we have enough bytes, find out if there is
605 		 * a corresponding uppercase character and if so, copy over
606 		 * the bytes for a comparison later. If there is no
607 		 * corresponding uppercase character, then, use what we have
608 		 * for the comparison.
609 		 */
610 		if (sz1 == 1) {
611 			if (is_it_toupper)
612 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
613 			else
614 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
615 			s1++;
616 			u8s1[1] = '\0';
617 		} else if ((i1 + sz1) > n1) {
618 			*errnum = EINVAL;
619 			for (j = 0; (i1 + j) < n1; )
620 				u8s1[j++] = *s1++;
621 			u8s1[j] = '\0';
622 		} else {
623 			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
624 			s1 += sz1;
625 		}
626 
627 		/* Do the same for the string s2. */
628 		sz2 = u8_number_of_bytes[*s2];
629 		if (sz2 < 0) {
630 			*errnum = EILSEQ;
631 			sz2 = 1;
632 		}
633 
634 		if (sz2 == 1) {
635 			if (is_it_toupper)
636 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
637 			else
638 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
639 			s2++;
640 			u8s2[1] = '\0';
641 		} else if ((i2 + sz2) > n2) {
642 			*errnum = EINVAL;
643 			for (j = 0; (i2 + j) < n2; )
644 				u8s2[j++] = *s2++;
645 			u8s2[j] = '\0';
646 		} else {
647 			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
648 			s2 += sz2;
649 		}
650 
651 		/* Now compare the two characters. */
652 		if (sz1 == 1 && sz2 == 1) {
653 			if (*u8s1 > *u8s2)
654 				return (1);
655 			if (*u8s1 < *u8s2)
656 				return (-1);
657 		} else {
658 			f = strcmp((const char *)u8s1, (const char *)u8s2);
659 			if (f != 0)
660 				return (f);
661 		}
662 
663 		/*
664 		 * They were the same. Let's move on to the next
665 		 * characters then.
666 		 */
667 		i1 += sz1;
668 		i2 += sz2;
669 	}
670 
671 	/*
672 	 * We compared until the end of either or both strings.
673 	 *
674 	 * If we reached to or went over the ends for the both, that means
675 	 * they are the same.
676 	 *
677 	 * If we reached only one of the two ends, that means the other string
678 	 * has something which then the fact can be used to determine
679 	 * the return value.
680 	 */
681 	if (i1 >= n1) {
682 		if (i2 >= n2)
683 			return (0);
684 		return (-1);
685 	}
686 	return (1);
687 }
688 
689 /*
690  * The combining_class() function checks on the given bytes and find out
691  * the corresponding Unicode combining class value. The return value 0 means
692  * it is a Starter. Any illegal UTF-8 character will also be treated as
693  * a Starter.
694  */
695 static uchar_t
combining_class(size_t uv,uchar_t * s,size_t sz)696 combining_class(size_t uv, uchar_t *s, size_t sz)
697 {
698 	uint16_t b1 = 0;
699 	uint16_t b2 = 0;
700 	uint16_t b3 = 0;
701 	uint16_t b4 = 0;
702 
703 	if (sz == 1 || sz > 4)
704 		return (0);
705 
706 	if (sz == 2) {
707 		b3 = s[0];
708 		b4 = s[1];
709 	} else if (sz == 3) {
710 		b2 = s[0];
711 		b3 = s[1];
712 		b4 = s[2];
713 	} else if (sz == 4) {
714 		b1 = s[0];
715 		b2 = s[1];
716 		b3 = s[2];
717 		b4 = s[3];
718 	}
719 
720 	b1 = u8_common_b1_tbl[uv][b1];
721 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
722 		return (0);
723 
724 	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
725 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
726 		return (0);
727 
728 	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
729 	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
730 		return (0);
731 
732 	return (u8_combining_class_b4_tbl[uv][b3][b4]);
733 }
734 
735 /*
736  * The do_decomp() function finds out a matching decomposition if any
737  * and return. If there is no match, the input bytes are copied and returned.
738  * The function also checks if there is a Hangul, decomposes it if necessary
739  * and returns.
740  *
741  * To save time, a single byte 7-bit ASCII character should be handled by
742  * the caller.
743  *
744  * The function returns the number of bytes returned sans always terminating
745  * the null byte. It will also return a state that will tell if there was
746  * a Hangul character decomposed which then will be used by the caller.
747  */
748 static size_t
do_decomp(size_t uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t canonical_decomposition,u8_normalization_states_t * state)749 do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
750     boolean_t canonical_decomposition, u8_normalization_states_t *state)
751 {
752 	uint16_t b1 = 0;
753 	uint16_t b2 = 0;
754 	uint16_t b3 = 0;
755 	uint16_t b3_tbl;
756 	uint16_t b3_base;
757 	uint16_t b4 = 0;
758 	size_t start_id;
759 	size_t end_id;
760 	size_t i;
761 	uint32_t u1;
762 
763 	if (sz == 2) {
764 		b3 = u8s[0] = s[0];
765 		b4 = u8s[1] = s[1];
766 		u8s[2] = '\0';
767 	} else if (sz == 3) {
768 		/* Convert it to a Unicode scalar value. */
769 		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
770 
771 		/*
772 		 * If this is a Hangul syllable, we decompose it into
773 		 * a leading consonant, a vowel, and an optional trailing
774 		 * consonant and then return.
775 		 */
776 		if (U8_HANGUL_SYLLABLE(u1)) {
777 			u1 -= U8_HANGUL_SYL_FIRST;
778 
779 			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
780 			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
781 			    / U8_HANGUL_T_COUNT;
782 			b3 = u1 % U8_HANGUL_T_COUNT;
783 
784 			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
785 			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
786 			if (b3) {
787 				b3 += U8_HANGUL_JAMO_T_FIRST;
788 				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
789 
790 				u8s[9] = '\0';
791 				*state = U8_STATE_HANGUL_LVT;
792 				return (9);
793 			}
794 
795 			u8s[6] = '\0';
796 			*state = U8_STATE_HANGUL_LV;
797 			return (6);
798 		}
799 
800 		b2 = u8s[0] = s[0];
801 		b3 = u8s[1] = s[1];
802 		b4 = u8s[2] = s[2];
803 		u8s[3] = '\0';
804 
805 		/*
806 		 * If this is a Hangul Jamo, we know there is nothing
807 		 * further that we can decompose.
808 		 */
809 		if (U8_HANGUL_JAMO_L(u1)) {
810 			*state = U8_STATE_HANGUL_L;
811 			return (3);
812 		}
813 
814 		if (U8_HANGUL_JAMO_V(u1)) {
815 			if (*state == U8_STATE_HANGUL_L)
816 				*state = U8_STATE_HANGUL_LV;
817 			else
818 				*state = U8_STATE_HANGUL_V;
819 			return (3);
820 		}
821 
822 		if (U8_HANGUL_JAMO_T(u1)) {
823 			if (*state == U8_STATE_HANGUL_LV)
824 				*state = U8_STATE_HANGUL_LVT;
825 			else
826 				*state = U8_STATE_HANGUL_T;
827 			return (3);
828 		}
829 	} else if (sz == 4) {
830 		b1 = u8s[0] = s[0];
831 		b2 = u8s[1] = s[1];
832 		b3 = u8s[2] = s[2];
833 		b4 = u8s[3] = s[3];
834 		u8s[4] = '\0';
835 	} else {
836 		/*
837 		 * This is a fallback and should not happen if the function
838 		 * was called properly.
839 		 */
840 		u8s[0] = s[0];
841 		u8s[1] = '\0';
842 		*state = U8_STATE_START;
843 		return (1);
844 	}
845 
846 	/*
847 	 * At this point, this routine does not know what it would get.
848 	 * The caller should sort it out if the state isn't a Hangul one.
849 	 */
850 	*state = U8_STATE_START;
851 
852 	/* Try to find matching decomposition mapping byte sequence. */
853 	b1 = u8_common_b1_tbl[uv][b1];
854 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
855 		return ((size_t)sz);
856 
857 	b2 = u8_decomp_b2_tbl[uv][b1][b2];
858 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
859 		return ((size_t)sz);
860 
861 	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
862 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
863 		return ((size_t)sz);
864 
865 	/*
866 	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
867 	 * which is 0x8000, this means we couldn't fit the mappings into
868 	 * the cardinality of a unsigned byte.
869 	 */
870 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
871 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
872 		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
873 		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
874 	} else {
875 		// cppcheck-suppress arrayIndexOutOfBoundsCond
876 		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
877 		// cppcheck-suppress arrayIndexOutOfBoundsCond
878 		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
879 	}
880 
881 	/* This also means there wasn't any matching decomposition. */
882 	if (start_id >= end_id)
883 		return ((size_t)sz);
884 
885 	/*
886 	 * The final table for decomposition mappings has three types of
887 	 * byte sequences depending on whether a mapping is for compatibility
888 	 * decomposition, canonical decomposition, or both like the following:
889 	 *
890 	 * (1) Compatibility decomposition mappings:
891 	 *
892 	 *	+---+---+-...-+---+
893 	 *	| B0| B1| ... | Bm|
894 	 *	+---+---+-...-+---+
895 	 *
896 	 *	The first byte, B0, is always less than 0xF5 (U8_DECOMP_BOTH).
897 	 *
898 	 * (2) Canonical decomposition mappings:
899 	 *
900 	 *	+---+---+---+-...-+---+
901 	 *	| T | b0| b1| ... | bn|
902 	 *	+---+---+---+-...-+---+
903 	 *
904 	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
905 	 *
906 	 * (3) Both mappings:
907 	 *
908 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
909 	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
910 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
911 	 *
912 	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
913 	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
914 	 *	compatibility mapping bytes.
915 	 *
916 	 * Note that compatibility decomposition means doing recursive
917 	 * decompositions using both compatibility decomposition mappings and
918 	 * canonical decomposition mappings. On the other hand, canonical
919 	 * decomposition means doing recursive decompositions using only
920 	 * canonical decomposition mappings. Since the table we have has gone
921 	 * through the recursions already, we do not need to do so during
922 	 * runtime, i.e., the table has been completely flattened out
923 	 * already.
924 	 */
925 
926 	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
927 
928 	/* Get the type, T, of the byte sequence. */
929 	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
930 
931 	/*
932 	 * If necessary, adjust start_id, end_id, or both. Note that if
933 	 * this is compatibility decomposition mapping, there is no
934 	 * adjustment.
935 	 */
936 	if (canonical_decomposition) {
937 		/* Is the mapping only for compatibility decomposition? */
938 		if (b1 < U8_DECOMP_BOTH)
939 			return ((size_t)sz);
940 
941 		start_id++;
942 
943 		if (b1 == U8_DECOMP_BOTH) {
944 			end_id = start_id +
945 			    u8_decomp_final_tbl[uv][b3_base + start_id];
946 			start_id++;
947 		}
948 	} else {
949 		/*
950 		 * Unless this is a compatibility decomposition mapping,
951 		 * we adjust the start_id.
952 		 */
953 		if (b1 == U8_DECOMP_BOTH) {
954 			start_id++;
955 			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
956 		} else if (b1 == U8_DECOMP_CANONICAL) {
957 			start_id++;
958 		}
959 	}
960 
961 	for (i = 0; start_id < end_id; start_id++)
962 		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
963 	u8s[i] = '\0';
964 
965 	return (i);
966 }
967 
968 /*
969  * The find_composition_start() function uses the character bytes given and
970  * find out the matching composition mappings if any and return the address
971  * to the composition mappings as explained in the do_composition().
972  */
973 static uchar_t *
find_composition_start(size_t uv,uchar_t * s,size_t sz)974 find_composition_start(size_t uv, uchar_t *s, size_t sz)
975 {
976 	uint16_t b1 = 0;
977 	uint16_t b2 = 0;
978 	uint16_t b3 = 0;
979 	uint16_t b3_tbl;
980 	uint16_t b3_base;
981 	uint16_t b4 = 0;
982 	size_t start_id;
983 	size_t end_id;
984 
985 	if (sz == 1) {
986 		b4 = s[0];
987 	} else if (sz == 2) {
988 		b3 = s[0];
989 		b4 = s[1];
990 	} else if (sz == 3) {
991 		b2 = s[0];
992 		b3 = s[1];
993 		b4 = s[2];
994 	} else if (sz == 4) {
995 		b1 = s[0];
996 		b2 = s[1];
997 		b3 = s[2];
998 		b4 = s[3];
999 	} else {
1000 		/*
1001 		 * This is a fallback and should not happen if the function
1002 		 * was called properly.
1003 		 */
1004 		return (NULL);
1005 	}
1006 
1007 	b1 = u8_composition_b1_tbl[uv][b1];
1008 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
1009 		return (NULL);
1010 
1011 	b2 = u8_composition_b2_tbl[uv][b1][b2];
1012 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
1013 		return (NULL);
1014 
1015 	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
1016 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
1017 		return (NULL);
1018 
1019 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
1020 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
1021 		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
1022 		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
1023 	} else {
1024 		// cppcheck-suppress arrayIndexOutOfBoundsCond
1025 		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
1026 		// cppcheck-suppress arrayIndexOutOfBoundsCond
1027 		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
1028 	}
1029 
1030 	if (start_id >= end_id)
1031 		return (NULL);
1032 
1033 	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
1034 
1035 	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
1036 }
1037 
1038 /*
1039  * The blocked() function checks on the combining class values of previous
1040  * characters in this sequence and return whether it is blocked or not.
1041  */
1042 static boolean_t
blocked(uchar_t * comb_class,size_t last)1043 blocked(uchar_t *comb_class, size_t last)
1044 {
1045 	uchar_t my_comb_class;
1046 	size_t i;
1047 
1048 	my_comb_class = comb_class[last];
1049 	for (i = 1; i < last; i++)
1050 		if (comb_class[i] >= my_comb_class ||
1051 		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
1052 			return (B_TRUE);
1053 
1054 	return (B_FALSE);
1055 }
1056 
1057 /*
1058  * The do_composition() reads the character string pointed by 's' and
1059  * do necessary canonical composition and then copy over the result back to
1060  * the 's'.
1061  *
1062  * The input argument 's' cannot contain more than 32 characters.
1063  */
1064 static size_t
do_composition(size_t uv,uchar_t * s,uchar_t * comb_class,uchar_t * start,uchar_t * disp,size_t last,uchar_t ** os,uchar_t * oslast)1065 do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
1066     uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
1067 {
1068 	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
1069 	uchar_t tc[U8_MB_CUR_MAX] = { '\0' };
1070 	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
1071 	size_t saved_marks_count;
1072 	uchar_t *p;
1073 	uchar_t *saved_p;
1074 	uchar_t *q;
1075 	size_t i;
1076 	size_t saved_i;
1077 	size_t j;
1078 	size_t k;
1079 	size_t l;
1080 	size_t C;
1081 	size_t saved_l;
1082 	size_t size;
1083 	uint32_t u1;
1084 	uint32_t u2;
1085 	boolean_t match_not_found = B_TRUE;
1086 
1087 	/*
1088 	 * This should never happen unless the callers are doing some strange
1089 	 * and unexpected things.
1090 	 *
1091 	 * The "last" is the index pointing to the last character not last + 1.
1092 	 */
1093 	if (last >= U8_MAX_CHARS_A_SEQ)
1094 		last = U8_UPPER_LIMIT_IN_A_SEQ;
1095 
1096 	for (i = l = 0; i <= last; i++) {
1097 		/*
1098 		 * The last or any non-Starters at the beginning, we don't
1099 		 * have any chance to do composition and so we just copy them
1100 		 * to the temporary buffer.
1101 		 */
1102 		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
1103 SAVE_THE_CHAR:
1104 			p = s + start[i];
1105 			size = disp[i];
1106 			for (k = 0; k < size; k++)
1107 				t[l++] = *p++;
1108 			continue;
1109 		}
1110 
1111 		/*
1112 		 * If this could be a start of Hangul Jamos, then, we try to
1113 		 * conjoin them.
1114 		 */
1115 		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
1116 			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
1117 			    s[start[i] + 1], s[start[i] + 2]);
1118 			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
1119 			    s[start[i] + 4], s[start[i] + 5]);
1120 
1121 			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
1122 				u1 -= U8_HANGUL_JAMO_L_FIRST;
1123 				u2 -= U8_HANGUL_JAMO_V_FIRST;
1124 				u1 = U8_HANGUL_SYL_FIRST +
1125 				    (u1 * U8_HANGUL_V_COUNT + u2) *
1126 				    U8_HANGUL_T_COUNT;
1127 
1128 				i += 2;
1129 				if (i <= last) {
1130 					U8_PUT_3BYTES_INTO_UTF32(u2,
1131 					    s[start[i]], s[start[i] + 1],
1132 					    s[start[i] + 2]);
1133 
1134 					if (U8_HANGUL_JAMO_T(u2)) {
1135 						u1 += u2 -
1136 						    U8_HANGUL_JAMO_T_FIRST;
1137 						i++;
1138 					}
1139 				}
1140 
1141 				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
1142 				i--;
1143 				l += 3;
1144 				continue;
1145 			}
1146 		}
1147 
1148 		/*
1149 		 * Let's then find out if this Starter has composition
1150 		 * mapping.
1151 		 */
1152 		p = find_composition_start(uv, s + start[i], disp[i]);
1153 		if (p == NULL)
1154 			goto SAVE_THE_CHAR;
1155 
1156 		/*
1157 		 * We have a Starter with composition mapping and the next
1158 		 * character is a non-Starter. Let's try to find out if
1159 		 * we can do composition.
1160 		 */
1161 
1162 		saved_p = p;
1163 		saved_i = i;
1164 		saved_l = l;
1165 		saved_marks_count = 0;
1166 
1167 TRY_THE_NEXT_MARK:
1168 		q = s + start[++i];
1169 		size = disp[i];
1170 
1171 		/*
1172 		 * The next for() loop compares the non-Starter pointed by
1173 		 * 'q' with the possible (joinable) characters pointed by 'p'.
1174 		 *
1175 		 * The composition final table entry pointed by the 'p'
1176 		 * looks like the following:
1177 		 *
1178 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1179 		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
1180 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1181 		 *
1182 		 * where C is the count byte indicating the number of
1183 		 * mapping pairs where each pair would be look like
1184 		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
1185 		 * character of a canonical decomposition and the B0-Bm are
1186 		 * the bytes of a matching composite character. The F is
1187 		 * a filler byte after each character as the separator.
1188 		 */
1189 
1190 		match_not_found = B_TRUE;
1191 
1192 		for (C = *p++; C > 0; C--) {
1193 			for (k = 0; k < size; p++, k++)
1194 				if (*p != q[k])
1195 					break;
1196 
1197 			/* Have we found it? */
1198 			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
1199 				match_not_found = B_FALSE;
1200 
1201 				l = saved_l;
1202 
1203 				while (*++p != U8_TBL_ELEMENT_FILLER)
1204 					t[l++] = *p;
1205 
1206 				break;
1207 			}
1208 
1209 			/* We didn't find; skip to the next pair. */
1210 			if (*p != U8_TBL_ELEMENT_FILLER)
1211 				while (*++p != U8_TBL_ELEMENT_FILLER)
1212 					;
1213 			while (*++p != U8_TBL_ELEMENT_FILLER)
1214 				;
1215 			p++;
1216 		}
1217 
1218 		/*
1219 		 * If there was no match, we will need to save the combining
1220 		 * mark for later appending. After that, if the next one
1221 		 * is a non-Starter and not blocked, then, we try once
1222 		 * again to do composition with the next non-Starter.
1223 		 *
1224 		 * If there was no match and this was a Starter, then,
1225 		 * this is a new start.
1226 		 *
1227 		 * If there was a match and a composition done and we have
1228 		 * more to check on, then, we retrieve a new composition final
1229 		 * table entry for the composite and then try to do the
1230 		 * composition again.
1231 		 */
1232 
1233 		if (match_not_found) {
1234 			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
1235 				i--;
1236 				goto SAVE_THE_CHAR;
1237 			}
1238 
1239 			saved_marks[saved_marks_count++] = i;
1240 		}
1241 
1242 		if (saved_l == l) {
1243 			while (i < last) {
1244 				if (blocked(comb_class, i + 1))
1245 					saved_marks[saved_marks_count++] = ++i;
1246 				else
1247 					break;
1248 			}
1249 			if (i < last) {
1250 				p = saved_p;
1251 				goto TRY_THE_NEXT_MARK;
1252 			}
1253 		} else if (i < last) {
1254 			p = find_composition_start(uv, t + saved_l,
1255 			    l - saved_l);
1256 			if (p != NULL) {
1257 				saved_p = p;
1258 				goto TRY_THE_NEXT_MARK;
1259 			}
1260 		}
1261 
1262 		/*
1263 		 * There is no more composition possible.
1264 		 *
1265 		 * If there was no composition what so ever then we copy
1266 		 * over the original Starter and then append any non-Starters
1267 		 * remaining at the target string sequentially after that.
1268 		 */
1269 
1270 		if (saved_l == l) {
1271 			p = s + start[saved_i];
1272 			size = disp[saved_i];
1273 			for (j = 0; j < size; j++)
1274 				t[l++] = *p++;
1275 		}
1276 
1277 		for (k = 0; k < saved_marks_count; k++) {
1278 			p = s + start[saved_marks[k]];
1279 			size = disp[saved_marks[k]];
1280 			for (j = 0; j < size; j++)
1281 				t[l++] = *p++;
1282 		}
1283 	}
1284 
1285 	/*
1286 	 * If the last character is a Starter and if we have a character
1287 	 * (possibly another Starter) that can be turned into a composite,
1288 	 * we do so and we do so until there is no more of composition
1289 	 * possible.
1290 	 */
1291 	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
1292 		p = *os;
1293 		saved_l = l - disp[last];
1294 
1295 		while (p < oslast) {
1296 			int8_t number_of_bytes = u8_number_of_bytes[*p];
1297 
1298 			if (number_of_bytes <= 1)
1299 				break;
1300 			size = number_of_bytes;
1301 			if ((p + size) > oslast)
1302 				break;
1303 
1304 			saved_p = p;
1305 
1306 			for (i = 0; i < size; i++)
1307 				tc[i] = *p++;
1308 
1309 			q = find_composition_start(uv, t + saved_l,
1310 			    l - saved_l);
1311 			if (q == NULL) {
1312 				p = saved_p;
1313 				break;
1314 			}
1315 
1316 			match_not_found = B_TRUE;
1317 
1318 			for (C = *q++; C > 0; C--) {
1319 				for (k = 0; k < size; q++, k++)
1320 					if (*q != tc[k])
1321 						break;
1322 
1323 				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
1324 					match_not_found = B_FALSE;
1325 
1326 					l = saved_l;
1327 
1328 					while (*++q != U8_TBL_ELEMENT_FILLER) {
1329 						/*
1330 						 * This is practically
1331 						 * impossible but we don't
1332 						 * want to take any chances.
1333 						 */
1334 						if (l >=
1335 						    U8_STREAM_SAFE_TEXT_MAX) {
1336 							p = saved_p;
1337 							goto SAFE_RETURN;
1338 						}
1339 						t[l++] = *q;
1340 					}
1341 
1342 					break;
1343 				}
1344 
1345 				if (*q != U8_TBL_ELEMENT_FILLER)
1346 					while (*++q != U8_TBL_ELEMENT_FILLER)
1347 						;
1348 				while (*++q != U8_TBL_ELEMENT_FILLER)
1349 					;
1350 				q++;
1351 			}
1352 
1353 			if (match_not_found) {
1354 				p = saved_p;
1355 				break;
1356 			}
1357 		}
1358 SAFE_RETURN:
1359 		*os = p;
1360 	}
1361 
1362 	/*
1363 	 * Now we copy over the temporary string to the target string.
1364 	 * Since composition always reduces the number of characters or
1365 	 * the number of characters stay, we don't need to worry about
1366 	 * the buffer overflow here.
1367 	 */
1368 	for (i = 0; i < l; i++)
1369 		s[i] = t[i];
1370 	s[l] = '\0';
1371 
1372 	return (l);
1373 }
1374 
1375 /*
1376  * The collect_a_seq() function checks on the given string s, collect
1377  * a sequence of characters at u8s, and return the sequence. While it collects
1378  * a sequence, it also applies case conversion, canonical or compatibility
1379  * decomposition, canonical decomposition, or some or all of them and
1380  * in that order.
1381  *
1382  * The collected sequence cannot be bigger than 32 characters since if
1383  * it is having more than 31 characters, the sequence will be terminated
1384  * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
1385  * a Stream-Safe Text. The collected sequence is always terminated with
1386  * a null byte and the return value is the byte length of the sequence
1387  * including 0. The return value does not include the terminating
1388  * null byte.
1389  */
1390 static size_t
collect_a_seq(size_t uv,uchar_t * u8s,uchar_t ** source,uchar_t * slast,boolean_t is_it_toupper,boolean_t is_it_tolower,boolean_t canonical_decomposition,boolean_t compatibility_decomposition,boolean_t canonical_composition,int * errnum,u8_normalization_states_t * state)1391 collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
1392     boolean_t is_it_toupper,
1393     boolean_t is_it_tolower,
1394     boolean_t canonical_decomposition,
1395     boolean_t compatibility_decomposition,
1396     boolean_t canonical_composition,
1397     int *errnum, u8_normalization_states_t *state)
1398 {
1399 	uchar_t *s;
1400 	int sz;
1401 	int saved_sz;
1402 	size_t i;
1403 	size_t j;
1404 	size_t k;
1405 	size_t l;
1406 	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
1407 	uchar_t disp[U8_MAX_CHARS_A_SEQ];
1408 	uchar_t start[U8_MAX_CHARS_A_SEQ];
1409 	uchar_t u8t[U8_MB_CUR_MAX] = { '\0' };
1410 	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
1411 	uchar_t tc;
1412 	size_t last;
1413 	size_t saved_last;
1414 	uint32_t u1;
1415 
1416 	/*
1417 	 * Save the source string pointer which we will return a changed
1418 	 * pointer if we do processing.
1419 	 */
1420 	s = *source;
1421 
1422 	/*
1423 	 * The following is a fallback for just in case callers are not
1424 	 * checking the string boundaries before the calling.
1425 	 */
1426 	if (s >= slast) {
1427 		u8s[0] = '\0';
1428 
1429 		return (0);
1430 	}
1431 
1432 	/*
1433 	 * As the first thing, let's collect a character and do case
1434 	 * conversion if necessary.
1435 	 */
1436 
1437 	sz = u8_number_of_bytes[*s];
1438 
1439 	if (sz < 0) {
1440 		*errnum = EILSEQ;
1441 
1442 		u8s[0] = *s++;
1443 		u8s[1] = '\0';
1444 
1445 		*source = s;
1446 
1447 		return (1);
1448 	}
1449 
1450 	if (sz == 1) {
1451 		if (is_it_toupper)
1452 			u8s[0] = U8_ASCII_TOUPPER(*s);
1453 		else if (is_it_tolower)
1454 			u8s[0] = U8_ASCII_TOLOWER(*s);
1455 		else
1456 			u8s[0] = *s;
1457 		s++;
1458 		u8s[1] = '\0';
1459 	} else if ((s + sz) > slast) {
1460 		*errnum = EINVAL;
1461 
1462 		for (i = 0; s < slast; )
1463 			u8s[i++] = *s++;
1464 		u8s[i] = '\0';
1465 
1466 		*source = s;
1467 
1468 		return (i);
1469 	} else {
1470 		if (is_it_toupper || is_it_tolower) {
1471 			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
1472 			s += sz;
1473 			sz = i;
1474 		} else {
1475 			for (i = 0; i < sz; )
1476 				u8s[i++] = *s++;
1477 			u8s[i] = '\0';
1478 		}
1479 	}
1480 
1481 	/*
1482 	 * And then canonical/compatibility decomposition followed by
1483 	 * an optional canonical composition. Please be noted that
1484 	 * canonical composition is done only when a decomposition is
1485 	 * done.
1486 	 */
1487 	if (canonical_decomposition || compatibility_decomposition) {
1488 		if (sz == 1) {
1489 			*state = U8_STATE_START;
1490 
1491 			saved_sz = 1;
1492 
1493 			comb_class[0] = 0;
1494 			start[0] = 0;
1495 			disp[0] = 1;
1496 
1497 			last = 1;
1498 		} else {
1499 			saved_sz = do_decomp(uv, u8s, u8s, sz,
1500 			    canonical_decomposition, state);
1501 
1502 			last = 0;
1503 
1504 			for (i = 0; i < saved_sz; ) {
1505 				sz = u8_number_of_bytes[u8s[i]];
1506 
1507 				comb_class[last] = combining_class(uv,
1508 				    u8s + i, sz);
1509 				start[last] = i;
1510 				disp[last] = sz;
1511 
1512 				last++;
1513 				i += sz;
1514 			}
1515 
1516 			/*
1517 			 * Decomposition yields various Hangul related
1518 			 * states but not on combining marks. We need to
1519 			 * find out at here by checking on the last
1520 			 * character.
1521 			 */
1522 			if (*state == U8_STATE_START) {
1523 				if (comb_class[last - 1])
1524 					*state = U8_STATE_COMBINING_MARK;
1525 			}
1526 		}
1527 
1528 		saved_last = last;
1529 
1530 		while (s < slast) {
1531 			sz = u8_number_of_bytes[*s];
1532 
1533 			/*
1534 			 * If this is an illegal character, an incomplete
1535 			 * character, or an 7-bit ASCII Starter character,
1536 			 * then we have collected a sequence; break and let
1537 			 * the next call deal with the two cases.
1538 			 *
1539 			 * Note that this is okay only if you are using this
1540 			 * function with a fixed length string, not on
1541 			 * a buffer with multiple calls of one chunk at a time.
1542 			 */
1543 			if (sz <= 1) {
1544 				break;
1545 			} else if ((s + sz) > slast) {
1546 				break;
1547 			} else {
1548 				/*
1549 				 * If the previous character was a Hangul Jamo
1550 				 * and this character is a Hangul Jamo that
1551 				 * can be conjoined, we collect the Jamo.
1552 				 */
1553 				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
1554 					U8_PUT_3BYTES_INTO_UTF32(u1,
1555 					    *s, *(s + 1), *(s + 2));
1556 
1557 					if (U8_HANGUL_COMPOSABLE_L_V(*state,
1558 					    u1)) {
1559 						i = 0;
1560 						*state = U8_STATE_HANGUL_LV;
1561 						goto COLLECT_A_HANGUL;
1562 					}
1563 
1564 					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
1565 					    u1)) {
1566 						i = 0;
1567 						*state = U8_STATE_HANGUL_LVT;
1568 						goto COLLECT_A_HANGUL;
1569 					}
1570 				}
1571 
1572 				/*
1573 				 * Regardless of whatever it was, if this is
1574 				 * a Starter, we don't collect the character
1575 				 * since that's a new start and we will deal
1576 				 * with it at the next time.
1577 				 */
1578 				i = combining_class(uv, s, sz);
1579 				if (i == U8_COMBINING_CLASS_STARTER)
1580 					break;
1581 
1582 				/*
1583 				 * We know the current character is a combining
1584 				 * mark. If the previous character wasn't
1585 				 * a Starter (not Hangul) or a combining mark,
1586 				 * then, we don't collect this combining mark.
1587 				 */
1588 				if (*state != U8_STATE_START &&
1589 				    *state != U8_STATE_COMBINING_MARK)
1590 					break;
1591 
1592 				*state = U8_STATE_COMBINING_MARK;
1593 COLLECT_A_HANGUL:
1594 				/*
1595 				 * If we collected a Starter and combining
1596 				 * marks up to 30, i.e., total 31 characters,
1597 				 * then, we terminate this degenerately long
1598 				 * combining sequence with a U+034F COMBINING
1599 				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
1600 				 * UTF-8 and turn this into a Stream-Safe
1601 				 * Text. This will be extremely rare but
1602 				 * possible.
1603 				 *
1604 				 * The following will also guarantee that
1605 				 * we are not writing more than 32 characters
1606 				 * plus a NULL at u8s[].
1607 				 */
1608 				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
1609 TURN_STREAM_SAFE:
1610 					*state = U8_STATE_START;
1611 					comb_class[last] = 0;
1612 					start[last] = saved_sz;
1613 					disp[last] = 2;
1614 					last++;
1615 
1616 					u8s[saved_sz++] = 0xCD;
1617 					u8s[saved_sz++] = 0x8F;
1618 
1619 					break;
1620 				}
1621 
1622 				/*
1623 				 * Some combining marks also do decompose into
1624 				 * another combining mark or marks.
1625 				 */
1626 				if (*state == U8_STATE_COMBINING_MARK) {
1627 					k = last;
1628 					l = sz;
1629 					i = do_decomp(uv, uts, s, sz,
1630 					    canonical_decomposition, state);
1631 					for (j = 0; j < i; ) {
1632 						sz = u8_number_of_bytes[uts[j]];
1633 
1634 						comb_class[last] =
1635 						    combining_class(uv,
1636 						    uts + j, sz);
1637 						start[last] = saved_sz + j;
1638 						disp[last] = sz;
1639 
1640 						last++;
1641 						if (last >=
1642 						    U8_UPPER_LIMIT_IN_A_SEQ) {
1643 							last = k;
1644 							goto TURN_STREAM_SAFE;
1645 						}
1646 						j += sz;
1647 					}
1648 
1649 					*state = U8_STATE_COMBINING_MARK;
1650 					sz = i;
1651 					s += l;
1652 
1653 					for (i = 0; i < sz; i++)
1654 						u8s[saved_sz++] = uts[i];
1655 				} else {
1656 					comb_class[last] = i;
1657 					start[last] = saved_sz;
1658 					disp[last] = sz;
1659 					last++;
1660 
1661 					for (i = 0; i < sz; i++)
1662 						u8s[saved_sz++] = *s++;
1663 				}
1664 
1665 				/*
1666 				 * If this is U+0345 COMBINING GREEK
1667 				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
1668 				 * iota subscript, and need to be converted to
1669 				 * uppercase letter, convert it to U+0399 GREEK
1670 				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
1671 				 * i.e., convert to capital adscript form as
1672 				 * specified in the Unicode standard.
1673 				 *
1674 				 * This is the only special case of (ambiguous)
1675 				 * case conversion at combining marks and
1676 				 * probably the standard will never have
1677 				 * anything similar like this in future.
1678 				 */
1679 				if (is_it_toupper && sz >= 2 &&
1680 				    u8s[saved_sz - 2] == 0xCD &&
1681 				    u8s[saved_sz - 1] == 0x85) {
1682 					u8s[saved_sz - 2] = 0xCE;
1683 					u8s[saved_sz - 1] = 0x99;
1684 				}
1685 			}
1686 		}
1687 
1688 		/*
1689 		 * Let's try to ensure a canonical ordering for the collected
1690 		 * combining marks. We do this only if we have collected
1691 		 * at least one more non-Starter. (The decomposition mapping
1692 		 * data tables have fully (and recursively) expanded and
1693 		 * canonically ordered decompositions.)
1694 		 *
1695 		 * The U8_SWAP_COMB_MARKS() convenience macro has some
1696 		 * assumptions and we are meeting the assumptions.
1697 		 */
1698 		last--;
1699 		if (last >= saved_last) {
1700 			for (i = 0; i < last; i++)
1701 				for (j = last; j > i; j--)
1702 					if (comb_class[j] &&
1703 					    comb_class[j - 1] > comb_class[j]) {
1704 						U8_SWAP_COMB_MARKS(j - 1, j);
1705 					}
1706 		}
1707 
1708 		*source = s;
1709 
1710 		if (! canonical_composition) {
1711 			u8s[saved_sz] = '\0';
1712 			return (saved_sz);
1713 		}
1714 
1715 		/*
1716 		 * Now do the canonical composition. Note that we do this
1717 		 * only after a canonical or compatibility decomposition to
1718 		 * finish up NFC or NFKC.
1719 		 */
1720 		sz = do_composition(uv, u8s, comb_class, start, disp, last,
1721 		    &s, slast);
1722 	}
1723 
1724 	*source = s;
1725 
1726 	return ((size_t)sz);
1727 }
1728 
1729 /*
1730  * The do_norm_compare() function does string comparison based on Unicode
1731  * simple case mappings and Unicode Normalization definitions.
1732  *
1733  * It does so by collecting a sequence of character at a time and comparing
1734  * the collected sequences from the strings.
1735  *
1736  * The meanings on the return values are the same as the usual strcmp().
1737  */
1738 static int
do_norm_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,int flag,int * errnum)1739 do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1740     int flag, int *errnum)
1741 {
1742 	int result;
1743 	size_t sz1;
1744 	size_t sz2;
1745 	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
1746 	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
1747 	uchar_t *s1last;
1748 	uchar_t *s2last;
1749 	boolean_t is_it_toupper;
1750 	boolean_t is_it_tolower;
1751 	boolean_t canonical_decomposition;
1752 	boolean_t compatibility_decomposition;
1753 	boolean_t canonical_composition;
1754 	u8_normalization_states_t state;
1755 
1756 	s1last = s1 + n1;
1757 	s2last = s2 + n2;
1758 
1759 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1760 #ifdef U8_STRCMP_CI_LOWER
1761 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1762 #else
1763 	is_it_tolower = 0;
1764 #endif
1765 	canonical_decomposition = flag & U8_CANON_DECOMP;
1766 	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
1767 	canonical_composition = flag & U8_CANON_COMP;
1768 
1769 	while (s1 < s1last && s2 < s2last) {
1770 		/*
1771 		 * If the current character is a 7-bit ASCII and the last
1772 		 * character, or, if the current character and the next
1773 		 * character are both some 7-bit ASCII characters then
1774 		 * we treat the current character as a sequence.
1775 		 *
1776 		 * In any other cases, we need to call collect_a_seq().
1777 		 */
1778 
1779 		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
1780 		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
1781 			if (is_it_toupper)
1782 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
1783 			else if (is_it_tolower)
1784 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
1785 			else
1786 				u8s1[0] = *s1;
1787 			u8s1[1] = '\0';
1788 			sz1 = 1;
1789 			s1++;
1790 		} else {
1791 			state = U8_STATE_START;
1792 			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
1793 			    is_it_toupper, is_it_tolower,
1794 			    canonical_decomposition,
1795 			    compatibility_decomposition,
1796 			    canonical_composition, errnum, &state);
1797 		}
1798 
1799 		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
1800 		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
1801 			if (is_it_toupper)
1802 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
1803 			else if (is_it_tolower)
1804 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
1805 			else
1806 				u8s2[0] = *s2;
1807 			u8s2[1] = '\0';
1808 			sz2 = 1;
1809 			s2++;
1810 		} else {
1811 			state = U8_STATE_START;
1812 			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
1813 			    is_it_toupper, is_it_tolower,
1814 			    canonical_decomposition,
1815 			    compatibility_decomposition,
1816 			    canonical_composition, errnum, &state);
1817 		}
1818 
1819 		/*
1820 		 * Now compare the two characters. If they are the same,
1821 		 * we move on to the next character sequences.
1822 		 */
1823 		if (sz1 == 1 && sz2 == 1) {
1824 			if (*u8s1 > *u8s2)
1825 				return (1);
1826 			if (*u8s1 < *u8s2)
1827 				return (-1);
1828 		} else {
1829 			result = strcmp((const char *)u8s1, (const char *)u8s2);
1830 			if (result != 0)
1831 				return (result);
1832 		}
1833 	}
1834 
1835 	/*
1836 	 * We compared until the end of either or both strings.
1837 	 *
1838 	 * If we reached to or went over the ends for the both, that means
1839 	 * they are the same.
1840 	 *
1841 	 * If we reached only one end, that means the other string has
1842 	 * something which then can be used to determine the return value.
1843 	 */
1844 	if (s1 >= s1last) {
1845 		if (s2 >= s2last)
1846 			return (0);
1847 		return (-1);
1848 	}
1849 	return (1);
1850 }
1851 
1852 /*
1853  * The u8_strcmp() function compares two UTF-8 strings quite similar to
1854  * the strcmp(). For the comparison, however, Unicode Normalization specific
1855  * equivalency and Unicode simple case conversion mappings based equivalency
1856  * can be requested and checked against.
1857  */
1858 int
u8_strcmp(const char * s1,const char * s2,size_t n,int flag,size_t uv,int * errnum)1859 u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1860     int *errnum)
1861 {
1862 	int f;
1863 	size_t n1;
1864 	size_t n2;
1865 
1866 	*errnum = 0;
1867 
1868 	/*
1869 	 * Check on the requested Unicode version, case conversion, and
1870 	 * normalization flag values.
1871 	 */
1872 
1873 	if (uv > U8_UNICODE_LATEST) {
1874 		*errnum = ERANGE;
1875 		uv = U8_UNICODE_LATEST;
1876 	}
1877 
1878 	if (flag == 0) {
1879 		flag = U8_STRCMP_CS;
1880 	} else {
1881 #ifdef U8_STRCMP_CI_LOWER
1882 		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER
1883 		    | U8_STRCMP_CI_LOWER);
1884 #else
1885 		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER);
1886 #endif
1887 		if (f == 0) {
1888 			flag |= U8_STRCMP_CS;
1889 		}
1890 #ifdef U8_STRCMP_CI_LOWER
1891 		else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
1892 		    f != U8_STRCMP_CI_LOWER)
1893 #else
1894 		else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER)
1895 #endif
1896 		{
1897 			*errnum = EBADF;
1898 			flag = U8_STRCMP_CS;
1899 		}
1900 
1901 		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1902 		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
1903 		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
1904 			*errnum = EBADF;
1905 			flag = U8_STRCMP_CS;
1906 		}
1907 	}
1908 
1909 	if (flag == U8_STRCMP_CS) {
1910 		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
1911 	}
1912 
1913 	n1 = strlen(s1);
1914 	n2 = strlen(s2);
1915 	if (n != 0) {
1916 		if (n < n1)
1917 			n1 = n;
1918 		if (n < n2)
1919 			n2 = n;
1920 	}
1921 
1922 	/*
1923 	 * Simple case conversion can be done much faster and so we do
1924 	 * them separately here.
1925 	 */
1926 	if (flag == U8_STRCMP_CI_UPPER) {
1927 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1928 		    n1, n2, B_TRUE, errnum));
1929 	}
1930 #ifdef U8_STRCMP_CI_LOWER
1931 	else if (flag == U8_STRCMP_CI_LOWER) {
1932 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1933 		    n1, n2, B_FALSE, errnum));
1934 	}
1935 #endif
1936 
1937 	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
1938 	    flag, errnum));
1939 }
1940 
1941 size_t
u8_textprep_str(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,size_t unicode_version,int * errnum)1942 u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1943     int flag, size_t unicode_version, int *errnum)
1944 {
1945 	int f;
1946 	int sz;
1947 	uchar_t *ib;
1948 	uchar_t *ibtail;
1949 	uchar_t *ob;
1950 	uchar_t *obtail;
1951 	boolean_t do_not_ignore_null;
1952 	boolean_t do_not_ignore_invalid;
1953 	boolean_t is_it_toupper;
1954 	boolean_t is_it_tolower;
1955 	boolean_t canonical_decomposition;
1956 	boolean_t compatibility_decomposition;
1957 	boolean_t canonical_composition;
1958 	size_t ret_val;
1959 	size_t i;
1960 	size_t j;
1961 	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
1962 	u8_normalization_states_t state;
1963 
1964 	if (unicode_version > U8_UNICODE_LATEST) {
1965 		*errnum = ERANGE;
1966 		return ((size_t)-1);
1967 	}
1968 
1969 #ifdef U8_TEXTPREP_TOLOWER
1970 	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
1971 	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
1972 		*errnum = EBADF;
1973 		return ((size_t)-1);
1974 	}
1975 #endif
1976 
1977 	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1978 	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
1979 	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
1980 		*errnum = EBADF;
1981 		return ((size_t)-1);
1982 	}
1983 
1984 	if (inarray == NULL || *inlen == 0)
1985 		return (0);
1986 
1987 	if (outarray == NULL) {
1988 		*errnum = E2BIG;
1989 		return ((size_t)-1);
1990 	}
1991 
1992 	ib = (uchar_t *)inarray;
1993 	ob = (uchar_t *)outarray;
1994 	ibtail = ib + *inlen;
1995 	obtail = ob + *outlen;
1996 
1997 	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
1998 	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
1999 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
2000 #ifdef U8_TEXTPREP_TOLOWER
2001 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
2002 #else
2003 	is_it_tolower = 0;
2004 #endif
2005 
2006 	ret_val = 0;
2007 
2008 	/*
2009 	 * If we don't have a normalization flag set, we do the simple case
2010 	 * conversion based text preparation separately below. Text
2011 	 * preparation involving Normalization will be done in the false task
2012 	 * block, again, separately since it will take much more time and
2013 	 * resource than doing simple case conversions.
2014 	 */
2015 	if (f == 0) {
2016 		while (ib < ibtail) {
2017 			if (*ib == '\0' && do_not_ignore_null)
2018 				break;
2019 
2020 			sz = u8_number_of_bytes[*ib];
2021 
2022 			if (sz < 0) {
2023 				if (do_not_ignore_invalid) {
2024 					*errnum = EILSEQ;
2025 					ret_val = (size_t)-1;
2026 					break;
2027 				}
2028 
2029 				sz = 1;
2030 				ret_val++;
2031 			}
2032 
2033 			if (sz == 1) {
2034 				if (ob >= obtail) {
2035 					*errnum = E2BIG;
2036 					ret_val = (size_t)-1;
2037 					break;
2038 				}
2039 
2040 				if (is_it_toupper)
2041 					*ob = U8_ASCII_TOUPPER(*ib);
2042 				else if (is_it_tolower)
2043 					*ob = U8_ASCII_TOLOWER(*ib);
2044 				else
2045 					*ob = *ib;
2046 				ib++;
2047 				ob++;
2048 			} else if ((ib + sz) > ibtail) {
2049 				if (do_not_ignore_invalid) {
2050 					*errnum = EINVAL;
2051 					ret_val = (size_t)-1;
2052 					break;
2053 				}
2054 
2055 				if ((obtail - ob) < (ibtail - ib)) {
2056 					*errnum = E2BIG;
2057 					ret_val = (size_t)-1;
2058 					break;
2059 				}
2060 
2061 				/*
2062 				 * We treat the remaining incomplete character
2063 				 * bytes as a character.
2064 				 */
2065 				ret_val++;
2066 
2067 				while (ib < ibtail)
2068 					*ob++ = *ib++;
2069 			} else {
2070 				if (is_it_toupper || is_it_tolower) {
2071 					i = do_case_conv(unicode_version, u8s,
2072 					    ib, sz, is_it_toupper);
2073 
2074 					if ((obtail - ob) < i) {
2075 						*errnum = E2BIG;
2076 						ret_val = (size_t)-1;
2077 						break;
2078 					}
2079 
2080 					ib += sz;
2081 
2082 					for (sz = 0; sz < i; sz++)
2083 						*ob++ = u8s[sz];
2084 				} else {
2085 					if ((obtail - ob) < sz) {
2086 						*errnum = E2BIG;
2087 						ret_val = (size_t)-1;
2088 						break;
2089 					}
2090 
2091 					for (i = 0; i < sz; i++)
2092 						*ob++ = *ib++;
2093 				}
2094 			}
2095 		}
2096 	} else {
2097 		canonical_decomposition = flag & U8_CANON_DECOMP;
2098 		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
2099 		canonical_composition = flag & U8_CANON_COMP;
2100 
2101 		while (ib < ibtail) {
2102 			if (*ib == '\0' && do_not_ignore_null)
2103 				break;
2104 
2105 			/*
2106 			 * If the current character is a 7-bit ASCII
2107 			 * character and it is the last character, or,
2108 			 * if the current character is a 7-bit ASCII
2109 			 * character and the next character is also a 7-bit
2110 			 * ASCII character, then, we copy over this
2111 			 * character without going through collect_a_seq().
2112 			 *
2113 			 * In any other cases, we need to look further with
2114 			 * the collect_a_seq() function.
2115 			 */
2116 			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
2117 			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
2118 				if (ob >= obtail) {
2119 					*errnum = E2BIG;
2120 					ret_val = (size_t)-1;
2121 					break;
2122 				}
2123 
2124 				if (is_it_toupper)
2125 					*ob = U8_ASCII_TOUPPER(*ib);
2126 				else if (is_it_tolower)
2127 					*ob = U8_ASCII_TOLOWER(*ib);
2128 				else
2129 					*ob = *ib;
2130 				ib++;
2131 				ob++;
2132 			} else {
2133 				*errnum = 0;
2134 				state = U8_STATE_START;
2135 
2136 				j = collect_a_seq(unicode_version, u8s,
2137 				    &ib, ibtail,
2138 				    is_it_toupper,
2139 				    is_it_tolower,
2140 				    canonical_decomposition,
2141 				    compatibility_decomposition,
2142 				    canonical_composition,
2143 				    errnum, &state);
2144 
2145 				if (*errnum && do_not_ignore_invalid) {
2146 					ret_val = (size_t)-1;
2147 					break;
2148 				}
2149 
2150 				if ((obtail - ob) < j) {
2151 					*errnum = E2BIG;
2152 					ret_val = (size_t)-1;
2153 					break;
2154 				}
2155 
2156 				for (i = 0; i < j; i++)
2157 					*ob++ = u8s[i];
2158 			}
2159 		}
2160 	}
2161 
2162 	*inlen = ibtail - ib;
2163 	*outlen = obtail - ob;
2164 
2165 	return (ret_val);
2166 }
2167 
2168 EXPORT_SYMBOL(u8_validate);
2169 EXPORT_SYMBOL(u8_strcmp);
2170 EXPORT_SYMBOL(u8_textprep_str);
2171