xref: /illumos-gate/usr/src/common/unicode/u8_textprep.c (revision f137b22e734e85642da3e56e8b94da3f5f027c73)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2022 MNX Cloud, Inc.
28  */
29 
30 
31 /*
32  * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
33  *
34  * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
35  * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
36  * the section 3C man pages.
37  * Interface stability: Committed.
38  */
39 
40 #include <sys/types.h>
41 #ifdef	_KERNEL
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/systm.h>
45 #include <sys/debug.h>
46 #include <sys/kmem.h>
47 #include <sys/ddi.h>
48 #include <sys/sunddi.h>
49 #else
50 #include <sys/u8_textprep.h>
51 #include <strings.h>
52 #endif	/* _KERNEL */
53 #include <sys/byteorder.h>
54 #include <sys/errno.h>
55 #include <sys/u8_textprep_data.h>
56 
57 
58 /* The maximum possible number of bytes in a UTF-8 character. */
59 #define	U8_MB_CUR_MAX			(4)
60 
61 /*
62  * The maximum number of bytes needed for a UTF-8 character to cover
63  * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
64  */
65 #define	U8_MAX_BYTES_UCS2		(3)
66 
67 /* The maximum possible number of bytes in a Stream-Safe Text. */
68 #define	U8_STREAM_SAFE_TEXT_MAX		(128)
69 
70 /*
71  * The maximum number of characters in a combining/conjoining sequence and
72  * the actual upperbound limit of a combining/conjoining sequence.
73  */
74 #define	U8_MAX_CHARS_A_SEQ		(32)
75 #define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
76 
77 /* The combining class value for Starter. */
78 #define	U8_COMBINING_CLASS_STARTER	(0)
79 
80 /*
81  * Some Hangul related macros at below.
82  *
83  * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
84  * Vowels, and optional Trailing consonants in Unicode scalar values.
85  *
86  * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
87  * the actual U+11A8. This is due to that the trailing consonant is optional
88  * and thus we are doing a pre-calculation of subtracting one.
89  *
90  * Each of 19 modern leading consonants has total 588 possible syllables since
91  * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
92  * no trailing consonant case, i.e., 21 x 28 = 588.
93  *
94  * We also have bunch of Hangul related macros at below. Please bear in mind
95  * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
96  * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
97  * Jamo; it just guarantee that it will be most likely.
98  */
99 #define	U8_HANGUL_SYL_FIRST		(0xAC00U)
100 #define	U8_HANGUL_SYL_LAST		(0xD7A3U)
101 
102 #define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
103 #define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
104 #define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
105 #define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
106 #define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
107 #define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
108 
109 #define	U8_HANGUL_V_COUNT		(21)
110 #define	U8_HANGUL_VT_COUNT		(588)
111 #define	U8_HANGUL_T_COUNT		(28)
112 
113 #define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
114 
115 #define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
116 	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
117 	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
118 	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
119 
120 #define	U8_HANGUL_JAMO_L(u) \
121 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
122 
123 #define	U8_HANGUL_JAMO_V(u) \
124 	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
125 
126 #define	U8_HANGUL_JAMO_T(u) \
127 	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
128 
129 #define	U8_HANGUL_JAMO(u) \
130 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
131 
132 #define	U8_HANGUL_SYLLABLE(u) \
133 	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
134 
135 #define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
136 	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
137 
138 #define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
139 	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
140 
141 /* The types of decomposition mappings. */
142 #define	U8_DECOMP_BOTH			(0xF5U)
143 #define	U8_DECOMP_CANONICAL		(0xF6U)
144 
145 /* The indicator for 16-bit table. */
146 #define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
147 
148 /* The following are some convenience macros. */
149 #define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
150 	(u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
151 		(uint32_t)(b3) & 0x3F;
152 
153 #define	U8_SIMPLE_SWAP(a, b, t) \
154 	(t) = (a); \
155 	(a) = (b); \
156 	(b) = (t);
157 
158 #define	U8_ASCII_TOUPPER(c) \
159 	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
160 
161 #define	U8_ASCII_TOLOWER(c) \
162 	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
163 
164 #define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
165 /*
166  * The following macro assumes that the two characters that are to be
167  * swapped are adjacent to each other and 'a' comes before 'b'.
168  *
169  * If the assumptions are not met, then, the macro will fail.
170  */
171 #define	U8_SWAP_COMB_MARKS(a, b) \
172 	for (k = 0; k < disp[(a)]; k++) \
173 		u8t[k] = u8s[start[(a)] + k]; \
174 	for (k = 0; k < disp[(b)]; k++) \
175 		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
176 	start[(b)] = start[(a)] + disp[(b)]; \
177 	for (k = 0; k < disp[(a)]; k++) \
178 		u8s[start[(b)] + k] = u8t[k]; \
179 	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
180 	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
181 
182 /* The possible states during normalization. */
183 typedef enum {
184 	U8_STATE_START = 0,
185 	U8_STATE_HANGUL_L = 1,
186 	U8_STATE_HANGUL_LV = 2,
187 	U8_STATE_HANGUL_LVT = 3,
188 	U8_STATE_HANGUL_V = 4,
189 	U8_STATE_HANGUL_T = 5,
190 	U8_STATE_COMBINING_MARK = 6
191 } u8_normalization_states_t;
192 
193 /*
194  * The three vectors at below are used to check bytes of a given UTF-8
195  * character are valid and not containing any malformed byte values.
196  *
197  * We used to have a quite relaxed UTF-8 binary representation but then there
198  * was some security related issues and so the Unicode Consortium defined
199  * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
200  * one more time at the Unicode 3.2. The following three tables are based on
201  * that.
202  */
203 
204 #define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
205 
206 #define	I_				U8_ILLEGAL_CHAR
207 #define	O_				U8_OUT_OF_RANGE_CHAR
208 
209 const int8_t u8_number_of_bytes[0x100] = {
210 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
211 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
212 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
213 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
214 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
215 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
216 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
217 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
218 
219 /*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
220 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
221 
222 /*	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
223 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
224 
225 /*	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
226 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
227 
228 /*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
229 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
230 
231 /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
232 	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
233 
234 /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
235 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
236 
237 /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
238 	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
239 
240 /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
241 	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
242 };
243 
244 #undef	I_
245 #undef	O_
246 
247 const uint8_t u8_valid_min_2nd_byte[0x100] = {
248 	0,    0,    0,    0,    0,    0,    0,    0,
249 	0,    0,    0,    0,    0,    0,    0,    0,
250 	0,    0,    0,    0,    0,    0,    0,    0,
251 	0,    0,    0,    0,    0,    0,    0,    0,
252 	0,    0,    0,    0,    0,    0,    0,    0,
253 	0,    0,    0,    0,    0,    0,    0,    0,
254 	0,    0,    0,    0,    0,    0,    0,    0,
255 	0,    0,    0,    0,    0,    0,    0,    0,
256 	0,    0,    0,    0,    0,    0,    0,    0,
257 	0,    0,    0,    0,    0,    0,    0,    0,
258 	0,    0,    0,    0,    0,    0,    0,    0,
259 	0,    0,    0,    0,    0,    0,    0,    0,
260 	0,    0,    0,    0,    0,    0,    0,    0,
261 	0,    0,    0,    0,    0,    0,    0,    0,
262 	0,    0,    0,    0,    0,    0,    0,    0,
263 	0,    0,    0,    0,    0,    0,    0,    0,
264 	0,    0,    0,    0,    0,    0,    0,    0,
265 	0,    0,    0,    0,    0,    0,    0,    0,
266 	0,    0,    0,    0,    0,    0,    0,    0,
267 	0,    0,    0,    0,    0,    0,    0,    0,
268 	0,    0,    0,    0,    0,    0,    0,    0,
269 	0,    0,    0,    0,    0,    0,    0,    0,
270 	0,    0,    0,    0,    0,    0,    0,    0,
271 	0,    0,    0,    0,    0,    0,    0,    0,
272 /*	C0    C1    C2    C3    C4    C5    C6    C7    */
273 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
274 /*	C8    C9    CA    CB    CC    CD    CE    CF    */
275 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
276 /*	D0    D1    D2    D3    D4    D5    D6    D7    */
277 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
278 /*	D8    D9    DA    DB    DC    DD    DE    DF    */
279 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
280 /*	E0    E1    E2    E3    E4    E5    E6    E7    */
281 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
282 /*	E8    E9    EA    EB    EC    ED    EE    EF    */
283 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
284 /*	F0    F1    F2    F3    F4    F5    F6    F7    */
285 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
286 	0,    0,    0,    0,    0,    0,    0,    0,
287 };
288 
289 const uint8_t u8_valid_max_2nd_byte[0x100] = {
290 	0,    0,    0,    0,    0,    0,    0,    0,
291 	0,    0,    0,    0,    0,    0,    0,    0,
292 	0,    0,    0,    0,    0,    0,    0,    0,
293 	0,    0,    0,    0,    0,    0,    0,    0,
294 	0,    0,    0,    0,    0,    0,    0,    0,
295 	0,    0,    0,    0,    0,    0,    0,    0,
296 	0,    0,    0,    0,    0,    0,    0,    0,
297 	0,    0,    0,    0,    0,    0,    0,    0,
298 	0,    0,    0,    0,    0,    0,    0,    0,
299 	0,    0,    0,    0,    0,    0,    0,    0,
300 	0,    0,    0,    0,    0,    0,    0,    0,
301 	0,    0,    0,    0,    0,    0,    0,    0,
302 	0,    0,    0,    0,    0,    0,    0,    0,
303 	0,    0,    0,    0,    0,    0,    0,    0,
304 	0,    0,    0,    0,    0,    0,    0,    0,
305 	0,    0,    0,    0,    0,    0,    0,    0,
306 	0,    0,    0,    0,    0,    0,    0,    0,
307 	0,    0,    0,    0,    0,    0,    0,    0,
308 	0,    0,    0,    0,    0,    0,    0,    0,
309 	0,    0,    0,    0,    0,    0,    0,    0,
310 	0,    0,    0,    0,    0,    0,    0,    0,
311 	0,    0,    0,    0,    0,    0,    0,    0,
312 	0,    0,    0,    0,    0,    0,    0,    0,
313 	0,    0,    0,    0,    0,    0,    0,    0,
314 /*	C0    C1    C2    C3    C4    C5    C6    C7    */
315 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
316 /*	C8    C9    CA    CB    CC    CD    CE    CF    */
317 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
318 /*	D0    D1    D2    D3    D4    D5    D6    D7    */
319 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
320 /*	D8    D9    DA    DB    DC    DD    DE    DF    */
321 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
322 /*	E0    E1    E2    E3    E4    E5    E6    E7    */
323 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
324 /*	E8    E9    EA    EB    EC    ED    EE    EF    */
325 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
326 /*	F0    F1    F2    F3    F4    F5    F6    F7    */
327 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
328 	0,    0,    0,    0,    0,    0,    0,    0,
329 };
330 
331 
332 /*
333  * The u8_validate() validates on the given UTF-8 character string and
334  * calculate the byte length. It is quite similar to mblen(3C) except that
335  * this will validate against the list of characters if required and
336  * specific to UTF-8 and Unicode.
337  */
338 int
u8_validate(char * u8str,size_t n,char ** list,int flag,int * errnum)339 u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
340 {
341 	uchar_t *ib;
342 	uchar_t *ibtail;
343 	uchar_t **p;
344 	uchar_t *s1;
345 	uchar_t *s2;
346 	uchar_t f;
347 	int sz;
348 	size_t i;
349 	int ret_val;
350 	boolean_t second;
351 	boolean_t no_need_to_validate_entire;
352 	boolean_t check_additional;
353 	boolean_t validate_ucs2_range_only;
354 
355 	if (! u8str)
356 		return (0);
357 
358 	ib = (uchar_t *)u8str;
359 	ibtail = ib + n;
360 
361 	ret_val = 0;
362 
363 	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
364 	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
365 	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
366 
367 	while (ib < ibtail) {
368 		/*
369 		 * The first byte of a UTF-8 character tells how many
370 		 * bytes will follow for the character. If the first byte
371 		 * is an illegal byte value or out of range value, we just
372 		 * return -1 with an appropriate error number.
373 		 */
374 		sz = u8_number_of_bytes[*ib];
375 		if (sz == U8_ILLEGAL_CHAR) {
376 			*errnum = EILSEQ;
377 			return (-1);
378 		}
379 
380 		if (sz == U8_OUT_OF_RANGE_CHAR ||
381 		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
382 			*errnum = ERANGE;
383 			return (-1);
384 		}
385 
386 		/*
387 		 * If we don't have enough bytes to check on, that's also
388 		 * an error. As you can see, we give illegal byte sequence
389 		 * checking higher priority then EINVAL cases.
390 		 */
391 		if ((ibtail - ib) < sz) {
392 			*errnum = EINVAL;
393 			return (-1);
394 		}
395 
396 		if (sz == 1) {
397 			ib++;
398 			ret_val++;
399 		} else {
400 			/*
401 			 * Check on the multi-byte UTF-8 character. For more
402 			 * details on this, see comment added for the used
403 			 * data structures at the beginning of the file.
404 			 */
405 			f = *ib++;
406 			ret_val++;
407 			second = B_TRUE;
408 			for (i = 1; i < sz; i++) {
409 				if (second) {
410 					if (*ib < u8_valid_min_2nd_byte[f] ||
411 					    *ib > u8_valid_max_2nd_byte[f]) {
412 						*errnum = EILSEQ;
413 						return (-1);
414 					}
415 					second = B_FALSE;
416 				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
417 					*errnum = EILSEQ;
418 					return (-1);
419 				}
420 				ib++;
421 				ret_val++;
422 			}
423 		}
424 
425 		if (check_additional) {
426 			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
427 				s1 = ib - sz;
428 				s2 = p[i];
429 				while (s1 < ib) {
430 					if (*s1 != *s2 || *s2 == '\0')
431 						break;
432 					s1++;
433 					s2++;
434 				}
435 
436 				if (s1 >= ib && *s2 == '\0') {
437 					*errnum = EBADF;
438 					return (-1);
439 				}
440 			}
441 		}
442 
443 		if (no_need_to_validate_entire)
444 			break;
445 	}
446 
447 	return (ret_val);
448 }
449 
450 /*
451  * The do_case_conv() looks at the mapping tables and returns found
452  * bytes if any. If not found, the input bytes are returned. The function
453  * always terminate the return bytes with a null character assuming that
454  * there are plenty of room to do so.
455  *
456  * The case conversions are simple case conversions mapping a character to
457  * another character as specified in the Unicode data. The byte size of
458  * the mapped character could be different from that of the input character.
459  *
460  * The return value is the byte length of the returned character excluding
461  * the terminating null byte.
462  */
463 static size_t
do_case_conv(int uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t is_it_toupper)464 do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
465 {
466 	size_t i;
467 	uint16_t b1 = 0;
468 	uint16_t b2 = 0;
469 	uint16_t b3 = 0;
470 	uint16_t b3_tbl;
471 	uint16_t b3_base;
472 	uint16_t b4 = 0;
473 	size_t start_id;
474 	size_t end_id;
475 
476 	/*
477 	 * At this point, the only possible values for sz are 2, 3, and 4.
478 	 * The u8s should point to a vector that is well beyond the size of
479 	 * 5 bytes.
480 	 */
481 	if (sz == 2) {
482 		b3 = u8s[0] = s[0];
483 		b4 = u8s[1] = s[1];
484 	} else if (sz == 3) {
485 		b2 = u8s[0] = s[0];
486 		b3 = u8s[1] = s[1];
487 		b4 = u8s[2] = s[2];
488 	} else if (sz == 4) {
489 		b1 = u8s[0] = s[0];
490 		b2 = u8s[1] = s[1];
491 		b3 = u8s[2] = s[2];
492 		b4 = u8s[3] = s[3];
493 	} else {
494 		/* This is not possible but just in case as a fallback. */
495 		if (is_it_toupper)
496 			*u8s = U8_ASCII_TOUPPER(*s);
497 		else
498 			*u8s = U8_ASCII_TOLOWER(*s);
499 		u8s[1] = '\0';
500 
501 		return (1);
502 	}
503 	u8s[sz] = '\0';
504 
505 	/*
506 	 * Let's find out if we have a corresponding character.
507 	 */
508 	b1 = u8_common_b1_tbl[uv][b1];
509 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
510 		return ((size_t)sz);
511 
512 	b2 = u8_case_common_b2_tbl[uv][b1][b2];
513 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
514 		return ((size_t)sz);
515 
516 	if (is_it_toupper) {
517 		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
518 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
519 			return ((size_t)sz);
520 
521 		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
522 		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
523 
524 		/* Either there is no match or an error at the table. */
525 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
526 			return ((size_t)sz);
527 
528 		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
529 
530 		for (i = 0; start_id < end_id; start_id++)
531 			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
532 	} else {
533 		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
534 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
535 			return ((size_t)sz);
536 
537 		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
538 		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
539 
540 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
541 			return ((size_t)sz);
542 
543 		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
544 
545 		for (i = 0; start_id < end_id; start_id++)
546 			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
547 	}
548 
549 	/*
550 	 * If i is still zero, that means there is no corresponding character.
551 	 */
552 	if (i == 0)
553 		return ((size_t)sz);
554 
555 	u8s[i] = '\0';
556 
557 	return (i);
558 }
559 
560 /*
561  * The do_case_compare() function compares the two input strings, s1 and s2,
562  * one character at a time doing case conversions if applicable and return
563  * the comparison result as like strcmp().
564  *
565  * Since, in empirical sense, most of text data are 7-bit ASCII characters,
566  * we treat the 7-bit ASCII characters as a special case trying to yield
567  * faster processing time.
568  */
569 static int
do_case_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,boolean_t is_it_toupper,int * errnum)570 do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
571     size_t n2, boolean_t is_it_toupper, int *errnum)
572 {
573 	int f;
574 	int sz1;
575 	int sz2;
576 	size_t j;
577 	size_t i1;
578 	size_t i2;
579 	uchar_t u8s1[U8_MB_CUR_MAX + 1];
580 	uchar_t u8s2[U8_MB_CUR_MAX + 1];
581 
582 	i1 = i2 = 0;
583 	while (i1 < n1 && i2 < n2) {
584 		/*
585 		 * Find out what would be the byte length for this UTF-8
586 		 * character at string s1 and also find out if this is
587 		 * an illegal start byte or not and if so, issue a proper
588 		 * error number and yet treat this byte as a character.
589 		 */
590 		sz1 = u8_number_of_bytes[*s1];
591 		if (sz1 < 0) {
592 			*errnum = EILSEQ;
593 			sz1 = 1;
594 		}
595 
596 		/*
597 		 * For 7-bit ASCII characters mainly, we do a quick case
598 		 * conversion right at here.
599 		 *
600 		 * If we don't have enough bytes for this character, issue
601 		 * an EINVAL error and use what are available.
602 		 *
603 		 * If we have enough bytes, find out if there is
604 		 * a corresponding uppercase character and if so, copy over
605 		 * the bytes for a comparison later. If there is no
606 		 * corresponding uppercase character, then, use what we have
607 		 * for the comparison.
608 		 */
609 		if (sz1 == 1) {
610 			if (is_it_toupper)
611 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
612 			else
613 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
614 			s1++;
615 			u8s1[1] = '\0';
616 		} else if ((i1 + sz1) > n1) {
617 			*errnum = EINVAL;
618 			for (j = 0; (i1 + j) < n1; )
619 				u8s1[j++] = *s1++;
620 			u8s1[j] = '\0';
621 		} else {
622 			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
623 			s1 += sz1;
624 		}
625 
626 		/* Do the same for the string s2. */
627 		sz2 = u8_number_of_bytes[*s2];
628 		if (sz2 < 0) {
629 			*errnum = EILSEQ;
630 			sz2 = 1;
631 		}
632 
633 		if (sz2 == 1) {
634 			if (is_it_toupper)
635 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
636 			else
637 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
638 			s2++;
639 			u8s2[1] = '\0';
640 		} else if ((i2 + sz2) > n2) {
641 			*errnum = EINVAL;
642 			for (j = 0; (i2 + j) < n2; )
643 				u8s2[j++] = *s2++;
644 			u8s2[j] = '\0';
645 		} else {
646 			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
647 			s2 += sz2;
648 		}
649 
650 		/* Now compare the two characters. */
651 		if (sz1 == 1 && sz2 == 1) {
652 			if (*u8s1 > *u8s2)
653 				return (1);
654 			if (*u8s1 < *u8s2)
655 				return (-1);
656 		} else {
657 			f = strcmp((const char *)u8s1, (const char *)u8s2);
658 			if (f != 0)
659 				return (f);
660 		}
661 
662 		/*
663 		 * They were the same. Let's move on to the next
664 		 * characters then.
665 		 */
666 		i1 += sz1;
667 		i2 += sz2;
668 	}
669 
670 	/*
671 	 * We compared until the end of either or both strings.
672 	 *
673 	 * If we reached to or went over the ends for the both, that means
674 	 * they are the same.
675 	 *
676 	 * If we reached only one of the two ends, that means the other string
677 	 * has something which then the fact can be used to determine
678 	 * the return value.
679 	 */
680 	if (i1 >= n1) {
681 		if (i2 >= n2)
682 			return (0);
683 		return (-1);
684 	}
685 	return (1);
686 }
687 
688 /*
689  * The combining_class() function checks on the given bytes and find out
690  * the corresponding Unicode combining class value. The return value 0 means
691  * it is a Starter. Any illegal UTF-8 character will also be treated as
692  * a Starter.
693  */
694 static uchar_t
combining_class(size_t uv,uchar_t * s,size_t sz)695 combining_class(size_t uv, uchar_t *s, size_t sz)
696 {
697 	uint16_t b1 = 0;
698 	uint16_t b2 = 0;
699 	uint16_t b3 = 0;
700 	uint16_t b4 = 0;
701 
702 	if (sz == 1 || sz > 4)
703 		return (0);
704 
705 	if (sz == 2) {
706 		b3 = s[0];
707 		b4 = s[1];
708 	} else if (sz == 3) {
709 		b2 = s[0];
710 		b3 = s[1];
711 		b4 = s[2];
712 	} else if (sz == 4) {
713 		b1 = s[0];
714 		b2 = s[1];
715 		b3 = s[2];
716 		b4 = s[3];
717 	}
718 
719 	b1 = u8_common_b1_tbl[uv][b1];
720 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
721 		return (0);
722 
723 	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
724 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
725 		return (0);
726 
727 	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
728 	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
729 		return (0);
730 
731 	return (u8_combining_class_b4_tbl[uv][b3][b4]);
732 }
733 
734 /*
735  * The do_decomp() function finds out a matching decomposition if any
736  * and return. If there is no match, the input bytes are copied and returned.
737  * The function also checks if there is a Hangul, decomposes it if necessary
738  * and returns.
739  *
740  * To save time, a single byte 7-bit ASCII character should be handled by
741  * the caller.
742  *
743  * The function returns the number of bytes returned sans always terminating
744  * the null byte. It will also return a state that will tell if there was
745  * a Hangul character decomposed which then will be used by the caller.
746  */
747 static size_t
do_decomp(size_t uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t canonical_decomposition,u8_normalization_states_t * state)748 do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
749     boolean_t canonical_decomposition, u8_normalization_states_t *state)
750 {
751 	uint16_t b1 = 0;
752 	uint16_t b2 = 0;
753 	uint16_t b3 = 0;
754 	uint16_t b3_tbl;
755 	uint16_t b3_base;
756 	uint16_t b4 = 0;
757 	size_t start_id;
758 	size_t end_id;
759 	size_t i;
760 	uint32_t u1;
761 
762 	if (sz == 2) {
763 		b3 = u8s[0] = s[0];
764 		b4 = u8s[1] = s[1];
765 		u8s[2] = '\0';
766 	} else if (sz == 3) {
767 		/* Convert it to a Unicode scalar value. */
768 		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
769 
770 		/*
771 		 * If this is a Hangul syllable, we decompose it into
772 		 * a leading consonant, a vowel, and an optional trailing
773 		 * consonant and then return.
774 		 */
775 		if (U8_HANGUL_SYLLABLE(u1)) {
776 			u1 -= U8_HANGUL_SYL_FIRST;
777 
778 			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
779 			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
780 			    / U8_HANGUL_T_COUNT;
781 			b3 = u1 % U8_HANGUL_T_COUNT;
782 
783 			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
784 			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
785 			if (b3) {
786 				b3 += U8_HANGUL_JAMO_T_FIRST;
787 				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
788 
789 				u8s[9] = '\0';
790 				*state = U8_STATE_HANGUL_LVT;
791 				return (9);
792 			}
793 
794 			u8s[6] = '\0';
795 			*state = U8_STATE_HANGUL_LV;
796 			return (6);
797 		}
798 
799 		b2 = u8s[0] = s[0];
800 		b3 = u8s[1] = s[1];
801 		b4 = u8s[2] = s[2];
802 		u8s[3] = '\0';
803 
804 		/*
805 		 * If this is a Hangul Jamo, we know there is nothing
806 		 * further that we can decompose.
807 		 */
808 		if (U8_HANGUL_JAMO_L(u1)) {
809 			*state = U8_STATE_HANGUL_L;
810 			return (3);
811 		}
812 
813 		if (U8_HANGUL_JAMO_V(u1)) {
814 			if (*state == U8_STATE_HANGUL_L)
815 				*state = U8_STATE_HANGUL_LV;
816 			else
817 				*state = U8_STATE_HANGUL_V;
818 			return (3);
819 		}
820 
821 		if (U8_HANGUL_JAMO_T(u1)) {
822 			if (*state == U8_STATE_HANGUL_LV)
823 				*state = U8_STATE_HANGUL_LVT;
824 			else
825 				*state = U8_STATE_HANGUL_T;
826 			return (3);
827 		}
828 	} else if (sz == 4) {
829 		b1 = u8s[0] = s[0];
830 		b2 = u8s[1] = s[1];
831 		b3 = u8s[2] = s[2];
832 		b4 = u8s[3] = s[3];
833 		u8s[4] = '\0';
834 	} else {
835 		/*
836 		 * This is a fallback and should not happen if the function
837 		 * was called properly.
838 		 */
839 		u8s[0] = s[0];
840 		u8s[1] = '\0';
841 		*state = U8_STATE_START;
842 		return (1);
843 	}
844 
845 	/*
846 	 * At this point, this rountine does not know what it would get.
847 	 * The caller should sort it out if the state isn't a Hangul one.
848 	 */
849 	*state = U8_STATE_START;
850 
851 	/* Try to find matching decomposition mapping byte sequence. */
852 	b1 = u8_common_b1_tbl[uv][b1];
853 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
854 		return ((size_t)sz);
855 
856 	b2 = u8_decomp_b2_tbl[uv][b1][b2];
857 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
858 		return ((size_t)sz);
859 
860 	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
861 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
862 		return ((size_t)sz);
863 
864 	/*
865 	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
866 	 * which is 0x8000, this means we couldn't fit the mappings into
867 	 * the cardinality of a unsigned byte.
868 	 */
869 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
870 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
871 		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
872 		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
873 	} else {
874 		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
875 		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
876 	}
877 
878 	/* This also means there wasn't any matching decomposition. */
879 	if (start_id >= end_id)
880 		return ((size_t)sz);
881 
882 	/*
883 	 * The final table for decomposition mappings has three types of
884 	 * byte sequences depending on whether a mapping is for compatibility
885 	 * decomposition, canonical decomposition, or both like the following:
886 	 *
887 	 * (1) Compatibility decomposition mappings:
888 	 *
889 	 *	+---+---+-...-+---+
890 	 *	| B0| B1| ... | Bm|
891 	 *	+---+---+-...-+---+
892 	 *
893 	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
894 	 *
895 	 * (2) Canonical decomposition mappings:
896 	 *
897 	 *	+---+---+---+-...-+---+
898 	 *	| T | b0| b1| ... | bn|
899 	 *	+---+---+---+-...-+---+
900 	 *
901 	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
902 	 *
903 	 * (3) Both mappings:
904 	 *
905 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
906 	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
907 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
908 	 *
909 	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
910 	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
911 	 *	compatibility mapping bytes.
912 	 *
913 	 * Note that compatibility decomposition means doing recursive
914 	 * decompositions using both compatibility decomposition mappings and
915 	 * canonical decomposition mappings. On the other hand, canonical
916 	 * decomposition means doing recursive decompositions using only
917 	 * canonical decomposition mappings. Since the table we have has gone
918 	 * through the recursions already, we do not need to do so during
919 	 * runtime, i.e., the table has been completely flattened out
920 	 * already.
921 	 */
922 
923 	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
924 
925 	/* Get the type, T, of the byte sequence. */
926 	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
927 
928 	/*
929 	 * If necessary, adjust start_id, end_id, or both. Note that if
930 	 * this is compatibility decomposition mapping, there is no
931 	 * adjustment.
932 	 */
933 	if (canonical_decomposition) {
934 		/* Is the mapping only for compatibility decomposition? */
935 		if (b1 < U8_DECOMP_BOTH)
936 			return ((size_t)sz);
937 
938 		start_id++;
939 
940 		if (b1 == U8_DECOMP_BOTH) {
941 			end_id = start_id +
942 			    u8_decomp_final_tbl[uv][b3_base + start_id];
943 			start_id++;
944 		}
945 	} else {
946 		/*
947 		 * Unless this is a compatibility decomposition mapping,
948 		 * we adjust the start_id.
949 		 */
950 		if (b1 == U8_DECOMP_BOTH) {
951 			start_id++;
952 			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
953 		} else if (b1 == U8_DECOMP_CANONICAL) {
954 			start_id++;
955 		}
956 	}
957 
958 	for (i = 0; start_id < end_id; start_id++)
959 		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
960 	u8s[i] = '\0';
961 
962 	return (i);
963 }
964 
965 /*
966  * The find_composition_start() function uses the character bytes given and
967  * find out the matching composition mappings if any and return the address
968  * to the composition mappings as explained in the do_composition().
969  */
970 static uchar_t *
find_composition_start(size_t uv,uchar_t * s,size_t sz)971 find_composition_start(size_t uv, uchar_t *s, size_t sz)
972 {
973 	uint16_t b1 = 0;
974 	uint16_t b2 = 0;
975 	uint16_t b3 = 0;
976 	uint16_t b3_tbl;
977 	uint16_t b3_base;
978 	uint16_t b4 = 0;
979 	size_t start_id;
980 	size_t end_id;
981 
982 	if (sz == 1) {
983 		b4 = s[0];
984 	} else if (sz == 2) {
985 		b3 = s[0];
986 		b4 = s[1];
987 	} else if (sz == 3) {
988 		b2 = s[0];
989 		b3 = s[1];
990 		b4 = s[2];
991 	} else if (sz == 4) {
992 		b1 = s[0];
993 		b2 = s[1];
994 		b3 = s[2];
995 		b4 = s[3];
996 	} else {
997 		/*
998 		 * This is a fallback and should not happen if the function
999 		 * was called properly.
1000 		 */
1001 		return (NULL);
1002 	}
1003 
1004 	b1 = u8_composition_b1_tbl[uv][b1];
1005 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
1006 		return (NULL);
1007 
1008 	b2 = u8_composition_b2_tbl[uv][b1][b2];
1009 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
1010 		return (NULL);
1011 
1012 	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
1013 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
1014 		return (NULL);
1015 
1016 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
1017 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
1018 		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
1019 		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
1020 	} else {
1021 		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
1022 		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
1023 	}
1024 
1025 	if (start_id >= end_id)
1026 		return (NULL);
1027 
1028 	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
1029 
1030 	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
1031 }
1032 
1033 /*
1034  * The blocked() function checks on the combining class values of previous
1035  * characters in this sequence and return whether it is blocked or not.
1036  */
1037 static boolean_t
blocked(uchar_t * comb_class,size_t last)1038 blocked(uchar_t *comb_class, size_t last)
1039 {
1040 	uchar_t my_comb_class;
1041 	size_t i;
1042 
1043 	my_comb_class = comb_class[last];
1044 	for (i = 1; i < last; i++)
1045 		if (comb_class[i] >= my_comb_class ||
1046 		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
1047 			return (B_TRUE);
1048 
1049 	return (B_FALSE);
1050 }
1051 
1052 /*
1053  * The do_composition() reads the character string pointed by 's' and
1054  * do necessary canonical composition and then copy over the result back to
1055  * the 's'.
1056  *
1057  * The input argument 's' cannot contain more than 32 characters.
1058  */
1059 static size_t
do_composition(size_t uv,uchar_t * s,uchar_t * comb_class,uchar_t * start,uchar_t * disp,size_t last,uchar_t ** os,uchar_t * oslast)1060 do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
1061     uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
1062 {
1063 	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
1064 	uchar_t tc[U8_MB_CUR_MAX];
1065 	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
1066 	size_t saved_marks_count;
1067 	uchar_t *p;
1068 	uchar_t *saved_p;
1069 	uchar_t *q;
1070 	size_t i;
1071 	size_t saved_i;
1072 	size_t j;
1073 	size_t k;
1074 	size_t l;
1075 	size_t C;
1076 	size_t saved_l;
1077 	size_t size;
1078 	uint32_t u1;
1079 	uint32_t u2;
1080 	boolean_t match_not_found = B_TRUE;
1081 
1082 	/*
1083 	 * This should never happen unless the callers are doing some strange
1084 	 * and unexpected things.
1085 	 *
1086 	 * The "last" is the index pointing to the last character not last + 1.
1087 	 */
1088 	if (last >= U8_MAX_CHARS_A_SEQ)
1089 		last = U8_UPPER_LIMIT_IN_A_SEQ;
1090 
1091 	for (i = l = 0; i <= last; i++) {
1092 		/*
1093 		 * The last or any non-Starters at the beginning, we don't
1094 		 * have any chance to do composition and so we just copy them
1095 		 * to the temporary buffer.
1096 		 */
1097 		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
1098 SAVE_THE_CHAR:
1099 			p = s + start[i];
1100 			size = disp[i];
1101 			for (k = 0; k < size; k++)
1102 				t[l++] = *p++;
1103 			continue;
1104 		}
1105 
1106 		/*
1107 		 * If this could be a start of Hangul Jamos, then, we try to
1108 		 * conjoin them.
1109 		 */
1110 		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
1111 			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
1112 			    s[start[i] + 1], s[start[i] + 2]);
1113 			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
1114 			    s[start[i] + 4], s[start[i] + 5]);
1115 
1116 			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
1117 				u1 -= U8_HANGUL_JAMO_L_FIRST;
1118 				u2 -= U8_HANGUL_JAMO_V_FIRST;
1119 				u1 = U8_HANGUL_SYL_FIRST +
1120 				    (u1 * U8_HANGUL_V_COUNT + u2) *
1121 				    U8_HANGUL_T_COUNT;
1122 
1123 				i += 2;
1124 				if (i <= last) {
1125 					U8_PUT_3BYTES_INTO_UTF32(u2,
1126 					    s[start[i]], s[start[i] + 1],
1127 					    s[start[i] + 2]);
1128 
1129 					if (U8_HANGUL_JAMO_T(u2)) {
1130 						u1 += u2 -
1131 						    U8_HANGUL_JAMO_T_FIRST;
1132 						i++;
1133 					}
1134 				}
1135 
1136 				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
1137 				i--;
1138 				l += 3;
1139 				continue;
1140 			}
1141 		}
1142 
1143 		/*
1144 		 * Let's then find out if this Starter has composition
1145 		 * mapping.
1146 		 */
1147 		p = find_composition_start(uv, s + start[i], disp[i]);
1148 		if (p == NULL)
1149 			goto SAVE_THE_CHAR;
1150 
1151 		/*
1152 		 * We have a Starter with composition mapping and the next
1153 		 * character is a non-Starter. Let's try to find out if
1154 		 * we can do composition.
1155 		 */
1156 
1157 		saved_p = p;
1158 		saved_i = i;
1159 		saved_l = l;
1160 		saved_marks_count = 0;
1161 
1162 TRY_THE_NEXT_MARK:
1163 		q = s + start[++i];
1164 		size = disp[i];
1165 
1166 		/*
1167 		 * The next for() loop compares the non-Starter pointed by
1168 		 * 'q' with the possible (joinable) characters pointed by 'p'.
1169 		 *
1170 		 * The composition final table entry pointed by the 'p'
1171 		 * looks like the following:
1172 		 *
1173 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1174 		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
1175 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1176 		 *
1177 		 * where C is the count byte indicating the number of
1178 		 * mapping pairs where each pair would be look like
1179 		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
1180 		 * character of a canonical decomposition and the B0-Bm are
1181 		 * the bytes of a matching composite character. The F is
1182 		 * a filler byte after each character as the separator.
1183 		 */
1184 
1185 		match_not_found = B_TRUE;
1186 
1187 		for (C = *p++; C > 0; C--) {
1188 			for (k = 0; k < size; p++, k++)
1189 				if (*p != q[k])
1190 					break;
1191 
1192 			/* Have we found it? */
1193 			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
1194 				match_not_found = B_FALSE;
1195 
1196 				l = saved_l;
1197 
1198 				while (*++p != U8_TBL_ELEMENT_FILLER)
1199 					t[l++] = *p;
1200 
1201 				break;
1202 			}
1203 
1204 			/* We didn't find; skip to the next pair. */
1205 			if (*p != U8_TBL_ELEMENT_FILLER)
1206 				while (*++p != U8_TBL_ELEMENT_FILLER)
1207 					;
1208 			while (*++p != U8_TBL_ELEMENT_FILLER)
1209 				;
1210 			p++;
1211 		}
1212 
1213 		/*
1214 		 * If there was no match, we will need to save the combining
1215 		 * mark for later appending. After that, if the next one
1216 		 * is a non-Starter and not blocked, then, we try once
1217 		 * again to do composition with the next non-Starter.
1218 		 *
1219 		 * If there was no match and this was a Starter, then,
1220 		 * this is a new start.
1221 		 *
1222 		 * If there was a match and a composition done and we have
1223 		 * more to check on, then, we retrieve a new composition final
1224 		 * table entry for the composite and then try to do the
1225 		 * composition again.
1226 		 */
1227 
1228 		if (match_not_found) {
1229 			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
1230 				i--;
1231 				goto SAVE_THE_CHAR;
1232 			}
1233 
1234 			saved_marks[saved_marks_count++] = i;
1235 		}
1236 
1237 		if (saved_l == l) {
1238 			while (i < last) {
1239 				if (blocked(comb_class, i + 1))
1240 					saved_marks[saved_marks_count++] = ++i;
1241 				else
1242 					break;
1243 			}
1244 			if (i < last) {
1245 				p = saved_p;
1246 				goto TRY_THE_NEXT_MARK;
1247 			}
1248 		} else if (i < last) {
1249 			p = find_composition_start(uv, t + saved_l,
1250 			    l - saved_l);
1251 			if (p != NULL) {
1252 				saved_p = p;
1253 				goto TRY_THE_NEXT_MARK;
1254 			}
1255 		}
1256 
1257 		/*
1258 		 * There is no more composition possible.
1259 		 *
1260 		 * If there was no composition what so ever then we copy
1261 		 * over the original Starter and then append any non-Starters
1262 		 * remaining at the target string sequentially after that.
1263 		 */
1264 
1265 		if (saved_l == l) {
1266 			p = s + start[saved_i];
1267 			size = disp[saved_i];
1268 			for (j = 0; j < size; j++)
1269 				t[l++] = *p++;
1270 		}
1271 
1272 		for (k = 0; k < saved_marks_count; k++) {
1273 			p = s + start[saved_marks[k]];
1274 			size = disp[saved_marks[k]];
1275 			for (j = 0; j < size; j++)
1276 				t[l++] = *p++;
1277 		}
1278 	}
1279 
1280 	/*
1281 	 * If the last character is a Starter and if we have a character
1282 	 * (possibly another Starter) that can be turned into a composite,
1283 	 * we do so and we do so until there is no more of composition
1284 	 * possible.
1285 	 */
1286 	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
1287 		p = *os;
1288 		saved_l = l - disp[last];
1289 
1290 		while (p < oslast) {
1291 			int8_t number_of_bytes = u8_number_of_bytes[*p];
1292 
1293 			if (number_of_bytes <= 1)
1294 				break;
1295 			size = number_of_bytes;
1296 			if ((p + size) > oslast)
1297 				break;
1298 
1299 			saved_p = p;
1300 
1301 			for (i = 0; i < size; i++)
1302 				tc[i] = *p++;
1303 
1304 			q = find_composition_start(uv, t + saved_l,
1305 			    l - saved_l);
1306 			if (q == NULL) {
1307 				p = saved_p;
1308 				break;
1309 			}
1310 
1311 			match_not_found = B_TRUE;
1312 
1313 			for (C = *q++; C > 0; C--) {
1314 				for (k = 0; k < size; q++, k++)
1315 					if (*q != tc[k])
1316 						break;
1317 
1318 				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
1319 					match_not_found = B_FALSE;
1320 
1321 					l = saved_l;
1322 
1323 					while (*++q != U8_TBL_ELEMENT_FILLER) {
1324 						/*
1325 						 * This is practically
1326 						 * impossible but we don't
1327 						 * want to take any chances.
1328 						 */
1329 						if (l >=
1330 						    U8_STREAM_SAFE_TEXT_MAX) {
1331 							p = saved_p;
1332 							goto SAFE_RETURN;
1333 						}
1334 						t[l++] = *q;
1335 					}
1336 
1337 					break;
1338 				}
1339 
1340 				if (*q != U8_TBL_ELEMENT_FILLER)
1341 					while (*++q != U8_TBL_ELEMENT_FILLER)
1342 						;
1343 				while (*++q != U8_TBL_ELEMENT_FILLER)
1344 					;
1345 				q++;
1346 			}
1347 
1348 			if (match_not_found) {
1349 				p = saved_p;
1350 				break;
1351 			}
1352 		}
1353 SAFE_RETURN:
1354 		*os = p;
1355 	}
1356 
1357 	/*
1358 	 * Now we copy over the temporary string to the target string.
1359 	 * Since composition always reduces the number of characters or
1360 	 * the number of characters stay, we don't need to worry about
1361 	 * the buffer overflow here.
1362 	 */
1363 	for (i = 0; i < l; i++)
1364 		s[i] = t[i];
1365 	s[l] = '\0';
1366 
1367 	return (l);
1368 }
1369 
1370 /*
1371  * The collect_a_seq() function checks on the given string s, collect
1372  * a sequence of characters at u8s, and return the sequence. While it collects
1373  * a sequence, it also applies case conversion, canonical or compatibility
1374  * decomposition, canonical decomposition, or some or all of them and
1375  * in that order.
1376  *
1377  * The collected sequence cannot be bigger than 32 characters since if
1378  * it is having more than 31 characters, the sequence will be terminated
1379  * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
1380  * a Stream-Safe Text. The collected sequence is always terminated with
1381  * a null byte and the return value is the byte length of the sequence
1382  * including 0. The return value does not include the terminating
1383  * null byte.
1384  */
1385 static size_t
collect_a_seq(size_t uv,uchar_t * u8s,uchar_t ** source,uchar_t * slast,boolean_t is_it_toupper,boolean_t is_it_tolower,boolean_t canonical_decomposition,boolean_t compatibility_decomposition,boolean_t canonical_composition,int * errnum,u8_normalization_states_t * state)1386 collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
1387     boolean_t is_it_toupper,
1388     boolean_t is_it_tolower,
1389     boolean_t canonical_decomposition,
1390     boolean_t compatibility_decomposition,
1391     boolean_t canonical_composition,
1392     int *errnum, u8_normalization_states_t *state)
1393 {
1394 	uchar_t *s;
1395 	int sz;
1396 	int saved_sz;
1397 	size_t i;
1398 	size_t j;
1399 	size_t k;
1400 	size_t l;
1401 	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
1402 	uchar_t disp[U8_MAX_CHARS_A_SEQ];
1403 	uchar_t start[U8_MAX_CHARS_A_SEQ];
1404 	uchar_t u8t[U8_MB_CUR_MAX];
1405 	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
1406 	uchar_t tc;
1407 	size_t last;
1408 	size_t saved_last;
1409 	uint32_t u1;
1410 
1411 	/*
1412 	 * Save the source string pointer which we will return a changed
1413 	 * pointer if we do processing.
1414 	 */
1415 	s = *source;
1416 
1417 	/*
1418 	 * The following is a fallback for just in case callers are not
1419 	 * checking the string boundaries before the calling.
1420 	 */
1421 	if (s >= slast) {
1422 		u8s[0] = '\0';
1423 
1424 		return (0);
1425 	}
1426 
1427 	/*
1428 	 * As the first thing, let's collect a character and do case
1429 	 * conversion if necessary.
1430 	 */
1431 
1432 	sz = u8_number_of_bytes[*s];
1433 
1434 	if (sz < 0) {
1435 		*errnum = EILSEQ;
1436 
1437 		u8s[0] = *s++;
1438 		u8s[1] = '\0';
1439 
1440 		*source = s;
1441 
1442 		return (1);
1443 	}
1444 
1445 	if (sz == 1) {
1446 		if (is_it_toupper)
1447 			u8s[0] = U8_ASCII_TOUPPER(*s);
1448 		else if (is_it_tolower)
1449 			u8s[0] = U8_ASCII_TOLOWER(*s);
1450 		else
1451 			u8s[0] = *s;
1452 		s++;
1453 		u8s[1] = '\0';
1454 	} else if ((s + sz) > slast) {
1455 		*errnum = EINVAL;
1456 
1457 		for (i = 0; s < slast; )
1458 			u8s[i++] = *s++;
1459 		u8s[i] = '\0';
1460 
1461 		*source = s;
1462 
1463 		return (i);
1464 	} else {
1465 		if (is_it_toupper || is_it_tolower) {
1466 			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
1467 			s += sz;
1468 			sz = i;
1469 		} else {
1470 			for (i = 0; i < sz; )
1471 				u8s[i++] = *s++;
1472 			u8s[i] = '\0';
1473 		}
1474 	}
1475 
1476 	/*
1477 	 * And then canonical/compatibility decomposition followed by
1478 	 * an optional canonical composition. Please be noted that
1479 	 * canonical composition is done only when a decomposition is
1480 	 * done.
1481 	 */
1482 	if (canonical_decomposition || compatibility_decomposition) {
1483 		if (sz == 1) {
1484 			*state = U8_STATE_START;
1485 
1486 			saved_sz = 1;
1487 
1488 			comb_class[0] = 0;
1489 			start[0] = 0;
1490 			disp[0] = 1;
1491 
1492 			last = 1;
1493 		} else {
1494 			saved_sz = do_decomp(uv, u8s, u8s, sz,
1495 			    canonical_decomposition, state);
1496 
1497 			last = 0;
1498 
1499 			for (i = 0; i < saved_sz; ) {
1500 				sz = u8_number_of_bytes[u8s[i]];
1501 
1502 				comb_class[last] = combining_class(uv,
1503 				    u8s + i, sz);
1504 				start[last] = i;
1505 				disp[last] = sz;
1506 
1507 				last++;
1508 				i += sz;
1509 			}
1510 
1511 			/*
1512 			 * Decomposition yields various Hangul related
1513 			 * states but not on combining marks. We need to
1514 			 * find out at here by checking on the last
1515 			 * character.
1516 			 */
1517 			if (*state == U8_STATE_START) {
1518 				if (comb_class[last - 1])
1519 					*state = U8_STATE_COMBINING_MARK;
1520 			}
1521 		}
1522 
1523 		saved_last = last;
1524 
1525 		while (s < slast) {
1526 			sz = u8_number_of_bytes[*s];
1527 
1528 			/*
1529 			 * If this is an illegal character, an incomplete
1530 			 * character, or an 7-bit ASCII Starter character,
1531 			 * then we have collected a sequence; break and let
1532 			 * the next call deal with the two cases.
1533 			 *
1534 			 * Note that this is okay only if you are using this
1535 			 * function with a fixed length string, not on
1536 			 * a buffer with multiple calls of one chunk at a time.
1537 			 */
1538 			if (sz <= 1) {
1539 				break;
1540 			} else if ((s + sz) > slast) {
1541 				break;
1542 			} else {
1543 				/*
1544 				 * If the previous character was a Hangul Jamo
1545 				 * and this character is a Hangul Jamo that
1546 				 * can be conjoined, we collect the Jamo.
1547 				 */
1548 				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
1549 					U8_PUT_3BYTES_INTO_UTF32(u1,
1550 					    *s, *(s + 1), *(s + 2));
1551 
1552 					if (U8_HANGUL_COMPOSABLE_L_V(*state,
1553 					    u1)) {
1554 						i = 0;
1555 						*state = U8_STATE_HANGUL_LV;
1556 						goto COLLECT_A_HANGUL;
1557 					}
1558 
1559 					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
1560 					    u1)) {
1561 						i = 0;
1562 						*state = U8_STATE_HANGUL_LVT;
1563 						goto COLLECT_A_HANGUL;
1564 					}
1565 				}
1566 
1567 				/*
1568 				 * Regardless of whatever it was, if this is
1569 				 * a Starter, we don't collect the character
1570 				 * since that's a new start and we will deal
1571 				 * with it at the next time.
1572 				 */
1573 				i = combining_class(uv, s, sz);
1574 				if (i == U8_COMBINING_CLASS_STARTER)
1575 					break;
1576 
1577 				/*
1578 				 * We know the current character is a combining
1579 				 * mark. If the previous character wasn't
1580 				 * a Starter (not Hangul) or a combining mark,
1581 				 * then, we don't collect this combining mark.
1582 				 */
1583 				if (*state != U8_STATE_START &&
1584 				    *state != U8_STATE_COMBINING_MARK)
1585 					break;
1586 
1587 				*state = U8_STATE_COMBINING_MARK;
1588 COLLECT_A_HANGUL:
1589 				/*
1590 				 * If we collected a Starter and combining
1591 				 * marks up to 30, i.e., total 31 characters,
1592 				 * then, we terminate this degenerately long
1593 				 * combining sequence with a U+034F COMBINING
1594 				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
1595 				 * UTF-8 and turn this into a Stream-Safe
1596 				 * Text. This will be extremely rare but
1597 				 * possible.
1598 				 *
1599 				 * The following will also guarantee that
1600 				 * we are not writing more than 32 characters
1601 				 * plus a NULL at u8s[].
1602 				 */
1603 				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
1604 TURN_STREAM_SAFE:
1605 					*state = U8_STATE_START;
1606 					comb_class[last] = 0;
1607 					start[last] = saved_sz;
1608 					disp[last] = 2;
1609 					last++;
1610 
1611 					u8s[saved_sz++] = 0xCD;
1612 					u8s[saved_sz++] = 0x8F;
1613 
1614 					break;
1615 				}
1616 
1617 				/*
1618 				 * Some combining marks also do decompose into
1619 				 * another combining mark or marks.
1620 				 */
1621 				if (*state == U8_STATE_COMBINING_MARK) {
1622 					k = last;
1623 					l = sz;
1624 					i = do_decomp(uv, uts, s, sz,
1625 					    canonical_decomposition, state);
1626 					for (j = 0; j < i; ) {
1627 						sz = u8_number_of_bytes[uts[j]];
1628 
1629 						comb_class[last] =
1630 						    combining_class(uv,
1631 						    uts + j, sz);
1632 						start[last] = saved_sz + j;
1633 						disp[last] = sz;
1634 
1635 						last++;
1636 						if (last >=
1637 						    U8_UPPER_LIMIT_IN_A_SEQ) {
1638 							last = k;
1639 							goto TURN_STREAM_SAFE;
1640 						}
1641 						j += sz;
1642 					}
1643 
1644 					*state = U8_STATE_COMBINING_MARK;
1645 					sz = i;
1646 					s += l;
1647 
1648 					for (i = 0; i < sz; i++)
1649 						u8s[saved_sz++] = uts[i];
1650 				} else {
1651 					comb_class[last] = i;
1652 					start[last] = saved_sz;
1653 					disp[last] = sz;
1654 					last++;
1655 
1656 					for (i = 0; i < sz; i++)
1657 						u8s[saved_sz++] = *s++;
1658 				}
1659 
1660 				/*
1661 				 * If this is U+0345 COMBINING GREEK
1662 				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
1663 				 * iota subscript, and need to be converted to
1664 				 * uppercase letter, convert it to U+0399 GREEK
1665 				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
1666 				 * i.e., convert to capital adscript form as
1667 				 * specified in the Unicode standard.
1668 				 *
1669 				 * This is the only special case of (ambiguous)
1670 				 * case conversion at combining marks and
1671 				 * probably the standard will never have
1672 				 * anything similar like this in future.
1673 				 */
1674 				if (is_it_toupper && sz >= 2 &&
1675 				    u8s[saved_sz - 2] == 0xCD &&
1676 				    u8s[saved_sz - 1] == 0x85) {
1677 					u8s[saved_sz - 2] = 0xCE;
1678 					u8s[saved_sz - 1] = 0x99;
1679 				}
1680 			}
1681 		}
1682 
1683 		/*
1684 		 * Let's try to ensure a canonical ordering for the collected
1685 		 * combining marks. We do this only if we have collected
1686 		 * at least one more non-Starter. (The decomposition mapping
1687 		 * data tables have fully (and recursively) expanded and
1688 		 * canonically ordered decompositions.)
1689 		 *
1690 		 * The U8_SWAP_COMB_MARKS() convenience macro has some
1691 		 * assumptions and we are meeting the assumptions.
1692 		 */
1693 		last--;
1694 		if (last >= saved_last) {
1695 			for (i = 0; i < last; i++)
1696 				for (j = last; j > i; j--)
1697 					if (comb_class[j] &&
1698 					    comb_class[j - 1] > comb_class[j]) {
1699 						U8_SWAP_COMB_MARKS(j - 1, j);
1700 					}
1701 		}
1702 
1703 		*source = s;
1704 
1705 		if (! canonical_composition) {
1706 			u8s[saved_sz] = '\0';
1707 			return (saved_sz);
1708 		}
1709 
1710 		/*
1711 		 * Now do the canonical composition. Note that we do this
1712 		 * only after a canonical or compatibility decomposition to
1713 		 * finish up NFC or NFKC.
1714 		 */
1715 		sz = do_composition(uv, u8s, comb_class, start, disp, last,
1716 		    &s, slast);
1717 	}
1718 
1719 	*source = s;
1720 
1721 	return ((size_t)sz);
1722 }
1723 
1724 /*
1725  * The do_norm_compare() function does string comparion based on Unicode
1726  * simple case mappings and Unicode Normalization definitions.
1727  *
1728  * It does so by collecting a sequence of character at a time and comparing
1729  * the collected sequences from the strings.
1730  *
1731  * The meanings on the return values are the same as the usual strcmp().
1732  */
1733 static int
do_norm_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,int flag,int * errnum)1734 do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1735     int flag, int *errnum)
1736 {
1737 	int result;
1738 	size_t sz1;
1739 	size_t sz2;
1740 	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
1741 	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
1742 	uchar_t *s1last;
1743 	uchar_t *s2last;
1744 	boolean_t is_it_toupper;
1745 	boolean_t is_it_tolower;
1746 	boolean_t canonical_decomposition;
1747 	boolean_t compatibility_decomposition;
1748 	boolean_t canonical_composition;
1749 	u8_normalization_states_t state;
1750 
1751 	s1last = s1 + n1;
1752 	s2last = s2 + n2;
1753 
1754 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1755 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1756 	canonical_decomposition = flag & U8_CANON_DECOMP;
1757 	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
1758 	canonical_composition = flag & U8_CANON_COMP;
1759 
1760 	while (s1 < s1last && s2 < s2last) {
1761 		/*
1762 		 * If the current character is a 7-bit ASCII and the last
1763 		 * character, or, if the current character and the next
1764 		 * character are both some 7-bit ASCII characters then
1765 		 * we treat the current character as a sequence.
1766 		 *
1767 		 * In any other cases, we need to call collect_a_seq().
1768 		 */
1769 
1770 		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
1771 		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
1772 			if (is_it_toupper)
1773 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
1774 			else if (is_it_tolower)
1775 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
1776 			else
1777 				u8s1[0] = *s1;
1778 			u8s1[1] = '\0';
1779 			sz1 = 1;
1780 			s1++;
1781 		} else {
1782 			state = U8_STATE_START;
1783 			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
1784 			    is_it_toupper, is_it_tolower,
1785 			    canonical_decomposition,
1786 			    compatibility_decomposition,
1787 			    canonical_composition, errnum, &state);
1788 		}
1789 
1790 		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
1791 		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
1792 			if (is_it_toupper)
1793 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
1794 			else if (is_it_tolower)
1795 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
1796 			else
1797 				u8s2[0] = *s2;
1798 			u8s2[1] = '\0';
1799 			sz2 = 1;
1800 			s2++;
1801 		} else {
1802 			state = U8_STATE_START;
1803 			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
1804 			    is_it_toupper, is_it_tolower,
1805 			    canonical_decomposition,
1806 			    compatibility_decomposition,
1807 			    canonical_composition, errnum, &state);
1808 		}
1809 
1810 		/*
1811 		 * Now compare the two characters. If they are the same,
1812 		 * we move on to the next character sequences.
1813 		 */
1814 		if (sz1 == 1 && sz2 == 1) {
1815 			if (*u8s1 > *u8s2)
1816 				return (1);
1817 			if (*u8s1 < *u8s2)
1818 				return (-1);
1819 		} else {
1820 			result = strcmp((const char *)u8s1, (const char *)u8s2);
1821 			if (result != 0)
1822 				return (result);
1823 		}
1824 	}
1825 
1826 	/*
1827 	 * We compared until the end of either or both strings.
1828 	 *
1829 	 * If we reached to or went over the ends for the both, that means
1830 	 * they are the same.
1831 	 *
1832 	 * If we reached only one end, that means the other string has
1833 	 * something which then can be used to determine the return value.
1834 	 */
1835 	if (s1 >= s1last) {
1836 		if (s2 >= s2last)
1837 			return (0);
1838 		return (-1);
1839 	}
1840 	return (1);
1841 }
1842 
1843 /*
1844  * The u8_strcmp() function compares two UTF-8 strings quite similar to
1845  * the strcmp(). For the comparison, however, Unicode Normalization specific
1846  * equivalency and Unicode simple case conversion mappings based equivalency
1847  * can be requested and checked against.
1848  */
1849 int
u8_strcmp(const char * s1,const char * s2,size_t n,int flag,size_t uv,int * errnum)1850 u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1851     int *errnum)
1852 {
1853 	int f;
1854 	size_t n1;
1855 	size_t n2;
1856 
1857 	*errnum = 0;
1858 
1859 	/*
1860 	 * Check on the requested Unicode version, case conversion, and
1861 	 * normalization flag values.
1862 	 */
1863 
1864 	if (uv > U8_UNICODE_LATEST) {
1865 		*errnum = ERANGE;
1866 		uv = U8_UNICODE_LATEST;
1867 	}
1868 
1869 	if (flag == 0) {
1870 		flag = U8_STRCMP_CS;
1871 	} else {
1872 		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
1873 		    U8_STRCMP_CI_LOWER);
1874 		if (f == 0) {
1875 			flag |= U8_STRCMP_CS;
1876 		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
1877 		    f != U8_STRCMP_CI_LOWER) {
1878 			*errnum = EBADF;
1879 			flag = U8_STRCMP_CS;
1880 		}
1881 
1882 		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1883 		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
1884 		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
1885 			*errnum = EBADF;
1886 			flag = U8_STRCMP_CS;
1887 		}
1888 	}
1889 
1890 	if (flag == U8_STRCMP_CS) {
1891 		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
1892 	}
1893 
1894 	n1 = strlen(s1);
1895 	n2 = strlen(s2);
1896 	if (n != 0) {
1897 		if (n < n1)
1898 			n1 = n;
1899 		if (n < n2)
1900 			n2 = n;
1901 	}
1902 
1903 	/*
1904 	 * Simple case conversion can be done much faster and so we do
1905 	 * them separately here.
1906 	 */
1907 	if (flag == U8_STRCMP_CI_UPPER) {
1908 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1909 		    n1, n2, B_TRUE, errnum));
1910 	} else if (flag == U8_STRCMP_CI_LOWER) {
1911 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1912 		    n1, n2, B_FALSE, errnum));
1913 	}
1914 
1915 	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
1916 	    flag, errnum));
1917 }
1918 
1919 size_t
u8_textprep_str(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,size_t unicode_version,int * errnum)1920 u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1921     int flag, size_t unicode_version, int *errnum)
1922 {
1923 	int f;
1924 	int sz;
1925 	uchar_t *ib;
1926 	uchar_t *ibtail;
1927 	uchar_t *ob;
1928 	uchar_t *obtail;
1929 	boolean_t do_not_ignore_null;
1930 	boolean_t do_not_ignore_invalid;
1931 	boolean_t is_it_toupper;
1932 	boolean_t is_it_tolower;
1933 	boolean_t canonical_decomposition;
1934 	boolean_t compatibility_decomposition;
1935 	boolean_t canonical_composition;
1936 	size_t ret_val;
1937 	size_t i;
1938 	size_t j;
1939 	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
1940 	u8_normalization_states_t state;
1941 
1942 	if (unicode_version > U8_UNICODE_LATEST) {
1943 		*errnum = ERANGE;
1944 		return ((size_t)-1);
1945 	}
1946 
1947 	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
1948 	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
1949 		*errnum = EBADF;
1950 		return ((size_t)-1);
1951 	}
1952 
1953 	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1954 	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
1955 	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
1956 		*errnum = EBADF;
1957 		return ((size_t)-1);
1958 	}
1959 
1960 	if (inarray == NULL || *inlen == 0)
1961 		return (0);
1962 
1963 	if (outarray == NULL) {
1964 		*errnum = E2BIG;
1965 		return ((size_t)-1);
1966 	}
1967 
1968 	ib = (uchar_t *)inarray;
1969 	ob = (uchar_t *)outarray;
1970 	ibtail = ib + *inlen;
1971 	obtail = ob + *outlen;
1972 
1973 	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
1974 	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
1975 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1976 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1977 
1978 	ret_val = 0;
1979 
1980 	/*
1981 	 * If we don't have a normalization flag set, we do the simple case
1982 	 * conversion based text preparation separately below. Text
1983 	 * preparation involving Normalization will be done in the false task
1984 	 * block, again, separately since it will take much more time and
1985 	 * resource than doing simple case conversions.
1986 	 */
1987 	if (f == 0) {
1988 		while (ib < ibtail) {
1989 			if (*ib == '\0' && do_not_ignore_null)
1990 				break;
1991 
1992 			sz = u8_number_of_bytes[*ib];
1993 
1994 			if (sz < 0) {
1995 				if (do_not_ignore_invalid) {
1996 					*errnum = EILSEQ;
1997 					ret_val = (size_t)-1;
1998 					break;
1999 				}
2000 
2001 				sz = 1;
2002 				ret_val++;
2003 			}
2004 
2005 			if (sz == 1) {
2006 				if (ob >= obtail) {
2007 					*errnum = E2BIG;
2008 					ret_val = (size_t)-1;
2009 					break;
2010 				}
2011 
2012 				if (is_it_toupper)
2013 					*ob = U8_ASCII_TOUPPER(*ib);
2014 				else if (is_it_tolower)
2015 					*ob = U8_ASCII_TOLOWER(*ib);
2016 				else
2017 					*ob = *ib;
2018 				ib++;
2019 				ob++;
2020 			} else if ((ib + sz) > ibtail) {
2021 				if (do_not_ignore_invalid) {
2022 					*errnum = EINVAL;
2023 					ret_val = (size_t)-1;
2024 					break;
2025 				}
2026 
2027 				if ((obtail - ob) < (ibtail - ib)) {
2028 					*errnum = E2BIG;
2029 					ret_val = (size_t)-1;
2030 					break;
2031 				}
2032 
2033 				/*
2034 				 * We treat the remaining incomplete character
2035 				 * bytes as a character.
2036 				 */
2037 				ret_val++;
2038 
2039 				while (ib < ibtail)
2040 					*ob++ = *ib++;
2041 			} else {
2042 				if (is_it_toupper || is_it_tolower) {
2043 					i = do_case_conv(unicode_version, u8s,
2044 					    ib, sz, is_it_toupper);
2045 
2046 					if ((obtail - ob) < i) {
2047 						*errnum = E2BIG;
2048 						ret_val = (size_t)-1;
2049 						break;
2050 					}
2051 
2052 					ib += sz;
2053 
2054 					for (sz = 0; sz < i; sz++)
2055 						*ob++ = u8s[sz];
2056 				} else {
2057 					if ((obtail - ob) < sz) {
2058 						*errnum = E2BIG;
2059 						ret_val = (size_t)-1;
2060 						break;
2061 					}
2062 
2063 					for (i = 0; i < sz; i++)
2064 						*ob++ = *ib++;
2065 				}
2066 			}
2067 		}
2068 	} else {
2069 		canonical_decomposition = flag & U8_CANON_DECOMP;
2070 		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
2071 		canonical_composition = flag & U8_CANON_COMP;
2072 
2073 		while (ib < ibtail) {
2074 			if (*ib == '\0' && do_not_ignore_null)
2075 				break;
2076 
2077 			/*
2078 			 * If the current character is a 7-bit ASCII
2079 			 * character and it is the last character, or,
2080 			 * if the current character is a 7-bit ASCII
2081 			 * character and the next character is also a 7-bit
2082 			 * ASCII character, then, we copy over this
2083 			 * character without going through collect_a_seq().
2084 			 *
2085 			 * In any other cases, we need to look further with
2086 			 * the collect_a_seq() function.
2087 			 */
2088 			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
2089 			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
2090 				if (ob >= obtail) {
2091 					*errnum = E2BIG;
2092 					ret_val = (size_t)-1;
2093 					break;
2094 				}
2095 
2096 				if (is_it_toupper)
2097 					*ob = U8_ASCII_TOUPPER(*ib);
2098 				else if (is_it_tolower)
2099 					*ob = U8_ASCII_TOLOWER(*ib);
2100 				else
2101 					*ob = *ib;
2102 				ib++;
2103 				ob++;
2104 			} else {
2105 				*errnum = 0;
2106 				state = U8_STATE_START;
2107 
2108 				j = collect_a_seq(unicode_version, u8s,
2109 				    &ib, ibtail,
2110 				    is_it_toupper,
2111 				    is_it_tolower,
2112 				    canonical_decomposition,
2113 				    compatibility_decomposition,
2114 				    canonical_composition,
2115 				    errnum, &state);
2116 
2117 				if (*errnum && do_not_ignore_invalid) {
2118 					ret_val = (size_t)-1;
2119 					break;
2120 				}
2121 
2122 				if ((obtail - ob) < j) {
2123 					*errnum = E2BIG;
2124 					ret_val = (size_t)-1;
2125 					break;
2126 				}
2127 
2128 				for (i = 0; i < j; i++)
2129 					*ob++ = u8s[i];
2130 			}
2131 		}
2132 	}
2133 
2134 	*inlen = ibtail - ib;
2135 	*outlen = obtail - ob;
2136 
2137 	return (ret_val);
2138 }
2139