xref: /illumos-gate/usr/src/common/unicode/u8_textprep.c (revision 2671fc51ca6b63fc20cd2e852a326c63e2a958d4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2022 MNX Cloud, Inc.
28  */
29 
30 
31 /*
32  * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
33  *
34  * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
35  * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
36  * the section 3C man pages.
37  * Interface stability: Committed.
38  */
39 
40 #include <sys/types.h>
41 #ifdef	_KERNEL
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/systm.h>
45 #include <sys/debug.h>
46 #include <sys/kmem.h>
47 #include <sys/ddi.h>
48 #include <sys/sunddi.h>
49 #else
50 #include <sys/u8_textprep.h>
51 #include <strings.h>
52 #endif	/* _KERNEL */
53 #include <sys/byteorder.h>
54 #include <sys/errno.h>
55 #include <sys/u8_textprep_data.h>
56 #include <sys/sysmacros.h>
57 
58 
59 /* The maximum possible number of bytes in a UTF-8 character. */
60 #define	U8_MB_CUR_MAX			(4)
61 
62 /*
63  * The maximum number of bytes needed for a UTF-8 character to cover
64  * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
65  */
66 #define	U8_MAX_BYTES_UCS2		(3)
67 
68 /* The maximum possible number of bytes in a Stream-Safe Text. */
69 #define	U8_STREAM_SAFE_TEXT_MAX		(128)
70 
71 /*
72  * The maximum number of characters in a combining/conjoining sequence and
73  * the actual upperbound limit of a combining/conjoining sequence.
74  */
75 #define	U8_MAX_CHARS_A_SEQ		(32)
76 #define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
77 
78 /* The combining class value for Starter. */
79 #define	U8_COMBINING_CLASS_STARTER	(0)
80 
81 /*
82  * Some Hangul related macros at below.
83  *
84  * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
85  * Vowels, and optional Trailing consonants in Unicode scalar values.
86  *
87  * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
88  * the actual U+11A8. This is due to that the trailing consonant is optional
89  * and thus we are doing a pre-calculation of subtracting one.
90  *
91  * Each of 19 modern leading consonants has total 588 possible syllables since
92  * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
93  * no trailing consonant case, i.e., 21 x 28 = 588.
94  *
95  * We also have bunch of Hangul related macros at below. Please bear in mind
96  * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
97  * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
98  * Jamo; it just guarantee that it will be most likely.
99  */
100 #define	U8_HANGUL_SYL_FIRST		(0xAC00U)
101 #define	U8_HANGUL_SYL_LAST		(0xD7A3U)
102 
103 #define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
104 #define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
105 #define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
106 #define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
107 #define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
108 #define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
109 
110 #define	U8_HANGUL_V_COUNT		(21)
111 #define	U8_HANGUL_VT_COUNT		(588)
112 #define	U8_HANGUL_T_COUNT		(28)
113 
114 #define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
115 
116 #define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
117 	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
118 	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
119 	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
120 
121 #define	U8_HANGUL_JAMO_L(u) \
122 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
123 
124 #define	U8_HANGUL_JAMO_V(u) \
125 	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
126 
127 #define	U8_HANGUL_JAMO_T(u) \
128 	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
129 
130 #define	U8_HANGUL_JAMO(u) \
131 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
132 
133 #define	U8_HANGUL_SYLLABLE(u) \
134 	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
135 
136 #define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
137 	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
138 
139 #define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
140 	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
141 
142 /* The types of decomposition mappings. */
143 #define	U8_DECOMP_BOTH			(0xF5U)
144 #define	U8_DECOMP_CANONICAL		(0xF6U)
145 
146 /* The indicator for 16-bit table. */
147 #define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
148 
149 /* The following are some convenience macros. */
150 #define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
151 	(u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
152 		(uint32_t)(b3) & 0x3F;
153 
154 #define	U8_SIMPLE_SWAP(a, b, t) \
155 	(t) = (a); \
156 	(a) = (b); \
157 	(b) = (t);
158 
159 #define	U8_ASCII_TOUPPER(c) \
160 	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
161 
162 #define	U8_ASCII_TOLOWER(c) \
163 	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
164 
165 #define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
166 /*
167  * The following macro assumes that the two characters that are to be
168  * swapped are adjacent to each other and 'a' comes before 'b'.
169  *
170  * If the assumptions are not met, then, the macro will fail.
171  */
172 #define	U8_SWAP_COMB_MARKS(a, b) \
173 	for (k = 0; k < disp[(a)]; k++) \
174 		u8t[k] = u8s[start[(a)] + k]; \
175 	for (k = 0; k < disp[(b)]; k++) \
176 		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
177 	start[(b)] = start[(a)] + disp[(b)]; \
178 	for (k = 0; k < disp[(a)]; k++) \
179 		u8s[start[(b)] + k] = u8t[k]; \
180 	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
181 	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
182 
183 /* The possible states during normalization. */
184 typedef enum {
185 	U8_STATE_START = 0,
186 	U8_STATE_HANGUL_L = 1,
187 	U8_STATE_HANGUL_LV = 2,
188 	U8_STATE_HANGUL_LVT = 3,
189 	U8_STATE_HANGUL_V = 4,
190 	U8_STATE_HANGUL_T = 5,
191 	U8_STATE_COMBINING_MARK = 6
192 } u8_normalization_states_t;
193 
194 /*
195  * The three vectors at below are used to check bytes of a given UTF-8
196  * character are valid and not containing any malformed byte values.
197  *
198  * We used to have a quite relaxed UTF-8 binary representation but then there
199  * was some security related issues and so the Unicode Consortium defined
200  * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
201  * one more time at the Unicode 3.2. The following three tables are based on
202  * that.
203  */
204 
205 #define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
206 
207 #define	I_				U8_ILLEGAL_CHAR
208 #define	O_				U8_OUT_OF_RANGE_CHAR
209 
210 const int8_t u8_number_of_bytes[0x100] = {
211 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
212 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
213 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
214 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
215 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
216 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
217 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
218 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
219 
220 /*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
221 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
222 
223 /*	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
224 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
225 
226 /*	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
227 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
228 
229 /*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
230 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
231 
232 /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
233 	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
234 
235 /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
236 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
237 
238 /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
239 	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
240 
241 /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
242 	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
243 };
244 
245 #undef	I_
246 #undef	O_
247 
248 const uint8_t u8_valid_min_2nd_byte[0x100] = {
249 	0,    0,    0,    0,    0,    0,    0,    0,
250 	0,    0,    0,    0,    0,    0,    0,    0,
251 	0,    0,    0,    0,    0,    0,    0,    0,
252 	0,    0,    0,    0,    0,    0,    0,    0,
253 	0,    0,    0,    0,    0,    0,    0,    0,
254 	0,    0,    0,    0,    0,    0,    0,    0,
255 	0,    0,    0,    0,    0,    0,    0,    0,
256 	0,    0,    0,    0,    0,    0,    0,    0,
257 	0,    0,    0,    0,    0,    0,    0,    0,
258 	0,    0,    0,    0,    0,    0,    0,    0,
259 	0,    0,    0,    0,    0,    0,    0,    0,
260 	0,    0,    0,    0,    0,    0,    0,    0,
261 	0,    0,    0,    0,    0,    0,    0,    0,
262 	0,    0,    0,    0,    0,    0,    0,    0,
263 	0,    0,    0,    0,    0,    0,    0,    0,
264 	0,    0,    0,    0,    0,    0,    0,    0,
265 	0,    0,    0,    0,    0,    0,    0,    0,
266 	0,    0,    0,    0,    0,    0,    0,    0,
267 	0,    0,    0,    0,    0,    0,    0,    0,
268 	0,    0,    0,    0,    0,    0,    0,    0,
269 	0,    0,    0,    0,    0,    0,    0,    0,
270 	0,    0,    0,    0,    0,    0,    0,    0,
271 	0,    0,    0,    0,    0,    0,    0,    0,
272 	0,    0,    0,    0,    0,    0,    0,    0,
273 /*	C0    C1    C2    C3    C4    C5    C6    C7    */
274 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
275 /*	C8    C9    CA    CB    CC    CD    CE    CF    */
276 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
277 /*	D0    D1    D2    D3    D4    D5    D6    D7    */
278 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
279 /*	D8    D9    DA    DB    DC    DD    DE    DF    */
280 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
281 /*	E0    E1    E2    E3    E4    E5    E6    E7    */
282 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
283 /*	E8    E9    EA    EB    EC    ED    EE    EF    */
284 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
285 /*	F0    F1    F2    F3    F4    F5    F6    F7    */
286 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
287 	0,    0,    0,    0,    0,    0,    0,    0,
288 };
289 
290 const uint8_t u8_valid_max_2nd_byte[0x100] = {
291 	0,    0,    0,    0,    0,    0,    0,    0,
292 	0,    0,    0,    0,    0,    0,    0,    0,
293 	0,    0,    0,    0,    0,    0,    0,    0,
294 	0,    0,    0,    0,    0,    0,    0,    0,
295 	0,    0,    0,    0,    0,    0,    0,    0,
296 	0,    0,    0,    0,    0,    0,    0,    0,
297 	0,    0,    0,    0,    0,    0,    0,    0,
298 	0,    0,    0,    0,    0,    0,    0,    0,
299 	0,    0,    0,    0,    0,    0,    0,    0,
300 	0,    0,    0,    0,    0,    0,    0,    0,
301 	0,    0,    0,    0,    0,    0,    0,    0,
302 	0,    0,    0,    0,    0,    0,    0,    0,
303 	0,    0,    0,    0,    0,    0,    0,    0,
304 	0,    0,    0,    0,    0,    0,    0,    0,
305 	0,    0,    0,    0,    0,    0,    0,    0,
306 	0,    0,    0,    0,    0,    0,    0,    0,
307 	0,    0,    0,    0,    0,    0,    0,    0,
308 	0,    0,    0,    0,    0,    0,    0,    0,
309 	0,    0,    0,    0,    0,    0,    0,    0,
310 	0,    0,    0,    0,    0,    0,    0,    0,
311 	0,    0,    0,    0,    0,    0,    0,    0,
312 	0,    0,    0,    0,    0,    0,    0,    0,
313 	0,    0,    0,    0,    0,    0,    0,    0,
314 	0,    0,    0,    0,    0,    0,    0,    0,
315 /*	C0    C1    C2    C3    C4    C5    C6    C7    */
316 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
317 /*	C8    C9    CA    CB    CC    CD    CE    CF    */
318 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
319 /*	D0    D1    D2    D3    D4    D5    D6    D7    */
320 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
321 /*	D8    D9    DA    DB    DC    DD    DE    DF    */
322 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
323 /*	E0    E1    E2    E3    E4    E5    E6    E7    */
324 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
325 /*	E8    E9    EA    EB    EC    ED    EE    EF    */
326 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
327 /*	F0    F1    F2    F3    F4    F5    F6    F7    */
328 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
329 	0,    0,    0,    0,    0,    0,    0,    0,
330 };
331 
332 
333 /*
334  * The u8_validate() validates on the given UTF-8 character string and
335  * calculate the byte length. It is quite similar to mblen(3C) except that
336  * this will validate against the list of characters if required and
337  * specific to UTF-8 and Unicode.
338  */
339 int
340 u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
341 {
342 	uchar_t *ib;
343 	uchar_t *ibtail;
344 	uchar_t **p;
345 	uchar_t *s1;
346 	uchar_t *s2;
347 	uchar_t f;
348 	int sz;
349 	size_t i;
350 	int ret_val;
351 	boolean_t second;
352 	boolean_t no_need_to_validate_entire;
353 	boolean_t check_additional;
354 	boolean_t validate_ucs2_range_only;
355 
356 	if (! u8str)
357 		return (0);
358 
359 	ib = (uchar_t *)u8str;
360 	ibtail = ib + n;
361 
362 	ret_val = 0;
363 
364 	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
365 	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
366 	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
367 
368 	while (ib < ibtail) {
369 		/*
370 		 * The first byte of a UTF-8 character tells how many
371 		 * bytes will follow for the character. If the first byte
372 		 * is an illegal byte value or out of range value, we just
373 		 * return -1 with an appropriate error number.
374 		 */
375 		sz = u8_number_of_bytes[*ib];
376 		if (sz == U8_ILLEGAL_CHAR) {
377 			*errnum = EILSEQ;
378 			return (-1);
379 		}
380 
381 		if (sz == U8_OUT_OF_RANGE_CHAR ||
382 		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
383 			*errnum = ERANGE;
384 			return (-1);
385 		}
386 
387 		/*
388 		 * If we don't have enough bytes to check on, that's also
389 		 * an error. As you can see, we give illegal byte sequence
390 		 * checking higher priority then EINVAL cases.
391 		 */
392 		if ((ibtail - ib) < sz) {
393 			*errnum = EINVAL;
394 			return (-1);
395 		}
396 
397 		if (sz == 1) {
398 			ib++;
399 			ret_val++;
400 		} else {
401 			/*
402 			 * Check on the multi-byte UTF-8 character. For more
403 			 * details on this, see comment added for the used
404 			 * data structures at the beginning of the file.
405 			 */
406 			f = *ib++;
407 			ret_val++;
408 			second = B_TRUE;
409 			for (i = 1; i < sz; i++) {
410 				if (second) {
411 					if (*ib < u8_valid_min_2nd_byte[f] ||
412 					    *ib > u8_valid_max_2nd_byte[f]) {
413 						*errnum = EILSEQ;
414 						return (-1);
415 					}
416 					second = B_FALSE;
417 				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
418 					*errnum = EILSEQ;
419 					return (-1);
420 				}
421 				ib++;
422 				ret_val++;
423 			}
424 		}
425 
426 		if (check_additional) {
427 			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
428 				s1 = ib - sz;
429 				s2 = p[i];
430 				while (s1 < ib) {
431 					if (*s1 != *s2 || *s2 == '\0')
432 						break;
433 					s1++;
434 					s2++;
435 				}
436 
437 				if (s1 >= ib && *s2 == '\0') {
438 					*errnum = EBADF;
439 					return (-1);
440 				}
441 			}
442 		}
443 
444 		if (no_need_to_validate_entire)
445 			break;
446 	}
447 
448 	return (ret_val);
449 }
450 
451 /*
452  * The do_case_conv() looks at the mapping tables and returns found
453  * bytes if any. If not found, the input bytes are returned. The function
454  * always terminate the return bytes with a null character assuming that
455  * there are plenty of room to do so.
456  *
457  * The case conversions are simple case conversions mapping a character to
458  * another character as specified in the Unicode data. The byte size of
459  * the mapped character could be different from that of the input character.
460  *
461  * The return value is the byte length of the returned character excluding
462  * the terminating null byte.
463  */
464 static size_t
465 do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
466 {
467 	size_t i;
468 	uint16_t b1 = 0;
469 	uint16_t b2 = 0;
470 	uint16_t b3 = 0;
471 	uint16_t b3_tbl;
472 	uint16_t b3_base;
473 	uint16_t b4 = 0;
474 	size_t start_id;
475 	size_t end_id;
476 
477 	/*
478 	 * At this point, the only possible values for sz are 2, 3, and 4.
479 	 * The u8s should point to a vector that is well beyond the size of
480 	 * 5 bytes.
481 	 */
482 	if (sz == 2) {
483 		b3 = u8s[0] = s[0];
484 		b4 = u8s[1] = s[1];
485 	} else if (sz == 3) {
486 		b2 = u8s[0] = s[0];
487 		b3 = u8s[1] = s[1];
488 		b4 = u8s[2] = s[2];
489 	} else if (sz == 4) {
490 		b1 = u8s[0] = s[0];
491 		b2 = u8s[1] = s[1];
492 		b3 = u8s[2] = s[2];
493 		b4 = u8s[3] = s[3];
494 	} else {
495 		/* This is not possible but just in case as a fallback. */
496 		if (is_it_toupper)
497 			*u8s = U8_ASCII_TOUPPER(*s);
498 		else
499 			*u8s = U8_ASCII_TOLOWER(*s);
500 		u8s[1] = '\0';
501 
502 		return (1);
503 	}
504 	u8s[sz] = '\0';
505 
506 	/*
507 	 * Let's find out if we have a corresponding character.
508 	 */
509 	b1 = u8_common_b1_tbl[uv][b1];
510 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
511 		return ((size_t)sz);
512 
513 	b2 = u8_case_common_b2_tbl[uv][b1][b2];
514 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
515 		return ((size_t)sz);
516 
517 	if (is_it_toupper) {
518 		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
519 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
520 			return ((size_t)sz);
521 
522 		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
523 		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
524 
525 		/* Either there is no match or an error at the table. */
526 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
527 			return ((size_t)sz);
528 
529 		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
530 
531 		for (i = 0; start_id < end_id; start_id++)
532 			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
533 	} else {
534 		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
535 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
536 			return ((size_t)sz);
537 
538 		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
539 		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
540 
541 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
542 			return ((size_t)sz);
543 
544 		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
545 
546 		for (i = 0; start_id < end_id; start_id++)
547 			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
548 	}
549 
550 	/*
551 	 * If i is still zero, that means there is no corresponding character.
552 	 */
553 	if (i == 0)
554 		return ((size_t)sz);
555 
556 	u8s[i] = '\0';
557 
558 	return (i);
559 }
560 
561 /*
562  * The do_case_compare() function compares the two input strings, s1 and s2,
563  * one character at a time doing case conversions if applicable and return
564  * the comparison result as like strcmp().
565  *
566  * Since, in empirical sense, most of text data are 7-bit ASCII characters,
567  * we treat the 7-bit ASCII characters as a special case trying to yield
568  * faster processing time.
569  */
570 static int
571 do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
572     size_t n2, boolean_t is_it_toupper, int *errnum)
573 {
574 	int f;
575 	int sz1;
576 	int sz2;
577 	size_t j;
578 	size_t i1;
579 	size_t i2;
580 	uchar_t u8s1[U8_MB_CUR_MAX + 1];
581 	uchar_t u8s2[U8_MB_CUR_MAX + 1];
582 
583 	i1 = i2 = 0;
584 	while (i1 < n1 && i2 < n2) {
585 		/*
586 		 * Find out what would be the byte length for this UTF-8
587 		 * character at string s1 and also find out if this is
588 		 * an illegal start byte or not and if so, issue a proper
589 		 * error number and yet treat this byte as a character.
590 		 */
591 		sz1 = u8_number_of_bytes[*s1];
592 		if (sz1 < 0) {
593 			*errnum = EILSEQ;
594 			sz1 = 1;
595 		}
596 
597 		/*
598 		 * For 7-bit ASCII characters mainly, we do a quick case
599 		 * conversion right at here.
600 		 *
601 		 * If we don't have enough bytes for this character, issue
602 		 * an EINVAL error and use what are available.
603 		 *
604 		 * If we have enough bytes, find out if there is
605 		 * a corresponding uppercase character and if so, copy over
606 		 * the bytes for a comparison later. If there is no
607 		 * corresponding uppercase character, then, use what we have
608 		 * for the comparison.
609 		 */
610 		if (sz1 == 1) {
611 			if (is_it_toupper)
612 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
613 			else
614 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
615 			s1++;
616 			u8s1[1] = '\0';
617 		} else if ((i1 + sz1) > n1) {
618 			*errnum = EINVAL;
619 			for (j = 0; (i1 + j) < n1; )
620 				u8s1[j++] = *s1++;
621 			u8s1[j] = '\0';
622 		} else {
623 			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
624 			s1 += sz1;
625 		}
626 
627 		/* Do the same for the string s2. */
628 		sz2 = u8_number_of_bytes[*s2];
629 		if (sz2 < 0) {
630 			*errnum = EILSEQ;
631 			sz2 = 1;
632 		}
633 
634 		if (sz2 == 1) {
635 			if (is_it_toupper)
636 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
637 			else
638 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
639 			s2++;
640 			u8s2[1] = '\0';
641 		} else if ((i2 + sz2) > n2) {
642 			*errnum = EINVAL;
643 			for (j = 0; (i2 + j) < n2; )
644 				u8s2[j++] = *s2++;
645 			u8s2[j] = '\0';
646 		} else {
647 			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
648 			s2 += sz2;
649 		}
650 
651 		/* Now compare the two characters. */
652 		if (sz1 == 1 && sz2 == 1) {
653 			if (*u8s1 > *u8s2)
654 				return (1);
655 			if (*u8s1 < *u8s2)
656 				return (-1);
657 		} else {
658 			f = strcmp((const char *)u8s1, (const char *)u8s2);
659 			if (f != 0)
660 				return (f);
661 		}
662 
663 		/*
664 		 * They were the same. Let's move on to the next
665 		 * characters then.
666 		 */
667 		i1 += sz1;
668 		i2 += sz2;
669 	}
670 
671 	/*
672 	 * We compared until the end of either or both strings.
673 	 *
674 	 * If we reached to or went over the ends for the both, that means
675 	 * they are the same.
676 	 *
677 	 * If we reached only one of the two ends, that means the other string
678 	 * has something which then the fact can be used to determine
679 	 * the return value.
680 	 */
681 	if (i1 >= n1) {
682 		if (i2 >= n2)
683 			return (0);
684 		return (-1);
685 	}
686 	return (1);
687 }
688 
689 /*
690  * The combining_class() function checks on the given bytes and find out
691  * the corresponding Unicode combining class value. The return value 0 means
692  * it is a Starter. Any illegal UTF-8 character will also be treated as
693  * a Starter.
694  */
695 static uchar_t
696 combining_class(size_t uv, uchar_t *s, size_t sz)
697 {
698 	uint16_t b1 = 0;
699 	uint16_t b2 = 0;
700 	uint16_t b3 = 0;
701 	uint16_t b4 = 0;
702 
703 	if (sz == 1 || sz > 4)
704 		return (0);
705 
706 	if (sz == 2) {
707 		b3 = s[0];
708 		b4 = s[1];
709 	} else if (sz == 3) {
710 		b2 = s[0];
711 		b3 = s[1];
712 		b4 = s[2];
713 	} else if (sz == 4) {
714 		b1 = s[0];
715 		b2 = s[1];
716 		b3 = s[2];
717 		b4 = s[3];
718 	}
719 
720 	b1 = u8_common_b1_tbl[uv][b1];
721 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
722 		return (0);
723 
724 	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
725 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
726 		return (0);
727 
728 	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
729 	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
730 		return (0);
731 
732 	return (u8_combining_class_b4_tbl[uv][b3][b4]);
733 }
734 
735 /*
736  * The do_decomp() function finds out a matching decomposition if any
737  * and return. If there is no match, the input bytes are copied and returned.
738  * The function also checks if there is a Hangul, decomposes it if necessary
739  * and returns.
740  *
741  * To save time, a single byte 7-bit ASCII character should be handled by
742  * the caller.
743  *
744  * The function returns the number of bytes returned sans always terminating
745  * the null byte. It will also return a state that will tell if there was
746  * a Hangul character decomposed which then will be used by the caller.
747  */
748 static size_t
749 do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
750     boolean_t canonical_decomposition, u8_normalization_states_t *state)
751 {
752 	uint16_t b1 = 0;
753 	uint16_t b2 = 0;
754 	uint16_t b3 = 0;
755 	uint16_t b3_tbl;
756 	uint16_t b3_base;
757 	uint16_t b4 = 0;
758 	size_t start_id;
759 	size_t end_id;
760 	size_t i;
761 	uint32_t u1;
762 
763 	if (sz == 2) {
764 		b3 = u8s[0] = s[0];
765 		b4 = u8s[1] = s[1];
766 		u8s[2] = '\0';
767 	} else if (sz == 3) {
768 		/* Convert it to a Unicode scalar value. */
769 		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
770 
771 		/*
772 		 * If this is a Hangul syllable, we decompose it into
773 		 * a leading consonant, a vowel, and an optional trailing
774 		 * consonant and then return.
775 		 */
776 		if (U8_HANGUL_SYLLABLE(u1)) {
777 			u1 -= U8_HANGUL_SYL_FIRST;
778 
779 			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
780 			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
781 			    / U8_HANGUL_T_COUNT;
782 			b3 = u1 % U8_HANGUL_T_COUNT;
783 
784 			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
785 			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
786 			if (b3) {
787 				b3 += U8_HANGUL_JAMO_T_FIRST;
788 				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
789 
790 				u8s[9] = '\0';
791 				*state = U8_STATE_HANGUL_LVT;
792 				return (9);
793 			}
794 
795 			u8s[6] = '\0';
796 			*state = U8_STATE_HANGUL_LV;
797 			return (6);
798 		}
799 
800 		b2 = u8s[0] = s[0];
801 		b3 = u8s[1] = s[1];
802 		b4 = u8s[2] = s[2];
803 		u8s[3] = '\0';
804 
805 		/*
806 		 * If this is a Hangul Jamo, we know there is nothing
807 		 * further that we can decompose.
808 		 */
809 		if (U8_HANGUL_JAMO_L(u1)) {
810 			*state = U8_STATE_HANGUL_L;
811 			return (3);
812 		}
813 
814 		if (U8_HANGUL_JAMO_V(u1)) {
815 			if (*state == U8_STATE_HANGUL_L)
816 				*state = U8_STATE_HANGUL_LV;
817 			else
818 				*state = U8_STATE_HANGUL_V;
819 			return (3);
820 		}
821 
822 		if (U8_HANGUL_JAMO_T(u1)) {
823 			if (*state == U8_STATE_HANGUL_LV)
824 				*state = U8_STATE_HANGUL_LVT;
825 			else
826 				*state = U8_STATE_HANGUL_T;
827 			return (3);
828 		}
829 	} else if (sz == 4) {
830 		b1 = u8s[0] = s[0];
831 		b2 = u8s[1] = s[1];
832 		b3 = u8s[2] = s[2];
833 		b4 = u8s[3] = s[3];
834 		u8s[4] = '\0';
835 	} else {
836 		/*
837 		 * This is a fallback and should not happen if the function
838 		 * was called properly.
839 		 */
840 		u8s[0] = s[0];
841 		u8s[1] = '\0';
842 		*state = U8_STATE_START;
843 		return (1);
844 	}
845 
846 	/*
847 	 * At this point, this rountine does not know what it would get.
848 	 * The caller should sort it out if the state isn't a Hangul one.
849 	 */
850 	*state = U8_STATE_START;
851 
852 	/* Try to find matching decomposition mapping byte sequence. */
853 	b1 = u8_common_b1_tbl[uv][b1];
854 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
855 		return ((size_t)sz);
856 
857 	b2 = u8_decomp_b2_tbl[uv][b1][b2];
858 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
859 		return ((size_t)sz);
860 
861 	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
862 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
863 		return ((size_t)sz);
864 
865 	/*
866 	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
867 	 * which is 0x8000, this means we couldn't fit the mappings into
868 	 * the cardinality of a unsigned byte.
869 	 */
870 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
871 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
872 		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
873 		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
874 	} else {
875 		if (b3_tbl >= ARRAY_SIZE(u8_decomp_b4_tbl[uv]))
876 			return ((size_t)sz);
877 		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
878 		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
879 	}
880 
881 	/* This also means there wasn't any matching decomposition. */
882 	if (start_id >= end_id)
883 		return ((size_t)sz);
884 
885 	/*
886 	 * The final table for decomposition mappings has three types of
887 	 * byte sequences depending on whether a mapping is for compatibility
888 	 * decomposition, canonical decomposition, or both like the following:
889 	 *
890 	 * (1) Compatibility decomposition mappings:
891 	 *
892 	 *	+---+---+-...-+---+
893 	 *	| B0| B1| ... | Bm|
894 	 *	+---+---+-...-+---+
895 	 *
896 	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
897 	 *
898 	 * (2) Canonical decomposition mappings:
899 	 *
900 	 *	+---+---+---+-...-+---+
901 	 *	| T | b0| b1| ... | bn|
902 	 *	+---+---+---+-...-+---+
903 	 *
904 	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
905 	 *
906 	 * (3) Both mappings:
907 	 *
908 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
909 	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
910 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
911 	 *
912 	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
913 	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
914 	 *	compatibility mapping bytes.
915 	 *
916 	 * Note that compatibility decomposition means doing recursive
917 	 * decompositions using both compatibility decomposition mappings and
918 	 * canonical decomposition mappings. On the other hand, canonical
919 	 * decomposition means doing recursive decompositions using only
920 	 * canonical decomposition mappings. Since the table we have has gone
921 	 * through the recursions already, we do not need to do so during
922 	 * runtime, i.e., the table has been completely flattened out
923 	 * already.
924 	 */
925 
926 	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
927 
928 	/* Get the type, T, of the byte sequence. */
929 	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
930 
931 	/*
932 	 * If necessary, adjust start_id, end_id, or both. Note that if
933 	 * this is compatibility decomposition mapping, there is no
934 	 * adjustment.
935 	 */
936 	if (canonical_decomposition) {
937 		/* Is the mapping only for compatibility decomposition? */
938 		if (b1 < U8_DECOMP_BOTH)
939 			return ((size_t)sz);
940 
941 		start_id++;
942 
943 		if (b1 == U8_DECOMP_BOTH) {
944 			end_id = start_id +
945 			    u8_decomp_final_tbl[uv][b3_base + start_id];
946 			start_id++;
947 		}
948 	} else {
949 		/*
950 		 * Unless this is a compatibility decomposition mapping,
951 		 * we adjust the start_id.
952 		 */
953 		if (b1 == U8_DECOMP_BOTH) {
954 			start_id++;
955 			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
956 		} else if (b1 == U8_DECOMP_CANONICAL) {
957 			start_id++;
958 		}
959 	}
960 
961 	for (i = 0; start_id < end_id; start_id++)
962 		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
963 	u8s[i] = '\0';
964 
965 	return (i);
966 }
967 
968 /*
969  * The find_composition_start() function uses the character bytes given and
970  * find out the matching composition mappings if any and return the address
971  * to the composition mappings as explained in the do_composition().
972  */
973 static uchar_t *
974 find_composition_start(size_t uv, uchar_t *s, size_t sz)
975 {
976 	uint16_t b1 = 0;
977 	uint16_t b2 = 0;
978 	uint16_t b3 = 0;
979 	uint16_t b3_tbl;
980 	uint16_t b3_base;
981 	uint16_t b4 = 0;
982 	size_t start_id;
983 	size_t end_id;
984 
985 	if (sz == 1) {
986 		b4 = s[0];
987 	} else if (sz == 2) {
988 		b3 = s[0];
989 		b4 = s[1];
990 	} else if (sz == 3) {
991 		b2 = s[0];
992 		b3 = s[1];
993 		b4 = s[2];
994 	} else if (sz == 4) {
995 		b1 = s[0];
996 		b2 = s[1];
997 		b3 = s[2];
998 		b4 = s[3];
999 	} else {
1000 		/*
1001 		 * This is a fallback and should not happen if the function
1002 		 * was called properly.
1003 		 */
1004 		return (NULL);
1005 	}
1006 
1007 	b1 = u8_composition_b1_tbl[uv][b1];
1008 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
1009 		return (NULL);
1010 
1011 	b2 = u8_composition_b2_tbl[uv][b1][b2];
1012 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
1013 		return (NULL);
1014 
1015 	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
1016 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
1017 		return (NULL);
1018 
1019 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
1020 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
1021 		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
1022 		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
1023 	} else {
1024 		if (b3_tbl >= ARRAY_SIZE(u8_composition_b4_tbl[uv]))
1025 			return (NULL);
1026 		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
1027 		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
1028 	}
1029 
1030 	if (start_id >= end_id)
1031 		return (NULL);
1032 
1033 	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
1034 
1035 	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
1036 }
1037 
1038 /*
1039  * The blocked() function checks on the combining class values of previous
1040  * characters in this sequence and return whether it is blocked or not.
1041  */
1042 static boolean_t
1043 blocked(uchar_t *comb_class, size_t last)
1044 {
1045 	uchar_t my_comb_class;
1046 	size_t i;
1047 
1048 	my_comb_class = comb_class[last];
1049 	for (i = 1; i < last; i++)
1050 		if (comb_class[i] >= my_comb_class ||
1051 		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
1052 			return (B_TRUE);
1053 
1054 	return (B_FALSE);
1055 }
1056 
1057 /*
1058  * The do_composition() reads the character string pointed by 's' and
1059  * do necessary canonical composition and then copy over the result back to
1060  * the 's'.
1061  *
1062  * The input argument 's' cannot contain more than 32 characters.
1063  */
1064 static size_t
1065 do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
1066     uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
1067 {
1068 	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
1069 	uchar_t tc[U8_MB_CUR_MAX];
1070 	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
1071 	size_t saved_marks_count;
1072 	uchar_t *p;
1073 	uchar_t *saved_p;
1074 	uchar_t *q;
1075 	size_t i;
1076 	size_t saved_i;
1077 	size_t j;
1078 	size_t k;
1079 	size_t l;
1080 	size_t C;
1081 	size_t saved_l;
1082 	size_t size;
1083 	uint32_t u1;
1084 	uint32_t u2;
1085 	boolean_t match_not_found = B_TRUE;
1086 
1087 	/*
1088 	 * This should never happen unless the callers are doing some strange
1089 	 * and unexpected things.
1090 	 *
1091 	 * The "last" is the index pointing to the last character not last + 1.
1092 	 */
1093 	if (last >= U8_MAX_CHARS_A_SEQ)
1094 		last = U8_UPPER_LIMIT_IN_A_SEQ;
1095 
1096 	for (i = l = 0; i <= last; i++) {
1097 		/*
1098 		 * The last or any non-Starters at the beginning, we don't
1099 		 * have any chance to do composition and so we just copy them
1100 		 * to the temporary buffer.
1101 		 */
1102 		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
1103 SAVE_THE_CHAR:
1104 			p = s + start[i];
1105 			size = disp[i];
1106 			for (k = 0; k < size; k++)
1107 				t[l++] = *p++;
1108 			continue;
1109 		}
1110 
1111 		/*
1112 		 * If this could be a start of Hangul Jamos, then, we try to
1113 		 * conjoin them.
1114 		 */
1115 		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
1116 			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
1117 			    s[start[i] + 1], s[start[i] + 2]);
1118 			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
1119 			    s[start[i] + 4], s[start[i] + 5]);
1120 
1121 			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
1122 				u1 -= U8_HANGUL_JAMO_L_FIRST;
1123 				u2 -= U8_HANGUL_JAMO_V_FIRST;
1124 				u1 = U8_HANGUL_SYL_FIRST +
1125 				    (u1 * U8_HANGUL_V_COUNT + u2) *
1126 				    U8_HANGUL_T_COUNT;
1127 
1128 				i += 2;
1129 				if (i <= last) {
1130 					U8_PUT_3BYTES_INTO_UTF32(u2,
1131 					    s[start[i]], s[start[i] + 1],
1132 					    s[start[i] + 2]);
1133 
1134 					if (U8_HANGUL_JAMO_T(u2)) {
1135 						u1 += u2 -
1136 						    U8_HANGUL_JAMO_T_FIRST;
1137 						i++;
1138 					}
1139 				}
1140 
1141 				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
1142 				i--;
1143 				l += 3;
1144 				continue;
1145 			}
1146 		}
1147 
1148 		/*
1149 		 * Let's then find out if this Starter has composition
1150 		 * mapping.
1151 		 */
1152 		p = find_composition_start(uv, s + start[i], disp[i]);
1153 		if (p == NULL)
1154 			goto SAVE_THE_CHAR;
1155 
1156 		/*
1157 		 * We have a Starter with composition mapping and the next
1158 		 * character is a non-Starter. Let's try to find out if
1159 		 * we can do composition.
1160 		 */
1161 
1162 		saved_p = p;
1163 		saved_i = i;
1164 		saved_l = l;
1165 		saved_marks_count = 0;
1166 
1167 TRY_THE_NEXT_MARK:
1168 		q = s + start[++i];
1169 		size = disp[i];
1170 
1171 		/*
1172 		 * The next for() loop compares the non-Starter pointed by
1173 		 * 'q' with the possible (joinable) characters pointed by 'p'.
1174 		 *
1175 		 * The composition final table entry pointed by the 'p'
1176 		 * looks like the following:
1177 		 *
1178 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1179 		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
1180 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1181 		 *
1182 		 * where C is the count byte indicating the number of
1183 		 * mapping pairs where each pair would be look like
1184 		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
1185 		 * character of a canonical decomposition and the B0-Bm are
1186 		 * the bytes of a matching composite character. The F is
1187 		 * a filler byte after each character as the separator.
1188 		 */
1189 
1190 		match_not_found = B_TRUE;
1191 
1192 		for (C = *p++; C > 0; C--) {
1193 			for (k = 0; k < size; p++, k++)
1194 				if (*p != q[k])
1195 					break;
1196 
1197 			/* Have we found it? */
1198 			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
1199 				match_not_found = B_FALSE;
1200 
1201 				l = saved_l;
1202 
1203 				while (*++p != U8_TBL_ELEMENT_FILLER)
1204 					t[l++] = *p;
1205 
1206 				break;
1207 			}
1208 
1209 			/* We didn't find; skip to the next pair. */
1210 			if (*p != U8_TBL_ELEMENT_FILLER)
1211 				while (*++p != U8_TBL_ELEMENT_FILLER)
1212 					;
1213 			while (*++p != U8_TBL_ELEMENT_FILLER)
1214 				;
1215 			p++;
1216 		}
1217 
1218 		/*
1219 		 * If there was no match, we will need to save the combining
1220 		 * mark for later appending. After that, if the next one
1221 		 * is a non-Starter and not blocked, then, we try once
1222 		 * again to do composition with the next non-Starter.
1223 		 *
1224 		 * If there was no match and this was a Starter, then,
1225 		 * this is a new start.
1226 		 *
1227 		 * If there was a match and a composition done and we have
1228 		 * more to check on, then, we retrieve a new composition final
1229 		 * table entry for the composite and then try to do the
1230 		 * composition again.
1231 		 */
1232 
1233 		if (match_not_found) {
1234 			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
1235 				i--;
1236 				goto SAVE_THE_CHAR;
1237 			}
1238 
1239 			saved_marks[saved_marks_count++] = i;
1240 		}
1241 
1242 		if (saved_l == l) {
1243 			while (i < last) {
1244 				if (blocked(comb_class, i + 1))
1245 					saved_marks[saved_marks_count++] = ++i;
1246 				else
1247 					break;
1248 			}
1249 			if (i < last) {
1250 				p = saved_p;
1251 				goto TRY_THE_NEXT_MARK;
1252 			}
1253 		} else if (i < last) {
1254 			p = find_composition_start(uv, t + saved_l,
1255 			    l - saved_l);
1256 			if (p != NULL) {
1257 				saved_p = p;
1258 				goto TRY_THE_NEXT_MARK;
1259 			}
1260 		}
1261 
1262 		/*
1263 		 * There is no more composition possible.
1264 		 *
1265 		 * If there was no composition what so ever then we copy
1266 		 * over the original Starter and then append any non-Starters
1267 		 * remaining at the target string sequentially after that.
1268 		 */
1269 
1270 		if (saved_l == l) {
1271 			p = s + start[saved_i];
1272 			size = disp[saved_i];
1273 			for (j = 0; j < size; j++)
1274 				t[l++] = *p++;
1275 		}
1276 
1277 		for (k = 0; k < saved_marks_count; k++) {
1278 			p = s + start[saved_marks[k]];
1279 			size = disp[saved_marks[k]];
1280 			for (j = 0; j < size; j++)
1281 				t[l++] = *p++;
1282 		}
1283 	}
1284 
1285 	/*
1286 	 * If the last character is a Starter and if we have a character
1287 	 * (possibly another Starter) that can be turned into a composite,
1288 	 * we do so and we do so until there is no more of composition
1289 	 * possible.
1290 	 */
1291 	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
1292 		p = *os;
1293 		saved_l = l - disp[last];
1294 
1295 		while (p < oslast) {
1296 			int8_t number_of_bytes = u8_number_of_bytes[*p];
1297 
1298 			if (number_of_bytes <= 1)
1299 				break;
1300 			size = number_of_bytes;
1301 			if ((p + size) > oslast)
1302 				break;
1303 
1304 			saved_p = p;
1305 
1306 			for (i = 0; i < size; i++)
1307 				tc[i] = *p++;
1308 
1309 			q = find_composition_start(uv, t + saved_l,
1310 			    l - saved_l);
1311 			if (q == NULL) {
1312 				p = saved_p;
1313 				break;
1314 			}
1315 
1316 			match_not_found = B_TRUE;
1317 
1318 			for (C = *q++; C > 0; C--) {
1319 				for (k = 0; k < size; q++, k++)
1320 					if (*q != tc[k])
1321 						break;
1322 
1323 				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
1324 					match_not_found = B_FALSE;
1325 
1326 					l = saved_l;
1327 
1328 					while (*++q != U8_TBL_ELEMENT_FILLER) {
1329 						/*
1330 						 * This is practically
1331 						 * impossible but we don't
1332 						 * want to take any chances.
1333 						 */
1334 						if (l >=
1335 						    U8_STREAM_SAFE_TEXT_MAX) {
1336 							p = saved_p;
1337 							goto SAFE_RETURN;
1338 						}
1339 						t[l++] = *q;
1340 					}
1341 
1342 					break;
1343 				}
1344 
1345 				if (*q != U8_TBL_ELEMENT_FILLER)
1346 					while (*++q != U8_TBL_ELEMENT_FILLER)
1347 						;
1348 				while (*++q != U8_TBL_ELEMENT_FILLER)
1349 					;
1350 				q++;
1351 			}
1352 
1353 			if (match_not_found) {
1354 				p = saved_p;
1355 				break;
1356 			}
1357 		}
1358 SAFE_RETURN:
1359 		*os = p;
1360 	}
1361 
1362 	/*
1363 	 * Now we copy over the temporary string to the target string.
1364 	 * Since composition always reduces the number of characters or
1365 	 * the number of characters stay, we don't need to worry about
1366 	 * the buffer overflow here.
1367 	 */
1368 	for (i = 0; i < l; i++)
1369 		s[i] = t[i];
1370 	s[l] = '\0';
1371 
1372 	return (l);
1373 }
1374 
1375 /*
1376  * The collect_a_seq() function checks on the given string s, collect
1377  * a sequence of characters at u8s, and return the sequence. While it collects
1378  * a sequence, it also applies case conversion, canonical or compatibility
1379  * decomposition, canonical decomposition, or some or all of them and
1380  * in that order.
1381  *
1382  * The collected sequence cannot be bigger than 32 characters since if
1383  * it is having more than 31 characters, the sequence will be terminated
1384  * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
1385  * a Stream-Safe Text. The collected sequence is always terminated with
1386  * a null byte and the return value is the byte length of the sequence
1387  * including 0. The return value does not include the terminating
1388  * null byte.
1389  */
1390 static size_t
1391 collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
1392     boolean_t is_it_toupper,
1393     boolean_t is_it_tolower,
1394     boolean_t canonical_decomposition,
1395     boolean_t compatibility_decomposition,
1396     boolean_t canonical_composition,
1397     int *errnum, u8_normalization_states_t *state)
1398 {
1399 	uchar_t *s;
1400 	int sz;
1401 	int saved_sz;
1402 	size_t i;
1403 	size_t j;
1404 	size_t k;
1405 	size_t l;
1406 	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
1407 	uchar_t disp[U8_MAX_CHARS_A_SEQ];
1408 	uchar_t start[U8_MAX_CHARS_A_SEQ];
1409 	uchar_t u8t[U8_MB_CUR_MAX];
1410 	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
1411 	uchar_t tc;
1412 	size_t last;
1413 	size_t saved_last;
1414 	uint32_t u1;
1415 
1416 	/*
1417 	 * Save the source string pointer which we will return a changed
1418 	 * pointer if we do processing.
1419 	 */
1420 	s = *source;
1421 
1422 	/*
1423 	 * The following is a fallback for just in case callers are not
1424 	 * checking the string boundaries before the calling.
1425 	 */
1426 	if (s >= slast) {
1427 		u8s[0] = '\0';
1428 
1429 		return (0);
1430 	}
1431 
1432 	/*
1433 	 * As the first thing, let's collect a character and do case
1434 	 * conversion if necessary.
1435 	 */
1436 
1437 	sz = u8_number_of_bytes[*s];
1438 
1439 	if (sz < 0) {
1440 		*errnum = EILSEQ;
1441 
1442 		u8s[0] = *s++;
1443 		u8s[1] = '\0';
1444 
1445 		*source = s;
1446 
1447 		return (1);
1448 	}
1449 
1450 	if (sz == 1) {
1451 		if (is_it_toupper)
1452 			u8s[0] = U8_ASCII_TOUPPER(*s);
1453 		else if (is_it_tolower)
1454 			u8s[0] = U8_ASCII_TOLOWER(*s);
1455 		else
1456 			u8s[0] = *s;
1457 		s++;
1458 		u8s[1] = '\0';
1459 	} else if ((s + sz) > slast) {
1460 		*errnum = EINVAL;
1461 
1462 		for (i = 0; s < slast; )
1463 			u8s[i++] = *s++;
1464 		u8s[i] = '\0';
1465 
1466 		*source = s;
1467 
1468 		return (i);
1469 	} else {
1470 		if (is_it_toupper || is_it_tolower) {
1471 			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
1472 			s += sz;
1473 			sz = i;
1474 		} else {
1475 			for (i = 0; i < sz; )
1476 				u8s[i++] = *s++;
1477 			u8s[i] = '\0';
1478 		}
1479 	}
1480 
1481 	/*
1482 	 * And then canonical/compatibility decomposition followed by
1483 	 * an optional canonical composition. Please be noted that
1484 	 * canonical composition is done only when a decomposition is
1485 	 * done.
1486 	 */
1487 	if (canonical_decomposition || compatibility_decomposition) {
1488 		if (sz == 1) {
1489 			*state = U8_STATE_START;
1490 
1491 			saved_sz = 1;
1492 
1493 			comb_class[0] = 0;
1494 			start[0] = 0;
1495 			disp[0] = 1;
1496 
1497 			last = 1;
1498 		} else {
1499 			saved_sz = do_decomp(uv, u8s, u8s, sz,
1500 			    canonical_decomposition, state);
1501 
1502 			last = 0;
1503 
1504 			for (i = 0; i < saved_sz; ) {
1505 				sz = u8_number_of_bytes[u8s[i]];
1506 
1507 				comb_class[last] = combining_class(uv,
1508 				    u8s + i, sz);
1509 				start[last] = i;
1510 				disp[last] = sz;
1511 
1512 				last++;
1513 				i += sz;
1514 			}
1515 
1516 			/*
1517 			 * Decomposition yields various Hangul related
1518 			 * states but not on combining marks. We need to
1519 			 * find out at here by checking on the last
1520 			 * character.
1521 			 */
1522 			if (*state == U8_STATE_START) {
1523 				if (comb_class[last - 1])
1524 					*state = U8_STATE_COMBINING_MARK;
1525 			}
1526 		}
1527 
1528 		saved_last = last;
1529 
1530 		while (s < slast) {
1531 			sz = u8_number_of_bytes[*s];
1532 
1533 			/*
1534 			 * If this is an illegal character, an incomplete
1535 			 * character, or an 7-bit ASCII Starter character,
1536 			 * then we have collected a sequence; break and let
1537 			 * the next call deal with the two cases.
1538 			 *
1539 			 * Note that this is okay only if you are using this
1540 			 * function with a fixed length string, not on
1541 			 * a buffer with multiple calls of one chunk at a time.
1542 			 */
1543 			if (sz <= 1) {
1544 				break;
1545 			} else if ((s + sz) > slast) {
1546 				break;
1547 			} else {
1548 				/*
1549 				 * If the previous character was a Hangul Jamo
1550 				 * and this character is a Hangul Jamo that
1551 				 * can be conjoined, we collect the Jamo.
1552 				 */
1553 				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
1554 					U8_PUT_3BYTES_INTO_UTF32(u1,
1555 					    *s, *(s + 1), *(s + 2));
1556 
1557 					if (U8_HANGUL_COMPOSABLE_L_V(*state,
1558 					    u1)) {
1559 						i = 0;
1560 						*state = U8_STATE_HANGUL_LV;
1561 						goto COLLECT_A_HANGUL;
1562 					}
1563 
1564 					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
1565 					    u1)) {
1566 						i = 0;
1567 						*state = U8_STATE_HANGUL_LVT;
1568 						goto COLLECT_A_HANGUL;
1569 					}
1570 				}
1571 
1572 				/*
1573 				 * Regardless of whatever it was, if this is
1574 				 * a Starter, we don't collect the character
1575 				 * since that's a new start and we will deal
1576 				 * with it at the next time.
1577 				 */
1578 				i = combining_class(uv, s, sz);
1579 				if (i == U8_COMBINING_CLASS_STARTER)
1580 					break;
1581 
1582 				/*
1583 				 * We know the current character is a combining
1584 				 * mark. If the previous character wasn't
1585 				 * a Starter (not Hangul) or a combining mark,
1586 				 * then, we don't collect this combining mark.
1587 				 */
1588 				if (*state != U8_STATE_START &&
1589 				    *state != U8_STATE_COMBINING_MARK)
1590 					break;
1591 
1592 				*state = U8_STATE_COMBINING_MARK;
1593 COLLECT_A_HANGUL:
1594 				/*
1595 				 * If we collected a Starter and combining
1596 				 * marks up to 30, i.e., total 31 characters,
1597 				 * then, we terminate this degenerately long
1598 				 * combining sequence with a U+034F COMBINING
1599 				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
1600 				 * UTF-8 and turn this into a Stream-Safe
1601 				 * Text. This will be extremely rare but
1602 				 * possible.
1603 				 *
1604 				 * The following will also guarantee that
1605 				 * we are not writing more than 32 characters
1606 				 * plus a NULL at u8s[].
1607 				 */
1608 				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
1609 TURN_STREAM_SAFE:
1610 					*state = U8_STATE_START;
1611 					comb_class[last] = 0;
1612 					start[last] = saved_sz;
1613 					disp[last] = 2;
1614 					last++;
1615 
1616 					u8s[saved_sz++] = 0xCD;
1617 					u8s[saved_sz++] = 0x8F;
1618 
1619 					break;
1620 				}
1621 
1622 				/*
1623 				 * Some combining marks also do decompose into
1624 				 * another combining mark or marks.
1625 				 */
1626 				if (*state == U8_STATE_COMBINING_MARK) {
1627 					k = last;
1628 					l = sz;
1629 					i = do_decomp(uv, uts, s, sz,
1630 					    canonical_decomposition, state);
1631 					for (j = 0; j < i; ) {
1632 						sz = u8_number_of_bytes[uts[j]];
1633 
1634 						comb_class[last] =
1635 						    combining_class(uv,
1636 						    uts + j, sz);
1637 						start[last] = saved_sz + j;
1638 						disp[last] = sz;
1639 
1640 						last++;
1641 						if (last >=
1642 						    U8_UPPER_LIMIT_IN_A_SEQ) {
1643 							last = k;
1644 							goto TURN_STREAM_SAFE;
1645 						}
1646 						j += sz;
1647 					}
1648 
1649 					*state = U8_STATE_COMBINING_MARK;
1650 					sz = i;
1651 					s += l;
1652 
1653 					for (i = 0; i < sz; i++)
1654 						u8s[saved_sz++] = uts[i];
1655 				} else {
1656 					comb_class[last] = i;
1657 					start[last] = saved_sz;
1658 					disp[last] = sz;
1659 					last++;
1660 
1661 					for (i = 0; i < sz; i++)
1662 						u8s[saved_sz++] = *s++;
1663 				}
1664 
1665 				/*
1666 				 * If this is U+0345 COMBINING GREEK
1667 				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
1668 				 * iota subscript, and need to be converted to
1669 				 * uppercase letter, convert it to U+0399 GREEK
1670 				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
1671 				 * i.e., convert to capital adscript form as
1672 				 * specified in the Unicode standard.
1673 				 *
1674 				 * This is the only special case of (ambiguous)
1675 				 * case conversion at combining marks and
1676 				 * probably the standard will never have
1677 				 * anything similar like this in future.
1678 				 */
1679 				if (is_it_toupper && sz >= 2 &&
1680 				    u8s[saved_sz - 2] == 0xCD &&
1681 				    u8s[saved_sz - 1] == 0x85) {
1682 					u8s[saved_sz - 2] = 0xCE;
1683 					u8s[saved_sz - 1] = 0x99;
1684 				}
1685 			}
1686 		}
1687 
1688 		/*
1689 		 * Let's try to ensure a canonical ordering for the collected
1690 		 * combining marks. We do this only if we have collected
1691 		 * at least one more non-Starter. (The decomposition mapping
1692 		 * data tables have fully (and recursively) expanded and
1693 		 * canonically ordered decompositions.)
1694 		 *
1695 		 * The U8_SWAP_COMB_MARKS() convenience macro has some
1696 		 * assumptions and we are meeting the assumptions.
1697 		 */
1698 		last--;
1699 		if (last >= saved_last) {
1700 			for (i = 0; i < last; i++)
1701 				for (j = last; j > i; j--)
1702 					if (comb_class[j] &&
1703 					    comb_class[j - 1] > comb_class[j]) {
1704 						U8_SWAP_COMB_MARKS(j - 1, j);
1705 					}
1706 		}
1707 
1708 		*source = s;
1709 
1710 		if (! canonical_composition) {
1711 			u8s[saved_sz] = '\0';
1712 			return (saved_sz);
1713 		}
1714 
1715 		/*
1716 		 * Now do the canonical composition. Note that we do this
1717 		 * only after a canonical or compatibility decomposition to
1718 		 * finish up NFC or NFKC.
1719 		 */
1720 		sz = do_composition(uv, u8s, comb_class, start, disp, last,
1721 		    &s, slast);
1722 	}
1723 
1724 	*source = s;
1725 
1726 	return ((size_t)sz);
1727 }
1728 
1729 /*
1730  * The do_norm_compare() function does string comparion based on Unicode
1731  * simple case mappings and Unicode Normalization definitions.
1732  *
1733  * It does so by collecting a sequence of character at a time and comparing
1734  * the collected sequences from the strings.
1735  *
1736  * The meanings on the return values are the same as the usual strcmp().
1737  */
1738 static int
1739 do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1740     int flag, int *errnum)
1741 {
1742 	int result;
1743 	size_t sz1;
1744 	size_t sz2;
1745 	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
1746 	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
1747 	uchar_t *s1last;
1748 	uchar_t *s2last;
1749 	boolean_t is_it_toupper;
1750 	boolean_t is_it_tolower;
1751 	boolean_t canonical_decomposition;
1752 	boolean_t compatibility_decomposition;
1753 	boolean_t canonical_composition;
1754 	u8_normalization_states_t state;
1755 
1756 	s1last = s1 + n1;
1757 	s2last = s2 + n2;
1758 
1759 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1760 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1761 	canonical_decomposition = flag & U8_CANON_DECOMP;
1762 	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
1763 	canonical_composition = flag & U8_CANON_COMP;
1764 
1765 	while (s1 < s1last && s2 < s2last) {
1766 		/*
1767 		 * If the current character is a 7-bit ASCII and the last
1768 		 * character, or, if the current character and the next
1769 		 * character are both some 7-bit ASCII characters then
1770 		 * we treat the current character as a sequence.
1771 		 *
1772 		 * In any other cases, we need to call collect_a_seq().
1773 		 */
1774 
1775 		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
1776 		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
1777 			if (is_it_toupper)
1778 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
1779 			else if (is_it_tolower)
1780 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
1781 			else
1782 				u8s1[0] = *s1;
1783 			u8s1[1] = '\0';
1784 			sz1 = 1;
1785 			s1++;
1786 		} else {
1787 			state = U8_STATE_START;
1788 			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
1789 			    is_it_toupper, is_it_tolower,
1790 			    canonical_decomposition,
1791 			    compatibility_decomposition,
1792 			    canonical_composition, errnum, &state);
1793 		}
1794 
1795 		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
1796 		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
1797 			if (is_it_toupper)
1798 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
1799 			else if (is_it_tolower)
1800 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
1801 			else
1802 				u8s2[0] = *s2;
1803 			u8s2[1] = '\0';
1804 			sz2 = 1;
1805 			s2++;
1806 		} else {
1807 			state = U8_STATE_START;
1808 			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
1809 			    is_it_toupper, is_it_tolower,
1810 			    canonical_decomposition,
1811 			    compatibility_decomposition,
1812 			    canonical_composition, errnum, &state);
1813 		}
1814 
1815 		/*
1816 		 * Now compare the two characters. If they are the same,
1817 		 * we move on to the next character sequences.
1818 		 */
1819 		if (sz1 == 1 && sz2 == 1) {
1820 			if (*u8s1 > *u8s2)
1821 				return (1);
1822 			if (*u8s1 < *u8s2)
1823 				return (-1);
1824 		} else {
1825 			result = strcmp((const char *)u8s1, (const char *)u8s2);
1826 			if (result != 0)
1827 				return (result);
1828 		}
1829 	}
1830 
1831 	/*
1832 	 * We compared until the end of either or both strings.
1833 	 *
1834 	 * If we reached to or went over the ends for the both, that means
1835 	 * they are the same.
1836 	 *
1837 	 * If we reached only one end, that means the other string has
1838 	 * something which then can be used to determine the return value.
1839 	 */
1840 	if (s1 >= s1last) {
1841 		if (s2 >= s2last)
1842 			return (0);
1843 		return (-1);
1844 	}
1845 	return (1);
1846 }
1847 
1848 /*
1849  * The u8_strcmp() function compares two UTF-8 strings quite similar to
1850  * the strcmp(). For the comparison, however, Unicode Normalization specific
1851  * equivalency and Unicode simple case conversion mappings based equivalency
1852  * can be requested and checked against.
1853  */
1854 int
1855 u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1856     int *errnum)
1857 {
1858 	int f;
1859 	size_t n1;
1860 	size_t n2;
1861 
1862 	*errnum = 0;
1863 
1864 	/*
1865 	 * Check on the requested Unicode version, case conversion, and
1866 	 * normalization flag values.
1867 	 */
1868 
1869 	if (uv > U8_UNICODE_LATEST) {
1870 		*errnum = ERANGE;
1871 		uv = U8_UNICODE_LATEST;
1872 	}
1873 
1874 	if (flag == 0) {
1875 		flag = U8_STRCMP_CS;
1876 	} else {
1877 		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
1878 		    U8_STRCMP_CI_LOWER);
1879 		if (f == 0) {
1880 			flag |= U8_STRCMP_CS;
1881 		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
1882 		    f != U8_STRCMP_CI_LOWER) {
1883 			*errnum = EBADF;
1884 			flag = U8_STRCMP_CS;
1885 		}
1886 
1887 		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1888 		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
1889 		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
1890 			*errnum = EBADF;
1891 			flag = U8_STRCMP_CS;
1892 		}
1893 	}
1894 
1895 	if (flag == U8_STRCMP_CS) {
1896 		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
1897 	}
1898 
1899 	n1 = strlen(s1);
1900 	n2 = strlen(s2);
1901 	if (n != 0) {
1902 		if (n < n1)
1903 			n1 = n;
1904 		if (n < n2)
1905 			n2 = n;
1906 	}
1907 
1908 	/*
1909 	 * Simple case conversion can be done much faster and so we do
1910 	 * them separately here.
1911 	 */
1912 	if (flag == U8_STRCMP_CI_UPPER) {
1913 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1914 		    n1, n2, B_TRUE, errnum));
1915 	} else if (flag == U8_STRCMP_CI_LOWER) {
1916 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1917 		    n1, n2, B_FALSE, errnum));
1918 	}
1919 
1920 	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
1921 	    flag, errnum));
1922 }
1923 
1924 size_t
1925 u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1926     int flag, size_t unicode_version, int *errnum)
1927 {
1928 	int f;
1929 	int sz;
1930 	uchar_t *ib;
1931 	uchar_t *ibtail;
1932 	uchar_t *ob;
1933 	uchar_t *obtail;
1934 	boolean_t do_not_ignore_null;
1935 	boolean_t do_not_ignore_invalid;
1936 	boolean_t is_it_toupper;
1937 	boolean_t is_it_tolower;
1938 	boolean_t canonical_decomposition;
1939 	boolean_t compatibility_decomposition;
1940 	boolean_t canonical_composition;
1941 	size_t ret_val;
1942 	size_t i;
1943 	size_t j;
1944 	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
1945 	u8_normalization_states_t state;
1946 
1947 	if (unicode_version > U8_UNICODE_LATEST) {
1948 		*errnum = ERANGE;
1949 		return ((size_t)-1);
1950 	}
1951 
1952 	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
1953 	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
1954 		*errnum = EBADF;
1955 		return ((size_t)-1);
1956 	}
1957 
1958 	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1959 	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
1960 	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
1961 		*errnum = EBADF;
1962 		return ((size_t)-1);
1963 	}
1964 
1965 	if (inarray == NULL || *inlen == 0)
1966 		return (0);
1967 
1968 	if (outarray == NULL) {
1969 		*errnum = E2BIG;
1970 		return ((size_t)-1);
1971 	}
1972 
1973 	ib = (uchar_t *)inarray;
1974 	ob = (uchar_t *)outarray;
1975 	ibtail = ib + *inlen;
1976 	obtail = ob + *outlen;
1977 
1978 	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
1979 	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
1980 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1981 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1982 
1983 	ret_val = 0;
1984 
1985 	/*
1986 	 * If we don't have a normalization flag set, we do the simple case
1987 	 * conversion based text preparation separately below. Text
1988 	 * preparation involving Normalization will be done in the false task
1989 	 * block, again, separately since it will take much more time and
1990 	 * resource than doing simple case conversions.
1991 	 */
1992 	if (f == 0) {
1993 		while (ib < ibtail) {
1994 			if (*ib == '\0' && do_not_ignore_null)
1995 				break;
1996 
1997 			sz = u8_number_of_bytes[*ib];
1998 
1999 			if (sz < 0) {
2000 				if (do_not_ignore_invalid) {
2001 					*errnum = EILSEQ;
2002 					ret_val = (size_t)-1;
2003 					break;
2004 				}
2005 
2006 				sz = 1;
2007 				ret_val++;
2008 			}
2009 
2010 			if (sz == 1) {
2011 				if (ob >= obtail) {
2012 					*errnum = E2BIG;
2013 					ret_val = (size_t)-1;
2014 					break;
2015 				}
2016 
2017 				if (is_it_toupper)
2018 					*ob = U8_ASCII_TOUPPER(*ib);
2019 				else if (is_it_tolower)
2020 					*ob = U8_ASCII_TOLOWER(*ib);
2021 				else
2022 					*ob = *ib;
2023 				ib++;
2024 				ob++;
2025 			} else if ((ib + sz) > ibtail) {
2026 				if (do_not_ignore_invalid) {
2027 					*errnum = EINVAL;
2028 					ret_val = (size_t)-1;
2029 					break;
2030 				}
2031 
2032 				if ((obtail - ob) < (ibtail - ib)) {
2033 					*errnum = E2BIG;
2034 					ret_val = (size_t)-1;
2035 					break;
2036 				}
2037 
2038 				/*
2039 				 * We treat the remaining incomplete character
2040 				 * bytes as a character.
2041 				 */
2042 				ret_val++;
2043 
2044 				while (ib < ibtail)
2045 					*ob++ = *ib++;
2046 			} else {
2047 				if (is_it_toupper || is_it_tolower) {
2048 					i = do_case_conv(unicode_version, u8s,
2049 					    ib, sz, is_it_toupper);
2050 
2051 					if ((obtail - ob) < i) {
2052 						*errnum = E2BIG;
2053 						ret_val = (size_t)-1;
2054 						break;
2055 					}
2056 
2057 					ib += sz;
2058 
2059 					for (sz = 0; sz < i; sz++)
2060 						*ob++ = u8s[sz];
2061 				} else {
2062 					if ((obtail - ob) < sz) {
2063 						*errnum = E2BIG;
2064 						ret_val = (size_t)-1;
2065 						break;
2066 					}
2067 
2068 					for (i = 0; i < sz; i++)
2069 						*ob++ = *ib++;
2070 				}
2071 			}
2072 		}
2073 	} else {
2074 		canonical_decomposition = flag & U8_CANON_DECOMP;
2075 		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
2076 		canonical_composition = flag & U8_CANON_COMP;
2077 
2078 		while (ib < ibtail) {
2079 			if (*ib == '\0' && do_not_ignore_null)
2080 				break;
2081 
2082 			/*
2083 			 * If the current character is a 7-bit ASCII
2084 			 * character and it is the last character, or,
2085 			 * if the current character is a 7-bit ASCII
2086 			 * character and the next character is also a 7-bit
2087 			 * ASCII character, then, we copy over this
2088 			 * character without going through collect_a_seq().
2089 			 *
2090 			 * In any other cases, we need to look further with
2091 			 * the collect_a_seq() function.
2092 			 */
2093 			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
2094 			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
2095 				if (ob >= obtail) {
2096 					*errnum = E2BIG;
2097 					ret_val = (size_t)-1;
2098 					break;
2099 				}
2100 
2101 				if (is_it_toupper)
2102 					*ob = U8_ASCII_TOUPPER(*ib);
2103 				else if (is_it_tolower)
2104 					*ob = U8_ASCII_TOLOWER(*ib);
2105 				else
2106 					*ob = *ib;
2107 				ib++;
2108 				ob++;
2109 			} else {
2110 				*errnum = 0;
2111 				state = U8_STATE_START;
2112 
2113 				j = collect_a_seq(unicode_version, u8s,
2114 				    &ib, ibtail,
2115 				    is_it_toupper,
2116 				    is_it_tolower,
2117 				    canonical_decomposition,
2118 				    compatibility_decomposition,
2119 				    canonical_composition,
2120 				    errnum, &state);
2121 
2122 				if (*errnum && do_not_ignore_invalid) {
2123 					ret_val = (size_t)-1;
2124 					break;
2125 				}
2126 
2127 				if ((obtail - ob) < j) {
2128 					*errnum = E2BIG;
2129 					ret_val = (size_t)-1;
2130 					break;
2131 				}
2132 
2133 				for (i = 0; i < j; i++)
2134 					*ob++ = u8s[i];
2135 			}
2136 		}
2137 	}
2138 
2139 	*inlen = ibtail - ib;
2140 	*outlen = obtail - ob;
2141 
2142 	return (ret_val);
2143 }
2144