xref: /titanic_41/usr/src/common/unicode/u8_textprep.c (revision 85bb5f1d642e430b889478fb1200b511338085d7)
14703203dSis /*
24703203dSis  * CDDL HEADER START
34703203dSis  *
44703203dSis  * The contents of this file are subject to the terms of the
54703203dSis  * Common Development and Distribution License (the "License").
64703203dSis  * You may not use this file except in compliance with the License.
74703203dSis  *
84703203dSis  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
94703203dSis  * or http://www.opensolaris.org/os/licensing.
104703203dSis  * See the License for the specific language governing permissions
114703203dSis  * and limitations under the License.
124703203dSis  *
134703203dSis  * When distributing Covered Code, include this CDDL HEADER in each
144703203dSis  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
154703203dSis  * If applicable, add the following below this CDDL HEADER, with the
164703203dSis  * fields enclosed by brackets "[]" replaced with your own identifying
174703203dSis  * information: Portions Copyright [yyyy] [name of copyright owner]
184703203dSis  *
194703203dSis  * CDDL HEADER END
204703203dSis  */
214703203dSis /*
22*85bb5f1dSis  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
234703203dSis  * Use is subject to license terms.
244703203dSis  */
254703203dSis 
264703203dSis #pragma ident	"%Z%%M%	%I%	%E% SMI"
274703203dSis 
284703203dSis 
294703203dSis /*
304703203dSis  * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
314703203dSis  *
324703203dSis  * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
334703203dSis  * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
344703203dSis  * the section 3C man pages.
354703203dSis  * Interface stability: Committed.
364703203dSis  */
374703203dSis 
384703203dSis #include <sys/types.h>
394703203dSis #ifdef	_KERNEL
404703203dSis #include <sys/param.h>
414703203dSis #include <sys/sysmacros.h>
424703203dSis #include <sys/systm.h>
434703203dSis #include <sys/debug.h>
444703203dSis #include <sys/kmem.h>
454703203dSis #include <sys/ddi.h>
464703203dSis #include <sys/sunddi.h>
474703203dSis #else
484703203dSis #include <sys/u8_textprep.h>
494703203dSis #include <strings.h>
504703203dSis #endif	/* _KERNEL */
514703203dSis #include <sys/byteorder.h>
524703203dSis #include <sys/errno.h>
534703203dSis #include <sys/u8_textprep_data.h>
544703203dSis 
554703203dSis 
564703203dSis /* The maximum possible number of bytes in a UTF-8 character. */
574703203dSis #define	U8_MB_CUR_MAX			(4)
584703203dSis 
594703203dSis /*
604703203dSis  * The maximum number of bytes needed for a UTF-8 character to cover
614703203dSis  * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
624703203dSis  */
634703203dSis #define	U8_MAX_BYTES_UCS2		(3)
644703203dSis 
654703203dSis /* The maximum possible number of bytes in a Stream-Safe Text. */
664703203dSis #define	U8_STREAM_SAFE_TEXT_MAX		(128)
674703203dSis 
684703203dSis /*
694703203dSis  * The maximum number of characters in a combining/conjoining sequence and
704703203dSis  * the actual upperbound limit of a combining/conjoining sequence.
714703203dSis  */
724703203dSis #define	U8_MAX_CHARS_A_SEQ		(32)
734703203dSis #define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
744703203dSis 
754703203dSis /* The combining class value for Starter. */
764703203dSis #define	U8_COMBINING_CLASS_STARTER	(0)
774703203dSis 
784703203dSis /*
794703203dSis  * Some Hangul related macros at below.
804703203dSis  *
814703203dSis  * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
824703203dSis  * Vowels, and optional Trailing consonants in Unicode scalar values.
834703203dSis  *
844703203dSis  * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
854703203dSis  * the actual U+11A8. This is due to that the trailing consonant is optional
864703203dSis  * and thus we are doing a pre-calculation of subtracting one.
874703203dSis  *
884703203dSis  * Each of 19 modern leading consonants has total 588 possible syllables since
894703203dSis  * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
904703203dSis  * no trailing consonant case, i.e., 21 x 28 = 588.
914703203dSis  *
924703203dSis  * We also have bunch of Hangul related macros at below. Please bear in mind
934703203dSis  * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
944703203dSis  * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
954703203dSis  * Jamo; it just guarantee that it will be most likely.
964703203dSis  */
974703203dSis #define	U8_HANGUL_SYL_FIRST		(0xAC00U)
984703203dSis #define	U8_HANGUL_SYL_LAST		(0xD7A3U)
994703203dSis 
1004703203dSis #define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
1014703203dSis #define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
1024703203dSis #define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
1034703203dSis #define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
1044703203dSis #define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
1054703203dSis #define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
1064703203dSis 
1074703203dSis #define	U8_HANGUL_V_COUNT		(21)
1084703203dSis #define	U8_HANGUL_VT_COUNT		(588)
1094703203dSis #define	U8_HANGUL_T_COUNT		(28)
1104703203dSis 
1114703203dSis #define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
1124703203dSis 
1134703203dSis #define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
1144703203dSis 	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
1154703203dSis 	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
1164703203dSis 	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
1174703203dSis 
1184703203dSis #define	U8_HANGUL_JAMO_L(u) \
1194703203dSis 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
1204703203dSis 
1214703203dSis #define	U8_HANGUL_JAMO_V(u) \
1224703203dSis 	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
1234703203dSis 
1244703203dSis #define	U8_HANGUL_JAMO_T(u) \
1254703203dSis 	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1264703203dSis 
1274703203dSis #define	U8_HANGUL_JAMO(u) \
1284703203dSis 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1294703203dSis 
1304703203dSis #define	U8_HANGUL_SYLLABLE(u) \
1314703203dSis 	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
1324703203dSis 
1334703203dSis #define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
1344703203dSis 	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
1354703203dSis 
1364703203dSis #define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
1374703203dSis 	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
1384703203dSis 
1394703203dSis /* The types of decomposition mappings. */
1404703203dSis #define	U8_DECOMP_BOTH			(0xF5U)
1414703203dSis #define	U8_DECOMP_CANONICAL		(0xF6U)
1424703203dSis 
1434703203dSis /* The indicator for 16-bit table. */
1444703203dSis #define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
1454703203dSis 
1464703203dSis /* The following are some convenience macros. */
1474703203dSis #define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
1484703203dSis 	(u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
1494703203dSis 		(uint32_t)(b3) & 0x3F;
1504703203dSis 
1514703203dSis #define	U8_SIMPLE_SWAP(a, b, t) \
1524703203dSis 	(t) = (a); \
1534703203dSis 	(a) = (b); \
1544703203dSis 	(b) = (t);
1554703203dSis 
1564703203dSis #define	U8_ASCII_TOUPPER(c) \
1574703203dSis 	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
1584703203dSis 
1594703203dSis #define	U8_ASCII_TOLOWER(c) \
1604703203dSis 	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
1614703203dSis 
1624703203dSis #define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
1634703203dSis /*
1644703203dSis  * The following macro assumes that the two characters that are to be
1654703203dSis  * swapped are adjacent to each other and 'a' comes before 'b'.
1664703203dSis  *
1674703203dSis  * If the assumptions are not met, then, the macro will fail.
1684703203dSis  */
1694703203dSis #define	U8_SWAP_COMB_MARKS(a, b) \
1704703203dSis 	for (k = 0; k < disp[(a)]; k++) \
1714703203dSis 		u8t[k] = u8s[start[(a)] + k]; \
1724703203dSis 	for (k = 0; k < disp[(b)]; k++) \
1734703203dSis 		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
1744703203dSis 	start[(b)] = start[(a)] + disp[(b)]; \
1754703203dSis 	for (k = 0; k < disp[(a)]; k++) \
1764703203dSis 		u8s[start[(b)] + k] = u8t[k]; \
1774703203dSis 	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
1784703203dSis 	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
1794703203dSis 
1804703203dSis /* The possible states during normalization. */
1814703203dSis typedef enum {
1824703203dSis 	U8_STATE_START = 0,
1834703203dSis 	U8_STATE_HANGUL_L = 1,
1844703203dSis 	U8_STATE_HANGUL_LV = 2,
1854703203dSis 	U8_STATE_HANGUL_LVT = 3,
1864703203dSis 	U8_STATE_HANGUL_V = 4,
1874703203dSis 	U8_STATE_HANGUL_T = 5,
1884703203dSis 	U8_STATE_COMBINING_MARK = 6
1894703203dSis } u8_normalization_states_t;
1904703203dSis 
1914703203dSis /*
1924703203dSis  * The three vectors at below are used to check bytes of a given UTF-8
1934703203dSis  * character are valid and not containing any malformed byte values.
1944703203dSis  *
1954703203dSis  * We used to have a quite relaxed UTF-8 binary representation but then there
1964703203dSis  * was some security related issues and so the Unicode Consortium defined
1974703203dSis  * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
1984703203dSis  * one more time at the Unicode 3.2. The following three tables are based on
1994703203dSis  * that.
2004703203dSis  */
2014703203dSis 
2024703203dSis #define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
2034703203dSis 
2044703203dSis #define	I_				U8_ILLEGAL_CHAR
2054703203dSis #define	O_				U8_OUT_OF_RANGE_CHAR
2064703203dSis 
2074703203dSis const int8_t u8_number_of_bytes[0x100] = {
2084703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2094703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2104703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2114703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2124703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2134703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2144703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2154703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2164703203dSis 
2174703203dSis /*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
2184703203dSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2194703203dSis 
2204703203dSis /*  	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
2214703203dSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2224703203dSis 
2234703203dSis /*  	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
2244703203dSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2254703203dSis 
2264703203dSis /*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
2274703203dSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2284703203dSis 
2294703203dSis /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
2304703203dSis 	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
2314703203dSis 
2324703203dSis /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
2334703203dSis 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
2344703203dSis 
2354703203dSis /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
2364703203dSis 	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
2374703203dSis 
2384703203dSis /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
2394703203dSis 	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
2404703203dSis };
2414703203dSis 
2424703203dSis #undef	I_
2434703203dSis #undef	O_
2444703203dSis 
2454703203dSis const uint8_t u8_valid_min_2nd_byte[0x100] = {
2464703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2474703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2484703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2494703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2504703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2514703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2524703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2534703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2544703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2554703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2564703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2574703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2584703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2594703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2604703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2614703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2624703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2634703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2644703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2654703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2664703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2674703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2684703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2694703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2704703203dSis /*	C0    C1    C2    C3    C4    C5    C6    C7    */
2714703203dSis 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2724703203dSis /*	C8    C9    CA    CB    CC    CD    CE    CF    */
2734703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2744703203dSis /*	D0    D1    D2    D3    D4    D5    D6    D7    */
2754703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2764703203dSis /*	D8    D9    DA    DB    DC    DD    DE    DF    */
2774703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2784703203dSis /*	E0    E1    E2    E3    E4    E5    E6    E7    */
2794703203dSis 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2804703203dSis /*	E8    E9    EA    EB    EC    ED    EE    EF    */
2814703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2824703203dSis /*	F0    F1    F2    F3    F4    F5    F6    F7    */
2834703203dSis 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
2844703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2854703203dSis };
2864703203dSis 
2874703203dSis const uint8_t u8_valid_max_2nd_byte[0x100] = {
2884703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2894703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2904703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2914703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2924703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2934703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2944703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2954703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2964703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2974703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2984703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2994703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3004703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3014703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3024703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3034703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3044703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3054703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3064703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3074703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3084703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3094703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3104703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3114703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3124703203dSis /*	C0    C1    C2    C3    C4    C5    C6    C7    */
3134703203dSis 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3144703203dSis /*	C8    C9    CA    CB    CC    CD    CE    CF    */
3154703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3164703203dSis /*	D0    D1    D2    D3    D4    D5    D6    D7    */
3174703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3184703203dSis /*	D8    D9    DA    DB    DC    DD    DE    DF    */
3194703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3204703203dSis /*	E0    E1    E2    E3    E4    E5    E6    E7    */
3214703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3224703203dSis /*	E8    E9    EA    EB    EC    ED    EE    EF    */
3234703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
3244703203dSis /*	F0    F1    F2    F3    F4    F5    F6    F7    */
3254703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
3264703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
3274703203dSis };
3284703203dSis 
3294703203dSis 
3304703203dSis /*
3314703203dSis  * The u8_validate() validates on the given UTF-8 character string and
3324703203dSis  * calculate the byte length. It is quite similar to mblen(3C) except that
3334703203dSis  * this will validate against the list of characters if required and
3344703203dSis  * specific to UTF-8 and Unicode.
3354703203dSis  */
3364703203dSis int
u8_validate(char * u8str,size_t n,char ** list,int flag,int * errnum)337*85bb5f1dSis u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
3384703203dSis {
3394703203dSis 	uchar_t *ib;
3404703203dSis 	uchar_t *ibtail;
3414703203dSis 	uchar_t **p;
3424703203dSis 	uchar_t *s1;
3434703203dSis 	uchar_t *s2;
3444703203dSis 	uchar_t f;
3454703203dSis 	int sz;
3464703203dSis 	size_t i;
3474703203dSis 	int ret_val;
3484703203dSis 	boolean_t second;
3494703203dSis 	boolean_t no_need_to_validate_entire;
3504703203dSis 	boolean_t check_additional;
3514703203dSis 	boolean_t validate_ucs2_range_only;
3524703203dSis 
3534703203dSis 	if (! u8str)
3544703203dSis 		return (0);
3554703203dSis 
3564703203dSis 	ib = (uchar_t *)u8str;
3574703203dSis 	ibtail = ib + n;
3584703203dSis 
3594703203dSis 	ret_val = 0;
3604703203dSis 
3614703203dSis 	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
3624703203dSis 	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
3634703203dSis 	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
3644703203dSis 
3654703203dSis 	while (ib < ibtail) {
3664703203dSis 		/*
3674703203dSis 		 * The first byte of a UTF-8 character tells how many
3684703203dSis 		 * bytes will follow for the character. If the first byte
3694703203dSis 		 * is an illegal byte value or out of range value, we just
3704703203dSis 		 * return -1 with an appropriate error number.
3714703203dSis 		 */
3724703203dSis 		sz = u8_number_of_bytes[*ib];
3734703203dSis 		if (sz == U8_ILLEGAL_CHAR) {
374*85bb5f1dSis 			*errnum = EILSEQ;
3754703203dSis 			return (-1);
3764703203dSis 		}
3774703203dSis 
3784703203dSis 		if (sz == U8_OUT_OF_RANGE_CHAR ||
3794703203dSis 		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
380*85bb5f1dSis 			*errnum = ERANGE;
3814703203dSis 			return (-1);
3824703203dSis 		}
3834703203dSis 
3844703203dSis 		/*
3854703203dSis 		 * If we don't have enough bytes to check on, that's also
3864703203dSis 		 * an error. As you can see, we give illegal byte sequence
3874703203dSis 		 * checking higher priority then EINVAL cases.
3884703203dSis 		 */
3894703203dSis 		if ((ibtail - ib) < sz) {
390*85bb5f1dSis 			*errnum = EINVAL;
3914703203dSis 			return (-1);
3924703203dSis 		}
3934703203dSis 
3944703203dSis 		if (sz == 1) {
3954703203dSis 			ib++;
3964703203dSis 			ret_val++;
3974703203dSis 		} else {
3984703203dSis 			/*
3994703203dSis 			 * Check on the multi-byte UTF-8 character. For more
4004703203dSis 			 * details on this, see comment added for the used
4014703203dSis 			 * data structures at the beginning of the file.
4024703203dSis 			 */
4034703203dSis 			f = *ib++;
4044703203dSis 			ret_val++;
4054703203dSis 			second = B_TRUE;
4064703203dSis 			for (i = 1; i < sz; i++) {
4074703203dSis 				if (second) {
4084703203dSis 					if (*ib < u8_valid_min_2nd_byte[f] ||
4094703203dSis 					    *ib > u8_valid_max_2nd_byte[f]) {
410*85bb5f1dSis 						*errnum = EILSEQ;
4114703203dSis 						return (-1);
4124703203dSis 					}
4134703203dSis 					second = B_FALSE;
4144703203dSis 				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
415*85bb5f1dSis 					*errnum = EILSEQ;
4164703203dSis 					return (-1);
4174703203dSis 				}
4184703203dSis 				ib++;
4194703203dSis 				ret_val++;
4204703203dSis 			}
4214703203dSis 		}
4224703203dSis 
4234703203dSis 		if (check_additional) {
4244703203dSis 			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
4254703203dSis 				s1 = ib - sz;
4264703203dSis 				s2 = p[i];
4274703203dSis 				while (s1 < ib) {
4284703203dSis 					if (*s1 != *s2 || *s2 == '\0')
4294703203dSis 						break;
4304703203dSis 					s1++;
4314703203dSis 					s2++;
4324703203dSis 				}
4334703203dSis 
4344703203dSis 				if (s1 >= ib && *s2 == '\0') {
435*85bb5f1dSis 					*errnum = EBADF;
4364703203dSis 					return (-1);
4374703203dSis 				}
4384703203dSis 			}
4394703203dSis 		}
4404703203dSis 
4414703203dSis 		if (no_need_to_validate_entire)
4424703203dSis 			break;
4434703203dSis 	}
4444703203dSis 
4454703203dSis 	return (ret_val);
4464703203dSis }
4474703203dSis 
4484703203dSis /*
4494703203dSis  * The do_case_conv() looks at the mapping tables and returns found
4504703203dSis  * bytes if any. If not found, the input bytes are returned. The function
4514703203dSis  * always terminate the return bytes with a null character assuming that
4524703203dSis  * there are plenty of room to do so.
4534703203dSis  *
4544703203dSis  * The case conversions are simple case conversions mapping a character to
4554703203dSis  * another character as specified in the Unicode data. The byte size of
4564703203dSis  * the mapped character could be different from that of the input character.
4574703203dSis  *
4584703203dSis  * The return value is the byte length of the returned character excluding
4594703203dSis  * the terminating null byte.
4604703203dSis  */
4614703203dSis static size_t
do_case_conv(int uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t is_it_toupper)4624703203dSis do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
4634703203dSis {
4644703203dSis 	size_t i;
4654703203dSis 	uint16_t b1 = 0;
4664703203dSis 	uint16_t b2 = 0;
4674703203dSis 	uint16_t b3 = 0;
4684703203dSis 	uint16_t b3_tbl;
4694703203dSis 	uint16_t b3_base;
4704703203dSis 	uint16_t b4 = 0;
4714703203dSis 	size_t start_id;
4724703203dSis 	size_t end_id;
4734703203dSis 
4744703203dSis 	/*
4754703203dSis 	 * At this point, the only possible values for sz are 2, 3, and 4.
4764703203dSis 	 * The u8s should point to a vector that is well beyond the size of
4774703203dSis 	 * 5 bytes.
4784703203dSis 	 */
4794703203dSis 	if (sz == 2) {
4804703203dSis 		b3 = u8s[0] = s[0];
4814703203dSis 		b4 = u8s[1] = s[1];
4824703203dSis 	} else if (sz == 3) {
4834703203dSis 		b2 = u8s[0] = s[0];
4844703203dSis 		b3 = u8s[1] = s[1];
4854703203dSis 		b4 = u8s[2] = s[2];
4864703203dSis 	} else if (sz == 4) {
4874703203dSis 		b1 = u8s[0] = s[0];
4884703203dSis 		b2 = u8s[1] = s[1];
4894703203dSis 		b3 = u8s[2] = s[2];
4904703203dSis 		b4 = u8s[3] = s[3];
4914703203dSis 	} else {
4924703203dSis 		/* This is not possible but just in case as a fallback. */
4934703203dSis 		if (is_it_toupper)
4944703203dSis 			*u8s = U8_ASCII_TOUPPER(*s);
4954703203dSis 		else
4964703203dSis 			*u8s = U8_ASCII_TOLOWER(*s);
4974703203dSis 		u8s[1] = '\0';
4984703203dSis 
4994703203dSis 		return (1);
5004703203dSis 	}
5014703203dSis 	u8s[sz] = '\0';
5024703203dSis 
5034703203dSis 	/*
5044703203dSis 	 * Let's find out if we have a corresponding character.
5054703203dSis 	 */
5064703203dSis 	b1 = u8_common_b1_tbl[uv][b1];
5074703203dSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
5084703203dSis 		return ((size_t)sz);
5094703203dSis 
5104703203dSis 	b2 = u8_case_common_b2_tbl[uv][b1][b2];
5114703203dSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
5124703203dSis 		return ((size_t)sz);
5134703203dSis 
5144703203dSis 	if (is_it_toupper) {
5154703203dSis 		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
5164703203dSis 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5174703203dSis 			return ((size_t)sz);
5184703203dSis 
5194703203dSis 		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
5204703203dSis 		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
5214703203dSis 
5224703203dSis 		/* Either there is no match or an error at the table. */
5234703203dSis 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5244703203dSis 			return ((size_t)sz);
5254703203dSis 
5264703203dSis 		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
5274703203dSis 
5284703203dSis 		for (i = 0; start_id < end_id; start_id++)
5294703203dSis 			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
5304703203dSis 	} else {
5314703203dSis 		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
5324703203dSis 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5334703203dSis 			return ((size_t)sz);
5344703203dSis 
5354703203dSis 		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
5364703203dSis 		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
5374703203dSis 
5384703203dSis 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5394703203dSis 			return ((size_t)sz);
5404703203dSis 
5414703203dSis 		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
5424703203dSis 
5434703203dSis 		for (i = 0; start_id < end_id; start_id++)
5444703203dSis 			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
5454703203dSis 	}
5464703203dSis 
5474703203dSis 	/*
5484703203dSis 	 * If i is still zero, that means there is no corresponding character.
5494703203dSis 	 */
5504703203dSis 	if (i == 0)
5514703203dSis 		return ((size_t)sz);
5524703203dSis 
5534703203dSis 	u8s[i] = '\0';
5544703203dSis 
5554703203dSis 	return (i);
5564703203dSis }
5574703203dSis 
5584703203dSis /*
5594703203dSis  * The do_case_compare() function compares the two input strings, s1 and s2,
5604703203dSis  * one character at a time doing case conversions if applicable and return
5614703203dSis  * the comparison result as like strcmp().
5624703203dSis  *
5634703203dSis  * Since, in empirical sense, most of text data are 7-bit ASCII characters,
5644703203dSis  * we treat the 7-bit ASCII characters as a special case trying to yield
5654703203dSis  * faster processing time.
5664703203dSis  */
5674703203dSis static int
do_case_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,boolean_t is_it_toupper,int * errnum)5684703203dSis do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
569*85bb5f1dSis 	size_t n2, boolean_t is_it_toupper, int *errnum)
5704703203dSis {
5714703203dSis 	int f;
5724703203dSis 	int sz1;
5734703203dSis 	int sz2;
5744703203dSis 	size_t j;
5754703203dSis 	size_t i1;
5764703203dSis 	size_t i2;
5774703203dSis 	uchar_t u8s1[U8_MB_CUR_MAX + 1];
5784703203dSis 	uchar_t u8s2[U8_MB_CUR_MAX + 1];
5794703203dSis 
5804703203dSis 	i1 = i2 = 0;
5814703203dSis 	while (i1 < n1 && i2 < n2) {
5824703203dSis 		/*
5834703203dSis 		 * Find out what would be the byte length for this UTF-8
5844703203dSis 		 * character at string s1 and also find out if this is
5854703203dSis 		 * an illegal start byte or not and if so, issue a proper
586*85bb5f1dSis 		 * error number and yet treat this byte as a character.
5874703203dSis 		 */
5884703203dSis 		sz1 = u8_number_of_bytes[*s1];
5894703203dSis 		if (sz1 < 0) {
590*85bb5f1dSis 			*errnum = EILSEQ;
5914703203dSis 			sz1 = 1;
5924703203dSis 		}
5934703203dSis 
5944703203dSis 		/*
5954703203dSis 		 * For 7-bit ASCII characters mainly, we do a quick case
5964703203dSis 		 * conversion right at here.
5974703203dSis 		 *
5984703203dSis 		 * If we don't have enough bytes for this character, issue
5994703203dSis 		 * an EINVAL error and use what are available.
6004703203dSis 		 *
6014703203dSis 		 * If we have enough bytes, find out if there is
6024703203dSis 		 * a corresponding uppercase character and if so, copy over
6034703203dSis 		 * the bytes for a comparison later. If there is no
6044703203dSis 		 * corresponding uppercase character, then, use what we have
6054703203dSis 		 * for the comparison.
6064703203dSis 		 */
6074703203dSis 		if (sz1 == 1) {
6084703203dSis 			if (is_it_toupper)
6094703203dSis 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
6104703203dSis 			else
6114703203dSis 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
6124703203dSis 			s1++;
6134703203dSis 			u8s1[1] = '\0';
6144703203dSis 		} else if ((i1 + sz1) > n1) {
615*85bb5f1dSis 			*errnum = EINVAL;
6164703203dSis 			for (j = 0; (i1 + j) < n1; )
6174703203dSis 				u8s1[j++] = *s1++;
6184703203dSis 			u8s1[j] = '\0';
6194703203dSis 		} else {
6204703203dSis 			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
6214703203dSis 			s1 += sz1;
6224703203dSis 		}
6234703203dSis 
6244703203dSis 		/* Do the same for the string s2. */
6254703203dSis 		sz2 = u8_number_of_bytes[*s2];
6264703203dSis 		if (sz2 < 0) {
627*85bb5f1dSis 			*errnum = EILSEQ;
6284703203dSis 			sz2 = 1;
6294703203dSis 		}
6304703203dSis 
6314703203dSis 		if (sz2 == 1) {
6324703203dSis 			if (is_it_toupper)
6334703203dSis 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
6344703203dSis 			else
6354703203dSis 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
6364703203dSis 			s2++;
6374703203dSis 			u8s2[1] = '\0';
6384703203dSis 		} else if ((i2 + sz2) > n2) {
639*85bb5f1dSis 			*errnum = EINVAL;
6404703203dSis 			for (j = 0; (i2 + j) < n2; )
6414703203dSis 				u8s2[j++] = *s2++;
6424703203dSis 			u8s2[j] = '\0';
6434703203dSis 		} else {
6444703203dSis 			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
6454703203dSis 			s2 += sz2;
6464703203dSis 		}
6474703203dSis 
6484703203dSis 		/* Now compare the two characters. */
6494703203dSis 		if (sz1 == 1 && sz2 == 1) {
6504703203dSis 			if (*u8s1 > *u8s2)
6514703203dSis 				return (1);
6524703203dSis 			if (*u8s1 < *u8s2)
6534703203dSis 				return (-1);
6544703203dSis 		} else {
6554703203dSis 			f = strcmp((const char *)u8s1, (const char *)u8s2);
6564703203dSis 			if (f != 0)
6574703203dSis 				return (f);
6584703203dSis 		}
6594703203dSis 
6604703203dSis 		/*
6614703203dSis 		 * They were the same. Let's move on to the next
6624703203dSis 		 * characters then.
6634703203dSis 		 */
6644703203dSis 		i1 += sz1;
6654703203dSis 		i2 += sz2;
6664703203dSis 	}
6674703203dSis 
6684703203dSis 	/*
6694703203dSis 	 * We compared until the end of either or both strings.
6704703203dSis 	 *
6714703203dSis 	 * If we reached to or went over the ends for the both, that means
6724703203dSis 	 * they are the same.
6734703203dSis 	 *
6744703203dSis 	 * If we reached only one of the two ends, that means the other string
6754703203dSis 	 * has something which then the fact can be used to determine
6764703203dSis 	 * the return value.
6774703203dSis 	 */
6784703203dSis 	if (i1 >= n1) {
6794703203dSis 		if (i2 >= n2)
6804703203dSis 			return (0);
6814703203dSis 		return (-1);
6824703203dSis 	}
6834703203dSis 	return (1);
6844703203dSis }
6854703203dSis 
6864703203dSis /*
6874703203dSis  * The combining_class() function checks on the given bytes and find out
6884703203dSis  * the corresponding Unicode combining class value. The return value 0 means
6894703203dSis  * it is a Starter. Any illegal UTF-8 character will also be treated as
6904703203dSis  * a Starter.
6914703203dSis  */
6924703203dSis static uchar_t
combining_class(size_t uv,uchar_t * s,size_t sz)6934703203dSis combining_class(size_t uv, uchar_t *s, size_t sz)
6944703203dSis {
6954703203dSis 	uint16_t b1 = 0;
6964703203dSis 	uint16_t b2 = 0;
6974703203dSis 	uint16_t b3 = 0;
6984703203dSis 	uint16_t b4 = 0;
6994703203dSis 
7004703203dSis 	if (sz == 1 || sz > 4)
7014703203dSis 		return (0);
7024703203dSis 
7034703203dSis 	if (sz == 2) {
7044703203dSis 		b3 = s[0];
7054703203dSis 		b4 = s[1];
7064703203dSis 	} else if (sz == 3) {
7074703203dSis 		b2 = s[0];
7084703203dSis 		b3 = s[1];
7094703203dSis 		b4 = s[2];
7104703203dSis 	} else if (sz == 4) {
7114703203dSis 		b1 = s[0];
7124703203dSis 		b2 = s[1];
7134703203dSis 		b3 = s[2];
7144703203dSis 		b4 = s[3];
7154703203dSis 	}
7164703203dSis 
7174703203dSis 	b1 = u8_common_b1_tbl[uv][b1];
7184703203dSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
7194703203dSis 		return (0);
7204703203dSis 
7214703203dSis 	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
7224703203dSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
7234703203dSis 		return (0);
7244703203dSis 
7254703203dSis 	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
7264703203dSis 	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
7274703203dSis 		return (0);
7284703203dSis 
7294703203dSis 	return (u8_combining_class_b4_tbl[uv][b3][b4]);
7304703203dSis }
7314703203dSis 
7324703203dSis /*
7334703203dSis  * The do_decomp() function finds out a matching decomposition if any
7344703203dSis  * and return. If there is no match, the input bytes are copied and returned.
7354703203dSis  * The function also checks if there is a Hangul, decomposes it if necessary
7364703203dSis  * and returns.
7374703203dSis  *
7384703203dSis  * To save time, a single byte 7-bit ASCII character should be handled by
7394703203dSis  * the caller.
7404703203dSis  *
7414703203dSis  * The function returns the number of bytes returned sans always terminating
7424703203dSis  * the null byte. It will also return a state that will tell if there was
7434703203dSis  * a Hangul character decomposed which then will be used by the caller.
7444703203dSis  */
7454703203dSis static size_t
do_decomp(size_t uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t canonical_decomposition,u8_normalization_states_t * state)7464703203dSis do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
7474703203dSis 	boolean_t canonical_decomposition, u8_normalization_states_t *state)
7484703203dSis {
7494703203dSis 	uint16_t b1 = 0;
7504703203dSis 	uint16_t b2 = 0;
7514703203dSis 	uint16_t b3 = 0;
7524703203dSis 	uint16_t b3_tbl;
7534703203dSis 	uint16_t b3_base;
7544703203dSis 	uint16_t b4 = 0;
7554703203dSis 	size_t start_id;
7564703203dSis 	size_t end_id;
7574703203dSis 	size_t i;
7584703203dSis 	uint32_t u1;
7594703203dSis 
7604703203dSis 	if (sz == 2) {
7614703203dSis 		b3 = u8s[0] = s[0];
7624703203dSis 		b4 = u8s[1] = s[1];
7634703203dSis 		u8s[2] = '\0';
7644703203dSis 	} else if (sz == 3) {
7654703203dSis 		/* Convert it to a Unicode scalar value. */
7664703203dSis 		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
7674703203dSis 
7684703203dSis 		/*
7694703203dSis 		 * If this is a Hangul syllable, we decompose it into
7704703203dSis 		 * a leading consonant, a vowel, and an optional trailing
7714703203dSis 		 * consonant and then return.
7724703203dSis 		 */
7734703203dSis 		if (U8_HANGUL_SYLLABLE(u1)) {
7744703203dSis 			u1 -= U8_HANGUL_SYL_FIRST;
7754703203dSis 
7764703203dSis 			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
7774703203dSis 			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
7784703203dSis 			    / U8_HANGUL_T_COUNT;
7794703203dSis 			b3 = u1 % U8_HANGUL_T_COUNT;
7804703203dSis 
7814703203dSis 			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
7824703203dSis 			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
7834703203dSis 			if (b3) {
7844703203dSis 				b3 += U8_HANGUL_JAMO_T_FIRST;
7854703203dSis 				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
7864703203dSis 
7874703203dSis 				u8s[9] = '\0';
7884703203dSis 				*state = U8_STATE_HANGUL_LVT;
7894703203dSis 				return (9);
7904703203dSis 			}
7914703203dSis 
7924703203dSis 			u8s[6] = '\0';
7934703203dSis 			*state = U8_STATE_HANGUL_LV;
7944703203dSis 			return (6);
7954703203dSis 		}
7964703203dSis 
7974703203dSis 		b2 = u8s[0] = s[0];
7984703203dSis 		b3 = u8s[1] = s[1];
7994703203dSis 		b4 = u8s[2] = s[2];
8004703203dSis 		u8s[3] = '\0';
8014703203dSis 
8024703203dSis 		/*
8034703203dSis 		 * If this is a Hangul Jamo, we know there is nothing
8044703203dSis 		 * further that we can decompose.
8054703203dSis 		 */
8064703203dSis 		if (U8_HANGUL_JAMO_L(u1)) {
8074703203dSis 			*state = U8_STATE_HANGUL_L;
8084703203dSis 			return (3);
8094703203dSis 		}
8104703203dSis 
8114703203dSis 		if (U8_HANGUL_JAMO_V(u1)) {
8124703203dSis 			if (*state == U8_STATE_HANGUL_L)
8134703203dSis 				*state = U8_STATE_HANGUL_LV;
8144703203dSis 			else
8154703203dSis 				*state = U8_STATE_HANGUL_V;
8164703203dSis 			return (3);
8174703203dSis 		}
8184703203dSis 
8194703203dSis 		if (U8_HANGUL_JAMO_T(u1)) {
8204703203dSis 			if (*state == U8_STATE_HANGUL_LV)
8214703203dSis 				*state = U8_STATE_HANGUL_LVT;
8224703203dSis 			else
8234703203dSis 				*state = U8_STATE_HANGUL_T;
8244703203dSis 			return (3);
8254703203dSis 		}
8264703203dSis 	} else if (sz == 4) {
8274703203dSis 		b1 = u8s[0] = s[0];
8284703203dSis 		b2 = u8s[1] = s[1];
8294703203dSis 		b3 = u8s[2] = s[2];
8304703203dSis 		b4 = u8s[3] = s[3];
8314703203dSis 		u8s[4] = '\0';
8324703203dSis 	} else {
8334703203dSis 		/*
8344703203dSis 		 * This is a fallback and should not happen if the function
8354703203dSis 		 * was called properly.
8364703203dSis 		 */
8374703203dSis 		u8s[0] = s[0];
8384703203dSis 		u8s[1] = '\0';
8394703203dSis 		*state = U8_STATE_START;
8404703203dSis 		return (1);
8414703203dSis 	}
8424703203dSis 
8434703203dSis 	/*
8444703203dSis 	 * At this point, this rountine does not know what it would get.
8454703203dSis 	 * The caller should sort it out if the state isn't a Hangul one.
8464703203dSis 	 */
8474703203dSis 	*state = U8_STATE_START;
8484703203dSis 
8494703203dSis 	/* Try to find matching decomposition mapping byte sequence. */
8504703203dSis 	b1 = u8_common_b1_tbl[uv][b1];
8514703203dSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
8524703203dSis 		return ((size_t)sz);
8534703203dSis 
8544703203dSis 	b2 = u8_decomp_b2_tbl[uv][b1][b2];
8554703203dSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
8564703203dSis 		return ((size_t)sz);
8574703203dSis 
8584703203dSis 	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
8594703203dSis 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
8604703203dSis 		return ((size_t)sz);
8614703203dSis 
8624703203dSis 	/*
8634703203dSis 	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
8644703203dSis 	 * which is 0x8000, this means we couldn't fit the mappings into
8654703203dSis 	 * the cardinality of a unsigned byte.
8664703203dSis 	 */
8674703203dSis 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
8684703203dSis 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
8694703203dSis 		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
8704703203dSis 		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
8714703203dSis 	} else {
8724703203dSis 		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
8734703203dSis 		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
8744703203dSis 	}
8754703203dSis 
8764703203dSis 	/* This also means there wasn't any matching decomposition. */
8774703203dSis 	if (start_id >= end_id)
8784703203dSis 		return ((size_t)sz);
8794703203dSis 
8804703203dSis 	/*
8814703203dSis 	 * The final table for decomposition mappings has three types of
8824703203dSis 	 * byte sequences depending on whether a mapping is for compatibility
8834703203dSis 	 * decomposition, canonical decomposition, or both like the following:
8844703203dSis 	 *
8854703203dSis 	 * (1) Compatibility decomposition mappings:
8864703203dSis 	 *
8874703203dSis 	 *	+---+---+-...-+---+
8884703203dSis 	 *	| B0| B1| ... | Bm|
8894703203dSis 	 *	+---+---+-...-+---+
8904703203dSis 	 *
8914703203dSis 	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
8924703203dSis 	 *
8934703203dSis 	 * (2) Canonical decomposition mappings:
8944703203dSis 	 *
8954703203dSis 	 *	+---+---+---+-...-+---+
8964703203dSis 	 *	| T | b0| b1| ... | bn|
8974703203dSis 	 *	+---+---+---+-...-+---+
8984703203dSis 	 *
8994703203dSis 	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
9004703203dSis 	 *
9014703203dSis 	 * (3) Both mappings:
9024703203dSis 	 *
9034703203dSis 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
9044703203dSis 	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
9054703203dSis 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
9064703203dSis 	 *
9074703203dSis 	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
9084703203dSis 	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
9094703203dSis 	 *	compatibility mapping bytes.
9104703203dSis 	 *
9114703203dSis 	 * Note that compatibility decomposition means doing recursive
9124703203dSis 	 * decompositions using both compatibility decomposition mappings and
9134703203dSis 	 * canonical decomposition mappings. On the other hand, canonical
9144703203dSis 	 * decomposition means doing recursive decompositions using only
9154703203dSis 	 * canonical decomposition mappings. Since the table we have has gone
9164703203dSis 	 * through the recursions already, we do not need to do so during
9174703203dSis 	 * runtime, i.e., the table has been completely flattened out
9184703203dSis 	 * already.
9194703203dSis 	 */
9204703203dSis 
9214703203dSis 	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
9224703203dSis 
9234703203dSis 	/* Get the type, T, of the byte sequence. */
9244703203dSis 	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
9254703203dSis 
9264703203dSis 	/*
9274703203dSis 	 * If necessary, adjust start_id, end_id, or both. Note that if
9284703203dSis 	 * this is compatibility decomposition mapping, there is no
9294703203dSis 	 * adjustment.
9304703203dSis 	 */
9314703203dSis 	if (canonical_decomposition) {
9324703203dSis 		/* Is the mapping only for compatibility decomposition? */
9334703203dSis 		if (b1 < U8_DECOMP_BOTH)
9344703203dSis 			return ((size_t)sz);
9354703203dSis 
9364703203dSis 		start_id++;
9374703203dSis 
9384703203dSis 		if (b1 == U8_DECOMP_BOTH) {
9394703203dSis 			end_id = start_id +
9404703203dSis 			    u8_decomp_final_tbl[uv][b3_base + start_id];
9414703203dSis 			start_id++;
9424703203dSis 		}
9434703203dSis 	} else {
9444703203dSis 		/*
9454703203dSis 		 * Unless this is a compatibility decomposition mapping,
9464703203dSis 		 * we adjust the start_id.
9474703203dSis 		 */
9484703203dSis 		if (b1 == U8_DECOMP_BOTH) {
9494703203dSis 			start_id++;
9504703203dSis 			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
9514703203dSis 		} else if (b1 == U8_DECOMP_CANONICAL) {
9524703203dSis 			start_id++;
9534703203dSis 		}
9544703203dSis 	}
9554703203dSis 
9564703203dSis 	for (i = 0; start_id < end_id; start_id++)
9574703203dSis 		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
9584703203dSis 	u8s[i] = '\0';
9594703203dSis 
9604703203dSis 	return (i);
9614703203dSis }
9624703203dSis 
9634703203dSis /*
9644703203dSis  * The find_composition_start() function uses the character bytes given and
9654703203dSis  * find out the matching composition mappings if any and return the address
9664703203dSis  * to the composition mappings as explained in the do_composition().
9674703203dSis  */
9684703203dSis static uchar_t *
find_composition_start(size_t uv,uchar_t * s,size_t sz)9694703203dSis find_composition_start(size_t uv, uchar_t *s, size_t sz)
9704703203dSis {
9714703203dSis 	uint16_t b1 = 0;
9724703203dSis 	uint16_t b2 = 0;
9734703203dSis 	uint16_t b3 = 0;
9744703203dSis 	uint16_t b3_tbl;
9754703203dSis 	uint16_t b3_base;
9764703203dSis 	uint16_t b4 = 0;
9774703203dSis 	size_t start_id;
9784703203dSis 	size_t end_id;
9794703203dSis 
9804703203dSis 	if (sz == 1) {
9814703203dSis 		b4 = s[0];
9824703203dSis 	} else if (sz == 2) {
9834703203dSis 		b3 = s[0];
9844703203dSis 		b4 = s[1];
9854703203dSis 	} else if (sz == 3) {
9864703203dSis 		b2 = s[0];
9874703203dSis 		b3 = s[1];
9884703203dSis 		b4 = s[2];
9894703203dSis 	} else if (sz == 4) {
9904703203dSis 		b1 = s[0];
9914703203dSis 		b2 = s[1];
9924703203dSis 		b3 = s[2];
9934703203dSis 		b4 = s[3];
9944703203dSis 	} else {
9954703203dSis 		/*
9964703203dSis 		 * This is a fallback and should not happen if the function
9974703203dSis 		 * was called properly.
9984703203dSis 		 */
9994703203dSis 		return (NULL);
10004703203dSis 	}
10014703203dSis 
10024703203dSis 	b1 = u8_composition_b1_tbl[uv][b1];
10034703203dSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
10044703203dSis 		return (NULL);
10054703203dSis 
10064703203dSis 	b2 = u8_composition_b2_tbl[uv][b1][b2];
10074703203dSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
10084703203dSis 		return (NULL);
10094703203dSis 
10104703203dSis 	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
10114703203dSis 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
10124703203dSis 		return (NULL);
10134703203dSis 
10144703203dSis 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
10154703203dSis 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
10164703203dSis 		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
10174703203dSis 		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
10184703203dSis 	} else {
10194703203dSis 		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
10204703203dSis 		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
10214703203dSis 	}
10224703203dSis 
10234703203dSis 	if (start_id >= end_id)
10244703203dSis 		return (NULL);
10254703203dSis 
10264703203dSis 	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
10274703203dSis 
10284703203dSis 	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
10294703203dSis }
10304703203dSis 
10314703203dSis /*
10324703203dSis  * The blocked() function checks on the combining class values of previous
10334703203dSis  * characters in this sequence and return whether it is blocked or not.
10344703203dSis  */
10354703203dSis static boolean_t
blocked(uchar_t * comb_class,size_t last)10364703203dSis blocked(uchar_t *comb_class, size_t last)
10374703203dSis {
10384703203dSis 	uchar_t my_comb_class;
10394703203dSis 	size_t i;
10404703203dSis 
10414703203dSis 	my_comb_class = comb_class[last];
10424703203dSis 	for (i = 1; i < last; i++)
10434703203dSis 		if (comb_class[i] >= my_comb_class ||
10444703203dSis 		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
10454703203dSis 			return (B_TRUE);
10464703203dSis 
10474703203dSis 	return (B_FALSE);
10484703203dSis }
10494703203dSis 
10504703203dSis /*
10514703203dSis  * The do_composition() reads the character string pointed by 's' and
10524703203dSis  * do necessary canonical composition and then copy over the result back to
10534703203dSis  * the 's'.
10544703203dSis  *
10554703203dSis  * The input argument 's' cannot contain more than 32 characters.
10564703203dSis  */
10574703203dSis static size_t
do_composition(size_t uv,uchar_t * s,uchar_t * comb_class,uchar_t * start,uchar_t * disp,size_t last,uchar_t ** os,uchar_t * oslast)10584703203dSis do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
10594703203dSis 	uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
10604703203dSis {
10614703203dSis 	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
10624703203dSis 	uchar_t tc[U8_MB_CUR_MAX];
10634703203dSis 	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
10644703203dSis 	size_t saved_marks_count;
10654703203dSis 	uchar_t *p;
10664703203dSis 	uchar_t *saved_p;
10674703203dSis 	uchar_t *q;
10684703203dSis 	size_t i;
10694703203dSis 	size_t saved_i;
10704703203dSis 	size_t j;
10714703203dSis 	size_t k;
10724703203dSis 	size_t l;
10734703203dSis 	size_t C;
10744703203dSis 	size_t saved_l;
10754703203dSis 	size_t size;
10764703203dSis 	uint32_t u1;
10774703203dSis 	uint32_t u2;
10784703203dSis 	boolean_t match_not_found = B_TRUE;
10794703203dSis 
10804703203dSis 	/*
10814703203dSis 	 * This should never happen unless the callers are doing some strange
10824703203dSis 	 * and unexpected things.
10834703203dSis 	 *
10844703203dSis 	 * The "last" is the index pointing to the last character not last + 1.
10854703203dSis 	 */
10864703203dSis 	if (last >= U8_MAX_CHARS_A_SEQ)
10874703203dSis 		last = U8_UPPER_LIMIT_IN_A_SEQ;
10884703203dSis 
10894703203dSis 	for (i = l = 0; i <= last; i++) {
10904703203dSis 		/*
10914703203dSis 		 * The last or any non-Starters at the beginning, we don't
10924703203dSis 		 * have any chance to do composition and so we just copy them
10934703203dSis 		 * to the temporary buffer.
10944703203dSis 		 */
10954703203dSis 		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
10964703203dSis SAVE_THE_CHAR:
10974703203dSis 			p = s + start[i];
10984703203dSis 			size = disp[i];
10994703203dSis 			for (k = 0; k < size; k++)
11004703203dSis 				t[l++] = *p++;
11014703203dSis 			continue;
11024703203dSis 		}
11034703203dSis 
11044703203dSis 		/*
11054703203dSis 		 * If this could be a start of Hangul Jamos, then, we try to
11064703203dSis 		 * conjoin them.
11074703203dSis 		 */
11084703203dSis 		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
11094703203dSis 			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
11104703203dSis 			    s[start[i] + 1], s[start[i] + 2]);
11114703203dSis 			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
11124703203dSis 			    s[start[i] + 4], s[start[i] + 5]);
11134703203dSis 
11144703203dSis 			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
11154703203dSis 				u1 -= U8_HANGUL_JAMO_L_FIRST;
11164703203dSis 				u2 -= U8_HANGUL_JAMO_V_FIRST;
11174703203dSis 				u1 = U8_HANGUL_SYL_FIRST +
11184703203dSis 				    (u1 * U8_HANGUL_V_COUNT + u2) *
11194703203dSis 				    U8_HANGUL_T_COUNT;
11204703203dSis 
11214703203dSis 				i += 2;
11224703203dSis 				if (i <= last) {
11234703203dSis 					U8_PUT_3BYTES_INTO_UTF32(u2,
11244703203dSis 					    s[start[i]], s[start[i] + 1],
11254703203dSis 					    s[start[i] + 2]);
11264703203dSis 
11274703203dSis 					if (U8_HANGUL_JAMO_T(u2)) {
11284703203dSis 						u1 += u2 -
11294703203dSis 						    U8_HANGUL_JAMO_T_FIRST;
11304703203dSis 						i++;
11314703203dSis 					}
11324703203dSis 				}
11334703203dSis 
11344703203dSis 				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
11354703203dSis 				i--;
11364703203dSis 				l += 3;
11374703203dSis 				continue;
11384703203dSis 			}
11394703203dSis 		}
11404703203dSis 
11414703203dSis 		/*
11424703203dSis 		 * Let's then find out if this Starter has composition
11434703203dSis 		 * mapping.
11444703203dSis 		 */
11454703203dSis 		p = find_composition_start(uv, s + start[i], disp[i]);
11464703203dSis 		if (p == NULL)
11474703203dSis 			goto SAVE_THE_CHAR;
11484703203dSis 
11494703203dSis 		/*
11504703203dSis 		 * We have a Starter with composition mapping and the next
11514703203dSis 		 * character is a non-Starter. Let's try to find out if
11524703203dSis 		 * we can do composition.
11534703203dSis 		 */
11544703203dSis 
11554703203dSis 		saved_p = p;
11564703203dSis 		saved_i = i;
11574703203dSis 		saved_l = l;
11584703203dSis 		saved_marks_count = 0;
11594703203dSis 
11604703203dSis TRY_THE_NEXT_MARK:
11614703203dSis 		q = s + start[++i];
11624703203dSis 		size = disp[i];
11634703203dSis 
11644703203dSis 		/*
11654703203dSis 		 * The next for() loop compares the non-Starter pointed by
11664703203dSis 		 * 'q' with the possible (joinable) characters pointed by 'p'.
11674703203dSis 		 *
11684703203dSis 		 * The composition final table entry pointed by the 'p'
11694703203dSis 		 * looks like the following:
11704703203dSis 		 *
11714703203dSis 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11724703203dSis 		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
11734703203dSis 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11744703203dSis 		 *
11754703203dSis 		 * where C is the count byte indicating the number of
11764703203dSis 		 * mapping pairs where each pair would be look like
11774703203dSis 		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
11784703203dSis 		 * character of a canonical decomposition and the B0-Bm are
11794703203dSis 		 * the bytes of a matching composite character. The F is
11804703203dSis 		 * a filler byte after each character as the separator.
11814703203dSis 		 */
11824703203dSis 
11834703203dSis 		match_not_found = B_TRUE;
11844703203dSis 
11854703203dSis 		for (C = *p++; C > 0; C--) {
11864703203dSis 			for (k = 0; k < size; p++, k++)
11874703203dSis 				if (*p != q[k])
11884703203dSis 					break;
11894703203dSis 
11904703203dSis 			/* Have we found it? */
11914703203dSis 			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
11924703203dSis 				match_not_found = B_FALSE;
11934703203dSis 
11944703203dSis 				l = saved_l;
11954703203dSis 
11964703203dSis 				while (*++p != U8_TBL_ELEMENT_FILLER)
11974703203dSis 					t[l++] = *p;
11984703203dSis 
11994703203dSis 				break;
12004703203dSis 			}
12014703203dSis 
12024703203dSis 			/* We didn't find; skip to the next pair. */
12034703203dSis 			if (*p != U8_TBL_ELEMENT_FILLER)
12044703203dSis 				while (*++p != U8_TBL_ELEMENT_FILLER)
12054703203dSis 					;
12064703203dSis 			while (*++p != U8_TBL_ELEMENT_FILLER)
12074703203dSis 				;
12084703203dSis 			p++;
12094703203dSis 		}
12104703203dSis 
12114703203dSis 		/*
12124703203dSis 		 * If there was no match, we will need to save the combining
12134703203dSis 		 * mark for later appending. After that, if the next one
12144703203dSis 		 * is a non-Starter and not blocked, then, we try once
12154703203dSis 		 * again to do composition with the next non-Starter.
12164703203dSis 		 *
12174703203dSis 		 * If there was no match and this was a Starter, then,
12184703203dSis 		 * this is a new start.
12194703203dSis 		 *
12204703203dSis 		 * If there was a match and a composition done and we have
12214703203dSis 		 * more to check on, then, we retrieve a new composition final
12224703203dSis 		 * table entry for the composite and then try to do the
12234703203dSis 		 * composition again.
12244703203dSis 		 */
12254703203dSis 
12264703203dSis 		if (match_not_found) {
12274703203dSis 			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
12284703203dSis 				i--;
12294703203dSis 				goto SAVE_THE_CHAR;
12304703203dSis 			}
12314703203dSis 
12324703203dSis 			saved_marks[saved_marks_count++] = i;
12334703203dSis 		}
12344703203dSis 
12354703203dSis 		if (saved_l == l) {
12364703203dSis 			while (i < last) {
12374703203dSis 				if (blocked(comb_class, i + 1))
12384703203dSis 					saved_marks[saved_marks_count++] = ++i;
12394703203dSis 				else
12404703203dSis 					break;
12414703203dSis 			}
12424703203dSis 			if (i < last) {
12434703203dSis 				p = saved_p;
12444703203dSis 				goto TRY_THE_NEXT_MARK;
12454703203dSis 			}
12464703203dSis 		} else if (i < last) {
12474703203dSis 			p = find_composition_start(uv, t + saved_l,
12484703203dSis 			    l - saved_l);
12494703203dSis 			if (p != NULL) {
12504703203dSis 				saved_p = p;
12514703203dSis 				goto TRY_THE_NEXT_MARK;
12524703203dSis 			}
12534703203dSis 		}
12544703203dSis 
12554703203dSis 		/*
12564703203dSis 		 * There is no more composition possible.
12574703203dSis 		 *
12584703203dSis 		 * If there was no composition what so ever then we copy
12594703203dSis 		 * over the original Starter and then append any non-Starters
12604703203dSis 		 * remaining at the target string sequentially after that.
12614703203dSis 		 */
12624703203dSis 
12634703203dSis 		if (saved_l == l) {
12644703203dSis 			p = s + start[saved_i];
12654703203dSis 			size = disp[saved_i];
12664703203dSis 			for (j = 0; j < size; j++)
12674703203dSis 				t[l++] = *p++;
12684703203dSis 		}
12694703203dSis 
12704703203dSis 		for (k = 0; k < saved_marks_count; k++) {
12714703203dSis 			p = s + start[saved_marks[k]];
12724703203dSis 			size = disp[saved_marks[k]];
12734703203dSis 			for (j = 0; j < size; j++)
12744703203dSis 				t[l++] = *p++;
12754703203dSis 		}
12764703203dSis 	}
12774703203dSis 
12784703203dSis 	/*
12794703203dSis 	 * If the last character is a Starter and if we have a character
12804703203dSis 	 * (possibly another Starter) that can be turned into a composite,
12814703203dSis 	 * we do so and we do so until there is no more of composition
12824703203dSis 	 * possible.
12834703203dSis 	 */
12844703203dSis 	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
12854703203dSis 		p = *os;
12864703203dSis 		saved_l = l - disp[last];
12874703203dSis 
12884703203dSis 		while (p < oslast) {
12894703203dSis 			size = u8_number_of_bytes[*p];
12904703203dSis 			if (size <= 1 || (p + size) > oslast)
12914703203dSis 				break;
12924703203dSis 
12934703203dSis 			saved_p = p;
12944703203dSis 
12954703203dSis 			for (i = 0; i < size; i++)
12964703203dSis 				tc[i] = *p++;
12974703203dSis 
12984703203dSis 			q = find_composition_start(uv, t + saved_l,
12994703203dSis 			    l - saved_l);
13004703203dSis 			if (q == NULL) {
13014703203dSis 				p = saved_p;
13024703203dSis 				break;
13034703203dSis 			}
13044703203dSis 
13054703203dSis 			match_not_found = B_TRUE;
13064703203dSis 
13074703203dSis 			for (C = *q++; C > 0; C--) {
13084703203dSis 				for (k = 0; k < size; q++, k++)
13094703203dSis 					if (*q != tc[k])
13104703203dSis 						break;
13114703203dSis 
13124703203dSis 				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
13134703203dSis 					match_not_found = B_FALSE;
13144703203dSis 
13154703203dSis 					l = saved_l;
13164703203dSis 
13174703203dSis 					while (*++q != U8_TBL_ELEMENT_FILLER) {
13184703203dSis 						/*
13194703203dSis 						 * This is practically
13204703203dSis 						 * impossible but we don't
13214703203dSis 						 * want to take any chances.
13224703203dSis 						 */
13234703203dSis 						if (l >=
13244703203dSis 						    U8_STREAM_SAFE_TEXT_MAX) {
13254703203dSis 							p = saved_p;
13264703203dSis 							goto SAFE_RETURN;
13274703203dSis 						}
13284703203dSis 						t[l++] = *q;
13294703203dSis 					}
13304703203dSis 
13314703203dSis 					break;
13324703203dSis 				}
13334703203dSis 
13344703203dSis 				if (*q != U8_TBL_ELEMENT_FILLER)
13354703203dSis 					while (*++q != U8_TBL_ELEMENT_FILLER)
13364703203dSis 						;
13374703203dSis 				while (*++q != U8_TBL_ELEMENT_FILLER)
13384703203dSis 					;
13394703203dSis 				q++;
13404703203dSis 			}
13414703203dSis 
13424703203dSis 			if (match_not_found) {
13434703203dSis 				p = saved_p;
13444703203dSis 				break;
13454703203dSis 			}
13464703203dSis 		}
13474703203dSis SAFE_RETURN:
13484703203dSis 		*os = p;
13494703203dSis 	}
13504703203dSis 
13514703203dSis 	/*
13524703203dSis 	 * Now we copy over the temporary string to the target string.
13534703203dSis 	 * Since composition always reduces the number of characters or
13544703203dSis 	 * the number of characters stay, we don't need to worry about
13554703203dSis 	 * the buffer overflow here.
13564703203dSis 	 */
13574703203dSis 	for (i = 0; i < l; i++)
13584703203dSis 		s[i] = t[i];
13594703203dSis 	s[l] = '\0';
13604703203dSis 
13614703203dSis 	return (l);
13624703203dSis }
13634703203dSis 
13644703203dSis /*
13654703203dSis  * The collect_a_seq() function checks on the given string s, collect
13664703203dSis  * a sequence of characters at u8s, and return the sequence. While it collects
13674703203dSis  * a sequence, it also applies case conversion, canonical or compatibility
13684703203dSis  * decomposition, canonical decomposition, or some or all of them and
13694703203dSis  * in that order.
13704703203dSis  *
13714703203dSis  * The collected sequence cannot be bigger than 32 characters since if
13724703203dSis  * it is having more than 31 characters, the sequence will be terminated
13734703203dSis  * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
13744703203dSis  * a Stream-Safe Text. The collected sequence is always terminated with
13754703203dSis  * a null byte and the return value is the byte length of the sequence
13764703203dSis  * including 0. The return value does not include the terminating
13774703203dSis  * null byte.
13784703203dSis  */
13794703203dSis static size_t
collect_a_seq(size_t uv,uchar_t * u8s,uchar_t ** source,uchar_t * slast,boolean_t is_it_toupper,boolean_t is_it_tolower,boolean_t canonical_decomposition,boolean_t compatibility_decomposition,boolean_t canonical_composition,int * errnum,u8_normalization_states_t * state)13804703203dSis collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
13814703203dSis 	boolean_t is_it_toupper,
13824703203dSis 	boolean_t is_it_tolower,
13834703203dSis 	boolean_t canonical_decomposition,
13844703203dSis 	boolean_t compatibility_decomposition,
13854703203dSis 	boolean_t canonical_composition,
1386*85bb5f1dSis 	int *errnum, u8_normalization_states_t *state)
13874703203dSis {
13884703203dSis 	uchar_t *s;
13894703203dSis 	int sz;
13904703203dSis 	int saved_sz;
13914703203dSis 	size_t i;
13924703203dSis 	size_t j;
13934703203dSis 	size_t k;
13944703203dSis 	size_t l;
13954703203dSis 	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
13964703203dSis 	uchar_t disp[U8_MAX_CHARS_A_SEQ];
13974703203dSis 	uchar_t start[U8_MAX_CHARS_A_SEQ];
13984703203dSis 	uchar_t u8t[U8_MB_CUR_MAX];
13994703203dSis 	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
14004703203dSis 	uchar_t tc;
14014703203dSis 	size_t last;
14024703203dSis 	size_t saved_last;
14034703203dSis 	uint32_t u1;
14044703203dSis 
14054703203dSis 	/*
14064703203dSis 	 * Save the source string pointer which we will return a changed
14074703203dSis 	 * pointer if we do processing.
14084703203dSis 	 */
14094703203dSis 	s = *source;
14104703203dSis 
14114703203dSis 	/*
14124703203dSis 	 * The following is a fallback for just in case callers are not
14134703203dSis 	 * checking the string boundaries before the calling.
14144703203dSis 	 */
14154703203dSis 	if (s >= slast) {
14164703203dSis 		u8s[0] = '\0';
14174703203dSis 
14184703203dSis 		return (0);
14194703203dSis 	}
14204703203dSis 
14214703203dSis 	/*
14224703203dSis 	 * As the first thing, let's collect a character and do case
14234703203dSis 	 * conversion if necessary.
14244703203dSis 	 */
14254703203dSis 
14264703203dSis 	sz = u8_number_of_bytes[*s];
14274703203dSis 
14284703203dSis 	if (sz < 0) {
1429*85bb5f1dSis 		*errnum = EILSEQ;
14304703203dSis 
14314703203dSis 		u8s[0] = *s++;
14324703203dSis 		u8s[1] = '\0';
14334703203dSis 
14344703203dSis 		*source = s;
14354703203dSis 
14364703203dSis 		return (1);
14374703203dSis 	}
14384703203dSis 
14394703203dSis 	if (sz == 1) {
14404703203dSis 		if (is_it_toupper)
14414703203dSis 			u8s[0] = U8_ASCII_TOUPPER(*s);
14424703203dSis 		else if (is_it_tolower)
14434703203dSis 			u8s[0] = U8_ASCII_TOLOWER(*s);
14444703203dSis 		else
14454703203dSis 			u8s[0] = *s;
14464703203dSis 		s++;
14474703203dSis 		u8s[1] = '\0';
14484703203dSis 	} else if ((s + sz) > slast) {
1449*85bb5f1dSis 		*errnum = EINVAL;
14504703203dSis 
14514703203dSis 		for (i = 0; s < slast; )
14524703203dSis 			u8s[i++] = *s++;
14534703203dSis 		u8s[i] = '\0';
14544703203dSis 
14554703203dSis 		*source = s;
14564703203dSis 
14574703203dSis 		return (i);
14584703203dSis 	} else {
14594703203dSis 		if (is_it_toupper || is_it_tolower) {
14604703203dSis 			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
14614703203dSis 			s += sz;
14624703203dSis 			sz = i;
14634703203dSis 		} else {
14644703203dSis 			for (i = 0; i < sz; )
14654703203dSis 				u8s[i++] = *s++;
14664703203dSis 			u8s[i] = '\0';
14674703203dSis 		}
14684703203dSis 	}
14694703203dSis 
14704703203dSis 	/*
14714703203dSis 	 * And then canonical/compatibility decomposition followed by
14724703203dSis 	 * an optional canonical composition. Please be noted that
14734703203dSis 	 * canonical composition is done only when a decomposition is
14744703203dSis 	 * done.
14754703203dSis 	 */
14764703203dSis 	if (canonical_decomposition || compatibility_decomposition) {
14774703203dSis 		if (sz == 1) {
14784703203dSis 			*state = U8_STATE_START;
14794703203dSis 
14804703203dSis 			saved_sz = 1;
14814703203dSis 
14824703203dSis 			comb_class[0] = 0;
14834703203dSis 			start[0] = 0;
14844703203dSis 			disp[0] = 1;
14854703203dSis 
14864703203dSis 			last = 1;
14874703203dSis 		} else {
14884703203dSis 			saved_sz = do_decomp(uv, u8s, u8s, sz,
14894703203dSis 			    canonical_decomposition, state);
14904703203dSis 
14914703203dSis 			last = 0;
14924703203dSis 
14934703203dSis 			for (i = 0; i < saved_sz; ) {
14944703203dSis 				sz = u8_number_of_bytes[u8s[i]];
14954703203dSis 
14964703203dSis 				comb_class[last] = combining_class(uv,
14974703203dSis 				    u8s + i, sz);
14984703203dSis 				start[last] = i;
14994703203dSis 				disp[last] = sz;
15004703203dSis 
15014703203dSis 				last++;
15024703203dSis 				i += sz;
15034703203dSis 			}
15044703203dSis 
15054703203dSis 			/*
15064703203dSis 			 * Decomposition yields various Hangul related
15074703203dSis 			 * states but not on combining marks. We need to
15084703203dSis 			 * find out at here by checking on the last
15094703203dSis 			 * character.
15104703203dSis 			 */
15114703203dSis 			if (*state == U8_STATE_START) {
15124703203dSis 				if (comb_class[last - 1])
15134703203dSis 					*state = U8_STATE_COMBINING_MARK;
15144703203dSis 			}
15154703203dSis 		}
15164703203dSis 
15174703203dSis 		saved_last = last;
15184703203dSis 
15194703203dSis 		while (s < slast) {
15204703203dSis 			sz = u8_number_of_bytes[*s];
15214703203dSis 
15224703203dSis 			/*
15234703203dSis 			 * If this is an illegal character, an incomplete
15244703203dSis 			 * character, or an 7-bit ASCII Starter character,
15254703203dSis 			 * then we have collected a sequence; break and let
15264703203dSis 			 * the next call deal with the two cases.
15274703203dSis 			 *
15284703203dSis 			 * Note that this is okay only if you are using this
15294703203dSis 			 * function with a fixed length string, not on
15304703203dSis 			 * a buffer with multiple calls of one chunk at a time.
15314703203dSis 			 */
15324703203dSis 			if (sz <= 1) {
15334703203dSis 				break;
15344703203dSis 			} else if ((s + sz) > slast) {
15354703203dSis 				break;
15364703203dSis 			} else {
15374703203dSis 				/*
15384703203dSis 				 * If the previous character was a Hangul Jamo
15394703203dSis 				 * and this character is a Hangul Jamo that
15404703203dSis 				 * can be conjoined, we collect the Jamo.
15414703203dSis 				 */
15424703203dSis 				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
15434703203dSis 					U8_PUT_3BYTES_INTO_UTF32(u1,
15444703203dSis 					    *s, *(s + 1), *(s + 2));
15454703203dSis 
15464703203dSis 					if (U8_HANGUL_COMPOSABLE_L_V(*state,
15474703203dSis 					    u1)) {
15484703203dSis 						i = 0;
15494703203dSis 						*state = U8_STATE_HANGUL_LV;
15504703203dSis 						goto COLLECT_A_HANGUL;
15514703203dSis 					}
15524703203dSis 
15534703203dSis 					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
15544703203dSis 					    u1)) {
15554703203dSis 						i = 0;
15564703203dSis 						*state = U8_STATE_HANGUL_LVT;
15574703203dSis 						goto COLLECT_A_HANGUL;
15584703203dSis 					}
15594703203dSis 				}
15604703203dSis 
15614703203dSis 				/*
15624703203dSis 				 * Regardless of whatever it was, if this is
15634703203dSis 				 * a Starter, we don't collect the character
15644703203dSis 				 * since that's a new start and we will deal
15654703203dSis 				 * with it at the next time.
15664703203dSis 				 */
15674703203dSis 				i = combining_class(uv, s, sz);
15684703203dSis 				if (i == U8_COMBINING_CLASS_STARTER)
15694703203dSis 					break;
15704703203dSis 
15714703203dSis 				/*
15724703203dSis 				 * We know the current character is a combining
15734703203dSis 				 * mark. If the previous character wasn't
15744703203dSis 				 * a Starter (not Hangul) or a combining mark,
15754703203dSis 				 * then, we don't collect this combining mark.
15764703203dSis 				 */
15774703203dSis 				if (*state != U8_STATE_START &&
15784703203dSis 				    *state != U8_STATE_COMBINING_MARK)
15794703203dSis 					break;
15804703203dSis 
15814703203dSis 				*state = U8_STATE_COMBINING_MARK;
15824703203dSis COLLECT_A_HANGUL:
15834703203dSis 				/*
15844703203dSis 				 * If we collected a Starter and combining
15854703203dSis 				 * marks up to 30, i.e., total 31 characters,
15864703203dSis 				 * then, we terminate this degenerately long
15874703203dSis 				 * combining sequence with a U+034F COMBINING
15884703203dSis 				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
15894703203dSis 				 * UTF-8 and turn this into a Stream-Safe
15904703203dSis 				 * Text. This will be extremely rare but
15914703203dSis 				 * possible.
15924703203dSis 				 *
15934703203dSis 				 * The following will also guarantee that
15944703203dSis 				 * we are not writing more than 32 characters
15954703203dSis 				 * plus a NULL at u8s[].
15964703203dSis 				 */
15974703203dSis 				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
15984703203dSis TURN_STREAM_SAFE:
15994703203dSis 					*state = U8_STATE_START;
16004703203dSis 					comb_class[last] = 0;
16014703203dSis 					start[last] = saved_sz;
16024703203dSis 					disp[last] = 2;
16034703203dSis 					last++;
16044703203dSis 
16054703203dSis 					u8s[saved_sz++] = 0xCD;
16064703203dSis 					u8s[saved_sz++] = 0x8F;
16074703203dSis 
16084703203dSis 					break;
16094703203dSis 				}
16104703203dSis 
16114703203dSis 				/*
16124703203dSis 				 * Some combining marks also do decompose into
16134703203dSis 				 * another combining mark or marks.
16144703203dSis 				 */
16154703203dSis 				if (*state == U8_STATE_COMBINING_MARK) {
16164703203dSis 					k = last;
16174703203dSis 					l = sz;
16184703203dSis 					i = do_decomp(uv, uts, s, sz,
16194703203dSis 					    canonical_decomposition, state);
16204703203dSis 					for (j = 0; j < i; ) {
16214703203dSis 						sz = u8_number_of_bytes[uts[j]];
16224703203dSis 
16234703203dSis 						comb_class[last] =
16244703203dSis 						    combining_class(uv,
16254703203dSis 						    uts + j, sz);
16264703203dSis 						start[last] = saved_sz + j;
16274703203dSis 						disp[last] = sz;
16284703203dSis 
16294703203dSis 						last++;
16304703203dSis 						if (last >=
16314703203dSis 						    U8_UPPER_LIMIT_IN_A_SEQ) {
16324703203dSis 							last = k;
16334703203dSis 							goto TURN_STREAM_SAFE;
16344703203dSis 						}
16354703203dSis 						j += sz;
16364703203dSis 					}
16374703203dSis 
16384703203dSis 					*state = U8_STATE_COMBINING_MARK;
16394703203dSis 					sz = i;
16404703203dSis 					s += l;
16414703203dSis 
16424703203dSis 					for (i = 0; i < sz; i++)
16434703203dSis 						u8s[saved_sz++] = uts[i];
16444703203dSis 				} else {
16454703203dSis 					comb_class[last] = i;
16464703203dSis 					start[last] = saved_sz;
16474703203dSis 					disp[last] = sz;
16484703203dSis 					last++;
16494703203dSis 
16504703203dSis 					for (i = 0; i < sz; i++)
16514703203dSis 						u8s[saved_sz++] = *s++;
16524703203dSis 				}
16534703203dSis 
16544703203dSis 				/*
16554703203dSis 				 * If this is U+0345 COMBINING GREEK
16564703203dSis 				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
16574703203dSis 				 * iota subscript, and need to be converted to
16584703203dSis 				 * uppercase letter, convert it to U+0399 GREEK
16594703203dSis 				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
16604703203dSis 				 * i.e., convert to capital adscript form as
16614703203dSis 				 * specified in the Unicode standard.
16624703203dSis 				 *
16634703203dSis 				 * This is the only special case of (ambiguous)
16644703203dSis 				 * case conversion at combining marks and
16654703203dSis 				 * probably the standard will never have
16664703203dSis 				 * anything similar like this in future.
16674703203dSis 				 */
16684703203dSis 				if (is_it_toupper && sz >= 2 &&
16694703203dSis 				    u8s[saved_sz - 2] == 0xCD &&
16704703203dSis 				    u8s[saved_sz - 1] == 0x85) {
16714703203dSis 					u8s[saved_sz - 2] = 0xCE;
16724703203dSis 					u8s[saved_sz - 1] = 0x99;
16734703203dSis 				}
16744703203dSis 			}
16754703203dSis 		}
16764703203dSis 
16774703203dSis 		/*
16784703203dSis 		 * Let's try to ensure a canonical ordering for the collected
16794703203dSis 		 * combining marks. We do this only if we have collected
16804703203dSis 		 * at least one more non-Starter. (The decomposition mapping
16814703203dSis 		 * data tables have fully (and recursively) expanded and
16824703203dSis 		 * canonically ordered decompositions.)
16834703203dSis 		 *
16844703203dSis 		 * The U8_SWAP_COMB_MARKS() convenience macro has some
16854703203dSis 		 * assumptions and we are meeting the assumptions.
16864703203dSis 		 */
16874703203dSis 		last--;
16884703203dSis 		if (last >= saved_last) {
16894703203dSis 			for (i = 0; i < last; i++)
16904703203dSis 				for (j = last; j > i; j--)
16914703203dSis 					if (comb_class[j] &&
16924703203dSis 					    comb_class[j - 1] > comb_class[j]) {
16934703203dSis 						U8_SWAP_COMB_MARKS(j - 1, j);
16944703203dSis 					}
16954703203dSis 		}
16964703203dSis 
16974703203dSis 		*source = s;
16984703203dSis 
16994703203dSis 		if (! canonical_composition) {
17004703203dSis 			u8s[saved_sz] = '\0';
17014703203dSis 			return (saved_sz);
17024703203dSis 		}
17034703203dSis 
17044703203dSis 		/*
17054703203dSis 		 * Now do the canonical composition. Note that we do this
17064703203dSis 		 * only after a canonical or compatibility decomposition to
17074703203dSis 		 * finish up NFC or NFKC.
17084703203dSis 		 */
17094703203dSis 		sz = do_composition(uv, u8s, comb_class, start, disp, last,
17104703203dSis 		    &s, slast);
17114703203dSis 	}
17124703203dSis 
17134703203dSis 	*source = s;
17144703203dSis 
17154703203dSis 	return ((size_t)sz);
17164703203dSis }
17174703203dSis 
17184703203dSis /*
17194703203dSis  * The do_norm_compare() function does string comparion based on Unicode
17204703203dSis  * simple case mappings and Unicode Normalization definitions.
17214703203dSis  *
17224703203dSis  * It does so by collecting a sequence of character at a time and comparing
17234703203dSis  * the collected sequences from the strings.
17244703203dSis  *
17254703203dSis  * The meanings on the return values are the same as the usual strcmp().
17264703203dSis  */
17274703203dSis static int
do_norm_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,int flag,int * errnum)17284703203dSis do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1729*85bb5f1dSis 	int flag, int *errnum)
17304703203dSis {
17314703203dSis 	int result;
17324703203dSis 	size_t sz1;
17334703203dSis 	size_t sz2;
17344703203dSis 	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
17354703203dSis 	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
17364703203dSis 	uchar_t *s1last;
17374703203dSis 	uchar_t *s2last;
17384703203dSis 	boolean_t is_it_toupper;
17394703203dSis 	boolean_t is_it_tolower;
17404703203dSis 	boolean_t canonical_decomposition;
17414703203dSis 	boolean_t compatibility_decomposition;
17424703203dSis 	boolean_t canonical_composition;
17434703203dSis 	u8_normalization_states_t state;
17444703203dSis 
17454703203dSis 	s1last = s1 + n1;
17464703203dSis 	s2last = s2 + n2;
17474703203dSis 
17484703203dSis 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
17494703203dSis 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
17504703203dSis 	canonical_decomposition = flag & U8_CANON_DECOMP;
17514703203dSis 	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
17524703203dSis 	canonical_composition = flag & U8_CANON_COMP;
17534703203dSis 
17544703203dSis 	while (s1 < s1last && s2 < s2last) {
17554703203dSis 		/*
17564703203dSis 		 * If the current character is a 7-bit ASCII and the last
17574703203dSis 		 * character, or, if the current character and the next
17584703203dSis 		 * character are both some 7-bit ASCII characters then
17594703203dSis 		 * we treat the current character as a sequence.
17604703203dSis 		 *
17614703203dSis 		 * In any other cases, we need to call collect_a_seq().
17624703203dSis 		 */
17634703203dSis 
17644703203dSis 		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
17654703203dSis 		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
17664703203dSis 			if (is_it_toupper)
17674703203dSis 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
17684703203dSis 			else if (is_it_tolower)
17694703203dSis 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
17704703203dSis 			else
17714703203dSis 				u8s1[0] = *s1;
17724703203dSis 			u8s1[1] = '\0';
17734703203dSis 			sz1 = 1;
17744703203dSis 			s1++;
17754703203dSis 		} else {
17764703203dSis 			state = U8_STATE_START;
17774703203dSis 			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
17784703203dSis 			    is_it_toupper, is_it_tolower,
17794703203dSis 			    canonical_decomposition,
17804703203dSis 			    compatibility_decomposition,
1781*85bb5f1dSis 			    canonical_composition, errnum, &state);
17824703203dSis 		}
17834703203dSis 
17844703203dSis 		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
17854703203dSis 		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
17864703203dSis 			if (is_it_toupper)
17874703203dSis 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
17884703203dSis 			else if (is_it_tolower)
17894703203dSis 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
17904703203dSis 			else
17914703203dSis 				u8s2[0] = *s2;
17924703203dSis 			u8s2[1] = '\0';
17934703203dSis 			sz2 = 1;
17944703203dSis 			s2++;
17954703203dSis 		} else {
17964703203dSis 			state = U8_STATE_START;
17974703203dSis 			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
17984703203dSis 			    is_it_toupper, is_it_tolower,
17994703203dSis 			    canonical_decomposition,
18004703203dSis 			    compatibility_decomposition,
1801*85bb5f1dSis 			    canonical_composition, errnum, &state);
18024703203dSis 		}
18034703203dSis 
18044703203dSis 		/*
18054703203dSis 		 * Now compare the two characters. If they are the same,
18064703203dSis 		 * we move on to the next character sequences.
18074703203dSis 		 */
18084703203dSis 		if (sz1 == 1 && sz2 == 1) {
18094703203dSis 			if (*u8s1 > *u8s2)
18104703203dSis 				return (1);
18114703203dSis 			if (*u8s1 < *u8s2)
18124703203dSis 				return (-1);
18134703203dSis 		} else {
18144703203dSis 			result = strcmp((const char *)u8s1, (const char *)u8s2);
18154703203dSis 			if (result != 0)
18164703203dSis 				return (result);
18174703203dSis 		}
18184703203dSis 	}
18194703203dSis 
18204703203dSis 	/*
18214703203dSis 	 * We compared until the end of either or both strings.
18224703203dSis 	 *
18234703203dSis 	 * If we reached to or went over the ends for the both, that means
18244703203dSis 	 * they are the same.
18254703203dSis 	 *
18264703203dSis 	 * If we reached only one end, that means the other string has
18274703203dSis 	 * something which then can be used to determine the return value.
18284703203dSis 	 */
18294703203dSis 	if (s1 >= s1last) {
18304703203dSis 		if (s2 >= s2last)
18314703203dSis 			return (0);
18324703203dSis 		return (-1);
18334703203dSis 	}
18344703203dSis 	return (1);
18354703203dSis }
18364703203dSis 
18374703203dSis /*
18384703203dSis  * The u8_strcmp() function compares two UTF-8 strings quite similar to
18394703203dSis  * the strcmp(). For the comparison, however, Unicode Normalization specific
18404703203dSis  * equivalency and Unicode simple case conversion mappings based equivalency
18414703203dSis  * can be requested and checked against.
18424703203dSis  */
18434703203dSis int
u8_strcmp(const char * s1,const char * s2,size_t n,int flag,size_t uv,int * errnum)18444703203dSis u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1845*85bb5f1dSis 		int *errnum)
18464703203dSis {
18474703203dSis 	int f;
18484703203dSis 	size_t n1;
18494703203dSis 	size_t n2;
18504703203dSis 
1851*85bb5f1dSis 	*errnum = 0;
18524703203dSis 
18534703203dSis 	/*
18544703203dSis 	 * Check on the requested Unicode version, case conversion, and
18554703203dSis 	 * normalization flag values.
18564703203dSis 	 */
18574703203dSis 
18584703203dSis 	if (uv > U8_UNICODE_LATEST) {
1859*85bb5f1dSis 		*errnum = ERANGE;
18604703203dSis 		uv = U8_UNICODE_LATEST;
18614703203dSis 	}
18624703203dSis 
18634703203dSis 	if (flag == 0) {
18644703203dSis 		flag = U8_STRCMP_CS;
18654703203dSis 	} else {
18664703203dSis 		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
18674703203dSis 		    U8_STRCMP_CI_LOWER);
18684703203dSis 		if (f == 0) {
18694703203dSis 			flag |= U8_STRCMP_CS;
18704703203dSis 		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
18714703203dSis 		    f != U8_STRCMP_CI_LOWER) {
1872*85bb5f1dSis 			*errnum = EBADF;
18734703203dSis 			flag = U8_STRCMP_CS;
18744703203dSis 		}
18754703203dSis 
18764703203dSis 		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
18774703203dSis 		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
18784703203dSis 		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
1879*85bb5f1dSis 			*errnum = EBADF;
18804703203dSis 			flag = U8_STRCMP_CS;
18814703203dSis 		}
18824703203dSis 	}
18834703203dSis 
18844703203dSis 	if (flag == U8_STRCMP_CS) {
18854703203dSis 		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
18864703203dSis 	}
18874703203dSis 
18884703203dSis 	n1 = strlen(s1);
18894703203dSis 	n2 = strlen(s2);
18904703203dSis 	if (n != 0) {
18914703203dSis 		if (n < n1)
18924703203dSis 			n1 = n;
18934703203dSis 		if (n < n2)
18944703203dSis 			n2 = n;
18954703203dSis 	}
18964703203dSis 
18974703203dSis 	/*
18984703203dSis 	 * Simple case conversion can be done much faster and so we do
18994703203dSis 	 * them separately here.
19004703203dSis 	 */
19014703203dSis 	if (flag == U8_STRCMP_CI_UPPER) {
19024703203dSis 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1903*85bb5f1dSis 		    n1, n2, B_TRUE, errnum));
19044703203dSis 	} else if (flag == U8_STRCMP_CI_LOWER) {
19054703203dSis 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1906*85bb5f1dSis 		    n1, n2, B_FALSE, errnum));
19074703203dSis 	}
19084703203dSis 
19094703203dSis 	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
1910*85bb5f1dSis 	    flag, errnum));
19114703203dSis }
19124703203dSis 
19134703203dSis size_t
u8_textprep_str(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,size_t unicode_version,int * errnum)19144703203dSis u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1915*85bb5f1dSis 	int flag, size_t unicode_version, int *errnum)
19164703203dSis {
19174703203dSis 	int f;
19184703203dSis 	int sz;
19194703203dSis 	uchar_t *ib;
19204703203dSis 	uchar_t *ibtail;
19214703203dSis 	uchar_t *ob;
19224703203dSis 	uchar_t *obtail;
19234703203dSis 	boolean_t do_not_ignore_null;
19244703203dSis 	boolean_t do_not_ignore_invalid;
19254703203dSis 	boolean_t is_it_toupper;
19264703203dSis 	boolean_t is_it_tolower;
19274703203dSis 	boolean_t canonical_decomposition;
19284703203dSis 	boolean_t compatibility_decomposition;
19294703203dSis 	boolean_t canonical_composition;
19304703203dSis 	size_t ret_val;
19314703203dSis 	size_t i;
19324703203dSis 	size_t j;
19334703203dSis 	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
19344703203dSis 	u8_normalization_states_t state;
19354703203dSis 
19364703203dSis 	if (unicode_version > U8_UNICODE_LATEST) {
1937*85bb5f1dSis 		*errnum = ERANGE;
19384703203dSis 		return ((size_t)-1);
19394703203dSis 	}
19404703203dSis 
19414703203dSis 	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
19424703203dSis 	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
1943*85bb5f1dSis 		*errnum = EBADF;
19444703203dSis 		return ((size_t)-1);
19454703203dSis 	}
19464703203dSis 
19474703203dSis 	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
19484703203dSis 	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
19494703203dSis 	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
1950*85bb5f1dSis 		*errnum = EBADF;
19514703203dSis 		return ((size_t)-1);
19524703203dSis 	}
19534703203dSis 
19544703203dSis 	if (inarray == NULL || *inlen == 0)
19554703203dSis 		return (0);
19564703203dSis 
19574703203dSis 	if (outarray == NULL) {
1958*85bb5f1dSis 		*errnum = E2BIG;
19594703203dSis 		return ((size_t)-1);
19604703203dSis 	}
19614703203dSis 
19624703203dSis 	ib = (uchar_t *)inarray;
19634703203dSis 	ob = (uchar_t *)outarray;
19644703203dSis 	ibtail = ib + *inlen;
19654703203dSis 	obtail = ob + *outlen;
19664703203dSis 
19674703203dSis 	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
19684703203dSis 	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
19694703203dSis 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
19704703203dSis 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
19714703203dSis 
19724703203dSis 	ret_val = 0;
19734703203dSis 
19744703203dSis 	/*
19754703203dSis 	 * If we don't have a normalization flag set, we do the simple case
19764703203dSis 	 * conversion based text preparation separately below. Text
19774703203dSis 	 * preparation involving Normalization will be done in the false task
19784703203dSis 	 * block, again, separately since it will take much more time and
19794703203dSis 	 * resource than doing simple case conversions.
19804703203dSis 	 */
19814703203dSis 	if (f == 0) {
19824703203dSis 		while (ib < ibtail) {
19834703203dSis 			if (*ib == '\0' && do_not_ignore_null)
19844703203dSis 				break;
19854703203dSis 
19864703203dSis 			sz = u8_number_of_bytes[*ib];
19874703203dSis 
19884703203dSis 			if (sz < 0) {
19894703203dSis 				if (do_not_ignore_invalid) {
1990*85bb5f1dSis 					*errnum = EILSEQ;
19914703203dSis 					ret_val = (size_t)-1;
19924703203dSis 					break;
19934703203dSis 				}
19944703203dSis 
19954703203dSis 				sz = 1;
19964703203dSis 				ret_val++;
19974703203dSis 			}
19984703203dSis 
19994703203dSis 			if (sz == 1) {
20004703203dSis 				if (ob >= obtail) {
2001*85bb5f1dSis 					*errnum = E2BIG;
20024703203dSis 					ret_val = (size_t)-1;
20034703203dSis 					break;
20044703203dSis 				}
20054703203dSis 
20064703203dSis 				if (is_it_toupper)
20074703203dSis 					*ob = U8_ASCII_TOUPPER(*ib);
20084703203dSis 				else if (is_it_tolower)
20094703203dSis 					*ob = U8_ASCII_TOLOWER(*ib);
20104703203dSis 				else
20114703203dSis 					*ob = *ib;
20124703203dSis 				ib++;
20134703203dSis 				ob++;
20144703203dSis 			} else if ((ib + sz) > ibtail) {
20154703203dSis 				if (do_not_ignore_invalid) {
2016*85bb5f1dSis 					*errnum = EINVAL;
20174703203dSis 					ret_val = (size_t)-1;
20184703203dSis 					break;
20194703203dSis 				}
20204703203dSis 
20214703203dSis 				if ((obtail - ob) < (ibtail - ib)) {
2022*85bb5f1dSis 					*errnum = E2BIG;
20234703203dSis 					ret_val = (size_t)-1;
20244703203dSis 					break;
20254703203dSis 				}
20264703203dSis 
20274703203dSis 				/*
20284703203dSis 				 * We treat the remaining incomplete character
20294703203dSis 				 * bytes as a character.
20304703203dSis 				 */
20314703203dSis 				ret_val++;
20324703203dSis 
20334703203dSis 				while (ib < ibtail)
20344703203dSis 					*ob++ = *ib++;
20354703203dSis 			} else {
20364703203dSis 				if (is_it_toupper || is_it_tolower) {
20374703203dSis 					i = do_case_conv(unicode_version, u8s,
20384703203dSis 					    ib, sz, is_it_toupper);
20394703203dSis 
20404703203dSis 					if ((obtail - ob) < i) {
2041*85bb5f1dSis 						*errnum = E2BIG;
20424703203dSis 						ret_val = (size_t)-1;
20434703203dSis 						break;
20444703203dSis 					}
20454703203dSis 
20464703203dSis 					ib += sz;
20474703203dSis 
20484703203dSis 					for (sz = 0; sz < i; sz++)
20494703203dSis 						*ob++ = u8s[sz];
20504703203dSis 				} else {
20514703203dSis 					if ((obtail - ob) < sz) {
2052*85bb5f1dSis 						*errnum = E2BIG;
20534703203dSis 						ret_val = (size_t)-1;
20544703203dSis 						break;
20554703203dSis 					}
20564703203dSis 
20574703203dSis 					for (i = 0; i < sz; i++)
20584703203dSis 						*ob++ = *ib++;
20594703203dSis 				}
20604703203dSis 			}
20614703203dSis 		}
20624703203dSis 	} else {
20634703203dSis 		canonical_decomposition = flag & U8_CANON_DECOMP;
20644703203dSis 		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
20654703203dSis 		canonical_composition = flag & U8_CANON_COMP;
20664703203dSis 
20674703203dSis 		while (ib < ibtail) {
20684703203dSis 			if (*ib == '\0' && do_not_ignore_null)
20694703203dSis 				break;
20704703203dSis 
20714703203dSis 			/*
20724703203dSis 			 * If the current character is a 7-bit ASCII
20734703203dSis 			 * character and it is the last character, or,
20744703203dSis 			 * if the current character is a 7-bit ASCII
20754703203dSis 			 * character and the next character is also a 7-bit
20764703203dSis 			 * ASCII character, then, we copy over this
20774703203dSis 			 * character without going through collect_a_seq().
20784703203dSis 			 *
20794703203dSis 			 * In any other cases, we need to look further with
20804703203dSis 			 * the collect_a_seq() function.
20814703203dSis 			 */
20824703203dSis 			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
20834703203dSis 			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
20844703203dSis 				if (ob >= obtail) {
2085*85bb5f1dSis 					*errnum = E2BIG;
20864703203dSis 					ret_val = (size_t)-1;
20874703203dSis 					break;
20884703203dSis 				}
20894703203dSis 
20904703203dSis 				if (is_it_toupper)
20914703203dSis 					*ob = U8_ASCII_TOUPPER(*ib);
20924703203dSis 				else if (is_it_tolower)
20934703203dSis 					*ob = U8_ASCII_TOLOWER(*ib);
20944703203dSis 				else
20954703203dSis 					*ob = *ib;
20964703203dSis 				ib++;
20974703203dSis 				ob++;
20984703203dSis 			} else {
2099*85bb5f1dSis 				*errnum = 0;
21004703203dSis 				state = U8_STATE_START;
21014703203dSis 
21024703203dSis 				j = collect_a_seq(unicode_version, u8s,
21034703203dSis 				    &ib, ibtail,
21044703203dSis 				    is_it_toupper,
21054703203dSis 				    is_it_tolower,
21064703203dSis 				    canonical_decomposition,
21074703203dSis 				    compatibility_decomposition,
21084703203dSis 				    canonical_composition,
2109*85bb5f1dSis 				    errnum, &state);
21104703203dSis 
2111*85bb5f1dSis 				if (*errnum && do_not_ignore_invalid) {
21124703203dSis 					ret_val = (size_t)-1;
21134703203dSis 					break;
21144703203dSis 				}
21154703203dSis 
21164703203dSis 				if ((obtail - ob) < j) {
2117*85bb5f1dSis 					*errnum = E2BIG;
21184703203dSis 					ret_val = (size_t)-1;
21194703203dSis 					break;
21204703203dSis 				}
21214703203dSis 
21224703203dSis 				for (i = 0; i < j; i++)
21234703203dSis 					*ob++ = u8s[i];
21244703203dSis 			}
21254703203dSis 		}
21264703203dSis 	}
21274703203dSis 
21284703203dSis 	*inlen = ibtail - ib;
21294703203dSis 	*outlen = obtail - ob;
21304703203dSis 
21314703203dSis 	return (ret_val);
21324703203dSis }
2133