14703203dSis /*
24703203dSis * CDDL HEADER START
34703203dSis *
44703203dSis * The contents of this file are subject to the terms of the
54703203dSis * Common Development and Distribution License (the "License").
64703203dSis * You may not use this file except in compliance with the License.
74703203dSis *
84703203dSis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
94703203dSis * or http://www.opensolaris.org/os/licensing.
104703203dSis * See the License for the specific language governing permissions
114703203dSis * and limitations under the License.
124703203dSis *
134703203dSis * When distributing Covered Code, include this CDDL HEADER in each
144703203dSis * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
154703203dSis * If applicable, add the following below this CDDL HEADER, with the
164703203dSis * fields enclosed by brackets "[]" replaced with your own identifying
174703203dSis * information: Portions Copyright [yyyy] [name of copyright owner]
184703203dSis *
194703203dSis * CDDL HEADER END
204703203dSis */
214703203dSis /*
22*85bb5f1dSis * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
234703203dSis * Use is subject to license terms.
244703203dSis */
254703203dSis
264703203dSis #pragma ident "%Z%%M% %I% %E% SMI"
274703203dSis
284703203dSis
294703203dSis /*
304703203dSis * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
314703203dSis *
324703203dSis * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
334703203dSis * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
344703203dSis * the section 3C man pages.
354703203dSis * Interface stability: Committed.
364703203dSis */
374703203dSis
384703203dSis #include <sys/types.h>
394703203dSis #ifdef _KERNEL
404703203dSis #include <sys/param.h>
414703203dSis #include <sys/sysmacros.h>
424703203dSis #include <sys/systm.h>
434703203dSis #include <sys/debug.h>
444703203dSis #include <sys/kmem.h>
454703203dSis #include <sys/ddi.h>
464703203dSis #include <sys/sunddi.h>
474703203dSis #else
484703203dSis #include <sys/u8_textprep.h>
494703203dSis #include <strings.h>
504703203dSis #endif /* _KERNEL */
514703203dSis #include <sys/byteorder.h>
524703203dSis #include <sys/errno.h>
534703203dSis #include <sys/u8_textprep_data.h>
544703203dSis
554703203dSis
564703203dSis /* The maximum possible number of bytes in a UTF-8 character. */
574703203dSis #define U8_MB_CUR_MAX (4)
584703203dSis
594703203dSis /*
604703203dSis * The maximum number of bytes needed for a UTF-8 character to cover
614703203dSis * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
624703203dSis */
634703203dSis #define U8_MAX_BYTES_UCS2 (3)
644703203dSis
654703203dSis /* The maximum possible number of bytes in a Stream-Safe Text. */
664703203dSis #define U8_STREAM_SAFE_TEXT_MAX (128)
674703203dSis
684703203dSis /*
694703203dSis * The maximum number of characters in a combining/conjoining sequence and
704703203dSis * the actual upperbound limit of a combining/conjoining sequence.
714703203dSis */
724703203dSis #define U8_MAX_CHARS_A_SEQ (32)
734703203dSis #define U8_UPPER_LIMIT_IN_A_SEQ (31)
744703203dSis
754703203dSis /* The combining class value for Starter. */
764703203dSis #define U8_COMBINING_CLASS_STARTER (0)
774703203dSis
784703203dSis /*
794703203dSis * Some Hangul related macros at below.
804703203dSis *
814703203dSis * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
824703203dSis * Vowels, and optional Trailing consonants in Unicode scalar values.
834703203dSis *
844703203dSis * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
854703203dSis * the actual U+11A8. This is due to that the trailing consonant is optional
864703203dSis * and thus we are doing a pre-calculation of subtracting one.
874703203dSis *
884703203dSis * Each of 19 modern leading consonants has total 588 possible syllables since
894703203dSis * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
904703203dSis * no trailing consonant case, i.e., 21 x 28 = 588.
914703203dSis *
924703203dSis * We also have bunch of Hangul related macros at below. Please bear in mind
934703203dSis * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
944703203dSis * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
954703203dSis * Jamo; it just guarantee that it will be most likely.
964703203dSis */
974703203dSis #define U8_HANGUL_SYL_FIRST (0xAC00U)
984703203dSis #define U8_HANGUL_SYL_LAST (0xD7A3U)
994703203dSis
1004703203dSis #define U8_HANGUL_JAMO_L_FIRST (0x1100U)
1014703203dSis #define U8_HANGUL_JAMO_L_LAST (0x1112U)
1024703203dSis #define U8_HANGUL_JAMO_V_FIRST (0x1161U)
1034703203dSis #define U8_HANGUL_JAMO_V_LAST (0x1175U)
1044703203dSis #define U8_HANGUL_JAMO_T_FIRST (0x11A7U)
1054703203dSis #define U8_HANGUL_JAMO_T_LAST (0x11C2U)
1064703203dSis
1074703203dSis #define U8_HANGUL_V_COUNT (21)
1084703203dSis #define U8_HANGUL_VT_COUNT (588)
1094703203dSis #define U8_HANGUL_T_COUNT (28)
1104703203dSis
1114703203dSis #define U8_HANGUL_JAMO_1ST_BYTE (0xE1U)
1124703203dSis
1134703203dSis #define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
1144703203dSis (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
1154703203dSis (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
1164703203dSis (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
1174703203dSis
1184703203dSis #define U8_HANGUL_JAMO_L(u) \
1194703203dSis ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
1204703203dSis
1214703203dSis #define U8_HANGUL_JAMO_V(u) \
1224703203dSis ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
1234703203dSis
1244703203dSis #define U8_HANGUL_JAMO_T(u) \
1254703203dSis ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1264703203dSis
1274703203dSis #define U8_HANGUL_JAMO(u) \
1284703203dSis ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
1294703203dSis
1304703203dSis #define U8_HANGUL_SYLLABLE(u) \
1314703203dSis ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
1324703203dSis
1334703203dSis #define U8_HANGUL_COMPOSABLE_L_V(s, u) \
1344703203dSis ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
1354703203dSis
1364703203dSis #define U8_HANGUL_COMPOSABLE_LV_T(s, u) \
1374703203dSis ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
1384703203dSis
1394703203dSis /* The types of decomposition mappings. */
1404703203dSis #define U8_DECOMP_BOTH (0xF5U)
1414703203dSis #define U8_DECOMP_CANONICAL (0xF6U)
1424703203dSis
1434703203dSis /* The indicator for 16-bit table. */
1444703203dSis #define U8_16BIT_TABLE_INDICATOR (0x8000U)
1454703203dSis
1464703203dSis /* The following are some convenience macros. */
1474703203dSis #define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
1484703203dSis (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
1494703203dSis (uint32_t)(b3) & 0x3F;
1504703203dSis
1514703203dSis #define U8_SIMPLE_SWAP(a, b, t) \
1524703203dSis (t) = (a); \
1534703203dSis (a) = (b); \
1544703203dSis (b) = (t);
1554703203dSis
1564703203dSis #define U8_ASCII_TOUPPER(c) \
1574703203dSis (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
1584703203dSis
1594703203dSis #define U8_ASCII_TOLOWER(c) \
1604703203dSis (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
1614703203dSis
1624703203dSis #define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U)
1634703203dSis /*
1644703203dSis * The following macro assumes that the two characters that are to be
1654703203dSis * swapped are adjacent to each other and 'a' comes before 'b'.
1664703203dSis *
1674703203dSis * If the assumptions are not met, then, the macro will fail.
1684703203dSis */
1694703203dSis #define U8_SWAP_COMB_MARKS(a, b) \
1704703203dSis for (k = 0; k < disp[(a)]; k++) \
1714703203dSis u8t[k] = u8s[start[(a)] + k]; \
1724703203dSis for (k = 0; k < disp[(b)]; k++) \
1734703203dSis u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
1744703203dSis start[(b)] = start[(a)] + disp[(b)]; \
1754703203dSis for (k = 0; k < disp[(a)]; k++) \
1764703203dSis u8s[start[(b)] + k] = u8t[k]; \
1774703203dSis U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
1784703203dSis U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
1794703203dSis
1804703203dSis /* The possible states during normalization. */
1814703203dSis typedef enum {
1824703203dSis U8_STATE_START = 0,
1834703203dSis U8_STATE_HANGUL_L = 1,
1844703203dSis U8_STATE_HANGUL_LV = 2,
1854703203dSis U8_STATE_HANGUL_LVT = 3,
1864703203dSis U8_STATE_HANGUL_V = 4,
1874703203dSis U8_STATE_HANGUL_T = 5,
1884703203dSis U8_STATE_COMBINING_MARK = 6
1894703203dSis } u8_normalization_states_t;
1904703203dSis
1914703203dSis /*
1924703203dSis * The three vectors at below are used to check bytes of a given UTF-8
1934703203dSis * character are valid and not containing any malformed byte values.
1944703203dSis *
1954703203dSis * We used to have a quite relaxed UTF-8 binary representation but then there
1964703203dSis * was some security related issues and so the Unicode Consortium defined
1974703203dSis * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
1984703203dSis * one more time at the Unicode 3.2. The following three tables are based on
1994703203dSis * that.
2004703203dSis */
2014703203dSis
2024703203dSis #define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF)
2034703203dSis
2044703203dSis #define I_ U8_ILLEGAL_CHAR
2054703203dSis #define O_ U8_OUT_OF_RANGE_CHAR
2064703203dSis
2074703203dSis const int8_t u8_number_of_bytes[0x100] = {
2084703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2094703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2104703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2114703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2124703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2134703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2144703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2154703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2164703203dSis
2174703203dSis /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */
2184703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2194703203dSis
2204703203dSis /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */
2214703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2224703203dSis
2234703203dSis /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */
2244703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2254703203dSis
2264703203dSis /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */
2274703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
2284703203dSis
2294703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
2304703203dSis I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2314703203dSis
2324703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
2334703203dSis 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2344703203dSis
2354703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
2364703203dSis 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2374703203dSis
2384703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
2394703203dSis 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
2404703203dSis };
2414703203dSis
2424703203dSis #undef I_
2434703203dSis #undef O_
2444703203dSis
2454703203dSis const uint8_t u8_valid_min_2nd_byte[0x100] = {
2464703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2474703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2484703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2494703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2504703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2514703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2524703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2534703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2544703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2554703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2564703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2574703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2584703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2594703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2604703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2614703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2624703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2634703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2644703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2654703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2664703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2674703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2684703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2694703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2704703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */
2714703203dSis 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2724703203dSis /* C8 C9 CA CB CC CD CE CF */
2734703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2744703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */
2754703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2764703203dSis /* D8 D9 DA DB DC DD DE DF */
2774703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2784703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */
2794703203dSis 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2804703203dSis /* E8 E9 EA EB EC ED EE EF */
2814703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2824703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */
2834703203dSis 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
2844703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2854703203dSis };
2864703203dSis
2874703203dSis const uint8_t u8_valid_max_2nd_byte[0x100] = {
2884703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2894703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2904703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2914703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2924703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2934703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2944703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2954703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2964703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2974703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2984703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2994703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3004703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3014703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3024703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3034703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3044703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3054703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3064703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3074703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3084703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3094703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3104703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3114703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3124703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */
3134703203dSis 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3144703203dSis /* C8 C9 CA CB CC CD CE CF */
3154703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3164703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */
3174703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3184703203dSis /* D8 D9 DA DB DC DD DE DF */
3194703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3204703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */
3214703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
3224703203dSis /* E8 E9 EA EB EC ED EE EF */
3234703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
3244703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */
3254703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
3264703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
3274703203dSis };
3284703203dSis
3294703203dSis
3304703203dSis /*
3314703203dSis * The u8_validate() validates on the given UTF-8 character string and
3324703203dSis * calculate the byte length. It is quite similar to mblen(3C) except that
3334703203dSis * this will validate against the list of characters if required and
3344703203dSis * specific to UTF-8 and Unicode.
3354703203dSis */
3364703203dSis int
u8_validate(char * u8str,size_t n,char ** list,int flag,int * errnum)337*85bb5f1dSis u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum)
3384703203dSis {
3394703203dSis uchar_t *ib;
3404703203dSis uchar_t *ibtail;
3414703203dSis uchar_t **p;
3424703203dSis uchar_t *s1;
3434703203dSis uchar_t *s2;
3444703203dSis uchar_t f;
3454703203dSis int sz;
3464703203dSis size_t i;
3474703203dSis int ret_val;
3484703203dSis boolean_t second;
3494703203dSis boolean_t no_need_to_validate_entire;
3504703203dSis boolean_t check_additional;
3514703203dSis boolean_t validate_ucs2_range_only;
3524703203dSis
3534703203dSis if (! u8str)
3544703203dSis return (0);
3554703203dSis
3564703203dSis ib = (uchar_t *)u8str;
3574703203dSis ibtail = ib + n;
3584703203dSis
3594703203dSis ret_val = 0;
3604703203dSis
3614703203dSis no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
3624703203dSis check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
3634703203dSis validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
3644703203dSis
3654703203dSis while (ib < ibtail) {
3664703203dSis /*
3674703203dSis * The first byte of a UTF-8 character tells how many
3684703203dSis * bytes will follow for the character. If the first byte
3694703203dSis * is an illegal byte value or out of range value, we just
3704703203dSis * return -1 with an appropriate error number.
3714703203dSis */
3724703203dSis sz = u8_number_of_bytes[*ib];
3734703203dSis if (sz == U8_ILLEGAL_CHAR) {
374*85bb5f1dSis *errnum = EILSEQ;
3754703203dSis return (-1);
3764703203dSis }
3774703203dSis
3784703203dSis if (sz == U8_OUT_OF_RANGE_CHAR ||
3794703203dSis (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
380*85bb5f1dSis *errnum = ERANGE;
3814703203dSis return (-1);
3824703203dSis }
3834703203dSis
3844703203dSis /*
3854703203dSis * If we don't have enough bytes to check on, that's also
3864703203dSis * an error. As you can see, we give illegal byte sequence
3874703203dSis * checking higher priority then EINVAL cases.
3884703203dSis */
3894703203dSis if ((ibtail - ib) < sz) {
390*85bb5f1dSis *errnum = EINVAL;
3914703203dSis return (-1);
3924703203dSis }
3934703203dSis
3944703203dSis if (sz == 1) {
3954703203dSis ib++;
3964703203dSis ret_val++;
3974703203dSis } else {
3984703203dSis /*
3994703203dSis * Check on the multi-byte UTF-8 character. For more
4004703203dSis * details on this, see comment added for the used
4014703203dSis * data structures at the beginning of the file.
4024703203dSis */
4034703203dSis f = *ib++;
4044703203dSis ret_val++;
4054703203dSis second = B_TRUE;
4064703203dSis for (i = 1; i < sz; i++) {
4074703203dSis if (second) {
4084703203dSis if (*ib < u8_valid_min_2nd_byte[f] ||
4094703203dSis *ib > u8_valid_max_2nd_byte[f]) {
410*85bb5f1dSis *errnum = EILSEQ;
4114703203dSis return (-1);
4124703203dSis }
4134703203dSis second = B_FALSE;
4144703203dSis } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
415*85bb5f1dSis *errnum = EILSEQ;
4164703203dSis return (-1);
4174703203dSis }
4184703203dSis ib++;
4194703203dSis ret_val++;
4204703203dSis }
4214703203dSis }
4224703203dSis
4234703203dSis if (check_additional) {
4244703203dSis for (p = (uchar_t **)list, i = 0; p[i]; i++) {
4254703203dSis s1 = ib - sz;
4264703203dSis s2 = p[i];
4274703203dSis while (s1 < ib) {
4284703203dSis if (*s1 != *s2 || *s2 == '\0')
4294703203dSis break;
4304703203dSis s1++;
4314703203dSis s2++;
4324703203dSis }
4334703203dSis
4344703203dSis if (s1 >= ib && *s2 == '\0') {
435*85bb5f1dSis *errnum = EBADF;
4364703203dSis return (-1);
4374703203dSis }
4384703203dSis }
4394703203dSis }
4404703203dSis
4414703203dSis if (no_need_to_validate_entire)
4424703203dSis break;
4434703203dSis }
4444703203dSis
4454703203dSis return (ret_val);
4464703203dSis }
4474703203dSis
4484703203dSis /*
4494703203dSis * The do_case_conv() looks at the mapping tables and returns found
4504703203dSis * bytes if any. If not found, the input bytes are returned. The function
4514703203dSis * always terminate the return bytes with a null character assuming that
4524703203dSis * there are plenty of room to do so.
4534703203dSis *
4544703203dSis * The case conversions are simple case conversions mapping a character to
4554703203dSis * another character as specified in the Unicode data. The byte size of
4564703203dSis * the mapped character could be different from that of the input character.
4574703203dSis *
4584703203dSis * The return value is the byte length of the returned character excluding
4594703203dSis * the terminating null byte.
4604703203dSis */
4614703203dSis static size_t
do_case_conv(int uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t is_it_toupper)4624703203dSis do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
4634703203dSis {
4644703203dSis size_t i;
4654703203dSis uint16_t b1 = 0;
4664703203dSis uint16_t b2 = 0;
4674703203dSis uint16_t b3 = 0;
4684703203dSis uint16_t b3_tbl;
4694703203dSis uint16_t b3_base;
4704703203dSis uint16_t b4 = 0;
4714703203dSis size_t start_id;
4724703203dSis size_t end_id;
4734703203dSis
4744703203dSis /*
4754703203dSis * At this point, the only possible values for sz are 2, 3, and 4.
4764703203dSis * The u8s should point to a vector that is well beyond the size of
4774703203dSis * 5 bytes.
4784703203dSis */
4794703203dSis if (sz == 2) {
4804703203dSis b3 = u8s[0] = s[0];
4814703203dSis b4 = u8s[1] = s[1];
4824703203dSis } else if (sz == 3) {
4834703203dSis b2 = u8s[0] = s[0];
4844703203dSis b3 = u8s[1] = s[1];
4854703203dSis b4 = u8s[2] = s[2];
4864703203dSis } else if (sz == 4) {
4874703203dSis b1 = u8s[0] = s[0];
4884703203dSis b2 = u8s[1] = s[1];
4894703203dSis b3 = u8s[2] = s[2];
4904703203dSis b4 = u8s[3] = s[3];
4914703203dSis } else {
4924703203dSis /* This is not possible but just in case as a fallback. */
4934703203dSis if (is_it_toupper)
4944703203dSis *u8s = U8_ASCII_TOUPPER(*s);
4954703203dSis else
4964703203dSis *u8s = U8_ASCII_TOLOWER(*s);
4974703203dSis u8s[1] = '\0';
4984703203dSis
4994703203dSis return (1);
5004703203dSis }
5014703203dSis u8s[sz] = '\0';
5024703203dSis
5034703203dSis /*
5044703203dSis * Let's find out if we have a corresponding character.
5054703203dSis */
5064703203dSis b1 = u8_common_b1_tbl[uv][b1];
5074703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF)
5084703203dSis return ((size_t)sz);
5094703203dSis
5104703203dSis b2 = u8_case_common_b2_tbl[uv][b1][b2];
5114703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF)
5124703203dSis return ((size_t)sz);
5134703203dSis
5144703203dSis if (is_it_toupper) {
5154703203dSis b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
5164703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5174703203dSis return ((size_t)sz);
5184703203dSis
5194703203dSis start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
5204703203dSis end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
5214703203dSis
5224703203dSis /* Either there is no match or an error at the table. */
5234703203dSis if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5244703203dSis return ((size_t)sz);
5254703203dSis
5264703203dSis b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
5274703203dSis
5284703203dSis for (i = 0; start_id < end_id; start_id++)
5294703203dSis u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
5304703203dSis } else {
5314703203dSis b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
5324703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
5334703203dSis return ((size_t)sz);
5344703203dSis
5354703203dSis start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
5364703203dSis end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
5374703203dSis
5384703203dSis if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
5394703203dSis return ((size_t)sz);
5404703203dSis
5414703203dSis b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
5424703203dSis
5434703203dSis for (i = 0; start_id < end_id; start_id++)
5444703203dSis u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
5454703203dSis }
5464703203dSis
5474703203dSis /*
5484703203dSis * If i is still zero, that means there is no corresponding character.
5494703203dSis */
5504703203dSis if (i == 0)
5514703203dSis return ((size_t)sz);
5524703203dSis
5534703203dSis u8s[i] = '\0';
5544703203dSis
5554703203dSis return (i);
5564703203dSis }
5574703203dSis
5584703203dSis /*
5594703203dSis * The do_case_compare() function compares the two input strings, s1 and s2,
5604703203dSis * one character at a time doing case conversions if applicable and return
5614703203dSis * the comparison result as like strcmp().
5624703203dSis *
5634703203dSis * Since, in empirical sense, most of text data are 7-bit ASCII characters,
5644703203dSis * we treat the 7-bit ASCII characters as a special case trying to yield
5654703203dSis * faster processing time.
5664703203dSis */
5674703203dSis static int
do_case_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,boolean_t is_it_toupper,int * errnum)5684703203dSis do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
569*85bb5f1dSis size_t n2, boolean_t is_it_toupper, int *errnum)
5704703203dSis {
5714703203dSis int f;
5724703203dSis int sz1;
5734703203dSis int sz2;
5744703203dSis size_t j;
5754703203dSis size_t i1;
5764703203dSis size_t i2;
5774703203dSis uchar_t u8s1[U8_MB_CUR_MAX + 1];
5784703203dSis uchar_t u8s2[U8_MB_CUR_MAX + 1];
5794703203dSis
5804703203dSis i1 = i2 = 0;
5814703203dSis while (i1 < n1 && i2 < n2) {
5824703203dSis /*
5834703203dSis * Find out what would be the byte length for this UTF-8
5844703203dSis * character at string s1 and also find out if this is
5854703203dSis * an illegal start byte or not and if so, issue a proper
586*85bb5f1dSis * error number and yet treat this byte as a character.
5874703203dSis */
5884703203dSis sz1 = u8_number_of_bytes[*s1];
5894703203dSis if (sz1 < 0) {
590*85bb5f1dSis *errnum = EILSEQ;
5914703203dSis sz1 = 1;
5924703203dSis }
5934703203dSis
5944703203dSis /*
5954703203dSis * For 7-bit ASCII characters mainly, we do a quick case
5964703203dSis * conversion right at here.
5974703203dSis *
5984703203dSis * If we don't have enough bytes for this character, issue
5994703203dSis * an EINVAL error and use what are available.
6004703203dSis *
6014703203dSis * If we have enough bytes, find out if there is
6024703203dSis * a corresponding uppercase character and if so, copy over
6034703203dSis * the bytes for a comparison later. If there is no
6044703203dSis * corresponding uppercase character, then, use what we have
6054703203dSis * for the comparison.
6064703203dSis */
6074703203dSis if (sz1 == 1) {
6084703203dSis if (is_it_toupper)
6094703203dSis u8s1[0] = U8_ASCII_TOUPPER(*s1);
6104703203dSis else
6114703203dSis u8s1[0] = U8_ASCII_TOLOWER(*s1);
6124703203dSis s1++;
6134703203dSis u8s1[1] = '\0';
6144703203dSis } else if ((i1 + sz1) > n1) {
615*85bb5f1dSis *errnum = EINVAL;
6164703203dSis for (j = 0; (i1 + j) < n1; )
6174703203dSis u8s1[j++] = *s1++;
6184703203dSis u8s1[j] = '\0';
6194703203dSis } else {
6204703203dSis (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
6214703203dSis s1 += sz1;
6224703203dSis }
6234703203dSis
6244703203dSis /* Do the same for the string s2. */
6254703203dSis sz2 = u8_number_of_bytes[*s2];
6264703203dSis if (sz2 < 0) {
627*85bb5f1dSis *errnum = EILSEQ;
6284703203dSis sz2 = 1;
6294703203dSis }
6304703203dSis
6314703203dSis if (sz2 == 1) {
6324703203dSis if (is_it_toupper)
6334703203dSis u8s2[0] = U8_ASCII_TOUPPER(*s2);
6344703203dSis else
6354703203dSis u8s2[0] = U8_ASCII_TOLOWER(*s2);
6364703203dSis s2++;
6374703203dSis u8s2[1] = '\0';
6384703203dSis } else if ((i2 + sz2) > n2) {
639*85bb5f1dSis *errnum = EINVAL;
6404703203dSis for (j = 0; (i2 + j) < n2; )
6414703203dSis u8s2[j++] = *s2++;
6424703203dSis u8s2[j] = '\0';
6434703203dSis } else {
6444703203dSis (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
6454703203dSis s2 += sz2;
6464703203dSis }
6474703203dSis
6484703203dSis /* Now compare the two characters. */
6494703203dSis if (sz1 == 1 && sz2 == 1) {
6504703203dSis if (*u8s1 > *u8s2)
6514703203dSis return (1);
6524703203dSis if (*u8s1 < *u8s2)
6534703203dSis return (-1);
6544703203dSis } else {
6554703203dSis f = strcmp((const char *)u8s1, (const char *)u8s2);
6564703203dSis if (f != 0)
6574703203dSis return (f);
6584703203dSis }
6594703203dSis
6604703203dSis /*
6614703203dSis * They were the same. Let's move on to the next
6624703203dSis * characters then.
6634703203dSis */
6644703203dSis i1 += sz1;
6654703203dSis i2 += sz2;
6664703203dSis }
6674703203dSis
6684703203dSis /*
6694703203dSis * We compared until the end of either or both strings.
6704703203dSis *
6714703203dSis * If we reached to or went over the ends for the both, that means
6724703203dSis * they are the same.
6734703203dSis *
6744703203dSis * If we reached only one of the two ends, that means the other string
6754703203dSis * has something which then the fact can be used to determine
6764703203dSis * the return value.
6774703203dSis */
6784703203dSis if (i1 >= n1) {
6794703203dSis if (i2 >= n2)
6804703203dSis return (0);
6814703203dSis return (-1);
6824703203dSis }
6834703203dSis return (1);
6844703203dSis }
6854703203dSis
6864703203dSis /*
6874703203dSis * The combining_class() function checks on the given bytes and find out
6884703203dSis * the corresponding Unicode combining class value. The return value 0 means
6894703203dSis * it is a Starter. Any illegal UTF-8 character will also be treated as
6904703203dSis * a Starter.
6914703203dSis */
6924703203dSis static uchar_t
combining_class(size_t uv,uchar_t * s,size_t sz)6934703203dSis combining_class(size_t uv, uchar_t *s, size_t sz)
6944703203dSis {
6954703203dSis uint16_t b1 = 0;
6964703203dSis uint16_t b2 = 0;
6974703203dSis uint16_t b3 = 0;
6984703203dSis uint16_t b4 = 0;
6994703203dSis
7004703203dSis if (sz == 1 || sz > 4)
7014703203dSis return (0);
7024703203dSis
7034703203dSis if (sz == 2) {
7044703203dSis b3 = s[0];
7054703203dSis b4 = s[1];
7064703203dSis } else if (sz == 3) {
7074703203dSis b2 = s[0];
7084703203dSis b3 = s[1];
7094703203dSis b4 = s[2];
7104703203dSis } else if (sz == 4) {
7114703203dSis b1 = s[0];
7124703203dSis b2 = s[1];
7134703203dSis b3 = s[2];
7144703203dSis b4 = s[3];
7154703203dSis }
7164703203dSis
7174703203dSis b1 = u8_common_b1_tbl[uv][b1];
7184703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF)
7194703203dSis return (0);
7204703203dSis
7214703203dSis b2 = u8_combining_class_b2_tbl[uv][b1][b2];
7224703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF)
7234703203dSis return (0);
7244703203dSis
7254703203dSis b3 = u8_combining_class_b3_tbl[uv][b2][b3];
7264703203dSis if (b3 == U8_TBL_ELEMENT_NOT_DEF)
7274703203dSis return (0);
7284703203dSis
7294703203dSis return (u8_combining_class_b4_tbl[uv][b3][b4]);
7304703203dSis }
7314703203dSis
7324703203dSis /*
7334703203dSis * The do_decomp() function finds out a matching decomposition if any
7344703203dSis * and return. If there is no match, the input bytes are copied and returned.
7354703203dSis * The function also checks if there is a Hangul, decomposes it if necessary
7364703203dSis * and returns.
7374703203dSis *
7384703203dSis * To save time, a single byte 7-bit ASCII character should be handled by
7394703203dSis * the caller.
7404703203dSis *
7414703203dSis * The function returns the number of bytes returned sans always terminating
7424703203dSis * the null byte. It will also return a state that will tell if there was
7434703203dSis * a Hangul character decomposed which then will be used by the caller.
7444703203dSis */
7454703203dSis static size_t
do_decomp(size_t uv,uchar_t * u8s,uchar_t * s,int sz,boolean_t canonical_decomposition,u8_normalization_states_t * state)7464703203dSis do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
7474703203dSis boolean_t canonical_decomposition, u8_normalization_states_t *state)
7484703203dSis {
7494703203dSis uint16_t b1 = 0;
7504703203dSis uint16_t b2 = 0;
7514703203dSis uint16_t b3 = 0;
7524703203dSis uint16_t b3_tbl;
7534703203dSis uint16_t b3_base;
7544703203dSis uint16_t b4 = 0;
7554703203dSis size_t start_id;
7564703203dSis size_t end_id;
7574703203dSis size_t i;
7584703203dSis uint32_t u1;
7594703203dSis
7604703203dSis if (sz == 2) {
7614703203dSis b3 = u8s[0] = s[0];
7624703203dSis b4 = u8s[1] = s[1];
7634703203dSis u8s[2] = '\0';
7644703203dSis } else if (sz == 3) {
7654703203dSis /* Convert it to a Unicode scalar value. */
7664703203dSis U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
7674703203dSis
7684703203dSis /*
7694703203dSis * If this is a Hangul syllable, we decompose it into
7704703203dSis * a leading consonant, a vowel, and an optional trailing
7714703203dSis * consonant and then return.
7724703203dSis */
7734703203dSis if (U8_HANGUL_SYLLABLE(u1)) {
7744703203dSis u1 -= U8_HANGUL_SYL_FIRST;
7754703203dSis
7764703203dSis b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
7774703203dSis b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
7784703203dSis / U8_HANGUL_T_COUNT;
7794703203dSis b3 = u1 % U8_HANGUL_T_COUNT;
7804703203dSis
7814703203dSis U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
7824703203dSis U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
7834703203dSis if (b3) {
7844703203dSis b3 += U8_HANGUL_JAMO_T_FIRST;
7854703203dSis U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
7864703203dSis
7874703203dSis u8s[9] = '\0';
7884703203dSis *state = U8_STATE_HANGUL_LVT;
7894703203dSis return (9);
7904703203dSis }
7914703203dSis
7924703203dSis u8s[6] = '\0';
7934703203dSis *state = U8_STATE_HANGUL_LV;
7944703203dSis return (6);
7954703203dSis }
7964703203dSis
7974703203dSis b2 = u8s[0] = s[0];
7984703203dSis b3 = u8s[1] = s[1];
7994703203dSis b4 = u8s[2] = s[2];
8004703203dSis u8s[3] = '\0';
8014703203dSis
8024703203dSis /*
8034703203dSis * If this is a Hangul Jamo, we know there is nothing
8044703203dSis * further that we can decompose.
8054703203dSis */
8064703203dSis if (U8_HANGUL_JAMO_L(u1)) {
8074703203dSis *state = U8_STATE_HANGUL_L;
8084703203dSis return (3);
8094703203dSis }
8104703203dSis
8114703203dSis if (U8_HANGUL_JAMO_V(u1)) {
8124703203dSis if (*state == U8_STATE_HANGUL_L)
8134703203dSis *state = U8_STATE_HANGUL_LV;
8144703203dSis else
8154703203dSis *state = U8_STATE_HANGUL_V;
8164703203dSis return (3);
8174703203dSis }
8184703203dSis
8194703203dSis if (U8_HANGUL_JAMO_T(u1)) {
8204703203dSis if (*state == U8_STATE_HANGUL_LV)
8214703203dSis *state = U8_STATE_HANGUL_LVT;
8224703203dSis else
8234703203dSis *state = U8_STATE_HANGUL_T;
8244703203dSis return (3);
8254703203dSis }
8264703203dSis } else if (sz == 4) {
8274703203dSis b1 = u8s[0] = s[0];
8284703203dSis b2 = u8s[1] = s[1];
8294703203dSis b3 = u8s[2] = s[2];
8304703203dSis b4 = u8s[3] = s[3];
8314703203dSis u8s[4] = '\0';
8324703203dSis } else {
8334703203dSis /*
8344703203dSis * This is a fallback and should not happen if the function
8354703203dSis * was called properly.
8364703203dSis */
8374703203dSis u8s[0] = s[0];
8384703203dSis u8s[1] = '\0';
8394703203dSis *state = U8_STATE_START;
8404703203dSis return (1);
8414703203dSis }
8424703203dSis
8434703203dSis /*
8444703203dSis * At this point, this rountine does not know what it would get.
8454703203dSis * The caller should sort it out if the state isn't a Hangul one.
8464703203dSis */
8474703203dSis *state = U8_STATE_START;
8484703203dSis
8494703203dSis /* Try to find matching decomposition mapping byte sequence. */
8504703203dSis b1 = u8_common_b1_tbl[uv][b1];
8514703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF)
8524703203dSis return ((size_t)sz);
8534703203dSis
8544703203dSis b2 = u8_decomp_b2_tbl[uv][b1][b2];
8554703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF)
8564703203dSis return ((size_t)sz);
8574703203dSis
8584703203dSis b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
8594703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
8604703203dSis return ((size_t)sz);
8614703203dSis
8624703203dSis /*
8634703203dSis * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
8644703203dSis * which is 0x8000, this means we couldn't fit the mappings into
8654703203dSis * the cardinality of a unsigned byte.
8664703203dSis */
8674703203dSis if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
8684703203dSis b3_tbl -= U8_16BIT_TABLE_INDICATOR;
8694703203dSis start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
8704703203dSis end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
8714703203dSis } else {
8724703203dSis start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
8734703203dSis end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
8744703203dSis }
8754703203dSis
8764703203dSis /* This also means there wasn't any matching decomposition. */
8774703203dSis if (start_id >= end_id)
8784703203dSis return ((size_t)sz);
8794703203dSis
8804703203dSis /*
8814703203dSis * The final table for decomposition mappings has three types of
8824703203dSis * byte sequences depending on whether a mapping is for compatibility
8834703203dSis * decomposition, canonical decomposition, or both like the following:
8844703203dSis *
8854703203dSis * (1) Compatibility decomposition mappings:
8864703203dSis *
8874703203dSis * +---+---+-...-+---+
8884703203dSis * | B0| B1| ... | Bm|
8894703203dSis * +---+---+-...-+---+
8904703203dSis *
8914703203dSis * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
8924703203dSis *
8934703203dSis * (2) Canonical decomposition mappings:
8944703203dSis *
8954703203dSis * +---+---+---+-...-+---+
8964703203dSis * | T | b0| b1| ... | bn|
8974703203dSis * +---+---+---+-...-+---+
8984703203dSis *
8994703203dSis * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
9004703203dSis *
9014703203dSis * (3) Both mappings:
9024703203dSis *
9034703203dSis * +---+---+---+---+-...-+---+---+---+-...-+---+
9044703203dSis * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
9054703203dSis * +---+---+---+---+-...-+---+---+---+-...-+---+
9064703203dSis *
9074703203dSis * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
9084703203dSis * byte, b0 to bn are canonical mapping bytes and B0 to Bm are
9094703203dSis * compatibility mapping bytes.
9104703203dSis *
9114703203dSis * Note that compatibility decomposition means doing recursive
9124703203dSis * decompositions using both compatibility decomposition mappings and
9134703203dSis * canonical decomposition mappings. On the other hand, canonical
9144703203dSis * decomposition means doing recursive decompositions using only
9154703203dSis * canonical decomposition mappings. Since the table we have has gone
9164703203dSis * through the recursions already, we do not need to do so during
9174703203dSis * runtime, i.e., the table has been completely flattened out
9184703203dSis * already.
9194703203dSis */
9204703203dSis
9214703203dSis b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
9224703203dSis
9234703203dSis /* Get the type, T, of the byte sequence. */
9244703203dSis b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
9254703203dSis
9264703203dSis /*
9274703203dSis * If necessary, adjust start_id, end_id, or both. Note that if
9284703203dSis * this is compatibility decomposition mapping, there is no
9294703203dSis * adjustment.
9304703203dSis */
9314703203dSis if (canonical_decomposition) {
9324703203dSis /* Is the mapping only for compatibility decomposition? */
9334703203dSis if (b1 < U8_DECOMP_BOTH)
9344703203dSis return ((size_t)sz);
9354703203dSis
9364703203dSis start_id++;
9374703203dSis
9384703203dSis if (b1 == U8_DECOMP_BOTH) {
9394703203dSis end_id = start_id +
9404703203dSis u8_decomp_final_tbl[uv][b3_base + start_id];
9414703203dSis start_id++;
9424703203dSis }
9434703203dSis } else {
9444703203dSis /*
9454703203dSis * Unless this is a compatibility decomposition mapping,
9464703203dSis * we adjust the start_id.
9474703203dSis */
9484703203dSis if (b1 == U8_DECOMP_BOTH) {
9494703203dSis start_id++;
9504703203dSis start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
9514703203dSis } else if (b1 == U8_DECOMP_CANONICAL) {
9524703203dSis start_id++;
9534703203dSis }
9544703203dSis }
9554703203dSis
9564703203dSis for (i = 0; start_id < end_id; start_id++)
9574703203dSis u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
9584703203dSis u8s[i] = '\0';
9594703203dSis
9604703203dSis return (i);
9614703203dSis }
9624703203dSis
9634703203dSis /*
9644703203dSis * The find_composition_start() function uses the character bytes given and
9654703203dSis * find out the matching composition mappings if any and return the address
9664703203dSis * to the composition mappings as explained in the do_composition().
9674703203dSis */
9684703203dSis static uchar_t *
find_composition_start(size_t uv,uchar_t * s,size_t sz)9694703203dSis find_composition_start(size_t uv, uchar_t *s, size_t sz)
9704703203dSis {
9714703203dSis uint16_t b1 = 0;
9724703203dSis uint16_t b2 = 0;
9734703203dSis uint16_t b3 = 0;
9744703203dSis uint16_t b3_tbl;
9754703203dSis uint16_t b3_base;
9764703203dSis uint16_t b4 = 0;
9774703203dSis size_t start_id;
9784703203dSis size_t end_id;
9794703203dSis
9804703203dSis if (sz == 1) {
9814703203dSis b4 = s[0];
9824703203dSis } else if (sz == 2) {
9834703203dSis b3 = s[0];
9844703203dSis b4 = s[1];
9854703203dSis } else if (sz == 3) {
9864703203dSis b2 = s[0];
9874703203dSis b3 = s[1];
9884703203dSis b4 = s[2];
9894703203dSis } else if (sz == 4) {
9904703203dSis b1 = s[0];
9914703203dSis b2 = s[1];
9924703203dSis b3 = s[2];
9934703203dSis b4 = s[3];
9944703203dSis } else {
9954703203dSis /*
9964703203dSis * This is a fallback and should not happen if the function
9974703203dSis * was called properly.
9984703203dSis */
9994703203dSis return (NULL);
10004703203dSis }
10014703203dSis
10024703203dSis b1 = u8_composition_b1_tbl[uv][b1];
10034703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF)
10044703203dSis return (NULL);
10054703203dSis
10064703203dSis b2 = u8_composition_b2_tbl[uv][b1][b2];
10074703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF)
10084703203dSis return (NULL);
10094703203dSis
10104703203dSis b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
10114703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
10124703203dSis return (NULL);
10134703203dSis
10144703203dSis if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
10154703203dSis b3_tbl -= U8_16BIT_TABLE_INDICATOR;
10164703203dSis start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
10174703203dSis end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
10184703203dSis } else {
10194703203dSis start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
10204703203dSis end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
10214703203dSis }
10224703203dSis
10234703203dSis if (start_id >= end_id)
10244703203dSis return (NULL);
10254703203dSis
10264703203dSis b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
10274703203dSis
10284703203dSis return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
10294703203dSis }
10304703203dSis
10314703203dSis /*
10324703203dSis * The blocked() function checks on the combining class values of previous
10334703203dSis * characters in this sequence and return whether it is blocked or not.
10344703203dSis */
10354703203dSis static boolean_t
blocked(uchar_t * comb_class,size_t last)10364703203dSis blocked(uchar_t *comb_class, size_t last)
10374703203dSis {
10384703203dSis uchar_t my_comb_class;
10394703203dSis size_t i;
10404703203dSis
10414703203dSis my_comb_class = comb_class[last];
10424703203dSis for (i = 1; i < last; i++)
10434703203dSis if (comb_class[i] >= my_comb_class ||
10444703203dSis comb_class[i] == U8_COMBINING_CLASS_STARTER)
10454703203dSis return (B_TRUE);
10464703203dSis
10474703203dSis return (B_FALSE);
10484703203dSis }
10494703203dSis
10504703203dSis /*
10514703203dSis * The do_composition() reads the character string pointed by 's' and
10524703203dSis * do necessary canonical composition and then copy over the result back to
10534703203dSis * the 's'.
10544703203dSis *
10554703203dSis * The input argument 's' cannot contain more than 32 characters.
10564703203dSis */
10574703203dSis static size_t
do_composition(size_t uv,uchar_t * s,uchar_t * comb_class,uchar_t * start,uchar_t * disp,size_t last,uchar_t ** os,uchar_t * oslast)10584703203dSis do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
10594703203dSis uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
10604703203dSis {
10614703203dSis uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
10624703203dSis uchar_t tc[U8_MB_CUR_MAX];
10634703203dSis uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
10644703203dSis size_t saved_marks_count;
10654703203dSis uchar_t *p;
10664703203dSis uchar_t *saved_p;
10674703203dSis uchar_t *q;
10684703203dSis size_t i;
10694703203dSis size_t saved_i;
10704703203dSis size_t j;
10714703203dSis size_t k;
10724703203dSis size_t l;
10734703203dSis size_t C;
10744703203dSis size_t saved_l;
10754703203dSis size_t size;
10764703203dSis uint32_t u1;
10774703203dSis uint32_t u2;
10784703203dSis boolean_t match_not_found = B_TRUE;
10794703203dSis
10804703203dSis /*
10814703203dSis * This should never happen unless the callers are doing some strange
10824703203dSis * and unexpected things.
10834703203dSis *
10844703203dSis * The "last" is the index pointing to the last character not last + 1.
10854703203dSis */
10864703203dSis if (last >= U8_MAX_CHARS_A_SEQ)
10874703203dSis last = U8_UPPER_LIMIT_IN_A_SEQ;
10884703203dSis
10894703203dSis for (i = l = 0; i <= last; i++) {
10904703203dSis /*
10914703203dSis * The last or any non-Starters at the beginning, we don't
10924703203dSis * have any chance to do composition and so we just copy them
10934703203dSis * to the temporary buffer.
10944703203dSis */
10954703203dSis if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
10964703203dSis SAVE_THE_CHAR:
10974703203dSis p = s + start[i];
10984703203dSis size = disp[i];
10994703203dSis for (k = 0; k < size; k++)
11004703203dSis t[l++] = *p++;
11014703203dSis continue;
11024703203dSis }
11034703203dSis
11044703203dSis /*
11054703203dSis * If this could be a start of Hangul Jamos, then, we try to
11064703203dSis * conjoin them.
11074703203dSis */
11084703203dSis if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
11094703203dSis U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
11104703203dSis s[start[i] + 1], s[start[i] + 2]);
11114703203dSis U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
11124703203dSis s[start[i] + 4], s[start[i] + 5]);
11134703203dSis
11144703203dSis if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
11154703203dSis u1 -= U8_HANGUL_JAMO_L_FIRST;
11164703203dSis u2 -= U8_HANGUL_JAMO_V_FIRST;
11174703203dSis u1 = U8_HANGUL_SYL_FIRST +
11184703203dSis (u1 * U8_HANGUL_V_COUNT + u2) *
11194703203dSis U8_HANGUL_T_COUNT;
11204703203dSis
11214703203dSis i += 2;
11224703203dSis if (i <= last) {
11234703203dSis U8_PUT_3BYTES_INTO_UTF32(u2,
11244703203dSis s[start[i]], s[start[i] + 1],
11254703203dSis s[start[i] + 2]);
11264703203dSis
11274703203dSis if (U8_HANGUL_JAMO_T(u2)) {
11284703203dSis u1 += u2 -
11294703203dSis U8_HANGUL_JAMO_T_FIRST;
11304703203dSis i++;
11314703203dSis }
11324703203dSis }
11334703203dSis
11344703203dSis U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
11354703203dSis i--;
11364703203dSis l += 3;
11374703203dSis continue;
11384703203dSis }
11394703203dSis }
11404703203dSis
11414703203dSis /*
11424703203dSis * Let's then find out if this Starter has composition
11434703203dSis * mapping.
11444703203dSis */
11454703203dSis p = find_composition_start(uv, s + start[i], disp[i]);
11464703203dSis if (p == NULL)
11474703203dSis goto SAVE_THE_CHAR;
11484703203dSis
11494703203dSis /*
11504703203dSis * We have a Starter with composition mapping and the next
11514703203dSis * character is a non-Starter. Let's try to find out if
11524703203dSis * we can do composition.
11534703203dSis */
11544703203dSis
11554703203dSis saved_p = p;
11564703203dSis saved_i = i;
11574703203dSis saved_l = l;
11584703203dSis saved_marks_count = 0;
11594703203dSis
11604703203dSis TRY_THE_NEXT_MARK:
11614703203dSis q = s + start[++i];
11624703203dSis size = disp[i];
11634703203dSis
11644703203dSis /*
11654703203dSis * The next for() loop compares the non-Starter pointed by
11664703203dSis * 'q' with the possible (joinable) characters pointed by 'p'.
11674703203dSis *
11684703203dSis * The composition final table entry pointed by the 'p'
11694703203dSis * looks like the following:
11704703203dSis *
11714703203dSis * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11724703203dSis * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
11734703203dSis * +---+---+---+-...-+---+---+---+---+-...-+---+---+
11744703203dSis *
11754703203dSis * where C is the count byte indicating the number of
11764703203dSis * mapping pairs where each pair would be look like
11774703203dSis * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
11784703203dSis * character of a canonical decomposition and the B0-Bm are
11794703203dSis * the bytes of a matching composite character. The F is
11804703203dSis * a filler byte after each character as the separator.
11814703203dSis */
11824703203dSis
11834703203dSis match_not_found = B_TRUE;
11844703203dSis
11854703203dSis for (C = *p++; C > 0; C--) {
11864703203dSis for (k = 0; k < size; p++, k++)
11874703203dSis if (*p != q[k])
11884703203dSis break;
11894703203dSis
11904703203dSis /* Have we found it? */
11914703203dSis if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
11924703203dSis match_not_found = B_FALSE;
11934703203dSis
11944703203dSis l = saved_l;
11954703203dSis
11964703203dSis while (*++p != U8_TBL_ELEMENT_FILLER)
11974703203dSis t[l++] = *p;
11984703203dSis
11994703203dSis break;
12004703203dSis }
12014703203dSis
12024703203dSis /* We didn't find; skip to the next pair. */
12034703203dSis if (*p != U8_TBL_ELEMENT_FILLER)
12044703203dSis while (*++p != U8_TBL_ELEMENT_FILLER)
12054703203dSis ;
12064703203dSis while (*++p != U8_TBL_ELEMENT_FILLER)
12074703203dSis ;
12084703203dSis p++;
12094703203dSis }
12104703203dSis
12114703203dSis /*
12124703203dSis * If there was no match, we will need to save the combining
12134703203dSis * mark for later appending. After that, if the next one
12144703203dSis * is a non-Starter and not blocked, then, we try once
12154703203dSis * again to do composition with the next non-Starter.
12164703203dSis *
12174703203dSis * If there was no match and this was a Starter, then,
12184703203dSis * this is a new start.
12194703203dSis *
12204703203dSis * If there was a match and a composition done and we have
12214703203dSis * more to check on, then, we retrieve a new composition final
12224703203dSis * table entry for the composite and then try to do the
12234703203dSis * composition again.
12244703203dSis */
12254703203dSis
12264703203dSis if (match_not_found) {
12274703203dSis if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
12284703203dSis i--;
12294703203dSis goto SAVE_THE_CHAR;
12304703203dSis }
12314703203dSis
12324703203dSis saved_marks[saved_marks_count++] = i;
12334703203dSis }
12344703203dSis
12354703203dSis if (saved_l == l) {
12364703203dSis while (i < last) {
12374703203dSis if (blocked(comb_class, i + 1))
12384703203dSis saved_marks[saved_marks_count++] = ++i;
12394703203dSis else
12404703203dSis break;
12414703203dSis }
12424703203dSis if (i < last) {
12434703203dSis p = saved_p;
12444703203dSis goto TRY_THE_NEXT_MARK;
12454703203dSis }
12464703203dSis } else if (i < last) {
12474703203dSis p = find_composition_start(uv, t + saved_l,
12484703203dSis l - saved_l);
12494703203dSis if (p != NULL) {
12504703203dSis saved_p = p;
12514703203dSis goto TRY_THE_NEXT_MARK;
12524703203dSis }
12534703203dSis }
12544703203dSis
12554703203dSis /*
12564703203dSis * There is no more composition possible.
12574703203dSis *
12584703203dSis * If there was no composition what so ever then we copy
12594703203dSis * over the original Starter and then append any non-Starters
12604703203dSis * remaining at the target string sequentially after that.
12614703203dSis */
12624703203dSis
12634703203dSis if (saved_l == l) {
12644703203dSis p = s + start[saved_i];
12654703203dSis size = disp[saved_i];
12664703203dSis for (j = 0; j < size; j++)
12674703203dSis t[l++] = *p++;
12684703203dSis }
12694703203dSis
12704703203dSis for (k = 0; k < saved_marks_count; k++) {
12714703203dSis p = s + start[saved_marks[k]];
12724703203dSis size = disp[saved_marks[k]];
12734703203dSis for (j = 0; j < size; j++)
12744703203dSis t[l++] = *p++;
12754703203dSis }
12764703203dSis }
12774703203dSis
12784703203dSis /*
12794703203dSis * If the last character is a Starter and if we have a character
12804703203dSis * (possibly another Starter) that can be turned into a composite,
12814703203dSis * we do so and we do so until there is no more of composition
12824703203dSis * possible.
12834703203dSis */
12844703203dSis if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
12854703203dSis p = *os;
12864703203dSis saved_l = l - disp[last];
12874703203dSis
12884703203dSis while (p < oslast) {
12894703203dSis size = u8_number_of_bytes[*p];
12904703203dSis if (size <= 1 || (p + size) > oslast)
12914703203dSis break;
12924703203dSis
12934703203dSis saved_p = p;
12944703203dSis
12954703203dSis for (i = 0; i < size; i++)
12964703203dSis tc[i] = *p++;
12974703203dSis
12984703203dSis q = find_composition_start(uv, t + saved_l,
12994703203dSis l - saved_l);
13004703203dSis if (q == NULL) {
13014703203dSis p = saved_p;
13024703203dSis break;
13034703203dSis }
13044703203dSis
13054703203dSis match_not_found = B_TRUE;
13064703203dSis
13074703203dSis for (C = *q++; C > 0; C--) {
13084703203dSis for (k = 0; k < size; q++, k++)
13094703203dSis if (*q != tc[k])
13104703203dSis break;
13114703203dSis
13124703203dSis if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
13134703203dSis match_not_found = B_FALSE;
13144703203dSis
13154703203dSis l = saved_l;
13164703203dSis
13174703203dSis while (*++q != U8_TBL_ELEMENT_FILLER) {
13184703203dSis /*
13194703203dSis * This is practically
13204703203dSis * impossible but we don't
13214703203dSis * want to take any chances.
13224703203dSis */
13234703203dSis if (l >=
13244703203dSis U8_STREAM_SAFE_TEXT_MAX) {
13254703203dSis p = saved_p;
13264703203dSis goto SAFE_RETURN;
13274703203dSis }
13284703203dSis t[l++] = *q;
13294703203dSis }
13304703203dSis
13314703203dSis break;
13324703203dSis }
13334703203dSis
13344703203dSis if (*q != U8_TBL_ELEMENT_FILLER)
13354703203dSis while (*++q != U8_TBL_ELEMENT_FILLER)
13364703203dSis ;
13374703203dSis while (*++q != U8_TBL_ELEMENT_FILLER)
13384703203dSis ;
13394703203dSis q++;
13404703203dSis }
13414703203dSis
13424703203dSis if (match_not_found) {
13434703203dSis p = saved_p;
13444703203dSis break;
13454703203dSis }
13464703203dSis }
13474703203dSis SAFE_RETURN:
13484703203dSis *os = p;
13494703203dSis }
13504703203dSis
13514703203dSis /*
13524703203dSis * Now we copy over the temporary string to the target string.
13534703203dSis * Since composition always reduces the number of characters or
13544703203dSis * the number of characters stay, we don't need to worry about
13554703203dSis * the buffer overflow here.
13564703203dSis */
13574703203dSis for (i = 0; i < l; i++)
13584703203dSis s[i] = t[i];
13594703203dSis s[l] = '\0';
13604703203dSis
13614703203dSis return (l);
13624703203dSis }
13634703203dSis
13644703203dSis /*
13654703203dSis * The collect_a_seq() function checks on the given string s, collect
13664703203dSis * a sequence of characters at u8s, and return the sequence. While it collects
13674703203dSis * a sequence, it also applies case conversion, canonical or compatibility
13684703203dSis * decomposition, canonical decomposition, or some or all of them and
13694703203dSis * in that order.
13704703203dSis *
13714703203dSis * The collected sequence cannot be bigger than 32 characters since if
13724703203dSis * it is having more than 31 characters, the sequence will be terminated
13734703203dSis * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
13744703203dSis * a Stream-Safe Text. The collected sequence is always terminated with
13754703203dSis * a null byte and the return value is the byte length of the sequence
13764703203dSis * including 0. The return value does not include the terminating
13774703203dSis * null byte.
13784703203dSis */
13794703203dSis static size_t
collect_a_seq(size_t uv,uchar_t * u8s,uchar_t ** source,uchar_t * slast,boolean_t is_it_toupper,boolean_t is_it_tolower,boolean_t canonical_decomposition,boolean_t compatibility_decomposition,boolean_t canonical_composition,int * errnum,u8_normalization_states_t * state)13804703203dSis collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
13814703203dSis boolean_t is_it_toupper,
13824703203dSis boolean_t is_it_tolower,
13834703203dSis boolean_t canonical_decomposition,
13844703203dSis boolean_t compatibility_decomposition,
13854703203dSis boolean_t canonical_composition,
1386*85bb5f1dSis int *errnum, u8_normalization_states_t *state)
13874703203dSis {
13884703203dSis uchar_t *s;
13894703203dSis int sz;
13904703203dSis int saved_sz;
13914703203dSis size_t i;
13924703203dSis size_t j;
13934703203dSis size_t k;
13944703203dSis size_t l;
13954703203dSis uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
13964703203dSis uchar_t disp[U8_MAX_CHARS_A_SEQ];
13974703203dSis uchar_t start[U8_MAX_CHARS_A_SEQ];
13984703203dSis uchar_t u8t[U8_MB_CUR_MAX];
13994703203dSis uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
14004703203dSis uchar_t tc;
14014703203dSis size_t last;
14024703203dSis size_t saved_last;
14034703203dSis uint32_t u1;
14044703203dSis
14054703203dSis /*
14064703203dSis * Save the source string pointer which we will return a changed
14074703203dSis * pointer if we do processing.
14084703203dSis */
14094703203dSis s = *source;
14104703203dSis
14114703203dSis /*
14124703203dSis * The following is a fallback for just in case callers are not
14134703203dSis * checking the string boundaries before the calling.
14144703203dSis */
14154703203dSis if (s >= slast) {
14164703203dSis u8s[0] = '\0';
14174703203dSis
14184703203dSis return (0);
14194703203dSis }
14204703203dSis
14214703203dSis /*
14224703203dSis * As the first thing, let's collect a character and do case
14234703203dSis * conversion if necessary.
14244703203dSis */
14254703203dSis
14264703203dSis sz = u8_number_of_bytes[*s];
14274703203dSis
14284703203dSis if (sz < 0) {
1429*85bb5f1dSis *errnum = EILSEQ;
14304703203dSis
14314703203dSis u8s[0] = *s++;
14324703203dSis u8s[1] = '\0';
14334703203dSis
14344703203dSis *source = s;
14354703203dSis
14364703203dSis return (1);
14374703203dSis }
14384703203dSis
14394703203dSis if (sz == 1) {
14404703203dSis if (is_it_toupper)
14414703203dSis u8s[0] = U8_ASCII_TOUPPER(*s);
14424703203dSis else if (is_it_tolower)
14434703203dSis u8s[0] = U8_ASCII_TOLOWER(*s);
14444703203dSis else
14454703203dSis u8s[0] = *s;
14464703203dSis s++;
14474703203dSis u8s[1] = '\0';
14484703203dSis } else if ((s + sz) > slast) {
1449*85bb5f1dSis *errnum = EINVAL;
14504703203dSis
14514703203dSis for (i = 0; s < slast; )
14524703203dSis u8s[i++] = *s++;
14534703203dSis u8s[i] = '\0';
14544703203dSis
14554703203dSis *source = s;
14564703203dSis
14574703203dSis return (i);
14584703203dSis } else {
14594703203dSis if (is_it_toupper || is_it_tolower) {
14604703203dSis i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
14614703203dSis s += sz;
14624703203dSis sz = i;
14634703203dSis } else {
14644703203dSis for (i = 0; i < sz; )
14654703203dSis u8s[i++] = *s++;
14664703203dSis u8s[i] = '\0';
14674703203dSis }
14684703203dSis }
14694703203dSis
14704703203dSis /*
14714703203dSis * And then canonical/compatibility decomposition followed by
14724703203dSis * an optional canonical composition. Please be noted that
14734703203dSis * canonical composition is done only when a decomposition is
14744703203dSis * done.
14754703203dSis */
14764703203dSis if (canonical_decomposition || compatibility_decomposition) {
14774703203dSis if (sz == 1) {
14784703203dSis *state = U8_STATE_START;
14794703203dSis
14804703203dSis saved_sz = 1;
14814703203dSis
14824703203dSis comb_class[0] = 0;
14834703203dSis start[0] = 0;
14844703203dSis disp[0] = 1;
14854703203dSis
14864703203dSis last = 1;
14874703203dSis } else {
14884703203dSis saved_sz = do_decomp(uv, u8s, u8s, sz,
14894703203dSis canonical_decomposition, state);
14904703203dSis
14914703203dSis last = 0;
14924703203dSis
14934703203dSis for (i = 0; i < saved_sz; ) {
14944703203dSis sz = u8_number_of_bytes[u8s[i]];
14954703203dSis
14964703203dSis comb_class[last] = combining_class(uv,
14974703203dSis u8s + i, sz);
14984703203dSis start[last] = i;
14994703203dSis disp[last] = sz;
15004703203dSis
15014703203dSis last++;
15024703203dSis i += sz;
15034703203dSis }
15044703203dSis
15054703203dSis /*
15064703203dSis * Decomposition yields various Hangul related
15074703203dSis * states but not on combining marks. We need to
15084703203dSis * find out at here by checking on the last
15094703203dSis * character.
15104703203dSis */
15114703203dSis if (*state == U8_STATE_START) {
15124703203dSis if (comb_class[last - 1])
15134703203dSis *state = U8_STATE_COMBINING_MARK;
15144703203dSis }
15154703203dSis }
15164703203dSis
15174703203dSis saved_last = last;
15184703203dSis
15194703203dSis while (s < slast) {
15204703203dSis sz = u8_number_of_bytes[*s];
15214703203dSis
15224703203dSis /*
15234703203dSis * If this is an illegal character, an incomplete
15244703203dSis * character, or an 7-bit ASCII Starter character,
15254703203dSis * then we have collected a sequence; break and let
15264703203dSis * the next call deal with the two cases.
15274703203dSis *
15284703203dSis * Note that this is okay only if you are using this
15294703203dSis * function with a fixed length string, not on
15304703203dSis * a buffer with multiple calls of one chunk at a time.
15314703203dSis */
15324703203dSis if (sz <= 1) {
15334703203dSis break;
15344703203dSis } else if ((s + sz) > slast) {
15354703203dSis break;
15364703203dSis } else {
15374703203dSis /*
15384703203dSis * If the previous character was a Hangul Jamo
15394703203dSis * and this character is a Hangul Jamo that
15404703203dSis * can be conjoined, we collect the Jamo.
15414703203dSis */
15424703203dSis if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
15434703203dSis U8_PUT_3BYTES_INTO_UTF32(u1,
15444703203dSis *s, *(s + 1), *(s + 2));
15454703203dSis
15464703203dSis if (U8_HANGUL_COMPOSABLE_L_V(*state,
15474703203dSis u1)) {
15484703203dSis i = 0;
15494703203dSis *state = U8_STATE_HANGUL_LV;
15504703203dSis goto COLLECT_A_HANGUL;
15514703203dSis }
15524703203dSis
15534703203dSis if (U8_HANGUL_COMPOSABLE_LV_T(*state,
15544703203dSis u1)) {
15554703203dSis i = 0;
15564703203dSis *state = U8_STATE_HANGUL_LVT;
15574703203dSis goto COLLECT_A_HANGUL;
15584703203dSis }
15594703203dSis }
15604703203dSis
15614703203dSis /*
15624703203dSis * Regardless of whatever it was, if this is
15634703203dSis * a Starter, we don't collect the character
15644703203dSis * since that's a new start and we will deal
15654703203dSis * with it at the next time.
15664703203dSis */
15674703203dSis i = combining_class(uv, s, sz);
15684703203dSis if (i == U8_COMBINING_CLASS_STARTER)
15694703203dSis break;
15704703203dSis
15714703203dSis /*
15724703203dSis * We know the current character is a combining
15734703203dSis * mark. If the previous character wasn't
15744703203dSis * a Starter (not Hangul) or a combining mark,
15754703203dSis * then, we don't collect this combining mark.
15764703203dSis */
15774703203dSis if (*state != U8_STATE_START &&
15784703203dSis *state != U8_STATE_COMBINING_MARK)
15794703203dSis break;
15804703203dSis
15814703203dSis *state = U8_STATE_COMBINING_MARK;
15824703203dSis COLLECT_A_HANGUL:
15834703203dSis /*
15844703203dSis * If we collected a Starter and combining
15854703203dSis * marks up to 30, i.e., total 31 characters,
15864703203dSis * then, we terminate this degenerately long
15874703203dSis * combining sequence with a U+034F COMBINING
15884703203dSis * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
15894703203dSis * UTF-8 and turn this into a Stream-Safe
15904703203dSis * Text. This will be extremely rare but
15914703203dSis * possible.
15924703203dSis *
15934703203dSis * The following will also guarantee that
15944703203dSis * we are not writing more than 32 characters
15954703203dSis * plus a NULL at u8s[].
15964703203dSis */
15974703203dSis if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
15984703203dSis TURN_STREAM_SAFE:
15994703203dSis *state = U8_STATE_START;
16004703203dSis comb_class[last] = 0;
16014703203dSis start[last] = saved_sz;
16024703203dSis disp[last] = 2;
16034703203dSis last++;
16044703203dSis
16054703203dSis u8s[saved_sz++] = 0xCD;
16064703203dSis u8s[saved_sz++] = 0x8F;
16074703203dSis
16084703203dSis break;
16094703203dSis }
16104703203dSis
16114703203dSis /*
16124703203dSis * Some combining marks also do decompose into
16134703203dSis * another combining mark or marks.
16144703203dSis */
16154703203dSis if (*state == U8_STATE_COMBINING_MARK) {
16164703203dSis k = last;
16174703203dSis l = sz;
16184703203dSis i = do_decomp(uv, uts, s, sz,
16194703203dSis canonical_decomposition, state);
16204703203dSis for (j = 0; j < i; ) {
16214703203dSis sz = u8_number_of_bytes[uts[j]];
16224703203dSis
16234703203dSis comb_class[last] =
16244703203dSis combining_class(uv,
16254703203dSis uts + j, sz);
16264703203dSis start[last] = saved_sz + j;
16274703203dSis disp[last] = sz;
16284703203dSis
16294703203dSis last++;
16304703203dSis if (last >=
16314703203dSis U8_UPPER_LIMIT_IN_A_SEQ) {
16324703203dSis last = k;
16334703203dSis goto TURN_STREAM_SAFE;
16344703203dSis }
16354703203dSis j += sz;
16364703203dSis }
16374703203dSis
16384703203dSis *state = U8_STATE_COMBINING_MARK;
16394703203dSis sz = i;
16404703203dSis s += l;
16414703203dSis
16424703203dSis for (i = 0; i < sz; i++)
16434703203dSis u8s[saved_sz++] = uts[i];
16444703203dSis } else {
16454703203dSis comb_class[last] = i;
16464703203dSis start[last] = saved_sz;
16474703203dSis disp[last] = sz;
16484703203dSis last++;
16494703203dSis
16504703203dSis for (i = 0; i < sz; i++)
16514703203dSis u8s[saved_sz++] = *s++;
16524703203dSis }
16534703203dSis
16544703203dSis /*
16554703203dSis * If this is U+0345 COMBINING GREEK
16564703203dSis * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
16574703203dSis * iota subscript, and need to be converted to
16584703203dSis * uppercase letter, convert it to U+0399 GREEK
16594703203dSis * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
16604703203dSis * i.e., convert to capital adscript form as
16614703203dSis * specified in the Unicode standard.
16624703203dSis *
16634703203dSis * This is the only special case of (ambiguous)
16644703203dSis * case conversion at combining marks and
16654703203dSis * probably the standard will never have
16664703203dSis * anything similar like this in future.
16674703203dSis */
16684703203dSis if (is_it_toupper && sz >= 2 &&
16694703203dSis u8s[saved_sz - 2] == 0xCD &&
16704703203dSis u8s[saved_sz - 1] == 0x85) {
16714703203dSis u8s[saved_sz - 2] = 0xCE;
16724703203dSis u8s[saved_sz - 1] = 0x99;
16734703203dSis }
16744703203dSis }
16754703203dSis }
16764703203dSis
16774703203dSis /*
16784703203dSis * Let's try to ensure a canonical ordering for the collected
16794703203dSis * combining marks. We do this only if we have collected
16804703203dSis * at least one more non-Starter. (The decomposition mapping
16814703203dSis * data tables have fully (and recursively) expanded and
16824703203dSis * canonically ordered decompositions.)
16834703203dSis *
16844703203dSis * The U8_SWAP_COMB_MARKS() convenience macro has some
16854703203dSis * assumptions and we are meeting the assumptions.
16864703203dSis */
16874703203dSis last--;
16884703203dSis if (last >= saved_last) {
16894703203dSis for (i = 0; i < last; i++)
16904703203dSis for (j = last; j > i; j--)
16914703203dSis if (comb_class[j] &&
16924703203dSis comb_class[j - 1] > comb_class[j]) {
16934703203dSis U8_SWAP_COMB_MARKS(j - 1, j);
16944703203dSis }
16954703203dSis }
16964703203dSis
16974703203dSis *source = s;
16984703203dSis
16994703203dSis if (! canonical_composition) {
17004703203dSis u8s[saved_sz] = '\0';
17014703203dSis return (saved_sz);
17024703203dSis }
17034703203dSis
17044703203dSis /*
17054703203dSis * Now do the canonical composition. Note that we do this
17064703203dSis * only after a canonical or compatibility decomposition to
17074703203dSis * finish up NFC or NFKC.
17084703203dSis */
17094703203dSis sz = do_composition(uv, u8s, comb_class, start, disp, last,
17104703203dSis &s, slast);
17114703203dSis }
17124703203dSis
17134703203dSis *source = s;
17144703203dSis
17154703203dSis return ((size_t)sz);
17164703203dSis }
17174703203dSis
17184703203dSis /*
17194703203dSis * The do_norm_compare() function does string comparion based on Unicode
17204703203dSis * simple case mappings and Unicode Normalization definitions.
17214703203dSis *
17224703203dSis * It does so by collecting a sequence of character at a time and comparing
17234703203dSis * the collected sequences from the strings.
17244703203dSis *
17254703203dSis * The meanings on the return values are the same as the usual strcmp().
17264703203dSis */
17274703203dSis static int
do_norm_compare(size_t uv,uchar_t * s1,uchar_t * s2,size_t n1,size_t n2,int flag,int * errnum)17284703203dSis do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1729*85bb5f1dSis int flag, int *errnum)
17304703203dSis {
17314703203dSis int result;
17324703203dSis size_t sz1;
17334703203dSis size_t sz2;
17344703203dSis uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
17354703203dSis uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
17364703203dSis uchar_t *s1last;
17374703203dSis uchar_t *s2last;
17384703203dSis boolean_t is_it_toupper;
17394703203dSis boolean_t is_it_tolower;
17404703203dSis boolean_t canonical_decomposition;
17414703203dSis boolean_t compatibility_decomposition;
17424703203dSis boolean_t canonical_composition;
17434703203dSis u8_normalization_states_t state;
17444703203dSis
17454703203dSis s1last = s1 + n1;
17464703203dSis s2last = s2 + n2;
17474703203dSis
17484703203dSis is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
17494703203dSis is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
17504703203dSis canonical_decomposition = flag & U8_CANON_DECOMP;
17514703203dSis compatibility_decomposition = flag & U8_COMPAT_DECOMP;
17524703203dSis canonical_composition = flag & U8_CANON_COMP;
17534703203dSis
17544703203dSis while (s1 < s1last && s2 < s2last) {
17554703203dSis /*
17564703203dSis * If the current character is a 7-bit ASCII and the last
17574703203dSis * character, or, if the current character and the next
17584703203dSis * character are both some 7-bit ASCII characters then
17594703203dSis * we treat the current character as a sequence.
17604703203dSis *
17614703203dSis * In any other cases, we need to call collect_a_seq().
17624703203dSis */
17634703203dSis
17644703203dSis if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
17654703203dSis ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
17664703203dSis if (is_it_toupper)
17674703203dSis u8s1[0] = U8_ASCII_TOUPPER(*s1);
17684703203dSis else if (is_it_tolower)
17694703203dSis u8s1[0] = U8_ASCII_TOLOWER(*s1);
17704703203dSis else
17714703203dSis u8s1[0] = *s1;
17724703203dSis u8s1[1] = '\0';
17734703203dSis sz1 = 1;
17744703203dSis s1++;
17754703203dSis } else {
17764703203dSis state = U8_STATE_START;
17774703203dSis sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
17784703203dSis is_it_toupper, is_it_tolower,
17794703203dSis canonical_decomposition,
17804703203dSis compatibility_decomposition,
1781*85bb5f1dSis canonical_composition, errnum, &state);
17824703203dSis }
17834703203dSis
17844703203dSis if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
17854703203dSis ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
17864703203dSis if (is_it_toupper)
17874703203dSis u8s2[0] = U8_ASCII_TOUPPER(*s2);
17884703203dSis else if (is_it_tolower)
17894703203dSis u8s2[0] = U8_ASCII_TOLOWER(*s2);
17904703203dSis else
17914703203dSis u8s2[0] = *s2;
17924703203dSis u8s2[1] = '\0';
17934703203dSis sz2 = 1;
17944703203dSis s2++;
17954703203dSis } else {
17964703203dSis state = U8_STATE_START;
17974703203dSis sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
17984703203dSis is_it_toupper, is_it_tolower,
17994703203dSis canonical_decomposition,
18004703203dSis compatibility_decomposition,
1801*85bb5f1dSis canonical_composition, errnum, &state);
18024703203dSis }
18034703203dSis
18044703203dSis /*
18054703203dSis * Now compare the two characters. If they are the same,
18064703203dSis * we move on to the next character sequences.
18074703203dSis */
18084703203dSis if (sz1 == 1 && sz2 == 1) {
18094703203dSis if (*u8s1 > *u8s2)
18104703203dSis return (1);
18114703203dSis if (*u8s1 < *u8s2)
18124703203dSis return (-1);
18134703203dSis } else {
18144703203dSis result = strcmp((const char *)u8s1, (const char *)u8s2);
18154703203dSis if (result != 0)
18164703203dSis return (result);
18174703203dSis }
18184703203dSis }
18194703203dSis
18204703203dSis /*
18214703203dSis * We compared until the end of either or both strings.
18224703203dSis *
18234703203dSis * If we reached to or went over the ends for the both, that means
18244703203dSis * they are the same.
18254703203dSis *
18264703203dSis * If we reached only one end, that means the other string has
18274703203dSis * something which then can be used to determine the return value.
18284703203dSis */
18294703203dSis if (s1 >= s1last) {
18304703203dSis if (s2 >= s2last)
18314703203dSis return (0);
18324703203dSis return (-1);
18334703203dSis }
18344703203dSis return (1);
18354703203dSis }
18364703203dSis
18374703203dSis /*
18384703203dSis * The u8_strcmp() function compares two UTF-8 strings quite similar to
18394703203dSis * the strcmp(). For the comparison, however, Unicode Normalization specific
18404703203dSis * equivalency and Unicode simple case conversion mappings based equivalency
18414703203dSis * can be requested and checked against.
18424703203dSis */
18434703203dSis int
u8_strcmp(const char * s1,const char * s2,size_t n,int flag,size_t uv,int * errnum)18444703203dSis u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1845*85bb5f1dSis int *errnum)
18464703203dSis {
18474703203dSis int f;
18484703203dSis size_t n1;
18494703203dSis size_t n2;
18504703203dSis
1851*85bb5f1dSis *errnum = 0;
18524703203dSis
18534703203dSis /*
18544703203dSis * Check on the requested Unicode version, case conversion, and
18554703203dSis * normalization flag values.
18564703203dSis */
18574703203dSis
18584703203dSis if (uv > U8_UNICODE_LATEST) {
1859*85bb5f1dSis *errnum = ERANGE;
18604703203dSis uv = U8_UNICODE_LATEST;
18614703203dSis }
18624703203dSis
18634703203dSis if (flag == 0) {
18644703203dSis flag = U8_STRCMP_CS;
18654703203dSis } else {
18664703203dSis f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
18674703203dSis U8_STRCMP_CI_LOWER);
18684703203dSis if (f == 0) {
18694703203dSis flag |= U8_STRCMP_CS;
18704703203dSis } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
18714703203dSis f != U8_STRCMP_CI_LOWER) {
1872*85bb5f1dSis *errnum = EBADF;
18734703203dSis flag = U8_STRCMP_CS;
18744703203dSis }
18754703203dSis
18764703203dSis f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
18774703203dSis if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
18784703203dSis f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
1879*85bb5f1dSis *errnum = EBADF;
18804703203dSis flag = U8_STRCMP_CS;
18814703203dSis }
18824703203dSis }
18834703203dSis
18844703203dSis if (flag == U8_STRCMP_CS) {
18854703203dSis return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
18864703203dSis }
18874703203dSis
18884703203dSis n1 = strlen(s1);
18894703203dSis n2 = strlen(s2);
18904703203dSis if (n != 0) {
18914703203dSis if (n < n1)
18924703203dSis n1 = n;
18934703203dSis if (n < n2)
18944703203dSis n2 = n;
18954703203dSis }
18964703203dSis
18974703203dSis /*
18984703203dSis * Simple case conversion can be done much faster and so we do
18994703203dSis * them separately here.
19004703203dSis */
19014703203dSis if (flag == U8_STRCMP_CI_UPPER) {
19024703203dSis return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1903*85bb5f1dSis n1, n2, B_TRUE, errnum));
19044703203dSis } else if (flag == U8_STRCMP_CI_LOWER) {
19054703203dSis return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1906*85bb5f1dSis n1, n2, B_FALSE, errnum));
19074703203dSis }
19084703203dSis
19094703203dSis return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
1910*85bb5f1dSis flag, errnum));
19114703203dSis }
19124703203dSis
19134703203dSis size_t
u8_textprep_str(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,size_t unicode_version,int * errnum)19144703203dSis u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1915*85bb5f1dSis int flag, size_t unicode_version, int *errnum)
19164703203dSis {
19174703203dSis int f;
19184703203dSis int sz;
19194703203dSis uchar_t *ib;
19204703203dSis uchar_t *ibtail;
19214703203dSis uchar_t *ob;
19224703203dSis uchar_t *obtail;
19234703203dSis boolean_t do_not_ignore_null;
19244703203dSis boolean_t do_not_ignore_invalid;
19254703203dSis boolean_t is_it_toupper;
19264703203dSis boolean_t is_it_tolower;
19274703203dSis boolean_t canonical_decomposition;
19284703203dSis boolean_t compatibility_decomposition;
19294703203dSis boolean_t canonical_composition;
19304703203dSis size_t ret_val;
19314703203dSis size_t i;
19324703203dSis size_t j;
19334703203dSis uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
19344703203dSis u8_normalization_states_t state;
19354703203dSis
19364703203dSis if (unicode_version > U8_UNICODE_LATEST) {
1937*85bb5f1dSis *errnum = ERANGE;
19384703203dSis return ((size_t)-1);
19394703203dSis }
19404703203dSis
19414703203dSis f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
19424703203dSis if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
1943*85bb5f1dSis *errnum = EBADF;
19444703203dSis return ((size_t)-1);
19454703203dSis }
19464703203dSis
19474703203dSis f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
19484703203dSis if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
19494703203dSis f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
1950*85bb5f1dSis *errnum = EBADF;
19514703203dSis return ((size_t)-1);
19524703203dSis }
19534703203dSis
19544703203dSis if (inarray == NULL || *inlen == 0)
19554703203dSis return (0);
19564703203dSis
19574703203dSis if (outarray == NULL) {
1958*85bb5f1dSis *errnum = E2BIG;
19594703203dSis return ((size_t)-1);
19604703203dSis }
19614703203dSis
19624703203dSis ib = (uchar_t *)inarray;
19634703203dSis ob = (uchar_t *)outarray;
19644703203dSis ibtail = ib + *inlen;
19654703203dSis obtail = ob + *outlen;
19664703203dSis
19674703203dSis do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
19684703203dSis do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
19694703203dSis is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
19704703203dSis is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
19714703203dSis
19724703203dSis ret_val = 0;
19734703203dSis
19744703203dSis /*
19754703203dSis * If we don't have a normalization flag set, we do the simple case
19764703203dSis * conversion based text preparation separately below. Text
19774703203dSis * preparation involving Normalization will be done in the false task
19784703203dSis * block, again, separately since it will take much more time and
19794703203dSis * resource than doing simple case conversions.
19804703203dSis */
19814703203dSis if (f == 0) {
19824703203dSis while (ib < ibtail) {
19834703203dSis if (*ib == '\0' && do_not_ignore_null)
19844703203dSis break;
19854703203dSis
19864703203dSis sz = u8_number_of_bytes[*ib];
19874703203dSis
19884703203dSis if (sz < 0) {
19894703203dSis if (do_not_ignore_invalid) {
1990*85bb5f1dSis *errnum = EILSEQ;
19914703203dSis ret_val = (size_t)-1;
19924703203dSis break;
19934703203dSis }
19944703203dSis
19954703203dSis sz = 1;
19964703203dSis ret_val++;
19974703203dSis }
19984703203dSis
19994703203dSis if (sz == 1) {
20004703203dSis if (ob >= obtail) {
2001*85bb5f1dSis *errnum = E2BIG;
20024703203dSis ret_val = (size_t)-1;
20034703203dSis break;
20044703203dSis }
20054703203dSis
20064703203dSis if (is_it_toupper)
20074703203dSis *ob = U8_ASCII_TOUPPER(*ib);
20084703203dSis else if (is_it_tolower)
20094703203dSis *ob = U8_ASCII_TOLOWER(*ib);
20104703203dSis else
20114703203dSis *ob = *ib;
20124703203dSis ib++;
20134703203dSis ob++;
20144703203dSis } else if ((ib + sz) > ibtail) {
20154703203dSis if (do_not_ignore_invalid) {
2016*85bb5f1dSis *errnum = EINVAL;
20174703203dSis ret_val = (size_t)-1;
20184703203dSis break;
20194703203dSis }
20204703203dSis
20214703203dSis if ((obtail - ob) < (ibtail - ib)) {
2022*85bb5f1dSis *errnum = E2BIG;
20234703203dSis ret_val = (size_t)-1;
20244703203dSis break;
20254703203dSis }
20264703203dSis
20274703203dSis /*
20284703203dSis * We treat the remaining incomplete character
20294703203dSis * bytes as a character.
20304703203dSis */
20314703203dSis ret_val++;
20324703203dSis
20334703203dSis while (ib < ibtail)
20344703203dSis *ob++ = *ib++;
20354703203dSis } else {
20364703203dSis if (is_it_toupper || is_it_tolower) {
20374703203dSis i = do_case_conv(unicode_version, u8s,
20384703203dSis ib, sz, is_it_toupper);
20394703203dSis
20404703203dSis if ((obtail - ob) < i) {
2041*85bb5f1dSis *errnum = E2BIG;
20424703203dSis ret_val = (size_t)-1;
20434703203dSis break;
20444703203dSis }
20454703203dSis
20464703203dSis ib += sz;
20474703203dSis
20484703203dSis for (sz = 0; sz < i; sz++)
20494703203dSis *ob++ = u8s[sz];
20504703203dSis } else {
20514703203dSis if ((obtail - ob) < sz) {
2052*85bb5f1dSis *errnum = E2BIG;
20534703203dSis ret_val = (size_t)-1;
20544703203dSis break;
20554703203dSis }
20564703203dSis
20574703203dSis for (i = 0; i < sz; i++)
20584703203dSis *ob++ = *ib++;
20594703203dSis }
20604703203dSis }
20614703203dSis }
20624703203dSis } else {
20634703203dSis canonical_decomposition = flag & U8_CANON_DECOMP;
20644703203dSis compatibility_decomposition = flag & U8_COMPAT_DECOMP;
20654703203dSis canonical_composition = flag & U8_CANON_COMP;
20664703203dSis
20674703203dSis while (ib < ibtail) {
20684703203dSis if (*ib == '\0' && do_not_ignore_null)
20694703203dSis break;
20704703203dSis
20714703203dSis /*
20724703203dSis * If the current character is a 7-bit ASCII
20734703203dSis * character and it is the last character, or,
20744703203dSis * if the current character is a 7-bit ASCII
20754703203dSis * character and the next character is also a 7-bit
20764703203dSis * ASCII character, then, we copy over this
20774703203dSis * character without going through collect_a_seq().
20784703203dSis *
20794703203dSis * In any other cases, we need to look further with
20804703203dSis * the collect_a_seq() function.
20814703203dSis */
20824703203dSis if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
20834703203dSis ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
20844703203dSis if (ob >= obtail) {
2085*85bb5f1dSis *errnum = E2BIG;
20864703203dSis ret_val = (size_t)-1;
20874703203dSis break;
20884703203dSis }
20894703203dSis
20904703203dSis if (is_it_toupper)
20914703203dSis *ob = U8_ASCII_TOUPPER(*ib);
20924703203dSis else if (is_it_tolower)
20934703203dSis *ob = U8_ASCII_TOLOWER(*ib);
20944703203dSis else
20954703203dSis *ob = *ib;
20964703203dSis ib++;
20974703203dSis ob++;
20984703203dSis } else {
2099*85bb5f1dSis *errnum = 0;
21004703203dSis state = U8_STATE_START;
21014703203dSis
21024703203dSis j = collect_a_seq(unicode_version, u8s,
21034703203dSis &ib, ibtail,
21044703203dSis is_it_toupper,
21054703203dSis is_it_tolower,
21064703203dSis canonical_decomposition,
21074703203dSis compatibility_decomposition,
21084703203dSis canonical_composition,
2109*85bb5f1dSis errnum, &state);
21104703203dSis
2111*85bb5f1dSis if (*errnum && do_not_ignore_invalid) {
21124703203dSis ret_val = (size_t)-1;
21134703203dSis break;
21144703203dSis }
21154703203dSis
21164703203dSis if ((obtail - ob) < j) {
2117*85bb5f1dSis *errnum = E2BIG;
21184703203dSis ret_val = (size_t)-1;
21194703203dSis break;
21204703203dSis }
21214703203dSis
21224703203dSis for (i = 0; i < j; i++)
21234703203dSis *ob++ = u8s[i];
21244703203dSis }
21254703203dSis }
21264703203dSis }
21274703203dSis
21284703203dSis *inlen = ibtail - ib;
21294703203dSis *outlen = obtail - ob;
21304703203dSis
21314703203dSis return (ret_val);
21324703203dSis }
2133