14703203dSis /* 24703203dSis * CDDL HEADER START 34703203dSis * 44703203dSis * The contents of this file are subject to the terms of the 54703203dSis * Common Development and Distribution License (the "License"). 64703203dSis * You may not use this file except in compliance with the License. 74703203dSis * 84703203dSis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 94703203dSis * or http://www.opensolaris.org/os/licensing. 104703203dSis * See the License for the specific language governing permissions 114703203dSis * and limitations under the License. 124703203dSis * 134703203dSis * When distributing Covered Code, include this CDDL HEADER in each 144703203dSis * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 154703203dSis * If applicable, add the following below this CDDL HEADER, with the 164703203dSis * fields enclosed by brackets "[]" replaced with your own identifying 174703203dSis * information: Portions Copyright [yyyy] [name of copyright owner] 184703203dSis * 194703203dSis * CDDL HEADER END 204703203dSis */ 214703203dSis /* 22*85bb5f1dSis * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 234703203dSis * Use is subject to license terms. 244703203dSis */ 254703203dSis 264703203dSis #pragma ident "%Z%%M% %I% %E% SMI" 274703203dSis 284703203dSis 294703203dSis /* 304703203dSis * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458). 314703203dSis * 324703203dSis * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), 334703203dSis * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also 344703203dSis * the section 3C man pages. 354703203dSis * Interface stability: Committed. 364703203dSis */ 374703203dSis 384703203dSis #include <sys/types.h> 394703203dSis #ifdef _KERNEL 404703203dSis #include <sys/param.h> 414703203dSis #include <sys/sysmacros.h> 424703203dSis #include <sys/systm.h> 434703203dSis #include <sys/debug.h> 444703203dSis #include <sys/kmem.h> 454703203dSis #include <sys/ddi.h> 464703203dSis #include <sys/sunddi.h> 474703203dSis #else 484703203dSis #include <sys/u8_textprep.h> 494703203dSis #include <strings.h> 504703203dSis #endif /* _KERNEL */ 514703203dSis #include <sys/byteorder.h> 524703203dSis #include <sys/errno.h> 534703203dSis #include <sys/u8_textprep_data.h> 544703203dSis 554703203dSis 564703203dSis /* The maximum possible number of bytes in a UTF-8 character. */ 574703203dSis #define U8_MB_CUR_MAX (4) 584703203dSis 594703203dSis /* 604703203dSis * The maximum number of bytes needed for a UTF-8 character to cover 614703203dSis * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. 624703203dSis */ 634703203dSis #define U8_MAX_BYTES_UCS2 (3) 644703203dSis 654703203dSis /* The maximum possible number of bytes in a Stream-Safe Text. */ 664703203dSis #define U8_STREAM_SAFE_TEXT_MAX (128) 674703203dSis 684703203dSis /* 694703203dSis * The maximum number of characters in a combining/conjoining sequence and 704703203dSis * the actual upperbound limit of a combining/conjoining sequence. 714703203dSis */ 724703203dSis #define U8_MAX_CHARS_A_SEQ (32) 734703203dSis #define U8_UPPER_LIMIT_IN_A_SEQ (31) 744703203dSis 754703203dSis /* The combining class value for Starter. */ 764703203dSis #define U8_COMBINING_CLASS_STARTER (0) 774703203dSis 784703203dSis /* 794703203dSis * Some Hangul related macros at below. 804703203dSis * 814703203dSis * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, 824703203dSis * Vowels, and optional Trailing consonants in Unicode scalar values. 834703203dSis * 844703203dSis * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not 854703203dSis * the actual U+11A8. This is due to that the trailing consonant is optional 864703203dSis * and thus we are doing a pre-calculation of subtracting one. 874703203dSis * 884703203dSis * Each of 19 modern leading consonants has total 588 possible syllables since 894703203dSis * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for 904703203dSis * no trailing consonant case, i.e., 21 x 28 = 588. 914703203dSis * 924703203dSis * We also have bunch of Hangul related macros at below. Please bear in mind 934703203dSis * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is 944703203dSis * a Hangul Jamo or not but the value does not guarantee that it is a Hangul 954703203dSis * Jamo; it just guarantee that it will be most likely. 964703203dSis */ 974703203dSis #define U8_HANGUL_SYL_FIRST (0xAC00U) 984703203dSis #define U8_HANGUL_SYL_LAST (0xD7A3U) 994703203dSis 1004703203dSis #define U8_HANGUL_JAMO_L_FIRST (0x1100U) 1014703203dSis #define U8_HANGUL_JAMO_L_LAST (0x1112U) 1024703203dSis #define U8_HANGUL_JAMO_V_FIRST (0x1161U) 1034703203dSis #define U8_HANGUL_JAMO_V_LAST (0x1175U) 1044703203dSis #define U8_HANGUL_JAMO_T_FIRST (0x11A7U) 1054703203dSis #define U8_HANGUL_JAMO_T_LAST (0x11C2U) 1064703203dSis 1074703203dSis #define U8_HANGUL_V_COUNT (21) 1084703203dSis #define U8_HANGUL_VT_COUNT (588) 1094703203dSis #define U8_HANGUL_T_COUNT (28) 1104703203dSis 1114703203dSis #define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) 1124703203dSis 1134703203dSis #define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ 1144703203dSis (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ 1154703203dSis (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ 1164703203dSis (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); 1174703203dSis 1184703203dSis #define U8_HANGUL_JAMO_L(u) \ 1194703203dSis ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) 1204703203dSis 1214703203dSis #define U8_HANGUL_JAMO_V(u) \ 1224703203dSis ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) 1234703203dSis 1244703203dSis #define U8_HANGUL_JAMO_T(u) \ 1254703203dSis ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) 1264703203dSis 1274703203dSis #define U8_HANGUL_JAMO(u) \ 1284703203dSis ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) 1294703203dSis 1304703203dSis #define U8_HANGUL_SYLLABLE(u) \ 1314703203dSis ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) 1324703203dSis 1334703203dSis #define U8_HANGUL_COMPOSABLE_L_V(s, u) \ 1344703203dSis ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) 1354703203dSis 1364703203dSis #define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ 1374703203dSis ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) 1384703203dSis 1394703203dSis /* The types of decomposition mappings. */ 1404703203dSis #define U8_DECOMP_BOTH (0xF5U) 1414703203dSis #define U8_DECOMP_CANONICAL (0xF6U) 1424703203dSis 1434703203dSis /* The indicator for 16-bit table. */ 1444703203dSis #define U8_16BIT_TABLE_INDICATOR (0x8000U) 1454703203dSis 1464703203dSis /* The following are some convenience macros. */ 1474703203dSis #define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ 1484703203dSis (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \ 1494703203dSis (uint32_t)(b3) & 0x3F; 1504703203dSis 1514703203dSis #define U8_SIMPLE_SWAP(a, b, t) \ 1524703203dSis (t) = (a); \ 1534703203dSis (a) = (b); \ 1544703203dSis (b) = (t); 1554703203dSis 1564703203dSis #define U8_ASCII_TOUPPER(c) \ 1574703203dSis (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) 1584703203dSis 1594703203dSis #define U8_ASCII_TOLOWER(c) \ 1604703203dSis (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) 1614703203dSis 1624703203dSis #define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) 1634703203dSis /* 1644703203dSis * The following macro assumes that the two characters that are to be 1654703203dSis * swapped are adjacent to each other and 'a' comes before 'b'. 1664703203dSis * 1674703203dSis * If the assumptions are not met, then, the macro will fail. 1684703203dSis */ 1694703203dSis #define U8_SWAP_COMB_MARKS(a, b) \ 1704703203dSis for (k = 0; k < disp[(a)]; k++) \ 1714703203dSis u8t[k] = u8s[start[(a)] + k]; \ 1724703203dSis for (k = 0; k < disp[(b)]; k++) \ 1734703203dSis u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ 1744703203dSis start[(b)] = start[(a)] + disp[(b)]; \ 1754703203dSis for (k = 0; k < disp[(a)]; k++) \ 1764703203dSis u8s[start[(b)] + k] = u8t[k]; \ 1774703203dSis U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ 1784703203dSis U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); 1794703203dSis 1804703203dSis /* The possible states during normalization. */ 1814703203dSis typedef enum { 1824703203dSis U8_STATE_START = 0, 1834703203dSis U8_STATE_HANGUL_L = 1, 1844703203dSis U8_STATE_HANGUL_LV = 2, 1854703203dSis U8_STATE_HANGUL_LVT = 3, 1864703203dSis U8_STATE_HANGUL_V = 4, 1874703203dSis U8_STATE_HANGUL_T = 5, 1884703203dSis U8_STATE_COMBINING_MARK = 6 1894703203dSis } u8_normalization_states_t; 1904703203dSis 1914703203dSis /* 1924703203dSis * The three vectors at below are used to check bytes of a given UTF-8 1934703203dSis * character are valid and not containing any malformed byte values. 1944703203dSis * 1954703203dSis * We used to have a quite relaxed UTF-8 binary representation but then there 1964703203dSis * was some security related issues and so the Unicode Consortium defined 1974703203dSis * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it 1984703203dSis * one more time at the Unicode 3.2. The following three tables are based on 1994703203dSis * that. 2004703203dSis */ 2014703203dSis 2024703203dSis #define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) 2034703203dSis 2044703203dSis #define I_ U8_ILLEGAL_CHAR 2054703203dSis #define O_ U8_OUT_OF_RANGE_CHAR 2064703203dSis 2074703203dSis const int8_t u8_number_of_bytes[0x100] = { 2084703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2094703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2104703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2114703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2124703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2134703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2144703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2154703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2164703203dSis 2174703203dSis /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ 2184703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 2194703203dSis 2204703203dSis /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ 2214703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 2224703203dSis 2234703203dSis /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ 2244703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 2254703203dSis 2264703203dSis /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ 2274703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 2284703203dSis 2294703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 2304703203dSis I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2314703203dSis 2324703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 2334703203dSis 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2344703203dSis 2354703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 2364703203dSis 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2374703203dSis 2384703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 2394703203dSis 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, 2404703203dSis }; 2414703203dSis 2424703203dSis #undef I_ 2434703203dSis #undef O_ 2444703203dSis 2454703203dSis const uint8_t u8_valid_min_2nd_byte[0x100] = { 2464703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2474703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2484703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2494703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2504703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2514703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2524703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2534703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2544703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2554703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2564703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2574703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2584703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2594703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2604703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2614703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2624703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2634703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2644703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2654703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2664703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2674703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2684703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2694703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2704703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */ 2714703203dSis 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2724703203dSis /* C8 C9 CA CB CC CD CE CF */ 2734703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2744703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */ 2754703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2764703203dSis /* D8 D9 DA DB DC DD DE DF */ 2774703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2784703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */ 2794703203dSis 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2804703203dSis /* E8 E9 EA EB EC ED EE EF */ 2814703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2824703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */ 2834703203dSis 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 2844703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2854703203dSis }; 2864703203dSis 2874703203dSis const uint8_t u8_valid_max_2nd_byte[0x100] = { 2884703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2894703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2904703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2914703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2924703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2934703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2944703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2954703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2964703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2974703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2984703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 2994703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3004703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3014703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3024703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3034703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3044703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3054703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3064703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3074703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3084703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3094703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3104703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3114703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3124703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */ 3134703203dSis 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 3144703203dSis /* C8 C9 CA CB CC CD CE CF */ 3154703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 3164703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */ 3174703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 3184703203dSis /* D8 D9 DA DB DC DD DE DF */ 3194703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 3204703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */ 3214703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 3224703203dSis /* E8 E9 EA EB EC ED EE EF */ 3234703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 3244703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */ 3254703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 3264703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 3274703203dSis }; 3284703203dSis 3294703203dSis 3304703203dSis /* 3314703203dSis * The u8_validate() validates on the given UTF-8 character string and 3324703203dSis * calculate the byte length. It is quite similar to mblen(3C) except that 3334703203dSis * this will validate against the list of characters if required and 3344703203dSis * specific to UTF-8 and Unicode. 3354703203dSis */ 3364703203dSis int 337*85bb5f1dSis u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum) 3384703203dSis { 3394703203dSis uchar_t *ib; 3404703203dSis uchar_t *ibtail; 3414703203dSis uchar_t **p; 3424703203dSis uchar_t *s1; 3434703203dSis uchar_t *s2; 3444703203dSis uchar_t f; 3454703203dSis int sz; 3464703203dSis size_t i; 3474703203dSis int ret_val; 3484703203dSis boolean_t second; 3494703203dSis boolean_t no_need_to_validate_entire; 3504703203dSis boolean_t check_additional; 3514703203dSis boolean_t validate_ucs2_range_only; 3524703203dSis 3534703203dSis if (! u8str) 3544703203dSis return (0); 3554703203dSis 3564703203dSis ib = (uchar_t *)u8str; 3574703203dSis ibtail = ib + n; 3584703203dSis 3594703203dSis ret_val = 0; 3604703203dSis 3614703203dSis no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); 3624703203dSis check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; 3634703203dSis validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; 3644703203dSis 3654703203dSis while (ib < ibtail) { 3664703203dSis /* 3674703203dSis * The first byte of a UTF-8 character tells how many 3684703203dSis * bytes will follow for the character. If the first byte 3694703203dSis * is an illegal byte value or out of range value, we just 3704703203dSis * return -1 with an appropriate error number. 3714703203dSis */ 3724703203dSis sz = u8_number_of_bytes[*ib]; 3734703203dSis if (sz == U8_ILLEGAL_CHAR) { 374*85bb5f1dSis *errnum = EILSEQ; 3754703203dSis return (-1); 3764703203dSis } 3774703203dSis 3784703203dSis if (sz == U8_OUT_OF_RANGE_CHAR || 3794703203dSis (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { 380*85bb5f1dSis *errnum = ERANGE; 3814703203dSis return (-1); 3824703203dSis } 3834703203dSis 3844703203dSis /* 3854703203dSis * If we don't have enough bytes to check on, that's also 3864703203dSis * an error. As you can see, we give illegal byte sequence 3874703203dSis * checking higher priority then EINVAL cases. 3884703203dSis */ 3894703203dSis if ((ibtail - ib) < sz) { 390*85bb5f1dSis *errnum = EINVAL; 3914703203dSis return (-1); 3924703203dSis } 3934703203dSis 3944703203dSis if (sz == 1) { 3954703203dSis ib++; 3964703203dSis ret_val++; 3974703203dSis } else { 3984703203dSis /* 3994703203dSis * Check on the multi-byte UTF-8 character. For more 4004703203dSis * details on this, see comment added for the used 4014703203dSis * data structures at the beginning of the file. 4024703203dSis */ 4034703203dSis f = *ib++; 4044703203dSis ret_val++; 4054703203dSis second = B_TRUE; 4064703203dSis for (i = 1; i < sz; i++) { 4074703203dSis if (second) { 4084703203dSis if (*ib < u8_valid_min_2nd_byte[f] || 4094703203dSis *ib > u8_valid_max_2nd_byte[f]) { 410*85bb5f1dSis *errnum = EILSEQ; 4114703203dSis return (-1); 4124703203dSis } 4134703203dSis second = B_FALSE; 4144703203dSis } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { 415*85bb5f1dSis *errnum = EILSEQ; 4164703203dSis return (-1); 4174703203dSis } 4184703203dSis ib++; 4194703203dSis ret_val++; 4204703203dSis } 4214703203dSis } 4224703203dSis 4234703203dSis if (check_additional) { 4244703203dSis for (p = (uchar_t **)list, i = 0; p[i]; i++) { 4254703203dSis s1 = ib - sz; 4264703203dSis s2 = p[i]; 4274703203dSis while (s1 < ib) { 4284703203dSis if (*s1 != *s2 || *s2 == '\0') 4294703203dSis break; 4304703203dSis s1++; 4314703203dSis s2++; 4324703203dSis } 4334703203dSis 4344703203dSis if (s1 >= ib && *s2 == '\0') { 435*85bb5f1dSis *errnum = EBADF; 4364703203dSis return (-1); 4374703203dSis } 4384703203dSis } 4394703203dSis } 4404703203dSis 4414703203dSis if (no_need_to_validate_entire) 4424703203dSis break; 4434703203dSis } 4444703203dSis 4454703203dSis return (ret_val); 4464703203dSis } 4474703203dSis 4484703203dSis /* 4494703203dSis * The do_case_conv() looks at the mapping tables and returns found 4504703203dSis * bytes if any. If not found, the input bytes are returned. The function 4514703203dSis * always terminate the return bytes with a null character assuming that 4524703203dSis * there are plenty of room to do so. 4534703203dSis * 4544703203dSis * The case conversions are simple case conversions mapping a character to 4554703203dSis * another character as specified in the Unicode data. The byte size of 4564703203dSis * the mapped character could be different from that of the input character. 4574703203dSis * 4584703203dSis * The return value is the byte length of the returned character excluding 4594703203dSis * the terminating null byte. 4604703203dSis */ 4614703203dSis static size_t 4624703203dSis do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) 4634703203dSis { 4644703203dSis size_t i; 4654703203dSis uint16_t b1 = 0; 4664703203dSis uint16_t b2 = 0; 4674703203dSis uint16_t b3 = 0; 4684703203dSis uint16_t b3_tbl; 4694703203dSis uint16_t b3_base; 4704703203dSis uint16_t b4 = 0; 4714703203dSis size_t start_id; 4724703203dSis size_t end_id; 4734703203dSis 4744703203dSis /* 4754703203dSis * At this point, the only possible values for sz are 2, 3, and 4. 4764703203dSis * The u8s should point to a vector that is well beyond the size of 4774703203dSis * 5 bytes. 4784703203dSis */ 4794703203dSis if (sz == 2) { 4804703203dSis b3 = u8s[0] = s[0]; 4814703203dSis b4 = u8s[1] = s[1]; 4824703203dSis } else if (sz == 3) { 4834703203dSis b2 = u8s[0] = s[0]; 4844703203dSis b3 = u8s[1] = s[1]; 4854703203dSis b4 = u8s[2] = s[2]; 4864703203dSis } else if (sz == 4) { 4874703203dSis b1 = u8s[0] = s[0]; 4884703203dSis b2 = u8s[1] = s[1]; 4894703203dSis b3 = u8s[2] = s[2]; 4904703203dSis b4 = u8s[3] = s[3]; 4914703203dSis } else { 4924703203dSis /* This is not possible but just in case as a fallback. */ 4934703203dSis if (is_it_toupper) 4944703203dSis *u8s = U8_ASCII_TOUPPER(*s); 4954703203dSis else 4964703203dSis *u8s = U8_ASCII_TOLOWER(*s); 4974703203dSis u8s[1] = '\0'; 4984703203dSis 4994703203dSis return (1); 5004703203dSis } 5014703203dSis u8s[sz] = '\0'; 5024703203dSis 5034703203dSis /* 5044703203dSis * Let's find out if we have a corresponding character. 5054703203dSis */ 5064703203dSis b1 = u8_common_b1_tbl[uv][b1]; 5074703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF) 5084703203dSis return ((size_t)sz); 5094703203dSis 5104703203dSis b2 = u8_case_common_b2_tbl[uv][b1][b2]; 5114703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF) 5124703203dSis return ((size_t)sz); 5134703203dSis 5144703203dSis if (is_it_toupper) { 5154703203dSis b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; 5164703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 5174703203dSis return ((size_t)sz); 5184703203dSis 5194703203dSis start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; 5204703203dSis end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; 5214703203dSis 5224703203dSis /* Either there is no match or an error at the table. */ 5234703203dSis if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) 5244703203dSis return ((size_t)sz); 5254703203dSis 5264703203dSis b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; 5274703203dSis 5284703203dSis for (i = 0; start_id < end_id; start_id++) 5294703203dSis u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; 5304703203dSis } else { 5314703203dSis b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; 5324703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 5334703203dSis return ((size_t)sz); 5344703203dSis 5354703203dSis start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; 5364703203dSis end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; 5374703203dSis 5384703203dSis if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) 5394703203dSis return ((size_t)sz); 5404703203dSis 5414703203dSis b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; 5424703203dSis 5434703203dSis for (i = 0; start_id < end_id; start_id++) 5444703203dSis u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; 5454703203dSis } 5464703203dSis 5474703203dSis /* 5484703203dSis * If i is still zero, that means there is no corresponding character. 5494703203dSis */ 5504703203dSis if (i == 0) 5514703203dSis return ((size_t)sz); 5524703203dSis 5534703203dSis u8s[i] = '\0'; 5544703203dSis 5554703203dSis return (i); 5564703203dSis } 5574703203dSis 5584703203dSis /* 5594703203dSis * The do_case_compare() function compares the two input strings, s1 and s2, 5604703203dSis * one character at a time doing case conversions if applicable and return 5614703203dSis * the comparison result as like strcmp(). 5624703203dSis * 5634703203dSis * Since, in empirical sense, most of text data are 7-bit ASCII characters, 5644703203dSis * we treat the 7-bit ASCII characters as a special case trying to yield 5654703203dSis * faster processing time. 5664703203dSis */ 5674703203dSis static int 5684703203dSis do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, 569*85bb5f1dSis size_t n2, boolean_t is_it_toupper, int *errnum) 5704703203dSis { 5714703203dSis int f; 5724703203dSis int sz1; 5734703203dSis int sz2; 5744703203dSis size_t j; 5754703203dSis size_t i1; 5764703203dSis size_t i2; 5774703203dSis uchar_t u8s1[U8_MB_CUR_MAX + 1]; 5784703203dSis uchar_t u8s2[U8_MB_CUR_MAX + 1]; 5794703203dSis 5804703203dSis i1 = i2 = 0; 5814703203dSis while (i1 < n1 && i2 < n2) { 5824703203dSis /* 5834703203dSis * Find out what would be the byte length for this UTF-8 5844703203dSis * character at string s1 and also find out if this is 5854703203dSis * an illegal start byte or not and if so, issue a proper 586*85bb5f1dSis * error number and yet treat this byte as a character. 5874703203dSis */ 5884703203dSis sz1 = u8_number_of_bytes[*s1]; 5894703203dSis if (sz1 < 0) { 590*85bb5f1dSis *errnum = EILSEQ; 5914703203dSis sz1 = 1; 5924703203dSis } 5934703203dSis 5944703203dSis /* 5954703203dSis * For 7-bit ASCII characters mainly, we do a quick case 5964703203dSis * conversion right at here. 5974703203dSis * 5984703203dSis * If we don't have enough bytes for this character, issue 5994703203dSis * an EINVAL error and use what are available. 6004703203dSis * 6014703203dSis * If we have enough bytes, find out if there is 6024703203dSis * a corresponding uppercase character and if so, copy over 6034703203dSis * the bytes for a comparison later. If there is no 6044703203dSis * corresponding uppercase character, then, use what we have 6054703203dSis * for the comparison. 6064703203dSis */ 6074703203dSis if (sz1 == 1) { 6084703203dSis if (is_it_toupper) 6094703203dSis u8s1[0] = U8_ASCII_TOUPPER(*s1); 6104703203dSis else 6114703203dSis u8s1[0] = U8_ASCII_TOLOWER(*s1); 6124703203dSis s1++; 6134703203dSis u8s1[1] = '\0'; 6144703203dSis } else if ((i1 + sz1) > n1) { 615*85bb5f1dSis *errnum = EINVAL; 6164703203dSis for (j = 0; (i1 + j) < n1; ) 6174703203dSis u8s1[j++] = *s1++; 6184703203dSis u8s1[j] = '\0'; 6194703203dSis } else { 6204703203dSis (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); 6214703203dSis s1 += sz1; 6224703203dSis } 6234703203dSis 6244703203dSis /* Do the same for the string s2. */ 6254703203dSis sz2 = u8_number_of_bytes[*s2]; 6264703203dSis if (sz2 < 0) { 627*85bb5f1dSis *errnum = EILSEQ; 6284703203dSis sz2 = 1; 6294703203dSis } 6304703203dSis 6314703203dSis if (sz2 == 1) { 6324703203dSis if (is_it_toupper) 6334703203dSis u8s2[0] = U8_ASCII_TOUPPER(*s2); 6344703203dSis else 6354703203dSis u8s2[0] = U8_ASCII_TOLOWER(*s2); 6364703203dSis s2++; 6374703203dSis u8s2[1] = '\0'; 6384703203dSis } else if ((i2 + sz2) > n2) { 639*85bb5f1dSis *errnum = EINVAL; 6404703203dSis for (j = 0; (i2 + j) < n2; ) 6414703203dSis u8s2[j++] = *s2++; 6424703203dSis u8s2[j] = '\0'; 6434703203dSis } else { 6444703203dSis (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); 6454703203dSis s2 += sz2; 6464703203dSis } 6474703203dSis 6484703203dSis /* Now compare the two characters. */ 6494703203dSis if (sz1 == 1 && sz2 == 1) { 6504703203dSis if (*u8s1 > *u8s2) 6514703203dSis return (1); 6524703203dSis if (*u8s1 < *u8s2) 6534703203dSis return (-1); 6544703203dSis } else { 6554703203dSis f = strcmp((const char *)u8s1, (const char *)u8s2); 6564703203dSis if (f != 0) 6574703203dSis return (f); 6584703203dSis } 6594703203dSis 6604703203dSis /* 6614703203dSis * They were the same. Let's move on to the next 6624703203dSis * characters then. 6634703203dSis */ 6644703203dSis i1 += sz1; 6654703203dSis i2 += sz2; 6664703203dSis } 6674703203dSis 6684703203dSis /* 6694703203dSis * We compared until the end of either or both strings. 6704703203dSis * 6714703203dSis * If we reached to or went over the ends for the both, that means 6724703203dSis * they are the same. 6734703203dSis * 6744703203dSis * If we reached only one of the two ends, that means the other string 6754703203dSis * has something which then the fact can be used to determine 6764703203dSis * the return value. 6774703203dSis */ 6784703203dSis if (i1 >= n1) { 6794703203dSis if (i2 >= n2) 6804703203dSis return (0); 6814703203dSis return (-1); 6824703203dSis } 6834703203dSis return (1); 6844703203dSis } 6854703203dSis 6864703203dSis /* 6874703203dSis * The combining_class() function checks on the given bytes and find out 6884703203dSis * the corresponding Unicode combining class value. The return value 0 means 6894703203dSis * it is a Starter. Any illegal UTF-8 character will also be treated as 6904703203dSis * a Starter. 6914703203dSis */ 6924703203dSis static uchar_t 6934703203dSis combining_class(size_t uv, uchar_t *s, size_t sz) 6944703203dSis { 6954703203dSis uint16_t b1 = 0; 6964703203dSis uint16_t b2 = 0; 6974703203dSis uint16_t b3 = 0; 6984703203dSis uint16_t b4 = 0; 6994703203dSis 7004703203dSis if (sz == 1 || sz > 4) 7014703203dSis return (0); 7024703203dSis 7034703203dSis if (sz == 2) { 7044703203dSis b3 = s[0]; 7054703203dSis b4 = s[1]; 7064703203dSis } else if (sz == 3) { 7074703203dSis b2 = s[0]; 7084703203dSis b3 = s[1]; 7094703203dSis b4 = s[2]; 7104703203dSis } else if (sz == 4) { 7114703203dSis b1 = s[0]; 7124703203dSis b2 = s[1]; 7134703203dSis b3 = s[2]; 7144703203dSis b4 = s[3]; 7154703203dSis } 7164703203dSis 7174703203dSis b1 = u8_common_b1_tbl[uv][b1]; 7184703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF) 7194703203dSis return (0); 7204703203dSis 7214703203dSis b2 = u8_combining_class_b2_tbl[uv][b1][b2]; 7224703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF) 7234703203dSis return (0); 7244703203dSis 7254703203dSis b3 = u8_combining_class_b3_tbl[uv][b2][b3]; 7264703203dSis if (b3 == U8_TBL_ELEMENT_NOT_DEF) 7274703203dSis return (0); 7284703203dSis 7294703203dSis return (u8_combining_class_b4_tbl[uv][b3][b4]); 7304703203dSis } 7314703203dSis 7324703203dSis /* 7334703203dSis * The do_decomp() function finds out a matching decomposition if any 7344703203dSis * and return. If there is no match, the input bytes are copied and returned. 7354703203dSis * The function also checks if there is a Hangul, decomposes it if necessary 7364703203dSis * and returns. 7374703203dSis * 7384703203dSis * To save time, a single byte 7-bit ASCII character should be handled by 7394703203dSis * the caller. 7404703203dSis * 7414703203dSis * The function returns the number of bytes returned sans always terminating 7424703203dSis * the null byte. It will also return a state that will tell if there was 7434703203dSis * a Hangul character decomposed which then will be used by the caller. 7444703203dSis */ 7454703203dSis static size_t 7464703203dSis do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, 7474703203dSis boolean_t canonical_decomposition, u8_normalization_states_t *state) 7484703203dSis { 7494703203dSis uint16_t b1 = 0; 7504703203dSis uint16_t b2 = 0; 7514703203dSis uint16_t b3 = 0; 7524703203dSis uint16_t b3_tbl; 7534703203dSis uint16_t b3_base; 7544703203dSis uint16_t b4 = 0; 7554703203dSis size_t start_id; 7564703203dSis size_t end_id; 7574703203dSis size_t i; 7584703203dSis uint32_t u1; 7594703203dSis 7604703203dSis if (sz == 2) { 7614703203dSis b3 = u8s[0] = s[0]; 7624703203dSis b4 = u8s[1] = s[1]; 7634703203dSis u8s[2] = '\0'; 7644703203dSis } else if (sz == 3) { 7654703203dSis /* Convert it to a Unicode scalar value. */ 7664703203dSis U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); 7674703203dSis 7684703203dSis /* 7694703203dSis * If this is a Hangul syllable, we decompose it into 7704703203dSis * a leading consonant, a vowel, and an optional trailing 7714703203dSis * consonant and then return. 7724703203dSis */ 7734703203dSis if (U8_HANGUL_SYLLABLE(u1)) { 7744703203dSis u1 -= U8_HANGUL_SYL_FIRST; 7754703203dSis 7764703203dSis b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; 7774703203dSis b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) 7784703203dSis / U8_HANGUL_T_COUNT; 7794703203dSis b3 = u1 % U8_HANGUL_T_COUNT; 7804703203dSis 7814703203dSis U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); 7824703203dSis U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); 7834703203dSis if (b3) { 7844703203dSis b3 += U8_HANGUL_JAMO_T_FIRST; 7854703203dSis U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); 7864703203dSis 7874703203dSis u8s[9] = '\0'; 7884703203dSis *state = U8_STATE_HANGUL_LVT; 7894703203dSis return (9); 7904703203dSis } 7914703203dSis 7924703203dSis u8s[6] = '\0'; 7934703203dSis *state = U8_STATE_HANGUL_LV; 7944703203dSis return (6); 7954703203dSis } 7964703203dSis 7974703203dSis b2 = u8s[0] = s[0]; 7984703203dSis b3 = u8s[1] = s[1]; 7994703203dSis b4 = u8s[2] = s[2]; 8004703203dSis u8s[3] = '\0'; 8014703203dSis 8024703203dSis /* 8034703203dSis * If this is a Hangul Jamo, we know there is nothing 8044703203dSis * further that we can decompose. 8054703203dSis */ 8064703203dSis if (U8_HANGUL_JAMO_L(u1)) { 8074703203dSis *state = U8_STATE_HANGUL_L; 8084703203dSis return (3); 8094703203dSis } 8104703203dSis 8114703203dSis if (U8_HANGUL_JAMO_V(u1)) { 8124703203dSis if (*state == U8_STATE_HANGUL_L) 8134703203dSis *state = U8_STATE_HANGUL_LV; 8144703203dSis else 8154703203dSis *state = U8_STATE_HANGUL_V; 8164703203dSis return (3); 8174703203dSis } 8184703203dSis 8194703203dSis if (U8_HANGUL_JAMO_T(u1)) { 8204703203dSis if (*state == U8_STATE_HANGUL_LV) 8214703203dSis *state = U8_STATE_HANGUL_LVT; 8224703203dSis else 8234703203dSis *state = U8_STATE_HANGUL_T; 8244703203dSis return (3); 8254703203dSis } 8264703203dSis } else if (sz == 4) { 8274703203dSis b1 = u8s[0] = s[0]; 8284703203dSis b2 = u8s[1] = s[1]; 8294703203dSis b3 = u8s[2] = s[2]; 8304703203dSis b4 = u8s[3] = s[3]; 8314703203dSis u8s[4] = '\0'; 8324703203dSis } else { 8334703203dSis /* 8344703203dSis * This is a fallback and should not happen if the function 8354703203dSis * was called properly. 8364703203dSis */ 8374703203dSis u8s[0] = s[0]; 8384703203dSis u8s[1] = '\0'; 8394703203dSis *state = U8_STATE_START; 8404703203dSis return (1); 8414703203dSis } 8424703203dSis 8434703203dSis /* 8444703203dSis * At this point, this rountine does not know what it would get. 8454703203dSis * The caller should sort it out if the state isn't a Hangul one. 8464703203dSis */ 8474703203dSis *state = U8_STATE_START; 8484703203dSis 8494703203dSis /* Try to find matching decomposition mapping byte sequence. */ 8504703203dSis b1 = u8_common_b1_tbl[uv][b1]; 8514703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF) 8524703203dSis return ((size_t)sz); 8534703203dSis 8544703203dSis b2 = u8_decomp_b2_tbl[uv][b1][b2]; 8554703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF) 8564703203dSis return ((size_t)sz); 8574703203dSis 8584703203dSis b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; 8594703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 8604703203dSis return ((size_t)sz); 8614703203dSis 8624703203dSis /* 8634703203dSis * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR 8644703203dSis * which is 0x8000, this means we couldn't fit the mappings into 8654703203dSis * the cardinality of a unsigned byte. 8664703203dSis */ 8674703203dSis if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { 8684703203dSis b3_tbl -= U8_16BIT_TABLE_INDICATOR; 8694703203dSis start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; 8704703203dSis end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; 8714703203dSis } else { 8724703203dSis start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; 8734703203dSis end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; 8744703203dSis } 8754703203dSis 8764703203dSis /* This also means there wasn't any matching decomposition. */ 8774703203dSis if (start_id >= end_id) 8784703203dSis return ((size_t)sz); 8794703203dSis 8804703203dSis /* 8814703203dSis * The final table for decomposition mappings has three types of 8824703203dSis * byte sequences depending on whether a mapping is for compatibility 8834703203dSis * decomposition, canonical decomposition, or both like the following: 8844703203dSis * 8854703203dSis * (1) Compatibility decomposition mappings: 8864703203dSis * 8874703203dSis * +---+---+-...-+---+ 8884703203dSis * | B0| B1| ... | Bm| 8894703203dSis * +---+---+-...-+---+ 8904703203dSis * 8914703203dSis * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). 8924703203dSis * 8934703203dSis * (2) Canonical decomposition mappings: 8944703203dSis * 8954703203dSis * +---+---+---+-...-+---+ 8964703203dSis * | T | b0| b1| ... | bn| 8974703203dSis * +---+---+---+-...-+---+ 8984703203dSis * 8994703203dSis * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). 9004703203dSis * 9014703203dSis * (3) Both mappings: 9024703203dSis * 9034703203dSis * +---+---+---+---+-...-+---+---+---+-...-+---+ 9044703203dSis * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| 9054703203dSis * +---+---+---+---+-...-+---+---+---+-...-+---+ 9064703203dSis * 9074703203dSis * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement 9084703203dSis * byte, b0 to bn are canonical mapping bytes and B0 to Bm are 9094703203dSis * compatibility mapping bytes. 9104703203dSis * 9114703203dSis * Note that compatibility decomposition means doing recursive 9124703203dSis * decompositions using both compatibility decomposition mappings and 9134703203dSis * canonical decomposition mappings. On the other hand, canonical 9144703203dSis * decomposition means doing recursive decompositions using only 9154703203dSis * canonical decomposition mappings. Since the table we have has gone 9164703203dSis * through the recursions already, we do not need to do so during 9174703203dSis * runtime, i.e., the table has been completely flattened out 9184703203dSis * already. 9194703203dSis */ 9204703203dSis 9214703203dSis b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; 9224703203dSis 9234703203dSis /* Get the type, T, of the byte sequence. */ 9244703203dSis b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; 9254703203dSis 9264703203dSis /* 9274703203dSis * If necessary, adjust start_id, end_id, or both. Note that if 9284703203dSis * this is compatibility decomposition mapping, there is no 9294703203dSis * adjustment. 9304703203dSis */ 9314703203dSis if (canonical_decomposition) { 9324703203dSis /* Is the mapping only for compatibility decomposition? */ 9334703203dSis if (b1 < U8_DECOMP_BOTH) 9344703203dSis return ((size_t)sz); 9354703203dSis 9364703203dSis start_id++; 9374703203dSis 9384703203dSis if (b1 == U8_DECOMP_BOTH) { 9394703203dSis end_id = start_id + 9404703203dSis u8_decomp_final_tbl[uv][b3_base + start_id]; 9414703203dSis start_id++; 9424703203dSis } 9434703203dSis } else { 9444703203dSis /* 9454703203dSis * Unless this is a compatibility decomposition mapping, 9464703203dSis * we adjust the start_id. 9474703203dSis */ 9484703203dSis if (b1 == U8_DECOMP_BOTH) { 9494703203dSis start_id++; 9504703203dSis start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; 9514703203dSis } else if (b1 == U8_DECOMP_CANONICAL) { 9524703203dSis start_id++; 9534703203dSis } 9544703203dSis } 9554703203dSis 9564703203dSis for (i = 0; start_id < end_id; start_id++) 9574703203dSis u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; 9584703203dSis u8s[i] = '\0'; 9594703203dSis 9604703203dSis return (i); 9614703203dSis } 9624703203dSis 9634703203dSis /* 9644703203dSis * The find_composition_start() function uses the character bytes given and 9654703203dSis * find out the matching composition mappings if any and return the address 9664703203dSis * to the composition mappings as explained in the do_composition(). 9674703203dSis */ 9684703203dSis static uchar_t * 9694703203dSis find_composition_start(size_t uv, uchar_t *s, size_t sz) 9704703203dSis { 9714703203dSis uint16_t b1 = 0; 9724703203dSis uint16_t b2 = 0; 9734703203dSis uint16_t b3 = 0; 9744703203dSis uint16_t b3_tbl; 9754703203dSis uint16_t b3_base; 9764703203dSis uint16_t b4 = 0; 9774703203dSis size_t start_id; 9784703203dSis size_t end_id; 9794703203dSis 9804703203dSis if (sz == 1) { 9814703203dSis b4 = s[0]; 9824703203dSis } else if (sz == 2) { 9834703203dSis b3 = s[0]; 9844703203dSis b4 = s[1]; 9854703203dSis } else if (sz == 3) { 9864703203dSis b2 = s[0]; 9874703203dSis b3 = s[1]; 9884703203dSis b4 = s[2]; 9894703203dSis } else if (sz == 4) { 9904703203dSis b1 = s[0]; 9914703203dSis b2 = s[1]; 9924703203dSis b3 = s[2]; 9934703203dSis b4 = s[3]; 9944703203dSis } else { 9954703203dSis /* 9964703203dSis * This is a fallback and should not happen if the function 9974703203dSis * was called properly. 9984703203dSis */ 9994703203dSis return (NULL); 10004703203dSis } 10014703203dSis 10024703203dSis b1 = u8_composition_b1_tbl[uv][b1]; 10034703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF) 10044703203dSis return (NULL); 10054703203dSis 10064703203dSis b2 = u8_composition_b2_tbl[uv][b1][b2]; 10074703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF) 10084703203dSis return (NULL); 10094703203dSis 10104703203dSis b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; 10114703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 10124703203dSis return (NULL); 10134703203dSis 10144703203dSis if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { 10154703203dSis b3_tbl -= U8_16BIT_TABLE_INDICATOR; 10164703203dSis start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; 10174703203dSis end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; 10184703203dSis } else { 10194703203dSis start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; 10204703203dSis end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; 10214703203dSis } 10224703203dSis 10234703203dSis if (start_id >= end_id) 10244703203dSis return (NULL); 10254703203dSis 10264703203dSis b3_base = u8_composition_b3_tbl[uv][b2][b3].base; 10274703203dSis 10284703203dSis return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); 10294703203dSis } 10304703203dSis 10314703203dSis /* 10324703203dSis * The blocked() function checks on the combining class values of previous 10334703203dSis * characters in this sequence and return whether it is blocked or not. 10344703203dSis */ 10354703203dSis static boolean_t 10364703203dSis blocked(uchar_t *comb_class, size_t last) 10374703203dSis { 10384703203dSis uchar_t my_comb_class; 10394703203dSis size_t i; 10404703203dSis 10414703203dSis my_comb_class = comb_class[last]; 10424703203dSis for (i = 1; i < last; i++) 10434703203dSis if (comb_class[i] >= my_comb_class || 10444703203dSis comb_class[i] == U8_COMBINING_CLASS_STARTER) 10454703203dSis return (B_TRUE); 10464703203dSis 10474703203dSis return (B_FALSE); 10484703203dSis } 10494703203dSis 10504703203dSis /* 10514703203dSis * The do_composition() reads the character string pointed by 's' and 10524703203dSis * do necessary canonical composition and then copy over the result back to 10534703203dSis * the 's'. 10544703203dSis * 10554703203dSis * The input argument 's' cannot contain more than 32 characters. 10564703203dSis */ 10574703203dSis static size_t 10584703203dSis do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, 10594703203dSis uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) 10604703203dSis { 10614703203dSis uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; 10624703203dSis uchar_t tc[U8_MB_CUR_MAX]; 10634703203dSis uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; 10644703203dSis size_t saved_marks_count; 10654703203dSis uchar_t *p; 10664703203dSis uchar_t *saved_p; 10674703203dSis uchar_t *q; 10684703203dSis size_t i; 10694703203dSis size_t saved_i; 10704703203dSis size_t j; 10714703203dSis size_t k; 10724703203dSis size_t l; 10734703203dSis size_t C; 10744703203dSis size_t saved_l; 10754703203dSis size_t size; 10764703203dSis uint32_t u1; 10774703203dSis uint32_t u2; 10784703203dSis boolean_t match_not_found = B_TRUE; 10794703203dSis 10804703203dSis /* 10814703203dSis * This should never happen unless the callers are doing some strange 10824703203dSis * and unexpected things. 10834703203dSis * 10844703203dSis * The "last" is the index pointing to the last character not last + 1. 10854703203dSis */ 10864703203dSis if (last >= U8_MAX_CHARS_A_SEQ) 10874703203dSis last = U8_UPPER_LIMIT_IN_A_SEQ; 10884703203dSis 10894703203dSis for (i = l = 0; i <= last; i++) { 10904703203dSis /* 10914703203dSis * The last or any non-Starters at the beginning, we don't 10924703203dSis * have any chance to do composition and so we just copy them 10934703203dSis * to the temporary buffer. 10944703203dSis */ 10954703203dSis if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { 10964703203dSis SAVE_THE_CHAR: 10974703203dSis p = s + start[i]; 10984703203dSis size = disp[i]; 10994703203dSis for (k = 0; k < size; k++) 11004703203dSis t[l++] = *p++; 11014703203dSis continue; 11024703203dSis } 11034703203dSis 11044703203dSis /* 11054703203dSis * If this could be a start of Hangul Jamos, then, we try to 11064703203dSis * conjoin them. 11074703203dSis */ 11084703203dSis if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { 11094703203dSis U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], 11104703203dSis s[start[i] + 1], s[start[i] + 2]); 11114703203dSis U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], 11124703203dSis s[start[i] + 4], s[start[i] + 5]); 11134703203dSis 11144703203dSis if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { 11154703203dSis u1 -= U8_HANGUL_JAMO_L_FIRST; 11164703203dSis u2 -= U8_HANGUL_JAMO_V_FIRST; 11174703203dSis u1 = U8_HANGUL_SYL_FIRST + 11184703203dSis (u1 * U8_HANGUL_V_COUNT + u2) * 11194703203dSis U8_HANGUL_T_COUNT; 11204703203dSis 11214703203dSis i += 2; 11224703203dSis if (i <= last) { 11234703203dSis U8_PUT_3BYTES_INTO_UTF32(u2, 11244703203dSis s[start[i]], s[start[i] + 1], 11254703203dSis s[start[i] + 2]); 11264703203dSis 11274703203dSis if (U8_HANGUL_JAMO_T(u2)) { 11284703203dSis u1 += u2 - 11294703203dSis U8_HANGUL_JAMO_T_FIRST; 11304703203dSis i++; 11314703203dSis } 11324703203dSis } 11334703203dSis 11344703203dSis U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); 11354703203dSis i--; 11364703203dSis l += 3; 11374703203dSis continue; 11384703203dSis } 11394703203dSis } 11404703203dSis 11414703203dSis /* 11424703203dSis * Let's then find out if this Starter has composition 11434703203dSis * mapping. 11444703203dSis */ 11454703203dSis p = find_composition_start(uv, s + start[i], disp[i]); 11464703203dSis if (p == NULL) 11474703203dSis goto SAVE_THE_CHAR; 11484703203dSis 11494703203dSis /* 11504703203dSis * We have a Starter with composition mapping and the next 11514703203dSis * character is a non-Starter. Let's try to find out if 11524703203dSis * we can do composition. 11534703203dSis */ 11544703203dSis 11554703203dSis saved_p = p; 11564703203dSis saved_i = i; 11574703203dSis saved_l = l; 11584703203dSis saved_marks_count = 0; 11594703203dSis 11604703203dSis TRY_THE_NEXT_MARK: 11614703203dSis q = s + start[++i]; 11624703203dSis size = disp[i]; 11634703203dSis 11644703203dSis /* 11654703203dSis * The next for() loop compares the non-Starter pointed by 11664703203dSis * 'q' with the possible (joinable) characters pointed by 'p'. 11674703203dSis * 11684703203dSis * The composition final table entry pointed by the 'p' 11694703203dSis * looks like the following: 11704703203dSis * 11714703203dSis * +---+---+---+-...-+---+---+---+---+-...-+---+---+ 11724703203dSis * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | 11734703203dSis * +---+---+---+-...-+---+---+---+---+-...-+---+---+ 11744703203dSis * 11754703203dSis * where C is the count byte indicating the number of 11764703203dSis * mapping pairs where each pair would be look like 11774703203dSis * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second 11784703203dSis * character of a canonical decomposition and the B0-Bm are 11794703203dSis * the bytes of a matching composite character. The F is 11804703203dSis * a filler byte after each character as the separator. 11814703203dSis */ 11824703203dSis 11834703203dSis match_not_found = B_TRUE; 11844703203dSis 11854703203dSis for (C = *p++; C > 0; C--) { 11864703203dSis for (k = 0; k < size; p++, k++) 11874703203dSis if (*p != q[k]) 11884703203dSis break; 11894703203dSis 11904703203dSis /* Have we found it? */ 11914703203dSis if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { 11924703203dSis match_not_found = B_FALSE; 11934703203dSis 11944703203dSis l = saved_l; 11954703203dSis 11964703203dSis while (*++p != U8_TBL_ELEMENT_FILLER) 11974703203dSis t[l++] = *p; 11984703203dSis 11994703203dSis break; 12004703203dSis } 12014703203dSis 12024703203dSis /* We didn't find; skip to the next pair. */ 12034703203dSis if (*p != U8_TBL_ELEMENT_FILLER) 12044703203dSis while (*++p != U8_TBL_ELEMENT_FILLER) 12054703203dSis ; 12064703203dSis while (*++p != U8_TBL_ELEMENT_FILLER) 12074703203dSis ; 12084703203dSis p++; 12094703203dSis } 12104703203dSis 12114703203dSis /* 12124703203dSis * If there was no match, we will need to save the combining 12134703203dSis * mark for later appending. After that, if the next one 12144703203dSis * is a non-Starter and not blocked, then, we try once 12154703203dSis * again to do composition with the next non-Starter. 12164703203dSis * 12174703203dSis * If there was no match and this was a Starter, then, 12184703203dSis * this is a new start. 12194703203dSis * 12204703203dSis * If there was a match and a composition done and we have 12214703203dSis * more to check on, then, we retrieve a new composition final 12224703203dSis * table entry for the composite and then try to do the 12234703203dSis * composition again. 12244703203dSis */ 12254703203dSis 12264703203dSis if (match_not_found) { 12274703203dSis if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { 12284703203dSis i--; 12294703203dSis goto SAVE_THE_CHAR; 12304703203dSis } 12314703203dSis 12324703203dSis saved_marks[saved_marks_count++] = i; 12334703203dSis } 12344703203dSis 12354703203dSis if (saved_l == l) { 12364703203dSis while (i < last) { 12374703203dSis if (blocked(comb_class, i + 1)) 12384703203dSis saved_marks[saved_marks_count++] = ++i; 12394703203dSis else 12404703203dSis break; 12414703203dSis } 12424703203dSis if (i < last) { 12434703203dSis p = saved_p; 12444703203dSis goto TRY_THE_NEXT_MARK; 12454703203dSis } 12464703203dSis } else if (i < last) { 12474703203dSis p = find_composition_start(uv, t + saved_l, 12484703203dSis l - saved_l); 12494703203dSis if (p != NULL) { 12504703203dSis saved_p = p; 12514703203dSis goto TRY_THE_NEXT_MARK; 12524703203dSis } 12534703203dSis } 12544703203dSis 12554703203dSis /* 12564703203dSis * There is no more composition possible. 12574703203dSis * 12584703203dSis * If there was no composition what so ever then we copy 12594703203dSis * over the original Starter and then append any non-Starters 12604703203dSis * remaining at the target string sequentially after that. 12614703203dSis */ 12624703203dSis 12634703203dSis if (saved_l == l) { 12644703203dSis p = s + start[saved_i]; 12654703203dSis size = disp[saved_i]; 12664703203dSis for (j = 0; j < size; j++) 12674703203dSis t[l++] = *p++; 12684703203dSis } 12694703203dSis 12704703203dSis for (k = 0; k < saved_marks_count; k++) { 12714703203dSis p = s + start[saved_marks[k]]; 12724703203dSis size = disp[saved_marks[k]]; 12734703203dSis for (j = 0; j < size; j++) 12744703203dSis t[l++] = *p++; 12754703203dSis } 12764703203dSis } 12774703203dSis 12784703203dSis /* 12794703203dSis * If the last character is a Starter and if we have a character 12804703203dSis * (possibly another Starter) that can be turned into a composite, 12814703203dSis * we do so and we do so until there is no more of composition 12824703203dSis * possible. 12834703203dSis */ 12844703203dSis if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { 12854703203dSis p = *os; 12864703203dSis saved_l = l - disp[last]; 12874703203dSis 12884703203dSis while (p < oslast) { 12894703203dSis size = u8_number_of_bytes[*p]; 12904703203dSis if (size <= 1 || (p + size) > oslast) 12914703203dSis break; 12924703203dSis 12934703203dSis saved_p = p; 12944703203dSis 12954703203dSis for (i = 0; i < size; i++) 12964703203dSis tc[i] = *p++; 12974703203dSis 12984703203dSis q = find_composition_start(uv, t + saved_l, 12994703203dSis l - saved_l); 13004703203dSis if (q == NULL) { 13014703203dSis p = saved_p; 13024703203dSis break; 13034703203dSis } 13044703203dSis 13054703203dSis match_not_found = B_TRUE; 13064703203dSis 13074703203dSis for (C = *q++; C > 0; C--) { 13084703203dSis for (k = 0; k < size; q++, k++) 13094703203dSis if (*q != tc[k]) 13104703203dSis break; 13114703203dSis 13124703203dSis if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { 13134703203dSis match_not_found = B_FALSE; 13144703203dSis 13154703203dSis l = saved_l; 13164703203dSis 13174703203dSis while (*++q != U8_TBL_ELEMENT_FILLER) { 13184703203dSis /* 13194703203dSis * This is practically 13204703203dSis * impossible but we don't 13214703203dSis * want to take any chances. 13224703203dSis */ 13234703203dSis if (l >= 13244703203dSis U8_STREAM_SAFE_TEXT_MAX) { 13254703203dSis p = saved_p; 13264703203dSis goto SAFE_RETURN; 13274703203dSis } 13284703203dSis t[l++] = *q; 13294703203dSis } 13304703203dSis 13314703203dSis break; 13324703203dSis } 13334703203dSis 13344703203dSis if (*q != U8_TBL_ELEMENT_FILLER) 13354703203dSis while (*++q != U8_TBL_ELEMENT_FILLER) 13364703203dSis ; 13374703203dSis while (*++q != U8_TBL_ELEMENT_FILLER) 13384703203dSis ; 13394703203dSis q++; 13404703203dSis } 13414703203dSis 13424703203dSis if (match_not_found) { 13434703203dSis p = saved_p; 13444703203dSis break; 13454703203dSis } 13464703203dSis } 13474703203dSis SAFE_RETURN: 13484703203dSis *os = p; 13494703203dSis } 13504703203dSis 13514703203dSis /* 13524703203dSis * Now we copy over the temporary string to the target string. 13534703203dSis * Since composition always reduces the number of characters or 13544703203dSis * the number of characters stay, we don't need to worry about 13554703203dSis * the buffer overflow here. 13564703203dSis */ 13574703203dSis for (i = 0; i < l; i++) 13584703203dSis s[i] = t[i]; 13594703203dSis s[l] = '\0'; 13604703203dSis 13614703203dSis return (l); 13624703203dSis } 13634703203dSis 13644703203dSis /* 13654703203dSis * The collect_a_seq() function checks on the given string s, collect 13664703203dSis * a sequence of characters at u8s, and return the sequence. While it collects 13674703203dSis * a sequence, it also applies case conversion, canonical or compatibility 13684703203dSis * decomposition, canonical decomposition, or some or all of them and 13694703203dSis * in that order. 13704703203dSis * 13714703203dSis * The collected sequence cannot be bigger than 32 characters since if 13724703203dSis * it is having more than 31 characters, the sequence will be terminated 13734703203dSis * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into 13744703203dSis * a Stream-Safe Text. The collected sequence is always terminated with 13754703203dSis * a null byte and the return value is the byte length of the sequence 13764703203dSis * including 0. The return value does not include the terminating 13774703203dSis * null byte. 13784703203dSis */ 13794703203dSis static size_t 13804703203dSis collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, 13814703203dSis boolean_t is_it_toupper, 13824703203dSis boolean_t is_it_tolower, 13834703203dSis boolean_t canonical_decomposition, 13844703203dSis boolean_t compatibility_decomposition, 13854703203dSis boolean_t canonical_composition, 1386*85bb5f1dSis int *errnum, u8_normalization_states_t *state) 13874703203dSis { 13884703203dSis uchar_t *s; 13894703203dSis int sz; 13904703203dSis int saved_sz; 13914703203dSis size_t i; 13924703203dSis size_t j; 13934703203dSis size_t k; 13944703203dSis size_t l; 13954703203dSis uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; 13964703203dSis uchar_t disp[U8_MAX_CHARS_A_SEQ]; 13974703203dSis uchar_t start[U8_MAX_CHARS_A_SEQ]; 13984703203dSis uchar_t u8t[U8_MB_CUR_MAX]; 13994703203dSis uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; 14004703203dSis uchar_t tc; 14014703203dSis size_t last; 14024703203dSis size_t saved_last; 14034703203dSis uint32_t u1; 14044703203dSis 14054703203dSis /* 14064703203dSis * Save the source string pointer which we will return a changed 14074703203dSis * pointer if we do processing. 14084703203dSis */ 14094703203dSis s = *source; 14104703203dSis 14114703203dSis /* 14124703203dSis * The following is a fallback for just in case callers are not 14134703203dSis * checking the string boundaries before the calling. 14144703203dSis */ 14154703203dSis if (s >= slast) { 14164703203dSis u8s[0] = '\0'; 14174703203dSis 14184703203dSis return (0); 14194703203dSis } 14204703203dSis 14214703203dSis /* 14224703203dSis * As the first thing, let's collect a character and do case 14234703203dSis * conversion if necessary. 14244703203dSis */ 14254703203dSis 14264703203dSis sz = u8_number_of_bytes[*s]; 14274703203dSis 14284703203dSis if (sz < 0) { 1429*85bb5f1dSis *errnum = EILSEQ; 14304703203dSis 14314703203dSis u8s[0] = *s++; 14324703203dSis u8s[1] = '\0'; 14334703203dSis 14344703203dSis *source = s; 14354703203dSis 14364703203dSis return (1); 14374703203dSis } 14384703203dSis 14394703203dSis if (sz == 1) { 14404703203dSis if (is_it_toupper) 14414703203dSis u8s[0] = U8_ASCII_TOUPPER(*s); 14424703203dSis else if (is_it_tolower) 14434703203dSis u8s[0] = U8_ASCII_TOLOWER(*s); 14444703203dSis else 14454703203dSis u8s[0] = *s; 14464703203dSis s++; 14474703203dSis u8s[1] = '\0'; 14484703203dSis } else if ((s + sz) > slast) { 1449*85bb5f1dSis *errnum = EINVAL; 14504703203dSis 14514703203dSis for (i = 0; s < slast; ) 14524703203dSis u8s[i++] = *s++; 14534703203dSis u8s[i] = '\0'; 14544703203dSis 14554703203dSis *source = s; 14564703203dSis 14574703203dSis return (i); 14584703203dSis } else { 14594703203dSis if (is_it_toupper || is_it_tolower) { 14604703203dSis i = do_case_conv(uv, u8s, s, sz, is_it_toupper); 14614703203dSis s += sz; 14624703203dSis sz = i; 14634703203dSis } else { 14644703203dSis for (i = 0; i < sz; ) 14654703203dSis u8s[i++] = *s++; 14664703203dSis u8s[i] = '\0'; 14674703203dSis } 14684703203dSis } 14694703203dSis 14704703203dSis /* 14714703203dSis * And then canonical/compatibility decomposition followed by 14724703203dSis * an optional canonical composition. Please be noted that 14734703203dSis * canonical composition is done only when a decomposition is 14744703203dSis * done. 14754703203dSis */ 14764703203dSis if (canonical_decomposition || compatibility_decomposition) { 14774703203dSis if (sz == 1) { 14784703203dSis *state = U8_STATE_START; 14794703203dSis 14804703203dSis saved_sz = 1; 14814703203dSis 14824703203dSis comb_class[0] = 0; 14834703203dSis start[0] = 0; 14844703203dSis disp[0] = 1; 14854703203dSis 14864703203dSis last = 1; 14874703203dSis } else { 14884703203dSis saved_sz = do_decomp(uv, u8s, u8s, sz, 14894703203dSis canonical_decomposition, state); 14904703203dSis 14914703203dSis last = 0; 14924703203dSis 14934703203dSis for (i = 0; i < saved_sz; ) { 14944703203dSis sz = u8_number_of_bytes[u8s[i]]; 14954703203dSis 14964703203dSis comb_class[last] = combining_class(uv, 14974703203dSis u8s + i, sz); 14984703203dSis start[last] = i; 14994703203dSis disp[last] = sz; 15004703203dSis 15014703203dSis last++; 15024703203dSis i += sz; 15034703203dSis } 15044703203dSis 15054703203dSis /* 15064703203dSis * Decomposition yields various Hangul related 15074703203dSis * states but not on combining marks. We need to 15084703203dSis * find out at here by checking on the last 15094703203dSis * character. 15104703203dSis */ 15114703203dSis if (*state == U8_STATE_START) { 15124703203dSis if (comb_class[last - 1]) 15134703203dSis *state = U8_STATE_COMBINING_MARK; 15144703203dSis } 15154703203dSis } 15164703203dSis 15174703203dSis saved_last = last; 15184703203dSis 15194703203dSis while (s < slast) { 15204703203dSis sz = u8_number_of_bytes[*s]; 15214703203dSis 15224703203dSis /* 15234703203dSis * If this is an illegal character, an incomplete 15244703203dSis * character, or an 7-bit ASCII Starter character, 15254703203dSis * then we have collected a sequence; break and let 15264703203dSis * the next call deal with the two cases. 15274703203dSis * 15284703203dSis * Note that this is okay only if you are using this 15294703203dSis * function with a fixed length string, not on 15304703203dSis * a buffer with multiple calls of one chunk at a time. 15314703203dSis */ 15324703203dSis if (sz <= 1) { 15334703203dSis break; 15344703203dSis } else if ((s + sz) > slast) { 15354703203dSis break; 15364703203dSis } else { 15374703203dSis /* 15384703203dSis * If the previous character was a Hangul Jamo 15394703203dSis * and this character is a Hangul Jamo that 15404703203dSis * can be conjoined, we collect the Jamo. 15414703203dSis */ 15424703203dSis if (*s == U8_HANGUL_JAMO_1ST_BYTE) { 15434703203dSis U8_PUT_3BYTES_INTO_UTF32(u1, 15444703203dSis *s, *(s + 1), *(s + 2)); 15454703203dSis 15464703203dSis if (U8_HANGUL_COMPOSABLE_L_V(*state, 15474703203dSis u1)) { 15484703203dSis i = 0; 15494703203dSis *state = U8_STATE_HANGUL_LV; 15504703203dSis goto COLLECT_A_HANGUL; 15514703203dSis } 15524703203dSis 15534703203dSis if (U8_HANGUL_COMPOSABLE_LV_T(*state, 15544703203dSis u1)) { 15554703203dSis i = 0; 15564703203dSis *state = U8_STATE_HANGUL_LVT; 15574703203dSis goto COLLECT_A_HANGUL; 15584703203dSis } 15594703203dSis } 15604703203dSis 15614703203dSis /* 15624703203dSis * Regardless of whatever it was, if this is 15634703203dSis * a Starter, we don't collect the character 15644703203dSis * since that's a new start and we will deal 15654703203dSis * with it at the next time. 15664703203dSis */ 15674703203dSis i = combining_class(uv, s, sz); 15684703203dSis if (i == U8_COMBINING_CLASS_STARTER) 15694703203dSis break; 15704703203dSis 15714703203dSis /* 15724703203dSis * We know the current character is a combining 15734703203dSis * mark. If the previous character wasn't 15744703203dSis * a Starter (not Hangul) or a combining mark, 15754703203dSis * then, we don't collect this combining mark. 15764703203dSis */ 15774703203dSis if (*state != U8_STATE_START && 15784703203dSis *state != U8_STATE_COMBINING_MARK) 15794703203dSis break; 15804703203dSis 15814703203dSis *state = U8_STATE_COMBINING_MARK; 15824703203dSis COLLECT_A_HANGUL: 15834703203dSis /* 15844703203dSis * If we collected a Starter and combining 15854703203dSis * marks up to 30, i.e., total 31 characters, 15864703203dSis * then, we terminate this degenerately long 15874703203dSis * combining sequence with a U+034F COMBINING 15884703203dSis * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in 15894703203dSis * UTF-8 and turn this into a Stream-Safe 15904703203dSis * Text. This will be extremely rare but 15914703203dSis * possible. 15924703203dSis * 15934703203dSis * The following will also guarantee that 15944703203dSis * we are not writing more than 32 characters 15954703203dSis * plus a NULL at u8s[]. 15964703203dSis */ 15974703203dSis if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { 15984703203dSis TURN_STREAM_SAFE: 15994703203dSis *state = U8_STATE_START; 16004703203dSis comb_class[last] = 0; 16014703203dSis start[last] = saved_sz; 16024703203dSis disp[last] = 2; 16034703203dSis last++; 16044703203dSis 16054703203dSis u8s[saved_sz++] = 0xCD; 16064703203dSis u8s[saved_sz++] = 0x8F; 16074703203dSis 16084703203dSis break; 16094703203dSis } 16104703203dSis 16114703203dSis /* 16124703203dSis * Some combining marks also do decompose into 16134703203dSis * another combining mark or marks. 16144703203dSis */ 16154703203dSis if (*state == U8_STATE_COMBINING_MARK) { 16164703203dSis k = last; 16174703203dSis l = sz; 16184703203dSis i = do_decomp(uv, uts, s, sz, 16194703203dSis canonical_decomposition, state); 16204703203dSis for (j = 0; j < i; ) { 16214703203dSis sz = u8_number_of_bytes[uts[j]]; 16224703203dSis 16234703203dSis comb_class[last] = 16244703203dSis combining_class(uv, 16254703203dSis uts + j, sz); 16264703203dSis start[last] = saved_sz + j; 16274703203dSis disp[last] = sz; 16284703203dSis 16294703203dSis last++; 16304703203dSis if (last >= 16314703203dSis U8_UPPER_LIMIT_IN_A_SEQ) { 16324703203dSis last = k; 16334703203dSis goto TURN_STREAM_SAFE; 16344703203dSis } 16354703203dSis j += sz; 16364703203dSis } 16374703203dSis 16384703203dSis *state = U8_STATE_COMBINING_MARK; 16394703203dSis sz = i; 16404703203dSis s += l; 16414703203dSis 16424703203dSis for (i = 0; i < sz; i++) 16434703203dSis u8s[saved_sz++] = uts[i]; 16444703203dSis } else { 16454703203dSis comb_class[last] = i; 16464703203dSis start[last] = saved_sz; 16474703203dSis disp[last] = sz; 16484703203dSis last++; 16494703203dSis 16504703203dSis for (i = 0; i < sz; i++) 16514703203dSis u8s[saved_sz++] = *s++; 16524703203dSis } 16534703203dSis 16544703203dSis /* 16554703203dSis * If this is U+0345 COMBINING GREEK 16564703203dSis * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., 16574703203dSis * iota subscript, and need to be converted to 16584703203dSis * uppercase letter, convert it to U+0399 GREEK 16594703203dSis * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), 16604703203dSis * i.e., convert to capital adscript form as 16614703203dSis * specified in the Unicode standard. 16624703203dSis * 16634703203dSis * This is the only special case of (ambiguous) 16644703203dSis * case conversion at combining marks and 16654703203dSis * probably the standard will never have 16664703203dSis * anything similar like this in future. 16674703203dSis */ 16684703203dSis if (is_it_toupper && sz >= 2 && 16694703203dSis u8s[saved_sz - 2] == 0xCD && 16704703203dSis u8s[saved_sz - 1] == 0x85) { 16714703203dSis u8s[saved_sz - 2] = 0xCE; 16724703203dSis u8s[saved_sz - 1] = 0x99; 16734703203dSis } 16744703203dSis } 16754703203dSis } 16764703203dSis 16774703203dSis /* 16784703203dSis * Let's try to ensure a canonical ordering for the collected 16794703203dSis * combining marks. We do this only if we have collected 16804703203dSis * at least one more non-Starter. (The decomposition mapping 16814703203dSis * data tables have fully (and recursively) expanded and 16824703203dSis * canonically ordered decompositions.) 16834703203dSis * 16844703203dSis * The U8_SWAP_COMB_MARKS() convenience macro has some 16854703203dSis * assumptions and we are meeting the assumptions. 16864703203dSis */ 16874703203dSis last--; 16884703203dSis if (last >= saved_last) { 16894703203dSis for (i = 0; i < last; i++) 16904703203dSis for (j = last; j > i; j--) 16914703203dSis if (comb_class[j] && 16924703203dSis comb_class[j - 1] > comb_class[j]) { 16934703203dSis U8_SWAP_COMB_MARKS(j - 1, j); 16944703203dSis } 16954703203dSis } 16964703203dSis 16974703203dSis *source = s; 16984703203dSis 16994703203dSis if (! canonical_composition) { 17004703203dSis u8s[saved_sz] = '\0'; 17014703203dSis return (saved_sz); 17024703203dSis } 17034703203dSis 17044703203dSis /* 17054703203dSis * Now do the canonical composition. Note that we do this 17064703203dSis * only after a canonical or compatibility decomposition to 17074703203dSis * finish up NFC or NFKC. 17084703203dSis */ 17094703203dSis sz = do_composition(uv, u8s, comb_class, start, disp, last, 17104703203dSis &s, slast); 17114703203dSis } 17124703203dSis 17134703203dSis *source = s; 17144703203dSis 17154703203dSis return ((size_t)sz); 17164703203dSis } 17174703203dSis 17184703203dSis /* 17194703203dSis * The do_norm_compare() function does string comparion based on Unicode 17204703203dSis * simple case mappings and Unicode Normalization definitions. 17214703203dSis * 17224703203dSis * It does so by collecting a sequence of character at a time and comparing 17234703203dSis * the collected sequences from the strings. 17244703203dSis * 17254703203dSis * The meanings on the return values are the same as the usual strcmp(). 17264703203dSis */ 17274703203dSis static int 17284703203dSis do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, 1729*85bb5f1dSis int flag, int *errnum) 17304703203dSis { 17314703203dSis int result; 17324703203dSis size_t sz1; 17334703203dSis size_t sz2; 17344703203dSis uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; 17354703203dSis uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; 17364703203dSis uchar_t *s1last; 17374703203dSis uchar_t *s2last; 17384703203dSis boolean_t is_it_toupper; 17394703203dSis boolean_t is_it_tolower; 17404703203dSis boolean_t canonical_decomposition; 17414703203dSis boolean_t compatibility_decomposition; 17424703203dSis boolean_t canonical_composition; 17434703203dSis u8_normalization_states_t state; 17444703203dSis 17454703203dSis s1last = s1 + n1; 17464703203dSis s2last = s2 + n2; 17474703203dSis 17484703203dSis is_it_toupper = flag & U8_TEXTPREP_TOUPPER; 17494703203dSis is_it_tolower = flag & U8_TEXTPREP_TOLOWER; 17504703203dSis canonical_decomposition = flag & U8_CANON_DECOMP; 17514703203dSis compatibility_decomposition = flag & U8_COMPAT_DECOMP; 17524703203dSis canonical_composition = flag & U8_CANON_COMP; 17534703203dSis 17544703203dSis while (s1 < s1last && s2 < s2last) { 17554703203dSis /* 17564703203dSis * If the current character is a 7-bit ASCII and the last 17574703203dSis * character, or, if the current character and the next 17584703203dSis * character are both some 7-bit ASCII characters then 17594703203dSis * we treat the current character as a sequence. 17604703203dSis * 17614703203dSis * In any other cases, we need to call collect_a_seq(). 17624703203dSis */ 17634703203dSis 17644703203dSis if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || 17654703203dSis ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { 17664703203dSis if (is_it_toupper) 17674703203dSis u8s1[0] = U8_ASCII_TOUPPER(*s1); 17684703203dSis else if (is_it_tolower) 17694703203dSis u8s1[0] = U8_ASCII_TOLOWER(*s1); 17704703203dSis else 17714703203dSis u8s1[0] = *s1; 17724703203dSis u8s1[1] = '\0'; 17734703203dSis sz1 = 1; 17744703203dSis s1++; 17754703203dSis } else { 17764703203dSis state = U8_STATE_START; 17774703203dSis sz1 = collect_a_seq(uv, u8s1, &s1, s1last, 17784703203dSis is_it_toupper, is_it_tolower, 17794703203dSis canonical_decomposition, 17804703203dSis compatibility_decomposition, 1781*85bb5f1dSis canonical_composition, errnum, &state); 17824703203dSis } 17834703203dSis 17844703203dSis if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || 17854703203dSis ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { 17864703203dSis if (is_it_toupper) 17874703203dSis u8s2[0] = U8_ASCII_TOUPPER(*s2); 17884703203dSis else if (is_it_tolower) 17894703203dSis u8s2[0] = U8_ASCII_TOLOWER(*s2); 17904703203dSis else 17914703203dSis u8s2[0] = *s2; 17924703203dSis u8s2[1] = '\0'; 17934703203dSis sz2 = 1; 17944703203dSis s2++; 17954703203dSis } else { 17964703203dSis state = U8_STATE_START; 17974703203dSis sz2 = collect_a_seq(uv, u8s2, &s2, s2last, 17984703203dSis is_it_toupper, is_it_tolower, 17994703203dSis canonical_decomposition, 18004703203dSis compatibility_decomposition, 1801*85bb5f1dSis canonical_composition, errnum, &state); 18024703203dSis } 18034703203dSis 18044703203dSis /* 18054703203dSis * Now compare the two characters. If they are the same, 18064703203dSis * we move on to the next character sequences. 18074703203dSis */ 18084703203dSis if (sz1 == 1 && sz2 == 1) { 18094703203dSis if (*u8s1 > *u8s2) 18104703203dSis return (1); 18114703203dSis if (*u8s1 < *u8s2) 18124703203dSis return (-1); 18134703203dSis } else { 18144703203dSis result = strcmp((const char *)u8s1, (const char *)u8s2); 18154703203dSis if (result != 0) 18164703203dSis return (result); 18174703203dSis } 18184703203dSis } 18194703203dSis 18204703203dSis /* 18214703203dSis * We compared until the end of either or both strings. 18224703203dSis * 18234703203dSis * If we reached to or went over the ends for the both, that means 18244703203dSis * they are the same. 18254703203dSis * 18264703203dSis * If we reached only one end, that means the other string has 18274703203dSis * something which then can be used to determine the return value. 18284703203dSis */ 18294703203dSis if (s1 >= s1last) { 18304703203dSis if (s2 >= s2last) 18314703203dSis return (0); 18324703203dSis return (-1); 18334703203dSis } 18344703203dSis return (1); 18354703203dSis } 18364703203dSis 18374703203dSis /* 18384703203dSis * The u8_strcmp() function compares two UTF-8 strings quite similar to 18394703203dSis * the strcmp(). For the comparison, however, Unicode Normalization specific 18404703203dSis * equivalency and Unicode simple case conversion mappings based equivalency 18414703203dSis * can be requested and checked against. 18424703203dSis */ 18434703203dSis int 18444703203dSis u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, 1845*85bb5f1dSis int *errnum) 18464703203dSis { 18474703203dSis int f; 18484703203dSis size_t n1; 18494703203dSis size_t n2; 18504703203dSis 1851*85bb5f1dSis *errnum = 0; 18524703203dSis 18534703203dSis /* 18544703203dSis * Check on the requested Unicode version, case conversion, and 18554703203dSis * normalization flag values. 18564703203dSis */ 18574703203dSis 18584703203dSis if (uv > U8_UNICODE_LATEST) { 1859*85bb5f1dSis *errnum = ERANGE; 18604703203dSis uv = U8_UNICODE_LATEST; 18614703203dSis } 18624703203dSis 18634703203dSis if (flag == 0) { 18644703203dSis flag = U8_STRCMP_CS; 18654703203dSis } else { 18664703203dSis f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER | 18674703203dSis U8_STRCMP_CI_LOWER); 18684703203dSis if (f == 0) { 18694703203dSis flag |= U8_STRCMP_CS; 18704703203dSis } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && 18714703203dSis f != U8_STRCMP_CI_LOWER) { 1872*85bb5f1dSis *errnum = EBADF; 18734703203dSis flag = U8_STRCMP_CS; 18744703203dSis } 18754703203dSis 18764703203dSis f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); 18774703203dSis if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && 18784703203dSis f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { 1879*85bb5f1dSis *errnum = EBADF; 18804703203dSis flag = U8_STRCMP_CS; 18814703203dSis } 18824703203dSis } 18834703203dSis 18844703203dSis if (flag == U8_STRCMP_CS) { 18854703203dSis return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); 18864703203dSis } 18874703203dSis 18884703203dSis n1 = strlen(s1); 18894703203dSis n2 = strlen(s2); 18904703203dSis if (n != 0) { 18914703203dSis if (n < n1) 18924703203dSis n1 = n; 18934703203dSis if (n < n2) 18944703203dSis n2 = n; 18954703203dSis } 18964703203dSis 18974703203dSis /* 18984703203dSis * Simple case conversion can be done much faster and so we do 18994703203dSis * them separately here. 19004703203dSis */ 19014703203dSis if (flag == U8_STRCMP_CI_UPPER) { 19024703203dSis return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, 1903*85bb5f1dSis n1, n2, B_TRUE, errnum)); 19044703203dSis } else if (flag == U8_STRCMP_CI_LOWER) { 19054703203dSis return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, 1906*85bb5f1dSis n1, n2, B_FALSE, errnum)); 19074703203dSis } 19084703203dSis 19094703203dSis return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, 1910*85bb5f1dSis flag, errnum)); 19114703203dSis } 19124703203dSis 19134703203dSis size_t 19144703203dSis u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, 1915*85bb5f1dSis int flag, size_t unicode_version, int *errnum) 19164703203dSis { 19174703203dSis int f; 19184703203dSis int sz; 19194703203dSis uchar_t *ib; 19204703203dSis uchar_t *ibtail; 19214703203dSis uchar_t *ob; 19224703203dSis uchar_t *obtail; 19234703203dSis boolean_t do_not_ignore_null; 19244703203dSis boolean_t do_not_ignore_invalid; 19254703203dSis boolean_t is_it_toupper; 19264703203dSis boolean_t is_it_tolower; 19274703203dSis boolean_t canonical_decomposition; 19284703203dSis boolean_t compatibility_decomposition; 19294703203dSis boolean_t canonical_composition; 19304703203dSis size_t ret_val; 19314703203dSis size_t i; 19324703203dSis size_t j; 19334703203dSis uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; 19344703203dSis u8_normalization_states_t state; 19354703203dSis 19364703203dSis if (unicode_version > U8_UNICODE_LATEST) { 1937*85bb5f1dSis *errnum = ERANGE; 19384703203dSis return ((size_t)-1); 19394703203dSis } 19404703203dSis 19414703203dSis f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); 19424703203dSis if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { 1943*85bb5f1dSis *errnum = EBADF; 19444703203dSis return ((size_t)-1); 19454703203dSis } 19464703203dSis 19474703203dSis f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); 19484703203dSis if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && 19494703203dSis f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { 1950*85bb5f1dSis *errnum = EBADF; 19514703203dSis return ((size_t)-1); 19524703203dSis } 19534703203dSis 19544703203dSis if (inarray == NULL || *inlen == 0) 19554703203dSis return (0); 19564703203dSis 19574703203dSis if (outarray == NULL) { 1958*85bb5f1dSis *errnum = E2BIG; 19594703203dSis return ((size_t)-1); 19604703203dSis } 19614703203dSis 19624703203dSis ib = (uchar_t *)inarray; 19634703203dSis ob = (uchar_t *)outarray; 19644703203dSis ibtail = ib + *inlen; 19654703203dSis obtail = ob + *outlen; 19664703203dSis 19674703203dSis do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); 19684703203dSis do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); 19694703203dSis is_it_toupper = flag & U8_TEXTPREP_TOUPPER; 19704703203dSis is_it_tolower = flag & U8_TEXTPREP_TOLOWER; 19714703203dSis 19724703203dSis ret_val = 0; 19734703203dSis 19744703203dSis /* 19754703203dSis * If we don't have a normalization flag set, we do the simple case 19764703203dSis * conversion based text preparation separately below. Text 19774703203dSis * preparation involving Normalization will be done in the false task 19784703203dSis * block, again, separately since it will take much more time and 19794703203dSis * resource than doing simple case conversions. 19804703203dSis */ 19814703203dSis if (f == 0) { 19824703203dSis while (ib < ibtail) { 19834703203dSis if (*ib == '\0' && do_not_ignore_null) 19844703203dSis break; 19854703203dSis 19864703203dSis sz = u8_number_of_bytes[*ib]; 19874703203dSis 19884703203dSis if (sz < 0) { 19894703203dSis if (do_not_ignore_invalid) { 1990*85bb5f1dSis *errnum = EILSEQ; 19914703203dSis ret_val = (size_t)-1; 19924703203dSis break; 19934703203dSis } 19944703203dSis 19954703203dSis sz = 1; 19964703203dSis ret_val++; 19974703203dSis } 19984703203dSis 19994703203dSis if (sz == 1) { 20004703203dSis if (ob >= obtail) { 2001*85bb5f1dSis *errnum = E2BIG; 20024703203dSis ret_val = (size_t)-1; 20034703203dSis break; 20044703203dSis } 20054703203dSis 20064703203dSis if (is_it_toupper) 20074703203dSis *ob = U8_ASCII_TOUPPER(*ib); 20084703203dSis else if (is_it_tolower) 20094703203dSis *ob = U8_ASCII_TOLOWER(*ib); 20104703203dSis else 20114703203dSis *ob = *ib; 20124703203dSis ib++; 20134703203dSis ob++; 20144703203dSis } else if ((ib + sz) > ibtail) { 20154703203dSis if (do_not_ignore_invalid) { 2016*85bb5f1dSis *errnum = EINVAL; 20174703203dSis ret_val = (size_t)-1; 20184703203dSis break; 20194703203dSis } 20204703203dSis 20214703203dSis if ((obtail - ob) < (ibtail - ib)) { 2022*85bb5f1dSis *errnum = E2BIG; 20234703203dSis ret_val = (size_t)-1; 20244703203dSis break; 20254703203dSis } 20264703203dSis 20274703203dSis /* 20284703203dSis * We treat the remaining incomplete character 20294703203dSis * bytes as a character. 20304703203dSis */ 20314703203dSis ret_val++; 20324703203dSis 20334703203dSis while (ib < ibtail) 20344703203dSis *ob++ = *ib++; 20354703203dSis } else { 20364703203dSis if (is_it_toupper || is_it_tolower) { 20374703203dSis i = do_case_conv(unicode_version, u8s, 20384703203dSis ib, sz, is_it_toupper); 20394703203dSis 20404703203dSis if ((obtail - ob) < i) { 2041*85bb5f1dSis *errnum = E2BIG; 20424703203dSis ret_val = (size_t)-1; 20434703203dSis break; 20444703203dSis } 20454703203dSis 20464703203dSis ib += sz; 20474703203dSis 20484703203dSis for (sz = 0; sz < i; sz++) 20494703203dSis *ob++ = u8s[sz]; 20504703203dSis } else { 20514703203dSis if ((obtail - ob) < sz) { 2052*85bb5f1dSis *errnum = E2BIG; 20534703203dSis ret_val = (size_t)-1; 20544703203dSis break; 20554703203dSis } 20564703203dSis 20574703203dSis for (i = 0; i < sz; i++) 20584703203dSis *ob++ = *ib++; 20594703203dSis } 20604703203dSis } 20614703203dSis } 20624703203dSis } else { 20634703203dSis canonical_decomposition = flag & U8_CANON_DECOMP; 20644703203dSis compatibility_decomposition = flag & U8_COMPAT_DECOMP; 20654703203dSis canonical_composition = flag & U8_CANON_COMP; 20664703203dSis 20674703203dSis while (ib < ibtail) { 20684703203dSis if (*ib == '\0' && do_not_ignore_null) 20694703203dSis break; 20704703203dSis 20714703203dSis /* 20724703203dSis * If the current character is a 7-bit ASCII 20734703203dSis * character and it is the last character, or, 20744703203dSis * if the current character is a 7-bit ASCII 20754703203dSis * character and the next character is also a 7-bit 20764703203dSis * ASCII character, then, we copy over this 20774703203dSis * character without going through collect_a_seq(). 20784703203dSis * 20794703203dSis * In any other cases, we need to look further with 20804703203dSis * the collect_a_seq() function. 20814703203dSis */ 20824703203dSis if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || 20834703203dSis ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { 20844703203dSis if (ob >= obtail) { 2085*85bb5f1dSis *errnum = E2BIG; 20864703203dSis ret_val = (size_t)-1; 20874703203dSis break; 20884703203dSis } 20894703203dSis 20904703203dSis if (is_it_toupper) 20914703203dSis *ob = U8_ASCII_TOUPPER(*ib); 20924703203dSis else if (is_it_tolower) 20934703203dSis *ob = U8_ASCII_TOLOWER(*ib); 20944703203dSis else 20954703203dSis *ob = *ib; 20964703203dSis ib++; 20974703203dSis ob++; 20984703203dSis } else { 2099*85bb5f1dSis *errnum = 0; 21004703203dSis state = U8_STATE_START; 21014703203dSis 21024703203dSis j = collect_a_seq(unicode_version, u8s, 21034703203dSis &ib, ibtail, 21044703203dSis is_it_toupper, 21054703203dSis is_it_tolower, 21064703203dSis canonical_decomposition, 21074703203dSis compatibility_decomposition, 21084703203dSis canonical_composition, 2109*85bb5f1dSis errnum, &state); 21104703203dSis 2111*85bb5f1dSis if (*errnum && do_not_ignore_invalid) { 21124703203dSis ret_val = (size_t)-1; 21134703203dSis break; 21144703203dSis } 21154703203dSis 21164703203dSis if ((obtail - ob) < j) { 2117*85bb5f1dSis *errnum = E2BIG; 21184703203dSis ret_val = (size_t)-1; 21194703203dSis break; 21204703203dSis } 21214703203dSis 21224703203dSis for (i = 0; i < j; i++) 21234703203dSis *ob++ = u8s[i]; 21244703203dSis } 21254703203dSis } 21264703203dSis } 21274703203dSis 21284703203dSis *inlen = ibtail - ib; 21294703203dSis *outlen = obtail - ob; 21304703203dSis 21314703203dSis return (ret_val); 21324703203dSis } 2133