1*4703203dSis /* 2*4703203dSis * CDDL HEADER START 3*4703203dSis * 4*4703203dSis * The contents of this file are subject to the terms of the 5*4703203dSis * Common Development and Distribution License (the "License"). 6*4703203dSis * You may not use this file except in compliance with the License. 7*4703203dSis * 8*4703203dSis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*4703203dSis * or http://www.opensolaris.org/os/licensing. 10*4703203dSis * See the License for the specific language governing permissions 11*4703203dSis * and limitations under the License. 12*4703203dSis * 13*4703203dSis * When distributing Covered Code, include this CDDL HEADER in each 14*4703203dSis * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*4703203dSis * If applicable, add the following below this CDDL HEADER, with the 16*4703203dSis * fields enclosed by brackets "[]" replaced with your own identifying 17*4703203dSis * information: Portions Copyright [yyyy] [name of copyright owner] 18*4703203dSis * 19*4703203dSis * CDDL HEADER END 20*4703203dSis */ 21*4703203dSis /* 22*4703203dSis * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23*4703203dSis * Use is subject to license terms. 24*4703203dSis */ 25*4703203dSis 26*4703203dSis #pragma ident "%Z%%M% %I% %E% SMI" 27*4703203dSis 28*4703203dSis 29*4703203dSis /* 30*4703203dSis * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458). 31*4703203dSis * 32*4703203dSis * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), 33*4703203dSis * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also 34*4703203dSis * the section 3C man pages. 35*4703203dSis * Interface stability: Committed. 36*4703203dSis */ 37*4703203dSis 38*4703203dSis #include <sys/types.h> 39*4703203dSis #ifdef _KERNEL 40*4703203dSis #include <sys/param.h> 41*4703203dSis #include <sys/sysmacros.h> 42*4703203dSis #include <sys/systm.h> 43*4703203dSis #include <sys/debug.h> 44*4703203dSis #include <sys/kmem.h> 45*4703203dSis #include <sys/ddi.h> 46*4703203dSis #include <sys/sunddi.h> 47*4703203dSis #else 48*4703203dSis #include <sys/u8_textprep.h> 49*4703203dSis #include <strings.h> 50*4703203dSis #endif /* _KERNEL */ 51*4703203dSis #include <sys/byteorder.h> 52*4703203dSis #include <sys/errno.h> 53*4703203dSis #include <sys/u8_textprep_data.h> 54*4703203dSis 55*4703203dSis 56*4703203dSis /* The maximum possible number of bytes in a UTF-8 character. */ 57*4703203dSis #define U8_MB_CUR_MAX (4) 58*4703203dSis 59*4703203dSis /* 60*4703203dSis * The maximum number of bytes needed for a UTF-8 character to cover 61*4703203dSis * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. 62*4703203dSis */ 63*4703203dSis #define U8_MAX_BYTES_UCS2 (3) 64*4703203dSis 65*4703203dSis /* The maximum possible number of bytes in a Stream-Safe Text. */ 66*4703203dSis #define U8_STREAM_SAFE_TEXT_MAX (128) 67*4703203dSis 68*4703203dSis /* 69*4703203dSis * The maximum number of characters in a combining/conjoining sequence and 70*4703203dSis * the actual upperbound limit of a combining/conjoining sequence. 71*4703203dSis */ 72*4703203dSis #define U8_MAX_CHARS_A_SEQ (32) 73*4703203dSis #define U8_UPPER_LIMIT_IN_A_SEQ (31) 74*4703203dSis 75*4703203dSis /* The combining class value for Starter. */ 76*4703203dSis #define U8_COMBINING_CLASS_STARTER (0) 77*4703203dSis 78*4703203dSis /* 79*4703203dSis * Some Hangul related macros at below. 80*4703203dSis * 81*4703203dSis * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, 82*4703203dSis * Vowels, and optional Trailing consonants in Unicode scalar values. 83*4703203dSis * 84*4703203dSis * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not 85*4703203dSis * the actual U+11A8. This is due to that the trailing consonant is optional 86*4703203dSis * and thus we are doing a pre-calculation of subtracting one. 87*4703203dSis * 88*4703203dSis * Each of 19 modern leading consonants has total 588 possible syllables since 89*4703203dSis * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for 90*4703203dSis * no trailing consonant case, i.e., 21 x 28 = 588. 91*4703203dSis * 92*4703203dSis * We also have bunch of Hangul related macros at below. Please bear in mind 93*4703203dSis * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is 94*4703203dSis * a Hangul Jamo or not but the value does not guarantee that it is a Hangul 95*4703203dSis * Jamo; it just guarantee that it will be most likely. 96*4703203dSis */ 97*4703203dSis #define U8_HANGUL_SYL_FIRST (0xAC00U) 98*4703203dSis #define U8_HANGUL_SYL_LAST (0xD7A3U) 99*4703203dSis 100*4703203dSis #define U8_HANGUL_JAMO_L_FIRST (0x1100U) 101*4703203dSis #define U8_HANGUL_JAMO_L_LAST (0x1112U) 102*4703203dSis #define U8_HANGUL_JAMO_V_FIRST (0x1161U) 103*4703203dSis #define U8_HANGUL_JAMO_V_LAST (0x1175U) 104*4703203dSis #define U8_HANGUL_JAMO_T_FIRST (0x11A7U) 105*4703203dSis #define U8_HANGUL_JAMO_T_LAST (0x11C2U) 106*4703203dSis 107*4703203dSis #define U8_HANGUL_V_COUNT (21) 108*4703203dSis #define U8_HANGUL_VT_COUNT (588) 109*4703203dSis #define U8_HANGUL_T_COUNT (28) 110*4703203dSis 111*4703203dSis #define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) 112*4703203dSis 113*4703203dSis #define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ 114*4703203dSis (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ 115*4703203dSis (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ 116*4703203dSis (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); 117*4703203dSis 118*4703203dSis #define U8_HANGUL_JAMO_L(u) \ 119*4703203dSis ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) 120*4703203dSis 121*4703203dSis #define U8_HANGUL_JAMO_V(u) \ 122*4703203dSis ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) 123*4703203dSis 124*4703203dSis #define U8_HANGUL_JAMO_T(u) \ 125*4703203dSis ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) 126*4703203dSis 127*4703203dSis #define U8_HANGUL_JAMO(u) \ 128*4703203dSis ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) 129*4703203dSis 130*4703203dSis #define U8_HANGUL_SYLLABLE(u) \ 131*4703203dSis ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) 132*4703203dSis 133*4703203dSis #define U8_HANGUL_COMPOSABLE_L_V(s, u) \ 134*4703203dSis ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) 135*4703203dSis 136*4703203dSis #define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ 137*4703203dSis ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) 138*4703203dSis 139*4703203dSis /* The types of decomposition mappings. */ 140*4703203dSis #define U8_DECOMP_BOTH (0xF5U) 141*4703203dSis #define U8_DECOMP_CANONICAL (0xF6U) 142*4703203dSis 143*4703203dSis /* The indicator for 16-bit table. */ 144*4703203dSis #define U8_16BIT_TABLE_INDICATOR (0x8000U) 145*4703203dSis 146*4703203dSis /* The following are some convenience macros. */ 147*4703203dSis #define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ 148*4703203dSis (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \ 149*4703203dSis (uint32_t)(b3) & 0x3F; 150*4703203dSis 151*4703203dSis #define U8_SIMPLE_SWAP(a, b, t) \ 152*4703203dSis (t) = (a); \ 153*4703203dSis (a) = (b); \ 154*4703203dSis (b) = (t); 155*4703203dSis 156*4703203dSis #define U8_ASCII_TOUPPER(c) \ 157*4703203dSis (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) 158*4703203dSis 159*4703203dSis #define U8_ASCII_TOLOWER(c) \ 160*4703203dSis (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) 161*4703203dSis 162*4703203dSis #define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) 163*4703203dSis /* 164*4703203dSis * The following macro assumes that the two characters that are to be 165*4703203dSis * swapped are adjacent to each other and 'a' comes before 'b'. 166*4703203dSis * 167*4703203dSis * If the assumptions are not met, then, the macro will fail. 168*4703203dSis */ 169*4703203dSis #define U8_SWAP_COMB_MARKS(a, b) \ 170*4703203dSis for (k = 0; k < disp[(a)]; k++) \ 171*4703203dSis u8t[k] = u8s[start[(a)] + k]; \ 172*4703203dSis for (k = 0; k < disp[(b)]; k++) \ 173*4703203dSis u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ 174*4703203dSis start[(b)] = start[(a)] + disp[(b)]; \ 175*4703203dSis for (k = 0; k < disp[(a)]; k++) \ 176*4703203dSis u8s[start[(b)] + k] = u8t[k]; \ 177*4703203dSis U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ 178*4703203dSis U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); 179*4703203dSis 180*4703203dSis /* The possible states during normalization. */ 181*4703203dSis typedef enum { 182*4703203dSis U8_STATE_START = 0, 183*4703203dSis U8_STATE_HANGUL_L = 1, 184*4703203dSis U8_STATE_HANGUL_LV = 2, 185*4703203dSis U8_STATE_HANGUL_LVT = 3, 186*4703203dSis U8_STATE_HANGUL_V = 4, 187*4703203dSis U8_STATE_HANGUL_T = 5, 188*4703203dSis U8_STATE_COMBINING_MARK = 6 189*4703203dSis } u8_normalization_states_t; 190*4703203dSis 191*4703203dSis /* 192*4703203dSis * The three vectors at below are used to check bytes of a given UTF-8 193*4703203dSis * character are valid and not containing any malformed byte values. 194*4703203dSis * 195*4703203dSis * We used to have a quite relaxed UTF-8 binary representation but then there 196*4703203dSis * was some security related issues and so the Unicode Consortium defined 197*4703203dSis * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it 198*4703203dSis * one more time at the Unicode 3.2. The following three tables are based on 199*4703203dSis * that. 200*4703203dSis */ 201*4703203dSis 202*4703203dSis #define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) 203*4703203dSis 204*4703203dSis #define I_ U8_ILLEGAL_CHAR 205*4703203dSis #define O_ U8_OUT_OF_RANGE_CHAR 206*4703203dSis 207*4703203dSis const int8_t u8_number_of_bytes[0x100] = { 208*4703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 209*4703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 210*4703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 211*4703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 212*4703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 213*4703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 214*4703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 215*4703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 216*4703203dSis 217*4703203dSis /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ 218*4703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 219*4703203dSis 220*4703203dSis /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ 221*4703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 222*4703203dSis 223*4703203dSis /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ 224*4703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 225*4703203dSis 226*4703203dSis /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ 227*4703203dSis I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, 228*4703203dSis 229*4703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 230*4703203dSis I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 231*4703203dSis 232*4703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 233*4703203dSis 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 234*4703203dSis 235*4703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 236*4703203dSis 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 237*4703203dSis 238*4703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 239*4703203dSis 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, 240*4703203dSis }; 241*4703203dSis 242*4703203dSis #undef I_ 243*4703203dSis #undef O_ 244*4703203dSis 245*4703203dSis const uint8_t u8_valid_min_2nd_byte[0x100] = { 246*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 247*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 248*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 249*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 250*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 251*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 252*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 253*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 254*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 255*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 256*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 257*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 258*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 259*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 260*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 261*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 262*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 263*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 264*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 265*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 266*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 267*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 268*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 269*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 270*4703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */ 271*4703203dSis 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 272*4703203dSis /* C8 C9 CA CB CC CD CE CF */ 273*4703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 274*4703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */ 275*4703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 276*4703203dSis /* D8 D9 DA DB DC DD DE DF */ 277*4703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 278*4703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */ 279*4703203dSis 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 280*4703203dSis /* E8 E9 EA EB EC ED EE EF */ 281*4703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 282*4703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */ 283*4703203dSis 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 284*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 285*4703203dSis }; 286*4703203dSis 287*4703203dSis const uint8_t u8_valid_max_2nd_byte[0x100] = { 288*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 289*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 290*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 291*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 292*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 293*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 294*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 295*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 296*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 297*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 298*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 299*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 300*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 301*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 302*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 303*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 304*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 305*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 306*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 307*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 308*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 309*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 310*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 311*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 312*4703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */ 313*4703203dSis 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 314*4703203dSis /* C8 C9 CA CB CC CD CE CF */ 315*4703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 316*4703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */ 317*4703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 318*4703203dSis /* D8 D9 DA DB DC DD DE DF */ 319*4703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 320*4703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */ 321*4703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 322*4703203dSis /* E8 E9 EA EB EC ED EE EF */ 323*4703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 324*4703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */ 325*4703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 326*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 327*4703203dSis }; 328*4703203dSis 329*4703203dSis 330*4703203dSis /* 331*4703203dSis * The u8_validate() validates on the given UTF-8 character string and 332*4703203dSis * calculate the byte length. It is quite similar to mblen(3C) except that 333*4703203dSis * this will validate against the list of characters if required and 334*4703203dSis * specific to UTF-8 and Unicode. 335*4703203dSis */ 336*4703203dSis int 337*4703203dSis u8_validate(char *u8str, size_t n, char **list, int flag, int *errno) 338*4703203dSis { 339*4703203dSis uchar_t *ib; 340*4703203dSis uchar_t *ibtail; 341*4703203dSis uchar_t **p; 342*4703203dSis uchar_t *s1; 343*4703203dSis uchar_t *s2; 344*4703203dSis uchar_t f; 345*4703203dSis int sz; 346*4703203dSis size_t i; 347*4703203dSis int ret_val; 348*4703203dSis boolean_t second; 349*4703203dSis boolean_t no_need_to_validate_entire; 350*4703203dSis boolean_t check_additional; 351*4703203dSis boolean_t validate_ucs2_range_only; 352*4703203dSis 353*4703203dSis if (! u8str) 354*4703203dSis return (0); 355*4703203dSis 356*4703203dSis ib = (uchar_t *)u8str; 357*4703203dSis ibtail = ib + n; 358*4703203dSis 359*4703203dSis ret_val = 0; 360*4703203dSis 361*4703203dSis no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); 362*4703203dSis check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; 363*4703203dSis validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; 364*4703203dSis 365*4703203dSis while (ib < ibtail) { 366*4703203dSis /* 367*4703203dSis * The first byte of a UTF-8 character tells how many 368*4703203dSis * bytes will follow for the character. If the first byte 369*4703203dSis * is an illegal byte value or out of range value, we just 370*4703203dSis * return -1 with an appropriate error number. 371*4703203dSis */ 372*4703203dSis sz = u8_number_of_bytes[*ib]; 373*4703203dSis if (sz == U8_ILLEGAL_CHAR) { 374*4703203dSis *errno = EILSEQ; 375*4703203dSis return (-1); 376*4703203dSis } 377*4703203dSis 378*4703203dSis if (sz == U8_OUT_OF_RANGE_CHAR || 379*4703203dSis (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { 380*4703203dSis *errno = ERANGE; 381*4703203dSis return (-1); 382*4703203dSis } 383*4703203dSis 384*4703203dSis /* 385*4703203dSis * If we don't have enough bytes to check on, that's also 386*4703203dSis * an error. As you can see, we give illegal byte sequence 387*4703203dSis * checking higher priority then EINVAL cases. 388*4703203dSis */ 389*4703203dSis if ((ibtail - ib) < sz) { 390*4703203dSis *errno = EINVAL; 391*4703203dSis return (-1); 392*4703203dSis } 393*4703203dSis 394*4703203dSis if (sz == 1) { 395*4703203dSis ib++; 396*4703203dSis ret_val++; 397*4703203dSis } else { 398*4703203dSis /* 399*4703203dSis * Check on the multi-byte UTF-8 character. For more 400*4703203dSis * details on this, see comment added for the used 401*4703203dSis * data structures at the beginning of the file. 402*4703203dSis */ 403*4703203dSis f = *ib++; 404*4703203dSis ret_val++; 405*4703203dSis second = B_TRUE; 406*4703203dSis for (i = 1; i < sz; i++) { 407*4703203dSis if (second) { 408*4703203dSis if (*ib < u8_valid_min_2nd_byte[f] || 409*4703203dSis *ib > u8_valid_max_2nd_byte[f]) { 410*4703203dSis *errno = EILSEQ; 411*4703203dSis return (-1); 412*4703203dSis } 413*4703203dSis second = B_FALSE; 414*4703203dSis } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { 415*4703203dSis *errno = EILSEQ; 416*4703203dSis return (-1); 417*4703203dSis } 418*4703203dSis ib++; 419*4703203dSis ret_val++; 420*4703203dSis } 421*4703203dSis } 422*4703203dSis 423*4703203dSis if (check_additional) { 424*4703203dSis for (p = (uchar_t **)list, i = 0; p[i]; i++) { 425*4703203dSis s1 = ib - sz; 426*4703203dSis s2 = p[i]; 427*4703203dSis while (s1 < ib) { 428*4703203dSis if (*s1 != *s2 || *s2 == '\0') 429*4703203dSis break; 430*4703203dSis s1++; 431*4703203dSis s2++; 432*4703203dSis } 433*4703203dSis 434*4703203dSis if (s1 >= ib && *s2 == '\0') { 435*4703203dSis *errno = EBADF; 436*4703203dSis return (-1); 437*4703203dSis } 438*4703203dSis } 439*4703203dSis } 440*4703203dSis 441*4703203dSis if (no_need_to_validate_entire) 442*4703203dSis break; 443*4703203dSis } 444*4703203dSis 445*4703203dSis return (ret_val); 446*4703203dSis } 447*4703203dSis 448*4703203dSis /* 449*4703203dSis * The do_case_conv() looks at the mapping tables and returns found 450*4703203dSis * bytes if any. If not found, the input bytes are returned. The function 451*4703203dSis * always terminate the return bytes with a null character assuming that 452*4703203dSis * there are plenty of room to do so. 453*4703203dSis * 454*4703203dSis * The case conversions are simple case conversions mapping a character to 455*4703203dSis * another character as specified in the Unicode data. The byte size of 456*4703203dSis * the mapped character could be different from that of the input character. 457*4703203dSis * 458*4703203dSis * The return value is the byte length of the returned character excluding 459*4703203dSis * the terminating null byte. 460*4703203dSis */ 461*4703203dSis static size_t 462*4703203dSis do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) 463*4703203dSis { 464*4703203dSis size_t i; 465*4703203dSis uint16_t b1 = 0; 466*4703203dSis uint16_t b2 = 0; 467*4703203dSis uint16_t b3 = 0; 468*4703203dSis uint16_t b3_tbl; 469*4703203dSis uint16_t b3_base; 470*4703203dSis uint16_t b4 = 0; 471*4703203dSis size_t start_id; 472*4703203dSis size_t end_id; 473*4703203dSis 474*4703203dSis /* 475*4703203dSis * At this point, the only possible values for sz are 2, 3, and 4. 476*4703203dSis * The u8s should point to a vector that is well beyond the size of 477*4703203dSis * 5 bytes. 478*4703203dSis */ 479*4703203dSis if (sz == 2) { 480*4703203dSis b3 = u8s[0] = s[0]; 481*4703203dSis b4 = u8s[1] = s[1]; 482*4703203dSis } else if (sz == 3) { 483*4703203dSis b2 = u8s[0] = s[0]; 484*4703203dSis b3 = u8s[1] = s[1]; 485*4703203dSis b4 = u8s[2] = s[2]; 486*4703203dSis } else if (sz == 4) { 487*4703203dSis b1 = u8s[0] = s[0]; 488*4703203dSis b2 = u8s[1] = s[1]; 489*4703203dSis b3 = u8s[2] = s[2]; 490*4703203dSis b4 = u8s[3] = s[3]; 491*4703203dSis } else { 492*4703203dSis /* This is not possible but just in case as a fallback. */ 493*4703203dSis if (is_it_toupper) 494*4703203dSis *u8s = U8_ASCII_TOUPPER(*s); 495*4703203dSis else 496*4703203dSis *u8s = U8_ASCII_TOLOWER(*s); 497*4703203dSis u8s[1] = '\0'; 498*4703203dSis 499*4703203dSis return (1); 500*4703203dSis } 501*4703203dSis u8s[sz] = '\0'; 502*4703203dSis 503*4703203dSis /* 504*4703203dSis * Let's find out if we have a corresponding character. 505*4703203dSis */ 506*4703203dSis b1 = u8_common_b1_tbl[uv][b1]; 507*4703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF) 508*4703203dSis return ((size_t)sz); 509*4703203dSis 510*4703203dSis b2 = u8_case_common_b2_tbl[uv][b1][b2]; 511*4703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF) 512*4703203dSis return ((size_t)sz); 513*4703203dSis 514*4703203dSis if (is_it_toupper) { 515*4703203dSis b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; 516*4703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 517*4703203dSis return ((size_t)sz); 518*4703203dSis 519*4703203dSis start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; 520*4703203dSis end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; 521*4703203dSis 522*4703203dSis /* Either there is no match or an error at the table. */ 523*4703203dSis if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) 524*4703203dSis return ((size_t)sz); 525*4703203dSis 526*4703203dSis b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; 527*4703203dSis 528*4703203dSis for (i = 0; start_id < end_id; start_id++) 529*4703203dSis u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; 530*4703203dSis } else { 531*4703203dSis b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; 532*4703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 533*4703203dSis return ((size_t)sz); 534*4703203dSis 535*4703203dSis start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; 536*4703203dSis end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; 537*4703203dSis 538*4703203dSis if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) 539*4703203dSis return ((size_t)sz); 540*4703203dSis 541*4703203dSis b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; 542*4703203dSis 543*4703203dSis for (i = 0; start_id < end_id; start_id++) 544*4703203dSis u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; 545*4703203dSis } 546*4703203dSis 547*4703203dSis /* 548*4703203dSis * If i is still zero, that means there is no corresponding character. 549*4703203dSis */ 550*4703203dSis if (i == 0) 551*4703203dSis return ((size_t)sz); 552*4703203dSis 553*4703203dSis u8s[i] = '\0'; 554*4703203dSis 555*4703203dSis return (i); 556*4703203dSis } 557*4703203dSis 558*4703203dSis /* 559*4703203dSis * The do_case_compare() function compares the two input strings, s1 and s2, 560*4703203dSis * one character at a time doing case conversions if applicable and return 561*4703203dSis * the comparison result as like strcmp(). 562*4703203dSis * 563*4703203dSis * Since, in empirical sense, most of text data are 7-bit ASCII characters, 564*4703203dSis * we treat the 7-bit ASCII characters as a special case trying to yield 565*4703203dSis * faster processing time. 566*4703203dSis */ 567*4703203dSis static int 568*4703203dSis do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, 569*4703203dSis size_t n2, boolean_t is_it_toupper, int *errno) 570*4703203dSis { 571*4703203dSis int f; 572*4703203dSis int sz1; 573*4703203dSis int sz2; 574*4703203dSis size_t j; 575*4703203dSis size_t i1; 576*4703203dSis size_t i2; 577*4703203dSis uchar_t u8s1[U8_MB_CUR_MAX + 1]; 578*4703203dSis uchar_t u8s2[U8_MB_CUR_MAX + 1]; 579*4703203dSis 580*4703203dSis i1 = i2 = 0; 581*4703203dSis while (i1 < n1 && i2 < n2) { 582*4703203dSis /* 583*4703203dSis * Find out what would be the byte length for this UTF-8 584*4703203dSis * character at string s1 and also find out if this is 585*4703203dSis * an illegal start byte or not and if so, issue a proper 586*4703203dSis * errno and yet treat this byte as a character. 587*4703203dSis */ 588*4703203dSis sz1 = u8_number_of_bytes[*s1]; 589*4703203dSis if (sz1 < 0) { 590*4703203dSis *errno = EILSEQ; 591*4703203dSis sz1 = 1; 592*4703203dSis } 593*4703203dSis 594*4703203dSis /* 595*4703203dSis * For 7-bit ASCII characters mainly, we do a quick case 596*4703203dSis * conversion right at here. 597*4703203dSis * 598*4703203dSis * If we don't have enough bytes for this character, issue 599*4703203dSis * an EINVAL error and use what are available. 600*4703203dSis * 601*4703203dSis * If we have enough bytes, find out if there is 602*4703203dSis * a corresponding uppercase character and if so, copy over 603*4703203dSis * the bytes for a comparison later. If there is no 604*4703203dSis * corresponding uppercase character, then, use what we have 605*4703203dSis * for the comparison. 606*4703203dSis */ 607*4703203dSis if (sz1 == 1) { 608*4703203dSis if (is_it_toupper) 609*4703203dSis u8s1[0] = U8_ASCII_TOUPPER(*s1); 610*4703203dSis else 611*4703203dSis u8s1[0] = U8_ASCII_TOLOWER(*s1); 612*4703203dSis s1++; 613*4703203dSis u8s1[1] = '\0'; 614*4703203dSis } else if ((i1 + sz1) > n1) { 615*4703203dSis *errno = EINVAL; 616*4703203dSis for (j = 0; (i1 + j) < n1; ) 617*4703203dSis u8s1[j++] = *s1++; 618*4703203dSis u8s1[j] = '\0'; 619*4703203dSis } else { 620*4703203dSis (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); 621*4703203dSis s1 += sz1; 622*4703203dSis } 623*4703203dSis 624*4703203dSis /* Do the same for the string s2. */ 625*4703203dSis sz2 = u8_number_of_bytes[*s2]; 626*4703203dSis if (sz2 < 0) { 627*4703203dSis *errno = EILSEQ; 628*4703203dSis sz2 = 1; 629*4703203dSis } 630*4703203dSis 631*4703203dSis if (sz2 == 1) { 632*4703203dSis if (is_it_toupper) 633*4703203dSis u8s2[0] = U8_ASCII_TOUPPER(*s2); 634*4703203dSis else 635*4703203dSis u8s2[0] = U8_ASCII_TOLOWER(*s2); 636*4703203dSis s2++; 637*4703203dSis u8s2[1] = '\0'; 638*4703203dSis } else if ((i2 + sz2) > n2) { 639*4703203dSis *errno = EINVAL; 640*4703203dSis for (j = 0; (i2 + j) < n2; ) 641*4703203dSis u8s2[j++] = *s2++; 642*4703203dSis u8s2[j] = '\0'; 643*4703203dSis } else { 644*4703203dSis (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); 645*4703203dSis s2 += sz2; 646*4703203dSis } 647*4703203dSis 648*4703203dSis /* Now compare the two characters. */ 649*4703203dSis if (sz1 == 1 && sz2 == 1) { 650*4703203dSis if (*u8s1 > *u8s2) 651*4703203dSis return (1); 652*4703203dSis if (*u8s1 < *u8s2) 653*4703203dSis return (-1); 654*4703203dSis } else { 655*4703203dSis f = strcmp((const char *)u8s1, (const char *)u8s2); 656*4703203dSis if (f != 0) 657*4703203dSis return (f); 658*4703203dSis } 659*4703203dSis 660*4703203dSis /* 661*4703203dSis * They were the same. Let's move on to the next 662*4703203dSis * characters then. 663*4703203dSis */ 664*4703203dSis i1 += sz1; 665*4703203dSis i2 += sz2; 666*4703203dSis } 667*4703203dSis 668*4703203dSis /* 669*4703203dSis * We compared until the end of either or both strings. 670*4703203dSis * 671*4703203dSis * If we reached to or went over the ends for the both, that means 672*4703203dSis * they are the same. 673*4703203dSis * 674*4703203dSis * If we reached only one of the two ends, that means the other string 675*4703203dSis * has something which then the fact can be used to determine 676*4703203dSis * the return value. 677*4703203dSis */ 678*4703203dSis if (i1 >= n1) { 679*4703203dSis if (i2 >= n2) 680*4703203dSis return (0); 681*4703203dSis return (-1); 682*4703203dSis } 683*4703203dSis return (1); 684*4703203dSis } 685*4703203dSis 686*4703203dSis /* 687*4703203dSis * The combining_class() function checks on the given bytes and find out 688*4703203dSis * the corresponding Unicode combining class value. The return value 0 means 689*4703203dSis * it is a Starter. Any illegal UTF-8 character will also be treated as 690*4703203dSis * a Starter. 691*4703203dSis */ 692*4703203dSis static uchar_t 693*4703203dSis combining_class(size_t uv, uchar_t *s, size_t sz) 694*4703203dSis { 695*4703203dSis uint16_t b1 = 0; 696*4703203dSis uint16_t b2 = 0; 697*4703203dSis uint16_t b3 = 0; 698*4703203dSis uint16_t b4 = 0; 699*4703203dSis 700*4703203dSis if (sz == 1 || sz > 4) 701*4703203dSis return (0); 702*4703203dSis 703*4703203dSis if (sz == 2) { 704*4703203dSis b3 = s[0]; 705*4703203dSis b4 = s[1]; 706*4703203dSis } else if (sz == 3) { 707*4703203dSis b2 = s[0]; 708*4703203dSis b3 = s[1]; 709*4703203dSis b4 = s[2]; 710*4703203dSis } else if (sz == 4) { 711*4703203dSis b1 = s[0]; 712*4703203dSis b2 = s[1]; 713*4703203dSis b3 = s[2]; 714*4703203dSis b4 = s[3]; 715*4703203dSis } 716*4703203dSis 717*4703203dSis b1 = u8_common_b1_tbl[uv][b1]; 718*4703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF) 719*4703203dSis return (0); 720*4703203dSis 721*4703203dSis b2 = u8_combining_class_b2_tbl[uv][b1][b2]; 722*4703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF) 723*4703203dSis return (0); 724*4703203dSis 725*4703203dSis b3 = u8_combining_class_b3_tbl[uv][b2][b3]; 726*4703203dSis if (b3 == U8_TBL_ELEMENT_NOT_DEF) 727*4703203dSis return (0); 728*4703203dSis 729*4703203dSis return (u8_combining_class_b4_tbl[uv][b3][b4]); 730*4703203dSis } 731*4703203dSis 732*4703203dSis /* 733*4703203dSis * The do_decomp() function finds out a matching decomposition if any 734*4703203dSis * and return. If there is no match, the input bytes are copied and returned. 735*4703203dSis * The function also checks if there is a Hangul, decomposes it if necessary 736*4703203dSis * and returns. 737*4703203dSis * 738*4703203dSis * To save time, a single byte 7-bit ASCII character should be handled by 739*4703203dSis * the caller. 740*4703203dSis * 741*4703203dSis * The function returns the number of bytes returned sans always terminating 742*4703203dSis * the null byte. It will also return a state that will tell if there was 743*4703203dSis * a Hangul character decomposed which then will be used by the caller. 744*4703203dSis */ 745*4703203dSis static size_t 746*4703203dSis do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, 747*4703203dSis boolean_t canonical_decomposition, u8_normalization_states_t *state) 748*4703203dSis { 749*4703203dSis uint16_t b1 = 0; 750*4703203dSis uint16_t b2 = 0; 751*4703203dSis uint16_t b3 = 0; 752*4703203dSis uint16_t b3_tbl; 753*4703203dSis uint16_t b3_base; 754*4703203dSis uint16_t b4 = 0; 755*4703203dSis size_t start_id; 756*4703203dSis size_t end_id; 757*4703203dSis size_t i; 758*4703203dSis uint32_t u1; 759*4703203dSis 760*4703203dSis if (sz == 2) { 761*4703203dSis b3 = u8s[0] = s[0]; 762*4703203dSis b4 = u8s[1] = s[1]; 763*4703203dSis u8s[2] = '\0'; 764*4703203dSis } else if (sz == 3) { 765*4703203dSis /* Convert it to a Unicode scalar value. */ 766*4703203dSis U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); 767*4703203dSis 768*4703203dSis /* 769*4703203dSis * If this is a Hangul syllable, we decompose it into 770*4703203dSis * a leading consonant, a vowel, and an optional trailing 771*4703203dSis * consonant and then return. 772*4703203dSis */ 773*4703203dSis if (U8_HANGUL_SYLLABLE(u1)) { 774*4703203dSis u1 -= U8_HANGUL_SYL_FIRST; 775*4703203dSis 776*4703203dSis b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; 777*4703203dSis b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) 778*4703203dSis / U8_HANGUL_T_COUNT; 779*4703203dSis b3 = u1 % U8_HANGUL_T_COUNT; 780*4703203dSis 781*4703203dSis U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); 782*4703203dSis U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); 783*4703203dSis if (b3) { 784*4703203dSis b3 += U8_HANGUL_JAMO_T_FIRST; 785*4703203dSis U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); 786*4703203dSis 787*4703203dSis u8s[9] = '\0'; 788*4703203dSis *state = U8_STATE_HANGUL_LVT; 789*4703203dSis return (9); 790*4703203dSis } 791*4703203dSis 792*4703203dSis u8s[6] = '\0'; 793*4703203dSis *state = U8_STATE_HANGUL_LV; 794*4703203dSis return (6); 795*4703203dSis } 796*4703203dSis 797*4703203dSis b2 = u8s[0] = s[0]; 798*4703203dSis b3 = u8s[1] = s[1]; 799*4703203dSis b4 = u8s[2] = s[2]; 800*4703203dSis u8s[3] = '\0'; 801*4703203dSis 802*4703203dSis /* 803*4703203dSis * If this is a Hangul Jamo, we know there is nothing 804*4703203dSis * further that we can decompose. 805*4703203dSis */ 806*4703203dSis if (U8_HANGUL_JAMO_L(u1)) { 807*4703203dSis *state = U8_STATE_HANGUL_L; 808*4703203dSis return (3); 809*4703203dSis } 810*4703203dSis 811*4703203dSis if (U8_HANGUL_JAMO_V(u1)) { 812*4703203dSis if (*state == U8_STATE_HANGUL_L) 813*4703203dSis *state = U8_STATE_HANGUL_LV; 814*4703203dSis else 815*4703203dSis *state = U8_STATE_HANGUL_V; 816*4703203dSis return (3); 817*4703203dSis } 818*4703203dSis 819*4703203dSis if (U8_HANGUL_JAMO_T(u1)) { 820*4703203dSis if (*state == U8_STATE_HANGUL_LV) 821*4703203dSis *state = U8_STATE_HANGUL_LVT; 822*4703203dSis else 823*4703203dSis *state = U8_STATE_HANGUL_T; 824*4703203dSis return (3); 825*4703203dSis } 826*4703203dSis } else if (sz == 4) { 827*4703203dSis b1 = u8s[0] = s[0]; 828*4703203dSis b2 = u8s[1] = s[1]; 829*4703203dSis b3 = u8s[2] = s[2]; 830*4703203dSis b4 = u8s[3] = s[3]; 831*4703203dSis u8s[4] = '\0'; 832*4703203dSis } else { 833*4703203dSis /* 834*4703203dSis * This is a fallback and should not happen if the function 835*4703203dSis * was called properly. 836*4703203dSis */ 837*4703203dSis u8s[0] = s[0]; 838*4703203dSis u8s[1] = '\0'; 839*4703203dSis *state = U8_STATE_START; 840*4703203dSis return (1); 841*4703203dSis } 842*4703203dSis 843*4703203dSis /* 844*4703203dSis * At this point, this rountine does not know what it would get. 845*4703203dSis * The caller should sort it out if the state isn't a Hangul one. 846*4703203dSis */ 847*4703203dSis *state = U8_STATE_START; 848*4703203dSis 849*4703203dSis /* Try to find matching decomposition mapping byte sequence. */ 850*4703203dSis b1 = u8_common_b1_tbl[uv][b1]; 851*4703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF) 852*4703203dSis return ((size_t)sz); 853*4703203dSis 854*4703203dSis b2 = u8_decomp_b2_tbl[uv][b1][b2]; 855*4703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF) 856*4703203dSis return ((size_t)sz); 857*4703203dSis 858*4703203dSis b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; 859*4703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 860*4703203dSis return ((size_t)sz); 861*4703203dSis 862*4703203dSis /* 863*4703203dSis * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR 864*4703203dSis * which is 0x8000, this means we couldn't fit the mappings into 865*4703203dSis * the cardinality of a unsigned byte. 866*4703203dSis */ 867*4703203dSis if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { 868*4703203dSis b3_tbl -= U8_16BIT_TABLE_INDICATOR; 869*4703203dSis start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; 870*4703203dSis end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; 871*4703203dSis } else { 872*4703203dSis start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; 873*4703203dSis end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; 874*4703203dSis } 875*4703203dSis 876*4703203dSis /* This also means there wasn't any matching decomposition. */ 877*4703203dSis if (start_id >= end_id) 878*4703203dSis return ((size_t)sz); 879*4703203dSis 880*4703203dSis /* 881*4703203dSis * The final table for decomposition mappings has three types of 882*4703203dSis * byte sequences depending on whether a mapping is for compatibility 883*4703203dSis * decomposition, canonical decomposition, or both like the following: 884*4703203dSis * 885*4703203dSis * (1) Compatibility decomposition mappings: 886*4703203dSis * 887*4703203dSis * +---+---+-...-+---+ 888*4703203dSis * | B0| B1| ... | Bm| 889*4703203dSis * +---+---+-...-+---+ 890*4703203dSis * 891*4703203dSis * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). 892*4703203dSis * 893*4703203dSis * (2) Canonical decomposition mappings: 894*4703203dSis * 895*4703203dSis * +---+---+---+-...-+---+ 896*4703203dSis * | T | b0| b1| ... | bn| 897*4703203dSis * +---+---+---+-...-+---+ 898*4703203dSis * 899*4703203dSis * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). 900*4703203dSis * 901*4703203dSis * (3) Both mappings: 902*4703203dSis * 903*4703203dSis * +---+---+---+---+-...-+---+---+---+-...-+---+ 904*4703203dSis * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| 905*4703203dSis * +---+---+---+---+-...-+---+---+---+-...-+---+ 906*4703203dSis * 907*4703203dSis * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement 908*4703203dSis * byte, b0 to bn are canonical mapping bytes and B0 to Bm are 909*4703203dSis * compatibility mapping bytes. 910*4703203dSis * 911*4703203dSis * Note that compatibility decomposition means doing recursive 912*4703203dSis * decompositions using both compatibility decomposition mappings and 913*4703203dSis * canonical decomposition mappings. On the other hand, canonical 914*4703203dSis * decomposition means doing recursive decompositions using only 915*4703203dSis * canonical decomposition mappings. Since the table we have has gone 916*4703203dSis * through the recursions already, we do not need to do so during 917*4703203dSis * runtime, i.e., the table has been completely flattened out 918*4703203dSis * already. 919*4703203dSis */ 920*4703203dSis 921*4703203dSis b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; 922*4703203dSis 923*4703203dSis /* Get the type, T, of the byte sequence. */ 924*4703203dSis b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; 925*4703203dSis 926*4703203dSis /* 927*4703203dSis * If necessary, adjust start_id, end_id, or both. Note that if 928*4703203dSis * this is compatibility decomposition mapping, there is no 929*4703203dSis * adjustment. 930*4703203dSis */ 931*4703203dSis if (canonical_decomposition) { 932*4703203dSis /* Is the mapping only for compatibility decomposition? */ 933*4703203dSis if (b1 < U8_DECOMP_BOTH) 934*4703203dSis return ((size_t)sz); 935*4703203dSis 936*4703203dSis start_id++; 937*4703203dSis 938*4703203dSis if (b1 == U8_DECOMP_BOTH) { 939*4703203dSis end_id = start_id + 940*4703203dSis u8_decomp_final_tbl[uv][b3_base + start_id]; 941*4703203dSis start_id++; 942*4703203dSis } 943*4703203dSis } else { 944*4703203dSis /* 945*4703203dSis * Unless this is a compatibility decomposition mapping, 946*4703203dSis * we adjust the start_id. 947*4703203dSis */ 948*4703203dSis if (b1 == U8_DECOMP_BOTH) { 949*4703203dSis start_id++; 950*4703203dSis start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; 951*4703203dSis } else if (b1 == U8_DECOMP_CANONICAL) { 952*4703203dSis start_id++; 953*4703203dSis } 954*4703203dSis } 955*4703203dSis 956*4703203dSis for (i = 0; start_id < end_id; start_id++) 957*4703203dSis u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; 958*4703203dSis u8s[i] = '\0'; 959*4703203dSis 960*4703203dSis return (i); 961*4703203dSis } 962*4703203dSis 963*4703203dSis /* 964*4703203dSis * The find_composition_start() function uses the character bytes given and 965*4703203dSis * find out the matching composition mappings if any and return the address 966*4703203dSis * to the composition mappings as explained in the do_composition(). 967*4703203dSis */ 968*4703203dSis static uchar_t * 969*4703203dSis find_composition_start(size_t uv, uchar_t *s, size_t sz) 970*4703203dSis { 971*4703203dSis uint16_t b1 = 0; 972*4703203dSis uint16_t b2 = 0; 973*4703203dSis uint16_t b3 = 0; 974*4703203dSis uint16_t b3_tbl; 975*4703203dSis uint16_t b3_base; 976*4703203dSis uint16_t b4 = 0; 977*4703203dSis size_t start_id; 978*4703203dSis size_t end_id; 979*4703203dSis 980*4703203dSis if (sz == 1) { 981*4703203dSis b4 = s[0]; 982*4703203dSis } else if (sz == 2) { 983*4703203dSis b3 = s[0]; 984*4703203dSis b4 = s[1]; 985*4703203dSis } else if (sz == 3) { 986*4703203dSis b2 = s[0]; 987*4703203dSis b3 = s[1]; 988*4703203dSis b4 = s[2]; 989*4703203dSis } else if (sz == 4) { 990*4703203dSis b1 = s[0]; 991*4703203dSis b2 = s[1]; 992*4703203dSis b3 = s[2]; 993*4703203dSis b4 = s[3]; 994*4703203dSis } else { 995*4703203dSis /* 996*4703203dSis * This is a fallback and should not happen if the function 997*4703203dSis * was called properly. 998*4703203dSis */ 999*4703203dSis return (NULL); 1000*4703203dSis } 1001*4703203dSis 1002*4703203dSis b1 = u8_composition_b1_tbl[uv][b1]; 1003*4703203dSis if (b1 == U8_TBL_ELEMENT_NOT_DEF) 1004*4703203dSis return (NULL); 1005*4703203dSis 1006*4703203dSis b2 = u8_composition_b2_tbl[uv][b1][b2]; 1007*4703203dSis if (b2 == U8_TBL_ELEMENT_NOT_DEF) 1008*4703203dSis return (NULL); 1009*4703203dSis 1010*4703203dSis b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; 1011*4703203dSis if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) 1012*4703203dSis return (NULL); 1013*4703203dSis 1014*4703203dSis if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { 1015*4703203dSis b3_tbl -= U8_16BIT_TABLE_INDICATOR; 1016*4703203dSis start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; 1017*4703203dSis end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; 1018*4703203dSis } else { 1019*4703203dSis start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; 1020*4703203dSis end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; 1021*4703203dSis } 1022*4703203dSis 1023*4703203dSis if (start_id >= end_id) 1024*4703203dSis return (NULL); 1025*4703203dSis 1026*4703203dSis b3_base = u8_composition_b3_tbl[uv][b2][b3].base; 1027*4703203dSis 1028*4703203dSis return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); 1029*4703203dSis } 1030*4703203dSis 1031*4703203dSis /* 1032*4703203dSis * The blocked() function checks on the combining class values of previous 1033*4703203dSis * characters in this sequence and return whether it is blocked or not. 1034*4703203dSis */ 1035*4703203dSis static boolean_t 1036*4703203dSis blocked(uchar_t *comb_class, size_t last) 1037*4703203dSis { 1038*4703203dSis uchar_t my_comb_class; 1039*4703203dSis size_t i; 1040*4703203dSis 1041*4703203dSis my_comb_class = comb_class[last]; 1042*4703203dSis for (i = 1; i < last; i++) 1043*4703203dSis if (comb_class[i] >= my_comb_class || 1044*4703203dSis comb_class[i] == U8_COMBINING_CLASS_STARTER) 1045*4703203dSis return (B_TRUE); 1046*4703203dSis 1047*4703203dSis return (B_FALSE); 1048*4703203dSis } 1049*4703203dSis 1050*4703203dSis /* 1051*4703203dSis * The do_composition() reads the character string pointed by 's' and 1052*4703203dSis * do necessary canonical composition and then copy over the result back to 1053*4703203dSis * the 's'. 1054*4703203dSis * 1055*4703203dSis * The input argument 's' cannot contain more than 32 characters. 1056*4703203dSis */ 1057*4703203dSis static size_t 1058*4703203dSis do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, 1059*4703203dSis uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) 1060*4703203dSis { 1061*4703203dSis uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; 1062*4703203dSis uchar_t tc[U8_MB_CUR_MAX]; 1063*4703203dSis uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; 1064*4703203dSis size_t saved_marks_count; 1065*4703203dSis uchar_t *p; 1066*4703203dSis uchar_t *saved_p; 1067*4703203dSis uchar_t *q; 1068*4703203dSis size_t i; 1069*4703203dSis size_t saved_i; 1070*4703203dSis size_t j; 1071*4703203dSis size_t k; 1072*4703203dSis size_t l; 1073*4703203dSis size_t C; 1074*4703203dSis size_t saved_l; 1075*4703203dSis size_t size; 1076*4703203dSis uint32_t u1; 1077*4703203dSis uint32_t u2; 1078*4703203dSis boolean_t match_not_found = B_TRUE; 1079*4703203dSis 1080*4703203dSis /* 1081*4703203dSis * This should never happen unless the callers are doing some strange 1082*4703203dSis * and unexpected things. 1083*4703203dSis * 1084*4703203dSis * The "last" is the index pointing to the last character not last + 1. 1085*4703203dSis */ 1086*4703203dSis if (last >= U8_MAX_CHARS_A_SEQ) 1087*4703203dSis last = U8_UPPER_LIMIT_IN_A_SEQ; 1088*4703203dSis 1089*4703203dSis for (i = l = 0; i <= last; i++) { 1090*4703203dSis /* 1091*4703203dSis * The last or any non-Starters at the beginning, we don't 1092*4703203dSis * have any chance to do composition and so we just copy them 1093*4703203dSis * to the temporary buffer. 1094*4703203dSis */ 1095*4703203dSis if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { 1096*4703203dSis SAVE_THE_CHAR: 1097*4703203dSis p = s + start[i]; 1098*4703203dSis size = disp[i]; 1099*4703203dSis for (k = 0; k < size; k++) 1100*4703203dSis t[l++] = *p++; 1101*4703203dSis continue; 1102*4703203dSis } 1103*4703203dSis 1104*4703203dSis /* 1105*4703203dSis * If this could be a start of Hangul Jamos, then, we try to 1106*4703203dSis * conjoin them. 1107*4703203dSis */ 1108*4703203dSis if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { 1109*4703203dSis U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], 1110*4703203dSis s[start[i] + 1], s[start[i] + 2]); 1111*4703203dSis U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], 1112*4703203dSis s[start[i] + 4], s[start[i] + 5]); 1113*4703203dSis 1114*4703203dSis if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { 1115*4703203dSis u1 -= U8_HANGUL_JAMO_L_FIRST; 1116*4703203dSis u2 -= U8_HANGUL_JAMO_V_FIRST; 1117*4703203dSis u1 = U8_HANGUL_SYL_FIRST + 1118*4703203dSis (u1 * U8_HANGUL_V_COUNT + u2) * 1119*4703203dSis U8_HANGUL_T_COUNT; 1120*4703203dSis 1121*4703203dSis i += 2; 1122*4703203dSis if (i <= last) { 1123*4703203dSis U8_PUT_3BYTES_INTO_UTF32(u2, 1124*4703203dSis s[start[i]], s[start[i] + 1], 1125*4703203dSis s[start[i] + 2]); 1126*4703203dSis 1127*4703203dSis if (U8_HANGUL_JAMO_T(u2)) { 1128*4703203dSis u1 += u2 - 1129*4703203dSis U8_HANGUL_JAMO_T_FIRST; 1130*4703203dSis i++; 1131*4703203dSis } 1132*4703203dSis } 1133*4703203dSis 1134*4703203dSis U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); 1135*4703203dSis i--; 1136*4703203dSis l += 3; 1137*4703203dSis continue; 1138*4703203dSis } 1139*4703203dSis } 1140*4703203dSis 1141*4703203dSis /* 1142*4703203dSis * Let's then find out if this Starter has composition 1143*4703203dSis * mapping. 1144*4703203dSis */ 1145*4703203dSis p = find_composition_start(uv, s + start[i], disp[i]); 1146*4703203dSis if (p == NULL) 1147*4703203dSis goto SAVE_THE_CHAR; 1148*4703203dSis 1149*4703203dSis /* 1150*4703203dSis * We have a Starter with composition mapping and the next 1151*4703203dSis * character is a non-Starter. Let's try to find out if 1152*4703203dSis * we can do composition. 1153*4703203dSis */ 1154*4703203dSis 1155*4703203dSis saved_p = p; 1156*4703203dSis saved_i = i; 1157*4703203dSis saved_l = l; 1158*4703203dSis saved_marks_count = 0; 1159*4703203dSis 1160*4703203dSis TRY_THE_NEXT_MARK: 1161*4703203dSis q = s + start[++i]; 1162*4703203dSis size = disp[i]; 1163*4703203dSis 1164*4703203dSis /* 1165*4703203dSis * The next for() loop compares the non-Starter pointed by 1166*4703203dSis * 'q' with the possible (joinable) characters pointed by 'p'. 1167*4703203dSis * 1168*4703203dSis * The composition final table entry pointed by the 'p' 1169*4703203dSis * looks like the following: 1170*4703203dSis * 1171*4703203dSis * +---+---+---+-...-+---+---+---+---+-...-+---+---+ 1172*4703203dSis * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | 1173*4703203dSis * +---+---+---+-...-+---+---+---+---+-...-+---+---+ 1174*4703203dSis * 1175*4703203dSis * where C is the count byte indicating the number of 1176*4703203dSis * mapping pairs where each pair would be look like 1177*4703203dSis * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second 1178*4703203dSis * character of a canonical decomposition and the B0-Bm are 1179*4703203dSis * the bytes of a matching composite character. The F is 1180*4703203dSis * a filler byte after each character as the separator. 1181*4703203dSis */ 1182*4703203dSis 1183*4703203dSis match_not_found = B_TRUE; 1184*4703203dSis 1185*4703203dSis for (C = *p++; C > 0; C--) { 1186*4703203dSis for (k = 0; k < size; p++, k++) 1187*4703203dSis if (*p != q[k]) 1188*4703203dSis break; 1189*4703203dSis 1190*4703203dSis /* Have we found it? */ 1191*4703203dSis if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { 1192*4703203dSis match_not_found = B_FALSE; 1193*4703203dSis 1194*4703203dSis l = saved_l; 1195*4703203dSis 1196*4703203dSis while (*++p != U8_TBL_ELEMENT_FILLER) 1197*4703203dSis t[l++] = *p; 1198*4703203dSis 1199*4703203dSis break; 1200*4703203dSis } 1201*4703203dSis 1202*4703203dSis /* We didn't find; skip to the next pair. */ 1203*4703203dSis if (*p != U8_TBL_ELEMENT_FILLER) 1204*4703203dSis while (*++p != U8_TBL_ELEMENT_FILLER) 1205*4703203dSis ; 1206*4703203dSis while (*++p != U8_TBL_ELEMENT_FILLER) 1207*4703203dSis ; 1208*4703203dSis p++; 1209*4703203dSis } 1210*4703203dSis 1211*4703203dSis /* 1212*4703203dSis * If there was no match, we will need to save the combining 1213*4703203dSis * mark for later appending. After that, if the next one 1214*4703203dSis * is a non-Starter and not blocked, then, we try once 1215*4703203dSis * again to do composition with the next non-Starter. 1216*4703203dSis * 1217*4703203dSis * If there was no match and this was a Starter, then, 1218*4703203dSis * this is a new start. 1219*4703203dSis * 1220*4703203dSis * If there was a match and a composition done and we have 1221*4703203dSis * more to check on, then, we retrieve a new composition final 1222*4703203dSis * table entry for the composite and then try to do the 1223*4703203dSis * composition again. 1224*4703203dSis */ 1225*4703203dSis 1226*4703203dSis if (match_not_found) { 1227*4703203dSis if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { 1228*4703203dSis i--; 1229*4703203dSis goto SAVE_THE_CHAR; 1230*4703203dSis } 1231*4703203dSis 1232*4703203dSis saved_marks[saved_marks_count++] = i; 1233*4703203dSis } 1234*4703203dSis 1235*4703203dSis if (saved_l == l) { 1236*4703203dSis while (i < last) { 1237*4703203dSis if (blocked(comb_class, i + 1)) 1238*4703203dSis saved_marks[saved_marks_count++] = ++i; 1239*4703203dSis else 1240*4703203dSis break; 1241*4703203dSis } 1242*4703203dSis if (i < last) { 1243*4703203dSis p = saved_p; 1244*4703203dSis goto TRY_THE_NEXT_MARK; 1245*4703203dSis } 1246*4703203dSis } else if (i < last) { 1247*4703203dSis p = find_composition_start(uv, t + saved_l, 1248*4703203dSis l - saved_l); 1249*4703203dSis if (p != NULL) { 1250*4703203dSis saved_p = p; 1251*4703203dSis goto TRY_THE_NEXT_MARK; 1252*4703203dSis } 1253*4703203dSis } 1254*4703203dSis 1255*4703203dSis /* 1256*4703203dSis * There is no more composition possible. 1257*4703203dSis * 1258*4703203dSis * If there was no composition what so ever then we copy 1259*4703203dSis * over the original Starter and then append any non-Starters 1260*4703203dSis * remaining at the target string sequentially after that. 1261*4703203dSis */ 1262*4703203dSis 1263*4703203dSis if (saved_l == l) { 1264*4703203dSis p = s + start[saved_i]; 1265*4703203dSis size = disp[saved_i]; 1266*4703203dSis for (j = 0; j < size; j++) 1267*4703203dSis t[l++] = *p++; 1268*4703203dSis } 1269*4703203dSis 1270*4703203dSis for (k = 0; k < saved_marks_count; k++) { 1271*4703203dSis p = s + start[saved_marks[k]]; 1272*4703203dSis size = disp[saved_marks[k]]; 1273*4703203dSis for (j = 0; j < size; j++) 1274*4703203dSis t[l++] = *p++; 1275*4703203dSis } 1276*4703203dSis } 1277*4703203dSis 1278*4703203dSis /* 1279*4703203dSis * If the last character is a Starter and if we have a character 1280*4703203dSis * (possibly another Starter) that can be turned into a composite, 1281*4703203dSis * we do so and we do so until there is no more of composition 1282*4703203dSis * possible. 1283*4703203dSis */ 1284*4703203dSis if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { 1285*4703203dSis p = *os; 1286*4703203dSis saved_l = l - disp[last]; 1287*4703203dSis 1288*4703203dSis while (p < oslast) { 1289*4703203dSis size = u8_number_of_bytes[*p]; 1290*4703203dSis if (size <= 1 || (p + size) > oslast) 1291*4703203dSis break; 1292*4703203dSis 1293*4703203dSis saved_p = p; 1294*4703203dSis 1295*4703203dSis for (i = 0; i < size; i++) 1296*4703203dSis tc[i] = *p++; 1297*4703203dSis 1298*4703203dSis q = find_composition_start(uv, t + saved_l, 1299*4703203dSis l - saved_l); 1300*4703203dSis if (q == NULL) { 1301*4703203dSis p = saved_p; 1302*4703203dSis break; 1303*4703203dSis } 1304*4703203dSis 1305*4703203dSis match_not_found = B_TRUE; 1306*4703203dSis 1307*4703203dSis for (C = *q++; C > 0; C--) { 1308*4703203dSis for (k = 0; k < size; q++, k++) 1309*4703203dSis if (*q != tc[k]) 1310*4703203dSis break; 1311*4703203dSis 1312*4703203dSis if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { 1313*4703203dSis match_not_found = B_FALSE; 1314*4703203dSis 1315*4703203dSis l = saved_l; 1316*4703203dSis 1317*4703203dSis while (*++q != U8_TBL_ELEMENT_FILLER) { 1318*4703203dSis /* 1319*4703203dSis * This is practically 1320*4703203dSis * impossible but we don't 1321*4703203dSis * want to take any chances. 1322*4703203dSis */ 1323*4703203dSis if (l >= 1324*4703203dSis U8_STREAM_SAFE_TEXT_MAX) { 1325*4703203dSis p = saved_p; 1326*4703203dSis goto SAFE_RETURN; 1327*4703203dSis } 1328*4703203dSis t[l++] = *q; 1329*4703203dSis } 1330*4703203dSis 1331*4703203dSis break; 1332*4703203dSis } 1333*4703203dSis 1334*4703203dSis if (*q != U8_TBL_ELEMENT_FILLER) 1335*4703203dSis while (*++q != U8_TBL_ELEMENT_FILLER) 1336*4703203dSis ; 1337*4703203dSis while (*++q != U8_TBL_ELEMENT_FILLER) 1338*4703203dSis ; 1339*4703203dSis q++; 1340*4703203dSis } 1341*4703203dSis 1342*4703203dSis if (match_not_found) { 1343*4703203dSis p = saved_p; 1344*4703203dSis break; 1345*4703203dSis } 1346*4703203dSis } 1347*4703203dSis SAFE_RETURN: 1348*4703203dSis *os = p; 1349*4703203dSis } 1350*4703203dSis 1351*4703203dSis /* 1352*4703203dSis * Now we copy over the temporary string to the target string. 1353*4703203dSis * Since composition always reduces the number of characters or 1354*4703203dSis * the number of characters stay, we don't need to worry about 1355*4703203dSis * the buffer overflow here. 1356*4703203dSis */ 1357*4703203dSis for (i = 0; i < l; i++) 1358*4703203dSis s[i] = t[i]; 1359*4703203dSis s[l] = '\0'; 1360*4703203dSis 1361*4703203dSis return (l); 1362*4703203dSis } 1363*4703203dSis 1364*4703203dSis /* 1365*4703203dSis * The collect_a_seq() function checks on the given string s, collect 1366*4703203dSis * a sequence of characters at u8s, and return the sequence. While it collects 1367*4703203dSis * a sequence, it also applies case conversion, canonical or compatibility 1368*4703203dSis * decomposition, canonical decomposition, or some or all of them and 1369*4703203dSis * in that order. 1370*4703203dSis * 1371*4703203dSis * The collected sequence cannot be bigger than 32 characters since if 1372*4703203dSis * it is having more than 31 characters, the sequence will be terminated 1373*4703203dSis * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into 1374*4703203dSis * a Stream-Safe Text. The collected sequence is always terminated with 1375*4703203dSis * a null byte and the return value is the byte length of the sequence 1376*4703203dSis * including 0. The return value does not include the terminating 1377*4703203dSis * null byte. 1378*4703203dSis */ 1379*4703203dSis static size_t 1380*4703203dSis collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, 1381*4703203dSis boolean_t is_it_toupper, 1382*4703203dSis boolean_t is_it_tolower, 1383*4703203dSis boolean_t canonical_decomposition, 1384*4703203dSis boolean_t compatibility_decomposition, 1385*4703203dSis boolean_t canonical_composition, 1386*4703203dSis int *errno, u8_normalization_states_t *state) 1387*4703203dSis { 1388*4703203dSis uchar_t *s; 1389*4703203dSis int sz; 1390*4703203dSis int saved_sz; 1391*4703203dSis size_t i; 1392*4703203dSis size_t j; 1393*4703203dSis size_t k; 1394*4703203dSis size_t l; 1395*4703203dSis uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; 1396*4703203dSis uchar_t disp[U8_MAX_CHARS_A_SEQ]; 1397*4703203dSis uchar_t start[U8_MAX_CHARS_A_SEQ]; 1398*4703203dSis uchar_t u8t[U8_MB_CUR_MAX]; 1399*4703203dSis uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; 1400*4703203dSis uchar_t tc; 1401*4703203dSis size_t last; 1402*4703203dSis size_t saved_last; 1403*4703203dSis uint32_t u1; 1404*4703203dSis 1405*4703203dSis /* 1406*4703203dSis * Save the source string pointer which we will return a changed 1407*4703203dSis * pointer if we do processing. 1408*4703203dSis */ 1409*4703203dSis s = *source; 1410*4703203dSis 1411*4703203dSis /* 1412*4703203dSis * The following is a fallback for just in case callers are not 1413*4703203dSis * checking the string boundaries before the calling. 1414*4703203dSis */ 1415*4703203dSis if (s >= slast) { 1416*4703203dSis u8s[0] = '\0'; 1417*4703203dSis 1418*4703203dSis return (0); 1419*4703203dSis } 1420*4703203dSis 1421*4703203dSis /* 1422*4703203dSis * As the first thing, let's collect a character and do case 1423*4703203dSis * conversion if necessary. 1424*4703203dSis */ 1425*4703203dSis 1426*4703203dSis sz = u8_number_of_bytes[*s]; 1427*4703203dSis 1428*4703203dSis if (sz < 0) { 1429*4703203dSis *errno = EILSEQ; 1430*4703203dSis 1431*4703203dSis u8s[0] = *s++; 1432*4703203dSis u8s[1] = '\0'; 1433*4703203dSis 1434*4703203dSis *source = s; 1435*4703203dSis 1436*4703203dSis return (1); 1437*4703203dSis } 1438*4703203dSis 1439*4703203dSis if (sz == 1) { 1440*4703203dSis if (is_it_toupper) 1441*4703203dSis u8s[0] = U8_ASCII_TOUPPER(*s); 1442*4703203dSis else if (is_it_tolower) 1443*4703203dSis u8s[0] = U8_ASCII_TOLOWER(*s); 1444*4703203dSis else 1445*4703203dSis u8s[0] = *s; 1446*4703203dSis s++; 1447*4703203dSis u8s[1] = '\0'; 1448*4703203dSis } else if ((s + sz) > slast) { 1449*4703203dSis *errno = EINVAL; 1450*4703203dSis 1451*4703203dSis for (i = 0; s < slast; ) 1452*4703203dSis u8s[i++] = *s++; 1453*4703203dSis u8s[i] = '\0'; 1454*4703203dSis 1455*4703203dSis *source = s; 1456*4703203dSis 1457*4703203dSis return (i); 1458*4703203dSis } else { 1459*4703203dSis if (is_it_toupper || is_it_tolower) { 1460*4703203dSis i = do_case_conv(uv, u8s, s, sz, is_it_toupper); 1461*4703203dSis s += sz; 1462*4703203dSis sz = i; 1463*4703203dSis } else { 1464*4703203dSis for (i = 0; i < sz; ) 1465*4703203dSis u8s[i++] = *s++; 1466*4703203dSis u8s[i] = '\0'; 1467*4703203dSis } 1468*4703203dSis } 1469*4703203dSis 1470*4703203dSis /* 1471*4703203dSis * And then canonical/compatibility decomposition followed by 1472*4703203dSis * an optional canonical composition. Please be noted that 1473*4703203dSis * canonical composition is done only when a decomposition is 1474*4703203dSis * done. 1475*4703203dSis */ 1476*4703203dSis if (canonical_decomposition || compatibility_decomposition) { 1477*4703203dSis if (sz == 1) { 1478*4703203dSis *state = U8_STATE_START; 1479*4703203dSis 1480*4703203dSis saved_sz = 1; 1481*4703203dSis 1482*4703203dSis comb_class[0] = 0; 1483*4703203dSis start[0] = 0; 1484*4703203dSis disp[0] = 1; 1485*4703203dSis 1486*4703203dSis last = 1; 1487*4703203dSis } else { 1488*4703203dSis saved_sz = do_decomp(uv, u8s, u8s, sz, 1489*4703203dSis canonical_decomposition, state); 1490*4703203dSis 1491*4703203dSis last = 0; 1492*4703203dSis 1493*4703203dSis for (i = 0; i < saved_sz; ) { 1494*4703203dSis sz = u8_number_of_bytes[u8s[i]]; 1495*4703203dSis 1496*4703203dSis comb_class[last] = combining_class(uv, 1497*4703203dSis u8s + i, sz); 1498*4703203dSis start[last] = i; 1499*4703203dSis disp[last] = sz; 1500*4703203dSis 1501*4703203dSis last++; 1502*4703203dSis i += sz; 1503*4703203dSis } 1504*4703203dSis 1505*4703203dSis /* 1506*4703203dSis * Decomposition yields various Hangul related 1507*4703203dSis * states but not on combining marks. We need to 1508*4703203dSis * find out at here by checking on the last 1509*4703203dSis * character. 1510*4703203dSis */ 1511*4703203dSis if (*state == U8_STATE_START) { 1512*4703203dSis if (comb_class[last - 1]) 1513*4703203dSis *state = U8_STATE_COMBINING_MARK; 1514*4703203dSis } 1515*4703203dSis } 1516*4703203dSis 1517*4703203dSis saved_last = last; 1518*4703203dSis 1519*4703203dSis while (s < slast) { 1520*4703203dSis sz = u8_number_of_bytes[*s]; 1521*4703203dSis 1522*4703203dSis /* 1523*4703203dSis * If this is an illegal character, an incomplete 1524*4703203dSis * character, or an 7-bit ASCII Starter character, 1525*4703203dSis * then we have collected a sequence; break and let 1526*4703203dSis * the next call deal with the two cases. 1527*4703203dSis * 1528*4703203dSis * Note that this is okay only if you are using this 1529*4703203dSis * function with a fixed length string, not on 1530*4703203dSis * a buffer with multiple calls of one chunk at a time. 1531*4703203dSis */ 1532*4703203dSis if (sz <= 1) { 1533*4703203dSis break; 1534*4703203dSis } else if ((s + sz) > slast) { 1535*4703203dSis break; 1536*4703203dSis } else { 1537*4703203dSis /* 1538*4703203dSis * If the previous character was a Hangul Jamo 1539*4703203dSis * and this character is a Hangul Jamo that 1540*4703203dSis * can be conjoined, we collect the Jamo. 1541*4703203dSis */ 1542*4703203dSis if (*s == U8_HANGUL_JAMO_1ST_BYTE) { 1543*4703203dSis U8_PUT_3BYTES_INTO_UTF32(u1, 1544*4703203dSis *s, *(s + 1), *(s + 2)); 1545*4703203dSis 1546*4703203dSis if (U8_HANGUL_COMPOSABLE_L_V(*state, 1547*4703203dSis u1)) { 1548*4703203dSis i = 0; 1549*4703203dSis *state = U8_STATE_HANGUL_LV; 1550*4703203dSis goto COLLECT_A_HANGUL; 1551*4703203dSis } 1552*4703203dSis 1553*4703203dSis if (U8_HANGUL_COMPOSABLE_LV_T(*state, 1554*4703203dSis u1)) { 1555*4703203dSis i = 0; 1556*4703203dSis *state = U8_STATE_HANGUL_LVT; 1557*4703203dSis goto COLLECT_A_HANGUL; 1558*4703203dSis } 1559*4703203dSis } 1560*4703203dSis 1561*4703203dSis /* 1562*4703203dSis * Regardless of whatever it was, if this is 1563*4703203dSis * a Starter, we don't collect the character 1564*4703203dSis * since that's a new start and we will deal 1565*4703203dSis * with it at the next time. 1566*4703203dSis */ 1567*4703203dSis i = combining_class(uv, s, sz); 1568*4703203dSis if (i == U8_COMBINING_CLASS_STARTER) 1569*4703203dSis break; 1570*4703203dSis 1571*4703203dSis /* 1572*4703203dSis * We know the current character is a combining 1573*4703203dSis * mark. If the previous character wasn't 1574*4703203dSis * a Starter (not Hangul) or a combining mark, 1575*4703203dSis * then, we don't collect this combining mark. 1576*4703203dSis */ 1577*4703203dSis if (*state != U8_STATE_START && 1578*4703203dSis *state != U8_STATE_COMBINING_MARK) 1579*4703203dSis break; 1580*4703203dSis 1581*4703203dSis *state = U8_STATE_COMBINING_MARK; 1582*4703203dSis COLLECT_A_HANGUL: 1583*4703203dSis /* 1584*4703203dSis * If we collected a Starter and combining 1585*4703203dSis * marks up to 30, i.e., total 31 characters, 1586*4703203dSis * then, we terminate this degenerately long 1587*4703203dSis * combining sequence with a U+034F COMBINING 1588*4703203dSis * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in 1589*4703203dSis * UTF-8 and turn this into a Stream-Safe 1590*4703203dSis * Text. This will be extremely rare but 1591*4703203dSis * possible. 1592*4703203dSis * 1593*4703203dSis * The following will also guarantee that 1594*4703203dSis * we are not writing more than 32 characters 1595*4703203dSis * plus a NULL at u8s[]. 1596*4703203dSis */ 1597*4703203dSis if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { 1598*4703203dSis TURN_STREAM_SAFE: 1599*4703203dSis *state = U8_STATE_START; 1600*4703203dSis comb_class[last] = 0; 1601*4703203dSis start[last] = saved_sz; 1602*4703203dSis disp[last] = 2; 1603*4703203dSis last++; 1604*4703203dSis 1605*4703203dSis u8s[saved_sz++] = 0xCD; 1606*4703203dSis u8s[saved_sz++] = 0x8F; 1607*4703203dSis 1608*4703203dSis break; 1609*4703203dSis } 1610*4703203dSis 1611*4703203dSis /* 1612*4703203dSis * Some combining marks also do decompose into 1613*4703203dSis * another combining mark or marks. 1614*4703203dSis */ 1615*4703203dSis if (*state == U8_STATE_COMBINING_MARK) { 1616*4703203dSis k = last; 1617*4703203dSis l = sz; 1618*4703203dSis i = do_decomp(uv, uts, s, sz, 1619*4703203dSis canonical_decomposition, state); 1620*4703203dSis for (j = 0; j < i; ) { 1621*4703203dSis sz = u8_number_of_bytes[uts[j]]; 1622*4703203dSis 1623*4703203dSis comb_class[last] = 1624*4703203dSis combining_class(uv, 1625*4703203dSis uts + j, sz); 1626*4703203dSis start[last] = saved_sz + j; 1627*4703203dSis disp[last] = sz; 1628*4703203dSis 1629*4703203dSis last++; 1630*4703203dSis if (last >= 1631*4703203dSis U8_UPPER_LIMIT_IN_A_SEQ) { 1632*4703203dSis last = k; 1633*4703203dSis goto TURN_STREAM_SAFE; 1634*4703203dSis } 1635*4703203dSis j += sz; 1636*4703203dSis } 1637*4703203dSis 1638*4703203dSis *state = U8_STATE_COMBINING_MARK; 1639*4703203dSis sz = i; 1640*4703203dSis s += l; 1641*4703203dSis 1642*4703203dSis for (i = 0; i < sz; i++) 1643*4703203dSis u8s[saved_sz++] = uts[i]; 1644*4703203dSis } else { 1645*4703203dSis comb_class[last] = i; 1646*4703203dSis start[last] = saved_sz; 1647*4703203dSis disp[last] = sz; 1648*4703203dSis last++; 1649*4703203dSis 1650*4703203dSis for (i = 0; i < sz; i++) 1651*4703203dSis u8s[saved_sz++] = *s++; 1652*4703203dSis } 1653*4703203dSis 1654*4703203dSis /* 1655*4703203dSis * If this is U+0345 COMBINING GREEK 1656*4703203dSis * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., 1657*4703203dSis * iota subscript, and need to be converted to 1658*4703203dSis * uppercase letter, convert it to U+0399 GREEK 1659*4703203dSis * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), 1660*4703203dSis * i.e., convert to capital adscript form as 1661*4703203dSis * specified in the Unicode standard. 1662*4703203dSis * 1663*4703203dSis * This is the only special case of (ambiguous) 1664*4703203dSis * case conversion at combining marks and 1665*4703203dSis * probably the standard will never have 1666*4703203dSis * anything similar like this in future. 1667*4703203dSis */ 1668*4703203dSis if (is_it_toupper && sz >= 2 && 1669*4703203dSis u8s[saved_sz - 2] == 0xCD && 1670*4703203dSis u8s[saved_sz - 1] == 0x85) { 1671*4703203dSis u8s[saved_sz - 2] = 0xCE; 1672*4703203dSis u8s[saved_sz - 1] = 0x99; 1673*4703203dSis } 1674*4703203dSis } 1675*4703203dSis } 1676*4703203dSis 1677*4703203dSis /* 1678*4703203dSis * Let's try to ensure a canonical ordering for the collected 1679*4703203dSis * combining marks. We do this only if we have collected 1680*4703203dSis * at least one more non-Starter. (The decomposition mapping 1681*4703203dSis * data tables have fully (and recursively) expanded and 1682*4703203dSis * canonically ordered decompositions.) 1683*4703203dSis * 1684*4703203dSis * The U8_SWAP_COMB_MARKS() convenience macro has some 1685*4703203dSis * assumptions and we are meeting the assumptions. 1686*4703203dSis */ 1687*4703203dSis last--; 1688*4703203dSis if (last >= saved_last) { 1689*4703203dSis for (i = 0; i < last; i++) 1690*4703203dSis for (j = last; j > i; j--) 1691*4703203dSis if (comb_class[j] && 1692*4703203dSis comb_class[j - 1] > comb_class[j]) { 1693*4703203dSis U8_SWAP_COMB_MARKS(j - 1, j); 1694*4703203dSis } 1695*4703203dSis } 1696*4703203dSis 1697*4703203dSis *source = s; 1698*4703203dSis 1699*4703203dSis if (! canonical_composition) { 1700*4703203dSis u8s[saved_sz] = '\0'; 1701*4703203dSis return (saved_sz); 1702*4703203dSis } 1703*4703203dSis 1704*4703203dSis /* 1705*4703203dSis * Now do the canonical composition. Note that we do this 1706*4703203dSis * only after a canonical or compatibility decomposition to 1707*4703203dSis * finish up NFC or NFKC. 1708*4703203dSis */ 1709*4703203dSis sz = do_composition(uv, u8s, comb_class, start, disp, last, 1710*4703203dSis &s, slast); 1711*4703203dSis } 1712*4703203dSis 1713*4703203dSis *source = s; 1714*4703203dSis 1715*4703203dSis return ((size_t)sz); 1716*4703203dSis } 1717*4703203dSis 1718*4703203dSis /* 1719*4703203dSis * The do_norm_compare() function does string comparion based on Unicode 1720*4703203dSis * simple case mappings and Unicode Normalization definitions. 1721*4703203dSis * 1722*4703203dSis * It does so by collecting a sequence of character at a time and comparing 1723*4703203dSis * the collected sequences from the strings. 1724*4703203dSis * 1725*4703203dSis * The meanings on the return values are the same as the usual strcmp(). 1726*4703203dSis */ 1727*4703203dSis static int 1728*4703203dSis do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, 1729*4703203dSis int flag, int *errno) 1730*4703203dSis { 1731*4703203dSis int result; 1732*4703203dSis size_t sz1; 1733*4703203dSis size_t sz2; 1734*4703203dSis uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; 1735*4703203dSis uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; 1736*4703203dSis uchar_t *s1last; 1737*4703203dSis uchar_t *s2last; 1738*4703203dSis boolean_t is_it_toupper; 1739*4703203dSis boolean_t is_it_tolower; 1740*4703203dSis boolean_t canonical_decomposition; 1741*4703203dSis boolean_t compatibility_decomposition; 1742*4703203dSis boolean_t canonical_composition; 1743*4703203dSis u8_normalization_states_t state; 1744*4703203dSis 1745*4703203dSis s1last = s1 + n1; 1746*4703203dSis s2last = s2 + n2; 1747*4703203dSis 1748*4703203dSis is_it_toupper = flag & U8_TEXTPREP_TOUPPER; 1749*4703203dSis is_it_tolower = flag & U8_TEXTPREP_TOLOWER; 1750*4703203dSis canonical_decomposition = flag & U8_CANON_DECOMP; 1751*4703203dSis compatibility_decomposition = flag & U8_COMPAT_DECOMP; 1752*4703203dSis canonical_composition = flag & U8_CANON_COMP; 1753*4703203dSis 1754*4703203dSis while (s1 < s1last && s2 < s2last) { 1755*4703203dSis /* 1756*4703203dSis * If the current character is a 7-bit ASCII and the last 1757*4703203dSis * character, or, if the current character and the next 1758*4703203dSis * character are both some 7-bit ASCII characters then 1759*4703203dSis * we treat the current character as a sequence. 1760*4703203dSis * 1761*4703203dSis * In any other cases, we need to call collect_a_seq(). 1762*4703203dSis */ 1763*4703203dSis 1764*4703203dSis if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || 1765*4703203dSis ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { 1766*4703203dSis if (is_it_toupper) 1767*4703203dSis u8s1[0] = U8_ASCII_TOUPPER(*s1); 1768*4703203dSis else if (is_it_tolower) 1769*4703203dSis u8s1[0] = U8_ASCII_TOLOWER(*s1); 1770*4703203dSis else 1771*4703203dSis u8s1[0] = *s1; 1772*4703203dSis u8s1[1] = '\0'; 1773*4703203dSis sz1 = 1; 1774*4703203dSis s1++; 1775*4703203dSis } else { 1776*4703203dSis state = U8_STATE_START; 1777*4703203dSis sz1 = collect_a_seq(uv, u8s1, &s1, s1last, 1778*4703203dSis is_it_toupper, is_it_tolower, 1779*4703203dSis canonical_decomposition, 1780*4703203dSis compatibility_decomposition, 1781*4703203dSis canonical_composition, errno, &state); 1782*4703203dSis } 1783*4703203dSis 1784*4703203dSis if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || 1785*4703203dSis ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { 1786*4703203dSis if (is_it_toupper) 1787*4703203dSis u8s2[0] = U8_ASCII_TOUPPER(*s2); 1788*4703203dSis else if (is_it_tolower) 1789*4703203dSis u8s2[0] = U8_ASCII_TOLOWER(*s2); 1790*4703203dSis else 1791*4703203dSis u8s2[0] = *s2; 1792*4703203dSis u8s2[1] = '\0'; 1793*4703203dSis sz2 = 1; 1794*4703203dSis s2++; 1795*4703203dSis } else { 1796*4703203dSis state = U8_STATE_START; 1797*4703203dSis sz2 = collect_a_seq(uv, u8s2, &s2, s2last, 1798*4703203dSis is_it_toupper, is_it_tolower, 1799*4703203dSis canonical_decomposition, 1800*4703203dSis compatibility_decomposition, 1801*4703203dSis canonical_composition, errno, &state); 1802*4703203dSis } 1803*4703203dSis 1804*4703203dSis /* 1805*4703203dSis * Now compare the two characters. If they are the same, 1806*4703203dSis * we move on to the next character sequences. 1807*4703203dSis */ 1808*4703203dSis if (sz1 == 1 && sz2 == 1) { 1809*4703203dSis if (*u8s1 > *u8s2) 1810*4703203dSis return (1); 1811*4703203dSis if (*u8s1 < *u8s2) 1812*4703203dSis return (-1); 1813*4703203dSis } else { 1814*4703203dSis result = strcmp((const char *)u8s1, (const char *)u8s2); 1815*4703203dSis if (result != 0) 1816*4703203dSis return (result); 1817*4703203dSis } 1818*4703203dSis } 1819*4703203dSis 1820*4703203dSis /* 1821*4703203dSis * We compared until the end of either or both strings. 1822*4703203dSis * 1823*4703203dSis * If we reached to or went over the ends for the both, that means 1824*4703203dSis * they are the same. 1825*4703203dSis * 1826*4703203dSis * If we reached only one end, that means the other string has 1827*4703203dSis * something which then can be used to determine the return value. 1828*4703203dSis */ 1829*4703203dSis if (s1 >= s1last) { 1830*4703203dSis if (s2 >= s2last) 1831*4703203dSis return (0); 1832*4703203dSis return (-1); 1833*4703203dSis } 1834*4703203dSis return (1); 1835*4703203dSis } 1836*4703203dSis 1837*4703203dSis /* 1838*4703203dSis * The u8_strcmp() function compares two UTF-8 strings quite similar to 1839*4703203dSis * the strcmp(). For the comparison, however, Unicode Normalization specific 1840*4703203dSis * equivalency and Unicode simple case conversion mappings based equivalency 1841*4703203dSis * can be requested and checked against. 1842*4703203dSis */ 1843*4703203dSis int 1844*4703203dSis u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, 1845*4703203dSis int *errno) 1846*4703203dSis { 1847*4703203dSis int f; 1848*4703203dSis size_t n1; 1849*4703203dSis size_t n2; 1850*4703203dSis 1851*4703203dSis *errno = 0; 1852*4703203dSis 1853*4703203dSis /* 1854*4703203dSis * Check on the requested Unicode version, case conversion, and 1855*4703203dSis * normalization flag values. 1856*4703203dSis */ 1857*4703203dSis 1858*4703203dSis if (uv > U8_UNICODE_LATEST) { 1859*4703203dSis *errno = ERANGE; 1860*4703203dSis uv = U8_UNICODE_LATEST; 1861*4703203dSis } 1862*4703203dSis 1863*4703203dSis if (flag == 0) { 1864*4703203dSis flag = U8_STRCMP_CS; 1865*4703203dSis } else { 1866*4703203dSis f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER | 1867*4703203dSis U8_STRCMP_CI_LOWER); 1868*4703203dSis if (f == 0) { 1869*4703203dSis flag |= U8_STRCMP_CS; 1870*4703203dSis } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && 1871*4703203dSis f != U8_STRCMP_CI_LOWER) { 1872*4703203dSis *errno = EBADF; 1873*4703203dSis flag = U8_STRCMP_CS; 1874*4703203dSis } 1875*4703203dSis 1876*4703203dSis f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); 1877*4703203dSis if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && 1878*4703203dSis f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { 1879*4703203dSis *errno = EBADF; 1880*4703203dSis flag = U8_STRCMP_CS; 1881*4703203dSis } 1882*4703203dSis } 1883*4703203dSis 1884*4703203dSis if (flag == U8_STRCMP_CS) { 1885*4703203dSis return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); 1886*4703203dSis } 1887*4703203dSis 1888*4703203dSis n1 = strlen(s1); 1889*4703203dSis n2 = strlen(s2); 1890*4703203dSis if (n != 0) { 1891*4703203dSis if (n < n1) 1892*4703203dSis n1 = n; 1893*4703203dSis if (n < n2) 1894*4703203dSis n2 = n; 1895*4703203dSis } 1896*4703203dSis 1897*4703203dSis /* 1898*4703203dSis * Simple case conversion can be done much faster and so we do 1899*4703203dSis * them separately here. 1900*4703203dSis */ 1901*4703203dSis if (flag == U8_STRCMP_CI_UPPER) { 1902*4703203dSis return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, 1903*4703203dSis n1, n2, B_TRUE, errno)); 1904*4703203dSis } else if (flag == U8_STRCMP_CI_LOWER) { 1905*4703203dSis return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, 1906*4703203dSis n1, n2, B_FALSE, errno)); 1907*4703203dSis } 1908*4703203dSis 1909*4703203dSis return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, 1910*4703203dSis flag, errno)); 1911*4703203dSis } 1912*4703203dSis 1913*4703203dSis size_t 1914*4703203dSis u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, 1915*4703203dSis int flag, size_t unicode_version, int *errno) 1916*4703203dSis { 1917*4703203dSis int f; 1918*4703203dSis int sz; 1919*4703203dSis uchar_t *ib; 1920*4703203dSis uchar_t *ibtail; 1921*4703203dSis uchar_t *ob; 1922*4703203dSis uchar_t *obtail; 1923*4703203dSis boolean_t do_not_ignore_null; 1924*4703203dSis boolean_t do_not_ignore_invalid; 1925*4703203dSis boolean_t is_it_toupper; 1926*4703203dSis boolean_t is_it_tolower; 1927*4703203dSis boolean_t canonical_decomposition; 1928*4703203dSis boolean_t compatibility_decomposition; 1929*4703203dSis boolean_t canonical_composition; 1930*4703203dSis size_t ret_val; 1931*4703203dSis size_t i; 1932*4703203dSis size_t j; 1933*4703203dSis uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; 1934*4703203dSis u8_normalization_states_t state; 1935*4703203dSis 1936*4703203dSis if (unicode_version > U8_UNICODE_LATEST) { 1937*4703203dSis *errno = ERANGE; 1938*4703203dSis return ((size_t)-1); 1939*4703203dSis } 1940*4703203dSis 1941*4703203dSis f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); 1942*4703203dSis if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { 1943*4703203dSis *errno = EBADF; 1944*4703203dSis return ((size_t)-1); 1945*4703203dSis } 1946*4703203dSis 1947*4703203dSis f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); 1948*4703203dSis if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && 1949*4703203dSis f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { 1950*4703203dSis *errno = EBADF; 1951*4703203dSis return ((size_t)-1); 1952*4703203dSis } 1953*4703203dSis 1954*4703203dSis if (inarray == NULL || *inlen == 0) 1955*4703203dSis return (0); 1956*4703203dSis 1957*4703203dSis if (outarray == NULL) { 1958*4703203dSis *errno = E2BIG; 1959*4703203dSis return ((size_t)-1); 1960*4703203dSis } 1961*4703203dSis 1962*4703203dSis ib = (uchar_t *)inarray; 1963*4703203dSis ob = (uchar_t *)outarray; 1964*4703203dSis ibtail = ib + *inlen; 1965*4703203dSis obtail = ob + *outlen; 1966*4703203dSis 1967*4703203dSis do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); 1968*4703203dSis do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); 1969*4703203dSis is_it_toupper = flag & U8_TEXTPREP_TOUPPER; 1970*4703203dSis is_it_tolower = flag & U8_TEXTPREP_TOLOWER; 1971*4703203dSis 1972*4703203dSis ret_val = 0; 1973*4703203dSis 1974*4703203dSis /* 1975*4703203dSis * If we don't have a normalization flag set, we do the simple case 1976*4703203dSis * conversion based text preparation separately below. Text 1977*4703203dSis * preparation involving Normalization will be done in the false task 1978*4703203dSis * block, again, separately since it will take much more time and 1979*4703203dSis * resource than doing simple case conversions. 1980*4703203dSis */ 1981*4703203dSis if (f == 0) { 1982*4703203dSis while (ib < ibtail) { 1983*4703203dSis if (*ib == '\0' && do_not_ignore_null) 1984*4703203dSis break; 1985*4703203dSis 1986*4703203dSis sz = u8_number_of_bytes[*ib]; 1987*4703203dSis 1988*4703203dSis if (sz < 0) { 1989*4703203dSis if (do_not_ignore_invalid) { 1990*4703203dSis *errno = EILSEQ; 1991*4703203dSis ret_val = (size_t)-1; 1992*4703203dSis break; 1993*4703203dSis } 1994*4703203dSis 1995*4703203dSis sz = 1; 1996*4703203dSis ret_val++; 1997*4703203dSis } 1998*4703203dSis 1999*4703203dSis if (sz == 1) { 2000*4703203dSis if (ob >= obtail) { 2001*4703203dSis *errno = E2BIG; 2002*4703203dSis ret_val = (size_t)-1; 2003*4703203dSis break; 2004*4703203dSis } 2005*4703203dSis 2006*4703203dSis if (is_it_toupper) 2007*4703203dSis *ob = U8_ASCII_TOUPPER(*ib); 2008*4703203dSis else if (is_it_tolower) 2009*4703203dSis *ob = U8_ASCII_TOLOWER(*ib); 2010*4703203dSis else 2011*4703203dSis *ob = *ib; 2012*4703203dSis ib++; 2013*4703203dSis ob++; 2014*4703203dSis } else if ((ib + sz) > ibtail) { 2015*4703203dSis if (do_not_ignore_invalid) { 2016*4703203dSis *errno = EINVAL; 2017*4703203dSis ret_val = (size_t)-1; 2018*4703203dSis break; 2019*4703203dSis } 2020*4703203dSis 2021*4703203dSis if ((obtail - ob) < (ibtail - ib)) { 2022*4703203dSis *errno = E2BIG; 2023*4703203dSis ret_val = (size_t)-1; 2024*4703203dSis break; 2025*4703203dSis } 2026*4703203dSis 2027*4703203dSis /* 2028*4703203dSis * We treat the remaining incomplete character 2029*4703203dSis * bytes as a character. 2030*4703203dSis */ 2031*4703203dSis ret_val++; 2032*4703203dSis 2033*4703203dSis while (ib < ibtail) 2034*4703203dSis *ob++ = *ib++; 2035*4703203dSis } else { 2036*4703203dSis if (is_it_toupper || is_it_tolower) { 2037*4703203dSis i = do_case_conv(unicode_version, u8s, 2038*4703203dSis ib, sz, is_it_toupper); 2039*4703203dSis 2040*4703203dSis if ((obtail - ob) < i) { 2041*4703203dSis *errno = E2BIG; 2042*4703203dSis ret_val = (size_t)-1; 2043*4703203dSis break; 2044*4703203dSis } 2045*4703203dSis 2046*4703203dSis ib += sz; 2047*4703203dSis 2048*4703203dSis for (sz = 0; sz < i; sz++) 2049*4703203dSis *ob++ = u8s[sz]; 2050*4703203dSis } else { 2051*4703203dSis if ((obtail - ob) < sz) { 2052*4703203dSis *errno = E2BIG; 2053*4703203dSis ret_val = (size_t)-1; 2054*4703203dSis break; 2055*4703203dSis } 2056*4703203dSis 2057*4703203dSis for (i = 0; i < sz; i++) 2058*4703203dSis *ob++ = *ib++; 2059*4703203dSis } 2060*4703203dSis } 2061*4703203dSis } 2062*4703203dSis } else { 2063*4703203dSis canonical_decomposition = flag & U8_CANON_DECOMP; 2064*4703203dSis compatibility_decomposition = flag & U8_COMPAT_DECOMP; 2065*4703203dSis canonical_composition = flag & U8_CANON_COMP; 2066*4703203dSis 2067*4703203dSis while (ib < ibtail) { 2068*4703203dSis if (*ib == '\0' && do_not_ignore_null) 2069*4703203dSis break; 2070*4703203dSis 2071*4703203dSis /* 2072*4703203dSis * If the current character is a 7-bit ASCII 2073*4703203dSis * character and it is the last character, or, 2074*4703203dSis * if the current character is a 7-bit ASCII 2075*4703203dSis * character and the next character is also a 7-bit 2076*4703203dSis * ASCII character, then, we copy over this 2077*4703203dSis * character without going through collect_a_seq(). 2078*4703203dSis * 2079*4703203dSis * In any other cases, we need to look further with 2080*4703203dSis * the collect_a_seq() function. 2081*4703203dSis */ 2082*4703203dSis if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || 2083*4703203dSis ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { 2084*4703203dSis if (ob >= obtail) { 2085*4703203dSis *errno = E2BIG; 2086*4703203dSis ret_val = (size_t)-1; 2087*4703203dSis break; 2088*4703203dSis } 2089*4703203dSis 2090*4703203dSis if (is_it_toupper) 2091*4703203dSis *ob = U8_ASCII_TOUPPER(*ib); 2092*4703203dSis else if (is_it_tolower) 2093*4703203dSis *ob = U8_ASCII_TOLOWER(*ib); 2094*4703203dSis else 2095*4703203dSis *ob = *ib; 2096*4703203dSis ib++; 2097*4703203dSis ob++; 2098*4703203dSis } else { 2099*4703203dSis *errno = 0; 2100*4703203dSis state = U8_STATE_START; 2101*4703203dSis 2102*4703203dSis j = collect_a_seq(unicode_version, u8s, 2103*4703203dSis &ib, ibtail, 2104*4703203dSis is_it_toupper, 2105*4703203dSis is_it_tolower, 2106*4703203dSis canonical_decomposition, 2107*4703203dSis compatibility_decomposition, 2108*4703203dSis canonical_composition, 2109*4703203dSis errno, &state); 2110*4703203dSis 2111*4703203dSis if (*errno && do_not_ignore_invalid) { 2112*4703203dSis ret_val = (size_t)-1; 2113*4703203dSis break; 2114*4703203dSis } 2115*4703203dSis 2116*4703203dSis if ((obtail - ob) < j) { 2117*4703203dSis *errno = E2BIG; 2118*4703203dSis ret_val = (size_t)-1; 2119*4703203dSis break; 2120*4703203dSis } 2121*4703203dSis 2122*4703203dSis for (i = 0; i < j; i++) 2123*4703203dSis *ob++ = u8s[i]; 2124*4703203dSis } 2125*4703203dSis } 2126*4703203dSis } 2127*4703203dSis 2128*4703203dSis *inlen = ibtail - ib; 2129*4703203dSis *outlen = obtail - ob; 2130*4703203dSis 2131*4703203dSis return (ret_val); 2132*4703203dSis } 2133