xref: /titanic_51/usr/src/common/unicode/u8_textprep.c (revision 4703203d9b3e06246d73931f07359a7ef70f47bf)
1*4703203dSis /*
2*4703203dSis  * CDDL HEADER START
3*4703203dSis  *
4*4703203dSis  * The contents of this file are subject to the terms of the
5*4703203dSis  * Common Development and Distribution License (the "License").
6*4703203dSis  * You may not use this file except in compliance with the License.
7*4703203dSis  *
8*4703203dSis  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*4703203dSis  * or http://www.opensolaris.org/os/licensing.
10*4703203dSis  * See the License for the specific language governing permissions
11*4703203dSis  * and limitations under the License.
12*4703203dSis  *
13*4703203dSis  * When distributing Covered Code, include this CDDL HEADER in each
14*4703203dSis  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*4703203dSis  * If applicable, add the following below this CDDL HEADER, with the
16*4703203dSis  * fields enclosed by brackets "[]" replaced with your own identifying
17*4703203dSis  * information: Portions Copyright [yyyy] [name of copyright owner]
18*4703203dSis  *
19*4703203dSis  * CDDL HEADER END
20*4703203dSis  */
21*4703203dSis /*
22*4703203dSis  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23*4703203dSis  * Use is subject to license terms.
24*4703203dSis  */
25*4703203dSis 
26*4703203dSis #pragma ident	"%Z%%M%	%I%	%E% SMI"
27*4703203dSis 
28*4703203dSis 
29*4703203dSis /*
30*4703203dSis  * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
31*4703203dSis  *
32*4703203dSis  * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
33*4703203dSis  * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
34*4703203dSis  * the section 3C man pages.
35*4703203dSis  * Interface stability: Committed.
36*4703203dSis  */
37*4703203dSis 
38*4703203dSis #include <sys/types.h>
39*4703203dSis #ifdef	_KERNEL
40*4703203dSis #include <sys/param.h>
41*4703203dSis #include <sys/sysmacros.h>
42*4703203dSis #include <sys/systm.h>
43*4703203dSis #include <sys/debug.h>
44*4703203dSis #include <sys/kmem.h>
45*4703203dSis #include <sys/ddi.h>
46*4703203dSis #include <sys/sunddi.h>
47*4703203dSis #else
48*4703203dSis #include <sys/u8_textprep.h>
49*4703203dSis #include <strings.h>
50*4703203dSis #endif	/* _KERNEL */
51*4703203dSis #include <sys/byteorder.h>
52*4703203dSis #include <sys/errno.h>
53*4703203dSis #include <sys/u8_textprep_data.h>
54*4703203dSis 
55*4703203dSis 
56*4703203dSis /* The maximum possible number of bytes in a UTF-8 character. */
57*4703203dSis #define	U8_MB_CUR_MAX			(4)
58*4703203dSis 
59*4703203dSis /*
60*4703203dSis  * The maximum number of bytes needed for a UTF-8 character to cover
61*4703203dSis  * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
62*4703203dSis  */
63*4703203dSis #define	U8_MAX_BYTES_UCS2		(3)
64*4703203dSis 
65*4703203dSis /* The maximum possible number of bytes in a Stream-Safe Text. */
66*4703203dSis #define	U8_STREAM_SAFE_TEXT_MAX		(128)
67*4703203dSis 
68*4703203dSis /*
69*4703203dSis  * The maximum number of characters in a combining/conjoining sequence and
70*4703203dSis  * the actual upperbound limit of a combining/conjoining sequence.
71*4703203dSis  */
72*4703203dSis #define	U8_MAX_CHARS_A_SEQ		(32)
73*4703203dSis #define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
74*4703203dSis 
75*4703203dSis /* The combining class value for Starter. */
76*4703203dSis #define	U8_COMBINING_CLASS_STARTER	(0)
77*4703203dSis 
78*4703203dSis /*
79*4703203dSis  * Some Hangul related macros at below.
80*4703203dSis  *
81*4703203dSis  * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
82*4703203dSis  * Vowels, and optional Trailing consonants in Unicode scalar values.
83*4703203dSis  *
84*4703203dSis  * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
85*4703203dSis  * the actual U+11A8. This is due to that the trailing consonant is optional
86*4703203dSis  * and thus we are doing a pre-calculation of subtracting one.
87*4703203dSis  *
88*4703203dSis  * Each of 19 modern leading consonants has total 588 possible syllables since
89*4703203dSis  * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
90*4703203dSis  * no trailing consonant case, i.e., 21 x 28 = 588.
91*4703203dSis  *
92*4703203dSis  * We also have bunch of Hangul related macros at below. Please bear in mind
93*4703203dSis  * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
94*4703203dSis  * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
95*4703203dSis  * Jamo; it just guarantee that it will be most likely.
96*4703203dSis  */
97*4703203dSis #define	U8_HANGUL_SYL_FIRST		(0xAC00U)
98*4703203dSis #define	U8_HANGUL_SYL_LAST		(0xD7A3U)
99*4703203dSis 
100*4703203dSis #define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
101*4703203dSis #define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
102*4703203dSis #define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
103*4703203dSis #define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
104*4703203dSis #define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
105*4703203dSis #define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
106*4703203dSis 
107*4703203dSis #define	U8_HANGUL_V_COUNT		(21)
108*4703203dSis #define	U8_HANGUL_VT_COUNT		(588)
109*4703203dSis #define	U8_HANGUL_T_COUNT		(28)
110*4703203dSis 
111*4703203dSis #define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
112*4703203dSis 
113*4703203dSis #define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
114*4703203dSis 	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
115*4703203dSis 	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
116*4703203dSis 	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
117*4703203dSis 
118*4703203dSis #define	U8_HANGUL_JAMO_L(u) \
119*4703203dSis 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
120*4703203dSis 
121*4703203dSis #define	U8_HANGUL_JAMO_V(u) \
122*4703203dSis 	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
123*4703203dSis 
124*4703203dSis #define	U8_HANGUL_JAMO_T(u) \
125*4703203dSis 	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
126*4703203dSis 
127*4703203dSis #define	U8_HANGUL_JAMO(u) \
128*4703203dSis 	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
129*4703203dSis 
130*4703203dSis #define	U8_HANGUL_SYLLABLE(u) \
131*4703203dSis 	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
132*4703203dSis 
133*4703203dSis #define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
134*4703203dSis 	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
135*4703203dSis 
136*4703203dSis #define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
137*4703203dSis 	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
138*4703203dSis 
139*4703203dSis /* The types of decomposition mappings. */
140*4703203dSis #define	U8_DECOMP_BOTH			(0xF5U)
141*4703203dSis #define	U8_DECOMP_CANONICAL		(0xF6U)
142*4703203dSis 
143*4703203dSis /* The indicator for 16-bit table. */
144*4703203dSis #define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
145*4703203dSis 
146*4703203dSis /* The following are some convenience macros. */
147*4703203dSis #define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
148*4703203dSis 	(u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
149*4703203dSis 		(uint32_t)(b3) & 0x3F;
150*4703203dSis 
151*4703203dSis #define	U8_SIMPLE_SWAP(a, b, t) \
152*4703203dSis 	(t) = (a); \
153*4703203dSis 	(a) = (b); \
154*4703203dSis 	(b) = (t);
155*4703203dSis 
156*4703203dSis #define	U8_ASCII_TOUPPER(c) \
157*4703203dSis 	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
158*4703203dSis 
159*4703203dSis #define	U8_ASCII_TOLOWER(c) \
160*4703203dSis 	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
161*4703203dSis 
162*4703203dSis #define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
163*4703203dSis /*
164*4703203dSis  * The following macro assumes that the two characters that are to be
165*4703203dSis  * swapped are adjacent to each other and 'a' comes before 'b'.
166*4703203dSis  *
167*4703203dSis  * If the assumptions are not met, then, the macro will fail.
168*4703203dSis  */
169*4703203dSis #define	U8_SWAP_COMB_MARKS(a, b) \
170*4703203dSis 	for (k = 0; k < disp[(a)]; k++) \
171*4703203dSis 		u8t[k] = u8s[start[(a)] + k]; \
172*4703203dSis 	for (k = 0; k < disp[(b)]; k++) \
173*4703203dSis 		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
174*4703203dSis 	start[(b)] = start[(a)] + disp[(b)]; \
175*4703203dSis 	for (k = 0; k < disp[(a)]; k++) \
176*4703203dSis 		u8s[start[(b)] + k] = u8t[k]; \
177*4703203dSis 	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
178*4703203dSis 	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
179*4703203dSis 
180*4703203dSis /* The possible states during normalization. */
181*4703203dSis typedef enum {
182*4703203dSis 	U8_STATE_START = 0,
183*4703203dSis 	U8_STATE_HANGUL_L = 1,
184*4703203dSis 	U8_STATE_HANGUL_LV = 2,
185*4703203dSis 	U8_STATE_HANGUL_LVT = 3,
186*4703203dSis 	U8_STATE_HANGUL_V = 4,
187*4703203dSis 	U8_STATE_HANGUL_T = 5,
188*4703203dSis 	U8_STATE_COMBINING_MARK = 6
189*4703203dSis } u8_normalization_states_t;
190*4703203dSis 
191*4703203dSis /*
192*4703203dSis  * The three vectors at below are used to check bytes of a given UTF-8
193*4703203dSis  * character are valid and not containing any malformed byte values.
194*4703203dSis  *
195*4703203dSis  * We used to have a quite relaxed UTF-8 binary representation but then there
196*4703203dSis  * was some security related issues and so the Unicode Consortium defined
197*4703203dSis  * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
198*4703203dSis  * one more time at the Unicode 3.2. The following three tables are based on
199*4703203dSis  * that.
200*4703203dSis  */
201*4703203dSis 
202*4703203dSis #define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
203*4703203dSis 
204*4703203dSis #define	I_				U8_ILLEGAL_CHAR
205*4703203dSis #define	O_				U8_OUT_OF_RANGE_CHAR
206*4703203dSis 
207*4703203dSis const int8_t u8_number_of_bytes[0x100] = {
208*4703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
209*4703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
210*4703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
211*4703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
212*4703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
213*4703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
214*4703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
215*4703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
216*4703203dSis 
217*4703203dSis /*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
218*4703203dSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
219*4703203dSis 
220*4703203dSis /*  	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
221*4703203dSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
222*4703203dSis 
223*4703203dSis /*  	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
224*4703203dSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
225*4703203dSis 
226*4703203dSis /*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
227*4703203dSis 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
228*4703203dSis 
229*4703203dSis /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
230*4703203dSis 	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
231*4703203dSis 
232*4703203dSis /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
233*4703203dSis 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
234*4703203dSis 
235*4703203dSis /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
236*4703203dSis 	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
237*4703203dSis 
238*4703203dSis /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
239*4703203dSis 	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
240*4703203dSis };
241*4703203dSis 
242*4703203dSis #undef	I_
243*4703203dSis #undef	O_
244*4703203dSis 
245*4703203dSis const uint8_t u8_valid_min_2nd_byte[0x100] = {
246*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
247*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
248*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
249*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
250*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
251*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
252*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
253*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
254*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
255*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
256*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
257*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
258*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
259*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
260*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
261*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
262*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
263*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
264*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
265*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
266*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
267*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
268*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
269*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
270*4703203dSis /*	C0    C1    C2    C3    C4    C5    C6    C7    */
271*4703203dSis 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
272*4703203dSis /*	C8    C9    CA    CB    CC    CD    CE    CF    */
273*4703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
274*4703203dSis /*	D0    D1    D2    D3    D4    D5    D6    D7    */
275*4703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
276*4703203dSis /*	D8    D9    DA    DB    DC    DD    DE    DF    */
277*4703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
278*4703203dSis /*	E0    E1    E2    E3    E4    E5    E6    E7    */
279*4703203dSis 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
280*4703203dSis /*	E8    E9    EA    EB    EC    ED    EE    EF    */
281*4703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
282*4703203dSis /*	F0    F1    F2    F3    F4    F5    F6    F7    */
283*4703203dSis 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
284*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
285*4703203dSis };
286*4703203dSis 
287*4703203dSis const uint8_t u8_valid_max_2nd_byte[0x100] = {
288*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
289*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
290*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
291*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
292*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
293*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
294*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
295*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
296*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
297*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
298*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
299*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
300*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
301*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
302*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
303*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
304*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
305*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
306*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
307*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
308*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
309*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
310*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
311*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
312*4703203dSis /*	C0    C1    C2    C3    C4    C5    C6    C7    */
313*4703203dSis 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
314*4703203dSis /*	C8    C9    CA    CB    CC    CD    CE    CF    */
315*4703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
316*4703203dSis /*	D0    D1    D2    D3    D4    D5    D6    D7    */
317*4703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
318*4703203dSis /*	D8    D9    DA    DB    DC    DD    DE    DF    */
319*4703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
320*4703203dSis /*	E0    E1    E2    E3    E4    E5    E6    E7    */
321*4703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
322*4703203dSis /*	E8    E9    EA    EB    EC    ED    EE    EF    */
323*4703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
324*4703203dSis /*	F0    F1    F2    F3    F4    F5    F6    F7    */
325*4703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
326*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
327*4703203dSis };
328*4703203dSis 
329*4703203dSis 
330*4703203dSis /*
331*4703203dSis  * The u8_validate() validates on the given UTF-8 character string and
332*4703203dSis  * calculate the byte length. It is quite similar to mblen(3C) except that
333*4703203dSis  * this will validate against the list of characters if required and
334*4703203dSis  * specific to UTF-8 and Unicode.
335*4703203dSis  */
336*4703203dSis int
337*4703203dSis u8_validate(char *u8str, size_t n, char **list, int flag, int *errno)
338*4703203dSis {
339*4703203dSis 	uchar_t *ib;
340*4703203dSis 	uchar_t *ibtail;
341*4703203dSis 	uchar_t **p;
342*4703203dSis 	uchar_t *s1;
343*4703203dSis 	uchar_t *s2;
344*4703203dSis 	uchar_t f;
345*4703203dSis 	int sz;
346*4703203dSis 	size_t i;
347*4703203dSis 	int ret_val;
348*4703203dSis 	boolean_t second;
349*4703203dSis 	boolean_t no_need_to_validate_entire;
350*4703203dSis 	boolean_t check_additional;
351*4703203dSis 	boolean_t validate_ucs2_range_only;
352*4703203dSis 
353*4703203dSis 	if (! u8str)
354*4703203dSis 		return (0);
355*4703203dSis 
356*4703203dSis 	ib = (uchar_t *)u8str;
357*4703203dSis 	ibtail = ib + n;
358*4703203dSis 
359*4703203dSis 	ret_val = 0;
360*4703203dSis 
361*4703203dSis 	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
362*4703203dSis 	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
363*4703203dSis 	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
364*4703203dSis 
365*4703203dSis 	while (ib < ibtail) {
366*4703203dSis 		/*
367*4703203dSis 		 * The first byte of a UTF-8 character tells how many
368*4703203dSis 		 * bytes will follow for the character. If the first byte
369*4703203dSis 		 * is an illegal byte value or out of range value, we just
370*4703203dSis 		 * return -1 with an appropriate error number.
371*4703203dSis 		 */
372*4703203dSis 		sz = u8_number_of_bytes[*ib];
373*4703203dSis 		if (sz == U8_ILLEGAL_CHAR) {
374*4703203dSis 			*errno = EILSEQ;
375*4703203dSis 			return (-1);
376*4703203dSis 		}
377*4703203dSis 
378*4703203dSis 		if (sz == U8_OUT_OF_RANGE_CHAR ||
379*4703203dSis 		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
380*4703203dSis 			*errno = ERANGE;
381*4703203dSis 			return (-1);
382*4703203dSis 		}
383*4703203dSis 
384*4703203dSis 		/*
385*4703203dSis 		 * If we don't have enough bytes to check on, that's also
386*4703203dSis 		 * an error. As you can see, we give illegal byte sequence
387*4703203dSis 		 * checking higher priority then EINVAL cases.
388*4703203dSis 		 */
389*4703203dSis 		if ((ibtail - ib) < sz) {
390*4703203dSis 			*errno = EINVAL;
391*4703203dSis 			return (-1);
392*4703203dSis 		}
393*4703203dSis 
394*4703203dSis 		if (sz == 1) {
395*4703203dSis 			ib++;
396*4703203dSis 			ret_val++;
397*4703203dSis 		} else {
398*4703203dSis 			/*
399*4703203dSis 			 * Check on the multi-byte UTF-8 character. For more
400*4703203dSis 			 * details on this, see comment added for the used
401*4703203dSis 			 * data structures at the beginning of the file.
402*4703203dSis 			 */
403*4703203dSis 			f = *ib++;
404*4703203dSis 			ret_val++;
405*4703203dSis 			second = B_TRUE;
406*4703203dSis 			for (i = 1; i < sz; i++) {
407*4703203dSis 				if (second) {
408*4703203dSis 					if (*ib < u8_valid_min_2nd_byte[f] ||
409*4703203dSis 					    *ib > u8_valid_max_2nd_byte[f]) {
410*4703203dSis 						*errno = EILSEQ;
411*4703203dSis 						return (-1);
412*4703203dSis 					}
413*4703203dSis 					second = B_FALSE;
414*4703203dSis 				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
415*4703203dSis 					*errno = EILSEQ;
416*4703203dSis 					return (-1);
417*4703203dSis 				}
418*4703203dSis 				ib++;
419*4703203dSis 				ret_val++;
420*4703203dSis 			}
421*4703203dSis 		}
422*4703203dSis 
423*4703203dSis 		if (check_additional) {
424*4703203dSis 			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
425*4703203dSis 				s1 = ib - sz;
426*4703203dSis 				s2 = p[i];
427*4703203dSis 				while (s1 < ib) {
428*4703203dSis 					if (*s1 != *s2 || *s2 == '\0')
429*4703203dSis 						break;
430*4703203dSis 					s1++;
431*4703203dSis 					s2++;
432*4703203dSis 				}
433*4703203dSis 
434*4703203dSis 				if (s1 >= ib && *s2 == '\0') {
435*4703203dSis 					*errno = EBADF;
436*4703203dSis 					return (-1);
437*4703203dSis 				}
438*4703203dSis 			}
439*4703203dSis 		}
440*4703203dSis 
441*4703203dSis 		if (no_need_to_validate_entire)
442*4703203dSis 			break;
443*4703203dSis 	}
444*4703203dSis 
445*4703203dSis 	return (ret_val);
446*4703203dSis }
447*4703203dSis 
448*4703203dSis /*
449*4703203dSis  * The do_case_conv() looks at the mapping tables and returns found
450*4703203dSis  * bytes if any. If not found, the input bytes are returned. The function
451*4703203dSis  * always terminate the return bytes with a null character assuming that
452*4703203dSis  * there are plenty of room to do so.
453*4703203dSis  *
454*4703203dSis  * The case conversions are simple case conversions mapping a character to
455*4703203dSis  * another character as specified in the Unicode data. The byte size of
456*4703203dSis  * the mapped character could be different from that of the input character.
457*4703203dSis  *
458*4703203dSis  * The return value is the byte length of the returned character excluding
459*4703203dSis  * the terminating null byte.
460*4703203dSis  */
461*4703203dSis static size_t
462*4703203dSis do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
463*4703203dSis {
464*4703203dSis 	size_t i;
465*4703203dSis 	uint16_t b1 = 0;
466*4703203dSis 	uint16_t b2 = 0;
467*4703203dSis 	uint16_t b3 = 0;
468*4703203dSis 	uint16_t b3_tbl;
469*4703203dSis 	uint16_t b3_base;
470*4703203dSis 	uint16_t b4 = 0;
471*4703203dSis 	size_t start_id;
472*4703203dSis 	size_t end_id;
473*4703203dSis 
474*4703203dSis 	/*
475*4703203dSis 	 * At this point, the only possible values for sz are 2, 3, and 4.
476*4703203dSis 	 * The u8s should point to a vector that is well beyond the size of
477*4703203dSis 	 * 5 bytes.
478*4703203dSis 	 */
479*4703203dSis 	if (sz == 2) {
480*4703203dSis 		b3 = u8s[0] = s[0];
481*4703203dSis 		b4 = u8s[1] = s[1];
482*4703203dSis 	} else if (sz == 3) {
483*4703203dSis 		b2 = u8s[0] = s[0];
484*4703203dSis 		b3 = u8s[1] = s[1];
485*4703203dSis 		b4 = u8s[2] = s[2];
486*4703203dSis 	} else if (sz == 4) {
487*4703203dSis 		b1 = u8s[0] = s[0];
488*4703203dSis 		b2 = u8s[1] = s[1];
489*4703203dSis 		b3 = u8s[2] = s[2];
490*4703203dSis 		b4 = u8s[3] = s[3];
491*4703203dSis 	} else {
492*4703203dSis 		/* This is not possible but just in case as a fallback. */
493*4703203dSis 		if (is_it_toupper)
494*4703203dSis 			*u8s = U8_ASCII_TOUPPER(*s);
495*4703203dSis 		else
496*4703203dSis 			*u8s = U8_ASCII_TOLOWER(*s);
497*4703203dSis 		u8s[1] = '\0';
498*4703203dSis 
499*4703203dSis 		return (1);
500*4703203dSis 	}
501*4703203dSis 	u8s[sz] = '\0';
502*4703203dSis 
503*4703203dSis 	/*
504*4703203dSis 	 * Let's find out if we have a corresponding character.
505*4703203dSis 	 */
506*4703203dSis 	b1 = u8_common_b1_tbl[uv][b1];
507*4703203dSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
508*4703203dSis 		return ((size_t)sz);
509*4703203dSis 
510*4703203dSis 	b2 = u8_case_common_b2_tbl[uv][b1][b2];
511*4703203dSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
512*4703203dSis 		return ((size_t)sz);
513*4703203dSis 
514*4703203dSis 	if (is_it_toupper) {
515*4703203dSis 		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
516*4703203dSis 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
517*4703203dSis 			return ((size_t)sz);
518*4703203dSis 
519*4703203dSis 		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
520*4703203dSis 		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
521*4703203dSis 
522*4703203dSis 		/* Either there is no match or an error at the table. */
523*4703203dSis 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
524*4703203dSis 			return ((size_t)sz);
525*4703203dSis 
526*4703203dSis 		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
527*4703203dSis 
528*4703203dSis 		for (i = 0; start_id < end_id; start_id++)
529*4703203dSis 			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
530*4703203dSis 	} else {
531*4703203dSis 		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
532*4703203dSis 		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
533*4703203dSis 			return ((size_t)sz);
534*4703203dSis 
535*4703203dSis 		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
536*4703203dSis 		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
537*4703203dSis 
538*4703203dSis 		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
539*4703203dSis 			return ((size_t)sz);
540*4703203dSis 
541*4703203dSis 		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
542*4703203dSis 
543*4703203dSis 		for (i = 0; start_id < end_id; start_id++)
544*4703203dSis 			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
545*4703203dSis 	}
546*4703203dSis 
547*4703203dSis 	/*
548*4703203dSis 	 * If i is still zero, that means there is no corresponding character.
549*4703203dSis 	 */
550*4703203dSis 	if (i == 0)
551*4703203dSis 		return ((size_t)sz);
552*4703203dSis 
553*4703203dSis 	u8s[i] = '\0';
554*4703203dSis 
555*4703203dSis 	return (i);
556*4703203dSis }
557*4703203dSis 
558*4703203dSis /*
559*4703203dSis  * The do_case_compare() function compares the two input strings, s1 and s2,
560*4703203dSis  * one character at a time doing case conversions if applicable and return
561*4703203dSis  * the comparison result as like strcmp().
562*4703203dSis  *
563*4703203dSis  * Since, in empirical sense, most of text data are 7-bit ASCII characters,
564*4703203dSis  * we treat the 7-bit ASCII characters as a special case trying to yield
565*4703203dSis  * faster processing time.
566*4703203dSis  */
567*4703203dSis static int
568*4703203dSis do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
569*4703203dSis 	size_t n2, boolean_t is_it_toupper, int *errno)
570*4703203dSis {
571*4703203dSis 	int f;
572*4703203dSis 	int sz1;
573*4703203dSis 	int sz2;
574*4703203dSis 	size_t j;
575*4703203dSis 	size_t i1;
576*4703203dSis 	size_t i2;
577*4703203dSis 	uchar_t u8s1[U8_MB_CUR_MAX + 1];
578*4703203dSis 	uchar_t u8s2[U8_MB_CUR_MAX + 1];
579*4703203dSis 
580*4703203dSis 	i1 = i2 = 0;
581*4703203dSis 	while (i1 < n1 && i2 < n2) {
582*4703203dSis 		/*
583*4703203dSis 		 * Find out what would be the byte length for this UTF-8
584*4703203dSis 		 * character at string s1 and also find out if this is
585*4703203dSis 		 * an illegal start byte or not and if so, issue a proper
586*4703203dSis 		 * errno and yet treat this byte as a character.
587*4703203dSis 		 */
588*4703203dSis 		sz1 = u8_number_of_bytes[*s1];
589*4703203dSis 		if (sz1 < 0) {
590*4703203dSis 			*errno = EILSEQ;
591*4703203dSis 			sz1 = 1;
592*4703203dSis 		}
593*4703203dSis 
594*4703203dSis 		/*
595*4703203dSis 		 * For 7-bit ASCII characters mainly, we do a quick case
596*4703203dSis 		 * conversion right at here.
597*4703203dSis 		 *
598*4703203dSis 		 * If we don't have enough bytes for this character, issue
599*4703203dSis 		 * an EINVAL error and use what are available.
600*4703203dSis 		 *
601*4703203dSis 		 * If we have enough bytes, find out if there is
602*4703203dSis 		 * a corresponding uppercase character and if so, copy over
603*4703203dSis 		 * the bytes for a comparison later. If there is no
604*4703203dSis 		 * corresponding uppercase character, then, use what we have
605*4703203dSis 		 * for the comparison.
606*4703203dSis 		 */
607*4703203dSis 		if (sz1 == 1) {
608*4703203dSis 			if (is_it_toupper)
609*4703203dSis 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
610*4703203dSis 			else
611*4703203dSis 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
612*4703203dSis 			s1++;
613*4703203dSis 			u8s1[1] = '\0';
614*4703203dSis 		} else if ((i1 + sz1) > n1) {
615*4703203dSis 			*errno = EINVAL;
616*4703203dSis 			for (j = 0; (i1 + j) < n1; )
617*4703203dSis 				u8s1[j++] = *s1++;
618*4703203dSis 			u8s1[j] = '\0';
619*4703203dSis 		} else {
620*4703203dSis 			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
621*4703203dSis 			s1 += sz1;
622*4703203dSis 		}
623*4703203dSis 
624*4703203dSis 		/* Do the same for the string s2. */
625*4703203dSis 		sz2 = u8_number_of_bytes[*s2];
626*4703203dSis 		if (sz2 < 0) {
627*4703203dSis 			*errno = EILSEQ;
628*4703203dSis 			sz2 = 1;
629*4703203dSis 		}
630*4703203dSis 
631*4703203dSis 		if (sz2 == 1) {
632*4703203dSis 			if (is_it_toupper)
633*4703203dSis 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
634*4703203dSis 			else
635*4703203dSis 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
636*4703203dSis 			s2++;
637*4703203dSis 			u8s2[1] = '\0';
638*4703203dSis 		} else if ((i2 + sz2) > n2) {
639*4703203dSis 			*errno = EINVAL;
640*4703203dSis 			for (j = 0; (i2 + j) < n2; )
641*4703203dSis 				u8s2[j++] = *s2++;
642*4703203dSis 			u8s2[j] = '\0';
643*4703203dSis 		} else {
644*4703203dSis 			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
645*4703203dSis 			s2 += sz2;
646*4703203dSis 		}
647*4703203dSis 
648*4703203dSis 		/* Now compare the two characters. */
649*4703203dSis 		if (sz1 == 1 && sz2 == 1) {
650*4703203dSis 			if (*u8s1 > *u8s2)
651*4703203dSis 				return (1);
652*4703203dSis 			if (*u8s1 < *u8s2)
653*4703203dSis 				return (-1);
654*4703203dSis 		} else {
655*4703203dSis 			f = strcmp((const char *)u8s1, (const char *)u8s2);
656*4703203dSis 			if (f != 0)
657*4703203dSis 				return (f);
658*4703203dSis 		}
659*4703203dSis 
660*4703203dSis 		/*
661*4703203dSis 		 * They were the same. Let's move on to the next
662*4703203dSis 		 * characters then.
663*4703203dSis 		 */
664*4703203dSis 		i1 += sz1;
665*4703203dSis 		i2 += sz2;
666*4703203dSis 	}
667*4703203dSis 
668*4703203dSis 	/*
669*4703203dSis 	 * We compared until the end of either or both strings.
670*4703203dSis 	 *
671*4703203dSis 	 * If we reached to or went over the ends for the both, that means
672*4703203dSis 	 * they are the same.
673*4703203dSis 	 *
674*4703203dSis 	 * If we reached only one of the two ends, that means the other string
675*4703203dSis 	 * has something which then the fact can be used to determine
676*4703203dSis 	 * the return value.
677*4703203dSis 	 */
678*4703203dSis 	if (i1 >= n1) {
679*4703203dSis 		if (i2 >= n2)
680*4703203dSis 			return (0);
681*4703203dSis 		return (-1);
682*4703203dSis 	}
683*4703203dSis 	return (1);
684*4703203dSis }
685*4703203dSis 
686*4703203dSis /*
687*4703203dSis  * The combining_class() function checks on the given bytes and find out
688*4703203dSis  * the corresponding Unicode combining class value. The return value 0 means
689*4703203dSis  * it is a Starter. Any illegal UTF-8 character will also be treated as
690*4703203dSis  * a Starter.
691*4703203dSis  */
692*4703203dSis static uchar_t
693*4703203dSis combining_class(size_t uv, uchar_t *s, size_t sz)
694*4703203dSis {
695*4703203dSis 	uint16_t b1 = 0;
696*4703203dSis 	uint16_t b2 = 0;
697*4703203dSis 	uint16_t b3 = 0;
698*4703203dSis 	uint16_t b4 = 0;
699*4703203dSis 
700*4703203dSis 	if (sz == 1 || sz > 4)
701*4703203dSis 		return (0);
702*4703203dSis 
703*4703203dSis 	if (sz == 2) {
704*4703203dSis 		b3 = s[0];
705*4703203dSis 		b4 = s[1];
706*4703203dSis 	} else if (sz == 3) {
707*4703203dSis 		b2 = s[0];
708*4703203dSis 		b3 = s[1];
709*4703203dSis 		b4 = s[2];
710*4703203dSis 	} else if (sz == 4) {
711*4703203dSis 		b1 = s[0];
712*4703203dSis 		b2 = s[1];
713*4703203dSis 		b3 = s[2];
714*4703203dSis 		b4 = s[3];
715*4703203dSis 	}
716*4703203dSis 
717*4703203dSis 	b1 = u8_common_b1_tbl[uv][b1];
718*4703203dSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
719*4703203dSis 		return (0);
720*4703203dSis 
721*4703203dSis 	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
722*4703203dSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
723*4703203dSis 		return (0);
724*4703203dSis 
725*4703203dSis 	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
726*4703203dSis 	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
727*4703203dSis 		return (0);
728*4703203dSis 
729*4703203dSis 	return (u8_combining_class_b4_tbl[uv][b3][b4]);
730*4703203dSis }
731*4703203dSis 
732*4703203dSis /*
733*4703203dSis  * The do_decomp() function finds out a matching decomposition if any
734*4703203dSis  * and return. If there is no match, the input bytes are copied and returned.
735*4703203dSis  * The function also checks if there is a Hangul, decomposes it if necessary
736*4703203dSis  * and returns.
737*4703203dSis  *
738*4703203dSis  * To save time, a single byte 7-bit ASCII character should be handled by
739*4703203dSis  * the caller.
740*4703203dSis  *
741*4703203dSis  * The function returns the number of bytes returned sans always terminating
742*4703203dSis  * the null byte. It will also return a state that will tell if there was
743*4703203dSis  * a Hangul character decomposed which then will be used by the caller.
744*4703203dSis  */
745*4703203dSis static size_t
746*4703203dSis do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
747*4703203dSis 	boolean_t canonical_decomposition, u8_normalization_states_t *state)
748*4703203dSis {
749*4703203dSis 	uint16_t b1 = 0;
750*4703203dSis 	uint16_t b2 = 0;
751*4703203dSis 	uint16_t b3 = 0;
752*4703203dSis 	uint16_t b3_tbl;
753*4703203dSis 	uint16_t b3_base;
754*4703203dSis 	uint16_t b4 = 0;
755*4703203dSis 	size_t start_id;
756*4703203dSis 	size_t end_id;
757*4703203dSis 	size_t i;
758*4703203dSis 	uint32_t u1;
759*4703203dSis 
760*4703203dSis 	if (sz == 2) {
761*4703203dSis 		b3 = u8s[0] = s[0];
762*4703203dSis 		b4 = u8s[1] = s[1];
763*4703203dSis 		u8s[2] = '\0';
764*4703203dSis 	} else if (sz == 3) {
765*4703203dSis 		/* Convert it to a Unicode scalar value. */
766*4703203dSis 		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
767*4703203dSis 
768*4703203dSis 		/*
769*4703203dSis 		 * If this is a Hangul syllable, we decompose it into
770*4703203dSis 		 * a leading consonant, a vowel, and an optional trailing
771*4703203dSis 		 * consonant and then return.
772*4703203dSis 		 */
773*4703203dSis 		if (U8_HANGUL_SYLLABLE(u1)) {
774*4703203dSis 			u1 -= U8_HANGUL_SYL_FIRST;
775*4703203dSis 
776*4703203dSis 			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
777*4703203dSis 			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
778*4703203dSis 			    / U8_HANGUL_T_COUNT;
779*4703203dSis 			b3 = u1 % U8_HANGUL_T_COUNT;
780*4703203dSis 
781*4703203dSis 			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
782*4703203dSis 			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
783*4703203dSis 			if (b3) {
784*4703203dSis 				b3 += U8_HANGUL_JAMO_T_FIRST;
785*4703203dSis 				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
786*4703203dSis 
787*4703203dSis 				u8s[9] = '\0';
788*4703203dSis 				*state = U8_STATE_HANGUL_LVT;
789*4703203dSis 				return (9);
790*4703203dSis 			}
791*4703203dSis 
792*4703203dSis 			u8s[6] = '\0';
793*4703203dSis 			*state = U8_STATE_HANGUL_LV;
794*4703203dSis 			return (6);
795*4703203dSis 		}
796*4703203dSis 
797*4703203dSis 		b2 = u8s[0] = s[0];
798*4703203dSis 		b3 = u8s[1] = s[1];
799*4703203dSis 		b4 = u8s[2] = s[2];
800*4703203dSis 		u8s[3] = '\0';
801*4703203dSis 
802*4703203dSis 		/*
803*4703203dSis 		 * If this is a Hangul Jamo, we know there is nothing
804*4703203dSis 		 * further that we can decompose.
805*4703203dSis 		 */
806*4703203dSis 		if (U8_HANGUL_JAMO_L(u1)) {
807*4703203dSis 			*state = U8_STATE_HANGUL_L;
808*4703203dSis 			return (3);
809*4703203dSis 		}
810*4703203dSis 
811*4703203dSis 		if (U8_HANGUL_JAMO_V(u1)) {
812*4703203dSis 			if (*state == U8_STATE_HANGUL_L)
813*4703203dSis 				*state = U8_STATE_HANGUL_LV;
814*4703203dSis 			else
815*4703203dSis 				*state = U8_STATE_HANGUL_V;
816*4703203dSis 			return (3);
817*4703203dSis 		}
818*4703203dSis 
819*4703203dSis 		if (U8_HANGUL_JAMO_T(u1)) {
820*4703203dSis 			if (*state == U8_STATE_HANGUL_LV)
821*4703203dSis 				*state = U8_STATE_HANGUL_LVT;
822*4703203dSis 			else
823*4703203dSis 				*state = U8_STATE_HANGUL_T;
824*4703203dSis 			return (3);
825*4703203dSis 		}
826*4703203dSis 	} else if (sz == 4) {
827*4703203dSis 		b1 = u8s[0] = s[0];
828*4703203dSis 		b2 = u8s[1] = s[1];
829*4703203dSis 		b3 = u8s[2] = s[2];
830*4703203dSis 		b4 = u8s[3] = s[3];
831*4703203dSis 		u8s[4] = '\0';
832*4703203dSis 	} else {
833*4703203dSis 		/*
834*4703203dSis 		 * This is a fallback and should not happen if the function
835*4703203dSis 		 * was called properly.
836*4703203dSis 		 */
837*4703203dSis 		u8s[0] = s[0];
838*4703203dSis 		u8s[1] = '\0';
839*4703203dSis 		*state = U8_STATE_START;
840*4703203dSis 		return (1);
841*4703203dSis 	}
842*4703203dSis 
843*4703203dSis 	/*
844*4703203dSis 	 * At this point, this rountine does not know what it would get.
845*4703203dSis 	 * The caller should sort it out if the state isn't a Hangul one.
846*4703203dSis 	 */
847*4703203dSis 	*state = U8_STATE_START;
848*4703203dSis 
849*4703203dSis 	/* Try to find matching decomposition mapping byte sequence. */
850*4703203dSis 	b1 = u8_common_b1_tbl[uv][b1];
851*4703203dSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
852*4703203dSis 		return ((size_t)sz);
853*4703203dSis 
854*4703203dSis 	b2 = u8_decomp_b2_tbl[uv][b1][b2];
855*4703203dSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
856*4703203dSis 		return ((size_t)sz);
857*4703203dSis 
858*4703203dSis 	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
859*4703203dSis 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
860*4703203dSis 		return ((size_t)sz);
861*4703203dSis 
862*4703203dSis 	/*
863*4703203dSis 	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
864*4703203dSis 	 * which is 0x8000, this means we couldn't fit the mappings into
865*4703203dSis 	 * the cardinality of a unsigned byte.
866*4703203dSis 	 */
867*4703203dSis 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
868*4703203dSis 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
869*4703203dSis 		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
870*4703203dSis 		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
871*4703203dSis 	} else {
872*4703203dSis 		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
873*4703203dSis 		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
874*4703203dSis 	}
875*4703203dSis 
876*4703203dSis 	/* This also means there wasn't any matching decomposition. */
877*4703203dSis 	if (start_id >= end_id)
878*4703203dSis 		return ((size_t)sz);
879*4703203dSis 
880*4703203dSis 	/*
881*4703203dSis 	 * The final table for decomposition mappings has three types of
882*4703203dSis 	 * byte sequences depending on whether a mapping is for compatibility
883*4703203dSis 	 * decomposition, canonical decomposition, or both like the following:
884*4703203dSis 	 *
885*4703203dSis 	 * (1) Compatibility decomposition mappings:
886*4703203dSis 	 *
887*4703203dSis 	 *	+---+---+-...-+---+
888*4703203dSis 	 *	| B0| B1| ... | Bm|
889*4703203dSis 	 *	+---+---+-...-+---+
890*4703203dSis 	 *
891*4703203dSis 	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
892*4703203dSis 	 *
893*4703203dSis 	 * (2) Canonical decomposition mappings:
894*4703203dSis 	 *
895*4703203dSis 	 *	+---+---+---+-...-+---+
896*4703203dSis 	 *	| T | b0| b1| ... | bn|
897*4703203dSis 	 *	+---+---+---+-...-+---+
898*4703203dSis 	 *
899*4703203dSis 	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
900*4703203dSis 	 *
901*4703203dSis 	 * (3) Both mappings:
902*4703203dSis 	 *
903*4703203dSis 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
904*4703203dSis 	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
905*4703203dSis 	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
906*4703203dSis 	 *
907*4703203dSis 	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
908*4703203dSis 	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
909*4703203dSis 	 *	compatibility mapping bytes.
910*4703203dSis 	 *
911*4703203dSis 	 * Note that compatibility decomposition means doing recursive
912*4703203dSis 	 * decompositions using both compatibility decomposition mappings and
913*4703203dSis 	 * canonical decomposition mappings. On the other hand, canonical
914*4703203dSis 	 * decomposition means doing recursive decompositions using only
915*4703203dSis 	 * canonical decomposition mappings. Since the table we have has gone
916*4703203dSis 	 * through the recursions already, we do not need to do so during
917*4703203dSis 	 * runtime, i.e., the table has been completely flattened out
918*4703203dSis 	 * already.
919*4703203dSis 	 */
920*4703203dSis 
921*4703203dSis 	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
922*4703203dSis 
923*4703203dSis 	/* Get the type, T, of the byte sequence. */
924*4703203dSis 	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
925*4703203dSis 
926*4703203dSis 	/*
927*4703203dSis 	 * If necessary, adjust start_id, end_id, or both. Note that if
928*4703203dSis 	 * this is compatibility decomposition mapping, there is no
929*4703203dSis 	 * adjustment.
930*4703203dSis 	 */
931*4703203dSis 	if (canonical_decomposition) {
932*4703203dSis 		/* Is the mapping only for compatibility decomposition? */
933*4703203dSis 		if (b1 < U8_DECOMP_BOTH)
934*4703203dSis 			return ((size_t)sz);
935*4703203dSis 
936*4703203dSis 		start_id++;
937*4703203dSis 
938*4703203dSis 		if (b1 == U8_DECOMP_BOTH) {
939*4703203dSis 			end_id = start_id +
940*4703203dSis 			    u8_decomp_final_tbl[uv][b3_base + start_id];
941*4703203dSis 			start_id++;
942*4703203dSis 		}
943*4703203dSis 	} else {
944*4703203dSis 		/*
945*4703203dSis 		 * Unless this is a compatibility decomposition mapping,
946*4703203dSis 		 * we adjust the start_id.
947*4703203dSis 		 */
948*4703203dSis 		if (b1 == U8_DECOMP_BOTH) {
949*4703203dSis 			start_id++;
950*4703203dSis 			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
951*4703203dSis 		} else if (b1 == U8_DECOMP_CANONICAL) {
952*4703203dSis 			start_id++;
953*4703203dSis 		}
954*4703203dSis 	}
955*4703203dSis 
956*4703203dSis 	for (i = 0; start_id < end_id; start_id++)
957*4703203dSis 		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
958*4703203dSis 	u8s[i] = '\0';
959*4703203dSis 
960*4703203dSis 	return (i);
961*4703203dSis }
962*4703203dSis 
963*4703203dSis /*
964*4703203dSis  * The find_composition_start() function uses the character bytes given and
965*4703203dSis  * find out the matching composition mappings if any and return the address
966*4703203dSis  * to the composition mappings as explained in the do_composition().
967*4703203dSis  */
968*4703203dSis static uchar_t *
969*4703203dSis find_composition_start(size_t uv, uchar_t *s, size_t sz)
970*4703203dSis {
971*4703203dSis 	uint16_t b1 = 0;
972*4703203dSis 	uint16_t b2 = 0;
973*4703203dSis 	uint16_t b3 = 0;
974*4703203dSis 	uint16_t b3_tbl;
975*4703203dSis 	uint16_t b3_base;
976*4703203dSis 	uint16_t b4 = 0;
977*4703203dSis 	size_t start_id;
978*4703203dSis 	size_t end_id;
979*4703203dSis 
980*4703203dSis 	if (sz == 1) {
981*4703203dSis 		b4 = s[0];
982*4703203dSis 	} else if (sz == 2) {
983*4703203dSis 		b3 = s[0];
984*4703203dSis 		b4 = s[1];
985*4703203dSis 	} else if (sz == 3) {
986*4703203dSis 		b2 = s[0];
987*4703203dSis 		b3 = s[1];
988*4703203dSis 		b4 = s[2];
989*4703203dSis 	} else if (sz == 4) {
990*4703203dSis 		b1 = s[0];
991*4703203dSis 		b2 = s[1];
992*4703203dSis 		b3 = s[2];
993*4703203dSis 		b4 = s[3];
994*4703203dSis 	} else {
995*4703203dSis 		/*
996*4703203dSis 		 * This is a fallback and should not happen if the function
997*4703203dSis 		 * was called properly.
998*4703203dSis 		 */
999*4703203dSis 		return (NULL);
1000*4703203dSis 	}
1001*4703203dSis 
1002*4703203dSis 	b1 = u8_composition_b1_tbl[uv][b1];
1003*4703203dSis 	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
1004*4703203dSis 		return (NULL);
1005*4703203dSis 
1006*4703203dSis 	b2 = u8_composition_b2_tbl[uv][b1][b2];
1007*4703203dSis 	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
1008*4703203dSis 		return (NULL);
1009*4703203dSis 
1010*4703203dSis 	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
1011*4703203dSis 	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
1012*4703203dSis 		return (NULL);
1013*4703203dSis 
1014*4703203dSis 	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
1015*4703203dSis 		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
1016*4703203dSis 		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
1017*4703203dSis 		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
1018*4703203dSis 	} else {
1019*4703203dSis 		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
1020*4703203dSis 		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
1021*4703203dSis 	}
1022*4703203dSis 
1023*4703203dSis 	if (start_id >= end_id)
1024*4703203dSis 		return (NULL);
1025*4703203dSis 
1026*4703203dSis 	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
1027*4703203dSis 
1028*4703203dSis 	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
1029*4703203dSis }
1030*4703203dSis 
1031*4703203dSis /*
1032*4703203dSis  * The blocked() function checks on the combining class values of previous
1033*4703203dSis  * characters in this sequence and return whether it is blocked or not.
1034*4703203dSis  */
1035*4703203dSis static boolean_t
1036*4703203dSis blocked(uchar_t *comb_class, size_t last)
1037*4703203dSis {
1038*4703203dSis 	uchar_t my_comb_class;
1039*4703203dSis 	size_t i;
1040*4703203dSis 
1041*4703203dSis 	my_comb_class = comb_class[last];
1042*4703203dSis 	for (i = 1; i < last; i++)
1043*4703203dSis 		if (comb_class[i] >= my_comb_class ||
1044*4703203dSis 		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
1045*4703203dSis 			return (B_TRUE);
1046*4703203dSis 
1047*4703203dSis 	return (B_FALSE);
1048*4703203dSis }
1049*4703203dSis 
1050*4703203dSis /*
1051*4703203dSis  * The do_composition() reads the character string pointed by 's' and
1052*4703203dSis  * do necessary canonical composition and then copy over the result back to
1053*4703203dSis  * the 's'.
1054*4703203dSis  *
1055*4703203dSis  * The input argument 's' cannot contain more than 32 characters.
1056*4703203dSis  */
1057*4703203dSis static size_t
1058*4703203dSis do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
1059*4703203dSis 	uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
1060*4703203dSis {
1061*4703203dSis 	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
1062*4703203dSis 	uchar_t tc[U8_MB_CUR_MAX];
1063*4703203dSis 	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
1064*4703203dSis 	size_t saved_marks_count;
1065*4703203dSis 	uchar_t *p;
1066*4703203dSis 	uchar_t *saved_p;
1067*4703203dSis 	uchar_t *q;
1068*4703203dSis 	size_t i;
1069*4703203dSis 	size_t saved_i;
1070*4703203dSis 	size_t j;
1071*4703203dSis 	size_t k;
1072*4703203dSis 	size_t l;
1073*4703203dSis 	size_t C;
1074*4703203dSis 	size_t saved_l;
1075*4703203dSis 	size_t size;
1076*4703203dSis 	uint32_t u1;
1077*4703203dSis 	uint32_t u2;
1078*4703203dSis 	boolean_t match_not_found = B_TRUE;
1079*4703203dSis 
1080*4703203dSis 	/*
1081*4703203dSis 	 * This should never happen unless the callers are doing some strange
1082*4703203dSis 	 * and unexpected things.
1083*4703203dSis 	 *
1084*4703203dSis 	 * The "last" is the index pointing to the last character not last + 1.
1085*4703203dSis 	 */
1086*4703203dSis 	if (last >= U8_MAX_CHARS_A_SEQ)
1087*4703203dSis 		last = U8_UPPER_LIMIT_IN_A_SEQ;
1088*4703203dSis 
1089*4703203dSis 	for (i = l = 0; i <= last; i++) {
1090*4703203dSis 		/*
1091*4703203dSis 		 * The last or any non-Starters at the beginning, we don't
1092*4703203dSis 		 * have any chance to do composition and so we just copy them
1093*4703203dSis 		 * to the temporary buffer.
1094*4703203dSis 		 */
1095*4703203dSis 		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
1096*4703203dSis SAVE_THE_CHAR:
1097*4703203dSis 			p = s + start[i];
1098*4703203dSis 			size = disp[i];
1099*4703203dSis 			for (k = 0; k < size; k++)
1100*4703203dSis 				t[l++] = *p++;
1101*4703203dSis 			continue;
1102*4703203dSis 		}
1103*4703203dSis 
1104*4703203dSis 		/*
1105*4703203dSis 		 * If this could be a start of Hangul Jamos, then, we try to
1106*4703203dSis 		 * conjoin them.
1107*4703203dSis 		 */
1108*4703203dSis 		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
1109*4703203dSis 			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
1110*4703203dSis 			    s[start[i] + 1], s[start[i] + 2]);
1111*4703203dSis 			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
1112*4703203dSis 			    s[start[i] + 4], s[start[i] + 5]);
1113*4703203dSis 
1114*4703203dSis 			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
1115*4703203dSis 				u1 -= U8_HANGUL_JAMO_L_FIRST;
1116*4703203dSis 				u2 -= U8_HANGUL_JAMO_V_FIRST;
1117*4703203dSis 				u1 = U8_HANGUL_SYL_FIRST +
1118*4703203dSis 				    (u1 * U8_HANGUL_V_COUNT + u2) *
1119*4703203dSis 				    U8_HANGUL_T_COUNT;
1120*4703203dSis 
1121*4703203dSis 				i += 2;
1122*4703203dSis 				if (i <= last) {
1123*4703203dSis 					U8_PUT_3BYTES_INTO_UTF32(u2,
1124*4703203dSis 					    s[start[i]], s[start[i] + 1],
1125*4703203dSis 					    s[start[i] + 2]);
1126*4703203dSis 
1127*4703203dSis 					if (U8_HANGUL_JAMO_T(u2)) {
1128*4703203dSis 						u1 += u2 -
1129*4703203dSis 						    U8_HANGUL_JAMO_T_FIRST;
1130*4703203dSis 						i++;
1131*4703203dSis 					}
1132*4703203dSis 				}
1133*4703203dSis 
1134*4703203dSis 				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
1135*4703203dSis 				i--;
1136*4703203dSis 				l += 3;
1137*4703203dSis 				continue;
1138*4703203dSis 			}
1139*4703203dSis 		}
1140*4703203dSis 
1141*4703203dSis 		/*
1142*4703203dSis 		 * Let's then find out if this Starter has composition
1143*4703203dSis 		 * mapping.
1144*4703203dSis 		 */
1145*4703203dSis 		p = find_composition_start(uv, s + start[i], disp[i]);
1146*4703203dSis 		if (p == NULL)
1147*4703203dSis 			goto SAVE_THE_CHAR;
1148*4703203dSis 
1149*4703203dSis 		/*
1150*4703203dSis 		 * We have a Starter with composition mapping and the next
1151*4703203dSis 		 * character is a non-Starter. Let's try to find out if
1152*4703203dSis 		 * we can do composition.
1153*4703203dSis 		 */
1154*4703203dSis 
1155*4703203dSis 		saved_p = p;
1156*4703203dSis 		saved_i = i;
1157*4703203dSis 		saved_l = l;
1158*4703203dSis 		saved_marks_count = 0;
1159*4703203dSis 
1160*4703203dSis TRY_THE_NEXT_MARK:
1161*4703203dSis 		q = s + start[++i];
1162*4703203dSis 		size = disp[i];
1163*4703203dSis 
1164*4703203dSis 		/*
1165*4703203dSis 		 * The next for() loop compares the non-Starter pointed by
1166*4703203dSis 		 * 'q' with the possible (joinable) characters pointed by 'p'.
1167*4703203dSis 		 *
1168*4703203dSis 		 * The composition final table entry pointed by the 'p'
1169*4703203dSis 		 * looks like the following:
1170*4703203dSis 		 *
1171*4703203dSis 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1172*4703203dSis 		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
1173*4703203dSis 		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
1174*4703203dSis 		 *
1175*4703203dSis 		 * where C is the count byte indicating the number of
1176*4703203dSis 		 * mapping pairs where each pair would be look like
1177*4703203dSis 		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
1178*4703203dSis 		 * character of a canonical decomposition and the B0-Bm are
1179*4703203dSis 		 * the bytes of a matching composite character. The F is
1180*4703203dSis 		 * a filler byte after each character as the separator.
1181*4703203dSis 		 */
1182*4703203dSis 
1183*4703203dSis 		match_not_found = B_TRUE;
1184*4703203dSis 
1185*4703203dSis 		for (C = *p++; C > 0; C--) {
1186*4703203dSis 			for (k = 0; k < size; p++, k++)
1187*4703203dSis 				if (*p != q[k])
1188*4703203dSis 					break;
1189*4703203dSis 
1190*4703203dSis 			/* Have we found it? */
1191*4703203dSis 			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
1192*4703203dSis 				match_not_found = B_FALSE;
1193*4703203dSis 
1194*4703203dSis 				l = saved_l;
1195*4703203dSis 
1196*4703203dSis 				while (*++p != U8_TBL_ELEMENT_FILLER)
1197*4703203dSis 					t[l++] = *p;
1198*4703203dSis 
1199*4703203dSis 				break;
1200*4703203dSis 			}
1201*4703203dSis 
1202*4703203dSis 			/* We didn't find; skip to the next pair. */
1203*4703203dSis 			if (*p != U8_TBL_ELEMENT_FILLER)
1204*4703203dSis 				while (*++p != U8_TBL_ELEMENT_FILLER)
1205*4703203dSis 					;
1206*4703203dSis 			while (*++p != U8_TBL_ELEMENT_FILLER)
1207*4703203dSis 				;
1208*4703203dSis 			p++;
1209*4703203dSis 		}
1210*4703203dSis 
1211*4703203dSis 		/*
1212*4703203dSis 		 * If there was no match, we will need to save the combining
1213*4703203dSis 		 * mark for later appending. After that, if the next one
1214*4703203dSis 		 * is a non-Starter and not blocked, then, we try once
1215*4703203dSis 		 * again to do composition with the next non-Starter.
1216*4703203dSis 		 *
1217*4703203dSis 		 * If there was no match and this was a Starter, then,
1218*4703203dSis 		 * this is a new start.
1219*4703203dSis 		 *
1220*4703203dSis 		 * If there was a match and a composition done and we have
1221*4703203dSis 		 * more to check on, then, we retrieve a new composition final
1222*4703203dSis 		 * table entry for the composite and then try to do the
1223*4703203dSis 		 * composition again.
1224*4703203dSis 		 */
1225*4703203dSis 
1226*4703203dSis 		if (match_not_found) {
1227*4703203dSis 			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
1228*4703203dSis 				i--;
1229*4703203dSis 				goto SAVE_THE_CHAR;
1230*4703203dSis 			}
1231*4703203dSis 
1232*4703203dSis 			saved_marks[saved_marks_count++] = i;
1233*4703203dSis 		}
1234*4703203dSis 
1235*4703203dSis 		if (saved_l == l) {
1236*4703203dSis 			while (i < last) {
1237*4703203dSis 				if (blocked(comb_class, i + 1))
1238*4703203dSis 					saved_marks[saved_marks_count++] = ++i;
1239*4703203dSis 				else
1240*4703203dSis 					break;
1241*4703203dSis 			}
1242*4703203dSis 			if (i < last) {
1243*4703203dSis 				p = saved_p;
1244*4703203dSis 				goto TRY_THE_NEXT_MARK;
1245*4703203dSis 			}
1246*4703203dSis 		} else if (i < last) {
1247*4703203dSis 			p = find_composition_start(uv, t + saved_l,
1248*4703203dSis 			    l - saved_l);
1249*4703203dSis 			if (p != NULL) {
1250*4703203dSis 				saved_p = p;
1251*4703203dSis 				goto TRY_THE_NEXT_MARK;
1252*4703203dSis 			}
1253*4703203dSis 		}
1254*4703203dSis 
1255*4703203dSis 		/*
1256*4703203dSis 		 * There is no more composition possible.
1257*4703203dSis 		 *
1258*4703203dSis 		 * If there was no composition what so ever then we copy
1259*4703203dSis 		 * over the original Starter and then append any non-Starters
1260*4703203dSis 		 * remaining at the target string sequentially after that.
1261*4703203dSis 		 */
1262*4703203dSis 
1263*4703203dSis 		if (saved_l == l) {
1264*4703203dSis 			p = s + start[saved_i];
1265*4703203dSis 			size = disp[saved_i];
1266*4703203dSis 			for (j = 0; j < size; j++)
1267*4703203dSis 				t[l++] = *p++;
1268*4703203dSis 		}
1269*4703203dSis 
1270*4703203dSis 		for (k = 0; k < saved_marks_count; k++) {
1271*4703203dSis 			p = s + start[saved_marks[k]];
1272*4703203dSis 			size = disp[saved_marks[k]];
1273*4703203dSis 			for (j = 0; j < size; j++)
1274*4703203dSis 				t[l++] = *p++;
1275*4703203dSis 		}
1276*4703203dSis 	}
1277*4703203dSis 
1278*4703203dSis 	/*
1279*4703203dSis 	 * If the last character is a Starter and if we have a character
1280*4703203dSis 	 * (possibly another Starter) that can be turned into a composite,
1281*4703203dSis 	 * we do so and we do so until there is no more of composition
1282*4703203dSis 	 * possible.
1283*4703203dSis 	 */
1284*4703203dSis 	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
1285*4703203dSis 		p = *os;
1286*4703203dSis 		saved_l = l - disp[last];
1287*4703203dSis 
1288*4703203dSis 		while (p < oslast) {
1289*4703203dSis 			size = u8_number_of_bytes[*p];
1290*4703203dSis 			if (size <= 1 || (p + size) > oslast)
1291*4703203dSis 				break;
1292*4703203dSis 
1293*4703203dSis 			saved_p = p;
1294*4703203dSis 
1295*4703203dSis 			for (i = 0; i < size; i++)
1296*4703203dSis 				tc[i] = *p++;
1297*4703203dSis 
1298*4703203dSis 			q = find_composition_start(uv, t + saved_l,
1299*4703203dSis 			    l - saved_l);
1300*4703203dSis 			if (q == NULL) {
1301*4703203dSis 				p = saved_p;
1302*4703203dSis 				break;
1303*4703203dSis 			}
1304*4703203dSis 
1305*4703203dSis 			match_not_found = B_TRUE;
1306*4703203dSis 
1307*4703203dSis 			for (C = *q++; C > 0; C--) {
1308*4703203dSis 				for (k = 0; k < size; q++, k++)
1309*4703203dSis 					if (*q != tc[k])
1310*4703203dSis 						break;
1311*4703203dSis 
1312*4703203dSis 				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
1313*4703203dSis 					match_not_found = B_FALSE;
1314*4703203dSis 
1315*4703203dSis 					l = saved_l;
1316*4703203dSis 
1317*4703203dSis 					while (*++q != U8_TBL_ELEMENT_FILLER) {
1318*4703203dSis 						/*
1319*4703203dSis 						 * This is practically
1320*4703203dSis 						 * impossible but we don't
1321*4703203dSis 						 * want to take any chances.
1322*4703203dSis 						 */
1323*4703203dSis 						if (l >=
1324*4703203dSis 						    U8_STREAM_SAFE_TEXT_MAX) {
1325*4703203dSis 							p = saved_p;
1326*4703203dSis 							goto SAFE_RETURN;
1327*4703203dSis 						}
1328*4703203dSis 						t[l++] = *q;
1329*4703203dSis 					}
1330*4703203dSis 
1331*4703203dSis 					break;
1332*4703203dSis 				}
1333*4703203dSis 
1334*4703203dSis 				if (*q != U8_TBL_ELEMENT_FILLER)
1335*4703203dSis 					while (*++q != U8_TBL_ELEMENT_FILLER)
1336*4703203dSis 						;
1337*4703203dSis 				while (*++q != U8_TBL_ELEMENT_FILLER)
1338*4703203dSis 					;
1339*4703203dSis 				q++;
1340*4703203dSis 			}
1341*4703203dSis 
1342*4703203dSis 			if (match_not_found) {
1343*4703203dSis 				p = saved_p;
1344*4703203dSis 				break;
1345*4703203dSis 			}
1346*4703203dSis 		}
1347*4703203dSis SAFE_RETURN:
1348*4703203dSis 		*os = p;
1349*4703203dSis 	}
1350*4703203dSis 
1351*4703203dSis 	/*
1352*4703203dSis 	 * Now we copy over the temporary string to the target string.
1353*4703203dSis 	 * Since composition always reduces the number of characters or
1354*4703203dSis 	 * the number of characters stay, we don't need to worry about
1355*4703203dSis 	 * the buffer overflow here.
1356*4703203dSis 	 */
1357*4703203dSis 	for (i = 0; i < l; i++)
1358*4703203dSis 		s[i] = t[i];
1359*4703203dSis 	s[l] = '\0';
1360*4703203dSis 
1361*4703203dSis 	return (l);
1362*4703203dSis }
1363*4703203dSis 
1364*4703203dSis /*
1365*4703203dSis  * The collect_a_seq() function checks on the given string s, collect
1366*4703203dSis  * a sequence of characters at u8s, and return the sequence. While it collects
1367*4703203dSis  * a sequence, it also applies case conversion, canonical or compatibility
1368*4703203dSis  * decomposition, canonical decomposition, or some or all of them and
1369*4703203dSis  * in that order.
1370*4703203dSis  *
1371*4703203dSis  * The collected sequence cannot be bigger than 32 characters since if
1372*4703203dSis  * it is having more than 31 characters, the sequence will be terminated
1373*4703203dSis  * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
1374*4703203dSis  * a Stream-Safe Text. The collected sequence is always terminated with
1375*4703203dSis  * a null byte and the return value is the byte length of the sequence
1376*4703203dSis  * including 0. The return value does not include the terminating
1377*4703203dSis  * null byte.
1378*4703203dSis  */
1379*4703203dSis static size_t
1380*4703203dSis collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
1381*4703203dSis 	boolean_t is_it_toupper,
1382*4703203dSis 	boolean_t is_it_tolower,
1383*4703203dSis 	boolean_t canonical_decomposition,
1384*4703203dSis 	boolean_t compatibility_decomposition,
1385*4703203dSis 	boolean_t canonical_composition,
1386*4703203dSis 	int *errno, u8_normalization_states_t *state)
1387*4703203dSis {
1388*4703203dSis 	uchar_t *s;
1389*4703203dSis 	int sz;
1390*4703203dSis 	int saved_sz;
1391*4703203dSis 	size_t i;
1392*4703203dSis 	size_t j;
1393*4703203dSis 	size_t k;
1394*4703203dSis 	size_t l;
1395*4703203dSis 	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
1396*4703203dSis 	uchar_t disp[U8_MAX_CHARS_A_SEQ];
1397*4703203dSis 	uchar_t start[U8_MAX_CHARS_A_SEQ];
1398*4703203dSis 	uchar_t u8t[U8_MB_CUR_MAX];
1399*4703203dSis 	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
1400*4703203dSis 	uchar_t tc;
1401*4703203dSis 	size_t last;
1402*4703203dSis 	size_t saved_last;
1403*4703203dSis 	uint32_t u1;
1404*4703203dSis 
1405*4703203dSis 	/*
1406*4703203dSis 	 * Save the source string pointer which we will return a changed
1407*4703203dSis 	 * pointer if we do processing.
1408*4703203dSis 	 */
1409*4703203dSis 	s = *source;
1410*4703203dSis 
1411*4703203dSis 	/*
1412*4703203dSis 	 * The following is a fallback for just in case callers are not
1413*4703203dSis 	 * checking the string boundaries before the calling.
1414*4703203dSis 	 */
1415*4703203dSis 	if (s >= slast) {
1416*4703203dSis 		u8s[0] = '\0';
1417*4703203dSis 
1418*4703203dSis 		return (0);
1419*4703203dSis 	}
1420*4703203dSis 
1421*4703203dSis 	/*
1422*4703203dSis 	 * As the first thing, let's collect a character and do case
1423*4703203dSis 	 * conversion if necessary.
1424*4703203dSis 	 */
1425*4703203dSis 
1426*4703203dSis 	sz = u8_number_of_bytes[*s];
1427*4703203dSis 
1428*4703203dSis 	if (sz < 0) {
1429*4703203dSis 		*errno = EILSEQ;
1430*4703203dSis 
1431*4703203dSis 		u8s[0] = *s++;
1432*4703203dSis 		u8s[1] = '\0';
1433*4703203dSis 
1434*4703203dSis 		*source = s;
1435*4703203dSis 
1436*4703203dSis 		return (1);
1437*4703203dSis 	}
1438*4703203dSis 
1439*4703203dSis 	if (sz == 1) {
1440*4703203dSis 		if (is_it_toupper)
1441*4703203dSis 			u8s[0] = U8_ASCII_TOUPPER(*s);
1442*4703203dSis 		else if (is_it_tolower)
1443*4703203dSis 			u8s[0] = U8_ASCII_TOLOWER(*s);
1444*4703203dSis 		else
1445*4703203dSis 			u8s[0] = *s;
1446*4703203dSis 		s++;
1447*4703203dSis 		u8s[1] = '\0';
1448*4703203dSis 	} else if ((s + sz) > slast) {
1449*4703203dSis 		*errno = EINVAL;
1450*4703203dSis 
1451*4703203dSis 		for (i = 0; s < slast; )
1452*4703203dSis 			u8s[i++] = *s++;
1453*4703203dSis 		u8s[i] = '\0';
1454*4703203dSis 
1455*4703203dSis 		*source = s;
1456*4703203dSis 
1457*4703203dSis 		return (i);
1458*4703203dSis 	} else {
1459*4703203dSis 		if (is_it_toupper || is_it_tolower) {
1460*4703203dSis 			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
1461*4703203dSis 			s += sz;
1462*4703203dSis 			sz = i;
1463*4703203dSis 		} else {
1464*4703203dSis 			for (i = 0; i < sz; )
1465*4703203dSis 				u8s[i++] = *s++;
1466*4703203dSis 			u8s[i] = '\0';
1467*4703203dSis 		}
1468*4703203dSis 	}
1469*4703203dSis 
1470*4703203dSis 	/*
1471*4703203dSis 	 * And then canonical/compatibility decomposition followed by
1472*4703203dSis 	 * an optional canonical composition. Please be noted that
1473*4703203dSis 	 * canonical composition is done only when a decomposition is
1474*4703203dSis 	 * done.
1475*4703203dSis 	 */
1476*4703203dSis 	if (canonical_decomposition || compatibility_decomposition) {
1477*4703203dSis 		if (sz == 1) {
1478*4703203dSis 			*state = U8_STATE_START;
1479*4703203dSis 
1480*4703203dSis 			saved_sz = 1;
1481*4703203dSis 
1482*4703203dSis 			comb_class[0] = 0;
1483*4703203dSis 			start[0] = 0;
1484*4703203dSis 			disp[0] = 1;
1485*4703203dSis 
1486*4703203dSis 			last = 1;
1487*4703203dSis 		} else {
1488*4703203dSis 			saved_sz = do_decomp(uv, u8s, u8s, sz,
1489*4703203dSis 			    canonical_decomposition, state);
1490*4703203dSis 
1491*4703203dSis 			last = 0;
1492*4703203dSis 
1493*4703203dSis 			for (i = 0; i < saved_sz; ) {
1494*4703203dSis 				sz = u8_number_of_bytes[u8s[i]];
1495*4703203dSis 
1496*4703203dSis 				comb_class[last] = combining_class(uv,
1497*4703203dSis 				    u8s + i, sz);
1498*4703203dSis 				start[last] = i;
1499*4703203dSis 				disp[last] = sz;
1500*4703203dSis 
1501*4703203dSis 				last++;
1502*4703203dSis 				i += sz;
1503*4703203dSis 			}
1504*4703203dSis 
1505*4703203dSis 			/*
1506*4703203dSis 			 * Decomposition yields various Hangul related
1507*4703203dSis 			 * states but not on combining marks. We need to
1508*4703203dSis 			 * find out at here by checking on the last
1509*4703203dSis 			 * character.
1510*4703203dSis 			 */
1511*4703203dSis 			if (*state == U8_STATE_START) {
1512*4703203dSis 				if (comb_class[last - 1])
1513*4703203dSis 					*state = U8_STATE_COMBINING_MARK;
1514*4703203dSis 			}
1515*4703203dSis 		}
1516*4703203dSis 
1517*4703203dSis 		saved_last = last;
1518*4703203dSis 
1519*4703203dSis 		while (s < slast) {
1520*4703203dSis 			sz = u8_number_of_bytes[*s];
1521*4703203dSis 
1522*4703203dSis 			/*
1523*4703203dSis 			 * If this is an illegal character, an incomplete
1524*4703203dSis 			 * character, or an 7-bit ASCII Starter character,
1525*4703203dSis 			 * then we have collected a sequence; break and let
1526*4703203dSis 			 * the next call deal with the two cases.
1527*4703203dSis 			 *
1528*4703203dSis 			 * Note that this is okay only if you are using this
1529*4703203dSis 			 * function with a fixed length string, not on
1530*4703203dSis 			 * a buffer with multiple calls of one chunk at a time.
1531*4703203dSis 			 */
1532*4703203dSis 			if (sz <= 1) {
1533*4703203dSis 				break;
1534*4703203dSis 			} else if ((s + sz) > slast) {
1535*4703203dSis 				break;
1536*4703203dSis 			} else {
1537*4703203dSis 				/*
1538*4703203dSis 				 * If the previous character was a Hangul Jamo
1539*4703203dSis 				 * and this character is a Hangul Jamo that
1540*4703203dSis 				 * can be conjoined, we collect the Jamo.
1541*4703203dSis 				 */
1542*4703203dSis 				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
1543*4703203dSis 					U8_PUT_3BYTES_INTO_UTF32(u1,
1544*4703203dSis 					    *s, *(s + 1), *(s + 2));
1545*4703203dSis 
1546*4703203dSis 					if (U8_HANGUL_COMPOSABLE_L_V(*state,
1547*4703203dSis 					    u1)) {
1548*4703203dSis 						i = 0;
1549*4703203dSis 						*state = U8_STATE_HANGUL_LV;
1550*4703203dSis 						goto COLLECT_A_HANGUL;
1551*4703203dSis 					}
1552*4703203dSis 
1553*4703203dSis 					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
1554*4703203dSis 					    u1)) {
1555*4703203dSis 						i = 0;
1556*4703203dSis 						*state = U8_STATE_HANGUL_LVT;
1557*4703203dSis 						goto COLLECT_A_HANGUL;
1558*4703203dSis 					}
1559*4703203dSis 				}
1560*4703203dSis 
1561*4703203dSis 				/*
1562*4703203dSis 				 * Regardless of whatever it was, if this is
1563*4703203dSis 				 * a Starter, we don't collect the character
1564*4703203dSis 				 * since that's a new start and we will deal
1565*4703203dSis 				 * with it at the next time.
1566*4703203dSis 				 */
1567*4703203dSis 				i = combining_class(uv, s, sz);
1568*4703203dSis 				if (i == U8_COMBINING_CLASS_STARTER)
1569*4703203dSis 					break;
1570*4703203dSis 
1571*4703203dSis 				/*
1572*4703203dSis 				 * We know the current character is a combining
1573*4703203dSis 				 * mark. If the previous character wasn't
1574*4703203dSis 				 * a Starter (not Hangul) or a combining mark,
1575*4703203dSis 				 * then, we don't collect this combining mark.
1576*4703203dSis 				 */
1577*4703203dSis 				if (*state != U8_STATE_START &&
1578*4703203dSis 				    *state != U8_STATE_COMBINING_MARK)
1579*4703203dSis 					break;
1580*4703203dSis 
1581*4703203dSis 				*state = U8_STATE_COMBINING_MARK;
1582*4703203dSis COLLECT_A_HANGUL:
1583*4703203dSis 				/*
1584*4703203dSis 				 * If we collected a Starter and combining
1585*4703203dSis 				 * marks up to 30, i.e., total 31 characters,
1586*4703203dSis 				 * then, we terminate this degenerately long
1587*4703203dSis 				 * combining sequence with a U+034F COMBINING
1588*4703203dSis 				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
1589*4703203dSis 				 * UTF-8 and turn this into a Stream-Safe
1590*4703203dSis 				 * Text. This will be extremely rare but
1591*4703203dSis 				 * possible.
1592*4703203dSis 				 *
1593*4703203dSis 				 * The following will also guarantee that
1594*4703203dSis 				 * we are not writing more than 32 characters
1595*4703203dSis 				 * plus a NULL at u8s[].
1596*4703203dSis 				 */
1597*4703203dSis 				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
1598*4703203dSis TURN_STREAM_SAFE:
1599*4703203dSis 					*state = U8_STATE_START;
1600*4703203dSis 					comb_class[last] = 0;
1601*4703203dSis 					start[last] = saved_sz;
1602*4703203dSis 					disp[last] = 2;
1603*4703203dSis 					last++;
1604*4703203dSis 
1605*4703203dSis 					u8s[saved_sz++] = 0xCD;
1606*4703203dSis 					u8s[saved_sz++] = 0x8F;
1607*4703203dSis 
1608*4703203dSis 					break;
1609*4703203dSis 				}
1610*4703203dSis 
1611*4703203dSis 				/*
1612*4703203dSis 				 * Some combining marks also do decompose into
1613*4703203dSis 				 * another combining mark or marks.
1614*4703203dSis 				 */
1615*4703203dSis 				if (*state == U8_STATE_COMBINING_MARK) {
1616*4703203dSis 					k = last;
1617*4703203dSis 					l = sz;
1618*4703203dSis 					i = do_decomp(uv, uts, s, sz,
1619*4703203dSis 					    canonical_decomposition, state);
1620*4703203dSis 					for (j = 0; j < i; ) {
1621*4703203dSis 						sz = u8_number_of_bytes[uts[j]];
1622*4703203dSis 
1623*4703203dSis 						comb_class[last] =
1624*4703203dSis 						    combining_class(uv,
1625*4703203dSis 						    uts + j, sz);
1626*4703203dSis 						start[last] = saved_sz + j;
1627*4703203dSis 						disp[last] = sz;
1628*4703203dSis 
1629*4703203dSis 						last++;
1630*4703203dSis 						if (last >=
1631*4703203dSis 						    U8_UPPER_LIMIT_IN_A_SEQ) {
1632*4703203dSis 							last = k;
1633*4703203dSis 							goto TURN_STREAM_SAFE;
1634*4703203dSis 						}
1635*4703203dSis 						j += sz;
1636*4703203dSis 					}
1637*4703203dSis 
1638*4703203dSis 					*state = U8_STATE_COMBINING_MARK;
1639*4703203dSis 					sz = i;
1640*4703203dSis 					s += l;
1641*4703203dSis 
1642*4703203dSis 					for (i = 0; i < sz; i++)
1643*4703203dSis 						u8s[saved_sz++] = uts[i];
1644*4703203dSis 				} else {
1645*4703203dSis 					comb_class[last] = i;
1646*4703203dSis 					start[last] = saved_sz;
1647*4703203dSis 					disp[last] = sz;
1648*4703203dSis 					last++;
1649*4703203dSis 
1650*4703203dSis 					for (i = 0; i < sz; i++)
1651*4703203dSis 						u8s[saved_sz++] = *s++;
1652*4703203dSis 				}
1653*4703203dSis 
1654*4703203dSis 				/*
1655*4703203dSis 				 * If this is U+0345 COMBINING GREEK
1656*4703203dSis 				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
1657*4703203dSis 				 * iota subscript, and need to be converted to
1658*4703203dSis 				 * uppercase letter, convert it to U+0399 GREEK
1659*4703203dSis 				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
1660*4703203dSis 				 * i.e., convert to capital adscript form as
1661*4703203dSis 				 * specified in the Unicode standard.
1662*4703203dSis 				 *
1663*4703203dSis 				 * This is the only special case of (ambiguous)
1664*4703203dSis 				 * case conversion at combining marks and
1665*4703203dSis 				 * probably the standard will never have
1666*4703203dSis 				 * anything similar like this in future.
1667*4703203dSis 				 */
1668*4703203dSis 				if (is_it_toupper && sz >= 2 &&
1669*4703203dSis 				    u8s[saved_sz - 2] == 0xCD &&
1670*4703203dSis 				    u8s[saved_sz - 1] == 0x85) {
1671*4703203dSis 					u8s[saved_sz - 2] = 0xCE;
1672*4703203dSis 					u8s[saved_sz - 1] = 0x99;
1673*4703203dSis 				}
1674*4703203dSis 			}
1675*4703203dSis 		}
1676*4703203dSis 
1677*4703203dSis 		/*
1678*4703203dSis 		 * Let's try to ensure a canonical ordering for the collected
1679*4703203dSis 		 * combining marks. We do this only if we have collected
1680*4703203dSis 		 * at least one more non-Starter. (The decomposition mapping
1681*4703203dSis 		 * data tables have fully (and recursively) expanded and
1682*4703203dSis 		 * canonically ordered decompositions.)
1683*4703203dSis 		 *
1684*4703203dSis 		 * The U8_SWAP_COMB_MARKS() convenience macro has some
1685*4703203dSis 		 * assumptions and we are meeting the assumptions.
1686*4703203dSis 		 */
1687*4703203dSis 		last--;
1688*4703203dSis 		if (last >= saved_last) {
1689*4703203dSis 			for (i = 0; i < last; i++)
1690*4703203dSis 				for (j = last; j > i; j--)
1691*4703203dSis 					if (comb_class[j] &&
1692*4703203dSis 					    comb_class[j - 1] > comb_class[j]) {
1693*4703203dSis 						U8_SWAP_COMB_MARKS(j - 1, j);
1694*4703203dSis 					}
1695*4703203dSis 		}
1696*4703203dSis 
1697*4703203dSis 		*source = s;
1698*4703203dSis 
1699*4703203dSis 		if (! canonical_composition) {
1700*4703203dSis 			u8s[saved_sz] = '\0';
1701*4703203dSis 			return (saved_sz);
1702*4703203dSis 		}
1703*4703203dSis 
1704*4703203dSis 		/*
1705*4703203dSis 		 * Now do the canonical composition. Note that we do this
1706*4703203dSis 		 * only after a canonical or compatibility decomposition to
1707*4703203dSis 		 * finish up NFC or NFKC.
1708*4703203dSis 		 */
1709*4703203dSis 		sz = do_composition(uv, u8s, comb_class, start, disp, last,
1710*4703203dSis 		    &s, slast);
1711*4703203dSis 	}
1712*4703203dSis 
1713*4703203dSis 	*source = s;
1714*4703203dSis 
1715*4703203dSis 	return ((size_t)sz);
1716*4703203dSis }
1717*4703203dSis 
1718*4703203dSis /*
1719*4703203dSis  * The do_norm_compare() function does string comparion based on Unicode
1720*4703203dSis  * simple case mappings and Unicode Normalization definitions.
1721*4703203dSis  *
1722*4703203dSis  * It does so by collecting a sequence of character at a time and comparing
1723*4703203dSis  * the collected sequences from the strings.
1724*4703203dSis  *
1725*4703203dSis  * The meanings on the return values are the same as the usual strcmp().
1726*4703203dSis  */
1727*4703203dSis static int
1728*4703203dSis do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
1729*4703203dSis 	int flag, int *errno)
1730*4703203dSis {
1731*4703203dSis 	int result;
1732*4703203dSis 	size_t sz1;
1733*4703203dSis 	size_t sz2;
1734*4703203dSis 	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
1735*4703203dSis 	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
1736*4703203dSis 	uchar_t *s1last;
1737*4703203dSis 	uchar_t *s2last;
1738*4703203dSis 	boolean_t is_it_toupper;
1739*4703203dSis 	boolean_t is_it_tolower;
1740*4703203dSis 	boolean_t canonical_decomposition;
1741*4703203dSis 	boolean_t compatibility_decomposition;
1742*4703203dSis 	boolean_t canonical_composition;
1743*4703203dSis 	u8_normalization_states_t state;
1744*4703203dSis 
1745*4703203dSis 	s1last = s1 + n1;
1746*4703203dSis 	s2last = s2 + n2;
1747*4703203dSis 
1748*4703203dSis 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1749*4703203dSis 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1750*4703203dSis 	canonical_decomposition = flag & U8_CANON_DECOMP;
1751*4703203dSis 	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
1752*4703203dSis 	canonical_composition = flag & U8_CANON_COMP;
1753*4703203dSis 
1754*4703203dSis 	while (s1 < s1last && s2 < s2last) {
1755*4703203dSis 		/*
1756*4703203dSis 		 * If the current character is a 7-bit ASCII and the last
1757*4703203dSis 		 * character, or, if the current character and the next
1758*4703203dSis 		 * character are both some 7-bit ASCII characters then
1759*4703203dSis 		 * we treat the current character as a sequence.
1760*4703203dSis 		 *
1761*4703203dSis 		 * In any other cases, we need to call collect_a_seq().
1762*4703203dSis 		 */
1763*4703203dSis 
1764*4703203dSis 		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
1765*4703203dSis 		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
1766*4703203dSis 			if (is_it_toupper)
1767*4703203dSis 				u8s1[0] = U8_ASCII_TOUPPER(*s1);
1768*4703203dSis 			else if (is_it_tolower)
1769*4703203dSis 				u8s1[0] = U8_ASCII_TOLOWER(*s1);
1770*4703203dSis 			else
1771*4703203dSis 				u8s1[0] = *s1;
1772*4703203dSis 			u8s1[1] = '\0';
1773*4703203dSis 			sz1 = 1;
1774*4703203dSis 			s1++;
1775*4703203dSis 		} else {
1776*4703203dSis 			state = U8_STATE_START;
1777*4703203dSis 			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
1778*4703203dSis 			    is_it_toupper, is_it_tolower,
1779*4703203dSis 			    canonical_decomposition,
1780*4703203dSis 			    compatibility_decomposition,
1781*4703203dSis 			    canonical_composition, errno, &state);
1782*4703203dSis 		}
1783*4703203dSis 
1784*4703203dSis 		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
1785*4703203dSis 		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
1786*4703203dSis 			if (is_it_toupper)
1787*4703203dSis 				u8s2[0] = U8_ASCII_TOUPPER(*s2);
1788*4703203dSis 			else if (is_it_tolower)
1789*4703203dSis 				u8s2[0] = U8_ASCII_TOLOWER(*s2);
1790*4703203dSis 			else
1791*4703203dSis 				u8s2[0] = *s2;
1792*4703203dSis 			u8s2[1] = '\0';
1793*4703203dSis 			sz2 = 1;
1794*4703203dSis 			s2++;
1795*4703203dSis 		} else {
1796*4703203dSis 			state = U8_STATE_START;
1797*4703203dSis 			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
1798*4703203dSis 			    is_it_toupper, is_it_tolower,
1799*4703203dSis 			    canonical_decomposition,
1800*4703203dSis 			    compatibility_decomposition,
1801*4703203dSis 			    canonical_composition, errno, &state);
1802*4703203dSis 		}
1803*4703203dSis 
1804*4703203dSis 		/*
1805*4703203dSis 		 * Now compare the two characters. If they are the same,
1806*4703203dSis 		 * we move on to the next character sequences.
1807*4703203dSis 		 */
1808*4703203dSis 		if (sz1 == 1 && sz2 == 1) {
1809*4703203dSis 			if (*u8s1 > *u8s2)
1810*4703203dSis 				return (1);
1811*4703203dSis 			if (*u8s1 < *u8s2)
1812*4703203dSis 				return (-1);
1813*4703203dSis 		} else {
1814*4703203dSis 			result = strcmp((const char *)u8s1, (const char *)u8s2);
1815*4703203dSis 			if (result != 0)
1816*4703203dSis 				return (result);
1817*4703203dSis 		}
1818*4703203dSis 	}
1819*4703203dSis 
1820*4703203dSis 	/*
1821*4703203dSis 	 * We compared until the end of either or both strings.
1822*4703203dSis 	 *
1823*4703203dSis 	 * If we reached to or went over the ends for the both, that means
1824*4703203dSis 	 * they are the same.
1825*4703203dSis 	 *
1826*4703203dSis 	 * If we reached only one end, that means the other string has
1827*4703203dSis 	 * something which then can be used to determine the return value.
1828*4703203dSis 	 */
1829*4703203dSis 	if (s1 >= s1last) {
1830*4703203dSis 		if (s2 >= s2last)
1831*4703203dSis 			return (0);
1832*4703203dSis 		return (-1);
1833*4703203dSis 	}
1834*4703203dSis 	return (1);
1835*4703203dSis }
1836*4703203dSis 
1837*4703203dSis /*
1838*4703203dSis  * The u8_strcmp() function compares two UTF-8 strings quite similar to
1839*4703203dSis  * the strcmp(). For the comparison, however, Unicode Normalization specific
1840*4703203dSis  * equivalency and Unicode simple case conversion mappings based equivalency
1841*4703203dSis  * can be requested and checked against.
1842*4703203dSis  */
1843*4703203dSis int
1844*4703203dSis u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
1845*4703203dSis 		int *errno)
1846*4703203dSis {
1847*4703203dSis 	int f;
1848*4703203dSis 	size_t n1;
1849*4703203dSis 	size_t n2;
1850*4703203dSis 
1851*4703203dSis 	*errno = 0;
1852*4703203dSis 
1853*4703203dSis 	/*
1854*4703203dSis 	 * Check on the requested Unicode version, case conversion, and
1855*4703203dSis 	 * normalization flag values.
1856*4703203dSis 	 */
1857*4703203dSis 
1858*4703203dSis 	if (uv > U8_UNICODE_LATEST) {
1859*4703203dSis 		*errno = ERANGE;
1860*4703203dSis 		uv = U8_UNICODE_LATEST;
1861*4703203dSis 	}
1862*4703203dSis 
1863*4703203dSis 	if (flag == 0) {
1864*4703203dSis 		flag = U8_STRCMP_CS;
1865*4703203dSis 	} else {
1866*4703203dSis 		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
1867*4703203dSis 		    U8_STRCMP_CI_LOWER);
1868*4703203dSis 		if (f == 0) {
1869*4703203dSis 			flag |= U8_STRCMP_CS;
1870*4703203dSis 		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
1871*4703203dSis 		    f != U8_STRCMP_CI_LOWER) {
1872*4703203dSis 			*errno = EBADF;
1873*4703203dSis 			flag = U8_STRCMP_CS;
1874*4703203dSis 		}
1875*4703203dSis 
1876*4703203dSis 		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1877*4703203dSis 		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
1878*4703203dSis 		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
1879*4703203dSis 			*errno = EBADF;
1880*4703203dSis 			flag = U8_STRCMP_CS;
1881*4703203dSis 		}
1882*4703203dSis 	}
1883*4703203dSis 
1884*4703203dSis 	if (flag == U8_STRCMP_CS) {
1885*4703203dSis 		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
1886*4703203dSis 	}
1887*4703203dSis 
1888*4703203dSis 	n1 = strlen(s1);
1889*4703203dSis 	n2 = strlen(s2);
1890*4703203dSis 	if (n != 0) {
1891*4703203dSis 		if (n < n1)
1892*4703203dSis 			n1 = n;
1893*4703203dSis 		if (n < n2)
1894*4703203dSis 			n2 = n;
1895*4703203dSis 	}
1896*4703203dSis 
1897*4703203dSis 	/*
1898*4703203dSis 	 * Simple case conversion can be done much faster and so we do
1899*4703203dSis 	 * them separately here.
1900*4703203dSis 	 */
1901*4703203dSis 	if (flag == U8_STRCMP_CI_UPPER) {
1902*4703203dSis 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1903*4703203dSis 		    n1, n2, B_TRUE, errno));
1904*4703203dSis 	} else if (flag == U8_STRCMP_CI_LOWER) {
1905*4703203dSis 		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
1906*4703203dSis 		    n1, n2, B_FALSE, errno));
1907*4703203dSis 	}
1908*4703203dSis 
1909*4703203dSis 	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
1910*4703203dSis 	    flag, errno));
1911*4703203dSis }
1912*4703203dSis 
1913*4703203dSis size_t
1914*4703203dSis u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
1915*4703203dSis 	int flag, size_t unicode_version, int *errno)
1916*4703203dSis {
1917*4703203dSis 	int f;
1918*4703203dSis 	int sz;
1919*4703203dSis 	uchar_t *ib;
1920*4703203dSis 	uchar_t *ibtail;
1921*4703203dSis 	uchar_t *ob;
1922*4703203dSis 	uchar_t *obtail;
1923*4703203dSis 	boolean_t do_not_ignore_null;
1924*4703203dSis 	boolean_t do_not_ignore_invalid;
1925*4703203dSis 	boolean_t is_it_toupper;
1926*4703203dSis 	boolean_t is_it_tolower;
1927*4703203dSis 	boolean_t canonical_decomposition;
1928*4703203dSis 	boolean_t compatibility_decomposition;
1929*4703203dSis 	boolean_t canonical_composition;
1930*4703203dSis 	size_t ret_val;
1931*4703203dSis 	size_t i;
1932*4703203dSis 	size_t j;
1933*4703203dSis 	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
1934*4703203dSis 	u8_normalization_states_t state;
1935*4703203dSis 
1936*4703203dSis 	if (unicode_version > U8_UNICODE_LATEST) {
1937*4703203dSis 		*errno = ERANGE;
1938*4703203dSis 		return ((size_t)-1);
1939*4703203dSis 	}
1940*4703203dSis 
1941*4703203dSis 	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
1942*4703203dSis 	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
1943*4703203dSis 		*errno = EBADF;
1944*4703203dSis 		return ((size_t)-1);
1945*4703203dSis 	}
1946*4703203dSis 
1947*4703203dSis 	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
1948*4703203dSis 	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
1949*4703203dSis 	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
1950*4703203dSis 		*errno = EBADF;
1951*4703203dSis 		return ((size_t)-1);
1952*4703203dSis 	}
1953*4703203dSis 
1954*4703203dSis 	if (inarray == NULL || *inlen == 0)
1955*4703203dSis 		return (0);
1956*4703203dSis 
1957*4703203dSis 	if (outarray == NULL) {
1958*4703203dSis 		*errno = E2BIG;
1959*4703203dSis 		return ((size_t)-1);
1960*4703203dSis 	}
1961*4703203dSis 
1962*4703203dSis 	ib = (uchar_t *)inarray;
1963*4703203dSis 	ob = (uchar_t *)outarray;
1964*4703203dSis 	ibtail = ib + *inlen;
1965*4703203dSis 	obtail = ob + *outlen;
1966*4703203dSis 
1967*4703203dSis 	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
1968*4703203dSis 	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
1969*4703203dSis 	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
1970*4703203dSis 	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
1971*4703203dSis 
1972*4703203dSis 	ret_val = 0;
1973*4703203dSis 
1974*4703203dSis 	/*
1975*4703203dSis 	 * If we don't have a normalization flag set, we do the simple case
1976*4703203dSis 	 * conversion based text preparation separately below. Text
1977*4703203dSis 	 * preparation involving Normalization will be done in the false task
1978*4703203dSis 	 * block, again, separately since it will take much more time and
1979*4703203dSis 	 * resource than doing simple case conversions.
1980*4703203dSis 	 */
1981*4703203dSis 	if (f == 0) {
1982*4703203dSis 		while (ib < ibtail) {
1983*4703203dSis 			if (*ib == '\0' && do_not_ignore_null)
1984*4703203dSis 				break;
1985*4703203dSis 
1986*4703203dSis 			sz = u8_number_of_bytes[*ib];
1987*4703203dSis 
1988*4703203dSis 			if (sz < 0) {
1989*4703203dSis 				if (do_not_ignore_invalid) {
1990*4703203dSis 					*errno = EILSEQ;
1991*4703203dSis 					ret_val = (size_t)-1;
1992*4703203dSis 					break;
1993*4703203dSis 				}
1994*4703203dSis 
1995*4703203dSis 				sz = 1;
1996*4703203dSis 				ret_val++;
1997*4703203dSis 			}
1998*4703203dSis 
1999*4703203dSis 			if (sz == 1) {
2000*4703203dSis 				if (ob >= obtail) {
2001*4703203dSis 					*errno = E2BIG;
2002*4703203dSis 					ret_val = (size_t)-1;
2003*4703203dSis 					break;
2004*4703203dSis 				}
2005*4703203dSis 
2006*4703203dSis 				if (is_it_toupper)
2007*4703203dSis 					*ob = U8_ASCII_TOUPPER(*ib);
2008*4703203dSis 				else if (is_it_tolower)
2009*4703203dSis 					*ob = U8_ASCII_TOLOWER(*ib);
2010*4703203dSis 				else
2011*4703203dSis 					*ob = *ib;
2012*4703203dSis 				ib++;
2013*4703203dSis 				ob++;
2014*4703203dSis 			} else if ((ib + sz) > ibtail) {
2015*4703203dSis 				if (do_not_ignore_invalid) {
2016*4703203dSis 					*errno = EINVAL;
2017*4703203dSis 					ret_val = (size_t)-1;
2018*4703203dSis 					break;
2019*4703203dSis 				}
2020*4703203dSis 
2021*4703203dSis 				if ((obtail - ob) < (ibtail - ib)) {
2022*4703203dSis 					*errno = E2BIG;
2023*4703203dSis 					ret_val = (size_t)-1;
2024*4703203dSis 					break;
2025*4703203dSis 				}
2026*4703203dSis 
2027*4703203dSis 				/*
2028*4703203dSis 				 * We treat the remaining incomplete character
2029*4703203dSis 				 * bytes as a character.
2030*4703203dSis 				 */
2031*4703203dSis 				ret_val++;
2032*4703203dSis 
2033*4703203dSis 				while (ib < ibtail)
2034*4703203dSis 					*ob++ = *ib++;
2035*4703203dSis 			} else {
2036*4703203dSis 				if (is_it_toupper || is_it_tolower) {
2037*4703203dSis 					i = do_case_conv(unicode_version, u8s,
2038*4703203dSis 					    ib, sz, is_it_toupper);
2039*4703203dSis 
2040*4703203dSis 					if ((obtail - ob) < i) {
2041*4703203dSis 						*errno = E2BIG;
2042*4703203dSis 						ret_val = (size_t)-1;
2043*4703203dSis 						break;
2044*4703203dSis 					}
2045*4703203dSis 
2046*4703203dSis 					ib += sz;
2047*4703203dSis 
2048*4703203dSis 					for (sz = 0; sz < i; sz++)
2049*4703203dSis 						*ob++ = u8s[sz];
2050*4703203dSis 				} else {
2051*4703203dSis 					if ((obtail - ob) < sz) {
2052*4703203dSis 						*errno = E2BIG;
2053*4703203dSis 						ret_val = (size_t)-1;
2054*4703203dSis 						break;
2055*4703203dSis 					}
2056*4703203dSis 
2057*4703203dSis 					for (i = 0; i < sz; i++)
2058*4703203dSis 						*ob++ = *ib++;
2059*4703203dSis 				}
2060*4703203dSis 			}
2061*4703203dSis 		}
2062*4703203dSis 	} else {
2063*4703203dSis 		canonical_decomposition = flag & U8_CANON_DECOMP;
2064*4703203dSis 		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
2065*4703203dSis 		canonical_composition = flag & U8_CANON_COMP;
2066*4703203dSis 
2067*4703203dSis 		while (ib < ibtail) {
2068*4703203dSis 			if (*ib == '\0' && do_not_ignore_null)
2069*4703203dSis 				break;
2070*4703203dSis 
2071*4703203dSis 			/*
2072*4703203dSis 			 * If the current character is a 7-bit ASCII
2073*4703203dSis 			 * character and it is the last character, or,
2074*4703203dSis 			 * if the current character is a 7-bit ASCII
2075*4703203dSis 			 * character and the next character is also a 7-bit
2076*4703203dSis 			 * ASCII character, then, we copy over this
2077*4703203dSis 			 * character without going through collect_a_seq().
2078*4703203dSis 			 *
2079*4703203dSis 			 * In any other cases, we need to look further with
2080*4703203dSis 			 * the collect_a_seq() function.
2081*4703203dSis 			 */
2082*4703203dSis 			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
2083*4703203dSis 			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
2084*4703203dSis 				if (ob >= obtail) {
2085*4703203dSis 					*errno = E2BIG;
2086*4703203dSis 					ret_val = (size_t)-1;
2087*4703203dSis 					break;
2088*4703203dSis 				}
2089*4703203dSis 
2090*4703203dSis 				if (is_it_toupper)
2091*4703203dSis 					*ob = U8_ASCII_TOUPPER(*ib);
2092*4703203dSis 				else if (is_it_tolower)
2093*4703203dSis 					*ob = U8_ASCII_TOLOWER(*ib);
2094*4703203dSis 				else
2095*4703203dSis 					*ob = *ib;
2096*4703203dSis 				ib++;
2097*4703203dSis 				ob++;
2098*4703203dSis 			} else {
2099*4703203dSis 				*errno = 0;
2100*4703203dSis 				state = U8_STATE_START;
2101*4703203dSis 
2102*4703203dSis 				j = collect_a_seq(unicode_version, u8s,
2103*4703203dSis 				    &ib, ibtail,
2104*4703203dSis 				    is_it_toupper,
2105*4703203dSis 				    is_it_tolower,
2106*4703203dSis 				    canonical_decomposition,
2107*4703203dSis 				    compatibility_decomposition,
2108*4703203dSis 				    canonical_composition,
2109*4703203dSis 				    errno, &state);
2110*4703203dSis 
2111*4703203dSis 				if (*errno && do_not_ignore_invalid) {
2112*4703203dSis 					ret_val = (size_t)-1;
2113*4703203dSis 					break;
2114*4703203dSis 				}
2115*4703203dSis 
2116*4703203dSis 				if ((obtail - ob) < j) {
2117*4703203dSis 					*errno = E2BIG;
2118*4703203dSis 					ret_val = (size_t)-1;
2119*4703203dSis 					break;
2120*4703203dSis 				}
2121*4703203dSis 
2122*4703203dSis 				for (i = 0; i < j; i++)
2123*4703203dSis 					*ob++ = u8s[i];
2124*4703203dSis 			}
2125*4703203dSis 		}
2126*4703203dSis 	}
2127*4703203dSis 
2128*4703203dSis 	*inlen = ibtail - ib;
2129*4703203dSis 	*outlen = obtail - ob;
2130*4703203dSis 
2131*4703203dSis 	return (ret_val);
2132*4703203dSis }
2133