xref: /titanic_53/usr/src/common/unicode/uconv.c (revision 4703203d9b3e06246d73931f07359a7ef70f47bf)
1*4703203dSis /*
2*4703203dSis  * CDDL HEADER START
3*4703203dSis  *
4*4703203dSis  * The contents of this file are subject to the terms of the
5*4703203dSis  * Common Development and Distribution License (the "License").
6*4703203dSis  * You may not use this file except in compliance with the License.
7*4703203dSis  *
8*4703203dSis  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*4703203dSis  * or http://www.opensolaris.org/os/licensing.
10*4703203dSis  * See the License for the specific language governing permissions
11*4703203dSis  * and limitations under the License.
12*4703203dSis  *
13*4703203dSis  * When distributing Covered Code, include this CDDL HEADER in each
14*4703203dSis  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*4703203dSis  * If applicable, add the following below this CDDL HEADER, with the
16*4703203dSis  * fields enclosed by brackets "[]" replaced with your own identifying
17*4703203dSis  * information: Portions Copyright [yyyy] [name of copyright owner]
18*4703203dSis  *
19*4703203dSis  * CDDL HEADER END
20*4703203dSis  */
21*4703203dSis /*
22*4703203dSis  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23*4703203dSis  * Use is subject to license terms.
24*4703203dSis  */
25*4703203dSis 
26*4703203dSis #pragma ident	"%Z%%M%	%I%	%E% SMI"
27*4703203dSis 
28*4703203dSis /*
29*4703203dSis  * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
30*4703203dSis  * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
31*4703203dSis  * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
32*4703203dSis  * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
33*4703203dSis  * the section 3C man pages.
34*4703203dSis  * Interface stability: Committed
35*4703203dSis  */
36*4703203dSis 
37*4703203dSis #include <sys/types.h>
38*4703203dSis #ifdef	_KERNEL
39*4703203dSis #include <sys/param.h>
40*4703203dSis #include <sys/sysmacros.h>
41*4703203dSis #include <sys/systm.h>
42*4703203dSis #include <sys/debug.h>
43*4703203dSis #include <sys/kmem.h>
44*4703203dSis #include <sys/sunddi.h>
45*4703203dSis #else
46*4703203dSis #include <sys/u8_textprep.h>
47*4703203dSis #endif	/* _KERNEL */
48*4703203dSis #include <sys/byteorder.h>
49*4703203dSis #include <sys/errno.h>
50*4703203dSis 
51*4703203dSis 
52*4703203dSis /*
53*4703203dSis  * The max and min values of high and low surrogate pairs of UTF-16,
54*4703203dSis  * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
55*4703203dSis  */
56*4703203dSis #define	UCONV_U16_HI_MIN	(0xd800U)
57*4703203dSis #define	UCONV_U16_HI_MAX	(0xdbffU)
58*4703203dSis #define	UCONV_U16_LO_MIN	(0xdc00U)
59*4703203dSis #define	UCONV_U16_LO_MAX	(0xdfffU)
60*4703203dSis #define	UCONV_U16_BIT_SHIFT	(0x0400U)
61*4703203dSis #define	UCONV_U16_BIT_MASK	(0x0fffffU)
62*4703203dSis #define	UCONV_U16_START		(0x010000U)
63*4703203dSis 
64*4703203dSis /* The maximum value of Unicode coding space and ASCII coding space. */
65*4703203dSis #define	UCONV_UNICODE_MAX	(0x10ffffU)
66*4703203dSis #define	UCONV_ASCII_MAX		(0x7fU)
67*4703203dSis 
68*4703203dSis /* The mask values for input and output endians. */
69*4703203dSis #define	UCONV_IN_ENDIAN_MASKS	(UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
70*4703203dSis #define	UCONV_OUT_ENDIAN_MASKS	(UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
71*4703203dSis 
72*4703203dSis /* Native and reversed endian macros. */
73*4703203dSis #ifdef	_BIG_ENDIAN
74*4703203dSis #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_BIG_ENDIAN
75*4703203dSis #define	UCONV_IN_REV_ENDIAN	UCONV_IN_LITTLE_ENDIAN
76*4703203dSis #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_BIG_ENDIAN
77*4703203dSis #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
78*4703203dSis #else
79*4703203dSis #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_LITTLE_ENDIAN
80*4703203dSis #define	UCONV_IN_REV_ENDIAN	UCONV_IN_BIG_ENDIAN
81*4703203dSis #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
82*4703203dSis #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_BIG_ENDIAN
83*4703203dSis #endif	/* _BIG_ENDIAN */
84*4703203dSis 
85*4703203dSis /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
86*4703203dSis #define	UCONV_BOM_NORMAL	(0xfeffU)
87*4703203dSis #define	UCONV_BOM_SWAPPED	(0xfffeU)
88*4703203dSis #define	UCONV_BOM_SWAPPED_32	(0xfffe0000U)
89*4703203dSis 
90*4703203dSis /* UTF-32 boundaries based on UTF-8 character byte lengths. */
91*4703203dSis #define	UCONV_U8_ONE_BYTE	(0x7fU)
92*4703203dSis #define	UCONV_U8_TWO_BYTES	(0x7ffU)
93*4703203dSis #define	UCONV_U8_THREE_BYTES	(0xffffU)
94*4703203dSis #define	UCONV_U8_FOUR_BYTES	(0x10ffffU)
95*4703203dSis 
96*4703203dSis /* The common minimum and maximum values at the UTF-8 character bytes. */
97*4703203dSis #define	UCONV_U8_BYTE_MIN	(0x80U)
98*4703203dSis #define	UCONV_U8_BYTE_MAX	(0xbfU)
99*4703203dSis 
100*4703203dSis /*
101*4703203dSis  * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
102*4703203dSis  * UTF-8 character bytes.
103*4703203dSis  */
104*4703203dSis #define	UCONV_U8_BIT_SHIFT	6
105*4703203dSis #define	UCONV_U8_BIT_MASK	0x3f
106*4703203dSis 
107*4703203dSis /*
108*4703203dSis  * The following vector shows remaining bytes in a UTF-8 character.
109*4703203dSis  * Index will be the first byte of the character.
110*4703203dSis  */
111*4703203dSis static const uchar_t remaining_bytes_tbl[0x100] = {
112*4703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
113*4703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
114*4703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
115*4703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
116*4703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
117*4703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
118*4703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
119*4703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
120*4703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
121*4703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
122*4703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
123*4703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
124*4703203dSis 
125*4703203dSis /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
126*4703203dSis 	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
127*4703203dSis 
128*4703203dSis /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
129*4703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
130*4703203dSis 
131*4703203dSis /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
132*4703203dSis 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
133*4703203dSis 
134*4703203dSis /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
135*4703203dSis 	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
136*4703203dSis };
137*4703203dSis 
138*4703203dSis /*
139*4703203dSis  * The following is a vector of bit-masks to get used bits in
140*4703203dSis  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
141*4703203dSis  * the character.
142*4703203dSis  */
143*4703203dSis static const uchar_t masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
144*4703203dSis 
145*4703203dSis /*
146*4703203dSis  * The following two vectors are to provide valid minimum and
147*4703203dSis  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
148*4703203dSis  * better illegal sequence checking. The index value must be the value of
149*4703203dSis  * the first byte of the UTF-8 character.
150*4703203dSis  */
151*4703203dSis static const uchar_t valid_min_2nd_byte[0x100] = {
152*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
153*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
154*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
155*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
156*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
157*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
158*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
159*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
160*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
161*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
162*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
163*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
164*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
165*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
166*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
167*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
168*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
169*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
170*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
171*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
172*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
173*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
174*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
175*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
176*4703203dSis 
177*4703203dSis /*	C0    C1    C2    C3    C4    C5    C6    C7 */
178*4703203dSis 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
179*4703203dSis 
180*4703203dSis /*	C8    C9    CA    CB    CC    CD    CE    CF */
181*4703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
182*4703203dSis 
183*4703203dSis /*	D0    D1    D2    D3    D4    D5    D6    D7 */
184*4703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
185*4703203dSis 
186*4703203dSis /*	D8    D9    DA    DB    DC    DD    DE    DF */
187*4703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
188*4703203dSis 
189*4703203dSis /*	E0    E1    E2    E3    E4    E5    E6    E7 */
190*4703203dSis 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
191*4703203dSis 
192*4703203dSis /*	E8    E9    EA    EB    EC    ED    EE    EF */
193*4703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
194*4703203dSis 
195*4703203dSis /*	F0    F1    F2    F3    F4    F5    F6    F7 */
196*4703203dSis 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
197*4703203dSis 
198*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0
199*4703203dSis };
200*4703203dSis 
201*4703203dSis static const uchar_t valid_max_2nd_byte[0x100] = {
202*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
203*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
204*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
205*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
206*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
207*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
208*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
209*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
210*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
211*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
212*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
213*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
214*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
215*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
216*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
217*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
218*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
219*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
220*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
221*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
222*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
223*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
224*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
225*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
226*4703203dSis 
227*4703203dSis /*	C0    C1    C2    C3    C4    C5    C6    C7 */
228*4703203dSis 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
229*4703203dSis 
230*4703203dSis /*	C8    C9    CA    CB    CC    CD    CE    CF */
231*4703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
232*4703203dSis 
233*4703203dSis /*	D0    D1    D2    D3    D4    D5    D6    D7 */
234*4703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
235*4703203dSis 
236*4703203dSis /*	D8    D9    DA    DB    DC    DD    DE    DF */
237*4703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
238*4703203dSis 
239*4703203dSis /*	E0    E1    E2    E3    E4    E5    E6    E7 */
240*4703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
241*4703203dSis 
242*4703203dSis /*	E8    E9    EA    EB    EC    ED    EE    EF */
243*4703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
244*4703203dSis 
245*4703203dSis /*	F0    F1    F2    F3    F4    F5    F6    F7 */
246*4703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
247*4703203dSis 
248*4703203dSis 	0,    0,    0,    0,    0,    0,    0,    0
249*4703203dSis };
250*4703203dSis 
251*4703203dSis 
252*4703203dSis static int
253*4703203dSis check_endian(int flag, int *in, int *out)
254*4703203dSis {
255*4703203dSis 	*in = flag & UCONV_IN_ENDIAN_MASKS;
256*4703203dSis 
257*4703203dSis 	/* You cannot have both. */
258*4703203dSis 	if (*in == UCONV_IN_ENDIAN_MASKS)
259*4703203dSis 		return (EBADF);
260*4703203dSis 
261*4703203dSis 	if (*in == 0)
262*4703203dSis 		*in = UCONV_IN_NAT_ENDIAN;
263*4703203dSis 
264*4703203dSis 	*out = flag & UCONV_OUT_ENDIAN_MASKS;
265*4703203dSis 
266*4703203dSis 	/* You cannot have both. */
267*4703203dSis 	if (*out == UCONV_OUT_ENDIAN_MASKS)
268*4703203dSis 		return (EBADF);
269*4703203dSis 
270*4703203dSis 	if (*out == 0)
271*4703203dSis 		*out = UCONV_OUT_NAT_ENDIAN;
272*4703203dSis 
273*4703203dSis 	return (0);
274*4703203dSis }
275*4703203dSis 
276*4703203dSis static boolean_t
277*4703203dSis check_bom16(const uint16_t *u16s, size_t u16l, int *in)
278*4703203dSis {
279*4703203dSis 	if (u16l > 0) {
280*4703203dSis 		if (*u16s == UCONV_BOM_NORMAL) {
281*4703203dSis 			*in = UCONV_IN_NAT_ENDIAN;
282*4703203dSis 			return (B_TRUE);
283*4703203dSis 		}
284*4703203dSis 		if (*u16s == UCONV_BOM_SWAPPED) {
285*4703203dSis 			*in = UCONV_IN_REV_ENDIAN;
286*4703203dSis 			return (B_TRUE);
287*4703203dSis 		}
288*4703203dSis 	}
289*4703203dSis 
290*4703203dSis 	return (B_FALSE);
291*4703203dSis }
292*4703203dSis 
293*4703203dSis static boolean_t
294*4703203dSis check_bom32(const uint32_t *u32s, size_t u32l, int *in)
295*4703203dSis {
296*4703203dSis 	if (u32l > 0) {
297*4703203dSis 		if (*u32s == UCONV_BOM_NORMAL) {
298*4703203dSis 			*in = UCONV_IN_NAT_ENDIAN;
299*4703203dSis 			return (B_TRUE);
300*4703203dSis 		}
301*4703203dSis 		if (*u32s == UCONV_BOM_SWAPPED_32) {
302*4703203dSis 			*in = UCONV_IN_REV_ENDIAN;
303*4703203dSis 			return (B_TRUE);
304*4703203dSis 		}
305*4703203dSis 	}
306*4703203dSis 
307*4703203dSis 	return (B_FALSE);
308*4703203dSis }
309*4703203dSis 
310*4703203dSis int
311*4703203dSis uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
312*4703203dSis     uint32_t *u32s, size_t *utf32len, int flag)
313*4703203dSis {
314*4703203dSis 	int inendian;
315*4703203dSis 	int outendian;
316*4703203dSis 	size_t u16l;
317*4703203dSis 	size_t u32l;
318*4703203dSis 	uint32_t hi;
319*4703203dSis 	uint32_t lo;
320*4703203dSis 	boolean_t do_not_ignore_null;
321*4703203dSis 
322*4703203dSis 	/*
323*4703203dSis 	 * Do preliminary validity checks on parameters and collect info on
324*4703203dSis 	 * endians.
325*4703203dSis 	 */
326*4703203dSis 	if (u16s == NULL || utf16len == NULL)
327*4703203dSis 		return (EILSEQ);
328*4703203dSis 
329*4703203dSis 	if (u32s == NULL || utf32len == NULL)
330*4703203dSis 		return (E2BIG);
331*4703203dSis 
332*4703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
333*4703203dSis 		return (EBADF);
334*4703203dSis 
335*4703203dSis 	/*
336*4703203dSis 	 * Initialize input and output parameter buffer indices and
337*4703203dSis 	 * temporary variables.
338*4703203dSis 	 */
339*4703203dSis 	u16l = u32l = 0;
340*4703203dSis 	hi = 0;
341*4703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
342*4703203dSis 
343*4703203dSis 	/*
344*4703203dSis 	 * Check on the BOM at the beginning of the input buffer if required
345*4703203dSis 	 * and if there is indeed one, process it.
346*4703203dSis 	 */
347*4703203dSis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
348*4703203dSis 	    check_bom16(u16s, *utf16len, &inendian))
349*4703203dSis 		u16l++;
350*4703203dSis 
351*4703203dSis 	/*
352*4703203dSis 	 * Reset inendian and outendian so that after this point, those can be
353*4703203dSis 	 * used as condition values.
354*4703203dSis 	 */
355*4703203dSis 	inendian &= UCONV_IN_NAT_ENDIAN;
356*4703203dSis 	outendian &= UCONV_OUT_NAT_ENDIAN;
357*4703203dSis 
358*4703203dSis 	/*
359*4703203dSis 	 * If there is something in the input buffer and if necessary and
360*4703203dSis 	 * requested, save the BOM at the output buffer.
361*4703203dSis 	 */
362*4703203dSis 	if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
363*4703203dSis 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
364*4703203dSis 		    UCONV_BOM_SWAPPED_32;
365*4703203dSis 
366*4703203dSis 	/*
367*4703203dSis 	 * Do conversion; if encounter a surrogate pair, assemble high and
368*4703203dSis 	 * low pair values to form a UTF-32 character. If a half of a pair
369*4703203dSis 	 * exists alone, then, either it is an illegal (EILSEQ) or
370*4703203dSis 	 * invalid (EINVAL) value.
371*4703203dSis 	 */
372*4703203dSis 	for (; u16l < *utf16len; u16l++) {
373*4703203dSis 		if (u16s[u16l] == 0 && do_not_ignore_null)
374*4703203dSis 			break;
375*4703203dSis 
376*4703203dSis 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
377*4703203dSis 
378*4703203dSis 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
379*4703203dSis 			if (hi)
380*4703203dSis 				return (EILSEQ);
381*4703203dSis 			hi = lo;
382*4703203dSis 			continue;
383*4703203dSis 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
384*4703203dSis 			if (! hi)
385*4703203dSis 				return (EILSEQ);
386*4703203dSis 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
387*4703203dSis 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
388*4703203dSis 			    + UCONV_U16_START;
389*4703203dSis 			hi = 0;
390*4703203dSis 		} else if (hi) {
391*4703203dSis 			return (EILSEQ);
392*4703203dSis 		}
393*4703203dSis 
394*4703203dSis 		if (u32l >= *utf32len)
395*4703203dSis 			return (E2BIG);
396*4703203dSis 
397*4703203dSis 		u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
398*4703203dSis 	}
399*4703203dSis 
400*4703203dSis 	/*
401*4703203dSis 	 * If high half didn't see low half, then, it's most likely the input
402*4703203dSis 	 * parameter is incomplete.
403*4703203dSis 	 */
404*4703203dSis 	if (hi)
405*4703203dSis 		return (EINVAL);
406*4703203dSis 
407*4703203dSis 	/*
408*4703203dSis 	 * Save the number of consumed and saved characters. They do not
409*4703203dSis 	 * include terminating NULL character (U+0000) at the end of
410*4703203dSis 	 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
411*4703203dSis 	 * the input buffer length is big enough to include the terminating
412*4703203dSis 	 * NULL character).
413*4703203dSis 	 */
414*4703203dSis 	*utf16len = u16l;
415*4703203dSis 	*utf32len = u32l;
416*4703203dSis 
417*4703203dSis 	return (0);
418*4703203dSis }
419*4703203dSis 
420*4703203dSis int
421*4703203dSis uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
422*4703203dSis     uchar_t *u8s, size_t *utf8len, int flag)
423*4703203dSis {
424*4703203dSis 	int inendian;
425*4703203dSis 	int outendian;
426*4703203dSis 	size_t u16l;
427*4703203dSis 	size_t u8l;
428*4703203dSis 	uint32_t hi;
429*4703203dSis 	uint32_t lo;
430*4703203dSis 	boolean_t do_not_ignore_null;
431*4703203dSis 
432*4703203dSis 	if (u16s == NULL || utf16len == NULL)
433*4703203dSis 		return (EILSEQ);
434*4703203dSis 
435*4703203dSis 	if (u8s == NULL || utf8len == NULL)
436*4703203dSis 		return (E2BIG);
437*4703203dSis 
438*4703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
439*4703203dSis 		return (EBADF);
440*4703203dSis 
441*4703203dSis 	u16l = u8l = 0;
442*4703203dSis 	hi = 0;
443*4703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
444*4703203dSis 
445*4703203dSis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
446*4703203dSis 	    check_bom16(u16s, *utf16len, &inendian))
447*4703203dSis 		u16l++;
448*4703203dSis 
449*4703203dSis 	inendian &= UCONV_IN_NAT_ENDIAN;
450*4703203dSis 
451*4703203dSis 	for (; u16l < *utf16len; u16l++) {
452*4703203dSis 		if (u16s[u16l] == 0 && do_not_ignore_null)
453*4703203dSis 			break;
454*4703203dSis 
455*4703203dSis 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
456*4703203dSis 
457*4703203dSis 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
458*4703203dSis 			if (hi)
459*4703203dSis 				return (EILSEQ);
460*4703203dSis 			hi = lo;
461*4703203dSis 			continue;
462*4703203dSis 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
463*4703203dSis 			if (! hi)
464*4703203dSis 				return (EILSEQ);
465*4703203dSis 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
466*4703203dSis 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
467*4703203dSis 			    + UCONV_U16_START;
468*4703203dSis 			hi = 0;
469*4703203dSis 		} else if (hi) {
470*4703203dSis 			return (EILSEQ);
471*4703203dSis 		}
472*4703203dSis 
473*4703203dSis 		/*
474*4703203dSis 		 * Now we convert a UTF-32 character into a UTF-8 character.
475*4703203dSis 		 * Unicode coding space is between U+0000 and U+10FFFF;
476*4703203dSis 		 * anything bigger is an illegal character.
477*4703203dSis 		 */
478*4703203dSis 		if (lo <= UCONV_U8_ONE_BYTE) {
479*4703203dSis 			if (u8l >= *utf8len)
480*4703203dSis 				return (E2BIG);
481*4703203dSis 			u8s[u8l++] = (uchar_t)lo;
482*4703203dSis 		} else if (lo <= UCONV_U8_TWO_BYTES) {
483*4703203dSis 			if ((u8l + 1) >= *utf8len)
484*4703203dSis 				return (E2BIG);
485*4703203dSis 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
486*4703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
487*4703203dSis 		} else if (lo <= UCONV_U8_THREE_BYTES) {
488*4703203dSis 			if ((u8l + 2) >= *utf8len)
489*4703203dSis 				return (E2BIG);
490*4703203dSis 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
491*4703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
492*4703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
493*4703203dSis 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
494*4703203dSis 			if ((u8l + 3) >= *utf8len)
495*4703203dSis 				return (E2BIG);
496*4703203dSis 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
497*4703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
498*4703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
499*4703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
500*4703203dSis 		} else {
501*4703203dSis 			return (EILSEQ);
502*4703203dSis 		}
503*4703203dSis 	}
504*4703203dSis 
505*4703203dSis 	if (hi)
506*4703203dSis 		return (EINVAL);
507*4703203dSis 
508*4703203dSis 	*utf16len = u16l;
509*4703203dSis 	*utf8len = u8l;
510*4703203dSis 
511*4703203dSis 	return (0);
512*4703203dSis }
513*4703203dSis 
514*4703203dSis int
515*4703203dSis uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
516*4703203dSis     uint16_t *u16s, size_t *utf16len, int flag)
517*4703203dSis {
518*4703203dSis 	int inendian;
519*4703203dSis 	int outendian;
520*4703203dSis 	size_t u16l;
521*4703203dSis 	size_t u32l;
522*4703203dSis 	uint32_t hi;
523*4703203dSis 	uint32_t lo;
524*4703203dSis 	boolean_t do_not_ignore_null;
525*4703203dSis 
526*4703203dSis 	if (u32s == NULL || utf32len == NULL)
527*4703203dSis 		return (EILSEQ);
528*4703203dSis 
529*4703203dSis 	if (u16s == NULL || utf16len == NULL)
530*4703203dSis 		return (E2BIG);
531*4703203dSis 
532*4703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
533*4703203dSis 		return (EBADF);
534*4703203dSis 
535*4703203dSis 	u16l = u32l = 0;
536*4703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
537*4703203dSis 
538*4703203dSis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
539*4703203dSis 	    check_bom32(u32s, *utf32len, &inendian))
540*4703203dSis 		u32l++;
541*4703203dSis 
542*4703203dSis 	inendian &= UCONV_IN_NAT_ENDIAN;
543*4703203dSis 	outendian &= UCONV_OUT_NAT_ENDIAN;
544*4703203dSis 
545*4703203dSis 	if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
546*4703203dSis 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
547*4703203dSis 		    UCONV_BOM_SWAPPED;
548*4703203dSis 
549*4703203dSis 	for (; u32l < *utf32len; u32l++) {
550*4703203dSis 		if (u32s[u32l] == 0 && do_not_ignore_null)
551*4703203dSis 			break;
552*4703203dSis 
553*4703203dSis 		hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
554*4703203dSis 
555*4703203dSis 		/*
556*4703203dSis 		 * Anything bigger than the Unicode coding space, i.e.,
557*4703203dSis 		 * Unicode scalar value bigger than U+10FFFF, is an illegal
558*4703203dSis 		 * character.
559*4703203dSis 		 */
560*4703203dSis 		if (hi > UCONV_UNICODE_MAX)
561*4703203dSis 			return (EILSEQ);
562*4703203dSis 
563*4703203dSis 		/*
564*4703203dSis 		 * Anything bigger than U+FFFF must be converted into
565*4703203dSis 		 * a surrogate pair in UTF-16.
566*4703203dSis 		 */
567*4703203dSis 		if (hi >= UCONV_U16_START) {
568*4703203dSis 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
569*4703203dSis 			    UCONV_U16_LO_MIN;
570*4703203dSis 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
571*4703203dSis 			    UCONV_U16_HI_MIN;
572*4703203dSis 
573*4703203dSis 			if ((u16l + 1) >= *utf16len)
574*4703203dSis 				return (E2BIG);
575*4703203dSis 
576*4703203dSis 			if (outendian) {
577*4703203dSis 				u16s[u16l++] = (uint16_t)hi;
578*4703203dSis 				u16s[u16l++] = (uint16_t)lo;
579*4703203dSis 			} else {
580*4703203dSis 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
581*4703203dSis 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
582*4703203dSis 			}
583*4703203dSis 		} else {
584*4703203dSis 			if (u16l >= *utf16len)
585*4703203dSis 				return (E2BIG);
586*4703203dSis 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
587*4703203dSis 			    BSWAP_16(((uint16_t)hi));
588*4703203dSis 		}
589*4703203dSis 	}
590*4703203dSis 
591*4703203dSis 	*utf16len = u16l;
592*4703203dSis 	*utf32len = u32l;
593*4703203dSis 
594*4703203dSis 	return (0);
595*4703203dSis }
596*4703203dSis 
597*4703203dSis int
598*4703203dSis uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
599*4703203dSis     uchar_t *u8s, size_t *utf8len, int flag)
600*4703203dSis {
601*4703203dSis 	int inendian;
602*4703203dSis 	int outendian;
603*4703203dSis 	size_t u32l;
604*4703203dSis 	size_t u8l;
605*4703203dSis 	uint32_t lo;
606*4703203dSis 	boolean_t do_not_ignore_null;
607*4703203dSis 
608*4703203dSis 	if (u32s == NULL || utf32len == NULL)
609*4703203dSis 		return (EILSEQ);
610*4703203dSis 
611*4703203dSis 	if (u8s == NULL || utf8len == NULL)
612*4703203dSis 		return (E2BIG);
613*4703203dSis 
614*4703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
615*4703203dSis 		return (EBADF);
616*4703203dSis 
617*4703203dSis 	u32l = u8l = 0;
618*4703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
619*4703203dSis 
620*4703203dSis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
621*4703203dSis 	    check_bom32(u32s, *utf32len, &inendian))
622*4703203dSis 		u32l++;
623*4703203dSis 
624*4703203dSis 	inendian &= UCONV_IN_NAT_ENDIAN;
625*4703203dSis 
626*4703203dSis 	for (; u32l < *utf32len; u32l++) {
627*4703203dSis 		if (u32s[u32l] == 0 && do_not_ignore_null)
628*4703203dSis 			break;
629*4703203dSis 
630*4703203dSis 		lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
631*4703203dSis 
632*4703203dSis 		if (lo <= UCONV_U8_ONE_BYTE) {
633*4703203dSis 			if (u8l >= *utf8len)
634*4703203dSis 				return (E2BIG);
635*4703203dSis 			u8s[u8l++] = (uchar_t)lo;
636*4703203dSis 		} else if (lo <= UCONV_U8_TWO_BYTES) {
637*4703203dSis 			if ((u8l + 1) >= *utf8len)
638*4703203dSis 				return (E2BIG);
639*4703203dSis 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
640*4703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
641*4703203dSis 		} else if (lo <= UCONV_U8_THREE_BYTES) {
642*4703203dSis 			if ((u8l + 2) >= *utf8len)
643*4703203dSis 				return (E2BIG);
644*4703203dSis 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
645*4703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
646*4703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
647*4703203dSis 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
648*4703203dSis 			if ((u8l + 3) >= *utf8len)
649*4703203dSis 				return (E2BIG);
650*4703203dSis 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
651*4703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
652*4703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
653*4703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
654*4703203dSis 		} else {
655*4703203dSis 			return (EILSEQ);
656*4703203dSis 		}
657*4703203dSis 	}
658*4703203dSis 
659*4703203dSis 	*utf32len = u32l;
660*4703203dSis 	*utf8len = u8l;
661*4703203dSis 
662*4703203dSis 	return (0);
663*4703203dSis }
664*4703203dSis 
665*4703203dSis int
666*4703203dSis uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
667*4703203dSis     uint16_t *u16s, size_t *utf16len, int flag)
668*4703203dSis {
669*4703203dSis 	int inendian;
670*4703203dSis 	int outendian;
671*4703203dSis 	size_t u16l;
672*4703203dSis 	size_t u8l;
673*4703203dSis 	uint32_t hi;
674*4703203dSis 	uint32_t lo;
675*4703203dSis 	int remaining_bytes;
676*4703203dSis 	int first_b;
677*4703203dSis 	boolean_t do_not_ignore_null;
678*4703203dSis 
679*4703203dSis 	if (u8s == NULL || utf8len == NULL)
680*4703203dSis 		return (EILSEQ);
681*4703203dSis 
682*4703203dSis 	if (u16s == NULL || utf16len == NULL)
683*4703203dSis 		return (E2BIG);
684*4703203dSis 
685*4703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
686*4703203dSis 		return (EBADF);
687*4703203dSis 
688*4703203dSis 	u16l = u8l = 0;
689*4703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
690*4703203dSis 
691*4703203dSis 	outendian &= UCONV_OUT_NAT_ENDIAN;
692*4703203dSis 
693*4703203dSis 	if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
694*4703203dSis 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
695*4703203dSis 		    UCONV_BOM_SWAPPED;
696*4703203dSis 
697*4703203dSis 	for (; u8l < *utf8len; ) {
698*4703203dSis 		if (u8s[u8l] == 0 && do_not_ignore_null)
699*4703203dSis 			break;
700*4703203dSis 
701*4703203dSis 		/*
702*4703203dSis 		 * Collect a UTF-8 character and convert it to a UTF-32
703*4703203dSis 		 * character. In doing so, we screen out illegally formed
704*4703203dSis 		 * UTF-8 characters and treat such as illegal characters.
705*4703203dSis 		 * The algorithm at below also screens out anything bigger
706*4703203dSis 		 * than the U+10FFFF.
707*4703203dSis 		 *
708*4703203dSis 		 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
709*4703203dSis 		 * more details on the illegal values of UTF-8 character
710*4703203dSis 		 * bytes.
711*4703203dSis 		 */
712*4703203dSis 		hi = (uint32_t)u8s[u8l++];
713*4703203dSis 
714*4703203dSis 		if (hi > UCONV_ASCII_MAX) {
715*4703203dSis 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
716*4703203dSis 				return (EILSEQ);
717*4703203dSis 
718*4703203dSis 			first_b = hi;
719*4703203dSis 			hi = hi & masks_tbl[remaining_bytes];
720*4703203dSis 
721*4703203dSis 			for (; remaining_bytes > 0; remaining_bytes--) {
722*4703203dSis 				/*
723*4703203dSis 				 * If we have no more bytes, the current
724*4703203dSis 				 * UTF-8 character is incomplete.
725*4703203dSis 				 */
726*4703203dSis 				if (u8l >= *utf8len)
727*4703203dSis 					return (EINVAL);
728*4703203dSis 
729*4703203dSis 				lo = (uint32_t)u8s[u8l++];
730*4703203dSis 
731*4703203dSis 				if (first_b) {
732*4703203dSis 					if (lo < valid_min_2nd_byte[first_b] ||
733*4703203dSis 					    lo > valid_max_2nd_byte[first_b])
734*4703203dSis 						return (EILSEQ);
735*4703203dSis 					first_b = 0;
736*4703203dSis 				} else if (lo < UCONV_U8_BYTE_MIN ||
737*4703203dSis 				    lo > UCONV_U8_BYTE_MAX) {
738*4703203dSis 					return (EILSEQ);
739*4703203dSis 				}
740*4703203dSis 				hi = (hi << UCONV_U8_BIT_SHIFT) |
741*4703203dSis 				    (lo & UCONV_U8_BIT_MASK);
742*4703203dSis 			}
743*4703203dSis 		}
744*4703203dSis 
745*4703203dSis 		if (hi >= UCONV_U16_START) {
746*4703203dSis 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
747*4703203dSis 			    UCONV_U16_LO_MIN;
748*4703203dSis 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
749*4703203dSis 			    UCONV_U16_HI_MIN;
750*4703203dSis 
751*4703203dSis 			if ((u16l + 1) >= *utf16len)
752*4703203dSis 				return (E2BIG);
753*4703203dSis 
754*4703203dSis 			if (outendian) {
755*4703203dSis 				u16s[u16l++] = (uint16_t)hi;
756*4703203dSis 				u16s[u16l++] = (uint16_t)lo;
757*4703203dSis 			} else {
758*4703203dSis 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
759*4703203dSis 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
760*4703203dSis 			}
761*4703203dSis 		} else {
762*4703203dSis 			if (u16l >= *utf16len)
763*4703203dSis 				return (E2BIG);
764*4703203dSis 
765*4703203dSis 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
766*4703203dSis 			    BSWAP_16(((uint16_t)hi));
767*4703203dSis 		}
768*4703203dSis 	}
769*4703203dSis 
770*4703203dSis 	*utf16len = u16l;
771*4703203dSis 	*utf8len = u8l;
772*4703203dSis 
773*4703203dSis 	return (0);
774*4703203dSis }
775*4703203dSis 
776*4703203dSis int
777*4703203dSis uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
778*4703203dSis     uint32_t *u32s, size_t *utf32len, int flag)
779*4703203dSis {
780*4703203dSis 	int inendian;
781*4703203dSis 	int outendian;
782*4703203dSis 	size_t u32l;
783*4703203dSis 	size_t u8l;
784*4703203dSis 	uint32_t hi;
785*4703203dSis 	uint32_t c;
786*4703203dSis 	int remaining_bytes;
787*4703203dSis 	int first_b;
788*4703203dSis 	boolean_t do_not_ignore_null;
789*4703203dSis 
790*4703203dSis 	if (u8s == NULL || utf8len == NULL)
791*4703203dSis 		return (EILSEQ);
792*4703203dSis 
793*4703203dSis 	if (u32s == NULL || utf32len == NULL)
794*4703203dSis 		return (E2BIG);
795*4703203dSis 
796*4703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
797*4703203dSis 		return (EBADF);
798*4703203dSis 
799*4703203dSis 	u32l = u8l = 0;
800*4703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
801*4703203dSis 
802*4703203dSis 	outendian &= UCONV_OUT_NAT_ENDIAN;
803*4703203dSis 
804*4703203dSis 	if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
805*4703203dSis 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
806*4703203dSis 		    UCONV_BOM_SWAPPED_32;
807*4703203dSis 
808*4703203dSis 	for (; u8l < *utf8len; ) {
809*4703203dSis 		if (u8s[u8l] == 0 && do_not_ignore_null)
810*4703203dSis 			break;
811*4703203dSis 
812*4703203dSis 		hi = (uint32_t)u8s[u8l++];
813*4703203dSis 
814*4703203dSis 		if (hi > UCONV_ASCII_MAX) {
815*4703203dSis 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
816*4703203dSis 				return (EILSEQ);
817*4703203dSis 
818*4703203dSis 			first_b = hi;
819*4703203dSis 			hi = hi & masks_tbl[remaining_bytes];
820*4703203dSis 
821*4703203dSis 			for (; remaining_bytes > 0; remaining_bytes--) {
822*4703203dSis 				if (u8l >= *utf8len)
823*4703203dSis 					return (EINVAL);
824*4703203dSis 
825*4703203dSis 				c = (uint32_t)u8s[u8l++];
826*4703203dSis 
827*4703203dSis 				if (first_b) {
828*4703203dSis 					if (c < valid_min_2nd_byte[first_b] ||
829*4703203dSis 					    c > valid_max_2nd_byte[first_b])
830*4703203dSis 						return (EILSEQ);
831*4703203dSis 					first_b = 0;
832*4703203dSis 				} else if (c < UCONV_U8_BYTE_MIN ||
833*4703203dSis 				    c > UCONV_U8_BYTE_MAX) {
834*4703203dSis 					return (EILSEQ);
835*4703203dSis 				}
836*4703203dSis 				hi = (hi << UCONV_U8_BIT_SHIFT) |
837*4703203dSis 				    (c & UCONV_U8_BIT_MASK);
838*4703203dSis 			}
839*4703203dSis 		}
840*4703203dSis 
841*4703203dSis 		if (u32l >= *utf32len)
842*4703203dSis 			return (E2BIG);
843*4703203dSis 
844*4703203dSis 		u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
845*4703203dSis 	}
846*4703203dSis 
847*4703203dSis 	*utf32len = u32l;
848*4703203dSis 	*utf8len = u8l;
849*4703203dSis 
850*4703203dSis 	return (0);
851*4703203dSis }
852