xref: /illumos-gate/usr/src/common/unicode/uconv.c (revision c40a6cd785e883b3f052b122c332e21174fc1871)
14703203dSis /*
24703203dSis  * CDDL HEADER START
34703203dSis  *
44703203dSis  * The contents of this file are subject to the terms of the
54703203dSis  * Common Development and Distribution License (the "License").
64703203dSis  * You may not use this file except in compliance with the License.
74703203dSis  *
84703203dSis  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
94703203dSis  * or http://www.opensolaris.org/os/licensing.
104703203dSis  * See the License for the specific language governing permissions
114703203dSis  * and limitations under the License.
124703203dSis  *
134703203dSis  * When distributing Covered Code, include this CDDL HEADER in each
144703203dSis  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
154703203dSis  * If applicable, add the following below this CDDL HEADER, with the
164703203dSis  * fields enclosed by brackets "[]" replaced with your own identifying
174703203dSis  * information: Portions Copyright [yyyy] [name of copyright owner]
184703203dSis  *
194703203dSis  * CDDL HEADER END
204703203dSis  */
214703203dSis /*
22*15d9d0b5Syy154373  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
234703203dSis  * Use is subject to license terms.
244703203dSis  */
254703203dSis 
264703203dSis /*
274703203dSis  * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
284703203dSis  * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
294703203dSis  * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
304703203dSis  * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
314703203dSis  * the section 3C man pages.
324703203dSis  * Interface stability: Committed
334703203dSis  */
344703203dSis 
354703203dSis #include <sys/types.h>
364703203dSis #ifdef	_KERNEL
374703203dSis #include <sys/param.h>
384703203dSis #include <sys/sysmacros.h>
394703203dSis #include <sys/systm.h>
404703203dSis #include <sys/debug.h>
414703203dSis #include <sys/kmem.h>
424703203dSis #include <sys/sunddi.h>
434703203dSis #else
444703203dSis #include <sys/u8_textprep.h>
454703203dSis #endif	/* _KERNEL */
464703203dSis #include <sys/byteorder.h>
474703203dSis #include <sys/errno.h>
484703203dSis 
494703203dSis 
504703203dSis /*
514703203dSis  * The max and min values of high and low surrogate pairs of UTF-16,
524703203dSis  * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
534703203dSis  */
544703203dSis #define	UCONV_U16_HI_MIN	(0xd800U)
554703203dSis #define	UCONV_U16_HI_MAX	(0xdbffU)
564703203dSis #define	UCONV_U16_LO_MIN	(0xdc00U)
574703203dSis #define	UCONV_U16_LO_MAX	(0xdfffU)
584703203dSis #define	UCONV_U16_BIT_SHIFT	(0x0400U)
594703203dSis #define	UCONV_U16_BIT_MASK	(0x0fffffU)
604703203dSis #define	UCONV_U16_START		(0x010000U)
614703203dSis 
624703203dSis /* The maximum value of Unicode coding space and ASCII coding space. */
634703203dSis #define	UCONV_UNICODE_MAX	(0x10ffffU)
644703203dSis #define	UCONV_ASCII_MAX		(0x7fU)
654703203dSis 
664703203dSis /* The mask values for input and output endians. */
674703203dSis #define	UCONV_IN_ENDIAN_MASKS	(UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
684703203dSis #define	UCONV_OUT_ENDIAN_MASKS	(UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
694703203dSis 
704703203dSis /* Native and reversed endian macros. */
714703203dSis #ifdef	_BIG_ENDIAN
724703203dSis #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_BIG_ENDIAN
734703203dSis #define	UCONV_IN_REV_ENDIAN	UCONV_IN_LITTLE_ENDIAN
744703203dSis #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_BIG_ENDIAN
754703203dSis #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
764703203dSis #else
774703203dSis #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_LITTLE_ENDIAN
784703203dSis #define	UCONV_IN_REV_ENDIAN	UCONV_IN_BIG_ENDIAN
794703203dSis #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
804703203dSis #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_BIG_ENDIAN
814703203dSis #endif	/* _BIG_ENDIAN */
824703203dSis 
834703203dSis /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
844703203dSis #define	UCONV_BOM_NORMAL	(0xfeffU)
854703203dSis #define	UCONV_BOM_SWAPPED	(0xfffeU)
864703203dSis #define	UCONV_BOM_SWAPPED_32	(0xfffe0000U)
874703203dSis 
884703203dSis /* UTF-32 boundaries based on UTF-8 character byte lengths. */
894703203dSis #define	UCONV_U8_ONE_BYTE	(0x7fU)
904703203dSis #define	UCONV_U8_TWO_BYTES	(0x7ffU)
914703203dSis #define	UCONV_U8_THREE_BYTES	(0xffffU)
924703203dSis #define	UCONV_U8_FOUR_BYTES	(0x10ffffU)
934703203dSis 
944703203dSis /* The common minimum and maximum values at the UTF-8 character bytes. */
954703203dSis #define	UCONV_U8_BYTE_MIN	(0x80U)
964703203dSis #define	UCONV_U8_BYTE_MAX	(0xbfU)
974703203dSis 
984703203dSis /*
994703203dSis  * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
1004703203dSis  * UTF-8 character bytes.
1014703203dSis  */
1024703203dSis #define	UCONV_U8_BIT_SHIFT	6
1034703203dSis #define	UCONV_U8_BIT_MASK	0x3f
1044703203dSis 
1054703203dSis /*
1064703203dSis  * The following vector shows remaining bytes in a UTF-8 character.
1074703203dSis  * Index will be the first byte of the character.
1084703203dSis  */
1094703203dSis static const uchar_t remaining_bytes_tbl[0x100] = {
1104703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1114703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1124703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1134703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1144703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1154703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1164703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1174703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1184703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1194703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1204703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1214703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1224703203dSis 
1234703203dSis /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
1244703203dSis 	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
1254703203dSis 
1264703203dSis /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
1274703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
1284703203dSis 
1294703203dSis /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
1304703203dSis 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
1314703203dSis 
1324703203dSis /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
1334703203dSis 	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1344703203dSis };
1354703203dSis 
1364703203dSis /*
1374703203dSis  * The following is a vector of bit-masks to get used bits in
1384703203dSis  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
1394703203dSis  * the character.
1404703203dSis  */
141*15d9d0b5Syy154373 #ifdef	_KERNEL
142*15d9d0b5Syy154373 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
143*15d9d0b5Syy154373 #else
144*15d9d0b5Syy154373 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145*15d9d0b5Syy154373 #endif	/* _KERNEL */
1464703203dSis 
1474703203dSis /*
1484703203dSis  * The following two vectors are to provide valid minimum and
1494703203dSis  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
1504703203dSis  * better illegal sequence checking. The index value must be the value of
1514703203dSis  * the first byte of the UTF-8 character.
1524703203dSis  */
1534703203dSis static const uchar_t valid_min_2nd_byte[0x100] = {
1544703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1554703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1564703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1574703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1584703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1594703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1604703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1614703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1624703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1634703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1644703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1654703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1664703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1674703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1684703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1694703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1704703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1714703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1724703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1734703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1744703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1754703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1764703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1774703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1784703203dSis 
1794703203dSis /*	C0    C1    C2    C3    C4    C5    C6    C7 */
1804703203dSis 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1814703203dSis 
1824703203dSis /*	C8    C9    CA    CB    CC    CD    CE    CF */
1834703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1844703203dSis 
1854703203dSis /*	D0    D1    D2    D3    D4    D5    D6    D7 */
1864703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1874703203dSis 
1884703203dSis /*	D8    D9    DA    DB    DC    DD    DE    DF */
1894703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1904703203dSis 
1914703203dSis /*	E0    E1    E2    E3    E4    E5    E6    E7 */
1924703203dSis 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1934703203dSis 
1944703203dSis /*	E8    E9    EA    EB    EC    ED    EE    EF */
1954703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1964703203dSis 
1974703203dSis /*	F0    F1    F2    F3    F4    F5    F6    F7 */
1984703203dSis 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
1994703203dSis 
2004703203dSis 	0,    0,    0,    0,    0,    0,    0,    0
2014703203dSis };
2024703203dSis 
2034703203dSis static const uchar_t valid_max_2nd_byte[0x100] = {
2044703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2054703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2064703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2074703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2084703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2094703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2104703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2114703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2124703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2134703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2144703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2154703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2164703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2174703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2184703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2194703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2204703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2214703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2224703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2234703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2244703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2254703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2264703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2274703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2284703203dSis 
2294703203dSis /*	C0    C1    C2    C3    C4    C5    C6    C7 */
2304703203dSis 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2314703203dSis 
2324703203dSis /*	C8    C9    CA    CB    CC    CD    CE    CF */
2334703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2344703203dSis 
2354703203dSis /*	D0    D1    D2    D3    D4    D5    D6    D7 */
2364703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2374703203dSis 
2384703203dSis /*	D8    D9    DA    DB    DC    DD    DE    DF */
2394703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2404703203dSis 
2414703203dSis /*	E0    E1    E2    E3    E4    E5    E6    E7 */
2424703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2434703203dSis 
2444703203dSis /*	E8    E9    EA    EB    EC    ED    EE    EF */
2454703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
2464703203dSis 
2474703203dSis /*	F0    F1    F2    F3    F4    F5    F6    F7 */
2484703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
2494703203dSis 
2504703203dSis 	0,    0,    0,    0,    0,    0,    0,    0
2514703203dSis };
2524703203dSis 
2534703203dSis 
2544703203dSis static int
check_endian(int flag,int * in,int * out)2554703203dSis check_endian(int flag, int *in, int *out)
2564703203dSis {
2574703203dSis 	*in = flag & UCONV_IN_ENDIAN_MASKS;
2584703203dSis 
2594703203dSis 	/* You cannot have both. */
2604703203dSis 	if (*in == UCONV_IN_ENDIAN_MASKS)
2614703203dSis 		return (EBADF);
2624703203dSis 
2634703203dSis 	if (*in == 0)
2644703203dSis 		*in = UCONV_IN_NAT_ENDIAN;
2654703203dSis 
2664703203dSis 	*out = flag & UCONV_OUT_ENDIAN_MASKS;
2674703203dSis 
2684703203dSis 	/* You cannot have both. */
2694703203dSis 	if (*out == UCONV_OUT_ENDIAN_MASKS)
2704703203dSis 		return (EBADF);
2714703203dSis 
2724703203dSis 	if (*out == 0)
2734703203dSis 		*out = UCONV_OUT_NAT_ENDIAN;
2744703203dSis 
2754703203dSis 	return (0);
2764703203dSis }
2774703203dSis 
2784703203dSis static boolean_t
check_bom16(const uint16_t * u16s,size_t u16l,int * in)2794703203dSis check_bom16(const uint16_t *u16s, size_t u16l, int *in)
2804703203dSis {
2814703203dSis 	if (u16l > 0) {
2824703203dSis 		if (*u16s == UCONV_BOM_NORMAL) {
2834703203dSis 			*in = UCONV_IN_NAT_ENDIAN;
2844703203dSis 			return (B_TRUE);
2854703203dSis 		}
2864703203dSis 		if (*u16s == UCONV_BOM_SWAPPED) {
2874703203dSis 			*in = UCONV_IN_REV_ENDIAN;
2884703203dSis 			return (B_TRUE);
2894703203dSis 		}
2904703203dSis 	}
2914703203dSis 
2924703203dSis 	return (B_FALSE);
2934703203dSis }
2944703203dSis 
2954703203dSis static boolean_t
check_bom32(const uint32_t * u32s,size_t u32l,int * in)2964703203dSis check_bom32(const uint32_t *u32s, size_t u32l, int *in)
2974703203dSis {
2984703203dSis 	if (u32l > 0) {
2994703203dSis 		if (*u32s == UCONV_BOM_NORMAL) {
3004703203dSis 			*in = UCONV_IN_NAT_ENDIAN;
3014703203dSis 			return (B_TRUE);
3024703203dSis 		}
3034703203dSis 		if (*u32s == UCONV_BOM_SWAPPED_32) {
3044703203dSis 			*in = UCONV_IN_REV_ENDIAN;
3054703203dSis 			return (B_TRUE);
3064703203dSis 		}
3074703203dSis 	}
3084703203dSis 
3094703203dSis 	return (B_FALSE);
3104703203dSis }
3114703203dSis 
3124703203dSis int
uconv_u16tou32(const uint16_t * u16s,size_t * utf16len,uint32_t * u32s,size_t * utf32len,int flag)3134703203dSis uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
3144703203dSis     uint32_t *u32s, size_t *utf32len, int flag)
3154703203dSis {
3164703203dSis 	int inendian;
3174703203dSis 	int outendian;
3184703203dSis 	size_t u16l;
3194703203dSis 	size_t u32l;
3204703203dSis 	uint32_t hi;
3214703203dSis 	uint32_t lo;
3224703203dSis 	boolean_t do_not_ignore_null;
3234703203dSis 
3244703203dSis 	/*
3254703203dSis 	 * Do preliminary validity checks on parameters and collect info on
3264703203dSis 	 * endians.
3274703203dSis 	 */
3284703203dSis 	if (u16s == NULL || utf16len == NULL)
3294703203dSis 		return (EILSEQ);
3304703203dSis 
3314703203dSis 	if (u32s == NULL || utf32len == NULL)
3324703203dSis 		return (E2BIG);
3334703203dSis 
3344703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
3354703203dSis 		return (EBADF);
3364703203dSis 
3374703203dSis 	/*
3384703203dSis 	 * Initialize input and output parameter buffer indices and
3394703203dSis 	 * temporary variables.
3404703203dSis 	 */
3414703203dSis 	u16l = u32l = 0;
3424703203dSis 	hi = 0;
3434703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
3444703203dSis 
3454703203dSis 	/*
3464703203dSis 	 * Check on the BOM at the beginning of the input buffer if required
3474703203dSis 	 * and if there is indeed one, process it.
3484703203dSis 	 */
3494703203dSis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
3504703203dSis 	    check_bom16(u16s, *utf16len, &inendian))
3514703203dSis 		u16l++;
3524703203dSis 
3534703203dSis 	/*
3544703203dSis 	 * Reset inendian and outendian so that after this point, those can be
3554703203dSis 	 * used as condition values.
3564703203dSis 	 */
3574703203dSis 	inendian &= UCONV_IN_NAT_ENDIAN;
3584703203dSis 	outendian &= UCONV_OUT_NAT_ENDIAN;
3594703203dSis 
3604703203dSis 	/*
3614703203dSis 	 * If there is something in the input buffer and if necessary and
3624703203dSis 	 * requested, save the BOM at the output buffer.
3634703203dSis 	 */
3644703203dSis 	if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
3654703203dSis 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
3664703203dSis 		    UCONV_BOM_SWAPPED_32;
3674703203dSis 
3684703203dSis 	/*
3694703203dSis 	 * Do conversion; if encounter a surrogate pair, assemble high and
3704703203dSis 	 * low pair values to form a UTF-32 character. If a half of a pair
3714703203dSis 	 * exists alone, then, either it is an illegal (EILSEQ) or
3724703203dSis 	 * invalid (EINVAL) value.
3734703203dSis 	 */
3744703203dSis 	for (; u16l < *utf16len; u16l++) {
3754703203dSis 		if (u16s[u16l] == 0 && do_not_ignore_null)
3764703203dSis 			break;
3774703203dSis 
3784703203dSis 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
3794703203dSis 
3804703203dSis 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
3814703203dSis 			if (hi)
3824703203dSis 				return (EILSEQ);
3834703203dSis 			hi = lo;
3844703203dSis 			continue;
3854703203dSis 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
3864703203dSis 			if (! hi)
3874703203dSis 				return (EILSEQ);
3884703203dSis 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
3894703203dSis 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
3904703203dSis 			    + UCONV_U16_START;
3914703203dSis 			hi = 0;
3924703203dSis 		} else if (hi) {
3934703203dSis 			return (EILSEQ);
3944703203dSis 		}
3954703203dSis 
3964703203dSis 		if (u32l >= *utf32len)
3974703203dSis 			return (E2BIG);
3984703203dSis 
3994703203dSis 		u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
4004703203dSis 	}
4014703203dSis 
4024703203dSis 	/*
4034703203dSis 	 * If high half didn't see low half, then, it's most likely the input
4044703203dSis 	 * parameter is incomplete.
4054703203dSis 	 */
4064703203dSis 	if (hi)
4074703203dSis 		return (EINVAL);
4084703203dSis 
4094703203dSis 	/*
4104703203dSis 	 * Save the number of consumed and saved characters. They do not
4114703203dSis 	 * include terminating NULL character (U+0000) at the end of
4124703203dSis 	 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
4134703203dSis 	 * the input buffer length is big enough to include the terminating
4144703203dSis 	 * NULL character).
4154703203dSis 	 */
4164703203dSis 	*utf16len = u16l;
4174703203dSis 	*utf32len = u32l;
4184703203dSis 
4194703203dSis 	return (0);
4204703203dSis }
4214703203dSis 
4224703203dSis int
uconv_u16tou8(const uint16_t * u16s,size_t * utf16len,uchar_t * u8s,size_t * utf8len,int flag)4234703203dSis uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
4244703203dSis     uchar_t *u8s, size_t *utf8len, int flag)
4254703203dSis {
4264703203dSis 	int inendian;
4274703203dSis 	int outendian;
4284703203dSis 	size_t u16l;
4294703203dSis 	size_t u8l;
4304703203dSis 	uint32_t hi;
4314703203dSis 	uint32_t lo;
4324703203dSis 	boolean_t do_not_ignore_null;
4334703203dSis 
4344703203dSis 	if (u16s == NULL || utf16len == NULL)
4354703203dSis 		return (EILSEQ);
4364703203dSis 
4374703203dSis 	if (u8s == NULL || utf8len == NULL)
4384703203dSis 		return (E2BIG);
4394703203dSis 
4404703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
4414703203dSis 		return (EBADF);
4424703203dSis 
4434703203dSis 	u16l = u8l = 0;
4444703203dSis 	hi = 0;
4454703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
4464703203dSis 
4474703203dSis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
4484703203dSis 	    check_bom16(u16s, *utf16len, &inendian))
4494703203dSis 		u16l++;
4504703203dSis 
4514703203dSis 	inendian &= UCONV_IN_NAT_ENDIAN;
4524703203dSis 
4534703203dSis 	for (; u16l < *utf16len; u16l++) {
4544703203dSis 		if (u16s[u16l] == 0 && do_not_ignore_null)
4554703203dSis 			break;
4564703203dSis 
4574703203dSis 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
4584703203dSis 
4594703203dSis 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
4604703203dSis 			if (hi)
4614703203dSis 				return (EILSEQ);
4624703203dSis 			hi = lo;
4634703203dSis 			continue;
4644703203dSis 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
4654703203dSis 			if (! hi)
4664703203dSis 				return (EILSEQ);
4674703203dSis 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
4684703203dSis 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
4694703203dSis 			    + UCONV_U16_START;
4704703203dSis 			hi = 0;
4714703203dSis 		} else if (hi) {
4724703203dSis 			return (EILSEQ);
4734703203dSis 		}
4744703203dSis 
4754703203dSis 		/*
4764703203dSis 		 * Now we convert a UTF-32 character into a UTF-8 character.
4774703203dSis 		 * Unicode coding space is between U+0000 and U+10FFFF;
4784703203dSis 		 * anything bigger is an illegal character.
4794703203dSis 		 */
4804703203dSis 		if (lo <= UCONV_U8_ONE_BYTE) {
4814703203dSis 			if (u8l >= *utf8len)
4824703203dSis 				return (E2BIG);
4834703203dSis 			u8s[u8l++] = (uchar_t)lo;
4844703203dSis 		} else if (lo <= UCONV_U8_TWO_BYTES) {
4854703203dSis 			if ((u8l + 1) >= *utf8len)
4864703203dSis 				return (E2BIG);
4874703203dSis 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
4884703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
4894703203dSis 		} else if (lo <= UCONV_U8_THREE_BYTES) {
4904703203dSis 			if ((u8l + 2) >= *utf8len)
4914703203dSis 				return (E2BIG);
4924703203dSis 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
4934703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
4944703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
4954703203dSis 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
4964703203dSis 			if ((u8l + 3) >= *utf8len)
4974703203dSis 				return (E2BIG);
4984703203dSis 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
4994703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
5004703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
5014703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
5024703203dSis 		} else {
5034703203dSis 			return (EILSEQ);
5044703203dSis 		}
5054703203dSis 	}
5064703203dSis 
5074703203dSis 	if (hi)
5084703203dSis 		return (EINVAL);
5094703203dSis 
5104703203dSis 	*utf16len = u16l;
5114703203dSis 	*utf8len = u8l;
5124703203dSis 
5134703203dSis 	return (0);
5144703203dSis }
5154703203dSis 
5164703203dSis int
uconv_u32tou16(const uint32_t * u32s,size_t * utf32len,uint16_t * u16s,size_t * utf16len,int flag)5174703203dSis uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
5184703203dSis     uint16_t *u16s, size_t *utf16len, int flag)
5194703203dSis {
5204703203dSis 	int inendian;
5214703203dSis 	int outendian;
5224703203dSis 	size_t u16l;
5234703203dSis 	size_t u32l;
5244703203dSis 	uint32_t hi;
5254703203dSis 	uint32_t lo;
5264703203dSis 	boolean_t do_not_ignore_null;
5274703203dSis 
5284703203dSis 	if (u32s == NULL || utf32len == NULL)
5294703203dSis 		return (EILSEQ);
5304703203dSis 
5314703203dSis 	if (u16s == NULL || utf16len == NULL)
5324703203dSis 		return (E2BIG);
5334703203dSis 
5344703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
5354703203dSis 		return (EBADF);
5364703203dSis 
5374703203dSis 	u16l = u32l = 0;
5384703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
5394703203dSis 
5404703203dSis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
5414703203dSis 	    check_bom32(u32s, *utf32len, &inendian))
5424703203dSis 		u32l++;
5434703203dSis 
5444703203dSis 	inendian &= UCONV_IN_NAT_ENDIAN;
5454703203dSis 	outendian &= UCONV_OUT_NAT_ENDIAN;
5464703203dSis 
5474703203dSis 	if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
5484703203dSis 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
5494703203dSis 		    UCONV_BOM_SWAPPED;
5504703203dSis 
5514703203dSis 	for (; u32l < *utf32len; u32l++) {
5524703203dSis 		if (u32s[u32l] == 0 && do_not_ignore_null)
5534703203dSis 			break;
5544703203dSis 
5554703203dSis 		hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
5564703203dSis 
5574703203dSis 		/*
5584703203dSis 		 * Anything bigger than the Unicode coding space, i.e.,
5594703203dSis 		 * Unicode scalar value bigger than U+10FFFF, is an illegal
5604703203dSis 		 * character.
5614703203dSis 		 */
5624703203dSis 		if (hi > UCONV_UNICODE_MAX)
5634703203dSis 			return (EILSEQ);
5644703203dSis 
5654703203dSis 		/*
5664703203dSis 		 * Anything bigger than U+FFFF must be converted into
5674703203dSis 		 * a surrogate pair in UTF-16.
5684703203dSis 		 */
5694703203dSis 		if (hi >= UCONV_U16_START) {
5704703203dSis 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
5714703203dSis 			    UCONV_U16_LO_MIN;
5724703203dSis 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
5734703203dSis 			    UCONV_U16_HI_MIN;
5744703203dSis 
5754703203dSis 			if ((u16l + 1) >= *utf16len)
5764703203dSis 				return (E2BIG);
5774703203dSis 
5784703203dSis 			if (outendian) {
5794703203dSis 				u16s[u16l++] = (uint16_t)hi;
5804703203dSis 				u16s[u16l++] = (uint16_t)lo;
5814703203dSis 			} else {
5824703203dSis 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
5834703203dSis 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
5844703203dSis 			}
5854703203dSis 		} else {
5864703203dSis 			if (u16l >= *utf16len)
5874703203dSis 				return (E2BIG);
5884703203dSis 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
5894703203dSis 			    BSWAP_16(((uint16_t)hi));
5904703203dSis 		}
5914703203dSis 	}
5924703203dSis 
5934703203dSis 	*utf16len = u16l;
5944703203dSis 	*utf32len = u32l;
5954703203dSis 
5964703203dSis 	return (0);
5974703203dSis }
5984703203dSis 
5994703203dSis int
uconv_u32tou8(const uint32_t * u32s,size_t * utf32len,uchar_t * u8s,size_t * utf8len,int flag)6004703203dSis uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
6014703203dSis     uchar_t *u8s, size_t *utf8len, int flag)
6024703203dSis {
6034703203dSis 	int inendian;
6044703203dSis 	int outendian;
6054703203dSis 	size_t u32l;
6064703203dSis 	size_t u8l;
6074703203dSis 	uint32_t lo;
6084703203dSis 	boolean_t do_not_ignore_null;
6094703203dSis 
6104703203dSis 	if (u32s == NULL || utf32len == NULL)
6114703203dSis 		return (EILSEQ);
6124703203dSis 
6134703203dSis 	if (u8s == NULL || utf8len == NULL)
6144703203dSis 		return (E2BIG);
6154703203dSis 
6164703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
6174703203dSis 		return (EBADF);
6184703203dSis 
6194703203dSis 	u32l = u8l = 0;
6204703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
6214703203dSis 
6224703203dSis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
6234703203dSis 	    check_bom32(u32s, *utf32len, &inendian))
6244703203dSis 		u32l++;
6254703203dSis 
6264703203dSis 	inendian &= UCONV_IN_NAT_ENDIAN;
6274703203dSis 
6284703203dSis 	for (; u32l < *utf32len; u32l++) {
6294703203dSis 		if (u32s[u32l] == 0 && do_not_ignore_null)
6304703203dSis 			break;
6314703203dSis 
6324703203dSis 		lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
6334703203dSis 
6344703203dSis 		if (lo <= UCONV_U8_ONE_BYTE) {
6354703203dSis 			if (u8l >= *utf8len)
6364703203dSis 				return (E2BIG);
6374703203dSis 			u8s[u8l++] = (uchar_t)lo;
6384703203dSis 		} else if (lo <= UCONV_U8_TWO_BYTES) {
6394703203dSis 			if ((u8l + 1) >= *utf8len)
6404703203dSis 				return (E2BIG);
6414703203dSis 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
6424703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
6434703203dSis 		} else if (lo <= UCONV_U8_THREE_BYTES) {
6444703203dSis 			if ((u8l + 2) >= *utf8len)
6454703203dSis 				return (E2BIG);
6464703203dSis 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
6474703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
6484703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
6494703203dSis 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
6504703203dSis 			if ((u8l + 3) >= *utf8len)
6514703203dSis 				return (E2BIG);
6524703203dSis 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
6534703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
6544703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
6554703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
6564703203dSis 		} else {
6574703203dSis 			return (EILSEQ);
6584703203dSis 		}
6594703203dSis 	}
6604703203dSis 
6614703203dSis 	*utf32len = u32l;
6624703203dSis 	*utf8len = u8l;
6634703203dSis 
6644703203dSis 	return (0);
6654703203dSis }
6664703203dSis 
6674703203dSis int
uconv_u8tou16(const uchar_t * u8s,size_t * utf8len,uint16_t * u16s,size_t * utf16len,int flag)6684703203dSis uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
6694703203dSis     uint16_t *u16s, size_t *utf16len, int flag)
6704703203dSis {
6714703203dSis 	int inendian;
6724703203dSis 	int outendian;
6734703203dSis 	size_t u16l;
6744703203dSis 	size_t u8l;
6754703203dSis 	uint32_t hi;
6764703203dSis 	uint32_t lo;
6774703203dSis 	int remaining_bytes;
6784703203dSis 	int first_b;
6794703203dSis 	boolean_t do_not_ignore_null;
6804703203dSis 
6814703203dSis 	if (u8s == NULL || utf8len == NULL)
6824703203dSis 		return (EILSEQ);
6834703203dSis 
6844703203dSis 	if (u16s == NULL || utf16len == NULL)
6854703203dSis 		return (E2BIG);
6864703203dSis 
6874703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
6884703203dSis 		return (EBADF);
6894703203dSis 
6904703203dSis 	u16l = u8l = 0;
6914703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
6924703203dSis 
6934703203dSis 	outendian &= UCONV_OUT_NAT_ENDIAN;
6944703203dSis 
6954703203dSis 	if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
6964703203dSis 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
6974703203dSis 		    UCONV_BOM_SWAPPED;
6984703203dSis 
6994703203dSis 	for (; u8l < *utf8len; ) {
7004703203dSis 		if (u8s[u8l] == 0 && do_not_ignore_null)
7014703203dSis 			break;
7024703203dSis 
7034703203dSis 		/*
7044703203dSis 		 * Collect a UTF-8 character and convert it to a UTF-32
7054703203dSis 		 * character. In doing so, we screen out illegally formed
7064703203dSis 		 * UTF-8 characters and treat such as illegal characters.
7074703203dSis 		 * The algorithm at below also screens out anything bigger
7084703203dSis 		 * than the U+10FFFF.
7094703203dSis 		 *
7104703203dSis 		 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
7114703203dSis 		 * more details on the illegal values of UTF-8 character
7124703203dSis 		 * bytes.
7134703203dSis 		 */
7144703203dSis 		hi = (uint32_t)u8s[u8l++];
7154703203dSis 
7164703203dSis 		if (hi > UCONV_ASCII_MAX) {
7174703203dSis 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
7184703203dSis 				return (EILSEQ);
7194703203dSis 
7204703203dSis 			first_b = hi;
721*15d9d0b5Syy154373 			hi = hi & u8_masks_tbl[remaining_bytes];
7224703203dSis 
7234703203dSis 			for (; remaining_bytes > 0; remaining_bytes--) {
7244703203dSis 				/*
7254703203dSis 				 * If we have no more bytes, the current
7264703203dSis 				 * UTF-8 character is incomplete.
7274703203dSis 				 */
7284703203dSis 				if (u8l >= *utf8len)
7294703203dSis 					return (EINVAL);
7304703203dSis 
7314703203dSis 				lo = (uint32_t)u8s[u8l++];
7324703203dSis 
7334703203dSis 				if (first_b) {
7344703203dSis 					if (lo < valid_min_2nd_byte[first_b] ||
7354703203dSis 					    lo > valid_max_2nd_byte[first_b])
7364703203dSis 						return (EILSEQ);
7374703203dSis 					first_b = 0;
7384703203dSis 				} else if (lo < UCONV_U8_BYTE_MIN ||
7394703203dSis 				    lo > UCONV_U8_BYTE_MAX) {
7404703203dSis 					return (EILSEQ);
7414703203dSis 				}
7424703203dSis 				hi = (hi << UCONV_U8_BIT_SHIFT) |
7434703203dSis 				    (lo & UCONV_U8_BIT_MASK);
7444703203dSis 			}
7454703203dSis 		}
7464703203dSis 
7474703203dSis 		if (hi >= UCONV_U16_START) {
7484703203dSis 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
7494703203dSis 			    UCONV_U16_LO_MIN;
7504703203dSis 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
7514703203dSis 			    UCONV_U16_HI_MIN;
7524703203dSis 
7534703203dSis 			if ((u16l + 1) >= *utf16len)
7544703203dSis 				return (E2BIG);
7554703203dSis 
7564703203dSis 			if (outendian) {
7574703203dSis 				u16s[u16l++] = (uint16_t)hi;
7584703203dSis 				u16s[u16l++] = (uint16_t)lo;
7594703203dSis 			} else {
7604703203dSis 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
7614703203dSis 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
7624703203dSis 			}
7634703203dSis 		} else {
7644703203dSis 			if (u16l >= *utf16len)
7654703203dSis 				return (E2BIG);
7664703203dSis 
7674703203dSis 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
7684703203dSis 			    BSWAP_16(((uint16_t)hi));
7694703203dSis 		}
7704703203dSis 	}
7714703203dSis 
7724703203dSis 	*utf16len = u16l;
7734703203dSis 	*utf8len = u8l;
7744703203dSis 
7754703203dSis 	return (0);
7764703203dSis }
7774703203dSis 
7784703203dSis int
uconv_u8tou32(const uchar_t * u8s,size_t * utf8len,uint32_t * u32s,size_t * utf32len,int flag)7794703203dSis uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
7804703203dSis     uint32_t *u32s, size_t *utf32len, int flag)
7814703203dSis {
7824703203dSis 	int inendian;
7834703203dSis 	int outendian;
7844703203dSis 	size_t u32l;
7854703203dSis 	size_t u8l;
7864703203dSis 	uint32_t hi;
7874703203dSis 	uint32_t c;
7884703203dSis 	int remaining_bytes;
7894703203dSis 	int first_b;
7904703203dSis 	boolean_t do_not_ignore_null;
7914703203dSis 
7924703203dSis 	if (u8s == NULL || utf8len == NULL)
7934703203dSis 		return (EILSEQ);
7944703203dSis 
7954703203dSis 	if (u32s == NULL || utf32len == NULL)
7964703203dSis 		return (E2BIG);
7974703203dSis 
7984703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
7994703203dSis 		return (EBADF);
8004703203dSis 
8014703203dSis 	u32l = u8l = 0;
8024703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
8034703203dSis 
8044703203dSis 	outendian &= UCONV_OUT_NAT_ENDIAN;
8054703203dSis 
8064703203dSis 	if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
8074703203dSis 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
8084703203dSis 		    UCONV_BOM_SWAPPED_32;
8094703203dSis 
8104703203dSis 	for (; u8l < *utf8len; ) {
8114703203dSis 		if (u8s[u8l] == 0 && do_not_ignore_null)
8124703203dSis 			break;
8134703203dSis 
8144703203dSis 		hi = (uint32_t)u8s[u8l++];
8154703203dSis 
8164703203dSis 		if (hi > UCONV_ASCII_MAX) {
8174703203dSis 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
8184703203dSis 				return (EILSEQ);
8194703203dSis 
8204703203dSis 			first_b = hi;
821*15d9d0b5Syy154373 			hi = hi & u8_masks_tbl[remaining_bytes];
8224703203dSis 
8234703203dSis 			for (; remaining_bytes > 0; remaining_bytes--) {
8244703203dSis 				if (u8l >= *utf8len)
8254703203dSis 					return (EINVAL);
8264703203dSis 
8274703203dSis 				c = (uint32_t)u8s[u8l++];
8284703203dSis 
8294703203dSis 				if (first_b) {
8304703203dSis 					if (c < valid_min_2nd_byte[first_b] ||
8314703203dSis 					    c > valid_max_2nd_byte[first_b])
8324703203dSis 						return (EILSEQ);
8334703203dSis 					first_b = 0;
8344703203dSis 				} else if (c < UCONV_U8_BYTE_MIN ||
8354703203dSis 				    c > UCONV_U8_BYTE_MAX) {
8364703203dSis 					return (EILSEQ);
8374703203dSis 				}
8384703203dSis 				hi = (hi << UCONV_U8_BIT_SHIFT) |
8394703203dSis 				    (c & UCONV_U8_BIT_MASK);
8404703203dSis 			}
8414703203dSis 		}
8424703203dSis 
8434703203dSis 		if (u32l >= *utf32len)
8444703203dSis 			return (E2BIG);
8454703203dSis 
8464703203dSis 		u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
8474703203dSis 	}
8484703203dSis 
8494703203dSis 	*utf32len = u32l;
8504703203dSis 	*utf8len = u8l;
8514703203dSis 
8524703203dSis 	return (0);
8534703203dSis }
854