xref: /titanic_44/usr/src/common/unicode/uconv.c (revision 15d9d0b528387242011cdcc6190c9e598cfe3a07)
14703203dSis /*
24703203dSis  * CDDL HEADER START
34703203dSis  *
44703203dSis  * The contents of this file are subject to the terms of the
54703203dSis  * Common Development and Distribution License (the "License").
64703203dSis  * You may not use this file except in compliance with the License.
74703203dSis  *
84703203dSis  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
94703203dSis  * or http://www.opensolaris.org/os/licensing.
104703203dSis  * See the License for the specific language governing permissions
114703203dSis  * and limitations under the License.
124703203dSis  *
134703203dSis  * When distributing Covered Code, include this CDDL HEADER in each
144703203dSis  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
154703203dSis  * If applicable, add the following below this CDDL HEADER, with the
164703203dSis  * fields enclosed by brackets "[]" replaced with your own identifying
174703203dSis  * information: Portions Copyright [yyyy] [name of copyright owner]
184703203dSis  *
194703203dSis  * CDDL HEADER END
204703203dSis  */
214703203dSis /*
22*15d9d0b5Syy154373  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
234703203dSis  * Use is subject to license terms.
244703203dSis  */
254703203dSis 
264703203dSis #pragma ident	"%Z%%M%	%I%	%E% SMI"
274703203dSis 
284703203dSis /*
294703203dSis  * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
304703203dSis  * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
314703203dSis  * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
324703203dSis  * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
334703203dSis  * the section 3C man pages.
344703203dSis  * Interface stability: Committed
354703203dSis  */
364703203dSis 
374703203dSis #include <sys/types.h>
384703203dSis #ifdef	_KERNEL
394703203dSis #include <sys/param.h>
404703203dSis #include <sys/sysmacros.h>
414703203dSis #include <sys/systm.h>
424703203dSis #include <sys/debug.h>
434703203dSis #include <sys/kmem.h>
444703203dSis #include <sys/sunddi.h>
454703203dSis #else
464703203dSis #include <sys/u8_textprep.h>
474703203dSis #endif	/* _KERNEL */
484703203dSis #include <sys/byteorder.h>
494703203dSis #include <sys/errno.h>
504703203dSis 
514703203dSis 
524703203dSis /*
534703203dSis  * The max and min values of high and low surrogate pairs of UTF-16,
544703203dSis  * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
554703203dSis  */
564703203dSis #define	UCONV_U16_HI_MIN	(0xd800U)
574703203dSis #define	UCONV_U16_HI_MAX	(0xdbffU)
584703203dSis #define	UCONV_U16_LO_MIN	(0xdc00U)
594703203dSis #define	UCONV_U16_LO_MAX	(0xdfffU)
604703203dSis #define	UCONV_U16_BIT_SHIFT	(0x0400U)
614703203dSis #define	UCONV_U16_BIT_MASK	(0x0fffffU)
624703203dSis #define	UCONV_U16_START		(0x010000U)
634703203dSis 
644703203dSis /* The maximum value of Unicode coding space and ASCII coding space. */
654703203dSis #define	UCONV_UNICODE_MAX	(0x10ffffU)
664703203dSis #define	UCONV_ASCII_MAX		(0x7fU)
674703203dSis 
684703203dSis /* The mask values for input and output endians. */
694703203dSis #define	UCONV_IN_ENDIAN_MASKS	(UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
704703203dSis #define	UCONV_OUT_ENDIAN_MASKS	(UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
714703203dSis 
724703203dSis /* Native and reversed endian macros. */
734703203dSis #ifdef	_BIG_ENDIAN
744703203dSis #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_BIG_ENDIAN
754703203dSis #define	UCONV_IN_REV_ENDIAN	UCONV_IN_LITTLE_ENDIAN
764703203dSis #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_BIG_ENDIAN
774703203dSis #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
784703203dSis #else
794703203dSis #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_LITTLE_ENDIAN
804703203dSis #define	UCONV_IN_REV_ENDIAN	UCONV_IN_BIG_ENDIAN
814703203dSis #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
824703203dSis #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_BIG_ENDIAN
834703203dSis #endif	/* _BIG_ENDIAN */
844703203dSis 
854703203dSis /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
864703203dSis #define	UCONV_BOM_NORMAL	(0xfeffU)
874703203dSis #define	UCONV_BOM_SWAPPED	(0xfffeU)
884703203dSis #define	UCONV_BOM_SWAPPED_32	(0xfffe0000U)
894703203dSis 
904703203dSis /* UTF-32 boundaries based on UTF-8 character byte lengths. */
914703203dSis #define	UCONV_U8_ONE_BYTE	(0x7fU)
924703203dSis #define	UCONV_U8_TWO_BYTES	(0x7ffU)
934703203dSis #define	UCONV_U8_THREE_BYTES	(0xffffU)
944703203dSis #define	UCONV_U8_FOUR_BYTES	(0x10ffffU)
954703203dSis 
964703203dSis /* The common minimum and maximum values at the UTF-8 character bytes. */
974703203dSis #define	UCONV_U8_BYTE_MIN	(0x80U)
984703203dSis #define	UCONV_U8_BYTE_MAX	(0xbfU)
994703203dSis 
1004703203dSis /*
1014703203dSis  * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
1024703203dSis  * UTF-8 character bytes.
1034703203dSis  */
1044703203dSis #define	UCONV_U8_BIT_SHIFT	6
1054703203dSis #define	UCONV_U8_BIT_MASK	0x3f
1064703203dSis 
1074703203dSis /*
1084703203dSis  * The following vector shows remaining bytes in a UTF-8 character.
1094703203dSis  * Index will be the first byte of the character.
1104703203dSis  */
1114703203dSis static const uchar_t remaining_bytes_tbl[0x100] = {
1124703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1134703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1144703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1154703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1164703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1174703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1184703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1194703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1204703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1214703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1224703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1234703203dSis 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1244703203dSis 
1254703203dSis /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
1264703203dSis 	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
1274703203dSis 
1284703203dSis /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
1294703203dSis 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
1304703203dSis 
1314703203dSis /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
1324703203dSis 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
1334703203dSis 
1344703203dSis /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
1354703203dSis 	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1364703203dSis };
1374703203dSis 
1384703203dSis /*
1394703203dSis  * The following is a vector of bit-masks to get used bits in
1404703203dSis  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
1414703203dSis  * the character.
1424703203dSis  */
143*15d9d0b5Syy154373 #ifdef	_KERNEL
144*15d9d0b5Syy154373 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145*15d9d0b5Syy154373 #else
146*15d9d0b5Syy154373 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
147*15d9d0b5Syy154373 #endif	/* _KERNEL */
1484703203dSis 
1494703203dSis /*
1504703203dSis  * The following two vectors are to provide valid minimum and
1514703203dSis  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
1524703203dSis  * better illegal sequence checking. The index value must be the value of
1534703203dSis  * the first byte of the UTF-8 character.
1544703203dSis  */
1554703203dSis static const uchar_t valid_min_2nd_byte[0x100] = {
1564703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1574703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1584703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1594703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1604703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1614703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1624703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1634703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1644703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1654703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1664703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1674703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1684703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1694703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1704703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1714703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1724703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1734703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1744703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1754703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1764703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1774703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1784703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1794703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
1804703203dSis 
1814703203dSis /*	C0    C1    C2    C3    C4    C5    C6    C7 */
1824703203dSis 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1834703203dSis 
1844703203dSis /*	C8    C9    CA    CB    CC    CD    CE    CF */
1854703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1864703203dSis 
1874703203dSis /*	D0    D1    D2    D3    D4    D5    D6    D7 */
1884703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1894703203dSis 
1904703203dSis /*	D8    D9    DA    DB    DC    DD    DE    DF */
1914703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1924703203dSis 
1934703203dSis /*	E0    E1    E2    E3    E4    E5    E6    E7 */
1944703203dSis 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1954703203dSis 
1964703203dSis /*	E8    E9    EA    EB    EC    ED    EE    EF */
1974703203dSis 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1984703203dSis 
1994703203dSis /*	F0    F1    F2    F3    F4    F5    F6    F7 */
2004703203dSis 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
2014703203dSis 
2024703203dSis 	0,    0,    0,    0,    0,    0,    0,    0
2034703203dSis };
2044703203dSis 
2054703203dSis static const uchar_t valid_max_2nd_byte[0x100] = {
2064703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2074703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2084703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2094703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2104703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2114703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2124703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2134703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2144703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2154703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2164703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2174703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2184703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2194703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2204703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2214703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2224703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2234703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2244703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2254703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2264703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2274703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2284703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2294703203dSis 	0,    0,    0,    0,    0,    0,    0,    0,
2304703203dSis 
2314703203dSis /*	C0    C1    C2    C3    C4    C5    C6    C7 */
2324703203dSis 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2334703203dSis 
2344703203dSis /*	C8    C9    CA    CB    CC    CD    CE    CF */
2354703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2364703203dSis 
2374703203dSis /*	D0    D1    D2    D3    D4    D5    D6    D7 */
2384703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2394703203dSis 
2404703203dSis /*	D8    D9    DA    DB    DC    DD    DE    DF */
2414703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2424703203dSis 
2434703203dSis /*	E0    E1    E2    E3    E4    E5    E6    E7 */
2444703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2454703203dSis 
2464703203dSis /*	E8    E9    EA    EB    EC    ED    EE    EF */
2474703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
2484703203dSis 
2494703203dSis /*	F0    F1    F2    F3    F4    F5    F6    F7 */
2504703203dSis 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
2514703203dSis 
2524703203dSis 	0,    0,    0,    0,    0,    0,    0,    0
2534703203dSis };
2544703203dSis 
2554703203dSis 
2564703203dSis static int
check_endian(int flag,int * in,int * out)2574703203dSis check_endian(int flag, int *in, int *out)
2584703203dSis {
2594703203dSis 	*in = flag & UCONV_IN_ENDIAN_MASKS;
2604703203dSis 
2614703203dSis 	/* You cannot have both. */
2624703203dSis 	if (*in == UCONV_IN_ENDIAN_MASKS)
2634703203dSis 		return (EBADF);
2644703203dSis 
2654703203dSis 	if (*in == 0)
2664703203dSis 		*in = UCONV_IN_NAT_ENDIAN;
2674703203dSis 
2684703203dSis 	*out = flag & UCONV_OUT_ENDIAN_MASKS;
2694703203dSis 
2704703203dSis 	/* You cannot have both. */
2714703203dSis 	if (*out == UCONV_OUT_ENDIAN_MASKS)
2724703203dSis 		return (EBADF);
2734703203dSis 
2744703203dSis 	if (*out == 0)
2754703203dSis 		*out = UCONV_OUT_NAT_ENDIAN;
2764703203dSis 
2774703203dSis 	return (0);
2784703203dSis }
2794703203dSis 
2804703203dSis static boolean_t
check_bom16(const uint16_t * u16s,size_t u16l,int * in)2814703203dSis check_bom16(const uint16_t *u16s, size_t u16l, int *in)
2824703203dSis {
2834703203dSis 	if (u16l > 0) {
2844703203dSis 		if (*u16s == UCONV_BOM_NORMAL) {
2854703203dSis 			*in = UCONV_IN_NAT_ENDIAN;
2864703203dSis 			return (B_TRUE);
2874703203dSis 		}
2884703203dSis 		if (*u16s == UCONV_BOM_SWAPPED) {
2894703203dSis 			*in = UCONV_IN_REV_ENDIAN;
2904703203dSis 			return (B_TRUE);
2914703203dSis 		}
2924703203dSis 	}
2934703203dSis 
2944703203dSis 	return (B_FALSE);
2954703203dSis }
2964703203dSis 
2974703203dSis static boolean_t
check_bom32(const uint32_t * u32s,size_t u32l,int * in)2984703203dSis check_bom32(const uint32_t *u32s, size_t u32l, int *in)
2994703203dSis {
3004703203dSis 	if (u32l > 0) {
3014703203dSis 		if (*u32s == UCONV_BOM_NORMAL) {
3024703203dSis 			*in = UCONV_IN_NAT_ENDIAN;
3034703203dSis 			return (B_TRUE);
3044703203dSis 		}
3054703203dSis 		if (*u32s == UCONV_BOM_SWAPPED_32) {
3064703203dSis 			*in = UCONV_IN_REV_ENDIAN;
3074703203dSis 			return (B_TRUE);
3084703203dSis 		}
3094703203dSis 	}
3104703203dSis 
3114703203dSis 	return (B_FALSE);
3124703203dSis }
3134703203dSis 
3144703203dSis int
uconv_u16tou32(const uint16_t * u16s,size_t * utf16len,uint32_t * u32s,size_t * utf32len,int flag)3154703203dSis uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
3164703203dSis     uint32_t *u32s, size_t *utf32len, int flag)
3174703203dSis {
3184703203dSis 	int inendian;
3194703203dSis 	int outendian;
3204703203dSis 	size_t u16l;
3214703203dSis 	size_t u32l;
3224703203dSis 	uint32_t hi;
3234703203dSis 	uint32_t lo;
3244703203dSis 	boolean_t do_not_ignore_null;
3254703203dSis 
3264703203dSis 	/*
3274703203dSis 	 * Do preliminary validity checks on parameters and collect info on
3284703203dSis 	 * endians.
3294703203dSis 	 */
3304703203dSis 	if (u16s == NULL || utf16len == NULL)
3314703203dSis 		return (EILSEQ);
3324703203dSis 
3334703203dSis 	if (u32s == NULL || utf32len == NULL)
3344703203dSis 		return (E2BIG);
3354703203dSis 
3364703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
3374703203dSis 		return (EBADF);
3384703203dSis 
3394703203dSis 	/*
3404703203dSis 	 * Initialize input and output parameter buffer indices and
3414703203dSis 	 * temporary variables.
3424703203dSis 	 */
3434703203dSis 	u16l = u32l = 0;
3444703203dSis 	hi = 0;
3454703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
3464703203dSis 
3474703203dSis 	/*
3484703203dSis 	 * Check on the BOM at the beginning of the input buffer if required
3494703203dSis 	 * and if there is indeed one, process it.
3504703203dSis 	 */
3514703203dSis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
3524703203dSis 	    check_bom16(u16s, *utf16len, &inendian))
3534703203dSis 		u16l++;
3544703203dSis 
3554703203dSis 	/*
3564703203dSis 	 * Reset inendian and outendian so that after this point, those can be
3574703203dSis 	 * used as condition values.
3584703203dSis 	 */
3594703203dSis 	inendian &= UCONV_IN_NAT_ENDIAN;
3604703203dSis 	outendian &= UCONV_OUT_NAT_ENDIAN;
3614703203dSis 
3624703203dSis 	/*
3634703203dSis 	 * If there is something in the input buffer and if necessary and
3644703203dSis 	 * requested, save the BOM at the output buffer.
3654703203dSis 	 */
3664703203dSis 	if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
3674703203dSis 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
3684703203dSis 		    UCONV_BOM_SWAPPED_32;
3694703203dSis 
3704703203dSis 	/*
3714703203dSis 	 * Do conversion; if encounter a surrogate pair, assemble high and
3724703203dSis 	 * low pair values to form a UTF-32 character. If a half of a pair
3734703203dSis 	 * exists alone, then, either it is an illegal (EILSEQ) or
3744703203dSis 	 * invalid (EINVAL) value.
3754703203dSis 	 */
3764703203dSis 	for (; u16l < *utf16len; u16l++) {
3774703203dSis 		if (u16s[u16l] == 0 && do_not_ignore_null)
3784703203dSis 			break;
3794703203dSis 
3804703203dSis 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
3814703203dSis 
3824703203dSis 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
3834703203dSis 			if (hi)
3844703203dSis 				return (EILSEQ);
3854703203dSis 			hi = lo;
3864703203dSis 			continue;
3874703203dSis 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
3884703203dSis 			if (! hi)
3894703203dSis 				return (EILSEQ);
3904703203dSis 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
3914703203dSis 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
3924703203dSis 			    + UCONV_U16_START;
3934703203dSis 			hi = 0;
3944703203dSis 		} else if (hi) {
3954703203dSis 			return (EILSEQ);
3964703203dSis 		}
3974703203dSis 
3984703203dSis 		if (u32l >= *utf32len)
3994703203dSis 			return (E2BIG);
4004703203dSis 
4014703203dSis 		u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
4024703203dSis 	}
4034703203dSis 
4044703203dSis 	/*
4054703203dSis 	 * If high half didn't see low half, then, it's most likely the input
4064703203dSis 	 * parameter is incomplete.
4074703203dSis 	 */
4084703203dSis 	if (hi)
4094703203dSis 		return (EINVAL);
4104703203dSis 
4114703203dSis 	/*
4124703203dSis 	 * Save the number of consumed and saved characters. They do not
4134703203dSis 	 * include terminating NULL character (U+0000) at the end of
4144703203dSis 	 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
4154703203dSis 	 * the input buffer length is big enough to include the terminating
4164703203dSis 	 * NULL character).
4174703203dSis 	 */
4184703203dSis 	*utf16len = u16l;
4194703203dSis 	*utf32len = u32l;
4204703203dSis 
4214703203dSis 	return (0);
4224703203dSis }
4234703203dSis 
4244703203dSis int
uconv_u16tou8(const uint16_t * u16s,size_t * utf16len,uchar_t * u8s,size_t * utf8len,int flag)4254703203dSis uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
4264703203dSis     uchar_t *u8s, size_t *utf8len, int flag)
4274703203dSis {
4284703203dSis 	int inendian;
4294703203dSis 	int outendian;
4304703203dSis 	size_t u16l;
4314703203dSis 	size_t u8l;
4324703203dSis 	uint32_t hi;
4334703203dSis 	uint32_t lo;
4344703203dSis 	boolean_t do_not_ignore_null;
4354703203dSis 
4364703203dSis 	if (u16s == NULL || utf16len == NULL)
4374703203dSis 		return (EILSEQ);
4384703203dSis 
4394703203dSis 	if (u8s == NULL || utf8len == NULL)
4404703203dSis 		return (E2BIG);
4414703203dSis 
4424703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
4434703203dSis 		return (EBADF);
4444703203dSis 
4454703203dSis 	u16l = u8l = 0;
4464703203dSis 	hi = 0;
4474703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
4484703203dSis 
4494703203dSis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
4504703203dSis 	    check_bom16(u16s, *utf16len, &inendian))
4514703203dSis 		u16l++;
4524703203dSis 
4534703203dSis 	inendian &= UCONV_IN_NAT_ENDIAN;
4544703203dSis 
4554703203dSis 	for (; u16l < *utf16len; u16l++) {
4564703203dSis 		if (u16s[u16l] == 0 && do_not_ignore_null)
4574703203dSis 			break;
4584703203dSis 
4594703203dSis 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
4604703203dSis 
4614703203dSis 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
4624703203dSis 			if (hi)
4634703203dSis 				return (EILSEQ);
4644703203dSis 			hi = lo;
4654703203dSis 			continue;
4664703203dSis 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
4674703203dSis 			if (! hi)
4684703203dSis 				return (EILSEQ);
4694703203dSis 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
4704703203dSis 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
4714703203dSis 			    + UCONV_U16_START;
4724703203dSis 			hi = 0;
4734703203dSis 		} else if (hi) {
4744703203dSis 			return (EILSEQ);
4754703203dSis 		}
4764703203dSis 
4774703203dSis 		/*
4784703203dSis 		 * Now we convert a UTF-32 character into a UTF-8 character.
4794703203dSis 		 * Unicode coding space is between U+0000 and U+10FFFF;
4804703203dSis 		 * anything bigger is an illegal character.
4814703203dSis 		 */
4824703203dSis 		if (lo <= UCONV_U8_ONE_BYTE) {
4834703203dSis 			if (u8l >= *utf8len)
4844703203dSis 				return (E2BIG);
4854703203dSis 			u8s[u8l++] = (uchar_t)lo;
4864703203dSis 		} else if (lo <= UCONV_U8_TWO_BYTES) {
4874703203dSis 			if ((u8l + 1) >= *utf8len)
4884703203dSis 				return (E2BIG);
4894703203dSis 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
4904703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
4914703203dSis 		} else if (lo <= UCONV_U8_THREE_BYTES) {
4924703203dSis 			if ((u8l + 2) >= *utf8len)
4934703203dSis 				return (E2BIG);
4944703203dSis 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
4954703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
4964703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
4974703203dSis 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
4984703203dSis 			if ((u8l + 3) >= *utf8len)
4994703203dSis 				return (E2BIG);
5004703203dSis 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
5014703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
5024703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
5034703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
5044703203dSis 		} else {
5054703203dSis 			return (EILSEQ);
5064703203dSis 		}
5074703203dSis 	}
5084703203dSis 
5094703203dSis 	if (hi)
5104703203dSis 		return (EINVAL);
5114703203dSis 
5124703203dSis 	*utf16len = u16l;
5134703203dSis 	*utf8len = u8l;
5144703203dSis 
5154703203dSis 	return (0);
5164703203dSis }
5174703203dSis 
5184703203dSis int
uconv_u32tou16(const uint32_t * u32s,size_t * utf32len,uint16_t * u16s,size_t * utf16len,int flag)5194703203dSis uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
5204703203dSis     uint16_t *u16s, size_t *utf16len, int flag)
5214703203dSis {
5224703203dSis 	int inendian;
5234703203dSis 	int outendian;
5244703203dSis 	size_t u16l;
5254703203dSis 	size_t u32l;
5264703203dSis 	uint32_t hi;
5274703203dSis 	uint32_t lo;
5284703203dSis 	boolean_t do_not_ignore_null;
5294703203dSis 
5304703203dSis 	if (u32s == NULL || utf32len == NULL)
5314703203dSis 		return (EILSEQ);
5324703203dSis 
5334703203dSis 	if (u16s == NULL || utf16len == NULL)
5344703203dSis 		return (E2BIG);
5354703203dSis 
5364703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
5374703203dSis 		return (EBADF);
5384703203dSis 
5394703203dSis 	u16l = u32l = 0;
5404703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
5414703203dSis 
5424703203dSis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
5434703203dSis 	    check_bom32(u32s, *utf32len, &inendian))
5444703203dSis 		u32l++;
5454703203dSis 
5464703203dSis 	inendian &= UCONV_IN_NAT_ENDIAN;
5474703203dSis 	outendian &= UCONV_OUT_NAT_ENDIAN;
5484703203dSis 
5494703203dSis 	if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
5504703203dSis 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
5514703203dSis 		    UCONV_BOM_SWAPPED;
5524703203dSis 
5534703203dSis 	for (; u32l < *utf32len; u32l++) {
5544703203dSis 		if (u32s[u32l] == 0 && do_not_ignore_null)
5554703203dSis 			break;
5564703203dSis 
5574703203dSis 		hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
5584703203dSis 
5594703203dSis 		/*
5604703203dSis 		 * Anything bigger than the Unicode coding space, i.e.,
5614703203dSis 		 * Unicode scalar value bigger than U+10FFFF, is an illegal
5624703203dSis 		 * character.
5634703203dSis 		 */
5644703203dSis 		if (hi > UCONV_UNICODE_MAX)
5654703203dSis 			return (EILSEQ);
5664703203dSis 
5674703203dSis 		/*
5684703203dSis 		 * Anything bigger than U+FFFF must be converted into
5694703203dSis 		 * a surrogate pair in UTF-16.
5704703203dSis 		 */
5714703203dSis 		if (hi >= UCONV_U16_START) {
5724703203dSis 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
5734703203dSis 			    UCONV_U16_LO_MIN;
5744703203dSis 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
5754703203dSis 			    UCONV_U16_HI_MIN;
5764703203dSis 
5774703203dSis 			if ((u16l + 1) >= *utf16len)
5784703203dSis 				return (E2BIG);
5794703203dSis 
5804703203dSis 			if (outendian) {
5814703203dSis 				u16s[u16l++] = (uint16_t)hi;
5824703203dSis 				u16s[u16l++] = (uint16_t)lo;
5834703203dSis 			} else {
5844703203dSis 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
5854703203dSis 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
5864703203dSis 			}
5874703203dSis 		} else {
5884703203dSis 			if (u16l >= *utf16len)
5894703203dSis 				return (E2BIG);
5904703203dSis 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
5914703203dSis 			    BSWAP_16(((uint16_t)hi));
5924703203dSis 		}
5934703203dSis 	}
5944703203dSis 
5954703203dSis 	*utf16len = u16l;
5964703203dSis 	*utf32len = u32l;
5974703203dSis 
5984703203dSis 	return (0);
5994703203dSis }
6004703203dSis 
6014703203dSis int
uconv_u32tou8(const uint32_t * u32s,size_t * utf32len,uchar_t * u8s,size_t * utf8len,int flag)6024703203dSis uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
6034703203dSis     uchar_t *u8s, size_t *utf8len, int flag)
6044703203dSis {
6054703203dSis 	int inendian;
6064703203dSis 	int outendian;
6074703203dSis 	size_t u32l;
6084703203dSis 	size_t u8l;
6094703203dSis 	uint32_t lo;
6104703203dSis 	boolean_t do_not_ignore_null;
6114703203dSis 
6124703203dSis 	if (u32s == NULL || utf32len == NULL)
6134703203dSis 		return (EILSEQ);
6144703203dSis 
6154703203dSis 	if (u8s == NULL || utf8len == NULL)
6164703203dSis 		return (E2BIG);
6174703203dSis 
6184703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
6194703203dSis 		return (EBADF);
6204703203dSis 
6214703203dSis 	u32l = u8l = 0;
6224703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
6234703203dSis 
6244703203dSis 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
6254703203dSis 	    check_bom32(u32s, *utf32len, &inendian))
6264703203dSis 		u32l++;
6274703203dSis 
6284703203dSis 	inendian &= UCONV_IN_NAT_ENDIAN;
6294703203dSis 
6304703203dSis 	for (; u32l < *utf32len; u32l++) {
6314703203dSis 		if (u32s[u32l] == 0 && do_not_ignore_null)
6324703203dSis 			break;
6334703203dSis 
6344703203dSis 		lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
6354703203dSis 
6364703203dSis 		if (lo <= UCONV_U8_ONE_BYTE) {
6374703203dSis 			if (u8l >= *utf8len)
6384703203dSis 				return (E2BIG);
6394703203dSis 			u8s[u8l++] = (uchar_t)lo;
6404703203dSis 		} else if (lo <= UCONV_U8_TWO_BYTES) {
6414703203dSis 			if ((u8l + 1) >= *utf8len)
6424703203dSis 				return (E2BIG);
6434703203dSis 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
6444703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
6454703203dSis 		} else if (lo <= UCONV_U8_THREE_BYTES) {
6464703203dSis 			if ((u8l + 2) >= *utf8len)
6474703203dSis 				return (E2BIG);
6484703203dSis 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
6494703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
6504703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
6514703203dSis 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
6524703203dSis 			if ((u8l + 3) >= *utf8len)
6534703203dSis 				return (E2BIG);
6544703203dSis 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
6554703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
6564703203dSis 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
6574703203dSis 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
6584703203dSis 		} else {
6594703203dSis 			return (EILSEQ);
6604703203dSis 		}
6614703203dSis 	}
6624703203dSis 
6634703203dSis 	*utf32len = u32l;
6644703203dSis 	*utf8len = u8l;
6654703203dSis 
6664703203dSis 	return (0);
6674703203dSis }
6684703203dSis 
6694703203dSis int
uconv_u8tou16(const uchar_t * u8s,size_t * utf8len,uint16_t * u16s,size_t * utf16len,int flag)6704703203dSis uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
6714703203dSis     uint16_t *u16s, size_t *utf16len, int flag)
6724703203dSis {
6734703203dSis 	int inendian;
6744703203dSis 	int outendian;
6754703203dSis 	size_t u16l;
6764703203dSis 	size_t u8l;
6774703203dSis 	uint32_t hi;
6784703203dSis 	uint32_t lo;
6794703203dSis 	int remaining_bytes;
6804703203dSis 	int first_b;
6814703203dSis 	boolean_t do_not_ignore_null;
6824703203dSis 
6834703203dSis 	if (u8s == NULL || utf8len == NULL)
6844703203dSis 		return (EILSEQ);
6854703203dSis 
6864703203dSis 	if (u16s == NULL || utf16len == NULL)
6874703203dSis 		return (E2BIG);
6884703203dSis 
6894703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
6904703203dSis 		return (EBADF);
6914703203dSis 
6924703203dSis 	u16l = u8l = 0;
6934703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
6944703203dSis 
6954703203dSis 	outendian &= UCONV_OUT_NAT_ENDIAN;
6964703203dSis 
6974703203dSis 	if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
6984703203dSis 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
6994703203dSis 		    UCONV_BOM_SWAPPED;
7004703203dSis 
7014703203dSis 	for (; u8l < *utf8len; ) {
7024703203dSis 		if (u8s[u8l] == 0 && do_not_ignore_null)
7034703203dSis 			break;
7044703203dSis 
7054703203dSis 		/*
7064703203dSis 		 * Collect a UTF-8 character and convert it to a UTF-32
7074703203dSis 		 * character. In doing so, we screen out illegally formed
7084703203dSis 		 * UTF-8 characters and treat such as illegal characters.
7094703203dSis 		 * The algorithm at below also screens out anything bigger
7104703203dSis 		 * than the U+10FFFF.
7114703203dSis 		 *
7124703203dSis 		 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
7134703203dSis 		 * more details on the illegal values of UTF-8 character
7144703203dSis 		 * bytes.
7154703203dSis 		 */
7164703203dSis 		hi = (uint32_t)u8s[u8l++];
7174703203dSis 
7184703203dSis 		if (hi > UCONV_ASCII_MAX) {
7194703203dSis 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
7204703203dSis 				return (EILSEQ);
7214703203dSis 
7224703203dSis 			first_b = hi;
723*15d9d0b5Syy154373 			hi = hi & u8_masks_tbl[remaining_bytes];
7244703203dSis 
7254703203dSis 			for (; remaining_bytes > 0; remaining_bytes--) {
7264703203dSis 				/*
7274703203dSis 				 * If we have no more bytes, the current
7284703203dSis 				 * UTF-8 character is incomplete.
7294703203dSis 				 */
7304703203dSis 				if (u8l >= *utf8len)
7314703203dSis 					return (EINVAL);
7324703203dSis 
7334703203dSis 				lo = (uint32_t)u8s[u8l++];
7344703203dSis 
7354703203dSis 				if (first_b) {
7364703203dSis 					if (lo < valid_min_2nd_byte[first_b] ||
7374703203dSis 					    lo > valid_max_2nd_byte[first_b])
7384703203dSis 						return (EILSEQ);
7394703203dSis 					first_b = 0;
7404703203dSis 				} else if (lo < UCONV_U8_BYTE_MIN ||
7414703203dSis 				    lo > UCONV_U8_BYTE_MAX) {
7424703203dSis 					return (EILSEQ);
7434703203dSis 				}
7444703203dSis 				hi = (hi << UCONV_U8_BIT_SHIFT) |
7454703203dSis 				    (lo & UCONV_U8_BIT_MASK);
7464703203dSis 			}
7474703203dSis 		}
7484703203dSis 
7494703203dSis 		if (hi >= UCONV_U16_START) {
7504703203dSis 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
7514703203dSis 			    UCONV_U16_LO_MIN;
7524703203dSis 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
7534703203dSis 			    UCONV_U16_HI_MIN;
7544703203dSis 
7554703203dSis 			if ((u16l + 1) >= *utf16len)
7564703203dSis 				return (E2BIG);
7574703203dSis 
7584703203dSis 			if (outendian) {
7594703203dSis 				u16s[u16l++] = (uint16_t)hi;
7604703203dSis 				u16s[u16l++] = (uint16_t)lo;
7614703203dSis 			} else {
7624703203dSis 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
7634703203dSis 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
7644703203dSis 			}
7654703203dSis 		} else {
7664703203dSis 			if (u16l >= *utf16len)
7674703203dSis 				return (E2BIG);
7684703203dSis 
7694703203dSis 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
7704703203dSis 			    BSWAP_16(((uint16_t)hi));
7714703203dSis 		}
7724703203dSis 	}
7734703203dSis 
7744703203dSis 	*utf16len = u16l;
7754703203dSis 	*utf8len = u8l;
7764703203dSis 
7774703203dSis 	return (0);
7784703203dSis }
7794703203dSis 
7804703203dSis int
uconv_u8tou32(const uchar_t * u8s,size_t * utf8len,uint32_t * u32s,size_t * utf32len,int flag)7814703203dSis uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
7824703203dSis     uint32_t *u32s, size_t *utf32len, int flag)
7834703203dSis {
7844703203dSis 	int inendian;
7854703203dSis 	int outendian;
7864703203dSis 	size_t u32l;
7874703203dSis 	size_t u8l;
7884703203dSis 	uint32_t hi;
7894703203dSis 	uint32_t c;
7904703203dSis 	int remaining_bytes;
7914703203dSis 	int first_b;
7924703203dSis 	boolean_t do_not_ignore_null;
7934703203dSis 
7944703203dSis 	if (u8s == NULL || utf8len == NULL)
7954703203dSis 		return (EILSEQ);
7964703203dSis 
7974703203dSis 	if (u32s == NULL || utf32len == NULL)
7984703203dSis 		return (E2BIG);
7994703203dSis 
8004703203dSis 	if (check_endian(flag, &inendian, &outendian) != 0)
8014703203dSis 		return (EBADF);
8024703203dSis 
8034703203dSis 	u32l = u8l = 0;
8044703203dSis 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
8054703203dSis 
8064703203dSis 	outendian &= UCONV_OUT_NAT_ENDIAN;
8074703203dSis 
8084703203dSis 	if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
8094703203dSis 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
8104703203dSis 		    UCONV_BOM_SWAPPED_32;
8114703203dSis 
8124703203dSis 	for (; u8l < *utf8len; ) {
8134703203dSis 		if (u8s[u8l] == 0 && do_not_ignore_null)
8144703203dSis 			break;
8154703203dSis 
8164703203dSis 		hi = (uint32_t)u8s[u8l++];
8174703203dSis 
8184703203dSis 		if (hi > UCONV_ASCII_MAX) {
8194703203dSis 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
8204703203dSis 				return (EILSEQ);
8214703203dSis 
8224703203dSis 			first_b = hi;
823*15d9d0b5Syy154373 			hi = hi & u8_masks_tbl[remaining_bytes];
8244703203dSis 
8254703203dSis 			for (; remaining_bytes > 0; remaining_bytes--) {
8264703203dSis 				if (u8l >= *utf8len)
8274703203dSis 					return (EINVAL);
8284703203dSis 
8294703203dSis 				c = (uint32_t)u8s[u8l++];
8304703203dSis 
8314703203dSis 				if (first_b) {
8324703203dSis 					if (c < valid_min_2nd_byte[first_b] ||
8334703203dSis 					    c > valid_max_2nd_byte[first_b])
8344703203dSis 						return (EILSEQ);
8354703203dSis 					first_b = 0;
8364703203dSis 				} else if (c < UCONV_U8_BYTE_MIN ||
8374703203dSis 				    c > UCONV_U8_BYTE_MAX) {
8384703203dSis 					return (EILSEQ);
8394703203dSis 				}
8404703203dSis 				hi = (hi << UCONV_U8_BIT_SHIFT) |
8414703203dSis 				    (c & UCONV_U8_BIT_MASK);
8424703203dSis 			}
8434703203dSis 		}
8444703203dSis 
8454703203dSis 		if (u32l >= *utf32len)
8464703203dSis 			return (E2BIG);
8474703203dSis 
8484703203dSis 		u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
8494703203dSis 	}
8504703203dSis 
8514703203dSis 	*utf32len = u32l;
8524703203dSis 	*utf8len = u8l;
8534703203dSis 
8544703203dSis 	return (0);
8554703203dSis }
856