14703203dSis /*
24703203dSis * CDDL HEADER START
34703203dSis *
44703203dSis * The contents of this file are subject to the terms of the
54703203dSis * Common Development and Distribution License (the "License").
64703203dSis * You may not use this file except in compliance with the License.
74703203dSis *
84703203dSis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
94703203dSis * or http://www.opensolaris.org/os/licensing.
104703203dSis * See the License for the specific language governing permissions
114703203dSis * and limitations under the License.
124703203dSis *
134703203dSis * When distributing Covered Code, include this CDDL HEADER in each
144703203dSis * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
154703203dSis * If applicable, add the following below this CDDL HEADER, with the
164703203dSis * fields enclosed by brackets "[]" replaced with your own identifying
174703203dSis * information: Portions Copyright [yyyy] [name of copyright owner]
184703203dSis *
194703203dSis * CDDL HEADER END
204703203dSis */
214703203dSis /*
22*15d9d0b5Syy154373 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
234703203dSis * Use is subject to license terms.
244703203dSis */
254703203dSis
264703203dSis /*
274703203dSis * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
284703203dSis * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
294703203dSis * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
304703203dSis * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
314703203dSis * the section 3C man pages.
324703203dSis * Interface stability: Committed
334703203dSis */
344703203dSis
354703203dSis #include <sys/types.h>
364703203dSis #ifdef _KERNEL
374703203dSis #include <sys/param.h>
384703203dSis #include <sys/sysmacros.h>
394703203dSis #include <sys/systm.h>
404703203dSis #include <sys/debug.h>
414703203dSis #include <sys/kmem.h>
424703203dSis #include <sys/sunddi.h>
434703203dSis #else
444703203dSis #include <sys/u8_textprep.h>
454703203dSis #endif /* _KERNEL */
464703203dSis #include <sys/byteorder.h>
474703203dSis #include <sys/errno.h>
484703203dSis
494703203dSis
504703203dSis /*
514703203dSis * The max and min values of high and low surrogate pairs of UTF-16,
524703203dSis * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
534703203dSis */
544703203dSis #define UCONV_U16_HI_MIN (0xd800U)
554703203dSis #define UCONV_U16_HI_MAX (0xdbffU)
564703203dSis #define UCONV_U16_LO_MIN (0xdc00U)
574703203dSis #define UCONV_U16_LO_MAX (0xdfffU)
584703203dSis #define UCONV_U16_BIT_SHIFT (0x0400U)
594703203dSis #define UCONV_U16_BIT_MASK (0x0fffffU)
604703203dSis #define UCONV_U16_START (0x010000U)
614703203dSis
624703203dSis /* The maximum value of Unicode coding space and ASCII coding space. */
634703203dSis #define UCONV_UNICODE_MAX (0x10ffffU)
644703203dSis #define UCONV_ASCII_MAX (0x7fU)
654703203dSis
664703203dSis /* The mask values for input and output endians. */
674703203dSis #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
684703203dSis #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
694703203dSis
704703203dSis /* Native and reversed endian macros. */
714703203dSis #ifdef _BIG_ENDIAN
724703203dSis #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN
734703203dSis #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN
744703203dSis #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN
754703203dSis #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN
764703203dSis #else
774703203dSis #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN
784703203dSis #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN
794703203dSis #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN
804703203dSis #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN
814703203dSis #endif /* _BIG_ENDIAN */
824703203dSis
834703203dSis /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
844703203dSis #define UCONV_BOM_NORMAL (0xfeffU)
854703203dSis #define UCONV_BOM_SWAPPED (0xfffeU)
864703203dSis #define UCONV_BOM_SWAPPED_32 (0xfffe0000U)
874703203dSis
884703203dSis /* UTF-32 boundaries based on UTF-8 character byte lengths. */
894703203dSis #define UCONV_U8_ONE_BYTE (0x7fU)
904703203dSis #define UCONV_U8_TWO_BYTES (0x7ffU)
914703203dSis #define UCONV_U8_THREE_BYTES (0xffffU)
924703203dSis #define UCONV_U8_FOUR_BYTES (0x10ffffU)
934703203dSis
944703203dSis /* The common minimum and maximum values at the UTF-8 character bytes. */
954703203dSis #define UCONV_U8_BYTE_MIN (0x80U)
964703203dSis #define UCONV_U8_BYTE_MAX (0xbfU)
974703203dSis
984703203dSis /*
994703203dSis * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
1004703203dSis * UTF-8 character bytes.
1014703203dSis */
1024703203dSis #define UCONV_U8_BIT_SHIFT 6
1034703203dSis #define UCONV_U8_BIT_MASK 0x3f
1044703203dSis
1054703203dSis /*
1064703203dSis * The following vector shows remaining bytes in a UTF-8 character.
1074703203dSis * Index will be the first byte of the character.
1084703203dSis */
1094703203dSis static const uchar_t remaining_bytes_tbl[0x100] = {
1104703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1114703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1124703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1134703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1144703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1154703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1164703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1174703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1184703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1204703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1214703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1224703203dSis
1234703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
1244703203dSis 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1254703203dSis
1264703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
1274703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1284703203dSis
1294703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
1304703203dSis 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1314703203dSis
1324703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
1334703203dSis 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1344703203dSis };
1354703203dSis
1364703203dSis /*
1374703203dSis * The following is a vector of bit-masks to get used bits in
1384703203dSis * the first byte of a UTF-8 character. Index is remaining bytes at above of
1394703203dSis * the character.
1404703203dSis */
141*15d9d0b5Syy154373 #ifdef _KERNEL
142*15d9d0b5Syy154373 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
143*15d9d0b5Syy154373 #else
144*15d9d0b5Syy154373 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145*15d9d0b5Syy154373 #endif /* _KERNEL */
1464703203dSis
1474703203dSis /*
1484703203dSis * The following two vectors are to provide valid minimum and
1494703203dSis * maximum values for the 2'nd byte of a multibyte UTF-8 character for
1504703203dSis * better illegal sequence checking. The index value must be the value of
1514703203dSis * the first byte of the UTF-8 character.
1524703203dSis */
1534703203dSis static const uchar_t valid_min_2nd_byte[0x100] = {
1544703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1554703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1564703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1574703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1584703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1594703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1604703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1614703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1624703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1634703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1644703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1654703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1664703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1674703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1684703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1694703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1704703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1714703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1724703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1734703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1744703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1754703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1764703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1774703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1784703203dSis
1794703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */
1804703203dSis 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1814703203dSis
1824703203dSis /* C8 C9 CA CB CC CD CE CF */
1834703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1844703203dSis
1854703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */
1864703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1874703203dSis
1884703203dSis /* D8 D9 DA DB DC DD DE DF */
1894703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1904703203dSis
1914703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */
1924703203dSis 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1934703203dSis
1944703203dSis /* E8 E9 EA EB EC ED EE EF */
1954703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1964703203dSis
1974703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */
1984703203dSis 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
1994703203dSis
2004703203dSis 0, 0, 0, 0, 0, 0, 0, 0
2014703203dSis };
2024703203dSis
2034703203dSis static const uchar_t valid_max_2nd_byte[0x100] = {
2044703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2054703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2064703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2074703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2084703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2094703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2104703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2114703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2124703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2134703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2144703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2154703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2164703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2174703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2184703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2194703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2204703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2214703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2224703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2234703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2244703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2254703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2264703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2274703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2284703203dSis
2294703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */
2304703203dSis 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2314703203dSis
2324703203dSis /* C8 C9 CA CB CC CD CE CF */
2334703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2344703203dSis
2354703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */
2364703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2374703203dSis
2384703203dSis /* D8 D9 DA DB DC DD DE DF */
2394703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2404703203dSis
2414703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */
2424703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2434703203dSis
2444703203dSis /* E8 E9 EA EB EC ED EE EF */
2454703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
2464703203dSis
2474703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */
2484703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
2494703203dSis
2504703203dSis 0, 0, 0, 0, 0, 0, 0, 0
2514703203dSis };
2524703203dSis
2534703203dSis
2544703203dSis static int
check_endian(int flag,int * in,int * out)2554703203dSis check_endian(int flag, int *in, int *out)
2564703203dSis {
2574703203dSis *in = flag & UCONV_IN_ENDIAN_MASKS;
2584703203dSis
2594703203dSis /* You cannot have both. */
2604703203dSis if (*in == UCONV_IN_ENDIAN_MASKS)
2614703203dSis return (EBADF);
2624703203dSis
2634703203dSis if (*in == 0)
2644703203dSis *in = UCONV_IN_NAT_ENDIAN;
2654703203dSis
2664703203dSis *out = flag & UCONV_OUT_ENDIAN_MASKS;
2674703203dSis
2684703203dSis /* You cannot have both. */
2694703203dSis if (*out == UCONV_OUT_ENDIAN_MASKS)
2704703203dSis return (EBADF);
2714703203dSis
2724703203dSis if (*out == 0)
2734703203dSis *out = UCONV_OUT_NAT_ENDIAN;
2744703203dSis
2754703203dSis return (0);
2764703203dSis }
2774703203dSis
2784703203dSis static boolean_t
check_bom16(const uint16_t * u16s,size_t u16l,int * in)2794703203dSis check_bom16(const uint16_t *u16s, size_t u16l, int *in)
2804703203dSis {
2814703203dSis if (u16l > 0) {
2824703203dSis if (*u16s == UCONV_BOM_NORMAL) {
2834703203dSis *in = UCONV_IN_NAT_ENDIAN;
2844703203dSis return (B_TRUE);
2854703203dSis }
2864703203dSis if (*u16s == UCONV_BOM_SWAPPED) {
2874703203dSis *in = UCONV_IN_REV_ENDIAN;
2884703203dSis return (B_TRUE);
2894703203dSis }
2904703203dSis }
2914703203dSis
2924703203dSis return (B_FALSE);
2934703203dSis }
2944703203dSis
2954703203dSis static boolean_t
check_bom32(const uint32_t * u32s,size_t u32l,int * in)2964703203dSis check_bom32(const uint32_t *u32s, size_t u32l, int *in)
2974703203dSis {
2984703203dSis if (u32l > 0) {
2994703203dSis if (*u32s == UCONV_BOM_NORMAL) {
3004703203dSis *in = UCONV_IN_NAT_ENDIAN;
3014703203dSis return (B_TRUE);
3024703203dSis }
3034703203dSis if (*u32s == UCONV_BOM_SWAPPED_32) {
3044703203dSis *in = UCONV_IN_REV_ENDIAN;
3054703203dSis return (B_TRUE);
3064703203dSis }
3074703203dSis }
3084703203dSis
3094703203dSis return (B_FALSE);
3104703203dSis }
3114703203dSis
3124703203dSis int
uconv_u16tou32(const uint16_t * u16s,size_t * utf16len,uint32_t * u32s,size_t * utf32len,int flag)3134703203dSis uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
3144703203dSis uint32_t *u32s, size_t *utf32len, int flag)
3154703203dSis {
3164703203dSis int inendian;
3174703203dSis int outendian;
3184703203dSis size_t u16l;
3194703203dSis size_t u32l;
3204703203dSis uint32_t hi;
3214703203dSis uint32_t lo;
3224703203dSis boolean_t do_not_ignore_null;
3234703203dSis
3244703203dSis /*
3254703203dSis * Do preliminary validity checks on parameters and collect info on
3264703203dSis * endians.
3274703203dSis */
3284703203dSis if (u16s == NULL || utf16len == NULL)
3294703203dSis return (EILSEQ);
3304703203dSis
3314703203dSis if (u32s == NULL || utf32len == NULL)
3324703203dSis return (E2BIG);
3334703203dSis
3344703203dSis if (check_endian(flag, &inendian, &outendian) != 0)
3354703203dSis return (EBADF);
3364703203dSis
3374703203dSis /*
3384703203dSis * Initialize input and output parameter buffer indices and
3394703203dSis * temporary variables.
3404703203dSis */
3414703203dSis u16l = u32l = 0;
3424703203dSis hi = 0;
3434703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
3444703203dSis
3454703203dSis /*
3464703203dSis * Check on the BOM at the beginning of the input buffer if required
3474703203dSis * and if there is indeed one, process it.
3484703203dSis */
3494703203dSis if ((flag & UCONV_IN_ACCEPT_BOM) &&
3504703203dSis check_bom16(u16s, *utf16len, &inendian))
3514703203dSis u16l++;
3524703203dSis
3534703203dSis /*
3544703203dSis * Reset inendian and outendian so that after this point, those can be
3554703203dSis * used as condition values.
3564703203dSis */
3574703203dSis inendian &= UCONV_IN_NAT_ENDIAN;
3584703203dSis outendian &= UCONV_OUT_NAT_ENDIAN;
3594703203dSis
3604703203dSis /*
3614703203dSis * If there is something in the input buffer and if necessary and
3624703203dSis * requested, save the BOM at the output buffer.
3634703203dSis */
3644703203dSis if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
3654703203dSis u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
3664703203dSis UCONV_BOM_SWAPPED_32;
3674703203dSis
3684703203dSis /*
3694703203dSis * Do conversion; if encounter a surrogate pair, assemble high and
3704703203dSis * low pair values to form a UTF-32 character. If a half of a pair
3714703203dSis * exists alone, then, either it is an illegal (EILSEQ) or
3724703203dSis * invalid (EINVAL) value.
3734703203dSis */
3744703203dSis for (; u16l < *utf16len; u16l++) {
3754703203dSis if (u16s[u16l] == 0 && do_not_ignore_null)
3764703203dSis break;
3774703203dSis
3784703203dSis lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
3794703203dSis
3804703203dSis if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
3814703203dSis if (hi)
3824703203dSis return (EILSEQ);
3834703203dSis hi = lo;
3844703203dSis continue;
3854703203dSis } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
3864703203dSis if (! hi)
3874703203dSis return (EILSEQ);
3884703203dSis lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
3894703203dSis lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
3904703203dSis + UCONV_U16_START;
3914703203dSis hi = 0;
3924703203dSis } else if (hi) {
3934703203dSis return (EILSEQ);
3944703203dSis }
3954703203dSis
3964703203dSis if (u32l >= *utf32len)
3974703203dSis return (E2BIG);
3984703203dSis
3994703203dSis u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
4004703203dSis }
4014703203dSis
4024703203dSis /*
4034703203dSis * If high half didn't see low half, then, it's most likely the input
4044703203dSis * parameter is incomplete.
4054703203dSis */
4064703203dSis if (hi)
4074703203dSis return (EINVAL);
4084703203dSis
4094703203dSis /*
4104703203dSis * Save the number of consumed and saved characters. They do not
4114703203dSis * include terminating NULL character (U+0000) at the end of
4124703203dSis * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
4134703203dSis * the input buffer length is big enough to include the terminating
4144703203dSis * NULL character).
4154703203dSis */
4164703203dSis *utf16len = u16l;
4174703203dSis *utf32len = u32l;
4184703203dSis
4194703203dSis return (0);
4204703203dSis }
4214703203dSis
4224703203dSis int
uconv_u16tou8(const uint16_t * u16s,size_t * utf16len,uchar_t * u8s,size_t * utf8len,int flag)4234703203dSis uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
4244703203dSis uchar_t *u8s, size_t *utf8len, int flag)
4254703203dSis {
4264703203dSis int inendian;
4274703203dSis int outendian;
4284703203dSis size_t u16l;
4294703203dSis size_t u8l;
4304703203dSis uint32_t hi;
4314703203dSis uint32_t lo;
4324703203dSis boolean_t do_not_ignore_null;
4334703203dSis
4344703203dSis if (u16s == NULL || utf16len == NULL)
4354703203dSis return (EILSEQ);
4364703203dSis
4374703203dSis if (u8s == NULL || utf8len == NULL)
4384703203dSis return (E2BIG);
4394703203dSis
4404703203dSis if (check_endian(flag, &inendian, &outendian) != 0)
4414703203dSis return (EBADF);
4424703203dSis
4434703203dSis u16l = u8l = 0;
4444703203dSis hi = 0;
4454703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
4464703203dSis
4474703203dSis if ((flag & UCONV_IN_ACCEPT_BOM) &&
4484703203dSis check_bom16(u16s, *utf16len, &inendian))
4494703203dSis u16l++;
4504703203dSis
4514703203dSis inendian &= UCONV_IN_NAT_ENDIAN;
4524703203dSis
4534703203dSis for (; u16l < *utf16len; u16l++) {
4544703203dSis if (u16s[u16l] == 0 && do_not_ignore_null)
4554703203dSis break;
4564703203dSis
4574703203dSis lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
4584703203dSis
4594703203dSis if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
4604703203dSis if (hi)
4614703203dSis return (EILSEQ);
4624703203dSis hi = lo;
4634703203dSis continue;
4644703203dSis } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
4654703203dSis if (! hi)
4664703203dSis return (EILSEQ);
4674703203dSis lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
4684703203dSis lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
4694703203dSis + UCONV_U16_START;
4704703203dSis hi = 0;
4714703203dSis } else if (hi) {
4724703203dSis return (EILSEQ);
4734703203dSis }
4744703203dSis
4754703203dSis /*
4764703203dSis * Now we convert a UTF-32 character into a UTF-8 character.
4774703203dSis * Unicode coding space is between U+0000 and U+10FFFF;
4784703203dSis * anything bigger is an illegal character.
4794703203dSis */
4804703203dSis if (lo <= UCONV_U8_ONE_BYTE) {
4814703203dSis if (u8l >= *utf8len)
4824703203dSis return (E2BIG);
4834703203dSis u8s[u8l++] = (uchar_t)lo;
4844703203dSis } else if (lo <= UCONV_U8_TWO_BYTES) {
4854703203dSis if ((u8l + 1) >= *utf8len)
4864703203dSis return (E2BIG);
4874703203dSis u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
4884703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
4894703203dSis } else if (lo <= UCONV_U8_THREE_BYTES) {
4904703203dSis if ((u8l + 2) >= *utf8len)
4914703203dSis return (E2BIG);
4924703203dSis u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
4934703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
4944703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
4954703203dSis } else if (lo <= UCONV_U8_FOUR_BYTES) {
4964703203dSis if ((u8l + 3) >= *utf8len)
4974703203dSis return (E2BIG);
4984703203dSis u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
4994703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
5004703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
5014703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
5024703203dSis } else {
5034703203dSis return (EILSEQ);
5044703203dSis }
5054703203dSis }
5064703203dSis
5074703203dSis if (hi)
5084703203dSis return (EINVAL);
5094703203dSis
5104703203dSis *utf16len = u16l;
5114703203dSis *utf8len = u8l;
5124703203dSis
5134703203dSis return (0);
5144703203dSis }
5154703203dSis
5164703203dSis int
uconv_u32tou16(const uint32_t * u32s,size_t * utf32len,uint16_t * u16s,size_t * utf16len,int flag)5174703203dSis uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
5184703203dSis uint16_t *u16s, size_t *utf16len, int flag)
5194703203dSis {
5204703203dSis int inendian;
5214703203dSis int outendian;
5224703203dSis size_t u16l;
5234703203dSis size_t u32l;
5244703203dSis uint32_t hi;
5254703203dSis uint32_t lo;
5264703203dSis boolean_t do_not_ignore_null;
5274703203dSis
5284703203dSis if (u32s == NULL || utf32len == NULL)
5294703203dSis return (EILSEQ);
5304703203dSis
5314703203dSis if (u16s == NULL || utf16len == NULL)
5324703203dSis return (E2BIG);
5334703203dSis
5344703203dSis if (check_endian(flag, &inendian, &outendian) != 0)
5354703203dSis return (EBADF);
5364703203dSis
5374703203dSis u16l = u32l = 0;
5384703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
5394703203dSis
5404703203dSis if ((flag & UCONV_IN_ACCEPT_BOM) &&
5414703203dSis check_bom32(u32s, *utf32len, &inendian))
5424703203dSis u32l++;
5434703203dSis
5444703203dSis inendian &= UCONV_IN_NAT_ENDIAN;
5454703203dSis outendian &= UCONV_OUT_NAT_ENDIAN;
5464703203dSis
5474703203dSis if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
5484703203dSis u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
5494703203dSis UCONV_BOM_SWAPPED;
5504703203dSis
5514703203dSis for (; u32l < *utf32len; u32l++) {
5524703203dSis if (u32s[u32l] == 0 && do_not_ignore_null)
5534703203dSis break;
5544703203dSis
5554703203dSis hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
5564703203dSis
5574703203dSis /*
5584703203dSis * Anything bigger than the Unicode coding space, i.e.,
5594703203dSis * Unicode scalar value bigger than U+10FFFF, is an illegal
5604703203dSis * character.
5614703203dSis */
5624703203dSis if (hi > UCONV_UNICODE_MAX)
5634703203dSis return (EILSEQ);
5644703203dSis
5654703203dSis /*
5664703203dSis * Anything bigger than U+FFFF must be converted into
5674703203dSis * a surrogate pair in UTF-16.
5684703203dSis */
5694703203dSis if (hi >= UCONV_U16_START) {
5704703203dSis lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
5714703203dSis UCONV_U16_LO_MIN;
5724703203dSis hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
5734703203dSis UCONV_U16_HI_MIN;
5744703203dSis
5754703203dSis if ((u16l + 1) >= *utf16len)
5764703203dSis return (E2BIG);
5774703203dSis
5784703203dSis if (outendian) {
5794703203dSis u16s[u16l++] = (uint16_t)hi;
5804703203dSis u16s[u16l++] = (uint16_t)lo;
5814703203dSis } else {
5824703203dSis u16s[u16l++] = BSWAP_16(((uint16_t)hi));
5834703203dSis u16s[u16l++] = BSWAP_16(((uint16_t)lo));
5844703203dSis }
5854703203dSis } else {
5864703203dSis if (u16l >= *utf16len)
5874703203dSis return (E2BIG);
5884703203dSis u16s[u16l++] = (outendian) ? (uint16_t)hi :
5894703203dSis BSWAP_16(((uint16_t)hi));
5904703203dSis }
5914703203dSis }
5924703203dSis
5934703203dSis *utf16len = u16l;
5944703203dSis *utf32len = u32l;
5954703203dSis
5964703203dSis return (0);
5974703203dSis }
5984703203dSis
5994703203dSis int
uconv_u32tou8(const uint32_t * u32s,size_t * utf32len,uchar_t * u8s,size_t * utf8len,int flag)6004703203dSis uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
6014703203dSis uchar_t *u8s, size_t *utf8len, int flag)
6024703203dSis {
6034703203dSis int inendian;
6044703203dSis int outendian;
6054703203dSis size_t u32l;
6064703203dSis size_t u8l;
6074703203dSis uint32_t lo;
6084703203dSis boolean_t do_not_ignore_null;
6094703203dSis
6104703203dSis if (u32s == NULL || utf32len == NULL)
6114703203dSis return (EILSEQ);
6124703203dSis
6134703203dSis if (u8s == NULL || utf8len == NULL)
6144703203dSis return (E2BIG);
6154703203dSis
6164703203dSis if (check_endian(flag, &inendian, &outendian) != 0)
6174703203dSis return (EBADF);
6184703203dSis
6194703203dSis u32l = u8l = 0;
6204703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
6214703203dSis
6224703203dSis if ((flag & UCONV_IN_ACCEPT_BOM) &&
6234703203dSis check_bom32(u32s, *utf32len, &inendian))
6244703203dSis u32l++;
6254703203dSis
6264703203dSis inendian &= UCONV_IN_NAT_ENDIAN;
6274703203dSis
6284703203dSis for (; u32l < *utf32len; u32l++) {
6294703203dSis if (u32s[u32l] == 0 && do_not_ignore_null)
6304703203dSis break;
6314703203dSis
6324703203dSis lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
6334703203dSis
6344703203dSis if (lo <= UCONV_U8_ONE_BYTE) {
6354703203dSis if (u8l >= *utf8len)
6364703203dSis return (E2BIG);
6374703203dSis u8s[u8l++] = (uchar_t)lo;
6384703203dSis } else if (lo <= UCONV_U8_TWO_BYTES) {
6394703203dSis if ((u8l + 1) >= *utf8len)
6404703203dSis return (E2BIG);
6414703203dSis u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
6424703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
6434703203dSis } else if (lo <= UCONV_U8_THREE_BYTES) {
6444703203dSis if ((u8l + 2) >= *utf8len)
6454703203dSis return (E2BIG);
6464703203dSis u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
6474703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
6484703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
6494703203dSis } else if (lo <= UCONV_U8_FOUR_BYTES) {
6504703203dSis if ((u8l + 3) >= *utf8len)
6514703203dSis return (E2BIG);
6524703203dSis u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
6534703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
6544703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
6554703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
6564703203dSis } else {
6574703203dSis return (EILSEQ);
6584703203dSis }
6594703203dSis }
6604703203dSis
6614703203dSis *utf32len = u32l;
6624703203dSis *utf8len = u8l;
6634703203dSis
6644703203dSis return (0);
6654703203dSis }
6664703203dSis
6674703203dSis int
uconv_u8tou16(const uchar_t * u8s,size_t * utf8len,uint16_t * u16s,size_t * utf16len,int flag)6684703203dSis uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
6694703203dSis uint16_t *u16s, size_t *utf16len, int flag)
6704703203dSis {
6714703203dSis int inendian;
6724703203dSis int outendian;
6734703203dSis size_t u16l;
6744703203dSis size_t u8l;
6754703203dSis uint32_t hi;
6764703203dSis uint32_t lo;
6774703203dSis int remaining_bytes;
6784703203dSis int first_b;
6794703203dSis boolean_t do_not_ignore_null;
6804703203dSis
6814703203dSis if (u8s == NULL || utf8len == NULL)
6824703203dSis return (EILSEQ);
6834703203dSis
6844703203dSis if (u16s == NULL || utf16len == NULL)
6854703203dSis return (E2BIG);
6864703203dSis
6874703203dSis if (check_endian(flag, &inendian, &outendian) != 0)
6884703203dSis return (EBADF);
6894703203dSis
6904703203dSis u16l = u8l = 0;
6914703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
6924703203dSis
6934703203dSis outendian &= UCONV_OUT_NAT_ENDIAN;
6944703203dSis
6954703203dSis if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
6964703203dSis u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
6974703203dSis UCONV_BOM_SWAPPED;
6984703203dSis
6994703203dSis for (; u8l < *utf8len; ) {
7004703203dSis if (u8s[u8l] == 0 && do_not_ignore_null)
7014703203dSis break;
7024703203dSis
7034703203dSis /*
7044703203dSis * Collect a UTF-8 character and convert it to a UTF-32
7054703203dSis * character. In doing so, we screen out illegally formed
7064703203dSis * UTF-8 characters and treat such as illegal characters.
7074703203dSis * The algorithm at below also screens out anything bigger
7084703203dSis * than the U+10FFFF.
7094703203dSis *
7104703203dSis * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
7114703203dSis * more details on the illegal values of UTF-8 character
7124703203dSis * bytes.
7134703203dSis */
7144703203dSis hi = (uint32_t)u8s[u8l++];
7154703203dSis
7164703203dSis if (hi > UCONV_ASCII_MAX) {
7174703203dSis if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
7184703203dSis return (EILSEQ);
7194703203dSis
7204703203dSis first_b = hi;
721*15d9d0b5Syy154373 hi = hi & u8_masks_tbl[remaining_bytes];
7224703203dSis
7234703203dSis for (; remaining_bytes > 0; remaining_bytes--) {
7244703203dSis /*
7254703203dSis * If we have no more bytes, the current
7264703203dSis * UTF-8 character is incomplete.
7274703203dSis */
7284703203dSis if (u8l >= *utf8len)
7294703203dSis return (EINVAL);
7304703203dSis
7314703203dSis lo = (uint32_t)u8s[u8l++];
7324703203dSis
7334703203dSis if (first_b) {
7344703203dSis if (lo < valid_min_2nd_byte[first_b] ||
7354703203dSis lo > valid_max_2nd_byte[first_b])
7364703203dSis return (EILSEQ);
7374703203dSis first_b = 0;
7384703203dSis } else if (lo < UCONV_U8_BYTE_MIN ||
7394703203dSis lo > UCONV_U8_BYTE_MAX) {
7404703203dSis return (EILSEQ);
7414703203dSis }
7424703203dSis hi = (hi << UCONV_U8_BIT_SHIFT) |
7434703203dSis (lo & UCONV_U8_BIT_MASK);
7444703203dSis }
7454703203dSis }
7464703203dSis
7474703203dSis if (hi >= UCONV_U16_START) {
7484703203dSis lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
7494703203dSis UCONV_U16_LO_MIN;
7504703203dSis hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
7514703203dSis UCONV_U16_HI_MIN;
7524703203dSis
7534703203dSis if ((u16l + 1) >= *utf16len)
7544703203dSis return (E2BIG);
7554703203dSis
7564703203dSis if (outendian) {
7574703203dSis u16s[u16l++] = (uint16_t)hi;
7584703203dSis u16s[u16l++] = (uint16_t)lo;
7594703203dSis } else {
7604703203dSis u16s[u16l++] = BSWAP_16(((uint16_t)hi));
7614703203dSis u16s[u16l++] = BSWAP_16(((uint16_t)lo));
7624703203dSis }
7634703203dSis } else {
7644703203dSis if (u16l >= *utf16len)
7654703203dSis return (E2BIG);
7664703203dSis
7674703203dSis u16s[u16l++] = (outendian) ? (uint16_t)hi :
7684703203dSis BSWAP_16(((uint16_t)hi));
7694703203dSis }
7704703203dSis }
7714703203dSis
7724703203dSis *utf16len = u16l;
7734703203dSis *utf8len = u8l;
7744703203dSis
7754703203dSis return (0);
7764703203dSis }
7774703203dSis
7784703203dSis int
uconv_u8tou32(const uchar_t * u8s,size_t * utf8len,uint32_t * u32s,size_t * utf32len,int flag)7794703203dSis uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
7804703203dSis uint32_t *u32s, size_t *utf32len, int flag)
7814703203dSis {
7824703203dSis int inendian;
7834703203dSis int outendian;
7844703203dSis size_t u32l;
7854703203dSis size_t u8l;
7864703203dSis uint32_t hi;
7874703203dSis uint32_t c;
7884703203dSis int remaining_bytes;
7894703203dSis int first_b;
7904703203dSis boolean_t do_not_ignore_null;
7914703203dSis
7924703203dSis if (u8s == NULL || utf8len == NULL)
7934703203dSis return (EILSEQ);
7944703203dSis
7954703203dSis if (u32s == NULL || utf32len == NULL)
7964703203dSis return (E2BIG);
7974703203dSis
7984703203dSis if (check_endian(flag, &inendian, &outendian) != 0)
7994703203dSis return (EBADF);
8004703203dSis
8014703203dSis u32l = u8l = 0;
8024703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
8034703203dSis
8044703203dSis outendian &= UCONV_OUT_NAT_ENDIAN;
8054703203dSis
8064703203dSis if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
8074703203dSis u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
8084703203dSis UCONV_BOM_SWAPPED_32;
8094703203dSis
8104703203dSis for (; u8l < *utf8len; ) {
8114703203dSis if (u8s[u8l] == 0 && do_not_ignore_null)
8124703203dSis break;
8134703203dSis
8144703203dSis hi = (uint32_t)u8s[u8l++];
8154703203dSis
8164703203dSis if (hi > UCONV_ASCII_MAX) {
8174703203dSis if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
8184703203dSis return (EILSEQ);
8194703203dSis
8204703203dSis first_b = hi;
821*15d9d0b5Syy154373 hi = hi & u8_masks_tbl[remaining_bytes];
8224703203dSis
8234703203dSis for (; remaining_bytes > 0; remaining_bytes--) {
8244703203dSis if (u8l >= *utf8len)
8254703203dSis return (EINVAL);
8264703203dSis
8274703203dSis c = (uint32_t)u8s[u8l++];
8284703203dSis
8294703203dSis if (first_b) {
8304703203dSis if (c < valid_min_2nd_byte[first_b] ||
8314703203dSis c > valid_max_2nd_byte[first_b])
8324703203dSis return (EILSEQ);
8334703203dSis first_b = 0;
8344703203dSis } else if (c < UCONV_U8_BYTE_MIN ||
8354703203dSis c > UCONV_U8_BYTE_MAX) {
8364703203dSis return (EILSEQ);
8374703203dSis }
8384703203dSis hi = (hi << UCONV_U8_BIT_SHIFT) |
8394703203dSis (c & UCONV_U8_BIT_MASK);
8404703203dSis }
8414703203dSis }
8424703203dSis
8434703203dSis if (u32l >= *utf32len)
8444703203dSis return (E2BIG);
8454703203dSis
8464703203dSis u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
8474703203dSis }
8484703203dSis
8494703203dSis *utf32len = u32l;
8504703203dSis *utf8len = u8l;
8514703203dSis
8524703203dSis return (0);
8534703203dSis }
854