14703203dSis /*
24703203dSis * CDDL HEADER START
34703203dSis *
44703203dSis * The contents of this file are subject to the terms of the
54703203dSis * Common Development and Distribution License (the "License").
64703203dSis * You may not use this file except in compliance with the License.
74703203dSis *
84703203dSis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
94703203dSis * or http://www.opensolaris.org/os/licensing.
104703203dSis * See the License for the specific language governing permissions
114703203dSis * and limitations under the License.
124703203dSis *
134703203dSis * When distributing Covered Code, include this CDDL HEADER in each
144703203dSis * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
154703203dSis * If applicable, add the following below this CDDL HEADER, with the
164703203dSis * fields enclosed by brackets "[]" replaced with your own identifying
174703203dSis * information: Portions Copyright [yyyy] [name of copyright owner]
184703203dSis *
194703203dSis * CDDL HEADER END
204703203dSis */
214703203dSis /*
22*15d9d0b5Syy154373 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
234703203dSis * Use is subject to license terms.
244703203dSis */
254703203dSis
264703203dSis #pragma ident "%Z%%M% %I% %E% SMI"
274703203dSis
284703203dSis /*
294703203dSis * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
304703203dSis * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
314703203dSis * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
324703203dSis * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
334703203dSis * the section 3C man pages.
344703203dSis * Interface stability: Committed
354703203dSis */
364703203dSis
374703203dSis #include <sys/types.h>
384703203dSis #ifdef _KERNEL
394703203dSis #include <sys/param.h>
404703203dSis #include <sys/sysmacros.h>
414703203dSis #include <sys/systm.h>
424703203dSis #include <sys/debug.h>
434703203dSis #include <sys/kmem.h>
444703203dSis #include <sys/sunddi.h>
454703203dSis #else
464703203dSis #include <sys/u8_textprep.h>
474703203dSis #endif /* _KERNEL */
484703203dSis #include <sys/byteorder.h>
494703203dSis #include <sys/errno.h>
504703203dSis
514703203dSis
524703203dSis /*
534703203dSis * The max and min values of high and low surrogate pairs of UTF-16,
544703203dSis * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
554703203dSis */
564703203dSis #define UCONV_U16_HI_MIN (0xd800U)
574703203dSis #define UCONV_U16_HI_MAX (0xdbffU)
584703203dSis #define UCONV_U16_LO_MIN (0xdc00U)
594703203dSis #define UCONV_U16_LO_MAX (0xdfffU)
604703203dSis #define UCONV_U16_BIT_SHIFT (0x0400U)
614703203dSis #define UCONV_U16_BIT_MASK (0x0fffffU)
624703203dSis #define UCONV_U16_START (0x010000U)
634703203dSis
644703203dSis /* The maximum value of Unicode coding space and ASCII coding space. */
654703203dSis #define UCONV_UNICODE_MAX (0x10ffffU)
664703203dSis #define UCONV_ASCII_MAX (0x7fU)
674703203dSis
684703203dSis /* The mask values for input and output endians. */
694703203dSis #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
704703203dSis #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
714703203dSis
724703203dSis /* Native and reversed endian macros. */
734703203dSis #ifdef _BIG_ENDIAN
744703203dSis #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN
754703203dSis #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN
764703203dSis #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN
774703203dSis #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN
784703203dSis #else
794703203dSis #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN
804703203dSis #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN
814703203dSis #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN
824703203dSis #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN
834703203dSis #endif /* _BIG_ENDIAN */
844703203dSis
854703203dSis /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
864703203dSis #define UCONV_BOM_NORMAL (0xfeffU)
874703203dSis #define UCONV_BOM_SWAPPED (0xfffeU)
884703203dSis #define UCONV_BOM_SWAPPED_32 (0xfffe0000U)
894703203dSis
904703203dSis /* UTF-32 boundaries based on UTF-8 character byte lengths. */
914703203dSis #define UCONV_U8_ONE_BYTE (0x7fU)
924703203dSis #define UCONV_U8_TWO_BYTES (0x7ffU)
934703203dSis #define UCONV_U8_THREE_BYTES (0xffffU)
944703203dSis #define UCONV_U8_FOUR_BYTES (0x10ffffU)
954703203dSis
964703203dSis /* The common minimum and maximum values at the UTF-8 character bytes. */
974703203dSis #define UCONV_U8_BYTE_MIN (0x80U)
984703203dSis #define UCONV_U8_BYTE_MAX (0xbfU)
994703203dSis
1004703203dSis /*
1014703203dSis * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
1024703203dSis * UTF-8 character bytes.
1034703203dSis */
1044703203dSis #define UCONV_U8_BIT_SHIFT 6
1054703203dSis #define UCONV_U8_BIT_MASK 0x3f
1064703203dSis
1074703203dSis /*
1084703203dSis * The following vector shows remaining bytes in a UTF-8 character.
1094703203dSis * Index will be the first byte of the character.
1104703203dSis */
1114703203dSis static const uchar_t remaining_bytes_tbl[0x100] = {
1124703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1134703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1144703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1154703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1164703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1174703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1184703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1204703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1214703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1224703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1234703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1244703203dSis
1254703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
1264703203dSis 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1274703203dSis
1284703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
1294703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1304703203dSis
1314703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
1324703203dSis 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1334703203dSis
1344703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
1354703203dSis 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1364703203dSis };
1374703203dSis
1384703203dSis /*
1394703203dSis * The following is a vector of bit-masks to get used bits in
1404703203dSis * the first byte of a UTF-8 character. Index is remaining bytes at above of
1414703203dSis * the character.
1424703203dSis */
143*15d9d0b5Syy154373 #ifdef _KERNEL
144*15d9d0b5Syy154373 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145*15d9d0b5Syy154373 #else
146*15d9d0b5Syy154373 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
147*15d9d0b5Syy154373 #endif /* _KERNEL */
1484703203dSis
1494703203dSis /*
1504703203dSis * The following two vectors are to provide valid minimum and
1514703203dSis * maximum values for the 2'nd byte of a multibyte UTF-8 character for
1524703203dSis * better illegal sequence checking. The index value must be the value of
1534703203dSis * the first byte of the UTF-8 character.
1544703203dSis */
1554703203dSis static const uchar_t valid_min_2nd_byte[0x100] = {
1564703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1574703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1584703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1594703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1604703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1614703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1624703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1634703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1644703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1654703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1664703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1674703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1684703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1694703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1704703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1714703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1724703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1734703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1744703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1754703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1764703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1774703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1784703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1794703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
1804703203dSis
1814703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */
1824703203dSis 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1834703203dSis
1844703203dSis /* C8 C9 CA CB CC CD CE CF */
1854703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1864703203dSis
1874703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */
1884703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1894703203dSis
1904703203dSis /* D8 D9 DA DB DC DD DE DF */
1914703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1924703203dSis
1934703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */
1944703203dSis 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1954703203dSis
1964703203dSis /* E8 E9 EA EB EC ED EE EF */
1974703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1984703203dSis
1994703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */
2004703203dSis 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
2014703203dSis
2024703203dSis 0, 0, 0, 0, 0, 0, 0, 0
2034703203dSis };
2044703203dSis
2054703203dSis static const uchar_t valid_max_2nd_byte[0x100] = {
2064703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2074703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2084703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2094703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2104703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2114703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2124703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2134703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2144703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2154703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2164703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2174703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2184703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2194703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2204703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2214703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2224703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2234703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2244703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2254703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2264703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2274703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2284703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2294703203dSis 0, 0, 0, 0, 0, 0, 0, 0,
2304703203dSis
2314703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */
2324703203dSis 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2334703203dSis
2344703203dSis /* C8 C9 CA CB CC CD CE CF */
2354703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2364703203dSis
2374703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */
2384703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2394703203dSis
2404703203dSis /* D8 D9 DA DB DC DD DE DF */
2414703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2424703203dSis
2434703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */
2444703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
2454703203dSis
2464703203dSis /* E8 E9 EA EB EC ED EE EF */
2474703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
2484703203dSis
2494703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */
2504703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
2514703203dSis
2524703203dSis 0, 0, 0, 0, 0, 0, 0, 0
2534703203dSis };
2544703203dSis
2554703203dSis
2564703203dSis static int
check_endian(int flag,int * in,int * out)2574703203dSis check_endian(int flag, int *in, int *out)
2584703203dSis {
2594703203dSis *in = flag & UCONV_IN_ENDIAN_MASKS;
2604703203dSis
2614703203dSis /* You cannot have both. */
2624703203dSis if (*in == UCONV_IN_ENDIAN_MASKS)
2634703203dSis return (EBADF);
2644703203dSis
2654703203dSis if (*in == 0)
2664703203dSis *in = UCONV_IN_NAT_ENDIAN;
2674703203dSis
2684703203dSis *out = flag & UCONV_OUT_ENDIAN_MASKS;
2694703203dSis
2704703203dSis /* You cannot have both. */
2714703203dSis if (*out == UCONV_OUT_ENDIAN_MASKS)
2724703203dSis return (EBADF);
2734703203dSis
2744703203dSis if (*out == 0)
2754703203dSis *out = UCONV_OUT_NAT_ENDIAN;
2764703203dSis
2774703203dSis return (0);
2784703203dSis }
2794703203dSis
2804703203dSis static boolean_t
check_bom16(const uint16_t * u16s,size_t u16l,int * in)2814703203dSis check_bom16(const uint16_t *u16s, size_t u16l, int *in)
2824703203dSis {
2834703203dSis if (u16l > 0) {
2844703203dSis if (*u16s == UCONV_BOM_NORMAL) {
2854703203dSis *in = UCONV_IN_NAT_ENDIAN;
2864703203dSis return (B_TRUE);
2874703203dSis }
2884703203dSis if (*u16s == UCONV_BOM_SWAPPED) {
2894703203dSis *in = UCONV_IN_REV_ENDIAN;
2904703203dSis return (B_TRUE);
2914703203dSis }
2924703203dSis }
2934703203dSis
2944703203dSis return (B_FALSE);
2954703203dSis }
2964703203dSis
2974703203dSis static boolean_t
check_bom32(const uint32_t * u32s,size_t u32l,int * in)2984703203dSis check_bom32(const uint32_t *u32s, size_t u32l, int *in)
2994703203dSis {
3004703203dSis if (u32l > 0) {
3014703203dSis if (*u32s == UCONV_BOM_NORMAL) {
3024703203dSis *in = UCONV_IN_NAT_ENDIAN;
3034703203dSis return (B_TRUE);
3044703203dSis }
3054703203dSis if (*u32s == UCONV_BOM_SWAPPED_32) {
3064703203dSis *in = UCONV_IN_REV_ENDIAN;
3074703203dSis return (B_TRUE);
3084703203dSis }
3094703203dSis }
3104703203dSis
3114703203dSis return (B_FALSE);
3124703203dSis }
3134703203dSis
3144703203dSis int
uconv_u16tou32(const uint16_t * u16s,size_t * utf16len,uint32_t * u32s,size_t * utf32len,int flag)3154703203dSis uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
3164703203dSis uint32_t *u32s, size_t *utf32len, int flag)
3174703203dSis {
3184703203dSis int inendian;
3194703203dSis int outendian;
3204703203dSis size_t u16l;
3214703203dSis size_t u32l;
3224703203dSis uint32_t hi;
3234703203dSis uint32_t lo;
3244703203dSis boolean_t do_not_ignore_null;
3254703203dSis
3264703203dSis /*
3274703203dSis * Do preliminary validity checks on parameters and collect info on
3284703203dSis * endians.
3294703203dSis */
3304703203dSis if (u16s == NULL || utf16len == NULL)
3314703203dSis return (EILSEQ);
3324703203dSis
3334703203dSis if (u32s == NULL || utf32len == NULL)
3344703203dSis return (E2BIG);
3354703203dSis
3364703203dSis if (check_endian(flag, &inendian, &outendian) != 0)
3374703203dSis return (EBADF);
3384703203dSis
3394703203dSis /*
3404703203dSis * Initialize input and output parameter buffer indices and
3414703203dSis * temporary variables.
3424703203dSis */
3434703203dSis u16l = u32l = 0;
3444703203dSis hi = 0;
3454703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
3464703203dSis
3474703203dSis /*
3484703203dSis * Check on the BOM at the beginning of the input buffer if required
3494703203dSis * and if there is indeed one, process it.
3504703203dSis */
3514703203dSis if ((flag & UCONV_IN_ACCEPT_BOM) &&
3524703203dSis check_bom16(u16s, *utf16len, &inendian))
3534703203dSis u16l++;
3544703203dSis
3554703203dSis /*
3564703203dSis * Reset inendian and outendian so that after this point, those can be
3574703203dSis * used as condition values.
3584703203dSis */
3594703203dSis inendian &= UCONV_IN_NAT_ENDIAN;
3604703203dSis outendian &= UCONV_OUT_NAT_ENDIAN;
3614703203dSis
3624703203dSis /*
3634703203dSis * If there is something in the input buffer and if necessary and
3644703203dSis * requested, save the BOM at the output buffer.
3654703203dSis */
3664703203dSis if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
3674703203dSis u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
3684703203dSis UCONV_BOM_SWAPPED_32;
3694703203dSis
3704703203dSis /*
3714703203dSis * Do conversion; if encounter a surrogate pair, assemble high and
3724703203dSis * low pair values to form a UTF-32 character. If a half of a pair
3734703203dSis * exists alone, then, either it is an illegal (EILSEQ) or
3744703203dSis * invalid (EINVAL) value.
3754703203dSis */
3764703203dSis for (; u16l < *utf16len; u16l++) {
3774703203dSis if (u16s[u16l] == 0 && do_not_ignore_null)
3784703203dSis break;
3794703203dSis
3804703203dSis lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
3814703203dSis
3824703203dSis if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
3834703203dSis if (hi)
3844703203dSis return (EILSEQ);
3854703203dSis hi = lo;
3864703203dSis continue;
3874703203dSis } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
3884703203dSis if (! hi)
3894703203dSis return (EILSEQ);
3904703203dSis lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
3914703203dSis lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
3924703203dSis + UCONV_U16_START;
3934703203dSis hi = 0;
3944703203dSis } else if (hi) {
3954703203dSis return (EILSEQ);
3964703203dSis }
3974703203dSis
3984703203dSis if (u32l >= *utf32len)
3994703203dSis return (E2BIG);
4004703203dSis
4014703203dSis u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
4024703203dSis }
4034703203dSis
4044703203dSis /*
4054703203dSis * If high half didn't see low half, then, it's most likely the input
4064703203dSis * parameter is incomplete.
4074703203dSis */
4084703203dSis if (hi)
4094703203dSis return (EINVAL);
4104703203dSis
4114703203dSis /*
4124703203dSis * Save the number of consumed and saved characters. They do not
4134703203dSis * include terminating NULL character (U+0000) at the end of
4144703203dSis * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
4154703203dSis * the input buffer length is big enough to include the terminating
4164703203dSis * NULL character).
4174703203dSis */
4184703203dSis *utf16len = u16l;
4194703203dSis *utf32len = u32l;
4204703203dSis
4214703203dSis return (0);
4224703203dSis }
4234703203dSis
4244703203dSis int
uconv_u16tou8(const uint16_t * u16s,size_t * utf16len,uchar_t * u8s,size_t * utf8len,int flag)4254703203dSis uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
4264703203dSis uchar_t *u8s, size_t *utf8len, int flag)
4274703203dSis {
4284703203dSis int inendian;
4294703203dSis int outendian;
4304703203dSis size_t u16l;
4314703203dSis size_t u8l;
4324703203dSis uint32_t hi;
4334703203dSis uint32_t lo;
4344703203dSis boolean_t do_not_ignore_null;
4354703203dSis
4364703203dSis if (u16s == NULL || utf16len == NULL)
4374703203dSis return (EILSEQ);
4384703203dSis
4394703203dSis if (u8s == NULL || utf8len == NULL)
4404703203dSis return (E2BIG);
4414703203dSis
4424703203dSis if (check_endian(flag, &inendian, &outendian) != 0)
4434703203dSis return (EBADF);
4444703203dSis
4454703203dSis u16l = u8l = 0;
4464703203dSis hi = 0;
4474703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
4484703203dSis
4494703203dSis if ((flag & UCONV_IN_ACCEPT_BOM) &&
4504703203dSis check_bom16(u16s, *utf16len, &inendian))
4514703203dSis u16l++;
4524703203dSis
4534703203dSis inendian &= UCONV_IN_NAT_ENDIAN;
4544703203dSis
4554703203dSis for (; u16l < *utf16len; u16l++) {
4564703203dSis if (u16s[u16l] == 0 && do_not_ignore_null)
4574703203dSis break;
4584703203dSis
4594703203dSis lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
4604703203dSis
4614703203dSis if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
4624703203dSis if (hi)
4634703203dSis return (EILSEQ);
4644703203dSis hi = lo;
4654703203dSis continue;
4664703203dSis } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
4674703203dSis if (! hi)
4684703203dSis return (EILSEQ);
4694703203dSis lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
4704703203dSis lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
4714703203dSis + UCONV_U16_START;
4724703203dSis hi = 0;
4734703203dSis } else if (hi) {
4744703203dSis return (EILSEQ);
4754703203dSis }
4764703203dSis
4774703203dSis /*
4784703203dSis * Now we convert a UTF-32 character into a UTF-8 character.
4794703203dSis * Unicode coding space is between U+0000 and U+10FFFF;
4804703203dSis * anything bigger is an illegal character.
4814703203dSis */
4824703203dSis if (lo <= UCONV_U8_ONE_BYTE) {
4834703203dSis if (u8l >= *utf8len)
4844703203dSis return (E2BIG);
4854703203dSis u8s[u8l++] = (uchar_t)lo;
4864703203dSis } else if (lo <= UCONV_U8_TWO_BYTES) {
4874703203dSis if ((u8l + 1) >= *utf8len)
4884703203dSis return (E2BIG);
4894703203dSis u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
4904703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
4914703203dSis } else if (lo <= UCONV_U8_THREE_BYTES) {
4924703203dSis if ((u8l + 2) >= *utf8len)
4934703203dSis return (E2BIG);
4944703203dSis u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
4954703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
4964703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
4974703203dSis } else if (lo <= UCONV_U8_FOUR_BYTES) {
4984703203dSis if ((u8l + 3) >= *utf8len)
4994703203dSis return (E2BIG);
5004703203dSis u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
5014703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
5024703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
5034703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
5044703203dSis } else {
5054703203dSis return (EILSEQ);
5064703203dSis }
5074703203dSis }
5084703203dSis
5094703203dSis if (hi)
5104703203dSis return (EINVAL);
5114703203dSis
5124703203dSis *utf16len = u16l;
5134703203dSis *utf8len = u8l;
5144703203dSis
5154703203dSis return (0);
5164703203dSis }
5174703203dSis
5184703203dSis int
uconv_u32tou16(const uint32_t * u32s,size_t * utf32len,uint16_t * u16s,size_t * utf16len,int flag)5194703203dSis uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
5204703203dSis uint16_t *u16s, size_t *utf16len, int flag)
5214703203dSis {
5224703203dSis int inendian;
5234703203dSis int outendian;
5244703203dSis size_t u16l;
5254703203dSis size_t u32l;
5264703203dSis uint32_t hi;
5274703203dSis uint32_t lo;
5284703203dSis boolean_t do_not_ignore_null;
5294703203dSis
5304703203dSis if (u32s == NULL || utf32len == NULL)
5314703203dSis return (EILSEQ);
5324703203dSis
5334703203dSis if (u16s == NULL || utf16len == NULL)
5344703203dSis return (E2BIG);
5354703203dSis
5364703203dSis if (check_endian(flag, &inendian, &outendian) != 0)
5374703203dSis return (EBADF);
5384703203dSis
5394703203dSis u16l = u32l = 0;
5404703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
5414703203dSis
5424703203dSis if ((flag & UCONV_IN_ACCEPT_BOM) &&
5434703203dSis check_bom32(u32s, *utf32len, &inendian))
5444703203dSis u32l++;
5454703203dSis
5464703203dSis inendian &= UCONV_IN_NAT_ENDIAN;
5474703203dSis outendian &= UCONV_OUT_NAT_ENDIAN;
5484703203dSis
5494703203dSis if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
5504703203dSis u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
5514703203dSis UCONV_BOM_SWAPPED;
5524703203dSis
5534703203dSis for (; u32l < *utf32len; u32l++) {
5544703203dSis if (u32s[u32l] == 0 && do_not_ignore_null)
5554703203dSis break;
5564703203dSis
5574703203dSis hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
5584703203dSis
5594703203dSis /*
5604703203dSis * Anything bigger than the Unicode coding space, i.e.,
5614703203dSis * Unicode scalar value bigger than U+10FFFF, is an illegal
5624703203dSis * character.
5634703203dSis */
5644703203dSis if (hi > UCONV_UNICODE_MAX)
5654703203dSis return (EILSEQ);
5664703203dSis
5674703203dSis /*
5684703203dSis * Anything bigger than U+FFFF must be converted into
5694703203dSis * a surrogate pair in UTF-16.
5704703203dSis */
5714703203dSis if (hi >= UCONV_U16_START) {
5724703203dSis lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
5734703203dSis UCONV_U16_LO_MIN;
5744703203dSis hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
5754703203dSis UCONV_U16_HI_MIN;
5764703203dSis
5774703203dSis if ((u16l + 1) >= *utf16len)
5784703203dSis return (E2BIG);
5794703203dSis
5804703203dSis if (outendian) {
5814703203dSis u16s[u16l++] = (uint16_t)hi;
5824703203dSis u16s[u16l++] = (uint16_t)lo;
5834703203dSis } else {
5844703203dSis u16s[u16l++] = BSWAP_16(((uint16_t)hi));
5854703203dSis u16s[u16l++] = BSWAP_16(((uint16_t)lo));
5864703203dSis }
5874703203dSis } else {
5884703203dSis if (u16l >= *utf16len)
5894703203dSis return (E2BIG);
5904703203dSis u16s[u16l++] = (outendian) ? (uint16_t)hi :
5914703203dSis BSWAP_16(((uint16_t)hi));
5924703203dSis }
5934703203dSis }
5944703203dSis
5954703203dSis *utf16len = u16l;
5964703203dSis *utf32len = u32l;
5974703203dSis
5984703203dSis return (0);
5994703203dSis }
6004703203dSis
6014703203dSis int
uconv_u32tou8(const uint32_t * u32s,size_t * utf32len,uchar_t * u8s,size_t * utf8len,int flag)6024703203dSis uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
6034703203dSis uchar_t *u8s, size_t *utf8len, int flag)
6044703203dSis {
6054703203dSis int inendian;
6064703203dSis int outendian;
6074703203dSis size_t u32l;
6084703203dSis size_t u8l;
6094703203dSis uint32_t lo;
6104703203dSis boolean_t do_not_ignore_null;
6114703203dSis
6124703203dSis if (u32s == NULL || utf32len == NULL)
6134703203dSis return (EILSEQ);
6144703203dSis
6154703203dSis if (u8s == NULL || utf8len == NULL)
6164703203dSis return (E2BIG);
6174703203dSis
6184703203dSis if (check_endian(flag, &inendian, &outendian) != 0)
6194703203dSis return (EBADF);
6204703203dSis
6214703203dSis u32l = u8l = 0;
6224703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
6234703203dSis
6244703203dSis if ((flag & UCONV_IN_ACCEPT_BOM) &&
6254703203dSis check_bom32(u32s, *utf32len, &inendian))
6264703203dSis u32l++;
6274703203dSis
6284703203dSis inendian &= UCONV_IN_NAT_ENDIAN;
6294703203dSis
6304703203dSis for (; u32l < *utf32len; u32l++) {
6314703203dSis if (u32s[u32l] == 0 && do_not_ignore_null)
6324703203dSis break;
6334703203dSis
6344703203dSis lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
6354703203dSis
6364703203dSis if (lo <= UCONV_U8_ONE_BYTE) {
6374703203dSis if (u8l >= *utf8len)
6384703203dSis return (E2BIG);
6394703203dSis u8s[u8l++] = (uchar_t)lo;
6404703203dSis } else if (lo <= UCONV_U8_TWO_BYTES) {
6414703203dSis if ((u8l + 1) >= *utf8len)
6424703203dSis return (E2BIG);
6434703203dSis u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
6444703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
6454703203dSis } else if (lo <= UCONV_U8_THREE_BYTES) {
6464703203dSis if ((u8l + 2) >= *utf8len)
6474703203dSis return (E2BIG);
6484703203dSis u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
6494703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
6504703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
6514703203dSis } else if (lo <= UCONV_U8_FOUR_BYTES) {
6524703203dSis if ((u8l + 3) >= *utf8len)
6534703203dSis return (E2BIG);
6544703203dSis u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
6554703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
6564703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
6574703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
6584703203dSis } else {
6594703203dSis return (EILSEQ);
6604703203dSis }
6614703203dSis }
6624703203dSis
6634703203dSis *utf32len = u32l;
6644703203dSis *utf8len = u8l;
6654703203dSis
6664703203dSis return (0);
6674703203dSis }
6684703203dSis
6694703203dSis int
uconv_u8tou16(const uchar_t * u8s,size_t * utf8len,uint16_t * u16s,size_t * utf16len,int flag)6704703203dSis uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
6714703203dSis uint16_t *u16s, size_t *utf16len, int flag)
6724703203dSis {
6734703203dSis int inendian;
6744703203dSis int outendian;
6754703203dSis size_t u16l;
6764703203dSis size_t u8l;
6774703203dSis uint32_t hi;
6784703203dSis uint32_t lo;
6794703203dSis int remaining_bytes;
6804703203dSis int first_b;
6814703203dSis boolean_t do_not_ignore_null;
6824703203dSis
6834703203dSis if (u8s == NULL || utf8len == NULL)
6844703203dSis return (EILSEQ);
6854703203dSis
6864703203dSis if (u16s == NULL || utf16len == NULL)
6874703203dSis return (E2BIG);
6884703203dSis
6894703203dSis if (check_endian(flag, &inendian, &outendian) != 0)
6904703203dSis return (EBADF);
6914703203dSis
6924703203dSis u16l = u8l = 0;
6934703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
6944703203dSis
6954703203dSis outendian &= UCONV_OUT_NAT_ENDIAN;
6964703203dSis
6974703203dSis if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
6984703203dSis u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
6994703203dSis UCONV_BOM_SWAPPED;
7004703203dSis
7014703203dSis for (; u8l < *utf8len; ) {
7024703203dSis if (u8s[u8l] == 0 && do_not_ignore_null)
7034703203dSis break;
7044703203dSis
7054703203dSis /*
7064703203dSis * Collect a UTF-8 character and convert it to a UTF-32
7074703203dSis * character. In doing so, we screen out illegally formed
7084703203dSis * UTF-8 characters and treat such as illegal characters.
7094703203dSis * The algorithm at below also screens out anything bigger
7104703203dSis * than the U+10FFFF.
7114703203dSis *
7124703203dSis * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
7134703203dSis * more details on the illegal values of UTF-8 character
7144703203dSis * bytes.
7154703203dSis */
7164703203dSis hi = (uint32_t)u8s[u8l++];
7174703203dSis
7184703203dSis if (hi > UCONV_ASCII_MAX) {
7194703203dSis if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
7204703203dSis return (EILSEQ);
7214703203dSis
7224703203dSis first_b = hi;
723*15d9d0b5Syy154373 hi = hi & u8_masks_tbl[remaining_bytes];
7244703203dSis
7254703203dSis for (; remaining_bytes > 0; remaining_bytes--) {
7264703203dSis /*
7274703203dSis * If we have no more bytes, the current
7284703203dSis * UTF-8 character is incomplete.
7294703203dSis */
7304703203dSis if (u8l >= *utf8len)
7314703203dSis return (EINVAL);
7324703203dSis
7334703203dSis lo = (uint32_t)u8s[u8l++];
7344703203dSis
7354703203dSis if (first_b) {
7364703203dSis if (lo < valid_min_2nd_byte[first_b] ||
7374703203dSis lo > valid_max_2nd_byte[first_b])
7384703203dSis return (EILSEQ);
7394703203dSis first_b = 0;
7404703203dSis } else if (lo < UCONV_U8_BYTE_MIN ||
7414703203dSis lo > UCONV_U8_BYTE_MAX) {
7424703203dSis return (EILSEQ);
7434703203dSis }
7444703203dSis hi = (hi << UCONV_U8_BIT_SHIFT) |
7454703203dSis (lo & UCONV_U8_BIT_MASK);
7464703203dSis }
7474703203dSis }
7484703203dSis
7494703203dSis if (hi >= UCONV_U16_START) {
7504703203dSis lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
7514703203dSis UCONV_U16_LO_MIN;
7524703203dSis hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
7534703203dSis UCONV_U16_HI_MIN;
7544703203dSis
7554703203dSis if ((u16l + 1) >= *utf16len)
7564703203dSis return (E2BIG);
7574703203dSis
7584703203dSis if (outendian) {
7594703203dSis u16s[u16l++] = (uint16_t)hi;
7604703203dSis u16s[u16l++] = (uint16_t)lo;
7614703203dSis } else {
7624703203dSis u16s[u16l++] = BSWAP_16(((uint16_t)hi));
7634703203dSis u16s[u16l++] = BSWAP_16(((uint16_t)lo));
7644703203dSis }
7654703203dSis } else {
7664703203dSis if (u16l >= *utf16len)
7674703203dSis return (E2BIG);
7684703203dSis
7694703203dSis u16s[u16l++] = (outendian) ? (uint16_t)hi :
7704703203dSis BSWAP_16(((uint16_t)hi));
7714703203dSis }
7724703203dSis }
7734703203dSis
7744703203dSis *utf16len = u16l;
7754703203dSis *utf8len = u8l;
7764703203dSis
7774703203dSis return (0);
7784703203dSis }
7794703203dSis
7804703203dSis int
uconv_u8tou32(const uchar_t * u8s,size_t * utf8len,uint32_t * u32s,size_t * utf32len,int flag)7814703203dSis uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
7824703203dSis uint32_t *u32s, size_t *utf32len, int flag)
7834703203dSis {
7844703203dSis int inendian;
7854703203dSis int outendian;
7864703203dSis size_t u32l;
7874703203dSis size_t u8l;
7884703203dSis uint32_t hi;
7894703203dSis uint32_t c;
7904703203dSis int remaining_bytes;
7914703203dSis int first_b;
7924703203dSis boolean_t do_not_ignore_null;
7934703203dSis
7944703203dSis if (u8s == NULL || utf8len == NULL)
7954703203dSis return (EILSEQ);
7964703203dSis
7974703203dSis if (u32s == NULL || utf32len == NULL)
7984703203dSis return (E2BIG);
7994703203dSis
8004703203dSis if (check_endian(flag, &inendian, &outendian) != 0)
8014703203dSis return (EBADF);
8024703203dSis
8034703203dSis u32l = u8l = 0;
8044703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
8054703203dSis
8064703203dSis outendian &= UCONV_OUT_NAT_ENDIAN;
8074703203dSis
8084703203dSis if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
8094703203dSis u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
8104703203dSis UCONV_BOM_SWAPPED_32;
8114703203dSis
8124703203dSis for (; u8l < *utf8len; ) {
8134703203dSis if (u8s[u8l] == 0 && do_not_ignore_null)
8144703203dSis break;
8154703203dSis
8164703203dSis hi = (uint32_t)u8s[u8l++];
8174703203dSis
8184703203dSis if (hi > UCONV_ASCII_MAX) {
8194703203dSis if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
8204703203dSis return (EILSEQ);
8214703203dSis
8224703203dSis first_b = hi;
823*15d9d0b5Syy154373 hi = hi & u8_masks_tbl[remaining_bytes];
8244703203dSis
8254703203dSis for (; remaining_bytes > 0; remaining_bytes--) {
8264703203dSis if (u8l >= *utf8len)
8274703203dSis return (EINVAL);
8284703203dSis
8294703203dSis c = (uint32_t)u8s[u8l++];
8304703203dSis
8314703203dSis if (first_b) {
8324703203dSis if (c < valid_min_2nd_byte[first_b] ||
8334703203dSis c > valid_max_2nd_byte[first_b])
8344703203dSis return (EILSEQ);
8354703203dSis first_b = 0;
8364703203dSis } else if (c < UCONV_U8_BYTE_MIN ||
8374703203dSis c > UCONV_U8_BYTE_MAX) {
8384703203dSis return (EILSEQ);
8394703203dSis }
8404703203dSis hi = (hi << UCONV_U8_BIT_SHIFT) |
8414703203dSis (c & UCONV_U8_BIT_MASK);
8424703203dSis }
8434703203dSis }
8444703203dSis
8454703203dSis if (u32l >= *utf32len)
8464703203dSis return (E2BIG);
8474703203dSis
8484703203dSis u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
8494703203dSis }
8504703203dSis
8514703203dSis *utf32len = u32l;
8524703203dSis *utf8len = u8l;
8534703203dSis
8544703203dSis return (0);
8554703203dSis }
856