1*4703203dSis /* 2*4703203dSis * CDDL HEADER START 3*4703203dSis * 4*4703203dSis * The contents of this file are subject to the terms of the 5*4703203dSis * Common Development and Distribution License (the "License"). 6*4703203dSis * You may not use this file except in compliance with the License. 7*4703203dSis * 8*4703203dSis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*4703203dSis * or http://www.opensolaris.org/os/licensing. 10*4703203dSis * See the License for the specific language governing permissions 11*4703203dSis * and limitations under the License. 12*4703203dSis * 13*4703203dSis * When distributing Covered Code, include this CDDL HEADER in each 14*4703203dSis * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*4703203dSis * If applicable, add the following below this CDDL HEADER, with the 16*4703203dSis * fields enclosed by brackets "[]" replaced with your own identifying 17*4703203dSis * information: Portions Copyright [yyyy] [name of copyright owner] 18*4703203dSis * 19*4703203dSis * CDDL HEADER END 20*4703203dSis */ 21*4703203dSis /* 22*4703203dSis * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23*4703203dSis * Use is subject to license terms. 24*4703203dSis */ 25*4703203dSis 26*4703203dSis #pragma ident "%Z%%M% %I% %E% SMI" 27*4703203dSis 28*4703203dSis /* 29*4703203dSis * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32. 30*4703203dSis * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517) 31*4703203dSis * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F), 32*4703203dSis * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also 33*4703203dSis * the section 3C man pages. 34*4703203dSis * Interface stability: Committed 35*4703203dSis */ 36*4703203dSis 37*4703203dSis #include <sys/types.h> 38*4703203dSis #ifdef _KERNEL 39*4703203dSis #include <sys/param.h> 40*4703203dSis #include <sys/sysmacros.h> 41*4703203dSis #include <sys/systm.h> 42*4703203dSis #include <sys/debug.h> 43*4703203dSis #include <sys/kmem.h> 44*4703203dSis #include <sys/sunddi.h> 45*4703203dSis #else 46*4703203dSis #include <sys/u8_textprep.h> 47*4703203dSis #endif /* _KERNEL */ 48*4703203dSis #include <sys/byteorder.h> 49*4703203dSis #include <sys/errno.h> 50*4703203dSis 51*4703203dSis 52*4703203dSis /* 53*4703203dSis * The max and min values of high and low surrogate pairs of UTF-16, 54*4703203dSis * UTF-16 bit shift value, bit mask, and starting value outside of BMP. 55*4703203dSis */ 56*4703203dSis #define UCONV_U16_HI_MIN (0xd800U) 57*4703203dSis #define UCONV_U16_HI_MAX (0xdbffU) 58*4703203dSis #define UCONV_U16_LO_MIN (0xdc00U) 59*4703203dSis #define UCONV_U16_LO_MAX (0xdfffU) 60*4703203dSis #define UCONV_U16_BIT_SHIFT (0x0400U) 61*4703203dSis #define UCONV_U16_BIT_MASK (0x0fffffU) 62*4703203dSis #define UCONV_U16_START (0x010000U) 63*4703203dSis 64*4703203dSis /* The maximum value of Unicode coding space and ASCII coding space. */ 65*4703203dSis #define UCONV_UNICODE_MAX (0x10ffffU) 66*4703203dSis #define UCONV_ASCII_MAX (0x7fU) 67*4703203dSis 68*4703203dSis /* The mask values for input and output endians. */ 69*4703203dSis #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN) 70*4703203dSis #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN) 71*4703203dSis 72*4703203dSis /* Native and reversed endian macros. */ 73*4703203dSis #ifdef _BIG_ENDIAN 74*4703203dSis #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN 75*4703203dSis #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN 76*4703203dSis #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN 77*4703203dSis #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN 78*4703203dSis #else 79*4703203dSis #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN 80*4703203dSis #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN 81*4703203dSis #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN 82*4703203dSis #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN 83*4703203dSis #endif /* _BIG_ENDIAN */ 84*4703203dSis 85*4703203dSis /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */ 86*4703203dSis #define UCONV_BOM_NORMAL (0xfeffU) 87*4703203dSis #define UCONV_BOM_SWAPPED (0xfffeU) 88*4703203dSis #define UCONV_BOM_SWAPPED_32 (0xfffe0000U) 89*4703203dSis 90*4703203dSis /* UTF-32 boundaries based on UTF-8 character byte lengths. */ 91*4703203dSis #define UCONV_U8_ONE_BYTE (0x7fU) 92*4703203dSis #define UCONV_U8_TWO_BYTES (0x7ffU) 93*4703203dSis #define UCONV_U8_THREE_BYTES (0xffffU) 94*4703203dSis #define UCONV_U8_FOUR_BYTES (0x10ffffU) 95*4703203dSis 96*4703203dSis /* The common minimum and maximum values at the UTF-8 character bytes. */ 97*4703203dSis #define UCONV_U8_BYTE_MIN (0x80U) 98*4703203dSis #define UCONV_U8_BYTE_MAX (0xbfU) 99*4703203dSis 100*4703203dSis /* 101*4703203dSis * The following "6" and "0x3f" came from "10xx xxxx" bit representation of 102*4703203dSis * UTF-8 character bytes. 103*4703203dSis */ 104*4703203dSis #define UCONV_U8_BIT_SHIFT 6 105*4703203dSis #define UCONV_U8_BIT_MASK 0x3f 106*4703203dSis 107*4703203dSis /* 108*4703203dSis * The following vector shows remaining bytes in a UTF-8 character. 109*4703203dSis * Index will be the first byte of the character. 110*4703203dSis */ 111*4703203dSis static const uchar_t remaining_bytes_tbl[0x100] = { 112*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 113*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 114*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 115*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 117*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 118*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 119*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 120*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 121*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 122*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 123*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 124*4703203dSis 125*4703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 126*4703203dSis 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 127*4703203dSis 128*4703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 129*4703203dSis 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 130*4703203dSis 131*4703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 132*4703203dSis 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 133*4703203dSis 134*4703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 135*4703203dSis 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 136*4703203dSis }; 137*4703203dSis 138*4703203dSis /* 139*4703203dSis * The following is a vector of bit-masks to get used bits in 140*4703203dSis * the first byte of a UTF-8 character. Index is remaining bytes at above of 141*4703203dSis * the character. 142*4703203dSis */ 143*4703203dSis static const uchar_t masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 144*4703203dSis 145*4703203dSis /* 146*4703203dSis * The following two vectors are to provide valid minimum and 147*4703203dSis * maximum values for the 2'nd byte of a multibyte UTF-8 character for 148*4703203dSis * better illegal sequence checking. The index value must be the value of 149*4703203dSis * the first byte of the UTF-8 character. 150*4703203dSis */ 151*4703203dSis static const uchar_t valid_min_2nd_byte[0x100] = { 152*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 153*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 154*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 155*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 156*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 157*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 158*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 159*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 160*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 161*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 162*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 163*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 164*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 165*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 166*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 167*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 168*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 169*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 170*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 171*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 172*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 173*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 174*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 175*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 176*4703203dSis 177*4703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */ 178*4703203dSis 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 179*4703203dSis 180*4703203dSis /* C8 C9 CA CB CC CD CE CF */ 181*4703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 182*4703203dSis 183*4703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */ 184*4703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 185*4703203dSis 186*4703203dSis /* D8 D9 DA DB DC DD DE DF */ 187*4703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 188*4703203dSis 189*4703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */ 190*4703203dSis 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 191*4703203dSis 192*4703203dSis /* E8 E9 EA EB EC ED EE EF */ 193*4703203dSis 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 194*4703203dSis 195*4703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */ 196*4703203dSis 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 197*4703203dSis 198*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0 199*4703203dSis }; 200*4703203dSis 201*4703203dSis static const uchar_t valid_max_2nd_byte[0x100] = { 202*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 203*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 204*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 205*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 206*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 207*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 208*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 209*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 210*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 211*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 212*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 213*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 214*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 215*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 216*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 217*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 218*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 219*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 220*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 221*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 222*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 223*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 224*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 225*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0, 226*4703203dSis 227*4703203dSis /* C0 C1 C2 C3 C4 C5 C6 C7 */ 228*4703203dSis 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 229*4703203dSis 230*4703203dSis /* C8 C9 CA CB CC CD CE CF */ 231*4703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 232*4703203dSis 233*4703203dSis /* D0 D1 D2 D3 D4 D5 D6 D7 */ 234*4703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 235*4703203dSis 236*4703203dSis /* D8 D9 DA DB DC DD DE DF */ 237*4703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 238*4703203dSis 239*4703203dSis /* E0 E1 E2 E3 E4 E5 E6 E7 */ 240*4703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 241*4703203dSis 242*4703203dSis /* E8 E9 EA EB EC ED EE EF */ 243*4703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 244*4703203dSis 245*4703203dSis /* F0 F1 F2 F3 F4 F5 F6 F7 */ 246*4703203dSis 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 247*4703203dSis 248*4703203dSis 0, 0, 0, 0, 0, 0, 0, 0 249*4703203dSis }; 250*4703203dSis 251*4703203dSis 252*4703203dSis static int 253*4703203dSis check_endian(int flag, int *in, int *out) 254*4703203dSis { 255*4703203dSis *in = flag & UCONV_IN_ENDIAN_MASKS; 256*4703203dSis 257*4703203dSis /* You cannot have both. */ 258*4703203dSis if (*in == UCONV_IN_ENDIAN_MASKS) 259*4703203dSis return (EBADF); 260*4703203dSis 261*4703203dSis if (*in == 0) 262*4703203dSis *in = UCONV_IN_NAT_ENDIAN; 263*4703203dSis 264*4703203dSis *out = flag & UCONV_OUT_ENDIAN_MASKS; 265*4703203dSis 266*4703203dSis /* You cannot have both. */ 267*4703203dSis if (*out == UCONV_OUT_ENDIAN_MASKS) 268*4703203dSis return (EBADF); 269*4703203dSis 270*4703203dSis if (*out == 0) 271*4703203dSis *out = UCONV_OUT_NAT_ENDIAN; 272*4703203dSis 273*4703203dSis return (0); 274*4703203dSis } 275*4703203dSis 276*4703203dSis static boolean_t 277*4703203dSis check_bom16(const uint16_t *u16s, size_t u16l, int *in) 278*4703203dSis { 279*4703203dSis if (u16l > 0) { 280*4703203dSis if (*u16s == UCONV_BOM_NORMAL) { 281*4703203dSis *in = UCONV_IN_NAT_ENDIAN; 282*4703203dSis return (B_TRUE); 283*4703203dSis } 284*4703203dSis if (*u16s == UCONV_BOM_SWAPPED) { 285*4703203dSis *in = UCONV_IN_REV_ENDIAN; 286*4703203dSis return (B_TRUE); 287*4703203dSis } 288*4703203dSis } 289*4703203dSis 290*4703203dSis return (B_FALSE); 291*4703203dSis } 292*4703203dSis 293*4703203dSis static boolean_t 294*4703203dSis check_bom32(const uint32_t *u32s, size_t u32l, int *in) 295*4703203dSis { 296*4703203dSis if (u32l > 0) { 297*4703203dSis if (*u32s == UCONV_BOM_NORMAL) { 298*4703203dSis *in = UCONV_IN_NAT_ENDIAN; 299*4703203dSis return (B_TRUE); 300*4703203dSis } 301*4703203dSis if (*u32s == UCONV_BOM_SWAPPED_32) { 302*4703203dSis *in = UCONV_IN_REV_ENDIAN; 303*4703203dSis return (B_TRUE); 304*4703203dSis } 305*4703203dSis } 306*4703203dSis 307*4703203dSis return (B_FALSE); 308*4703203dSis } 309*4703203dSis 310*4703203dSis int 311*4703203dSis uconv_u16tou32(const uint16_t *u16s, size_t *utf16len, 312*4703203dSis uint32_t *u32s, size_t *utf32len, int flag) 313*4703203dSis { 314*4703203dSis int inendian; 315*4703203dSis int outendian; 316*4703203dSis size_t u16l; 317*4703203dSis size_t u32l; 318*4703203dSis uint32_t hi; 319*4703203dSis uint32_t lo; 320*4703203dSis boolean_t do_not_ignore_null; 321*4703203dSis 322*4703203dSis /* 323*4703203dSis * Do preliminary validity checks on parameters and collect info on 324*4703203dSis * endians. 325*4703203dSis */ 326*4703203dSis if (u16s == NULL || utf16len == NULL) 327*4703203dSis return (EILSEQ); 328*4703203dSis 329*4703203dSis if (u32s == NULL || utf32len == NULL) 330*4703203dSis return (E2BIG); 331*4703203dSis 332*4703203dSis if (check_endian(flag, &inendian, &outendian) != 0) 333*4703203dSis return (EBADF); 334*4703203dSis 335*4703203dSis /* 336*4703203dSis * Initialize input and output parameter buffer indices and 337*4703203dSis * temporary variables. 338*4703203dSis */ 339*4703203dSis u16l = u32l = 0; 340*4703203dSis hi = 0; 341*4703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 342*4703203dSis 343*4703203dSis /* 344*4703203dSis * Check on the BOM at the beginning of the input buffer if required 345*4703203dSis * and if there is indeed one, process it. 346*4703203dSis */ 347*4703203dSis if ((flag & UCONV_IN_ACCEPT_BOM) && 348*4703203dSis check_bom16(u16s, *utf16len, &inendian)) 349*4703203dSis u16l++; 350*4703203dSis 351*4703203dSis /* 352*4703203dSis * Reset inendian and outendian so that after this point, those can be 353*4703203dSis * used as condition values. 354*4703203dSis */ 355*4703203dSis inendian &= UCONV_IN_NAT_ENDIAN; 356*4703203dSis outendian &= UCONV_OUT_NAT_ENDIAN; 357*4703203dSis 358*4703203dSis /* 359*4703203dSis * If there is something in the input buffer and if necessary and 360*4703203dSis * requested, save the BOM at the output buffer. 361*4703203dSis */ 362*4703203dSis if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 363*4703203dSis u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : 364*4703203dSis UCONV_BOM_SWAPPED_32; 365*4703203dSis 366*4703203dSis /* 367*4703203dSis * Do conversion; if encounter a surrogate pair, assemble high and 368*4703203dSis * low pair values to form a UTF-32 character. If a half of a pair 369*4703203dSis * exists alone, then, either it is an illegal (EILSEQ) or 370*4703203dSis * invalid (EINVAL) value. 371*4703203dSis */ 372*4703203dSis for (; u16l < *utf16len; u16l++) { 373*4703203dSis if (u16s[u16l] == 0 && do_not_ignore_null) 374*4703203dSis break; 375*4703203dSis 376*4703203dSis lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); 377*4703203dSis 378*4703203dSis if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { 379*4703203dSis if (hi) 380*4703203dSis return (EILSEQ); 381*4703203dSis hi = lo; 382*4703203dSis continue; 383*4703203dSis } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { 384*4703203dSis if (! hi) 385*4703203dSis return (EILSEQ); 386*4703203dSis lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + 387*4703203dSis lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) 388*4703203dSis + UCONV_U16_START; 389*4703203dSis hi = 0; 390*4703203dSis } else if (hi) { 391*4703203dSis return (EILSEQ); 392*4703203dSis } 393*4703203dSis 394*4703203dSis if (u32l >= *utf32len) 395*4703203dSis return (E2BIG); 396*4703203dSis 397*4703203dSis u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo); 398*4703203dSis } 399*4703203dSis 400*4703203dSis /* 401*4703203dSis * If high half didn't see low half, then, it's most likely the input 402*4703203dSis * parameter is incomplete. 403*4703203dSis */ 404*4703203dSis if (hi) 405*4703203dSis return (EINVAL); 406*4703203dSis 407*4703203dSis /* 408*4703203dSis * Save the number of consumed and saved characters. They do not 409*4703203dSis * include terminating NULL character (U+0000) at the end of 410*4703203dSis * the input buffer (even when UCONV_IGNORE_NULL isn't specified and 411*4703203dSis * the input buffer length is big enough to include the terminating 412*4703203dSis * NULL character). 413*4703203dSis */ 414*4703203dSis *utf16len = u16l; 415*4703203dSis *utf32len = u32l; 416*4703203dSis 417*4703203dSis return (0); 418*4703203dSis } 419*4703203dSis 420*4703203dSis int 421*4703203dSis uconv_u16tou8(const uint16_t *u16s, size_t *utf16len, 422*4703203dSis uchar_t *u8s, size_t *utf8len, int flag) 423*4703203dSis { 424*4703203dSis int inendian; 425*4703203dSis int outendian; 426*4703203dSis size_t u16l; 427*4703203dSis size_t u8l; 428*4703203dSis uint32_t hi; 429*4703203dSis uint32_t lo; 430*4703203dSis boolean_t do_not_ignore_null; 431*4703203dSis 432*4703203dSis if (u16s == NULL || utf16len == NULL) 433*4703203dSis return (EILSEQ); 434*4703203dSis 435*4703203dSis if (u8s == NULL || utf8len == NULL) 436*4703203dSis return (E2BIG); 437*4703203dSis 438*4703203dSis if (check_endian(flag, &inendian, &outendian) != 0) 439*4703203dSis return (EBADF); 440*4703203dSis 441*4703203dSis u16l = u8l = 0; 442*4703203dSis hi = 0; 443*4703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 444*4703203dSis 445*4703203dSis if ((flag & UCONV_IN_ACCEPT_BOM) && 446*4703203dSis check_bom16(u16s, *utf16len, &inendian)) 447*4703203dSis u16l++; 448*4703203dSis 449*4703203dSis inendian &= UCONV_IN_NAT_ENDIAN; 450*4703203dSis 451*4703203dSis for (; u16l < *utf16len; u16l++) { 452*4703203dSis if (u16s[u16l] == 0 && do_not_ignore_null) 453*4703203dSis break; 454*4703203dSis 455*4703203dSis lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); 456*4703203dSis 457*4703203dSis if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { 458*4703203dSis if (hi) 459*4703203dSis return (EILSEQ); 460*4703203dSis hi = lo; 461*4703203dSis continue; 462*4703203dSis } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { 463*4703203dSis if (! hi) 464*4703203dSis return (EILSEQ); 465*4703203dSis lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + 466*4703203dSis lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) 467*4703203dSis + UCONV_U16_START; 468*4703203dSis hi = 0; 469*4703203dSis } else if (hi) { 470*4703203dSis return (EILSEQ); 471*4703203dSis } 472*4703203dSis 473*4703203dSis /* 474*4703203dSis * Now we convert a UTF-32 character into a UTF-8 character. 475*4703203dSis * Unicode coding space is between U+0000 and U+10FFFF; 476*4703203dSis * anything bigger is an illegal character. 477*4703203dSis */ 478*4703203dSis if (lo <= UCONV_U8_ONE_BYTE) { 479*4703203dSis if (u8l >= *utf8len) 480*4703203dSis return (E2BIG); 481*4703203dSis u8s[u8l++] = (uchar_t)lo; 482*4703203dSis } else if (lo <= UCONV_U8_TWO_BYTES) { 483*4703203dSis if ((u8l + 1) >= *utf8len) 484*4703203dSis return (E2BIG); 485*4703203dSis u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); 486*4703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); 487*4703203dSis } else if (lo <= UCONV_U8_THREE_BYTES) { 488*4703203dSis if ((u8l + 2) >= *utf8len) 489*4703203dSis return (E2BIG); 490*4703203dSis u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); 491*4703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); 492*4703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); 493*4703203dSis } else if (lo <= UCONV_U8_FOUR_BYTES) { 494*4703203dSis if ((u8l + 3) >= *utf8len) 495*4703203dSis return (E2BIG); 496*4703203dSis u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); 497*4703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); 498*4703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); 499*4703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); 500*4703203dSis } else { 501*4703203dSis return (EILSEQ); 502*4703203dSis } 503*4703203dSis } 504*4703203dSis 505*4703203dSis if (hi) 506*4703203dSis return (EINVAL); 507*4703203dSis 508*4703203dSis *utf16len = u16l; 509*4703203dSis *utf8len = u8l; 510*4703203dSis 511*4703203dSis return (0); 512*4703203dSis } 513*4703203dSis 514*4703203dSis int 515*4703203dSis uconv_u32tou16(const uint32_t *u32s, size_t *utf32len, 516*4703203dSis uint16_t *u16s, size_t *utf16len, int flag) 517*4703203dSis { 518*4703203dSis int inendian; 519*4703203dSis int outendian; 520*4703203dSis size_t u16l; 521*4703203dSis size_t u32l; 522*4703203dSis uint32_t hi; 523*4703203dSis uint32_t lo; 524*4703203dSis boolean_t do_not_ignore_null; 525*4703203dSis 526*4703203dSis if (u32s == NULL || utf32len == NULL) 527*4703203dSis return (EILSEQ); 528*4703203dSis 529*4703203dSis if (u16s == NULL || utf16len == NULL) 530*4703203dSis return (E2BIG); 531*4703203dSis 532*4703203dSis if (check_endian(flag, &inendian, &outendian) != 0) 533*4703203dSis return (EBADF); 534*4703203dSis 535*4703203dSis u16l = u32l = 0; 536*4703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 537*4703203dSis 538*4703203dSis if ((flag & UCONV_IN_ACCEPT_BOM) && 539*4703203dSis check_bom32(u32s, *utf32len, &inendian)) 540*4703203dSis u32l++; 541*4703203dSis 542*4703203dSis inendian &= UCONV_IN_NAT_ENDIAN; 543*4703203dSis outendian &= UCONV_OUT_NAT_ENDIAN; 544*4703203dSis 545*4703203dSis if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 546*4703203dSis u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : 547*4703203dSis UCONV_BOM_SWAPPED; 548*4703203dSis 549*4703203dSis for (; u32l < *utf32len; u32l++) { 550*4703203dSis if (u32s[u32l] == 0 && do_not_ignore_null) 551*4703203dSis break; 552*4703203dSis 553*4703203dSis hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); 554*4703203dSis 555*4703203dSis /* 556*4703203dSis * Anything bigger than the Unicode coding space, i.e., 557*4703203dSis * Unicode scalar value bigger than U+10FFFF, is an illegal 558*4703203dSis * character. 559*4703203dSis */ 560*4703203dSis if (hi > UCONV_UNICODE_MAX) 561*4703203dSis return (EILSEQ); 562*4703203dSis 563*4703203dSis /* 564*4703203dSis * Anything bigger than U+FFFF must be converted into 565*4703203dSis * a surrogate pair in UTF-16. 566*4703203dSis */ 567*4703203dSis if (hi >= UCONV_U16_START) { 568*4703203dSis lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + 569*4703203dSis UCONV_U16_LO_MIN; 570*4703203dSis hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + 571*4703203dSis UCONV_U16_HI_MIN; 572*4703203dSis 573*4703203dSis if ((u16l + 1) >= *utf16len) 574*4703203dSis return (E2BIG); 575*4703203dSis 576*4703203dSis if (outendian) { 577*4703203dSis u16s[u16l++] = (uint16_t)hi; 578*4703203dSis u16s[u16l++] = (uint16_t)lo; 579*4703203dSis } else { 580*4703203dSis u16s[u16l++] = BSWAP_16(((uint16_t)hi)); 581*4703203dSis u16s[u16l++] = BSWAP_16(((uint16_t)lo)); 582*4703203dSis } 583*4703203dSis } else { 584*4703203dSis if (u16l >= *utf16len) 585*4703203dSis return (E2BIG); 586*4703203dSis u16s[u16l++] = (outendian) ? (uint16_t)hi : 587*4703203dSis BSWAP_16(((uint16_t)hi)); 588*4703203dSis } 589*4703203dSis } 590*4703203dSis 591*4703203dSis *utf16len = u16l; 592*4703203dSis *utf32len = u32l; 593*4703203dSis 594*4703203dSis return (0); 595*4703203dSis } 596*4703203dSis 597*4703203dSis int 598*4703203dSis uconv_u32tou8(const uint32_t *u32s, size_t *utf32len, 599*4703203dSis uchar_t *u8s, size_t *utf8len, int flag) 600*4703203dSis { 601*4703203dSis int inendian; 602*4703203dSis int outendian; 603*4703203dSis size_t u32l; 604*4703203dSis size_t u8l; 605*4703203dSis uint32_t lo; 606*4703203dSis boolean_t do_not_ignore_null; 607*4703203dSis 608*4703203dSis if (u32s == NULL || utf32len == NULL) 609*4703203dSis return (EILSEQ); 610*4703203dSis 611*4703203dSis if (u8s == NULL || utf8len == NULL) 612*4703203dSis return (E2BIG); 613*4703203dSis 614*4703203dSis if (check_endian(flag, &inendian, &outendian) != 0) 615*4703203dSis return (EBADF); 616*4703203dSis 617*4703203dSis u32l = u8l = 0; 618*4703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 619*4703203dSis 620*4703203dSis if ((flag & UCONV_IN_ACCEPT_BOM) && 621*4703203dSis check_bom32(u32s, *utf32len, &inendian)) 622*4703203dSis u32l++; 623*4703203dSis 624*4703203dSis inendian &= UCONV_IN_NAT_ENDIAN; 625*4703203dSis 626*4703203dSis for (; u32l < *utf32len; u32l++) { 627*4703203dSis if (u32s[u32l] == 0 && do_not_ignore_null) 628*4703203dSis break; 629*4703203dSis 630*4703203dSis lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); 631*4703203dSis 632*4703203dSis if (lo <= UCONV_U8_ONE_BYTE) { 633*4703203dSis if (u8l >= *utf8len) 634*4703203dSis return (E2BIG); 635*4703203dSis u8s[u8l++] = (uchar_t)lo; 636*4703203dSis } else if (lo <= UCONV_U8_TWO_BYTES) { 637*4703203dSis if ((u8l + 1) >= *utf8len) 638*4703203dSis return (E2BIG); 639*4703203dSis u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); 640*4703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); 641*4703203dSis } else if (lo <= UCONV_U8_THREE_BYTES) { 642*4703203dSis if ((u8l + 2) >= *utf8len) 643*4703203dSis return (E2BIG); 644*4703203dSis u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); 645*4703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); 646*4703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); 647*4703203dSis } else if (lo <= UCONV_U8_FOUR_BYTES) { 648*4703203dSis if ((u8l + 3) >= *utf8len) 649*4703203dSis return (E2BIG); 650*4703203dSis u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); 651*4703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); 652*4703203dSis u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); 653*4703203dSis u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); 654*4703203dSis } else { 655*4703203dSis return (EILSEQ); 656*4703203dSis } 657*4703203dSis } 658*4703203dSis 659*4703203dSis *utf32len = u32l; 660*4703203dSis *utf8len = u8l; 661*4703203dSis 662*4703203dSis return (0); 663*4703203dSis } 664*4703203dSis 665*4703203dSis int 666*4703203dSis uconv_u8tou16(const uchar_t *u8s, size_t *utf8len, 667*4703203dSis uint16_t *u16s, size_t *utf16len, int flag) 668*4703203dSis { 669*4703203dSis int inendian; 670*4703203dSis int outendian; 671*4703203dSis size_t u16l; 672*4703203dSis size_t u8l; 673*4703203dSis uint32_t hi; 674*4703203dSis uint32_t lo; 675*4703203dSis int remaining_bytes; 676*4703203dSis int first_b; 677*4703203dSis boolean_t do_not_ignore_null; 678*4703203dSis 679*4703203dSis if (u8s == NULL || utf8len == NULL) 680*4703203dSis return (EILSEQ); 681*4703203dSis 682*4703203dSis if (u16s == NULL || utf16len == NULL) 683*4703203dSis return (E2BIG); 684*4703203dSis 685*4703203dSis if (check_endian(flag, &inendian, &outendian) != 0) 686*4703203dSis return (EBADF); 687*4703203dSis 688*4703203dSis u16l = u8l = 0; 689*4703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 690*4703203dSis 691*4703203dSis outendian &= UCONV_OUT_NAT_ENDIAN; 692*4703203dSis 693*4703203dSis if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 694*4703203dSis u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : 695*4703203dSis UCONV_BOM_SWAPPED; 696*4703203dSis 697*4703203dSis for (; u8l < *utf8len; ) { 698*4703203dSis if (u8s[u8l] == 0 && do_not_ignore_null) 699*4703203dSis break; 700*4703203dSis 701*4703203dSis /* 702*4703203dSis * Collect a UTF-8 character and convert it to a UTF-32 703*4703203dSis * character. In doing so, we screen out illegally formed 704*4703203dSis * UTF-8 characters and treat such as illegal characters. 705*4703203dSis * The algorithm at below also screens out anything bigger 706*4703203dSis * than the U+10FFFF. 707*4703203dSis * 708*4703203dSis * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for 709*4703203dSis * more details on the illegal values of UTF-8 character 710*4703203dSis * bytes. 711*4703203dSis */ 712*4703203dSis hi = (uint32_t)u8s[u8l++]; 713*4703203dSis 714*4703203dSis if (hi > UCONV_ASCII_MAX) { 715*4703203dSis if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) 716*4703203dSis return (EILSEQ); 717*4703203dSis 718*4703203dSis first_b = hi; 719*4703203dSis hi = hi & masks_tbl[remaining_bytes]; 720*4703203dSis 721*4703203dSis for (; remaining_bytes > 0; remaining_bytes--) { 722*4703203dSis /* 723*4703203dSis * If we have no more bytes, the current 724*4703203dSis * UTF-8 character is incomplete. 725*4703203dSis */ 726*4703203dSis if (u8l >= *utf8len) 727*4703203dSis return (EINVAL); 728*4703203dSis 729*4703203dSis lo = (uint32_t)u8s[u8l++]; 730*4703203dSis 731*4703203dSis if (first_b) { 732*4703203dSis if (lo < valid_min_2nd_byte[first_b] || 733*4703203dSis lo > valid_max_2nd_byte[first_b]) 734*4703203dSis return (EILSEQ); 735*4703203dSis first_b = 0; 736*4703203dSis } else if (lo < UCONV_U8_BYTE_MIN || 737*4703203dSis lo > UCONV_U8_BYTE_MAX) { 738*4703203dSis return (EILSEQ); 739*4703203dSis } 740*4703203dSis hi = (hi << UCONV_U8_BIT_SHIFT) | 741*4703203dSis (lo & UCONV_U8_BIT_MASK); 742*4703203dSis } 743*4703203dSis } 744*4703203dSis 745*4703203dSis if (hi >= UCONV_U16_START) { 746*4703203dSis lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + 747*4703203dSis UCONV_U16_LO_MIN; 748*4703203dSis hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + 749*4703203dSis UCONV_U16_HI_MIN; 750*4703203dSis 751*4703203dSis if ((u16l + 1) >= *utf16len) 752*4703203dSis return (E2BIG); 753*4703203dSis 754*4703203dSis if (outendian) { 755*4703203dSis u16s[u16l++] = (uint16_t)hi; 756*4703203dSis u16s[u16l++] = (uint16_t)lo; 757*4703203dSis } else { 758*4703203dSis u16s[u16l++] = BSWAP_16(((uint16_t)hi)); 759*4703203dSis u16s[u16l++] = BSWAP_16(((uint16_t)lo)); 760*4703203dSis } 761*4703203dSis } else { 762*4703203dSis if (u16l >= *utf16len) 763*4703203dSis return (E2BIG); 764*4703203dSis 765*4703203dSis u16s[u16l++] = (outendian) ? (uint16_t)hi : 766*4703203dSis BSWAP_16(((uint16_t)hi)); 767*4703203dSis } 768*4703203dSis } 769*4703203dSis 770*4703203dSis *utf16len = u16l; 771*4703203dSis *utf8len = u8l; 772*4703203dSis 773*4703203dSis return (0); 774*4703203dSis } 775*4703203dSis 776*4703203dSis int 777*4703203dSis uconv_u8tou32(const uchar_t *u8s, size_t *utf8len, 778*4703203dSis uint32_t *u32s, size_t *utf32len, int flag) 779*4703203dSis { 780*4703203dSis int inendian; 781*4703203dSis int outendian; 782*4703203dSis size_t u32l; 783*4703203dSis size_t u8l; 784*4703203dSis uint32_t hi; 785*4703203dSis uint32_t c; 786*4703203dSis int remaining_bytes; 787*4703203dSis int first_b; 788*4703203dSis boolean_t do_not_ignore_null; 789*4703203dSis 790*4703203dSis if (u8s == NULL || utf8len == NULL) 791*4703203dSis return (EILSEQ); 792*4703203dSis 793*4703203dSis if (u32s == NULL || utf32len == NULL) 794*4703203dSis return (E2BIG); 795*4703203dSis 796*4703203dSis if (check_endian(flag, &inendian, &outendian) != 0) 797*4703203dSis return (EBADF); 798*4703203dSis 799*4703203dSis u32l = u8l = 0; 800*4703203dSis do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 801*4703203dSis 802*4703203dSis outendian &= UCONV_OUT_NAT_ENDIAN; 803*4703203dSis 804*4703203dSis if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 805*4703203dSis u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : 806*4703203dSis UCONV_BOM_SWAPPED_32; 807*4703203dSis 808*4703203dSis for (; u8l < *utf8len; ) { 809*4703203dSis if (u8s[u8l] == 0 && do_not_ignore_null) 810*4703203dSis break; 811*4703203dSis 812*4703203dSis hi = (uint32_t)u8s[u8l++]; 813*4703203dSis 814*4703203dSis if (hi > UCONV_ASCII_MAX) { 815*4703203dSis if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) 816*4703203dSis return (EILSEQ); 817*4703203dSis 818*4703203dSis first_b = hi; 819*4703203dSis hi = hi & masks_tbl[remaining_bytes]; 820*4703203dSis 821*4703203dSis for (; remaining_bytes > 0; remaining_bytes--) { 822*4703203dSis if (u8l >= *utf8len) 823*4703203dSis return (EINVAL); 824*4703203dSis 825*4703203dSis c = (uint32_t)u8s[u8l++]; 826*4703203dSis 827*4703203dSis if (first_b) { 828*4703203dSis if (c < valid_min_2nd_byte[first_b] || 829*4703203dSis c > valid_max_2nd_byte[first_b]) 830*4703203dSis return (EILSEQ); 831*4703203dSis first_b = 0; 832*4703203dSis } else if (c < UCONV_U8_BYTE_MIN || 833*4703203dSis c > UCONV_U8_BYTE_MAX) { 834*4703203dSis return (EILSEQ); 835*4703203dSis } 836*4703203dSis hi = (hi << UCONV_U8_BIT_SHIFT) | 837*4703203dSis (c & UCONV_U8_BIT_MASK); 838*4703203dSis } 839*4703203dSis } 840*4703203dSis 841*4703203dSis if (u32l >= *utf32len) 842*4703203dSis return (E2BIG); 843*4703203dSis 844*4703203dSis u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi); 845*4703203dSis } 846*4703203dSis 847*4703203dSis *utf32len = u32l; 848*4703203dSis *utf8len = u8l; 849*4703203dSis 850*4703203dSis return (0); 851*4703203dSis } 852