1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32. 30 * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517) 31 * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F), 32 * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also 33 * the section 3C man pages. 34 * Interface stability: Committed 35 */ 36 37 #include <sys/types.h> 38 #ifdef _KERNEL 39 #include <sys/param.h> 40 #include <sys/sysmacros.h> 41 #include <sys/systm.h> 42 #include <sys/debug.h> 43 #include <sys/kmem.h> 44 #include <sys/sunddi.h> 45 #else 46 #include <sys/u8_textprep.h> 47 #endif /* _KERNEL */ 48 #include <sys/byteorder.h> 49 #include <sys/errno.h> 50 51 52 /* 53 * The max and min values of high and low surrogate pairs of UTF-16, 54 * UTF-16 bit shift value, bit mask, and starting value outside of BMP. 55 */ 56 #define UCONV_U16_HI_MIN (0xd800U) 57 #define UCONV_U16_HI_MAX (0xdbffU) 58 #define UCONV_U16_LO_MIN (0xdc00U) 59 #define UCONV_U16_LO_MAX (0xdfffU) 60 #define UCONV_U16_BIT_SHIFT (0x0400U) 61 #define UCONV_U16_BIT_MASK (0x0fffffU) 62 #define UCONV_U16_START (0x010000U) 63 64 /* The maximum value of Unicode coding space and ASCII coding space. */ 65 #define UCONV_UNICODE_MAX (0x10ffffU) 66 #define UCONV_ASCII_MAX (0x7fU) 67 68 /* The mask values for input and output endians. */ 69 #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN) 70 #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN) 71 72 /* Native and reversed endian macros. */ 73 #ifdef _BIG_ENDIAN 74 #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN 75 #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN 76 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN 77 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN 78 #else 79 #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN 80 #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN 81 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN 82 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN 83 #endif /* _BIG_ENDIAN */ 84 85 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */ 86 #define UCONV_BOM_NORMAL (0xfeffU) 87 #define UCONV_BOM_SWAPPED (0xfffeU) 88 #define UCONV_BOM_SWAPPED_32 (0xfffe0000U) 89 90 /* UTF-32 boundaries based on UTF-8 character byte lengths. */ 91 #define UCONV_U8_ONE_BYTE (0x7fU) 92 #define UCONV_U8_TWO_BYTES (0x7ffU) 93 #define UCONV_U8_THREE_BYTES (0xffffU) 94 #define UCONV_U8_FOUR_BYTES (0x10ffffU) 95 96 /* The common minimum and maximum values at the UTF-8 character bytes. */ 97 #define UCONV_U8_BYTE_MIN (0x80U) 98 #define UCONV_U8_BYTE_MAX (0xbfU) 99 100 /* 101 * The following "6" and "0x3f" came from "10xx xxxx" bit representation of 102 * UTF-8 character bytes. 103 */ 104 #define UCONV_U8_BIT_SHIFT 6 105 #define UCONV_U8_BIT_MASK 0x3f 106 107 /* 108 * The following vector shows remaining bytes in a UTF-8 character. 109 * Index will be the first byte of the character. 110 */ 111 static const uchar_t remaining_bytes_tbl[0x100] = { 112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 124 125 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 126 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 127 128 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 130 131 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 132 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 133 134 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 135 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 136 }; 137 138 /* 139 * The following is a vector of bit-masks to get used bits in 140 * the first byte of a UTF-8 character. Index is remaining bytes at above of 141 * the character. 142 */ 143 static const uchar_t masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 144 145 /* 146 * The following two vectors are to provide valid minimum and 147 * maximum values for the 2'nd byte of a multibyte UTF-8 character for 148 * better illegal sequence checking. The index value must be the value of 149 * the first byte of the UTF-8 character. 150 */ 151 static const uchar_t valid_min_2nd_byte[0x100] = { 152 0, 0, 0, 0, 0, 0, 0, 0, 153 0, 0, 0, 0, 0, 0, 0, 0, 154 0, 0, 0, 0, 0, 0, 0, 0, 155 0, 0, 0, 0, 0, 0, 0, 0, 156 0, 0, 0, 0, 0, 0, 0, 0, 157 0, 0, 0, 0, 0, 0, 0, 0, 158 0, 0, 0, 0, 0, 0, 0, 0, 159 0, 0, 0, 0, 0, 0, 0, 0, 160 0, 0, 0, 0, 0, 0, 0, 0, 161 0, 0, 0, 0, 0, 0, 0, 0, 162 0, 0, 0, 0, 0, 0, 0, 0, 163 0, 0, 0, 0, 0, 0, 0, 0, 164 0, 0, 0, 0, 0, 0, 0, 0, 165 0, 0, 0, 0, 0, 0, 0, 0, 166 0, 0, 0, 0, 0, 0, 0, 0, 167 0, 0, 0, 0, 0, 0, 0, 0, 168 0, 0, 0, 0, 0, 0, 0, 0, 169 0, 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0, 0, 171 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 177 /* C0 C1 C2 C3 C4 C5 C6 C7 */ 178 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 179 180 /* C8 C9 CA CB CC CD CE CF */ 181 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 182 183 /* D0 D1 D2 D3 D4 D5 D6 D7 */ 184 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 185 186 /* D8 D9 DA DB DC DD DE DF */ 187 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 188 189 /* E0 E1 E2 E3 E4 E5 E6 E7 */ 190 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 191 192 /* E8 E9 EA EB EC ED EE EF */ 193 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 194 195 /* F0 F1 F2 F3 F4 F5 F6 F7 */ 196 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 197 198 0, 0, 0, 0, 0, 0, 0, 0 199 }; 200 201 static const uchar_t valid_max_2nd_byte[0x100] = { 202 0, 0, 0, 0, 0, 0, 0, 0, 203 0, 0, 0, 0, 0, 0, 0, 0, 204 0, 0, 0, 0, 0, 0, 0, 0, 205 0, 0, 0, 0, 0, 0, 0, 0, 206 0, 0, 0, 0, 0, 0, 0, 0, 207 0, 0, 0, 0, 0, 0, 0, 0, 208 0, 0, 0, 0, 0, 0, 0, 0, 209 0, 0, 0, 0, 0, 0, 0, 0, 210 0, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0, 212 0, 0, 0, 0, 0, 0, 0, 0, 213 0, 0, 0, 0, 0, 0, 0, 0, 214 0, 0, 0, 0, 0, 0, 0, 0, 215 0, 0, 0, 0, 0, 0, 0, 0, 216 0, 0, 0, 0, 0, 0, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, 218 0, 0, 0, 0, 0, 0, 0, 0, 219 0, 0, 0, 0, 0, 0, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0, 221 0, 0, 0, 0, 0, 0, 0, 0, 222 0, 0, 0, 0, 0, 0, 0, 0, 223 0, 0, 0, 0, 0, 0, 0, 0, 224 0, 0, 0, 0, 0, 0, 0, 0, 225 0, 0, 0, 0, 0, 0, 0, 0, 226 227 /* C0 C1 C2 C3 C4 C5 C6 C7 */ 228 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 229 230 /* C8 C9 CA CB CC CD CE CF */ 231 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 232 233 /* D0 D1 D2 D3 D4 D5 D6 D7 */ 234 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 235 236 /* D8 D9 DA DB DC DD DE DF */ 237 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 238 239 /* E0 E1 E2 E3 E4 E5 E6 E7 */ 240 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 241 242 /* E8 E9 EA EB EC ED EE EF */ 243 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 244 245 /* F0 F1 F2 F3 F4 F5 F6 F7 */ 246 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 247 248 0, 0, 0, 0, 0, 0, 0, 0 249 }; 250 251 252 static int 253 check_endian(int flag, int *in, int *out) 254 { 255 *in = flag & UCONV_IN_ENDIAN_MASKS; 256 257 /* You cannot have both. */ 258 if (*in == UCONV_IN_ENDIAN_MASKS) 259 return (EBADF); 260 261 if (*in == 0) 262 *in = UCONV_IN_NAT_ENDIAN; 263 264 *out = flag & UCONV_OUT_ENDIAN_MASKS; 265 266 /* You cannot have both. */ 267 if (*out == UCONV_OUT_ENDIAN_MASKS) 268 return (EBADF); 269 270 if (*out == 0) 271 *out = UCONV_OUT_NAT_ENDIAN; 272 273 return (0); 274 } 275 276 static boolean_t 277 check_bom16(const uint16_t *u16s, size_t u16l, int *in) 278 { 279 if (u16l > 0) { 280 if (*u16s == UCONV_BOM_NORMAL) { 281 *in = UCONV_IN_NAT_ENDIAN; 282 return (B_TRUE); 283 } 284 if (*u16s == UCONV_BOM_SWAPPED) { 285 *in = UCONV_IN_REV_ENDIAN; 286 return (B_TRUE); 287 } 288 } 289 290 return (B_FALSE); 291 } 292 293 static boolean_t 294 check_bom32(const uint32_t *u32s, size_t u32l, int *in) 295 { 296 if (u32l > 0) { 297 if (*u32s == UCONV_BOM_NORMAL) { 298 *in = UCONV_IN_NAT_ENDIAN; 299 return (B_TRUE); 300 } 301 if (*u32s == UCONV_BOM_SWAPPED_32) { 302 *in = UCONV_IN_REV_ENDIAN; 303 return (B_TRUE); 304 } 305 } 306 307 return (B_FALSE); 308 } 309 310 int 311 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len, 312 uint32_t *u32s, size_t *utf32len, int flag) 313 { 314 int inendian; 315 int outendian; 316 size_t u16l; 317 size_t u32l; 318 uint32_t hi; 319 uint32_t lo; 320 boolean_t do_not_ignore_null; 321 322 /* 323 * Do preliminary validity checks on parameters and collect info on 324 * endians. 325 */ 326 if (u16s == NULL || utf16len == NULL) 327 return (EILSEQ); 328 329 if (u32s == NULL || utf32len == NULL) 330 return (E2BIG); 331 332 if (check_endian(flag, &inendian, &outendian) != 0) 333 return (EBADF); 334 335 /* 336 * Initialize input and output parameter buffer indices and 337 * temporary variables. 338 */ 339 u16l = u32l = 0; 340 hi = 0; 341 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 342 343 /* 344 * Check on the BOM at the beginning of the input buffer if required 345 * and if there is indeed one, process it. 346 */ 347 if ((flag & UCONV_IN_ACCEPT_BOM) && 348 check_bom16(u16s, *utf16len, &inendian)) 349 u16l++; 350 351 /* 352 * Reset inendian and outendian so that after this point, those can be 353 * used as condition values. 354 */ 355 inendian &= UCONV_IN_NAT_ENDIAN; 356 outendian &= UCONV_OUT_NAT_ENDIAN; 357 358 /* 359 * If there is something in the input buffer and if necessary and 360 * requested, save the BOM at the output buffer. 361 */ 362 if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 363 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : 364 UCONV_BOM_SWAPPED_32; 365 366 /* 367 * Do conversion; if encounter a surrogate pair, assemble high and 368 * low pair values to form a UTF-32 character. If a half of a pair 369 * exists alone, then, either it is an illegal (EILSEQ) or 370 * invalid (EINVAL) value. 371 */ 372 for (; u16l < *utf16len; u16l++) { 373 if (u16s[u16l] == 0 && do_not_ignore_null) 374 break; 375 376 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); 377 378 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { 379 if (hi) 380 return (EILSEQ); 381 hi = lo; 382 continue; 383 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { 384 if (! hi) 385 return (EILSEQ); 386 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + 387 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) 388 + UCONV_U16_START; 389 hi = 0; 390 } else if (hi) { 391 return (EILSEQ); 392 } 393 394 if (u32l >= *utf32len) 395 return (E2BIG); 396 397 u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo); 398 } 399 400 /* 401 * If high half didn't see low half, then, it's most likely the input 402 * parameter is incomplete. 403 */ 404 if (hi) 405 return (EINVAL); 406 407 /* 408 * Save the number of consumed and saved characters. They do not 409 * include terminating NULL character (U+0000) at the end of 410 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and 411 * the input buffer length is big enough to include the terminating 412 * NULL character). 413 */ 414 *utf16len = u16l; 415 *utf32len = u32l; 416 417 return (0); 418 } 419 420 int 421 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len, 422 uchar_t *u8s, size_t *utf8len, int flag) 423 { 424 int inendian; 425 int outendian; 426 size_t u16l; 427 size_t u8l; 428 uint32_t hi; 429 uint32_t lo; 430 boolean_t do_not_ignore_null; 431 432 if (u16s == NULL || utf16len == NULL) 433 return (EILSEQ); 434 435 if (u8s == NULL || utf8len == NULL) 436 return (E2BIG); 437 438 if (check_endian(flag, &inendian, &outendian) != 0) 439 return (EBADF); 440 441 u16l = u8l = 0; 442 hi = 0; 443 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 444 445 if ((flag & UCONV_IN_ACCEPT_BOM) && 446 check_bom16(u16s, *utf16len, &inendian)) 447 u16l++; 448 449 inendian &= UCONV_IN_NAT_ENDIAN; 450 451 for (; u16l < *utf16len; u16l++) { 452 if (u16s[u16l] == 0 && do_not_ignore_null) 453 break; 454 455 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); 456 457 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { 458 if (hi) 459 return (EILSEQ); 460 hi = lo; 461 continue; 462 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { 463 if (! hi) 464 return (EILSEQ); 465 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + 466 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) 467 + UCONV_U16_START; 468 hi = 0; 469 } else if (hi) { 470 return (EILSEQ); 471 } 472 473 /* 474 * Now we convert a UTF-32 character into a UTF-8 character. 475 * Unicode coding space is between U+0000 and U+10FFFF; 476 * anything bigger is an illegal character. 477 */ 478 if (lo <= UCONV_U8_ONE_BYTE) { 479 if (u8l >= *utf8len) 480 return (E2BIG); 481 u8s[u8l++] = (uchar_t)lo; 482 } else if (lo <= UCONV_U8_TWO_BYTES) { 483 if ((u8l + 1) >= *utf8len) 484 return (E2BIG); 485 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); 486 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); 487 } else if (lo <= UCONV_U8_THREE_BYTES) { 488 if ((u8l + 2) >= *utf8len) 489 return (E2BIG); 490 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); 491 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); 492 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); 493 } else if (lo <= UCONV_U8_FOUR_BYTES) { 494 if ((u8l + 3) >= *utf8len) 495 return (E2BIG); 496 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); 497 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); 498 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); 499 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); 500 } else { 501 return (EILSEQ); 502 } 503 } 504 505 if (hi) 506 return (EINVAL); 507 508 *utf16len = u16l; 509 *utf8len = u8l; 510 511 return (0); 512 } 513 514 int 515 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len, 516 uint16_t *u16s, size_t *utf16len, int flag) 517 { 518 int inendian; 519 int outendian; 520 size_t u16l; 521 size_t u32l; 522 uint32_t hi; 523 uint32_t lo; 524 boolean_t do_not_ignore_null; 525 526 if (u32s == NULL || utf32len == NULL) 527 return (EILSEQ); 528 529 if (u16s == NULL || utf16len == NULL) 530 return (E2BIG); 531 532 if (check_endian(flag, &inendian, &outendian) != 0) 533 return (EBADF); 534 535 u16l = u32l = 0; 536 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 537 538 if ((flag & UCONV_IN_ACCEPT_BOM) && 539 check_bom32(u32s, *utf32len, &inendian)) 540 u32l++; 541 542 inendian &= UCONV_IN_NAT_ENDIAN; 543 outendian &= UCONV_OUT_NAT_ENDIAN; 544 545 if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 546 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : 547 UCONV_BOM_SWAPPED; 548 549 for (; u32l < *utf32len; u32l++) { 550 if (u32s[u32l] == 0 && do_not_ignore_null) 551 break; 552 553 hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); 554 555 /* 556 * Anything bigger than the Unicode coding space, i.e., 557 * Unicode scalar value bigger than U+10FFFF, is an illegal 558 * character. 559 */ 560 if (hi > UCONV_UNICODE_MAX) 561 return (EILSEQ); 562 563 /* 564 * Anything bigger than U+FFFF must be converted into 565 * a surrogate pair in UTF-16. 566 */ 567 if (hi >= UCONV_U16_START) { 568 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + 569 UCONV_U16_LO_MIN; 570 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + 571 UCONV_U16_HI_MIN; 572 573 if ((u16l + 1) >= *utf16len) 574 return (E2BIG); 575 576 if (outendian) { 577 u16s[u16l++] = (uint16_t)hi; 578 u16s[u16l++] = (uint16_t)lo; 579 } else { 580 u16s[u16l++] = BSWAP_16(((uint16_t)hi)); 581 u16s[u16l++] = BSWAP_16(((uint16_t)lo)); 582 } 583 } else { 584 if (u16l >= *utf16len) 585 return (E2BIG); 586 u16s[u16l++] = (outendian) ? (uint16_t)hi : 587 BSWAP_16(((uint16_t)hi)); 588 } 589 } 590 591 *utf16len = u16l; 592 *utf32len = u32l; 593 594 return (0); 595 } 596 597 int 598 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len, 599 uchar_t *u8s, size_t *utf8len, int flag) 600 { 601 int inendian; 602 int outendian; 603 size_t u32l; 604 size_t u8l; 605 uint32_t lo; 606 boolean_t do_not_ignore_null; 607 608 if (u32s == NULL || utf32len == NULL) 609 return (EILSEQ); 610 611 if (u8s == NULL || utf8len == NULL) 612 return (E2BIG); 613 614 if (check_endian(flag, &inendian, &outendian) != 0) 615 return (EBADF); 616 617 u32l = u8l = 0; 618 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 619 620 if ((flag & UCONV_IN_ACCEPT_BOM) && 621 check_bom32(u32s, *utf32len, &inendian)) 622 u32l++; 623 624 inendian &= UCONV_IN_NAT_ENDIAN; 625 626 for (; u32l < *utf32len; u32l++) { 627 if (u32s[u32l] == 0 && do_not_ignore_null) 628 break; 629 630 lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); 631 632 if (lo <= UCONV_U8_ONE_BYTE) { 633 if (u8l >= *utf8len) 634 return (E2BIG); 635 u8s[u8l++] = (uchar_t)lo; 636 } else if (lo <= UCONV_U8_TWO_BYTES) { 637 if ((u8l + 1) >= *utf8len) 638 return (E2BIG); 639 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); 640 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); 641 } else if (lo <= UCONV_U8_THREE_BYTES) { 642 if ((u8l + 2) >= *utf8len) 643 return (E2BIG); 644 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); 645 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); 646 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); 647 } else if (lo <= UCONV_U8_FOUR_BYTES) { 648 if ((u8l + 3) >= *utf8len) 649 return (E2BIG); 650 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); 651 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); 652 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); 653 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); 654 } else { 655 return (EILSEQ); 656 } 657 } 658 659 *utf32len = u32l; 660 *utf8len = u8l; 661 662 return (0); 663 } 664 665 int 666 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len, 667 uint16_t *u16s, size_t *utf16len, int flag) 668 { 669 int inendian; 670 int outendian; 671 size_t u16l; 672 size_t u8l; 673 uint32_t hi; 674 uint32_t lo; 675 int remaining_bytes; 676 int first_b; 677 boolean_t do_not_ignore_null; 678 679 if (u8s == NULL || utf8len == NULL) 680 return (EILSEQ); 681 682 if (u16s == NULL || utf16len == NULL) 683 return (E2BIG); 684 685 if (check_endian(flag, &inendian, &outendian) != 0) 686 return (EBADF); 687 688 u16l = u8l = 0; 689 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 690 691 outendian &= UCONV_OUT_NAT_ENDIAN; 692 693 if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 694 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : 695 UCONV_BOM_SWAPPED; 696 697 for (; u8l < *utf8len; ) { 698 if (u8s[u8l] == 0 && do_not_ignore_null) 699 break; 700 701 /* 702 * Collect a UTF-8 character and convert it to a UTF-32 703 * character. In doing so, we screen out illegally formed 704 * UTF-8 characters and treat such as illegal characters. 705 * The algorithm at below also screens out anything bigger 706 * than the U+10FFFF. 707 * 708 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for 709 * more details on the illegal values of UTF-8 character 710 * bytes. 711 */ 712 hi = (uint32_t)u8s[u8l++]; 713 714 if (hi > UCONV_ASCII_MAX) { 715 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) 716 return (EILSEQ); 717 718 first_b = hi; 719 hi = hi & masks_tbl[remaining_bytes]; 720 721 for (; remaining_bytes > 0; remaining_bytes--) { 722 /* 723 * If we have no more bytes, the current 724 * UTF-8 character is incomplete. 725 */ 726 if (u8l >= *utf8len) 727 return (EINVAL); 728 729 lo = (uint32_t)u8s[u8l++]; 730 731 if (first_b) { 732 if (lo < valid_min_2nd_byte[first_b] || 733 lo > valid_max_2nd_byte[first_b]) 734 return (EILSEQ); 735 first_b = 0; 736 } else if (lo < UCONV_U8_BYTE_MIN || 737 lo > UCONV_U8_BYTE_MAX) { 738 return (EILSEQ); 739 } 740 hi = (hi << UCONV_U8_BIT_SHIFT) | 741 (lo & UCONV_U8_BIT_MASK); 742 } 743 } 744 745 if (hi >= UCONV_U16_START) { 746 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + 747 UCONV_U16_LO_MIN; 748 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + 749 UCONV_U16_HI_MIN; 750 751 if ((u16l + 1) >= *utf16len) 752 return (E2BIG); 753 754 if (outendian) { 755 u16s[u16l++] = (uint16_t)hi; 756 u16s[u16l++] = (uint16_t)lo; 757 } else { 758 u16s[u16l++] = BSWAP_16(((uint16_t)hi)); 759 u16s[u16l++] = BSWAP_16(((uint16_t)lo)); 760 } 761 } else { 762 if (u16l >= *utf16len) 763 return (E2BIG); 764 765 u16s[u16l++] = (outendian) ? (uint16_t)hi : 766 BSWAP_16(((uint16_t)hi)); 767 } 768 } 769 770 *utf16len = u16l; 771 *utf8len = u8l; 772 773 return (0); 774 } 775 776 int 777 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len, 778 uint32_t *u32s, size_t *utf32len, int flag) 779 { 780 int inendian; 781 int outendian; 782 size_t u32l; 783 size_t u8l; 784 uint32_t hi; 785 uint32_t c; 786 int remaining_bytes; 787 int first_b; 788 boolean_t do_not_ignore_null; 789 790 if (u8s == NULL || utf8len == NULL) 791 return (EILSEQ); 792 793 if (u32s == NULL || utf32len == NULL) 794 return (E2BIG); 795 796 if (check_endian(flag, &inendian, &outendian) != 0) 797 return (EBADF); 798 799 u32l = u8l = 0; 800 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 801 802 outendian &= UCONV_OUT_NAT_ENDIAN; 803 804 if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 805 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : 806 UCONV_BOM_SWAPPED_32; 807 808 for (; u8l < *utf8len; ) { 809 if (u8s[u8l] == 0 && do_not_ignore_null) 810 break; 811 812 hi = (uint32_t)u8s[u8l++]; 813 814 if (hi > UCONV_ASCII_MAX) { 815 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) 816 return (EILSEQ); 817 818 first_b = hi; 819 hi = hi & masks_tbl[remaining_bytes]; 820 821 for (; remaining_bytes > 0; remaining_bytes--) { 822 if (u8l >= *utf8len) 823 return (EINVAL); 824 825 c = (uint32_t)u8s[u8l++]; 826 827 if (first_b) { 828 if (c < valid_min_2nd_byte[first_b] || 829 c > valid_max_2nd_byte[first_b]) 830 return (EILSEQ); 831 first_b = 0; 832 } else if (c < UCONV_U8_BYTE_MIN || 833 c > UCONV_U8_BYTE_MAX) { 834 return (EILSEQ); 835 } 836 hi = (hi << UCONV_U8_BIT_SHIFT) | 837 (c & UCONV_U8_BIT_MASK); 838 } 839 } 840 841 if (u32l >= *utf32len) 842 return (E2BIG); 843 844 u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi); 845 } 846 847 *utf32len = u32l; 848 *utf8len = u8l; 849 850 return (0); 851 } 852