1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32. 30 * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517) 31 * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F), 32 * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also 33 * the section 3C man pages. 34 * Interface stability: Committed 35 */ 36 37 #include <sys/types.h> 38 #ifdef _KERNEL 39 #include <sys/param.h> 40 #include <sys/sysmacros.h> 41 #include <sys/systm.h> 42 #include <sys/debug.h> 43 #include <sys/kmem.h> 44 #include <sys/sunddi.h> 45 #else 46 #include <sys/u8_textprep.h> 47 #endif /* _KERNEL */ 48 #include <sys/byteorder.h> 49 #include <sys/errno.h> 50 51 52 /* 53 * The max and min values of high and low surrogate pairs of UTF-16, 54 * UTF-16 bit shift value, bit mask, and starting value outside of BMP. 55 */ 56 #define UCONV_U16_HI_MIN (0xd800U) 57 #define UCONV_U16_HI_MAX (0xdbffU) 58 #define UCONV_U16_LO_MIN (0xdc00U) 59 #define UCONV_U16_LO_MAX (0xdfffU) 60 #define UCONV_U16_BIT_SHIFT (0x0400U) 61 #define UCONV_U16_BIT_MASK (0x0fffffU) 62 #define UCONV_U16_START (0x010000U) 63 64 /* The maximum value of Unicode coding space and ASCII coding space. */ 65 #define UCONV_UNICODE_MAX (0x10ffffU) 66 #define UCONV_ASCII_MAX (0x7fU) 67 68 /* The mask values for input and output endians. */ 69 #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN) 70 #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN) 71 72 /* Native and reversed endian macros. */ 73 #ifdef _BIG_ENDIAN 74 #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN 75 #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN 76 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN 77 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN 78 #else 79 #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN 80 #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN 81 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN 82 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN 83 #endif /* _BIG_ENDIAN */ 84 85 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */ 86 #define UCONV_BOM_NORMAL (0xfeffU) 87 #define UCONV_BOM_SWAPPED (0xfffeU) 88 #define UCONV_BOM_SWAPPED_32 (0xfffe0000U) 89 90 /* UTF-32 boundaries based on UTF-8 character byte lengths. */ 91 #define UCONV_U8_ONE_BYTE (0x7fU) 92 #define UCONV_U8_TWO_BYTES (0x7ffU) 93 #define UCONV_U8_THREE_BYTES (0xffffU) 94 #define UCONV_U8_FOUR_BYTES (0x10ffffU) 95 96 /* The common minimum and maximum values at the UTF-8 character bytes. */ 97 #define UCONV_U8_BYTE_MIN (0x80U) 98 #define UCONV_U8_BYTE_MAX (0xbfU) 99 100 /* 101 * The following "6" and "0x3f" came from "10xx xxxx" bit representation of 102 * UTF-8 character bytes. 103 */ 104 #define UCONV_U8_BIT_SHIFT 6 105 #define UCONV_U8_BIT_MASK 0x3f 106 107 /* 108 * The following vector shows remaining bytes in a UTF-8 character. 109 * Index will be the first byte of the character. 110 */ 111 static const uchar_t remaining_bytes_tbl[0x100] = { 112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 124 125 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 126 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 127 128 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 130 131 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 132 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 133 134 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 135 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 136 }; 137 138 /* 139 * The following is a vector of bit-masks to get used bits in 140 * the first byte of a UTF-8 character. Index is remaining bytes at above of 141 * the character. 142 */ 143 #ifdef _KERNEL 144 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 145 #else 146 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 147 #endif /* _KERNEL */ 148 149 /* 150 * The following two vectors are to provide valid minimum and 151 * maximum values for the 2'nd byte of a multibyte UTF-8 character for 152 * better illegal sequence checking. The index value must be the value of 153 * the first byte of the UTF-8 character. 154 */ 155 static const uchar_t valid_min_2nd_byte[0x100] = { 156 0, 0, 0, 0, 0, 0, 0, 0, 157 0, 0, 0, 0, 0, 0, 0, 0, 158 0, 0, 0, 0, 0, 0, 0, 0, 159 0, 0, 0, 0, 0, 0, 0, 0, 160 0, 0, 0, 0, 0, 0, 0, 0, 161 0, 0, 0, 0, 0, 0, 0, 0, 162 0, 0, 0, 0, 0, 0, 0, 0, 163 0, 0, 0, 0, 0, 0, 0, 0, 164 0, 0, 0, 0, 0, 0, 0, 0, 165 0, 0, 0, 0, 0, 0, 0, 0, 166 0, 0, 0, 0, 0, 0, 0, 0, 167 0, 0, 0, 0, 0, 0, 0, 0, 168 0, 0, 0, 0, 0, 0, 0, 0, 169 0, 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0, 0, 171 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 0, 0, 0, 0, 0, 0, 0, 0, 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 180 181 /* C0 C1 C2 C3 C4 C5 C6 C7 */ 182 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 183 184 /* C8 C9 CA CB CC CD CE CF */ 185 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 186 187 /* D0 D1 D2 D3 D4 D5 D6 D7 */ 188 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 189 190 /* D8 D9 DA DB DC DD DE DF */ 191 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 192 193 /* E0 E1 E2 E3 E4 E5 E6 E7 */ 194 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 195 196 /* E8 E9 EA EB EC ED EE EF */ 197 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 198 199 /* F0 F1 F2 F3 F4 F5 F6 F7 */ 200 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 201 202 0, 0, 0, 0, 0, 0, 0, 0 203 }; 204 205 static const uchar_t valid_max_2nd_byte[0x100] = { 206 0, 0, 0, 0, 0, 0, 0, 0, 207 0, 0, 0, 0, 0, 0, 0, 0, 208 0, 0, 0, 0, 0, 0, 0, 0, 209 0, 0, 0, 0, 0, 0, 0, 0, 210 0, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0, 212 0, 0, 0, 0, 0, 0, 0, 0, 213 0, 0, 0, 0, 0, 0, 0, 0, 214 0, 0, 0, 0, 0, 0, 0, 0, 215 0, 0, 0, 0, 0, 0, 0, 0, 216 0, 0, 0, 0, 0, 0, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, 218 0, 0, 0, 0, 0, 0, 0, 0, 219 0, 0, 0, 0, 0, 0, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0, 221 0, 0, 0, 0, 0, 0, 0, 0, 222 0, 0, 0, 0, 0, 0, 0, 0, 223 0, 0, 0, 0, 0, 0, 0, 0, 224 0, 0, 0, 0, 0, 0, 0, 0, 225 0, 0, 0, 0, 0, 0, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 227 0, 0, 0, 0, 0, 0, 0, 0, 228 0, 0, 0, 0, 0, 0, 0, 0, 229 0, 0, 0, 0, 0, 0, 0, 0, 230 231 /* C0 C1 C2 C3 C4 C5 C6 C7 */ 232 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 233 234 /* C8 C9 CA CB CC CD CE CF */ 235 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 236 237 /* D0 D1 D2 D3 D4 D5 D6 D7 */ 238 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 239 240 /* D8 D9 DA DB DC DD DE DF */ 241 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 242 243 /* E0 E1 E2 E3 E4 E5 E6 E7 */ 244 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 245 246 /* E8 E9 EA EB EC ED EE EF */ 247 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 248 249 /* F0 F1 F2 F3 F4 F5 F6 F7 */ 250 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 251 252 0, 0, 0, 0, 0, 0, 0, 0 253 }; 254 255 256 static int 257 check_endian(int flag, int *in, int *out) 258 { 259 *in = flag & UCONV_IN_ENDIAN_MASKS; 260 261 /* You cannot have both. */ 262 if (*in == UCONV_IN_ENDIAN_MASKS) 263 return (EBADF); 264 265 if (*in == 0) 266 *in = UCONV_IN_NAT_ENDIAN; 267 268 *out = flag & UCONV_OUT_ENDIAN_MASKS; 269 270 /* You cannot have both. */ 271 if (*out == UCONV_OUT_ENDIAN_MASKS) 272 return (EBADF); 273 274 if (*out == 0) 275 *out = UCONV_OUT_NAT_ENDIAN; 276 277 return (0); 278 } 279 280 static boolean_t 281 check_bom16(const uint16_t *u16s, size_t u16l, int *in) 282 { 283 if (u16l > 0) { 284 if (*u16s == UCONV_BOM_NORMAL) { 285 *in = UCONV_IN_NAT_ENDIAN; 286 return (B_TRUE); 287 } 288 if (*u16s == UCONV_BOM_SWAPPED) { 289 *in = UCONV_IN_REV_ENDIAN; 290 return (B_TRUE); 291 } 292 } 293 294 return (B_FALSE); 295 } 296 297 static boolean_t 298 check_bom32(const uint32_t *u32s, size_t u32l, int *in) 299 { 300 if (u32l > 0) { 301 if (*u32s == UCONV_BOM_NORMAL) { 302 *in = UCONV_IN_NAT_ENDIAN; 303 return (B_TRUE); 304 } 305 if (*u32s == UCONV_BOM_SWAPPED_32) { 306 *in = UCONV_IN_REV_ENDIAN; 307 return (B_TRUE); 308 } 309 } 310 311 return (B_FALSE); 312 } 313 314 int 315 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len, 316 uint32_t *u32s, size_t *utf32len, int flag) 317 { 318 int inendian; 319 int outendian; 320 size_t u16l; 321 size_t u32l; 322 uint32_t hi; 323 uint32_t lo; 324 boolean_t do_not_ignore_null; 325 326 /* 327 * Do preliminary validity checks on parameters and collect info on 328 * endians. 329 */ 330 if (u16s == NULL || utf16len == NULL) 331 return (EILSEQ); 332 333 if (u32s == NULL || utf32len == NULL) 334 return (E2BIG); 335 336 if (check_endian(flag, &inendian, &outendian) != 0) 337 return (EBADF); 338 339 /* 340 * Initialize input and output parameter buffer indices and 341 * temporary variables. 342 */ 343 u16l = u32l = 0; 344 hi = 0; 345 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 346 347 /* 348 * Check on the BOM at the beginning of the input buffer if required 349 * and if there is indeed one, process it. 350 */ 351 if ((flag & UCONV_IN_ACCEPT_BOM) && 352 check_bom16(u16s, *utf16len, &inendian)) 353 u16l++; 354 355 /* 356 * Reset inendian and outendian so that after this point, those can be 357 * used as condition values. 358 */ 359 inendian &= UCONV_IN_NAT_ENDIAN; 360 outendian &= UCONV_OUT_NAT_ENDIAN; 361 362 /* 363 * If there is something in the input buffer and if necessary and 364 * requested, save the BOM at the output buffer. 365 */ 366 if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 367 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : 368 UCONV_BOM_SWAPPED_32; 369 370 /* 371 * Do conversion; if encounter a surrogate pair, assemble high and 372 * low pair values to form a UTF-32 character. If a half of a pair 373 * exists alone, then, either it is an illegal (EILSEQ) or 374 * invalid (EINVAL) value. 375 */ 376 for (; u16l < *utf16len; u16l++) { 377 if (u16s[u16l] == 0 && do_not_ignore_null) 378 break; 379 380 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); 381 382 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { 383 if (hi) 384 return (EILSEQ); 385 hi = lo; 386 continue; 387 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { 388 if (! hi) 389 return (EILSEQ); 390 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + 391 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) 392 + UCONV_U16_START; 393 hi = 0; 394 } else if (hi) { 395 return (EILSEQ); 396 } 397 398 if (u32l >= *utf32len) 399 return (E2BIG); 400 401 u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo); 402 } 403 404 /* 405 * If high half didn't see low half, then, it's most likely the input 406 * parameter is incomplete. 407 */ 408 if (hi) 409 return (EINVAL); 410 411 /* 412 * Save the number of consumed and saved characters. They do not 413 * include terminating NULL character (U+0000) at the end of 414 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and 415 * the input buffer length is big enough to include the terminating 416 * NULL character). 417 */ 418 *utf16len = u16l; 419 *utf32len = u32l; 420 421 return (0); 422 } 423 424 int 425 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len, 426 uchar_t *u8s, size_t *utf8len, int flag) 427 { 428 int inendian; 429 int outendian; 430 size_t u16l; 431 size_t u8l; 432 uint32_t hi; 433 uint32_t lo; 434 boolean_t do_not_ignore_null; 435 436 if (u16s == NULL || utf16len == NULL) 437 return (EILSEQ); 438 439 if (u8s == NULL || utf8len == NULL) 440 return (E2BIG); 441 442 if (check_endian(flag, &inendian, &outendian) != 0) 443 return (EBADF); 444 445 u16l = u8l = 0; 446 hi = 0; 447 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 448 449 if ((flag & UCONV_IN_ACCEPT_BOM) && 450 check_bom16(u16s, *utf16len, &inendian)) 451 u16l++; 452 453 inendian &= UCONV_IN_NAT_ENDIAN; 454 455 for (; u16l < *utf16len; u16l++) { 456 if (u16s[u16l] == 0 && do_not_ignore_null) 457 break; 458 459 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); 460 461 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { 462 if (hi) 463 return (EILSEQ); 464 hi = lo; 465 continue; 466 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { 467 if (! hi) 468 return (EILSEQ); 469 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + 470 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) 471 + UCONV_U16_START; 472 hi = 0; 473 } else if (hi) { 474 return (EILSEQ); 475 } 476 477 /* 478 * Now we convert a UTF-32 character into a UTF-8 character. 479 * Unicode coding space is between U+0000 and U+10FFFF; 480 * anything bigger is an illegal character. 481 */ 482 if (lo <= UCONV_U8_ONE_BYTE) { 483 if (u8l >= *utf8len) 484 return (E2BIG); 485 u8s[u8l++] = (uchar_t)lo; 486 } else if (lo <= UCONV_U8_TWO_BYTES) { 487 if ((u8l + 1) >= *utf8len) 488 return (E2BIG); 489 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); 490 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); 491 } else if (lo <= UCONV_U8_THREE_BYTES) { 492 if ((u8l + 2) >= *utf8len) 493 return (E2BIG); 494 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); 495 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); 496 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); 497 } else if (lo <= UCONV_U8_FOUR_BYTES) { 498 if ((u8l + 3) >= *utf8len) 499 return (E2BIG); 500 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); 501 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); 502 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); 503 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); 504 } else { 505 return (EILSEQ); 506 } 507 } 508 509 if (hi) 510 return (EINVAL); 511 512 *utf16len = u16l; 513 *utf8len = u8l; 514 515 return (0); 516 } 517 518 int 519 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len, 520 uint16_t *u16s, size_t *utf16len, int flag) 521 { 522 int inendian; 523 int outendian; 524 size_t u16l; 525 size_t u32l; 526 uint32_t hi; 527 uint32_t lo; 528 boolean_t do_not_ignore_null; 529 530 if (u32s == NULL || utf32len == NULL) 531 return (EILSEQ); 532 533 if (u16s == NULL || utf16len == NULL) 534 return (E2BIG); 535 536 if (check_endian(flag, &inendian, &outendian) != 0) 537 return (EBADF); 538 539 u16l = u32l = 0; 540 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 541 542 if ((flag & UCONV_IN_ACCEPT_BOM) && 543 check_bom32(u32s, *utf32len, &inendian)) 544 u32l++; 545 546 inendian &= UCONV_IN_NAT_ENDIAN; 547 outendian &= UCONV_OUT_NAT_ENDIAN; 548 549 if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 550 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : 551 UCONV_BOM_SWAPPED; 552 553 for (; u32l < *utf32len; u32l++) { 554 if (u32s[u32l] == 0 && do_not_ignore_null) 555 break; 556 557 hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); 558 559 /* 560 * Anything bigger than the Unicode coding space, i.e., 561 * Unicode scalar value bigger than U+10FFFF, is an illegal 562 * character. 563 */ 564 if (hi > UCONV_UNICODE_MAX) 565 return (EILSEQ); 566 567 /* 568 * Anything bigger than U+FFFF must be converted into 569 * a surrogate pair in UTF-16. 570 */ 571 if (hi >= UCONV_U16_START) { 572 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + 573 UCONV_U16_LO_MIN; 574 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + 575 UCONV_U16_HI_MIN; 576 577 if ((u16l + 1) >= *utf16len) 578 return (E2BIG); 579 580 if (outendian) { 581 u16s[u16l++] = (uint16_t)hi; 582 u16s[u16l++] = (uint16_t)lo; 583 } else { 584 u16s[u16l++] = BSWAP_16(((uint16_t)hi)); 585 u16s[u16l++] = BSWAP_16(((uint16_t)lo)); 586 } 587 } else { 588 if (u16l >= *utf16len) 589 return (E2BIG); 590 u16s[u16l++] = (outendian) ? (uint16_t)hi : 591 BSWAP_16(((uint16_t)hi)); 592 } 593 } 594 595 *utf16len = u16l; 596 *utf32len = u32l; 597 598 return (0); 599 } 600 601 int 602 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len, 603 uchar_t *u8s, size_t *utf8len, int flag) 604 { 605 int inendian; 606 int outendian; 607 size_t u32l; 608 size_t u8l; 609 uint32_t lo; 610 boolean_t do_not_ignore_null; 611 612 if (u32s == NULL || utf32len == NULL) 613 return (EILSEQ); 614 615 if (u8s == NULL || utf8len == NULL) 616 return (E2BIG); 617 618 if (check_endian(flag, &inendian, &outendian) != 0) 619 return (EBADF); 620 621 u32l = u8l = 0; 622 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 623 624 if ((flag & UCONV_IN_ACCEPT_BOM) && 625 check_bom32(u32s, *utf32len, &inendian)) 626 u32l++; 627 628 inendian &= UCONV_IN_NAT_ENDIAN; 629 630 for (; u32l < *utf32len; u32l++) { 631 if (u32s[u32l] == 0 && do_not_ignore_null) 632 break; 633 634 lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); 635 636 if (lo <= UCONV_U8_ONE_BYTE) { 637 if (u8l >= *utf8len) 638 return (E2BIG); 639 u8s[u8l++] = (uchar_t)lo; 640 } else if (lo <= UCONV_U8_TWO_BYTES) { 641 if ((u8l + 1) >= *utf8len) 642 return (E2BIG); 643 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); 644 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); 645 } else if (lo <= UCONV_U8_THREE_BYTES) { 646 if ((u8l + 2) >= *utf8len) 647 return (E2BIG); 648 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); 649 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); 650 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); 651 } else if (lo <= UCONV_U8_FOUR_BYTES) { 652 if ((u8l + 3) >= *utf8len) 653 return (E2BIG); 654 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); 655 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); 656 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); 657 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); 658 } else { 659 return (EILSEQ); 660 } 661 } 662 663 *utf32len = u32l; 664 *utf8len = u8l; 665 666 return (0); 667 } 668 669 int 670 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len, 671 uint16_t *u16s, size_t *utf16len, int flag) 672 { 673 int inendian; 674 int outendian; 675 size_t u16l; 676 size_t u8l; 677 uint32_t hi; 678 uint32_t lo; 679 int remaining_bytes; 680 int first_b; 681 boolean_t do_not_ignore_null; 682 683 if (u8s == NULL || utf8len == NULL) 684 return (EILSEQ); 685 686 if (u16s == NULL || utf16len == NULL) 687 return (E2BIG); 688 689 if (check_endian(flag, &inendian, &outendian) != 0) 690 return (EBADF); 691 692 u16l = u8l = 0; 693 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 694 695 outendian &= UCONV_OUT_NAT_ENDIAN; 696 697 if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 698 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : 699 UCONV_BOM_SWAPPED; 700 701 for (; u8l < *utf8len; ) { 702 if (u8s[u8l] == 0 && do_not_ignore_null) 703 break; 704 705 /* 706 * Collect a UTF-8 character and convert it to a UTF-32 707 * character. In doing so, we screen out illegally formed 708 * UTF-8 characters and treat such as illegal characters. 709 * The algorithm at below also screens out anything bigger 710 * than the U+10FFFF. 711 * 712 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for 713 * more details on the illegal values of UTF-8 character 714 * bytes. 715 */ 716 hi = (uint32_t)u8s[u8l++]; 717 718 if (hi > UCONV_ASCII_MAX) { 719 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) 720 return (EILSEQ); 721 722 first_b = hi; 723 hi = hi & u8_masks_tbl[remaining_bytes]; 724 725 for (; remaining_bytes > 0; remaining_bytes--) { 726 /* 727 * If we have no more bytes, the current 728 * UTF-8 character is incomplete. 729 */ 730 if (u8l >= *utf8len) 731 return (EINVAL); 732 733 lo = (uint32_t)u8s[u8l++]; 734 735 if (first_b) { 736 if (lo < valid_min_2nd_byte[first_b] || 737 lo > valid_max_2nd_byte[first_b]) 738 return (EILSEQ); 739 first_b = 0; 740 } else if (lo < UCONV_U8_BYTE_MIN || 741 lo > UCONV_U8_BYTE_MAX) { 742 return (EILSEQ); 743 } 744 hi = (hi << UCONV_U8_BIT_SHIFT) | 745 (lo & UCONV_U8_BIT_MASK); 746 } 747 } 748 749 if (hi >= UCONV_U16_START) { 750 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + 751 UCONV_U16_LO_MIN; 752 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + 753 UCONV_U16_HI_MIN; 754 755 if ((u16l + 1) >= *utf16len) 756 return (E2BIG); 757 758 if (outendian) { 759 u16s[u16l++] = (uint16_t)hi; 760 u16s[u16l++] = (uint16_t)lo; 761 } else { 762 u16s[u16l++] = BSWAP_16(((uint16_t)hi)); 763 u16s[u16l++] = BSWAP_16(((uint16_t)lo)); 764 } 765 } else { 766 if (u16l >= *utf16len) 767 return (E2BIG); 768 769 u16s[u16l++] = (outendian) ? (uint16_t)hi : 770 BSWAP_16(((uint16_t)hi)); 771 } 772 } 773 774 *utf16len = u16l; 775 *utf8len = u8l; 776 777 return (0); 778 } 779 780 int 781 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len, 782 uint32_t *u32s, size_t *utf32len, int flag) 783 { 784 int inendian; 785 int outendian; 786 size_t u32l; 787 size_t u8l; 788 uint32_t hi; 789 uint32_t c; 790 int remaining_bytes; 791 int first_b; 792 boolean_t do_not_ignore_null; 793 794 if (u8s == NULL || utf8len == NULL) 795 return (EILSEQ); 796 797 if (u32s == NULL || utf32len == NULL) 798 return (E2BIG); 799 800 if (check_endian(flag, &inendian, &outendian) != 0) 801 return (EBADF); 802 803 u32l = u8l = 0; 804 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 805 806 outendian &= UCONV_OUT_NAT_ENDIAN; 807 808 if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 809 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : 810 UCONV_BOM_SWAPPED_32; 811 812 for (; u8l < *utf8len; ) { 813 if (u8s[u8l] == 0 && do_not_ignore_null) 814 break; 815 816 hi = (uint32_t)u8s[u8l++]; 817 818 if (hi > UCONV_ASCII_MAX) { 819 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) 820 return (EILSEQ); 821 822 first_b = hi; 823 hi = hi & u8_masks_tbl[remaining_bytes]; 824 825 for (; remaining_bytes > 0; remaining_bytes--) { 826 if (u8l >= *utf8len) 827 return (EINVAL); 828 829 c = (uint32_t)u8s[u8l++]; 830 831 if (first_b) { 832 if (c < valid_min_2nd_byte[first_b] || 833 c > valid_max_2nd_byte[first_b]) 834 return (EILSEQ); 835 first_b = 0; 836 } else if (c < UCONV_U8_BYTE_MIN || 837 c > UCONV_U8_BYTE_MAX) { 838 return (EILSEQ); 839 } 840 hi = (hi << UCONV_U8_BIT_SHIFT) | 841 (c & UCONV_U8_BIT_MASK); 842 } 843 } 844 845 if (u32l >= *utf32len) 846 return (E2BIG); 847 848 u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi); 849 } 850 851 *utf32len = u32l; 852 *utf8len = u8l; 853 854 return (0); 855 } 856