1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32. 28 * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517) 29 * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F), 30 * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also 31 * the section 3C man pages. 32 * Interface stability: Committed 33 */ 34 35 #include <sys/types.h> 36 #ifdef _KERNEL 37 #include <sys/param.h> 38 #include <sys/sysmacros.h> 39 #include <sys/systm.h> 40 #include <sys/debug.h> 41 #include <sys/kmem.h> 42 #include <sys/sunddi.h> 43 #else 44 #include <sys/u8_textprep.h> 45 #endif /* _KERNEL */ 46 #include <sys/byteorder.h> 47 #include <sys/errno.h> 48 49 50 /* 51 * The max and min values of high and low surrogate pairs of UTF-16, 52 * UTF-16 bit shift value, bit mask, and starting value outside of BMP. 53 */ 54 #define UCONV_U16_HI_MIN (0xd800U) 55 #define UCONV_U16_HI_MAX (0xdbffU) 56 #define UCONV_U16_LO_MIN (0xdc00U) 57 #define UCONV_U16_LO_MAX (0xdfffU) 58 #define UCONV_U16_BIT_SHIFT (0x0400U) 59 #define UCONV_U16_BIT_MASK (0x0fffffU) 60 #define UCONV_U16_START (0x010000U) 61 62 /* The maximum value of Unicode coding space and ASCII coding space. */ 63 #define UCONV_UNICODE_MAX (0x10ffffU) 64 #define UCONV_ASCII_MAX (0x7fU) 65 66 /* The mask values for input and output endians. */ 67 #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN) 68 #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN) 69 70 /* Native and reversed endian macros. */ 71 #ifdef _BIG_ENDIAN 72 #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN 73 #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN 74 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN 75 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN 76 #else 77 #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN 78 #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN 79 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN 80 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN 81 #endif /* _BIG_ENDIAN */ 82 83 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */ 84 #define UCONV_BOM_NORMAL (0xfeffU) 85 #define UCONV_BOM_SWAPPED (0xfffeU) 86 #define UCONV_BOM_SWAPPED_32 (0xfffe0000U) 87 88 /* UTF-32 boundaries based on UTF-8 character byte lengths. */ 89 #define UCONV_U8_ONE_BYTE (0x7fU) 90 #define UCONV_U8_TWO_BYTES (0x7ffU) 91 #define UCONV_U8_THREE_BYTES (0xffffU) 92 #define UCONV_U8_FOUR_BYTES (0x10ffffU) 93 94 /* The common minimum and maximum values at the UTF-8 character bytes. */ 95 #define UCONV_U8_BYTE_MIN (0x80U) 96 #define UCONV_U8_BYTE_MAX (0xbfU) 97 98 /* 99 * The following "6" and "0x3f" came from "10xx xxxx" bit representation of 100 * UTF-8 character bytes. 101 */ 102 #define UCONV_U8_BIT_SHIFT 6 103 #define UCONV_U8_BIT_MASK 0x3f 104 105 /* 106 * The following vector shows remaining bytes in a UTF-8 character. 107 * Index will be the first byte of the character. 108 */ 109 static const uchar_t remaining_bytes_tbl[0x100] = { 110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 122 123 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 124 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 125 126 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 128 129 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 130 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 131 132 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 133 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 134 }; 135 136 /* 137 * The following is a vector of bit-masks to get used bits in 138 * the first byte of a UTF-8 character. Index is remaining bytes at above of 139 * the character. 140 */ 141 #ifdef _KERNEL 142 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 143 #else 144 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 145 #endif /* _KERNEL */ 146 147 /* 148 * The following two vectors are to provide valid minimum and 149 * maximum values for the 2'nd byte of a multibyte UTF-8 character for 150 * better illegal sequence checking. The index value must be the value of 151 * the first byte of the UTF-8 character. 152 */ 153 static const uchar_t valid_min_2nd_byte[0x100] = { 154 0, 0, 0, 0, 0, 0, 0, 0, 155 0, 0, 0, 0, 0, 0, 0, 0, 156 0, 0, 0, 0, 0, 0, 0, 0, 157 0, 0, 0, 0, 0, 0, 0, 0, 158 0, 0, 0, 0, 0, 0, 0, 0, 159 0, 0, 0, 0, 0, 0, 0, 0, 160 0, 0, 0, 0, 0, 0, 0, 0, 161 0, 0, 0, 0, 0, 0, 0, 0, 162 0, 0, 0, 0, 0, 0, 0, 0, 163 0, 0, 0, 0, 0, 0, 0, 0, 164 0, 0, 0, 0, 0, 0, 0, 0, 165 0, 0, 0, 0, 0, 0, 0, 0, 166 0, 0, 0, 0, 0, 0, 0, 0, 167 0, 0, 0, 0, 0, 0, 0, 0, 168 0, 0, 0, 0, 0, 0, 0, 0, 169 0, 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0, 0, 171 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 0, 0, 0, 0, 0, 0, 0, 0, 177 0, 0, 0, 0, 0, 0, 0, 0, 178 179 /* C0 C1 C2 C3 C4 C5 C6 C7 */ 180 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 181 182 /* C8 C9 CA CB CC CD CE CF */ 183 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 184 185 /* D0 D1 D2 D3 D4 D5 D6 D7 */ 186 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 187 188 /* D8 D9 DA DB DC DD DE DF */ 189 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 190 191 /* E0 E1 E2 E3 E4 E5 E6 E7 */ 192 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 193 194 /* E8 E9 EA EB EC ED EE EF */ 195 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 196 197 /* F0 F1 F2 F3 F4 F5 F6 F7 */ 198 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 199 200 0, 0, 0, 0, 0, 0, 0, 0 201 }; 202 203 static const uchar_t valid_max_2nd_byte[0x100] = { 204 0, 0, 0, 0, 0, 0, 0, 0, 205 0, 0, 0, 0, 0, 0, 0, 0, 206 0, 0, 0, 0, 0, 0, 0, 0, 207 0, 0, 0, 0, 0, 0, 0, 0, 208 0, 0, 0, 0, 0, 0, 0, 0, 209 0, 0, 0, 0, 0, 0, 0, 0, 210 0, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0, 212 0, 0, 0, 0, 0, 0, 0, 0, 213 0, 0, 0, 0, 0, 0, 0, 0, 214 0, 0, 0, 0, 0, 0, 0, 0, 215 0, 0, 0, 0, 0, 0, 0, 0, 216 0, 0, 0, 0, 0, 0, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, 218 0, 0, 0, 0, 0, 0, 0, 0, 219 0, 0, 0, 0, 0, 0, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0, 221 0, 0, 0, 0, 0, 0, 0, 0, 222 0, 0, 0, 0, 0, 0, 0, 0, 223 0, 0, 0, 0, 0, 0, 0, 0, 224 0, 0, 0, 0, 0, 0, 0, 0, 225 0, 0, 0, 0, 0, 0, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 227 0, 0, 0, 0, 0, 0, 0, 0, 228 229 /* C0 C1 C2 C3 C4 C5 C6 C7 */ 230 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 231 232 /* C8 C9 CA CB CC CD CE CF */ 233 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 234 235 /* D0 D1 D2 D3 D4 D5 D6 D7 */ 236 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 237 238 /* D8 D9 DA DB DC DD DE DF */ 239 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 240 241 /* E0 E1 E2 E3 E4 E5 E6 E7 */ 242 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 243 244 /* E8 E9 EA EB EC ED EE EF */ 245 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 246 247 /* F0 F1 F2 F3 F4 F5 F6 F7 */ 248 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 249 250 0, 0, 0, 0, 0, 0, 0, 0 251 }; 252 253 254 static int 255 check_endian(int flag, int *in, int *out) 256 { 257 *in = flag & UCONV_IN_ENDIAN_MASKS; 258 259 /* You cannot have both. */ 260 if (*in == UCONV_IN_ENDIAN_MASKS) 261 return (EBADF); 262 263 if (*in == 0) 264 *in = UCONV_IN_NAT_ENDIAN; 265 266 *out = flag & UCONV_OUT_ENDIAN_MASKS; 267 268 /* You cannot have both. */ 269 if (*out == UCONV_OUT_ENDIAN_MASKS) 270 return (EBADF); 271 272 if (*out == 0) 273 *out = UCONV_OUT_NAT_ENDIAN; 274 275 return (0); 276 } 277 278 static boolean_t 279 check_bom16(const uint16_t *u16s, size_t u16l, int *in) 280 { 281 if (u16l > 0) { 282 if (*u16s == UCONV_BOM_NORMAL) { 283 *in = UCONV_IN_NAT_ENDIAN; 284 return (B_TRUE); 285 } 286 if (*u16s == UCONV_BOM_SWAPPED) { 287 *in = UCONV_IN_REV_ENDIAN; 288 return (B_TRUE); 289 } 290 } 291 292 return (B_FALSE); 293 } 294 295 static boolean_t 296 check_bom32(const uint32_t *u32s, size_t u32l, int *in) 297 { 298 if (u32l > 0) { 299 if (*u32s == UCONV_BOM_NORMAL) { 300 *in = UCONV_IN_NAT_ENDIAN; 301 return (B_TRUE); 302 } 303 if (*u32s == UCONV_BOM_SWAPPED_32) { 304 *in = UCONV_IN_REV_ENDIAN; 305 return (B_TRUE); 306 } 307 } 308 309 return (B_FALSE); 310 } 311 312 int 313 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len, 314 uint32_t *u32s, size_t *utf32len, int flag) 315 { 316 int inendian; 317 int outendian; 318 size_t u16l; 319 size_t u32l; 320 uint32_t hi; 321 uint32_t lo; 322 boolean_t do_not_ignore_null; 323 324 /* 325 * Do preliminary validity checks on parameters and collect info on 326 * endians. 327 */ 328 if (u16s == NULL || utf16len == NULL) 329 return (EILSEQ); 330 331 if (u32s == NULL || utf32len == NULL) 332 return (E2BIG); 333 334 if (check_endian(flag, &inendian, &outendian) != 0) 335 return (EBADF); 336 337 /* 338 * Initialize input and output parameter buffer indices and 339 * temporary variables. 340 */ 341 u16l = u32l = 0; 342 hi = 0; 343 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 344 345 /* 346 * Check on the BOM at the beginning of the input buffer if required 347 * and if there is indeed one, process it. 348 */ 349 if ((flag & UCONV_IN_ACCEPT_BOM) && 350 check_bom16(u16s, *utf16len, &inendian)) 351 u16l++; 352 353 /* 354 * Reset inendian and outendian so that after this point, those can be 355 * used as condition values. 356 */ 357 inendian &= UCONV_IN_NAT_ENDIAN; 358 outendian &= UCONV_OUT_NAT_ENDIAN; 359 360 /* 361 * If there is something in the input buffer and if necessary and 362 * requested, save the BOM at the output buffer. 363 */ 364 if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 365 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : 366 UCONV_BOM_SWAPPED_32; 367 368 /* 369 * Do conversion; if encounter a surrogate pair, assemble high and 370 * low pair values to form a UTF-32 character. If a half of a pair 371 * exists alone, then, either it is an illegal (EILSEQ) or 372 * invalid (EINVAL) value. 373 */ 374 for (; u16l < *utf16len; u16l++) { 375 if (u16s[u16l] == 0 && do_not_ignore_null) 376 break; 377 378 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); 379 380 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { 381 if (hi) 382 return (EILSEQ); 383 hi = lo; 384 continue; 385 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { 386 if (! hi) 387 return (EILSEQ); 388 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + 389 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) 390 + UCONV_U16_START; 391 hi = 0; 392 } else if (hi) { 393 return (EILSEQ); 394 } 395 396 if (u32l >= *utf32len) 397 return (E2BIG); 398 399 u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo); 400 } 401 402 /* 403 * If high half didn't see low half, then, it's most likely the input 404 * parameter is incomplete. 405 */ 406 if (hi) 407 return (EINVAL); 408 409 /* 410 * Save the number of consumed and saved characters. They do not 411 * include terminating NULL character (U+0000) at the end of 412 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and 413 * the input buffer length is big enough to include the terminating 414 * NULL character). 415 */ 416 *utf16len = u16l; 417 *utf32len = u32l; 418 419 return (0); 420 } 421 422 int 423 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len, 424 uchar_t *u8s, size_t *utf8len, int flag) 425 { 426 int inendian; 427 int outendian; 428 size_t u16l; 429 size_t u8l; 430 uint32_t hi; 431 uint32_t lo; 432 boolean_t do_not_ignore_null; 433 434 if (u16s == NULL || utf16len == NULL) 435 return (EILSEQ); 436 437 if (u8s == NULL || utf8len == NULL) 438 return (E2BIG); 439 440 if (check_endian(flag, &inendian, &outendian) != 0) 441 return (EBADF); 442 443 u16l = u8l = 0; 444 hi = 0; 445 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 446 447 if ((flag & UCONV_IN_ACCEPT_BOM) && 448 check_bom16(u16s, *utf16len, &inendian)) 449 u16l++; 450 451 inendian &= UCONV_IN_NAT_ENDIAN; 452 453 for (; u16l < *utf16len; u16l++) { 454 if (u16s[u16l] == 0 && do_not_ignore_null) 455 break; 456 457 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); 458 459 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { 460 if (hi) 461 return (EILSEQ); 462 hi = lo; 463 continue; 464 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { 465 if (! hi) 466 return (EILSEQ); 467 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + 468 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) 469 + UCONV_U16_START; 470 hi = 0; 471 } else if (hi) { 472 return (EILSEQ); 473 } 474 475 /* 476 * Now we convert a UTF-32 character into a UTF-8 character. 477 * Unicode coding space is between U+0000 and U+10FFFF; 478 * anything bigger is an illegal character. 479 */ 480 if (lo <= UCONV_U8_ONE_BYTE) { 481 if (u8l >= *utf8len) 482 return (E2BIG); 483 u8s[u8l++] = (uchar_t)lo; 484 } else if (lo <= UCONV_U8_TWO_BYTES) { 485 if ((u8l + 1) >= *utf8len) 486 return (E2BIG); 487 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); 488 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); 489 } else if (lo <= UCONV_U8_THREE_BYTES) { 490 if ((u8l + 2) >= *utf8len) 491 return (E2BIG); 492 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); 493 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); 494 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); 495 } else if (lo <= UCONV_U8_FOUR_BYTES) { 496 if ((u8l + 3) >= *utf8len) 497 return (E2BIG); 498 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); 499 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); 500 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); 501 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); 502 } else { 503 return (EILSEQ); 504 } 505 } 506 507 if (hi) 508 return (EINVAL); 509 510 *utf16len = u16l; 511 *utf8len = u8l; 512 513 return (0); 514 } 515 516 int 517 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len, 518 uint16_t *u16s, size_t *utf16len, int flag) 519 { 520 int inendian; 521 int outendian; 522 size_t u16l; 523 size_t u32l; 524 uint32_t hi; 525 uint32_t lo; 526 boolean_t do_not_ignore_null; 527 528 if (u32s == NULL || utf32len == NULL) 529 return (EILSEQ); 530 531 if (u16s == NULL || utf16len == NULL) 532 return (E2BIG); 533 534 if (check_endian(flag, &inendian, &outendian) != 0) 535 return (EBADF); 536 537 u16l = u32l = 0; 538 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 539 540 if ((flag & UCONV_IN_ACCEPT_BOM) && 541 check_bom32(u32s, *utf32len, &inendian)) 542 u32l++; 543 544 inendian &= UCONV_IN_NAT_ENDIAN; 545 outendian &= UCONV_OUT_NAT_ENDIAN; 546 547 if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 548 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : 549 UCONV_BOM_SWAPPED; 550 551 for (; u32l < *utf32len; u32l++) { 552 if (u32s[u32l] == 0 && do_not_ignore_null) 553 break; 554 555 hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); 556 557 /* 558 * Anything bigger than the Unicode coding space, i.e., 559 * Unicode scalar value bigger than U+10FFFF, is an illegal 560 * character. 561 */ 562 if (hi > UCONV_UNICODE_MAX) 563 return (EILSEQ); 564 565 /* 566 * Anything bigger than U+FFFF must be converted into 567 * a surrogate pair in UTF-16. 568 */ 569 if (hi >= UCONV_U16_START) { 570 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + 571 UCONV_U16_LO_MIN; 572 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + 573 UCONV_U16_HI_MIN; 574 575 if ((u16l + 1) >= *utf16len) 576 return (E2BIG); 577 578 if (outendian) { 579 u16s[u16l++] = (uint16_t)hi; 580 u16s[u16l++] = (uint16_t)lo; 581 } else { 582 u16s[u16l++] = BSWAP_16(((uint16_t)hi)); 583 u16s[u16l++] = BSWAP_16(((uint16_t)lo)); 584 } 585 } else { 586 if (u16l >= *utf16len) 587 return (E2BIG); 588 u16s[u16l++] = (outendian) ? (uint16_t)hi : 589 BSWAP_16(((uint16_t)hi)); 590 } 591 } 592 593 *utf16len = u16l; 594 *utf32len = u32l; 595 596 return (0); 597 } 598 599 int 600 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len, 601 uchar_t *u8s, size_t *utf8len, int flag) 602 { 603 int inendian; 604 int outendian; 605 size_t u32l; 606 size_t u8l; 607 uint32_t lo; 608 boolean_t do_not_ignore_null; 609 610 if (u32s == NULL || utf32len == NULL) 611 return (EILSEQ); 612 613 if (u8s == NULL || utf8len == NULL) 614 return (E2BIG); 615 616 if (check_endian(flag, &inendian, &outendian) != 0) 617 return (EBADF); 618 619 u32l = u8l = 0; 620 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 621 622 if ((flag & UCONV_IN_ACCEPT_BOM) && 623 check_bom32(u32s, *utf32len, &inendian)) 624 u32l++; 625 626 inendian &= UCONV_IN_NAT_ENDIAN; 627 628 for (; u32l < *utf32len; u32l++) { 629 if (u32s[u32l] == 0 && do_not_ignore_null) 630 break; 631 632 lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); 633 634 if (lo <= UCONV_U8_ONE_BYTE) { 635 if (u8l >= *utf8len) 636 return (E2BIG); 637 u8s[u8l++] = (uchar_t)lo; 638 } else if (lo <= UCONV_U8_TWO_BYTES) { 639 if ((u8l + 1) >= *utf8len) 640 return (E2BIG); 641 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); 642 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); 643 } else if (lo <= UCONV_U8_THREE_BYTES) { 644 if ((u8l + 2) >= *utf8len) 645 return (E2BIG); 646 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); 647 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); 648 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); 649 } else if (lo <= UCONV_U8_FOUR_BYTES) { 650 if ((u8l + 3) >= *utf8len) 651 return (E2BIG); 652 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); 653 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); 654 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); 655 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); 656 } else { 657 return (EILSEQ); 658 } 659 } 660 661 *utf32len = u32l; 662 *utf8len = u8l; 663 664 return (0); 665 } 666 667 int 668 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len, 669 uint16_t *u16s, size_t *utf16len, int flag) 670 { 671 int inendian; 672 int outendian; 673 size_t u16l; 674 size_t u8l; 675 uint32_t hi; 676 uint32_t lo; 677 int remaining_bytes; 678 int first_b; 679 boolean_t do_not_ignore_null; 680 681 if (u8s == NULL || utf8len == NULL) 682 return (EILSEQ); 683 684 if (u16s == NULL || utf16len == NULL) 685 return (E2BIG); 686 687 if (check_endian(flag, &inendian, &outendian) != 0) 688 return (EBADF); 689 690 u16l = u8l = 0; 691 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 692 693 outendian &= UCONV_OUT_NAT_ENDIAN; 694 695 if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 696 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : 697 UCONV_BOM_SWAPPED; 698 699 for (; u8l < *utf8len; ) { 700 if (u8s[u8l] == 0 && do_not_ignore_null) 701 break; 702 703 /* 704 * Collect a UTF-8 character and convert it to a UTF-32 705 * character. In doing so, we screen out illegally formed 706 * UTF-8 characters and treat such as illegal characters. 707 * The algorithm at below also screens out anything bigger 708 * than the U+10FFFF. 709 * 710 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for 711 * more details on the illegal values of UTF-8 character 712 * bytes. 713 */ 714 hi = (uint32_t)u8s[u8l++]; 715 716 if (hi > UCONV_ASCII_MAX) { 717 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) 718 return (EILSEQ); 719 720 first_b = hi; 721 hi = hi & u8_masks_tbl[remaining_bytes]; 722 723 for (; remaining_bytes > 0; remaining_bytes--) { 724 /* 725 * If we have no more bytes, the current 726 * UTF-8 character is incomplete. 727 */ 728 if (u8l >= *utf8len) 729 return (EINVAL); 730 731 lo = (uint32_t)u8s[u8l++]; 732 733 if (first_b) { 734 if (lo < valid_min_2nd_byte[first_b] || 735 lo > valid_max_2nd_byte[first_b]) 736 return (EILSEQ); 737 first_b = 0; 738 } else if (lo < UCONV_U8_BYTE_MIN || 739 lo > UCONV_U8_BYTE_MAX) { 740 return (EILSEQ); 741 } 742 hi = (hi << UCONV_U8_BIT_SHIFT) | 743 (lo & UCONV_U8_BIT_MASK); 744 } 745 } 746 747 if (hi >= UCONV_U16_START) { 748 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + 749 UCONV_U16_LO_MIN; 750 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + 751 UCONV_U16_HI_MIN; 752 753 if ((u16l + 1) >= *utf16len) 754 return (E2BIG); 755 756 if (outendian) { 757 u16s[u16l++] = (uint16_t)hi; 758 u16s[u16l++] = (uint16_t)lo; 759 } else { 760 u16s[u16l++] = BSWAP_16(((uint16_t)hi)); 761 u16s[u16l++] = BSWAP_16(((uint16_t)lo)); 762 } 763 } else { 764 if (u16l >= *utf16len) 765 return (E2BIG); 766 767 u16s[u16l++] = (outendian) ? (uint16_t)hi : 768 BSWAP_16(((uint16_t)hi)); 769 } 770 } 771 772 *utf16len = u16l; 773 *utf8len = u8l; 774 775 return (0); 776 } 777 778 int 779 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len, 780 uint32_t *u32s, size_t *utf32len, int flag) 781 { 782 int inendian; 783 int outendian; 784 size_t u32l; 785 size_t u8l; 786 uint32_t hi; 787 uint32_t c; 788 int remaining_bytes; 789 int first_b; 790 boolean_t do_not_ignore_null; 791 792 if (u8s == NULL || utf8len == NULL) 793 return (EILSEQ); 794 795 if (u32s == NULL || utf32len == NULL) 796 return (E2BIG); 797 798 if (check_endian(flag, &inendian, &outendian) != 0) 799 return (EBADF); 800 801 u32l = u8l = 0; 802 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); 803 804 outendian &= UCONV_OUT_NAT_ENDIAN; 805 806 if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) 807 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : 808 UCONV_BOM_SWAPPED_32; 809 810 for (; u8l < *utf8len; ) { 811 if (u8s[u8l] == 0 && do_not_ignore_null) 812 break; 813 814 hi = (uint32_t)u8s[u8l++]; 815 816 if (hi > UCONV_ASCII_MAX) { 817 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) 818 return (EILSEQ); 819 820 first_b = hi; 821 hi = hi & u8_masks_tbl[remaining_bytes]; 822 823 for (; remaining_bytes > 0; remaining_bytes--) { 824 if (u8l >= *utf8len) 825 return (EINVAL); 826 827 c = (uint32_t)u8s[u8l++]; 828 829 if (first_b) { 830 if (c < valid_min_2nd_byte[first_b] || 831 c > valid_max_2nd_byte[first_b]) 832 return (EILSEQ); 833 first_b = 0; 834 } else if (c < UCONV_U8_BYTE_MIN || 835 c > UCONV_U8_BYTE_MAX) { 836 return (EILSEQ); 837 } 838 hi = (hi << UCONV_U8_BIT_SHIFT) | 839 (c & UCONV_U8_BIT_MASK); 840 } 841 } 842 843 if (u32l >= *utf32len) 844 return (E2BIG); 845 846 u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi); 847 } 848 849 *utf32len = u32l; 850 *utf8len = u8l; 851 852 return (0); 853 } 854