1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms version 1.0 5 * of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 14 */ 15 16 /* 17 * The functions in this file convert from the standard multibyte forms 18 * to the wide character forms used internally by libc. Unfortunately, 19 * this approach means that we need a method for each and every encoding. 20 */ 21 22 #include <stdlib.h> 23 #include <wchar.h> 24 #include <string.h> 25 #include <sys/types.h> 26 #include "localedef.h" 27 28 static int towide_none(wchar_t *, const char *, int); 29 static int towide_utf8(wchar_t *, const char *, int); 30 static int towide_big5(wchar_t *, const char *, int); 31 static int towide_gbk(wchar_t *, const char *, int); 32 static int towide_gb2312(wchar_t *, const char *, int); 33 static int towide_gb18030(wchar_t *, const char *, int); 34 static int towide_mskanji(wchar_t *, const char *, int); 35 static int towide_euccn(wchar_t *, const char *, int); 36 static int towide_eucjp(wchar_t *, const char *, int); 37 static int towide_euckr(wchar_t *, const char *, int); 38 static int towide_euctw(wchar_t *, const char *, int); 39 40 static int tomb_none(char *, wchar_t); 41 static int tomb_utf8(char *, wchar_t); 42 static int tomb_mbs(char *, wchar_t); 43 44 static int (*_towide)(wchar_t *, const char *, int) = towide_none; 45 static int (*_tomb)(char *, wchar_t) = tomb_none; 46 static const char *_encoding = "NONE"; 47 static int _nbits = 7; 48 49 /* 50 * Table of supported encodings. We only bother to list the multibyte 51 * encodings here, because single byte locales are handed by "NONE". 52 */ 53 static struct { 54 const char *name; 55 /* the name that the underlying libc implemenation uses */ 56 const char *cname; 57 /* the maximum number of bits required for priorities */ 58 int nbits; 59 int (*towide)(wchar_t *, const char *, int); 60 int (*tomb)(char *, wchar_t); 61 } mb_encodings[] = { 62 /* 63 * UTF8 values max out at 0x1fffff (although in theory there could 64 * be later extensions, but it won't happen.) This means we only need 65 * 21 bits to be able to encode the entire range of priorities. 66 */ 67 { "UTF-8", "UTF-8", 21, towide_utf8, tomb_utf8 }, 68 { "UTF8", "UTF-8", 21, towide_utf8, tomb_utf8 }, 69 { "utf8", "UTF-8", 21, towide_utf8, tomb_utf8 }, 70 { "utf-8", "UTF-8", 21, towide_utf8, tomb_utf8 }, 71 72 { "EUC-CN", "EUC-CN", 16, towide_euccn, tomb_mbs }, 73 { "eucCN", "EUC-CN", 16, towide_euccn, tomb_mbs }, 74 /* 75 * Becuase the 3-byte form of EUC-JP use the same leading byte, 76 * only 17 bits required to provide unique priorities. (The low 77 * bit of that first byte is set.) By setting this value low, 78 * we can get by with only 3 bytes in the strxfrm expansion. 79 */ 80 { "EUC-JP", "EUC-JP", 17, towide_eucjp, tomb_mbs }, 81 { "eucJP", "EUC-JP", 17, towide_eucjp, tomb_mbs }, 82 83 { "EUC-KR", "EUC-KR", 16, towide_euckr, tomb_mbs }, 84 { "eucKR", "EUC-KR", 16, towide_euckr, tomb_mbs }, 85 /* 86 * EUC-TW uses 2 bytes most of the time, but 4 bytes if the 87 * high order byte is 0x8E. However, with 4 byte encodings, 88 * the third byte will be A0-B0. So we only need to consider 89 * the lower order 24 bits for collation. 90 */ 91 { "EUC-TW", "EUC-TW", 24, towide_euctw, tomb_mbs }, 92 { "eucTW", "EUC-TW", 24, towide_euctw, tomb_mbs }, 93 94 { "MS_Kanji", "MSKanji", 16, towide_mskanji, tomb_mbs }, 95 { "MSKanji", "MSKanji", 16, towide_mskanji, tomb_mbs }, 96 { "PCK", "MSKanji", 16, towide_mskanji, tomb_mbs }, 97 { "SJIS", "MSKanji", 16, towide_mskanji, tomb_mbs }, 98 { "Shift_JIS", "MSKanji", 16, towide_mskanji, tomb_mbs }, 99 100 { "BIG5", "BIG5", 16, towide_big5, tomb_mbs }, 101 { "big5", "BIG5", 16, towide_big5, tomb_mbs }, 102 { "Big5", "BIG5", 16, towide_big5, tomb_mbs }, 103 104 { "GBK", "GBK", 16, towide_gbk, tomb_mbs }, 105 106 /* 107 * GB18030 can get away with just 31 bits. This is because the 108 * high order bit is always set for 4 byte values, and the 109 * at least one of the other bits in that 4 byte value will 110 * be non-zero. 111 */ 112 { "GB18030", "GB18030", 31, towide_gb18030, tomb_mbs }, 113 114 /* 115 * This should probably be an aliase for euc-cn, or vice versa. 116 */ 117 { "GB2312", "GB2312", 16, towide_gb2312, tomb_mbs }, 118 119 { "ASCII", "ASCII", 7, towide_none, tomb_none }, 120 { "US-ASCII", "ASCII", 7, towide_none, tomb_none }, 121 { "646", "ASCII", 7, towide_none, tomb_none }, 122 123 { NULL, NULL }, 124 }; 125 126 static char * 127 show_mb(const char *mb) 128 { 129 static char buf[64]; 130 131 /* ASCII stuff we just print */ 132 if (isascii(*mb) && isgraph(*mb)) { 133 buf[0] = *mb; 134 buf[1] = 0; 135 return (buf); 136 } 137 buf[0] = 0; 138 while (*mb != 0) { 139 char scr[8]; 140 (void) snprintf(scr, sizeof (scr), "\\x%02x", *mb); 141 (void) strlcat(buf, scr, sizeof (buf)); 142 mb++; 143 } 144 return (buf); 145 } 146 147 static char *widemsg; 148 149 void 150 werr(const char *fmt, ...) 151 { 152 char *msg; 153 154 va_list va; 155 va_start(va, fmt); 156 (void) vasprintf(&msg, fmt, va); 157 va_end(va); 158 159 free(widemsg); 160 widemsg = msg; 161 } 162 163 /* 164 * This is used for 8-bit encodings. 165 */ 166 int 167 towide_none(wchar_t *c, const char *mb, int n) 168 { 169 if (mb_cur_max != 1) { 170 werr("invalid or unsupported multibyte locale"); 171 return (-1); 172 } 173 if (n < 1) { 174 werr("no character data"); 175 return (-1); 176 } 177 *c = (uint8_t)*mb; 178 return (1); 179 } 180 181 int 182 tomb_none(char *mb, wchar_t wc) 183 { 184 if (mb_cur_max != 1) { 185 werr("invalid or unsupported multibyte locale"); 186 return (-1); 187 } 188 *(uint8_t *)mb = (wc & 0xff); 189 mb[1] = 0; 190 return (1); 191 } 192 193 /* 194 * UTF-8 stores wide characters in UTF-32 form. 195 */ 196 int 197 towide_utf8(wchar_t *wc, const char *mb, int n) 198 { 199 wchar_t c; 200 int nb; 201 int lv; /* lowest legal value */ 202 int i; 203 const uint8_t *s = (const uint8_t *)mb; 204 205 if (n < 1) { 206 werr("no utf8 data"); 207 return (-1); 208 } 209 c = *s; 210 211 if ((c & 0x80) == 0) { 212 /* 7-bit ASCII */ 213 *wc = c; 214 return (1); 215 } else if ((c & 0xe0) == 0xc0) { 216 /* u80-u7ff - two bytes encoded */ 217 nb = 2; 218 lv = 0x80; 219 c &= ~0xe0; 220 } else if ((c & 0xf0) == 0xe0) { 221 /* u800-uffff - three bytes encoded */ 222 nb = 3; 223 lv = 0x800; 224 c &= ~0xf0; 225 } else if ((c & 0xf8) == 0xf0) { 226 /* u1000-u1fffff - four bytes encoded */ 227 nb = 4; 228 lv = 0x1000; 229 c &= ~0xf8; 230 } else { 231 /* 5 and 6 byte encodings are not legal unicode */ 232 werr("utf8 encoding too large (%s)", show_mb(mb)); 233 return (-1); 234 } 235 if (nb > n) { 236 werr("incomplete utf8 sequence (%s)", show_mb(mb)); 237 return (-1); 238 } 239 240 for (i = 1; i < nb; i++) { 241 if (((s[i]) & 0xc0) != 0x80) { 242 werr("illegal utf8 byte (%x)", s[i]); 243 return (-1); 244 } 245 c <<= 6; 246 c |= (s[i] & 0x3f); 247 } 248 249 if (c < lv) { 250 werr("illegal redundant utf8 encoding (%s)", show_mb(mb)); 251 return (-1); 252 } 253 *wc = c; 254 return (nb); 255 } 256 257 int 258 tomb_utf8(char *mb, wchar_t wc) 259 { 260 uint8_t *s = (uint8_t *)mb; 261 uint8_t msk; 262 int cnt; 263 int i; 264 265 if (wc <= 0x7f) { 266 s[0] = wc & 0x7f; 267 s[1] = 0; 268 return (1); 269 } 270 if (wc <= 0x7ff) { 271 cnt = 2; 272 msk = 0xc0; 273 } else if (wc <= 0xffff) { 274 cnt = 3; 275 msk = 0xe0; 276 } else if (wc <= 0x1fffff) { 277 cnt = 4; 278 msk = 0xf0; 279 } else { 280 werr("illegal uf8 char (%x)", wc); 281 return (-1); 282 } 283 for (i = cnt - 1; i; i--) { 284 s[i] = (wc & 0x3f) | 0x80; 285 wc >>= 6; 286 } 287 s[0] = (msk) | wc; 288 s[cnt] = 0; 289 return (cnt); 290 } 291 292 /* 293 * Several encodings share a simplistic dual byte encoding. In these 294 * forms, they all indicate that a two byte sequence is to be used if 295 * the first byte has its high bit set. They all store this simple 296 * encoding as a 16-bit value, although a great many of the possible 297 * code points are not used in most character sets. This gives a possible 298 * set of just over 32,000 valid code points. 299 * 300 * 0x00 - 0x7f - 1 byte encoding 301 * 0x80 - 0x7fff - illegal 302 * 0x8000 - 0xffff - 2 byte encoding 303 */ 304 static int 305 towide_dbcs(wchar_t *wc, const char *mb, int n) 306 { 307 wchar_t c; 308 309 c = *(uint8_t *)mb; 310 311 if (n < 1) { 312 werr("no character data"); 313 return (-1); 314 } 315 if ((c & 0x80) == 0) { 316 /* 7-bit */ 317 *wc = c; 318 return (1); 319 } 320 if (n < 2) { 321 werr("incomplete character sequence (%s)", show_mb(mb)); 322 return (-1); 323 } 324 325 /* Store both bytes as a single 16-bit wide. */ 326 c <<= 8; 327 c |= (uint8_t)(mb[1]); 328 *wc = c; 329 return (2); 330 } 331 332 /* 333 * Most multibyte locales just convert the wide character to the multibyte 334 * form by stripping leading null bytes, and writing the 32-bit quantity 335 * in big-endian order. 336 */ 337 int 338 tomb_mbs(char *mb, wchar_t wc) 339 { 340 uint8_t *s = (uint8_t *)mb; 341 int n = 0, c; 342 343 if ((wc & 0xff000000U) != 0) { 344 n = 4; 345 } else if ((wc & 0x00ff0000U) != 0) { 346 n = 3; 347 } else if ((wc & 0x0000ff00U) != 0) { 348 n = 2; 349 } else { 350 n = 1; 351 } 352 c = n; 353 while (n) { 354 n--; 355 s[n] = wc & 0xff; 356 wc >>= 8; 357 } 358 /* ensure null termination */ 359 s[c] = 0; 360 return (c); 361 } 362 363 364 /* 365 * big5 is a simple dual byte character set. 366 */ 367 int 368 towide_big5(wchar_t *wc, const char *mb, int n) 369 { 370 return (towide_dbcs(wc, mb, n)); 371 } 372 373 /* 374 * GBK encodes wides in the same way that big5 does, the high order 375 * bit of the first byte indicates a double byte character. 376 */ 377 int 378 towide_gbk(wchar_t *wc, const char *mb, int n) 379 { 380 return (towide_dbcs(wc, mb, n)); 381 } 382 383 /* 384 * GB2312 is another DBCS. Its cleaner than others in that the second 385 * byte does not encode ASCII, but it supports characters. 386 */ 387 int 388 towide_gb2312(wchar_t *wc, const char *mb, int n) 389 { 390 return (towide_dbcs(wc, mb, n)); 391 } 392 393 /* 394 * GB18030. This encodes as 8, 16, or 32-bits. 395 * 7-bit values are in 1 byte, 4 byte sequences are used when 396 * the second byte encodes 0x30-39 and all other sequences are 2 bytes. 397 */ 398 int 399 towide_gb18030(wchar_t *wc, const char *mb, int n) 400 { 401 wchar_t c; 402 403 c = *(uint8_t *)mb; 404 405 if (n < 1) { 406 werr("no character data"); 407 return (-1); 408 } 409 if ((c & 0x80) == 0) { 410 /* 7-bit */ 411 *wc = c; 412 return (1); 413 } 414 if (n < 2) { 415 werr("incomplete character sequence (%s)", show_mb(mb)); 416 return (-1); 417 } 418 419 /* pull in the second byte */ 420 c <<= 8; 421 c |= (uint8_t)(mb[1]); 422 423 if (((c & 0xff) >= 0x30) && ((c & 0xff) <= 0x39)) { 424 if (n < 4) { 425 werr("incomplete 4-byte character sequence (%s)", 426 show_mb(mb)); 427 return (-1); 428 } 429 c <<= 8; 430 c |= (uint8_t)(mb[2]); 431 c <<= 8; 432 c |= (uint8_t)(mb[3]); 433 *wc = c; 434 return (4); 435 } 436 437 *wc = c; 438 return (2); 439 } 440 441 /* 442 * MS-Kanji (aka SJIS) is almost a clean DBCS like the others, but it 443 * also has a range of single byte characters above 0x80. (0xa1-0xdf). 444 */ 445 int 446 towide_mskanji(wchar_t *wc, const char *mb, int n) 447 { 448 wchar_t c; 449 450 c = *(uint8_t *)mb; 451 452 if (n < 1) { 453 werr("no character data"); 454 return (-1); 455 } 456 if ((c < 0x80) || ((c > 0xa0) && (c < 0xe0))) { 457 /* 7-bit */ 458 *wc = c; 459 return (-1); 460 } 461 462 if (n < 2) { 463 werr("incomplete character sequence (%s)", show_mb(mb)); 464 return (-1); 465 } 466 467 /* Store both bytes as a single 16-bit wide. */ 468 c <<= 8; 469 c |= (uint8_t)(mb[1]); 470 *wc = c; 471 return (2); 472 } 473 474 /* 475 * EUC forms. EUC encodings are "variable". FreeBSD carries some additional 476 * variable data to encode these, but we're going to treat each as independent 477 * instead. Its the only way we can sensibly move forward. 478 * 479 * Note that the way in which the different EUC forms vary is how wide 480 * CS2 and CS3 are and what the first byte of them is. 481 */ 482 static int 483 towide_euc_impl(wchar_t *wc, const char *mb, int n, 484 uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) 485 { 486 int i; 487 int width; 488 wchar_t c; 489 490 c = *(uint8_t *)mb; 491 492 if (n < 1) { 493 werr("no character data"); 494 return (-1); 495 } 496 497 /* 498 * All variations of EUC encode 7-bit ASCII as one byte, and use 499 * additional bytes for more than that. 500 */ 501 if ((c & 0x80) == 0) { 502 /* 7-bit */ 503 *wc = c; 504 return (1); 505 } 506 507 /* 508 * All EUC variants reserve 0xa1-0xff to identify CS1, which 509 * is always two bytes wide. Note that unused CS will be zero, 510 * and that cannot be true because we know that the high order 511 * bit must be set. 512 */ 513 if (c >= 0xa1) { 514 width = 2; 515 } else if (c == cs2) { 516 width = cs2width; 517 } else if (c == cs3) { 518 width = cs3width; 519 } 520 521 if (n < width) { 522 werr("incomplete character sequence (%s)", show_mb(mb)); 523 return (-1); 524 } 525 526 for (i = 1; i < width; i++) { 527 /* pull in the next byte */ 528 c <<= 8; 529 c |= (uint8_t)(mb[i]); 530 } 531 532 *wc = c; 533 return (width); 534 } 535 536 /* 537 * EUC-CN encodes as follows: 538 * 539 * Code set 0 (ASCII): 0x21-0x7E 540 * Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE 541 * Code set 2: unused 542 * Code set 3: unused 543 */ 544 int 545 towide_euccn(wchar_t *wc, const char *mb, int n) 546 { 547 return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0)); 548 } 549 550 /* 551 * EUC-JP encodes as follows: 552 * 553 * Code set 0 (ASCII or JIS X 0201-1976 Roman): 0x21-0x7E 554 * Code set 1 (JIS X 0208): 0xA1A1-0xFEFE 555 * Code set 2 (half-width katakana): 0x8EA1-0x8EDF 556 * Code set 3 (JIS X 0212-1990): 0x8FA1A1-0x8FFEFE 557 */ 558 int 559 towide_eucjp(wchar_t *wc, const char *mb, int n) 560 { 561 return (towide_euc_impl(wc, mb, n, 0x8e, 2, 0x8f, 3)); 562 } 563 564 /* 565 * EUC-KR encodes as follows: 566 * 567 * Code set 0 (ASCII or KS C 5636-1993): 0x21-0x7E 568 * Code set 1 (KS C 5601-1992): 0xA1A1-0xFEFE 569 * Code set 2: unused 570 * Code set 3: unused 571 */ 572 int 573 towide_euckr(wchar_t *wc, const char *mb, int n) 574 { 575 return (towide_euc_impl(wc, mb, n, 0, 0, 0, 0)); 576 } 577 578 /* 579 * EUC-TW encodes as follows: 580 * 581 * Code set 0 (ASCII): 0x21-0x7E 582 * Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE 583 * Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE 584 * Code set 3: unused 585 */ 586 int 587 towide_euctw(wchar_t *wc, const char *mb, int n) 588 { 589 return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0)); 590 } 591 592 /* 593 * Public entry points. 594 */ 595 596 int 597 to_wide(wchar_t *wc, const char *mb) 598 { 599 /* this won't fail hard */ 600 return (_towide(wc, mb, strlen(mb) + 1)); 601 } 602 603 int 604 to_mb(char *mb, wchar_t wc) 605 { 606 int rv; 607 608 if ((rv = _tomb(mb, wc)) < 0) { 609 errf(widemsg); 610 free(widemsg); 611 widemsg = NULL; 612 } 613 return (rv); 614 } 615 616 char * 617 to_mb_string(const wchar_t *wcs) 618 { 619 char *mbs; 620 char *ptr; 621 int len; 622 623 mbs = malloc((wcslen(wcs) * mb_cur_max) + 1); 624 if (mbs == NULL) { 625 errf("out of memory"); 626 return (NULL); 627 } 628 ptr = mbs; 629 while (*wcs) { 630 if ((len = to_mb(ptr, *wcs)) < 0) { 631 INTERR; 632 free(mbs); 633 return (NULL); 634 } 635 wcs++; 636 ptr += len; 637 } 638 *ptr = 0; 639 return (mbs); 640 } 641 642 void 643 set_wide_encoding(const char *encoding) 644 { 645 int i; 646 647 _towide = towide_none; 648 _tomb = tomb_none; 649 _encoding = "NONE"; 650 _nbits = 8; 651 652 for (i = 0; mb_encodings[i].name; i++) { 653 if (strcasecmp(encoding, mb_encodings[i].name) == 0) { 654 _towide = mb_encodings[i].towide; 655 _tomb = mb_encodings[i].tomb; 656 _encoding = mb_encodings[i].cname; 657 _nbits = mb_encodings[i].nbits; 658 break; 659 } 660 } 661 } 662 663 const char * 664 get_wide_encoding(void) 665 { 666 return (_encoding); 667 } 668 669 int 670 max_wide(void) 671 { 672 return ((int)((1U << _nbits) - 1)); 673 } 674