1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 14 */ 15 16 /* 17 * The functions in this file convert from the standard multibyte forms 18 * to the wide character forms used internally by libc. Unfortunately, 19 * this approach means that we need a method for each and every encoding. 20 */ 21 22 #include <stdlib.h> 23 #include <wchar.h> 24 #include <string.h> 25 #include <sys/types.h> 26 #include "localedef.h" 27 28 static int towide_none(wchar_t *, const char *, int); 29 static int towide_utf8(wchar_t *, const char *, int); 30 static int towide_big5(wchar_t *, const char *, int); 31 static int towide_gbk(wchar_t *, const char *, int); 32 static int towide_gb2312(wchar_t *, const char *, int); 33 static int towide_gb18030(wchar_t *, const char *, int); 34 static int towide_mskanji(wchar_t *, const char *, int); 35 static int towide_euccn(wchar_t *, const char *, int); 36 static int towide_eucjp(wchar_t *, const char *, int); 37 static int towide_euckr(wchar_t *, const char *, int); 38 static int towide_euctw(wchar_t *, const char *, int); 39 40 static int tomb_none(char *, wchar_t); 41 static int tomb_utf8(char *, wchar_t); 42 static int tomb_mbs(char *, wchar_t); 43 44 static int (*_towide)(wchar_t *, const char *, int) = towide_none; 45 static int (*_tomb)(char *, wchar_t) = tomb_none; 46 static const char *_encoding = "NONE"; 47 static int _nbits = 7; 48 49 /* 50 * Table of supported encodings. We only bother to list the multibyte 51 * encodings here, because single byte locales are handed by "NONE". 52 */ 53 static struct { 54 const char *name; 55 /* the name that the underlying libc implemenation uses */ 56 const char *cname; 57 /* the maximum number of bits required for priorities */ 58 int nbits; 59 int (*towide)(wchar_t *, const char *, int); 60 int (*tomb)(char *, wchar_t); 61 } mb_encodings[] = { 62 /* 63 * UTF8 values max out at 0x1fffff (although in theory there could 64 * be later extensions, but it won't happen.) This means we only need 65 * 21 bits to be able to encode the entire range of priorities. 66 */ 67 { "UTF-8", "UTF-8", 21, towide_utf8, tomb_utf8 }, 68 { "UTF8", "UTF-8", 21, towide_utf8, tomb_utf8 }, 69 { "utf8", "UTF-8", 21, towide_utf8, tomb_utf8 }, 70 { "utf-8", "UTF-8", 21, towide_utf8, tomb_utf8 }, 71 72 { "EUC-CN", "EUC-CN", 16, towide_euccn, tomb_mbs }, 73 { "eucCN", "EUC-CN", 16, towide_euccn, tomb_mbs }, 74 /* 75 * Becuase the 3-byte form of EUC-JP use the same leading byte, 76 * only 17 bits required to provide unique priorities. (The low 77 * bit of that first byte is set.) By setting this value low, 78 * we can get by with only 3 bytes in the strxfrm expansion. 79 */ 80 { "EUC-JP", "EUC-JP", 17, towide_eucjp, tomb_mbs }, 81 { "eucJP", "EUC-JP", 17, towide_eucjp, tomb_mbs }, 82 83 { "EUC-KR", "EUC-KR", 16, towide_euckr, tomb_mbs }, 84 { "eucKR", "EUC-KR", 16, towide_euckr, tomb_mbs }, 85 /* 86 * EUC-TW uses 2 bytes most of the time, but 4 bytes if the 87 * high order byte is 0x8E. However, with 4 byte encodings, 88 * the third byte will be A0-B0. So we only need to consider 89 * the lower order 24 bits for collation. 90 */ 91 { "EUC-TW", "EUC-TW", 24, towide_euctw, tomb_mbs }, 92 { "eucTW", "EUC-TW", 24, towide_euctw, tomb_mbs }, 93 94 { "MS_Kanji", "MSKanji", 16, towide_mskanji, tomb_mbs }, 95 { "MSKanji", "MSKanji", 16, towide_mskanji, tomb_mbs }, 96 { "PCK", "MSKanji", 16, towide_mskanji, tomb_mbs }, 97 { "SJIS", "MSKanji", 16, towide_mskanji, tomb_mbs }, 98 { "Shift_JIS", "MSKanji", 16, towide_mskanji, tomb_mbs }, 99 100 { "BIG5", "BIG5", 16, towide_big5, tomb_mbs }, 101 { "big5", "BIG5", 16, towide_big5, tomb_mbs }, 102 { "Big5", "BIG5", 16, towide_big5, tomb_mbs }, 103 104 { "GBK", "GBK", 16, towide_gbk, tomb_mbs }, 105 106 /* 107 * GB18030 can get away with just 31 bits. This is because the 108 * high order bit is always set for 4 byte values, and the 109 * at least one of the other bits in that 4 byte value will 110 * be non-zero. 111 */ 112 { "GB18030", "GB18030", 31, towide_gb18030, tomb_mbs }, 113 114 /* 115 * This should probably be an aliase for euc-cn, or vice versa. 116 */ 117 { "GB2312", "GB2312", 16, towide_gb2312, tomb_mbs }, 118 119 { NULL, NULL }, 120 }; 121 122 static char * 123 show_mb(const char *mb) 124 { 125 static char buf[64]; 126 127 /* ASCII stuff we just print */ 128 if (isascii(*mb) && isgraph(*mb)) { 129 buf[0] = *mb; 130 buf[1] = 0; 131 return (buf); 132 } 133 buf[0] = 0; 134 while (*mb != 0) { 135 char scr[8]; 136 (void) snprintf(scr, sizeof (scr), "\\x%02x", *mb); 137 (void) strlcat(buf, scr, sizeof (buf)); 138 mb++; 139 } 140 return (buf); 141 } 142 143 static char *widemsg; 144 145 void 146 werr(const char *fmt, ...) 147 { 148 char *msg; 149 150 va_list va; 151 va_start(va, fmt); 152 (void) vasprintf(&msg, fmt, va); 153 va_end(va); 154 155 free(widemsg); 156 widemsg = msg; 157 } 158 159 /* 160 * This is used for 8-bit encodings. 161 */ 162 int 163 towide_none(wchar_t *c, const char *mb, int n) 164 { 165 if (mb_cur_max != 1) { 166 werr("invalid or unsupported multibyte locale"); 167 return (-1); 168 } 169 if (n < 1) { 170 werr("no character data"); 171 return (-1); 172 } 173 *c = (uint8_t)*mb; 174 return (1); 175 } 176 177 int 178 tomb_none(char *mb, wchar_t wc) 179 { 180 if (mb_cur_max != 1) { 181 werr("invalid or unsupported multibyte locale"); 182 return (-1); 183 } 184 *(uint8_t *)mb = (wc & 0xff); 185 mb[1] = 0; 186 return (1); 187 } 188 189 /* 190 * UTF-8 stores wide characters in UTF-32 form. 191 */ 192 int 193 towide_utf8(wchar_t *wc, const char *mb, int n) 194 { 195 wchar_t c; 196 int nb; 197 int lv; /* lowest legal value */ 198 int i; 199 const uint8_t *s = (const uint8_t *)mb; 200 201 if (n < 1) { 202 werr("no utf8 data"); 203 return (-1); 204 } 205 c = *s; 206 207 if ((c & 0x80) == 0) { 208 /* 7-bit ASCII */ 209 *wc = c; 210 return (1); 211 } else if ((c & 0xe0) == 0xc0) { 212 /* u80-u7ff - two bytes encoded */ 213 nb = 2; 214 lv = 0x80; 215 c &= ~0xe0; 216 } else if ((c & 0xf0) == 0xe0) { 217 /* u800-uffff - three bytes encoded */ 218 nb = 3; 219 lv = 0x800; 220 c &= ~0xf0; 221 } else if ((c & 0xf8) == 0xf0) { 222 /* u1000-u1fffff - four bytes encoded */ 223 nb = 4; 224 lv = 0x1000; 225 c &= ~0xf8; 226 } else { 227 /* 5 and 6 byte encodings are not legal unicode */ 228 werr("utf8 encoding too large (%s)", show_mb(mb)); 229 return (-1); 230 } 231 if (nb > n) { 232 werr("incomplete utf8 sequence (%s)", show_mb(mb)); 233 return (-1); 234 } 235 236 for (i = 1; i < nb; i++) { 237 if (((s[i]) & 0xc0) != 0x80) { 238 werr("illegal utf8 byte (%x)", s[i]); 239 return (-1); 240 } 241 c <<= 6; 242 c |= (s[i] & 0x3f); 243 } 244 245 if (c < lv) { 246 werr("illegal redundant utf8 encoding (%s)", show_mb(mb)); 247 return (-1); 248 } 249 *wc = c; 250 return (nb); 251 } 252 253 int 254 tomb_utf8(char *mb, wchar_t wc) 255 { 256 uint8_t *s = (uint8_t *)mb; 257 uint8_t msk; 258 int cnt; 259 int i; 260 261 if (wc <= 0x7f) { 262 s[0] = wc & 0x7f; 263 s[1] = 0; 264 return (1); 265 } 266 if (wc <= 0x7ff) { 267 cnt = 2; 268 msk = 0xc0; 269 } else if (wc <= 0xffff) { 270 cnt = 3; 271 msk = 0xe0; 272 } else if (wc <= 0x1fffff) { 273 cnt = 4; 274 msk = 0xf0; 275 } else { 276 werr("illegal uf8 char (%x)", wc); 277 return (-1); 278 } 279 for (i = cnt - 1; i; i--) { 280 s[i] = (wc & 0x3f) | 0x80; 281 wc >>= 6; 282 } 283 s[0] = (msk) | wc; 284 s[cnt] = 0; 285 return (cnt); 286 } 287 288 /* 289 * Several encodings share a simplistic dual byte encoding. In these 290 * forms, they all indicate that a two byte sequence is to be used if 291 * the first byte has its high bit set. They all store this simple 292 * encoding as a 16-bit value, although a great many of the possible 293 * code points are not used in most character sets. This gives a possible 294 * set of just over 32,000 valid code points. 295 * 296 * 0x00 - 0x7f - 1 byte encoding 297 * 0x80 - 0x7fff - illegal 298 * 0x8000 - 0xffff - 2 byte encoding 299 */ 300 static int 301 towide_dbcs(wchar_t *wc, const char *mb, int n) 302 { 303 wchar_t c; 304 305 c = *(uint8_t *)mb; 306 307 if (n < 1) { 308 werr("no character data"); 309 return (-1); 310 } 311 if ((c & 0x80) == 0) { 312 /* 7-bit */ 313 *wc = c; 314 return (1); 315 } 316 if (n < 2) { 317 werr("incomplete character sequence (%s)", show_mb(mb)); 318 return (-1); 319 } 320 321 /* Store both bytes as a single 16-bit wide. */ 322 c <<= 8; 323 c |= (uint8_t)(mb[1]); 324 *wc = c; 325 return (2); 326 } 327 328 /* 329 * Most multibyte locales just convert the wide character to the multibyte 330 * form by stripping leading null bytes, and writing the 32-bit quantity 331 * in big-endian order. 332 */ 333 int 334 tomb_mbs(char *mb, wchar_t wc) 335 { 336 uint8_t *s = (uint8_t *)mb; 337 int n = 0, c; 338 339 if ((wc & 0xff000000U) != 0) { 340 n = 4; 341 } else if ((wc & 0x00ff0000U) != 0) { 342 n = 3; 343 } else if ((wc & 0x0000ff00U) != 0) { 344 n = 2; 345 } else { 346 n = 1; 347 } 348 c = n; 349 while (n) { 350 n--; 351 s[n] = wc & 0xff; 352 wc >>= 8; 353 } 354 /* ensure null termination */ 355 s[c] = 0; 356 return (c); 357 } 358 359 360 /* 361 * big5 is a simple dual byte character set. 362 */ 363 int 364 towide_big5(wchar_t *wc, const char *mb, int n) 365 { 366 return (towide_dbcs(wc, mb, n)); 367 } 368 369 /* 370 * GBK encodes wides in the same way that big5 does, the high order 371 * bit of the first byte indicates a double byte character. 372 */ 373 int 374 towide_gbk(wchar_t *wc, const char *mb, int n) 375 { 376 return (towide_dbcs(wc, mb, n)); 377 } 378 379 /* 380 * GB2312 is another DBCS. Its cleaner than others in that the second 381 * byte does not encode ASCII, but it supports characters. 382 */ 383 int 384 towide_gb2312(wchar_t *wc, const char *mb, int n) 385 { 386 return (towide_dbcs(wc, mb, n)); 387 } 388 389 /* 390 * GB18030. This encodes as 8, 16, or 32-bits. 391 * 7-bit values are in 1 byte, 4 byte sequences are used when 392 * the second byte encodes 0x30-39 and all other sequences are 2 bytes. 393 */ 394 int 395 towide_gb18030(wchar_t *wc, const char *mb, int n) 396 { 397 wchar_t c; 398 399 c = *(uint8_t *)mb; 400 401 if (n < 1) { 402 werr("no character data"); 403 return (-1); 404 } 405 if ((c & 0x80) == 0) { 406 /* 7-bit */ 407 *wc = c; 408 return (1); 409 } 410 if (n < 2) { 411 werr("incomplete character sequence (%s)", show_mb(mb)); 412 return (-1); 413 } 414 415 /* pull in the second byte */ 416 c <<= 8; 417 c |= (uint8_t)(mb[1]); 418 419 if (((c & 0xff) >= 0x30) && ((c & 0xff) <= 0x39)) { 420 if (n < 4) { 421 werr("incomplete 4-byte character sequence (%s)", 422 show_mb(mb)); 423 return (-1); 424 } 425 c <<= 8; 426 c |= (uint8_t)(mb[2]); 427 c <<= 8; 428 c |= (uint8_t)(mb[3]); 429 *wc = c; 430 return (4); 431 } 432 433 *wc = c; 434 return (2); 435 } 436 437 /* 438 * MS-Kanji (aka SJIS) is almost a clean DBCS like the others, but it 439 * also has a range of single byte characters above 0x80. (0xa1-0xdf). 440 */ 441 int 442 towide_mskanji(wchar_t *wc, const char *mb, int n) 443 { 444 wchar_t c; 445 446 c = *(uint8_t *)mb; 447 448 if (n < 1) { 449 werr("no character data"); 450 return (-1); 451 } 452 if ((c < 0x80) || ((c > 0xa0) && (c < 0xe0))) { 453 /* 7-bit */ 454 *wc = c; 455 return (-1); 456 } 457 458 if (n < 2) { 459 werr("incomplete character sequence (%s)", show_mb(mb)); 460 return (-1); 461 } 462 463 /* Store both bytes as a single 16-bit wide. */ 464 c <<= 8; 465 c |= (uint8_t)(mb[1]); 466 *wc = c; 467 return (2); 468 } 469 470 /* 471 * EUC forms. EUC encodings are "variable". FreeBSD carries some additional 472 * variable data to encode these, but we're going to treat each as independent 473 * instead. Its the only way we can sensibly move forward. 474 * 475 * Note that the way in which the different EUC forms vary is how wide 476 * CS2 and CS3 are and what the first byte of them is. 477 */ 478 static int 479 towide_euc_impl(wchar_t *wc, const char *mb, int n, 480 uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) 481 { 482 int i; 483 int width; 484 wchar_t c; 485 486 c = *(uint8_t *)mb; 487 488 if (n < 1) { 489 werr("no character data"); 490 return (-1); 491 } 492 493 /* 494 * All variations of EUC encode 7-bit ASCII as one byte, and use 495 * additional bytes for more than that. 496 */ 497 if ((c & 0x80) == 0) { 498 /* 7-bit */ 499 *wc = c; 500 return (1); 501 } 502 503 /* 504 * All EUC variants reserve 0xa1-0xff to identify CS1, which 505 * is always two bytes wide. Note that unused CS will be zero, 506 * and that cannot be true because we know that the high order 507 * bit must be set. 508 */ 509 if (c >= 0xa1) { 510 width = 2; 511 } else if (c == cs2) { 512 width = cs2width; 513 } else if (c == cs3) { 514 width = cs3width; 515 } 516 517 if (n < width) { 518 werr("incomplete character sequence (%s)", show_mb(mb)); 519 return (-1); 520 } 521 522 for (i = 1; i < width; i++) { 523 /* pull in the next byte */ 524 c <<= 8; 525 c |= (uint8_t)(mb[i]); 526 } 527 528 *wc = c; 529 return (width); 530 } 531 532 /* 533 * EUC-CN encodes as follows: 534 * 535 * Code set 0 (ASCII): 0x21-0x7E 536 * Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE 537 * Code set 2: unused 538 * Code set 3: unused 539 */ 540 int 541 towide_euccn(wchar_t *wc, const char *mb, int n) 542 { 543 return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0)); 544 } 545 546 /* 547 * EUC-JP encodes as follows: 548 * 549 * Code set 0 (ASCII or JIS X 0201-1976 Roman): 0x21-0x7E 550 * Code set 1 (JIS X 0208): 0xA1A1-0xFEFE 551 * Code set 2 (half-width katakana): 0x8EA1-0x8EDF 552 * Code set 3 (JIS X 0212-1990): 0x8FA1A1-0x8FFEFE 553 */ 554 int 555 towide_eucjp(wchar_t *wc, const char *mb, int n) 556 { 557 return (towide_euc_impl(wc, mb, n, 0x8e, 2, 0x8f, 3)); 558 } 559 560 /* 561 * EUC-KR encodes as follows: 562 * 563 * Code set 0 (ASCII or KS C 5636-1993): 0x21-0x7E 564 * Code set 1 (KS C 5601-1992): 0xA1A1-0xFEFE 565 * Code set 2: unused 566 * Code set 3: unused 567 */ 568 int 569 towide_euckr(wchar_t *wc, const char *mb, int n) 570 { 571 return (towide_euc_impl(wc, mb, n, 0, 0, 0, 0)); 572 } 573 574 /* 575 * EUC-TW encodes as follows: 576 * 577 * Code set 0 (ASCII): 0x21-0x7E 578 * Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE 579 * Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE 580 * Code set 3: unused 581 */ 582 int 583 towide_euctw(wchar_t *wc, const char *mb, int n) 584 { 585 return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0)); 586 } 587 588 /* 589 * Public entry points. 590 */ 591 592 int 593 to_wide(wchar_t *wc, const char *mb) 594 { 595 /* this won't fail hard */ 596 return (_towide(wc, mb, strlen(mb) + 1)); 597 } 598 599 int 600 to_mb(char *mb, wchar_t wc) 601 { 602 int rv; 603 604 if ((rv = _tomb(mb, wc)) < 0) { 605 errf(widemsg); 606 free(widemsg); 607 widemsg = NULL; 608 } 609 return (rv); 610 } 611 612 char * 613 to_mb_string(const wchar_t *wcs) 614 { 615 char *mbs; 616 char *ptr; 617 int len; 618 619 mbs = malloc((wcslen(wcs) * mb_cur_max) + 1); 620 if (mbs == NULL) { 621 errf("out of memory"); 622 return (NULL); 623 } 624 ptr = mbs; 625 while (*wcs) { 626 if ((len = to_mb(ptr, *wcs)) < 0) { 627 INTERR; 628 free(mbs); 629 return (NULL); 630 } 631 wcs++; 632 ptr += len; 633 } 634 *ptr = 0; 635 return (mbs); 636 } 637 638 void 639 set_wide_encoding(const char *encoding) 640 { 641 int i; 642 643 _towide = towide_none; 644 _tomb = tomb_none; 645 _encoding = "NONE"; 646 _nbits = 8; 647 648 for (i = 0; mb_encodings[i].name; i++) { 649 if (strcasecmp(encoding, mb_encodings[i].name) == 0) { 650 _towide = mb_encodings[i].towide; 651 _tomb = mb_encodings[i].tomb; 652 _encoding = mb_encodings[i].cname; 653 _nbits = mb_encodings[i].nbits; 654 break; 655 } 656 } 657 } 658 659 const char * 660 get_wide_encoding(void) 661 { 662 return (_encoding); 663 } 664 665 int 666 max_wide(void) 667 { 668 return ((int)((1U << _nbits) - 1)); 669 } 670