1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms version 1.0 5 * of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 14 */ 15 16 /* 17 * The functions in this file convert from the standard multibyte forms 18 * to the wide character forms used internally by libc. Unfortunately, 19 * this approach means that we need a method for each and every encoding. 20 */ 21 22 #include <stdlib.h> 23 #include <wchar.h> 24 #include <string.h> 25 #include <sys/types.h> 26 #include "localedef.h" 27 28 static int towide_none(wchar_t *, const char *, int); 29 static int towide_utf8(wchar_t *, const char *, int); 30 static int towide_big5(wchar_t *, const char *, int); 31 static int towide_gbk(wchar_t *, const char *, int); 32 static int towide_gb2312(wchar_t *, const char *, int); 33 static int towide_gb18030(wchar_t *, const char *, int); 34 static int towide_mskanji(wchar_t *, const char *, int); 35 static int towide_euccn(wchar_t *, const char *, int); 36 static int towide_eucjp(wchar_t *, const char *, int); 37 static int towide_euckr(wchar_t *, const char *, int); 38 static int towide_euctw(wchar_t *, const char *, int); 39 40 static int tomb_none(char *, wchar_t); 41 static int tomb_utf8(char *, wchar_t); 42 static int tomb_mbs(char *, wchar_t); 43 44 static int (*_towide)(wchar_t *, const char *, int) = towide_none; 45 static int (*_tomb)(char *, wchar_t) = tomb_none; 46 static const char *_encoding = "NONE"; 47 48 /* 49 * Table of supported encodings. We only bother to list the multibyte 50 * encodings here, because single byte locales are handed by "NONE". 51 */ 52 static struct { 53 const char *name; 54 /* the name that the underlying libc implemenation uses */ 55 const char *cname; 56 int (*towide)(wchar_t *, const char *, int); 57 int (*tomb)(char *, wchar_t); 58 } mb_encodings[] = { 59 { "UTF-8", "UTF-8", towide_utf8, tomb_utf8 }, 60 { "UTF8", "UTF-8", towide_utf8, tomb_utf8 }, 61 { "utf8", "UTF-8", towide_utf8, tomb_utf8 }, 62 { "utf-8", "UTF-8", towide_utf8, tomb_utf8 }, 63 64 { "EUC-CN", "EUC-CN", towide_euccn, tomb_mbs }, 65 { "eucCN", "EUC-CN", towide_euccn, tomb_mbs }, 66 67 { "EUC-JP", "EUC-JP", towide_eucjp, tomb_mbs }, 68 { "eucJP", "EUC-JP", towide_eucjp, tomb_mbs }, 69 70 { "EUC-KR", "EUC-KR", towide_euckr, tomb_mbs }, 71 { "eucKR", "EUC-KR", towide_euckr, tomb_mbs }, 72 73 { "EUC-TW", "EUC-TW", towide_euctw, tomb_mbs }, 74 { "eucTW", "EUC-TW", towide_euctw, tomb_mbs }, 75 76 { "MS_Kanji", "MSKanji", towide_mskanji, tomb_mbs }, 77 { "MSKanji", "MSKanji", towide_mskanji, tomb_mbs }, 78 { "PCK", "MSKanji", towide_mskanji, tomb_mbs }, 79 { "SJIS", "MSKanji", towide_mskanji, tomb_mbs }, 80 { "Shift_JIS", "MSKanji", towide_mskanji, tomb_mbs }, 81 82 { "BIG5", "BIG5", towide_big5, tomb_mbs }, 83 { "big5", "BIG5", towide_big5, tomb_mbs }, 84 { "Big5", "BIG5", towide_big5, tomb_mbs }, 85 86 { "GBK", "GBK", towide_gbk, tomb_mbs }, 87 88 { "GB18030", "GB18030", towide_gb18030, tomb_mbs }, 89 90 { "GB2312", "GB2312", towide_gb2312, tomb_mbs }, 91 92 { "ASCII", "ASCII", towide_none, tomb_none }, 93 { "US-ASCII", "ASCII", towide_none, tomb_none }, 94 { "646", "ASCII", towide_none, tomb_none }, 95 96 { NULL, NULL }, 97 }; 98 99 static char * 100 show_mb(const char *mb) 101 { 102 static char buf[64]; 103 104 /* ASCII stuff we just print */ 105 if (isascii(*mb) && isgraph(*mb)) { 106 buf[0] = *mb; 107 buf[1] = 0; 108 return (buf); 109 } 110 buf[0] = 0; 111 while (*mb != 0) { 112 char scr[8]; 113 (void) snprintf(scr, sizeof (scr), "\\x%02x", *mb); 114 (void) strlcat(buf, scr, sizeof (buf)); 115 mb++; 116 } 117 return (buf); 118 } 119 120 static char *widemsg; 121 122 void 123 werr(const char *fmt, ...) 124 { 125 char *msg; 126 127 va_list va; 128 va_start(va, fmt); 129 (void) vasprintf(&msg, fmt, va); 130 va_end(va); 131 132 free(widemsg); 133 widemsg = msg; 134 } 135 136 /* 137 * This is used for 8-bit encodings. 138 */ 139 int 140 towide_none(wchar_t *c, const char *mb, int n) 141 { 142 if (mb_cur_max != 1) { 143 werr("invalid or unsupported multibyte locale"); 144 return (-1); 145 } 146 if (n < 1) { 147 werr("no character data"); 148 return (-1); 149 } 150 *c = (uint8_t)*mb; 151 return (1); 152 } 153 154 int 155 tomb_none(char *mb, wchar_t wc) 156 { 157 if (mb_cur_max != 1) { 158 werr("invalid or unsupported multibyte locale"); 159 return (-1); 160 } 161 *(uint8_t *)mb = (wc & 0xff); 162 mb[1] = 0; 163 return (1); 164 } 165 166 /* 167 * UTF-8 stores wide characters in UTF-32 form. 168 */ 169 int 170 towide_utf8(wchar_t *wc, const char *mb, int n) 171 { 172 wchar_t c; 173 int nb; 174 int lv; /* lowest legal value */ 175 int i; 176 const uint8_t *s = (const uint8_t *)mb; 177 178 if (n < 1) { 179 werr("no utf8 data"); 180 return (-1); 181 } 182 c = *s; 183 184 if ((c & 0x80) == 0) { 185 /* 7-bit ASCII */ 186 *wc = c; 187 return (1); 188 } else if ((c & 0xe0) == 0xc0) { 189 /* u80-u7ff - two bytes encoded */ 190 nb = 2; 191 lv = 0x80; 192 c &= ~0xe0; 193 } else if ((c & 0xf0) == 0xe0) { 194 /* u800-uffff - three bytes encoded */ 195 nb = 3; 196 lv = 0x800; 197 c &= ~0xf0; 198 } else if ((c & 0xf8) == 0xf0) { 199 /* u1000-u1fffff - four bytes encoded */ 200 nb = 4; 201 lv = 0x1000; 202 c &= ~0xf8; 203 } else { 204 /* 5 and 6 byte encodings are not legal unicode */ 205 werr("utf8 encoding too large (%s)", show_mb(mb)); 206 return (-1); 207 } 208 if (nb > n) { 209 werr("incomplete utf8 sequence (%s)", show_mb(mb)); 210 return (-1); 211 } 212 213 for (i = 1; i < nb; i++) { 214 if (((s[i]) & 0xc0) != 0x80) { 215 werr("illegal utf8 byte (%x)", s[i]); 216 return (-1); 217 } 218 c <<= 6; 219 c |= (s[i] & 0x3f); 220 } 221 222 if (c < lv) { 223 werr("illegal redundant utf8 encoding (%s)", show_mb(mb)); 224 return (-1); 225 } 226 *wc = c; 227 return (nb); 228 } 229 230 int 231 tomb_utf8(char *mb, wchar_t wc) 232 { 233 uint8_t *s = (uint8_t *)mb; 234 uint8_t msk; 235 int cnt; 236 int i; 237 238 if (wc <= 0x7f) { 239 s[0] = wc & 0x7f; 240 s[1] = 0; 241 return (1); 242 } 243 if (wc <= 0x7ff) { 244 cnt = 2; 245 msk = 0xc0; 246 } else if (wc <= 0xffff) { 247 cnt = 3; 248 msk = 0xe0; 249 } else if (wc <= 0x1fffff) { 250 cnt = 4; 251 msk = 0xf0; 252 } else { 253 werr("illegal uf8 char (%x)", wc); 254 return (-1); 255 } 256 for (i = cnt - 1; i; i--) { 257 s[i] = (wc & 0x3f) | 0x80; 258 wc >>= 6; 259 } 260 s[0] = (msk) | wc; 261 s[cnt] = 0; 262 return (cnt); 263 } 264 265 /* 266 * Several encodings share a simplistic dual byte encoding. In these 267 * forms, they all indicate that a two byte sequence is to be used if 268 * the first byte has its high bit set. They all store this simple 269 * encoding as a 16-bit value, although a great many of the possible 270 * code points are not used in most character sets. This gives a possible 271 * set of just over 32,000 valid code points. 272 * 273 * 0x00 - 0x7f - 1 byte encoding 274 * 0x80 - 0x7fff - illegal 275 * 0x8000 - 0xffff - 2 byte encoding 276 */ 277 static int 278 towide_dbcs(wchar_t *wc, const char *mb, int n) 279 { 280 wchar_t c; 281 282 c = *(uint8_t *)mb; 283 284 if (n < 1) { 285 werr("no character data"); 286 return (-1); 287 } 288 if ((c & 0x80) == 0) { 289 /* 7-bit */ 290 *wc = c; 291 return (1); 292 } 293 if (n < 2) { 294 werr("incomplete character sequence (%s)", show_mb(mb)); 295 return (-1); 296 } 297 298 /* Store both bytes as a single 16-bit wide. */ 299 c <<= 8; 300 c |= (uint8_t)(mb[1]); 301 *wc = c; 302 return (2); 303 } 304 305 /* 306 * Most multibyte locales just convert the wide character to the multibyte 307 * form by stripping leading null bytes, and writing the 32-bit quantity 308 * in big-endian order. 309 */ 310 int 311 tomb_mbs(char *mb, wchar_t wc) 312 { 313 uint8_t *s = (uint8_t *)mb; 314 int n = 0, c; 315 316 if ((wc & 0xff000000U) != 0) { 317 n = 4; 318 } else if ((wc & 0x00ff0000U) != 0) { 319 n = 3; 320 } else if ((wc & 0x0000ff00U) != 0) { 321 n = 2; 322 } else { 323 n = 1; 324 } 325 c = n; 326 while (n) { 327 n--; 328 s[n] = wc & 0xff; 329 wc >>= 8; 330 } 331 /* ensure null termination */ 332 s[c] = 0; 333 return (c); 334 } 335 336 337 /* 338 * big5 is a simple dual byte character set. 339 */ 340 int 341 towide_big5(wchar_t *wc, const char *mb, int n) 342 { 343 return (towide_dbcs(wc, mb, n)); 344 } 345 346 /* 347 * GBK encodes wides in the same way that big5 does, the high order 348 * bit of the first byte indicates a double byte character. 349 */ 350 int 351 towide_gbk(wchar_t *wc, const char *mb, int n) 352 { 353 return (towide_dbcs(wc, mb, n)); 354 } 355 356 /* 357 * GB2312 is another DBCS. Its cleaner than others in that the second 358 * byte does not encode ASCII, but it supports characters. 359 */ 360 int 361 towide_gb2312(wchar_t *wc, const char *mb, int n) 362 { 363 return (towide_dbcs(wc, mb, n)); 364 } 365 366 /* 367 * GB18030. This encodes as 8, 16, or 32-bits. 368 * 7-bit values are in 1 byte, 4 byte sequences are used when 369 * the second byte encodes 0x30-39 and all other sequences are 2 bytes. 370 */ 371 int 372 towide_gb18030(wchar_t *wc, const char *mb, int n) 373 { 374 wchar_t c; 375 376 c = *(uint8_t *)mb; 377 378 if (n < 1) { 379 werr("no character data"); 380 return (-1); 381 } 382 if ((c & 0x80) == 0) { 383 /* 7-bit */ 384 *wc = c; 385 return (1); 386 } 387 if (n < 2) { 388 werr("incomplete character sequence (%s)", show_mb(mb)); 389 return (-1); 390 } 391 392 /* pull in the second byte */ 393 c <<= 8; 394 c |= (uint8_t)(mb[1]); 395 396 if (((c & 0xff) >= 0x30) && ((c & 0xff) <= 0x39)) { 397 if (n < 4) { 398 werr("incomplete 4-byte character sequence (%s)", 399 show_mb(mb)); 400 return (-1); 401 } 402 c <<= 8; 403 c |= (uint8_t)(mb[2]); 404 c <<= 8; 405 c |= (uint8_t)(mb[3]); 406 *wc = c; 407 return (4); 408 } 409 410 *wc = c; 411 return (2); 412 } 413 414 /* 415 * MS-Kanji (aka SJIS) is almost a clean DBCS like the others, but it 416 * also has a range of single byte characters above 0x80. (0xa1-0xdf). 417 */ 418 int 419 towide_mskanji(wchar_t *wc, const char *mb, int n) 420 { 421 wchar_t c; 422 423 c = *(uint8_t *)mb; 424 425 if (n < 1) { 426 werr("no character data"); 427 return (-1); 428 } 429 if ((c < 0x80) || ((c > 0xa0) && (c < 0xe0))) { 430 /* 7-bit */ 431 *wc = c; 432 return (-1); 433 } 434 435 if (n < 2) { 436 werr("incomplete character sequence (%s)", show_mb(mb)); 437 return (-1); 438 } 439 440 /* Store both bytes as a single 16-bit wide. */ 441 c <<= 8; 442 c |= (uint8_t)(mb[1]); 443 *wc = c; 444 return (2); 445 } 446 447 /* 448 * EUC forms. EUC encodings are "variable". FreeBSD carries some additional 449 * variable data to encode these, but we're going to treat each as independent 450 * instead. Its the only way we can sensibly move forward. 451 * 452 * Note that the way in which the different EUC forms vary is how wide 453 * CS2 and CS3 are and what the first byte of them is. 454 */ 455 static int 456 towide_euc_impl(wchar_t *wc, const char *mb, int n, 457 uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) 458 { 459 int i; 460 int width; 461 wchar_t c; 462 463 c = *(uint8_t *)mb; 464 465 if (n < 1) { 466 werr("no character data"); 467 return (-1); 468 } 469 470 /* 471 * All variations of EUC encode 7-bit ASCII as one byte, and use 472 * additional bytes for more than that. 473 */ 474 if ((c & 0x80) == 0) { 475 /* 7-bit */ 476 *wc = c; 477 return (1); 478 } 479 480 /* 481 * All EUC variants reserve 0xa1-0xff to identify CS1, which 482 * is always two bytes wide. Note that unused CS will be zero, 483 * and that cannot be true because we know that the high order 484 * bit must be set. 485 */ 486 if (c >= 0xa1) { 487 width = 2; 488 } else if (c == cs2) { 489 width = cs2width; 490 } else if (c == cs3) { 491 width = cs3width; 492 } 493 494 if (n < width) { 495 werr("incomplete character sequence (%s)", show_mb(mb)); 496 return (-1); 497 } 498 499 for (i = 1; i < width; i++) { 500 /* pull in the next byte */ 501 c <<= 8; 502 c |= (uint8_t)(mb[i]); 503 } 504 505 *wc = c; 506 return (width); 507 } 508 509 /* 510 * EUC-CN encodes as follows: 511 * 512 * Code set 0 (ASCII): 0x21-0x7E 513 * Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE 514 * Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE 515 * Code set 3: unused 516 */ 517 int 518 towide_euccn(wchar_t *wc, const char *mb, int n) 519 { 520 return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0)); 521 } 522 523 /* 524 * EUC-JP encodes as follows: 525 * 526 * Code set 0 (ASCII or JIS X 0201-1976 Roman): 0x21-0x7E 527 * Code set 1 (JIS X 0208): 0xA1A1-0xFEFE 528 * Code set 2 (half-width katakana): 0x8EA1-0x8EDF 529 * Code set 3 (JIS X 0212-1990): 0x8FA1A1-0x8FFEFE 530 */ 531 int 532 towide_eucjp(wchar_t *wc, const char *mb, int n) 533 { 534 return (towide_euc_impl(wc, mb, n, 0x8e, 2, 0x8f, 3)); 535 } 536 537 /* 538 * EUC-KR encodes as follows: 539 * 540 * Code set 0 (ASCII or KS C 5636-1993): 0x21-0x7E 541 * Code set 1 (KS C 5601-1992): 0xA1A1-0xFEFE 542 * Code set 2: unused 543 * Code set 3: unused 544 */ 545 int 546 towide_euckr(wchar_t *wc, const char *mb, int n) 547 { 548 return (towide_euc_impl(wc, mb, n, 0, 0, 0, 0)); 549 } 550 551 /* 552 * EUC-TW encodes as follows: 553 * 554 * Code set 0 (ASCII): 0x21-0x7E 555 * Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE 556 * Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE 557 * Code set 3: unused 558 */ 559 int 560 towide_euctw(wchar_t *wc, const char *mb, int n) 561 { 562 return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0)); 563 } 564 565 /* 566 * Public entry points. 567 */ 568 569 int 570 to_wide(wchar_t *wc, const char *mb) 571 { 572 /* this won't fail hard */ 573 return (_towide(wc, mb, strlen(mb) + 1)); 574 } 575 576 int 577 to_mb(char *mb, wchar_t wc) 578 { 579 int rv; 580 581 if ((rv = _tomb(mb, wc)) < 0) { 582 errf(widemsg); 583 free(widemsg); 584 widemsg = NULL; 585 } 586 return (rv); 587 } 588 589 char * 590 to_mb_string(const wchar_t *wcs) 591 { 592 char *mbs; 593 char *ptr; 594 int len; 595 596 mbs = malloc((wcslen(wcs) * mb_cur_max) + 1); 597 if (mbs == NULL) { 598 errf("out of memory"); 599 return (NULL); 600 } 601 ptr = mbs; 602 while (*wcs) { 603 if ((len = to_mb(ptr, *wcs)) < 0) { 604 INTERR; 605 free(mbs); 606 return (NULL); 607 } 608 wcs++; 609 ptr += len; 610 } 611 *ptr = 0; 612 return (mbs); 613 } 614 615 void 616 set_wide_encoding(const char *encoding) 617 { 618 int i; 619 620 _towide = towide_none; 621 _tomb = tomb_none; 622 _encoding = "NONE"; 623 624 for (i = 0; mb_encodings[i].name; i++) { 625 if (strcasecmp(encoding, mb_encodings[i].name) == 0) { 626 _towide = mb_encodings[i].towide; 627 _tomb = mb_encodings[i].tomb; 628 _encoding = mb_encodings[i].cname; 629 } 630 } 631 } 632 633 const char * 634 get_wide_encoding(void) 635 { 636 return (_encoding); 637 } 638