1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 * 21 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 22 * Use is subject to license terms. 23 */ 24 25 #pragma ident "%Z%%M% %I% %E% SMI" 26 27 #include <errno.h> 28 #include <locale.h> 29 #include <langinfo.h> 30 #include <iconv.h> 31 #include <ctype.h> 32 #include <strings.h> 33 #include <string.h> 34 #include <stdio.h> 35 #include <stdlib.h> 36 #include "includes.h" 37 #include "xmalloc.h" 38 #include "xlist.h" 39 40 #ifdef MIN 41 #undef MIN 42 #endif /* MIN */ 43 44 #define MIN(x, y) ((x) < (y) ? (x) : (y)) 45 46 #define LOCALE_PATH "/usr/bin/locale" 47 48 /* two-char country code, '-' and two-char region code */ 49 #define LANGTAG_MAX 5 50 51 static uchar_t *do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, 52 uint_t len, uint_t *outlen, int *err, uchar_t **err_str); 53 54 static int locale_cmp(const void *d1, const void *d2); 55 static char *g11n_locale2langtag(char *locale); 56 57 uint_t g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str); 58 uint_t g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str); 59 60 static char * 61 g11n_locale2langtag(char *locale) 62 { 63 char *langtag; 64 65 /* base cases */ 66 if (!locale || !*locale) 67 return (NULL); 68 69 if (strcmp(locale, "POSIX") == 0 || strcmp(locale, "C") == 0) 70 return ("i-default"); 71 72 /* punt for language codes which are not exactly 2 letters */ 73 if (strlen(locale) < 2 || 74 !isalpha(locale[0]) || 75 !isalpha(locale[1]) || 76 (locale[2] != '\0' && 77 locale[2] != '_' && 78 locale[2] != '.' && 79 locale[2] != '@')) 80 return (NULL); 81 82 83 /* we have a primary language sub-tag */ 84 langtag = (char *)xmalloc(LANGTAG_MAX + 1); 85 86 strncpy(langtag, locale, 2); 87 langtag[2] = '\0'; 88 89 /* do we have country sub-tag? For example: cs_CZ */ 90 if (locale[2] == '_') { 91 if (strlen(locale) < 5 || 92 !isalpha(locale[3]) || 93 !isalpha(locale[4]) || 94 (locale[5] != '\0' && (locale[5] != '.' && 95 locale[5] != '@'))) { 96 return (langtag); 97 } 98 99 /* example: create cs-CZ from cs_CZ */ 100 if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 2, 101 locale + 3) == 5) 102 return (langtag); 103 } 104 105 /* in all other cases we just use the primary language sub-tag */ 106 return (langtag); 107 } 108 109 uint_t 110 g11n_langtag_is_default(char *langtag) 111 { 112 return (strcmp(langtag, "i-default") == 0); 113 } 114 115 /* 116 * This lang tag / locale matching function works only for two-character 117 * language primary sub-tags and two-character country sub-tags. 118 */ 119 uint_t 120 g11n_langtag_matches_locale(char *langtag, char *locale) 121 { 122 /* match "i-default" to the process' current locale if possible */ 123 if (g11n_langtag_is_default(langtag)) { 124 if (strcasecmp(locale, "POSIX") == 0 || 125 strcasecmp(locale, "C") == 0) 126 return (1); 127 else 128 return (0); 129 } 130 131 /* 132 * locale must be at least 2 chars long and the lang part must be 133 * exactly two characters 134 */ 135 if (strlen(locale) < 2 || 136 (!isalpha(locale[0]) || !isalpha(locale[1]) || 137 (locale[2] != '\0' && locale[2] != '_' && 138 locale[2] != '.' && locale[2] != '@'))) 139 return (0); 140 141 /* same thing with the langtag */ 142 if (strlen(langtag) < 2 || 143 (!isalpha(langtag[0]) || !isalpha(langtag[1]) || 144 (langtag[2] != '\0' && langtag[2] != '-'))) 145 return (0); 146 147 /* primary language sub-tag and the locale's language part must match */ 148 if (strncasecmp(langtag, locale, 2) != 0) 149 return (0); 150 151 /* 152 * primary language sub-tag and the locale's language match, now 153 * fuzzy check country part 154 */ 155 156 /* neither langtag nor locale have more than one component */ 157 if (langtag[2] == '\0' && 158 (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')) 159 return (2); 160 161 /* langtag has only one sub-tag... */ 162 if (langtag[2] == '\0') 163 return (1); 164 165 /* locale has no country code... */ 166 if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@') 167 return (1); 168 169 /* langtag has more than one subtag and the locale has a country code */ 170 171 /* ignore second subtag if not two chars */ 172 if (strlen(langtag) < 5) 173 return (1); 174 175 if (!isalpha(langtag[3]) || !isalpha(langtag[4]) || 176 (langtag[5] != '\0' && langtag[5] != '-')) 177 return (1); 178 179 /* ignore rest of locale if there is no two-character country part */ 180 if (strlen(locale) < 5) 181 return (1); 182 183 if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) || 184 (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@')) 185 return (1); 186 187 /* if the country part matches, return 2 */ 188 if (strncasecmp(&langtag[3], &locale[3], 2) == 0) 189 return (2); 190 191 return (1); 192 } 193 194 char * 195 g11n_getlocale() 196 { 197 /* we have one text domain - always set it */ 198 (void) textdomain(TEXT_DOMAIN); 199 200 /* if the locale is not set, set it from the env vars */ 201 if (!setlocale(LC_MESSAGES, NULL)) 202 (void) setlocale(LC_MESSAGES, ""); 203 204 return (setlocale(LC_MESSAGES, NULL)); 205 } 206 207 void 208 g11n_setlocale(int category, const char *locale) 209 { 210 char *curr; 211 212 /* we have one text domain - always set it */ 213 (void) textdomain(TEXT_DOMAIN); 214 215 if (!locale) 216 return; 217 218 if (*locale && ((curr = setlocale(category, NULL))) && 219 strcmp(curr, locale) == 0) 220 return; 221 222 /* if <category> is bogus, setlocale() will do nothing */ 223 (void) setlocale(category, locale); 224 } 225 226 char ** 227 g11n_getlocales() 228 { 229 FILE *locale_out; 230 uint_t n_elems, list_size, long_line = 0; 231 char **list; 232 char locale[64]; /* 64 bytes is plenty for locale names */ 233 234 if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL) 235 return (NULL); 236 237 /* 238 * start with enough room for 65 locales - that's a lot fewer than 239 * all the locales available for installation, but a lot more than 240 * what most users will need and install 241 */ 242 n_elems = 0; 243 list_size = 192; 244 list = (char **) xmalloc(sizeof (char *) * (list_size + 1)); 245 memset(list, 0, sizeof (char *) * (list_size + 1)); 246 247 while (fgets(locale, sizeof (locale), locale_out)) { 248 /* skip long locale names (if any) */ 249 if (!strchr(locale, '\n')) { 250 long_line = 1; 251 continue; 252 } else if (long_line) { 253 long_line = 0; 254 continue; 255 } 256 257 if (strncmp(locale, "iso_8859", 8) == 0) 258 /* ignore locale names like "iso_8859-1" */ 259 continue; 260 261 if (n_elems == list_size) { 262 list_size *= 2; 263 list = (char **)xrealloc((void *) list, 264 (list_size + 1) * sizeof (char *)); 265 memset(&list[n_elems + 1], 0, 266 sizeof (char *) * (list_size - n_elems + 1)); 267 } 268 269 *(strchr(locale, '\n')) = '\0'; /* remove the trailing \n */ 270 list[n_elems++] = xstrdup(locale); 271 } 272 273 list[n_elems] = NULL; 274 (void) pclose(locale_out); 275 276 qsort(list, n_elems - 1, sizeof (char *), locale_cmp); 277 return (list); 278 } 279 280 char * 281 g11n_getlangs() 282 { 283 char *locale; 284 285 if (getenv("SSH_LANGS")) 286 return (xstrdup(getenv("SSH_LANGS"))); 287 288 locale = g11n_getlocale(); 289 290 if (!locale || !*locale) 291 return (xstrdup("i-default")); 292 293 return (g11n_locale2langtag(locale)); 294 } 295 296 char * 297 g11n_locales2langs(char **locale_set) 298 { 299 char **p, **r, **q; 300 char *langtag; 301 int locales, skip; 302 303 for (locales = 0, p = locale_set; p && *p; p++) 304 locales++; 305 306 r = (char **)xmalloc((locales + 1) * sizeof (char *)); 307 memset(r, 0, (locales + 1) * sizeof (char *)); 308 309 for (p = locale_set; p && *p && ((p - locale_set) <= locales); p++) { 310 skip = 0; 311 if ((langtag = g11n_locale2langtag(*p)) == NULL) 312 continue; 313 for (q = r; (q - r) < locales; q++) { 314 if (!*q) 315 break; 316 if (*q && strcmp(*q, langtag) == 0) 317 skip = 1; 318 } 319 if (!skip) 320 *(q++) = langtag; 321 *q = NULL; 322 } 323 324 return (xjoin(r, ',')); 325 } 326 327 static int 328 sortcmp(const void *d1, const void *d2) 329 { 330 char *s1 = *(char **)d1; 331 char *s2 = *(char **)d2; 332 333 return (strcmp(s1, s2)); 334 } 335 336 int 337 g11n_langtag_match(char *langtag1, char *langtag2) 338 { 339 int len1, len2; 340 char c1, c2; 341 342 len1 = (strchr(langtag1, '-')) ? 343 (strchr(langtag1, '-') - langtag1) 344 : strlen(langtag1); 345 346 len2 = (strchr(langtag2, '-')) ? 347 (strchr(langtag2, '-') - langtag2) 348 : strlen(langtag2); 349 350 /* no match */ 351 if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0) 352 return (0); 353 354 c1 = *(langtag1 + len1); 355 c2 = *(langtag2 + len2); 356 357 /* no country sub-tags - exact match */ 358 if (c1 == '\0' && c2 == '\0') 359 return (2); 360 361 /* one langtag has a country sub-tag, the other doesn't */ 362 if (c1 == '\0' || c2 == '\0') 363 return (1); 364 365 /* can't happen - both langtags have a country sub-tag */ 366 if (c1 != '-' || c2 != '-') 367 return (1); 368 369 /* compare country subtags */ 370 langtag1 = langtag1 + len1 + 1; 371 langtag2 = langtag2 + len2 + 1; 372 373 len1 = (strchr(langtag1, '-')) ? 374 (strchr(langtag1, '-') - langtag1) : strlen(langtag1); 375 376 len2 = (strchr(langtag2, '-')) ? 377 (strchr(langtag2, '-') - langtag2) : strlen(langtag2); 378 379 if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0) 380 return (1); 381 382 /* country tags matched - exact match */ 383 return (2); 384 } 385 386 char * 387 g11n_langtag_set_intersect(char *set1, char *set2) 388 { 389 char **list1, **list2, **list3, **p, **q, **r; 390 char *set3, *lang_subtag; 391 uint_t n1, n2, n3; 392 uint_t do_append; 393 394 list1 = xsplit(set1, ','); 395 list2 = xsplit(set2, ','); 396 397 for (n1 = 0, p = list1; p && *p; p++, n1++) 398 ; 399 for (n2 = 0, p = list2; p && *p; p++, n2++) 400 ; 401 402 list3 = (char **) xmalloc(sizeof (char *) * (n1 + n2 + 1)); 403 *list3 = NULL; 404 405 /* 406 * we must not sort the user langtags - sorting or not the server's 407 * should not affect the outcome 408 */ 409 qsort(list2, n2, sizeof (char *), sortcmp); 410 411 for (n3 = 0, p = list1; p && *p; p++) { 412 do_append = 0; 413 for (q = list2; q && *q; q++) { 414 if (g11n_langtag_match(*p, *q) != 2) continue; 415 /* append element */ 416 for (r = list3; (r - list3) <= (n1 + n2); r++) { 417 do_append = 1; 418 if (!*r) 419 break; 420 if (strcmp(*p, *r) == 0) { 421 do_append = 0; 422 break; 423 } 424 } 425 if (do_append && n3 <= (n1 + n2)) { 426 list3[n3++] = xstrdup(*p); 427 list3[n3] = NULL; 428 } 429 } 430 } 431 432 for (p = list1; p && *p; p++) { 433 do_append = 0; 434 for (q = list2; q && *q; q++) { 435 if (g11n_langtag_match(*p, *q) != 1) 436 continue; 437 438 /* append element */ 439 lang_subtag = xstrdup(*p); 440 if (strchr(lang_subtag, '-')) 441 *(strchr(lang_subtag, '-')) = '\0'; 442 for (r = list3; (r - list3) <= (n1 + n2); r++) { 443 do_append = 1; 444 if (!*r) 445 break; 446 if (strcmp(lang_subtag, *r) == 0) { 447 do_append = 0; 448 break; 449 } 450 } 451 if (do_append && n3 <= (n1 + n2)) { 452 list3[n3++] = lang_subtag; 453 list3[n3] = NULL; 454 } else 455 xfree(lang_subtag); 456 } 457 } 458 459 set3 = xjoin(list3, ','); 460 xfree_split_list(list1); 461 xfree_split_list(list2); 462 xfree_split_list(list3); 463 464 return (set3); 465 } 466 467 char * 468 g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags) 469 { 470 char *list, *result; 471 char **xlist; 472 473 /* g11n_langtag_set_intersect uses xmalloc - should not return NULL */ 474 list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags); 475 476 if (!list) 477 return (NULL); 478 479 xlist = xsplit(list, ','); 480 481 xfree(list); 482 483 if (!xlist || !*xlist) 484 return (NULL); 485 486 result = xstrdup(*xlist); 487 xfree_split_list(xlist); 488 489 return (result); 490 } 491 492 /* 493 * Compare locales, preferring UTF-8 codesets to others, otherwise doing 494 * a stright strcmp() 495 */ 496 static int 497 locale_cmp(const void *d1, const void *d2) 498 { 499 char *dot_ptr; 500 char *s1 = *(char **)d1; 501 char *s2 = *(char **)d2; 502 int s1_is_utf8 = 0; 503 int s2_is_utf8 = 0; 504 505 /* check if s1 is a UTF-8 locale */ 506 if (((dot_ptr = strchr((char *)s1, '.')) != NULL) && 507 (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) && 508 (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) { 509 s1_is_utf8++; 510 } 511 512 /* check if s2 is a UTF-8 locale */ 513 if (((dot_ptr = strchr((char *)s2, '.')) != NULL) && 514 (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) && 515 (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) { 516 s2_is_utf8++; 517 } 518 519 /* prefer UTF-8 locales */ 520 if (s1_is_utf8 && !s2_is_utf8) 521 return (-1); 522 523 if (s2_is_utf8 && !s1_is_utf8) 524 return (1); 525 526 /* prefer any locale over the default locales */ 527 if (strcmp(s1, "C") == 0 || strcmp(s1, "POSIX") == 0 || 528 strcmp(s1, "common") == 0) { 529 if (strcmp(s2, "C") != 0 && strcmp(s2, "POSIX") != 0 && 530 strcmp(s2, "common") != 0) 531 return (1); 532 } 533 534 if (strcmp(s2, "C") == 0 || strcmp(s2, "POSIX") == 0 || 535 strcmp(s2, "common") == 0) { 536 if (strcmp(s1, "C") != 0 && 537 strcmp(s1, "POSIX") != 0 && 538 strcmp(s1, "common") != 0) 539 return (-1); 540 } 541 542 return (strcmp(s1, s2)); 543 } 544 545 546 char ** 547 g11n_langtag_set_locale_set_intersect(char *langtag_set, char **locale_set) 548 { 549 char **langtag_list, **result, **p, **q, **r; 550 char *s; 551 uint_t do_append, n_langtags, n_locales, n_results, max_results; 552 553 /* count lang tags and locales */ 554 for (n_locales = 0, p = locale_set; p && *p; p++) 555 n_locales++; 556 557 n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0; 558 /* count the number of langtags */ 559 for (; s = strchr(s, ','); s++, n_langtags++) 560 ; 561 562 qsort(locale_set, n_locales, sizeof (char *), locale_cmp); 563 564 langtag_list = xsplit(langtag_set, ','); 565 for (n_langtags = 0, p = langtag_list; p && *p; p++, n_langtags++) 566 ; 567 568 max_results = MIN(n_locales, n_langtags) * 2; 569 result = (char **) xmalloc(sizeof (char *) * (max_results + 1)); 570 *result = NULL; 571 n_results = 0; 572 573 /* more specific matches first */ 574 for (p = langtag_list; p && *p; p++) { 575 do_append = 0; 576 for (q = locale_set; q && *q; q++) { 577 if (g11n_langtag_matches_locale(*p, *q) == 2) { 578 do_append = 1; 579 for (r = result; (r - result) <= 580 MIN(n_locales, n_langtags); r++) { 581 if (!*r) 582 break; 583 if (strcmp(*q, *r) == 0) { 584 do_append = 0; 585 break; 586 } 587 } 588 if (do_append && n_results < max_results) { 589 result[n_results++] = xstrdup(*q); 590 result[n_results] = NULL; 591 } 592 break; 593 } 594 } 595 } 596 597 for (p = langtag_list; p && *p; p++) { 598 do_append = 0; 599 for (q = locale_set; q && *q; q++) { 600 if (g11n_langtag_matches_locale(*p, *q) == 1) { 601 do_append = 1; 602 for (r = result; (r - result) <= 603 MIN(n_locales, n_langtags); r++) { 604 if (!*r) 605 break; 606 if (strcmp(*q, *r) == 0) { 607 do_append = 0; 608 break; 609 } 610 } 611 if (do_append && n_results < max_results) { 612 result[n_results++] = xstrdup(*q); 613 result[n_results] = NULL; 614 } 615 break; 616 } 617 } 618 } 619 620 xfree_split_list(langtag_list); 621 622 return (result); 623 } 624 625 char * 626 g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales) 627 { 628 char **results, *result = NULL; 629 630 if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags, 631 srvr_locales ? srvr_locales : g11n_getlocales())) == NULL) 632 return (NULL); 633 634 if (*results != NULL) 635 result = xstrdup(*results); 636 637 xfree_split_list(results); 638 639 return (result); 640 } 641 642 643 /* 644 * Functions for validating ASCII and UTF-8 strings 645 * 646 * The error_str parameter is an optional pointer to a char variable 647 * where to store a string suitable for use with error() or fatal() or 648 * friends. 649 * 650 * The return value is 0 if success, EILSEQ or EINVAL. 651 * 652 */ 653 uint_t 654 g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str) 655 { 656 uchar_t *p; 657 658 for (p = (uchar_t *)str; p && *p && (!(*p & 0x80)); p++) 659 ; 660 661 if (len && ((p - (uchar_t *)str) != len)) 662 return (EILSEQ); 663 664 return (0); 665 } 666 667 uint_t 668 g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str) 669 { 670 uchar_t *p; 671 uint_t c, l; 672 673 if (len == 0) 674 len = strlen((const char *)str); 675 676 for (p = (uchar_t *)str; p && (p - str < len) && *p; ) { 677 /* 8-bit chars begin a UTF-8 sequence */ 678 if (*p & 0x80) { 679 /* get sequence length and sanity check first byte */ 680 if (*p < 0xc0) 681 return (EILSEQ); 682 else if (*p < 0xe0) 683 l = 2; 684 else if (*p < 0xf0) 685 l = 3; 686 else if (*p < 0xf8) 687 l = 4; 688 else if (*p < 0xfc) 689 l = 5; 690 else if (*p < 0xfe) 691 l = 6; 692 else 693 return (EILSEQ); 694 695 if ((p + l - str) >= len) 696 return (EILSEQ); 697 698 /* overlong detection - build codepoint */ 699 c = *p & 0x3f; 700 /* shift c bits from first byte */ 701 c = c << (6 * (l - 1)); 702 703 if (l > 1) { 704 if (*(p + 1) && ((*(p + 1) & 0xc0) == 0x80)) 705 c = c | ((*(p + 1) & 0x3f) << 706 (6 * (l - 2))); 707 else 708 return (EILSEQ); 709 710 if (c < 0x80) 711 return (EILSEQ); 712 } 713 714 if (l > 2) { 715 if (*(p + 2) && ((*(p + 2) & 0xc0) == 0x80)) 716 c = c | ((*(p + 2) & 0x3f) << 717 (6 * (l - 3))); 718 else 719 return (EILSEQ); 720 721 if (c < 0x800) 722 return (EILSEQ); 723 } 724 725 if (l > 3) { 726 if (*(p + 3) && ((*(p + 3) & 0xc0) == 0x80)) 727 c = c | ((*(p + 3) & 0x3f) << 728 (6 * (l - 4))); 729 else 730 return (EILSEQ); 731 732 if (c < 0x10000) 733 return (EILSEQ); 734 } 735 736 if (l > 4) { 737 if (*(p + 4) && ((*(p + 4) & 0xc0) == 0x80)) 738 c = c | ((*(p + 4) & 0x3f) << 739 (6 * (l - 5))); 740 else 741 return (EILSEQ); 742 743 if (c < 0x200000) 744 return (EILSEQ); 745 } 746 747 if (l > 5) { 748 if (*(p + 5) && ((*(p + 5) & 0xc0) == 0x80)) 749 c = c | (*(p + 5) & 0x3f); 750 else 751 return (EILSEQ); 752 753 if (c < 0x4000000) 754 return (EILSEQ); 755 } 756 757 /* 758 * check for UTF-16 surrogates ifs other illegal 759 * UTF-8 * points 760 */ 761 if (((c <= 0xdfff) && (c >= 0xd800)) || 762 (c == 0xfffe) || (c == 0xffff)) 763 return (EILSEQ); 764 p += l; 765 } 766 /* 7-bit chars are fine */ 767 else 768 p++; 769 } 770 return (0); 771 } 772 773 /* 774 * Functions for converting to ASCII or UTF-8 from the local codeset 775 * Functions for converting from ASCII or UTF-8 to the local codeset 776 * 777 * The error_str parameter is an optional pointer to a char variable 778 * where to store a string suitable for use with error() or fatal() or 779 * friends. 780 * 781 * The err parameter is an optional pointer to an integer where 0 782 * (success) or EILSEQ or EINVAL will be stored (failure). 783 * 784 * These functions return NULL if the conversion fails. 785 * 786 */ 787 uchar_t * 788 g11n_convert_from_ascii(const char *str, int *err_ptr, uchar_t **error_str) 789 { 790 static uint_t initialized = 0; 791 static uint_t do_convert = 0; 792 iconv_t cd; 793 int err; 794 795 if (!initialized) { 796 /* 797 * iconv_open() fails if the to/from codesets are the 798 * same, and there are aliases of codesets to boot... 799 */ 800 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 801 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 802 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 803 initialized = 1; 804 do_convert = 0; 805 } else { 806 cd = iconv_open(nl_langinfo(CODESET), "646"); 807 if (cd == (iconv_t)-1) { 808 if (err_ptr) 809 *err_ptr = errno; 810 if (error_str) 811 *error_str = (uchar_t *)"Cannot " 812 "convert ASCII strings to the local" 813 " codeset"; 814 } 815 initialized = 1; 816 do_convert = 1; 817 } 818 } 819 820 if (!do_convert) { 821 if ((err = g11n_validate_ascii(str, 0, error_str))) { 822 if (err_ptr) 823 *err_ptr = err; 824 return (NULL); 825 } else 826 return ((uchar_t *)xstrdup(str)); 827 } 828 829 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 830 } 831 832 uchar_t * 833 g11n_convert_from_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str) 834 { 835 static uint_t initialized = 0; 836 static uint_t do_convert = 0; 837 iconv_t cd; 838 int err; 839 840 if (!initialized) { 841 /* 842 * iconv_open() fails if the to/from codesets are the 843 * same, and there are aliases of codesets to boot... 844 */ 845 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 846 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 847 initialized = 1; 848 do_convert = 0; 849 } else { 850 cd = iconv_open(nl_langinfo(CODESET), "UTF-8"); 851 if (cd == (iconv_t)-1) { 852 if (err_ptr) 853 *err_ptr = errno; 854 if (error_str) 855 *error_str = (uchar_t *)"Cannot " 856 "convert UTF-8 strings to the " 857 "local codeset"; 858 } 859 initialized = 1; 860 do_convert = 1; 861 } 862 } 863 864 if (!do_convert) { 865 if ((err = g11n_validate_utf8(str, 0, error_str))) { 866 if (err_ptr) 867 *err_ptr = err; 868 return (NULL); 869 } else 870 return ((uchar_t *)xstrdup((char *)str)); 871 } 872 873 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 874 } 875 876 char * 877 g11n_convert_to_ascii(const uchar_t *str, int *err_ptr, uchar_t **error_str) 878 { 879 static uint_t initialized = 0; 880 static uint_t do_convert = 0; 881 iconv_t cd; 882 883 if (!initialized) { 884 /* 885 * iconv_open() fails if the to/from codesets are the 886 * same, and there are aliases of codesets to boot... 887 */ 888 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 889 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 890 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 891 initialized = 1; 892 do_convert = 0; 893 } else { 894 cd = iconv_open("646", nl_langinfo(CODESET)); 895 if (cd == (iconv_t)-1) { 896 if (err_ptr) 897 *err_ptr = errno; 898 if (error_str) 899 *error_str = (uchar_t *)"Cannot " 900 "convert UTF-8 strings to the " 901 "local codeset"; 902 } 903 initialized = 1; 904 do_convert = 1; 905 } 906 } 907 908 if (!do_convert) 909 return (xstrdup((char *)str)); 910 911 return ((char *)do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 912 } 913 914 uchar_t * 915 g11n_convert_to_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str) 916 { 917 static uint_t initialized = 0; 918 static uint_t do_convert = 0; 919 iconv_t cd; 920 921 if (!initialized) { 922 /* 923 * iconv_open() fails if the to/from codesets are the 924 * same, and there are aliases of codesets to boot... 925 */ 926 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 927 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 928 initialized = 1; 929 do_convert = 0; 930 } else { 931 cd = iconv_open("UTF-8", nl_langinfo(CODESET)); 932 if (cd == (iconv_t)-1) { 933 if (err_ptr) 934 *err_ptr = errno; 935 if (error_str) 936 *error_str = (uchar_t *)"Cannot " 937 "convert UTF-8 strings to the " 938 "local codeset"; 939 } 940 initialized = 1; 941 do_convert = 1; 942 } 943 } 944 945 if (!do_convert) 946 return ((uchar_t *)xstrdup((char *)str)); 947 948 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 949 } 950 951 952 /* 953 * Wrapper around iconv() 954 * 955 * The caller is responsible for freeing the result and for handling 956 * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF). 957 */ 958 static uchar_t * 959 do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, uint_t len, 960 uint_t *outlen, int *err, uchar_t **err_str) 961 { 962 size_t inbytesleft, outbytesleft, converted_size; 963 char *outbuf; 964 uchar_t *converted; 965 const char *inbuf; 966 uint_t mul = 0; 967 968 if (!buf || !(*(char *)buf)) 969 return (NULL); 970 971 if (len == 0) 972 len = strlen(buf); 973 974 /* reset conversion descriptor */ 975 /* XXX Do we need initial shift sequences for UTF-8??? */ 976 (void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft); 977 inbuf = (const char *) buf; 978 979 if (mul_ptr) 980 mul = *mul_ptr; 981 982 converted_size = (len << mul); 983 outbuf = (char *)xmalloc(converted_size + 1); /* for null */ 984 converted = (uchar_t *)outbuf; 985 outbytesleft = len; 986 987 do { 988 if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) == 989 (size_t)-1) { 990 if (errno == E2BIG) { 991 /* UTF-8 codepoints are at most 8 bytes long */ 992 if (mul > 2) { 993 if (err_str) 994 *err_str = (uchar_t *) 995 "Conversion to UTF-8 failed" 996 " due to preposterous space" 997 " requirements"; 998 if (err) 999 *err = EILSEQ; 1000 return (NULL); 1001 } 1002 1003 /* 1004 * re-alloc output and ensure that the outbuf 1005 * and outbytesleft values are adjusted 1006 */ 1007 converted = xrealloc(converted, 1008 converted_size << 1 + 1); 1009 outbuf = (char *)converted + converted_size - 1010 outbytesleft; 1011 converted_size = (len << ++(mul)); 1012 outbytesleft = converted_size - outbytesleft; 1013 } else { 1014 /* 1015 * let the caller deal with iconv() errors, 1016 * probably by calling fatal(); xfree() does 1017 * not set errno 1018 */ 1019 if (err) 1020 *err = errno; 1021 xfree(converted); 1022 return (NULL); 1023 } 1024 } 1025 } while (inbytesleft); 1026 1027 *outbuf = '\0'; /* ensure null-termination */ 1028 if (outlen) 1029 *outlen = converted_size - outbytesleft; 1030 if (mul_ptr) 1031 *mul_ptr = mul; 1032 1033 return (converted); 1034 } 1035