1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 * 21 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 22 * Use is subject to license terms. 23 */ 24 25 #pragma ident "%Z%%M% %I% %E% SMI" 26 27 #include <errno.h> 28 #include <locale.h> 29 #include <langinfo.h> 30 #include <iconv.h> 31 #include <ctype.h> 32 #include <strings.h> 33 #include <string.h> 34 #include <stdio.h> 35 #include <stdlib.h> 36 #include "includes.h" 37 #include "xmalloc.h" 38 #include "xlist.h" 39 40 #ifdef MIN 41 #undef MIN 42 #endif /* MIN */ 43 44 #define MIN(x, y) ((x) < (y) ? (x) : (y)) 45 46 #define LOCALE_PATH "/usr/bin/locale" 47 48 /* two-char country code, '-' and two-char region code */ 49 #define LANGTAG_MAX 5 50 51 static uchar_t *do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, 52 uint_t len, uint_t *outlen, int *err, uchar_t **err_str); 53 54 static int locale_cmp(const void *d1, const void *d2); 55 static char *g11n_locale2langtag(char *locale); 56 57 uint_t g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str); 58 uint_t g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str); 59 60 static char * 61 g11n_locale2langtag(char *locale) 62 { 63 char *langtag; 64 65 /* base cases */ 66 if (!locale || !*locale) 67 return (NULL); 68 69 if (strcmp(locale, "POSIX") == 0 || strcmp(locale, "C") == 0) 70 return ("i-default"); 71 72 /* punt for language codes which are not exactly 2 letters */ 73 if (strlen(locale) < 2 || 74 !isalpha(locale[0]) || 75 !isalpha(locale[1]) || 76 (locale[2] != '\0' && 77 locale[2] != '_' && 78 locale[2] != '.' && 79 locale[2] != '@')) 80 return (NULL); 81 82 83 /* we have a primary language sub-tag */ 84 langtag = (char *)xmalloc(LANGTAG_MAX + 1); 85 86 strncpy(langtag, locale, 2); 87 langtag[2] = '\0'; 88 89 /* do we have country sub-tag? For example: cs_CZ */ 90 if (locale[2] == '_') { 91 if (strlen(locale) < 5 || 92 !isalpha(locale[3]) || 93 !isalpha(locale[4]) || 94 (locale[5] != '\0' && (locale[5] != '.' && 95 locale[5] != '@'))) { 96 return (langtag); 97 } 98 99 /* example: create cs-CZ from cs_CZ */ 100 if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 2, 101 locale + 3) == 5) 102 return (langtag); 103 } 104 105 /* in all other cases we just use the primary language sub-tag */ 106 return (langtag); 107 } 108 109 uint_t 110 g11n_langtag_is_default(char *langtag) 111 { 112 return (strcmp(langtag, "i-default") == 0); 113 } 114 115 /* 116 * This lang tag / locale matching function works only for two-character 117 * language primary sub-tags and two-character country sub-tags. 118 */ 119 uint_t 120 g11n_langtag_matches_locale(char *langtag, char *locale) 121 { 122 /* match "i-default" to the process' current locale if possible */ 123 if (g11n_langtag_is_default(langtag)) { 124 if (strcasecmp(locale, "POSIX") == 0 || 125 strcasecmp(locale, "C") == 0) 126 return (1); 127 else 128 return (0); 129 } 130 131 /* 132 * locale must be at least 2 chars long and the lang part must be 133 * exactly two characters 134 */ 135 if (strlen(locale) < 2 || 136 (!isalpha(locale[0]) || !isalpha(locale[1]) || 137 (locale[2] != '\0' && locale[2] != '_' && 138 locale[2] != '.' && locale[2] != '@'))) 139 return (0); 140 141 /* same thing with the langtag */ 142 if (strlen(langtag) < 2 || 143 (!isalpha(langtag[0]) || !isalpha(langtag[1]) || 144 (langtag[2] != '\0' && langtag[2] != '-'))) 145 return (0); 146 147 /* primary language sub-tag and the locale's language part must match */ 148 if (strncasecmp(langtag, locale, 2) != 0) 149 return (0); 150 151 /* 152 * primary language sub-tag and the locale's language match, now 153 * fuzzy check country part 154 */ 155 156 /* neither langtag nor locale have more than one component */ 157 if (langtag[2] == '\0' && 158 (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')) 159 return (2); 160 161 /* langtag has only one sub-tag... */ 162 if (langtag[2] == '\0') 163 return (1); 164 165 /* locale has no country code... */ 166 if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@') 167 return (1); 168 169 /* langtag has more than one subtag and the locale has a country code */ 170 171 /* ignore second subtag if not two chars */ 172 if (strlen(langtag) < 5) 173 return (1); 174 175 if (!isalpha(langtag[3]) || !isalpha(langtag[4]) || 176 (langtag[5] != '\0' && langtag[5] != '-')) 177 return (1); 178 179 /* ignore rest of locale if there is no two-character country part */ 180 if (strlen(locale) < 5) 181 return (1); 182 183 if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) || 184 (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@')) 185 return (1); 186 187 /* if the country part matches, return 2 */ 188 if (strncasecmp(&langtag[3], &locale[3], 2) == 0) 189 return (2); 190 191 return (1); 192 } 193 194 char * 195 g11n_getlocale() 196 { 197 /* we have one text domain - always set it */ 198 (void) textdomain(TEXT_DOMAIN); 199 200 /* if the locale is not set, set it from the env vars */ 201 if (!setlocale(LC_MESSAGES, NULL)) 202 (void) setlocale(LC_MESSAGES, ""); 203 204 return (setlocale(LC_MESSAGES, NULL)); 205 } 206 207 void 208 g11n_setlocale(int category, const char *locale) 209 { 210 char *curr; 211 212 /* we have one text domain - always set it */ 213 (void) textdomain(TEXT_DOMAIN); 214 215 if (!locale) 216 return; 217 218 if (*locale && ((curr = setlocale(category, NULL))) && 219 strcmp(curr, locale) == 0) 220 return; 221 222 /* if <category> is bogus, setlocale() will do nothing */ 223 (void) setlocale(category, locale); 224 } 225 226 char ** 227 g11n_getlocales() 228 { 229 FILE *locale_out; 230 uint_t n_elems, list_size, long_line = 0; 231 char **list; 232 char locale[64]; /* 64 bytes is plenty for locale names */ 233 234 if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL) 235 return (NULL); 236 237 /* 238 * start with enough room for 65 locales - that's a lot fewer than 239 * all the locales available for installation, but a lot more than 240 * what most users will need and install 241 */ 242 n_elems = 0; 243 list_size = 192; 244 list = (char **) xmalloc(sizeof (char *) * (list_size + 1)); 245 memset(list, 0, sizeof (char *) * (list_size + 1)); 246 247 while (fgets(locale, sizeof (locale), locale_out)) { 248 /* skip long locale names (if any) */ 249 if (!strchr(locale, '\n')) { 250 long_line = 1; 251 continue; 252 } else if (long_line) { 253 long_line = 0; 254 continue; 255 } 256 257 if (strncmp(locale, "iso_8859", 8) == 0) 258 /* ignore locale names like "iso_8859-1" */ 259 continue; 260 261 if (n_elems == list_size) { 262 list_size *= 2; 263 list = (char **)xrealloc((void *) list, 264 (list_size + 1) * sizeof (char *)); 265 memset(&list[n_elems + 1], 0, 266 sizeof (char *) * (list_size - n_elems + 1)); 267 } 268 269 *(strchr(locale, '\n')) = '\0'; /* remove the trailing \n */ 270 list[n_elems++] = xstrdup(locale); 271 } 272 273 if (n_elems == 0) 274 return (NULL); 275 276 list[n_elems] = NULL; 277 (void) pclose(locale_out); 278 279 qsort(list, n_elems - 1, sizeof (char *), locale_cmp); 280 return (list); 281 } 282 283 char * 284 g11n_getlangs() 285 { 286 char *locale; 287 288 if (getenv("SSH_LANGS")) 289 return (xstrdup(getenv("SSH_LANGS"))); 290 291 locale = g11n_getlocale(); 292 293 if (!locale || !*locale) 294 return (xstrdup("i-default")); 295 296 return (g11n_locale2langtag(locale)); 297 } 298 299 char * 300 g11n_locales2langs(char **locale_set) 301 { 302 char **p, **r, **q; 303 char *langtag; 304 int locales, skip; 305 306 for (locales = 0, p = locale_set; p && *p; p++) 307 locales++; 308 309 r = (char **)xmalloc((locales + 1) * sizeof (char *)); 310 memset(r, 0, (locales + 1) * sizeof (char *)); 311 312 for (p = locale_set; p && *p && ((p - locale_set) <= locales); p++) { 313 skip = 0; 314 if ((langtag = g11n_locale2langtag(*p)) == NULL) 315 continue; 316 for (q = r; (q - r) < locales; q++) { 317 if (!*q) 318 break; 319 if (*q && strcmp(*q, langtag) == 0) 320 skip = 1; 321 } 322 if (!skip) 323 *(q++) = langtag; 324 *q = NULL; 325 } 326 327 return (xjoin(r, ',')); 328 } 329 330 static int 331 sortcmp(const void *d1, const void *d2) 332 { 333 char *s1 = *(char **)d1; 334 char *s2 = *(char **)d2; 335 336 return (strcmp(s1, s2)); 337 } 338 339 int 340 g11n_langtag_match(char *langtag1, char *langtag2) 341 { 342 int len1, len2; 343 char c1, c2; 344 345 len1 = (strchr(langtag1, '-')) ? 346 (strchr(langtag1, '-') - langtag1) 347 : strlen(langtag1); 348 349 len2 = (strchr(langtag2, '-')) ? 350 (strchr(langtag2, '-') - langtag2) 351 : strlen(langtag2); 352 353 /* no match */ 354 if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0) 355 return (0); 356 357 c1 = *(langtag1 + len1); 358 c2 = *(langtag2 + len2); 359 360 /* no country sub-tags - exact match */ 361 if (c1 == '\0' && c2 == '\0') 362 return (2); 363 364 /* one langtag has a country sub-tag, the other doesn't */ 365 if (c1 == '\0' || c2 == '\0') 366 return (1); 367 368 /* can't happen - both langtags have a country sub-tag */ 369 if (c1 != '-' || c2 != '-') 370 return (1); 371 372 /* compare country subtags */ 373 langtag1 = langtag1 + len1 + 1; 374 langtag2 = langtag2 + len2 + 1; 375 376 len1 = (strchr(langtag1, '-')) ? 377 (strchr(langtag1, '-') - langtag1) : strlen(langtag1); 378 379 len2 = (strchr(langtag2, '-')) ? 380 (strchr(langtag2, '-') - langtag2) : strlen(langtag2); 381 382 if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0) 383 return (1); 384 385 /* country tags matched - exact match */ 386 return (2); 387 } 388 389 char * 390 g11n_langtag_set_intersect(char *set1, char *set2) 391 { 392 char **list1, **list2, **list3, **p, **q, **r; 393 char *set3, *lang_subtag; 394 uint_t n1, n2, n3; 395 uint_t do_append; 396 397 list1 = xsplit(set1, ','); 398 list2 = xsplit(set2, ','); 399 400 for (n1 = 0, p = list1; p && *p; p++, n1++) 401 ; 402 for (n2 = 0, p = list2; p && *p; p++, n2++) 403 ; 404 405 list3 = (char **) xmalloc(sizeof (char *) * (n1 + n2 + 1)); 406 *list3 = NULL; 407 408 /* 409 * we must not sort the user langtags - sorting or not the server's 410 * should not affect the outcome 411 */ 412 qsort(list2, n2, sizeof (char *), sortcmp); 413 414 for (n3 = 0, p = list1; p && *p; p++) { 415 do_append = 0; 416 for (q = list2; q && *q; q++) { 417 if (g11n_langtag_match(*p, *q) != 2) continue; 418 /* append element */ 419 for (r = list3; (r - list3) <= (n1 + n2); r++) { 420 do_append = 1; 421 if (!*r) 422 break; 423 if (strcmp(*p, *r) == 0) { 424 do_append = 0; 425 break; 426 } 427 } 428 if (do_append && n3 <= (n1 + n2)) { 429 list3[n3++] = xstrdup(*p); 430 list3[n3] = NULL; 431 } 432 } 433 } 434 435 for (p = list1; p && *p; p++) { 436 do_append = 0; 437 for (q = list2; q && *q; q++) { 438 if (g11n_langtag_match(*p, *q) != 1) 439 continue; 440 441 /* append element */ 442 lang_subtag = xstrdup(*p); 443 if (strchr(lang_subtag, '-')) 444 *(strchr(lang_subtag, '-')) = '\0'; 445 for (r = list3; (r - list3) <= (n1 + n2); r++) { 446 do_append = 1; 447 if (!*r) 448 break; 449 if (strcmp(lang_subtag, *r) == 0) { 450 do_append = 0; 451 break; 452 } 453 } 454 if (do_append && n3 <= (n1 + n2)) { 455 list3[n3++] = lang_subtag; 456 list3[n3] = NULL; 457 } else 458 xfree(lang_subtag); 459 } 460 } 461 462 set3 = xjoin(list3, ','); 463 xfree_split_list(list1); 464 xfree_split_list(list2); 465 xfree_split_list(list3); 466 467 return (set3); 468 } 469 470 char * 471 g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags) 472 { 473 char *list, *result; 474 char **xlist; 475 476 /* g11n_langtag_set_intersect uses xmalloc - should not return NULL */ 477 list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags); 478 479 if (!list) 480 return (NULL); 481 482 xlist = xsplit(list, ','); 483 484 xfree(list); 485 486 if (!xlist || !*xlist) 487 return (NULL); 488 489 result = xstrdup(*xlist); 490 xfree_split_list(xlist); 491 492 return (result); 493 } 494 495 /* 496 * Compare locales, preferring UTF-8 codesets to others, otherwise doing 497 * a stright strcmp() 498 */ 499 static int 500 locale_cmp(const void *d1, const void *d2) 501 { 502 char *dot_ptr; 503 char *s1 = *(char **)d1; 504 char *s2 = *(char **)d2; 505 int s1_is_utf8 = 0; 506 int s2_is_utf8 = 0; 507 508 /* check if s1 is a UTF-8 locale */ 509 if (((dot_ptr = strchr((char *)s1, '.')) != NULL) && 510 (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) && 511 (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) { 512 s1_is_utf8++; 513 } 514 515 /* check if s2 is a UTF-8 locale */ 516 if (((dot_ptr = strchr((char *)s2, '.')) != NULL) && 517 (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) && 518 (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) { 519 s2_is_utf8++; 520 } 521 522 /* prefer UTF-8 locales */ 523 if (s1_is_utf8 && !s2_is_utf8) 524 return (-1); 525 526 if (s2_is_utf8 && !s1_is_utf8) 527 return (1); 528 529 /* prefer any locale over the default locales */ 530 if (strcmp(s1, "C") == 0 || strcmp(s1, "POSIX") == 0 || 531 strcmp(s1, "common") == 0) { 532 if (strcmp(s2, "C") != 0 && strcmp(s2, "POSIX") != 0 && 533 strcmp(s2, "common") != 0) 534 return (1); 535 } 536 537 if (strcmp(s2, "C") == 0 || strcmp(s2, "POSIX") == 0 || 538 strcmp(s2, "common") == 0) { 539 if (strcmp(s1, "C") != 0 && 540 strcmp(s1, "POSIX") != 0 && 541 strcmp(s1, "common") != 0) 542 return (-1); 543 } 544 545 return (strcmp(s1, s2)); 546 } 547 548 549 char ** 550 g11n_langtag_set_locale_set_intersect(char *langtag_set, char **locale_set) 551 { 552 char **langtag_list, **result, **p, **q, **r; 553 char *s; 554 uint_t do_append, n_langtags, n_locales, n_results, max_results; 555 556 /* count lang tags and locales */ 557 for (n_locales = 0, p = locale_set; p && *p; p++) 558 n_locales++; 559 560 n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0; 561 /* count the number of langtags */ 562 for (; s = strchr(s, ','); s++, n_langtags++) 563 ; 564 565 qsort(locale_set, n_locales, sizeof (char *), locale_cmp); 566 567 langtag_list = xsplit(langtag_set, ','); 568 for (n_langtags = 0, p = langtag_list; p && *p; p++, n_langtags++) 569 ; 570 571 max_results = MIN(n_locales, n_langtags) * 2; 572 result = (char **) xmalloc(sizeof (char *) * (max_results + 1)); 573 *result = NULL; 574 n_results = 0; 575 576 /* more specific matches first */ 577 for (p = langtag_list; p && *p; p++) { 578 do_append = 0; 579 for (q = locale_set; q && *q; q++) { 580 if (g11n_langtag_matches_locale(*p, *q) == 2) { 581 do_append = 1; 582 for (r = result; (r - result) <= 583 MIN(n_locales, n_langtags); r++) { 584 if (!*r) 585 break; 586 if (strcmp(*q, *r) == 0) { 587 do_append = 0; 588 break; 589 } 590 } 591 if (do_append && n_results < max_results) { 592 result[n_results++] = xstrdup(*q); 593 result[n_results] = NULL; 594 } 595 break; 596 } 597 } 598 } 599 600 for (p = langtag_list; p && *p; p++) { 601 do_append = 0; 602 for (q = locale_set; q && *q; q++) { 603 if (g11n_langtag_matches_locale(*p, *q) == 1) { 604 do_append = 1; 605 for (r = result; (r - result) <= 606 MIN(n_locales, n_langtags); r++) { 607 if (!*r) 608 break; 609 if (strcmp(*q, *r) == 0) { 610 do_append = 0; 611 break; 612 } 613 } 614 if (do_append && n_results < max_results) { 615 result[n_results++] = xstrdup(*q); 616 result[n_results] = NULL; 617 } 618 break; 619 } 620 } 621 } 622 623 xfree_split_list(langtag_list); 624 625 return (result); 626 } 627 628 char * 629 g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales) 630 { 631 char **results, *result = NULL; 632 633 if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags, 634 srvr_locales ? srvr_locales : g11n_getlocales())) == NULL) 635 return (NULL); 636 637 if (*results != NULL) 638 result = xstrdup(*results); 639 640 xfree_split_list(results); 641 642 return (result); 643 } 644 645 646 /* 647 * Functions for validating ASCII and UTF-8 strings 648 * 649 * The error_str parameter is an optional pointer to a char variable 650 * where to store a string suitable for use with error() or fatal() or 651 * friends. 652 * 653 * The return value is 0 if success, EILSEQ or EINVAL. 654 * 655 */ 656 uint_t 657 g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str) 658 { 659 uchar_t *p; 660 661 for (p = (uchar_t *)str; p && *p && (!(*p & 0x80)); p++) 662 ; 663 664 if (len && ((p - (uchar_t *)str) != len)) 665 return (EILSEQ); 666 667 return (0); 668 } 669 670 uint_t 671 g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str) 672 { 673 uchar_t *p; 674 uint_t c, l; 675 676 if (len == 0) 677 len = strlen((const char *)str); 678 679 for (p = (uchar_t *)str; p && (p - str < len) && *p; ) { 680 /* 8-bit chars begin a UTF-8 sequence */ 681 if (*p & 0x80) { 682 /* get sequence length and sanity check first byte */ 683 if (*p < 0xc0) 684 return (EILSEQ); 685 else if (*p < 0xe0) 686 l = 2; 687 else if (*p < 0xf0) 688 l = 3; 689 else if (*p < 0xf8) 690 l = 4; 691 else if (*p < 0xfc) 692 l = 5; 693 else if (*p < 0xfe) 694 l = 6; 695 else 696 return (EILSEQ); 697 698 if ((p + l - str) >= len) 699 return (EILSEQ); 700 701 /* overlong detection - build codepoint */ 702 c = *p & 0x3f; 703 /* shift c bits from first byte */ 704 c = c << (6 * (l - 1)); 705 706 if (l > 1) { 707 if (*(p + 1) && ((*(p + 1) & 0xc0) == 0x80)) 708 c = c | ((*(p + 1) & 0x3f) << 709 (6 * (l - 2))); 710 else 711 return (EILSEQ); 712 713 if (c < 0x80) 714 return (EILSEQ); 715 } 716 717 if (l > 2) { 718 if (*(p + 2) && ((*(p + 2) & 0xc0) == 0x80)) 719 c = c | ((*(p + 2) & 0x3f) << 720 (6 * (l - 3))); 721 else 722 return (EILSEQ); 723 724 if (c < 0x800) 725 return (EILSEQ); 726 } 727 728 if (l > 3) { 729 if (*(p + 3) && ((*(p + 3) & 0xc0) == 0x80)) 730 c = c | ((*(p + 3) & 0x3f) << 731 (6 * (l - 4))); 732 else 733 return (EILSEQ); 734 735 if (c < 0x10000) 736 return (EILSEQ); 737 } 738 739 if (l > 4) { 740 if (*(p + 4) && ((*(p + 4) & 0xc0) == 0x80)) 741 c = c | ((*(p + 4) & 0x3f) << 742 (6 * (l - 5))); 743 else 744 return (EILSEQ); 745 746 if (c < 0x200000) 747 return (EILSEQ); 748 } 749 750 if (l > 5) { 751 if (*(p + 5) && ((*(p + 5) & 0xc0) == 0x80)) 752 c = c | (*(p + 5) & 0x3f); 753 else 754 return (EILSEQ); 755 756 if (c < 0x4000000) 757 return (EILSEQ); 758 } 759 760 /* 761 * check for UTF-16 surrogates ifs other illegal 762 * UTF-8 * points 763 */ 764 if (((c <= 0xdfff) && (c >= 0xd800)) || 765 (c == 0xfffe) || (c == 0xffff)) 766 return (EILSEQ); 767 p += l; 768 } 769 /* 7-bit chars are fine */ 770 else 771 p++; 772 } 773 return (0); 774 } 775 776 /* 777 * Functions for converting to ASCII or UTF-8 from the local codeset 778 * Functions for converting from ASCII or UTF-8 to the local codeset 779 * 780 * The error_str parameter is an optional pointer to a char variable 781 * where to store a string suitable for use with error() or fatal() or 782 * friends. 783 * 784 * The err parameter is an optional pointer to an integer where 0 785 * (success) or EILSEQ or EINVAL will be stored (failure). 786 * 787 * These functions return NULL if the conversion fails. 788 * 789 */ 790 uchar_t * 791 g11n_convert_from_ascii(const char *str, int *err_ptr, uchar_t **error_str) 792 { 793 static uint_t initialized = 0; 794 static uint_t do_convert = 0; 795 iconv_t cd; 796 int err; 797 798 if (!initialized) { 799 /* 800 * iconv_open() fails if the to/from codesets are the 801 * same, and there are aliases of codesets to boot... 802 */ 803 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 804 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 805 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 806 initialized = 1; 807 do_convert = 0; 808 } else { 809 cd = iconv_open(nl_langinfo(CODESET), "646"); 810 if (cd == (iconv_t)-1) { 811 if (err_ptr) 812 *err_ptr = errno; 813 if (error_str) 814 *error_str = (uchar_t *)"Cannot " 815 "convert ASCII strings to the local" 816 " codeset"; 817 } 818 initialized = 1; 819 do_convert = 1; 820 } 821 } 822 823 if (!do_convert) { 824 if ((err = g11n_validate_ascii(str, 0, error_str))) { 825 if (err_ptr) 826 *err_ptr = err; 827 return (NULL); 828 } else 829 return ((uchar_t *)xstrdup(str)); 830 } 831 832 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 833 } 834 835 uchar_t * 836 g11n_convert_from_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str) 837 { 838 static uint_t initialized = 0; 839 static uint_t do_convert = 0; 840 iconv_t cd; 841 int err; 842 843 if (!initialized) { 844 /* 845 * iconv_open() fails if the to/from codesets are the 846 * same, and there are aliases of codesets to boot... 847 */ 848 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 849 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 850 initialized = 1; 851 do_convert = 0; 852 } else { 853 cd = iconv_open(nl_langinfo(CODESET), "UTF-8"); 854 if (cd == (iconv_t)-1) { 855 if (err_ptr) 856 *err_ptr = errno; 857 if (error_str) 858 *error_str = (uchar_t *)"Cannot " 859 "convert UTF-8 strings to the " 860 "local codeset"; 861 } 862 initialized = 1; 863 do_convert = 1; 864 } 865 } 866 867 if (!do_convert) { 868 if ((err = g11n_validate_utf8(str, 0, error_str))) { 869 if (err_ptr) 870 *err_ptr = err; 871 return (NULL); 872 } else 873 return ((uchar_t *)xstrdup((char *)str)); 874 } 875 876 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 877 } 878 879 char * 880 g11n_convert_to_ascii(const uchar_t *str, int *err_ptr, uchar_t **error_str) 881 { 882 static uint_t initialized = 0; 883 static uint_t do_convert = 0; 884 iconv_t cd; 885 886 if (!initialized) { 887 /* 888 * iconv_open() fails if the to/from codesets are the 889 * same, and there are aliases of codesets to boot... 890 */ 891 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 892 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 893 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 894 initialized = 1; 895 do_convert = 0; 896 } else { 897 cd = iconv_open("646", nl_langinfo(CODESET)); 898 if (cd == (iconv_t)-1) { 899 if (err_ptr) 900 *err_ptr = errno; 901 if (error_str) 902 *error_str = (uchar_t *)"Cannot " 903 "convert UTF-8 strings to the " 904 "local codeset"; 905 } 906 initialized = 1; 907 do_convert = 1; 908 } 909 } 910 911 if (!do_convert) 912 return (xstrdup((char *)str)); 913 914 return ((char *)do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 915 } 916 917 uchar_t * 918 g11n_convert_to_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str) 919 { 920 static uint_t initialized = 0; 921 static uint_t do_convert = 0; 922 iconv_t cd; 923 924 if (!initialized) { 925 /* 926 * iconv_open() fails if the to/from codesets are the 927 * same, and there are aliases of codesets to boot... 928 */ 929 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 930 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 931 initialized = 1; 932 do_convert = 0; 933 } else { 934 cd = iconv_open("UTF-8", nl_langinfo(CODESET)); 935 if (cd == (iconv_t)-1) { 936 if (err_ptr) 937 *err_ptr = errno; 938 if (error_str) 939 *error_str = (uchar_t *)"Cannot " 940 "convert UTF-8 strings to the " 941 "local codeset"; 942 } 943 initialized = 1; 944 do_convert = 1; 945 } 946 } 947 948 if (!do_convert) 949 return ((uchar_t *)xstrdup((char *)str)); 950 951 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 952 } 953 954 955 /* 956 * Wrapper around iconv() 957 * 958 * The caller is responsible for freeing the result and for handling 959 * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF). 960 */ 961 static uchar_t * 962 do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, uint_t len, 963 uint_t *outlen, int *err, uchar_t **err_str) 964 { 965 size_t inbytesleft, outbytesleft, converted_size; 966 char *outbuf; 967 uchar_t *converted; 968 const char *inbuf; 969 uint_t mul = 0; 970 971 if (!buf || !(*(char *)buf)) 972 return (NULL); 973 974 if (len == 0) 975 len = strlen(buf); 976 977 /* reset conversion descriptor */ 978 /* XXX Do we need initial shift sequences for UTF-8??? */ 979 (void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft); 980 inbuf = (const char *) buf; 981 982 if (mul_ptr) 983 mul = *mul_ptr; 984 985 converted_size = (len << mul); 986 outbuf = (char *)xmalloc(converted_size + 1); /* for null */ 987 converted = (uchar_t *)outbuf; 988 outbytesleft = len; 989 990 do { 991 if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) == 992 (size_t)-1) { 993 if (errno == E2BIG) { 994 /* UTF-8 codepoints are at most 8 bytes long */ 995 if (mul > 2) { 996 if (err_str) 997 *err_str = (uchar_t *) 998 "Conversion to UTF-8 failed" 999 " due to preposterous space" 1000 " requirements"; 1001 if (err) 1002 *err = EILSEQ; 1003 return (NULL); 1004 } 1005 1006 /* 1007 * re-alloc output and ensure that the outbuf 1008 * and outbytesleft values are adjusted 1009 */ 1010 converted = xrealloc(converted, 1011 converted_size << 1 + 1); 1012 outbuf = (char *)converted + converted_size - 1013 outbytesleft; 1014 converted_size = (len << ++(mul)); 1015 outbytesleft = converted_size - outbytesleft; 1016 } else { 1017 /* 1018 * let the caller deal with iconv() errors, 1019 * probably by calling fatal(); xfree() does 1020 * not set errno 1021 */ 1022 if (err) 1023 *err = errno; 1024 xfree(converted); 1025 return (NULL); 1026 } 1027 } 1028 } while (inbytesleft); 1029 1030 *outbuf = '\0'; /* ensure null-termination */ 1031 if (outlen) 1032 *outlen = converted_size - outbytesleft; 1033 if (mul_ptr) 1034 *mul_ptr = mul; 1035 1036 return (converted); 1037 } 1038