1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 * 21 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 22 * Use is subject to license terms. 23 */ 24 25 #pragma ident "%Z%%M% %I% %E% SMI" 26 27 #include <errno.h> 28 #include <locale.h> 29 #include <langinfo.h> 30 #include <iconv.h> 31 #include <ctype.h> 32 #include <strings.h> 33 #include <string.h> 34 #include <stdio.h> 35 #include <stdlib.h> 36 #include "includes.h" 37 #include "xmalloc.h" 38 #include "xlist.h" 39 40 #ifdef MIN 41 #undef MIN 42 #endif /* MIN */ 43 44 #define MIN(x, y) ((x) < (y) ? (x) : (y)) 45 46 #define LOCALE_PATH "/usr/bin/locale" 47 48 /* two-char country code, '-' and two-char region code */ 49 #define LANGTAG_MAX 5 50 51 static uchar_t *do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, 52 uint_t len, uint_t *outlen, int *err, uchar_t **err_str); 53 54 static int locale_cmp(const void *d1, const void *d2); 55 static char *g11n_locale2langtag(char *locale); 56 57 uint_t g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str); 58 uint_t g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str); 59 60 /* 61 * Convert locale string name into a language tag. The caller is responsible for 62 * freeing the memory allocated for the result. 63 */ 64 static char * 65 g11n_locale2langtag(char *locale) 66 { 67 char *langtag; 68 69 /* base cases */ 70 if (!locale || !*locale) 71 return (NULL); 72 73 if (strcmp(locale, "POSIX") == 0 || strcmp(locale, "C") == 0) 74 return (xstrdup("i-default")); 75 76 /* punt for language codes which are not exactly 2 letters */ 77 if (strlen(locale) < 2 || 78 !isalpha(locale[0]) || 79 !isalpha(locale[1]) || 80 (locale[2] != '\0' && 81 locale[2] != '_' && 82 locale[2] != '.' && 83 locale[2] != '@')) 84 return (NULL); 85 86 87 /* we have a primary language sub-tag */ 88 langtag = (char *)xmalloc(LANGTAG_MAX + 1); 89 90 strncpy(langtag, locale, 2); 91 langtag[2] = '\0'; 92 93 /* do we have country sub-tag? For example: cs_CZ */ 94 if (locale[2] == '_') { 95 if (strlen(locale) < 5 || 96 !isalpha(locale[3]) || 97 !isalpha(locale[4]) || 98 (locale[5] != '\0' && (locale[5] != '.' && 99 locale[5] != '@'))) { 100 return (langtag); 101 } 102 103 /* example: create cs-CZ from cs_CZ */ 104 if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 2, 105 locale + 3) == 5) 106 return (langtag); 107 } 108 109 /* in all other cases we just use the primary language sub-tag */ 110 return (langtag); 111 } 112 113 uint_t 114 g11n_langtag_is_default(char *langtag) 115 { 116 return (strcmp(langtag, "i-default") == 0); 117 } 118 119 /* 120 * This lang tag / locale matching function works only for two-character 121 * language primary sub-tags and two-character country sub-tags. 122 */ 123 uint_t 124 g11n_langtag_matches_locale(char *langtag, char *locale) 125 { 126 /* match "i-default" to the process' current locale if possible */ 127 if (g11n_langtag_is_default(langtag)) { 128 if (strcasecmp(locale, "POSIX") == 0 || 129 strcasecmp(locale, "C") == 0) 130 return (1); 131 else 132 return (0); 133 } 134 135 /* 136 * locale must be at least 2 chars long and the lang part must be 137 * exactly two characters 138 */ 139 if (strlen(locale) < 2 || 140 (!isalpha(locale[0]) || !isalpha(locale[1]) || 141 (locale[2] != '\0' && locale[2] != '_' && 142 locale[2] != '.' && locale[2] != '@'))) 143 return (0); 144 145 /* same thing with the langtag */ 146 if (strlen(langtag) < 2 || 147 (!isalpha(langtag[0]) || !isalpha(langtag[1]) || 148 (langtag[2] != '\0' && langtag[2] != '-'))) 149 return (0); 150 151 /* primary language sub-tag and the locale's language part must match */ 152 if (strncasecmp(langtag, locale, 2) != 0) 153 return (0); 154 155 /* 156 * primary language sub-tag and the locale's language match, now 157 * fuzzy check country part 158 */ 159 160 /* neither langtag nor locale have more than one component */ 161 if (langtag[2] == '\0' && 162 (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')) 163 return (2); 164 165 /* langtag has only one sub-tag... */ 166 if (langtag[2] == '\0') 167 return (1); 168 169 /* locale has no country code... */ 170 if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@') 171 return (1); 172 173 /* langtag has more than one subtag and the locale has a country code */ 174 175 /* ignore second subtag if not two chars */ 176 if (strlen(langtag) < 5) 177 return (1); 178 179 if (!isalpha(langtag[3]) || !isalpha(langtag[4]) || 180 (langtag[5] != '\0' && langtag[5] != '-')) 181 return (1); 182 183 /* ignore rest of locale if there is no two-character country part */ 184 if (strlen(locale) < 5) 185 return (1); 186 187 if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) || 188 (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@')) 189 return (1); 190 191 /* if the country part matches, return 2 */ 192 if (strncasecmp(&langtag[3], &locale[3], 2) == 0) 193 return (2); 194 195 return (1); 196 } 197 198 char * 199 g11n_getlocale() 200 { 201 /* we have one text domain - always set it */ 202 (void) textdomain(TEXT_DOMAIN); 203 204 /* if the locale is not set, set it from the env vars */ 205 if (!setlocale(LC_MESSAGES, NULL)) 206 (void) setlocale(LC_MESSAGES, ""); 207 208 return (setlocale(LC_MESSAGES, NULL)); 209 } 210 211 void 212 g11n_setlocale(int category, const char *locale) 213 { 214 char *curr; 215 216 /* we have one text domain - always set it */ 217 (void) textdomain(TEXT_DOMAIN); 218 219 if (!locale) 220 return; 221 222 if (*locale && ((curr = setlocale(category, NULL))) && 223 strcmp(curr, locale) == 0) 224 return; 225 226 /* if <category> is bogus, setlocale() will do nothing */ 227 (void) setlocale(category, locale); 228 } 229 230 char ** 231 g11n_getlocales() 232 { 233 FILE *locale_out; 234 uint_t n_elems, list_size, long_line = 0; 235 char **list; 236 char locale[64]; /* 64 bytes is plenty for locale names */ 237 238 if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL) 239 return (NULL); 240 241 /* 242 * start with enough room for 65 locales - that's a lot fewer than 243 * all the locales available for installation, but a lot more than 244 * what most users will need and install 245 */ 246 n_elems = 0; 247 list_size = 192; 248 list = (char **) xmalloc(sizeof (char *) * (list_size + 1)); 249 memset(list, 0, sizeof (char *) * (list_size + 1)); 250 251 while (fgets(locale, sizeof (locale), locale_out)) { 252 /* skip long locale names (if any) */ 253 if (!strchr(locale, '\n')) { 254 long_line = 1; 255 continue; 256 } else if (long_line) { 257 long_line = 0; 258 continue; 259 } 260 261 if (strncmp(locale, "iso_8859", 8) == 0) 262 /* ignore locale names like "iso_8859-1" */ 263 continue; 264 265 if (n_elems == list_size) { 266 list_size *= 2; 267 list = (char **)xrealloc((void *) list, 268 (list_size + 1) * sizeof (char *)); 269 memset(&list[n_elems + 1], 0, 270 sizeof (char *) * (list_size - n_elems + 1)); 271 } 272 273 *(strchr(locale, '\n')) = '\0'; /* remove the trailing \n */ 274 list[n_elems++] = xstrdup(locale); 275 } 276 277 if (n_elems == 0) { 278 xfree(list); 279 return (NULL); 280 } 281 282 list[n_elems] = NULL; 283 (void) pclose(locale_out); 284 285 qsort(list, n_elems - 1, sizeof (char *), locale_cmp); 286 return (list); 287 } 288 289 char * 290 g11n_getlangs() 291 { 292 char *locale; 293 294 if (getenv("SSH_LANGS")) 295 return (xstrdup(getenv("SSH_LANGS"))); 296 297 locale = g11n_getlocale(); 298 299 if (!locale || !*locale) 300 return (xstrdup("i-default")); 301 302 return (g11n_locale2langtag(locale)); 303 } 304 305 char * 306 g11n_locales2langs(char **locale_set) 307 { 308 char **p, **r, **q; 309 char *langtag, *langs; 310 int locales, skip; 311 312 for (locales = 0, p = locale_set; p && *p; p++) 313 locales++; 314 315 r = (char **)xmalloc((locales + 1) * sizeof (char *)); 316 memset(r, 0, (locales + 1) * sizeof (char *)); 317 318 for (p = locale_set; p && *p && ((p - locale_set) <= locales); p++) { 319 skip = 0; 320 if ((langtag = g11n_locale2langtag(*p)) == NULL) 321 continue; 322 for (q = r; (q - r) < locales; q++) { 323 if (!*q) 324 break; 325 if (*q && strcmp(*q, langtag) == 0) 326 skip = 1; 327 } 328 if (!skip) 329 *(q++) = langtag; 330 else 331 xfree(langtag); 332 *q = NULL; 333 } 334 335 langs = xjoin(r, ','); 336 g11n_freelist(r); 337 338 return (langs); 339 } 340 341 static int 342 sortcmp(const void *d1, const void *d2) 343 { 344 char *s1 = *(char **)d1; 345 char *s2 = *(char **)d2; 346 347 return (strcmp(s1, s2)); 348 } 349 350 int 351 g11n_langtag_match(char *langtag1, char *langtag2) 352 { 353 int len1, len2; 354 char c1, c2; 355 356 len1 = (strchr(langtag1, '-')) ? 357 (strchr(langtag1, '-') - langtag1) 358 : strlen(langtag1); 359 360 len2 = (strchr(langtag2, '-')) ? 361 (strchr(langtag2, '-') - langtag2) 362 : strlen(langtag2); 363 364 /* no match */ 365 if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0) 366 return (0); 367 368 c1 = *(langtag1 + len1); 369 c2 = *(langtag2 + len2); 370 371 /* no country sub-tags - exact match */ 372 if (c1 == '\0' && c2 == '\0') 373 return (2); 374 375 /* one langtag has a country sub-tag, the other doesn't */ 376 if (c1 == '\0' || c2 == '\0') 377 return (1); 378 379 /* can't happen - both langtags have a country sub-tag */ 380 if (c1 != '-' || c2 != '-') 381 return (1); 382 383 /* compare country subtags */ 384 langtag1 = langtag1 + len1 + 1; 385 langtag2 = langtag2 + len2 + 1; 386 387 len1 = (strchr(langtag1, '-')) ? 388 (strchr(langtag1, '-') - langtag1) : strlen(langtag1); 389 390 len2 = (strchr(langtag2, '-')) ? 391 (strchr(langtag2, '-') - langtag2) : strlen(langtag2); 392 393 if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0) 394 return (1); 395 396 /* country tags matched - exact match */ 397 return (2); 398 } 399 400 char * 401 g11n_langtag_set_intersect(char *set1, char *set2) 402 { 403 char **list1, **list2, **list3, **p, **q, **r; 404 char *set3, *lang_subtag; 405 uint_t n1, n2, n3; 406 uint_t do_append; 407 408 list1 = xsplit(set1, ','); 409 list2 = xsplit(set2, ','); 410 411 for (n1 = 0, p = list1; p && *p; p++, n1++) 412 ; 413 for (n2 = 0, p = list2; p && *p; p++, n2++) 414 ; 415 416 list3 = (char **) xmalloc(sizeof (char *) * (n1 + n2 + 1)); 417 *list3 = NULL; 418 419 /* 420 * we must not sort the user langtags - sorting or not the server's 421 * should not affect the outcome 422 */ 423 qsort(list2, n2, sizeof (char *), sortcmp); 424 425 for (n3 = 0, p = list1; p && *p; p++) { 426 do_append = 0; 427 for (q = list2; q && *q; q++) { 428 if (g11n_langtag_match(*p, *q) != 2) continue; 429 /* append element */ 430 for (r = list3; (r - list3) <= (n1 + n2); r++) { 431 do_append = 1; 432 if (!*r) 433 break; 434 if (strcmp(*p, *r) == 0) { 435 do_append = 0; 436 break; 437 } 438 } 439 if (do_append && n3 <= (n1 + n2)) { 440 list3[n3++] = xstrdup(*p); 441 list3[n3] = NULL; 442 } 443 } 444 } 445 446 for (p = list1; p && *p; p++) { 447 do_append = 0; 448 for (q = list2; q && *q; q++) { 449 if (g11n_langtag_match(*p, *q) != 1) 450 continue; 451 452 /* append element */ 453 lang_subtag = xstrdup(*p); 454 if (strchr(lang_subtag, '-')) 455 *(strchr(lang_subtag, '-')) = '\0'; 456 for (r = list3; (r - list3) <= (n1 + n2); r++) { 457 do_append = 1; 458 if (!*r) 459 break; 460 if (strcmp(lang_subtag, *r) == 0) { 461 do_append = 0; 462 break; 463 } 464 } 465 if (do_append && n3 <= (n1 + n2)) { 466 list3[n3++] = lang_subtag; 467 list3[n3] = NULL; 468 } else 469 xfree(lang_subtag); 470 } 471 } 472 473 set3 = xjoin(list3, ','); 474 xfree_split_list(list1); 475 xfree_split_list(list2); 476 xfree_split_list(list3); 477 478 return (set3); 479 } 480 481 char * 482 g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags) 483 { 484 char *list, *result; 485 char **xlist; 486 487 /* g11n_langtag_set_intersect uses xmalloc - should not return NULL */ 488 list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags); 489 490 if (!list) 491 return (NULL); 492 493 xlist = xsplit(list, ','); 494 495 xfree(list); 496 497 if (!xlist || !*xlist) 498 return (NULL); 499 500 result = xstrdup(*xlist); 501 xfree_split_list(xlist); 502 503 return (result); 504 } 505 506 /* 507 * Compare locales, preferring UTF-8 codesets to others, otherwise doing 508 * a stright strcmp() 509 */ 510 static int 511 locale_cmp(const void *d1, const void *d2) 512 { 513 char *dot_ptr; 514 char *s1 = *(char **)d1; 515 char *s2 = *(char **)d2; 516 int s1_is_utf8 = 0; 517 int s2_is_utf8 = 0; 518 519 /* check if s1 is a UTF-8 locale */ 520 if (((dot_ptr = strchr((char *)s1, '.')) != NULL) && 521 (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) && 522 (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) { 523 s1_is_utf8++; 524 } 525 526 /* check if s2 is a UTF-8 locale */ 527 if (((dot_ptr = strchr((char *)s2, '.')) != NULL) && 528 (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) && 529 (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) { 530 s2_is_utf8++; 531 } 532 533 /* prefer UTF-8 locales */ 534 if (s1_is_utf8 && !s2_is_utf8) 535 return (-1); 536 537 if (s2_is_utf8 && !s1_is_utf8) 538 return (1); 539 540 /* prefer any locale over the default locales */ 541 if (strcmp(s1, "C") == 0 || strcmp(s1, "POSIX") == 0 || 542 strcmp(s1, "common") == 0) { 543 if (strcmp(s2, "C") != 0 && strcmp(s2, "POSIX") != 0 && 544 strcmp(s2, "common") != 0) 545 return (1); 546 } 547 548 if (strcmp(s2, "C") == 0 || strcmp(s2, "POSIX") == 0 || 549 strcmp(s2, "common") == 0) { 550 if (strcmp(s1, "C") != 0 && 551 strcmp(s1, "POSIX") != 0 && 552 strcmp(s1, "common") != 0) 553 return (-1); 554 } 555 556 return (strcmp(s1, s2)); 557 } 558 559 560 char ** 561 g11n_langtag_set_locale_set_intersect(char *langtag_set, char **locale_set) 562 { 563 char **langtag_list, **result, **p, **q, **r; 564 char *s; 565 uint_t do_append, n_langtags, n_locales, n_results, max_results; 566 567 /* count lang tags and locales */ 568 for (n_locales = 0, p = locale_set; p && *p; p++) 569 n_locales++; 570 571 n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0; 572 /* count the number of langtags */ 573 for (; s = strchr(s, ','); s++, n_langtags++) 574 ; 575 576 qsort(locale_set, n_locales, sizeof (char *), locale_cmp); 577 578 langtag_list = xsplit(langtag_set, ','); 579 for (n_langtags = 0, p = langtag_list; p && *p; p++, n_langtags++) 580 ; 581 582 max_results = MIN(n_locales, n_langtags) * 2; 583 result = (char **) xmalloc(sizeof (char *) * (max_results + 1)); 584 *result = NULL; 585 n_results = 0; 586 587 /* more specific matches first */ 588 for (p = langtag_list; p && *p; p++) { 589 do_append = 0; 590 for (q = locale_set; q && *q; q++) { 591 if (g11n_langtag_matches_locale(*p, *q) == 2) { 592 do_append = 1; 593 for (r = result; (r - result) <= 594 MIN(n_locales, n_langtags); r++) { 595 if (!*r) 596 break; 597 if (strcmp(*q, *r) == 0) { 598 do_append = 0; 599 break; 600 } 601 } 602 if (do_append && n_results < max_results) { 603 result[n_results++] = xstrdup(*q); 604 result[n_results] = NULL; 605 } 606 break; 607 } 608 } 609 } 610 611 for (p = langtag_list; p && *p; p++) { 612 do_append = 0; 613 for (q = locale_set; q && *q; q++) { 614 if (g11n_langtag_matches_locale(*p, *q) == 1) { 615 do_append = 1; 616 for (r = result; (r - result) <= 617 MIN(n_locales, n_langtags); r++) { 618 if (!*r) 619 break; 620 if (strcmp(*q, *r) == 0) { 621 do_append = 0; 622 break; 623 } 624 } 625 if (do_append && n_results < max_results) { 626 result[n_results++] = xstrdup(*q); 627 result[n_results] = NULL; 628 } 629 break; 630 } 631 } 632 } 633 634 xfree_split_list(langtag_list); 635 636 return (result); 637 } 638 639 char * 640 g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales) 641 { 642 char **results, **locales, *result = NULL; 643 644 if (srvr_locales == NULL) 645 locales = g11n_getlocales(); 646 else 647 locales = srvr_locales; 648 649 if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags, 650 locales)) == NULL) 651 goto err; 652 653 if (*results != NULL) 654 result = xstrdup(*results); 655 656 xfree_split_list(results); 657 658 err: 659 if (locales != srvr_locales) 660 g11n_freelist(locales); 661 return (result); 662 } 663 664 665 /* 666 * Functions for validating ASCII and UTF-8 strings 667 * 668 * The error_str parameter is an optional pointer to a char variable 669 * where to store a string suitable for use with error() or fatal() or 670 * friends. 671 * 672 * The return value is 0 if success, EILSEQ or EINVAL. 673 * 674 */ 675 uint_t 676 g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str) 677 { 678 uchar_t *p; 679 680 for (p = (uchar_t *)str; p && *p && (!(*p & 0x80)); p++) 681 ; 682 683 if (len && ((p - (uchar_t *)str) != len)) 684 return (EILSEQ); 685 686 return (0); 687 } 688 689 uint_t 690 g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str) 691 { 692 uchar_t *p; 693 uint_t c, l; 694 695 if (len == 0) 696 len = strlen((const char *)str); 697 698 for (p = (uchar_t *)str; p && (p - str < len) && *p; ) { 699 /* 8-bit chars begin a UTF-8 sequence */ 700 if (*p & 0x80) { 701 /* get sequence length and sanity check first byte */ 702 if (*p < 0xc0) 703 return (EILSEQ); 704 else if (*p < 0xe0) 705 l = 2; 706 else if (*p < 0xf0) 707 l = 3; 708 else if (*p < 0xf8) 709 l = 4; 710 else if (*p < 0xfc) 711 l = 5; 712 else if (*p < 0xfe) 713 l = 6; 714 else 715 return (EILSEQ); 716 717 if ((p + l - str) >= len) 718 return (EILSEQ); 719 720 /* overlong detection - build codepoint */ 721 c = *p & 0x3f; 722 /* shift c bits from first byte */ 723 c = c << (6 * (l - 1)); 724 725 if (l > 1) { 726 if (*(p + 1) && ((*(p + 1) & 0xc0) == 0x80)) 727 c = c | ((*(p + 1) & 0x3f) << 728 (6 * (l - 2))); 729 else 730 return (EILSEQ); 731 732 if (c < 0x80) 733 return (EILSEQ); 734 } 735 736 if (l > 2) { 737 if (*(p + 2) && ((*(p + 2) & 0xc0) == 0x80)) 738 c = c | ((*(p + 2) & 0x3f) << 739 (6 * (l - 3))); 740 else 741 return (EILSEQ); 742 743 if (c < 0x800) 744 return (EILSEQ); 745 } 746 747 if (l > 3) { 748 if (*(p + 3) && ((*(p + 3) & 0xc0) == 0x80)) 749 c = c | ((*(p + 3) & 0x3f) << 750 (6 * (l - 4))); 751 else 752 return (EILSEQ); 753 754 if (c < 0x10000) 755 return (EILSEQ); 756 } 757 758 if (l > 4) { 759 if (*(p + 4) && ((*(p + 4) & 0xc0) == 0x80)) 760 c = c | ((*(p + 4) & 0x3f) << 761 (6 * (l - 5))); 762 else 763 return (EILSEQ); 764 765 if (c < 0x200000) 766 return (EILSEQ); 767 } 768 769 if (l > 5) { 770 if (*(p + 5) && ((*(p + 5) & 0xc0) == 0x80)) 771 c = c | (*(p + 5) & 0x3f); 772 else 773 return (EILSEQ); 774 775 if (c < 0x4000000) 776 return (EILSEQ); 777 } 778 779 /* 780 * check for UTF-16 surrogates ifs other illegal 781 * UTF-8 * points 782 */ 783 if (((c <= 0xdfff) && (c >= 0xd800)) || 784 (c == 0xfffe) || (c == 0xffff)) 785 return (EILSEQ); 786 p += l; 787 } 788 /* 7-bit chars are fine */ 789 else 790 p++; 791 } 792 return (0); 793 } 794 795 /* 796 * Functions for converting to ASCII or UTF-8 from the local codeset 797 * Functions for converting from ASCII or UTF-8 to the local codeset 798 * 799 * The error_str parameter is an optional pointer to a char variable 800 * where to store a string suitable for use with error() or fatal() or 801 * friends. 802 * 803 * The err parameter is an optional pointer to an integer where 0 804 * (success) or EILSEQ or EINVAL will be stored (failure). 805 * 806 * These functions return NULL if the conversion fails. 807 * 808 */ 809 uchar_t * 810 g11n_convert_from_ascii(const char *str, int *err_ptr, uchar_t **error_str) 811 { 812 static uint_t initialized = 0; 813 static uint_t do_convert = 0; 814 iconv_t cd; 815 int err; 816 817 if (!initialized) { 818 /* 819 * iconv_open() fails if the to/from codesets are the 820 * same, and there are aliases of codesets to boot... 821 */ 822 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 823 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 824 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 825 initialized = 1; 826 do_convert = 0; 827 } else { 828 cd = iconv_open(nl_langinfo(CODESET), "646"); 829 if (cd == (iconv_t)-1) { 830 if (err_ptr) 831 *err_ptr = errno; 832 if (error_str) 833 *error_str = (uchar_t *)"Cannot " 834 "convert ASCII strings to the local" 835 " codeset"; 836 } 837 initialized = 1; 838 do_convert = 1; 839 } 840 } 841 842 if (!do_convert) { 843 if ((err = g11n_validate_ascii(str, 0, error_str))) { 844 if (err_ptr) 845 *err_ptr = err; 846 return (NULL); 847 } else 848 return ((uchar_t *)xstrdup(str)); 849 } 850 851 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 852 } 853 854 uchar_t * 855 g11n_convert_from_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str) 856 { 857 static uint_t initialized = 0; 858 static uint_t do_convert = 0; 859 iconv_t cd; 860 int err; 861 862 if (!initialized) { 863 /* 864 * iconv_open() fails if the to/from codesets are the 865 * same, and there are aliases of codesets to boot... 866 */ 867 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 868 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 869 initialized = 1; 870 do_convert = 0; 871 } else { 872 cd = iconv_open(nl_langinfo(CODESET), "UTF-8"); 873 if (cd == (iconv_t)-1) { 874 if (err_ptr) 875 *err_ptr = errno; 876 if (error_str) 877 *error_str = (uchar_t *)"Cannot " 878 "convert UTF-8 strings to the " 879 "local codeset"; 880 } 881 initialized = 1; 882 do_convert = 1; 883 } 884 } 885 886 if (!do_convert) { 887 if ((err = g11n_validate_utf8(str, 0, error_str))) { 888 if (err_ptr) 889 *err_ptr = err; 890 return (NULL); 891 } else 892 return ((uchar_t *)xstrdup((char *)str)); 893 } 894 895 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 896 } 897 898 char * 899 g11n_convert_to_ascii(const uchar_t *str, int *err_ptr, uchar_t **error_str) 900 { 901 static uint_t initialized = 0; 902 static uint_t do_convert = 0; 903 iconv_t cd; 904 905 if (!initialized) { 906 /* 907 * iconv_open() fails if the to/from codesets are the 908 * same, and there are aliases of codesets to boot... 909 */ 910 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 911 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 912 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 913 initialized = 1; 914 do_convert = 0; 915 } else { 916 cd = iconv_open("646", nl_langinfo(CODESET)); 917 if (cd == (iconv_t)-1) { 918 if (err_ptr) 919 *err_ptr = errno; 920 if (error_str) 921 *error_str = (uchar_t *)"Cannot " 922 "convert UTF-8 strings to the " 923 "local codeset"; 924 } 925 initialized = 1; 926 do_convert = 1; 927 } 928 } 929 930 if (!do_convert) 931 return (xstrdup((char *)str)); 932 933 return ((char *)do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 934 } 935 936 uchar_t * 937 g11n_convert_to_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str) 938 { 939 static uint_t initialized = 0; 940 static uint_t do_convert = 0; 941 iconv_t cd; 942 943 if (!initialized) { 944 /* 945 * iconv_open() fails if the to/from codesets are the 946 * same, and there are aliases of codesets to boot... 947 */ 948 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 949 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 950 initialized = 1; 951 do_convert = 0; 952 } else { 953 cd = iconv_open("UTF-8", nl_langinfo(CODESET)); 954 if (cd == (iconv_t)-1) { 955 if (err_ptr) 956 *err_ptr = errno; 957 if (error_str) 958 *error_str = (uchar_t *)"Cannot " 959 "convert UTF-8 strings to the " 960 "local codeset"; 961 } 962 initialized = 1; 963 do_convert = 1; 964 } 965 } 966 967 if (!do_convert) 968 return ((uchar_t *)xstrdup((char *)str)); 969 970 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 971 } 972 973 974 /* 975 * Wrapper around iconv() 976 * 977 * The caller is responsible for freeing the result and for handling 978 * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF). 979 */ 980 static uchar_t * 981 do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, uint_t len, 982 uint_t *outlen, int *err, uchar_t **err_str) 983 { 984 size_t inbytesleft, outbytesleft, converted_size; 985 char *outbuf; 986 uchar_t *converted; 987 const char *inbuf; 988 uint_t mul = 0; 989 990 if (!buf || !(*(char *)buf)) 991 return (NULL); 992 993 if (len == 0) 994 len = strlen(buf); 995 996 /* reset conversion descriptor */ 997 /* XXX Do we need initial shift sequences for UTF-8??? */ 998 (void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft); 999 inbuf = (const char *) buf; 1000 1001 if (mul_ptr) 1002 mul = *mul_ptr; 1003 1004 converted_size = (len << mul); 1005 outbuf = (char *)xmalloc(converted_size + 1); /* for null */ 1006 converted = (uchar_t *)outbuf; 1007 outbytesleft = len; 1008 1009 do { 1010 if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) == 1011 (size_t)-1) { 1012 if (errno == E2BIG) { 1013 /* UTF-8 codepoints are at most 8 bytes long */ 1014 if (mul > 2) { 1015 if (err_str) 1016 *err_str = (uchar_t *) 1017 "Conversion to UTF-8 failed" 1018 " due to preposterous space" 1019 " requirements"; 1020 if (err) 1021 *err = EILSEQ; 1022 return (NULL); 1023 } 1024 1025 /* 1026 * re-alloc output and ensure that the outbuf 1027 * and outbytesleft values are adjusted 1028 */ 1029 converted = xrealloc(converted, 1030 converted_size << 1 + 1); 1031 outbuf = (char *)converted + converted_size - 1032 outbytesleft; 1033 converted_size = (len << ++(mul)); 1034 outbytesleft = converted_size - outbytesleft; 1035 } else { 1036 /* 1037 * let the caller deal with iconv() errors, 1038 * probably by calling fatal(); xfree() does 1039 * not set errno 1040 */ 1041 if (err) 1042 *err = errno; 1043 xfree(converted); 1044 return (NULL); 1045 } 1046 } 1047 } while (inbytesleft); 1048 1049 *outbuf = '\0'; /* ensure null-termination */ 1050 if (outlen) 1051 *outlen = converted_size - outbytesleft; 1052 if (mul_ptr) 1053 *mul_ptr = mul; 1054 1055 return (converted); 1056 } 1057 1058 /* 1059 * Free all strings in the list and then free the list itself. We know that the 1060 * list ends with a NULL pointer. 1061 */ 1062 void 1063 g11n_freelist(char **list) 1064 { 1065 int i = 0; 1066 1067 while (list[i] != NULL) { 1068 xfree(list[i]); 1069 i++; 1070 } 1071 1072 xfree(list); 1073 } 1074