1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 * 21 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 22 * Use is subject to license terms. 23 */ 24 25 #pragma ident "%Z%%M% %I% %E% SMI" 26 27 #include <errno.h> 28 #include <locale.h> 29 #include <langinfo.h> 30 #include <iconv.h> 31 #include <ctype.h> 32 #include <strings.h> 33 #include <string.h> 34 #include <stdio.h> 35 #include <stdlib.h> 36 #include "includes.h" 37 #include "xmalloc.h" 38 #include "xlist.h" 39 40 #ifdef MIN 41 #undef MIN 42 #endif /* MIN */ 43 44 #define MIN(x, y) ((x) < (y) ? (x) : (y)) 45 46 #define LOCALE_PATH "/usr/bin/locale" 47 48 /* two-char country code, '-' and two-char region code */ 49 #define LANGTAG_MAX 5 50 51 static uchar_t *do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, 52 uint_t len, uint_t *outlen, int *err, uchar_t **err_str); 53 54 static int locale_cmp(const void *d1, const void *d2); 55 static char *g11n_locale2langtag(char *locale); 56 57 uint_t g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str); 58 uint_t g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str); 59 60 /* 61 * Convert locale string name into a language tag. The caller is responsible for 62 * freeing the memory allocated for the result. 63 */ 64 static char * 65 g11n_locale2langtag(char *locale) 66 { 67 char *langtag; 68 69 /* base cases */ 70 if (!locale || !*locale) 71 return (NULL); 72 73 if (strcmp(locale, "POSIX") == 0 || strcmp(locale, "C") == 0) 74 return (xstrdup("i-default")); 75 76 /* punt for language codes which are not exactly 2 letters */ 77 if (strlen(locale) < 2 || 78 !isalpha(locale[0]) || 79 !isalpha(locale[1]) || 80 (locale[2] != '\0' && 81 locale[2] != '_' && 82 locale[2] != '.' && 83 locale[2] != '@')) 84 return (NULL); 85 86 87 /* we have a primary language sub-tag */ 88 langtag = (char *)xmalloc(LANGTAG_MAX + 1); 89 90 strncpy(langtag, locale, 2); 91 langtag[2] = '\0'; 92 93 /* do we have country sub-tag? For example: cs_CZ */ 94 if (locale[2] == '_') { 95 if (strlen(locale) < 5 || 96 !isalpha(locale[3]) || 97 !isalpha(locale[4]) || 98 (locale[5] != '\0' && (locale[5] != '.' && 99 locale[5] != '@'))) { 100 return (langtag); 101 } 102 103 /* example: create cs-CZ from cs_CZ */ 104 if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 2, 105 locale + 3) == 5) 106 return (langtag); 107 } 108 109 /* in all other cases we just use the primary language sub-tag */ 110 return (langtag); 111 } 112 113 uint_t 114 g11n_langtag_is_default(char *langtag) 115 { 116 return (strcmp(langtag, "i-default") == 0); 117 } 118 119 /* 120 * This lang tag / locale matching function works only for two-character 121 * language primary sub-tags and two-character country sub-tags. 122 */ 123 uint_t 124 g11n_langtag_matches_locale(char *langtag, char *locale) 125 { 126 /* match "i-default" to the process' current locale if possible */ 127 if (g11n_langtag_is_default(langtag)) { 128 if (strcasecmp(locale, "POSIX") == 0 || 129 strcasecmp(locale, "C") == 0) 130 return (1); 131 else 132 return (0); 133 } 134 135 /* 136 * locale must be at least 2 chars long and the lang part must be 137 * exactly two characters 138 */ 139 if (strlen(locale) < 2 || 140 (!isalpha(locale[0]) || !isalpha(locale[1]) || 141 (locale[2] != '\0' && locale[2] != '_' && 142 locale[2] != '.' && locale[2] != '@'))) 143 return (0); 144 145 /* same thing with the langtag */ 146 if (strlen(langtag) < 2 || 147 (!isalpha(langtag[0]) || !isalpha(langtag[1]) || 148 (langtag[2] != '\0' && langtag[2] != '-'))) 149 return (0); 150 151 /* primary language sub-tag and the locale's language part must match */ 152 if (strncasecmp(langtag, locale, 2) != 0) 153 return (0); 154 155 /* 156 * primary language sub-tag and the locale's language match, now 157 * fuzzy check country part 158 */ 159 160 /* neither langtag nor locale have more than one component */ 161 if (langtag[2] == '\0' && 162 (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')) 163 return (2); 164 165 /* langtag has only one sub-tag... */ 166 if (langtag[2] == '\0') 167 return (1); 168 169 /* locale has no country code... */ 170 if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@') 171 return (1); 172 173 /* langtag has more than one subtag and the locale has a country code */ 174 175 /* ignore second subtag if not two chars */ 176 if (strlen(langtag) < 5) 177 return (1); 178 179 if (!isalpha(langtag[3]) || !isalpha(langtag[4]) || 180 (langtag[5] != '\0' && langtag[5] != '-')) 181 return (1); 182 183 /* ignore rest of locale if there is no two-character country part */ 184 if (strlen(locale) < 5) 185 return (1); 186 187 if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) || 188 (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@')) 189 return (1); 190 191 /* if the country part matches, return 2 */ 192 if (strncasecmp(&langtag[3], &locale[3], 2) == 0) 193 return (2); 194 195 return (1); 196 } 197 198 char * 199 g11n_getlocale() 200 { 201 /* we have one text domain - always set it */ 202 (void) textdomain(TEXT_DOMAIN); 203 204 /* if the locale is not set, set it from the env vars */ 205 if (!setlocale(LC_MESSAGES, NULL)) 206 (void) setlocale(LC_MESSAGES, ""); 207 208 return (setlocale(LC_MESSAGES, NULL)); 209 } 210 211 void 212 g11n_setlocale(int category, const char *locale) 213 { 214 char *curr; 215 216 /* we have one text domain - always set it */ 217 (void) textdomain(TEXT_DOMAIN); 218 219 if (!locale) 220 return; 221 222 if (*locale && ((curr = setlocale(category, NULL))) && 223 strcmp(curr, locale) == 0) 224 return; 225 226 /* if <category> is bogus, setlocale() will do nothing */ 227 (void) setlocale(category, locale); 228 } 229 230 char ** 231 g11n_getlocales() 232 { 233 FILE *locale_out; 234 uint_t n_elems, list_size, long_line = 0; 235 char **list; 236 char locale[64]; /* 64 bytes is plenty for locale names */ 237 238 if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL) 239 return (NULL); 240 241 /* 242 * start with enough room for 65 locales - that's a lot fewer than 243 * all the locales available for installation, but a lot more than 244 * what most users will need and install 245 */ 246 n_elems = 0; 247 list_size = 192; 248 list = (char **) xmalloc(sizeof (char *) * (list_size + 1)); 249 memset(list, 0, sizeof (char *) * (list_size + 1)); 250 251 while (fgets(locale, sizeof (locale), locale_out)) { 252 /* skip long locale names (if any) */ 253 if (!strchr(locale, '\n')) { 254 long_line = 1; 255 continue; 256 } else if (long_line) { 257 long_line = 0; 258 continue; 259 } 260 261 if (strncmp(locale, "iso_8859", 8) == 0) 262 /* ignore locale names like "iso_8859-1" */ 263 continue; 264 265 if (n_elems == list_size) { 266 list_size *= 2; 267 list = (char **)xrealloc((void *) list, 268 (list_size + 1) * sizeof (char *)); 269 memset(&list[n_elems + 1], 0, 270 sizeof (char *) * (list_size - n_elems + 1)); 271 } 272 273 *(strchr(locale, '\n')) = '\0'; /* remove the trailing \n */ 274 list[n_elems++] = xstrdup(locale); 275 } 276 277 (void) pclose(locale_out); 278 279 if (n_elems == 0) { 280 xfree(list); 281 return (NULL); 282 } 283 284 list[n_elems] = NULL; 285 286 qsort(list, n_elems - 1, sizeof (char *), locale_cmp); 287 return (list); 288 } 289 290 char * 291 g11n_getlangs() 292 { 293 char *locale; 294 295 if (getenv("SSH_LANGS")) 296 return (xstrdup(getenv("SSH_LANGS"))); 297 298 locale = g11n_getlocale(); 299 300 if (!locale || !*locale) 301 return (xstrdup("i-default")); 302 303 return (g11n_locale2langtag(locale)); 304 } 305 306 char * 307 g11n_locales2langs(char **locale_set) 308 { 309 char **p, **r, **q; 310 char *langtag, *langs; 311 int locales, skip; 312 313 for (locales = 0, p = locale_set; p && *p; p++) 314 locales++; 315 316 r = (char **)xmalloc((locales + 1) * sizeof (char *)); 317 memset(r, 0, (locales + 1) * sizeof (char *)); 318 319 for (p = locale_set; p && *p && ((p - locale_set) <= locales); p++) { 320 skip = 0; 321 if ((langtag = g11n_locale2langtag(*p)) == NULL) 322 continue; 323 for (q = r; (q - r) < locales; q++) { 324 if (!*q) 325 break; 326 if (*q && strcmp(*q, langtag) == 0) 327 skip = 1; 328 } 329 if (!skip) 330 *(q++) = langtag; 331 else 332 xfree(langtag); 333 *q = NULL; 334 } 335 336 langs = xjoin(r, ','); 337 g11n_freelist(r); 338 339 return (langs); 340 } 341 342 static int 343 sortcmp(const void *d1, const void *d2) 344 { 345 char *s1 = *(char **)d1; 346 char *s2 = *(char **)d2; 347 348 return (strcmp(s1, s2)); 349 } 350 351 int 352 g11n_langtag_match(char *langtag1, char *langtag2) 353 { 354 int len1, len2; 355 char c1, c2; 356 357 len1 = (strchr(langtag1, '-')) ? 358 (strchr(langtag1, '-') - langtag1) 359 : strlen(langtag1); 360 361 len2 = (strchr(langtag2, '-')) ? 362 (strchr(langtag2, '-') - langtag2) 363 : strlen(langtag2); 364 365 /* no match */ 366 if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0) 367 return (0); 368 369 c1 = *(langtag1 + len1); 370 c2 = *(langtag2 + len2); 371 372 /* no country sub-tags - exact match */ 373 if (c1 == '\0' && c2 == '\0') 374 return (2); 375 376 /* one langtag has a country sub-tag, the other doesn't */ 377 if (c1 == '\0' || c2 == '\0') 378 return (1); 379 380 /* can't happen - both langtags have a country sub-tag */ 381 if (c1 != '-' || c2 != '-') 382 return (1); 383 384 /* compare country subtags */ 385 langtag1 = langtag1 + len1 + 1; 386 langtag2 = langtag2 + len2 + 1; 387 388 len1 = (strchr(langtag1, '-')) ? 389 (strchr(langtag1, '-') - langtag1) : strlen(langtag1); 390 391 len2 = (strchr(langtag2, '-')) ? 392 (strchr(langtag2, '-') - langtag2) : strlen(langtag2); 393 394 if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0) 395 return (1); 396 397 /* country tags matched - exact match */ 398 return (2); 399 } 400 401 char * 402 g11n_langtag_set_intersect(char *set1, char *set2) 403 { 404 char **list1, **list2, **list3, **p, **q, **r; 405 char *set3, *lang_subtag; 406 uint_t n1, n2, n3; 407 uint_t do_append; 408 409 list1 = xsplit(set1, ','); 410 list2 = xsplit(set2, ','); 411 412 for (n1 = 0, p = list1; p && *p; p++, n1++) 413 ; 414 for (n2 = 0, p = list2; p && *p; p++, n2++) 415 ; 416 417 list3 = (char **) xmalloc(sizeof (char *) * (n1 + n2 + 1)); 418 *list3 = NULL; 419 420 /* 421 * we must not sort the user langtags - sorting or not the server's 422 * should not affect the outcome 423 */ 424 qsort(list2, n2, sizeof (char *), sortcmp); 425 426 for (n3 = 0, p = list1; p && *p; p++) { 427 do_append = 0; 428 for (q = list2; q && *q; q++) { 429 if (g11n_langtag_match(*p, *q) != 2) continue; 430 /* append element */ 431 for (r = list3; (r - list3) <= (n1 + n2); r++) { 432 do_append = 1; 433 if (!*r) 434 break; 435 if (strcmp(*p, *r) == 0) { 436 do_append = 0; 437 break; 438 } 439 } 440 if (do_append && n3 <= (n1 + n2)) { 441 list3[n3++] = xstrdup(*p); 442 list3[n3] = NULL; 443 } 444 } 445 } 446 447 for (p = list1; p && *p; p++) { 448 do_append = 0; 449 for (q = list2; q && *q; q++) { 450 if (g11n_langtag_match(*p, *q) != 1) 451 continue; 452 453 /* append element */ 454 lang_subtag = xstrdup(*p); 455 if (strchr(lang_subtag, '-')) 456 *(strchr(lang_subtag, '-')) = '\0'; 457 for (r = list3; (r - list3) <= (n1 + n2); r++) { 458 do_append = 1; 459 if (!*r) 460 break; 461 if (strcmp(lang_subtag, *r) == 0) { 462 do_append = 0; 463 break; 464 } 465 } 466 if (do_append && n3 <= (n1 + n2)) { 467 list3[n3++] = lang_subtag; 468 list3[n3] = NULL; 469 } else 470 xfree(lang_subtag); 471 } 472 } 473 474 set3 = xjoin(list3, ','); 475 xfree_split_list(list1); 476 xfree_split_list(list2); 477 xfree_split_list(list3); 478 479 return (set3); 480 } 481 482 char * 483 g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags) 484 { 485 char *list, *result; 486 char **xlist; 487 488 /* g11n_langtag_set_intersect uses xmalloc - should not return NULL */ 489 list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags); 490 491 if (!list) 492 return (NULL); 493 494 xlist = xsplit(list, ','); 495 496 xfree(list); 497 498 if (!xlist || !*xlist) 499 return (NULL); 500 501 result = xstrdup(*xlist); 502 xfree_split_list(xlist); 503 504 return (result); 505 } 506 507 /* 508 * Compare locales, preferring UTF-8 codesets to others, otherwise doing 509 * a stright strcmp() 510 */ 511 static int 512 locale_cmp(const void *d1, const void *d2) 513 { 514 char *dot_ptr; 515 char *s1 = *(char **)d1; 516 char *s2 = *(char **)d2; 517 int s1_is_utf8 = 0; 518 int s2_is_utf8 = 0; 519 520 /* check if s1 is a UTF-8 locale */ 521 if (((dot_ptr = strchr((char *)s1, '.')) != NULL) && 522 (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) && 523 (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) { 524 s1_is_utf8++; 525 } 526 527 /* check if s2 is a UTF-8 locale */ 528 if (((dot_ptr = strchr((char *)s2, '.')) != NULL) && 529 (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) && 530 (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) { 531 s2_is_utf8++; 532 } 533 534 /* prefer UTF-8 locales */ 535 if (s1_is_utf8 && !s2_is_utf8) 536 return (-1); 537 538 if (s2_is_utf8 && !s1_is_utf8) 539 return (1); 540 541 /* prefer any locale over the default locales */ 542 if (strcmp(s1, "C") == 0 || strcmp(s1, "POSIX") == 0 || 543 strcmp(s1, "common") == 0) { 544 if (strcmp(s2, "C") != 0 && strcmp(s2, "POSIX") != 0 && 545 strcmp(s2, "common") != 0) 546 return (1); 547 } 548 549 if (strcmp(s2, "C") == 0 || strcmp(s2, "POSIX") == 0 || 550 strcmp(s2, "common") == 0) { 551 if (strcmp(s1, "C") != 0 && 552 strcmp(s1, "POSIX") != 0 && 553 strcmp(s1, "common") != 0) 554 return (-1); 555 } 556 557 return (strcmp(s1, s2)); 558 } 559 560 561 char ** 562 g11n_langtag_set_locale_set_intersect(char *langtag_set, char **locale_set) 563 { 564 char **langtag_list, **result, **p, **q, **r; 565 char *s; 566 uint_t do_append, n_langtags, n_locales, n_results, max_results; 567 568 /* count lang tags and locales */ 569 for (n_locales = 0, p = locale_set; p && *p; p++) 570 n_locales++; 571 572 n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0; 573 /* count the number of langtags */ 574 for (; s = strchr(s, ','); s++, n_langtags++) 575 ; 576 577 qsort(locale_set, n_locales, sizeof (char *), locale_cmp); 578 579 langtag_list = xsplit(langtag_set, ','); 580 for (n_langtags = 0, p = langtag_list; p && *p; p++, n_langtags++) 581 ; 582 583 max_results = MIN(n_locales, n_langtags) * 2; 584 result = (char **) xmalloc(sizeof (char *) * (max_results + 1)); 585 *result = NULL; 586 n_results = 0; 587 588 /* more specific matches first */ 589 for (p = langtag_list; p && *p; p++) { 590 do_append = 0; 591 for (q = locale_set; q && *q; q++) { 592 if (g11n_langtag_matches_locale(*p, *q) == 2) { 593 do_append = 1; 594 for (r = result; (r - result) <= 595 MIN(n_locales, n_langtags); r++) { 596 if (!*r) 597 break; 598 if (strcmp(*q, *r) == 0) { 599 do_append = 0; 600 break; 601 } 602 } 603 if (do_append && n_results < max_results) { 604 result[n_results++] = xstrdup(*q); 605 result[n_results] = NULL; 606 } 607 break; 608 } 609 } 610 } 611 612 for (p = langtag_list; p && *p; p++) { 613 do_append = 0; 614 for (q = locale_set; q && *q; q++) { 615 if (g11n_langtag_matches_locale(*p, *q) == 1) { 616 do_append = 1; 617 for (r = result; (r - result) <= 618 MIN(n_locales, n_langtags); r++) { 619 if (!*r) 620 break; 621 if (strcmp(*q, *r) == 0) { 622 do_append = 0; 623 break; 624 } 625 } 626 if (do_append && n_results < max_results) { 627 result[n_results++] = xstrdup(*q); 628 result[n_results] = NULL; 629 } 630 break; 631 } 632 } 633 } 634 635 xfree_split_list(langtag_list); 636 637 return (result); 638 } 639 640 char * 641 g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales) 642 { 643 char **results, **locales, *result = NULL; 644 645 if (srvr_locales == NULL) 646 locales = g11n_getlocales(); 647 else 648 locales = srvr_locales; 649 650 if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags, 651 locales)) == NULL) 652 goto err; 653 654 if (*results != NULL) 655 result = xstrdup(*results); 656 657 xfree_split_list(results); 658 659 err: 660 if (locales != srvr_locales) 661 g11n_freelist(locales); 662 return (result); 663 } 664 665 666 /* 667 * Functions for validating ASCII and UTF-8 strings 668 * 669 * The error_str parameter is an optional pointer to a char variable 670 * where to store a string suitable for use with error() or fatal() or 671 * friends. 672 * 673 * The return value is 0 if success, EILSEQ or EINVAL. 674 * 675 */ 676 uint_t 677 g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str) 678 { 679 uchar_t *p; 680 681 for (p = (uchar_t *)str; p && *p && (!(*p & 0x80)); p++) 682 ; 683 684 if (len && ((p - (uchar_t *)str) != len)) 685 return (EILSEQ); 686 687 return (0); 688 } 689 690 uint_t 691 g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str) 692 { 693 uchar_t *p; 694 uint_t c, l; 695 696 if (len == 0) 697 len = strlen((const char *)str); 698 699 for (p = (uchar_t *)str; p && (p - str < len) && *p; ) { 700 /* 8-bit chars begin a UTF-8 sequence */ 701 if (*p & 0x80) { 702 /* get sequence length and sanity check first byte */ 703 if (*p < 0xc0) 704 return (EILSEQ); 705 else if (*p < 0xe0) 706 l = 2; 707 else if (*p < 0xf0) 708 l = 3; 709 else if (*p < 0xf8) 710 l = 4; 711 else if (*p < 0xfc) 712 l = 5; 713 else if (*p < 0xfe) 714 l = 6; 715 else 716 return (EILSEQ); 717 718 if ((p + l - str) >= len) 719 return (EILSEQ); 720 721 /* overlong detection - build codepoint */ 722 c = *p & 0x3f; 723 /* shift c bits from first byte */ 724 c = c << (6 * (l - 1)); 725 726 if (l > 1) { 727 if (*(p + 1) && ((*(p + 1) & 0xc0) == 0x80)) 728 c = c | ((*(p + 1) & 0x3f) << 729 (6 * (l - 2))); 730 else 731 return (EILSEQ); 732 733 if (c < 0x80) 734 return (EILSEQ); 735 } 736 737 if (l > 2) { 738 if (*(p + 2) && ((*(p + 2) & 0xc0) == 0x80)) 739 c = c | ((*(p + 2) & 0x3f) << 740 (6 * (l - 3))); 741 else 742 return (EILSEQ); 743 744 if (c < 0x800) 745 return (EILSEQ); 746 } 747 748 if (l > 3) { 749 if (*(p + 3) && ((*(p + 3) & 0xc0) == 0x80)) 750 c = c | ((*(p + 3) & 0x3f) << 751 (6 * (l - 4))); 752 else 753 return (EILSEQ); 754 755 if (c < 0x10000) 756 return (EILSEQ); 757 } 758 759 if (l > 4) { 760 if (*(p + 4) && ((*(p + 4) & 0xc0) == 0x80)) 761 c = c | ((*(p + 4) & 0x3f) << 762 (6 * (l - 5))); 763 else 764 return (EILSEQ); 765 766 if (c < 0x200000) 767 return (EILSEQ); 768 } 769 770 if (l > 5) { 771 if (*(p + 5) && ((*(p + 5) & 0xc0) == 0x80)) 772 c = c | (*(p + 5) & 0x3f); 773 else 774 return (EILSEQ); 775 776 if (c < 0x4000000) 777 return (EILSEQ); 778 } 779 780 /* 781 * check for UTF-16 surrogates ifs other illegal 782 * UTF-8 * points 783 */ 784 if (((c <= 0xdfff) && (c >= 0xd800)) || 785 (c == 0xfffe) || (c == 0xffff)) 786 return (EILSEQ); 787 p += l; 788 } 789 /* 7-bit chars are fine */ 790 else 791 p++; 792 } 793 return (0); 794 } 795 796 /* 797 * Functions for converting to ASCII or UTF-8 from the local codeset 798 * Functions for converting from ASCII or UTF-8 to the local codeset 799 * 800 * The error_str parameter is an optional pointer to a char variable 801 * where to store a string suitable for use with error() or fatal() or 802 * friends. 803 * 804 * The err parameter is an optional pointer to an integer where 0 805 * (success) or EILSEQ or EINVAL will be stored (failure). 806 * 807 * These functions return NULL if the conversion fails. 808 * 809 */ 810 uchar_t * 811 g11n_convert_from_ascii(const char *str, int *err_ptr, uchar_t **error_str) 812 { 813 static uint_t initialized = 0; 814 static uint_t do_convert = 0; 815 iconv_t cd; 816 int err; 817 818 if (!initialized) { 819 /* 820 * iconv_open() fails if the to/from codesets are the 821 * same, and there are aliases of codesets to boot... 822 */ 823 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 824 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 825 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 826 initialized = 1; 827 do_convert = 0; 828 } else { 829 cd = iconv_open(nl_langinfo(CODESET), "646"); 830 if (cd == (iconv_t)-1) { 831 if (err_ptr) 832 *err_ptr = errno; 833 if (error_str) 834 *error_str = (uchar_t *)"Cannot " 835 "convert ASCII strings to the local" 836 " codeset"; 837 } 838 initialized = 1; 839 do_convert = 1; 840 } 841 } 842 843 if (!do_convert) { 844 if ((err = g11n_validate_ascii(str, 0, error_str))) { 845 if (err_ptr) 846 *err_ptr = err; 847 return (NULL); 848 } else 849 return ((uchar_t *)xstrdup(str)); 850 } 851 852 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 853 } 854 855 uchar_t * 856 g11n_convert_from_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str) 857 { 858 static uint_t initialized = 0; 859 static uint_t do_convert = 0; 860 iconv_t cd; 861 int err; 862 863 if (!initialized) { 864 /* 865 * iconv_open() fails if the to/from codesets are the 866 * same, and there are aliases of codesets to boot... 867 */ 868 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 869 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 870 initialized = 1; 871 do_convert = 0; 872 } else { 873 cd = iconv_open(nl_langinfo(CODESET), "UTF-8"); 874 if (cd == (iconv_t)-1) { 875 if (err_ptr) 876 *err_ptr = errno; 877 if (error_str) 878 *error_str = (uchar_t *)"Cannot " 879 "convert UTF-8 strings to the " 880 "local codeset"; 881 } 882 initialized = 1; 883 do_convert = 1; 884 } 885 } 886 887 if (!do_convert) { 888 if ((err = g11n_validate_utf8(str, 0, error_str))) { 889 if (err_ptr) 890 *err_ptr = err; 891 return (NULL); 892 } else 893 return ((uchar_t *)xstrdup((char *)str)); 894 } 895 896 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 897 } 898 899 char * 900 g11n_convert_to_ascii(const uchar_t *str, int *err_ptr, uchar_t **error_str) 901 { 902 static uint_t initialized = 0; 903 static uint_t do_convert = 0; 904 iconv_t cd; 905 906 if (!initialized) { 907 /* 908 * iconv_open() fails if the to/from codesets are the 909 * same, and there are aliases of codesets to boot... 910 */ 911 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 912 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 913 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 914 initialized = 1; 915 do_convert = 0; 916 } else { 917 cd = iconv_open("646", nl_langinfo(CODESET)); 918 if (cd == (iconv_t)-1) { 919 if (err_ptr) 920 *err_ptr = errno; 921 if (error_str) 922 *error_str = (uchar_t *)"Cannot " 923 "convert UTF-8 strings to the " 924 "local codeset"; 925 } 926 initialized = 1; 927 do_convert = 1; 928 } 929 } 930 931 if (!do_convert) 932 return (xstrdup((char *)str)); 933 934 return ((char *)do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 935 } 936 937 uchar_t * 938 g11n_convert_to_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str) 939 { 940 static uint_t initialized = 0; 941 static uint_t do_convert = 0; 942 iconv_t cd; 943 944 if (!initialized) { 945 /* 946 * iconv_open() fails if the to/from codesets are the 947 * same, and there are aliases of codesets to boot... 948 */ 949 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 950 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 951 initialized = 1; 952 do_convert = 0; 953 } else { 954 cd = iconv_open("UTF-8", nl_langinfo(CODESET)); 955 if (cd == (iconv_t)-1) { 956 if (err_ptr) 957 *err_ptr = errno; 958 if (error_str) 959 *error_str = (uchar_t *)"Cannot " 960 "convert UTF-8 strings to the " 961 "local codeset"; 962 } 963 initialized = 1; 964 do_convert = 1; 965 } 966 } 967 968 if (!do_convert) 969 return ((uchar_t *)xstrdup((char *)str)); 970 971 return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str)); 972 } 973 974 975 /* 976 * Wrapper around iconv() 977 * 978 * The caller is responsible for freeing the result and for handling 979 * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF). 980 */ 981 static uchar_t * 982 do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, uint_t len, 983 uint_t *outlen, int *err, uchar_t **err_str) 984 { 985 size_t inbytesleft, outbytesleft, converted_size; 986 char *outbuf; 987 uchar_t *converted; 988 const char *inbuf; 989 uint_t mul = 0; 990 991 if (!buf || !(*(char *)buf)) 992 return (NULL); 993 994 if (len == 0) 995 len = strlen(buf); 996 997 /* reset conversion descriptor */ 998 /* XXX Do we need initial shift sequences for UTF-8??? */ 999 (void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft); 1000 inbuf = (const char *) buf; 1001 1002 if (mul_ptr) 1003 mul = *mul_ptr; 1004 1005 converted_size = (len << mul); 1006 outbuf = (char *)xmalloc(converted_size + 1); /* for null */ 1007 converted = (uchar_t *)outbuf; 1008 outbytesleft = len; 1009 1010 do { 1011 if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) == 1012 (size_t)-1) { 1013 if (errno == E2BIG) { 1014 /* UTF-8 codepoints are at most 8 bytes long */ 1015 if (mul > 2) { 1016 if (err_str) 1017 *err_str = (uchar_t *) 1018 "Conversion to UTF-8 failed" 1019 " due to preposterous space" 1020 " requirements"; 1021 if (err) 1022 *err = EILSEQ; 1023 return (NULL); 1024 } 1025 1026 /* 1027 * re-alloc output and ensure that the outbuf 1028 * and outbytesleft values are adjusted 1029 */ 1030 converted = xrealloc(converted, 1031 converted_size << 1 + 1); 1032 outbuf = (char *)converted + converted_size - 1033 outbytesleft; 1034 converted_size = (len << ++(mul)); 1035 outbytesleft = converted_size - outbytesleft; 1036 } else { 1037 /* 1038 * let the caller deal with iconv() errors, 1039 * probably by calling fatal(); xfree() does 1040 * not set errno 1041 */ 1042 if (err) 1043 *err = errno; 1044 xfree(converted); 1045 return (NULL); 1046 } 1047 } 1048 } while (inbytesleft); 1049 1050 *outbuf = '\0'; /* ensure null-termination */ 1051 if (outlen) 1052 *outlen = converted_size - outbytesleft; 1053 if (mul_ptr) 1054 *mul_ptr = mul; 1055 1056 return (converted); 1057 } 1058 1059 /* 1060 * Free all strings in the list and then free the list itself. We know that the 1061 * list ends with a NULL pointer. 1062 */ 1063 void 1064 g11n_freelist(char **list) 1065 { 1066 int i = 0; 1067 1068 while (list[i] != NULL) { 1069 xfree(list[i]); 1070 i++; 1071 } 1072 1073 xfree(list); 1074 } 1075