1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 * 22 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <errno.h> 29 #include <locale.h> 30 #include <langinfo.h> 31 #include <iconv.h> 32 #include <ctype.h> 33 #include <strings.h> 34 #include <string.h> 35 #include <stdio.h> 36 #include <stdlib.h> 37 #include "includes.h" 38 #include "xmalloc.h" 39 #include "xlist.h" 40 41 #ifdef MIN 42 #undef MIN 43 #endif /* MIN */ 44 45 #define MIN(x, y) ((x) < (y) ? (x) : (y)) 46 47 #define LOCALE_PATH "/usr/bin/locale" 48 49 #define LANGTAG_MAX 5 /* two-char country code, '-' and two-char region code */ 50 51 static u_char * do_iconv(iconv_t cd, u_int *mul_ptr, 52 const void *buf, u_int len, 53 u_int *outlen, int *err, 54 u_char **err_str); 55 56 static int locale_cmp(const void *d1, const void *d2); 57 static char *g11n_locale2langtag(char *locale); 58 59 u_int 60 g11n_validate_ascii(const char *str, u_int len, u_char **error_str); 61 62 u_int 63 g11n_validate_utf8(const u_char *str, u_int len, u_char **error_str); 64 65 static 66 char * 67 g11n_locale2langtag(char *locale) 68 { 69 char *langtag; 70 71 /* base cases */ 72 if (!locale || !*locale) return NULL; 73 74 if (strcmp(locale, "POSIX") == 0 || 75 strcmp(locale, "C") == 0) return "i-default"; 76 77 /* Punt for language codes which are not exactly 2 letters */ 78 if (strlen(locale) < 2 || 79 !isalpha(locale[0]) || 80 !isalpha(locale[1]) || 81 (locale[2] != '\0' && 82 locale[2] != '_' && 83 locale[2] != '.' && 84 locale[2] != '@')) 85 return NULL; 86 87 88 /* We have a primary language sub-tag */ 89 langtag = (char *) xmalloc(LANGTAG_MAX + 1); 90 91 strncpy(langtag, locale, 2); 92 langtag[2] = '\0'; 93 94 /* Do we have country sub-tag? */ 95 if (locale[2] == '_') { 96 if (strlen(locale) < 5 || 97 !isalpha(locale[3]) || 98 !isalpha(locale[4]) || 99 (locale[5] != '\0' && (locale[5] != '.' && locale[5] != '@'))) { 100 return langtag; 101 } 102 103 /* yes, we do */ 104 /* if (snprintf(langtag, 6, "%s-%s,%s", lang_subtag, 105 country_subtag, langtag) == 8) */ 106 if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 107 2, locale+3) == 5) 108 return langtag; 109 } 110 111 /* In all other cases we just use the primary language sub-tag */ 112 return langtag; 113 } 114 115 u_int 116 g11n_langtag_is_default(char *langtag) 117 { 118 return (strcmp(langtag, "i-default") == 0); 119 } 120 121 /* 122 * This lang tag / locale matching function works only for two-character 123 * language primary sub-tags and two-character country sub-tags. 124 */ 125 u_int 126 g11n_langtag_matches_locale(char *langtag, char *locale) 127 { 128 /* Match "i-default" to the process' current locale if possible */ 129 if (g11n_langtag_is_default(langtag)) { 130 if (strcasecmp(locale, "POSIX") == 0 || 131 strcasecmp(locale, "C") == 0) 132 return 1; 133 else 134 return 0; 135 } 136 137 /* locale must be at least 2 chars long and the lang part must be 138 * exactly two characters */ 139 if (strlen(locale) < 2 || 140 (!isalpha(locale[0]) || !isalpha(locale[1]) || 141 (locale[2] != '\0' && locale[2] != '_' && locale[2] != '.' && locale[2] != '@'))) 142 return 0; 143 144 /* same thing with the langtag */ 145 if (strlen(langtag) < 2 || 146 (!isalpha(langtag[0]) || !isalpha(langtag[1]) || 147 (langtag[2] != '\0' && langtag[2] != '-'))) 148 return 0; 149 150 /* primary language sub-tag and the locale's language part must match */ 151 if (strncasecmp(langtag, locale, 2) != 0) 152 return 0; 153 154 /* primary language sub-tag and the locale's language match, now 155 * fuzzy check country part */ 156 157 /* neither langtag nor locale have more than one component */ 158 if (langtag[2] == '\0' && 159 (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')) 160 return 2; 161 162 /* langtag has only one sub-tag... */ 163 if (langtag[2] == '\0') 164 return 1; 165 166 /* locale has no country code... */ 167 if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@') 168 return 1; 169 170 /* langtag has more than one subtag and the locale has a country code */ 171 172 /* ignore second subtag if not two chars */ 173 if (strlen(langtag) < 5) 174 return 1; 175 176 if (!isalpha(langtag[3]) || !isalpha(langtag[4]) || 177 (langtag[5] != '\0' && langtag[5] != '-')) 178 return 1; 179 180 /* ignore rest of locale if there is no two-character country part */ 181 if (strlen(locale) < 5) 182 return 1; 183 184 if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) || 185 (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@')) 186 return 1; 187 188 /* if the country part matches, return 2 */ 189 if (strncasecmp(&langtag[3], &locale[3], 2) == 0) 190 return 2; 191 192 return 1; 193 } 194 195 char * 196 g11n_getlocale() 197 { 198 /* We have one text domain - always set it */ 199 (void) textdomain(TEXT_DOMAIN); 200 201 /* If the locale is not set, set it from the env vars */ 202 if (!setlocale(LC_CTYPE, NULL)) 203 (void) setlocale(LC_CTYPE, ""); 204 205 return setlocale(LC_CTYPE, NULL); 206 } 207 208 void 209 g11n_setlocale(int category, const char *locale) 210 { 211 char *curr; 212 213 /* We have one text domain - always set it */ 214 (void) textdomain(TEXT_DOMAIN); 215 216 if (!locale) 217 return; 218 219 if (*locale && ((curr = setlocale(category, NULL))) && 220 strcmp(curr, locale) == 0) 221 return; 222 223 /* 224 * If <category> is bogus, setlocale() will do nothing and will 225 * return NULL. 226 */ 227 if (!setlocale(category, locale)) 228 return; 229 230 /* If setting the locale from the environment, then we're done */ 231 if (!*locale) 232 return; 233 234 /* 235 * If setting a locale from the <locale> argument, then set the 236 * related env vars. 237 */ 238 switch (category) { 239 case LC_ALL: 240 setenv("LANG", locale, 1); 241 setenv("LC_ALL", locale, 1); 242 break; 243 case LC_CTYPE: 244 setenv("LC_CTYPE", locale, 1); 245 break; 246 case LC_NUMERIC: 247 setenv("LC_NUMERIC", locale, 1); 248 break; 249 case LC_TIME: 250 setenv("LC_TIME", locale, 1); 251 break; 252 case LC_COLLATE: 253 setenv("LC_COLLATE", locale, 1); 254 break; 255 case LC_MONETARY: 256 setenv("LC_MONETARY", locale, 1); 257 break; 258 case LC_MESSAGES: 259 setenv("LC_MESSAGES", locale, 1); 260 break; 261 } 262 return; 263 } 264 265 char ** 266 g11n_getlocales() 267 { 268 FILE *locale_out; 269 u_int n_elems, list_size, long_line = 0; 270 char **list; 271 char locale[64]; /* 64 bytes is plenty for locale names */ 272 273 if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL) { 274 return NULL; 275 } 276 277 /* 278 * Start with enough room for 65 locales - that's a lot fewer than 279 * all the locales available for installation, but a lot more than 280 * what most users will need and install 281 */ 282 n_elems=0; 283 list_size=192; 284 list = (char **) xmalloc(sizeof(char *) * (list_size + 1)); 285 memset(list, 0, sizeof(char *) * (list_size + 1)); 286 287 while (fgets(locale, sizeof(locale), locale_out)) { 288 /* skip long locale names (if any) */ 289 if (!strchr(locale, '\n')) { 290 long_line = 1; 291 continue; 292 } 293 else if (long_line) { 294 long_line = 0; 295 continue; 296 } 297 if (strncmp(locale, "iso_8859", 8) == 0) 298 continue; /* ignore locale names like "iso_8859-1" */ 299 300 if (n_elems == list_size) { 301 list_size *= 2; 302 list = (char **) xrealloc((void *) list, (list_size + 1) * sizeof(char *)); 303 memset(&list[n_elems+1], 0, sizeof(char *) * (list_size - n_elems + 1)); 304 } 305 306 *(strchr(locale, '\n')) = '\0'; /* remove the trailing \n */ 307 308 list[n_elems++] = xstrdup(locale); 309 } 310 list[n_elems] = NULL; 311 (void) pclose(locale_out); 312 313 qsort(list, n_elems - 1, sizeof(char *), locale_cmp); 314 return list; 315 } 316 317 char * 318 g11n_getlangs() 319 { 320 char *locale; 321 322 if (getenv("SSH_LANGS")) 323 return xstrdup(getenv("SSH_LANGS")); 324 325 locale = g11n_getlocale(); 326 327 if (!locale || !*locale) 328 return xstrdup("i-default"); 329 330 return g11n_locale2langtag(locale); 331 } 332 333 char * 334 g11n_locales2langs(char **locale_set) 335 { 336 char **p, **r, **q; 337 char *langtag; 338 int locales, skip; 339 340 for (locales = 0, p = locale_set ; p && *p ; p++) 341 locales++; 342 343 r = (char **) xmalloc((locales + 1) * sizeof(char *)); 344 memset(r, 0, (locales + 1) * sizeof(char *)); 345 346 for (p = locale_set ; p && *p && ((p - locale_set) <= locales); p++) { 347 skip = 0; 348 if ((langtag = g11n_locale2langtag(*p)) == NULL) 349 continue; 350 for (q = r ; (q - r) < locales ; q++) { 351 if (!*q) break; 352 if (*q && strcmp(*q, langtag) == 0) 353 skip = 1; 354 } 355 if (!skip) 356 *(q++) = langtag; 357 *q = NULL; 358 } 359 return xjoin(r, ','); 360 } 361 362 static 363 int 364 sortcmp(const void *d1, const void *d2) 365 { 366 char *s1 = *(char **)d1; 367 char *s2 = *(char **)d2; 368 369 return strcmp(s1, s2); 370 } 371 372 int 373 g11n_langtag_match(char *langtag1, char *langtag2) 374 { 375 int len1, len2; 376 char c1, c2; 377 378 len1 = (strchr(langtag1, '-')) ? 379 (strchr(langtag1, '-') - langtag1) 380 : strlen(langtag1); 381 382 len2 = (strchr(langtag2, '-')) ? 383 (strchr(langtag2, '-') - langtag2) 384 : strlen(langtag2); 385 386 /* no match */ 387 if (len1 != len2 || 388 strncmp(langtag1, langtag2, len1) != 0) 389 return 0; 390 391 c1 = *(langtag1 + len1); 392 c2 = *(langtag2 + len2); 393 394 /* no country sub-tags - exact match */ 395 if (c1 == '\0' && c2 == '\0') 396 return 2; 397 398 /* one langtag has a country sub-tag, the other doesn't */ 399 if (c1 == '\0' || c2 == '\0') 400 return 1; 401 402 /* can't happen - both langtags have a country sub-tag */ 403 if (c1 != '-' || c2 != '-') 404 return 1; 405 406 /* compare country subtags */ 407 langtag1 = langtag1 + len1 + 1; 408 langtag2 = langtag2 + len2 + 1; 409 410 len1 = (strchr(langtag1, '-')) ? 411 (strchr(langtag1, '-') - langtag1) 412 : strlen(langtag1); 413 414 len2 = (strchr(langtag2, '-')) ? 415 (strchr(langtag2, '-') - langtag2) 416 : strlen(langtag2); 417 418 if (len1 != len2 || 419 strncmp(langtag1, langtag2, len1) != 0) 420 return 1; 421 422 /* country tags matched - exact match */ 423 return 2; 424 } 425 426 char * 427 g11n_langtag_set_intersect(char *set1, char *set2) 428 { 429 char **list1, **list2, **list3, **p, **q, **r; 430 char *set3, *lang_subtag; 431 u_int n1, n2, n3; 432 u_int do_append; 433 434 list1 = xsplit(set1, ','); 435 list2 = xsplit(set2, ','); 436 for (n1 = 0, p = list1 ; p && *p ; p++, n1++) ; 437 for (n2 = 0, p = list2 ; p && *p ; p++, n2++) ; 438 439 list3 = (char **) xmalloc(sizeof(char *) * (n1 + n2 + 1)); 440 *list3 = NULL; 441 442 /* we must not sort the user langtags - sorting or not the server's 443 * should not affect the outcome 444 */ 445 qsort(list2, n2, sizeof(char *), sortcmp); 446 447 for (n3 = 0, p = list1 ; p && *p ; p++) { 448 do_append = 0; 449 for (q = list2 ; q && *q ; q++) { 450 if (g11n_langtag_match(*p, *q) != 2) continue; 451 /* append element */ 452 for (r = list3; (r - list3) <= (n1 + n2) ; r++) { 453 do_append = 1; 454 if (!*r) break; 455 if (strcmp(*p, *r) == 0) { 456 do_append = 0; 457 break; 458 } 459 } 460 if (do_append && n3 <= (n1 + n2)) { 461 list3[n3++] = xstrdup(*p); 462 list3[n3] = NULL; 463 } 464 } 465 } 466 467 for (p = list1 ; p && *p ; p++) { 468 do_append = 0; 469 for (q = list2 ; q && *q ; q++) { 470 if (g11n_langtag_match(*p, *q) != 1) continue; 471 /* append element */ 472 lang_subtag = xstrdup(*p); 473 if (strchr(lang_subtag, '-')) 474 *(strchr(lang_subtag, '-')) = '\0'; 475 for (r = list3; (r - list3) <= (n1 + n2) ; r++) { 476 do_append = 1; 477 if (!*r) break; 478 if (strcmp(lang_subtag, *r) == 0) { 479 do_append = 0; 480 break; 481 } 482 } 483 if (do_append && n3 <= (n1 + n2)) { 484 list3[n3++] = lang_subtag; 485 list3[n3] = NULL; 486 } 487 else 488 xfree(lang_subtag); 489 } 490 } 491 492 set3 = xjoin(list3, ','); 493 xfree_split_list(list1); 494 xfree_split_list(list2); 495 xfree_split_list(list3); 496 497 return set3; 498 } 499 500 char * 501 g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags) 502 { 503 char *list, *result; 504 char **xlist; 505 506 /* g11n_langtag_set_intersect uses xmalloc - should not return NULL */ 507 list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags); 508 509 if (!list) 510 return NULL; 511 512 xlist = xsplit(list, ','); 513 514 xfree(list); 515 516 if (!xlist || !*xlist) 517 return NULL; 518 519 result = xstrdup(*xlist); 520 521 xfree_split_list(xlist); 522 523 return result; 524 } 525 526 /* 527 * Compare locales, preferring UTF-8 codesets to others, otherwise doing 528 * a stright strcmp() 529 */ 530 static 531 int 532 locale_cmp(const void *d1, const void *d2) 533 { 534 char *dot_ptr; 535 char *s1 = *(char **)d1; 536 char *s2 = *(char **)d2; 537 int s1_is_utf8 = 0; 538 int s2_is_utf8 = 0; 539 540 /* check if s1 is a UTF-8 locale */ 541 if (((dot_ptr = strchr((char *) s1, '.')) != NULL) && (*dot_ptr != '\0') && 542 (strncmp(dot_ptr+1, "UTF-8", 5) == 0) && 543 (*(dot_ptr+6) == '\0' || *(dot_ptr+6) == '@')) { 544 s1_is_utf8++; 545 } 546 /* check if s2 is a UTF-8 locale */ 547 if (((dot_ptr = strchr((char *) s2, '.')) != NULL) && (*dot_ptr != '\0') && 548 (strncmp(dot_ptr+1, "UTF-8", 5) == 0) && 549 (*(dot_ptr+6) == '\0' || *(dot_ptr+6) == '@')) { 550 s2_is_utf8++; 551 } 552 553 /* prefer UTF-8 locales */ 554 if (s1_is_utf8 && !s2_is_utf8) 555 return -1; 556 557 if (s2_is_utf8 && !s1_is_utf8) 558 return 1; 559 560 /* prefer any locale over the default locales */ 561 if (strcmp(s1, "C") == 0 || 562 strcmp(s1, "POSIX") == 0 || 563 strcmp(s1, "common") == 0) 564 if (strcmp(s2, "C") != 0 && 565 strcmp(s2, "POSIX") != 0 && 566 strcmp(s2, "common") != 0) 567 return 1; 568 569 if (strcmp(s2, "C") == 0 || 570 strcmp(s2, "POSIX") == 0 || 571 strcmp(s2, "common") == 0) 572 if (strcmp(s1, "C") != 0 && 573 strcmp(s1, "POSIX") != 0 && 574 strcmp(s1, "common") != 0) 575 return -1; 576 577 return strcmp(s1, s2); 578 } 579 580 581 char ** 582 g11n_langtag_set_locale_set_intersect(char *langtag_set, 583 char **locale_set) 584 { 585 char **langtag_list, **result, **p, **q, **r; 586 char *s; 587 u_int do_append, n_langtags, n_locales, n_results, max_results; 588 589 /* Count lang tags and locales */ 590 for (n_locales = 0, p = locale_set ; p && *p ; p++) n_locales++; 591 n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0; 592 for ( ; s = strchr(s, ',') ; s++, n_langtags++) ; 593 /* 594 while ((s = strchr(s, ','))) { 595 n_langtags++; 596 s++; 597 } 598 */ 599 600 qsort(locale_set, n_locales, sizeof(char *), locale_cmp); 601 602 langtag_list = xsplit(langtag_set, ','); 603 for ( n_langtags = 0, p = langtag_list ; p && *p ; p++, n_langtags++); 604 605 max_results = MIN(n_locales, n_langtags) * 2; 606 result = (char **) xmalloc(sizeof(char *) * (max_results + 1)); 607 *result = NULL; 608 n_results = 0; 609 610 /* More specific matches first */ 611 for (p = langtag_list ; p && *p ; p++) { 612 do_append = 0; 613 for (q = locale_set ; q && *q ; q++) { 614 if (g11n_langtag_matches_locale(*p, *q) == 2) { 615 do_append = 1; 616 for (r = result ; (r - result) <= MIN(n_locales, n_langtags) ; r++) { 617 if (!*r) break; 618 if (strcmp(*q, *r) == 0) { 619 do_append = 0; 620 break; 621 } 622 } 623 if (do_append && n_results < max_results) { 624 result[n_results++] = xstrdup(*q); 625 result[n_results] = NULL; 626 } 627 break; 628 } 629 } 630 } 631 632 for (p = langtag_list ; p && *p ; p++) { 633 do_append = 0; 634 for (q = locale_set ; q && *q ; q++) { 635 if (g11n_langtag_matches_locale(*p, *q) == 1) { 636 do_append = 1; 637 for (r = result ; (r - result) <= MIN(n_locales, n_langtags) ; r++) { 638 if (!*r) break; 639 if (strcmp(*q, *r) == 0) { 640 do_append = 0; 641 break; 642 } 643 } 644 if (do_append && n_results < max_results) { 645 result[n_results++] = xstrdup(*q); 646 result[n_results] = NULL; 647 } 648 break; 649 } 650 } 651 } 652 xfree_split_list(langtag_list); 653 654 return result; 655 } 656 657 char * 658 g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales) 659 { 660 char **results, *result = NULL; 661 662 if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags, 663 srvr_locales ? srvr_locales : g11n_getlocales())) == NULL) 664 return NULL; 665 666 if (*results != NULL) 667 result = xstrdup(*results); 668 669 xfree_split_list(results); 670 671 return result; 672 } 673 674 675 /* 676 * Functions for validating ASCII and UTF-8 strings 677 * 678 * The error_str parameter is an optional pointer to a char variable 679 * where to store a string suitable for use with error() or fatal() or 680 * friends. 681 * 682 * The return value is 0 if success, EILSEQ or EINVAL. 683 * 684 */ 685 686 u_int 687 g11n_validate_ascii(const char *str, u_int len, u_char **error_str) 688 { 689 u_char *p; 690 691 for (p = (u_char *) str ; p && *p && (!(*p & 0x80)) ; p++) ; 692 693 if (len && ((p - (u_char *) str) != len)) { 694 return EILSEQ; 695 } 696 return 0; 697 } 698 699 u_int 700 g11n_validate_utf8(const u_char *str, u_int len, u_char **error_str) 701 { 702 u_char *p; 703 u_int c, l; 704 705 if (len == 0) len = strlen((const char *)str); 706 707 for (p = (u_char *) str ; p && (p - str < len) && *p ; ) { 708 /* 8-bit chars begin a UTF-8 sequence */ 709 if (*p & 0x80) { 710 /* Get sequence length and sanity check first byte */ 711 if (*p < 0xc0) 712 return EILSEQ; 713 else if (*p < 0xe0) 714 l=2; 715 else if (*p < 0xf0) 716 l=3; 717 else if (*p < 0xf8) 718 l=4; 719 else if (*p < 0xfc) 720 l=5; 721 else if (*p < 0xfe) 722 l=6; 723 else 724 return EILSEQ; 725 726 if ((p + l - str) >= len) 727 return EILSEQ; 728 729 /* overlong detection - build codepoint */ 730 c = *p & 0x3f; 731 c = c << (6 * (l-1)); /* shift c bits from first byte */ 732 733 if (l > 1) { 734 if (*(p+1) && ((*(p+1) & 0xc0) == 0x80)) 735 c = c | ((*(p+1) & 0x3f) << (6 * (l-2))); 736 else 737 return EILSEQ; 738 if (c < 0x80) 739 return EILSEQ; 740 } 741 if (l > 2) { 742 if (*(p+2) && ((*(p+2) & 0xc0) == 0x80)) 743 c = c | ((*(p+2) & 0x3f) << (6 * (l-3))); 744 else 745 return EILSEQ; 746 if (c < 0x800) 747 return EILSEQ; 748 } 749 if (l > 3) { 750 if (*(p+3) && ((*(p+3) & 0xc0) == 0x80)) 751 c = c | ((*(p+3) & 0x3f) << (6 * (l-4))); 752 else 753 return EILSEQ; 754 if (c < 0x10000) 755 return EILSEQ; 756 } 757 if (l > 4) { 758 if (*(p+4) && ((*(p+4) & 0xc0) == 0x80)) 759 c = c | ((*(p+4) & 0x3f) << (6 * (l-5))); 760 else 761 return EILSEQ; 762 if (c < 0x200000) 763 return EILSEQ; 764 } 765 if (l > 5) { 766 if (*(p+5) && ((*(p+5) & 0xc0) == 0x80)) 767 c = c | (*(p+5) & 0x3f) ; 768 else 769 return EILSEQ; 770 if (c < 0x4000000) 771 return EILSEQ; 772 } 773 774 /* Check for UTF-16 surrogates ifs other illegal UTF-8 * points */ 775 if (((c <= 0xdfff) && (c >= 0xd800)) || 776 (c == 0xfffe) || (c == 0xffff)) 777 return EILSEQ; 778 p += l; 779 } 780 /* 7-bit chars are fine */ 781 else 782 p++; 783 } 784 return 0; 785 } 786 787 /* 788 * Functions for converting to ASCII or UTF-8 from the local codeset 789 * Functions for converting from ASCII or UTF-8 to the local codeset 790 * 791 * The error_str parameter is an optional pointer to a char variable 792 * where to store a string suitable for use with error() or fatal() or 793 * friends. 794 * 795 * The err parameter is an optional pointer to an integer where 0 796 * (success) or EILSEQ or EINVAL will be stored (failure). 797 * 798 * These functions return NULL if the conversion fails. 799 * 800 */ 801 802 u_char * 803 g11n_convert_from_ascii(const char *str, int *err_ptr, u_char **error_str) 804 { 805 static u_int initialized = 0; 806 static u_int do_convert = 0; 807 iconv_t cd; 808 int err; 809 810 if (!initialized) { 811 /* 812 * iconv_open() fails if the to/from codesets are the 813 * same, and there are aliases of codesets to boot... 814 */ 815 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 816 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 817 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 818 initialized = 1; 819 do_convert = 0; 820 } 821 else { 822 cd = iconv_open(nl_langinfo(CODESET), "646"); 823 if (cd == (iconv_t) -1) { 824 if (err_ptr) *err_ptr = errno; 825 if (error_str) *error_str = (u_char *) 826 "Cannot convert ASCII strings to the local codeset"; 827 } 828 initialized = 1; 829 do_convert = 1; 830 } 831 } 832 833 if (!do_convert) { 834 if ((err = g11n_validate_ascii(str, 0, error_str))) { 835 if (err_ptr) *err_ptr = err; 836 return NULL; 837 } 838 else 839 return (u_char *) xstrdup(str); 840 } 841 return do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str); 842 } 843 844 u_char * 845 g11n_convert_from_utf8(const u_char *str, int *err_ptr, u_char **error_str) 846 { 847 static u_int initialized = 0; 848 static u_int do_convert = 0; 849 iconv_t cd; 850 int err; 851 852 if (!initialized) { 853 /* 854 * iconv_open() fails if the to/from codesets are the 855 * same, and there are aliases of codesets to boot... 856 */ 857 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 858 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 859 initialized = 1; 860 do_convert = 0; 861 } 862 else { 863 cd = iconv_open(nl_langinfo(CODESET), "UTF-8"); 864 if (cd == (iconv_t) -1) { 865 if (err_ptr) *err_ptr = errno; 866 if (error_str) *error_str = (u_char *) 867 "Cannot convert UTF-8 strings to the local codeset"; 868 } 869 initialized = 1; 870 do_convert = 1; 871 } 872 } 873 874 if (!do_convert) { 875 if ((err = g11n_validate_utf8(str, 0, error_str))) { 876 if (err_ptr) *err_ptr = err; 877 return NULL; 878 } 879 else 880 return (u_char *) xstrdup((char *) str); 881 } 882 return do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str); 883 } 884 885 char * 886 g11n_convert_to_ascii(const u_char *str, int *err_ptr, u_char **error_str) 887 { 888 static u_int initialized = 0; 889 static u_int do_convert = 0; 890 iconv_t cd; 891 892 if (!initialized) { 893 /* 894 * iconv_open() fails if the to/from codesets are the 895 * same, and there are aliases of codesets to boot... 896 */ 897 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 898 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 899 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 900 initialized = 1; 901 do_convert = 0; 902 } 903 else { 904 cd = iconv_open("646", nl_langinfo(CODESET)); 905 if (cd == (iconv_t) -1) { 906 if (err_ptr) *err_ptr = errno; 907 if (error_str) *error_str = (u_char *) 908 "Cannot convert UTF-8 strings to the local codeset"; 909 } 910 initialized = 1; 911 do_convert = 1; 912 } 913 } 914 915 if (!do_convert) 916 return xstrdup((char *) str); 917 return (char *) do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str); 918 } 919 920 u_char * 921 g11n_convert_to_utf8(const u_char *str, int *err_ptr, u_char **error_str) 922 { 923 static u_int initialized = 0; 924 static u_int do_convert = 0; 925 iconv_t cd; 926 927 if (!initialized) { 928 /* 929 * iconv_open() fails if the to/from codesets are the 930 * same, and there are aliases of codesets to boot... 931 */ 932 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 933 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 934 initialized = 1; 935 do_convert = 0; 936 } 937 else { 938 cd = iconv_open("UTF-8", nl_langinfo(CODESET)); 939 if (cd == (iconv_t) -1) { 940 if (err_ptr) *err_ptr = errno; 941 if (error_str) *error_str = (u_char *) 942 "Cannot convert UTF-8 strings to the local codeset"; 943 } 944 initialized = 1; 945 do_convert = 1; 946 } 947 } 948 949 if (!do_convert) 950 return (u_char *) xstrdup((char *) str); 951 return do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str); 952 } 953 954 955 /* 956 * Wrapper around iconv() 957 * 958 * The caller is responsible for freeing the result and for handling 959 * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF). 960 */ 961 962 static 963 u_char * 964 do_iconv(iconv_t cd, u_int *mul_ptr, 965 const void *buf, u_int len, 966 u_int *outlen, int *err, 967 u_char **err_str) 968 { 969 size_t inbytesleft, outbytesleft, converted_size; 970 char *outbuf; 971 u_char *converted; 972 const char *inbuf; 973 u_int mul = 0; 974 975 if (!buf || !(*(char *)buf)) return NULL; 976 if (len == 0) len = strlen(buf); 977 /* reset conversion descriptor */ 978 /* XXX Do we need initial shift sequences for UTF-8??? */ 979 (void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft); 980 inbuf = (const char *) buf; 981 if (mul_ptr) mul = *mul_ptr; 982 converted_size = (len << mul); 983 outbuf = (char *) xmalloc(converted_size + 1); /* for null */ 984 converted = (u_char *) outbuf; 985 outbytesleft = len; 986 do { 987 if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) == 988 (size_t) -1) { 989 if (errno == E2BIG) { 990 /* UTF-8 codepoints are at most 8 bytes long. */ 991 if (mul > 2) { 992 if (err_str) 993 *err_str = (u_char *) "Conversion to UTF-8 failed due to" 994 "preposterous space requirements"; 995 if (err) 996 *err = EILSEQ; 997 return NULL; 998 } 999 1000 /* 1001 * Re-alloc output and ensure that the outbuf 1002 * and outbytesleft values are adjusted. 1003 */ 1004 converted = xrealloc(converted, converted_size << 1 + 1); 1005 outbuf = (char *) converted + converted_size - outbytesleft; 1006 converted_size = (len << ++(mul)); 1007 outbytesleft = converted_size - outbytesleft; 1008 } 1009 else { 1010 /* 1011 * Let the caller deal with iconv() errors, probably by 1012 * calling fatal(); xfree() does not set errno. 1013 */ 1014 if (err) *err = errno; 1015 xfree(converted); 1016 return NULL; 1017 } 1018 } 1019 } while (inbytesleft); 1020 *outbuf = '\0'; /* ensure null-termination */ 1021 if (outlen) *outlen = converted_size - outbytesleft; 1022 if (mul_ptr) *mul_ptr = mul; 1023 return converted; 1024 } 1025