1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 * 22 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <errno.h> 29 #include <locale.h> 30 #include <langinfo.h> 31 #include <iconv.h> 32 #include <ctype.h> 33 #include <strings.h> 34 #include <string.h> 35 #include <stdio.h> 36 #include <stdlib.h> 37 #include "includes.h" 38 #include "xmalloc.h" 39 #include "xlist.h" 40 41 #ifdef MIN 42 #undef MIN 43 #endif /* MIN */ 44 45 #define MIN(x, y) ((x) < (y) ? (x) : (y)) 46 47 #define LOCALE_PATH "/usr/bin/locale" 48 49 #define LANGTAG_MAX 5 /* two-char country code, '-' and two-char region code */ 50 51 static u_char * do_iconv(iconv_t cd, u_int *mul_ptr, 52 const void *buf, u_int len, 53 u_int *outlen, int *err, 54 u_char **err_str); 55 56 static int locale_cmp(const void *d1, const void *d2); 57 static char *g11n_locale2langtag(char *locale); 58 59 u_int 60 g11n_validate_ascii(const char *str, u_int len, u_char **error_str); 61 62 u_int 63 g11n_validate_utf8(const u_char *str, u_int len, u_char **error_str); 64 65 static 66 char * 67 g11n_locale2langtag(char *locale) 68 { 69 char *langtag; 70 71 /* base cases */ 72 if (!locale || !*locale) return NULL; 73 74 if (strcmp(locale, "POSIX") == 0 || 75 strcmp(locale, "C") == 0) return "i-default"; 76 77 /* Punt for language codes which are not exactly 2 letters */ 78 if (strlen(locale) < 2 || 79 !isalpha(locale[0]) || 80 !isalpha(locale[1]) || 81 (locale[2] != '\0' && 82 locale[2] != '_' && 83 locale[2] != '.' && 84 locale[2] != '@')) 85 return NULL; 86 87 88 /* We have a primary language sub-tag */ 89 langtag = (char *) xmalloc(LANGTAG_MAX + 1); 90 91 strncpy(langtag, locale, 2); 92 langtag[2] = '\0'; 93 94 /* Do we have country sub-tag? */ 95 if (locale[2] == '_') { 96 if (strlen(locale) < 5 || 97 !isalpha(locale[3]) || 98 !isalpha(locale[4]) || 99 (locale[5] != '\0' && (locale[5] != '.' && locale[5] != '@'))) { 100 return langtag; 101 } 102 103 /* yes, we do */ 104 /* if (snprintf(langtag, 6, "%s-%s,%s", lang_subtag, 105 country_subtag, langtag) == 8) */ 106 if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 107 2, locale+3) == 5) 108 return langtag; 109 } 110 111 /* In all other cases we just use the primary language sub-tag */ 112 return langtag; 113 } 114 115 u_int 116 g11n_langtag_is_default(char *langtag) 117 { 118 return (strcmp(langtag, "i-default") == 0); 119 } 120 121 /* 122 * This lang tag / locale matching function works only for two-character 123 * language primary sub-tags and two-character country sub-tags. 124 */ 125 u_int 126 g11n_langtag_matches_locale(char *langtag, char *locale) 127 { 128 /* Match "i-default" to the process' current locale if possible */ 129 if (g11n_langtag_is_default(langtag)) { 130 if (strcasecmp(locale, "POSIX") == 0 || 131 strcasecmp(locale, "C") == 0) 132 return 1; 133 else 134 return 0; 135 } 136 137 /* locale must be at least 2 chars long and the lang part must be 138 * exactly two characters */ 139 if (strlen(locale) < 2 || 140 (!isalpha(locale[0]) || !isalpha(locale[1]) || 141 (locale[2] != '\0' && locale[2] != '_' && locale[2] != '.' && locale[2] != '@'))) 142 return 0; 143 144 /* same thing with the langtag */ 145 if (strlen(langtag) < 2 || 146 (!isalpha(langtag[0]) || !isalpha(langtag[1]) || 147 (langtag[2] != '\0' && langtag[2] != '-'))) 148 return 0; 149 150 /* primary language sub-tag and the locale's language part must match */ 151 if (strncasecmp(langtag, locale, 2) != 0) 152 return 0; 153 154 /* primary language sub-tag and the locale's language match, now 155 * fuzzy check country part */ 156 157 /* neither langtag nor locale have more than one component */ 158 if (langtag[2] == '\0' && 159 (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')) 160 return 2; 161 162 /* langtag has only one sub-tag... */ 163 if (langtag[2] == '\0') 164 return 1; 165 166 /* locale has no country code... */ 167 if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@') 168 return 1; 169 170 /* langtag has more than one subtag and the locale has a country code */ 171 172 /* ignore second subtag if not two chars */ 173 if (strlen(langtag) < 5) 174 return 1; 175 176 if (!isalpha(langtag[3]) || !isalpha(langtag[4]) || 177 (langtag[5] != '\0' && langtag[5] != '-')) 178 return 1; 179 180 /* ignore rest of locale if there is no two-character country part */ 181 if (strlen(locale) < 5) 182 return 1; 183 184 if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) || 185 (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@')) 186 return 1; 187 188 /* if the country part matches, return 2 */ 189 if (strncasecmp(&langtag[3], &locale[3], 2) == 0) 190 return 2; 191 192 return 1; 193 } 194 195 char * 196 g11n_getlocale() 197 { 198 /* We have one text domain - always set it */ 199 (void) textdomain(TEXT_DOMAIN); 200 201 /* If the locale is not set, set it from the env vars */ 202 if (!setlocale(LC_CTYPE, NULL)) 203 (void) setlocale(LC_CTYPE, ""); 204 205 return setlocale(LC_CTYPE, NULL); 206 } 207 208 void 209 g11n_setlocale(int category, const char *locale) 210 { 211 char *curr; 212 213 /* We have one text domain - always set it */ 214 (void) textdomain(TEXT_DOMAIN); 215 216 if (!locale) 217 return; 218 219 if (*locale && ((curr = setlocale(category, NULL))) && 220 strcmp(curr, locale) == 0) 221 return; 222 223 /* 224 * If <category> is bogus, setlocale() will do nothing and will 225 * return NULL. 226 */ 227 if (!setlocale(category, locale)) 228 return; 229 230 /* If setting the locale from the environment, then we're done */ 231 if (!*locale) 232 return; 233 234 /* 235 * If setting a locale from the <locale> argument, then set the 236 * related env vars. 237 */ 238 switch (category) { 239 case LC_ALL: 240 /* 241 * We must not set LC_ALL environment variable here because if we 242 * did it would later override any other LC_* variables that were 243 * requested from the other side. 244 */ 245 setenv("LANG", locale, 1); 246 break; 247 case LC_CTYPE: 248 setenv("LC_CTYPE", locale, 1); 249 break; 250 case LC_NUMERIC: 251 setenv("LC_NUMERIC", locale, 1); 252 break; 253 case LC_TIME: 254 setenv("LC_TIME", locale, 1); 255 break; 256 case LC_COLLATE: 257 setenv("LC_COLLATE", locale, 1); 258 break; 259 case LC_MONETARY: 260 setenv("LC_MONETARY", locale, 1); 261 break; 262 case LC_MESSAGES: 263 setenv("LC_MESSAGES", locale, 1); 264 break; 265 } 266 return; 267 } 268 269 char ** 270 g11n_getlocales() 271 { 272 FILE *locale_out; 273 u_int n_elems, list_size, long_line = 0; 274 char **list; 275 char locale[64]; /* 64 bytes is plenty for locale names */ 276 277 if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL) { 278 return NULL; 279 } 280 281 /* 282 * Start with enough room for 65 locales - that's a lot fewer than 283 * all the locales available for installation, but a lot more than 284 * what most users will need and install 285 */ 286 n_elems=0; 287 list_size=192; 288 list = (char **) xmalloc(sizeof(char *) * (list_size + 1)); 289 memset(list, 0, sizeof(char *) * (list_size + 1)); 290 291 while (fgets(locale, sizeof(locale), locale_out)) { 292 /* skip long locale names (if any) */ 293 if (!strchr(locale, '\n')) { 294 long_line = 1; 295 continue; 296 } 297 else if (long_line) { 298 long_line = 0; 299 continue; 300 } 301 if (strncmp(locale, "iso_8859", 8) == 0) 302 continue; /* ignore locale names like "iso_8859-1" */ 303 304 if (n_elems == list_size) { 305 list_size *= 2; 306 list = (char **) xrealloc((void *) list, (list_size + 1) * sizeof(char *)); 307 memset(&list[n_elems+1], 0, sizeof(char *) * (list_size - n_elems + 1)); 308 } 309 310 *(strchr(locale, '\n')) = '\0'; /* remove the trailing \n */ 311 312 list[n_elems++] = xstrdup(locale); 313 } 314 list[n_elems] = NULL; 315 (void) pclose(locale_out); 316 317 qsort(list, n_elems - 1, sizeof(char *), locale_cmp); 318 return list; 319 } 320 321 char * 322 g11n_getlangs() 323 { 324 char *locale; 325 326 if (getenv("SSH_LANGS")) 327 return xstrdup(getenv("SSH_LANGS")); 328 329 locale = g11n_getlocale(); 330 331 if (!locale || !*locale) 332 return xstrdup("i-default"); 333 334 return g11n_locale2langtag(locale); 335 } 336 337 char * 338 g11n_locales2langs(char **locale_set) 339 { 340 char **p, **r, **q; 341 char *langtag; 342 int locales, skip; 343 344 for (locales = 0, p = locale_set ; p && *p ; p++) 345 locales++; 346 347 r = (char **) xmalloc((locales + 1) * sizeof(char *)); 348 memset(r, 0, (locales + 1) * sizeof(char *)); 349 350 for (p = locale_set ; p && *p && ((p - locale_set) <= locales); p++) { 351 skip = 0; 352 if ((langtag = g11n_locale2langtag(*p)) == NULL) 353 continue; 354 for (q = r ; (q - r) < locales ; q++) { 355 if (!*q) break; 356 if (*q && strcmp(*q, langtag) == 0) 357 skip = 1; 358 } 359 if (!skip) 360 *(q++) = langtag; 361 *q = NULL; 362 } 363 return xjoin(r, ','); 364 } 365 366 static 367 int 368 sortcmp(const void *d1, const void *d2) 369 { 370 char *s1 = *(char **)d1; 371 char *s2 = *(char **)d2; 372 373 return strcmp(s1, s2); 374 } 375 376 int 377 g11n_langtag_match(char *langtag1, char *langtag2) 378 { 379 int len1, len2; 380 char c1, c2; 381 382 len1 = (strchr(langtag1, '-')) ? 383 (strchr(langtag1, '-') - langtag1) 384 : strlen(langtag1); 385 386 len2 = (strchr(langtag2, '-')) ? 387 (strchr(langtag2, '-') - langtag2) 388 : strlen(langtag2); 389 390 /* no match */ 391 if (len1 != len2 || 392 strncmp(langtag1, langtag2, len1) != 0) 393 return 0; 394 395 c1 = *(langtag1 + len1); 396 c2 = *(langtag2 + len2); 397 398 /* no country sub-tags - exact match */ 399 if (c1 == '\0' && c2 == '\0') 400 return 2; 401 402 /* one langtag has a country sub-tag, the other doesn't */ 403 if (c1 == '\0' || c2 == '\0') 404 return 1; 405 406 /* can't happen - both langtags have a country sub-tag */ 407 if (c1 != '-' || c2 != '-') 408 return 1; 409 410 /* compare country subtags */ 411 langtag1 = langtag1 + len1 + 1; 412 langtag2 = langtag2 + len2 + 1; 413 414 len1 = (strchr(langtag1, '-')) ? 415 (strchr(langtag1, '-') - langtag1) 416 : strlen(langtag1); 417 418 len2 = (strchr(langtag2, '-')) ? 419 (strchr(langtag2, '-') - langtag2) 420 : strlen(langtag2); 421 422 if (len1 != len2 || 423 strncmp(langtag1, langtag2, len1) != 0) 424 return 1; 425 426 /* country tags matched - exact match */ 427 return 2; 428 } 429 430 char * 431 g11n_langtag_set_intersect(char *set1, char *set2) 432 { 433 char **list1, **list2, **list3, **p, **q, **r; 434 char *set3, *lang_subtag; 435 u_int n1, n2, n3; 436 u_int do_append; 437 438 list1 = xsplit(set1, ','); 439 list2 = xsplit(set2, ','); 440 for (n1 = 0, p = list1 ; p && *p ; p++, n1++) ; 441 for (n2 = 0, p = list2 ; p && *p ; p++, n2++) ; 442 443 list3 = (char **) xmalloc(sizeof(char *) * (n1 + n2 + 1)); 444 *list3 = NULL; 445 446 /* we must not sort the user langtags - sorting or not the server's 447 * should not affect the outcome 448 */ 449 qsort(list2, n2, sizeof(char *), sortcmp); 450 451 for (n3 = 0, p = list1 ; p && *p ; p++) { 452 do_append = 0; 453 for (q = list2 ; q && *q ; q++) { 454 if (g11n_langtag_match(*p, *q) != 2) continue; 455 /* append element */ 456 for (r = list3; (r - list3) <= (n1 + n2) ; r++) { 457 do_append = 1; 458 if (!*r) break; 459 if (strcmp(*p, *r) == 0) { 460 do_append = 0; 461 break; 462 } 463 } 464 if (do_append && n3 <= (n1 + n2)) { 465 list3[n3++] = xstrdup(*p); 466 list3[n3] = NULL; 467 } 468 } 469 } 470 471 for (p = list1 ; p && *p ; p++) { 472 do_append = 0; 473 for (q = list2 ; q && *q ; q++) { 474 if (g11n_langtag_match(*p, *q) != 1) continue; 475 /* append element */ 476 lang_subtag = xstrdup(*p); 477 if (strchr(lang_subtag, '-')) 478 *(strchr(lang_subtag, '-')) = '\0'; 479 for (r = list3; (r - list3) <= (n1 + n2) ; r++) { 480 do_append = 1; 481 if (!*r) break; 482 if (strcmp(lang_subtag, *r) == 0) { 483 do_append = 0; 484 break; 485 } 486 } 487 if (do_append && n3 <= (n1 + n2)) { 488 list3[n3++] = lang_subtag; 489 list3[n3] = NULL; 490 } 491 else 492 xfree(lang_subtag); 493 } 494 } 495 496 set3 = xjoin(list3, ','); 497 xfree_split_list(list1); 498 xfree_split_list(list2); 499 xfree_split_list(list3); 500 501 return set3; 502 } 503 504 char * 505 g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags) 506 { 507 char *list, *result; 508 char **xlist; 509 510 /* g11n_langtag_set_intersect uses xmalloc - should not return NULL */ 511 list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags); 512 513 if (!list) 514 return NULL; 515 516 xlist = xsplit(list, ','); 517 518 xfree(list); 519 520 if (!xlist || !*xlist) 521 return NULL; 522 523 result = xstrdup(*xlist); 524 525 xfree_split_list(xlist); 526 527 return result; 528 } 529 530 /* 531 * Compare locales, preferring UTF-8 codesets to others, otherwise doing 532 * a stright strcmp() 533 */ 534 static 535 int 536 locale_cmp(const void *d1, const void *d2) 537 { 538 char *dot_ptr; 539 char *s1 = *(char **)d1; 540 char *s2 = *(char **)d2; 541 int s1_is_utf8 = 0; 542 int s2_is_utf8 = 0; 543 544 /* check if s1 is a UTF-8 locale */ 545 if (((dot_ptr = strchr((char *) s1, '.')) != NULL) && (*dot_ptr != '\0') && 546 (strncmp(dot_ptr+1, "UTF-8", 5) == 0) && 547 (*(dot_ptr+6) == '\0' || *(dot_ptr+6) == '@')) { 548 s1_is_utf8++; 549 } 550 /* check if s2 is a UTF-8 locale */ 551 if (((dot_ptr = strchr((char *) s2, '.')) != NULL) && (*dot_ptr != '\0') && 552 (strncmp(dot_ptr+1, "UTF-8", 5) == 0) && 553 (*(dot_ptr+6) == '\0' || *(dot_ptr+6) == '@')) { 554 s2_is_utf8++; 555 } 556 557 /* prefer UTF-8 locales */ 558 if (s1_is_utf8 && !s2_is_utf8) 559 return -1; 560 561 if (s2_is_utf8 && !s1_is_utf8) 562 return 1; 563 564 /* prefer any locale over the default locales */ 565 if (strcmp(s1, "C") == 0 || 566 strcmp(s1, "POSIX") == 0 || 567 strcmp(s1, "common") == 0) 568 if (strcmp(s2, "C") != 0 && 569 strcmp(s2, "POSIX") != 0 && 570 strcmp(s2, "common") != 0) 571 return 1; 572 573 if (strcmp(s2, "C") == 0 || 574 strcmp(s2, "POSIX") == 0 || 575 strcmp(s2, "common") == 0) 576 if (strcmp(s1, "C") != 0 && 577 strcmp(s1, "POSIX") != 0 && 578 strcmp(s1, "common") != 0) 579 return -1; 580 581 return strcmp(s1, s2); 582 } 583 584 585 char ** 586 g11n_langtag_set_locale_set_intersect(char *langtag_set, 587 char **locale_set) 588 { 589 char **langtag_list, **result, **p, **q, **r; 590 char *s; 591 u_int do_append, n_langtags, n_locales, n_results, max_results; 592 593 /* Count lang tags and locales */ 594 for (n_locales = 0, p = locale_set ; p && *p ; p++) n_locales++; 595 n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0; 596 for ( ; s = strchr(s, ',') ; s++, n_langtags++) ; 597 /* 598 while ((s = strchr(s, ','))) { 599 n_langtags++; 600 s++; 601 } 602 */ 603 604 qsort(locale_set, n_locales, sizeof(char *), locale_cmp); 605 606 langtag_list = xsplit(langtag_set, ','); 607 for ( n_langtags = 0, p = langtag_list ; p && *p ; p++, n_langtags++); 608 609 max_results = MIN(n_locales, n_langtags) * 2; 610 result = (char **) xmalloc(sizeof(char *) * (max_results + 1)); 611 *result = NULL; 612 n_results = 0; 613 614 /* More specific matches first */ 615 for (p = langtag_list ; p && *p ; p++) { 616 do_append = 0; 617 for (q = locale_set ; q && *q ; q++) { 618 if (g11n_langtag_matches_locale(*p, *q) == 2) { 619 do_append = 1; 620 for (r = result ; (r - result) <= MIN(n_locales, n_langtags) ; r++) { 621 if (!*r) break; 622 if (strcmp(*q, *r) == 0) { 623 do_append = 0; 624 break; 625 } 626 } 627 if (do_append && n_results < max_results) { 628 result[n_results++] = xstrdup(*q); 629 result[n_results] = NULL; 630 } 631 break; 632 } 633 } 634 } 635 636 for (p = langtag_list ; p && *p ; p++) { 637 do_append = 0; 638 for (q = locale_set ; q && *q ; q++) { 639 if (g11n_langtag_matches_locale(*p, *q) == 1) { 640 do_append = 1; 641 for (r = result ; (r - result) <= MIN(n_locales, n_langtags) ; r++) { 642 if (!*r) break; 643 if (strcmp(*q, *r) == 0) { 644 do_append = 0; 645 break; 646 } 647 } 648 if (do_append && n_results < max_results) { 649 result[n_results++] = xstrdup(*q); 650 result[n_results] = NULL; 651 } 652 break; 653 } 654 } 655 } 656 xfree_split_list(langtag_list); 657 658 return result; 659 } 660 661 char * 662 g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales) 663 { 664 char **results, *result = NULL; 665 666 if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags, 667 srvr_locales ? srvr_locales : g11n_getlocales())) == NULL) 668 return NULL; 669 670 if (*results != NULL) 671 result = xstrdup(*results); 672 673 xfree_split_list(results); 674 675 return result; 676 } 677 678 679 /* 680 * Functions for validating ASCII and UTF-8 strings 681 * 682 * The error_str parameter is an optional pointer to a char variable 683 * where to store a string suitable for use with error() or fatal() or 684 * friends. 685 * 686 * The return value is 0 if success, EILSEQ or EINVAL. 687 * 688 */ 689 690 u_int 691 g11n_validate_ascii(const char *str, u_int len, u_char **error_str) 692 { 693 u_char *p; 694 695 for (p = (u_char *) str ; p && *p && (!(*p & 0x80)) ; p++) ; 696 697 if (len && ((p - (u_char *) str) != len)) { 698 return EILSEQ; 699 } 700 return 0; 701 } 702 703 u_int 704 g11n_validate_utf8(const u_char *str, u_int len, u_char **error_str) 705 { 706 u_char *p; 707 u_int c, l; 708 709 if (len == 0) len = strlen((const char *)str); 710 711 for (p = (u_char *) str ; p && (p - str < len) && *p ; ) { 712 /* 8-bit chars begin a UTF-8 sequence */ 713 if (*p & 0x80) { 714 /* Get sequence length and sanity check first byte */ 715 if (*p < 0xc0) 716 return EILSEQ; 717 else if (*p < 0xe0) 718 l=2; 719 else if (*p < 0xf0) 720 l=3; 721 else if (*p < 0xf8) 722 l=4; 723 else if (*p < 0xfc) 724 l=5; 725 else if (*p < 0xfe) 726 l=6; 727 else 728 return EILSEQ; 729 730 if ((p + l - str) >= len) 731 return EILSEQ; 732 733 /* overlong detection - build codepoint */ 734 c = *p & 0x3f; 735 c = c << (6 * (l-1)); /* shift c bits from first byte */ 736 737 if (l > 1) { 738 if (*(p+1) && ((*(p+1) & 0xc0) == 0x80)) 739 c = c | ((*(p+1) & 0x3f) << (6 * (l-2))); 740 else 741 return EILSEQ; 742 if (c < 0x80) 743 return EILSEQ; 744 } 745 if (l > 2) { 746 if (*(p+2) && ((*(p+2) & 0xc0) == 0x80)) 747 c = c | ((*(p+2) & 0x3f) << (6 * (l-3))); 748 else 749 return EILSEQ; 750 if (c < 0x800) 751 return EILSEQ; 752 } 753 if (l > 3) { 754 if (*(p+3) && ((*(p+3) & 0xc0) == 0x80)) 755 c = c | ((*(p+3) & 0x3f) << (6 * (l-4))); 756 else 757 return EILSEQ; 758 if (c < 0x10000) 759 return EILSEQ; 760 } 761 if (l > 4) { 762 if (*(p+4) && ((*(p+4) & 0xc0) == 0x80)) 763 c = c | ((*(p+4) & 0x3f) << (6 * (l-5))); 764 else 765 return EILSEQ; 766 if (c < 0x200000) 767 return EILSEQ; 768 } 769 if (l > 5) { 770 if (*(p+5) && ((*(p+5) & 0xc0) == 0x80)) 771 c = c | (*(p+5) & 0x3f) ; 772 else 773 return EILSEQ; 774 if (c < 0x4000000) 775 return EILSEQ; 776 } 777 778 /* Check for UTF-16 surrogates ifs other illegal UTF-8 * points */ 779 if (((c <= 0xdfff) && (c >= 0xd800)) || 780 (c == 0xfffe) || (c == 0xffff)) 781 return EILSEQ; 782 p += l; 783 } 784 /* 7-bit chars are fine */ 785 else 786 p++; 787 } 788 return 0; 789 } 790 791 /* 792 * Functions for converting to ASCII or UTF-8 from the local codeset 793 * Functions for converting from ASCII or UTF-8 to the local codeset 794 * 795 * The error_str parameter is an optional pointer to a char variable 796 * where to store a string suitable for use with error() or fatal() or 797 * friends. 798 * 799 * The err parameter is an optional pointer to an integer where 0 800 * (success) or EILSEQ or EINVAL will be stored (failure). 801 * 802 * These functions return NULL if the conversion fails. 803 * 804 */ 805 806 u_char * 807 g11n_convert_from_ascii(const char *str, int *err_ptr, u_char **error_str) 808 { 809 static u_int initialized = 0; 810 static u_int do_convert = 0; 811 iconv_t cd; 812 int err; 813 814 if (!initialized) { 815 /* 816 * iconv_open() fails if the to/from codesets are the 817 * same, and there are aliases of codesets to boot... 818 */ 819 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 820 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 821 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 822 initialized = 1; 823 do_convert = 0; 824 } 825 else { 826 cd = iconv_open(nl_langinfo(CODESET), "646"); 827 if (cd == (iconv_t) -1) { 828 if (err_ptr) *err_ptr = errno; 829 if (error_str) *error_str = (u_char *) 830 "Cannot convert ASCII strings to the local codeset"; 831 } 832 initialized = 1; 833 do_convert = 1; 834 } 835 } 836 837 if (!do_convert) { 838 if ((err = g11n_validate_ascii(str, 0, error_str))) { 839 if (err_ptr) *err_ptr = err; 840 return NULL; 841 } 842 else 843 return (u_char *) xstrdup(str); 844 } 845 return do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str); 846 } 847 848 u_char * 849 g11n_convert_from_utf8(const u_char *str, int *err_ptr, u_char **error_str) 850 { 851 static u_int initialized = 0; 852 static u_int do_convert = 0; 853 iconv_t cd; 854 int err; 855 856 if (!initialized) { 857 /* 858 * iconv_open() fails if the to/from codesets are the 859 * same, and there are aliases of codesets to boot... 860 */ 861 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 862 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 863 initialized = 1; 864 do_convert = 0; 865 } 866 else { 867 cd = iconv_open(nl_langinfo(CODESET), "UTF-8"); 868 if (cd == (iconv_t) -1) { 869 if (err_ptr) *err_ptr = errno; 870 if (error_str) *error_str = (u_char *) 871 "Cannot convert UTF-8 strings to the local codeset"; 872 } 873 initialized = 1; 874 do_convert = 1; 875 } 876 } 877 878 if (!do_convert) { 879 if ((err = g11n_validate_utf8(str, 0, error_str))) { 880 if (err_ptr) *err_ptr = err; 881 return NULL; 882 } 883 else 884 return (u_char *) xstrdup((char *) str); 885 } 886 return do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str); 887 } 888 889 char * 890 g11n_convert_to_ascii(const u_char *str, int *err_ptr, u_char **error_str) 891 { 892 static u_int initialized = 0; 893 static u_int do_convert = 0; 894 iconv_t cd; 895 896 if (!initialized) { 897 /* 898 * iconv_open() fails if the to/from codesets are the 899 * same, and there are aliases of codesets to boot... 900 */ 901 if (strcmp("646", nl_langinfo(CODESET)) == 0 || 902 strcmp("ASCII", nl_langinfo(CODESET)) == 0 || 903 strcmp("US-ASCII", nl_langinfo(CODESET)) == 0) { 904 initialized = 1; 905 do_convert = 0; 906 } 907 else { 908 cd = iconv_open("646", nl_langinfo(CODESET)); 909 if (cd == (iconv_t) -1) { 910 if (err_ptr) *err_ptr = errno; 911 if (error_str) *error_str = (u_char *) 912 "Cannot convert UTF-8 strings to the local codeset"; 913 } 914 initialized = 1; 915 do_convert = 1; 916 } 917 } 918 919 if (!do_convert) 920 return xstrdup((char *) str); 921 return (char *) do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str); 922 } 923 924 u_char * 925 g11n_convert_to_utf8(const u_char *str, int *err_ptr, u_char **error_str) 926 { 927 static u_int initialized = 0; 928 static u_int do_convert = 0; 929 iconv_t cd; 930 931 if (!initialized) { 932 /* 933 * iconv_open() fails if the to/from codesets are the 934 * same, and there are aliases of codesets to boot... 935 */ 936 if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 || 937 strcmp("UTF8", nl_langinfo(CODESET)) == 0) { 938 initialized = 1; 939 do_convert = 0; 940 } 941 else { 942 cd = iconv_open("UTF-8", nl_langinfo(CODESET)); 943 if (cd == (iconv_t) -1) { 944 if (err_ptr) *err_ptr = errno; 945 if (error_str) *error_str = (u_char *) 946 "Cannot convert UTF-8 strings to the local codeset"; 947 } 948 initialized = 1; 949 do_convert = 1; 950 } 951 } 952 953 if (!do_convert) 954 return (u_char *) xstrdup((char *) str); 955 return do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str); 956 } 957 958 959 /* 960 * Wrapper around iconv() 961 * 962 * The caller is responsible for freeing the result and for handling 963 * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF). 964 */ 965 966 static 967 u_char * 968 do_iconv(iconv_t cd, u_int *mul_ptr, 969 const void *buf, u_int len, 970 u_int *outlen, int *err, 971 u_char **err_str) 972 { 973 size_t inbytesleft, outbytesleft, converted_size; 974 char *outbuf; 975 u_char *converted; 976 const char *inbuf; 977 u_int mul = 0; 978 979 if (!buf || !(*(char *)buf)) return NULL; 980 if (len == 0) len = strlen(buf); 981 /* reset conversion descriptor */ 982 /* XXX Do we need initial shift sequences for UTF-8??? */ 983 (void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft); 984 inbuf = (const char *) buf; 985 if (mul_ptr) mul = *mul_ptr; 986 converted_size = (len << mul); 987 outbuf = (char *) xmalloc(converted_size + 1); /* for null */ 988 converted = (u_char *) outbuf; 989 outbytesleft = len; 990 do { 991 if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) == 992 (size_t) -1) { 993 if (errno == E2BIG) { 994 /* UTF-8 codepoints are at most 8 bytes long. */ 995 if (mul > 2) { 996 if (err_str) 997 *err_str = (u_char *) "Conversion to UTF-8 failed due to" 998 "preposterous space requirements"; 999 if (err) 1000 *err = EILSEQ; 1001 return NULL; 1002 } 1003 1004 /* 1005 * Re-alloc output and ensure that the outbuf 1006 * and outbytesleft values are adjusted. 1007 */ 1008 converted = xrealloc(converted, converted_size << 1 + 1); 1009 outbuf = (char *) converted + converted_size - outbytesleft; 1010 converted_size = (len << ++(mul)); 1011 outbytesleft = converted_size - outbytesleft; 1012 } 1013 else { 1014 /* 1015 * Let the caller deal with iconv() errors, probably by 1016 * calling fatal(); xfree() does not set errno. 1017 */ 1018 if (err) *err = errno; 1019 xfree(converted); 1020 return NULL; 1021 } 1022 } 1023 } while (inbytesleft); 1024 *outbuf = '\0'; /* ensure null-termination */ 1025 if (outlen) *outlen = converted_size - outbytesleft; 1026 if (mul_ptr) *mul_ptr = mul; 1027 return converted; 1028 } 1029