xref: /titanic_44/usr/src/cmd/ssh/libssh/common/g11n.c (revision b06cdb87d254343cca2e66a21fd421617c3a0b7b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  *
22  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <errno.h>
29 #include <locale.h>
30 #include <langinfo.h>
31 #include <iconv.h>
32 #include <ctype.h>
33 #include <strings.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include "includes.h"
38 #include "xmalloc.h"
39 #include "xlist.h"
40 
41 #ifdef MIN
42 #undef MIN
43 #endif /* MIN */
44 
45 #define MIN(x, y)		    ((x) < (y) ? (x) : (y))
46 
47 #define LOCALE_PATH "/usr/bin/locale"
48 
49 #define LANGTAG_MAX 5 /* two-char country code, '-' and two-char region code */
50 
51 static u_char * do_iconv(iconv_t cd, u_int *mul_ptr,
52 		       const void *buf, u_int len,
53 		       u_int *outlen, int *err,
54 		       u_char **err_str);
55 
56 static int locale_cmp(const void *d1, const void *d2);
57 static char *g11n_locale2langtag(char *locale);
58 
59 u_int
60 g11n_validate_ascii(const char *str, u_int len, u_char **error_str);
61 
62 u_int
63 g11n_validate_utf8(const u_char *str, u_int len, u_char **error_str);
64 
65 static
66 char *
67 g11n_locale2langtag(char *locale)
68 {
69     char *langtag;
70 
71     /* base cases */
72     if (!locale || !*locale) return NULL;
73 
74     if (strcmp(locale, "POSIX") == 0 ||
75 	strcmp(locale, "C") == 0) return "i-default";
76 
77     /* Punt for language codes which are not exactly 2 letters */
78     if (strlen(locale) < 2 ||
79 	!isalpha(locale[0]) ||
80 	!isalpha(locale[1]) ||
81 	(locale[2] != '\0' &&
82 	locale[2] != '_' &&
83 	locale[2] != '.' &&
84 	locale[2] != '@'))
85 	return NULL;
86 
87 
88     /* We have a primary language sub-tag */
89     langtag = (char *) xmalloc(LANGTAG_MAX + 1);
90 
91     strncpy(langtag, locale, 2);
92     langtag[2] = '\0';
93 
94     /* Do we have country sub-tag? */
95     if (locale[2] == '_') {
96 	if (strlen(locale) < 5 ||
97 	    !isalpha(locale[3]) ||
98 	    !isalpha(locale[4]) ||
99 	    (locale[5] != '\0' && (locale[5] != '.' && locale[5] != '@'))) {
100 	    return langtag;
101 	}
102 
103 	/* yes, we do */
104 	/* if (snprintf(langtag, 6, "%s-%s,%s", lang_subtag,
105 		     country_subtag, langtag) == 8) */
106 	if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale,
107 		     2, locale+3) == 5)
108 	    return langtag;
109     }
110 
111     /* In all other cases we just use the primary language sub-tag */
112     return langtag;
113 }
114 
115 u_int
116 g11n_langtag_is_default(char *langtag)
117 {
118     return (strcmp(langtag, "i-default") == 0);
119 }
120 
121 /*
122  * This lang tag / locale matching function works only for two-character
123  * language primary sub-tags and two-character country sub-tags.
124  */
125 u_int
126 g11n_langtag_matches_locale(char *langtag, char *locale)
127 {
128     /* Match "i-default" to the process' current locale if possible */
129     if (g11n_langtag_is_default(langtag)) {
130 	if (strcasecmp(locale, "POSIX") == 0 ||
131 	    strcasecmp(locale, "C") == 0)
132 	    return 1;
133 	else
134 	    return 0;
135     }
136 
137     /* locale must be at least 2 chars long and the lang part must be
138      * exactly two characters */
139     if (strlen(locale) < 2 ||
140 	(!isalpha(locale[0]) || !isalpha(locale[1]) ||
141 	(locale[2] != '\0' && locale[2] != '_' && locale[2] != '.' && locale[2] != '@')))
142 	return 0;
143 
144     /* same thing with the langtag */
145     if (strlen(langtag) < 2 ||
146 	(!isalpha(langtag[0]) || !isalpha(langtag[1]) ||
147 	(langtag[2] != '\0' && langtag[2] != '-')))
148 	return 0;
149 
150     /* primary language sub-tag and the locale's language part must match */
151     if (strncasecmp(langtag, locale, 2) != 0)
152 	return 0;
153 
154     /* primary language sub-tag and the locale's language match, now
155      * fuzzy check country part */
156 
157     /* neither langtag nor locale have more than one component */
158     if (langtag[2] == '\0' &&
159         (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@'))
160 	return 2;
161 
162     /* langtag has only one sub-tag... */
163     if (langtag[2] == '\0')
164 	return 1;
165 
166     /* locale has no country code... */
167     if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')
168 	return 1;
169 
170     /* langtag has more than one subtag and the locale has a country code */
171 
172     /* ignore second subtag if not two chars */
173     if (strlen(langtag) < 5)
174 	return 1;
175 
176     if (!isalpha(langtag[3]) || !isalpha(langtag[4]) ||
177 	(langtag[5] != '\0' && langtag[5] != '-'))
178 	return 1;
179 
180     /* ignore rest of locale if there is no two-character country part */
181     if (strlen(locale) < 5)
182 	return 1;
183 
184     if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) ||
185 	(locale[5] != '\0' && locale[5] != '.' && locale[5] != '@'))
186 	return 1;
187 
188     /* if the country part matches, return 2 */
189     if (strncasecmp(&langtag[3], &locale[3], 2) == 0)
190 	return 2;
191 
192     return 1;
193 }
194 
195 char *
196 g11n_getlocale()
197 {
198     /* We have one text domain - always set it */
199     (void) textdomain(TEXT_DOMAIN);
200 
201     /* If the locale is not set, set it from the env vars */
202     if (!setlocale(LC_CTYPE, NULL))
203 	(void) setlocale(LC_CTYPE, "");
204 
205     return setlocale(LC_CTYPE, NULL);
206 }
207 
208 void
209 g11n_setlocale(int category, const char *locale)
210 {
211     char *curr;
212 
213     /* We have one text domain - always set it */
214     (void) textdomain(TEXT_DOMAIN);
215 
216     if (!locale)
217 	return;
218 
219     if (*locale && ((curr = setlocale(category, NULL))) &&
220 	strcmp(curr, locale) == 0)
221 	return;
222 
223     /*
224      * If <category> is bogus, setlocale() will do nothing and will
225      * return NULL.
226      */
227     if (!setlocale(category, locale))
228 	return;
229 
230     /* If setting the locale from the environment, then we're done */
231     if (!*locale)
232 	return;
233 
234     /*
235      * If setting a locale from the <locale> argument, then set the
236      * related env vars.
237      */
238     switch (category) {
239     case LC_ALL:
240 	/*
241 	 * We must not set LC_ALL environment variable here because if we
242 	 * did it would later override any other LC_* variables that were
243 	 * requested from the other side.
244 	 */
245 	setenv("LANG", locale, 1);
246 	break;
247     case LC_CTYPE:
248 	setenv("LC_CTYPE", locale, 1);
249 	break;
250     case LC_NUMERIC:
251 	setenv("LC_NUMERIC", locale, 1);
252 	break;
253     case LC_TIME:
254 	setenv("LC_TIME", locale, 1);
255 	break;
256     case LC_COLLATE:
257 	setenv("LC_COLLATE", locale, 1);
258 	break;
259     case LC_MONETARY:
260 	setenv("LC_MONETARY", locale, 1);
261 	break;
262     case LC_MESSAGES:
263 	setenv("LC_MESSAGES", locale, 1);
264 	break;
265     }
266     return;
267 }
268 
269 char **
270 g11n_getlocales()
271 {
272     FILE *locale_out;
273     u_int n_elems, list_size, long_line = 0;
274     char **list;
275     char locale[64];	/* 64 bytes is plenty for locale names */
276 
277     if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL) {
278 	return NULL;
279     }
280 
281     /*
282      * Start with enough room for 65 locales - that's a lot fewer than
283      * all the locales available for installation, but a lot more than
284      * what most users will need and install
285      */
286     n_elems=0;
287     list_size=192;
288     list = (char **) xmalloc(sizeof(char *) * (list_size + 1));
289     memset(list, 0, sizeof(char *) * (list_size + 1));
290 
291     while (fgets(locale, sizeof(locale), locale_out)) {
292 	/* skip long locale names (if any) */
293 	if (!strchr(locale, '\n')) {
294 	    long_line = 1;
295 	    continue;
296 	}
297 	else if (long_line) {
298 	    long_line = 0;
299 	    continue;
300 	}
301 	if (strncmp(locale, "iso_8859", 8) == 0)
302 	    continue;		    /* ignore locale names like "iso_8859-1" */
303 
304 	if (n_elems == list_size) {
305 	    list_size *= 2;
306 	    list = (char **) xrealloc((void *) list, (list_size + 1) * sizeof(char *));
307 	    memset(&list[n_elems+1], 0, sizeof(char *) * (list_size - n_elems + 1));
308 	}
309 
310 	*(strchr(locale, '\n')) = '\0';      /* remove the trailing \n */
311 
312 	list[n_elems++] = xstrdup(locale);
313     }
314     list[n_elems] = NULL;
315     (void) pclose(locale_out);
316 
317     qsort(list, n_elems - 1, sizeof(char *), locale_cmp);
318     return list;
319 }
320 
321 char *
322 g11n_getlangs()
323 {
324     char *locale;
325 
326     if (getenv("SSH_LANGS"))
327 	return xstrdup(getenv("SSH_LANGS"));
328 
329     locale = g11n_getlocale();
330 
331     if (!locale || !*locale)
332 	return xstrdup("i-default");
333 
334     return g11n_locale2langtag(locale);
335 }
336 
337 char *
338 g11n_locales2langs(char **locale_set)
339 {
340     char **p, **r, **q;
341     char *langtag;
342     int locales, skip;
343 
344     for (locales = 0, p = locale_set ; p && *p ; p++)
345 	locales++;
346 
347     r = (char **) xmalloc((locales + 1) * sizeof(char *));
348     memset(r, 0, (locales + 1) * sizeof(char *));
349 
350     for (p = locale_set ; p && *p && ((p - locale_set) <= locales); p++) {
351 	skip = 0;
352 	if ((langtag = g11n_locale2langtag(*p)) == NULL)
353 	    continue;
354 	for (q = r ; (q - r) < locales ; q++) {
355 	    if (!*q) break;
356 	    if (*q && strcmp(*q, langtag) == 0)
357 		skip = 1;
358 	}
359 	if (!skip)
360 	    *(q++) = langtag;
361 	*q = NULL;
362     }
363     return xjoin(r, ',');
364 }
365 
366 static
367 int
368 sortcmp(const void *d1, const void *d2)
369 {
370     char *s1 = *(char **)d1;
371     char *s2 = *(char **)d2;
372 
373     return strcmp(s1, s2);
374 }
375 
376 int
377 g11n_langtag_match(char *langtag1, char *langtag2)
378 {
379     int len1, len2;
380     char c1, c2;
381 
382     len1 = (strchr(langtag1, '-')) ?
383 		(strchr(langtag1, '-') - langtag1)
384 		: strlen(langtag1);
385 
386     len2 = (strchr(langtag2, '-')) ?
387 		(strchr(langtag2, '-') - langtag2)
388 		: strlen(langtag2);
389 
390     /* no match */
391     if (len1 != len2 ||
392 	strncmp(langtag1, langtag2, len1) != 0)
393 	return 0;
394 
395     c1 = *(langtag1 + len1);
396     c2 = *(langtag2 + len2);
397 
398     /* no country sub-tags - exact match */
399     if (c1 == '\0' && c2 == '\0')
400 	return 2;
401 
402     /* one langtag has a country sub-tag, the other doesn't */
403     if (c1 == '\0' || c2 == '\0')
404 	return 1;
405 
406     /* can't happen - both langtags have a country sub-tag */
407     if (c1 != '-' || c2 != '-')
408 	return 1;
409 
410     /* compare country subtags */
411     langtag1 = langtag1 + len1 + 1;
412     langtag2 = langtag2 + len2 + 1;
413 
414     len1 = (strchr(langtag1, '-')) ?
415 		(strchr(langtag1, '-') - langtag1)
416 		: strlen(langtag1);
417 
418     len2 = (strchr(langtag2, '-')) ?
419 		(strchr(langtag2, '-') - langtag2)
420 		: strlen(langtag2);
421 
422     if (len1 != len2 ||
423 	strncmp(langtag1, langtag2, len1) != 0)
424 	return 1;
425 
426     /* country tags matched - exact match */
427     return 2;
428 }
429 
430 char *
431 g11n_langtag_set_intersect(char *set1, char *set2)
432 {
433     char **list1, **list2, **list3, **p, **q, **r;
434     char *set3, *lang_subtag;
435     u_int n1, n2, n3;
436     u_int do_append;
437 
438     list1 = xsplit(set1, ',');
439     list2 = xsplit(set2, ',');
440     for (n1 = 0, p = list1 ; p && *p ; p++, n1++) ;
441     for (n2 = 0, p = list2 ; p && *p ; p++, n2++) ;
442 
443     list3 = (char **) xmalloc(sizeof(char *) * (n1 + n2 + 1));
444     *list3 = NULL;
445 
446     /* we must not sort the user langtags - sorting or not the server's
447      * should not affect the outcome
448      */
449     qsort(list2, n2, sizeof(char *), sortcmp);
450 
451     for (n3 = 0, p = list1 ; p && *p ; p++) {
452 	do_append = 0;
453 	for (q = list2 ; q && *q ; q++) {
454 	    if (g11n_langtag_match(*p, *q) != 2) continue;
455 	    /* append element */
456 	    for (r = list3; (r - list3) <= (n1 + n2) ; r++) {
457 		do_append = 1;
458 		if (!*r) break;
459 		if (strcmp(*p, *r) == 0) {
460 		    do_append = 0;
461 		    break;
462 		}
463 	    }
464 	    if (do_append && n3 <= (n1 + n2)) {
465 		list3[n3++] = xstrdup(*p);
466 		list3[n3] = NULL;
467 	    }
468 	}
469     }
470 
471     for (p = list1 ; p && *p ; p++) {
472 	do_append = 0;
473 	for (q = list2 ; q && *q ; q++) {
474 	    if (g11n_langtag_match(*p, *q) != 1) continue;
475 	    /* append element */
476 	    lang_subtag = xstrdup(*p);
477 	    if (strchr(lang_subtag, '-'))
478 		*(strchr(lang_subtag, '-')) = '\0';
479 	    for (r = list3; (r - list3) <= (n1 + n2) ; r++) {
480 		do_append = 1;
481 		if (!*r) break;
482 		if (strcmp(lang_subtag, *r) == 0) {
483 		    do_append = 0;
484 		    break;
485 		}
486 	    }
487 	    if (do_append && n3 <= (n1 + n2)) {
488 		list3[n3++] = lang_subtag;
489 		list3[n3] = NULL;
490 	    }
491 	    else
492 		xfree(lang_subtag);
493 	}
494     }
495 
496     set3 = xjoin(list3, ',');
497     xfree_split_list(list1);
498     xfree_split_list(list2);
499     xfree_split_list(list3);
500 
501     return set3;
502 }
503 
504 char *
505 g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags)
506 {
507     char *list, *result;
508     char **xlist;
509 
510     /* g11n_langtag_set_intersect uses xmalloc - should not return NULL */
511     list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags);
512 
513     if (!list)
514 	    return NULL;
515 
516     xlist = xsplit(list, ',');
517 
518     xfree(list);
519 
520     if (!xlist || !*xlist)
521 	    return NULL;
522 
523     result = xstrdup(*xlist);
524 
525     xfree_split_list(xlist);
526 
527     return result;
528 }
529 
530 /*
531  * Compare locales, preferring UTF-8 codesets to others, otherwise doing
532  * a stright strcmp()
533  */
534 static
535 int
536 locale_cmp(const void *d1, const void *d2)
537 {
538     char *dot_ptr;
539     char *s1 = *(char **)d1;
540     char *s2 = *(char **)d2;
541     int s1_is_utf8 = 0;
542     int s2_is_utf8 = 0;
543 
544     /* check if s1 is a UTF-8 locale */
545     if (((dot_ptr = strchr((char *) s1, '.')) != NULL) && (*dot_ptr != '\0') &&
546 	(strncmp(dot_ptr+1, "UTF-8", 5) == 0) &&
547 	(*(dot_ptr+6) == '\0' || *(dot_ptr+6) == '@')) {
548 	s1_is_utf8++;
549     }
550     /* check if s2 is a UTF-8 locale */
551     if (((dot_ptr = strchr((char *) s2, '.')) != NULL) && (*dot_ptr != '\0') &&
552 	(strncmp(dot_ptr+1, "UTF-8", 5) == 0) &&
553 	(*(dot_ptr+6) == '\0' || *(dot_ptr+6) == '@')) {
554 	s2_is_utf8++;
555     }
556 
557     /* prefer UTF-8 locales */
558     if (s1_is_utf8 && !s2_is_utf8)
559 	return -1;
560 
561     if (s2_is_utf8 && !s1_is_utf8)
562 	return 1;
563 
564     /* prefer any locale over the default locales */
565     if (strcmp(s1, "C") == 0 ||
566 	strcmp(s1, "POSIX") == 0 ||
567 	strcmp(s1, "common") == 0)
568 	if (strcmp(s2, "C") != 0 &&
569 	    strcmp(s2, "POSIX") != 0 &&
570 	    strcmp(s2, "common") != 0)
571 	    return 1;
572 
573     if (strcmp(s2, "C") == 0 ||
574 	strcmp(s2, "POSIX") == 0 ||
575 	strcmp(s2, "common") == 0)
576 	if (strcmp(s1, "C") != 0 &&
577 	    strcmp(s1, "POSIX") != 0 &&
578 	    strcmp(s1, "common") != 0)
579 	    return -1;
580 
581     return strcmp(s1, s2);
582 }
583 
584 
585 char **
586 g11n_langtag_set_locale_set_intersect(char *langtag_set,
587 				      char **locale_set)
588 {
589     char **langtag_list, **result, **p, **q, **r;
590     char *s;
591     u_int do_append, n_langtags, n_locales, n_results, max_results;
592 
593     /* Count lang tags and locales */
594     for (n_locales = 0, p = locale_set ; p && *p ; p++) n_locales++;
595     n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0;
596     for ( ; s = strchr(s, ',') ; s++, n_langtags++) ;
597     /*
598     while ((s = strchr(s, ','))) {
599 	n_langtags++;
600 	s++;
601     }
602      */
603 
604     qsort(locale_set, n_locales, sizeof(char *), locale_cmp);
605 
606     langtag_list = xsplit(langtag_set, ',');
607     for ( n_langtags = 0, p = langtag_list ; p && *p ; p++, n_langtags++);
608 
609     max_results = MIN(n_locales, n_langtags) * 2;
610     result = (char **) xmalloc(sizeof(char *) * (max_results + 1));
611     *result = NULL;
612     n_results = 0;
613 
614     /* More specific matches first */
615     for (p = langtag_list ; p && *p ; p++) {
616 	do_append = 0;
617 	for (q = locale_set ; q && *q ; q++) {
618 	    if (g11n_langtag_matches_locale(*p, *q) == 2) {
619 		do_append = 1;
620 		for (r = result ; (r - result) <= MIN(n_locales, n_langtags) ; r++) {
621 		    if (!*r) break;
622 		    if (strcmp(*q, *r) == 0) {
623 			do_append = 0;
624 			break;
625 		    }
626 		}
627 		if (do_append && n_results < max_results) {
628 		    result[n_results++] = xstrdup(*q);
629 		    result[n_results] = NULL;
630 		}
631 		break;
632 	    }
633 	}
634     }
635 
636     for (p = langtag_list ; p && *p ; p++) {
637 	do_append = 0;
638 	for (q = locale_set ; q && *q ; q++) {
639 	    if (g11n_langtag_matches_locale(*p, *q) == 1) {
640 		do_append = 1;
641 		for (r = result ; (r - result) <= MIN(n_locales, n_langtags) ; r++) {
642 		    if (!*r) break;
643 		    if (strcmp(*q, *r) == 0) {
644 			do_append = 0;
645 			break;
646 		    }
647 		}
648 		if (do_append && n_results < max_results) {
649 		    result[n_results++] = xstrdup(*q);
650 		    result[n_results] = NULL;
651 		}
652 		break;
653 	    }
654 	}
655     }
656     xfree_split_list(langtag_list);
657 
658     return result;
659 }
660 
661 char *
662 g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales)
663 {
664     char **results, *result = NULL;
665 
666     if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags,
667 	    srvr_locales ?  srvr_locales : g11n_getlocales())) == NULL)
668 	return NULL;
669 
670     if (*results != NULL)
671 	    result = xstrdup(*results);
672 
673     xfree_split_list(results);
674 
675     return result;
676 }
677 
678 
679 /*
680  * Functions for validating ASCII and UTF-8 strings
681  *
682  * The error_str parameter is an optional pointer to a char variable
683  * where to store a string suitable for use with error() or fatal() or
684  * friends.
685  *
686  * The return value is 0 if success, EILSEQ or EINVAL.
687  *
688  */
689 
690 u_int
691 g11n_validate_ascii(const char *str, u_int len, u_char **error_str)
692 {
693     u_char *p;
694 
695     for (p = (u_char *) str ; p && *p && (!(*p & 0x80)) ; p++) ;
696 
697     if (len && ((p - (u_char *) str) != len)) {
698 	return EILSEQ;
699     }
700     return 0;
701 }
702 
703 u_int
704 g11n_validate_utf8(const u_char *str, u_int len, u_char **error_str)
705 {
706     u_char *p;
707     u_int c, l;
708 
709     if (len == 0) len = strlen((const char *)str);
710 
711     for (p = (u_char *) str ; p && (p - str < len) && *p ; ) {
712 	/* 8-bit chars begin a UTF-8 sequence */
713 	if (*p & 0x80) {
714 	    /* Get sequence length and sanity check first byte */
715 	    if (*p < 0xc0)
716 		return EILSEQ;
717 	    else if (*p < 0xe0)
718 		l=2;
719 	    else if (*p < 0xf0)
720 		l=3;
721 	    else if (*p < 0xf8)
722 		l=4;
723 	    else if (*p < 0xfc)
724 		l=5;
725 	    else if (*p < 0xfe)
726 		l=6;
727 	    else
728 		return EILSEQ;
729 
730 	    if ((p + l - str) >= len)
731 		return EILSEQ;
732 
733 	    /* overlong detection - build codepoint */
734 	    c = *p & 0x3f;
735 	    c = c << (6 * (l-1)); /* shift c bits from first byte */
736 
737 	    if (l > 1) {
738 		if (*(p+1) && ((*(p+1) & 0xc0) == 0x80))
739 		    c = c | ((*(p+1) & 0x3f) << (6 * (l-2)));
740 		else
741 		    return EILSEQ;
742 		if (c < 0x80)
743 		    return EILSEQ;
744 	    }
745 	    if (l > 2) {
746 		if (*(p+2) && ((*(p+2) & 0xc0) == 0x80))
747 		    c = c | ((*(p+2) & 0x3f) << (6 * (l-3)));
748 		else
749 		    return EILSEQ;
750 		if (c < 0x800)
751 		    return EILSEQ;
752 	    }
753 	    if (l > 3) {
754 		if (*(p+3) && ((*(p+3) & 0xc0) == 0x80))
755 		    c = c | ((*(p+3) & 0x3f) << (6 * (l-4)));
756 		else
757 		    return EILSEQ;
758 		if (c < 0x10000)
759 		    return EILSEQ;
760 	    }
761 	    if (l > 4) {
762 		if (*(p+4) && ((*(p+4) & 0xc0) == 0x80))
763 		    c = c | ((*(p+4) & 0x3f) << (6 * (l-5)));
764 		else
765 		    return EILSEQ;
766 		if (c < 0x200000)
767 		    return EILSEQ;
768 	    }
769 	    if (l > 5) {
770 		if (*(p+5) && ((*(p+5) & 0xc0) == 0x80))
771 		    c = c | (*(p+5) & 0x3f) ;
772 		else
773 		    return EILSEQ;
774 		if (c < 0x4000000)
775 		    return EILSEQ;
776 	    }
777 
778 	    /* Check for UTF-16 surrogates ifs other illegal UTF-8 * points */
779 	    if (((c <= 0xdfff) && (c >= 0xd800)) ||
780 	        (c == 0xfffe) || (c == 0xffff))
781 		return EILSEQ;
782 	    p += l;
783 	}
784 	/* 7-bit chars are fine */
785 	else
786 	    p++;
787     }
788     return 0;
789 }
790 
791 /*
792  * Functions for converting to ASCII or UTF-8 from the local codeset
793  * Functions for converting from ASCII or UTF-8 to the local codeset
794  *
795  * The error_str parameter is an optional pointer to a char variable
796  * where to store a string suitable for use with error() or fatal() or
797  * friends.
798  *
799  * The err parameter is an optional pointer to an integer where 0
800  * (success) or EILSEQ or EINVAL will be stored (failure).
801  *
802  * These functions return NULL if the conversion fails.
803  *
804  */
805 
806 u_char *
807 g11n_convert_from_ascii(const char *str, int *err_ptr, u_char **error_str)
808 {
809     static u_int initialized = 0;
810     static u_int do_convert = 0;
811     iconv_t cd;
812     int err;
813 
814     if (!initialized) {
815 	/*
816 	 * iconv_open() fails if the to/from codesets are the
817 	 * same, and there are aliases of codesets to boot...
818 	 */
819 	if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
820 	    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
821 	    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
822 	    initialized = 1;
823 	    do_convert = 0;
824 	}
825 	else {
826 	    cd = iconv_open(nl_langinfo(CODESET), "646");
827 	    if (cd == (iconv_t) -1) {
828 		if (err_ptr) *err_ptr = errno;
829 		if (error_str) *error_str = (u_char *)
830 		    "Cannot convert ASCII strings to the local codeset";
831 	    }
832 	    initialized = 1;
833 	    do_convert = 1;
834 	}
835     }
836 
837     if (!do_convert) {
838 	if ((err = g11n_validate_ascii(str, 0, error_str))) {
839 	    if (err_ptr) *err_ptr = err;
840 	    return NULL;
841 	}
842 	else
843 	    return (u_char *) xstrdup(str);
844     }
845     return do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str);
846 }
847 
848 u_char *
849 g11n_convert_from_utf8(const u_char *str, int *err_ptr, u_char **error_str)
850 {
851     static u_int initialized = 0;
852     static u_int do_convert = 0;
853     iconv_t cd;
854     int err;
855 
856     if (!initialized) {
857 	/*
858 	 * iconv_open() fails if the to/from codesets are the
859 	 * same, and there are aliases of codesets to boot...
860 	 */
861 	if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
862 	    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
863 	    initialized = 1;
864 	    do_convert = 0;
865 	}
866 	else {
867 	    cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
868 	    if (cd == (iconv_t) -1) {
869 		if (err_ptr) *err_ptr = errno;
870 		if (error_str) *error_str = (u_char *)
871 		    "Cannot convert UTF-8 strings to the local codeset";
872 	    }
873 	    initialized = 1;
874 	    do_convert = 1;
875 	}
876     }
877 
878     if (!do_convert) {
879 	if ((err = g11n_validate_utf8(str, 0, error_str))) {
880 	    if (err_ptr) *err_ptr = err;
881 	    return NULL;
882 	}
883 	else
884 	    return (u_char *) xstrdup((char *) str);
885     }
886     return do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str);
887 }
888 
889 char *
890 g11n_convert_to_ascii(const u_char *str, int *err_ptr, u_char **error_str)
891 {
892     static u_int initialized = 0;
893     static u_int do_convert = 0;
894     iconv_t cd;
895 
896     if (!initialized) {
897 	/*
898 	 * iconv_open() fails if the to/from codesets are the
899 	 * same, and there are aliases of codesets to boot...
900 	 */
901 	if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
902 	    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
903 	    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
904 	    initialized = 1;
905 	    do_convert = 0;
906 	}
907 	else {
908 	    cd = iconv_open("646", nl_langinfo(CODESET));
909 	    if (cd == (iconv_t) -1) {
910 		if (err_ptr) *err_ptr = errno;
911 		if (error_str) *error_str = (u_char *)
912 		    "Cannot convert UTF-8 strings to the local codeset";
913 	    }
914 	    initialized = 1;
915 	    do_convert = 1;
916 	}
917     }
918 
919     if (!do_convert)
920 	return xstrdup((char *) str);
921     return (char *) do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str);
922 }
923 
924 u_char *
925 g11n_convert_to_utf8(const u_char *str, int *err_ptr, u_char **error_str)
926 {
927     static u_int initialized = 0;
928     static u_int do_convert = 0;
929     iconv_t cd;
930 
931     if (!initialized) {
932 	/*
933 	 * iconv_open() fails if the to/from codesets are the
934 	 * same, and there are aliases of codesets to boot...
935 	 */
936 	if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
937 	    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
938 	    initialized = 1;
939 	    do_convert = 0;
940 	}
941 	else {
942 	    cd = iconv_open("UTF-8", nl_langinfo(CODESET));
943 	    if (cd == (iconv_t) -1) {
944 		if (err_ptr) *err_ptr = errno;
945 		if (error_str) *error_str = (u_char *)
946 		    "Cannot convert UTF-8 strings to the local codeset";
947 	    }
948 	    initialized = 1;
949 	    do_convert = 1;
950 	}
951     }
952 
953     if (!do_convert)
954 	return (u_char *) xstrdup((char *) str);
955     return do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str);
956 }
957 
958 
959 /*
960  * Wrapper around iconv()
961  *
962  * The caller is responsible for freeing the result and for handling
963  * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF).
964  */
965 
966 static
967 u_char *
968 do_iconv(iconv_t cd, u_int *mul_ptr,
969 	 const void *buf, u_int len,
970 	 u_int *outlen, int *err,
971 	 u_char **err_str)
972 {
973     size_t inbytesleft, outbytesleft, converted_size;
974     char *outbuf;
975     u_char *converted;
976     const char *inbuf;
977     u_int mul = 0;
978 
979     if (!buf || !(*(char *)buf)) return NULL;
980     if (len == 0) len = strlen(buf);
981     /* reset conversion descriptor */
982     /* XXX Do we need initial shift sequences for UTF-8??? */
983     (void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft);
984     inbuf = (const char *) buf;
985     if (mul_ptr) mul = *mul_ptr;
986     converted_size = (len << mul);
987     outbuf = (char *) xmalloc(converted_size + 1); /* for null */
988     converted = (u_char *) outbuf;
989     outbytesleft = len;
990     do {
991 	if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) ==
992 		(size_t) -1) {
993 	    if (errno == E2BIG) {
994 		/* UTF-8 codepoints are at most 8 bytes long. */
995 		if (mul > 2) {
996 		    if (err_str)
997 			*err_str = (u_char *) "Conversion to UTF-8 failed due to"
998 				  "preposterous space requirements";
999 		    if (err)
1000 			*err = EILSEQ;
1001 		    return NULL;
1002 		}
1003 
1004 		/*
1005 		 * Re-alloc output and ensure that the outbuf
1006 		 * and outbytesleft values are adjusted.
1007 		 */
1008 		converted = xrealloc(converted, converted_size << 1 + 1);
1009 		outbuf = (char *) converted + converted_size - outbytesleft;
1010 		converted_size = (len << ++(mul));
1011 		outbytesleft = converted_size - outbytesleft;
1012 	    }
1013 	    else {
1014 		/*
1015 		 * Let the caller deal with iconv() errors, probably by
1016 		 * calling fatal(); xfree() does not set errno.
1017 		 */
1018 		if (err) *err = errno;
1019 		xfree(converted);
1020 		return NULL;
1021 	    }
1022 	}
1023     } while (inbytesleft);
1024     *outbuf = '\0'; /* ensure null-termination */
1025     if (outlen) *outlen = converted_size - outbytesleft;
1026     if (mul_ptr) *mul_ptr = mul;
1027     return converted;
1028 }
1029