xref: /titanic_44/usr/src/cmd/ssh/libssh/common/g11n.c (revision 98579b20de8e05c5117968705a18979f8b75b863)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
22  * Use is subject to license terms.
23  */
24 
25 #pragma ident	"%Z%%M%	%I%	%E% SMI"
26 
27 #include <errno.h>
28 #include <locale.h>
29 #include <langinfo.h>
30 #include <iconv.h>
31 #include <ctype.h>
32 #include <strings.h>
33 #include <string.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include "includes.h"
37 #include "xmalloc.h"
38 #include "xlist.h"
39 
40 #ifdef MIN
41 #undef MIN
42 #endif /* MIN */
43 
44 #define MIN(x, y)		    ((x) < (y) ? (x) : (y))
45 
46 #define LOCALE_PATH "/usr/bin/locale"
47 
48 #define LANGTAG_MAX 5 /* two-char country code, '-' and two-char region code */
49 
50 static u_char * do_iconv(iconv_t cd, u_int *mul_ptr,
51 		       const void *buf, u_int len,
52 		       u_int *outlen, int *err,
53 		       u_char **err_str);
54 
55 static int locale_cmp(const void *d1, const void *d2);
56 static char *g11n_locale2langtag(char *locale);
57 
58 u_int
59 g11n_validate_ascii(const char *str, u_int len, u_char **error_str);
60 
61 u_int
62 g11n_validate_utf8(const u_char *str, u_int len, u_char **error_str);
63 
64 static
65 char *
66 g11n_locale2langtag(char *locale)
67 {
68     char *langtag;
69 
70     /* base cases */
71     if (!locale || !*locale) return NULL;
72 
73     if (strcmp(locale, "POSIX") == 0 ||
74 	strcmp(locale, "C") == 0) return "i-default";
75 
76     /* Punt for language codes which are not exactly 2 letters */
77     if (strlen(locale) < 2 ||
78 	!isalpha(locale[0]) ||
79 	!isalpha(locale[1]) ||
80 	(locale[2] != '\0' &&
81 	locale[2] != '_' &&
82 	locale[2] != '.' &&
83 	locale[2] != '@'))
84 	return NULL;
85 
86 
87     /* We have a primary language sub-tag */
88     langtag = (char *) xmalloc(LANGTAG_MAX + 1);
89 
90     strncpy(langtag, locale, 2);
91     langtag[2] = '\0';
92 
93     /* Do we have country sub-tag? */
94     if (locale[2] == '_') {
95 	if (strlen(locale) < 5 ||
96 	    !isalpha(locale[3]) ||
97 	    !isalpha(locale[4]) ||
98 	    (locale[5] != '\0' && (locale[5] != '.' && locale[5] != '@'))) {
99 	    return langtag;
100 	}
101 
102 	/* yes, we do */
103 	/* if (snprintf(langtag, 6, "%s-%s,%s", lang_subtag,
104 		     country_subtag, langtag) == 8) */
105 	if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale,
106 		     2, locale+3) == 5)
107 	    return langtag;
108     }
109 
110     /* In all other cases we just use the primary language sub-tag */
111     return langtag;
112 }
113 
114 u_int
115 g11n_langtag_is_default(char *langtag)
116 {
117     return (strcmp(langtag, "i-default") == 0);
118 }
119 
120 /*
121  * This lang tag / locale matching function works only for two-character
122  * language primary sub-tags and two-character country sub-tags.
123  */
124 u_int
125 g11n_langtag_matches_locale(char *langtag, char *locale)
126 {
127     /* Match "i-default" to the process' current locale if possible */
128     if (g11n_langtag_is_default(langtag)) {
129 	if (strcasecmp(locale, "POSIX") == 0 ||
130 	    strcasecmp(locale, "C") == 0)
131 	    return 1;
132 	else
133 	    return 0;
134     }
135 
136     /* locale must be at least 2 chars long and the lang part must be
137      * exactly two characters */
138     if (strlen(locale) < 2 ||
139 	(!isalpha(locale[0]) || !isalpha(locale[1]) ||
140 	(locale[2] != '\0' && locale[2] != '_' && locale[2] != '.' && locale[2] != '@')))
141 	return 0;
142 
143     /* same thing with the langtag */
144     if (strlen(langtag) < 2 ||
145 	(!isalpha(langtag[0]) || !isalpha(langtag[1]) ||
146 	(langtag[2] != '\0' && langtag[2] != '-')))
147 	return 0;
148 
149     /* primary language sub-tag and the locale's language part must match */
150     if (strncasecmp(langtag, locale, 2) != 0)
151 	return 0;
152 
153     /* primary language sub-tag and the locale's language match, now
154      * fuzzy check country part */
155 
156     /* neither langtag nor locale have more than one component */
157     if (langtag[2] == '\0' &&
158         (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@'))
159 	return 2;
160 
161     /* langtag has only one sub-tag... */
162     if (langtag[2] == '\0')
163 	return 1;
164 
165     /* locale has no country code... */
166     if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')
167 	return 1;
168 
169     /* langtag has more than one subtag and the locale has a country code */
170 
171     /* ignore second subtag if not two chars */
172     if (strlen(langtag) < 5)
173 	return 1;
174 
175     if (!isalpha(langtag[3]) || !isalpha(langtag[4]) ||
176 	(langtag[5] != '\0' && langtag[5] != '-'))
177 	return 1;
178 
179     /* ignore rest of locale if there is no two-character country part */
180     if (strlen(locale) < 5)
181 	return 1;
182 
183     if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) ||
184 	(locale[5] != '\0' && locale[5] != '.' && locale[5] != '@'))
185 	return 1;
186 
187     /* if the country part matches, return 2 */
188     if (strncasecmp(&langtag[3], &locale[3], 2) == 0)
189 	return 2;
190 
191     return 1;
192 }
193 
194 char *
195 g11n_getlocale()
196 {
197     /* We have one text domain - always set it */
198     (void) textdomain(TEXT_DOMAIN);
199 
200     /* If the locale is not set, set it from the env vars */
201     if (!setlocale(LC_MESSAGES, NULL))
202 	(void) setlocale(LC_MESSAGES, "");
203 
204     return setlocale(LC_MESSAGES, NULL);
205 }
206 
207 void
208 g11n_setlocale(int category, const char *locale)
209 {
210     char *curr;
211 
212     /* We have one text domain - always set it */
213     (void) textdomain(TEXT_DOMAIN);
214 
215     if (!locale)
216 	return;
217 
218     if (*locale && ((curr = setlocale(category, NULL))) &&
219 	strcmp(curr, locale) == 0)
220 	return;
221 
222     /*
223      * If <category> is bogus, setlocale() will do nothing.
224      */
225     (void) setlocale(category, locale);
226 
227     return;
228 }
229 
230 char **
231 g11n_getlocales()
232 {
233     FILE *locale_out;
234     u_int n_elems, list_size, long_line = 0;
235     char **list;
236     char locale[64];	/* 64 bytes is plenty for locale names */
237 
238     if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL) {
239 	return NULL;
240     }
241 
242     /*
243      * Start with enough room for 65 locales - that's a lot fewer than
244      * all the locales available for installation, but a lot more than
245      * what most users will need and install
246      */
247     n_elems=0;
248     list_size=192;
249     list = (char **) xmalloc(sizeof(char *) * (list_size + 1));
250     memset(list, 0, sizeof(char *) * (list_size + 1));
251 
252     while (fgets(locale, sizeof(locale), locale_out)) {
253 	/* skip long locale names (if any) */
254 	if (!strchr(locale, '\n')) {
255 	    long_line = 1;
256 	    continue;
257 	}
258 	else if (long_line) {
259 	    long_line = 0;
260 	    continue;
261 	}
262 	if (strncmp(locale, "iso_8859", 8) == 0)
263 	    continue;		    /* ignore locale names like "iso_8859-1" */
264 
265 	if (n_elems == list_size) {
266 	    list_size *= 2;
267 	    list = (char **) xrealloc((void *) list, (list_size + 1) * sizeof(char *));
268 	    memset(&list[n_elems+1], 0, sizeof(char *) * (list_size - n_elems + 1));
269 	}
270 
271 	*(strchr(locale, '\n')) = '\0';      /* remove the trailing \n */
272 
273 	list[n_elems++] = xstrdup(locale);
274     }
275     list[n_elems] = NULL;
276     (void) pclose(locale_out);
277 
278     qsort(list, n_elems - 1, sizeof(char *), locale_cmp);
279     return list;
280 }
281 
282 char *
283 g11n_getlangs()
284 {
285     char *locale;
286 
287     if (getenv("SSH_LANGS"))
288 	return xstrdup(getenv("SSH_LANGS"));
289 
290     locale = g11n_getlocale();
291 
292     if (!locale || !*locale)
293 	return xstrdup("i-default");
294 
295     return g11n_locale2langtag(locale);
296 }
297 
298 char *
299 g11n_locales2langs(char **locale_set)
300 {
301     char **p, **r, **q;
302     char *langtag;
303     int locales, skip;
304 
305     for (locales = 0, p = locale_set ; p && *p ; p++)
306 	locales++;
307 
308     r = (char **) xmalloc((locales + 1) * sizeof(char *));
309     memset(r, 0, (locales + 1) * sizeof(char *));
310 
311     for (p = locale_set ; p && *p && ((p - locale_set) <= locales); p++) {
312 	skip = 0;
313 	if ((langtag = g11n_locale2langtag(*p)) == NULL)
314 	    continue;
315 	for (q = r ; (q - r) < locales ; q++) {
316 	    if (!*q) break;
317 	    if (*q && strcmp(*q, langtag) == 0)
318 		skip = 1;
319 	}
320 	if (!skip)
321 	    *(q++) = langtag;
322 	*q = NULL;
323     }
324     return xjoin(r, ',');
325 }
326 
327 static
328 int
329 sortcmp(const void *d1, const void *d2)
330 {
331     char *s1 = *(char **)d1;
332     char *s2 = *(char **)d2;
333 
334     return strcmp(s1, s2);
335 }
336 
337 int
338 g11n_langtag_match(char *langtag1, char *langtag2)
339 {
340     int len1, len2;
341     char c1, c2;
342 
343     len1 = (strchr(langtag1, '-')) ?
344 		(strchr(langtag1, '-') - langtag1)
345 		: strlen(langtag1);
346 
347     len2 = (strchr(langtag2, '-')) ?
348 		(strchr(langtag2, '-') - langtag2)
349 		: strlen(langtag2);
350 
351     /* no match */
352     if (len1 != len2 ||
353 	strncmp(langtag1, langtag2, len1) != 0)
354 	return 0;
355 
356     c1 = *(langtag1 + len1);
357     c2 = *(langtag2 + len2);
358 
359     /* no country sub-tags - exact match */
360     if (c1 == '\0' && c2 == '\0')
361 	return 2;
362 
363     /* one langtag has a country sub-tag, the other doesn't */
364     if (c1 == '\0' || c2 == '\0')
365 	return 1;
366 
367     /* can't happen - both langtags have a country sub-tag */
368     if (c1 != '-' || c2 != '-')
369 	return 1;
370 
371     /* compare country subtags */
372     langtag1 = langtag1 + len1 + 1;
373     langtag2 = langtag2 + len2 + 1;
374 
375     len1 = (strchr(langtag1, '-')) ?
376 		(strchr(langtag1, '-') - langtag1)
377 		: strlen(langtag1);
378 
379     len2 = (strchr(langtag2, '-')) ?
380 		(strchr(langtag2, '-') - langtag2)
381 		: strlen(langtag2);
382 
383     if (len1 != len2 ||
384 	strncmp(langtag1, langtag2, len1) != 0)
385 	return 1;
386 
387     /* country tags matched - exact match */
388     return 2;
389 }
390 
391 char *
392 g11n_langtag_set_intersect(char *set1, char *set2)
393 {
394     char **list1, **list2, **list3, **p, **q, **r;
395     char *set3, *lang_subtag;
396     u_int n1, n2, n3;
397     u_int do_append;
398 
399     list1 = xsplit(set1, ',');
400     list2 = xsplit(set2, ',');
401     for (n1 = 0, p = list1 ; p && *p ; p++, n1++) ;
402     for (n2 = 0, p = list2 ; p && *p ; p++, n2++) ;
403 
404     list3 = (char **) xmalloc(sizeof(char *) * (n1 + n2 + 1));
405     *list3 = NULL;
406 
407     /* we must not sort the user langtags - sorting or not the server's
408      * should not affect the outcome
409      */
410     qsort(list2, n2, sizeof(char *), sortcmp);
411 
412     for (n3 = 0, p = list1 ; p && *p ; p++) {
413 	do_append = 0;
414 	for (q = list2 ; q && *q ; q++) {
415 	    if (g11n_langtag_match(*p, *q) != 2) continue;
416 	    /* append element */
417 	    for (r = list3; (r - list3) <= (n1 + n2) ; r++) {
418 		do_append = 1;
419 		if (!*r) break;
420 		if (strcmp(*p, *r) == 0) {
421 		    do_append = 0;
422 		    break;
423 		}
424 	    }
425 	    if (do_append && n3 <= (n1 + n2)) {
426 		list3[n3++] = xstrdup(*p);
427 		list3[n3] = NULL;
428 	    }
429 	}
430     }
431 
432     for (p = list1 ; p && *p ; p++) {
433 	do_append = 0;
434 	for (q = list2 ; q && *q ; q++) {
435 	    if (g11n_langtag_match(*p, *q) != 1) continue;
436 	    /* append element */
437 	    lang_subtag = xstrdup(*p);
438 	    if (strchr(lang_subtag, '-'))
439 		*(strchr(lang_subtag, '-')) = '\0';
440 	    for (r = list3; (r - list3) <= (n1 + n2) ; r++) {
441 		do_append = 1;
442 		if (!*r) break;
443 		if (strcmp(lang_subtag, *r) == 0) {
444 		    do_append = 0;
445 		    break;
446 		}
447 	    }
448 	    if (do_append && n3 <= (n1 + n2)) {
449 		list3[n3++] = lang_subtag;
450 		list3[n3] = NULL;
451 	    }
452 	    else
453 		xfree(lang_subtag);
454 	}
455     }
456 
457     set3 = xjoin(list3, ',');
458     xfree_split_list(list1);
459     xfree_split_list(list2);
460     xfree_split_list(list3);
461 
462     return set3;
463 }
464 
465 char *
466 g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags)
467 {
468     char *list, *result;
469     char **xlist;
470 
471     /* g11n_langtag_set_intersect uses xmalloc - should not return NULL */
472     list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags);
473 
474     if (!list)
475 	    return NULL;
476 
477     xlist = xsplit(list, ',');
478 
479     xfree(list);
480 
481     if (!xlist || !*xlist)
482 	    return NULL;
483 
484     result = xstrdup(*xlist);
485 
486     xfree_split_list(xlist);
487 
488     return result;
489 }
490 
491 /*
492  * Compare locales, preferring UTF-8 codesets to others, otherwise doing
493  * a stright strcmp()
494  */
495 static
496 int
497 locale_cmp(const void *d1, const void *d2)
498 {
499     char *dot_ptr;
500     char *s1 = *(char **)d1;
501     char *s2 = *(char **)d2;
502     int s1_is_utf8 = 0;
503     int s2_is_utf8 = 0;
504 
505     /* check if s1 is a UTF-8 locale */
506     if (((dot_ptr = strchr((char *) s1, '.')) != NULL) && (*dot_ptr != '\0') &&
507 	(strncmp(dot_ptr+1, "UTF-8", 5) == 0) &&
508 	(*(dot_ptr+6) == '\0' || *(dot_ptr+6) == '@')) {
509 	s1_is_utf8++;
510     }
511     /* check if s2 is a UTF-8 locale */
512     if (((dot_ptr = strchr((char *) s2, '.')) != NULL) && (*dot_ptr != '\0') &&
513 	(strncmp(dot_ptr+1, "UTF-8", 5) == 0) &&
514 	(*(dot_ptr+6) == '\0' || *(dot_ptr+6) == '@')) {
515 	s2_is_utf8++;
516     }
517 
518     /* prefer UTF-8 locales */
519     if (s1_is_utf8 && !s2_is_utf8)
520 	return -1;
521 
522     if (s2_is_utf8 && !s1_is_utf8)
523 	return 1;
524 
525     /* prefer any locale over the default locales */
526     if (strcmp(s1, "C") == 0 ||
527 	strcmp(s1, "POSIX") == 0 ||
528 	strcmp(s1, "common") == 0)
529 	if (strcmp(s2, "C") != 0 &&
530 	    strcmp(s2, "POSIX") != 0 &&
531 	    strcmp(s2, "common") != 0)
532 	    return 1;
533 
534     if (strcmp(s2, "C") == 0 ||
535 	strcmp(s2, "POSIX") == 0 ||
536 	strcmp(s2, "common") == 0)
537 	if (strcmp(s1, "C") != 0 &&
538 	    strcmp(s1, "POSIX") != 0 &&
539 	    strcmp(s1, "common") != 0)
540 	    return -1;
541 
542     return strcmp(s1, s2);
543 }
544 
545 
546 char **
547 g11n_langtag_set_locale_set_intersect(char *langtag_set,
548 				      char **locale_set)
549 {
550     char **langtag_list, **result, **p, **q, **r;
551     char *s;
552     u_int do_append, n_langtags, n_locales, n_results, max_results;
553 
554     /* Count lang tags and locales */
555     for (n_locales = 0, p = locale_set ; p && *p ; p++) n_locales++;
556     n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0;
557     for ( ; s = strchr(s, ',') ; s++, n_langtags++) ;
558     /*
559     while ((s = strchr(s, ','))) {
560 	n_langtags++;
561 	s++;
562     }
563      */
564 
565     qsort(locale_set, n_locales, sizeof(char *), locale_cmp);
566 
567     langtag_list = xsplit(langtag_set, ',');
568     for ( n_langtags = 0, p = langtag_list ; p && *p ; p++, n_langtags++);
569 
570     max_results = MIN(n_locales, n_langtags) * 2;
571     result = (char **) xmalloc(sizeof(char *) * (max_results + 1));
572     *result = NULL;
573     n_results = 0;
574 
575     /* More specific matches first */
576     for (p = langtag_list ; p && *p ; p++) {
577 	do_append = 0;
578 	for (q = locale_set ; q && *q ; q++) {
579 	    if (g11n_langtag_matches_locale(*p, *q) == 2) {
580 		do_append = 1;
581 		for (r = result ; (r - result) <= MIN(n_locales, n_langtags) ; r++) {
582 		    if (!*r) break;
583 		    if (strcmp(*q, *r) == 0) {
584 			do_append = 0;
585 			break;
586 		    }
587 		}
588 		if (do_append && n_results < max_results) {
589 		    result[n_results++] = xstrdup(*q);
590 		    result[n_results] = NULL;
591 		}
592 		break;
593 	    }
594 	}
595     }
596 
597     for (p = langtag_list ; p && *p ; p++) {
598 	do_append = 0;
599 	for (q = locale_set ; q && *q ; q++) {
600 	    if (g11n_langtag_matches_locale(*p, *q) == 1) {
601 		do_append = 1;
602 		for (r = result ; (r - result) <= MIN(n_locales, n_langtags) ; r++) {
603 		    if (!*r) break;
604 		    if (strcmp(*q, *r) == 0) {
605 			do_append = 0;
606 			break;
607 		    }
608 		}
609 		if (do_append && n_results < max_results) {
610 		    result[n_results++] = xstrdup(*q);
611 		    result[n_results] = NULL;
612 		}
613 		break;
614 	    }
615 	}
616     }
617     xfree_split_list(langtag_list);
618 
619     return result;
620 }
621 
622 char *
623 g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales)
624 {
625     char **results, *result = NULL;
626 
627     if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags,
628 	    srvr_locales ?  srvr_locales : g11n_getlocales())) == NULL)
629 	return NULL;
630 
631     if (*results != NULL)
632 	    result = xstrdup(*results);
633 
634     xfree_split_list(results);
635 
636     return result;
637 }
638 
639 
640 /*
641  * Functions for validating ASCII and UTF-8 strings
642  *
643  * The error_str parameter is an optional pointer to a char variable
644  * where to store a string suitable for use with error() or fatal() or
645  * friends.
646  *
647  * The return value is 0 if success, EILSEQ or EINVAL.
648  *
649  */
650 
651 u_int
652 g11n_validate_ascii(const char *str, u_int len, u_char **error_str)
653 {
654     u_char *p;
655 
656     for (p = (u_char *) str ; p && *p && (!(*p & 0x80)) ; p++) ;
657 
658     if (len && ((p - (u_char *) str) != len)) {
659 	return EILSEQ;
660     }
661     return 0;
662 }
663 
664 u_int
665 g11n_validate_utf8(const u_char *str, u_int len, u_char **error_str)
666 {
667     u_char *p;
668     u_int c, l;
669 
670     if (len == 0) len = strlen((const char *)str);
671 
672     for (p = (u_char *) str ; p && (p - str < len) && *p ; ) {
673 	/* 8-bit chars begin a UTF-8 sequence */
674 	if (*p & 0x80) {
675 	    /* Get sequence length and sanity check first byte */
676 	    if (*p < 0xc0)
677 		return EILSEQ;
678 	    else if (*p < 0xe0)
679 		l=2;
680 	    else if (*p < 0xf0)
681 		l=3;
682 	    else if (*p < 0xf8)
683 		l=4;
684 	    else if (*p < 0xfc)
685 		l=5;
686 	    else if (*p < 0xfe)
687 		l=6;
688 	    else
689 		return EILSEQ;
690 
691 	    if ((p + l - str) >= len)
692 		return EILSEQ;
693 
694 	    /* overlong detection - build codepoint */
695 	    c = *p & 0x3f;
696 	    c = c << (6 * (l-1)); /* shift c bits from first byte */
697 
698 	    if (l > 1) {
699 		if (*(p+1) && ((*(p+1) & 0xc0) == 0x80))
700 		    c = c | ((*(p+1) & 0x3f) << (6 * (l-2)));
701 		else
702 		    return EILSEQ;
703 		if (c < 0x80)
704 		    return EILSEQ;
705 	    }
706 	    if (l > 2) {
707 		if (*(p+2) && ((*(p+2) & 0xc0) == 0x80))
708 		    c = c | ((*(p+2) & 0x3f) << (6 * (l-3)));
709 		else
710 		    return EILSEQ;
711 		if (c < 0x800)
712 		    return EILSEQ;
713 	    }
714 	    if (l > 3) {
715 		if (*(p+3) && ((*(p+3) & 0xc0) == 0x80))
716 		    c = c | ((*(p+3) & 0x3f) << (6 * (l-4)));
717 		else
718 		    return EILSEQ;
719 		if (c < 0x10000)
720 		    return EILSEQ;
721 	    }
722 	    if (l > 4) {
723 		if (*(p+4) && ((*(p+4) & 0xc0) == 0x80))
724 		    c = c | ((*(p+4) & 0x3f) << (6 * (l-5)));
725 		else
726 		    return EILSEQ;
727 		if (c < 0x200000)
728 		    return EILSEQ;
729 	    }
730 	    if (l > 5) {
731 		if (*(p+5) && ((*(p+5) & 0xc0) == 0x80))
732 		    c = c | (*(p+5) & 0x3f) ;
733 		else
734 		    return EILSEQ;
735 		if (c < 0x4000000)
736 		    return EILSEQ;
737 	    }
738 
739 	    /* Check for UTF-16 surrogates ifs other illegal UTF-8 * points */
740 	    if (((c <= 0xdfff) && (c >= 0xd800)) ||
741 	        (c == 0xfffe) || (c == 0xffff))
742 		return EILSEQ;
743 	    p += l;
744 	}
745 	/* 7-bit chars are fine */
746 	else
747 	    p++;
748     }
749     return 0;
750 }
751 
752 /*
753  * Functions for converting to ASCII or UTF-8 from the local codeset
754  * Functions for converting from ASCII or UTF-8 to the local codeset
755  *
756  * The error_str parameter is an optional pointer to a char variable
757  * where to store a string suitable for use with error() or fatal() or
758  * friends.
759  *
760  * The err parameter is an optional pointer to an integer where 0
761  * (success) or EILSEQ or EINVAL will be stored (failure).
762  *
763  * These functions return NULL if the conversion fails.
764  *
765  */
766 
767 u_char *
768 g11n_convert_from_ascii(const char *str, int *err_ptr, u_char **error_str)
769 {
770     static u_int initialized = 0;
771     static u_int do_convert = 0;
772     iconv_t cd;
773     int err;
774 
775     if (!initialized) {
776 	/*
777 	 * iconv_open() fails if the to/from codesets are the
778 	 * same, and there are aliases of codesets to boot...
779 	 */
780 	if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
781 	    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
782 	    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
783 	    initialized = 1;
784 	    do_convert = 0;
785 	}
786 	else {
787 	    cd = iconv_open(nl_langinfo(CODESET), "646");
788 	    if (cd == (iconv_t) -1) {
789 		if (err_ptr) *err_ptr = errno;
790 		if (error_str) *error_str = (u_char *)
791 		    "Cannot convert ASCII strings to the local codeset";
792 	    }
793 	    initialized = 1;
794 	    do_convert = 1;
795 	}
796     }
797 
798     if (!do_convert) {
799 	if ((err = g11n_validate_ascii(str, 0, error_str))) {
800 	    if (err_ptr) *err_ptr = err;
801 	    return NULL;
802 	}
803 	else
804 	    return (u_char *) xstrdup(str);
805     }
806     return do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str);
807 }
808 
809 u_char *
810 g11n_convert_from_utf8(const u_char *str, int *err_ptr, u_char **error_str)
811 {
812     static u_int initialized = 0;
813     static u_int do_convert = 0;
814     iconv_t cd;
815     int err;
816 
817     if (!initialized) {
818 	/*
819 	 * iconv_open() fails if the to/from codesets are the
820 	 * same, and there are aliases of codesets to boot...
821 	 */
822 	if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
823 	    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
824 	    initialized = 1;
825 	    do_convert = 0;
826 	}
827 	else {
828 	    cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
829 	    if (cd == (iconv_t) -1) {
830 		if (err_ptr) *err_ptr = errno;
831 		if (error_str) *error_str = (u_char *)
832 		    "Cannot convert UTF-8 strings to the local codeset";
833 	    }
834 	    initialized = 1;
835 	    do_convert = 1;
836 	}
837     }
838 
839     if (!do_convert) {
840 	if ((err = g11n_validate_utf8(str, 0, error_str))) {
841 	    if (err_ptr) *err_ptr = err;
842 	    return NULL;
843 	}
844 	else
845 	    return (u_char *) xstrdup((char *) str);
846     }
847     return do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str);
848 }
849 
850 char *
851 g11n_convert_to_ascii(const u_char *str, int *err_ptr, u_char **error_str)
852 {
853     static u_int initialized = 0;
854     static u_int do_convert = 0;
855     iconv_t cd;
856 
857     if (!initialized) {
858 	/*
859 	 * iconv_open() fails if the to/from codesets are the
860 	 * same, and there are aliases of codesets to boot...
861 	 */
862 	if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
863 	    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
864 	    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
865 	    initialized = 1;
866 	    do_convert = 0;
867 	}
868 	else {
869 	    cd = iconv_open("646", nl_langinfo(CODESET));
870 	    if (cd == (iconv_t) -1) {
871 		if (err_ptr) *err_ptr = errno;
872 		if (error_str) *error_str = (u_char *)
873 		    "Cannot convert UTF-8 strings to the local codeset";
874 	    }
875 	    initialized = 1;
876 	    do_convert = 1;
877 	}
878     }
879 
880     if (!do_convert)
881 	return xstrdup((char *) str);
882     return (char *) do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str);
883 }
884 
885 u_char *
886 g11n_convert_to_utf8(const u_char *str, int *err_ptr, u_char **error_str)
887 {
888     static u_int initialized = 0;
889     static u_int do_convert = 0;
890     iconv_t cd;
891 
892     if (!initialized) {
893 	/*
894 	 * iconv_open() fails if the to/from codesets are the
895 	 * same, and there are aliases of codesets to boot...
896 	 */
897 	if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
898 	    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
899 	    initialized = 1;
900 	    do_convert = 0;
901 	}
902 	else {
903 	    cd = iconv_open("UTF-8", nl_langinfo(CODESET));
904 	    if (cd == (iconv_t) -1) {
905 		if (err_ptr) *err_ptr = errno;
906 		if (error_str) *error_str = (u_char *)
907 		    "Cannot convert UTF-8 strings to the local codeset";
908 	    }
909 	    initialized = 1;
910 	    do_convert = 1;
911 	}
912     }
913 
914     if (!do_convert)
915 	return (u_char *) xstrdup((char *) str);
916     return do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str);
917 }
918 
919 
920 /*
921  * Wrapper around iconv()
922  *
923  * The caller is responsible for freeing the result and for handling
924  * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF).
925  */
926 
927 static
928 u_char *
929 do_iconv(iconv_t cd, u_int *mul_ptr,
930 	 const void *buf, u_int len,
931 	 u_int *outlen, int *err,
932 	 u_char **err_str)
933 {
934     size_t inbytesleft, outbytesleft, converted_size;
935     char *outbuf;
936     u_char *converted;
937     const char *inbuf;
938     u_int mul = 0;
939 
940     if (!buf || !(*(char *)buf)) return NULL;
941     if (len == 0) len = strlen(buf);
942     /* reset conversion descriptor */
943     /* XXX Do we need initial shift sequences for UTF-8??? */
944     (void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft);
945     inbuf = (const char *) buf;
946     if (mul_ptr) mul = *mul_ptr;
947     converted_size = (len << mul);
948     outbuf = (char *) xmalloc(converted_size + 1); /* for null */
949     converted = (u_char *) outbuf;
950     outbytesleft = len;
951     do {
952 	if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) ==
953 		(size_t) -1) {
954 	    if (errno == E2BIG) {
955 		/* UTF-8 codepoints are at most 8 bytes long. */
956 		if (mul > 2) {
957 		    if (err_str)
958 			*err_str = (u_char *) "Conversion to UTF-8 failed due to"
959 				  "preposterous space requirements";
960 		    if (err)
961 			*err = EILSEQ;
962 		    return NULL;
963 		}
964 
965 		/*
966 		 * Re-alloc output and ensure that the outbuf
967 		 * and outbytesleft values are adjusted.
968 		 */
969 		converted = xrealloc(converted, converted_size << 1 + 1);
970 		outbuf = (char *) converted + converted_size - outbytesleft;
971 		converted_size = (len << ++(mul));
972 		outbytesleft = converted_size - outbytesleft;
973 	    }
974 	    else {
975 		/*
976 		 * Let the caller deal with iconv() errors, probably by
977 		 * calling fatal(); xfree() does not set errno.
978 		 */
979 		if (err) *err = errno;
980 		xfree(converted);
981 		return NULL;
982 	    }
983 	}
984     } while (inbytesleft);
985     *outbuf = '\0'; /* ensure null-termination */
986     if (outlen) *outlen = converted_size - outbytesleft;
987     if (mul_ptr) *mul_ptr = mul;
988     return converted;
989 }
990