xref: /titanic_50/usr/src/cmd/ssh/libssh/common/g11n.c (revision 8883f1c270cc8e33c18dd088e744840092b47bbb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
22  * Use is subject to license terms.
23  */
24 
25 #pragma ident	"%Z%%M%	%I%	%E% SMI"
26 
27 #include <errno.h>
28 #include <locale.h>
29 #include <langinfo.h>
30 #include <iconv.h>
31 #include <ctype.h>
32 #include <strings.h>
33 #include <string.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include "includes.h"
37 #include "xmalloc.h"
38 #include "xlist.h"
39 
40 #ifdef MIN
41 #undef MIN
42 #endif /* MIN */
43 
44 #define	MIN(x, y)	((x) < (y) ? (x) : (y))
45 
46 #define	LOCALE_PATH	"/usr/bin/locale"
47 
48 /* two-char country code, '-' and two-char region code */
49 #define	LANGTAG_MAX	5
50 
51 static uchar_t *do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf,
52     uint_t len, uint_t *outlen, int *err, uchar_t **err_str);
53 
54 static int locale_cmp(const void *d1, const void *d2);
55 static char *g11n_locale2langtag(char *locale);
56 
57 uint_t g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str);
58 uint_t g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str);
59 
60 /*
61  * Convert locale string name into a language tag. The caller is responsible for
62  * freeing the memory allocated for the result.
63  */
64 static char *
65 g11n_locale2langtag(char *locale)
66 {
67 	char *langtag;
68 
69 	/* base cases */
70 	if (!locale || !*locale)
71 		return (NULL);
72 
73 	if (strcmp(locale, "POSIX") == 0 || strcmp(locale, "C") == 0)
74 		return (xstrdup("i-default"));
75 
76 	/* punt for language codes which are not exactly 2 letters */
77 	if (strlen(locale) < 2 ||
78 	    !isalpha(locale[0]) ||
79 	    !isalpha(locale[1]) ||
80 	    (locale[2] != '\0' &&
81 	    locale[2] != '_' &&
82 	    locale[2] != '.' &&
83 	    locale[2] != '@'))
84 		return (NULL);
85 
86 
87 	/* we have a primary language sub-tag */
88 	langtag = (char *)xmalloc(LANGTAG_MAX + 1);
89 
90 	strncpy(langtag, locale, 2);
91 	langtag[2] = '\0';
92 
93 	/* do we have country sub-tag? For example: cs_CZ */
94 	if (locale[2] == '_') {
95 		if (strlen(locale) < 5 ||
96 		    !isalpha(locale[3]) ||
97 		    !isalpha(locale[4]) ||
98 		    (locale[5] != '\0' && (locale[5] != '.' &&
99 		    locale[5] != '@'))) {
100 			return (langtag);
101 		}
102 
103 		/* example: create cs-CZ from cs_CZ */
104 		if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 2,
105 		    locale + 3) == 5)
106 			return (langtag);
107 	}
108 
109 	/* in all other cases we just use the primary language sub-tag */
110 	return (langtag);
111 }
112 
113 uint_t
114 g11n_langtag_is_default(char *langtag)
115 {
116 	return (strcmp(langtag, "i-default") == 0);
117 }
118 
119 /*
120  * This lang tag / locale matching function works only for two-character
121  * language primary sub-tags and two-character country sub-tags.
122  */
123 uint_t
124 g11n_langtag_matches_locale(char *langtag, char *locale)
125 {
126 	/* match "i-default" to the process' current locale if possible */
127 	if (g11n_langtag_is_default(langtag)) {
128 		if (strcasecmp(locale, "POSIX") == 0 ||
129 		    strcasecmp(locale, "C") == 0)
130 			return (1);
131 		else
132 			return (0);
133 	}
134 
135 	/*
136 	 * locale must be at least 2 chars long and the lang part must be
137 	 * exactly two characters
138 	 */
139 	if (strlen(locale) < 2 ||
140 	    (!isalpha(locale[0]) || !isalpha(locale[1]) ||
141 	    (locale[2] != '\0' && locale[2] != '_' &&
142 	    locale[2] != '.' && locale[2] != '@')))
143 		return (0);
144 
145 	/* same thing with the langtag */
146 	if (strlen(langtag) < 2 ||
147 	    (!isalpha(langtag[0]) || !isalpha(langtag[1]) ||
148 	    (langtag[2] != '\0' && langtag[2] != '-')))
149 		return (0);
150 
151 	/* primary language sub-tag and the locale's language part must match */
152 	if (strncasecmp(langtag, locale, 2) != 0)
153 		return (0);
154 
155 	/*
156 	 * primary language sub-tag and the locale's language match, now
157 	 * fuzzy check country part
158 	 */
159 
160 	/* neither langtag nor locale have more than one component */
161 	if (langtag[2] == '\0' &&
162 	    (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@'))
163 		return (2);
164 
165 	/* langtag has only one sub-tag... */
166 	if (langtag[2] == '\0')
167 		return (1);
168 
169 	/* locale has no country code... */
170 	if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')
171 		return (1);
172 
173 	/* langtag has more than one subtag and the locale has a country code */
174 
175 	/* ignore second subtag if not two chars */
176 	if (strlen(langtag) < 5)
177 		return (1);
178 
179 	if (!isalpha(langtag[3]) || !isalpha(langtag[4]) ||
180 	    (langtag[5] != '\0' && langtag[5] != '-'))
181 		return (1);
182 
183 	/* ignore rest of locale if there is no two-character country part */
184 	if (strlen(locale) < 5)
185 		return (1);
186 
187 	if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) ||
188 	    (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@'))
189 		return (1);
190 
191 	/* if the country part matches, return 2 */
192 	if (strncasecmp(&langtag[3], &locale[3], 2) == 0)
193 		return (2);
194 
195 	return (1);
196 }
197 
198 char *
199 g11n_getlocale()
200 {
201 	/* we have one text domain - always set it */
202 	(void) textdomain(TEXT_DOMAIN);
203 
204 	/* if the locale is not set, set it from the env vars */
205 	if (!setlocale(LC_MESSAGES, NULL))
206 		(void) setlocale(LC_MESSAGES, "");
207 
208 	return (setlocale(LC_MESSAGES, NULL));
209 }
210 
211 void
212 g11n_setlocale(int category, const char *locale)
213 {
214 	char *curr;
215 
216 	/* we have one text domain - always set it */
217 	(void) textdomain(TEXT_DOMAIN);
218 
219 	if (!locale)
220 		return;
221 
222 	if (*locale && ((curr = setlocale(category, NULL))) &&
223 	    strcmp(curr, locale) == 0)
224 		return;
225 
226 	/* if <category> is bogus, setlocale() will do nothing */
227 	(void) setlocale(category, locale);
228 }
229 
230 char **
231 g11n_getlocales()
232 {
233 	FILE *locale_out;
234 	uint_t n_elems, list_size, long_line = 0;
235 	char **list;
236 	char locale[64];	/* 64 bytes is plenty for locale names */
237 
238 	if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL)
239 		return (NULL);
240 
241 	/*
242 	 * start with enough room for 65 locales - that's a lot fewer than
243 	 * all the locales available for installation, but a lot more than
244 	 * what most users will need and install
245 	 */
246 	n_elems = 0;
247 	list_size = 192;
248 	list = (char **) xmalloc(sizeof (char *) * (list_size + 1));
249 	memset(list, 0, sizeof (char *) * (list_size + 1));
250 
251 	while (fgets(locale, sizeof (locale), locale_out)) {
252 		/* skip long locale names (if any) */
253 		if (!strchr(locale, '\n')) {
254 			long_line = 1;
255 			continue;
256 		} else if (long_line) {
257 			long_line = 0;
258 			continue;
259 		}
260 
261 		if (strncmp(locale, "iso_8859", 8) == 0)
262 			/* ignore locale names like "iso_8859-1" */
263 			continue;
264 
265 		if (n_elems == list_size) {
266 			list_size *= 2;
267 			list = (char **)xrealloc((void *) list,
268 			    (list_size + 1) * sizeof (char *));
269 			memset(&list[n_elems + 1], 0,
270 			    sizeof (char *) * (list_size - n_elems + 1));
271 		}
272 
273 		*(strchr(locale, '\n')) = '\0';	/* remove the trailing \n */
274 		list[n_elems++] = xstrdup(locale);
275 	}
276 
277 	(void) pclose(locale_out);
278 
279 	if (n_elems == 0) {
280 		xfree(list);
281 		return (NULL);
282 	}
283 
284 	list[n_elems] = NULL;
285 
286 	qsort(list, n_elems - 1, sizeof (char *), locale_cmp);
287 	return (list);
288 }
289 
290 char *
291 g11n_getlangs()
292 {
293 	char *locale;
294 
295 	if (getenv("SSH_LANGS"))
296 		return (xstrdup(getenv("SSH_LANGS")));
297 
298 	locale = g11n_getlocale();
299 
300 	if (!locale || !*locale)
301 		return (xstrdup("i-default"));
302 
303 	return (g11n_locale2langtag(locale));
304 }
305 
306 char *
307 g11n_locales2langs(char **locale_set)
308 {
309 	char **p, **r, **q;
310 	char *langtag, *langs;
311 	int locales, skip;
312 
313 	for (locales = 0, p = locale_set; p && *p; p++)
314 		locales++;
315 
316 	r = (char **)xmalloc((locales + 1) * sizeof (char *));
317 	memset(r, 0, (locales + 1) * sizeof (char *));
318 
319 	for (p = locale_set; p && *p && ((p - locale_set) <= locales); p++) {
320 		skip = 0;
321 		if ((langtag = g11n_locale2langtag(*p)) == NULL)
322 			continue;
323 		for (q = r; (q - r) < locales; q++) {
324 			if (!*q)
325 				break;
326 			if (*q && strcmp(*q, langtag) == 0)
327 				skip = 1;
328 		}
329 		if (!skip)
330 			*(q++) = langtag;
331 		else
332 			xfree(langtag);
333 		*q = NULL;
334 	}
335 
336 	langs = xjoin(r, ',');
337 	g11n_freelist(r);
338 
339 	return (langs);
340 }
341 
342 static int
343 sortcmp(const void *d1, const void *d2)
344 {
345 	char *s1 = *(char **)d1;
346 	char *s2 = *(char **)d2;
347 
348 	return (strcmp(s1, s2));
349 }
350 
351 int
352 g11n_langtag_match(char *langtag1, char *langtag2)
353 {
354 	int len1, len2;
355 	char c1, c2;
356 
357 	len1 = (strchr(langtag1, '-')) ?
358 	    (strchr(langtag1, '-') - langtag1)
359 	    : strlen(langtag1);
360 
361 	len2 = (strchr(langtag2, '-')) ?
362 	    (strchr(langtag2, '-') - langtag2)
363 	    : strlen(langtag2);
364 
365 	/* no match */
366 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
367 		return (0);
368 
369 	c1 = *(langtag1 + len1);
370 	c2 = *(langtag2 + len2);
371 
372 	/* no country sub-tags - exact match */
373 	if (c1 == '\0' && c2 == '\0')
374 		return (2);
375 
376 	/* one langtag has a country sub-tag, the other doesn't */
377 	if (c1 == '\0' || c2 == '\0')
378 		return (1);
379 
380 	/* can't happen - both langtags have a country sub-tag */
381 	if (c1 != '-' || c2 != '-')
382 		return (1);
383 
384 	/* compare country subtags */
385 	langtag1 = langtag1 + len1 + 1;
386 	langtag2 = langtag2 + len2 + 1;
387 
388 	len1 = (strchr(langtag1, '-')) ?
389 	    (strchr(langtag1, '-') - langtag1) : strlen(langtag1);
390 
391 	len2 = (strchr(langtag2, '-')) ?
392 	    (strchr(langtag2, '-') - langtag2) : strlen(langtag2);
393 
394 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
395 		return (1);
396 
397 	/* country tags matched - exact match */
398 	return (2);
399 }
400 
401 char *
402 g11n_langtag_set_intersect(char *set1, char *set2)
403 {
404 	char **list1, **list2, **list3, **p, **q, **r;
405 	char *set3, *lang_subtag;
406 	uint_t n1, n2, n3;
407 	uint_t do_append;
408 
409 	list1 = xsplit(set1, ',');
410 	list2 = xsplit(set2, ',');
411 
412 	for (n1 = 0, p = list1; p && *p; p++, n1++)
413 		;
414 	for (n2 = 0, p = list2; p && *p; p++, n2++)
415 		;
416 
417 	list3 = (char **) xmalloc(sizeof (char *) * (n1 + n2 + 1));
418 	*list3 = NULL;
419 
420 	/*
421 	 * we must not sort the user langtags - sorting or not the server's
422 	 * should not affect the outcome
423 	 */
424 	qsort(list2, n2, sizeof (char *), sortcmp);
425 
426 	for (n3 = 0, p = list1; p && *p; p++) {
427 		do_append = 0;
428 		for (q = list2; q && *q; q++) {
429 			if (g11n_langtag_match(*p, *q) != 2) continue;
430 			/* append element */
431 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
432 				do_append = 1;
433 				if (!*r)
434 					break;
435 				if (strcmp(*p, *r) == 0) {
436 					do_append = 0;
437 					break;
438 				}
439 			}
440 			if (do_append && n3 <= (n1 + n2)) {
441 				list3[n3++] = xstrdup(*p);
442 				list3[n3] = NULL;
443 			}
444 		}
445 	}
446 
447 	for (p = list1; p && *p; p++) {
448 		do_append = 0;
449 		for (q = list2; q && *q; q++) {
450 			if (g11n_langtag_match(*p, *q) != 1)
451 				continue;
452 
453 			/* append element */
454 			lang_subtag = xstrdup(*p);
455 			if (strchr(lang_subtag, '-'))
456 				*(strchr(lang_subtag, '-')) = '\0';
457 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
458 				do_append = 1;
459 				if (!*r)
460 					break;
461 				if (strcmp(lang_subtag, *r) == 0) {
462 					do_append = 0;
463 					break;
464 				}
465 			}
466 			if (do_append && n3 <= (n1 + n2)) {
467 				list3[n3++] = lang_subtag;
468 				list3[n3] = NULL;
469 			} else
470 				xfree(lang_subtag);
471 		}
472 	}
473 
474 	set3 = xjoin(list3, ',');
475 	xfree_split_list(list1);
476 	xfree_split_list(list2);
477 	xfree_split_list(list3);
478 
479 	return (set3);
480 }
481 
482 char *
483 g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags)
484 {
485 	char *list, *result;
486 	char **xlist;
487 
488 	/* g11n_langtag_set_intersect uses xmalloc - should not return NULL */
489 	list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags);
490 
491 	if (!list)
492 		return (NULL);
493 
494 	xlist = xsplit(list, ',');
495 
496 	xfree(list);
497 
498 	if (!xlist || !*xlist)
499 		return (NULL);
500 
501 	result = xstrdup(*xlist);
502 	xfree_split_list(xlist);
503 
504 	return (result);
505 }
506 
507 /*
508  * Compare locales, preferring UTF-8 codesets to others, otherwise doing
509  * a stright strcmp()
510  */
511 static int
512 locale_cmp(const void *d1, const void *d2)
513 {
514 	char *dot_ptr;
515 	char *s1 = *(char **)d1;
516 	char *s2 = *(char **)d2;
517 	int s1_is_utf8 = 0;
518 	int s2_is_utf8 = 0;
519 
520 	/* check if s1 is a UTF-8 locale */
521 	if (((dot_ptr = strchr((char *)s1, '.')) != NULL) &&
522 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
523 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
524 		s1_is_utf8++;
525 	}
526 
527 	/* check if s2 is a UTF-8 locale */
528 	if (((dot_ptr = strchr((char *)s2, '.')) != NULL) &&
529 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
530 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
531 		s2_is_utf8++;
532 	}
533 
534 	/* prefer UTF-8 locales */
535 	if (s1_is_utf8 && !s2_is_utf8)
536 		return (-1);
537 
538 	if (s2_is_utf8 && !s1_is_utf8)
539 		return (1);
540 
541 	/* prefer any locale over the default locales */
542 	if (strcmp(s1, "C") == 0 || strcmp(s1, "POSIX") == 0 ||
543 	    strcmp(s1, "common") == 0) {
544 		if (strcmp(s2, "C") != 0 && strcmp(s2, "POSIX") != 0 &&
545 		    strcmp(s2, "common") != 0)
546 			return (1);
547 	}
548 
549 	if (strcmp(s2, "C") == 0 || strcmp(s2, "POSIX") == 0 ||
550 	    strcmp(s2, "common") == 0) {
551 		if (strcmp(s1, "C") != 0 &&
552 		    strcmp(s1, "POSIX") != 0 &&
553 		    strcmp(s1, "common") != 0)
554 			return (-1);
555 	}
556 
557 	return (strcmp(s1, s2));
558 }
559 
560 
561 char **
562 g11n_langtag_set_locale_set_intersect(char *langtag_set, char **locale_set)
563 {
564 	char **langtag_list, **result, **p, **q, **r;
565 	char *s;
566 	uint_t do_append, n_langtags, n_locales, n_results, max_results;
567 
568 	/* count lang tags and locales */
569 	for (n_locales = 0, p = locale_set; p && *p; p++)
570 		n_locales++;
571 
572 	n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0;
573 	/* count the number of langtags */
574 	for (; s = strchr(s, ','); s++, n_langtags++)
575 		;
576 
577 	qsort(locale_set, n_locales, sizeof (char *), locale_cmp);
578 
579 	langtag_list = xsplit(langtag_set, ',');
580 	for (n_langtags = 0, p = langtag_list; p && *p; p++, n_langtags++)
581 		;
582 
583 	max_results = MIN(n_locales, n_langtags) * 2;
584 	result = (char **) xmalloc(sizeof (char *) * (max_results + 1));
585 	*result = NULL;
586 	n_results = 0;
587 
588 	/* more specific matches first */
589 	for (p = langtag_list; p && *p; p++) {
590 		do_append = 0;
591 		for (q = locale_set; q && *q; q++) {
592 			if (g11n_langtag_matches_locale(*p, *q) == 2) {
593 				do_append = 1;
594 				for (r = result; (r - result) <=
595 				    MIN(n_locales, n_langtags); r++) {
596 					if (!*r)
597 						break;
598 					if (strcmp(*q, *r) == 0) {
599 						do_append = 0;
600 						break;
601 					}
602 				}
603 				if (do_append && n_results < max_results) {
604 					result[n_results++] = xstrdup(*q);
605 					result[n_results] = NULL;
606 				}
607 				break;
608 			}
609 		}
610 	}
611 
612 	for (p = langtag_list; p && *p; p++) {
613 		do_append = 0;
614 		for (q = locale_set; q && *q; q++) {
615 			if (g11n_langtag_matches_locale(*p, *q) == 1) {
616 				do_append = 1;
617 				for (r = result; (r - result) <=
618 				    MIN(n_locales, n_langtags); r++) {
619 					if (!*r)
620 						break;
621 					if (strcmp(*q, *r) == 0) {
622 						do_append = 0;
623 						break;
624 					}
625 				}
626 				if (do_append && n_results < max_results) {
627 					result[n_results++] = xstrdup(*q);
628 					result[n_results] = NULL;
629 				}
630 				break;
631 			}
632 		}
633 	}
634 
635 	xfree_split_list(langtag_list);
636 
637 	return (result);
638 }
639 
640 char *
641 g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales)
642 {
643 	char **results, **locales, *result = NULL;
644 
645 	if (srvr_locales == NULL)
646 		locales = g11n_getlocales();
647 	else
648 		locales = srvr_locales;
649 
650 	if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags,
651 	    locales)) == NULL)
652 		goto err;
653 
654 	if (*results != NULL)
655 		result = xstrdup(*results);
656 
657 	xfree_split_list(results);
658 
659 err:
660 	if (locales != srvr_locales)
661 		g11n_freelist(locales);
662 	return (result);
663 }
664 
665 
666 /*
667  * Functions for validating ASCII and UTF-8 strings
668  *
669  * The error_str parameter is an optional pointer to a char variable
670  * where to store a string suitable for use with error() or fatal() or
671  * friends.
672  *
673  * The return value is 0 if success, EILSEQ or EINVAL.
674  *
675  */
676 uint_t
677 g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str)
678 {
679 	uchar_t *p;
680 
681 	for (p = (uchar_t *)str; p && *p && (!(*p & 0x80)); p++)
682 		;
683 
684 	if (len && ((p - (uchar_t *)str) != len))
685 		return (EILSEQ);
686 
687 	return (0);
688 }
689 
690 uint_t
691 g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str)
692 {
693 	uchar_t *p;
694 	uint_t c, l;
695 
696 	if (len == 0)
697 		len = strlen((const char *)str);
698 
699 	for (p = (uchar_t *)str; p && (p - str < len) && *p; ) {
700 		/* 8-bit chars begin a UTF-8 sequence */
701 		if (*p & 0x80) {
702 			/* get sequence length and sanity check first byte */
703 			if (*p < 0xc0)
704 				return (EILSEQ);
705 			else if (*p < 0xe0)
706 				l = 2;
707 			else if (*p < 0xf0)
708 				l = 3;
709 			else if (*p < 0xf8)
710 				l = 4;
711 			else if (*p < 0xfc)
712 				l = 5;
713 			else if (*p < 0xfe)
714 				l = 6;
715 			else
716 				return (EILSEQ);
717 
718 			if ((p + l - str) >= len)
719 				return (EILSEQ);
720 
721 			/* overlong detection - build codepoint */
722 			c = *p & 0x3f;
723 			/* shift c bits from first byte */
724 			c = c << (6 * (l - 1));
725 
726 			if (l > 1) {
727 				if (*(p + 1) && ((*(p + 1) & 0xc0) == 0x80))
728 					c = c | ((*(p + 1) & 0x3f) <<
729 					    (6 * (l - 2)));
730 				else
731 					return (EILSEQ);
732 
733 				if (c < 0x80)
734 					return (EILSEQ);
735 			}
736 
737 			if (l > 2) {
738 				if (*(p + 2) && ((*(p + 2) & 0xc0) == 0x80))
739 					c = c | ((*(p + 2) & 0x3f) <<
740 					    (6 * (l - 3)));
741 				else
742 					return (EILSEQ);
743 
744 				if (c < 0x800)
745 					return (EILSEQ);
746 			}
747 
748 			if (l > 3) {
749 				if (*(p + 3) && ((*(p + 3) & 0xc0) == 0x80))
750 					c = c | ((*(p + 3) & 0x3f) <<
751 					    (6 * (l - 4)));
752 				else
753 					return (EILSEQ);
754 
755 				if (c < 0x10000)
756 					return (EILSEQ);
757 			}
758 
759 			if (l > 4) {
760 				if (*(p + 4) && ((*(p + 4) & 0xc0) == 0x80))
761 					c = c | ((*(p + 4) & 0x3f) <<
762 					    (6 * (l - 5)));
763 				else
764 					return (EILSEQ);
765 
766 				if (c < 0x200000)
767 					return (EILSEQ);
768 			}
769 
770 			if (l > 5) {
771 				if (*(p + 5) && ((*(p + 5) & 0xc0) == 0x80))
772 					c = c | (*(p + 5) & 0x3f);
773 				else
774 					return (EILSEQ);
775 
776 				if (c < 0x4000000)
777 					return (EILSEQ);
778 			}
779 
780 			/*
781 			 * check for UTF-16 surrogates ifs other illegal
782 			 * UTF-8 * points
783 			 */
784 			if (((c <= 0xdfff) && (c >= 0xd800)) ||
785 			    (c == 0xfffe) || (c == 0xffff))
786 				return (EILSEQ);
787 			p += l;
788 		}
789 		/* 7-bit chars are fine */
790 		else
791 			p++;
792 	}
793 	return (0);
794 }
795 
796 /*
797  * Functions for converting to ASCII or UTF-8 from the local codeset
798  * Functions for converting from ASCII or UTF-8 to the local codeset
799  *
800  * The error_str parameter is an optional pointer to a char variable
801  * where to store a string suitable for use with error() or fatal() or
802  * friends.
803  *
804  * The err parameter is an optional pointer to an integer where 0
805  * (success) or EILSEQ or EINVAL will be stored (failure).
806  *
807  * These functions return NULL if the conversion fails.
808  *
809  */
810 uchar_t *
811 g11n_convert_from_ascii(const char *str, int *err_ptr, uchar_t **error_str)
812 {
813 	static uint_t initialized = 0;
814 	static uint_t do_convert = 0;
815 	iconv_t cd;
816 	int err;
817 
818 	if (!initialized) {
819 		/*
820 		 * iconv_open() fails if the to/from codesets are the
821 		 * same, and there are aliases of codesets to boot...
822 		 */
823 		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
824 		    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
825 		    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
826 			initialized = 1;
827 			do_convert = 0;
828 		} else {
829 			cd = iconv_open(nl_langinfo(CODESET), "646");
830 			if (cd == (iconv_t)-1) {
831 				if (err_ptr)
832 					*err_ptr = errno;
833 				if (error_str)
834 					*error_str = (uchar_t *)"Cannot "
835 					    "convert ASCII strings to the local"
836 					    " codeset";
837 			}
838 			initialized = 1;
839 			do_convert = 1;
840 		}
841 	}
842 
843 	if (!do_convert) {
844 		if ((err = g11n_validate_ascii(str, 0, error_str))) {
845 			if (err_ptr)
846 				*err_ptr = err;
847 			return (NULL);
848 		} else
849 			return ((uchar_t *)xstrdup(str));
850 	}
851 
852 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
853 }
854 
855 uchar_t *
856 g11n_convert_from_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
857 {
858 	static uint_t initialized = 0;
859 	static uint_t do_convert = 0;
860 	iconv_t cd;
861 	int err;
862 
863 	if (!initialized) {
864 		/*
865 		 * iconv_open() fails if the to/from codesets are the
866 		 * same, and there are aliases of codesets to boot...
867 		 */
868 		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
869 		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
870 			initialized = 1;
871 			do_convert = 0;
872 		} else {
873 			cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
874 			if (cd == (iconv_t)-1) {
875 				if (err_ptr)
876 					*err_ptr = errno;
877 				if (error_str)
878 					*error_str = (uchar_t *)"Cannot "
879 					    "convert UTF-8 strings to the "
880 					    "local codeset";
881 			}
882 			initialized = 1;
883 			do_convert = 1;
884 		}
885 	}
886 
887 	if (!do_convert) {
888 		if ((err = g11n_validate_utf8(str, 0, error_str))) {
889 			if (err_ptr)
890 				*err_ptr = err;
891 			return (NULL);
892 		} else
893 			return ((uchar_t *)xstrdup((char *)str));
894 	}
895 
896 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
897 }
898 
899 char *
900 g11n_convert_to_ascii(const uchar_t *str, int *err_ptr, uchar_t **error_str)
901 {
902 	static uint_t initialized = 0;
903 	static uint_t do_convert = 0;
904 	iconv_t cd;
905 
906 	if (!initialized) {
907 		/*
908 		 * iconv_open() fails if the to/from codesets are the
909 		 * same, and there are aliases of codesets to boot...
910 		 */
911 		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
912 		    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
913 		    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
914 			initialized = 1;
915 			do_convert = 0;
916 		} else {
917 			cd = iconv_open("646", nl_langinfo(CODESET));
918 			if (cd == (iconv_t)-1) {
919 				if (err_ptr)
920 					*err_ptr = errno;
921 				if (error_str)
922 					*error_str = (uchar_t *)"Cannot "
923 					    "convert UTF-8 strings to the "
924 					    "local codeset";
925 			}
926 			initialized = 1;
927 			do_convert = 1;
928 		}
929 	}
930 
931 	if (!do_convert)
932 		return (xstrdup((char *)str));
933 
934 	return ((char *)do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
935 }
936 
937 uchar_t *
938 g11n_convert_to_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
939 {
940 	static uint_t initialized = 0;
941 	static uint_t do_convert = 0;
942 	iconv_t cd;
943 
944 	if (!initialized) {
945 		/*
946 		 * iconv_open() fails if the to/from codesets are the
947 		 * same, and there are aliases of codesets to boot...
948 		 */
949 		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
950 		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
951 			initialized = 1;
952 			do_convert = 0;
953 		} else {
954 			cd = iconv_open("UTF-8", nl_langinfo(CODESET));
955 			if (cd == (iconv_t)-1) {
956 				if (err_ptr)
957 					*err_ptr = errno;
958 				if (error_str)
959 					*error_str = (uchar_t *)"Cannot "
960 					    "convert UTF-8 strings to the "
961 					    "local codeset";
962 			}
963 			initialized = 1;
964 			do_convert = 1;
965 		}
966 	}
967 
968 	if (!do_convert)
969 		return ((uchar_t *)xstrdup((char *)str));
970 
971 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
972 }
973 
974 
975 /*
976  * Wrapper around iconv()
977  *
978  * The caller is responsible for freeing the result and for handling
979  * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF).
980  */
981 static uchar_t *
982 do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, uint_t len,
983     uint_t *outlen, int *err, uchar_t **err_str)
984 {
985 	size_t inbytesleft, outbytesleft, converted_size;
986 	char *outbuf;
987 	uchar_t *converted;
988 	const char *inbuf;
989 	uint_t mul = 0;
990 
991 	if (!buf || !(*(char *)buf))
992 		return (NULL);
993 
994 	if (len == 0)
995 		len = strlen(buf);
996 
997 	/* reset conversion descriptor */
998 	/* XXX Do we need initial shift sequences for UTF-8??? */
999 	(void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft);
1000 	inbuf = (const char *) buf;
1001 
1002 	if (mul_ptr)
1003 		mul = *mul_ptr;
1004 
1005 	converted_size = (len << mul);
1006 	outbuf = (char *)xmalloc(converted_size + 1); /* for null */
1007 	converted = (uchar_t *)outbuf;
1008 	outbytesleft = len;
1009 
1010 	do {
1011 		if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) ==
1012 		    (size_t)-1) {
1013 			if (errno == E2BIG) {
1014 				/* UTF-8 codepoints are at most 8 bytes long */
1015 				if (mul > 2) {
1016 					if (err_str)
1017 						*err_str = (uchar_t *)
1018 						    "Conversion to UTF-8 failed"
1019 						    " due to preposterous space"
1020 						    " requirements";
1021 					if (err)
1022 						*err = EILSEQ;
1023 					return (NULL);
1024 				}
1025 
1026 				/*
1027 				 * re-alloc output and ensure that the outbuf
1028 				 * and outbytesleft values are adjusted
1029 				 */
1030 				converted = xrealloc(converted,
1031 				    converted_size << 1 + 1);
1032 				outbuf = (char *)converted + converted_size -
1033 				    outbytesleft;
1034 				converted_size = (len << ++(mul));
1035 				outbytesleft = converted_size - outbytesleft;
1036 			} else {
1037 				/*
1038 				 * let the caller deal with iconv() errors,
1039 				 * probably by calling fatal(); xfree() does
1040 				 * not set errno
1041 				 */
1042 				if (err)
1043 					*err = errno;
1044 				xfree(converted);
1045 				return (NULL);
1046 			}
1047 		}
1048 	} while (inbytesleft);
1049 
1050 	*outbuf = '\0'; /* ensure null-termination */
1051 	if (outlen)
1052 		*outlen = converted_size - outbytesleft;
1053 	if (mul_ptr)
1054 		*mul_ptr = mul;
1055 
1056 	return (converted);
1057 }
1058 
1059 /*
1060  * Free all strings in the list and then free the list itself. We know that the
1061  * list ends with a NULL pointer.
1062  */
1063 void
1064 g11n_freelist(char **list)
1065 {
1066 	int i = 0;
1067 
1068 	while (list[i] != NULL) {
1069 		xfree(list[i]);
1070 		i++;
1071 	}
1072 
1073 	xfree(list);
1074 }
1075