xref: /titanic_41/usr/src/cmd/ssh/libssh/common/g11n.c (revision 9404882939d18ddd3c94a5bd3da7a0449c195a5d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
22  * Use is subject to license terms.
23  */
24 
25 #pragma ident	"%Z%%M%	%I%	%E% SMI"
26 
27 #include <errno.h>
28 #include <locale.h>
29 #include <langinfo.h>
30 #include <iconv.h>
31 #include <ctype.h>
32 #include <strings.h>
33 #include <string.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include "includes.h"
37 #include "xmalloc.h"
38 #include "xlist.h"
39 
40 #ifdef MIN
41 #undef MIN
42 #endif /* MIN */
43 
44 #define	MIN(x, y)	((x) < (y) ? (x) : (y))
45 
46 #define	LOCALE_PATH	"/usr/bin/locale"
47 
48 /* two-char country code, '-' and two-char region code */
49 #define	LANGTAG_MAX	5
50 
51 static uchar_t *do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf,
52     uint_t len, uint_t *outlen, int *err, uchar_t **err_str);
53 
54 static int locale_cmp(const void *d1, const void *d2);
55 static char *g11n_locale2langtag(char *locale);
56 
57 uint_t g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str);
58 uint_t g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str);
59 
60 static char *
61 g11n_locale2langtag(char *locale)
62 {
63 	char *langtag;
64 
65 	/* base cases */
66 	if (!locale || !*locale)
67 		return (NULL);
68 
69 	if (strcmp(locale, "POSIX") == 0 || strcmp(locale, "C") == 0)
70 		return ("i-default");
71 
72 	/* punt for language codes which are not exactly 2 letters */
73 	if (strlen(locale) < 2 ||
74 	    !isalpha(locale[0]) ||
75 	    !isalpha(locale[1]) ||
76 	    (locale[2] != '\0' &&
77 	    locale[2] != '_' &&
78 	    locale[2] != '.' &&
79 	    locale[2] != '@'))
80 		return (NULL);
81 
82 
83 	/* we have a primary language sub-tag */
84 	langtag = (char *)xmalloc(LANGTAG_MAX + 1);
85 
86 	strncpy(langtag, locale, 2);
87 	langtag[2] = '\0';
88 
89 	/* do we have country sub-tag? For example: cs_CZ */
90 	if (locale[2] == '_') {
91 		if (strlen(locale) < 5 ||
92 		    !isalpha(locale[3]) ||
93 		    !isalpha(locale[4]) ||
94 		    (locale[5] != '\0' && (locale[5] != '.' &&
95 		    locale[5] != '@'))) {
96 			return (langtag);
97 		}
98 
99 		/* example: create cs-CZ from cs_CZ */
100 		if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 2,
101 		    locale + 3) == 5)
102 			return (langtag);
103 	}
104 
105 	/* in all other cases we just use the primary language sub-tag */
106 	return (langtag);
107 }
108 
109 uint_t
110 g11n_langtag_is_default(char *langtag)
111 {
112 	return (strcmp(langtag, "i-default") == 0);
113 }
114 
115 /*
116  * This lang tag / locale matching function works only for two-character
117  * language primary sub-tags and two-character country sub-tags.
118  */
119 uint_t
120 g11n_langtag_matches_locale(char *langtag, char *locale)
121 {
122 	/* match "i-default" to the process' current locale if possible */
123 	if (g11n_langtag_is_default(langtag)) {
124 		if (strcasecmp(locale, "POSIX") == 0 ||
125 		    strcasecmp(locale, "C") == 0)
126 			return (1);
127 		else
128 			return (0);
129 	}
130 
131 	/*
132 	 * locale must be at least 2 chars long and the lang part must be
133 	 * exactly two characters
134 	 */
135 	if (strlen(locale) < 2 ||
136 	    (!isalpha(locale[0]) || !isalpha(locale[1]) ||
137 	    (locale[2] != '\0' && locale[2] != '_' &&
138 	    locale[2] != '.' && locale[2] != '@')))
139 		return (0);
140 
141 	/* same thing with the langtag */
142 	if (strlen(langtag) < 2 ||
143 	    (!isalpha(langtag[0]) || !isalpha(langtag[1]) ||
144 	    (langtag[2] != '\0' && langtag[2] != '-')))
145 		return (0);
146 
147 	/* primary language sub-tag and the locale's language part must match */
148 	if (strncasecmp(langtag, locale, 2) != 0)
149 		return (0);
150 
151 	/*
152 	 * primary language sub-tag and the locale's language match, now
153 	 * fuzzy check country part
154 	 */
155 
156 	/* neither langtag nor locale have more than one component */
157 	if (langtag[2] == '\0' &&
158 	    (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@'))
159 		return (2);
160 
161 	/* langtag has only one sub-tag... */
162 	if (langtag[2] == '\0')
163 		return (1);
164 
165 	/* locale has no country code... */
166 	if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')
167 		return (1);
168 
169 	/* langtag has more than one subtag and the locale has a country code */
170 
171 	/* ignore second subtag if not two chars */
172 	if (strlen(langtag) < 5)
173 		return (1);
174 
175 	if (!isalpha(langtag[3]) || !isalpha(langtag[4]) ||
176 	    (langtag[5] != '\0' && langtag[5] != '-'))
177 		return (1);
178 
179 	/* ignore rest of locale if there is no two-character country part */
180 	if (strlen(locale) < 5)
181 		return (1);
182 
183 	if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) ||
184 	    (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@'))
185 		return (1);
186 
187 	/* if the country part matches, return 2 */
188 	if (strncasecmp(&langtag[3], &locale[3], 2) == 0)
189 		return (2);
190 
191 	return (1);
192 }
193 
194 char *
195 g11n_getlocale()
196 {
197 	/* we have one text domain - always set it */
198 	(void) textdomain(TEXT_DOMAIN);
199 
200 	/* if the locale is not set, set it from the env vars */
201 	if (!setlocale(LC_MESSAGES, NULL))
202 		(void) setlocale(LC_MESSAGES, "");
203 
204 	return (setlocale(LC_MESSAGES, NULL));
205 }
206 
207 void
208 g11n_setlocale(int category, const char *locale)
209 {
210 	char *curr;
211 
212 	/* we have one text domain - always set it */
213 	(void) textdomain(TEXT_DOMAIN);
214 
215 	if (!locale)
216 		return;
217 
218 	if (*locale && ((curr = setlocale(category, NULL))) &&
219 	    strcmp(curr, locale) == 0)
220 		return;
221 
222 	/* if <category> is bogus, setlocale() will do nothing */
223 	(void) setlocale(category, locale);
224 }
225 
226 char **
227 g11n_getlocales()
228 {
229 	FILE *locale_out;
230 	uint_t n_elems, list_size, long_line = 0;
231 	char **list;
232 	char locale[64];	/* 64 bytes is plenty for locale names */
233 
234 	if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL)
235 		return (NULL);
236 
237 	/*
238 	 * start with enough room for 65 locales - that's a lot fewer than
239 	 * all the locales available for installation, but a lot more than
240 	 * what most users will need and install
241 	 */
242 	n_elems = 0;
243 	list_size = 192;
244 	list = (char **) xmalloc(sizeof (char *) * (list_size + 1));
245 	memset(list, 0, sizeof (char *) * (list_size + 1));
246 
247 	while (fgets(locale, sizeof (locale), locale_out)) {
248 		/* skip long locale names (if any) */
249 		if (!strchr(locale, '\n')) {
250 			long_line = 1;
251 			continue;
252 		} else if (long_line) {
253 			long_line = 0;
254 			continue;
255 		}
256 
257 		if (strncmp(locale, "iso_8859", 8) == 0)
258 			/* ignore locale names like "iso_8859-1" */
259 			continue;
260 
261 		if (n_elems == list_size) {
262 			list_size *= 2;
263 			list = (char **)xrealloc((void *) list,
264 			    (list_size + 1) * sizeof (char *));
265 			memset(&list[n_elems + 1], 0,
266 			    sizeof (char *) * (list_size - n_elems + 1));
267 		}
268 
269 		*(strchr(locale, '\n')) = '\0';	/* remove the trailing \n */
270 		list[n_elems++] = xstrdup(locale);
271 	}
272 
273 	if (n_elems == 0)
274 		return (NULL);
275 
276 	list[n_elems] = NULL;
277 	(void) pclose(locale_out);
278 
279 	qsort(list, n_elems - 1, sizeof (char *), locale_cmp);
280 	return (list);
281 }
282 
283 char *
284 g11n_getlangs()
285 {
286 	char *locale;
287 
288 	if (getenv("SSH_LANGS"))
289 		return (xstrdup(getenv("SSH_LANGS")));
290 
291 	locale = g11n_getlocale();
292 
293 	if (!locale || !*locale)
294 		return (xstrdup("i-default"));
295 
296 	return (g11n_locale2langtag(locale));
297 }
298 
299 char *
300 g11n_locales2langs(char **locale_set)
301 {
302 	char **p, **r, **q;
303 	char *langtag;
304 	int locales, skip;
305 
306 	for (locales = 0, p = locale_set; p && *p; p++)
307 		locales++;
308 
309 	r = (char **)xmalloc((locales + 1) * sizeof (char *));
310 	memset(r, 0, (locales + 1) * sizeof (char *));
311 
312 	for (p = locale_set; p && *p && ((p - locale_set) <= locales); p++) {
313 		skip = 0;
314 		if ((langtag = g11n_locale2langtag(*p)) == NULL)
315 			continue;
316 		for (q = r; (q - r) < locales; q++) {
317 			if (!*q)
318 				break;
319 			if (*q && strcmp(*q, langtag) == 0)
320 				skip = 1;
321 		}
322 		if (!skip)
323 			*(q++) = langtag;
324 		*q = NULL;
325 	}
326 
327 	return (xjoin(r, ','));
328 }
329 
330 static int
331 sortcmp(const void *d1, const void *d2)
332 {
333 	char *s1 = *(char **)d1;
334 	char *s2 = *(char **)d2;
335 
336 	return (strcmp(s1, s2));
337 }
338 
339 int
340 g11n_langtag_match(char *langtag1, char *langtag2)
341 {
342 	int len1, len2;
343 	char c1, c2;
344 
345 	len1 = (strchr(langtag1, '-')) ?
346 		(strchr(langtag1, '-') - langtag1)
347 		: strlen(langtag1);
348 
349 	len2 = (strchr(langtag2, '-')) ?
350 		(strchr(langtag2, '-') - langtag2)
351 		: strlen(langtag2);
352 
353 	/* no match */
354 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
355 		return (0);
356 
357 	c1 = *(langtag1 + len1);
358 	c2 = *(langtag2 + len2);
359 
360 	/* no country sub-tags - exact match */
361 	if (c1 == '\0' && c2 == '\0')
362 		return (2);
363 
364 	/* one langtag has a country sub-tag, the other doesn't */
365 	if (c1 == '\0' || c2 == '\0')
366 		return (1);
367 
368 	/* can't happen - both langtags have a country sub-tag */
369 	if (c1 != '-' || c2 != '-')
370 		return (1);
371 
372 	/* compare country subtags */
373 	langtag1 = langtag1 + len1 + 1;
374 	langtag2 = langtag2 + len2 + 1;
375 
376 	len1 = (strchr(langtag1, '-')) ?
377 	    (strchr(langtag1, '-') - langtag1) : strlen(langtag1);
378 
379 	len2 = (strchr(langtag2, '-')) ?
380 	    (strchr(langtag2, '-') - langtag2) : strlen(langtag2);
381 
382 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
383 		return (1);
384 
385 	/* country tags matched - exact match */
386 	return (2);
387 }
388 
389 char *
390 g11n_langtag_set_intersect(char *set1, char *set2)
391 {
392 	char **list1, **list2, **list3, **p, **q, **r;
393 	char *set3, *lang_subtag;
394 	uint_t n1, n2, n3;
395 	uint_t do_append;
396 
397 	list1 = xsplit(set1, ',');
398 	list2 = xsplit(set2, ',');
399 
400 	for (n1 = 0, p = list1; p && *p; p++, n1++)
401 		;
402 	for (n2 = 0, p = list2; p && *p; p++, n2++)
403 		;
404 
405 	list3 = (char **) xmalloc(sizeof (char *) * (n1 + n2 + 1));
406 	*list3 = NULL;
407 
408 	/*
409 	 * we must not sort the user langtags - sorting or not the server's
410 	 * should not affect the outcome
411 	 */
412 	qsort(list2, n2, sizeof (char *), sortcmp);
413 
414 	for (n3 = 0, p = list1; p && *p; p++) {
415 		do_append = 0;
416 		for (q = list2; q && *q; q++) {
417 			if (g11n_langtag_match(*p, *q) != 2) continue;
418 			/* append element */
419 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
420 				do_append = 1;
421 				if (!*r)
422 					break;
423 				if (strcmp(*p, *r) == 0) {
424 					do_append = 0;
425 					break;
426 				}
427 			}
428 			if (do_append && n3 <= (n1 + n2)) {
429 				list3[n3++] = xstrdup(*p);
430 				list3[n3] = NULL;
431 			}
432 		}
433 	}
434 
435 	for (p = list1; p && *p; p++) {
436 		do_append = 0;
437 		for (q = list2; q && *q; q++) {
438 			if (g11n_langtag_match(*p, *q) != 1)
439 				continue;
440 
441 			/* append element */
442 			lang_subtag = xstrdup(*p);
443 			if (strchr(lang_subtag, '-'))
444 				*(strchr(lang_subtag, '-')) = '\0';
445 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
446 				do_append = 1;
447 				if (!*r)
448 					break;
449 				if (strcmp(lang_subtag, *r) == 0) {
450 					do_append = 0;
451 					break;
452 				}
453 			}
454 			if (do_append && n3 <= (n1 + n2)) {
455 				list3[n3++] = lang_subtag;
456 				list3[n3] = NULL;
457 			} else
458 				xfree(lang_subtag);
459 		}
460 	}
461 
462 	set3 = xjoin(list3, ',');
463 	xfree_split_list(list1);
464 	xfree_split_list(list2);
465 	xfree_split_list(list3);
466 
467 	return (set3);
468 }
469 
470 char *
471 g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags)
472 {
473 	char *list, *result;
474 	char **xlist;
475 
476 	/* g11n_langtag_set_intersect uses xmalloc - should not return NULL */
477 	list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags);
478 
479 	if (!list)
480 		return (NULL);
481 
482 	xlist = xsplit(list, ',');
483 
484 	xfree(list);
485 
486 	if (!xlist || !*xlist)
487 		return (NULL);
488 
489 	result = xstrdup(*xlist);
490 	xfree_split_list(xlist);
491 
492 	return (result);
493 }
494 
495 /*
496  * Compare locales, preferring UTF-8 codesets to others, otherwise doing
497  * a stright strcmp()
498  */
499 static int
500 locale_cmp(const void *d1, const void *d2)
501 {
502 	char *dot_ptr;
503 	char *s1 = *(char **)d1;
504 	char *s2 = *(char **)d2;
505 	int s1_is_utf8 = 0;
506 	int s2_is_utf8 = 0;
507 
508 	/* check if s1 is a UTF-8 locale */
509 	if (((dot_ptr = strchr((char *)s1, '.')) != NULL) &&
510 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
511 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
512 		s1_is_utf8++;
513 	}
514 
515 	/* check if s2 is a UTF-8 locale */
516 	if (((dot_ptr = strchr((char *)s2, '.')) != NULL) &&
517 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
518 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
519 		s2_is_utf8++;
520 	}
521 
522 	/* prefer UTF-8 locales */
523 	if (s1_is_utf8 && !s2_is_utf8)
524 		return (-1);
525 
526 	if (s2_is_utf8 && !s1_is_utf8)
527 		return (1);
528 
529 	/* prefer any locale over the default locales */
530 	if (strcmp(s1, "C") == 0 || strcmp(s1, "POSIX") == 0 ||
531 	    strcmp(s1, "common") == 0) {
532 		if (strcmp(s2, "C") != 0 && strcmp(s2, "POSIX") != 0 &&
533 		    strcmp(s2, "common") != 0)
534 			return (1);
535 	}
536 
537 	if (strcmp(s2, "C") == 0 || strcmp(s2, "POSIX") == 0 ||
538 	    strcmp(s2, "common") == 0) {
539 		if (strcmp(s1, "C") != 0 &&
540 		    strcmp(s1, "POSIX") != 0 &&
541 		    strcmp(s1, "common") != 0)
542 			return (-1);
543 	}
544 
545 	return (strcmp(s1, s2));
546 }
547 
548 
549 char **
550 g11n_langtag_set_locale_set_intersect(char *langtag_set, char **locale_set)
551 {
552 	char **langtag_list, **result, **p, **q, **r;
553 	char *s;
554 	uint_t do_append, n_langtags, n_locales, n_results, max_results;
555 
556 	/* count lang tags and locales */
557 	for (n_locales = 0, p = locale_set; p && *p; p++)
558 		n_locales++;
559 
560 	n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0;
561 	/* count the number of langtags */
562 	for (; s = strchr(s, ','); s++, n_langtags++)
563 		;
564 
565 	qsort(locale_set, n_locales, sizeof (char *), locale_cmp);
566 
567 	langtag_list = xsplit(langtag_set, ',');
568 	for (n_langtags = 0, p = langtag_list; p && *p; p++, n_langtags++)
569 		;
570 
571 	max_results = MIN(n_locales, n_langtags) * 2;
572 	result = (char **) xmalloc(sizeof (char *) * (max_results + 1));
573 	*result = NULL;
574 	n_results = 0;
575 
576 	/* more specific matches first */
577 	for (p = langtag_list; p && *p; p++) {
578 		do_append = 0;
579 		for (q = locale_set; q && *q; q++) {
580 			if (g11n_langtag_matches_locale(*p, *q) == 2) {
581 				do_append = 1;
582 				for (r = result; (r - result) <=
583 				    MIN(n_locales, n_langtags); r++) {
584 					if (!*r)
585 						break;
586 					if (strcmp(*q, *r) == 0) {
587 						do_append = 0;
588 						break;
589 					}
590 				}
591 				if (do_append && n_results < max_results) {
592 					result[n_results++] = xstrdup(*q);
593 					result[n_results] = NULL;
594 				}
595 				break;
596 			}
597 		}
598 	}
599 
600 	for (p = langtag_list; p && *p; p++) {
601 		do_append = 0;
602 		for (q = locale_set; q && *q; q++) {
603 			if (g11n_langtag_matches_locale(*p, *q) == 1) {
604 				do_append = 1;
605 				for (r = result; (r - result) <=
606 				    MIN(n_locales, n_langtags); r++) {
607 					if (!*r)
608 						break;
609 					if (strcmp(*q, *r) == 0) {
610 						do_append = 0;
611 						break;
612 					}
613 				}
614 				if (do_append && n_results < max_results) {
615 					result[n_results++] = xstrdup(*q);
616 					result[n_results] = NULL;
617 				}
618 				break;
619 			}
620 		}
621 	}
622 
623 	xfree_split_list(langtag_list);
624 
625 	return (result);
626 }
627 
628 char *
629 g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales)
630 {
631 	char **results, *result = NULL;
632 
633 	if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags,
634 	    srvr_locales ?  srvr_locales : g11n_getlocales())) == NULL)
635 		return (NULL);
636 
637 	if (*results != NULL)
638 		result = xstrdup(*results);
639 
640 	xfree_split_list(results);
641 
642 	return (result);
643 }
644 
645 
646 /*
647  * Functions for validating ASCII and UTF-8 strings
648  *
649  * The error_str parameter is an optional pointer to a char variable
650  * where to store a string suitable for use with error() or fatal() or
651  * friends.
652  *
653  * The return value is 0 if success, EILSEQ or EINVAL.
654  *
655  */
656 uint_t
657 g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str)
658 {
659 	uchar_t *p;
660 
661 	for (p = (uchar_t *)str; p && *p && (!(*p & 0x80)); p++)
662 		;
663 
664 	if (len && ((p - (uchar_t *)str) != len))
665 		return (EILSEQ);
666 
667 	return (0);
668 }
669 
670 uint_t
671 g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str)
672 {
673 	uchar_t *p;
674 	uint_t c, l;
675 
676 	if (len == 0)
677 		len = strlen((const char *)str);
678 
679 	for (p = (uchar_t *)str; p && (p - str < len) && *p; ) {
680 		/* 8-bit chars begin a UTF-8 sequence */
681 		if (*p & 0x80) {
682 			/* get sequence length and sanity check first byte */
683 			if (*p < 0xc0)
684 				return (EILSEQ);
685 			else if (*p < 0xe0)
686 				l = 2;
687 			else if (*p < 0xf0)
688 				l = 3;
689 			else if (*p < 0xf8)
690 				l = 4;
691 			else if (*p < 0xfc)
692 				l = 5;
693 			else if (*p < 0xfe)
694 				l = 6;
695 			else
696 				return (EILSEQ);
697 
698 			if ((p + l - str) >= len)
699 				return (EILSEQ);
700 
701 			/* overlong detection - build codepoint */
702 			c = *p & 0x3f;
703 			/* shift c bits from first byte */
704 			c = c << (6 * (l - 1));
705 
706 			if (l > 1) {
707 				if (*(p + 1) && ((*(p + 1) & 0xc0) == 0x80))
708 					c = c | ((*(p + 1) & 0x3f) <<
709 					    (6 * (l - 2)));
710 				else
711 					return (EILSEQ);
712 
713 				if (c < 0x80)
714 					return (EILSEQ);
715 			}
716 
717 			if (l > 2) {
718 				if (*(p + 2) && ((*(p + 2) & 0xc0) == 0x80))
719 					c = c | ((*(p + 2) & 0x3f) <<
720 					    (6 * (l - 3)));
721 				else
722 					return (EILSEQ);
723 
724 				if (c < 0x800)
725 					return (EILSEQ);
726 			}
727 
728 			if (l > 3) {
729 				if (*(p + 3) && ((*(p + 3) & 0xc0) == 0x80))
730 					c = c | ((*(p + 3) & 0x3f) <<
731 					    (6 * (l - 4)));
732 				else
733 					return (EILSEQ);
734 
735 				if (c < 0x10000)
736 					return (EILSEQ);
737 			}
738 
739 			if (l > 4) {
740 				if (*(p + 4) && ((*(p + 4) & 0xc0) == 0x80))
741 					c = c | ((*(p + 4) & 0x3f) <<
742 					    (6 * (l - 5)));
743 				else
744 					return (EILSEQ);
745 
746 				if (c < 0x200000)
747 					return (EILSEQ);
748 			}
749 
750 			if (l > 5) {
751 				if (*(p + 5) && ((*(p + 5) & 0xc0) == 0x80))
752 					c = c | (*(p + 5) & 0x3f);
753 				else
754 					return (EILSEQ);
755 
756 				if (c < 0x4000000)
757 					return (EILSEQ);
758 			}
759 
760 			/*
761 			 * check for UTF-16 surrogates ifs other illegal
762 			 * UTF-8 * points
763 			 */
764 			if (((c <= 0xdfff) && (c >= 0xd800)) ||
765 			    (c == 0xfffe) || (c == 0xffff))
766 				return (EILSEQ);
767 			p += l;
768 		}
769 		/* 7-bit chars are fine */
770 		else
771 			p++;
772 	}
773 	return (0);
774 }
775 
776 /*
777  * Functions for converting to ASCII or UTF-8 from the local codeset
778  * Functions for converting from ASCII or UTF-8 to the local codeset
779  *
780  * The error_str parameter is an optional pointer to a char variable
781  * where to store a string suitable for use with error() or fatal() or
782  * friends.
783  *
784  * The err parameter is an optional pointer to an integer where 0
785  * (success) or EILSEQ or EINVAL will be stored (failure).
786  *
787  * These functions return NULL if the conversion fails.
788  *
789  */
790 uchar_t *
791 g11n_convert_from_ascii(const char *str, int *err_ptr, uchar_t **error_str)
792 {
793 	static uint_t initialized = 0;
794 	static uint_t do_convert = 0;
795 	iconv_t cd;
796 	int err;
797 
798 	if (!initialized) {
799 		/*
800 		 * iconv_open() fails if the to/from codesets are the
801 		 * same, and there are aliases of codesets to boot...
802 		 */
803 		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
804 			strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
805 			strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
806 			initialized = 1;
807 			do_convert = 0;
808 		} else {
809 			cd = iconv_open(nl_langinfo(CODESET), "646");
810 			if (cd == (iconv_t)-1) {
811 				if (err_ptr)
812 					*err_ptr = errno;
813 				if (error_str)
814 					*error_str = (uchar_t *)"Cannot "
815 					    "convert ASCII strings to the local"
816 					    " codeset";
817 			}
818 			initialized = 1;
819 			do_convert = 1;
820 		}
821 	}
822 
823 	if (!do_convert) {
824 		if ((err = g11n_validate_ascii(str, 0, error_str))) {
825 			if (err_ptr)
826 				*err_ptr = err;
827 			return (NULL);
828 		} else
829 			return ((uchar_t *)xstrdup(str));
830 	}
831 
832 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
833 }
834 
835 uchar_t *
836 g11n_convert_from_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
837 {
838 	static uint_t initialized = 0;
839 	static uint_t do_convert = 0;
840 	iconv_t cd;
841 	int err;
842 
843 	if (!initialized) {
844 		/*
845 		 * iconv_open() fails if the to/from codesets are the
846 		 * same, and there are aliases of codesets to boot...
847 		 */
848 		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
849 		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
850 			initialized = 1;
851 			do_convert = 0;
852 		} else {
853 			cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
854 			if (cd == (iconv_t)-1) {
855 				if (err_ptr)
856 					*err_ptr = errno;
857 				if (error_str)
858 					*error_str = (uchar_t *)"Cannot "
859 					    "convert UTF-8 strings to the "
860 					    "local codeset";
861 			}
862 			initialized = 1;
863 			do_convert = 1;
864 		}
865 	}
866 
867 	if (!do_convert) {
868 		if ((err = g11n_validate_utf8(str, 0, error_str))) {
869 			if (err_ptr)
870 				*err_ptr = err;
871 			return (NULL);
872 		} else
873 			return ((uchar_t *)xstrdup((char *)str));
874 	}
875 
876 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
877 }
878 
879 char *
880 g11n_convert_to_ascii(const uchar_t *str, int *err_ptr, uchar_t **error_str)
881 {
882 	static uint_t initialized = 0;
883 	static uint_t do_convert = 0;
884 	iconv_t cd;
885 
886 	if (!initialized) {
887 		/*
888 		 * iconv_open() fails if the to/from codesets are the
889 		 * same, and there are aliases of codesets to boot...
890 		 */
891 		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
892 		    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
893 		    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
894 			initialized = 1;
895 			do_convert = 0;
896 		} else {
897 			cd = iconv_open("646", nl_langinfo(CODESET));
898 			if (cd == (iconv_t)-1) {
899 				if (err_ptr)
900 					*err_ptr = errno;
901 				if (error_str)
902 					*error_str = (uchar_t *)"Cannot "
903 					    "convert UTF-8 strings to the "
904 					    "local codeset";
905 			}
906 			initialized = 1;
907 			do_convert = 1;
908 		}
909 	}
910 
911 	if (!do_convert)
912 		return (xstrdup((char *)str));
913 
914 	return ((char *)do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
915 }
916 
917 uchar_t *
918 g11n_convert_to_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
919 {
920 	static uint_t initialized = 0;
921 	static uint_t do_convert = 0;
922 	iconv_t cd;
923 
924 	if (!initialized) {
925 		/*
926 		 * iconv_open() fails if the to/from codesets are the
927 		 * same, and there are aliases of codesets to boot...
928 		 */
929 		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
930 		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
931 			initialized = 1;
932 			do_convert = 0;
933 		} else {
934 			cd = iconv_open("UTF-8", nl_langinfo(CODESET));
935 			if (cd == (iconv_t)-1) {
936 				if (err_ptr)
937 					*err_ptr = errno;
938 				if (error_str)
939 					*error_str = (uchar_t *)"Cannot "
940 					    "convert UTF-8 strings to the "
941 					    "local codeset";
942 			}
943 			initialized = 1;
944 			do_convert = 1;
945 		}
946 	}
947 
948 	if (!do_convert)
949 		return ((uchar_t *)xstrdup((char *)str));
950 
951 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
952 }
953 
954 
955 /*
956  * Wrapper around iconv()
957  *
958  * The caller is responsible for freeing the result and for handling
959  * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF).
960  */
961 static uchar_t *
962 do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, uint_t len,
963     uint_t *outlen, int *err, uchar_t **err_str)
964 {
965 	size_t inbytesleft, outbytesleft, converted_size;
966 	char *outbuf;
967 	uchar_t *converted;
968 	const char *inbuf;
969 	uint_t mul = 0;
970 
971 	if (!buf || !(*(char *)buf))
972 		return (NULL);
973 
974 	if (len == 0)
975 		len = strlen(buf);
976 
977 	/* reset conversion descriptor */
978 	/* XXX Do we need initial shift sequences for UTF-8??? */
979 	(void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft);
980 	inbuf = (const char *) buf;
981 
982 	if (mul_ptr)
983 		mul = *mul_ptr;
984 
985 	converted_size = (len << mul);
986 	outbuf = (char *)xmalloc(converted_size + 1); /* for null */
987 	converted = (uchar_t *)outbuf;
988 	outbytesleft = len;
989 
990 	do {
991 		if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) ==
992 		    (size_t)-1) {
993 			if (errno == E2BIG) {
994 				/* UTF-8 codepoints are at most 8 bytes long */
995 				if (mul > 2) {
996 					if (err_str)
997 						*err_str = (uchar_t *)
998 						    "Conversion to UTF-8 failed"
999 						    " due to preposterous space"
1000 						    " requirements";
1001 					if (err)
1002 						*err = EILSEQ;
1003 					return (NULL);
1004 				}
1005 
1006 				/*
1007 				 * re-alloc output and ensure that the outbuf
1008 				 * and outbytesleft values are adjusted
1009 				 */
1010 				converted = xrealloc(converted,
1011 				    converted_size << 1 + 1);
1012 				outbuf = (char *)converted + converted_size -
1013 				    outbytesleft;
1014 				converted_size = (len << ++(mul));
1015 				outbytesleft = converted_size - outbytesleft;
1016 			} else {
1017 				/*
1018 				 * let the caller deal with iconv() errors,
1019 				 * probably by calling fatal(); xfree() does
1020 				 * not set errno
1021 				 */
1022 				if (err)
1023 					*err = errno;
1024 				xfree(converted);
1025 				return (NULL);
1026 			}
1027 		}
1028 	} while (inbytesleft);
1029 
1030 	*outbuf = '\0'; /* ensure null-termination */
1031 	if (outlen)
1032 		*outlen = converted_size - outbytesleft;
1033 	if (mul_ptr)
1034 		*mul_ptr = mul;
1035 
1036 	return (converted);
1037 }
1038