xref: /titanic_50/usr/src/cmd/ssh/libssh/common/g11n.c (revision 551bc2a66868b5cb5be6b70ab9f55515e77a39a9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
22  * Use is subject to license terms.
23  */
24 
25 #pragma ident	"%Z%%M%	%I%	%E% SMI"
26 
27 #include <errno.h>
28 #include <locale.h>
29 #include <langinfo.h>
30 #include <iconv.h>
31 #include <ctype.h>
32 #include <strings.h>
33 #include <string.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include "includes.h"
37 #include "xmalloc.h"
38 #include "xlist.h"
39 
40 #ifdef MIN
41 #undef MIN
42 #endif /* MIN */
43 
44 #define	MIN(x, y)	((x) < (y) ? (x) : (y))
45 
46 #define	LOCALE_PATH	"/usr/bin/locale"
47 
48 /* two-char country code, '-' and two-char region code */
49 #define	LANGTAG_MAX	5
50 
51 static uchar_t *do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf,
52     uint_t len, uint_t *outlen, int *err, uchar_t **err_str);
53 
54 static int locale_cmp(const void *d1, const void *d2);
55 static char *g11n_locale2langtag(char *locale);
56 
57 uint_t g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str);
58 uint_t g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str);
59 
60 /*
61  * Convert locale string name into a language tag. The caller is responsible for
62  * freeing the memory allocated for the result.
63  */
64 static char *
65 g11n_locale2langtag(char *locale)
66 {
67 	char *langtag;
68 
69 	/* base cases */
70 	if (!locale || !*locale)
71 		return (NULL);
72 
73 	if (strcmp(locale, "POSIX") == 0 || strcmp(locale, "C") == 0)
74 		return (xstrdup("i-default"));
75 
76 	/* punt for language codes which are not exactly 2 letters */
77 	if (strlen(locale) < 2 ||
78 	    !isalpha(locale[0]) ||
79 	    !isalpha(locale[1]) ||
80 	    (locale[2] != '\0' &&
81 	    locale[2] != '_' &&
82 	    locale[2] != '.' &&
83 	    locale[2] != '@'))
84 		return (NULL);
85 
86 
87 	/* we have a primary language sub-tag */
88 	langtag = (char *)xmalloc(LANGTAG_MAX + 1);
89 
90 	strncpy(langtag, locale, 2);
91 	langtag[2] = '\0';
92 
93 	/* do we have country sub-tag? For example: cs_CZ */
94 	if (locale[2] == '_') {
95 		if (strlen(locale) < 5 ||
96 		    !isalpha(locale[3]) ||
97 		    !isalpha(locale[4]) ||
98 		    (locale[5] != '\0' && (locale[5] != '.' &&
99 		    locale[5] != '@'))) {
100 			return (langtag);
101 		}
102 
103 		/* example: create cs-CZ from cs_CZ */
104 		if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 2,
105 		    locale + 3) == 5)
106 			return (langtag);
107 	}
108 
109 	/* in all other cases we just use the primary language sub-tag */
110 	return (langtag);
111 }
112 
113 uint_t
114 g11n_langtag_is_default(char *langtag)
115 {
116 	return (strcmp(langtag, "i-default") == 0);
117 }
118 
119 /*
120  * This lang tag / locale matching function works only for two-character
121  * language primary sub-tags and two-character country sub-tags.
122  */
123 uint_t
124 g11n_langtag_matches_locale(char *langtag, char *locale)
125 {
126 	/* match "i-default" to the process' current locale if possible */
127 	if (g11n_langtag_is_default(langtag)) {
128 		if (strcasecmp(locale, "POSIX") == 0 ||
129 		    strcasecmp(locale, "C") == 0)
130 			return (1);
131 		else
132 			return (0);
133 	}
134 
135 	/*
136 	 * locale must be at least 2 chars long and the lang part must be
137 	 * exactly two characters
138 	 */
139 	if (strlen(locale) < 2 ||
140 	    (!isalpha(locale[0]) || !isalpha(locale[1]) ||
141 	    (locale[2] != '\0' && locale[2] != '_' &&
142 	    locale[2] != '.' && locale[2] != '@')))
143 		return (0);
144 
145 	/* same thing with the langtag */
146 	if (strlen(langtag) < 2 ||
147 	    (!isalpha(langtag[0]) || !isalpha(langtag[1]) ||
148 	    (langtag[2] != '\0' && langtag[2] != '-')))
149 		return (0);
150 
151 	/* primary language sub-tag and the locale's language part must match */
152 	if (strncasecmp(langtag, locale, 2) != 0)
153 		return (0);
154 
155 	/*
156 	 * primary language sub-tag and the locale's language match, now
157 	 * fuzzy check country part
158 	 */
159 
160 	/* neither langtag nor locale have more than one component */
161 	if (langtag[2] == '\0' &&
162 	    (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@'))
163 		return (2);
164 
165 	/* langtag has only one sub-tag... */
166 	if (langtag[2] == '\0')
167 		return (1);
168 
169 	/* locale has no country code... */
170 	if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')
171 		return (1);
172 
173 	/* langtag has more than one subtag and the locale has a country code */
174 
175 	/* ignore second subtag if not two chars */
176 	if (strlen(langtag) < 5)
177 		return (1);
178 
179 	if (!isalpha(langtag[3]) || !isalpha(langtag[4]) ||
180 	    (langtag[5] != '\0' && langtag[5] != '-'))
181 		return (1);
182 
183 	/* ignore rest of locale if there is no two-character country part */
184 	if (strlen(locale) < 5)
185 		return (1);
186 
187 	if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) ||
188 	    (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@'))
189 		return (1);
190 
191 	/* if the country part matches, return 2 */
192 	if (strncasecmp(&langtag[3], &locale[3], 2) == 0)
193 		return (2);
194 
195 	return (1);
196 }
197 
198 char *
199 g11n_getlocale()
200 {
201 	/* we have one text domain - always set it */
202 	(void) textdomain(TEXT_DOMAIN);
203 
204 	/* if the locale is not set, set it from the env vars */
205 	if (!setlocale(LC_MESSAGES, NULL))
206 		(void) setlocale(LC_MESSAGES, "");
207 
208 	return (setlocale(LC_MESSAGES, NULL));
209 }
210 
211 void
212 g11n_setlocale(int category, const char *locale)
213 {
214 	char *curr;
215 
216 	/* we have one text domain - always set it */
217 	(void) textdomain(TEXT_DOMAIN);
218 
219 	if (!locale)
220 		return;
221 
222 	if (*locale && ((curr = setlocale(category, NULL))) &&
223 	    strcmp(curr, locale) == 0)
224 		return;
225 
226 	/* if <category> is bogus, setlocale() will do nothing */
227 	(void) setlocale(category, locale);
228 }
229 
230 char **
231 g11n_getlocales()
232 {
233 	FILE *locale_out;
234 	uint_t n_elems, list_size, long_line = 0;
235 	char **list;
236 	char locale[64];	/* 64 bytes is plenty for locale names */
237 
238 	if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL)
239 		return (NULL);
240 
241 	/*
242 	 * start with enough room for 65 locales - that's a lot fewer than
243 	 * all the locales available for installation, but a lot more than
244 	 * what most users will need and install
245 	 */
246 	n_elems = 0;
247 	list_size = 192;
248 	list = (char **) xmalloc(sizeof (char *) * (list_size + 1));
249 	memset(list, 0, sizeof (char *) * (list_size + 1));
250 
251 	while (fgets(locale, sizeof (locale), locale_out)) {
252 		/* skip long locale names (if any) */
253 		if (!strchr(locale, '\n')) {
254 			long_line = 1;
255 			continue;
256 		} else if (long_line) {
257 			long_line = 0;
258 			continue;
259 		}
260 
261 		if (strncmp(locale, "iso_8859", 8) == 0)
262 			/* ignore locale names like "iso_8859-1" */
263 			continue;
264 
265 		if (n_elems == list_size) {
266 			list_size *= 2;
267 			list = (char **)xrealloc((void *) list,
268 			    (list_size + 1) * sizeof (char *));
269 			memset(&list[n_elems + 1], 0,
270 			    sizeof (char *) * (list_size - n_elems + 1));
271 		}
272 
273 		*(strchr(locale, '\n')) = '\0';	/* remove the trailing \n */
274 		list[n_elems++] = xstrdup(locale);
275 	}
276 
277 	if (n_elems == 0) {
278 		xfree(list);
279 		return (NULL);
280 	}
281 
282 	list[n_elems] = NULL;
283 	(void) pclose(locale_out);
284 
285 	qsort(list, n_elems - 1, sizeof (char *), locale_cmp);
286 	return (list);
287 }
288 
289 char *
290 g11n_getlangs()
291 {
292 	char *locale;
293 
294 	if (getenv("SSH_LANGS"))
295 		return (xstrdup(getenv("SSH_LANGS")));
296 
297 	locale = g11n_getlocale();
298 
299 	if (!locale || !*locale)
300 		return (xstrdup("i-default"));
301 
302 	return (g11n_locale2langtag(locale));
303 }
304 
305 char *
306 g11n_locales2langs(char **locale_set)
307 {
308 	char **p, **r, **q;
309 	char *langtag, *langs;
310 	int locales, skip;
311 
312 	for (locales = 0, p = locale_set; p && *p; p++)
313 		locales++;
314 
315 	r = (char **)xmalloc((locales + 1) * sizeof (char *));
316 	memset(r, 0, (locales + 1) * sizeof (char *));
317 
318 	for (p = locale_set; p && *p && ((p - locale_set) <= locales); p++) {
319 		skip = 0;
320 		if ((langtag = g11n_locale2langtag(*p)) == NULL)
321 			continue;
322 		for (q = r; (q - r) < locales; q++) {
323 			if (!*q)
324 				break;
325 			if (*q && strcmp(*q, langtag) == 0)
326 				skip = 1;
327 		}
328 		if (!skip)
329 			*(q++) = langtag;
330 		else
331 			xfree(langtag);
332 		*q = NULL;
333 	}
334 
335 	langs = xjoin(r, ',');
336 	g11n_freelist(r);
337 
338 	return (langs);
339 }
340 
341 static int
342 sortcmp(const void *d1, const void *d2)
343 {
344 	char *s1 = *(char **)d1;
345 	char *s2 = *(char **)d2;
346 
347 	return (strcmp(s1, s2));
348 }
349 
350 int
351 g11n_langtag_match(char *langtag1, char *langtag2)
352 {
353 	int len1, len2;
354 	char c1, c2;
355 
356 	len1 = (strchr(langtag1, '-')) ?
357 	    (strchr(langtag1, '-') - langtag1)
358 	    : strlen(langtag1);
359 
360 	len2 = (strchr(langtag2, '-')) ?
361 	    (strchr(langtag2, '-') - langtag2)
362 	    : strlen(langtag2);
363 
364 	/* no match */
365 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
366 		return (0);
367 
368 	c1 = *(langtag1 + len1);
369 	c2 = *(langtag2 + len2);
370 
371 	/* no country sub-tags - exact match */
372 	if (c1 == '\0' && c2 == '\0')
373 		return (2);
374 
375 	/* one langtag has a country sub-tag, the other doesn't */
376 	if (c1 == '\0' || c2 == '\0')
377 		return (1);
378 
379 	/* can't happen - both langtags have a country sub-tag */
380 	if (c1 != '-' || c2 != '-')
381 		return (1);
382 
383 	/* compare country subtags */
384 	langtag1 = langtag1 + len1 + 1;
385 	langtag2 = langtag2 + len2 + 1;
386 
387 	len1 = (strchr(langtag1, '-')) ?
388 	    (strchr(langtag1, '-') - langtag1) : strlen(langtag1);
389 
390 	len2 = (strchr(langtag2, '-')) ?
391 	    (strchr(langtag2, '-') - langtag2) : strlen(langtag2);
392 
393 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
394 		return (1);
395 
396 	/* country tags matched - exact match */
397 	return (2);
398 }
399 
400 char *
401 g11n_langtag_set_intersect(char *set1, char *set2)
402 {
403 	char **list1, **list2, **list3, **p, **q, **r;
404 	char *set3, *lang_subtag;
405 	uint_t n1, n2, n3;
406 	uint_t do_append;
407 
408 	list1 = xsplit(set1, ',');
409 	list2 = xsplit(set2, ',');
410 
411 	for (n1 = 0, p = list1; p && *p; p++, n1++)
412 		;
413 	for (n2 = 0, p = list2; p && *p; p++, n2++)
414 		;
415 
416 	list3 = (char **) xmalloc(sizeof (char *) * (n1 + n2 + 1));
417 	*list3 = NULL;
418 
419 	/*
420 	 * we must not sort the user langtags - sorting or not the server's
421 	 * should not affect the outcome
422 	 */
423 	qsort(list2, n2, sizeof (char *), sortcmp);
424 
425 	for (n3 = 0, p = list1; p && *p; p++) {
426 		do_append = 0;
427 		for (q = list2; q && *q; q++) {
428 			if (g11n_langtag_match(*p, *q) != 2) continue;
429 			/* append element */
430 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
431 				do_append = 1;
432 				if (!*r)
433 					break;
434 				if (strcmp(*p, *r) == 0) {
435 					do_append = 0;
436 					break;
437 				}
438 			}
439 			if (do_append && n3 <= (n1 + n2)) {
440 				list3[n3++] = xstrdup(*p);
441 				list3[n3] = NULL;
442 			}
443 		}
444 	}
445 
446 	for (p = list1; p && *p; p++) {
447 		do_append = 0;
448 		for (q = list2; q && *q; q++) {
449 			if (g11n_langtag_match(*p, *q) != 1)
450 				continue;
451 
452 			/* append element */
453 			lang_subtag = xstrdup(*p);
454 			if (strchr(lang_subtag, '-'))
455 				*(strchr(lang_subtag, '-')) = '\0';
456 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
457 				do_append = 1;
458 				if (!*r)
459 					break;
460 				if (strcmp(lang_subtag, *r) == 0) {
461 					do_append = 0;
462 					break;
463 				}
464 			}
465 			if (do_append && n3 <= (n1 + n2)) {
466 				list3[n3++] = lang_subtag;
467 				list3[n3] = NULL;
468 			} else
469 				xfree(lang_subtag);
470 		}
471 	}
472 
473 	set3 = xjoin(list3, ',');
474 	xfree_split_list(list1);
475 	xfree_split_list(list2);
476 	xfree_split_list(list3);
477 
478 	return (set3);
479 }
480 
481 char *
482 g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags)
483 {
484 	char *list, *result;
485 	char **xlist;
486 
487 	/* g11n_langtag_set_intersect uses xmalloc - should not return NULL */
488 	list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags);
489 
490 	if (!list)
491 		return (NULL);
492 
493 	xlist = xsplit(list, ',');
494 
495 	xfree(list);
496 
497 	if (!xlist || !*xlist)
498 		return (NULL);
499 
500 	result = xstrdup(*xlist);
501 	xfree_split_list(xlist);
502 
503 	return (result);
504 }
505 
506 /*
507  * Compare locales, preferring UTF-8 codesets to others, otherwise doing
508  * a stright strcmp()
509  */
510 static int
511 locale_cmp(const void *d1, const void *d2)
512 {
513 	char *dot_ptr;
514 	char *s1 = *(char **)d1;
515 	char *s2 = *(char **)d2;
516 	int s1_is_utf8 = 0;
517 	int s2_is_utf8 = 0;
518 
519 	/* check if s1 is a UTF-8 locale */
520 	if (((dot_ptr = strchr((char *)s1, '.')) != NULL) &&
521 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
522 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
523 		s1_is_utf8++;
524 	}
525 
526 	/* check if s2 is a UTF-8 locale */
527 	if (((dot_ptr = strchr((char *)s2, '.')) != NULL) &&
528 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
529 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
530 		s2_is_utf8++;
531 	}
532 
533 	/* prefer UTF-8 locales */
534 	if (s1_is_utf8 && !s2_is_utf8)
535 		return (-1);
536 
537 	if (s2_is_utf8 && !s1_is_utf8)
538 		return (1);
539 
540 	/* prefer any locale over the default locales */
541 	if (strcmp(s1, "C") == 0 || strcmp(s1, "POSIX") == 0 ||
542 	    strcmp(s1, "common") == 0) {
543 		if (strcmp(s2, "C") != 0 && strcmp(s2, "POSIX") != 0 &&
544 		    strcmp(s2, "common") != 0)
545 			return (1);
546 	}
547 
548 	if (strcmp(s2, "C") == 0 || strcmp(s2, "POSIX") == 0 ||
549 	    strcmp(s2, "common") == 0) {
550 		if (strcmp(s1, "C") != 0 &&
551 		    strcmp(s1, "POSIX") != 0 &&
552 		    strcmp(s1, "common") != 0)
553 			return (-1);
554 	}
555 
556 	return (strcmp(s1, s2));
557 }
558 
559 
560 char **
561 g11n_langtag_set_locale_set_intersect(char *langtag_set, char **locale_set)
562 {
563 	char **langtag_list, **result, **p, **q, **r;
564 	char *s;
565 	uint_t do_append, n_langtags, n_locales, n_results, max_results;
566 
567 	/* count lang tags and locales */
568 	for (n_locales = 0, p = locale_set; p && *p; p++)
569 		n_locales++;
570 
571 	n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0;
572 	/* count the number of langtags */
573 	for (; s = strchr(s, ','); s++, n_langtags++)
574 		;
575 
576 	qsort(locale_set, n_locales, sizeof (char *), locale_cmp);
577 
578 	langtag_list = xsplit(langtag_set, ',');
579 	for (n_langtags = 0, p = langtag_list; p && *p; p++, n_langtags++)
580 		;
581 
582 	max_results = MIN(n_locales, n_langtags) * 2;
583 	result = (char **) xmalloc(sizeof (char *) * (max_results + 1));
584 	*result = NULL;
585 	n_results = 0;
586 
587 	/* more specific matches first */
588 	for (p = langtag_list; p && *p; p++) {
589 		do_append = 0;
590 		for (q = locale_set; q && *q; q++) {
591 			if (g11n_langtag_matches_locale(*p, *q) == 2) {
592 				do_append = 1;
593 				for (r = result; (r - result) <=
594 				    MIN(n_locales, n_langtags); r++) {
595 					if (!*r)
596 						break;
597 					if (strcmp(*q, *r) == 0) {
598 						do_append = 0;
599 						break;
600 					}
601 				}
602 				if (do_append && n_results < max_results) {
603 					result[n_results++] = xstrdup(*q);
604 					result[n_results] = NULL;
605 				}
606 				break;
607 			}
608 		}
609 	}
610 
611 	for (p = langtag_list; p && *p; p++) {
612 		do_append = 0;
613 		for (q = locale_set; q && *q; q++) {
614 			if (g11n_langtag_matches_locale(*p, *q) == 1) {
615 				do_append = 1;
616 				for (r = result; (r - result) <=
617 				    MIN(n_locales, n_langtags); r++) {
618 					if (!*r)
619 						break;
620 					if (strcmp(*q, *r) == 0) {
621 						do_append = 0;
622 						break;
623 					}
624 				}
625 				if (do_append && n_results < max_results) {
626 					result[n_results++] = xstrdup(*q);
627 					result[n_results] = NULL;
628 				}
629 				break;
630 			}
631 		}
632 	}
633 
634 	xfree_split_list(langtag_list);
635 
636 	return (result);
637 }
638 
639 char *
640 g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales)
641 {
642 	char **results, **locales, *result = NULL;
643 
644 	if (srvr_locales == NULL)
645 		locales = g11n_getlocales();
646 	else
647 		locales = srvr_locales;
648 
649 	if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags,
650 	    locales)) == NULL)
651 		goto err;
652 
653 	if (*results != NULL)
654 		result = xstrdup(*results);
655 
656 	xfree_split_list(results);
657 
658 err:
659 	if (locales != srvr_locales)
660 		g11n_freelist(locales);
661 	return (result);
662 }
663 
664 
665 /*
666  * Functions for validating ASCII and UTF-8 strings
667  *
668  * The error_str parameter is an optional pointer to a char variable
669  * where to store a string suitable for use with error() or fatal() or
670  * friends.
671  *
672  * The return value is 0 if success, EILSEQ or EINVAL.
673  *
674  */
675 uint_t
676 g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str)
677 {
678 	uchar_t *p;
679 
680 	for (p = (uchar_t *)str; p && *p && (!(*p & 0x80)); p++)
681 		;
682 
683 	if (len && ((p - (uchar_t *)str) != len))
684 		return (EILSEQ);
685 
686 	return (0);
687 }
688 
689 uint_t
690 g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str)
691 {
692 	uchar_t *p;
693 	uint_t c, l;
694 
695 	if (len == 0)
696 		len = strlen((const char *)str);
697 
698 	for (p = (uchar_t *)str; p && (p - str < len) && *p; ) {
699 		/* 8-bit chars begin a UTF-8 sequence */
700 		if (*p & 0x80) {
701 			/* get sequence length and sanity check first byte */
702 			if (*p < 0xc0)
703 				return (EILSEQ);
704 			else if (*p < 0xe0)
705 				l = 2;
706 			else if (*p < 0xf0)
707 				l = 3;
708 			else if (*p < 0xf8)
709 				l = 4;
710 			else if (*p < 0xfc)
711 				l = 5;
712 			else if (*p < 0xfe)
713 				l = 6;
714 			else
715 				return (EILSEQ);
716 
717 			if ((p + l - str) >= len)
718 				return (EILSEQ);
719 
720 			/* overlong detection - build codepoint */
721 			c = *p & 0x3f;
722 			/* shift c bits from first byte */
723 			c = c << (6 * (l - 1));
724 
725 			if (l > 1) {
726 				if (*(p + 1) && ((*(p + 1) & 0xc0) == 0x80))
727 					c = c | ((*(p + 1) & 0x3f) <<
728 					    (6 * (l - 2)));
729 				else
730 					return (EILSEQ);
731 
732 				if (c < 0x80)
733 					return (EILSEQ);
734 			}
735 
736 			if (l > 2) {
737 				if (*(p + 2) && ((*(p + 2) & 0xc0) == 0x80))
738 					c = c | ((*(p + 2) & 0x3f) <<
739 					    (6 * (l - 3)));
740 				else
741 					return (EILSEQ);
742 
743 				if (c < 0x800)
744 					return (EILSEQ);
745 			}
746 
747 			if (l > 3) {
748 				if (*(p + 3) && ((*(p + 3) & 0xc0) == 0x80))
749 					c = c | ((*(p + 3) & 0x3f) <<
750 					    (6 * (l - 4)));
751 				else
752 					return (EILSEQ);
753 
754 				if (c < 0x10000)
755 					return (EILSEQ);
756 			}
757 
758 			if (l > 4) {
759 				if (*(p + 4) && ((*(p + 4) & 0xc0) == 0x80))
760 					c = c | ((*(p + 4) & 0x3f) <<
761 					    (6 * (l - 5)));
762 				else
763 					return (EILSEQ);
764 
765 				if (c < 0x200000)
766 					return (EILSEQ);
767 			}
768 
769 			if (l > 5) {
770 				if (*(p + 5) && ((*(p + 5) & 0xc0) == 0x80))
771 					c = c | (*(p + 5) & 0x3f);
772 				else
773 					return (EILSEQ);
774 
775 				if (c < 0x4000000)
776 					return (EILSEQ);
777 			}
778 
779 			/*
780 			 * check for UTF-16 surrogates ifs other illegal
781 			 * UTF-8 * points
782 			 */
783 			if (((c <= 0xdfff) && (c >= 0xd800)) ||
784 			    (c == 0xfffe) || (c == 0xffff))
785 				return (EILSEQ);
786 			p += l;
787 		}
788 		/* 7-bit chars are fine */
789 		else
790 			p++;
791 	}
792 	return (0);
793 }
794 
795 /*
796  * Functions for converting to ASCII or UTF-8 from the local codeset
797  * Functions for converting from ASCII or UTF-8 to the local codeset
798  *
799  * The error_str parameter is an optional pointer to a char variable
800  * where to store a string suitable for use with error() or fatal() or
801  * friends.
802  *
803  * The err parameter is an optional pointer to an integer where 0
804  * (success) or EILSEQ or EINVAL will be stored (failure).
805  *
806  * These functions return NULL if the conversion fails.
807  *
808  */
809 uchar_t *
810 g11n_convert_from_ascii(const char *str, int *err_ptr, uchar_t **error_str)
811 {
812 	static uint_t initialized = 0;
813 	static uint_t do_convert = 0;
814 	iconv_t cd;
815 	int err;
816 
817 	if (!initialized) {
818 		/*
819 		 * iconv_open() fails if the to/from codesets are the
820 		 * same, and there are aliases of codesets to boot...
821 		 */
822 		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
823 		    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
824 		    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
825 			initialized = 1;
826 			do_convert = 0;
827 		} else {
828 			cd = iconv_open(nl_langinfo(CODESET), "646");
829 			if (cd == (iconv_t)-1) {
830 				if (err_ptr)
831 					*err_ptr = errno;
832 				if (error_str)
833 					*error_str = (uchar_t *)"Cannot "
834 					    "convert ASCII strings to the local"
835 					    " codeset";
836 			}
837 			initialized = 1;
838 			do_convert = 1;
839 		}
840 	}
841 
842 	if (!do_convert) {
843 		if ((err = g11n_validate_ascii(str, 0, error_str))) {
844 			if (err_ptr)
845 				*err_ptr = err;
846 			return (NULL);
847 		} else
848 			return ((uchar_t *)xstrdup(str));
849 	}
850 
851 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
852 }
853 
854 uchar_t *
855 g11n_convert_from_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
856 {
857 	static uint_t initialized = 0;
858 	static uint_t do_convert = 0;
859 	iconv_t cd;
860 	int err;
861 
862 	if (!initialized) {
863 		/*
864 		 * iconv_open() fails if the to/from codesets are the
865 		 * same, and there are aliases of codesets to boot...
866 		 */
867 		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
868 		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
869 			initialized = 1;
870 			do_convert = 0;
871 		} else {
872 			cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
873 			if (cd == (iconv_t)-1) {
874 				if (err_ptr)
875 					*err_ptr = errno;
876 				if (error_str)
877 					*error_str = (uchar_t *)"Cannot "
878 					    "convert UTF-8 strings to the "
879 					    "local codeset";
880 			}
881 			initialized = 1;
882 			do_convert = 1;
883 		}
884 	}
885 
886 	if (!do_convert) {
887 		if ((err = g11n_validate_utf8(str, 0, error_str))) {
888 			if (err_ptr)
889 				*err_ptr = err;
890 			return (NULL);
891 		} else
892 			return ((uchar_t *)xstrdup((char *)str));
893 	}
894 
895 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
896 }
897 
898 char *
899 g11n_convert_to_ascii(const uchar_t *str, int *err_ptr, uchar_t **error_str)
900 {
901 	static uint_t initialized = 0;
902 	static uint_t do_convert = 0;
903 	iconv_t cd;
904 
905 	if (!initialized) {
906 		/*
907 		 * iconv_open() fails if the to/from codesets are the
908 		 * same, and there are aliases of codesets to boot...
909 		 */
910 		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
911 		    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
912 		    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
913 			initialized = 1;
914 			do_convert = 0;
915 		} else {
916 			cd = iconv_open("646", nl_langinfo(CODESET));
917 			if (cd == (iconv_t)-1) {
918 				if (err_ptr)
919 					*err_ptr = errno;
920 				if (error_str)
921 					*error_str = (uchar_t *)"Cannot "
922 					    "convert UTF-8 strings to the "
923 					    "local codeset";
924 			}
925 			initialized = 1;
926 			do_convert = 1;
927 		}
928 	}
929 
930 	if (!do_convert)
931 		return (xstrdup((char *)str));
932 
933 	return ((char *)do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
934 }
935 
936 uchar_t *
937 g11n_convert_to_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
938 {
939 	static uint_t initialized = 0;
940 	static uint_t do_convert = 0;
941 	iconv_t cd;
942 
943 	if (!initialized) {
944 		/*
945 		 * iconv_open() fails if the to/from codesets are the
946 		 * same, and there are aliases of codesets to boot...
947 		 */
948 		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
949 		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
950 			initialized = 1;
951 			do_convert = 0;
952 		} else {
953 			cd = iconv_open("UTF-8", nl_langinfo(CODESET));
954 			if (cd == (iconv_t)-1) {
955 				if (err_ptr)
956 					*err_ptr = errno;
957 				if (error_str)
958 					*error_str = (uchar_t *)"Cannot "
959 					    "convert UTF-8 strings to the "
960 					    "local codeset";
961 			}
962 			initialized = 1;
963 			do_convert = 1;
964 		}
965 	}
966 
967 	if (!do_convert)
968 		return ((uchar_t *)xstrdup((char *)str));
969 
970 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
971 }
972 
973 
974 /*
975  * Wrapper around iconv()
976  *
977  * The caller is responsible for freeing the result and for handling
978  * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF).
979  */
980 static uchar_t *
981 do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, uint_t len,
982     uint_t *outlen, int *err, uchar_t **err_str)
983 {
984 	size_t inbytesleft, outbytesleft, converted_size;
985 	char *outbuf;
986 	uchar_t *converted;
987 	const char *inbuf;
988 	uint_t mul = 0;
989 
990 	if (!buf || !(*(char *)buf))
991 		return (NULL);
992 
993 	if (len == 0)
994 		len = strlen(buf);
995 
996 	/* reset conversion descriptor */
997 	/* XXX Do we need initial shift sequences for UTF-8??? */
998 	(void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft);
999 	inbuf = (const char *) buf;
1000 
1001 	if (mul_ptr)
1002 		mul = *mul_ptr;
1003 
1004 	converted_size = (len << mul);
1005 	outbuf = (char *)xmalloc(converted_size + 1); /* for null */
1006 	converted = (uchar_t *)outbuf;
1007 	outbytesleft = len;
1008 
1009 	do {
1010 		if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) ==
1011 		    (size_t)-1) {
1012 			if (errno == E2BIG) {
1013 				/* UTF-8 codepoints are at most 8 bytes long */
1014 				if (mul > 2) {
1015 					if (err_str)
1016 						*err_str = (uchar_t *)
1017 						    "Conversion to UTF-8 failed"
1018 						    " due to preposterous space"
1019 						    " requirements";
1020 					if (err)
1021 						*err = EILSEQ;
1022 					return (NULL);
1023 				}
1024 
1025 				/*
1026 				 * re-alloc output and ensure that the outbuf
1027 				 * and outbytesleft values are adjusted
1028 				 */
1029 				converted = xrealloc(converted,
1030 				    converted_size << 1 + 1);
1031 				outbuf = (char *)converted + converted_size -
1032 				    outbytesleft;
1033 				converted_size = (len << ++(mul));
1034 				outbytesleft = converted_size - outbytesleft;
1035 			} else {
1036 				/*
1037 				 * let the caller deal with iconv() errors,
1038 				 * probably by calling fatal(); xfree() does
1039 				 * not set errno
1040 				 */
1041 				if (err)
1042 					*err = errno;
1043 				xfree(converted);
1044 				return (NULL);
1045 			}
1046 		}
1047 	} while (inbytesleft);
1048 
1049 	*outbuf = '\0'; /* ensure null-termination */
1050 	if (outlen)
1051 		*outlen = converted_size - outbytesleft;
1052 	if (mul_ptr)
1053 		*mul_ptr = mul;
1054 
1055 	return (converted);
1056 }
1057 
1058 /*
1059  * Free all strings in the list and then free the list itself. We know that the
1060  * list ends with a NULL pointer.
1061  */
1062 void
1063 g11n_freelist(char **list)
1064 {
1065 	int i = 0;
1066 
1067 	while (list[i] != NULL) {
1068 		xfree(list[i]);
1069 		i++;
1070 	}
1071 
1072 	xfree(list);
1073 }
1074