xref: /titanic_41/usr/src/cmd/ssh/libssh/common/g11n.c (revision b9aa66a73c9016cf5c71fe80efe90ce9f2ca5c73)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
22  * Use is subject to license terms.
23  */
24 
25 #include <errno.h>
26 #include <locale.h>
27 #include <langinfo.h>
28 #include <iconv.h>
29 #include <ctype.h>
30 #include <wctype.h>
31 #include <strings.h>
32 #include <string.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include "includes.h"
36 #include "xmalloc.h"
37 #include "xlist.h"
38 #include "compat.h"
39 #include "log.h"
40 
41 #ifdef MIN
42 #undef MIN
43 #endif /* MIN */
44 
45 #define	MIN(x, y)	((x) < (y) ? (x) : (y))
46 
47 #define	LOCALE_PATH	"/usr/bin/locale"
48 
49 /* two-char country code, '-' and two-char region code */
50 #define	LANGTAG_MAX	5
51 
52 static int locale_cmp(const void *d1, const void *d2);
53 static char *g11n_locale2langtag(char *locale);
54 
55 static char *do_iconv(iconv_t cd, const char *s, uint_t *lenp, char **err_str);
56 
57 /*
58  * native_codeset records the codeset of the default system locale.
59  * It is used to convert the contents of file (eg /etc/issue) which is
60  * supposed to be in the codeset of default system locale.
61  */
62 static char *native_codeset;
63 
64 /*
65  * Convert locale string name into a language tag. The caller is responsible for
66  * freeing the memory allocated for the result.
67  */
68 static char *
g11n_locale2langtag(char * locale)69 g11n_locale2langtag(char *locale)
70 {
71 	char *langtag;
72 
73 	/* base cases */
74 	if (!locale || !*locale)
75 		return (NULL);
76 
77 	if (strcmp(locale, "POSIX") == 0 || strcmp(locale, "C") == 0)
78 		return (xstrdup("i-default"));
79 
80 	/* punt for language codes which are not exactly 2 letters */
81 	if (strlen(locale) < 2 ||
82 	    !isalpha(locale[0]) ||
83 	    !isalpha(locale[1]) ||
84 	    (locale[2] != '\0' &&
85 	    locale[2] != '_' &&
86 	    locale[2] != '.' &&
87 	    locale[2] != '@'))
88 		return (NULL);
89 
90 
91 	/* we have a primary language sub-tag */
92 	langtag = (char *)xmalloc(LANGTAG_MAX + 1);
93 
94 	strncpy(langtag, locale, 2);
95 	langtag[2] = '\0';
96 
97 	/* do we have country sub-tag? For example: cs_CZ */
98 	if (locale[2] == '_') {
99 		if (strlen(locale) < 5 ||
100 		    !isalpha(locale[3]) ||
101 		    !isalpha(locale[4]) ||
102 		    (locale[5] != '\0' && (locale[5] != '.' &&
103 		    locale[5] != '@'))) {
104 			return (langtag);
105 		}
106 
107 		/* example: create cs-CZ from cs_CZ */
108 		if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 2,
109 		    locale + 3) == 5)
110 			return (langtag);
111 	}
112 
113 	/* in all other cases we just use the primary language sub-tag */
114 	return (langtag);
115 }
116 
117 uint_t
g11n_langtag_is_default(char * langtag)118 g11n_langtag_is_default(char *langtag)
119 {
120 	return (strcmp(langtag, "i-default") == 0);
121 }
122 
123 /*
124  * This lang tag / locale matching function works only for two-character
125  * language primary sub-tags and two-character country sub-tags.
126  */
127 uint_t
g11n_langtag_matches_locale(char * langtag,char * locale)128 g11n_langtag_matches_locale(char *langtag, char *locale)
129 {
130 	/* match "i-default" to the process' current locale if possible */
131 	if (g11n_langtag_is_default(langtag)) {
132 		if (strcasecmp(locale, "POSIX") == 0 ||
133 		    strcasecmp(locale, "C") == 0)
134 			return (1);
135 		else
136 			return (0);
137 	}
138 
139 	/*
140 	 * locale must be at least 2 chars long and the lang part must be
141 	 * exactly two characters
142 	 */
143 	if (strlen(locale) < 2 ||
144 	    (!isalpha(locale[0]) || !isalpha(locale[1]) ||
145 	    (locale[2] != '\0' && locale[2] != '_' &&
146 	    locale[2] != '.' && locale[2] != '@')))
147 		return (0);
148 
149 	/* same thing with the langtag */
150 	if (strlen(langtag) < 2 ||
151 	    (!isalpha(langtag[0]) || !isalpha(langtag[1]) ||
152 	    (langtag[2] != '\0' && langtag[2] != '-')))
153 		return (0);
154 
155 	/* primary language sub-tag and the locale's language part must match */
156 	if (strncasecmp(langtag, locale, 2) != 0)
157 		return (0);
158 
159 	/*
160 	 * primary language sub-tag and the locale's language match, now
161 	 * fuzzy check country part
162 	 */
163 
164 	/* neither langtag nor locale have more than one component */
165 	if (langtag[2] == '\0' &&
166 	    (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@'))
167 		return (2);
168 
169 	/* langtag has only one sub-tag... */
170 	if (langtag[2] == '\0')
171 		return (1);
172 
173 	/* locale has no country code... */
174 	if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')
175 		return (1);
176 
177 	/* langtag has more than one subtag and the locale has a country code */
178 
179 	/* ignore second subtag if not two chars */
180 	if (strlen(langtag) < 5)
181 		return (1);
182 
183 	if (!isalpha(langtag[3]) || !isalpha(langtag[4]) ||
184 	    (langtag[5] != '\0' && langtag[5] != '-'))
185 		return (1);
186 
187 	/* ignore rest of locale if there is no two-character country part */
188 	if (strlen(locale) < 5)
189 		return (1);
190 
191 	if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) ||
192 	    (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@'))
193 		return (1);
194 
195 	/* if the country part matches, return 2 */
196 	if (strncasecmp(&langtag[3], &locale[3], 2) == 0)
197 		return (2);
198 
199 	return (1);
200 }
201 
202 char *
g11n_getlocale()203 g11n_getlocale()
204 {
205 	/* we have one text domain - always set it */
206 	(void) textdomain(TEXT_DOMAIN);
207 
208 	/* if the locale is not set, set it from the env vars */
209 	if (!setlocale(LC_MESSAGES, NULL))
210 		(void) setlocale(LC_MESSAGES, "");
211 
212 	return (setlocale(LC_MESSAGES, NULL));
213 }
214 
215 void
g11n_setlocale(int category,const char * locale)216 g11n_setlocale(int category, const char *locale)
217 {
218 	char *curr;
219 
220 	if (native_codeset == NULL) {
221 		/* set default locale, and record current codeset */
222 		(void) setlocale(LC_ALL, "");
223 		curr = nl_langinfo(CODESET);
224 		native_codeset = xstrdup(curr);
225 	}
226 
227 	/* we have one text domain - always set it */
228 	(void) textdomain(TEXT_DOMAIN);
229 
230 	if (!locale)
231 		return;
232 
233 	if (*locale && ((curr = setlocale(category, NULL))) &&
234 	    strcmp(curr, locale) == 0)
235 		return;
236 
237 	/* if <category> is bogus, setlocale() will do nothing */
238 	(void) setlocale(category, locale);
239 }
240 
241 char **
g11n_getlocales()242 g11n_getlocales()
243 {
244 	FILE *locale_out;
245 	uint_t n_elems, list_size, long_line = 0;
246 	char **list;
247 	char locale[64];	/* 64 bytes is plenty for locale names */
248 
249 	if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL)
250 		return (NULL);
251 
252 	/*
253 	 * start with enough room for 65 locales - that's a lot fewer than
254 	 * all the locales available for installation, but a lot more than
255 	 * what most users will need and install
256 	 */
257 	n_elems = 0;
258 	list_size = 192;
259 	list = (char **) xmalloc(sizeof (char *) * (list_size + 1));
260 	memset(list, 0, sizeof (char *) * (list_size + 1));
261 
262 	while (fgets(locale, sizeof (locale), locale_out)) {
263 		/* skip long locale names (if any) */
264 		if (!strchr(locale, '\n')) {
265 			long_line = 1;
266 			continue;
267 		} else if (long_line) {
268 			long_line = 0;
269 			continue;
270 		}
271 
272 		if (strncmp(locale, "iso_8859", 8) == 0)
273 			/* ignore locale names like "iso_8859-1" */
274 			continue;
275 
276 		if (n_elems == list_size) {
277 			list_size *= 2;
278 			list = (char **)xrealloc((void *) list,
279 			    (list_size + 1) * sizeof (char *));
280 			memset(&list[n_elems + 1], 0,
281 			    sizeof (char *) * (list_size - n_elems + 1));
282 		}
283 
284 		*(strchr(locale, '\n')) = '\0';	/* remove the trailing \n */
285 		list[n_elems++] = xstrdup(locale);
286 	}
287 
288 	(void) pclose(locale_out);
289 
290 	if (n_elems == 0) {
291 		xfree(list);
292 		return (NULL);
293 	}
294 
295 	list[n_elems] = NULL;
296 
297 	qsort(list, n_elems - 1, sizeof (char *), locale_cmp);
298 	return (list);
299 }
300 
301 char *
g11n_getlangs()302 g11n_getlangs()
303 {
304 	char *locale;
305 
306 	if (getenv("SSH_LANGS"))
307 		return (xstrdup(getenv("SSH_LANGS")));
308 
309 	locale = g11n_getlocale();
310 
311 	if (!locale || !*locale)
312 		return (xstrdup("i-default"));
313 
314 	return (g11n_locale2langtag(locale));
315 }
316 
317 char *
g11n_locales2langs(char ** locale_set)318 g11n_locales2langs(char **locale_set)
319 {
320 	char **p, **r, **q;
321 	char *langtag, *langs;
322 	int locales, skip;
323 
324 	for (locales = 0, p = locale_set; p && *p; p++)
325 		locales++;
326 
327 	r = (char **)xmalloc((locales + 1) * sizeof (char *));
328 	memset(r, 0, (locales + 1) * sizeof (char *));
329 
330 	for (p = locale_set; p && *p && ((p - locale_set) <= locales); p++) {
331 		skip = 0;
332 		if ((langtag = g11n_locale2langtag(*p)) == NULL)
333 			continue;
334 		for (q = r; (q - r) < locales; q++) {
335 			if (!*q)
336 				break;
337 			if (*q && strcmp(*q, langtag) == 0)
338 				skip = 1;
339 		}
340 		if (!skip)
341 			*(q++) = langtag;
342 		else
343 			xfree(langtag);
344 		*q = NULL;
345 	}
346 
347 	langs = xjoin(r, ',');
348 	g11n_freelist(r);
349 
350 	return (langs);
351 }
352 
353 static int
sortcmp(const void * d1,const void * d2)354 sortcmp(const void *d1, const void *d2)
355 {
356 	char *s1 = *(char **)d1;
357 	char *s2 = *(char **)d2;
358 
359 	return (strcmp(s1, s2));
360 }
361 
362 int
g11n_langtag_match(char * langtag1,char * langtag2)363 g11n_langtag_match(char *langtag1, char *langtag2)
364 {
365 	int len1, len2;
366 	char c1, c2;
367 
368 	len1 = (strchr(langtag1, '-')) ?
369 	    (strchr(langtag1, '-') - langtag1)
370 	    : strlen(langtag1);
371 
372 	len2 = (strchr(langtag2, '-')) ?
373 	    (strchr(langtag2, '-') - langtag2)
374 	    : strlen(langtag2);
375 
376 	/* no match */
377 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
378 		return (0);
379 
380 	c1 = *(langtag1 + len1);
381 	c2 = *(langtag2 + len2);
382 
383 	/* no country sub-tags - exact match */
384 	if (c1 == '\0' && c2 == '\0')
385 		return (2);
386 
387 	/* one langtag has a country sub-tag, the other doesn't */
388 	if (c1 == '\0' || c2 == '\0')
389 		return (1);
390 
391 	/* can't happen - both langtags have a country sub-tag */
392 	if (c1 != '-' || c2 != '-')
393 		return (1);
394 
395 	/* compare country subtags */
396 	langtag1 = langtag1 + len1 + 1;
397 	langtag2 = langtag2 + len2 + 1;
398 
399 	len1 = (strchr(langtag1, '-')) ?
400 	    (strchr(langtag1, '-') - langtag1) : strlen(langtag1);
401 
402 	len2 = (strchr(langtag2, '-')) ?
403 	    (strchr(langtag2, '-') - langtag2) : strlen(langtag2);
404 
405 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
406 		return (1);
407 
408 	/* country tags matched - exact match */
409 	return (2);
410 }
411 
412 char *
g11n_langtag_set_intersect(char * set1,char * set2)413 g11n_langtag_set_intersect(char *set1, char *set2)
414 {
415 	char **list1, **list2, **list3, **p, **q, **r;
416 	char *set3, *lang_subtag;
417 	uint_t n1, n2, n3;
418 	uint_t do_append;
419 
420 	list1 = xsplit(set1, ',');
421 	list2 = xsplit(set2, ',');
422 
423 	for (n1 = 0, p = list1; p && *p; p++, n1++)
424 		;
425 	for (n2 = 0, p = list2; p && *p; p++, n2++)
426 		;
427 
428 	list3 = (char **) xmalloc(sizeof (char *) * (n1 + n2 + 1));
429 	*list3 = NULL;
430 
431 	/*
432 	 * we must not sort the user langtags - sorting or not the server's
433 	 * should not affect the outcome
434 	 */
435 	qsort(list2, n2, sizeof (char *), sortcmp);
436 
437 	for (n3 = 0, p = list1; p && *p; p++) {
438 		do_append = 0;
439 		for (q = list2; q && *q; q++) {
440 			if (g11n_langtag_match(*p, *q) != 2) continue;
441 			/* append element */
442 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
443 				do_append = 1;
444 				if (!*r)
445 					break;
446 				if (strcmp(*p, *r) == 0) {
447 					do_append = 0;
448 					break;
449 				}
450 			}
451 			if (do_append && n3 <= (n1 + n2)) {
452 				list3[n3++] = xstrdup(*p);
453 				list3[n3] = NULL;
454 			}
455 		}
456 	}
457 
458 	for (p = list1; p && *p; p++) {
459 		do_append = 0;
460 		for (q = list2; q && *q; q++) {
461 			if (g11n_langtag_match(*p, *q) != 1)
462 				continue;
463 
464 			/* append element */
465 			lang_subtag = xstrdup(*p);
466 			if (strchr(lang_subtag, '-'))
467 				*(strchr(lang_subtag, '-')) = '\0';
468 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
469 				do_append = 1;
470 				if (!*r)
471 					break;
472 				if (strcmp(lang_subtag, *r) == 0) {
473 					do_append = 0;
474 					break;
475 				}
476 			}
477 			if (do_append && n3 <= (n1 + n2)) {
478 				list3[n3++] = lang_subtag;
479 				list3[n3] = NULL;
480 			} else
481 				xfree(lang_subtag);
482 		}
483 	}
484 
485 	set3 = xjoin(list3, ',');
486 	xfree_split_list(list1);
487 	xfree_split_list(list2);
488 	xfree_split_list(list3);
489 
490 	return (set3);
491 }
492 
493 char *
g11n_clnt_langtag_negotiate(char * clnt_langtags,char * srvr_langtags)494 g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags)
495 {
496 	char *list, *result;
497 	char **xlist;
498 
499 	/* g11n_langtag_set_intersect uses xmalloc - should not return NULL */
500 	list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags);
501 
502 	if (!list)
503 		return (NULL);
504 
505 	xlist = xsplit(list, ',');
506 
507 	xfree(list);
508 
509 	if (!xlist || !*xlist)
510 		return (NULL);
511 
512 	result = xstrdup(*xlist);
513 	xfree_split_list(xlist);
514 
515 	return (result);
516 }
517 
518 /*
519  * Compare locales, preferring UTF-8 codesets to others, otherwise doing
520  * a stright strcmp()
521  */
522 static int
locale_cmp(const void * d1,const void * d2)523 locale_cmp(const void *d1, const void *d2)
524 {
525 	char *dot_ptr;
526 	char *s1 = *(char **)d1;
527 	char *s2 = *(char **)d2;
528 	int s1_is_utf8 = 0;
529 	int s2_is_utf8 = 0;
530 
531 	/* check if s1 is a UTF-8 locale */
532 	if (((dot_ptr = strchr((char *)s1, '.')) != NULL) &&
533 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
534 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
535 		s1_is_utf8++;
536 	}
537 
538 	/* check if s2 is a UTF-8 locale */
539 	if (((dot_ptr = strchr((char *)s2, '.')) != NULL) &&
540 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
541 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
542 		s2_is_utf8++;
543 	}
544 
545 	/* prefer UTF-8 locales */
546 	if (s1_is_utf8 && !s2_is_utf8)
547 		return (-1);
548 
549 	if (s2_is_utf8 && !s1_is_utf8)
550 		return (1);
551 
552 	/* prefer any locale over the default locales */
553 	if (strcmp(s1, "C") == 0 || strcmp(s1, "POSIX") == 0 ||
554 	    strcmp(s1, "common") == 0) {
555 		if (strcmp(s2, "C") != 0 && strcmp(s2, "POSIX") != 0 &&
556 		    strcmp(s2, "common") != 0)
557 			return (1);
558 	}
559 
560 	if (strcmp(s2, "C") == 0 || strcmp(s2, "POSIX") == 0 ||
561 	    strcmp(s2, "common") == 0) {
562 		if (strcmp(s1, "C") != 0 &&
563 		    strcmp(s1, "POSIX") != 0 &&
564 		    strcmp(s1, "common") != 0)
565 			return (-1);
566 	}
567 
568 	return (strcmp(s1, s2));
569 }
570 
571 
572 char **
g11n_langtag_set_locale_set_intersect(char * langtag_set,char ** locale_set)573 g11n_langtag_set_locale_set_intersect(char *langtag_set, char **locale_set)
574 {
575 	char **langtag_list, **result, **p, **q, **r;
576 	char *s;
577 	uint_t do_append, n_langtags, n_locales, n_results, max_results;
578 
579 	if (locale_set == NULL)
580 		return (NULL);
581 
582 	/* count lang tags and locales */
583 	for (n_locales = 0, p = locale_set; p && *p; p++)
584 		n_locales++;
585 
586 	n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0;
587 	/* count the number of langtags */
588 	for (; s = strchr(s, ','); s++, n_langtags++)
589 		;
590 
591 	qsort(locale_set, n_locales, sizeof (char *), locale_cmp);
592 
593 	langtag_list = xsplit(langtag_set, ',');
594 	for (n_langtags = 0, p = langtag_list; p && *p; p++, n_langtags++)
595 		;
596 
597 	max_results = MIN(n_locales, n_langtags) * 2;
598 	result = (char **) xmalloc(sizeof (char *) * (max_results + 1));
599 	*result = NULL;
600 	n_results = 0;
601 
602 	/* more specific matches first */
603 	for (p = langtag_list; p && *p; p++) {
604 		do_append = 0;
605 		for (q = locale_set; q && *q; q++) {
606 			if (g11n_langtag_matches_locale(*p, *q) == 2) {
607 				do_append = 1;
608 				for (r = result; (r - result) <=
609 				    MIN(n_locales, n_langtags); r++) {
610 					if (!*r)
611 						break;
612 					if (strcmp(*q, *r) == 0) {
613 						do_append = 0;
614 						break;
615 					}
616 				}
617 				if (do_append && n_results < max_results) {
618 					result[n_results++] = xstrdup(*q);
619 					result[n_results] = NULL;
620 				}
621 				break;
622 			}
623 		}
624 	}
625 
626 	for (p = langtag_list; p && *p; p++) {
627 		do_append = 0;
628 		for (q = locale_set; q && *q; q++) {
629 			if (g11n_langtag_matches_locale(*p, *q) == 1) {
630 				do_append = 1;
631 				for (r = result; (r - result) <=
632 				    MIN(n_locales, n_langtags); r++) {
633 					if (!*r)
634 						break;
635 					if (strcmp(*q, *r) == 0) {
636 						do_append = 0;
637 						break;
638 					}
639 				}
640 				if (do_append && n_results < max_results) {
641 					result[n_results++] = xstrdup(*q);
642 					result[n_results] = NULL;
643 				}
644 				break;
645 			}
646 		}
647 	}
648 
649 	xfree_split_list(langtag_list);
650 
651 	return (result);
652 }
653 
654 char *
g11n_srvr_locale_negotiate(char * clnt_langtags,char ** srvr_locales)655 g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales)
656 {
657 	char **results, **locales, *result = NULL;
658 
659 	if (srvr_locales == NULL)
660 		locales = g11n_getlocales();
661 	else
662 		locales = srvr_locales;
663 
664 	if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags,
665 	    locales)) == NULL)
666 		goto err;
667 
668 	if (*results != NULL)
669 		result = xstrdup(*results);
670 
671 	xfree_split_list(results);
672 
673 err:
674 	if (locales != NULL && locales != srvr_locales)
675 		g11n_freelist(locales);
676 	return (result);
677 }
678 
679 /*
680  * Functions for converting to UTF-8 from the local codeset and
681  * converting from UTF-8 to the local codeset.
682  *
683  * The error_str parameter is an pointer to a char variable where to
684  * store a string suitable for use with error() or fatal() or friends.
685  * It is also used for an error indicator when NULL is returned.
686  *
687  * If conversion isn't necessary, *error_str is set to NULL, and
688  * NULL is returned.
689  * If conversion error occured, *error_str points to an error message,
690  * and NULL is returned.
691  */
692 char *
g11n_convert_from_utf8(const char * str,uint_t * lenp,char ** error_str)693 g11n_convert_from_utf8(const char *str, uint_t *lenp, char **error_str)
694 {
695 	static char *last_codeset;
696 	static iconv_t cd = (iconv_t)-1;
697 	char	*codeset;
698 
699 	*error_str = NULL;
700 
701 	codeset = nl_langinfo(CODESET);
702 
703 	if (strcmp(codeset, "UTF-8") == 0)
704 		return (NULL);
705 
706 	if (last_codeset == NULL || strcmp(codeset, last_codeset) != 0) {
707 		if (last_codeset != NULL) {
708 			xfree(last_codeset);
709 			last_codeset = NULL;
710 		}
711 		if (cd != (iconv_t)-1)
712 			(void) iconv_close(cd);
713 
714 		if ((cd = iconv_open(codeset, "UTF-8")) == (iconv_t)-1) {
715 			*error_str = gettext("Cannot convert UTF-8 "
716 			    "strings to the local codeset");
717 			return (NULL);
718 		}
719 		last_codeset = xstrdup(codeset);
720 	}
721 	return (do_iconv(cd, str, lenp, error_str));
722 }
723 
724 char *
g11n_convert_to_utf8(const char * str,uint_t * lenp,int native,char ** error_str)725 g11n_convert_to_utf8(const char *str, uint_t *lenp,
726     int native, char **error_str)
727 {
728 	static char *last_codeset;
729 	static iconv_t cd = (iconv_t)-1;
730 	char	*codeset;
731 
732 	*error_str = NULL;
733 
734 	if (native)
735 		codeset = native_codeset;
736 	else
737 		codeset = nl_langinfo(CODESET);
738 
739 	if (strcmp(codeset, "UTF-8") == 0)
740 		return (NULL);
741 
742 	if (last_codeset == NULL || strcmp(codeset, last_codeset) != 0) {
743 		if (last_codeset != NULL) {
744 			xfree(last_codeset);
745 			last_codeset = NULL;
746 		}
747 		if (cd != (iconv_t)-1)
748 			(void) iconv_close(cd);
749 
750 		if ((cd = iconv_open("UTF-8", codeset)) == (iconv_t)-1) {
751 			*error_str = gettext("Cannot convert the "
752 			    "local codeset strings to UTF-8");
753 			return (NULL);
754 		}
755 		last_codeset = xstrdup(codeset);
756 	}
757 	return (do_iconv(cd, str, lenp, error_str));
758 }
759 
760 /*
761  * Wrapper around iconv()
762  *
763  * The caller is responsible for freeing the result. NULL is returned when
764  * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF).
765  * The caller must ensure that the input string isn't NULL pointer.
766  */
767 static char *
do_iconv(iconv_t cd,const char * str,uint_t * lenp,char ** err_str)768 do_iconv(iconv_t cd, const char *str, uint_t *lenp, char **err_str)
769 {
770 	int	ilen, olen;
771 	size_t	ileft, oleft;
772 	char	*ostr, *optr;
773 	const char *istr;
774 
775 	ilen = *lenp;
776 	olen = ilen + 1;
777 
778 	ostr = NULL;
779 	for (;;) {
780 		olen *= 2;
781 		oleft = olen;
782 		ostr = optr = xrealloc(ostr, olen);
783 		istr = (const char *)str;
784 		if ((ileft = ilen) == 0)
785 			break;
786 
787 		if (iconv(cd, &istr, &ileft, &optr, &oleft) != (size_t)-1) {
788 			/* success: generate reset sequence */
789 			if (iconv(cd, NULL, NULL,
790 			    &optr, &oleft) == (size_t)-1 && errno == E2BIG) {
791 				continue;
792 			}
793 			break;
794 		}
795 		/* failed */
796 		if (errno != E2BIG) {
797 			oleft = olen;
798 			(void) iconv(cd, NULL, NULL, &ostr, &oleft);
799 			xfree(ostr);
800 			*err_str = gettext("Codeset conversion failed");
801 			return (NULL);
802 		}
803 	}
804 	olen = optr - ostr;
805 	optr = xmalloc(olen + 1);
806 	(void) memcpy(optr, ostr, olen);
807 	xfree(ostr);
808 
809 	optr[olen] = '\0';
810 	*lenp = olen;
811 
812 	return (optr);
813 }
814 
815 /*
816  * A filter for output string. Control and unprintable characters
817  * are converted into visible form (eg "\ooo").
818  */
819 char *
g11n_filter_string(char * s)820 g11n_filter_string(char *s)
821 {
822 	int	mb_cur_max = MB_CUR_MAX;
823 	int	mblen, len;
824 	char	*os = s;
825 	wchar_t	wc;
826 	char	*obuf, *op;
827 
828 	/* all character may be converted into the form of \ooo */
829 	obuf = op = xmalloc(strlen(s) * 4 + 1);
830 
831 	while (*s != '\0') {
832 		mblen = mbtowc(&wc, s, mb_cur_max);
833 		if (mblen <= 0) {
834 			mblen = 1;
835 			wc = (unsigned char)*s;
836 		}
837 		if (!iswprint(wc) &&
838 		    wc != L'\n' && wc != L'\r' && wc != L'\t') {
839 			/*
840 			 * control chars which need to be replaced
841 			 * with safe character sequence.
842 			 */
843 			while (mblen != 0) {
844 				op += sprintf(op, "\\%03o",
845 				    (unsigned char)*s++);
846 				mblen--;
847 			}
848 		} else {
849 			while (mblen != 0) {
850 				*op++ = *s++;
851 				mblen--;
852 			}
853 		}
854 	}
855 	*op = '\0';
856 	len = op - obuf + 1;
857 	op = xrealloc(os, len);
858 	(void) memcpy(op, obuf, len);
859 	xfree(obuf);
860 	return (op);
861 }
862 
863 /*
864  * Once we negotiated with a langtag, server need to map it to a system
865  * locale. That is done based on the locale supported on the server side.
866  * We know (with the locale supported on Solaris) how the langtag is
867  * mapped to. However, from the client point of view, there is no way to
868  * know exactly what locale(encoding) will be used.
869  *
870  * With the bug fix of SSH_BUG_STRING_ENCODING, it is guaranteed that the
871  * UTF-8 characters always come over the wire, so it is no longer the problem
872  * as long as both side has the bug fix. However if the server side doesn't
873  * have the fix, client can't safely perform the code conversion since the
874  * incoming character encoding is unknown.
875  *
876  * To alleviate this situation, we take an empirical approach to find
877  * encoding from langtag.
878  *
879  * If langtag has a subtag, we can directly map the langtag to UTF-8 locale
880  * (eg en-US can be mapped to en_US.UTF-8) with a few exceptions.
881  * Certain xx_YY locales don't support UTF-8 encoding (probably due to lack
882  * of L10N support ..). Those are:
883  *
884  * 	no_NO, no_NY, sr_SP, sr_YU
885  *
886  * They all use ISO8859-X encoding.
887  *
888  * For those "xx" langtags, some of them can be mapped to "xx.UTF-8",
889  * but others cannot. So we need to use the "xx" as the locale name.
890  * Those locales are:
891  *
892  * ar, ca, cs, da, et, fi, he, hu, ja, lt, lv, nl, no, pt, sh, th, tr
893  *
894  * Their encoding vary. They could be ISO8859-X or EUC or something else.
895  * So we don't perform code conversion for these langtags.
896  */
897 static const char *non_utf8_langtag[] = {
898 	"no-NO", "no-NY", "sr-SP", "sr-YU",
899 	"ar", "ca", "cs", "da", "et", "fi", "he", "hu", "ja",
900 	"lt", "lv", "nl", "no", "pt", "sh", "th", "tr", NULL};
901 
902 void
g11n_test_langtag(const char * lang,int server)903 g11n_test_langtag(const char *lang, int server)
904 {
905 	const char	**lp;
906 
907 	if (datafellows & SSH_BUG_LOCALES_NOT_LANGTAGS) {
908 		/*
909 		 * We negotiated with real locale name (not lang tag).
910 		 * We shouldn't expect UTF-8, thus shouldn't do code
911 		 * conversion.
912 		 */
913 		datafellows |= SSH_BUG_STRING_ENCODING;
914 		return;
915 	}
916 
917 	if (datafellows & SSH_BUG_STRING_ENCODING) {
918 		if (server) {
919 			/*
920 			 * Whatever bug exists in the client side, server
921 			 * side has nothing to do, since server has no way
922 			 * to know what actual encoding is used on the client
923 			 * side. For example, even if we negotiated with
924 			 * en_US, client locale could be en_US.ISO8859-X or
925 			 * en_US.UTF-8.
926 			 */
927 			return;
928 		}
929 		/*
930 		 * We are on the client side. We'll check with known
931 		 * locales to see if non-UTF8 characters could come in.
932 		 */
933 		for (lp = non_utf8_langtag; *lp != NULL; lp++) {
934 			if (strcmp(lang, *lp) == 0)
935 				break;
936 		}
937 		if (*lp == NULL) {
938 			debug2("Server is expected to use UTF-8 locale");
939 			datafellows &= ~SSH_BUG_STRING_ENCODING;
940 		} else {
941 			/*
942 			 * Server is expected to use non-UTF8 encoding.
943 			 */
944 			debug2("Enforcing no code conversion: %s", lang);
945 		}
946 	}
947 }
948 
949 /*
950  * Free all strings in the list and then free the list itself. We know that the
951  * list ends with a NULL pointer.
952  */
953 void
g11n_freelist(char ** list)954 g11n_freelist(char **list)
955 {
956 	int i = 0;
957 
958 	while (list[i] != NULL) {
959 		xfree(list[i]);
960 		i++;
961 	}
962 
963 	xfree(list);
964 }
965