xref: /illumos-gate/usr/src/lib/libc/port/locale/localeimpl.c (revision 45ede40b2394db7967e59f19288fae9b62efd4aa)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
14  */
15 
16 /*
17  * This file implements the 2008 newlocale and friends handling.
18  */
19 
20 #ifndef	_LCONV_C99
21 #define	_LCONV_C99
22 #endif
23 
24 #include "lint.h"
25 #include <atomic.h>
26 #include <locale.h>
27 #include <sys/types.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 #include <string.h>
31 #include "libc.h"
32 #include "mtlib.h"
33 #include "tsd.h"
34 #include "localeimpl.h"
35 #include "lctype.h"
36 
37 /*
38  * Big Theory of Locales:
39  *
40  * (It is recommended that readers familiarize themselves with the POSIX
41  * 2008 (XPG Issue 7) specifications for locales, first.)
42  *
43  * Historically, we had a bunch of global variables that stored locale
44  * data.  While this worked well, it limited applications to a single locale
45  * at a time.  This doesn't work well in certain server applications.
46  *
47  * Issue 7, X/Open introduced the concept of a locale_t object, along with
48  * versions of functions that can take this object as a parameter, along
49  * with functions to clone and manipulate these locale objects.  The new
50  * functions are named with a _l() suffix.
51  *
52  * Additionally uselocale() is introduced which can change the locale of
53  * of a single thread.  However, setlocale() can still be used to change
54  * the global locale.
55  *
56  * In our implementation, we use libc's TSD to store the locale data that
57  * was previously global.  We still have global data because some applications
58  * have had those global objects compiled into them.  (Such applications will
59  * be unable to benefit from uselocale(), btw.)  The legacy routines are
60  * reimplemented as wrappers that use the appropriate locale object by
61  * calling uselocale().  uselocale() when passed a NULL pointer returns the
62  * thread-specific locale object if one is present, or the global locale
63  * object otherwise.  Note that once the TSD data is set, the only way
64  * to revert to the global locale is to pass the global locale LC_GLOBAL_LOCALE
65  * to uselocale().
66  *
67  * We are careful to minimize performance impact of multiple calls to
68  * uselocale() or setlocale() by using a cache of locale data whenever possible.
69  * As a consequence of this, applications that iterate over all possible
70  * locales will burn through a lot of virtual memory, but we find such
71  * applications rare.  (locale -a might be an exception, but it is short lived.)
72  *
73  * Category data is never released (although enclosing locale objects might be),
74  * in order to guarantee thread-safety.  Calling freelocale() on an object
75  * while it is in use by another thread is a programmer error (use-after-free)
76  * and we don't bother to note it further.
77  *
78  * Locale objects (global locales) established by setlocale() are also
79  * never freed (for MT safety), but we will save previous locale objects
80  * and reuse them when we can.
81  */
82 
83 typedef struct locdata *(*loadfn_t)(const char *);
84 
85 static const loadfn_t loaders[LC_ALL] = {
86 	__lc_ctype_load,
87 	__lc_numeric_load,
88 	__lc_time_load,
89 	__lc_collate_load,
90 	__lc_monetary_load,
91 	__lc_messages_load,
92 };
93 
94 extern struct lc_monetary lc_monetary_posix;
95 extern struct lc_numeric lc_numeric_posix;
96 extern struct lc_messages lc_messages_posix;
97 extern struct lc_time lc_time_posix;
98 extern struct lc_ctype lc_ctype_posix;
99 extern struct lc_collate lc_collate_posix;
100 extern struct _RuneLocale _DefaultRuneLocale;
101 
102 static struct _locale posix_locale = {
103 	/* locdata */
104 	.locdata = {
105 		&__posix_ctype_locdata,
106 		&__posix_numeric_locdata,
107 		&__posix_time_locdata,
108 		&__posix_collate_locdata,
109 		&__posix_monetary_locdata,
110 		&__posix_messages_locdata,
111 	},
112 	.locname = "C",
113 	.ctype = &lc_ctype_posix,
114 	.numeric = &lc_numeric_posix,
115 	.collate = &lc_collate_posix,
116 	.monetary = &lc_monetary_posix,
117 	.messages = &lc_messages_posix,
118 	.time = &lc_time_posix,
119 	.runelocale = &_DefaultRuneLocale,
120 };
121 
122 locale_t ___global_locale = &posix_locale;
123 
124 locale_t
125 __global_locale(void)
126 {
127 	return (___global_locale);
128 }
129 
130 /*
131  * Locale data for hybrid C.UTF-8 locale having all the characteristics of
132  * default C/POSIX locale, except for LC_CTYPE data which is retrieved from
133  * cache/file as for other UTF-8 locales.
134  */
135 static struct locdata cutf_locdata[LC_ALL] = {
136 	{ "C.UTF-8", NULL }, /* unused */
137 	{ "C.UTF-8", &lc_numeric_posix },
138 	{ "C.UTF-8", &lc_time_posix },
139 	{ "C.UTF-8", &lc_collate_posix },
140 	{ "C.UTF-8", &lc_monetary_posix },
141 	{ "C.UTF-8", &lc_messages_posix },
142 };
143 
144 /*
145  * Category names for getenv()  Note that this was modified
146  * for Solaris.  See <iso/locale_iso.h>.
147  */
148 #define	NUM_CATS	7
149 static char *categories[7] = {
150 	"LC_CTYPE",
151 	"LC_NUMERIC",
152 	"LC_TIME",
153 	"LC_COLLATE",
154 	"LC_MONETARY",
155 	"LC_MESSAGES",
156 	"LC_ALL",
157 };
158 
159 /*
160  * Prototypes.
161  */
162 static const char *get_locale_env(int);
163 static struct locdata *locdata_get(int, const char *);
164 static struct locdata *locdata_get_cache(int, const char *);
165 static locale_t mklocname(locale_t);
166 
167 /*
168  * Some utility routines.
169  */
170 
171 struct locdata *
172 __locdata_alloc(const char *name, size_t memsz)
173 {
174 	struct locdata *ldata;
175 
176 	if ((ldata = lmalloc(sizeof (*ldata))) == NULL) {
177 		return (NULL);
178 	}
179 	if ((ldata->l_data[0] = libc_malloc(memsz)) == NULL) {
180 		lfree(ldata, sizeof (*ldata));
181 		errno = ENOMEM;
182 		return (NULL);
183 	}
184 	(void) strlcpy(ldata->l_lname, name, sizeof (ldata->l_lname));
185 
186 	return (ldata);
187 }
188 
189 /*
190  * Normally we never free locale data truly, but if we failed to load it
191  * for some reason, this routine is used to cleanup the partial mess.
192  */
193 void
194 __locdata_free(struct locdata *ldata)
195 {
196 	for (int i = 0; i < NLOCDATA; i++)
197 		libc_free(ldata->l_data[i]);
198 	if (ldata->l_map != NULL && ldata->l_map_len)
199 		(void) munmap(ldata->l_map, ldata->l_map_len);
200 	lfree(ldata, sizeof (*ldata));
201 }
202 
203 /*
204  * It turns out that for performance reasons we would really like to
205  * cache the most recently referenced locale data to avoid wasteful
206  * loading from files.
207  */
208 
209 static struct locdata *cache_data[LC_ALL];
210 static struct locdata *cat_data[LC_ALL];
211 static mutex_t cache_lock = DEFAULTMUTEX;
212 
213 /*
214  * Returns the cached data if the locale name is the same.  If not,
215  * returns NULL (cache miss).  The locdata is returned with a hold on
216  * it, taken on behalf of the caller.  The caller should drop the hold
217  * when it is finished.
218  */
219 static struct locdata *
220 locdata_get_cache(int category, const char *locname)
221 {
222 	struct locdata *loc;
223 
224 	if (category < 0 || category >= LC_ALL)
225 		return (NULL);
226 
227 	/* Try cache first. */
228 	lmutex_lock(&cache_lock);
229 	loc = cache_data[category];
230 
231 	if ((loc != NULL) && (strcmp(loc->l_lname, locname) == 0)) {
232 		lmutex_unlock(&cache_lock);
233 		return (loc);
234 	}
235 
236 	/*
237 	 * Failing that try previously loaded locales (linear search) --
238 	 * this could be optimized to a hash, but its unlikely that a single
239 	 * application will ever need to work with more than a few locales.
240 	 */
241 	for (loc = cat_data[category]; loc != NULL; loc = loc->l_next) {
242 		if (strcmp(locname, loc->l_lname) == 0) {
243 			break;
244 		}
245 	}
246 
247 	/*
248 	 * Finally, if we still don't have one, try loading the locale
249 	 * data from the actual on-disk data.
250 	 *
251 	 * We drop the lock (libc wants to ensure no internal locks
252 	 * are held when we call other routines required to read from
253 	 * files, allocate memory, etc.)  There is a small race here,
254 	 * but the consequences of the race are benign -- if multiple
255 	 * threads hit this at precisely the same point, we could
256 	 * wind up with duplicates of the locale data in the cache.
257 	 *
258 	 * This wastes the memory for an extra copy of the locale
259 	 * data, but there is no further harm beyond that.  Its not
260 	 * worth the effort to recode this to something "safe"
261 	 * (which would require rescanning the list, etc.), given
262 	 * that this race will probably never actually occur.
263 	 */
264 	if (loc == NULL) {
265 		lmutex_unlock(&cache_lock);
266 		loc = (*loaders[category])(locname);
267 		lmutex_lock(&cache_lock);
268 		if (loc != NULL)
269 			(void) strlcpy(loc->l_lname, locname,
270 			    sizeof (loc->l_lname));
271 	}
272 
273 	/*
274 	 * Assuming we got one, update the cache, and stick us on the list
275 	 * of loaded locale data.  We insert into the head (more recent
276 	 * use is likely to win.)
277 	 */
278 	if (loc != NULL) {
279 		cache_data[category] = loc;
280 		if (!loc->l_cached) {
281 			loc->l_cached = 1;
282 			loc->l_next = cat_data[category];
283 			cat_data[category] = loc;
284 		}
285 	}
286 
287 	lmutex_unlock(&cache_lock);
288 	return (loc);
289 }
290 
291 /* Charmap aliases, mostly found in Linux */
292 static const struct {
293 	const char *alias;
294 	const char *name;
295 } cmalias[] = {
296 	{ "utf8", "UTF-8" },
297 	{ "iso88591", "ISO8859-1" },
298 	{ "iso885915", "ISO8859-15" },
299 	{ "gb18030", "GB18030" },
300 	{ "koi8r", "KOI8-R" },
301 	{ NULL, NULL }
302 };
303 
304 /*
305  * Routine to get the locdata for a given category and locale.
306  * This includes retrieving it from cache, retrieving it from
307  * a file, etc.
308  */
309 static struct locdata *
310 locdata_get(int category, const char *locname)
311 {
312 	char scratch[ENCODING_LEN + 1];
313 	char scratch2[ENCODING_LEN + 1];
314 	char *slash, *cm;
315 	int cnt;
316 	int len;
317 	int i;
318 
319 	if (locname == NULL || *locname == 0) {
320 		locname = get_locale_env(category);
321 	}
322 
323 	/*
324 	 * Extract the locale name for the category if it is a composite
325 	 * locale.
326 	 */
327 	if ((slash = strchr(locname, '/')) != NULL) {
328 		for (cnt = category; cnt && slash != NULL; cnt--) {
329 			locname = slash + 1;
330 			slash = strchr(locname, '/');
331 		}
332 		if (slash) {
333 			len = slash - locname + 1;
334 			if (len >= sizeof (scratch)) {
335 				len = sizeof (scratch);
336 			}
337 		} else {
338 			len = sizeof (scratch);
339 		}
340 		(void) strlcpy(scratch, locname, len);
341 		locname = scratch;
342 	}
343 
344 	if ((strcmp(locname, "C") == 0) || (strcmp(locname, "POSIX") == 0))
345 		return (posix_locale.locdata[category]);
346 
347 	/* Handle charmap aliases */
348 	for (i = 0; cmalias[i].alias != NULL; i++) {
349 		if ((cm = strstr(locname, cmalias[i].alias)) != NULL &&
350 		    strlen(cm) == strlen(cmalias[i].alias)) {
351 			len = cm - locname + 1;
352 			if (len + strlen(cmalias[i].name) >= sizeof (scratch2))
353 				break;
354 			(void) strlcpy(scratch2, locname, len);
355 			(void) strlcat(scratch2, cmalias[i].name,
356 			    sizeof (scratch2));
357 			locname = scratch2;
358 			break;
359 		}
360 	}
361 
362 	if ((strcmp(locname, "C.UTF-8") == 0) && (category != LC_CTYPE))
363 		return (&cutf_locdata[category]);
364 
365 	return (locdata_get_cache(category, locname));
366 }
367 
368 /* tsd destructor */
369 static void
370 freelocptr(void *arg)
371 {
372 	locale_t *locptr = arg;
373 	if (*locptr != NULL)
374 		freelocale(*locptr);
375 }
376 
377 static const char *
378 get_locale_env(int category)
379 {
380 	const char *env;
381 
382 	/* 1. check LC_ALL. */
383 	env = getenv(categories[LC_ALL]);
384 
385 	/* 2. check LC_* */
386 	if (env == NULL || *env == '\0')
387 		env = getenv(categories[category]);
388 
389 	/* 3. check LANG */
390 	if (env == NULL || *env == '\0')
391 		env = getenv("LANG");
392 
393 	/* 4. if none is set, fall to "C" */
394 	if (env == NULL || *env == '\0')
395 		env = "C";
396 
397 	return (env);
398 }
399 
400 
401 /*
402  * This routine is exposed via the MB_CUR_MAX macro.  Note that legacy
403  * code will continue to use _ctype[520], but we prefer this function as
404  * it is the only way to get thread-specific information.
405  */
406 unsigned char
407 __mb_cur_max_l(locale_t loc)
408 {
409 	return (loc->ctype->lc_max_mblen);
410 }
411 
412 unsigned char
413 __mb_cur_max(void)
414 {
415 	return (__mb_cur_max_l(uselocale(NULL)));
416 }
417 
418 /*
419  * Public interfaces.
420  */
421 
422 locale_t
423 duplocale(locale_t src)
424 {
425 	locale_t	loc;
426 	int		i;
427 
428 	loc = lmalloc(sizeof (*loc));
429 	if (loc == NULL) {
430 		return (NULL);
431 	}
432 	if (src == NULL) {
433 		/* illumos extension: POSIX says LC_GLOBAL_LOCALE here */
434 		src = ___global_locale;
435 	}
436 	for (i = 0; i < LC_ALL; i++) {
437 		loc->locdata[i] = src->locdata[i];
438 		loc->loaded[i] = 0;
439 	}
440 	loc->collate = loc->locdata[LC_COLLATE]->l_data[0];
441 	loc->ctype = loc->locdata[LC_CTYPE]->l_data[0];
442 	loc->runelocale = loc->locdata[LC_CTYPE]->l_data[1];
443 	loc->messages = loc->locdata[LC_MESSAGES]->l_data[0];
444 	loc->monetary = loc->locdata[LC_MONETARY]->l_data[0];
445 	loc->numeric = loc->locdata[LC_NUMERIC]->l_data[0];
446 	loc->time = loc->locdata[LC_TIME]->l_data[0];
447 	return (loc);
448 }
449 
450 void
451 freelocale(locale_t loc)
452 {
453 	/*
454 	 * We take extra care never to free a saved locale created by
455 	 * setlocale().  This shouldn't be strictly necessary, but a little
456 	 * extra safety doesn't hurt here.
457 	 */
458 	if ((loc != NULL) && (loc != &posix_locale) && (!loc->on_list))
459 		lfree(loc, sizeof (*loc));
460 }
461 
462 locale_t
463 newlocale(int catmask, const char *locname, locale_t base)
464 {
465 	locale_t loc;
466 	int i, e;
467 
468 	if (catmask & ~(LC_ALL_MASK)) {
469 		errno = EINVAL;
470 		return (NULL);
471 	}
472 
473 	/*
474 	 * Technically passing LC_GLOBAL_LOCALE here is illegal,
475 	 * but we allow it.
476 	 */
477 	if (base == NULL || base == ___global_locale) {
478 		loc = duplocale(___global_locale);
479 	} else {
480 		loc = duplocale(base);
481 	}
482 	if (loc == NULL) {
483 		return (NULL);
484 	}
485 
486 	for (i = 0; i < LC_ALL; i++) {
487 		struct locdata *ldata;
488 		loc->loaded[i] = 0;
489 		if (((1 << i) & catmask) == 0) {
490 			/* Default to base locale if not overriding */
491 			continue;
492 		}
493 		ldata = locdata_get(i, locname);
494 		if (ldata == NULL) {
495 			e = errno;
496 			freelocale(loc);
497 			errno = e;
498 			return (NULL);
499 		}
500 		loc->locdata[i] = ldata;
501 	}
502 	loc->collate = loc->locdata[LC_COLLATE]->l_data[0];
503 	loc->ctype = loc->locdata[LC_CTYPE]->l_data[0];
504 	loc->runelocale = loc->locdata[LC_CTYPE]->l_data[1];
505 	loc->messages = loc->locdata[LC_MESSAGES]->l_data[0];
506 	loc->monetary = loc->locdata[LC_MONETARY]->l_data[0];
507 	loc->numeric = loc->locdata[LC_NUMERIC]->l_data[0];
508 	loc->time = loc->locdata[LC_TIME]->l_data[0];
509 	freelocale(base);
510 
511 	return (mklocname(loc));
512 }
513 
514 locale_t
515 uselocale(locale_t loc)
516 {
517 	locale_t lastloc = ___global_locale;
518 	locale_t *locptr;
519 
520 	locptr = tsdalloc(_T_SETLOCALE, sizeof (locale_t), freelocptr);
521 	/* Should never occur */
522 	if (locptr == NULL) {
523 		errno = EINVAL;
524 		return (NULL);
525 	}
526 
527 	if (*locptr != NULL)
528 		lastloc = *locptr;
529 
530 	/* Argument loc is NULL if we are just querying. */
531 	if (loc != NULL) {
532 		/*
533 		 * Set it to LC_GLOBAL_LOCAL to return to using
534 		 * the global locale (setlocale).
535 		 */
536 		if (loc == ___global_locale) {
537 			*locptr = NULL;
538 		} else {
539 			/* No validation of the provided locale at present */
540 			*locptr = loc;
541 		}
542 	}
543 
544 	/*
545 	 * The caller is responsible for freeing, of course it would be
546 	 * gross error to call freelocale() on a locale object that is still
547 	 * in use.
548 	 */
549 	return (lastloc);
550 }
551 
552 static locale_t
553 mklocname(locale_t loc)
554 {
555 	int composite = 0;
556 
557 	/* Look to see if any category is different */
558 	for (int i = 1; i < LC_ALL; ++i) {
559 		if (strcmp(loc->locdata[0]->l_lname,
560 		    loc->locdata[i]->l_lname) != 0) {
561 			composite = 1;
562 			break;
563 		}
564 	}
565 
566 	if (composite) {
567 		/*
568 		 * Note ordering of these follows the numeric order,
569 		 * if the order is changed, then setlocale() will need
570 		 * to be changed as well.
571 		 */
572 		(void) snprintf(loc->locname, sizeof (loc->locname),
573 		    "%s/%s/%s/%s/%s/%s",
574 		    loc->locdata[LC_CTYPE]->l_lname,
575 		    loc->locdata[LC_NUMERIC]->l_lname,
576 		    loc->locdata[LC_TIME]->l_lname,
577 		    loc->locdata[LC_COLLATE]->l_lname,
578 		    loc->locdata[LC_MONETARY]->l_lname,
579 		    loc->locdata[LC_MESSAGES]->l_lname);
580 	} else {
581 		(void) strlcpy(loc->locname, loc->locdata[LC_CTYPE]->l_lname,
582 		    sizeof (loc->locname));
583 	}
584 	return (loc);
585 }
586