xref: /illumos-gate/usr/src/lib/libc/port/locale/localeimpl.c (revision 004345e48064ccd168d15f66eba2031c6090ccee)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
14  * Copyright 2025 Oxide Computer Company
15  */
16 
17 /*
18  * This file implements the 2008 newlocale and friends handling.
19  */
20 
21 #ifndef	_LCONV_C99
22 #define	_LCONV_C99
23 #endif
24 
25 #include "lint.h"
26 #include <atomic.h>
27 #include <locale.h>
28 #include <sys/types.h>
29 #include <sys/mman.h>
30 #include <errno.h>
31 #include <string.h>
32 #include "libc.h"
33 #include "mtlib.h"
34 #include "tsd.h"
35 #include "localeimpl.h"
36 #include "lctype.h"
37 
38 /*
39  * Big Theory of Locales:
40  *
41  * (It is recommended that readers familiarize themselves with the POSIX
42  * 2008 (XPG Issue 7) specifications for locales, first.)
43  *
44  * Historically, we had a bunch of global variables that stored locale
45  * data.  While this worked well, it limited applications to a single locale
46  * at a time.  This doesn't work well in certain server applications.
47  *
48  * Issue 7, X/Open introduced the concept of a locale_t object, along with
49  * versions of functions that can take this object as a parameter, along
50  * with functions to clone and manipulate these locale objects.  The new
51  * functions are named with a _l() suffix.
52  *
53  * Additionally uselocale() is introduced which can change the locale of
54  * of a single thread.  However, setlocale() can still be used to change
55  * the global locale.
56  *
57  * In our implementation, we use libc's TSD to store the locale data that
58  * was previously global.  We still have global data because some applications
59  * have had those global objects compiled into them.  (Such applications will
60  * be unable to benefit from uselocale(), btw.)  The legacy routines are
61  * reimplemented as wrappers that use the appropriate locale object by
62  * calling uselocale().  uselocale() when passed a NULL pointer returns the
63  * thread-specific locale object if one is present, or the global locale
64  * object otherwise.  Note that once the TSD data is set, the only way
65  * to revert to the global locale is to pass the global locale LC_GLOBAL_LOCALE
66  * to uselocale().
67  *
68  * We are careful to minimize performance impact of multiple calls to
69  * uselocale() or setlocale() by using a cache of locale data whenever possible.
70  * As a consequence of this, applications that iterate over all possible
71  * locales will burn through a lot of virtual memory, but we find such
72  * applications rare.  (locale -a might be an exception, but it is short lived.)
73  *
74  * Category data is never released (although enclosing locale objects might be),
75  * in order to guarantee thread-safety.  Calling freelocale() on an object
76  * while it is in use by another thread is a programmer error (use-after-free)
77  * and we don't bother to note it further.
78  *
79  * Locale objects (global locales) established by setlocale() are also
80  * never freed (for MT safety), but we will save previous locale objects
81  * and reuse them when we can.
82  */
83 
84 typedef struct locdata *(*loadfn_t)(const char *);
85 
86 static const loadfn_t loaders[LC_ALL] = {
87 	__lc_ctype_load,
88 	__lc_numeric_load,
89 	__lc_time_load,
90 	__lc_collate_load,
91 	__lc_monetary_load,
92 	__lc_messages_load,
93 };
94 
95 extern struct lc_monetary lc_monetary_posix;
96 extern struct lc_numeric lc_numeric_posix;
97 extern struct lc_messages lc_messages_posix;
98 extern struct lc_time lc_time_posix;
99 extern struct lc_ctype lc_ctype_posix;
100 extern struct lc_collate lc_collate_posix;
101 extern struct _RuneLocale _DefaultRuneLocale;
102 
103 static struct _locale posix_locale = {
104 	/* locdata */
105 	.locdata = {
106 		&__posix_ctype_locdata,
107 		&__posix_numeric_locdata,
108 		&__posix_time_locdata,
109 		&__posix_collate_locdata,
110 		&__posix_monetary_locdata,
111 		&__posix_messages_locdata,
112 	},
113 	.locname = "C",
114 	.ctype = &lc_ctype_posix,
115 	.numeric = &lc_numeric_posix,
116 	.collate = &lc_collate_posix,
117 	.monetary = &lc_monetary_posix,
118 	.messages = &lc_messages_posix,
119 	.time = &lc_time_posix,
120 	.runelocale = &_DefaultRuneLocale,
121 };
122 
123 locale_t ___global_locale = &posix_locale;
124 
125 locale_t
__global_locale(void)126 __global_locale(void)
127 {
128 	return (___global_locale);
129 }
130 
131 /*
132  * Locale data for hybrid C.UTF-8 locale having all the characteristics of
133  * default C/POSIX locale, except for LC_CTYPE data which is retrieved from
134  * cache/file as for other UTF-8 locales.
135  */
136 static struct locdata cutf_locdata[LC_ALL] = {
137 	{ "C.UTF-8", NULL }, /* unused */
138 	{ "C.UTF-8", &lc_numeric_posix },
139 	{ "C.UTF-8", &lc_time_posix },
140 	{ "C.UTF-8", &lc_collate_posix },
141 	{ "C.UTF-8", &lc_monetary_posix },
142 	{ "C.UTF-8", &lc_messages_posix },
143 };
144 
145 /*
146  * Category names for getenv()  Note that this was modified
147  * for Solaris.  See <iso/locale_iso.h>.
148  */
149 #define	NUM_CATS	7
150 static char *categories[7] = {
151 	"LC_CTYPE",
152 	"LC_NUMERIC",
153 	"LC_TIME",
154 	"LC_COLLATE",
155 	"LC_MONETARY",
156 	"LC_MESSAGES",
157 	"LC_ALL",
158 };
159 
160 /*
161  * Prototypes.
162  */
163 static const char *get_locale_env(int);
164 static struct locdata *locdata_get(int, const char *);
165 static struct locdata *locdata_get_cache(int, const char *);
166 static locale_t mklocname(locale_t);
167 
168 /*
169  * Some utility routines.
170  */
171 
172 struct locdata *
__locdata_alloc(const char * name,size_t memsz)173 __locdata_alloc(const char *name, size_t memsz)
174 {
175 	struct locdata *ldata;
176 
177 	if ((ldata = lmalloc(sizeof (*ldata))) == NULL) {
178 		return (NULL);
179 	}
180 	if ((ldata->l_data[0] = libc_malloc(memsz)) == NULL) {
181 		lfree(ldata, sizeof (*ldata));
182 		errno = ENOMEM;
183 		return (NULL);
184 	}
185 	(void) strlcpy(ldata->l_lname, name, sizeof (ldata->l_lname));
186 
187 	return (ldata);
188 }
189 
190 /*
191  * Normally we never free locale data truly, but if we failed to load it
192  * for some reason, this routine is used to cleanup the partial mess.
193  */
194 void
__locdata_free(struct locdata * ldata)195 __locdata_free(struct locdata *ldata)
196 {
197 	for (int i = 0; i < NLOCDATA; i++)
198 		libc_free(ldata->l_data[i]);
199 	if (ldata->l_map != NULL && ldata->l_map_len)
200 		(void) munmap(ldata->l_map, ldata->l_map_len);
201 	lfree(ldata, sizeof (*ldata));
202 }
203 
204 /*
205  * It turns out that for performance reasons we would really like to
206  * cache the most recently referenced locale data to avoid wasteful
207  * loading from files.
208  */
209 
210 static struct locdata *cache_data[LC_ALL];
211 static struct locdata *cat_data[LC_ALL];
212 static mutex_t cache_lock = DEFAULTMUTEX;
213 
214 /*
215  * Returns the cached data if the locale name is the same.  If not,
216  * returns NULL (cache miss).  The locdata is returned with a hold on
217  * it, taken on behalf of the caller.  The caller should drop the hold
218  * when it is finished.
219  */
220 static struct locdata *
locdata_get_cache(int category,const char * locname)221 locdata_get_cache(int category, const char *locname)
222 {
223 	struct locdata *loc;
224 
225 	if (category < 0 || category >= LC_ALL)
226 		return (NULL);
227 
228 	/* Try cache first. */
229 	lmutex_lock(&cache_lock);
230 	loc = cache_data[category];
231 
232 	if ((loc != NULL) && (strcmp(loc->l_lname, locname) == 0)) {
233 		lmutex_unlock(&cache_lock);
234 		return (loc);
235 	}
236 
237 	/*
238 	 * Failing that try previously loaded locales (linear search) --
239 	 * this could be optimized to a hash, but its unlikely that a single
240 	 * application will ever need to work with more than a few locales.
241 	 */
242 	for (loc = cat_data[category]; loc != NULL; loc = loc->l_next) {
243 		if (strcmp(locname, loc->l_lname) == 0) {
244 			break;
245 		}
246 	}
247 
248 	/*
249 	 * Finally, if we still don't have one, try loading the locale
250 	 * data from the actual on-disk data.
251 	 *
252 	 * We drop the lock (libc wants to ensure no internal locks
253 	 * are held when we call other routines required to read from
254 	 * files, allocate memory, etc.)  There is a small race here,
255 	 * but the consequences of the race are benign -- if multiple
256 	 * threads hit this at precisely the same point, we could
257 	 * wind up with duplicates of the locale data in the cache.
258 	 *
259 	 * This wastes the memory for an extra copy of the locale
260 	 * data, but there is no further harm beyond that.  Its not
261 	 * worth the effort to recode this to something "safe"
262 	 * (which would require rescanning the list, etc.), given
263 	 * that this race will probably never actually occur.
264 	 */
265 	if (loc == NULL) {
266 		lmutex_unlock(&cache_lock);
267 		loc = (*loaders[category])(locname);
268 		lmutex_lock(&cache_lock);
269 		if (loc != NULL)
270 			(void) strlcpy(loc->l_lname, locname,
271 			    sizeof (loc->l_lname));
272 	}
273 
274 	/*
275 	 * Assuming we got one, update the cache, and stick us on the list
276 	 * of loaded locale data.  We insert into the head (more recent
277 	 * use is likely to win.)
278 	 */
279 	if (loc != NULL) {
280 		cache_data[category] = loc;
281 		if (!loc->l_cached) {
282 			loc->l_cached = 1;
283 			loc->l_next = cat_data[category];
284 			cat_data[category] = loc;
285 		}
286 	}
287 
288 	lmutex_unlock(&cache_lock);
289 	return (loc);
290 }
291 
292 /* Charmap aliases, mostly found in Linux */
293 static const struct {
294 	const char *alias;
295 	const char *name;
296 } cmalias[] = {
297 	{ "utf8", "UTF-8" },
298 	{ "iso88591", "ISO8859-1" },
299 	{ "iso885915", "ISO8859-15" },
300 	{ "gb18030", "GB18030" },
301 	{ "koi8r", "KOI8-R" },
302 	{ NULL, NULL }
303 };
304 
305 /*
306  * Routine to get the locdata for a given category and locale.
307  * This includes retrieving it from cache, retrieving it from
308  * a file, etc.
309  */
310 static struct locdata *
locdata_get(int category,const char * locname)311 locdata_get(int category, const char *locname)
312 {
313 	char scratch[ENCODING_LEN + 1];
314 	char scratch2[ENCODING_LEN + 1];
315 	char *slash, *cm;
316 	int cnt;
317 	int len;
318 	int i;
319 
320 	if (locname == NULL || *locname == 0) {
321 		locname = get_locale_env(category);
322 	}
323 
324 	/*
325 	 * Extract the locale name for the category if it is a composite
326 	 * locale.
327 	 */
328 	if ((slash = strchr(locname, '/')) != NULL) {
329 		for (cnt = category; cnt && slash != NULL; cnt--) {
330 			locname = slash + 1;
331 			slash = strchr(locname, '/');
332 		}
333 		if (slash) {
334 			len = slash - locname + 1;
335 			if (len >= sizeof (scratch)) {
336 				len = sizeof (scratch);
337 			}
338 		} else {
339 			len = sizeof (scratch);
340 		}
341 		(void) strlcpy(scratch, locname, len);
342 		locname = scratch;
343 	}
344 
345 	if ((strcmp(locname, "C") == 0) || (strcmp(locname, "POSIX") == 0))
346 		return (posix_locale.locdata[category]);
347 
348 	/* Handle charmap aliases */
349 	for (i = 0; cmalias[i].alias != NULL; i++) {
350 		if ((cm = strstr(locname, cmalias[i].alias)) != NULL &&
351 		    strlen(cm) == strlen(cmalias[i].alias)) {
352 			len = cm - locname + 1;
353 			if (len + strlen(cmalias[i].name) >= sizeof (scratch2))
354 				break;
355 			(void) strlcpy(scratch2, locname, len);
356 			(void) strlcat(scratch2, cmalias[i].name,
357 			    sizeof (scratch2));
358 			locname = scratch2;
359 			break;
360 		}
361 	}
362 
363 	if ((strcmp(locname, "C.UTF-8") == 0) && (category != LC_CTYPE))
364 		return (&cutf_locdata[category]);
365 
366 	return (locdata_get_cache(category, locname));
367 }
368 
369 /* tsd destructor */
370 static void
freelocptr(void * arg)371 freelocptr(void *arg)
372 {
373 	locale_t *locptr = arg;
374 	if (*locptr != NULL)
375 		freelocale(*locptr);
376 }
377 
378 static const char *
get_locale_env(int category)379 get_locale_env(int category)
380 {
381 	const char *env;
382 
383 	/* 1. check LC_ALL. */
384 	env = getenv(categories[LC_ALL]);
385 
386 	/* 2. check LC_* */
387 	if (env == NULL || *env == '\0')
388 		env = getenv(categories[category]);
389 
390 	/* 3. check LANG */
391 	if (env == NULL || *env == '\0')
392 		env = getenv("LANG");
393 
394 	/* 4. if none is set, fall to "C" */
395 	if (env == NULL || *env == '\0')
396 		env = "C";
397 
398 	return (env);
399 }
400 
401 
402 /*
403  * This routine is exposed via the MB_CUR_MAX macro.  Note that legacy
404  * code will continue to use _ctype[520], but we prefer this function as
405  * it is the only way to get thread-specific information.
406  */
407 unsigned char
__mb_cur_max_l(locale_t loc)408 __mb_cur_max_l(locale_t loc)
409 {
410 	return (loc->ctype->lc_max_mblen);
411 }
412 
413 unsigned char
__mb_cur_max(void)414 __mb_cur_max(void)
415 {
416 	return (__mb_cur_max_l(uselocale(NULL)));
417 }
418 
419 /*
420  * Public interfaces.
421  */
422 
423 locale_t
duplocale(locale_t src)424 duplocale(locale_t src)
425 {
426 	locale_t	loc;
427 	int		i;
428 
429 	loc = lmalloc(sizeof (*loc));
430 	if (loc == NULL) {
431 		return (NULL);
432 	}
433 	if (src == NULL) {
434 		/* illumos extension: POSIX says LC_GLOBAL_LOCALE here */
435 		src = ___global_locale;
436 	}
437 	for (i = 0; i < LC_ALL; i++) {
438 		loc->locdata[i] = src->locdata[i];
439 		loc->loaded[i] = 0;
440 	}
441 	loc->collate = loc->locdata[LC_COLLATE]->l_data[0];
442 	loc->ctype = loc->locdata[LC_CTYPE]->l_data[0];
443 	loc->runelocale = loc->locdata[LC_CTYPE]->l_data[1];
444 	loc->messages = loc->locdata[LC_MESSAGES]->l_data[0];
445 	loc->monetary = loc->locdata[LC_MONETARY]->l_data[0];
446 	loc->numeric = loc->locdata[LC_NUMERIC]->l_data[0];
447 	loc->time = loc->locdata[LC_TIME]->l_data[0];
448 	return (loc);
449 }
450 
451 void
freelocale(locale_t loc)452 freelocale(locale_t loc)
453 {
454 	/*
455 	 * We take extra care never to free a saved locale created by
456 	 * setlocale().  This shouldn't be strictly necessary, but a little
457 	 * extra safety doesn't hurt here.
458 	 */
459 	if ((loc != NULL) && (loc != &posix_locale) && (!loc->on_list))
460 		lfree(loc, sizeof (*loc));
461 }
462 
463 locale_t
newlocale(int catmask,const char * locname,locale_t base)464 newlocale(int catmask, const char *locname, locale_t base)
465 {
466 	locale_t loc;
467 	int i, e;
468 
469 	if (catmask & ~(LC_ALL_MASK)) {
470 		errno = EINVAL;
471 		return (NULL);
472 	}
473 
474 	/*
475 	 * Technically passing LC_GLOBAL_LOCALE here is illegal,
476 	 * but we allow it.
477 	 */
478 	if (base == NULL || base == ___global_locale) {
479 		loc = duplocale(___global_locale);
480 	} else {
481 		loc = duplocale(base);
482 	}
483 	if (loc == NULL) {
484 		return (NULL);
485 	}
486 
487 	for (i = 0; i < LC_ALL; i++) {
488 		struct locdata *ldata;
489 		loc->loaded[i] = 0;
490 		if (((1 << i) & catmask) == 0) {
491 			/* Default to base locale if not overriding */
492 			continue;
493 		}
494 		ldata = locdata_get(i, locname);
495 		if (ldata == NULL) {
496 			e = errno;
497 			freelocale(loc);
498 			errno = e;
499 			return (NULL);
500 		}
501 		loc->locdata[i] = ldata;
502 	}
503 	loc->collate = loc->locdata[LC_COLLATE]->l_data[0];
504 	loc->ctype = loc->locdata[LC_CTYPE]->l_data[0];
505 	loc->runelocale = loc->locdata[LC_CTYPE]->l_data[1];
506 	loc->messages = loc->locdata[LC_MESSAGES]->l_data[0];
507 	loc->monetary = loc->locdata[LC_MONETARY]->l_data[0];
508 	loc->numeric = loc->locdata[LC_NUMERIC]->l_data[0];
509 	loc->time = loc->locdata[LC_TIME]->l_data[0];
510 	freelocale(base);
511 
512 	return (mklocname(loc));
513 }
514 
515 locale_t
uselocale(locale_t loc)516 uselocale(locale_t loc)
517 {
518 	locale_t lastloc = ___global_locale;
519 	locale_t *locptr;
520 
521 	locptr = tsdalloc(_T_SETLOCALE, sizeof (locale_t), freelocptr);
522 	/* Should never occur */
523 	if (locptr == NULL) {
524 		errno = EINVAL;
525 		return (NULL);
526 	}
527 
528 	if (*locptr != NULL)
529 		lastloc = *locptr;
530 
531 	/* Argument loc is NULL if we are just querying. */
532 	if (loc != NULL) {
533 		/*
534 		 * Set it to LC_GLOBAL_LOCAL to return to using
535 		 * the global locale (setlocale).
536 		 */
537 		if (loc == ___global_locale) {
538 			*locptr = NULL;
539 		} else {
540 			/* No validation of the provided locale at present */
541 			*locptr = loc;
542 		}
543 	}
544 
545 	/*
546 	 * The caller is responsible for freeing, of course it would be
547 	 * gross error to call freelocale() on a locale object that is still
548 	 * in use.
549 	 */
550 	return (lastloc);
551 }
552 
553 static locale_t
mklocname(locale_t loc)554 mklocname(locale_t loc)
555 {
556 	int composite = 0;
557 
558 	/* Look to see if any category is different */
559 	for (int i = 1; i < LC_ALL; ++i) {
560 		if (strcmp(loc->locdata[0]->l_lname,
561 		    loc->locdata[i]->l_lname) != 0) {
562 			composite = 1;
563 			break;
564 		}
565 	}
566 
567 	if (composite) {
568 		/*
569 		 * Note ordering of these follows the numeric order,
570 		 * if the order is changed, then setlocale() will need
571 		 * to be changed as well.
572 		 */
573 		(void) snprintf(loc->locname, sizeof (loc->locname),
574 		    "%s/%s/%s/%s/%s/%s",
575 		    loc->locdata[LC_CTYPE]->l_lname,
576 		    loc->locdata[LC_NUMERIC]->l_lname,
577 		    loc->locdata[LC_TIME]->l_lname,
578 		    loc->locdata[LC_COLLATE]->l_lname,
579 		    loc->locdata[LC_MONETARY]->l_lname,
580 		    loc->locdata[LC_MESSAGES]->l_lname);
581 	} else {
582 		(void) strlcpy(loc->locname, loc->locdata[LC_CTYPE]->l_lname,
583 		    sizeof (loc->locname));
584 	}
585 	return (loc);
586 }
587 
588 /*
589  * POSIX has several lifetime requirements that vary on the type of locale.
590  *
591  * If the locale is LC_GLOBAL_LOCALE, the returned string is required to live
592  * beyond the locale's use as the global locale. The specification suggests that
593  * this use a thread-local buffer and cautions that it may disappear when the
594  * thread terminates or another LC_GLOBAL_LOCALE call is made. In our case,
595  * because we will never free a locale that is set with setlocale() (see
596  * port/locale/setlocale.c), we can simply return the name of the locale
597  * directly.
598  *
599  * If the locale is any other locale, it is allowed to be invalidated by a call
600  * to uselocale() or newlocale().
601  *
602  * In both of these cases this means that we can simply return the string from
603  * the current object. POSIX importantly states that the application is not
604  * allowed to assume the name will stay the same across invocations and
605  * therefore it cannot be relied upon for serialization. However, it will work
606  * with setlocale() again.
607  */
608 const char *
getlocalename_l(int category,locale_t loc)609 getlocalename_l(int category, locale_t loc)
610 {
611 	if (loc == NULL) {
612 		return (NULL);
613 	}
614 
615 	switch (category) {
616 	case LC_CTYPE:
617 	case LC_NUMERIC:
618 	case LC_TIME:
619 	case LC_COLLATE:
620 	case LC_MONETARY:
621 	case LC_MESSAGES:
622 		return (loc->locdata[category]->l_lname);
623 	case LC_ALL:
624 		return (loc->locname);
625 	default:
626 		/*
627 		 * POSIX does not define any errors here so we can't indicate
628 		 * anything via errno or similar.
629 		 */
630 		return (NULL);
631 	}
632 }
633