1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2014 Garrett D'Amore <garrett@damore.org>
14 * Copyright 2025 Oxide Computer Company
15 */
16
17 /*
18 * This file implements the 2008 newlocale and friends handling.
19 */
20
21 #ifndef _LCONV_C99
22 #define _LCONV_C99
23 #endif
24
25 #include "lint.h"
26 #include <atomic.h>
27 #include <locale.h>
28 #include <sys/types.h>
29 #include <sys/mman.h>
30 #include <errno.h>
31 #include <string.h>
32 #include "libc.h"
33 #include "mtlib.h"
34 #include "tsd.h"
35 #include "localeimpl.h"
36 #include "lctype.h"
37
38 /*
39 * Big Theory of Locales:
40 *
41 * (It is recommended that readers familiarize themselves with the POSIX
42 * 2008 (XPG Issue 7) specifications for locales, first.)
43 *
44 * Historically, we had a bunch of global variables that stored locale
45 * data. While this worked well, it limited applications to a single locale
46 * at a time. This doesn't work well in certain server applications.
47 *
48 * Issue 7, X/Open introduced the concept of a locale_t object, along with
49 * versions of functions that can take this object as a parameter, along
50 * with functions to clone and manipulate these locale objects. The new
51 * functions are named with a _l() suffix.
52 *
53 * Additionally uselocale() is introduced which can change the locale of
54 * of a single thread. However, setlocale() can still be used to change
55 * the global locale.
56 *
57 * In our implementation, we use libc's TSD to store the locale data that
58 * was previously global. We still have global data because some applications
59 * have had those global objects compiled into them. (Such applications will
60 * be unable to benefit from uselocale(), btw.) The legacy routines are
61 * reimplemented as wrappers that use the appropriate locale object by
62 * calling uselocale(). uselocale() when passed a NULL pointer returns the
63 * thread-specific locale object if one is present, or the global locale
64 * object otherwise. Note that once the TSD data is set, the only way
65 * to revert to the global locale is to pass the global locale LC_GLOBAL_LOCALE
66 * to uselocale().
67 *
68 * We are careful to minimize performance impact of multiple calls to
69 * uselocale() or setlocale() by using a cache of locale data whenever possible.
70 * As a consequence of this, applications that iterate over all possible
71 * locales will burn through a lot of virtual memory, but we find such
72 * applications rare. (locale -a might be an exception, but it is short lived.)
73 *
74 * Category data is never released (although enclosing locale objects might be),
75 * in order to guarantee thread-safety. Calling freelocale() on an object
76 * while it is in use by another thread is a programmer error (use-after-free)
77 * and we don't bother to note it further.
78 *
79 * Locale objects (global locales) established by setlocale() are also
80 * never freed (for MT safety), but we will save previous locale objects
81 * and reuse them when we can.
82 */
83
84 typedef struct locdata *(*loadfn_t)(const char *);
85
86 static const loadfn_t loaders[LC_ALL] = {
87 __lc_ctype_load,
88 __lc_numeric_load,
89 __lc_time_load,
90 __lc_collate_load,
91 __lc_monetary_load,
92 __lc_messages_load,
93 };
94
95 extern struct lc_monetary lc_monetary_posix;
96 extern struct lc_numeric lc_numeric_posix;
97 extern struct lc_messages lc_messages_posix;
98 extern struct lc_time lc_time_posix;
99 extern struct lc_ctype lc_ctype_posix;
100 extern struct lc_collate lc_collate_posix;
101 extern struct _RuneLocale _DefaultRuneLocale;
102
103 static struct _locale posix_locale = {
104 /* locdata */
105 .locdata = {
106 &__posix_ctype_locdata,
107 &__posix_numeric_locdata,
108 &__posix_time_locdata,
109 &__posix_collate_locdata,
110 &__posix_monetary_locdata,
111 &__posix_messages_locdata,
112 },
113 .locname = "C",
114 .ctype = &lc_ctype_posix,
115 .numeric = &lc_numeric_posix,
116 .collate = &lc_collate_posix,
117 .monetary = &lc_monetary_posix,
118 .messages = &lc_messages_posix,
119 .time = &lc_time_posix,
120 .runelocale = &_DefaultRuneLocale,
121 };
122
123 locale_t ___global_locale = &posix_locale;
124
125 locale_t
__global_locale(void)126 __global_locale(void)
127 {
128 return (___global_locale);
129 }
130
131 /*
132 * Locale data for hybrid C.UTF-8 locale having all the characteristics of
133 * default C/POSIX locale, except for LC_CTYPE data which is retrieved from
134 * cache/file as for other UTF-8 locales.
135 */
136 static struct locdata cutf_locdata[LC_ALL] = {
137 { "C.UTF-8", NULL }, /* unused */
138 { "C.UTF-8", &lc_numeric_posix },
139 { "C.UTF-8", &lc_time_posix },
140 { "C.UTF-8", &lc_collate_posix },
141 { "C.UTF-8", &lc_monetary_posix },
142 { "C.UTF-8", &lc_messages_posix },
143 };
144
145 /*
146 * Category names for getenv() Note that this was modified
147 * for Solaris. See <iso/locale_iso.h>.
148 */
149 #define NUM_CATS 7
150 static char *categories[7] = {
151 "LC_CTYPE",
152 "LC_NUMERIC",
153 "LC_TIME",
154 "LC_COLLATE",
155 "LC_MONETARY",
156 "LC_MESSAGES",
157 "LC_ALL",
158 };
159
160 /*
161 * Prototypes.
162 */
163 static const char *get_locale_env(int);
164 static struct locdata *locdata_get(int, const char *);
165 static struct locdata *locdata_get_cache(int, const char *);
166 static locale_t mklocname(locale_t);
167
168 /*
169 * Some utility routines.
170 */
171
172 struct locdata *
__locdata_alloc(const char * name,size_t memsz)173 __locdata_alloc(const char *name, size_t memsz)
174 {
175 struct locdata *ldata;
176
177 if ((ldata = lmalloc(sizeof (*ldata))) == NULL) {
178 return (NULL);
179 }
180 if ((ldata->l_data[0] = libc_malloc(memsz)) == NULL) {
181 lfree(ldata, sizeof (*ldata));
182 errno = ENOMEM;
183 return (NULL);
184 }
185 (void) strlcpy(ldata->l_lname, name, sizeof (ldata->l_lname));
186
187 return (ldata);
188 }
189
190 /*
191 * Normally we never free locale data truly, but if we failed to load it
192 * for some reason, this routine is used to cleanup the partial mess.
193 */
194 void
__locdata_free(struct locdata * ldata)195 __locdata_free(struct locdata *ldata)
196 {
197 for (int i = 0; i < NLOCDATA; i++)
198 libc_free(ldata->l_data[i]);
199 if (ldata->l_map != NULL && ldata->l_map_len)
200 (void) munmap(ldata->l_map, ldata->l_map_len);
201 lfree(ldata, sizeof (*ldata));
202 }
203
204 /*
205 * It turns out that for performance reasons we would really like to
206 * cache the most recently referenced locale data to avoid wasteful
207 * loading from files.
208 */
209
210 static struct locdata *cache_data[LC_ALL];
211 static struct locdata *cat_data[LC_ALL];
212 static mutex_t cache_lock = DEFAULTMUTEX;
213
214 /*
215 * Returns the cached data if the locale name is the same. If not,
216 * returns NULL (cache miss). The locdata is returned with a hold on
217 * it, taken on behalf of the caller. The caller should drop the hold
218 * when it is finished.
219 */
220 static struct locdata *
locdata_get_cache(int category,const char * locname)221 locdata_get_cache(int category, const char *locname)
222 {
223 struct locdata *loc;
224
225 if (category < 0 || category >= LC_ALL)
226 return (NULL);
227
228 /* Try cache first. */
229 lmutex_lock(&cache_lock);
230 loc = cache_data[category];
231
232 if ((loc != NULL) && (strcmp(loc->l_lname, locname) == 0)) {
233 lmutex_unlock(&cache_lock);
234 return (loc);
235 }
236
237 /*
238 * Failing that try previously loaded locales (linear search) --
239 * this could be optimized to a hash, but its unlikely that a single
240 * application will ever need to work with more than a few locales.
241 */
242 for (loc = cat_data[category]; loc != NULL; loc = loc->l_next) {
243 if (strcmp(locname, loc->l_lname) == 0) {
244 break;
245 }
246 }
247
248 /*
249 * Finally, if we still don't have one, try loading the locale
250 * data from the actual on-disk data.
251 *
252 * We drop the lock (libc wants to ensure no internal locks
253 * are held when we call other routines required to read from
254 * files, allocate memory, etc.) There is a small race here,
255 * but the consequences of the race are benign -- if multiple
256 * threads hit this at precisely the same point, we could
257 * wind up with duplicates of the locale data in the cache.
258 *
259 * This wastes the memory for an extra copy of the locale
260 * data, but there is no further harm beyond that. Its not
261 * worth the effort to recode this to something "safe"
262 * (which would require rescanning the list, etc.), given
263 * that this race will probably never actually occur.
264 */
265 if (loc == NULL) {
266 lmutex_unlock(&cache_lock);
267 loc = (*loaders[category])(locname);
268 lmutex_lock(&cache_lock);
269 if (loc != NULL)
270 (void) strlcpy(loc->l_lname, locname,
271 sizeof (loc->l_lname));
272 }
273
274 /*
275 * Assuming we got one, update the cache, and stick us on the list
276 * of loaded locale data. We insert into the head (more recent
277 * use is likely to win.)
278 */
279 if (loc != NULL) {
280 cache_data[category] = loc;
281 if (!loc->l_cached) {
282 loc->l_cached = 1;
283 loc->l_next = cat_data[category];
284 cat_data[category] = loc;
285 }
286 }
287
288 lmutex_unlock(&cache_lock);
289 return (loc);
290 }
291
292 /* Charmap aliases, mostly found in Linux */
293 static const struct {
294 const char *alias;
295 const char *name;
296 } cmalias[] = {
297 { "utf8", "UTF-8" },
298 { "iso88591", "ISO8859-1" },
299 { "iso885915", "ISO8859-15" },
300 { "gb18030", "GB18030" },
301 { "koi8r", "KOI8-R" },
302 { NULL, NULL }
303 };
304
305 /*
306 * Routine to get the locdata for a given category and locale.
307 * This includes retrieving it from cache, retrieving it from
308 * a file, etc.
309 */
310 static struct locdata *
locdata_get(int category,const char * locname)311 locdata_get(int category, const char *locname)
312 {
313 char scratch[ENCODING_LEN + 1];
314 char scratch2[ENCODING_LEN + 1];
315 char *slash, *cm;
316 int cnt;
317 int len;
318 int i;
319
320 if (locname == NULL || *locname == 0) {
321 locname = get_locale_env(category);
322 }
323
324 /*
325 * Extract the locale name for the category if it is a composite
326 * locale.
327 */
328 if ((slash = strchr(locname, '/')) != NULL) {
329 for (cnt = category; cnt && slash != NULL; cnt--) {
330 locname = slash + 1;
331 slash = strchr(locname, '/');
332 }
333 if (slash) {
334 len = slash - locname + 1;
335 if (len >= sizeof (scratch)) {
336 len = sizeof (scratch);
337 }
338 } else {
339 len = sizeof (scratch);
340 }
341 (void) strlcpy(scratch, locname, len);
342 locname = scratch;
343 }
344
345 if ((strcmp(locname, "C") == 0) || (strcmp(locname, "POSIX") == 0))
346 return (posix_locale.locdata[category]);
347
348 /* Handle charmap aliases */
349 for (i = 0; cmalias[i].alias != NULL; i++) {
350 if ((cm = strstr(locname, cmalias[i].alias)) != NULL &&
351 strlen(cm) == strlen(cmalias[i].alias)) {
352 len = cm - locname + 1;
353 if (len + strlen(cmalias[i].name) >= sizeof (scratch2))
354 break;
355 (void) strlcpy(scratch2, locname, len);
356 (void) strlcat(scratch2, cmalias[i].name,
357 sizeof (scratch2));
358 locname = scratch2;
359 break;
360 }
361 }
362
363 if ((strcmp(locname, "C.UTF-8") == 0) && (category != LC_CTYPE))
364 return (&cutf_locdata[category]);
365
366 return (locdata_get_cache(category, locname));
367 }
368
369 /* tsd destructor */
370 static void
freelocptr(void * arg)371 freelocptr(void *arg)
372 {
373 locale_t *locptr = arg;
374 if (*locptr != NULL)
375 freelocale(*locptr);
376 }
377
378 static const char *
get_locale_env(int category)379 get_locale_env(int category)
380 {
381 const char *env;
382
383 /* 1. check LC_ALL. */
384 env = getenv(categories[LC_ALL]);
385
386 /* 2. check LC_* */
387 if (env == NULL || *env == '\0')
388 env = getenv(categories[category]);
389
390 /* 3. check LANG */
391 if (env == NULL || *env == '\0')
392 env = getenv("LANG");
393
394 /* 4. if none is set, fall to "C" */
395 if (env == NULL || *env == '\0')
396 env = "C";
397
398 return (env);
399 }
400
401
402 /*
403 * This routine is exposed via the MB_CUR_MAX macro. Note that legacy
404 * code will continue to use _ctype[520], but we prefer this function as
405 * it is the only way to get thread-specific information.
406 */
407 unsigned char
__mb_cur_max_l(locale_t loc)408 __mb_cur_max_l(locale_t loc)
409 {
410 return (loc->ctype->lc_max_mblen);
411 }
412
413 unsigned char
__mb_cur_max(void)414 __mb_cur_max(void)
415 {
416 return (__mb_cur_max_l(uselocale(NULL)));
417 }
418
419 /*
420 * Public interfaces.
421 */
422
423 locale_t
duplocale(locale_t src)424 duplocale(locale_t src)
425 {
426 locale_t loc;
427 int i;
428
429 loc = lmalloc(sizeof (*loc));
430 if (loc == NULL) {
431 return (NULL);
432 }
433 if (src == NULL) {
434 /* illumos extension: POSIX says LC_GLOBAL_LOCALE here */
435 src = ___global_locale;
436 }
437 for (i = 0; i < LC_ALL; i++) {
438 loc->locdata[i] = src->locdata[i];
439 loc->loaded[i] = 0;
440 }
441 loc->collate = loc->locdata[LC_COLLATE]->l_data[0];
442 loc->ctype = loc->locdata[LC_CTYPE]->l_data[0];
443 loc->runelocale = loc->locdata[LC_CTYPE]->l_data[1];
444 loc->messages = loc->locdata[LC_MESSAGES]->l_data[0];
445 loc->monetary = loc->locdata[LC_MONETARY]->l_data[0];
446 loc->numeric = loc->locdata[LC_NUMERIC]->l_data[0];
447 loc->time = loc->locdata[LC_TIME]->l_data[0];
448 return (loc);
449 }
450
451 void
freelocale(locale_t loc)452 freelocale(locale_t loc)
453 {
454 /*
455 * We take extra care never to free a saved locale created by
456 * setlocale(). This shouldn't be strictly necessary, but a little
457 * extra safety doesn't hurt here.
458 */
459 if ((loc != NULL) && (loc != &posix_locale) && (!loc->on_list))
460 lfree(loc, sizeof (*loc));
461 }
462
463 locale_t
newlocale(int catmask,const char * locname,locale_t base)464 newlocale(int catmask, const char *locname, locale_t base)
465 {
466 locale_t loc;
467 int i, e;
468
469 if (catmask & ~(LC_ALL_MASK)) {
470 errno = EINVAL;
471 return (NULL);
472 }
473
474 /*
475 * Technically passing LC_GLOBAL_LOCALE here is illegal,
476 * but we allow it.
477 */
478 if (base == NULL || base == ___global_locale) {
479 loc = duplocale(___global_locale);
480 } else {
481 loc = duplocale(base);
482 }
483 if (loc == NULL) {
484 return (NULL);
485 }
486
487 for (i = 0; i < LC_ALL; i++) {
488 struct locdata *ldata;
489 loc->loaded[i] = 0;
490 if (((1 << i) & catmask) == 0) {
491 /* Default to base locale if not overriding */
492 continue;
493 }
494 ldata = locdata_get(i, locname);
495 if (ldata == NULL) {
496 e = errno;
497 freelocale(loc);
498 errno = e;
499 return (NULL);
500 }
501 loc->locdata[i] = ldata;
502 }
503 loc->collate = loc->locdata[LC_COLLATE]->l_data[0];
504 loc->ctype = loc->locdata[LC_CTYPE]->l_data[0];
505 loc->runelocale = loc->locdata[LC_CTYPE]->l_data[1];
506 loc->messages = loc->locdata[LC_MESSAGES]->l_data[0];
507 loc->monetary = loc->locdata[LC_MONETARY]->l_data[0];
508 loc->numeric = loc->locdata[LC_NUMERIC]->l_data[0];
509 loc->time = loc->locdata[LC_TIME]->l_data[0];
510 freelocale(base);
511
512 return (mklocname(loc));
513 }
514
515 locale_t
uselocale(locale_t loc)516 uselocale(locale_t loc)
517 {
518 locale_t lastloc = ___global_locale;
519 locale_t *locptr;
520
521 locptr = tsdalloc(_T_SETLOCALE, sizeof (locale_t), freelocptr);
522 /* Should never occur */
523 if (locptr == NULL) {
524 errno = EINVAL;
525 return (NULL);
526 }
527
528 if (*locptr != NULL)
529 lastloc = *locptr;
530
531 /* Argument loc is NULL if we are just querying. */
532 if (loc != NULL) {
533 /*
534 * Set it to LC_GLOBAL_LOCAL to return to using
535 * the global locale (setlocale).
536 */
537 if (loc == ___global_locale) {
538 *locptr = NULL;
539 } else {
540 /* No validation of the provided locale at present */
541 *locptr = loc;
542 }
543 }
544
545 /*
546 * The caller is responsible for freeing, of course it would be
547 * gross error to call freelocale() on a locale object that is still
548 * in use.
549 */
550 return (lastloc);
551 }
552
553 static locale_t
mklocname(locale_t loc)554 mklocname(locale_t loc)
555 {
556 int composite = 0;
557
558 /* Look to see if any category is different */
559 for (int i = 1; i < LC_ALL; ++i) {
560 if (strcmp(loc->locdata[0]->l_lname,
561 loc->locdata[i]->l_lname) != 0) {
562 composite = 1;
563 break;
564 }
565 }
566
567 if (composite) {
568 /*
569 * Note ordering of these follows the numeric order,
570 * if the order is changed, then setlocale() will need
571 * to be changed as well.
572 */
573 (void) snprintf(loc->locname, sizeof (loc->locname),
574 "%s/%s/%s/%s/%s/%s",
575 loc->locdata[LC_CTYPE]->l_lname,
576 loc->locdata[LC_NUMERIC]->l_lname,
577 loc->locdata[LC_TIME]->l_lname,
578 loc->locdata[LC_COLLATE]->l_lname,
579 loc->locdata[LC_MONETARY]->l_lname,
580 loc->locdata[LC_MESSAGES]->l_lname);
581 } else {
582 (void) strlcpy(loc->locname, loc->locdata[LC_CTYPE]->l_lname,
583 sizeof (loc->locname));
584 }
585 return (loc);
586 }
587
588 /*
589 * POSIX has several lifetime requirements that vary on the type of locale.
590 *
591 * If the locale is LC_GLOBAL_LOCALE, the returned string is required to live
592 * beyond the locale's use as the global locale. The specification suggests that
593 * this use a thread-local buffer and cautions that it may disappear when the
594 * thread terminates or another LC_GLOBAL_LOCALE call is made. In our case,
595 * because we will never free a locale that is set with setlocale() (see
596 * port/locale/setlocale.c), we can simply return the name of the locale
597 * directly.
598 *
599 * If the locale is any other locale, it is allowed to be invalidated by a call
600 * to uselocale() or newlocale().
601 *
602 * In both of these cases this means that we can simply return the string from
603 * the current object. POSIX importantly states that the application is not
604 * allowed to assume the name will stay the same across invocations and
605 * therefore it cannot be relied upon for serialization. However, it will work
606 * with setlocale() again.
607 */
608 const char *
getlocalename_l(int category,locale_t loc)609 getlocalename_l(int category, locale_t loc)
610 {
611 if (loc == NULL) {
612 return (NULL);
613 }
614
615 switch (category) {
616 case LC_CTYPE:
617 case LC_NUMERIC:
618 case LC_TIME:
619 case LC_COLLATE:
620 case LC_MONETARY:
621 case LC_MESSAGES:
622 return (loc->locdata[category]->l_lname);
623 case LC_ALL:
624 return (loc->locname);
625 default:
626 /*
627 * POSIX does not define any errors here so we can't indicate
628 * anything via errno or similar.
629 */
630 return (NULL);
631 }
632 }
633