xref: /freebsd/lib/libc/locale/collate.c (revision 2e620256bd76c449c835c604e404483437743011)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
5  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
6  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
7  *		at Electronni Visti IA, Kiev, Ukraine.
8  *			All rights reserved.
9  *
10  * Copyright (c) 2011 The FreeBSD Foundation
11  *
12  * Portions of this software were developed by David Chisnall
13  * under sponsorship from the FreeBSD Foundation.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  * Adapted to xlocale by John Marino <draco@marino.st>
37  */
38 
39 #include <sys/cdefs.h>
40 #include "namespace.h"
41 
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <sys/mman.h>
45 
46 #include <assert.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <wchar.h>
51 #include <errno.h>
52 #include <unistd.h>
53 #include <fcntl.h>
54 #include "un-namespace.h"
55 
56 #include "collate.h"
57 #include "setlocale.h"
58 #include "ldpart.h"
59 #include "libc_private.h"
60 
61 struct xlocale_collate __xlocale_global_collate = {
62 	{{0}, "C"}, 1, 0, 0, 0
63 };
64 
65 struct xlocale_collate __xlocale_C_collate = {
66 	{{0}, "C"}, 1, 0, 0, 0
67 };
68 
69 struct xlocale_collate __xlocale_POSIX_collate = {
70 	{{0}, "POSIX"}, 1, 0, 0, 0
71 };
72 
73 struct xlocale_collate __xlocale_CUTF8_collate = {
74 	{{0}, "C.UTF-8"}, 1, 0, 0, 0
75 };
76 
77 static int
78 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
79 
80 static void
81 destruct_collate(void *t)
82 {
83 	struct xlocale_collate *table = t;
84 	if (table->map && (table->maplen > 0)) {
85 		(void) munmap(table->map, table->maplen);
86 	}
87 	free(t);
88 }
89 
90 void *
91 __collate_load(const char *encoding, __unused locale_t unused)
92 {
93 	if (strcmp(encoding, "C") == 0)
94 		return (&__xlocale_C_collate);
95 	else if (strcmp(encoding, "POSIX") == 0)
96 		return (&__xlocale_POSIX_collate);
97 	else if (strcmp(encoding, "C.UTF-8") == 0)
98 		return (&__xlocale_CUTF8_collate);
99 
100 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate),
101 	    1);
102 	if (table == NULL)
103 		return (NULL);
104 	table->header.header.destructor = destruct_collate;
105 
106 	/*
107 	 * FIXME: Make sure that _LDP_CACHE is never returned.  We
108 	 * should be doing the caching outside of this section.
109 	 */
110 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
111 		xlocale_release(table);
112 		return (NULL);
113 	}
114 	return (table);
115 }
116 
117 /**
118  * Load the collation tables for the specified encoding into the global table.
119  */
120 int
121 __collate_load_tables(const char *encoding)
122 {
123 
124 	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
125 }
126 
127 static int
128 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
129 {
130 	int i, chains, z;
131 	char *buf;
132 	char *TMP;
133 	char *map;
134 	collate_info_t *info;
135 	struct stat sbuf;
136 	int fd;
137 
138 	table->__collate_load_error = 1;
139 
140 	/* 'encoding' must be already checked. */
141 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 ||
142 	    strncmp(encoding, "C.", 2) == 0) {
143 		return (_LDP_CACHE);
144 	}
145 
146 	if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1)
147 		return (_LDP_ERROR);
148 
149 	if ((fd = _open(buf, O_RDONLY | O_CLOEXEC)) < 0) {
150 		free(buf);
151 		return (_LDP_ERROR);
152 	}
153 	free(buf);
154 	if (_fstat(fd, &sbuf) < 0) {
155 		(void) _close(fd);
156 		return (_LDP_ERROR);
157 	}
158 	if (sbuf.st_size < (COLLATE_FMT_VERSION_LEN +
159 			    XLOCALE_DEF_VERSION_LEN +
160 			    sizeof (*info))) {
161 		(void) _close(fd);
162 		errno = EINVAL;
163 		return (_LDP_ERROR);
164 	}
165 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
166 	(void) _close(fd);
167 	if ((TMP = map) == MAP_FAILED) {
168 		return (_LDP_ERROR);
169 	}
170 
171 	if (strncmp(TMP, COLLATE_FMT_VERSION, COLLATE_FMT_VERSION_LEN) != 0) {
172 		(void) munmap(map, sbuf.st_size);
173 		errno = EINVAL;
174 		return (_LDP_ERROR);
175 	}
176 	TMP += COLLATE_FMT_VERSION_LEN;
177 	strlcat(table->header.version, TMP, sizeof (table->header.version));
178 	TMP += XLOCALE_DEF_VERSION_LEN;
179 
180 	info = (void *)TMP;
181 	TMP += sizeof (*info);
182 
183 	if ((info->directive_count < 1) ||
184 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
185 	    ((chains = info->chain_count) < 0)) {
186 		(void) munmap(map, sbuf.st_size);
187 		errno = EINVAL;
188 		return (_LDP_ERROR);
189 	}
190 
191 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
192 	    (sizeof (collate_chain_t) * chains) +
193 	    (sizeof (collate_large_t) * info->large_count);
194 	for (z = 0; z < info->directive_count; z++) {
195 		i += sizeof (collate_subst_t) * info->subst_count[z];
196 	}
197 	if (i != (sbuf.st_size - (TMP - map))) {
198 		(void) munmap(map, sbuf.st_size);
199 		errno = EINVAL;
200 		return (_LDP_ERROR);
201 	}
202 
203 	if (table->map && (table->maplen > 0)) {
204 		(void) munmap(table->map, table->maplen);
205 	}
206 	table->map = map;
207 	table->maplen = sbuf.st_size;
208 	table->info = info;
209 	table->char_pri_table = (void *)TMP;
210 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
211 
212 	for (z = 0; z < info->directive_count; z++) {
213 		if (info->subst_count[z] > 0) {
214 			table->subst_table[z] = (void *)TMP;
215 			TMP += info->subst_count[z] * sizeof (collate_subst_t);
216 		} else {
217 			table->subst_table[z] = NULL;
218 		}
219 	}
220 
221 	if (chains > 0) {
222 		table->chain_pri_table = (void *)TMP;
223 		TMP += chains * sizeof (collate_chain_t);
224 	} else
225 		table->chain_pri_table = NULL;
226 	if (info->large_count > 0)
227 		table->large_pri_table = (void *)TMP;
228 	else
229 		table->large_pri_table = NULL;
230 
231 	table->__collate_load_error = 0;
232 	return (_LDP_LOADED);
233 }
234 
235 static const int32_t *
236 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
237 {
238 	const collate_subst_t *p;
239 	int n = table->info->subst_count[pass];
240 
241 	if (n == 0)
242 		return (NULL);
243 
244 	if (pass >= table->info->directive_count)
245 		return (NULL);
246 
247 	if (!(key & COLLATE_SUBST_PRIORITY))
248 		return (NULL);
249 
250 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
251 	assert(p->key == key);
252 
253 	return (p->pri);
254 }
255 
256 static collate_chain_t *
257 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
258 {
259 	int low = 0;
260 	int high = table->info->chain_count - 1;
261 	int next, compar, l;
262 	collate_chain_t *p;
263 	collate_chain_t *tab = table->chain_pri_table;
264 
265 	if (high < 0)
266 		return (NULL);
267 
268 	while (low <= high) {
269 		next = (low + high) / 2;
270 		p = tab + next;
271 		compar = *key - *p->str;
272 		if (compar == 0) {
273 			l = wcsnlen(p->str, COLLATE_STR_LEN);
274 			compar = wcsncmp(key, p->str, l);
275 			if (compar == 0) {
276 				*len = l;
277 				return (p);
278 			}
279 		}
280 		if (compar > 0)
281 			low = next + 1;
282 		else
283 			high = next - 1;
284 	}
285 	return (NULL);
286 }
287 
288 static collate_large_t *
289 largesearch(struct xlocale_collate *table, const wchar_t key)
290 {
291 	int low = 0;
292 	int high = table->info->large_count - 1;
293 	int next, compar;
294 	collate_large_t *p;
295 	collate_large_t *tab = table->large_pri_table;
296 
297 	if (high < 0)
298 		return (NULL);
299 
300 	while (low <= high) {
301 		next = (low + high) / 2;
302 		p = tab + next;
303 		compar = key - p->val;
304 		if (compar == 0)
305 			return (p);
306 		if (compar > 0)
307 			low = next + 1;
308 		else
309 			high = next - 1;
310 	}
311 	return (NULL);
312 }
313 
314 void
315 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
316     int *pri, int which, const int **state)
317 {
318 	collate_chain_t *p2;
319 	collate_large_t *match;
320 	int p, l;
321 	const int *sptr;
322 
323 	/*
324 	 * If this is the "last" pass for the UNDEFINED, then
325 	 * we just return the priority itself.
326 	 */
327 	if (which >= table->info->directive_count) {
328 		*pri = *t;
329 		*len = 1;
330 		*state = NULL;
331 		return;
332 	}
333 
334 	/*
335 	 * If we have remaining substitution data from a previous
336 	 * call, consume it first.
337 	 */
338 	if ((sptr = *state) != NULL) {
339 		*pri = *sptr;
340 		sptr++;
341 		if ((sptr == *state) || (sptr == NULL))
342 			*state = NULL;
343 		else
344 			*state = sptr;
345 		*len = 0;
346 		return;
347 	}
348 
349 	/* No active substitutions */
350 	*len = 1;
351 
352 	/*
353 	 * Check for composites such as diphthongs that collate as a
354 	 * single element (aka chains or collating-elements).
355 	 */
356 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
357 	    ((p = p2->pri[which]) >= 0)) {
358 
359 		*len = l;
360 		*pri = p;
361 
362 	} else if (*t <= UCHAR_MAX) {
363 
364 		/*
365 		 * Character is a small (8-bit) character.
366 		 * We just look these up directly for speed.
367 		 */
368 		*pri = table->char_pri_table[*t].pri[which];
369 
370 	} else if ((table->info->large_count > 0) &&
371 	    ((match = largesearch(table, *t)) != NULL)) {
372 
373 		/*
374 		 * Character was found in the extended table.
375 		 */
376 		*pri = match->pri.pri[which];
377 
378 	} else {
379 		/*
380 		 * Character lacks a specific definition.
381 		 */
382 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
383 			/* Mask off sign bit to prevent ordering confusion. */
384 			*pri = (*t & COLLATE_MAX_PRIORITY);
385 		} else {
386 			*pri = table->info->undef_pri[which];
387 		}
388 		/* No substitutions for undefined characters! */
389 		return;
390 	}
391 
392 	/*
393 	 * Try substituting (expanding) the character.  We are
394 	 * currently doing this *after* the chain compression.  I
395 	 * think it should not matter, but this way might be slightly
396 	 * faster.
397 	 *
398 	 * We do this after the priority search, as this will help us
399 	 * to identify a single key value.  In order for this to work,
400 	 * its important that the priority assigned to a given element
401 	 * to be substituted be unique for that level.  The localedef
402 	 * code ensures this for us.
403 	 */
404 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
405 		if ((*pri = *sptr) > 0) {
406 			sptr++;
407 			*state = *sptr ? sptr : NULL;
408 		}
409 	}
410 
411 }
412 
413 /*
414  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
415  * NOT NULL terminate.  That is left to the caller.
416  */
417 size_t
418 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
419     size_t room)
420 {
421 	int		pri;
422 	int		len;
423 	const wchar_t	*t;
424 	wchar_t		*tr = NULL;
425 	int		direc;
426 	int		pass;
427 	const int32_t 	*state;
428 	size_t		want = 0;
429 	size_t		need = 0;
430 	int		ndir = table->info->directive_count;
431 
432 	assert(src);
433 
434 	for (pass = 0; pass <= ndir; pass++) {
435 
436 		state = NULL;
437 
438 		if (pass != 0) {
439 			/* insert level separator from the previous pass */
440 			if (room) {
441 				*xf++ = 1;
442 				room--;
443 			}
444 			want++;
445 		}
446 
447 		/* special pass for undefined */
448 		if (pass == ndir) {
449 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
450 		} else {
451 			direc = table->info->directive[pass];
452 		}
453 
454 		t = src;
455 
456 		if (direc & DIRECTIVE_BACKWARD) {
457 			wchar_t *bp, *fp, c;
458 			free(tr);
459 			if ((tr = wcsdup(t)) == NULL) {
460 				errno = ENOMEM;
461 				goto fail;
462 			}
463 			bp = tr;
464 			fp = tr + wcslen(tr) - 1;
465 			while (bp < fp) {
466 				c = *bp;
467 				*bp++ = *fp;
468 				*fp-- = c;
469 			}
470 			t = (const wchar_t *)tr;
471 		}
472 
473 		if (direc & DIRECTIVE_POSITION) {
474 			while (*t || state) {
475 				_collate_lookup(table, t, &len, &pri, pass, &state);
476 				t += len;
477 				if (pri <= 0) {
478 					if (pri < 0) {
479 						errno = EINVAL;
480 						goto fail;
481 					}
482 					state = NULL;
483 					pri = COLLATE_MAX_PRIORITY;
484 				}
485 				if (room) {
486 					*xf++ = pri;
487 					room--;
488 				}
489 				want++;
490 				need = want;
491 			}
492 		} else {
493 			while (*t || state) {
494 				_collate_lookup(table, t, &len, &pri, pass, &state);
495 				t += len;
496 				if (pri <= 0) {
497 					if (pri < 0) {
498 						errno = EINVAL;
499 						goto fail;
500 					}
501 					state = NULL;
502 					continue;
503 				}
504 				if (room) {
505 					*xf++ = pri;
506 					room--;
507 				}
508 				want++;
509 				need = want;
510 			}
511 		}
512 	}
513 	free(tr);
514 	return (need);
515 
516 fail:
517 	free(tr);
518 	return ((size_t)(-1));
519 }
520 
521 /*
522  * In the non-POSIX case, we transform each character into a string of
523  * characters representing the character's priority.  Since char is usually
524  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
525  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
526  * bits per byte.
527  *
528  * It turns out that we sometimes have real priorities that are
529  * 31-bits wide.  (But: be careful using priorities where the high
530  * order bit is set -- i.e. the priority is negative.  The sort order
531  * may be surprising!)
532  *
533  * TODO: This would be a good area to optimize somewhat.  It turns out
534  * that real prioririties *except for the last UNDEFINED pass* are generally
535  * very small.  We need the localedef code to precalculate the max
536  * priority for us, and ideally also give us a mask, and then we could
537  * severely limit what we expand to.
538  */
539 #define	XFRM_BYTES	6
540 #define	XFRM_OFFSET	('0')	/* make all printable characters */
541 #define	XFRM_SHIFT	6
542 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
543 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
544 
545 static int
546 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
547 {
548 	/* we use unsigned to ensure zero fill on right shift */
549 	uint32_t val = (uint32_t)table->info->pri_count[pass];
550 	int nc = 0;
551 
552 	while (val) {
553 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
554 		pri >>= XFRM_SHIFT;
555 		val >>= XFRM_SHIFT;
556 		p++;
557 		nc++;
558 	}
559 	return (nc);
560 }
561 
562 size_t
563 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
564     size_t room)
565 {
566 	int		pri;
567 	int		len;
568 	const wchar_t	*t;
569 	wchar_t		*tr = NULL;
570 	int		direc;
571 	int		pass;
572 	const int32_t 	*state;
573 	size_t		want = 0;
574 	size_t		need = 0;
575 	int		b;
576 	uint8_t		buf[XFRM_BYTES];
577 	int		ndir = table->info->directive_count;
578 
579 	assert(src);
580 
581 	for (pass = 0; pass <= ndir; pass++) {
582 
583 		state = NULL;
584 
585 		if (pass != 0) {
586 			/* insert level separator from the previous pass */
587 			if (room) {
588 				*xf++ = XFRM_SEP;
589 				room--;
590 			}
591 			want++;
592 		}
593 
594 		/* special pass for undefined */
595 		if (pass == ndir) {
596 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
597 		} else {
598 			direc = table->info->directive[pass];
599 		}
600 
601 		t = src;
602 
603 		if (direc & DIRECTIVE_BACKWARD) {
604 			wchar_t *bp, *fp, c;
605 			free(tr);
606 			if ((tr = wcsdup(t)) == NULL) {
607 				errno = ENOMEM;
608 				goto fail;
609 			}
610 			bp = tr;
611 			fp = tr + wcslen(tr) - 1;
612 			while (bp < fp) {
613 				c = *bp;
614 				*bp++ = *fp;
615 				*fp-- = c;
616 			}
617 			t = (const wchar_t *)tr;
618 		}
619 
620 		if (direc & DIRECTIVE_POSITION) {
621 			while (*t || state) {
622 
623 				_collate_lookup(table, t, &len, &pri, pass, &state);
624 				t += len;
625 				if (pri <= 0) {
626 					if (pri < 0) {
627 						errno = EINVAL;
628 						goto fail;
629 					}
630 					state = NULL;
631 					pri = COLLATE_MAX_PRIORITY;
632 				}
633 
634 				b = xfrm(table, buf, pri, pass);
635 				want += b;
636 				if (room) {
637 					while (b) {
638 						b--;
639 						if (room) {
640 							*xf++ = buf[b];
641 							room--;
642 						}
643 					}
644 				}
645 				need = want;
646 			}
647 		} else {
648 			while (*t || state) {
649 				_collate_lookup(table, t, &len, &pri, pass, &state);
650 				t += len;
651 				if (pri <= 0) {
652 					if (pri < 0) {
653 						errno = EINVAL;
654 						goto fail;
655 					}
656 					state = NULL;
657 					continue;
658 				}
659 
660 				b = xfrm(table, buf, pri, pass);
661 				want += b;
662 				if (room) {
663 
664 					while (b) {
665 						b--;
666 						if (room) {
667 							*xf++ = buf[b];
668 							room--;
669 						}
670 					}
671 				}
672 				need = want;
673 			}
674 		}
675 	}
676 	free(tr);
677 	return (need);
678 
679 fail:
680 	free(tr);
681 	return ((size_t)(-1));
682 }
683 
684 /*
685  * __collate_equiv_value returns the primary collation value for the given
686  * collating symbol specified by str and len.  Zero or negative is returned
687  * if the collating symbol was not found.  This function is used by bracket
688  * code in the TRE regex library.
689  */
690 int
691 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
692 {
693 	int32_t e;
694 
695 	if (len < 1 || len >= COLLATE_STR_LEN)
696 		return (-1);
697 
698 	FIX_LOCALE(locale);
699 	struct xlocale_collate *table =
700 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
701 
702 	if (table->__collate_load_error)
703 		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
704 
705 	if (len == 1) {
706 		e = -1;
707 		if (*str <= UCHAR_MAX)
708 			e = table->char_pri_table[*str].pri[0];
709 		else if (table->info->large_count > 0) {
710 			collate_large_t *match_large;
711 			match_large = largesearch(table, *str);
712 			if (match_large)
713 				e = match_large->pri.pri[0];
714 		}
715 		if (e == 0)
716 			return (1);
717 		return (e > 0 ? e : 0);
718 	}
719 	if (table->info->chain_count > 0) {
720 		wchar_t name[COLLATE_STR_LEN];
721 		collate_chain_t *match_chain;
722 		int clen;
723 
724 		wcsncpy (name, str, len);
725 		name[len] = 0;
726 		match_chain = chainsearch(table, name, &clen);
727 		if (match_chain) {
728 			e = match_chain->pri[0];
729 			if (e == 0)
730 				return (1);
731 			return (e < 0 ? -e : e);
732 		}
733 	}
734 	return (0);
735 }
736