xref: /freebsd/lib/libc/locale/collate.c (revision bdafb02fcb88389fd1ab684cfe734cb429d35618)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
5  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
6  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
7  *		at Electronni Visti IA, Kiev, Ukraine.
8  *			All rights reserved.
9  *
10  * Copyright (c) 2011 The FreeBSD Foundation
11  * All rights reserved.
12  * Portions of this software were developed by David Chisnall
13  * under sponsorship from the FreeBSD Foundation.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  * Adapted to xlocale by John Marino <draco@marino.st>
37  */
38 
39 #include <sys/cdefs.h>
40 __FBSDID("$FreeBSD$");
41 
42 #include "namespace.h"
43 
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 #include <sys/mman.h>
47 
48 #include <assert.h>
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <wchar.h>
53 #include <errno.h>
54 #include <unistd.h>
55 #include <fcntl.h>
56 #include "un-namespace.h"
57 
58 #include "endian.h"
59 #include "collate.h"
60 #include "setlocale.h"
61 #include "ldpart.h"
62 #include "libc_private.h"
63 
64 struct xlocale_collate __xlocale_global_collate = {
65 	{{0}, "C"}, 1, 0, 0, 0
66 };
67 
68 struct xlocale_collate __xlocale_C_collate = {
69 	{{0}, "C"}, 1, 0, 0, 0
70 };
71 
72 static int
73 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
74 
75 static void
76 destruct_collate(void *t)
77 {
78 	struct xlocale_collate *table = t;
79 	if (table->map && (table->maplen > 0)) {
80 		(void) munmap(table->map, table->maplen);
81 	}
82 	free(t);
83 }
84 
85 void *
86 __collate_load(const char *encoding, __unused locale_t unused)
87 {
88 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
89 		return &__xlocale_C_collate;
90 	}
91 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
92 	table->header.header.destructor = destruct_collate;
93 	// FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
94 	// the caching outside of this section
95 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
96 		xlocale_release(table);
97 		return NULL;
98 	}
99 	return table;
100 }
101 
102 /**
103  * Load the collation tables for the specified encoding into the global table.
104  */
105 int
106 __collate_load_tables(const char *encoding)
107 {
108 
109 	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
110 }
111 
112 int
113 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
114 {
115 	int i, chains, z;
116 	char *buf;
117 	char *TMP;
118 	char *map;
119 	collate_info_t *info;
120 	struct stat sbuf;
121 	int fd;
122 
123 	table->__collate_load_error = 1;
124 
125 	/* 'encoding' must be already checked. */
126 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
127 		return (_LDP_CACHE);
128 	}
129 
130 	if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1)
131 		return (_LDP_ERROR);
132 
133 	if ((fd = _open(buf, O_RDONLY)) < 0) {
134 		free(buf);
135 		return (_LDP_ERROR);
136 	}
137 	free(buf);
138 	if (_fstat(fd, &sbuf) < 0) {
139 		(void) _close(fd);
140 		return (_LDP_ERROR);
141 	}
142 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
143 		(void) _close(fd);
144 		errno = EINVAL;
145 		return (_LDP_ERROR);
146 	}
147 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
148 	(void) _close(fd);
149 	if ((TMP = map) == NULL) {
150 		return (_LDP_ERROR);
151 	}
152 
153 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
154 		(void) munmap(map, sbuf.st_size);
155 		errno = EINVAL;
156 		return (_LDP_ERROR);
157 	}
158 	TMP += COLLATE_STR_LEN;
159 
160 	info = (void *)TMP;
161 	TMP += sizeof (*info);
162 
163 	if ((info->directive_count < 1) ||
164 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
165 	    ((chains = BSWAP(info->chain_count)) < 0)) {
166 		(void) munmap(map, sbuf.st_size);
167 		errno = EINVAL;
168 		return (_LDP_ERROR);
169 	}
170 
171 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
172 	    (sizeof (collate_chain_t) * chains) +
173 	    (sizeof (collate_large_t) * BSWAP(info->large_count));
174 	for (z = 0; z < info->directive_count; z++) {
175 		i += sizeof (collate_subst_t) * BSWAP(info->subst_count[z]);
176 	}
177 	if (i != (sbuf.st_size - (TMP - map))) {
178 		(void) munmap(map, sbuf.st_size);
179 		errno = EINVAL;
180 		return (_LDP_ERROR);
181 	}
182 
183 	table->info = info;
184 	table->char_pri_table = (void *)TMP;
185 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
186 
187 	for (z = 0; z < info->directive_count; z++) {
188 		if (BSWAP(info->subst_count[z]) > 0) {
189 			table->subst_table[z] = (void *)TMP;
190 			TMP += BSWAP(info->subst_count[z]) * sizeof (collate_subst_t);
191 		} else {
192 			table->subst_table[z] = NULL;
193 		}
194 	}
195 
196 	if (chains > 0) {
197 		table->chain_pri_table = (void *)TMP;
198 		TMP += chains * sizeof (collate_chain_t);
199 	} else
200 		table->chain_pri_table = NULL;
201 	if (BSWAP(info->large_count) > 0)
202 		table->large_pri_table = (void *)TMP;
203 	else
204 		table->large_pri_table = NULL;
205 
206 	table->__collate_load_error = 0;
207 	return (_LDP_LOADED);
208 }
209 
210 static const int32_t *
211 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
212 {
213 	const collate_subst_t *p;
214 	int n = BSWAP(table->info->subst_count[pass]);
215 
216 	if (n == 0)
217 		return (NULL);
218 
219 	if (pass >= table->info->directive_count)
220 		return (NULL);
221 
222 	if (!(key & COLLATE_SUBST_PRIORITY))
223 		return (NULL);
224 
225 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
226 	assert(BSWAP(p->key) == key);
227 
228 	return (p->pri);
229 }
230 
231 static collate_chain_t *
232 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
233 {
234 	int low = 0;
235 	int high = BSWAP(table->info->chain_count) - 1;
236 	int next, compar, l;
237 	collate_chain_t *p;
238 	collate_chain_t *tab = table->chain_pri_table;
239 
240 	if (high < 0)
241 		return (NULL);
242 
243 	while (low <= high) {
244 		next = (low + high) / 2;
245 		p = tab + next;
246 		compar = *key - le16toh(*p->str);
247 		if (compar == 0) {
248 			l = wcsnlen(p->str, COLLATE_STR_LEN);
249 			compar = wcsncmp(key, p->str, l);
250 			if (compar == 0) {
251 				*len = l;
252 				return (p);
253 			}
254 		}
255 		if (compar > 0)
256 			low = next + 1;
257 		else
258 			high = next - 1;
259 	}
260 	return (NULL);
261 }
262 
263 static collate_large_t *
264 largesearch(struct xlocale_collate *table, const wchar_t key)
265 {
266 	int low = 0;
267 	int high = BSWAP(table->info->large_count) - 1;
268 	int next, compar;
269 	collate_large_t *p;
270 	collate_large_t *tab = table->large_pri_table;
271 
272 	if (high < 0)
273 		return (NULL);
274 
275 	while (low <= high) {
276 		next = (low + high) / 2;
277 		p = tab + next;
278 		compar = key - BSWAP(p->val);
279 		if (compar == 0)
280 			return (p);
281 		if (compar > 0)
282 			low = next + 1;
283 		else
284 			high = next - 1;
285 	}
286 	return (NULL);
287 }
288 
289 void
290 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
291     int *pri, int which, const int **state)
292 {
293 	collate_chain_t *p2;
294 	collate_large_t *match;
295 	int p, l;
296 	const int *sptr;
297 
298 	/*
299 	 * If this is the "last" pass for the UNDEFINED, then
300 	 * we just return the priority itself.
301 	 */
302 	if (which >= table->info->directive_count) {
303 		*pri = *t;
304 		*len = 1;
305 		*state = NULL;
306 		return;
307 	}
308 
309 	/*
310 	 * If we have remaining substitution data from a previous
311 	 * call, consume it first.
312 	 */
313 	if ((sptr = *state) != NULL) {
314 		*pri = *sptr;
315 		sptr++;
316 		if ((sptr == *state) || (sptr == NULL))
317 			*state = NULL;
318 		else
319 			*state = sptr;
320 		*len = 0;
321 		return;
322 	}
323 
324 	/* No active substitutions */
325 	*len = 1;
326 
327 	/*
328 	 * Check for composites such as diphthongs that collate as a
329 	 * single element (aka chains or collating-elements).
330 	 */
331 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
332 	    ((p = p2->pri[which]) >= 0)) {
333 
334 		*len = l;
335 		*pri = p;
336 
337 	} else if (*t <= UCHAR_MAX) {
338 
339 		/*
340 		 * Character is a small (8-bit) character.
341 		 * We just look these up directly for speed.
342 		 */
343 		*pri = BSWAP(table->char_pri_table[*t].pri[which]);
344 
345 	} else if ((BSWAP(table->info->large_count) > 0) &&
346 	    ((match = largesearch(table, *t)) != NULL)) {
347 
348 		/*
349 		 * Character was found in the extended table.
350 		 */
351 		*pri = BSWAP(match->pri.pri[which]);
352 
353 	} else {
354 		/*
355 		 * Character lacks a specific definition.
356 		 */
357 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
358 			/* Mask off sign bit to prevent ordering confusion. */
359 			*pri = (*t & COLLATE_MAX_PRIORITY);
360 		} else {
361 			*pri = BSWAP(table->info->undef_pri[which]);
362 		}
363 		/* No substitutions for undefined characters! */
364 		return;
365 	}
366 
367 	/*
368 	 * Try substituting (expanding) the character.  We are
369 	 * currently doing this *after* the chain compression.  I
370 	 * think it should not matter, but this way might be slightly
371 	 * faster.
372 	 *
373 	 * We do this after the priority search, as this will help us
374 	 * to identify a single key value.  In order for this to work,
375 	 * its important that the priority assigned to a given element
376 	 * to be substituted be unique for that level.  The localedef
377 	 * code ensures this for us.
378 	 */
379 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
380 		if ((*pri = BSWAP(*sptr)) > 0) {
381 			sptr++;
382 			*state = BSWAP(*sptr) ? sptr : NULL;
383 		}
384 	}
385 
386 }
387 
388 /*
389  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
390  * NOT NULL terminate.  That is left to the caller.
391  */
392 size_t
393 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
394     size_t room)
395 {
396 	int		pri;
397 	int		len;
398 	const wchar_t	*t;
399 	wchar_t		*tr = NULL;
400 	int		direc;
401 	int		pass;
402 	const int32_t 	*state;
403 	size_t		want = 0;
404 	size_t		need = 0;
405 	int		ndir = table->info->directive_count;
406 
407 	assert(src);
408 
409 	for (pass = 0; pass <= ndir; pass++) {
410 
411 		state = NULL;
412 
413 		if (pass != 0) {
414 			/* insert level separator from the previous pass */
415 			if (room) {
416 				*xf++ = 1;
417 				room--;
418 			}
419 			want++;
420 		}
421 
422 		/* special pass for undefined */
423 		if (pass == ndir) {
424 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
425 		} else {
426 			direc = table->info->directive[pass];
427 		}
428 
429 		t = src;
430 
431 		if (direc & DIRECTIVE_BACKWARD) {
432 			wchar_t *bp, *fp, c;
433 			free(tr);
434 			if ((tr = wcsdup(t)) == NULL) {
435 				errno = ENOMEM;
436 				goto fail;
437 			}
438 			bp = tr;
439 			fp = tr + wcslen(tr) - 1;
440 			while (bp < fp) {
441 				c = *bp;
442 				*bp++ = *fp;
443 				*fp-- = c;
444 			}
445 			t = (const wchar_t *)tr;
446 		}
447 
448 		if (direc & DIRECTIVE_POSITION) {
449 			while (*t || state) {
450 				_collate_lookup(table, t, &len, &pri, pass, &state);
451 				t += len;
452 				if (pri <= 0) {
453 					if (pri < 0) {
454 						errno = EINVAL;
455 						goto fail;
456 					}
457 					state = NULL;
458 					pri = COLLATE_MAX_PRIORITY;
459 				}
460 				if (room) {
461 					*xf++ = pri;
462 					room--;
463 				}
464 				want++;
465 				need = want;
466 			}
467 		} else {
468 			while (*t || state) {
469 				_collate_lookup(table, t, &len, &pri, pass, &state);
470 				t += len;
471 				if (pri <= 0) {
472 					if (pri < 0) {
473 						errno = EINVAL;
474 						goto fail;
475 					}
476 					state = NULL;
477 					continue;
478 				}
479 				if (room) {
480 					*xf++ = pri;
481 					room--;
482 				}
483 				want++;
484 				need = want;
485 			}
486 		}
487 	}
488 	free(tr);
489 	return (need);
490 
491 fail:
492 	free(tr);
493 	return ((size_t)(-1));
494 }
495 
496 /*
497  * In the non-POSIX case, we transform each character into a string of
498  * characters representing the character's priority.  Since char is usually
499  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
500  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
501  * bits per byte.
502  *
503  * It turns out that we sometimes have real priorities that are
504  * 31-bits wide.  (But: be careful using priorities where the high
505  * order bit is set -- i.e. the priority is negative.  The sort order
506  * may be surprising!)
507  *
508  * TODO: This would be a good area to optimize somewhat.  It turns out
509  * that real prioririties *except for the last UNDEFINED pass* are generally
510  * very small.  We need the localedef code to precalculate the max
511  * priority for us, and ideally also give us a mask, and then we could
512  * severely limit what we expand to.
513  */
514 #define	XFRM_BYTES	6
515 #define	XFRM_OFFSET	('0')	/* make all printable characters */
516 #define	XFRM_SHIFT	6
517 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
518 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
519 
520 static int
521 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
522 {
523 	/* we use unsigned to ensure zero fill on right shift */
524 	uint32_t val = BSWAP((uint32_t)table->info->pri_count[pass]);
525 	int nc = 0;
526 
527 	while (val) {
528 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
529 		pri >>= XFRM_SHIFT;
530 		val >>= XFRM_SHIFT;
531 		p++;
532 		nc++;
533 	}
534 	return (nc);
535 }
536 
537 size_t
538 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
539     size_t room)
540 {
541 	int		pri;
542 	int		len;
543 	const wchar_t	*t;
544 	wchar_t		*tr = NULL;
545 	int		direc;
546 	int		pass;
547 	const int32_t 	*state;
548 	size_t		want = 0;
549 	size_t		need = 0;
550 	int		b;
551 	uint8_t		buf[XFRM_BYTES];
552 	int		ndir = table->info->directive_count;
553 
554 	assert(src);
555 
556 	for (pass = 0; pass <= ndir; pass++) {
557 
558 		state = NULL;
559 
560 		if (pass != 0) {
561 			/* insert level separator from the previous pass */
562 			if (room) {
563 				*xf++ = XFRM_SEP;
564 				room--;
565 			}
566 			want++;
567 		}
568 
569 		/* special pass for undefined */
570 		if (pass == ndir) {
571 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
572 		} else {
573 			direc = table->info->directive[pass];
574 		}
575 
576 		t = src;
577 
578 		if (direc & DIRECTIVE_BACKWARD) {
579 			wchar_t *bp, *fp, c;
580 			free(tr);
581 			if ((tr = wcsdup(t)) == NULL) {
582 				errno = ENOMEM;
583 				goto fail;
584 			}
585 			bp = tr;
586 			fp = tr + wcslen(tr) - 1;
587 			while (bp < fp) {
588 				c = *bp;
589 				*bp++ = *fp;
590 				*fp-- = c;
591 			}
592 			t = (const wchar_t *)tr;
593 		}
594 
595 		if (direc & DIRECTIVE_POSITION) {
596 			while (*t || state) {
597 
598 				_collate_lookup(table, t, &len, &pri, pass, &state);
599 				t += len;
600 				if (pri <= 0) {
601 					if (pri < 0) {
602 						errno = EINVAL;
603 						goto fail;
604 					}
605 					state = NULL;
606 					pri = COLLATE_MAX_PRIORITY;
607 				}
608 
609 				b = xfrm(table, buf, pri, pass);
610 				want += b;
611 				if (room) {
612 					while (b) {
613 						b--;
614 						if (room) {
615 							*xf++ = buf[b];
616 							room--;
617 						}
618 					}
619 				}
620 				need = want;
621 			}
622 		} else {
623 			while (*t || state) {
624 				_collate_lookup(table, t, &len, &pri, pass, &state);
625 				t += len;
626 				if (pri <= 0) {
627 					if (pri < 0) {
628 						errno = EINVAL;
629 						goto fail;
630 					}
631 					state = NULL;
632 					continue;
633 				}
634 
635 				b = xfrm(table, buf, pri, pass);
636 				want += b;
637 				if (room) {
638 
639 					while (b) {
640 						b--;
641 						if (room) {
642 							*xf++ = buf[b];
643 							room--;
644 						}
645 					}
646 				}
647 				need = want;
648 			}
649 		}
650 	}
651 	free(tr);
652 	return (need);
653 
654 fail:
655 	free(tr);
656 	return ((size_t)(-1));
657 }
658 
659 /*
660  * __collate_equiv_value returns the primary collation value for the given
661  * collating symbol specified by str and len.  Zero or negative is returned
662  * if the collating symbol was not found.  This function is used by bracket
663  * code in the TRE regex library.
664  */
665 int
666 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
667 {
668 	int32_t e;
669 
670 	if (len < 1 || len >= COLLATE_STR_LEN)
671 		return (-1);
672 
673 	FIX_LOCALE(locale);
674 	struct xlocale_collate *table =
675 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
676 
677 	if (table->__collate_load_error)
678 		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
679 
680 	if (len == 1) {
681 		e = -1;
682 		if (*str <= UCHAR_MAX)
683 			e = table->char_pri_table[*str].pri[0];
684 		else if (BSWAP(table->info->large_count) > 0) {
685 			collate_large_t *match_large;
686 			match_large = largesearch(table, *str);
687 			if (match_large)
688 				e = match_large->pri.pri[0];
689 		}
690 		if (e == 0)
691 			return (1);
692 		return (e > 0 ? e : 0);
693 	}
694 	if (BSWAP(table->info->chain_count) > 0) {
695 		wchar_t name[COLLATE_STR_LEN];
696 		collate_chain_t *match_chain;
697 		int clen;
698 
699 		wcsncpy (name, str, len);
700 		name[len] = 0;
701 		match_chain = chainsearch(table, name, &clen);
702 		if (match_chain) {
703 			e = match_chain->pri[0];
704 			if (e == 0)
705 				return (1);
706 			return (e < 0 ? -e : e);
707 		}
708 	}
709 	return (0);
710 }
711