xref: /freebsd/lib/libc/locale/collate.c (revision 4f0c9b76cf75724ef0b9c59bb8c182be24361d7c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
5  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
6  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
7  *		at Electronni Visti IA, Kiev, Ukraine.
8  *			All rights reserved.
9  *
10  * Copyright (c) 2011 The FreeBSD Foundation
11  *
12  * Portions of this software were developed by David Chisnall
13  * under sponsorship from the FreeBSD Foundation.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  * Adapted to xlocale by John Marino <draco@marino.st>
37  */
38 
39 #include <sys/cdefs.h>
40 __FBSDID("$FreeBSD$");
41 
42 #include "namespace.h"
43 
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 #include <sys/mman.h>
47 
48 #include <assert.h>
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <wchar.h>
53 #include <errno.h>
54 #include <unistd.h>
55 #include <fcntl.h>
56 #include "un-namespace.h"
57 
58 #include "collate.h"
59 #include "setlocale.h"
60 #include "ldpart.h"
61 #include "libc_private.h"
62 
63 struct xlocale_collate __xlocale_global_collate = {
64 	{{0}, "C"}, 1, 0, 0, 0
65 };
66 
67 struct xlocale_collate __xlocale_C_collate = {
68 	{{0}, "C"}, 1, 0, 0, 0
69 };
70 
71 static int
72 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
73 
74 static void
75 destruct_collate(void *t)
76 {
77 	struct xlocale_collate *table = t;
78 	if (table->map && (table->maplen > 0)) {
79 		(void) munmap(table->map, table->maplen);
80 	}
81 	free(t);
82 }
83 
84 void *
85 __collate_load(const char *encoding, __unused locale_t unused)
86 {
87 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 ||
88 	    strncmp(encoding, "C.", 2) == 0) {
89 		return (&__xlocale_C_collate);
90 	}
91 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate),
92 	    1);
93 	if (table == NULL)
94 		return (NULL);
95 	table->header.header.destructor = destruct_collate;
96 
97 	/*
98 	 * FIXME: Make sure that _LDP_CACHE is never returned.  We
99 	 * should be doing the caching outside of this section.
100 	 */
101 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
102 		xlocale_release(table);
103 		return (NULL);
104 	}
105 	return (table);
106 }
107 
108 /**
109  * Load the collation tables for the specified encoding into the global table.
110  */
111 int
112 __collate_load_tables(const char *encoding)
113 {
114 
115 	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
116 }
117 
118 static int
119 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
120 {
121 	int i, chains, z;
122 	char *buf;
123 	char *TMP;
124 	char *map;
125 	collate_info_t *info;
126 	struct stat sbuf;
127 	int fd;
128 
129 	table->__collate_load_error = 1;
130 
131 	/* 'encoding' must be already checked. */
132 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 ||
133 	    strncmp(encoding, "C.", 2) == 0) {
134 		return (_LDP_CACHE);
135 	}
136 
137 	if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1)
138 		return (_LDP_ERROR);
139 
140 	if ((fd = _open(buf, O_RDONLY | O_CLOEXEC)) < 0) {
141 		free(buf);
142 		return (_LDP_ERROR);
143 	}
144 	free(buf);
145 	if (_fstat(fd, &sbuf) < 0) {
146 		(void) _close(fd);
147 		return (_LDP_ERROR);
148 	}
149 	if (sbuf.st_size < (COLLATE_FMT_VERSION_LEN +
150 			    XLOCALE_DEF_VERSION_LEN +
151 			    sizeof (info))) {
152 		(void) _close(fd);
153 		errno = EINVAL;
154 		return (_LDP_ERROR);
155 	}
156 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
157 	(void) _close(fd);
158 	if ((TMP = map) == MAP_FAILED) {
159 		return (_LDP_ERROR);
160 	}
161 
162 	if (strncmp(TMP, COLLATE_FMT_VERSION, COLLATE_FMT_VERSION_LEN) != 0) {
163 		(void) munmap(map, sbuf.st_size);
164 		errno = EINVAL;
165 		return (_LDP_ERROR);
166 	}
167 	TMP += COLLATE_FMT_VERSION_LEN;
168 	strlcat(table->header.version, TMP, sizeof (table->header.version));
169 	TMP += XLOCALE_DEF_VERSION_LEN;
170 
171 	info = (void *)TMP;
172 	TMP += sizeof (*info);
173 
174 	if ((info->directive_count < 1) ||
175 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
176 	    ((chains = info->chain_count) < 0)) {
177 		(void) munmap(map, sbuf.st_size);
178 		errno = EINVAL;
179 		return (_LDP_ERROR);
180 	}
181 
182 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
183 	    (sizeof (collate_chain_t) * chains) +
184 	    (sizeof (collate_large_t) * info->large_count);
185 	for (z = 0; z < info->directive_count; z++) {
186 		i += sizeof (collate_subst_t) * info->subst_count[z];
187 	}
188 	if (i != (sbuf.st_size - (TMP - map))) {
189 		(void) munmap(map, sbuf.st_size);
190 		errno = EINVAL;
191 		return (_LDP_ERROR);
192 	}
193 
194 	if (table->map && (table->maplen > 0)) {
195 		(void) munmap(table->map, table->maplen);
196 	}
197 	table->map = map;
198 	table->maplen = sbuf.st_size;
199 	table->info = info;
200 	table->char_pri_table = (void *)TMP;
201 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
202 
203 	for (z = 0; z < info->directive_count; z++) {
204 		if (info->subst_count[z] > 0) {
205 			table->subst_table[z] = (void *)TMP;
206 			TMP += info->subst_count[z] * sizeof (collate_subst_t);
207 		} else {
208 			table->subst_table[z] = NULL;
209 		}
210 	}
211 
212 	if (chains > 0) {
213 		table->chain_pri_table = (void *)TMP;
214 		TMP += chains * sizeof (collate_chain_t);
215 	} else
216 		table->chain_pri_table = NULL;
217 	if (info->large_count > 0)
218 		table->large_pri_table = (void *)TMP;
219 	else
220 		table->large_pri_table = NULL;
221 
222 	table->__collate_load_error = 0;
223 	return (_LDP_LOADED);
224 }
225 
226 static const int32_t *
227 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
228 {
229 	const collate_subst_t *p;
230 	int n = table->info->subst_count[pass];
231 
232 	if (n == 0)
233 		return (NULL);
234 
235 	if (pass >= table->info->directive_count)
236 		return (NULL);
237 
238 	if (!(key & COLLATE_SUBST_PRIORITY))
239 		return (NULL);
240 
241 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
242 	assert(p->key == key);
243 
244 	return (p->pri);
245 }
246 
247 static collate_chain_t *
248 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
249 {
250 	int low = 0;
251 	int high = table->info->chain_count - 1;
252 	int next, compar, l;
253 	collate_chain_t *p;
254 	collate_chain_t *tab = table->chain_pri_table;
255 
256 	if (high < 0)
257 		return (NULL);
258 
259 	while (low <= high) {
260 		next = (low + high) / 2;
261 		p = tab + next;
262 		compar = *key - *p->str;
263 		if (compar == 0) {
264 			l = wcsnlen(p->str, COLLATE_STR_LEN);
265 			compar = wcsncmp(key, p->str, l);
266 			if (compar == 0) {
267 				*len = l;
268 				return (p);
269 			}
270 		}
271 		if (compar > 0)
272 			low = next + 1;
273 		else
274 			high = next - 1;
275 	}
276 	return (NULL);
277 }
278 
279 static collate_large_t *
280 largesearch(struct xlocale_collate *table, const wchar_t key)
281 {
282 	int low = 0;
283 	int high = table->info->large_count - 1;
284 	int next, compar;
285 	collate_large_t *p;
286 	collate_large_t *tab = table->large_pri_table;
287 
288 	if (high < 0)
289 		return (NULL);
290 
291 	while (low <= high) {
292 		next = (low + high) / 2;
293 		p = tab + next;
294 		compar = key - p->val;
295 		if (compar == 0)
296 			return (p);
297 		if (compar > 0)
298 			low = next + 1;
299 		else
300 			high = next - 1;
301 	}
302 	return (NULL);
303 }
304 
305 void
306 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
307     int *pri, int which, const int **state)
308 {
309 	collate_chain_t *p2;
310 	collate_large_t *match;
311 	int p, l;
312 	const int *sptr;
313 
314 	/*
315 	 * If this is the "last" pass for the UNDEFINED, then
316 	 * we just return the priority itself.
317 	 */
318 	if (which >= table->info->directive_count) {
319 		*pri = *t;
320 		*len = 1;
321 		*state = NULL;
322 		return;
323 	}
324 
325 	/*
326 	 * If we have remaining substitution data from a previous
327 	 * call, consume it first.
328 	 */
329 	if ((sptr = *state) != NULL) {
330 		*pri = *sptr;
331 		sptr++;
332 		if ((sptr == *state) || (sptr == NULL))
333 			*state = NULL;
334 		else
335 			*state = sptr;
336 		*len = 0;
337 		return;
338 	}
339 
340 	/* No active substitutions */
341 	*len = 1;
342 
343 	/*
344 	 * Check for composites such as diphthongs that collate as a
345 	 * single element (aka chains or collating-elements).
346 	 */
347 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
348 	    ((p = p2->pri[which]) >= 0)) {
349 
350 		*len = l;
351 		*pri = p;
352 
353 	} else if (*t <= UCHAR_MAX) {
354 
355 		/*
356 		 * Character is a small (8-bit) character.
357 		 * We just look these up directly for speed.
358 		 */
359 		*pri = table->char_pri_table[*t].pri[which];
360 
361 	} else if ((table->info->large_count > 0) &&
362 	    ((match = largesearch(table, *t)) != NULL)) {
363 
364 		/*
365 		 * Character was found in the extended table.
366 		 */
367 		*pri = match->pri.pri[which];
368 
369 	} else {
370 		/*
371 		 * Character lacks a specific definition.
372 		 */
373 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
374 			/* Mask off sign bit to prevent ordering confusion. */
375 			*pri = (*t & COLLATE_MAX_PRIORITY);
376 		} else {
377 			*pri = table->info->undef_pri[which];
378 		}
379 		/* No substitutions for undefined characters! */
380 		return;
381 	}
382 
383 	/*
384 	 * Try substituting (expanding) the character.  We are
385 	 * currently doing this *after* the chain compression.  I
386 	 * think it should not matter, but this way might be slightly
387 	 * faster.
388 	 *
389 	 * We do this after the priority search, as this will help us
390 	 * to identify a single key value.  In order for this to work,
391 	 * its important that the priority assigned to a given element
392 	 * to be substituted be unique for that level.  The localedef
393 	 * code ensures this for us.
394 	 */
395 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
396 		if ((*pri = *sptr) > 0) {
397 			sptr++;
398 			*state = *sptr ? sptr : NULL;
399 		}
400 	}
401 
402 }
403 
404 /*
405  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
406  * NOT NULL terminate.  That is left to the caller.
407  */
408 size_t
409 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
410     size_t room)
411 {
412 	int		pri;
413 	int		len;
414 	const wchar_t	*t;
415 	wchar_t		*tr = NULL;
416 	int		direc;
417 	int		pass;
418 	const int32_t 	*state;
419 	size_t		want = 0;
420 	size_t		need = 0;
421 	int		ndir = table->info->directive_count;
422 
423 	assert(src);
424 
425 	for (pass = 0; pass <= ndir; pass++) {
426 
427 		state = NULL;
428 
429 		if (pass != 0) {
430 			/* insert level separator from the previous pass */
431 			if (room) {
432 				*xf++ = 1;
433 				room--;
434 			}
435 			want++;
436 		}
437 
438 		/* special pass for undefined */
439 		if (pass == ndir) {
440 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
441 		} else {
442 			direc = table->info->directive[pass];
443 		}
444 
445 		t = src;
446 
447 		if (direc & DIRECTIVE_BACKWARD) {
448 			wchar_t *bp, *fp, c;
449 			free(tr);
450 			if ((tr = wcsdup(t)) == NULL) {
451 				errno = ENOMEM;
452 				goto fail;
453 			}
454 			bp = tr;
455 			fp = tr + wcslen(tr) - 1;
456 			while (bp < fp) {
457 				c = *bp;
458 				*bp++ = *fp;
459 				*fp-- = c;
460 			}
461 			t = (const wchar_t *)tr;
462 		}
463 
464 		if (direc & DIRECTIVE_POSITION) {
465 			while (*t || state) {
466 				_collate_lookup(table, t, &len, &pri, pass, &state);
467 				t += len;
468 				if (pri <= 0) {
469 					if (pri < 0) {
470 						errno = EINVAL;
471 						goto fail;
472 					}
473 					state = NULL;
474 					pri = COLLATE_MAX_PRIORITY;
475 				}
476 				if (room) {
477 					*xf++ = pri;
478 					room--;
479 				}
480 				want++;
481 				need = want;
482 			}
483 		} else {
484 			while (*t || state) {
485 				_collate_lookup(table, t, &len, &pri, pass, &state);
486 				t += len;
487 				if (pri <= 0) {
488 					if (pri < 0) {
489 						errno = EINVAL;
490 						goto fail;
491 					}
492 					state = NULL;
493 					continue;
494 				}
495 				if (room) {
496 					*xf++ = pri;
497 					room--;
498 				}
499 				want++;
500 				need = want;
501 			}
502 		}
503 	}
504 	free(tr);
505 	return (need);
506 
507 fail:
508 	free(tr);
509 	return ((size_t)(-1));
510 }
511 
512 /*
513  * In the non-POSIX case, we transform each character into a string of
514  * characters representing the character's priority.  Since char is usually
515  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
516  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
517  * bits per byte.
518  *
519  * It turns out that we sometimes have real priorities that are
520  * 31-bits wide.  (But: be careful using priorities where the high
521  * order bit is set -- i.e. the priority is negative.  The sort order
522  * may be surprising!)
523  *
524  * TODO: This would be a good area to optimize somewhat.  It turns out
525  * that real prioririties *except for the last UNDEFINED pass* are generally
526  * very small.  We need the localedef code to precalculate the max
527  * priority for us, and ideally also give us a mask, and then we could
528  * severely limit what we expand to.
529  */
530 #define	XFRM_BYTES	6
531 #define	XFRM_OFFSET	('0')	/* make all printable characters */
532 #define	XFRM_SHIFT	6
533 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
534 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
535 
536 static int
537 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
538 {
539 	/* we use unsigned to ensure zero fill on right shift */
540 	uint32_t val = (uint32_t)table->info->pri_count[pass];
541 	int nc = 0;
542 
543 	while (val) {
544 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
545 		pri >>= XFRM_SHIFT;
546 		val >>= XFRM_SHIFT;
547 		p++;
548 		nc++;
549 	}
550 	return (nc);
551 }
552 
553 size_t
554 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
555     size_t room)
556 {
557 	int		pri;
558 	int		len;
559 	const wchar_t	*t;
560 	wchar_t		*tr = NULL;
561 	int		direc;
562 	int		pass;
563 	const int32_t 	*state;
564 	size_t		want = 0;
565 	size_t		need = 0;
566 	int		b;
567 	uint8_t		buf[XFRM_BYTES];
568 	int		ndir = table->info->directive_count;
569 
570 	assert(src);
571 
572 	for (pass = 0; pass <= ndir; pass++) {
573 
574 		state = NULL;
575 
576 		if (pass != 0) {
577 			/* insert level separator from the previous pass */
578 			if (room) {
579 				*xf++ = XFRM_SEP;
580 				room--;
581 			}
582 			want++;
583 		}
584 
585 		/* special pass for undefined */
586 		if (pass == ndir) {
587 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
588 		} else {
589 			direc = table->info->directive[pass];
590 		}
591 
592 		t = src;
593 
594 		if (direc & DIRECTIVE_BACKWARD) {
595 			wchar_t *bp, *fp, c;
596 			free(tr);
597 			if ((tr = wcsdup(t)) == NULL) {
598 				errno = ENOMEM;
599 				goto fail;
600 			}
601 			bp = tr;
602 			fp = tr + wcslen(tr) - 1;
603 			while (bp < fp) {
604 				c = *bp;
605 				*bp++ = *fp;
606 				*fp-- = c;
607 			}
608 			t = (const wchar_t *)tr;
609 		}
610 
611 		if (direc & DIRECTIVE_POSITION) {
612 			while (*t || state) {
613 
614 				_collate_lookup(table, t, &len, &pri, pass, &state);
615 				t += len;
616 				if (pri <= 0) {
617 					if (pri < 0) {
618 						errno = EINVAL;
619 						goto fail;
620 					}
621 					state = NULL;
622 					pri = COLLATE_MAX_PRIORITY;
623 				}
624 
625 				b = xfrm(table, buf, pri, pass);
626 				want += b;
627 				if (room) {
628 					while (b) {
629 						b--;
630 						if (room) {
631 							*xf++ = buf[b];
632 							room--;
633 						}
634 					}
635 				}
636 				need = want;
637 			}
638 		} else {
639 			while (*t || state) {
640 				_collate_lookup(table, t, &len, &pri, pass, &state);
641 				t += len;
642 				if (pri <= 0) {
643 					if (pri < 0) {
644 						errno = EINVAL;
645 						goto fail;
646 					}
647 					state = NULL;
648 					continue;
649 				}
650 
651 				b = xfrm(table, buf, pri, pass);
652 				want += b;
653 				if (room) {
654 
655 					while (b) {
656 						b--;
657 						if (room) {
658 							*xf++ = buf[b];
659 							room--;
660 						}
661 					}
662 				}
663 				need = want;
664 			}
665 		}
666 	}
667 	free(tr);
668 	return (need);
669 
670 fail:
671 	free(tr);
672 	return ((size_t)(-1));
673 }
674 
675 /*
676  * __collate_equiv_value returns the primary collation value for the given
677  * collating symbol specified by str and len.  Zero or negative is returned
678  * if the collating symbol was not found.  This function is used by bracket
679  * code in the TRE regex library.
680  */
681 int
682 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
683 {
684 	int32_t e;
685 
686 	if (len < 1 || len >= COLLATE_STR_LEN)
687 		return (-1);
688 
689 	FIX_LOCALE(locale);
690 	struct xlocale_collate *table =
691 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
692 
693 	if (table->__collate_load_error)
694 		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
695 
696 	if (len == 1) {
697 		e = -1;
698 		if (*str <= UCHAR_MAX)
699 			e = table->char_pri_table[*str].pri[0];
700 		else if (table->info->large_count > 0) {
701 			collate_large_t *match_large;
702 			match_large = largesearch(table, *str);
703 			if (match_large)
704 				e = match_large->pri.pri[0];
705 		}
706 		if (e == 0)
707 			return (1);
708 		return (e > 0 ? e : 0);
709 	}
710 	if (table->info->chain_count > 0) {
711 		wchar_t name[COLLATE_STR_LEN];
712 		collate_chain_t *match_chain;
713 		int clen;
714 
715 		wcsncpy (name, str, len);
716 		name[len] = 0;
717 		match_chain = chainsearch(table, name, &clen);
718 		if (match_chain) {
719 			e = match_chain->pri[0];
720 			if (e == 0)
721 				return (1);
722 			return (e < 0 ? -e : e);
723 		}
724 	}
725 	return (0);
726 }
727