xref: /freebsd/lib/libc/locale/collate.c (revision cddbc3b40812213ff00041f79174cac0be360a2a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
5  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
6  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
7  *		at Electronni Visti IA, Kiev, Ukraine.
8  *			All rights reserved.
9  *
10  * Copyright (c) 2011 The FreeBSD Foundation
11  * All rights reserved.
12  * Portions of this software were developed by David Chisnall
13  * under sponsorship from the FreeBSD Foundation.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  * Adapted to xlocale by John Marino <draco@marino.st>
37  */
38 
39 #include <sys/cdefs.h>
40 __FBSDID("$FreeBSD$");
41 
42 #include "namespace.h"
43 
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 #include <sys/mman.h>
47 
48 #include <assert.h>
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <wchar.h>
53 #include <errno.h>
54 #include <unistd.h>
55 #include <fcntl.h>
56 #include "un-namespace.h"
57 
58 #include "collate.h"
59 #include "setlocale.h"
60 #include "ldpart.h"
61 #include "libc_private.h"
62 
63 struct xlocale_collate __xlocale_global_collate = {
64 	{{0}, "C"}, 1, 0, 0, 0
65 };
66 
67 struct xlocale_collate __xlocale_C_collate = {
68 	{{0}, "C"}, 1, 0, 0, 0
69 };
70 
71 static int
72 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
73 
74 static void
75 destruct_collate(void *t)
76 {
77 	struct xlocale_collate *table = t;
78 	if (table->map && (table->maplen > 0)) {
79 		(void) munmap(table->map, table->maplen);
80 	}
81 	free(t);
82 }
83 
84 void *
85 __collate_load(const char *encoding, __unused locale_t unused)
86 {
87 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 ||
88 	    strncmp(encoding, "C.", 2) == 0) {
89 		return &__xlocale_C_collate;
90 	}
91 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
92 	table->header.header.destructor = destruct_collate;
93 	// FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
94 	// the caching outside of this section
95 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
96 		xlocale_release(table);
97 		return NULL;
98 	}
99 	return table;
100 }
101 
102 /**
103  * Load the collation tables for the specified encoding into the global table.
104  */
105 int
106 __collate_load_tables(const char *encoding)
107 {
108 
109 	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
110 }
111 
112 int
113 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
114 {
115 	int i, chains, z;
116 	char *buf;
117 	char *TMP;
118 	char *map;
119 	collate_info_t *info;
120 	struct stat sbuf;
121 	int fd;
122 
123 	table->__collate_load_error = 1;
124 
125 	/* 'encoding' must be already checked. */
126 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 ||
127 	    strncmp(encoding, "C.", 2) == 0) {
128 		return (_LDP_CACHE);
129 	}
130 
131 	if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1)
132 		return (_LDP_ERROR);
133 
134 	if ((fd = _open(buf, O_RDONLY)) < 0) {
135 		free(buf);
136 		return (_LDP_ERROR);
137 	}
138 	free(buf);
139 	if (_fstat(fd, &sbuf) < 0) {
140 		(void) _close(fd);
141 		return (_LDP_ERROR);
142 	}
143 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
144 		(void) _close(fd);
145 		errno = EINVAL;
146 		return (_LDP_ERROR);
147 	}
148 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
149 	(void) _close(fd);
150 	if ((TMP = map) == NULL) {
151 		return (_LDP_ERROR);
152 	}
153 
154 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
155 		(void) munmap(map, sbuf.st_size);
156 		errno = EINVAL;
157 		return (_LDP_ERROR);
158 	}
159 	TMP += COLLATE_STR_LEN;
160 
161 	info = (void *)TMP;
162 	TMP += sizeof (*info);
163 
164 	if ((info->directive_count < 1) ||
165 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
166 	    ((chains = info->chain_count) < 0)) {
167 		(void) munmap(map, sbuf.st_size);
168 		errno = EINVAL;
169 		return (_LDP_ERROR);
170 	}
171 
172 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
173 	    (sizeof (collate_chain_t) * chains) +
174 	    (sizeof (collate_large_t) * info->large_count);
175 	for (z = 0; z < info->directive_count; z++) {
176 		i += sizeof (collate_subst_t) * info->subst_count[z];
177 	}
178 	if (i != (sbuf.st_size - (TMP - map))) {
179 		(void) munmap(map, sbuf.st_size);
180 		errno = EINVAL;
181 		return (_LDP_ERROR);
182 	}
183 
184 	table->info = info;
185 	table->char_pri_table = (void *)TMP;
186 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
187 
188 	for (z = 0; z < info->directive_count; z++) {
189 		if (info->subst_count[z] > 0) {
190 			table->subst_table[z] = (void *)TMP;
191 			TMP += info->subst_count[z] * sizeof (collate_subst_t);
192 		} else {
193 			table->subst_table[z] = NULL;
194 		}
195 	}
196 
197 	if (chains > 0) {
198 		table->chain_pri_table = (void *)TMP;
199 		TMP += chains * sizeof (collate_chain_t);
200 	} else
201 		table->chain_pri_table = NULL;
202 	if (info->large_count > 0)
203 		table->large_pri_table = (void *)TMP;
204 	else
205 		table->large_pri_table = NULL;
206 
207 	table->__collate_load_error = 0;
208 	return (_LDP_LOADED);
209 }
210 
211 static const int32_t *
212 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
213 {
214 	const collate_subst_t *p;
215 	int n = table->info->subst_count[pass];
216 
217 	if (n == 0)
218 		return (NULL);
219 
220 	if (pass >= table->info->directive_count)
221 		return (NULL);
222 
223 	if (!(key & COLLATE_SUBST_PRIORITY))
224 		return (NULL);
225 
226 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
227 	assert(p->key == key);
228 
229 	return (p->pri);
230 }
231 
232 static collate_chain_t *
233 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
234 {
235 	int low = 0;
236 	int high = table->info->chain_count - 1;
237 	int next, compar, l;
238 	collate_chain_t *p;
239 	collate_chain_t *tab = table->chain_pri_table;
240 
241 	if (high < 0)
242 		return (NULL);
243 
244 	while (low <= high) {
245 		next = (low + high) / 2;
246 		p = tab + next;
247 		compar = *key - *p->str;
248 		if (compar == 0) {
249 			l = wcsnlen(p->str, COLLATE_STR_LEN);
250 			compar = wcsncmp(key, p->str, l);
251 			if (compar == 0) {
252 				*len = l;
253 				return (p);
254 			}
255 		}
256 		if (compar > 0)
257 			low = next + 1;
258 		else
259 			high = next - 1;
260 	}
261 	return (NULL);
262 }
263 
264 static collate_large_t *
265 largesearch(struct xlocale_collate *table, const wchar_t key)
266 {
267 	int low = 0;
268 	int high = table->info->large_count - 1;
269 	int next, compar;
270 	collate_large_t *p;
271 	collate_large_t *tab = table->large_pri_table;
272 
273 	if (high < 0)
274 		return (NULL);
275 
276 	while (low <= high) {
277 		next = (low + high) / 2;
278 		p = tab + next;
279 		compar = key - p->val;
280 		if (compar == 0)
281 			return (p);
282 		if (compar > 0)
283 			low = next + 1;
284 		else
285 			high = next - 1;
286 	}
287 	return (NULL);
288 }
289 
290 void
291 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
292     int *pri, int which, const int **state)
293 {
294 	collate_chain_t *p2;
295 	collate_large_t *match;
296 	int p, l;
297 	const int *sptr;
298 
299 	/*
300 	 * If this is the "last" pass for the UNDEFINED, then
301 	 * we just return the priority itself.
302 	 */
303 	if (which >= table->info->directive_count) {
304 		*pri = *t;
305 		*len = 1;
306 		*state = NULL;
307 		return;
308 	}
309 
310 	/*
311 	 * If we have remaining substitution data from a previous
312 	 * call, consume it first.
313 	 */
314 	if ((sptr = *state) != NULL) {
315 		*pri = *sptr;
316 		sptr++;
317 		if ((sptr == *state) || (sptr == NULL))
318 			*state = NULL;
319 		else
320 			*state = sptr;
321 		*len = 0;
322 		return;
323 	}
324 
325 	/* No active substitutions */
326 	*len = 1;
327 
328 	/*
329 	 * Check for composites such as diphthongs that collate as a
330 	 * single element (aka chains or collating-elements).
331 	 */
332 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
333 	    ((p = p2->pri[which]) >= 0)) {
334 
335 		*len = l;
336 		*pri = p;
337 
338 	} else if (*t <= UCHAR_MAX) {
339 
340 		/*
341 		 * Character is a small (8-bit) character.
342 		 * We just look these up directly for speed.
343 		 */
344 		*pri = table->char_pri_table[*t].pri[which];
345 
346 	} else if ((table->info->large_count > 0) &&
347 	    ((match = largesearch(table, *t)) != NULL)) {
348 
349 		/*
350 		 * Character was found in the extended table.
351 		 */
352 		*pri = match->pri.pri[which];
353 
354 	} else {
355 		/*
356 		 * Character lacks a specific definition.
357 		 */
358 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
359 			/* Mask off sign bit to prevent ordering confusion. */
360 			*pri = (*t & COLLATE_MAX_PRIORITY);
361 		} else {
362 			*pri = table->info->undef_pri[which];
363 		}
364 		/* No substitutions for undefined characters! */
365 		return;
366 	}
367 
368 	/*
369 	 * Try substituting (expanding) the character.  We are
370 	 * currently doing this *after* the chain compression.  I
371 	 * think it should not matter, but this way might be slightly
372 	 * faster.
373 	 *
374 	 * We do this after the priority search, as this will help us
375 	 * to identify a single key value.  In order for this to work,
376 	 * its important that the priority assigned to a given element
377 	 * to be substituted be unique for that level.  The localedef
378 	 * code ensures this for us.
379 	 */
380 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
381 		if ((*pri = *sptr) > 0) {
382 			sptr++;
383 			*state = *sptr ? sptr : NULL;
384 		}
385 	}
386 
387 }
388 
389 /*
390  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
391  * NOT NULL terminate.  That is left to the caller.
392  */
393 size_t
394 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
395     size_t room)
396 {
397 	int		pri;
398 	int		len;
399 	const wchar_t	*t;
400 	wchar_t		*tr = NULL;
401 	int		direc;
402 	int		pass;
403 	const int32_t 	*state;
404 	size_t		want = 0;
405 	size_t		need = 0;
406 	int		ndir = table->info->directive_count;
407 
408 	assert(src);
409 
410 	for (pass = 0; pass <= ndir; pass++) {
411 
412 		state = NULL;
413 
414 		if (pass != 0) {
415 			/* insert level separator from the previous pass */
416 			if (room) {
417 				*xf++ = 1;
418 				room--;
419 			}
420 			want++;
421 		}
422 
423 		/* special pass for undefined */
424 		if (pass == ndir) {
425 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
426 		} else {
427 			direc = table->info->directive[pass];
428 		}
429 
430 		t = src;
431 
432 		if (direc & DIRECTIVE_BACKWARD) {
433 			wchar_t *bp, *fp, c;
434 			free(tr);
435 			if ((tr = wcsdup(t)) == NULL) {
436 				errno = ENOMEM;
437 				goto fail;
438 			}
439 			bp = tr;
440 			fp = tr + wcslen(tr) - 1;
441 			while (bp < fp) {
442 				c = *bp;
443 				*bp++ = *fp;
444 				*fp-- = c;
445 			}
446 			t = (const wchar_t *)tr;
447 		}
448 
449 		if (direc & DIRECTIVE_POSITION) {
450 			while (*t || state) {
451 				_collate_lookup(table, t, &len, &pri, pass, &state);
452 				t += len;
453 				if (pri <= 0) {
454 					if (pri < 0) {
455 						errno = EINVAL;
456 						goto fail;
457 					}
458 					state = NULL;
459 					pri = COLLATE_MAX_PRIORITY;
460 				}
461 				if (room) {
462 					*xf++ = pri;
463 					room--;
464 				}
465 				want++;
466 				need = want;
467 			}
468 		} else {
469 			while (*t || state) {
470 				_collate_lookup(table, t, &len, &pri, pass, &state);
471 				t += len;
472 				if (pri <= 0) {
473 					if (pri < 0) {
474 						errno = EINVAL;
475 						goto fail;
476 					}
477 					state = NULL;
478 					continue;
479 				}
480 				if (room) {
481 					*xf++ = pri;
482 					room--;
483 				}
484 				want++;
485 				need = want;
486 			}
487 		}
488 	}
489 	free(tr);
490 	return (need);
491 
492 fail:
493 	free(tr);
494 	return ((size_t)(-1));
495 }
496 
497 /*
498  * In the non-POSIX case, we transform each character into a string of
499  * characters representing the character's priority.  Since char is usually
500  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
501  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
502  * bits per byte.
503  *
504  * It turns out that we sometimes have real priorities that are
505  * 31-bits wide.  (But: be careful using priorities where the high
506  * order bit is set -- i.e. the priority is negative.  The sort order
507  * may be surprising!)
508  *
509  * TODO: This would be a good area to optimize somewhat.  It turns out
510  * that real prioririties *except for the last UNDEFINED pass* are generally
511  * very small.  We need the localedef code to precalculate the max
512  * priority for us, and ideally also give us a mask, and then we could
513  * severely limit what we expand to.
514  */
515 #define	XFRM_BYTES	6
516 #define	XFRM_OFFSET	('0')	/* make all printable characters */
517 #define	XFRM_SHIFT	6
518 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
519 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
520 
521 static int
522 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
523 {
524 	/* we use unsigned to ensure zero fill on right shift */
525 	uint32_t val = (uint32_t)table->info->pri_count[pass];
526 	int nc = 0;
527 
528 	while (val) {
529 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
530 		pri >>= XFRM_SHIFT;
531 		val >>= XFRM_SHIFT;
532 		p++;
533 		nc++;
534 	}
535 	return (nc);
536 }
537 
538 size_t
539 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
540     size_t room)
541 {
542 	int		pri;
543 	int		len;
544 	const wchar_t	*t;
545 	wchar_t		*tr = NULL;
546 	int		direc;
547 	int		pass;
548 	const int32_t 	*state;
549 	size_t		want = 0;
550 	size_t		need = 0;
551 	int		b;
552 	uint8_t		buf[XFRM_BYTES];
553 	int		ndir = table->info->directive_count;
554 
555 	assert(src);
556 
557 	for (pass = 0; pass <= ndir; pass++) {
558 
559 		state = NULL;
560 
561 		if (pass != 0) {
562 			/* insert level separator from the previous pass */
563 			if (room) {
564 				*xf++ = XFRM_SEP;
565 				room--;
566 			}
567 			want++;
568 		}
569 
570 		/* special pass for undefined */
571 		if (pass == ndir) {
572 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
573 		} else {
574 			direc = table->info->directive[pass];
575 		}
576 
577 		t = src;
578 
579 		if (direc & DIRECTIVE_BACKWARD) {
580 			wchar_t *bp, *fp, c;
581 			free(tr);
582 			if ((tr = wcsdup(t)) == NULL) {
583 				errno = ENOMEM;
584 				goto fail;
585 			}
586 			bp = tr;
587 			fp = tr + wcslen(tr) - 1;
588 			while (bp < fp) {
589 				c = *bp;
590 				*bp++ = *fp;
591 				*fp-- = c;
592 			}
593 			t = (const wchar_t *)tr;
594 		}
595 
596 		if (direc & DIRECTIVE_POSITION) {
597 			while (*t || state) {
598 
599 				_collate_lookup(table, t, &len, &pri, pass, &state);
600 				t += len;
601 				if (pri <= 0) {
602 					if (pri < 0) {
603 						errno = EINVAL;
604 						goto fail;
605 					}
606 					state = NULL;
607 					pri = COLLATE_MAX_PRIORITY;
608 				}
609 
610 				b = xfrm(table, buf, pri, pass);
611 				want += b;
612 				if (room) {
613 					while (b) {
614 						b--;
615 						if (room) {
616 							*xf++ = buf[b];
617 							room--;
618 						}
619 					}
620 				}
621 				need = want;
622 			}
623 		} else {
624 			while (*t || state) {
625 				_collate_lookup(table, t, &len, &pri, pass, &state);
626 				t += len;
627 				if (pri <= 0) {
628 					if (pri < 0) {
629 						errno = EINVAL;
630 						goto fail;
631 					}
632 					state = NULL;
633 					continue;
634 				}
635 
636 				b = xfrm(table, buf, pri, pass);
637 				want += b;
638 				if (room) {
639 
640 					while (b) {
641 						b--;
642 						if (room) {
643 							*xf++ = buf[b];
644 							room--;
645 						}
646 					}
647 				}
648 				need = want;
649 			}
650 		}
651 	}
652 	free(tr);
653 	return (need);
654 
655 fail:
656 	free(tr);
657 	return ((size_t)(-1));
658 }
659 
660 /*
661  * __collate_equiv_value returns the primary collation value for the given
662  * collating symbol specified by str and len.  Zero or negative is returned
663  * if the collating symbol was not found.  This function is used by bracket
664  * code in the TRE regex library.
665  */
666 int
667 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
668 {
669 	int32_t e;
670 
671 	if (len < 1 || len >= COLLATE_STR_LEN)
672 		return (-1);
673 
674 	FIX_LOCALE(locale);
675 	struct xlocale_collate *table =
676 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
677 
678 	if (table->__collate_load_error)
679 		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
680 
681 	if (len == 1) {
682 		e = -1;
683 		if (*str <= UCHAR_MAX)
684 			e = table->char_pri_table[*str].pri[0];
685 		else if (table->info->large_count > 0) {
686 			collate_large_t *match_large;
687 			match_large = largesearch(table, *str);
688 			if (match_large)
689 				e = match_large->pri.pri[0];
690 		}
691 		if (e == 0)
692 			return (1);
693 		return (e > 0 ? e : 0);
694 	}
695 	if (table->info->chain_count > 0) {
696 		wchar_t name[COLLATE_STR_LEN];
697 		collate_chain_t *match_chain;
698 		int clen;
699 
700 		wcsncpy (name, str, len);
701 		name[len] = 0;
702 		match_chain = chainsearch(table, name, &clen);
703 		if (match_chain) {
704 			e = match_chain->pri[0];
705 			if (e == 0)
706 				return (1);
707 			return (e < 0 ? -e : e);
708 		}
709 	}
710 	return (0);
711 }
712