xref: /freebsd/lib/libc/locale/collate.c (revision 69718b786d3943ea9a99eeeb5f5f6162f11c78b7)
1 /*-
2  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
5  *		at Electronni Visti IA, Kiev, Ukraine.
6  *			All rights reserved.
7  *
8  * Copyright (c) 2011 The FreeBSD Foundation
9  * All rights reserved.
10  * Portions of this software were developed by David Chisnall
11  * under sponsorship from the FreeBSD Foundation.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Adapted to xlocale by John Marino <draco@marino.st>
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "namespace.h"
41 
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <sys/mman.h>
45 
46 #include <assert.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <wchar.h>
51 #include <errno.h>
52 #include <unistd.h>
53 #include <fcntl.h>
54 #include "un-namespace.h"
55 
56 #include "endian.h"
57 #include "collate.h"
58 #include "setlocale.h"
59 #include "ldpart.h"
60 #include "libc_private.h"
61 
62 struct xlocale_collate __xlocale_global_collate = {
63 	{{0}, "C"}, 1, 0, 0, 0
64 };
65 
66 struct xlocale_collate __xlocale_C_collate = {
67 	{{0}, "C"}, 1, 0, 0, 0
68 };
69 
70 static int
71 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
72 
73 static void
74 destruct_collate(void *t)
75 {
76 	struct xlocale_collate *table = t;
77 	if (table->map && (table->maplen > 0)) {
78 		(void) munmap(table->map, table->maplen);
79 	}
80 	free(t);
81 }
82 
83 void *
84 __collate_load(const char *encoding, __unused locale_t unused)
85 {
86 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
87 		return &__xlocale_C_collate;
88 	}
89 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
90 	table->header.header.destructor = destruct_collate;
91 	// FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
92 	// the caching outside of this section
93 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
94 		xlocale_release(table);
95 		return NULL;
96 	}
97 	return table;
98 }
99 
100 /**
101  * Load the collation tables for the specified encoding into the global table.
102  */
103 int
104 __collate_load_tables(const char *encoding)
105 {
106 
107 	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
108 }
109 
110 int
111 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
112 {
113 	int i, chains, z;
114 	char *buf;
115 	char *TMP;
116 	char *map;
117 	collate_info_t *info;
118 	struct stat sbuf;
119 	int fd;
120 
121 	table->__collate_load_error = 1;
122 
123 	/* 'encoding' must be already checked. */
124 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
125 		return (_LDP_CACHE);
126 	}
127 
128 	asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding);
129 	if (buf == NULL)
130 		return (_LDP_ERROR);
131 
132 	if ((fd = _open(buf, O_RDONLY)) < 0) {
133 		free(buf);
134 		return (_LDP_ERROR);
135 	}
136 	free(buf);
137 	if (_fstat(fd, &sbuf) < 0) {
138 		(void) _close(fd);
139 		return (_LDP_ERROR);
140 	}
141 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
142 		(void) _close(fd);
143 		errno = EINVAL;
144 		return (_LDP_ERROR);
145 	}
146 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
147 	(void) _close(fd);
148 	if ((TMP = map) == NULL) {
149 		return (_LDP_ERROR);
150 	}
151 
152 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
153 		(void) munmap(map, sbuf.st_size);
154 		errno = EINVAL;
155 		return (_LDP_ERROR);
156 	}
157 	TMP += COLLATE_STR_LEN;
158 
159 	info = (void *)TMP;
160 	TMP += sizeof (*info);
161 
162 	if ((info->directive_count < 1) ||
163 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
164 	    ((chains = BSWAP(info->chain_count)) < 0)) {
165 		(void) munmap(map, sbuf.st_size);
166 		errno = EINVAL;
167 		return (_LDP_ERROR);
168 	}
169 
170 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
171 	    (sizeof (collate_chain_t) * chains) +
172 	    (sizeof (collate_large_t) * BSWAP(info->large_count));
173 	for (z = 0; z < info->directive_count; z++) {
174 		i += sizeof (collate_subst_t) * BSWAP(info->subst_count[z]);
175 	}
176 	if (i != (sbuf.st_size - (TMP - map))) {
177 		(void) munmap(map, sbuf.st_size);
178 		errno = EINVAL;
179 		return (_LDP_ERROR);
180 	}
181 
182 	table->info = info;
183 	table->char_pri_table = (void *)TMP;
184 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
185 
186 	for (z = 0; z < info->directive_count; z++) {
187 		if (BSWAP(info->subst_count[z]) > 0) {
188 			table->subst_table[z] = (void *)TMP;
189 			TMP += BSWAP(info->subst_count[z]) * sizeof (collate_subst_t);
190 		} else {
191 			table->subst_table[z] = NULL;
192 		}
193 	}
194 
195 	if (chains > 0) {
196 		table->chain_pri_table = (void *)TMP;
197 		TMP += chains * sizeof (collate_chain_t);
198 	} else
199 		table->chain_pri_table = NULL;
200 	if (BSWAP(info->large_count) > 0)
201 		table->large_pri_table = (void *)TMP;
202 	else
203 		table->large_pri_table = NULL;
204 
205 	table->__collate_load_error = 0;
206 	return (_LDP_LOADED);
207 }
208 
209 static const int32_t *
210 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
211 {
212 	const collate_subst_t *p;
213 	int n = BSWAP(table->info->subst_count[pass]);
214 
215 	if (n == 0)
216 		return (NULL);
217 
218 	if (pass >= table->info->directive_count)
219 		return (NULL);
220 
221 	if (!(key & COLLATE_SUBST_PRIORITY))
222 		return (NULL);
223 
224 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
225 	assert(BSWAP(p->key) == key);
226 
227 	return (p->pri);
228 }
229 
230 static collate_chain_t *
231 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
232 {
233 	int low = 0;
234 	int high = BSWAP(table->info->chain_count) - 1;
235 	int next, compar, l;
236 	collate_chain_t *p;
237 	collate_chain_t *tab = table->chain_pri_table;
238 
239 	if (high < 0)
240 		return (NULL);
241 
242 	while (low <= high) {
243 		next = (low + high) / 2;
244 		p = tab + next;
245 		compar = *key - le16toh(*p->str);
246 		if (compar == 0) {
247 			l = wcsnlen(p->str, COLLATE_STR_LEN);
248 			compar = wcsncmp(key, p->str, l);
249 			if (compar == 0) {
250 				*len = l;
251 				return (p);
252 			}
253 		}
254 		if (compar > 0)
255 			low = next + 1;
256 		else
257 			high = next - 1;
258 	}
259 	return (NULL);
260 }
261 
262 static collate_large_t *
263 largesearch(struct xlocale_collate *table, const wchar_t key)
264 {
265 	int low = 0;
266 	int high = BSWAP(table->info->large_count) - 1;
267 	int next, compar;
268 	collate_large_t *p;
269 	collate_large_t *tab = table->large_pri_table;
270 
271 	if (high < 0)
272 		return (NULL);
273 
274 	while (low <= high) {
275 		next = (low + high) / 2;
276 		p = tab + next;
277 		compar = key - BSWAP(p->val);
278 		if (compar == 0)
279 			return (p);
280 		if (compar > 0)
281 			low = next + 1;
282 		else
283 			high = next - 1;
284 	}
285 	return (NULL);
286 }
287 
288 void
289 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
290     int *pri, int which, const int **state)
291 {
292 	collate_chain_t *p2;
293 	collate_large_t *match;
294 	int p, l;
295 	const int *sptr;
296 
297 	/*
298 	 * If this is the "last" pass for the UNDEFINED, then
299 	 * we just return the priority itself.
300 	 */
301 	if (which >= table->info->directive_count) {
302 		*pri = *t;
303 		*len = 1;
304 		*state = NULL;
305 		return;
306 	}
307 
308 	/*
309 	 * If we have remaining substitution data from a previous
310 	 * call, consume it first.
311 	 */
312 	if ((sptr = *state) != NULL) {
313 		*pri = *sptr;
314 		sptr++;
315 		if ((sptr == *state) || (sptr == NULL))
316 			*state = NULL;
317 		else
318 			*state = sptr;
319 		*len = 0;
320 		return;
321 	}
322 
323 	/* No active substitutions */
324 	*len = 1;
325 
326 	/*
327 	 * Check for composites such as diphthongs that collate as a
328 	 * single element (aka chains or collating-elements).
329 	 */
330 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
331 	    ((p = p2->pri[which]) >= 0)) {
332 
333 		*len = l;
334 		*pri = p;
335 
336 	} else if (*t <= UCHAR_MAX) {
337 
338 		/*
339 		 * Character is a small (8-bit) character.
340 		 * We just look these up directly for speed.
341 		 */
342 		*pri = BSWAP(table->char_pri_table[*t].pri[which]);
343 
344 	} else if ((BSWAP(table->info->large_count) > 0) &&
345 	    ((match = largesearch(table, *t)) != NULL)) {
346 
347 		/*
348 		 * Character was found in the extended table.
349 		 */
350 		*pri = BSWAP(match->pri.pri[which]);
351 
352 	} else {
353 		/*
354 		 * Character lacks a specific definition.
355 		 */
356 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
357 			/* Mask off sign bit to prevent ordering confusion. */
358 			*pri = (*t & COLLATE_MAX_PRIORITY);
359 		} else {
360 			*pri = BSWAP(table->info->undef_pri[which]);
361 		}
362 		/* No substitutions for undefined characters! */
363 		return;
364 	}
365 
366 	/*
367 	 * Try substituting (expanding) the character.  We are
368 	 * currently doing this *after* the chain compression.  I
369 	 * think it should not matter, but this way might be slightly
370 	 * faster.
371 	 *
372 	 * We do this after the priority search, as this will help us
373 	 * to identify a single key value.  In order for this to work,
374 	 * its important that the priority assigned to a given element
375 	 * to be substituted be unique for that level.  The localedef
376 	 * code ensures this for us.
377 	 */
378 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
379 		if ((*pri = BSWAP(*sptr)) > 0) {
380 			sptr++;
381 			*state = BSWAP(*sptr) ? sptr : NULL;
382 		}
383 	}
384 
385 }
386 
387 /*
388  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
389  * NOT NULL terminate.  That is left to the caller.
390  */
391 size_t
392 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
393     size_t room)
394 {
395 	int		pri;
396 	int		len;
397 	const wchar_t	*t;
398 	wchar_t		*tr = NULL;
399 	int		direc;
400 	int		pass;
401 	const int32_t 	*state;
402 	size_t		want = 0;
403 	size_t		need = 0;
404 	int		ndir = table->info->directive_count;
405 
406 	assert(src);
407 
408 	for (pass = 0; pass <= ndir; pass++) {
409 
410 		state = NULL;
411 
412 		if (pass != 0) {
413 			/* insert level separator from the previous pass */
414 			if (room) {
415 				*xf++ = 1;
416 				room--;
417 			}
418 			want++;
419 		}
420 
421 		/* special pass for undefined */
422 		if (pass == ndir) {
423 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
424 		} else {
425 			direc = table->info->directive[pass];
426 		}
427 
428 		t = src;
429 
430 		if (direc & DIRECTIVE_BACKWARD) {
431 			wchar_t *bp, *fp, c;
432 			free(tr);
433 			if ((tr = wcsdup(t)) == NULL) {
434 				errno = ENOMEM;
435 				goto fail;
436 			}
437 			bp = tr;
438 			fp = tr + wcslen(tr) - 1;
439 			while (bp < fp) {
440 				c = *bp;
441 				*bp++ = *fp;
442 				*fp-- = c;
443 			}
444 			t = (const wchar_t *)tr;
445 		}
446 
447 		if (direc & DIRECTIVE_POSITION) {
448 			while (*t || state) {
449 				_collate_lookup(table, t, &len, &pri, pass, &state);
450 				t += len;
451 				if (pri <= 0) {
452 					if (pri < 0) {
453 						errno = EINVAL;
454 						goto fail;
455 					}
456 					state = NULL;
457 					pri = COLLATE_MAX_PRIORITY;
458 				}
459 				if (room) {
460 					*xf++ = pri;
461 					room--;
462 				}
463 				want++;
464 				need = want;
465 			}
466 		} else {
467 			while (*t || state) {
468 				_collate_lookup(table, t, &len, &pri, pass, &state);
469 				t += len;
470 				if (pri <= 0) {
471 					if (pri < 0) {
472 						errno = EINVAL;
473 						goto fail;
474 					}
475 					state = NULL;
476 					continue;
477 				}
478 				if (room) {
479 					*xf++ = pri;
480 					room--;
481 				}
482 				want++;
483 				need = want;
484 			}
485 		}
486 	}
487 	free(tr);
488 	return (need);
489 
490 fail:
491 	free(tr);
492 	return ((size_t)(-1));
493 }
494 
495 /*
496  * In the non-POSIX case, we transform each character into a string of
497  * characters representing the character's priority.  Since char is usually
498  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
499  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
500  * bits per byte.
501  *
502  * It turns out that we sometimes have real priorities that are
503  * 31-bits wide.  (But: be careful using priorities where the high
504  * order bit is set -- i.e. the priority is negative.  The sort order
505  * may be surprising!)
506  *
507  * TODO: This would be a good area to optimize somewhat.  It turns out
508  * that real prioririties *except for the last UNDEFINED pass* are generally
509  * very small.  We need the localedef code to precalculate the max
510  * priority for us, and ideally also give us a mask, and then we could
511  * severely limit what we expand to.
512  */
513 #define	XFRM_BYTES	6
514 #define	XFRM_OFFSET	('0')	/* make all printable characters */
515 #define	XFRM_SHIFT	6
516 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
517 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
518 
519 static int
520 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
521 {
522 	/* we use unsigned to ensure zero fill on right shift */
523 	uint32_t val = BSWAP((uint32_t)table->info->pri_count[pass]);
524 	int nc = 0;
525 
526 	while (val) {
527 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
528 		pri >>= XFRM_SHIFT;
529 		val >>= XFRM_SHIFT;
530 		p++;
531 		nc++;
532 	}
533 	return (nc);
534 }
535 
536 size_t
537 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
538     size_t room)
539 {
540 	int		pri;
541 	int		len;
542 	const wchar_t	*t;
543 	wchar_t		*tr = NULL;
544 	int		direc;
545 	int		pass;
546 	const int32_t 	*state;
547 	size_t		want = 0;
548 	size_t		need = 0;
549 	int		b;
550 	uint8_t		buf[XFRM_BYTES];
551 	int		ndir = table->info->directive_count;
552 
553 	assert(src);
554 
555 	for (pass = 0; pass <= ndir; pass++) {
556 
557 		state = NULL;
558 
559 		if (pass != 0) {
560 			/* insert level separator from the previous pass */
561 			if (room) {
562 				*xf++ = XFRM_SEP;
563 				room--;
564 			}
565 			want++;
566 		}
567 
568 		/* special pass for undefined */
569 		if (pass == ndir) {
570 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
571 		} else {
572 			direc = table->info->directive[pass];
573 		}
574 
575 		t = src;
576 
577 		if (direc & DIRECTIVE_BACKWARD) {
578 			wchar_t *bp, *fp, c;
579 			free(tr);
580 			if ((tr = wcsdup(t)) == NULL) {
581 				errno = ENOMEM;
582 				goto fail;
583 			}
584 			bp = tr;
585 			fp = tr + wcslen(tr) - 1;
586 			while (bp < fp) {
587 				c = *bp;
588 				*bp++ = *fp;
589 				*fp-- = c;
590 			}
591 			t = (const wchar_t *)tr;
592 		}
593 
594 		if (direc & DIRECTIVE_POSITION) {
595 			while (*t || state) {
596 
597 				_collate_lookup(table, t, &len, &pri, pass, &state);
598 				t += len;
599 				if (pri <= 0) {
600 					if (pri < 0) {
601 						errno = EINVAL;
602 						goto fail;
603 					}
604 					state = NULL;
605 					pri = COLLATE_MAX_PRIORITY;
606 				}
607 
608 				b = xfrm(table, buf, pri, pass);
609 				want += b;
610 				if (room) {
611 					while (b) {
612 						b--;
613 						if (room) {
614 							*xf++ = buf[b];
615 							room--;
616 						}
617 					}
618 				}
619 				need = want;
620 			}
621 		} else {
622 			while (*t || state) {
623 				_collate_lookup(table, t, &len, &pri, pass, &state);
624 				t += len;
625 				if (pri <= 0) {
626 					if (pri < 0) {
627 						errno = EINVAL;
628 						goto fail;
629 					}
630 					state = NULL;
631 					continue;
632 				}
633 
634 				b = xfrm(table, buf, pri, pass);
635 				want += b;
636 				if (room) {
637 
638 					while (b) {
639 						b--;
640 						if (room) {
641 							*xf++ = buf[b];
642 							room--;
643 						}
644 					}
645 				}
646 				need = want;
647 			}
648 		}
649 	}
650 	free(tr);
651 	return (need);
652 
653 fail:
654 	free(tr);
655 	return ((size_t)(-1));
656 }
657 
658 /*
659  * __collate_equiv_value returns the primary collation value for the given
660  * collating symbol specified by str and len.  Zero or negative is returned
661  * if the collating symbol was not found.  This function is used by bracket
662  * code in the TRE regex library.
663  */
664 int
665 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
666 {
667 	int32_t e;
668 
669 	if (len < 1 || len >= COLLATE_STR_LEN)
670 		return (-1);
671 
672 	FIX_LOCALE(locale);
673 	struct xlocale_collate *table =
674 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
675 
676 	if (table->__collate_load_error)
677 		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
678 
679 	if (len == 1) {
680 		e = -1;
681 		if (*str <= UCHAR_MAX)
682 			e = table->char_pri_table[*str].pri[0];
683 		else if (BSWAP(table->info->large_count) > 0) {
684 			collate_large_t *match_large;
685 			match_large = largesearch(table, *str);
686 			if (match_large)
687 				e = match_large->pri.pri[0];
688 		}
689 		if (e == 0)
690 			return (1);
691 		return (e > 0 ? e : 0);
692 	}
693 	if (BSWAP(table->info->chain_count) > 0) {
694 		wchar_t name[COLLATE_STR_LEN];
695 		collate_chain_t *match_chain;
696 		int clen;
697 
698 		wcsncpy (name, str, len);
699 		name[len] = 0;
700 		match_chain = chainsearch(table, name, &clen);
701 		if (match_chain) {
702 			e = match_chain->pri[0];
703 			if (e == 0)
704 				return (1);
705 			return (e < 0 ? -e : e);
706 		}
707 	}
708 	return (0);
709 }
710