xref: /freebsd/lib/libc/locale/collate.c (revision ea825d02749f382c3f7e17f28247f20a48733eab)
1 /*-
2  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
5  *		at Electronni Visti IA, Kiev, Ukraine.
6  *			All rights reserved.
7  *
8  * Copyright (c) 2011 The FreeBSD Foundation
9  * All rights reserved.
10  * Portions of this software were developed by David Chisnall
11  * under sponsorship from the FreeBSD Foundation.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Adapted to xlocale by John Marino <draco@marino.st>
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "namespace.h"
41 
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <sys/mman.h>
45 
46 #include <assert.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <wchar.h>
51 #include <errno.h>
52 #include <unistd.h>
53 #include <fcntl.h>
54 #include "un-namespace.h"
55 
56 #include "endian.h"
57 #include "collate.h"
58 #include "setlocale.h"
59 #include "ldpart.h"
60 #include "libc_private.h"
61 
62 struct xlocale_collate __xlocale_global_collate = {
63 	{{0}, "C"}, 1, 0, 0, 0
64 };
65 
66 struct xlocale_collate __xlocale_C_collate = {
67 	{{0}, "C"}, 1, 0, 0, 0
68 };
69 
70 static int
71 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
72 
73 static void
74 destruct_collate(void *t)
75 {
76 	struct xlocale_collate *table = t;
77 	if (table->map && (table->maplen > 0)) {
78 		(void) munmap(table->map, table->maplen);
79 	}
80 	free(t);
81 }
82 
83 void *
84 __collate_load(const char *encoding, __unused locale_t unused)
85 {
86 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
87 		return &__xlocale_C_collate;
88 	}
89 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
90 	table->header.header.destructor = destruct_collate;
91 	// FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
92 	// the caching outside of this section
93 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
94 		xlocale_release(table);
95 		return NULL;
96 	}
97 	return table;
98 }
99 
100 /**
101  * Load the collation tables for the specified encoding into the global table.
102  */
103 int
104 __collate_load_tables(const char *encoding)
105 {
106 
107 	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
108 }
109 
110 int
111 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
112 {
113 	int i, chains, z;
114 	char *buf;
115 	char *TMP;
116 	char *map;
117 	collate_info_t *info;
118 	struct stat sbuf;
119 	int fd;
120 
121 	table->__collate_load_error = 1;
122 
123 	/* 'encoding' must be already checked. */
124 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
125 		return (_LDP_CACHE);
126 	}
127 
128 	if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1)
129 		return (_LDP_ERROR);
130 
131 	if ((fd = _open(buf, O_RDONLY)) < 0) {
132 		free(buf);
133 		return (_LDP_ERROR);
134 	}
135 	free(buf);
136 	if (_fstat(fd, &sbuf) < 0) {
137 		(void) _close(fd);
138 		return (_LDP_ERROR);
139 	}
140 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
141 		(void) _close(fd);
142 		errno = EINVAL;
143 		return (_LDP_ERROR);
144 	}
145 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
146 	(void) _close(fd);
147 	if ((TMP = map) == NULL) {
148 		return (_LDP_ERROR);
149 	}
150 
151 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
152 		(void) munmap(map, sbuf.st_size);
153 		errno = EINVAL;
154 		return (_LDP_ERROR);
155 	}
156 	TMP += COLLATE_STR_LEN;
157 
158 	info = (void *)TMP;
159 	TMP += sizeof (*info);
160 
161 	if ((info->directive_count < 1) ||
162 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
163 	    ((chains = BSWAP(info->chain_count)) < 0)) {
164 		(void) munmap(map, sbuf.st_size);
165 		errno = EINVAL;
166 		return (_LDP_ERROR);
167 	}
168 
169 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
170 	    (sizeof (collate_chain_t) * chains) +
171 	    (sizeof (collate_large_t) * BSWAP(info->large_count));
172 	for (z = 0; z < info->directive_count; z++) {
173 		i += sizeof (collate_subst_t) * BSWAP(info->subst_count[z]);
174 	}
175 	if (i != (sbuf.st_size - (TMP - map))) {
176 		(void) munmap(map, sbuf.st_size);
177 		errno = EINVAL;
178 		return (_LDP_ERROR);
179 	}
180 
181 	table->info = info;
182 	table->char_pri_table = (void *)TMP;
183 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
184 
185 	for (z = 0; z < info->directive_count; z++) {
186 		if (BSWAP(info->subst_count[z]) > 0) {
187 			table->subst_table[z] = (void *)TMP;
188 			TMP += BSWAP(info->subst_count[z]) * sizeof (collate_subst_t);
189 		} else {
190 			table->subst_table[z] = NULL;
191 		}
192 	}
193 
194 	if (chains > 0) {
195 		table->chain_pri_table = (void *)TMP;
196 		TMP += chains * sizeof (collate_chain_t);
197 	} else
198 		table->chain_pri_table = NULL;
199 	if (BSWAP(info->large_count) > 0)
200 		table->large_pri_table = (void *)TMP;
201 	else
202 		table->large_pri_table = NULL;
203 
204 	table->__collate_load_error = 0;
205 	return (_LDP_LOADED);
206 }
207 
208 static const int32_t *
209 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
210 {
211 	const collate_subst_t *p;
212 	int n = BSWAP(table->info->subst_count[pass]);
213 
214 	if (n == 0)
215 		return (NULL);
216 
217 	if (pass >= table->info->directive_count)
218 		return (NULL);
219 
220 	if (!(key & COLLATE_SUBST_PRIORITY))
221 		return (NULL);
222 
223 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
224 	assert(BSWAP(p->key) == key);
225 
226 	return (p->pri);
227 }
228 
229 static collate_chain_t *
230 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
231 {
232 	int low = 0;
233 	int high = BSWAP(table->info->chain_count) - 1;
234 	int next, compar, l;
235 	collate_chain_t *p;
236 	collate_chain_t *tab = table->chain_pri_table;
237 
238 	if (high < 0)
239 		return (NULL);
240 
241 	while (low <= high) {
242 		next = (low + high) / 2;
243 		p = tab + next;
244 		compar = *key - le16toh(*p->str);
245 		if (compar == 0) {
246 			l = wcsnlen(p->str, COLLATE_STR_LEN);
247 			compar = wcsncmp(key, p->str, l);
248 			if (compar == 0) {
249 				*len = l;
250 				return (p);
251 			}
252 		}
253 		if (compar > 0)
254 			low = next + 1;
255 		else
256 			high = next - 1;
257 	}
258 	return (NULL);
259 }
260 
261 static collate_large_t *
262 largesearch(struct xlocale_collate *table, const wchar_t key)
263 {
264 	int low = 0;
265 	int high = BSWAP(table->info->large_count) - 1;
266 	int next, compar;
267 	collate_large_t *p;
268 	collate_large_t *tab = table->large_pri_table;
269 
270 	if (high < 0)
271 		return (NULL);
272 
273 	while (low <= high) {
274 		next = (low + high) / 2;
275 		p = tab + next;
276 		compar = key - BSWAP(p->val);
277 		if (compar == 0)
278 			return (p);
279 		if (compar > 0)
280 			low = next + 1;
281 		else
282 			high = next - 1;
283 	}
284 	return (NULL);
285 }
286 
287 void
288 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
289     int *pri, int which, const int **state)
290 {
291 	collate_chain_t *p2;
292 	collate_large_t *match;
293 	int p, l;
294 	const int *sptr;
295 
296 	/*
297 	 * If this is the "last" pass for the UNDEFINED, then
298 	 * we just return the priority itself.
299 	 */
300 	if (which >= table->info->directive_count) {
301 		*pri = *t;
302 		*len = 1;
303 		*state = NULL;
304 		return;
305 	}
306 
307 	/*
308 	 * If we have remaining substitution data from a previous
309 	 * call, consume it first.
310 	 */
311 	if ((sptr = *state) != NULL) {
312 		*pri = *sptr;
313 		sptr++;
314 		if ((sptr == *state) || (sptr == NULL))
315 			*state = NULL;
316 		else
317 			*state = sptr;
318 		*len = 0;
319 		return;
320 	}
321 
322 	/* No active substitutions */
323 	*len = 1;
324 
325 	/*
326 	 * Check for composites such as diphthongs that collate as a
327 	 * single element (aka chains or collating-elements).
328 	 */
329 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
330 	    ((p = p2->pri[which]) >= 0)) {
331 
332 		*len = l;
333 		*pri = p;
334 
335 	} else if (*t <= UCHAR_MAX) {
336 
337 		/*
338 		 * Character is a small (8-bit) character.
339 		 * We just look these up directly for speed.
340 		 */
341 		*pri = BSWAP(table->char_pri_table[*t].pri[which]);
342 
343 	} else if ((BSWAP(table->info->large_count) > 0) &&
344 	    ((match = largesearch(table, *t)) != NULL)) {
345 
346 		/*
347 		 * Character was found in the extended table.
348 		 */
349 		*pri = BSWAP(match->pri.pri[which]);
350 
351 	} else {
352 		/*
353 		 * Character lacks a specific definition.
354 		 */
355 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
356 			/* Mask off sign bit to prevent ordering confusion. */
357 			*pri = (*t & COLLATE_MAX_PRIORITY);
358 		} else {
359 			*pri = BSWAP(table->info->undef_pri[which]);
360 		}
361 		/* No substitutions for undefined characters! */
362 		return;
363 	}
364 
365 	/*
366 	 * Try substituting (expanding) the character.  We are
367 	 * currently doing this *after* the chain compression.  I
368 	 * think it should not matter, but this way might be slightly
369 	 * faster.
370 	 *
371 	 * We do this after the priority search, as this will help us
372 	 * to identify a single key value.  In order for this to work,
373 	 * its important that the priority assigned to a given element
374 	 * to be substituted be unique for that level.  The localedef
375 	 * code ensures this for us.
376 	 */
377 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
378 		if ((*pri = BSWAP(*sptr)) > 0) {
379 			sptr++;
380 			*state = BSWAP(*sptr) ? sptr : NULL;
381 		}
382 	}
383 
384 }
385 
386 /*
387  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
388  * NOT NULL terminate.  That is left to the caller.
389  */
390 size_t
391 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
392     size_t room)
393 {
394 	int		pri;
395 	int		len;
396 	const wchar_t	*t;
397 	wchar_t		*tr = NULL;
398 	int		direc;
399 	int		pass;
400 	const int32_t 	*state;
401 	size_t		want = 0;
402 	size_t		need = 0;
403 	int		ndir = table->info->directive_count;
404 
405 	assert(src);
406 
407 	for (pass = 0; pass <= ndir; pass++) {
408 
409 		state = NULL;
410 
411 		if (pass != 0) {
412 			/* insert level separator from the previous pass */
413 			if (room) {
414 				*xf++ = 1;
415 				room--;
416 			}
417 			want++;
418 		}
419 
420 		/* special pass for undefined */
421 		if (pass == ndir) {
422 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
423 		} else {
424 			direc = table->info->directive[pass];
425 		}
426 
427 		t = src;
428 
429 		if (direc & DIRECTIVE_BACKWARD) {
430 			wchar_t *bp, *fp, c;
431 			free(tr);
432 			if ((tr = wcsdup(t)) == NULL) {
433 				errno = ENOMEM;
434 				goto fail;
435 			}
436 			bp = tr;
437 			fp = tr + wcslen(tr) - 1;
438 			while (bp < fp) {
439 				c = *bp;
440 				*bp++ = *fp;
441 				*fp-- = c;
442 			}
443 			t = (const wchar_t *)tr;
444 		}
445 
446 		if (direc & DIRECTIVE_POSITION) {
447 			while (*t || state) {
448 				_collate_lookup(table, t, &len, &pri, pass, &state);
449 				t += len;
450 				if (pri <= 0) {
451 					if (pri < 0) {
452 						errno = EINVAL;
453 						goto fail;
454 					}
455 					state = NULL;
456 					pri = COLLATE_MAX_PRIORITY;
457 				}
458 				if (room) {
459 					*xf++ = pri;
460 					room--;
461 				}
462 				want++;
463 				need = want;
464 			}
465 		} else {
466 			while (*t || state) {
467 				_collate_lookup(table, t, &len, &pri, pass, &state);
468 				t += len;
469 				if (pri <= 0) {
470 					if (pri < 0) {
471 						errno = EINVAL;
472 						goto fail;
473 					}
474 					state = NULL;
475 					continue;
476 				}
477 				if (room) {
478 					*xf++ = pri;
479 					room--;
480 				}
481 				want++;
482 				need = want;
483 			}
484 		}
485 	}
486 	free(tr);
487 	return (need);
488 
489 fail:
490 	free(tr);
491 	return ((size_t)(-1));
492 }
493 
494 /*
495  * In the non-POSIX case, we transform each character into a string of
496  * characters representing the character's priority.  Since char is usually
497  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
498  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
499  * bits per byte.
500  *
501  * It turns out that we sometimes have real priorities that are
502  * 31-bits wide.  (But: be careful using priorities where the high
503  * order bit is set -- i.e. the priority is negative.  The sort order
504  * may be surprising!)
505  *
506  * TODO: This would be a good area to optimize somewhat.  It turns out
507  * that real prioririties *except for the last UNDEFINED pass* are generally
508  * very small.  We need the localedef code to precalculate the max
509  * priority for us, and ideally also give us a mask, and then we could
510  * severely limit what we expand to.
511  */
512 #define	XFRM_BYTES	6
513 #define	XFRM_OFFSET	('0')	/* make all printable characters */
514 #define	XFRM_SHIFT	6
515 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
516 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
517 
518 static int
519 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
520 {
521 	/* we use unsigned to ensure zero fill on right shift */
522 	uint32_t val = BSWAP((uint32_t)table->info->pri_count[pass]);
523 	int nc = 0;
524 
525 	while (val) {
526 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
527 		pri >>= XFRM_SHIFT;
528 		val >>= XFRM_SHIFT;
529 		p++;
530 		nc++;
531 	}
532 	return (nc);
533 }
534 
535 size_t
536 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
537     size_t room)
538 {
539 	int		pri;
540 	int		len;
541 	const wchar_t	*t;
542 	wchar_t		*tr = NULL;
543 	int		direc;
544 	int		pass;
545 	const int32_t 	*state;
546 	size_t		want = 0;
547 	size_t		need = 0;
548 	int		b;
549 	uint8_t		buf[XFRM_BYTES];
550 	int		ndir = table->info->directive_count;
551 
552 	assert(src);
553 
554 	for (pass = 0; pass <= ndir; pass++) {
555 
556 		state = NULL;
557 
558 		if (pass != 0) {
559 			/* insert level separator from the previous pass */
560 			if (room) {
561 				*xf++ = XFRM_SEP;
562 				room--;
563 			}
564 			want++;
565 		}
566 
567 		/* special pass for undefined */
568 		if (pass == ndir) {
569 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
570 		} else {
571 			direc = table->info->directive[pass];
572 		}
573 
574 		t = src;
575 
576 		if (direc & DIRECTIVE_BACKWARD) {
577 			wchar_t *bp, *fp, c;
578 			free(tr);
579 			if ((tr = wcsdup(t)) == NULL) {
580 				errno = ENOMEM;
581 				goto fail;
582 			}
583 			bp = tr;
584 			fp = tr + wcslen(tr) - 1;
585 			while (bp < fp) {
586 				c = *bp;
587 				*bp++ = *fp;
588 				*fp-- = c;
589 			}
590 			t = (const wchar_t *)tr;
591 		}
592 
593 		if (direc & DIRECTIVE_POSITION) {
594 			while (*t || state) {
595 
596 				_collate_lookup(table, t, &len, &pri, pass, &state);
597 				t += len;
598 				if (pri <= 0) {
599 					if (pri < 0) {
600 						errno = EINVAL;
601 						goto fail;
602 					}
603 					state = NULL;
604 					pri = COLLATE_MAX_PRIORITY;
605 				}
606 
607 				b = xfrm(table, buf, pri, pass);
608 				want += b;
609 				if (room) {
610 					while (b) {
611 						b--;
612 						if (room) {
613 							*xf++ = buf[b];
614 							room--;
615 						}
616 					}
617 				}
618 				need = want;
619 			}
620 		} else {
621 			while (*t || state) {
622 				_collate_lookup(table, t, &len, &pri, pass, &state);
623 				t += len;
624 				if (pri <= 0) {
625 					if (pri < 0) {
626 						errno = EINVAL;
627 						goto fail;
628 					}
629 					state = NULL;
630 					continue;
631 				}
632 
633 				b = xfrm(table, buf, pri, pass);
634 				want += b;
635 				if (room) {
636 
637 					while (b) {
638 						b--;
639 						if (room) {
640 							*xf++ = buf[b];
641 							room--;
642 						}
643 					}
644 				}
645 				need = want;
646 			}
647 		}
648 	}
649 	free(tr);
650 	return (need);
651 
652 fail:
653 	free(tr);
654 	return ((size_t)(-1));
655 }
656 
657 /*
658  * __collate_equiv_value returns the primary collation value for the given
659  * collating symbol specified by str and len.  Zero or negative is returned
660  * if the collating symbol was not found.  This function is used by bracket
661  * code in the TRE regex library.
662  */
663 int
664 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
665 {
666 	int32_t e;
667 
668 	if (len < 1 || len >= COLLATE_STR_LEN)
669 		return (-1);
670 
671 	FIX_LOCALE(locale);
672 	struct xlocale_collate *table =
673 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
674 
675 	if (table->__collate_load_error)
676 		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
677 
678 	if (len == 1) {
679 		e = -1;
680 		if (*str <= UCHAR_MAX)
681 			e = table->char_pri_table[*str].pri[0];
682 		else if (BSWAP(table->info->large_count) > 0) {
683 			collate_large_t *match_large;
684 			match_large = largesearch(table, *str);
685 			if (match_large)
686 				e = match_large->pri.pri[0];
687 		}
688 		if (e == 0)
689 			return (1);
690 		return (e > 0 ? e : 0);
691 	}
692 	if (BSWAP(table->info->chain_count) > 0) {
693 		wchar_t name[COLLATE_STR_LEN];
694 		collate_chain_t *match_chain;
695 		int clen;
696 
697 		wcsncpy (name, str, len);
698 		name[len] = 0;
699 		match_chain = chainsearch(table, name, &clen);
700 		if (match_chain) {
701 			e = match_chain->pri[0];
702 			if (e == 0)
703 				return (1);
704 			return (e < 0 ? -e : e);
705 		}
706 	}
707 	return (0);
708 }
709