xref: /freebsd/lib/libc/locale/collate.c (revision eb9da1ada8b6b2c74378a5c17029ec5a7fb199e6)
1 /*-
2  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
5  *		at Electronni Visti IA, Kiev, Ukraine.
6  *			All rights reserved.
7  *
8  * Copyright (c) 2011 The FreeBSD Foundation
9  * All rights reserved.
10  * Portions of this software were developed by David Chisnall
11  * under sponsorship from the FreeBSD Foundation.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Adapted to xlocale by John Marino <draco@marino.st>
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "namespace.h"
41 
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <sys/mman.h>
45 
46 #include <assert.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <wchar.h>
51 #include <errno.h>
52 #include <unistd.h>
53 #include <fcntl.h>
54 #include "un-namespace.h"
55 
56 #include "collate.h"
57 #include "setlocale.h"
58 #include "ldpart.h"
59 #include "libc_private.h"
60 
61 struct xlocale_collate __xlocale_global_collate = {
62 	{{0}, "C"}, 1, 0, 0, 0
63 };
64 
65 struct xlocale_collate __xlocale_C_collate = {
66 	{{0}, "C"}, 1, 0, 0, 0
67 };
68 
69 static int
70 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
71 
72 static void
73 destruct_collate(void *t)
74 {
75 	struct xlocale_collate *table = t;
76 	if (table->map && (table->maplen > 0)) {
77 		(void) munmap(table->map, table->maplen);
78 	}
79 	free(t);
80 }
81 
82 void *
83 __collate_load(const char *encoding, __unused locale_t unused)
84 {
85 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
86 		return &__xlocale_C_collate;
87 	}
88 	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
89 	table->header.header.destructor = destruct_collate;
90 	// FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
91 	// the caching outside of this section
92 	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
93 		xlocale_release(table);
94 		return NULL;
95 	}
96 	return table;
97 }
98 
99 /**
100  * Load the collation tables for the specified encoding into the global table.
101  */
102 int
103 __collate_load_tables(const char *encoding)
104 {
105 
106 	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
107 }
108 
109 int
110 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
111 {
112 	int i, chains, z;
113 	char *buf;
114 	char *TMP;
115 	char *map;
116 	collate_info_t *info;
117 	struct stat sbuf;
118 	int fd;
119 
120 	table->__collate_load_error = 1;
121 
122 	/* 'encoding' must be already checked. */
123 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
124 		return (_LDP_CACHE);
125 	}
126 
127 	asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding);
128 	if (buf == NULL)
129 		return (_LDP_ERROR);
130 
131 	if ((fd = _open(buf, O_RDONLY)) < 0) {
132 		free(buf);
133 		return (_LDP_ERROR);
134 	}
135 	free(buf);
136 	if (_fstat(fd, &sbuf) < 0) {
137 		(void) _close(fd);
138 		return (_LDP_ERROR);
139 	}
140 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
141 		(void) _close(fd);
142 		errno = EINVAL;
143 		return (_LDP_ERROR);
144 	}
145 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
146 	(void) _close(fd);
147 	if ((TMP = map) == NULL) {
148 		return (_LDP_ERROR);
149 	}
150 
151 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
152 		(void) munmap(map, sbuf.st_size);
153 		errno = EINVAL;
154 		return (_LDP_ERROR);
155 	}
156 	TMP += COLLATE_STR_LEN;
157 
158 	info = (void *)TMP;
159 	TMP += sizeof (*info);
160 
161 	if ((info->directive_count < 1) ||
162 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
163 	    ((chains = info->chain_count) < 0)) {
164 		(void) munmap(map, sbuf.st_size);
165 		errno = EINVAL;
166 		return (_LDP_ERROR);
167 	}
168 
169 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
170 	    (sizeof (collate_chain_t) * chains) +
171 	    (sizeof (collate_large_t) * info->large_count);
172 	for (z = 0; z < info->directive_count; z++) {
173 		i += sizeof (collate_subst_t) * info->subst_count[z];
174 	}
175 	if (i != (sbuf.st_size - (TMP - map))) {
176 		(void) munmap(map, sbuf.st_size);
177 		errno = EINVAL;
178 		return (_LDP_ERROR);
179 	}
180 
181 	table->info = info;
182 	table->char_pri_table = (void *)TMP;
183 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
184 
185 	for (z = 0; z < info->directive_count; z++) {
186 		if (info->subst_count[z] > 0) {
187 			table->subst_table[z] = (void *)TMP;
188 			TMP += info->subst_count[z] * sizeof (collate_subst_t);
189 		} else {
190 			table->subst_table[z] = NULL;
191 		}
192 	}
193 
194 	if (chains > 0) {
195 		table->chain_pri_table = (void *)TMP;
196 		TMP += chains * sizeof (collate_chain_t);
197 	} else
198 		table->chain_pri_table = NULL;
199 	if (info->large_count > 0)
200 		table->large_pri_table = (void *)TMP;
201 	else
202 		table->large_pri_table = NULL;
203 
204 	table->__collate_load_error = 0;
205 	return (_LDP_LOADED);
206 }
207 
208 static const int32_t *
209 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
210 {
211 	const collate_subst_t *p;
212 	int n = table->info->subst_count[pass];
213 
214 	if (n == 0)
215 		return (NULL);
216 
217 	if (pass >= table->info->directive_count)
218 		return (NULL);
219 
220 	if (!(key & COLLATE_SUBST_PRIORITY))
221 		return (NULL);
222 
223 	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
224 	assert(p->key == key);
225 	return (p->pri);
226 }
227 
228 static collate_chain_t *
229 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
230 {
231 	int low = 0;
232 	int high = table->info->chain_count - 1;;
233 	int next, compar, l;
234 	collate_chain_t *p;
235 	collate_chain_t *tab = table->chain_pri_table;
236 
237 	if (high < 0)
238 		return (NULL);
239 
240 	while (low <= high) {
241 		next = (low + high) / 2;
242 		p = tab + next;
243 		compar = *key - *p->str;
244 		if (compar == 0) {
245 			l = wcsnlen(p->str, COLLATE_STR_LEN);
246 			compar = wcsncmp(key, p->str, l);
247 			if (compar == 0) {
248 				*len = l;
249 				return (p);
250 			}
251 		}
252 		if (compar > 0)
253 			low = next + 1;
254 		else
255 			high = next - 1;
256 	}
257 	return (NULL);
258 }
259 
260 static collate_large_t *
261 largesearch(struct xlocale_collate *table, const wchar_t key)
262 {
263 	int low = 0;
264 	int high = table->info->large_count - 1;
265 	int next, compar;
266 	collate_large_t *p;
267 	collate_large_t *tab = table->large_pri_table;
268 
269 	if (high < 0)
270 		return (NULL);
271 
272 	while (low <= high) {
273 		next = (low + high) / 2;
274 		p = tab + next;
275 		compar = key - p->val;
276 		if (compar == 0)
277 			return (p);
278 		if (compar > 0)
279 			low = next + 1;
280 		else
281 			high = next - 1;
282 	}
283 	return (NULL);
284 }
285 
286 void
287 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
288     int *pri, int which, const int **state)
289 {
290 	collate_chain_t *p2;
291 	collate_large_t *match;
292 	int p, l;
293 	const int *sptr;
294 
295 	/*
296 	 * If this is the "last" pass for the UNDEFINED, then
297 	 * we just return the priority itself.
298 	 */
299 	if (which >= table->info->directive_count) {
300 		*pri = *t;
301 		*len = 1;
302 		*state = NULL;
303 		return;
304 	}
305 
306 	/*
307 	 * If we have remaining substitution data from a previous
308 	 * call, consume it first.
309 	 */
310 	if ((sptr = *state) != NULL) {
311 		*pri = *sptr;
312 		sptr++;
313 		if ((sptr == *state) || (sptr == NULL))
314 			*state = NULL;
315 		else
316 			*state = sptr;
317 		*len = 0;
318 		return;
319 	}
320 
321 	/* No active substitutions */
322 	*len = 1;
323 
324 	/*
325 	 * Check for composites such as diphthongs that collate as a
326 	 * single element (aka chains or collating-elements).
327 	 */
328 	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
329 	    ((p = p2->pri[which]) >= 0)) {
330 
331 		*len = l;
332 		*pri = p;
333 
334 	} else if (*t <= UCHAR_MAX) {
335 
336 		/*
337 		 * Character is a small (8-bit) character.
338 		 * We just look these up directly for speed.
339 		 */
340 		*pri = table->char_pri_table[*t].pri[which];
341 
342 	} else if ((table->info->large_count > 0) &&
343 	    ((match = largesearch(table, *t)) != NULL)) {
344 
345 		/*
346 		 * Character was found in the extended table.
347 		 */
348 		*pri = match->pri.pri[which];
349 
350 	} else {
351 		/*
352 		 * Character lacks a specific definition.
353 		 */
354 		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
355 			/* Mask off sign bit to prevent ordering confusion. */
356 			*pri = (*t & COLLATE_MAX_PRIORITY);
357 		} else {
358 			*pri = table->info->undef_pri[which];
359 		}
360 		/* No substitutions for undefined characters! */
361 		return;
362 	}
363 
364 	/*
365 	 * Try substituting (expanding) the character.  We are
366 	 * currently doing this *after* the chain compression.  I
367 	 * think it should not matter, but this way might be slightly
368 	 * faster.
369 	 *
370 	 * We do this after the priority search, as this will help us
371 	 * to identify a single key value.  In order for this to work,
372 	 * its important that the priority assigned to a given element
373 	 * to be substituted be unique for that level.  The localedef
374 	 * code ensures this for us.
375 	 */
376 	if ((sptr = substsearch(table, *pri, which)) != NULL) {
377 		if ((*pri = *sptr) > 0) {
378 			sptr++;
379 			*state = *sptr ? sptr : NULL;
380 		}
381 	}
382 
383 }
384 
385 /*
386  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
387  * NOT NULL terminate.  That is left to the caller.
388  */
389 size_t
390 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
391     size_t room)
392 {
393 	int		pri;
394 	int		len;
395 	const wchar_t	*t;
396 	wchar_t		*tr = NULL;
397 	int		direc;
398 	int		pass;
399 	const int32_t 	*state;
400 	size_t		want = 0;
401 	size_t		need = 0;
402 	int		ndir = table->info->directive_count;
403 
404 	assert(src);
405 
406 	for (pass = 0; pass <= ndir; pass++) {
407 
408 		state = NULL;
409 
410 		if (pass != 0) {
411 			/* insert level separator from the previous pass */
412 			if (room) {
413 				*xf++ = 1;
414 				room--;
415 			}
416 			want++;
417 		}
418 
419 		/* special pass for undefined */
420 		if (pass == ndir) {
421 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
422 		} else {
423 			direc = table->info->directive[pass];
424 		}
425 
426 		t = src;
427 
428 		if (direc & DIRECTIVE_BACKWARD) {
429 			wchar_t *bp, *fp, c;
430 			free(tr);
431 			if ((tr = wcsdup(t)) == NULL) {
432 				errno = ENOMEM;
433 				goto fail;
434 			}
435 			bp = tr;
436 			fp = tr + wcslen(tr) - 1;
437 			while (bp < fp) {
438 				c = *bp;
439 				*bp++ = *fp;
440 				*fp-- = c;
441 			}
442 			t = (const wchar_t *)tr;
443 		}
444 
445 		if (direc & DIRECTIVE_POSITION) {
446 			while (*t || state) {
447 				_collate_lookup(table, t, &len, &pri, pass, &state);
448 				t += len;
449 				if (pri <= 0) {
450 					if (pri < 0) {
451 						errno = EINVAL;
452 						goto fail;
453 					}
454 					state = NULL;
455 					pri = COLLATE_MAX_PRIORITY;
456 				}
457 				if (room) {
458 					*xf++ = pri;
459 					room--;
460 				}
461 				want++;
462 				need = want;
463 			}
464 		} else {
465 			while (*t || state) {
466 				_collate_lookup(table, t, &len, &pri, pass, &state);
467 				t += len;
468 				if (pri <= 0) {
469 					if (pri < 0) {
470 						errno = EINVAL;
471 						goto fail;
472 					}
473 					state = NULL;
474 					continue;
475 				}
476 				if (room) {
477 					*xf++ = pri;
478 					room--;
479 				}
480 				want++;
481 				need = want;
482 			}
483 		}
484 	}
485 	free(tr);
486 	return (need);
487 
488 fail:
489 	free(tr);
490 	return ((size_t)(-1));
491 }
492 
493 /*
494  * In the non-POSIX case, we transform each character into a string of
495  * characters representing the character's priority.  Since char is usually
496  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
497  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
498  * bits per byte.
499  *
500  * It turns out that we sometimes have real priorities that are
501  * 31-bits wide.  (But: be careful using priorities where the high
502  * order bit is set -- i.e. the priority is negative.  The sort order
503  * may be surprising!)
504  *
505  * TODO: This would be a good area to optimize somewhat.  It turns out
506  * that real prioririties *except for the last UNDEFINED pass* are generally
507  * very small.  We need the localedef code to precalculate the max
508  * priority for us, and ideally also give us a mask, and then we could
509  * severely limit what we expand to.
510  */
511 #define	XFRM_BYTES	6
512 #define	XFRM_OFFSET	('0')	/* make all printable characters */
513 #define	XFRM_SHIFT	6
514 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
515 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
516 
517 static int
518 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
519 {
520 	/* we use unsigned to ensure zero fill on right shift */
521 	uint32_t val = (uint32_t)table->info->pri_count[pass];
522 	int nc = 0;
523 
524 	while (val) {
525 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
526 		pri >>= XFRM_SHIFT;
527 		val >>= XFRM_SHIFT;
528 		p++;
529 		nc++;
530 	}
531 	return (nc);
532 }
533 
534 size_t
535 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
536     size_t room)
537 {
538 	int		pri;
539 	int		len;
540 	const wchar_t	*t;
541 	wchar_t		*tr = NULL;
542 	int		direc;
543 	int		pass;
544 	const int32_t 	*state;
545 	size_t		want = 0;
546 	size_t		need = 0;
547 	int		b;
548 	uint8_t		buf[XFRM_BYTES];
549 	int		ndir = table->info->directive_count;
550 
551 	assert(src);
552 
553 	for (pass = 0; pass <= ndir; pass++) {
554 
555 		state = NULL;
556 
557 		if (pass != 0) {
558 			/* insert level separator from the previous pass */
559 			if (room) {
560 				*xf++ = XFRM_SEP;
561 				room--;
562 			}
563 			want++;
564 		}
565 
566 		/* special pass for undefined */
567 		if (pass == ndir) {
568 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
569 		} else {
570 			direc = table->info->directive[pass];
571 		}
572 
573 		t = src;
574 
575 		if (direc & DIRECTIVE_BACKWARD) {
576 			wchar_t *bp, *fp, c;
577 			free(tr);
578 			if ((tr = wcsdup(t)) == NULL) {
579 				errno = ENOMEM;
580 				goto fail;
581 			}
582 			bp = tr;
583 			fp = tr + wcslen(tr) - 1;
584 			while (bp < fp) {
585 				c = *bp;
586 				*bp++ = *fp;
587 				*fp-- = c;
588 			}
589 			t = (const wchar_t *)tr;
590 		}
591 
592 		if (direc & DIRECTIVE_POSITION) {
593 			while (*t || state) {
594 
595 				_collate_lookup(table, t, &len, &pri, pass, &state);
596 				t += len;
597 				if (pri <= 0) {
598 					if (pri < 0) {
599 						errno = EINVAL;
600 						goto fail;
601 					}
602 					state = NULL;
603 					pri = COLLATE_MAX_PRIORITY;
604 				}
605 
606 				b = xfrm(table, buf, pri, pass);
607 				want += b;
608 				if (room) {
609 					while (b) {
610 						b--;
611 						if (room) {
612 							*xf++ = buf[b];
613 							room--;
614 						}
615 					}
616 				}
617 				need = want;
618 			}
619 		} else {
620 			while (*t || state) {
621 				_collate_lookup(table, t, &len, &pri, pass, &state);
622 				t += len;
623 				if (pri <= 0) {
624 					if (pri < 0) {
625 						errno = EINVAL;
626 						goto fail;
627 					}
628 					state = NULL;
629 					continue;
630 				}
631 
632 				b = xfrm(table, buf, pri, pass);
633 				want += b;
634 				if (room) {
635 
636 					while (b) {
637 						b--;
638 						if (room) {
639 							*xf++ = buf[b];
640 							room--;
641 						}
642 					}
643 				}
644 				need = want;
645 			}
646 		}
647 	}
648 	free(tr);
649 	return (need);
650 
651 fail:
652 	free(tr);
653 	return ((size_t)(-1));
654 }
655 
656 /*
657  * __collate_equiv_value returns the primary collation value for the given
658  * collating symbol specified by str and len.  Zero or negative is returned
659  * if the collating symbol was not found.  This function is used by bracket
660  * code in the TRE regex library.
661  */
662 int
663 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
664 {
665 	int32_t e;
666 
667 	if (len < 1 || len >= COLLATE_STR_LEN)
668 		return (-1);
669 
670 	FIX_LOCALE(locale);
671 	struct xlocale_collate *table =
672 		(struct xlocale_collate*)locale->components[XLC_COLLATE];
673 
674 	if (table->__collate_load_error)
675 		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
676 
677 	if (len == 1) {
678 		e = -1;
679 		if (*str <= UCHAR_MAX)
680 			e = table->char_pri_table[*str].pri[0];
681 		else if (table->info->large_count > 0) {
682 			collate_large_t *match_large;
683 			match_large = largesearch(table, *str);
684 			if (match_large)
685 				e = match_large->pri.pri[0];
686 		}
687 		if (e == 0)
688 			return (1);
689 		return (e > 0 ? e : 0);
690 	}
691 	if (table->info->chain_count > 0) {
692 		wchar_t name[COLLATE_STR_LEN];
693 		collate_chain_t *match_chain;
694 		int clen;
695 
696 		wcsncpy (name, str, len);
697 		name[len] = 0;
698 		match_chain = chainsearch(table, name, &clen);
699 		if (match_chain) {
700 			e = match_chain->pri[0];
701 			if (e == 0)
702 				return (1);
703 			return (e < 0 ? -e : e);
704 		}
705 	}
706 	return (0);
707 }
708