xref: /illumos-gate/usr/src/lib/libc/port/locale/collate.c (revision 67d74cc3e7c9d9461311136a0b2069813a3fd927)
1 /*
2  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
5  *		at Electronni Visti IA, Kiev, Ukraine.
6  *			All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "lint.h"
31 #include "file64.h"
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <stddef.h>
35 #include <string.h>
36 #include <wchar.h>
37 #include <errno.h>
38 #include <unistd.h>
39 #include <ctype.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <assert.h>
43 #include <sys/stat.h>
44 #include <sys/mman.h>
45 
46 #include "collate.h"
47 #include "localeimpl.h"
48 
49 /* Check file format vs libc runtime. (See collatefile.h) */
50 #if COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX
51 #error "COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX"
52 #endif
53 
54 /*
55  * See the comments in usr/src/cmd/localedef/collate.c for further
56  * information.  It would also be very helpful to have a copy of the
57  * POSIX standard for collation (in the locale format manual page)
58  * handy (www.opengroup.org).
59  */
60 
61 /*
62  * POSIX uses empty tables and falls down to strcmp.
63  */
64 struct lc_collate lc_collate_posix = {
65 	.lc_is_posix = 1,
66 };
67 
68 struct locdata __posix_collate_locdata = {
69 	.l_lname = "C",
70 	.l_data = { &lc_collate_posix }
71 };
72 
73 
74 struct locdata *
75 __lc_collate_load(const char *locname)
76 {
77 	int i, chains, z;
78 	char buf[PATH_MAX];
79 	char *TMP;
80 	char *map;
81 	collate_info_t *info;
82 	struct stat sbuf;
83 	int fd;
84 	struct locdata *ldata;
85 	struct lc_collate *lcc;
86 
87 	/*
88 	 * Slurp the locale file into the cache.
89 	 */
90 
91 	(void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA",
92 	    _PathLocale, locname);
93 
94 	if ((fd = open(buf, O_RDONLY)) < 0) {
95 		errno = EINVAL;
96 		return (NULL);
97 	}
98 	if (fstat(fd, &sbuf) < 0) {
99 		(void) close(fd);
100 		errno = EINVAL;
101 		return (NULL);
102 	}
103 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
104 		(void) close(fd);
105 		errno = EINVAL;
106 		return (NULL);
107 	}
108 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
109 	(void) close(fd);
110 	if ((TMP = map) == NULL) {
111 		errno = EINVAL;
112 		return (NULL);
113 	}
114 
115 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
116 		(void) munmap(map, sbuf.st_size);
117 		errno = EINVAL;
118 		return (NULL);
119 	}
120 	TMP += COLLATE_STR_LEN;
121 
122 	info = (void *)TMP;
123 	TMP += sizeof (*info);
124 
125 	if ((info->directive_count < 1) ||
126 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
127 	    ((chains = info->chain_count) < 0)) {
128 		(void) munmap(map, sbuf.st_size);
129 		errno = EINVAL;
130 		return (NULL);
131 	}
132 
133 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
134 	    (sizeof (collate_chain_t) * chains) +
135 	    (sizeof (collate_large_t) * info->large_count);
136 	for (z = 0; z < info->directive_count; z++) {
137 		i += sizeof (collate_subst_t) * info->subst_count[z];
138 	}
139 	if (i != (sbuf.st_size - (TMP - map))) {
140 		(void) munmap(map, sbuf.st_size);
141 		errno = EINVAL;
142 		return (NULL);
143 	}
144 
145 
146 	if ((ldata = __locdata_alloc(locname, sizeof (*lcc))) == NULL) {
147 		(void) munmap(map, sbuf.st_size);
148 		return (NULL);
149 	}
150 	lcc = ldata->l_data[0];
151 	ldata->l_map = map;
152 	ldata->l_map_len = sbuf.st_size;
153 
154 	lcc->lc_info = info;
155 	lcc->lc_directive_count = info->directive_count;
156 	lcc->lc_large_count = info->large_count;
157 
158 	for (z = 0; z < COLL_WEIGHTS_MAX; z++) {
159 		lcc->lc_directive[z] = info->directive[z];
160 		lcc->lc_subst_count[z] = info->subst_count[z];
161 		lcc->lc_pri_count[z] = info->pri_count[z];
162 		lcc->lc_undef_pri[z] = info->undef_pri[z];
163 	}
164 
165 	lcc->lc_char_table = (void *)TMP;
166 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
167 
168 	for (z = 0; z < lcc->lc_directive_count; z++) {
169 		int count;
170 		if ((count = lcc->lc_subst_count[z]) > 0) {
171 			lcc->lc_subst_table[z] = (void *)TMP;
172 			TMP += count * sizeof (collate_subst_t);
173 		} else {
174 			lcc->lc_subst_table[z] = NULL;
175 		}
176 	}
177 
178 	if (chains > 0) {
179 		lcc->lc_chain_table = (void *)TMP;
180 		TMP += chains * sizeof (collate_chain_t);
181 	} else
182 		lcc->lc_chain_table = NULL;
183 	lcc->lc_chain_count = chains;
184 	if (lcc->lc_large_count > 0)
185 		lcc->lc_large_table = (void *)TMP;
186 	else
187 		lcc->lc_large_table = NULL;
188 
189 	return (ldata);
190 }
191 
192 static const int32_t *
193 substsearch(const struct lc_collate *lcc, const wchar_t key, int pass)
194 {
195 	const collate_subst_t *p;
196 	int n = lcc->lc_subst_count[pass];
197 
198 	if (n == 0)
199 		return (NULL);
200 
201 	if (pass >= lcc->lc_directive_count)
202 		return (NULL);
203 
204 	if (!(key & COLLATE_SUBST_PRIORITY))
205 		return (NULL);
206 
207 	p = lcc->lc_subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
208 	assert(p->key == key);
209 	return (p->pri);
210 }
211 
212 static collate_chain_t *
213 chainsearch(const struct lc_collate *lcc, const wchar_t *key, int *len)
214 {
215 	int low = 0;
216 	int high = lcc->lc_info->chain_count - 1;
217 	int next, compar, l;
218 	collate_chain_t *p;
219 	collate_chain_t *tab = lcc->lc_chain_table;
220 
221 	if (high < 0)
222 		return (NULL);
223 
224 	while (low <= high) {
225 		next = (low + high) / 2;
226 		p = tab + next;
227 		compar = *key - *p->str;
228 		if (compar == 0) {
229 			l = wcsnlen(p->str, COLLATE_STR_LEN);
230 			compar = wcsncmp(key, p->str, l);
231 			if (compar == 0) {
232 				*len = l;
233 				return (p);
234 			}
235 		}
236 		if (compar > 0)
237 			low = next + 1;
238 		else
239 			high = next - 1;
240 	}
241 	return (NULL);
242 }
243 
244 static collate_large_t *
245 largesearch(const struct lc_collate *lcc, const wchar_t key)
246 {
247 	int low = 0;
248 	int high = lcc->lc_info->large_count - 1;
249 	int next, compar;
250 	collate_large_t *p;
251 	collate_large_t *tab = lcc->lc_large_table;
252 
253 	if (high < 0)
254 		return (NULL);
255 
256 	while (low <= high) {
257 		next = (low + high) / 2;
258 		p = tab + next;
259 		compar = key - p->val;
260 		if (compar == 0)
261 			return (p);
262 		if (compar > 0)
263 			low = next + 1;
264 		else
265 			high = next - 1;
266 	}
267 	return (NULL);
268 }
269 
270 void
271 _collate_lookup(const struct lc_collate *lcc, const wchar_t *t,
272     int *len, int *pri, int which, const int **state)
273 {
274 	collate_chain_t *p2;
275 	collate_large_t *match;
276 	int p, l;
277 	const int *sptr;
278 
279 	/*
280 	 * If this is the "last" pass for the UNDEFINED, then
281 	 * we just return the priority itself.
282 	 */
283 	if (which >= lcc->lc_directive_count) {
284 		*pri = *t;
285 		*len = 1;
286 		*state = NULL;
287 		return;
288 	}
289 
290 	/*
291 	 * If we have remaining substitution data from a previous
292 	 * call, consume it first.
293 	 */
294 	if ((sptr = *state) != NULL) {
295 		*pri = *sptr;
296 		sptr++;
297 		if ((sptr == *state) || (sptr == NULL))
298 			*state = NULL;
299 		else
300 			*state = sptr;
301 		*len = 0;
302 		return;
303 	}
304 
305 	/* No active substitutions */
306 	*len = 1;
307 
308 	/*
309 	 * Check for composites such as dipthongs that collate as a
310 	 * single element (aka chains or collating-elements).
311 	 */
312 	if (((p2 = chainsearch(lcc, t, &l)) != NULL) &&
313 	    ((p = p2->pri[which]) >= 0)) {
314 
315 		*len = l;
316 		*pri = p;
317 
318 	} else if (*t <= UCHAR_MAX) {
319 
320 		/*
321 		 * Character is a small (8-bit) character.
322 		 * We just look these up directly for speed.
323 		 */
324 		*pri = lcc->lc_char_table[*t].pri[which];
325 
326 	} else if ((lcc->lc_info->large_count > 0) &&
327 	    ((match = largesearch(lcc, *t)) != NULL)) {
328 
329 		/*
330 		 * Character was found in the extended table.
331 		 */
332 		*pri = match->pri.pri[which];
333 
334 	} else {
335 		/*
336 		 * Character lacks a specific definition.
337 		 */
338 		if (lcc->lc_directive[which] & DIRECTIVE_UNDEFINED) {
339 			/* Mask off sign bit to prevent ordering confusion. */
340 			*pri = (*t & COLLATE_MAX_PRIORITY);
341 		} else {
342 			*pri = lcc->lc_undef_pri[which];
343 		}
344 		/* No substitutions for undefined characters! */
345 		return;
346 	}
347 
348 	/*
349 	 * Try substituting (expanding) the character.  We are
350 	 * currently doing this *after* the chain compression.  I
351 	 * think it should not matter, but this way might be slightly
352 	 * faster.
353 	 *
354 	 * We do this after the priority search, as this will help us
355 	 * to identify a single key value.  In order for this to work,
356 	 * its important that the priority assigned to a given element
357 	 * to be substituted be unique for that level.  The localedef
358 	 * code ensures this for us.
359 	 */
360 	if ((sptr = substsearch(lcc, *pri, which)) != NULL) {
361 		if ((*pri = *sptr) > 0) {
362 			sptr++;
363 			*state = *sptr ? sptr : NULL;
364 		}
365 	}
366 
367 }
368 
369 /*
370  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
371  * NOT NULL terminate.  That is left to the caller.
372  */
373 size_t
374 _collate_wxfrm(const struct lc_collate *lcc, const wchar_t *src, wchar_t *xf,
375     size_t room)
376 {
377 	int		pri;
378 	int		len;
379 	const wchar_t	*t;
380 	wchar_t		*tr = NULL;
381 	int		direc;
382 	int		pass;
383 	const int32_t 	*state;
384 	size_t		want = 0;
385 	size_t		need = 0;
386 	int		ndir = lcc->lc_directive_count;
387 
388 	assert(src);
389 
390 	for (pass = 0; pass <= ndir; pass++) {
391 
392 		state = NULL;
393 
394 		if (pass != 0) {
395 			/* insert level separator from the previous pass */
396 			if (room) {
397 				*xf++ = 1;
398 				room--;
399 			}
400 			want++;
401 		}
402 
403 		/* special pass for undefined */
404 		if (pass == ndir) {
405 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
406 		} else {
407 			direc = lcc->lc_directive[pass];
408 		}
409 
410 		t = src;
411 
412 		if (direc & DIRECTIVE_BACKWARD) {
413 			wchar_t *bp, *fp, c;
414 			if (tr)
415 				free(tr);
416 			if ((tr = wcsdup(t)) == NULL) {
417 				errno = ENOMEM;
418 				goto fail;
419 			}
420 			bp = tr;
421 			fp = tr + wcslen(tr) - 1;
422 			while (bp < fp) {
423 				c = *bp;
424 				*bp++ = *fp;
425 				*fp-- = c;
426 			}
427 			t = (const wchar_t *)tr;
428 		}
429 
430 		if (direc & DIRECTIVE_POSITION) {
431 			while (*t || state) {
432 				_collate_lookup(lcc, t, &len, &pri, pass,
433 				    &state);
434 				t += len;
435 				if (pri <= 0) {
436 					if (pri < 0) {
437 						errno = EINVAL;
438 						goto fail;
439 					}
440 					state = NULL;
441 					pri = COLLATE_MAX_PRIORITY;
442 				}
443 				if (room) {
444 					*xf++ = pri;
445 					room--;
446 				}
447 				want++;
448 				need = want;
449 			}
450 		} else {
451 			while (*t || state) {
452 				_collate_lookup(lcc, t, &len, &pri, pass,
453 				    &state);
454 				t += len;
455 				if (pri <= 0) {
456 					if (pri < 0) {
457 						errno = EINVAL;
458 						goto fail;
459 					}
460 					state = NULL;
461 					continue;
462 				}
463 				if (room) {
464 					*xf++ = pri;
465 					room--;
466 				}
467 				want++;
468 				need = want;
469 			}
470 		}
471 	}
472 
473 end:
474 	if (tr)
475 		free(tr);
476 	return (need);
477 
478 fail:
479 	if (tr)
480 		free(tr);
481 	return ((size_t)(-1));
482 }
483 
484 /*
485  * In the non-POSIX case, we transform each character into a string of
486  * characters representing the character's priority.  Since char is usually
487  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
488  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
489  * bits per byte.
490  *
491  * It turns out that we sometimes have real priorities that are
492  * 31-bits wide.  (But: be careful using priorities where the high
493  * order bit is set -- i.e. the priority is negative.  The sort order
494  * may be surprising!)
495  *
496  * TODO: This would be a good area to optimize somewhat.  It turns out
497  * that real prioririties *except for the last UNDEFINED pass* are generally
498  * very small.  We need the localedef code to precalculate the max
499  * priority for us, and ideally also give us a mask, and then we could
500  * severely limit what we expand to.
501  */
502 #define	XFRM_BYTES	6
503 #define	XFRM_OFFSET	('0')	/* make all printable characters */
504 #define	XFRM_SHIFT	6
505 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
506 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
507 
508 static int
509 xfrm(locale_t loc, unsigned char *p, int pri, int pass)
510 {
511 	/* we use unsigned to ensure zero fill on right shift */
512 	uint32_t val = (uint32_t)loc->collate->lc_pri_count[pass];
513 	int nc = 0;
514 
515 	while (val) {
516 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
517 		pri >>= XFRM_SHIFT;
518 		val >>= XFRM_SHIFT;
519 		p++;
520 		nc++;
521 	}
522 	return (nc);
523 }
524 
525 size_t
526 _collate_sxfrm(const wchar_t *src, char *xf, size_t room, locale_t loc)
527 {
528 	int		pri;
529 	int		len;
530 	const wchar_t	*t;
531 	wchar_t		*tr = NULL;
532 	int		direc;
533 	int		pass;
534 	const int32_t 	*state;
535 	size_t		want = 0;
536 	size_t		need = 0;
537 	int		b;
538 	uint8_t		buf[XFRM_BYTES];
539 	const struct lc_collate *lcc = loc->collate;
540 	int		ndir = lcc->lc_directive_count;
541 
542 	assert(src);
543 
544 	for (pass = 0; pass <= ndir; pass++) {
545 
546 		state = NULL;
547 
548 		if (pass != 0) {
549 			/* insert level separator from the previous pass */
550 			if (room) {
551 				*xf++ = XFRM_SEP;
552 				room--;
553 			}
554 			want++;
555 		}
556 
557 		/* special pass for undefined */
558 		if (pass == ndir) {
559 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
560 		} else {
561 			direc = lcc->lc_directive[pass];
562 		}
563 
564 		t = src;
565 
566 		if (direc & DIRECTIVE_BACKWARD) {
567 			wchar_t *bp, *fp, c;
568 			if (tr)
569 				free(tr);
570 			if ((tr = wcsdup(t)) == NULL) {
571 				errno = ENOMEM;
572 				goto fail;
573 			}
574 			bp = tr;
575 			fp = tr + wcslen(tr) - 1;
576 			while (bp < fp) {
577 				c = *bp;
578 				*bp++ = *fp;
579 				*fp-- = c;
580 			}
581 			t = (const wchar_t *)tr;
582 		}
583 
584 		if (direc & DIRECTIVE_POSITION) {
585 			while (*t || state) {
586 
587 				_collate_lookup(lcc, t, &len, &pri, pass,
588 				    &state);
589 				t += len;
590 				if (pri <= 0) {
591 					if (pri < 0) {
592 						errno = EINVAL;
593 						goto fail;
594 					}
595 					state = NULL;
596 					pri = COLLATE_MAX_PRIORITY;
597 				}
598 
599 				b = xfrm(loc, buf, pri, pass);
600 				want += b;
601 				if (room) {
602 					while (b) {
603 						b--;
604 						if (room) {
605 							*xf++ = buf[b];
606 							room--;
607 						}
608 					}
609 				}
610 				need = want;
611 			}
612 		} else {
613 			while (*t || state) {
614 				_collate_lookup(lcc, t, &len, &pri, pass,
615 				    &state);
616 				t += len;
617 				if (pri <= 0) {
618 					if (pri < 0) {
619 						errno = EINVAL;
620 						goto fail;
621 					}
622 					state = NULL;
623 					continue;
624 				}
625 
626 				b = xfrm(loc, buf, pri, pass);
627 				want += b;
628 				if (room) {
629 
630 					while (b) {
631 						b--;
632 						if (room) {
633 							*xf++ = buf[b];
634 							room--;
635 						}
636 					}
637 				}
638 				need = want;
639 			}
640 		}
641 	}
642 
643 end:
644 	if (tr)
645 		free(tr);
646 	return (need);
647 
648 fail:
649 	if (tr)
650 		free(tr);
651 	return ((size_t)(-1));
652 }
653