xref: /illumos-gate/usr/src/lib/libc/port/locale/collate.c (revision 3a005aada8ac0e291c13cbc488ba9ae1473f0a96)
1 /*
2  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
5  *		at Electronni Visti IA, Kiev, Ukraine.
6  *			All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "lint.h"
31 #include "file64.h"
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <stddef.h>
35 #include <string.h>
36 #include <wchar.h>
37 #include <errno.h>
38 #include <unistd.h>
39 #include <ctype.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <assert.h>
43 #include <sys/stat.h>
44 #include <sys/mman.h>
45 
46 #include "collate.h"
47 #include "localeimpl.h"
48 
49 /* Check file format vs libc runtime. (See collatefile.h) */
50 #if COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX
51 #error "COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX"
52 #endif
53 
54 /*
55  * See the comments in usr/src/cmd/localedef/collate.c for further
56  * information.  It would also be very helpful to have a copy of the
57  * POSIX standard for collation (in the locale format manual page)
58  * handy (www.opengroup.org).
59  */
60 
61 /*
62  * POSIX uses empty tables and falls down to strcmp.
63  */
64 struct lc_collate lc_collate_posix = {
65 	.lc_is_posix = 1,
66 };
67 
68 struct locdata __posix_collate_locdata = {
69 	.l_lname = "C",
70 	.l_data = { &lc_collate_posix }
71 };
72 
73 
74 struct locdata *
75 __lc_collate_load(const char *locname)
76 {
77 	int i, chains, z;
78 	char buf[PATH_MAX];
79 	char *TMP;
80 	char *map;
81 	collate_info_t *info;
82 	struct stat sbuf;
83 	int fd;
84 	struct locdata *ldata;
85 	struct lc_collate *lcc;
86 
87 	/*
88 	 * Slurp the locale file into the cache.
89 	 */
90 
91 	(void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA",
92 	    _PathLocale, locname);
93 
94 	if ((fd = open(buf, O_RDONLY)) < 0) {
95 		errno = EINVAL;
96 		return (NULL);
97 	}
98 	if (fstat(fd, &sbuf) < 0) {
99 		(void) close(fd);
100 		errno = EINVAL;
101 		return (NULL);
102 	}
103 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
104 		(void) close(fd);
105 		errno = EINVAL;
106 		return (NULL);
107 	}
108 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
109 	(void) close(fd);
110 	if ((TMP = map) == NULL) {
111 		errno = EINVAL;
112 		return (NULL);
113 	}
114 
115 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
116 		(void) munmap(map, sbuf.st_size);
117 		errno = EINVAL;
118 		return (NULL);
119 	}
120 	TMP += COLLATE_STR_LEN;
121 
122 	info = (void *)TMP;
123 	TMP += sizeof (*info);
124 
125 	if ((info->directive_count < 1) ||
126 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
127 	    ((chains = info->chain_count) < 0)) {
128 		(void) munmap(map, sbuf.st_size);
129 		errno = EINVAL;
130 		return (NULL);
131 	}
132 
133 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
134 	    (sizeof (collate_chain_t) * chains) +
135 	    (sizeof (collate_large_t) * info->large_count);
136 	for (z = 0; z < info->directive_count; z++) {
137 		i += sizeof (collate_subst_t) * info->subst_count[z];
138 	}
139 	if (i != (sbuf.st_size - (TMP - map))) {
140 		(void) munmap(map, sbuf.st_size);
141 		errno = EINVAL;
142 		return (NULL);
143 	}
144 
145 
146 	if ((ldata = __locdata_alloc(locname, sizeof (*lcc))) == NULL) {
147 		(void) munmap(map, sbuf.st_size);
148 		return (NULL);
149 	}
150 	lcc = ldata->l_data[0];
151 	ldata->l_map = map;
152 	ldata->l_map_len = sbuf.st_size;
153 
154 	lcc->lc_info = info;
155 	lcc->lc_directive_count = info->directive_count;
156 	lcc->lc_large_count = info->large_count;
157 
158 	for (z = 0; z < COLL_WEIGHTS_MAX; z++) {
159 		lcc->lc_directive[z] = info->directive[z];
160 		lcc->lc_subst_count[z] = info->subst_count[z];
161 		lcc->lc_pri_count[z] = info->pri_count[z];
162 		lcc->lc_undef_pri[z] = info->undef_pri[z];
163 	}
164 
165 	lcc->lc_char_table = (void *)TMP;
166 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
167 
168 	for (z = 0; z < lcc->lc_directive_count; z++) {
169 		int count;
170 		if ((count = lcc->lc_subst_count[z]) > 0) {
171 			lcc->lc_subst_table[z] = (void *)TMP;
172 			TMP += count * sizeof (collate_subst_t);
173 		} else {
174 			lcc->lc_subst_table[z] = NULL;
175 		}
176 	}
177 
178 	if (chains > 0) {
179 		lcc->lc_chain_table = (void *)TMP;
180 		TMP += chains * sizeof (collate_chain_t);
181 	} else
182 		lcc->lc_chain_table = NULL;
183 	lcc->lc_chain_count = chains;
184 	if (lcc->lc_large_count > 0)
185 		lcc->lc_large_table = (void *)TMP;
186 	else
187 		lcc->lc_large_table = NULL;
188 
189 	return (ldata);
190 }
191 
192 static const int32_t *
193 substsearch(const struct lc_collate *lcc, const wchar_t key, int pass)
194 {
195 	const collate_subst_t *p;
196 	int n = lcc->lc_subst_count[pass];
197 
198 	if (n == 0)
199 		return (NULL);
200 
201 	if (pass >= lcc->lc_directive_count)
202 		return (NULL);
203 
204 	if (!(key & COLLATE_SUBST_PRIORITY))
205 		return (NULL);
206 
207 	p = lcc->lc_subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
208 	assert(p->key == key);
209 	return (p->pri);
210 }
211 
212 /*
213  * Note: for performance reasons, we have expanded bsearch here.  This avoids
214  * function call overhead with each comparison.
215  */
216 
217 static collate_chain_t *
218 chainsearch(const struct lc_collate *lcc, const wchar_t *key, int *len)
219 {
220 	int low;
221 	int high;
222 	int next, compar, l;
223 	collate_chain_t *p;
224 	collate_chain_t *tab;
225 
226 	if (lcc->lc_info->chain_count == 0)
227 		return (NULL);
228 
229 	low = 0;
230 	high = lcc->lc_info->chain_count - 1;
231 	tab = lcc->lc_chain_table;
232 
233 	while (low <= high) {
234 		next = (low + high) / 2;
235 		p = tab + next;
236 		compar = *key - *p->str;
237 		if (compar == 0) {
238 			l = wcsnlen(p->str, COLLATE_STR_LEN);
239 			compar = wcsncmp(key, p->str, l);
240 			if (compar == 0) {
241 				*len = l;
242 				return (p);
243 			}
244 		}
245 		if (compar > 0)
246 			low = next + 1;
247 		else
248 			high = next - 1;
249 	}
250 	return (NULL);
251 }
252 
253 static collate_large_t *
254 largesearch(const struct lc_collate *lcc, const wchar_t key)
255 {
256 	int low = 0;
257 	int high = lcc->lc_info->large_count - 1;
258 	int next, compar;
259 	collate_large_t *p;
260 	collate_large_t *tab = lcc->lc_large_table;
261 
262 	if (lcc->lc_info->large_count == 0)
263 		return (NULL);
264 
265 	while (low <= high) {
266 		next = (low + high) / 2;
267 		p = tab + next;
268 		compar = key - p->val;
269 		if (compar == 0)
270 			return (p);
271 		if (compar > 0)
272 			low = next + 1;
273 		else
274 			high = next - 1;
275 	}
276 	return (NULL);
277 }
278 
279 void
280 _collate_lookup(const struct lc_collate *lcc, const wchar_t *t,
281     int *len, int *pri, int which, const int **state)
282 {
283 	collate_chain_t *p2;
284 	collate_large_t *match;
285 	int p, l;
286 	const int *sptr;
287 
288 	/*
289 	 * If this is the "last" pass for the UNDEFINED, then
290 	 * we just return the priority itself.
291 	 */
292 	if (which >= lcc->lc_directive_count) {
293 		*pri = *t;
294 		*len = 1;
295 		*state = NULL;
296 		return;
297 	}
298 
299 	/*
300 	 * If we have remaining substitution data from a previous
301 	 * call, consume it first.
302 	 */
303 	if ((sptr = *state) != NULL) {
304 		*pri = *sptr;
305 		sptr++;
306 		*state = *sptr ? sptr : NULL;
307 		*len = 0;
308 		return;
309 	}
310 
311 	/* No active substitutions */
312 	*len = 1;
313 
314 	/*
315 	 * Check for composites such as dipthongs that collate as a
316 	 * single element (aka chains or collating-elements).
317 	 */
318 	if (((p2 = chainsearch(lcc, t, &l)) != NULL) &&
319 	    ((p = p2->pri[which]) >= 0)) {
320 
321 		*len = l;
322 		*pri = p;
323 
324 	} else if (*t <= UCHAR_MAX) {
325 
326 		/*
327 		 * Character is a small (8-bit) character.
328 		 * We just look these up directly for speed.
329 		 */
330 		*pri = lcc->lc_char_table[*t].pri[which];
331 
332 	} else if ((lcc->lc_info->large_count > 0) &&
333 	    ((match = largesearch(lcc, *t)) != NULL)) {
334 
335 		/*
336 		 * Character was found in the extended table.
337 		 */
338 		*pri = match->pri.pri[which];
339 
340 	} else {
341 		/*
342 		 * Character lacks a specific definition.
343 		 */
344 		if (lcc->lc_directive[which] & DIRECTIVE_UNDEFINED) {
345 			/* Mask off sign bit to prevent ordering confusion. */
346 			*pri = (*t & COLLATE_MAX_PRIORITY);
347 		} else {
348 			*pri = lcc->lc_undef_pri[which];
349 		}
350 		/* No substitutions for undefined characters! */
351 		return;
352 	}
353 
354 	/*
355 	 * Try substituting (expanding) the character.  We are
356 	 * currently doing this *after* the chain compression.  I
357 	 * think it should not matter, but this way might be slightly
358 	 * faster.
359 	 *
360 	 * We do this after the priority search, as this will help us
361 	 * to identify a single key value.  In order for this to work,
362 	 * its important that the priority assigned to a given element
363 	 * to be substituted be unique for that level.  The localedef
364 	 * code ensures this for us.
365 	 */
366 	if ((sptr = substsearch(lcc, *pri, which)) != NULL) {
367 		if ((*pri = *sptr) != 0) {
368 			sptr++;
369 			*state = *sptr ? sptr : NULL;
370 		}
371 	}
372 
373 }
374 
375 /*
376  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
377  * NOT NULL terminate.  That is left to the caller.
378  */
379 size_t
380 _collate_wxfrm(const struct lc_collate *lcc, const wchar_t *src, wchar_t *xf,
381     size_t room)
382 {
383 	int		pri;
384 	int		len;
385 	const wchar_t	*t;
386 	wchar_t		*tr = NULL;
387 	int		direc;
388 	int		pass;
389 	const int32_t 	*state;
390 	size_t		want = 0;
391 	size_t		need = 0;
392 	int		ndir = lcc->lc_directive_count;
393 
394 	assert(src);
395 
396 	for (pass = 0; pass <= ndir; pass++) {
397 
398 		state = NULL;
399 
400 		if (pass != 0) {
401 			/* insert level separator from the previous pass */
402 			if (room) {
403 				*xf++ = 1;
404 				room--;
405 			}
406 			want++;
407 		}
408 
409 		/* special pass for undefined */
410 		if (pass == ndir) {
411 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
412 		} else {
413 			direc = lcc->lc_directive[pass];
414 		}
415 
416 		t = src;
417 
418 		if (direc & DIRECTIVE_BACKWARD) {
419 			wchar_t *bp, *fp, c;
420 			if (tr)
421 				free(tr);
422 			if ((tr = wcsdup(t)) == NULL) {
423 				errno = ENOMEM;
424 				goto fail;
425 			}
426 			bp = tr;
427 			fp = tr + wcslen(tr) - 1;
428 			while (bp < fp) {
429 				c = *bp;
430 				*bp++ = *fp;
431 				*fp-- = c;
432 			}
433 			t = (const wchar_t *)tr;
434 		}
435 
436 		if (direc & DIRECTIVE_POSITION) {
437 			while (*t || state) {
438 				_collate_lookup(lcc, t, &len, &pri, pass,
439 				    &state);
440 				t += len;
441 				if (pri <= 0) {
442 					if (pri < 0) {
443 						errno = EINVAL;
444 						goto fail;
445 					}
446 					pri = COLLATE_MAX_PRIORITY;
447 				}
448 				if (room) {
449 					*xf++ = pri;
450 					room--;
451 				}
452 				want++;
453 				need = want;
454 			}
455 		} else {
456 			while (*t || state) {
457 				_collate_lookup(lcc, t, &len, &pri, pass,
458 				    &state);
459 				t += len;
460 				if (pri <= 0) {
461 					if (pri < 0) {
462 						errno = EINVAL;
463 						goto fail;
464 					}
465 					continue;
466 				}
467 				if (room) {
468 					*xf++ = pri;
469 					room--;
470 				}
471 				want++;
472 				need = want;
473 			}
474 		}
475 	}
476 
477 end:
478 	if (tr)
479 		free(tr);
480 	return (need);
481 
482 fail:
483 	if (tr)
484 		free(tr);
485 	return ((size_t)(-1));
486 }
487 
488 /*
489  * In the non-POSIX case, we transform each character into a string of
490  * characters representing the character's priority.  Since char is usually
491  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
492  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
493  * bits per byte.
494  *
495  * It turns out that we sometimes have real priorities that are
496  * 31-bits wide.  (But: be careful using priorities where the high
497  * order bit is set -- i.e. the priority is negative.  The sort order
498  * may be surprising!)
499  *
500  * TODO: This would be a good area to optimize somewhat.  It turns out
501  * that real prioririties *except for the last UNDEFINED pass* are generally
502  * very small.  We need the localedef code to precalculate the max
503  * priority for us, and ideally also give us a mask, and then we could
504  * severely limit what we expand to.
505  */
506 #define	XFRM_BYTES	6
507 #define	XFRM_OFFSET	('0')	/* make all printable characters */
508 #define	XFRM_SHIFT	6
509 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
510 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
511 
512 static int
513 xfrm(locale_t loc, unsigned char *p, int pri, int pass)
514 {
515 	/* we use unsigned to ensure zero fill on right shift */
516 	uint32_t val = (uint32_t)loc->collate->lc_pri_count[pass];
517 	int nc = 0;
518 
519 	while (val) {
520 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
521 		pri >>= XFRM_SHIFT;
522 		val >>= XFRM_SHIFT;
523 		p++;
524 		nc++;
525 	}
526 	return (nc);
527 }
528 
529 size_t
530 _collate_sxfrm(const wchar_t *src, char *xf, size_t room, locale_t loc)
531 {
532 	int		pri;
533 	int		len;
534 	const wchar_t	*t;
535 	wchar_t		*tr = NULL;
536 	int		direc;
537 	int		pass;
538 	const int32_t 	*state;
539 	size_t		want = 0;
540 	size_t		need = 0;
541 	int		b;
542 	uint8_t		buf[XFRM_BYTES];
543 	const struct lc_collate *lcc = loc->collate;
544 	int		ndir = lcc->lc_directive_count;
545 
546 	assert(src);
547 
548 	for (pass = 0; pass <= ndir; pass++) {
549 
550 		state = NULL;
551 
552 		if (pass != 0) {
553 			/* insert level separator from the previous pass */
554 			if (room) {
555 				*xf++ = XFRM_SEP;
556 				room--;
557 			}
558 			want++;
559 		}
560 
561 		/* special pass for undefined */
562 		if (pass == ndir) {
563 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
564 		} else {
565 			direc = lcc->lc_directive[pass];
566 		}
567 
568 		t = src;
569 
570 		if (direc & DIRECTIVE_BACKWARD) {
571 			wchar_t *bp, *fp, c;
572 			if (tr)
573 				free(tr);
574 			if ((tr = wcsdup(t)) == NULL) {
575 				errno = ENOMEM;
576 				goto fail;
577 			}
578 			bp = tr;
579 			fp = tr + wcslen(tr) - 1;
580 			while (bp < fp) {
581 				c = *bp;
582 				*bp++ = *fp;
583 				*fp-- = c;
584 			}
585 			t = (const wchar_t *)tr;
586 		}
587 
588 		if (direc & DIRECTIVE_POSITION) {
589 			while (*t || state) {
590 
591 				_collate_lookup(lcc, t, &len, &pri, pass,
592 				    &state);
593 				t += len;
594 				if (pri <= 0) {
595 					if (pri < 0) {
596 						errno = EINVAL;
597 						goto fail;
598 					}
599 					pri = COLLATE_MAX_PRIORITY;
600 				}
601 
602 				b = xfrm(loc, buf, pri, pass);
603 				want += b;
604 				if (room) {
605 					while (b) {
606 						b--;
607 						if (room) {
608 							*xf++ = buf[b];
609 							room--;
610 						}
611 					}
612 				}
613 				need = want;
614 			}
615 		} else {
616 			while (*t || state) {
617 				_collate_lookup(lcc, t, &len, &pri, pass,
618 				    &state);
619 				t += len;
620 				if (pri <= 0) {
621 					if (pri < 0) {
622 						errno = EINVAL;
623 						goto fail;
624 					}
625 					continue;
626 				}
627 
628 				b = xfrm(loc, buf, pri, pass);
629 				want += b;
630 				if (room) {
631 
632 					while (b) {
633 						b--;
634 						if (room) {
635 							*xf++ = buf[b];
636 							room--;
637 						}
638 					}
639 				}
640 				need = want;
641 			}
642 		}
643 	}
644 
645 end:
646 	if (tr)
647 		free(tr);
648 	return (need);
649 
650 fail:
651 	if (tr)
652 		free(tr);
653 	return ((size_t)(-1));
654 }
655