xref: /illumos-gate/usr/src/lib/libc/port/locale/collate.c (revision bafd1f1462c49949e0251d74b4fbfa24d29bc79a)
1 /*
2  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
5  *		at Electronni Visti IA, Kiev, Ukraine.
6  *			All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "lint.h"
31 #include "file64.h"
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <stddef.h>
35 #include <string.h>
36 #include <wchar.h>
37 #include <errno.h>
38 #include <unistd.h>
39 #include <ctype.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <assert.h>
43 #include <sys/stat.h>
44 #include <sys/mman.h>
45 
46 #include "collate.h"
47 #include "setlocale.h"
48 
49 /*
50  * See the comments in usr/src/cmd/localedef/collate.c for further
51  * information.  It would also be very helpful to have a copy of the
52  * POSIX standard for collation (in the locale format manual page)
53  * handy (www.opengroup.org).
54  */
55 
56 /*
57  * POSIX uses empty tables and falls down to strcmp.
58  */
59 struct lc_collate lc_collate_posix = {
60 	.lc_is_posix = 1,
61 };
62 
63 struct locdata __posix_collate_locdata = {
64 	.l_lname = "C",
65 	.l_data = { &lc_collate_posix }
66 };
67 
68 
69 struct locdata *
70 __lc_collate_load(const char *locname)
71 {
72 	int i, chains, z;
73 	char buf[PATH_MAX];
74 	char *TMP;
75 	char *map;
76 	collate_info_t *info;
77 	struct stat sbuf;
78 	int fd;
79 	struct locdata *ldata;
80 	struct lc_collate *lcc;
81 
82 	/*
83 	 * Slurp the locale file into the cache.
84 	 */
85 
86 	(void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA",
87 	    _PathLocale, locname);
88 
89 	if ((fd = open(buf, O_RDONLY)) < 0) {
90 		errno = EINVAL;
91 		return (NULL);
92 	}
93 	if (fstat(fd, &sbuf) < 0) {
94 		(void) close(fd);
95 		errno = EINVAL;
96 		return (NULL);
97 	}
98 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
99 		(void) close(fd);
100 		errno = EINVAL;
101 		return (NULL);
102 	}
103 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
104 	(void) close(fd);
105 	if ((TMP = map) == NULL) {
106 		errno = EINVAL;
107 		return (NULL);
108 	}
109 
110 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
111 		(void) munmap(map, sbuf.st_size);
112 		errno = EINVAL;
113 		return (NULL);
114 	}
115 	TMP += COLLATE_STR_LEN;
116 
117 	info = (void *)TMP;
118 	TMP += sizeof (*info);
119 
120 	if ((info->directive_count < 1) ||
121 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
122 	    ((chains = info->chain_count) < 0)) {
123 		(void) munmap(map, sbuf.st_size);
124 		errno = EINVAL;
125 		return (NULL);
126 	}
127 
128 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
129 	    (sizeof (collate_chain_t) * chains) +
130 	    (sizeof (collate_large_t) * info->large_count);
131 	for (z = 0; z < info->directive_count; z++) {
132 		i += sizeof (collate_subst_t) * info->subst_count[z];
133 	}
134 	if (i != (sbuf.st_size - (TMP - map))) {
135 		(void) munmap(map, sbuf.st_size);
136 		errno = EINVAL;
137 		return (NULL);
138 	}
139 
140 
141 	if ((ldata = __locdata_alloc(locname, sizeof (*lcc))) == NULL) {
142 		(void) munmap(map, sbuf.st_size);
143 		return (NULL);
144 	}
145 	lcc = ldata->l_data[0];
146 	ldata->l_map = map;
147 	ldata->l_map_len = sbuf.st_size;
148 
149 	lcc->lc_info = info;
150 	lcc->lc_directive_count = info->directive_count;
151 	lcc->lc_large_count = info->large_count;
152 
153 	for (z = 0; z < COLL_WEIGHTS_MAX; z++) {
154 		lcc->lc_directive[z] = info->directive[z];
155 		lcc->lc_subst_count[z] = info->subst_count[z];
156 		lcc->lc_pri_count[z] = info->pri_count[z];
157 		lcc->lc_undef_pri[z] = info->undef_pri[z];
158 	}
159 
160 	lcc->lc_char_table = (void *)TMP;
161 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
162 
163 	for (z = 0; z < lcc->lc_directive_count; z++) {
164 		int count;
165 		if ((count = lcc->lc_subst_count[z]) > 0) {
166 			lcc->lc_subst_table[z] = (void *)TMP;
167 			TMP += count * sizeof (collate_subst_t);
168 		} else {
169 			lcc->lc_subst_table[z] = NULL;
170 		}
171 	}
172 
173 	if (chains > 0) {
174 		lcc->lc_chain_table = (void *)TMP;
175 		TMP += chains * sizeof (collate_chain_t);
176 	} else
177 		lcc->lc_chain_table = NULL;
178 	lcc->lc_chain_count = chains;
179 	if (lcc->lc_large_count > 0)
180 		lcc->lc_large_table = (void *)TMP;
181 	else
182 		lcc->lc_large_table = NULL;
183 
184 	return (ldata);
185 }
186 
187 static const int32_t *
188 substsearch(const struct lc_collate *lcc, const wchar_t key, int pass)
189 {
190 	const collate_subst_t *p;
191 	int n = lcc->lc_subst_count[pass];
192 
193 	if (n == 0)
194 		return (NULL);
195 
196 	if (pass >= lcc->lc_directive_count)
197 		return (NULL);
198 
199 	if (!(key & COLLATE_SUBST_PRIORITY))
200 		return (NULL);
201 
202 	p = lcc->lc_subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
203 	assert(p->key == key);
204 	return (p->pri);
205 }
206 
207 /*
208  * Note: for performance reasons, we have expanded bsearch here.  This avoids
209  * function call overhead with each comparison.
210  */
211 
212 static collate_chain_t *
213 chainsearch(const struct lc_collate *lcc, const wchar_t *key, int *len)
214 {
215 	int low;
216 	int high;
217 	int next, compar, l;
218 	collate_chain_t *p;
219 	collate_chain_t *tab;
220 
221 	if (lcc->lc_info->chain_count == 0)
222 		return (NULL);
223 
224 	low = 0;
225 	high = lcc->lc_info->chain_count - 1;
226 	tab = lcc->lc_chain_table;
227 
228 	while (low <= high) {
229 		next = (low + high) / 2;
230 		p = tab + next;
231 		compar = *key - *p->str;
232 		if (compar == 0) {
233 			l = wcsnlen(p->str, COLLATE_STR_LEN);
234 			compar = wcsncmp(key, p->str, l);
235 			if (compar == 0) {
236 				*len = l;
237 				return (p);
238 			}
239 		}
240 		if (compar > 0)
241 			low = next + 1;
242 		else
243 			high = next - 1;
244 	}
245 	return (NULL);
246 }
247 
248 static collate_large_t *
249 largesearch(const struct lc_collate *lcc, const wchar_t key)
250 {
251 	int low = 0;
252 	int high = lcc->lc_info->large_count - 1;
253 	int next, compar;
254 	collate_large_t *p;
255 	collate_large_t *tab = lcc->lc_large_table;
256 
257 	if (lcc->lc_info->large_count == 0)
258 		return (NULL);
259 
260 	while (low <= high) {
261 		next = (low + high) / 2;
262 		p = tab + next;
263 		compar = key - p->val;
264 		if (compar == 0)
265 			return (p);
266 		if (compar > 0)
267 			low = next + 1;
268 		else
269 			high = next - 1;
270 	}
271 	return (NULL);
272 }
273 
274 void
275 _collate_lookup(const struct lc_collate *lcc, const wchar_t *t,
276     int *len, int *pri, int which, const int **state)
277 {
278 	collate_chain_t *p2;
279 	collate_large_t *match;
280 	int p, l;
281 	const int *sptr;
282 
283 	/*
284 	 * If this is the "last" pass for the UNDEFINED, then
285 	 * we just return the priority itself.
286 	 */
287 	if (which >= lcc->lc_directive_count) {
288 		*pri = *t;
289 		*len = 1;
290 		*state = NULL;
291 		return;
292 	}
293 
294 	/*
295 	 * If we have remaining substitution data from a previous
296 	 * call, consume it first.
297 	 */
298 	if ((sptr = *state) != NULL) {
299 		*pri = *sptr;
300 		sptr++;
301 		*state = *sptr ? sptr : NULL;
302 		*len = 0;
303 		return;
304 	}
305 
306 	/* No active substitutions */
307 	*len = 1;
308 
309 	/*
310 	 * Check for composites such as dipthongs that collate as a
311 	 * single element (aka chains or collating-elements).
312 	 */
313 	if (((p2 = chainsearch(lcc, t, &l)) != NULL) &&
314 	    ((p = p2->pri[which]) >= 0)) {
315 
316 		*len = l;
317 		*pri = p;
318 
319 	} else if (*t <= UCHAR_MAX) {
320 
321 		/*
322 		 * Character is a small (8-bit) character.
323 		 * We just look these up directly for speed.
324 		 */
325 		*pri = lcc->lc_char_table[*t].pri[which];
326 
327 	} else if ((lcc->lc_info->large_count > 0) &&
328 	    ((match = largesearch(lcc, *t)) != NULL)) {
329 
330 		/*
331 		 * Character was found in the extended table.
332 		 */
333 		*pri = match->pri.pri[which];
334 
335 	} else {
336 		/*
337 		 * Character lacks a specific definition.
338 		 */
339 		if (lcc->lc_directive[which] & DIRECTIVE_UNDEFINED) {
340 			/* Mask off sign bit to prevent ordering confusion. */
341 			*pri = (*t & COLLATE_MAX_PRIORITY);
342 		} else {
343 			*pri = lcc->lc_undef_pri[which];
344 		}
345 		/* No substitutions for undefined characters! */
346 		return;
347 	}
348 
349 	/*
350 	 * Try substituting (expanding) the character.  We are
351 	 * currently doing this *after* the chain compression.  I
352 	 * think it should not matter, but this way might be slightly
353 	 * faster.
354 	 *
355 	 * We do this after the priority search, as this will help us
356 	 * to identify a single key value.  In order for this to work,
357 	 * its important that the priority assigned to a given element
358 	 * to be substituted be unique for that level.  The localedef
359 	 * code ensures this for us.
360 	 */
361 	if ((sptr = substsearch(lcc, *pri, which)) != NULL) {
362 		if ((*pri = *sptr) != 0) {
363 			sptr++;
364 			*state = *sptr ? sptr : NULL;
365 		}
366 	}
367 
368 }
369 
370 /*
371  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
372  * NOT NULL terminate.  That is left to the caller.
373  */
374 size_t
375 _collate_wxfrm(const struct lc_collate *lcc, const wchar_t *src, wchar_t *xf,
376     size_t room)
377 {
378 	int		pri;
379 	int		len;
380 	const wchar_t	*t;
381 	wchar_t		*tr = NULL;
382 	int		direc;
383 	int		pass;
384 	const int32_t 	*state;
385 	size_t		want = 0;
386 	size_t		need = 0;
387 	int		ndir = lcc->lc_directive_count;
388 
389 	assert(src);
390 
391 	for (pass = 0; pass <= ndir; pass++) {
392 
393 		state = NULL;
394 
395 		if (pass != 0) {
396 			/* insert level separator from the previous pass */
397 			if (room) {
398 				*xf++ = 1;
399 				room--;
400 			}
401 			want++;
402 		}
403 
404 		/* special pass for undefined */
405 		if (pass == ndir) {
406 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
407 		} else {
408 			direc = lcc->lc_directive[pass];
409 		}
410 
411 		t = src;
412 
413 		if (direc & DIRECTIVE_BACKWARD) {
414 			wchar_t *bp, *fp, c;
415 			if (tr)
416 				free(tr);
417 			if ((tr = wcsdup(t)) == NULL) {
418 				errno = ENOMEM;
419 				goto fail;
420 			}
421 			bp = tr;
422 			fp = tr + wcslen(tr) - 1;
423 			while (bp < fp) {
424 				c = *bp;
425 				*bp++ = *fp;
426 				*fp-- = c;
427 			}
428 			t = (const wchar_t *)tr;
429 		}
430 
431 		if (direc & DIRECTIVE_POSITION) {
432 			while (*t || state) {
433 				_collate_lookup(lcc, t, &len, &pri, pass,
434 				    &state);
435 				t += len;
436 				if (pri <= 0) {
437 					if (pri < 0) {
438 						errno = EINVAL;
439 						goto fail;
440 					}
441 					pri = COLLATE_MAX_PRIORITY;
442 				}
443 				if (room) {
444 					*xf++ = pri;
445 					room--;
446 				}
447 				want++;
448 				need = want;
449 			}
450 		} else {
451 			while (*t || state) {
452 				_collate_lookup(lcc, t, &len, &pri, pass,
453 				    &state);
454 				t += len;
455 				if (pri <= 0) {
456 					if (pri < 0) {
457 						errno = EINVAL;
458 						goto fail;
459 					}
460 					continue;
461 				}
462 				if (room) {
463 					*xf++ = pri;
464 					room--;
465 				}
466 				want++;
467 				need = want;
468 			}
469 		}
470 	}
471 
472 end:
473 	if (tr)
474 		free(tr);
475 	return (need);
476 
477 fail:
478 	if (tr)
479 		free(tr);
480 	return ((size_t)(-1));
481 }
482 
483 /*
484  * In the non-POSIX case, we transform each character into a string of
485  * characters representing the character's priority.  Since char is usually
486  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
487  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
488  * bits per byte.
489  *
490  * It turns out that we sometimes have real priorities that are
491  * 31-bits wide.  (But: be careful using priorities where the high
492  * order bit is set -- i.e. the priority is negative.  The sort order
493  * may be surprising!)
494  *
495  * TODO: This would be a good area to optimize somewhat.  It turns out
496  * that real prioririties *except for the last UNDEFINED pass* are generally
497  * very small.  We need the localedef code to precalculate the max
498  * priority for us, and ideally also give us a mask, and then we could
499  * severely limit what we expand to.
500  */
501 #define	XFRM_BYTES	6
502 #define	XFRM_OFFSET	('0')	/* make all printable characters */
503 #define	XFRM_SHIFT	6
504 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
505 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
506 
507 static int
508 xfrm(locale_t loc, unsigned char *p, int pri, int pass)
509 {
510 	/* we use unsigned to ensure zero fill on right shift */
511 	uint32_t val = (uint32_t)loc->collate->lc_pri_count[pass];
512 	int nc = 0;
513 
514 	while (val) {
515 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
516 		pri >>= XFRM_SHIFT;
517 		val >>= XFRM_SHIFT;
518 		p++;
519 		nc++;
520 	}
521 	return (nc);
522 }
523 
524 size_t
525 _collate_sxfrm(const wchar_t *src, char *xf, size_t room, locale_t loc)
526 {
527 	int		pri;
528 	int		len;
529 	const wchar_t	*t;
530 	wchar_t		*tr = NULL;
531 	int		direc;
532 	int		pass;
533 	const int32_t 	*state;
534 	size_t		want = 0;
535 	size_t		need = 0;
536 	int		b;
537 	uint8_t		buf[XFRM_BYTES];
538 	const struct lc_collate *lcc = loc->collate;
539 	int		ndir = lcc->lc_directive_count;
540 
541 	assert(src);
542 
543 	for (pass = 0; pass <= ndir; pass++) {
544 
545 		state = NULL;
546 
547 		if (pass != 0) {
548 			/* insert level separator from the previous pass */
549 			if (room) {
550 				*xf++ = XFRM_SEP;
551 				room--;
552 			}
553 			want++;
554 		}
555 
556 		/* special pass for undefined */
557 		if (pass == ndir) {
558 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
559 		} else {
560 			direc = lcc->lc_directive[pass];
561 		}
562 
563 		t = src;
564 
565 		if (direc & DIRECTIVE_BACKWARD) {
566 			wchar_t *bp, *fp, c;
567 			if (tr)
568 				free(tr);
569 			if ((tr = wcsdup(t)) == NULL) {
570 				errno = ENOMEM;
571 				goto fail;
572 			}
573 			bp = tr;
574 			fp = tr + wcslen(tr) - 1;
575 			while (bp < fp) {
576 				c = *bp;
577 				*bp++ = *fp;
578 				*fp-- = c;
579 			}
580 			t = (const wchar_t *)tr;
581 		}
582 
583 		if (direc & DIRECTIVE_POSITION) {
584 			while (*t || state) {
585 
586 				_collate_lookup(lcc, t, &len, &pri, pass,
587 				    &state);
588 				t += len;
589 				if (pri <= 0) {
590 					if (pri < 0) {
591 						errno = EINVAL;
592 						goto fail;
593 					}
594 					pri = COLLATE_MAX_PRIORITY;
595 				}
596 
597 				b = xfrm(loc, buf, pri, pass);
598 				want += b;
599 				if (room) {
600 					while (b) {
601 						b--;
602 						if (room) {
603 							*xf++ = buf[b];
604 							room--;
605 						}
606 					}
607 				}
608 				need = want;
609 			}
610 		} else {
611 			while (*t || state) {
612 				_collate_lookup(lcc, t, &len, &pri, pass,
613 				    &state);
614 				t += len;
615 				if (pri <= 0) {
616 					if (pri < 0) {
617 						errno = EINVAL;
618 						goto fail;
619 					}
620 					continue;
621 				}
622 
623 				b = xfrm(loc, buf, pri, pass);
624 				want += b;
625 				if (room) {
626 
627 					while (b) {
628 						b--;
629 						if (room) {
630 							*xf++ = buf[b];
631 							room--;
632 						}
633 					}
634 				}
635 				need = want;
636 			}
637 		}
638 	}
639 
640 end:
641 	if (tr)
642 		free(tr);
643 	return (need);
644 
645 fail:
646 	if (tr)
647 		free(tr);
648 	return ((size_t)(-1));
649 }
650