xref: /illumos-gate/usr/src/lib/libc/port/locale/collate.c (revision d0fccfcda73f8b52d101bd2b0f7885a766f7e354)
1 /*
2  * Copright 2010 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
4  *		at Electronni Visti IA, Kiev, Ukraine.
5  *			All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include "lint.h"
30 #include "file64.h"
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <stddef.h>
34 #include <string.h>
35 #include <wchar.h>
36 #include <errno.h>
37 #include <unistd.h>
38 #include <ctype.h>
39 #include <unistd.h>
40 #include <fcntl.h>
41 #include <assert.h>
42 #include <sys/stat.h>
43 #include <sys/mman.h>
44 
45 #include "collate.h"
46 #include "setlocale.h"
47 #include "ldpart.h"
48 
49 /*
50  * See the comments in usr/src/cmd/localedef/collate.c for further
51  * information.  It would also be very helpful to have a copy of the
52  * POSIX standard for collation (in the locale format manual page)
53  * handy (www.opengroup.org).
54  */
55 
56 static collate_subst_t		*subst_table[COLL_WEIGHTS_MAX];
57 static collate_char_t		*char_pri_table;
58 static collate_large_t		*large_pri_table;
59 static collate_chain_t		*chain_pri_table;
60 static char			*cache = NULL;
61 static size_t			cachesz;
62 static char			collate_encoding[ENCODING_LEN + 1];
63 
64 /* Exposed externally to other parts of libc. */
65 collate_info_t			*_collate_info;
66 int _collate_load_error = 1;
67 
68 
69 int
70 _collate_load_tables(const char *encoding)
71 {
72 	int i, chains, z;
73 	char buf[PATH_MAX];
74 	char *TMP;
75 	char *map;
76 	collate_info_t *info;
77 	struct stat sbuf;
78 	int fd;
79 
80 	/* 'encoding' must be already checked. */
81 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
82 		_collate_load_error = 1;
83 		return (_LDP_CACHE);
84 	}
85 
86 	/*
87 	 * If the locale name is the same as our cache, use the cache.
88 	 */
89 	if (cache && (strncmp(encoding, collate_encoding, ENCODING_LEN) == 0)) {
90 		_collate_load_error = 0;
91 		return (_LDP_CACHE);
92 	}
93 
94 	/*
95 	 * Slurp the locale file into the cache.
96 	 */
97 
98 	(void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA",
99 	    _PathLocale, encoding);
100 
101 	if ((fd = open(buf, O_RDONLY)) < 0)
102 		return (_LDP_ERROR);
103 	if (fstat(fd, &sbuf) < 0) {
104 		(void) close(fd);
105 		return (_LDP_ERROR);
106 	}
107 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
108 		(void) close(fd);
109 		errno = EINVAL;
110 		return (_LDP_ERROR);
111 	}
112 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
113 	(void) close(fd);
114 	if ((TMP = map) == NULL) {
115 		return (_LDP_ERROR);
116 	}
117 
118 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
119 		(void) munmap(map, sbuf.st_size);
120 		errno = EINVAL;
121 		return (_LDP_ERROR);
122 	}
123 	TMP += COLLATE_STR_LEN;
124 
125 	info = (void *)TMP;
126 	TMP += sizeof (*info);
127 
128 	if ((info->directive_count < 1) ||
129 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
130 	    ((chains = info->chain_count) < 0)) {
131 		(void) munmap(map, sbuf.st_size);
132 		errno = EINVAL;
133 		return (_LDP_ERROR);
134 	}
135 
136 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
137 	    (sizeof (collate_chain_t) * chains) +
138 	    (sizeof (collate_large_t) * info->large_count);
139 	for (z = 0; z < (info->directive_count); z++) {
140 		i += sizeof (collate_subst_t) * info->subst_count[z];
141 	}
142 	if (i != (sbuf.st_size - (TMP - map))) {
143 		(void) munmap(map, sbuf.st_size);
144 		errno = EINVAL;
145 		return (_LDP_ERROR);
146 	}
147 
148 	char_pri_table = (void *)TMP;
149 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
150 
151 	for (z = 0; z < info->directive_count; z++) {
152 		if (info->subst_count[z] > 0) {
153 			subst_table[z] = (void *)TMP;
154 			TMP += info->subst_count[z] * sizeof (collate_subst_t);
155 		} else {
156 			subst_table[z] = NULL;
157 		}
158 	}
159 
160 	if (chains > 0) {
161 		chain_pri_table = (void *)TMP;
162 		TMP += chains * sizeof (collate_chain_t);
163 	} else
164 		chain_pri_table = NULL;
165 	if (info->large_count > 0)
166 		large_pri_table = (void *)TMP;
167 	else
168 		large_pri_table = NULL;
169 
170 	(void) strlcpy(collate_encoding, encoding, ENCODING_LEN);
171 	_collate_info = info;
172 
173 	if (cache)
174 		(void) munmap(cache, cachesz);
175 
176 	cache = map;
177 	cachesz = sbuf.st_size;
178 	_collate_load_error = 0;
179 
180 	return (_LDP_LOADED);
181 }
182 
183 static int32_t *
184 substsearch(const wchar_t key, int pass)
185 {
186 	collate_subst_t *p;
187 	int n = _collate_info->subst_count[pass];
188 
189 	if (n == 0)
190 		return (NULL);
191 
192 	if (pass >= _collate_info->directive_count)
193 		return (NULL);
194 
195 	if (!(key & COLLATE_SUBST_PRIORITY))
196 		return (NULL);
197 
198 	p = subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
199 	assert(p->key == key);
200 	return (p->pri);
201 }
202 
203 /*
204  * Note: for performance reasons, we have expanded bsearch here.  This avoids
205  * function call overhead with each comparison.
206  */
207 
208 static collate_chain_t *
209 chainsearch(const wchar_t *key, int *len)
210 {
211 	int low;
212 	int high;
213 	int next, compar, l;
214 	collate_chain_t *p;
215 	collate_chain_t *tab;
216 
217 	if (_collate_info->chain_count == 0)
218 		return (NULL);
219 
220 	low = 0;
221 	high = _collate_info->chain_count - 1;
222 	tab = chain_pri_table;
223 
224 	while (low <= high) {
225 		next = (low + high) / 2;
226 		p = tab + next;
227 		compar = *key - *p->str;
228 		if (compar == 0) {
229 			l = wcsnlen(p->str, COLLATE_STR_LEN);
230 			compar = wcsncmp(key, p->str, l);
231 			if (compar == 0) {
232 				*len = l;
233 				return (p);
234 			}
235 		}
236 		if (compar > 0)
237 			low = next + 1;
238 		else
239 			high = next - 1;
240 	}
241 	return (NULL);
242 }
243 
244 static collate_large_t *
245 largesearch(const wchar_t key)
246 {
247 	int low = 0;
248 	int high = _collate_info->large_count - 1;
249 	int next, compar;
250 	collate_large_t *p;
251 	collate_large_t *tab = large_pri_table;
252 
253 	if (_collate_info->large_count == 0)
254 		return (NULL);
255 
256 	while (low <= high) {
257 		next = (low + high) / 2;
258 		p = tab + next;
259 		compar = key - p->val;
260 		if (compar == 0)
261 			return (p);
262 		if (compar > 0)
263 			low = next + 1;
264 		else
265 			high = next - 1;
266 	}
267 	return (NULL);
268 }
269 
270 void
271 _collate_lookup(const wchar_t *t, int *len, int *pri, int which, int **state)
272 {
273 	collate_chain_t *p2;
274 	collate_large_t *match;
275 	collate_info_t *info = _collate_info;
276 	int p, l;
277 	int *sptr;
278 
279 	/*
280 	 * If this is the "last" pass for the UNDEFINED, then
281 	 * we just return the priority itself.
282 	 */
283 	if (which >= info->directive_count) {
284 		*pri = *t;
285 		*len = 1;
286 		*state = NULL;
287 		return;
288 	}
289 
290 	/*
291 	 * If we have remaining substitution data from a previous
292 	 * call, consume it first.
293 	 */
294 	if ((sptr = *state) != NULL) {
295 		*pri = *sptr;
296 		sptr++;
297 		*state = *sptr ? sptr : NULL;
298 		*len = 0;
299 		return;
300 	}
301 
302 	/* No active substitutions */
303 	*len = 1;
304 
305 	/*
306 	 * Check for composites such as dipthongs that collate as a
307 	 * single element (aka chains or collating-elements).
308 	 */
309 	if (((p2 = chainsearch(t, &l)) != NULL) &&
310 	    ((p = p2->pri[which]) >= 0)) {
311 
312 		*len = l;
313 		*pri = p;
314 
315 	} else if (*t <= UCHAR_MAX) {
316 
317 		/*
318 		 * Character is a small (8-bit) character.
319 		 * We just look these up directly for speed.
320 		 */
321 		*pri = char_pri_table[*t].pri[which];
322 
323 	} else if ((info->large_count > 0) &&
324 	    ((match = largesearch(*t)) != NULL)) {
325 
326 		/*
327 		 * Character was found in the extended table.
328 		 */
329 		*pri = match->pri.pri[which];
330 
331 	} else {
332 		/*
333 		 * Character lacks a specific definition.
334 		 */
335 		if (info->directive[which] & DIRECTIVE_UNDEFINED) {
336 			/* Mask off sign bit to prevent ordering confusion. */
337 			*pri = (*t & COLLATE_MAX_PRIORITY);
338 		} else {
339 			*pri = info->undef_pri[which];
340 		}
341 		/* No substitutions for undefined characters! */
342 		return;
343 	}
344 
345 	/*
346 	 * Try substituting (expanding) the character.  We are
347 	 * currently doing this *after* the chain compression.  I
348 	 * think it should not matter, but this way might be slightly
349 	 * faster.
350 	 *
351 	 * We do this after the priority search, as this will help us
352 	 * to identify a single key value.  In order for this to work,
353 	 * its important that the priority assigned to a given element
354 	 * to be substituted be unique for that level.  The localedef
355 	 * code ensures this for us.
356 	 */
357 	if ((sptr = substsearch(*pri, which)) != NULL) {
358 		if ((*pri = *sptr) != 0) {
359 			sptr++;
360 			*state = *sptr ? sptr : NULL;
361 		}
362 	}
363 
364 }
365 
366 /*
367  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
368  * NOT NULL terminate.  That is left to the caller.
369  */
370 size_t
371 _collate_wxfrm(const wchar_t *src, wchar_t *xf, size_t room)
372 {
373 	int		pri;
374 	int		len;
375 	const wchar_t	*t;
376 	wchar_t		*tr = NULL;
377 	int		direc;
378 	int		pass;
379 	int32_t 	*state;
380 	size_t		want = 0;
381 	size_t		need = 0;
382 
383 	assert(src);
384 
385 	for (pass = 0; pass <= _collate_info->directive_count; pass++) {
386 
387 		state = NULL;
388 
389 		if (pass != 0) {
390 			/* insert level separator from the previous pass */
391 			if (room) {
392 				*xf++ = 1;
393 				room--;
394 			}
395 			want++;
396 		}
397 
398 		/* special pass for undefined */
399 		if (pass == _collate_info->directive_count) {
400 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
401 		} else {
402 			direc = _collate_info->directive[pass];
403 		}
404 
405 		t = src;
406 
407 		if (direc & DIRECTIVE_BACKWARD) {
408 			wchar_t *bp, *fp, c;
409 			if (tr)
410 				free(tr);
411 			if ((tr = wcsdup(t)) == NULL) {
412 				errno = ENOMEM;
413 				goto fail;
414 			}
415 			bp = tr;
416 			fp = tr + wcslen(tr) - 1;
417 			while (bp < fp) {
418 				c = *bp;
419 				*bp++ = *fp;
420 				*fp-- = c;
421 			}
422 			t = (const wchar_t *)tr;
423 		}
424 
425 		if (direc & DIRECTIVE_POSITION) {
426 			while (*t || state) {
427 				_collate_lookup(t, &len, &pri, pass, &state);
428 				t += len;
429 				if (pri <= 0) {
430 					if (pri < 0) {
431 						errno = EINVAL;
432 						goto fail;
433 					}
434 					pri = COLLATE_MAX_PRIORITY;
435 				}
436 				if (room) {
437 					*xf++ = pri;
438 					room--;
439 				}
440 				want++;
441 				need = want;
442 			}
443 		} else {
444 			while (*t || state) {
445 				_collate_lookup(t, &len, &pri, pass, &state);
446 				t += len;
447 				if (pri <= 0) {
448 					if (pri < 0) {
449 						errno = EINVAL;
450 						goto fail;
451 					}
452 					continue;
453 				}
454 				if (room) {
455 					*xf++ = pri;
456 					room--;
457 				}
458 				want++;
459 				need = want;
460 			}
461 		}
462 	}
463 
464 end:
465 	if (tr)
466 		free(tr);
467 	return (need);
468 
469 fail:
470 	if (tr)
471 		free(tr);
472 	return ((size_t)(-1));
473 }
474 
475 /*
476  * In the non-POSIX case, we transform each character into a string of
477  * characters representing the character's priority.  Since char is usually
478  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
479  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
480  * bits per byte.
481  *
482  * It turns out that we sometimes have real priorities that are
483  * 31-bits wide.  (But: be careful using priorities where the high
484  * order bit is set -- i.e. the priority is negative.  The sort order
485  * may be surprising!)
486  *
487  * TODO: This would be a good area to optimize somewhat.  It turns out
488  * that real prioririties *except for the last UNDEFINED pass* are generally
489  * very small.  We need the localedef code to precalculate the max
490  * priority for us, and ideally also give us a mask, and then we could
491  * severely limit what we expand to.
492  */
493 #define	XFRM_BYTES	6
494 #define	XFRM_OFFSET	('0')	/* make all printable characters */
495 #define	XFRM_SHIFT	6
496 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
497 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
498 
499 static int
500 xfrm(unsigned char *p, int pri, int pass)
501 {
502 	/* we use unsigned to ensure zero fill on right shift */
503 	uint32_t val = (uint32_t)_collate_info->pri_count[pass];
504 	int nc = 0;
505 
506 	while (val) {
507 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
508 		pri >>= XFRM_SHIFT;
509 		val >>= XFRM_SHIFT;
510 		p++;
511 		nc++;
512 	}
513 	return (nc);
514 }
515 
516 size_t
517 _collate_sxfrm(const wchar_t *src, char *xf, size_t room)
518 {
519 	int		pri;
520 	int		len;
521 	const wchar_t	*t;
522 	wchar_t		*tr = NULL;
523 	int		direc;
524 	int		pass;
525 	int32_t 	*state;
526 	size_t		want = 0;
527 	size_t		need = 0;
528 	int		b;
529 	uint8_t		buf[XFRM_BYTES];
530 
531 	assert(src);
532 
533 	for (pass = 0; pass <= _collate_info->directive_count; pass++) {
534 
535 		state = NULL;
536 
537 		if (pass != 0) {
538 			/* insert level separator from the previous pass */
539 			if (room) {
540 				*xf++ = XFRM_SEP;
541 				room--;
542 			}
543 			want++;
544 		}
545 
546 		/* special pass for undefined */
547 		if (pass == _collate_info->directive_count) {
548 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
549 		} else {
550 			direc = _collate_info->directive[pass];
551 		}
552 
553 		t = src;
554 
555 		if (direc & DIRECTIVE_BACKWARD) {
556 			wchar_t *bp, *fp, c;
557 			if (tr)
558 				free(tr);
559 			if ((tr = wcsdup(t)) == NULL) {
560 				errno = ENOMEM;
561 				goto fail;
562 			}
563 			bp = tr;
564 			fp = tr + wcslen(tr) - 1;
565 			while (bp < fp) {
566 				c = *bp;
567 				*bp++ = *fp;
568 				*fp-- = c;
569 			}
570 			t = (const wchar_t *)tr;
571 		}
572 
573 		if (direc & DIRECTIVE_POSITION) {
574 			while (*t || state) {
575 
576 				_collate_lookup(t, &len, &pri, pass, &state);
577 				t += len;
578 				if (pri <= 0) {
579 					if (pri < 0) {
580 						errno = EINVAL;
581 						goto fail;
582 					}
583 					pri = COLLATE_MAX_PRIORITY;
584 				}
585 
586 				b = xfrm(buf, pri, pass);
587 				want += b;
588 				if (room) {
589 					while (b) {
590 						b--;
591 						if (room)
592 							*xf++ = buf[b];
593 						room--;
594 					}
595 				}
596 				need = want;
597 			}
598 		} else {
599 			while (*t || state) {
600 				_collate_lookup(t, &len, &pri, pass, &state);
601 				t += len;
602 				if (pri <= 0) {
603 					if (pri < 0) {
604 						errno = EINVAL;
605 						goto fail;
606 					}
607 					continue;
608 				}
609 
610 				b = xfrm(buf, pri, pass);
611 				want += b;
612 				if (room) {
613 
614 					while (b) {
615 						b--;
616 						if (room)
617 							*xf++ = buf[b];
618 						room--;
619 					}
620 				}
621 				need = want;
622 			}
623 		}
624 	}
625 
626 end:
627 	if (tr)
628 		free(tr);
629 	return (need);
630 
631 fail:
632 	if (tr)
633 		free(tr);
634 	return ((size_t)(-1));
635 }
636