xref: /titanic_50/usr/src/lib/libc/port/locale/collate.c (revision 643918928169438203a594dfc3fc1b3e9afe5ba3)
1 /*
2  * Copright 2010 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
4  *		at Electronni Visti IA, Kiev, Ukraine.
5  *			All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include "lint.h"
30 #include "file64.h"
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <stddef.h>
34 #include <string.h>
35 #include <wchar.h>
36 #include <errno.h>
37 #include <unistd.h>
38 #include <ctype.h>
39 #include <unistd.h>
40 #include <fcntl.h>
41 #include <assert.h>
42 #include <sys/stat.h>
43 #include <sys/mman.h>
44 
45 #include "collate.h"
46 #include "setlocale.h"
47 #include "ldpart.h"
48 
49 static collate_subst_t			*subst_table[COLL_WEIGHTS_MAX];
50 static collate_char_pri_t		*char_pri_table;
51 static collate_large_pri_t		*large_pri_table;
52 static collate_chain_pri_t		*chain_pri_table;
53 static char				*cache = NULL;
54 static size_t				cachesz;
55 static char				collate_encoding[ENCODING_LEN + 1];
56 
57 /* Exposed externally to other parts of libc. */
58 collate_info_t				*_collate_info;
59 int _collate_load_error = 1;
60 
61 
62 int
63 _collate_load_tables(const char *encoding)
64 {
65 	int i, chains, z;
66 	char buf[PATH_MAX];
67 	char *TMP;
68 	char *map;
69 	collate_info_t *info;
70 	struct stat sbuf;
71 	int fd;
72 
73 	/* 'encoding' must be already checked. */
74 	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
75 		_collate_load_error = 1;
76 		return (_LDP_CACHE);
77 	}
78 
79 	/*
80 	 * If the locale name is the same as our cache, use the cache.
81 	 */
82 	if (cache && (strncmp(encoding, collate_encoding, ENCODING_LEN) == 0)) {
83 		_collate_load_error = 0;
84 		return (_LDP_CACHE);
85 	}
86 
87 	/*
88 	 * Slurp the locale file into the cache.
89 	 */
90 
91 	(void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA",
92 	    _PathLocale, encoding);
93 
94 	if ((fd = open(buf, O_RDONLY)) < 0)
95 		return (_LDP_ERROR);
96 	if (fstat(fd, &sbuf) < 0) {
97 		(void) close(fd);
98 		return (_LDP_ERROR);
99 	}
100 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
101 		(void) close(fd);
102 		errno = EINVAL;
103 		return (_LDP_ERROR);
104 	}
105 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
106 	(void) close(fd);
107 	if ((TMP = map) == NULL) {
108 		return (_LDP_ERROR);
109 	}
110 
111 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
112 		(void) munmap(map, sbuf.st_size);
113 		errno = EINVAL;
114 		return (_LDP_ERROR);
115 	}
116 	TMP += COLLATE_STR_LEN;
117 
118 	info = (void *)TMP;
119 	TMP += sizeof (*info);
120 
121 	if ((info->directive_count < 1) ||
122 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
123 	    ((chains = info->chain_count) < 0)) {
124 		(void) munmap(map, sbuf.st_size);
125 		errno = EINVAL;
126 		return (_LDP_ERROR);
127 	}
128 
129 	i = (sizeof (collate_char_pri_t) * (UCHAR_MAX + 1)) +
130 	    (sizeof (collate_chain_pri_t) * chains) +
131 	    (sizeof (collate_large_pri_t) * info->large_pri_count);
132 	for (z = 0; z < (info->directive_count); z++) {
133 		i += sizeof (collate_subst_t) * info->subst_count[z];
134 	}
135 	if (i != (sbuf.st_size - (TMP - map))) {
136 		(void) munmap(map, sbuf.st_size);
137 		errno = EINVAL;
138 		return (_LDP_ERROR);
139 	}
140 
141 	char_pri_table = (void *)TMP;
142 	TMP += sizeof (collate_char_pri_t) * (UCHAR_MAX + 1);
143 
144 	for (z = 0; z < info->directive_count; z++) {
145 		if (info->subst_count[z] > 0) {
146 			subst_table[z] = (void *)TMP;
147 			TMP += info->subst_count[z] * sizeof (collate_subst_t);
148 		} else {
149 			subst_table[z] = NULL;
150 		}
151 	}
152 
153 	if (chains > 0) {
154 		chain_pri_table = (void *)TMP;
155 		TMP += chains * sizeof (collate_chain_pri_t);
156 	} else
157 		chain_pri_table = NULL;
158 	if (info->large_pri_count > 0)
159 		large_pri_table = (void *)TMP;
160 	else
161 		large_pri_table = NULL;
162 
163 	(void) strlcpy(collate_encoding, encoding, ENCODING_LEN);
164 	_collate_info = info;
165 
166 	if (cache)
167 		(void) munmap(cache, cachesz);
168 
169 	cache = map;
170 	cachesz = sbuf.st_size;
171 	_collate_load_error = 0;
172 
173 	return (_LDP_LOADED);
174 }
175 
176 /*
177  * Note: for performance reasons, we have expanded bsearch here.  This avoids
178  * function call overhead with each comparison.
179  */
180 
181 static int32_t *
182 substsearch(const wchar_t key, int pass)
183 {
184 	int low;
185 	int high;
186 	int next, compar;
187 	collate_subst_t *p;
188 	collate_subst_t *tab;
189 	int n = _collate_info->subst_count[pass];
190 
191 	if (n == 0)
192 		return (NULL);
193 
194 	tab = subst_table[pass];
195 	low = 0;
196 	high = n - 1;
197 	while (low <= high) {
198 		next = (low + high) / 2;
199 		p = tab + next;
200 		compar = key - p->key;
201 		if (compar == 0)
202 			return (p->pri);
203 		if (compar > 0)
204 			low = next + 1;
205 		else
206 			high = next - 1;
207 	}
208 	return (NULL);
209 }
210 
211 static collate_chain_pri_t *
212 chainsearch(const wchar_t *key, int *len)
213 {
214 	int low;
215 	int high;
216 	int next, compar, l;
217 	collate_chain_pri_t *p;
218 	collate_chain_pri_t *tab;
219 
220 	if (_collate_info->chain_count == 0)
221 		return (NULL);
222 
223 	low = 0;
224 	high = _collate_info->chain_count - 1;
225 	tab = chain_pri_table;
226 
227 	while (low <= high) {
228 		next = (low + high) / 2;
229 		p = tab + next;
230 		compar = *key - *p->str;
231 		if (compar == 0) {
232 			l = wcsnlen(p->str, COLLATE_STR_LEN);
233 			compar = wcsncmp(key, p->str, l);
234 			if (compar == 0) {
235 				*len = l;
236 				return (p);
237 			}
238 		}
239 		if (compar > 0)
240 			low = next + 1;
241 		else
242 			high = next - 1;
243 	}
244 	return (NULL);
245 }
246 
247 static collate_large_pri_t *
248 largesearch(const wchar_t key)
249 {
250 	int low = 0;
251 	int high = _collate_info->large_pri_count - 1;
252 	int next, compar;
253 	collate_large_pri_t *p;
254 	collate_large_pri_t *tab = large_pri_table;
255 
256 	if (_collate_info->large_pri_count == 0)
257 		return (NULL);
258 
259 	while (low <= high) {
260 		next = (low + high) / 2;
261 		p = tab + next;
262 		compar = key - p->val;
263 		if (compar == 0)
264 			return (p);
265 		if (compar > 0)
266 			low = next + 1;
267 		else
268 			high = next - 1;
269 	}
270 	return (NULL);
271 }
272 
273 void
274 _collate_lookup(const wchar_t *t, int *len, int *pri, int which,
275     int **state)
276 {
277 	collate_chain_pri_t *p2;
278 	collate_large_pri_t *match;
279 	int p, l;
280 	int *sptr;
281 
282 	/*
283 	 * If we have remaining substitution data from a previous
284 	 * call, consume it first.
285 	 */
286 	if ((sptr = *state) != NULL) {
287 		*pri = *sptr;
288 		sptr++;
289 		*state = *sptr ? sptr : NULL;
290 		*len = 0;
291 		return;
292 	}
293 
294 	/* No active substitutions */
295 	*len = 1;
296 
297 	/*
298 	 * Check for composites such as dipthongs that collate as a
299 	 * single element (aka chains or collating-elements).
300 	 */
301 	if (((p2 = chainsearch(t, &l)) != NULL) &&
302 	    ((p = p2->pri[which]) >= 0)) {
303 
304 		*len = l;
305 		*pri = p;
306 
307 	} else if (*t <= UCHAR_MAX) {
308 
309 		/*
310 		 * Character is a small (8-bit) character.
311 		 * We just look these up directly for speed.
312 		 */
313 		*pri = char_pri_table[*t].pri[which];
314 
315 	} else if ((_collate_info->large_pri_count > 0) &&
316 	    ((match = largesearch(*t)) != NULL)) {
317 
318 		/*
319 		 * Character was found in the extended table.
320 		 */
321 		*pri = match->pri.pri[which];
322 
323 	} else {
324 		/*
325 		 * Character lacks a specific definition.
326 		 */
327 		if (_collate_info->directive[which] & DIRECTIVE_UNDEFINED) {
328 			/* Mask off sign bit to prevent ordering confusion. */
329 			*pri = (*t & COLLATE_MAX_PRIORITY);
330 		} else {
331 			*pri = _collate_info->undef_pri[which];
332 		}
333 		/* No substitutions for undefined characters! */
334 		return;
335 	}
336 
337 	/*
338 	 * Try substituting (expanding) the character.  We are
339 	 * currently doing this *after* the chain compression.  I
340 	 * think it should not matter, but this way might be slightly
341 	 * faster.
342 	 *
343 	 * We do this after the priority search, as this will help us
344 	 * to identify a single key value.  In order for this to work,
345 	 * its important that the priority assigned to a given element
346 	 * to be substituted be unique for that level.  The localedef
347 	 * code ensures this for us.
348 	 */
349 	if ((*pri & COLLATE_SUBST_PRIORITY) &&
350 	    (sptr = substsearch(*pri, which)) != NULL) {
351 		if ((*pri = *sptr) != 0) {
352 			sptr++;
353 			*state = *sptr ? sptr : NULL;
354 		}
355 	}
356 
357 }
358 
359 /*
360  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
361  * NOT NULL terminate.  That is left to the caller.
362  */
363 size_t
364 _collate_wxfrm(const wchar_t *src, wchar_t *xf, size_t room)
365 {
366 	int		pri;
367 	int		len;
368 	const wchar_t	*t;
369 	wchar_t		*tr = NULL;
370 	int		direc;
371 	int		pass;
372 	int32_t 	*state;
373 	size_t		want = 0;
374 	size_t		need = 0;
375 
376 	assert(src);
377 
378 	for (pass = 0; pass <= _collate_info->directive_count; pass++) {
379 
380 		state = NULL;
381 
382 		if (pass != 0) {
383 			/* insert level separator from the previous pass */
384 			if (room) {
385 				*xf++ = 1;
386 				room--;
387 			}
388 			want++;
389 		}
390 
391 		/* special pass for undefined */
392 		if (pass == _collate_info->directive_count) {
393 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
394 		} else {
395 			direc = _collate_info->directive[pass];
396 		}
397 
398 		t = src;
399 
400 		if (direc & DIRECTIVE_BACKWARD) {
401 			wchar_t *bp, *fp, c;
402 			if (tr)
403 				free(tr);
404 			if ((tr = wcsdup(t)) == NULL) {
405 				errno = ENOMEM;
406 				goto fail;
407 			}
408 			bp = tr;
409 			fp = tr + wcslen(tr) - 1;
410 			while (bp < fp) {
411 				c = *bp;
412 				*bp++ = *fp;
413 				*fp-- = c;
414 			}
415 			t = (const wchar_t *)tr;
416 		}
417 
418 		if (direc & DIRECTIVE_POSITION) {
419 			while (*t || state) {
420 				_collate_lookup(t, &len, &pri, pass, &state);
421 				t += len;
422 				if (pri <= 0) {
423 					if (pri < 0) {
424 						errno = EINVAL;
425 						goto fail;
426 					}
427 					pri = COLLATE_MAX_PRIORITY;
428 				}
429 				if (room) {
430 					*xf++ = pri;
431 					room--;
432 				}
433 				want++;
434 				need = want;
435 			}
436 		} else {
437 			while (*t || state) {
438 				_collate_lookup(t, &len, &pri, pass, &state);
439 				t += len;
440 				if (pri <= 0) {
441 					if (pri < 0) {
442 						errno = EINVAL;
443 						goto fail;
444 					}
445 					continue;
446 				}
447 				if (room) {
448 					*xf++ = pri;
449 					room--;
450 				}
451 				want++;
452 				need = want;
453 			}
454 		}
455 	}
456 
457 end:
458 	if (tr)
459 		free(tr);
460 	return (need);
461 
462 fail:
463 	if (tr)
464 		free(tr);
465 	return ((size_t)(-1));
466 }
467 
468 /*
469  * In the non-POSIX case, we transform each character into a string of
470  * characters representing the character's priority.  Since char is usually
471  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
472  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
473  * bits per byte.
474  *
475  * It turns out that we sometimes have real priorities that are
476  * 31-bits wide.  (But: be careful using priorities where the high
477  * order bit is set -- i.e. the priority is negative.  The sort order
478  * may be surprising!)
479  *
480  * TODO: This would be a good area to optimize somewhat.  It turns out
481  * that real prioririties *except for the last UNDEFINED pass* are generally
482  * very small.  We need the localedef code to precalculate the max
483  * priority for us, and ideally also give us a mask, and then we could
484  * severely limit what we expand to.
485  */
486 #define	XFRM_BYTES	6
487 #define	XFRM_OFFSET	('0')	/* make all printable characters */
488 #define	XFRM_SHIFT	6
489 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
490 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
491 
492 static void
493 xfrm(unsigned char *p, int pri)
494 {
495 	int i;
496 	for (i = 0; i < XFRM_BYTES; i++) {
497 		p[i] = (pri & XFRM_MASK) + XFRM_OFFSET;
498 		pri >>= XFRM_SHIFT;
499 	}
500 }
501 
502 size_t
503 _collate_sxfrm(const wchar_t *src, char *xf, size_t room)
504 {
505 	int		pri;
506 	int		len;
507 	const wchar_t	*t;
508 	wchar_t		*tr = NULL;
509 	int		direc;
510 	int		pass;
511 	int32_t 	*state;
512 	size_t		want = 0;
513 	size_t		need = 0;
514 	int		b;
515 	uint8_t		buf[XFRM_BYTES];
516 
517 	assert(src);
518 
519 	for (pass = 0; pass <= _collate_info->directive_count; pass++) {
520 
521 		state = NULL;
522 
523 		if (pass != 0) {
524 			/* insert level separator from the previous pass */
525 			if (room) {
526 				*xf++ = XFRM_SEP;
527 				room--;
528 			}
529 			want++;
530 		}
531 
532 		/* special pass for undefined */
533 		if (pass == _collate_info->directive_count) {
534 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
535 		} else {
536 			direc = _collate_info->directive[pass];
537 		}
538 
539 		t = src;
540 
541 		if (direc & DIRECTIVE_BACKWARD) {
542 			wchar_t *bp, *fp, c;
543 			if (tr)
544 				free(tr);
545 			if ((tr = wcsdup(t)) == NULL) {
546 				errno = ENOMEM;
547 				goto fail;
548 			}
549 			bp = tr;
550 			fp = tr + wcslen(tr) - 1;
551 			while (bp < fp) {
552 				c = *bp;
553 				*bp++ = *fp;
554 				*fp-- = c;
555 			}
556 			t = (const wchar_t *)tr;
557 		}
558 
559 		if (direc & DIRECTIVE_POSITION) {
560 			while (*t || state) {
561 
562 				_collate_lookup(t, &len, &pri, pass, &state);
563 				t += len;
564 				if (pri <= 0) {
565 					if (pri < 0) {
566 						errno = EINVAL;
567 						goto fail;
568 					}
569 					pri = COLLATE_MAX_PRIORITY;
570 				}
571 
572 				if (room) {
573 					b = XFRM_BYTES;
574 					xfrm(buf, pri);
575 
576 					while (b) {
577 						b--;
578 						if (room)
579 							*xf++ = buf[b];
580 						room--;
581 					}
582 				}
583 				want += XFRM_BYTES;
584 				need = want;
585 			}
586 		} else {
587 			while (*t || state) {
588 				_collate_lookup(t, &len, &pri, pass, &state);
589 				t += len;
590 				if (pri <= 0) {
591 					if (pri < 0) {
592 						errno = EINVAL;
593 						goto fail;
594 					}
595 					continue;
596 				}
597 				if (room) {
598 					b = XFRM_BYTES;
599 					xfrm(buf, pri);
600 
601 					while (b) {
602 						b--;
603 						if (room)
604 							*xf++ = buf[b];
605 						room--;
606 					}
607 				}
608 				want += XFRM_BYTES;
609 				need = want;
610 			}
611 		}
612 	}
613 
614 end:
615 	if (tr)
616 		free(tr);
617 	return (need);
618 
619 fail:
620 	if (tr)
621 		free(tr);
622 	return ((size_t)(-1));
623 }
624