1 /*
2 * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
5 * at Electronni Visti IA, Kiev, Ukraine.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "lint.h"
31 #include "file64.h"
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <stddef.h>
35 #include <string.h>
36 #include <wchar.h>
37 #include <errno.h>
38 #include <unistd.h>
39 #include <ctype.h>
40 #include <unistd.h>
41 #include <fcntl.h>
42 #include <assert.h>
43 #include <sys/stat.h>
44 #include <sys/mman.h>
45
46 #include "collate.h"
47 #include "setlocale.h"
48
49 /*
50 * See the comments in usr/src/cmd/localedef/collate.c for further
51 * information. It would also be very helpful to have a copy of the
52 * POSIX standard for collation (in the locale format manual page)
53 * handy (www.opengroup.org).
54 */
55
56 /*
57 * POSIX uses empty tables and falls down to strcmp.
58 */
59 struct lc_collate lc_collate_posix = {
60 .lc_is_posix = 1,
61 };
62
63 struct locdata __posix_collate_locdata = {
64 .l_lname = "C",
65 .l_data = { &lc_collate_posix }
66 };
67
68
69 struct locdata *
__lc_collate_load(const char * locname)70 __lc_collate_load(const char *locname)
71 {
72 int i, chains, z;
73 char buf[PATH_MAX];
74 char *TMP;
75 char *map;
76 collate_info_t *info;
77 struct stat sbuf;
78 int fd;
79 struct locdata *ldata;
80 struct lc_collate *lcc;
81
82 /*
83 * Slurp the locale file into the cache.
84 */
85
86 (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA",
87 _PathLocale, locname);
88
89 if ((fd = open(buf, O_RDONLY)) < 0) {
90 errno = EINVAL;
91 return (NULL);
92 }
93 if (fstat(fd, &sbuf) < 0) {
94 (void) close(fd);
95 errno = EINVAL;
96 return (NULL);
97 }
98 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
99 (void) close(fd);
100 errno = EINVAL;
101 return (NULL);
102 }
103 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
104 (void) close(fd);
105 if ((TMP = map) == NULL) {
106 errno = EINVAL;
107 return (NULL);
108 }
109
110 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
111 (void) munmap(map, sbuf.st_size);
112 errno = EINVAL;
113 return (NULL);
114 }
115 TMP += COLLATE_STR_LEN;
116
117 info = (void *)TMP;
118 TMP += sizeof (*info);
119
120 if ((info->directive_count < 1) ||
121 (info->directive_count >= COLL_WEIGHTS_MAX) ||
122 ((chains = info->chain_count) < 0)) {
123 (void) munmap(map, sbuf.st_size);
124 errno = EINVAL;
125 return (NULL);
126 }
127
128 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
129 (sizeof (collate_chain_t) * chains) +
130 (sizeof (collate_large_t) * info->large_count);
131 for (z = 0; z < info->directive_count; z++) {
132 i += sizeof (collate_subst_t) * info->subst_count[z];
133 }
134 if (i != (sbuf.st_size - (TMP - map))) {
135 (void) munmap(map, sbuf.st_size);
136 errno = EINVAL;
137 return (NULL);
138 }
139
140
141 if ((ldata = __locdata_alloc(locname, sizeof (*lcc))) == NULL) {
142 (void) munmap(map, sbuf.st_size);
143 return (NULL);
144 }
145 lcc = ldata->l_data[0];
146 ldata->l_map = map;
147 ldata->l_map_len = sbuf.st_size;
148
149 lcc->lc_info = info;
150 lcc->lc_directive_count = info->directive_count;
151 lcc->lc_large_count = info->large_count;
152
153 for (z = 0; z < COLL_WEIGHTS_MAX; z++) {
154 lcc->lc_directive[z] = info->directive[z];
155 lcc->lc_subst_count[z] = info->subst_count[z];
156 lcc->lc_pri_count[z] = info->pri_count[z];
157 lcc->lc_undef_pri[z] = info->undef_pri[z];
158 }
159
160 lcc->lc_char_table = (void *)TMP;
161 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
162
163 for (z = 0; z < lcc->lc_directive_count; z++) {
164 int count;
165 if ((count = lcc->lc_subst_count[z]) > 0) {
166 lcc->lc_subst_table[z] = (void *)TMP;
167 TMP += count * sizeof (collate_subst_t);
168 } else {
169 lcc->lc_subst_table[z] = NULL;
170 }
171 }
172
173 if (chains > 0) {
174 lcc->lc_chain_table = (void *)TMP;
175 TMP += chains * sizeof (collate_chain_t);
176 } else
177 lcc->lc_chain_table = NULL;
178 lcc->lc_chain_count = chains;
179 if (lcc->lc_large_count > 0)
180 lcc->lc_large_table = (void *)TMP;
181 else
182 lcc->lc_large_table = NULL;
183
184 return (ldata);
185 }
186
187 static const int32_t *
substsearch(const struct lc_collate * lcc,const wchar_t key,int pass)188 substsearch(const struct lc_collate *lcc, const wchar_t key, int pass)
189 {
190 const collate_subst_t *p;
191 int n = lcc->lc_subst_count[pass];
192
193 if (n == 0)
194 return (NULL);
195
196 if (pass >= lcc->lc_directive_count)
197 return (NULL);
198
199 if (!(key & COLLATE_SUBST_PRIORITY))
200 return (NULL);
201
202 p = lcc->lc_subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
203 assert(p->key == key);
204 return (p->pri);
205 }
206
207 /*
208 * Note: for performance reasons, we have expanded bsearch here. This avoids
209 * function call overhead with each comparison.
210 */
211
212 static collate_chain_t *
chainsearch(const struct lc_collate * lcc,const wchar_t * key,int * len)213 chainsearch(const struct lc_collate *lcc, const wchar_t *key, int *len)
214 {
215 int low;
216 int high;
217 int next, compar, l;
218 collate_chain_t *p;
219 collate_chain_t *tab;
220
221 if (lcc->lc_info->chain_count == 0)
222 return (NULL);
223
224 low = 0;
225 high = lcc->lc_info->chain_count - 1;
226 tab = lcc->lc_chain_table;
227
228 while (low <= high) {
229 next = (low + high) / 2;
230 p = tab + next;
231 compar = *key - *p->str;
232 if (compar == 0) {
233 l = wcsnlen(p->str, COLLATE_STR_LEN);
234 compar = wcsncmp(key, p->str, l);
235 if (compar == 0) {
236 *len = l;
237 return (p);
238 }
239 }
240 if (compar > 0)
241 low = next + 1;
242 else
243 high = next - 1;
244 }
245 return (NULL);
246 }
247
248 static collate_large_t *
largesearch(const struct lc_collate * lcc,const wchar_t key)249 largesearch(const struct lc_collate *lcc, const wchar_t key)
250 {
251 int low = 0;
252 int high = lcc->lc_info->large_count - 1;
253 int next, compar;
254 collate_large_t *p;
255 collate_large_t *tab = lcc->lc_large_table;
256
257 if (lcc->lc_info->large_count == 0)
258 return (NULL);
259
260 while (low <= high) {
261 next = (low + high) / 2;
262 p = tab + next;
263 compar = key - p->val;
264 if (compar == 0)
265 return (p);
266 if (compar > 0)
267 low = next + 1;
268 else
269 high = next - 1;
270 }
271 return (NULL);
272 }
273
274 void
_collate_lookup(const struct lc_collate * lcc,const wchar_t * t,int * len,int * pri,int which,const int ** state)275 _collate_lookup(const struct lc_collate *lcc, const wchar_t *t,
276 int *len, int *pri, int which, const int **state)
277 {
278 collate_chain_t *p2;
279 collate_large_t *match;
280 int p, l;
281 const int *sptr;
282
283 /*
284 * If this is the "last" pass for the UNDEFINED, then
285 * we just return the priority itself.
286 */
287 if (which >= lcc->lc_directive_count) {
288 *pri = *t;
289 *len = 1;
290 *state = NULL;
291 return;
292 }
293
294 /*
295 * If we have remaining substitution data from a previous
296 * call, consume it first.
297 */
298 if ((sptr = *state) != NULL) {
299 *pri = *sptr;
300 sptr++;
301 *state = *sptr ? sptr : NULL;
302 *len = 0;
303 return;
304 }
305
306 /* No active substitutions */
307 *len = 1;
308
309 /*
310 * Check for composites such as dipthongs that collate as a
311 * single element (aka chains or collating-elements).
312 */
313 if (((p2 = chainsearch(lcc, t, &l)) != NULL) &&
314 ((p = p2->pri[which]) >= 0)) {
315
316 *len = l;
317 *pri = p;
318
319 } else if (*t <= UCHAR_MAX) {
320
321 /*
322 * Character is a small (8-bit) character.
323 * We just look these up directly for speed.
324 */
325 *pri = lcc->lc_char_table[*t].pri[which];
326
327 } else if ((lcc->lc_info->large_count > 0) &&
328 ((match = largesearch(lcc, *t)) != NULL)) {
329
330 /*
331 * Character was found in the extended table.
332 */
333 *pri = match->pri.pri[which];
334
335 } else {
336 /*
337 * Character lacks a specific definition.
338 */
339 if (lcc->lc_directive[which] & DIRECTIVE_UNDEFINED) {
340 /* Mask off sign bit to prevent ordering confusion. */
341 *pri = (*t & COLLATE_MAX_PRIORITY);
342 } else {
343 *pri = lcc->lc_undef_pri[which];
344 }
345 /* No substitutions for undefined characters! */
346 return;
347 }
348
349 /*
350 * Try substituting (expanding) the character. We are
351 * currently doing this *after* the chain compression. I
352 * think it should not matter, but this way might be slightly
353 * faster.
354 *
355 * We do this after the priority search, as this will help us
356 * to identify a single key value. In order for this to work,
357 * its important that the priority assigned to a given element
358 * to be substituted be unique for that level. The localedef
359 * code ensures this for us.
360 */
361 if ((sptr = substsearch(lcc, *pri, which)) != NULL) {
362 if ((*pri = *sptr) != 0) {
363 sptr++;
364 *state = *sptr ? sptr : NULL;
365 }
366 }
367
368 }
369
370 /*
371 * This is the meaty part of wcsxfrm & strxfrm. Note that it does
372 * NOT NULL terminate. That is left to the caller.
373 */
374 size_t
_collate_wxfrm(const struct lc_collate * lcc,const wchar_t * src,wchar_t * xf,size_t room)375 _collate_wxfrm(const struct lc_collate *lcc, const wchar_t *src, wchar_t *xf,
376 size_t room)
377 {
378 int pri;
379 int len;
380 const wchar_t *t;
381 wchar_t *tr = NULL;
382 int direc;
383 int pass;
384 const int32_t *state;
385 size_t want = 0;
386 size_t need = 0;
387 int ndir = lcc->lc_directive_count;
388
389 assert(src);
390
391 for (pass = 0; pass <= ndir; pass++) {
392
393 state = NULL;
394
395 if (pass != 0) {
396 /* insert level separator from the previous pass */
397 if (room) {
398 *xf++ = 1;
399 room--;
400 }
401 want++;
402 }
403
404 /* special pass for undefined */
405 if (pass == ndir) {
406 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
407 } else {
408 direc = lcc->lc_directive[pass];
409 }
410
411 t = src;
412
413 if (direc & DIRECTIVE_BACKWARD) {
414 wchar_t *bp, *fp, c;
415 if (tr)
416 free(tr);
417 if ((tr = wcsdup(t)) == NULL) {
418 errno = ENOMEM;
419 goto fail;
420 }
421 bp = tr;
422 fp = tr + wcslen(tr) - 1;
423 while (bp < fp) {
424 c = *bp;
425 *bp++ = *fp;
426 *fp-- = c;
427 }
428 t = (const wchar_t *)tr;
429 }
430
431 if (direc & DIRECTIVE_POSITION) {
432 while (*t || state) {
433 _collate_lookup(lcc, t, &len, &pri, pass,
434 &state);
435 t += len;
436 if (pri <= 0) {
437 if (pri < 0) {
438 errno = EINVAL;
439 goto fail;
440 }
441 pri = COLLATE_MAX_PRIORITY;
442 }
443 if (room) {
444 *xf++ = pri;
445 room--;
446 }
447 want++;
448 need = want;
449 }
450 } else {
451 while (*t || state) {
452 _collate_lookup(lcc, t, &len, &pri, pass,
453 &state);
454 t += len;
455 if (pri <= 0) {
456 if (pri < 0) {
457 errno = EINVAL;
458 goto fail;
459 }
460 continue;
461 }
462 if (room) {
463 *xf++ = pri;
464 room--;
465 }
466 want++;
467 need = want;
468 }
469 }
470 }
471
472 end:
473 if (tr)
474 free(tr);
475 return (need);
476
477 fail:
478 if (tr)
479 free(tr);
480 return ((size_t)(-1));
481 }
482
483 /*
484 * In the non-POSIX case, we transform each character into a string of
485 * characters representing the character's priority. Since char is usually
486 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add
487 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6
488 * bits per byte.
489 *
490 * It turns out that we sometimes have real priorities that are
491 * 31-bits wide. (But: be careful using priorities where the high
492 * order bit is set -- i.e. the priority is negative. The sort order
493 * may be surprising!)
494 *
495 * TODO: This would be a good area to optimize somewhat. It turns out
496 * that real prioririties *except for the last UNDEFINED pass* are generally
497 * very small. We need the localedef code to precalculate the max
498 * priority for us, and ideally also give us a mask, and then we could
499 * severely limit what we expand to.
500 */
501 #define XFRM_BYTES 6
502 #define XFRM_OFFSET ('0') /* make all printable characters */
503 #define XFRM_SHIFT 6
504 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1)
505 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */
506
507 static int
xfrm(locale_t loc,unsigned char * p,int pri,int pass)508 xfrm(locale_t loc, unsigned char *p, int pri, int pass)
509 {
510 /* we use unsigned to ensure zero fill on right shift */
511 uint32_t val = (uint32_t)loc->collate->lc_pri_count[pass];
512 int nc = 0;
513
514 while (val) {
515 *p = (pri & XFRM_MASK) + XFRM_OFFSET;
516 pri >>= XFRM_SHIFT;
517 val >>= XFRM_SHIFT;
518 p++;
519 nc++;
520 }
521 return (nc);
522 }
523
524 size_t
_collate_sxfrm(const wchar_t * src,char * xf,size_t room,locale_t loc)525 _collate_sxfrm(const wchar_t *src, char *xf, size_t room, locale_t loc)
526 {
527 int pri;
528 int len;
529 const wchar_t *t;
530 wchar_t *tr = NULL;
531 int direc;
532 int pass;
533 const int32_t *state;
534 size_t want = 0;
535 size_t need = 0;
536 int b;
537 uint8_t buf[XFRM_BYTES];
538 const struct lc_collate *lcc = loc->collate;
539 int ndir = lcc->lc_directive_count;
540
541 assert(src);
542
543 for (pass = 0; pass <= ndir; pass++) {
544
545 state = NULL;
546
547 if (pass != 0) {
548 /* insert level separator from the previous pass */
549 if (room) {
550 *xf++ = XFRM_SEP;
551 room--;
552 }
553 want++;
554 }
555
556 /* special pass for undefined */
557 if (pass == ndir) {
558 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
559 } else {
560 direc = lcc->lc_directive[pass];
561 }
562
563 t = src;
564
565 if (direc & DIRECTIVE_BACKWARD) {
566 wchar_t *bp, *fp, c;
567 if (tr)
568 free(tr);
569 if ((tr = wcsdup(t)) == NULL) {
570 errno = ENOMEM;
571 goto fail;
572 }
573 bp = tr;
574 fp = tr + wcslen(tr) - 1;
575 while (bp < fp) {
576 c = *bp;
577 *bp++ = *fp;
578 *fp-- = c;
579 }
580 t = (const wchar_t *)tr;
581 }
582
583 if (direc & DIRECTIVE_POSITION) {
584 while (*t || state) {
585
586 _collate_lookup(lcc, t, &len, &pri, pass,
587 &state);
588 t += len;
589 if (pri <= 0) {
590 if (pri < 0) {
591 errno = EINVAL;
592 goto fail;
593 }
594 pri = COLLATE_MAX_PRIORITY;
595 }
596
597 b = xfrm(loc, buf, pri, pass);
598 want += b;
599 if (room) {
600 while (b) {
601 b--;
602 if (room) {
603 *xf++ = buf[b];
604 room--;
605 }
606 }
607 }
608 need = want;
609 }
610 } else {
611 while (*t || state) {
612 _collate_lookup(lcc, t, &len, &pri, pass,
613 &state);
614 t += len;
615 if (pri <= 0) {
616 if (pri < 0) {
617 errno = EINVAL;
618 goto fail;
619 }
620 continue;
621 }
622
623 b = xfrm(loc, buf, pri, pass);
624 want += b;
625 if (room) {
626
627 while (b) {
628 b--;
629 if (room) {
630 *xf++ = buf[b];
631 room--;
632 }
633 }
634 }
635 need = want;
636 }
637 }
638 }
639
640 end:
641 if (tr)
642 free(tr);
643 return (need);
644
645 fail:
646 if (tr)
647 free(tr);
648 return ((size_t)(-1));
649 }
650