14297a3b0SGarrett D'Amore /*
22d08521bSGarrett D'Amore * Copyright 2014 Garrett D'Amore <garrett@damore.org>
32d08521bSGarrett D'Amore * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
44297a3b0SGarrett D'Amore * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
54297a3b0SGarrett D'Amore * at Electronni Visti IA, Kiev, Ukraine.
64297a3b0SGarrett D'Amore * All rights reserved.
74297a3b0SGarrett D'Amore *
84297a3b0SGarrett D'Amore * Redistribution and use in source and binary forms, with or without
94297a3b0SGarrett D'Amore * modification, are permitted provided that the following conditions
104297a3b0SGarrett D'Amore * are met:
114297a3b0SGarrett D'Amore * 1. Redistributions of source code must retain the above copyright
124297a3b0SGarrett D'Amore * notice, this list of conditions and the following disclaimer.
134297a3b0SGarrett D'Amore * 2. Redistributions in binary form must reproduce the above copyright
144297a3b0SGarrett D'Amore * notice, this list of conditions and the following disclaimer in the
154297a3b0SGarrett D'Amore * documentation and/or other materials provided with the distribution.
164297a3b0SGarrett D'Amore *
174297a3b0SGarrett D'Amore * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
184297a3b0SGarrett D'Amore * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
194297a3b0SGarrett D'Amore * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
204297a3b0SGarrett D'Amore * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
214297a3b0SGarrett D'Amore * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
224297a3b0SGarrett D'Amore * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
234297a3b0SGarrett D'Amore * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
244297a3b0SGarrett D'Amore * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
254297a3b0SGarrett D'Amore * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
264297a3b0SGarrett D'Amore * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
274297a3b0SGarrett D'Amore * SUCH DAMAGE.
284297a3b0SGarrett D'Amore */
294297a3b0SGarrett D'Amore
304297a3b0SGarrett D'Amore #include "lint.h"
314297a3b0SGarrett D'Amore #include "file64.h"
324297a3b0SGarrett D'Amore #include <stdio.h>
334297a3b0SGarrett D'Amore #include <stdlib.h>
346b5e5868SGarrett D'Amore #include <stddef.h>
354297a3b0SGarrett D'Amore #include <string.h>
366b5e5868SGarrett D'Amore #include <wchar.h>
374297a3b0SGarrett D'Amore #include <errno.h>
384297a3b0SGarrett D'Amore #include <unistd.h>
396b5e5868SGarrett D'Amore #include <ctype.h>
406b5e5868SGarrett D'Amore #include <unistd.h>
416b5e5868SGarrett D'Amore #include <fcntl.h>
426b5e5868SGarrett D'Amore #include <assert.h>
436b5e5868SGarrett D'Amore #include <sys/stat.h>
446b5e5868SGarrett D'Amore #include <sys/mman.h>
454297a3b0SGarrett D'Amore
464297a3b0SGarrett D'Amore #include "collate.h"
47*bc09504fSGordon Ross #include "localeimpl.h"
48*bc09504fSGordon Ross
49*bc09504fSGordon Ross /* Check file format vs libc runtime. (See collatefile.h) */
50*bc09504fSGordon Ross #if COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX
51*bc09504fSGordon Ross #error "COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX"
52*bc09504fSGordon Ross #endif
534297a3b0SGarrett D'Amore
54723fee08SGarrett D'Amore /*
55723fee08SGarrett D'Amore * See the comments in usr/src/cmd/localedef/collate.c for further
56723fee08SGarrett D'Amore * information. It would also be very helpful to have a copy of the
57723fee08SGarrett D'Amore * POSIX standard for collation (in the locale format manual page)
58723fee08SGarrett D'Amore * handy (www.opengroup.org).
59723fee08SGarrett D'Amore */
60723fee08SGarrett D'Amore
612d08521bSGarrett D'Amore /*
622d08521bSGarrett D'Amore * POSIX uses empty tables and falls down to strcmp.
632d08521bSGarrett D'Amore */
642d08521bSGarrett D'Amore struct lc_collate lc_collate_posix = {
652d08521bSGarrett D'Amore .lc_is_posix = 1,
662d08521bSGarrett D'Amore };
674297a3b0SGarrett D'Amore
682d08521bSGarrett D'Amore struct locdata __posix_collate_locdata = {
692d08521bSGarrett D'Amore .l_lname = "C",
702d08521bSGarrett D'Amore .l_data = { &lc_collate_posix }
712d08521bSGarrett D'Amore };
726b5e5868SGarrett D'Amore
734297a3b0SGarrett D'Amore
742d08521bSGarrett D'Amore struct locdata *
__lc_collate_load(const char * locname)752d08521bSGarrett D'Amore __lc_collate_load(const char *locname)
764297a3b0SGarrett D'Amore {
776b5e5868SGarrett D'Amore int i, chains, z;
786b5e5868SGarrett D'Amore char buf[PATH_MAX];
796b5e5868SGarrett D'Amore char *TMP;
806b5e5868SGarrett D'Amore char *map;
816b5e5868SGarrett D'Amore collate_info_t *info;
826b5e5868SGarrett D'Amore struct stat sbuf;
836b5e5868SGarrett D'Amore int fd;
842d08521bSGarrett D'Amore struct locdata *ldata;
852d08521bSGarrett D'Amore struct lc_collate *lcc;
864297a3b0SGarrett D'Amore
874297a3b0SGarrett D'Amore /*
884297a3b0SGarrett D'Amore * Slurp the locale file into the cache.
894297a3b0SGarrett D'Amore */
904297a3b0SGarrett D'Amore
916da5aa94SGarrett D'Amore (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA",
922d08521bSGarrett D'Amore _PathLocale, locname);
936da5aa94SGarrett D'Amore
942d08521bSGarrett D'Amore if ((fd = open(buf, O_RDONLY)) < 0) {
952d08521bSGarrett D'Amore errno = EINVAL;
962d08521bSGarrett D'Amore return (NULL);
972d08521bSGarrett D'Amore }
986b5e5868SGarrett D'Amore if (fstat(fd, &sbuf) < 0) {
996b5e5868SGarrett D'Amore (void) close(fd);
1002d08521bSGarrett D'Amore errno = EINVAL;
1012d08521bSGarrett D'Amore return (NULL);
1026b5e5868SGarrett D'Amore }
1036b5e5868SGarrett D'Amore if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
1046b5e5868SGarrett D'Amore (void) close(fd);
1056b5e5868SGarrett D'Amore errno = EINVAL;
1062d08521bSGarrett D'Amore return (NULL);
1076b5e5868SGarrett D'Amore }
1086b5e5868SGarrett D'Amore map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
1096b5e5868SGarrett D'Amore (void) close(fd);
1106b5e5868SGarrett D'Amore if ((TMP = map) == NULL) {
1112d08521bSGarrett D'Amore errno = EINVAL;
1122d08521bSGarrett D'Amore return (NULL);
1136b5e5868SGarrett D'Amore }
1144297a3b0SGarrett D'Amore
1156b5e5868SGarrett D'Amore if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
1166b5e5868SGarrett D'Amore (void) munmap(map, sbuf.st_size);
1174297a3b0SGarrett D'Amore errno = EINVAL;
1182d08521bSGarrett D'Amore return (NULL);
1194297a3b0SGarrett D'Amore }
1206b5e5868SGarrett D'Amore TMP += COLLATE_STR_LEN;
1216b5e5868SGarrett D'Amore
1226b5e5868SGarrett D'Amore info = (void *)TMP;
1236b5e5868SGarrett D'Amore TMP += sizeof (*info);
1246b5e5868SGarrett D'Amore
1256b5e5868SGarrett D'Amore if ((info->directive_count < 1) ||
1266b5e5868SGarrett D'Amore (info->directive_count >= COLL_WEIGHTS_MAX) ||
1276b5e5868SGarrett D'Amore ((chains = info->chain_count) < 0)) {
1286b5e5868SGarrett D'Amore (void) munmap(map, sbuf.st_size);
1294297a3b0SGarrett D'Amore errno = EINVAL;
1302d08521bSGarrett D'Amore return (NULL);
1314297a3b0SGarrett D'Amore }
1326b5e5868SGarrett D'Amore
133723fee08SGarrett D'Amore i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
134723fee08SGarrett D'Amore (sizeof (collate_chain_t) * chains) +
135723fee08SGarrett D'Amore (sizeof (collate_large_t) * info->large_count);
1362d08521bSGarrett D'Amore for (z = 0; z < info->directive_count; z++) {
1376b5e5868SGarrett D'Amore i += sizeof (collate_subst_t) * info->subst_count[z];
1386b5e5868SGarrett D'Amore }
1396b5e5868SGarrett D'Amore if (i != (sbuf.st_size - (TMP - map))) {
1406b5e5868SGarrett D'Amore (void) munmap(map, sbuf.st_size);
1416b5e5868SGarrett D'Amore errno = EINVAL;
1422d08521bSGarrett D'Amore return (NULL);
1436b5e5868SGarrett D'Amore }
1446b5e5868SGarrett D'Amore
1452d08521bSGarrett D'Amore
1462d08521bSGarrett D'Amore if ((ldata = __locdata_alloc(locname, sizeof (*lcc))) == NULL) {
1472d08521bSGarrett D'Amore (void) munmap(map, sbuf.st_size);
1482d08521bSGarrett D'Amore return (NULL);
1492d08521bSGarrett D'Amore }
1502d08521bSGarrett D'Amore lcc = ldata->l_data[0];
1512d08521bSGarrett D'Amore ldata->l_map = map;
1522d08521bSGarrett D'Amore ldata->l_map_len = sbuf.st_size;
1532d08521bSGarrett D'Amore
1542d08521bSGarrett D'Amore lcc->lc_info = info;
1552d08521bSGarrett D'Amore lcc->lc_directive_count = info->directive_count;
1562d08521bSGarrett D'Amore lcc->lc_large_count = info->large_count;
1572d08521bSGarrett D'Amore
1582d08521bSGarrett D'Amore for (z = 0; z < COLL_WEIGHTS_MAX; z++) {
1592d08521bSGarrett D'Amore lcc->lc_directive[z] = info->directive[z];
1602d08521bSGarrett D'Amore lcc->lc_subst_count[z] = info->subst_count[z];
1612d08521bSGarrett D'Amore lcc->lc_pri_count[z] = info->pri_count[z];
1622d08521bSGarrett D'Amore lcc->lc_undef_pri[z] = info->undef_pri[z];
1632d08521bSGarrett D'Amore }
1642d08521bSGarrett D'Amore
1652d08521bSGarrett D'Amore lcc->lc_char_table = (void *)TMP;
166723fee08SGarrett D'Amore TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
1676b5e5868SGarrett D'Amore
1682d08521bSGarrett D'Amore for (z = 0; z < lcc->lc_directive_count; z++) {
1692d08521bSGarrett D'Amore int count;
1702d08521bSGarrett D'Amore if ((count = lcc->lc_subst_count[z]) > 0) {
1712d08521bSGarrett D'Amore lcc->lc_subst_table[z] = (void *)TMP;
1722d08521bSGarrett D'Amore TMP += count * sizeof (collate_subst_t);
1736b5e5868SGarrett D'Amore } else {
1742d08521bSGarrett D'Amore lcc->lc_subst_table[z] = NULL;
1756b5e5868SGarrett D'Amore }
1766b5e5868SGarrett D'Amore }
1776b5e5868SGarrett D'Amore
1786b5e5868SGarrett D'Amore if (chains > 0) {
1792d08521bSGarrett D'Amore lcc->lc_chain_table = (void *)TMP;
180723fee08SGarrett D'Amore TMP += chains * sizeof (collate_chain_t);
1814297a3b0SGarrett D'Amore } else
1822d08521bSGarrett D'Amore lcc->lc_chain_table = NULL;
1832d08521bSGarrett D'Amore lcc->lc_chain_count = chains;
1842d08521bSGarrett D'Amore if (lcc->lc_large_count > 0)
1852d08521bSGarrett D'Amore lcc->lc_large_table = (void *)TMP;
1866b5e5868SGarrett D'Amore else
1872d08521bSGarrett D'Amore lcc->lc_large_table = NULL;
1884297a3b0SGarrett D'Amore
1892d08521bSGarrett D'Amore return (ldata);
1904297a3b0SGarrett D'Amore }
1914297a3b0SGarrett D'Amore
1922d08521bSGarrett D'Amore static const int32_t *
substsearch(const struct lc_collate * lcc,const wchar_t key,int pass)1932d08521bSGarrett D'Amore substsearch(const struct lc_collate *lcc, const wchar_t key, int pass)
1944297a3b0SGarrett D'Amore {
1952d08521bSGarrett D'Amore const collate_subst_t *p;
1962d08521bSGarrett D'Amore int n = lcc->lc_subst_count[pass];
1974297a3b0SGarrett D'Amore
1986b5e5868SGarrett D'Amore if (n == 0)
1994297a3b0SGarrett D'Amore return (NULL);
2006b5e5868SGarrett D'Amore
2012d08521bSGarrett D'Amore if (pass >= lcc->lc_directive_count)
2024297a3b0SGarrett D'Amore return (NULL);
203723fee08SGarrett D'Amore
204723fee08SGarrett D'Amore if (!(key & COLLATE_SUBST_PRIORITY))
205723fee08SGarrett D'Amore return (NULL);
206723fee08SGarrett D'Amore
2072d08521bSGarrett D'Amore p = lcc->lc_subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
208723fee08SGarrett D'Amore assert(p->key == key);
209723fee08SGarrett D'Amore return (p->pri);
2104297a3b0SGarrett D'Amore }
2116b5e5868SGarrett D'Amore
212723fee08SGarrett D'Amore /*
213723fee08SGarrett D'Amore * Note: for performance reasons, we have expanded bsearch here. This avoids
214723fee08SGarrett D'Amore * function call overhead with each comparison.
215723fee08SGarrett D'Amore */
216723fee08SGarrett D'Amore
217723fee08SGarrett D'Amore static collate_chain_t *
chainsearch(const struct lc_collate * lcc,const wchar_t * key,int * len)2182d08521bSGarrett D'Amore chainsearch(const struct lc_collate *lcc, const wchar_t *key, int *len)
2196b5e5868SGarrett D'Amore {
2206b5e5868SGarrett D'Amore int low;
2216b5e5868SGarrett D'Amore int high;
2226b5e5868SGarrett D'Amore int next, compar, l;
223723fee08SGarrett D'Amore collate_chain_t *p;
224723fee08SGarrett D'Amore collate_chain_t *tab;
2256b5e5868SGarrett D'Amore
2262d08521bSGarrett D'Amore if (lcc->lc_info->chain_count == 0)
2276b5e5868SGarrett D'Amore return (NULL);
2286b5e5868SGarrett D'Amore
2296b5e5868SGarrett D'Amore low = 0;
2302d08521bSGarrett D'Amore high = lcc->lc_info->chain_count - 1;
2312d08521bSGarrett D'Amore tab = lcc->lc_chain_table;
2326b5e5868SGarrett D'Amore
2336b5e5868SGarrett D'Amore while (low <= high) {
2346b5e5868SGarrett D'Amore next = (low + high) / 2;
2356b5e5868SGarrett D'Amore p = tab + next;
2366b5e5868SGarrett D'Amore compar = *key - *p->str;
2376b5e5868SGarrett D'Amore if (compar == 0) {
2386b5e5868SGarrett D'Amore l = wcsnlen(p->str, COLLATE_STR_LEN);
2396b5e5868SGarrett D'Amore compar = wcsncmp(key, p->str, l);
2406b5e5868SGarrett D'Amore if (compar == 0) {
2416b5e5868SGarrett D'Amore *len = l;
2426b5e5868SGarrett D'Amore return (p);
2434297a3b0SGarrett D'Amore }
2444297a3b0SGarrett D'Amore }
2456b5e5868SGarrett D'Amore if (compar > 0)
2466b5e5868SGarrett D'Amore low = next + 1;
2476b5e5868SGarrett D'Amore else
2486b5e5868SGarrett D'Amore high = next - 1;
2496b5e5868SGarrett D'Amore }
2506b5e5868SGarrett D'Amore return (NULL);
2516b5e5868SGarrett D'Amore }
2526b5e5868SGarrett D'Amore
253723fee08SGarrett D'Amore static collate_large_t *
largesearch(const struct lc_collate * lcc,const wchar_t key)2542d08521bSGarrett D'Amore largesearch(const struct lc_collate *lcc, const wchar_t key)
2556b5e5868SGarrett D'Amore {
2566b5e5868SGarrett D'Amore int low = 0;
2572d08521bSGarrett D'Amore int high = lcc->lc_info->large_count - 1;
2586b5e5868SGarrett D'Amore int next, compar;
259723fee08SGarrett D'Amore collate_large_t *p;
2602d08521bSGarrett D'Amore collate_large_t *tab = lcc->lc_large_table;
2616b5e5868SGarrett D'Amore
2622d08521bSGarrett D'Amore if (lcc->lc_info->large_count == 0)
2636b5e5868SGarrett D'Amore return (NULL);
2646b5e5868SGarrett D'Amore
2656b5e5868SGarrett D'Amore while (low <= high) {
2666b5e5868SGarrett D'Amore next = (low + high) / 2;
2676b5e5868SGarrett D'Amore p = tab + next;
2686b5e5868SGarrett D'Amore compar = key - p->val;
2696b5e5868SGarrett D'Amore if (compar == 0)
2706b5e5868SGarrett D'Amore return (p);
2716b5e5868SGarrett D'Amore if (compar > 0)
2726b5e5868SGarrett D'Amore low = next + 1;
2736b5e5868SGarrett D'Amore else
2746b5e5868SGarrett D'Amore high = next - 1;
2756b5e5868SGarrett D'Amore }
2766b5e5868SGarrett D'Amore return (NULL);
2774297a3b0SGarrett D'Amore }
2784297a3b0SGarrett D'Amore
2794297a3b0SGarrett D'Amore void
_collate_lookup(const struct lc_collate * lcc,const wchar_t * t,int * len,int * pri,int which,const int ** state)2802d08521bSGarrett D'Amore _collate_lookup(const struct lc_collate *lcc, const wchar_t *t,
2812d08521bSGarrett D'Amore int *len, int *pri, int which, const int **state)
2824297a3b0SGarrett D'Amore {
283723fee08SGarrett D'Amore collate_chain_t *p2;
284723fee08SGarrett D'Amore collate_large_t *match;
2856b5e5868SGarrett D'Amore int p, l;
2862d08521bSGarrett D'Amore const int *sptr;
2874297a3b0SGarrett D'Amore
2886b5e5868SGarrett D'Amore /*
289723fee08SGarrett D'Amore * If this is the "last" pass for the UNDEFINED, then
290723fee08SGarrett D'Amore * we just return the priority itself.
291723fee08SGarrett D'Amore */
2922d08521bSGarrett D'Amore if (which >= lcc->lc_directive_count) {
293723fee08SGarrett D'Amore *pri = *t;
294723fee08SGarrett D'Amore *len = 1;
295723fee08SGarrett D'Amore *state = NULL;
296723fee08SGarrett D'Amore return;
297723fee08SGarrett D'Amore }
298723fee08SGarrett D'Amore
299723fee08SGarrett D'Amore /*
3006b5e5868SGarrett D'Amore * If we have remaining substitution data from a previous
3016b5e5868SGarrett D'Amore * call, consume it first.
3026b5e5868SGarrett D'Amore */
3036b5e5868SGarrett D'Amore if ((sptr = *state) != NULL) {
3046b5e5868SGarrett D'Amore *pri = *sptr;
3056b5e5868SGarrett D'Amore sptr++;
3066b5e5868SGarrett D'Amore *state = *sptr ? sptr : NULL;
3076b5e5868SGarrett D'Amore *len = 0;
3084297a3b0SGarrett D'Amore return;
3094297a3b0SGarrett D'Amore }
3106b5e5868SGarrett D'Amore
3116b5e5868SGarrett D'Amore /* No active substitutions */
3126b5e5868SGarrett D'Amore *len = 1;
3136b5e5868SGarrett D'Amore
3146b5e5868SGarrett D'Amore /*
3156b5e5868SGarrett D'Amore * Check for composites such as dipthongs that collate as a
3166b5e5868SGarrett D'Amore * single element (aka chains or collating-elements).
3176b5e5868SGarrett D'Amore */
3182d08521bSGarrett D'Amore if (((p2 = chainsearch(lcc, t, &l)) != NULL) &&
3196b5e5868SGarrett D'Amore ((p = p2->pri[which]) >= 0)) {
3206b5e5868SGarrett D'Amore
3216b5e5868SGarrett D'Amore *len = l;
3226b5e5868SGarrett D'Amore *pri = p;
3236b5e5868SGarrett D'Amore
3246b5e5868SGarrett D'Amore } else if (*t <= UCHAR_MAX) {
3256b5e5868SGarrett D'Amore
3266b5e5868SGarrett D'Amore /*
3276b5e5868SGarrett D'Amore * Character is a small (8-bit) character.
3286b5e5868SGarrett D'Amore * We just look these up directly for speed.
3296b5e5868SGarrett D'Amore */
3302d08521bSGarrett D'Amore *pri = lcc->lc_char_table[*t].pri[which];
3316b5e5868SGarrett D'Amore
3322d08521bSGarrett D'Amore } else if ((lcc->lc_info->large_count > 0) &&
3332d08521bSGarrett D'Amore ((match = largesearch(lcc, *t)) != NULL)) {
3346b5e5868SGarrett D'Amore
3356b5e5868SGarrett D'Amore /*
3366b5e5868SGarrett D'Amore * Character was found in the extended table.
3376b5e5868SGarrett D'Amore */
3386b5e5868SGarrett D'Amore *pri = match->pri.pri[which];
3396b5e5868SGarrett D'Amore
3406b5e5868SGarrett D'Amore } else {
3416b5e5868SGarrett D'Amore /*
3426b5e5868SGarrett D'Amore * Character lacks a specific definition.
3436b5e5868SGarrett D'Amore */
3442d08521bSGarrett D'Amore if (lcc->lc_directive[which] & DIRECTIVE_UNDEFINED) {
3456b5e5868SGarrett D'Amore /* Mask off sign bit to prevent ordering confusion. */
3466b5e5868SGarrett D'Amore *pri = (*t & COLLATE_MAX_PRIORITY);
3476b5e5868SGarrett D'Amore } else {
3482d08521bSGarrett D'Amore *pri = lcc->lc_undef_pri[which];
3494297a3b0SGarrett D'Amore }
3506b5e5868SGarrett D'Amore /* No substitutions for undefined characters! */
3516b5e5868SGarrett D'Amore return;
3526b5e5868SGarrett D'Amore }
3536b5e5868SGarrett D'Amore
3546b5e5868SGarrett D'Amore /*
3556b5e5868SGarrett D'Amore * Try substituting (expanding) the character. We are
3566b5e5868SGarrett D'Amore * currently doing this *after* the chain compression. I
3576b5e5868SGarrett D'Amore * think it should not matter, but this way might be slightly
3586b5e5868SGarrett D'Amore * faster.
3596b5e5868SGarrett D'Amore *
3606b5e5868SGarrett D'Amore * We do this after the priority search, as this will help us
3616b5e5868SGarrett D'Amore * to identify a single key value. In order for this to work,
3626b5e5868SGarrett D'Amore * its important that the priority assigned to a given element
3636b5e5868SGarrett D'Amore * to be substituted be unique for that level. The localedef
3646b5e5868SGarrett D'Amore * code ensures this for us.
3656b5e5868SGarrett D'Amore */
3662d08521bSGarrett D'Amore if ((sptr = substsearch(lcc, *pri, which)) != NULL) {
3676b5e5868SGarrett D'Amore if ((*pri = *sptr) != 0) {
3686b5e5868SGarrett D'Amore sptr++;
3696b5e5868SGarrett D'Amore *state = *sptr ? sptr : NULL;
3706b5e5868SGarrett D'Amore }
3716b5e5868SGarrett D'Amore }
3726b5e5868SGarrett D'Amore
3736b5e5868SGarrett D'Amore }
3746b5e5868SGarrett D'Amore
3756b5e5868SGarrett D'Amore /*
3766b5e5868SGarrett D'Amore * This is the meaty part of wcsxfrm & strxfrm. Note that it does
3776b5e5868SGarrett D'Amore * NOT NULL terminate. That is left to the caller.
3786b5e5868SGarrett D'Amore */
3796b5e5868SGarrett D'Amore size_t
_collate_wxfrm(const struct lc_collate * lcc,const wchar_t * src,wchar_t * xf,size_t room)3802d08521bSGarrett D'Amore _collate_wxfrm(const struct lc_collate *lcc, const wchar_t *src, wchar_t *xf,
3812d08521bSGarrett D'Amore size_t room)
3826b5e5868SGarrett D'Amore {
3836b5e5868SGarrett D'Amore int pri;
3846b5e5868SGarrett D'Amore int len;
3856b5e5868SGarrett D'Amore const wchar_t *t;
3866b5e5868SGarrett D'Amore wchar_t *tr = NULL;
3876b5e5868SGarrett D'Amore int direc;
3886b5e5868SGarrett D'Amore int pass;
3892d08521bSGarrett D'Amore const int32_t *state;
3906b5e5868SGarrett D'Amore size_t want = 0;
3916b5e5868SGarrett D'Amore size_t need = 0;
3922d08521bSGarrett D'Amore int ndir = lcc->lc_directive_count;
3936b5e5868SGarrett D'Amore
3946b5e5868SGarrett D'Amore assert(src);
3956b5e5868SGarrett D'Amore
3962d08521bSGarrett D'Amore for (pass = 0; pass <= ndir; pass++) {
3976b5e5868SGarrett D'Amore
3986b5e5868SGarrett D'Amore state = NULL;
3996b5e5868SGarrett D'Amore
4006b5e5868SGarrett D'Amore if (pass != 0) {
4016b5e5868SGarrett D'Amore /* insert level separator from the previous pass */
4026b5e5868SGarrett D'Amore if (room) {
4036b5e5868SGarrett D'Amore *xf++ = 1;
4046b5e5868SGarrett D'Amore room--;
4056b5e5868SGarrett D'Amore }
4066b5e5868SGarrett D'Amore want++;
4076b5e5868SGarrett D'Amore }
4086b5e5868SGarrett D'Amore
4096b5e5868SGarrett D'Amore /* special pass for undefined */
4102d08521bSGarrett D'Amore if (pass == ndir) {
4116b5e5868SGarrett D'Amore direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
4126b5e5868SGarrett D'Amore } else {
4132d08521bSGarrett D'Amore direc = lcc->lc_directive[pass];
4146b5e5868SGarrett D'Amore }
4156b5e5868SGarrett D'Amore
4166b5e5868SGarrett D'Amore t = src;
4176b5e5868SGarrett D'Amore
4186b5e5868SGarrett D'Amore if (direc & DIRECTIVE_BACKWARD) {
4196b5e5868SGarrett D'Amore wchar_t *bp, *fp, c;
4206b5e5868SGarrett D'Amore if (tr)
4216b5e5868SGarrett D'Amore free(tr);
4226b5e5868SGarrett D'Amore if ((tr = wcsdup(t)) == NULL) {
4236b5e5868SGarrett D'Amore errno = ENOMEM;
4246b5e5868SGarrett D'Amore goto fail;
4256b5e5868SGarrett D'Amore }
4266b5e5868SGarrett D'Amore bp = tr;
4276b5e5868SGarrett D'Amore fp = tr + wcslen(tr) - 1;
4286b5e5868SGarrett D'Amore while (bp < fp) {
4296b5e5868SGarrett D'Amore c = *bp;
4306b5e5868SGarrett D'Amore *bp++ = *fp;
4316b5e5868SGarrett D'Amore *fp-- = c;
4326b5e5868SGarrett D'Amore }
4336b5e5868SGarrett D'Amore t = (const wchar_t *)tr;
4346b5e5868SGarrett D'Amore }
4356b5e5868SGarrett D'Amore
4366b5e5868SGarrett D'Amore if (direc & DIRECTIVE_POSITION) {
4376b5e5868SGarrett D'Amore while (*t || state) {
4382d08521bSGarrett D'Amore _collate_lookup(lcc, t, &len, &pri, pass,
4392d08521bSGarrett D'Amore &state);
4406b5e5868SGarrett D'Amore t += len;
4416b5e5868SGarrett D'Amore if (pri <= 0) {
4426b5e5868SGarrett D'Amore if (pri < 0) {
4436b5e5868SGarrett D'Amore errno = EINVAL;
4446b5e5868SGarrett D'Amore goto fail;
4456b5e5868SGarrett D'Amore }
4466b5e5868SGarrett D'Amore pri = COLLATE_MAX_PRIORITY;
4476b5e5868SGarrett D'Amore }
4486b5e5868SGarrett D'Amore if (room) {
4496b5e5868SGarrett D'Amore *xf++ = pri;
4506b5e5868SGarrett D'Amore room--;
4516b5e5868SGarrett D'Amore }
4526b5e5868SGarrett D'Amore want++;
4536b5e5868SGarrett D'Amore need = want;
4546b5e5868SGarrett D'Amore }
4556b5e5868SGarrett D'Amore } else {
4566b5e5868SGarrett D'Amore while (*t || state) {
4572d08521bSGarrett D'Amore _collate_lookup(lcc, t, &len, &pri, pass,
4582d08521bSGarrett D'Amore &state);
4596b5e5868SGarrett D'Amore t += len;
4606b5e5868SGarrett D'Amore if (pri <= 0) {
4616b5e5868SGarrett D'Amore if (pri < 0) {
4626b5e5868SGarrett D'Amore errno = EINVAL;
4636b5e5868SGarrett D'Amore goto fail;
4646b5e5868SGarrett D'Amore }
4656b5e5868SGarrett D'Amore continue;
4666b5e5868SGarrett D'Amore }
4676b5e5868SGarrett D'Amore if (room) {
4686b5e5868SGarrett D'Amore *xf++ = pri;
4696b5e5868SGarrett D'Amore room--;
4706b5e5868SGarrett D'Amore }
4716b5e5868SGarrett D'Amore want++;
4726b5e5868SGarrett D'Amore need = want;
4736b5e5868SGarrett D'Amore }
4746b5e5868SGarrett D'Amore }
4756b5e5868SGarrett D'Amore }
4766b5e5868SGarrett D'Amore
4776b5e5868SGarrett D'Amore end:
4786b5e5868SGarrett D'Amore if (tr)
4796b5e5868SGarrett D'Amore free(tr);
4806b5e5868SGarrett D'Amore return (need);
4816b5e5868SGarrett D'Amore
4826b5e5868SGarrett D'Amore fail:
4836b5e5868SGarrett D'Amore if (tr)
4846b5e5868SGarrett D'Amore free(tr);
4856b5e5868SGarrett D'Amore return ((size_t)(-1));
4866b5e5868SGarrett D'Amore }
4876b5e5868SGarrett D'Amore
4886b5e5868SGarrett D'Amore /*
4896b5e5868SGarrett D'Amore * In the non-POSIX case, we transform each character into a string of
4906b5e5868SGarrett D'Amore * characters representing the character's priority. Since char is usually
4916b5e5868SGarrett D'Amore * signed, we are limited by 7 bits per byte. To avoid zero, we need to add
4926b5e5868SGarrett D'Amore * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6
4936b5e5868SGarrett D'Amore * bits per byte.
4946b5e5868SGarrett D'Amore *
4956b5e5868SGarrett D'Amore * It turns out that we sometimes have real priorities that are
4966b5e5868SGarrett D'Amore * 31-bits wide. (But: be careful using priorities where the high
4976b5e5868SGarrett D'Amore * order bit is set -- i.e. the priority is negative. The sort order
4986b5e5868SGarrett D'Amore * may be surprising!)
4996b5e5868SGarrett D'Amore *
5006b5e5868SGarrett D'Amore * TODO: This would be a good area to optimize somewhat. It turns out
5016b5e5868SGarrett D'Amore * that real prioririties *except for the last UNDEFINED pass* are generally
5026b5e5868SGarrett D'Amore * very small. We need the localedef code to precalculate the max
5036b5e5868SGarrett D'Amore * priority for us, and ideally also give us a mask, and then we could
5046b5e5868SGarrett D'Amore * severely limit what we expand to.
5056b5e5868SGarrett D'Amore */
5066b5e5868SGarrett D'Amore #define XFRM_BYTES 6
5076b5e5868SGarrett D'Amore #define XFRM_OFFSET ('0') /* make all printable characters */
5086b5e5868SGarrett D'Amore #define XFRM_SHIFT 6
5096b5e5868SGarrett D'Amore #define XFRM_MASK ((1 << XFRM_SHIFT) - 1)
5106b5e5868SGarrett D'Amore #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */
5116b5e5868SGarrett D'Amore
512723fee08SGarrett D'Amore static int
xfrm(locale_t loc,unsigned char * p,int pri,int pass)5132d08521bSGarrett D'Amore xfrm(locale_t loc, unsigned char *p, int pri, int pass)
5146b5e5868SGarrett D'Amore {
515723fee08SGarrett D'Amore /* we use unsigned to ensure zero fill on right shift */
5162d08521bSGarrett D'Amore uint32_t val = (uint32_t)loc->collate->lc_pri_count[pass];
517723fee08SGarrett D'Amore int nc = 0;
518723fee08SGarrett D'Amore
519723fee08SGarrett D'Amore while (val) {
520723fee08SGarrett D'Amore *p = (pri & XFRM_MASK) + XFRM_OFFSET;
5216b5e5868SGarrett D'Amore pri >>= XFRM_SHIFT;
522723fee08SGarrett D'Amore val >>= XFRM_SHIFT;
523723fee08SGarrett D'Amore p++;
524723fee08SGarrett D'Amore nc++;
5256b5e5868SGarrett D'Amore }
526723fee08SGarrett D'Amore return (nc);
5276b5e5868SGarrett D'Amore }
5286b5e5868SGarrett D'Amore
5296b5e5868SGarrett D'Amore size_t
_collate_sxfrm(const wchar_t * src,char * xf,size_t room,locale_t loc)5302d08521bSGarrett D'Amore _collate_sxfrm(const wchar_t *src, char *xf, size_t room, locale_t loc)
5316b5e5868SGarrett D'Amore {
5326b5e5868SGarrett D'Amore int pri;
5336b5e5868SGarrett D'Amore int len;
5346b5e5868SGarrett D'Amore const wchar_t *t;
5356b5e5868SGarrett D'Amore wchar_t *tr = NULL;
5366b5e5868SGarrett D'Amore int direc;
5376b5e5868SGarrett D'Amore int pass;
5382d08521bSGarrett D'Amore const int32_t *state;
5396b5e5868SGarrett D'Amore size_t want = 0;
5406b5e5868SGarrett D'Amore size_t need = 0;
5416b5e5868SGarrett D'Amore int b;
5426b5e5868SGarrett D'Amore uint8_t buf[XFRM_BYTES];
5432d08521bSGarrett D'Amore const struct lc_collate *lcc = loc->collate;
5442d08521bSGarrett D'Amore int ndir = lcc->lc_directive_count;
5456b5e5868SGarrett D'Amore
5466b5e5868SGarrett D'Amore assert(src);
5476b5e5868SGarrett D'Amore
5482d08521bSGarrett D'Amore for (pass = 0; pass <= ndir; pass++) {
5496b5e5868SGarrett D'Amore
5506b5e5868SGarrett D'Amore state = NULL;
5516b5e5868SGarrett D'Amore
5526b5e5868SGarrett D'Amore if (pass != 0) {
5536b5e5868SGarrett D'Amore /* insert level separator from the previous pass */
5546b5e5868SGarrett D'Amore if (room) {
5556b5e5868SGarrett D'Amore *xf++ = XFRM_SEP;
5566b5e5868SGarrett D'Amore room--;
5576b5e5868SGarrett D'Amore }
5586b5e5868SGarrett D'Amore want++;
5596b5e5868SGarrett D'Amore }
5606b5e5868SGarrett D'Amore
5616b5e5868SGarrett D'Amore /* special pass for undefined */
5622d08521bSGarrett D'Amore if (pass == ndir) {
5636b5e5868SGarrett D'Amore direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
5646b5e5868SGarrett D'Amore } else {
5652d08521bSGarrett D'Amore direc = lcc->lc_directive[pass];
5666b5e5868SGarrett D'Amore }
5676b5e5868SGarrett D'Amore
5686b5e5868SGarrett D'Amore t = src;
5696b5e5868SGarrett D'Amore
5706b5e5868SGarrett D'Amore if (direc & DIRECTIVE_BACKWARD) {
5716b5e5868SGarrett D'Amore wchar_t *bp, *fp, c;
5726b5e5868SGarrett D'Amore if (tr)
5736b5e5868SGarrett D'Amore free(tr);
5746b5e5868SGarrett D'Amore if ((tr = wcsdup(t)) == NULL) {
5756b5e5868SGarrett D'Amore errno = ENOMEM;
5766b5e5868SGarrett D'Amore goto fail;
5776b5e5868SGarrett D'Amore }
5786b5e5868SGarrett D'Amore bp = tr;
5796b5e5868SGarrett D'Amore fp = tr + wcslen(tr) - 1;
5806b5e5868SGarrett D'Amore while (bp < fp) {
5816b5e5868SGarrett D'Amore c = *bp;
5826b5e5868SGarrett D'Amore *bp++ = *fp;
5836b5e5868SGarrett D'Amore *fp-- = c;
5846b5e5868SGarrett D'Amore }
5856b5e5868SGarrett D'Amore t = (const wchar_t *)tr;
5866b5e5868SGarrett D'Amore }
5876b5e5868SGarrett D'Amore
5886b5e5868SGarrett D'Amore if (direc & DIRECTIVE_POSITION) {
5896b5e5868SGarrett D'Amore while (*t || state) {
5906b5e5868SGarrett D'Amore
5912d08521bSGarrett D'Amore _collate_lookup(lcc, t, &len, &pri, pass,
5922d08521bSGarrett D'Amore &state);
5936b5e5868SGarrett D'Amore t += len;
5946b5e5868SGarrett D'Amore if (pri <= 0) {
5956b5e5868SGarrett D'Amore if (pri < 0) {
5966b5e5868SGarrett D'Amore errno = EINVAL;
5976b5e5868SGarrett D'Amore goto fail;
5986b5e5868SGarrett D'Amore }
5996b5e5868SGarrett D'Amore pri = COLLATE_MAX_PRIORITY;
6006b5e5868SGarrett D'Amore }
6016b5e5868SGarrett D'Amore
6022d08521bSGarrett D'Amore b = xfrm(loc, buf, pri, pass);
603723fee08SGarrett D'Amore want += b;
6046b5e5868SGarrett D'Amore if (room) {
6056b5e5868SGarrett D'Amore while (b) {
6066b5e5868SGarrett D'Amore b--;
60763481050STheo Schlossnagle if (room) {
6086b5e5868SGarrett D'Amore *xf++ = buf[b];
6096b5e5868SGarrett D'Amore room--;
6106b5e5868SGarrett D'Amore }
6116b5e5868SGarrett D'Amore }
61263481050STheo Schlossnagle }
6136b5e5868SGarrett D'Amore need = want;
6146b5e5868SGarrett D'Amore }
6156b5e5868SGarrett D'Amore } else {
6166b5e5868SGarrett D'Amore while (*t || state) {
6172d08521bSGarrett D'Amore _collate_lookup(lcc, t, &len, &pri, pass,
6182d08521bSGarrett D'Amore &state);
6196b5e5868SGarrett D'Amore t += len;
6206b5e5868SGarrett D'Amore if (pri <= 0) {
6216b5e5868SGarrett D'Amore if (pri < 0) {
6226b5e5868SGarrett D'Amore errno = EINVAL;
6236b5e5868SGarrett D'Amore goto fail;
6246b5e5868SGarrett D'Amore }
6256b5e5868SGarrett D'Amore continue;
6266b5e5868SGarrett D'Amore }
627723fee08SGarrett D'Amore
6282d08521bSGarrett D'Amore b = xfrm(loc, buf, pri, pass);
629723fee08SGarrett D'Amore want += b;
6306b5e5868SGarrett D'Amore if (room) {
6316b5e5868SGarrett D'Amore
6326b5e5868SGarrett D'Amore while (b) {
6336b5e5868SGarrett D'Amore b--;
63463481050STheo Schlossnagle if (room) {
6356b5e5868SGarrett D'Amore *xf++ = buf[b];
6366b5e5868SGarrett D'Amore room--;
6376b5e5868SGarrett D'Amore }
6386b5e5868SGarrett D'Amore }
63963481050STheo Schlossnagle }
6406b5e5868SGarrett D'Amore need = want;
6416b5e5868SGarrett D'Amore }
6426b5e5868SGarrett D'Amore }
6436b5e5868SGarrett D'Amore }
6446b5e5868SGarrett D'Amore
6456b5e5868SGarrett D'Amore end:
6466b5e5868SGarrett D'Amore if (tr)
6476b5e5868SGarrett D'Amore free(tr);
6486b5e5868SGarrett D'Amore return (need);
6496b5e5868SGarrett D'Amore
6506b5e5868SGarrett D'Amore fail:
6516b5e5868SGarrett D'Amore if (tr)
6526b5e5868SGarrett D'Amore free(tr);
6536b5e5868SGarrett D'Amore return ((size_t)(-1));
6544297a3b0SGarrett D'Amore }
655