14297a3b0SGarrett D'Amore /* 22d08521bSGarrett D'Amore * Copyright 2014 Garrett D'Amore <garrett@damore.org> 32d08521bSGarrett D'Amore * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 44297a3b0SGarrett D'Amore * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 54297a3b0SGarrett D'Amore * at Electronni Visti IA, Kiev, Ukraine. 64297a3b0SGarrett D'Amore * All rights reserved. 74297a3b0SGarrett D'Amore * 84297a3b0SGarrett D'Amore * Redistribution and use in source and binary forms, with or without 94297a3b0SGarrett D'Amore * modification, are permitted provided that the following conditions 104297a3b0SGarrett D'Amore * are met: 114297a3b0SGarrett D'Amore * 1. Redistributions of source code must retain the above copyright 124297a3b0SGarrett D'Amore * notice, this list of conditions and the following disclaimer. 134297a3b0SGarrett D'Amore * 2. Redistributions in binary form must reproduce the above copyright 144297a3b0SGarrett D'Amore * notice, this list of conditions and the following disclaimer in the 154297a3b0SGarrett D'Amore * documentation and/or other materials provided with the distribution. 164297a3b0SGarrett D'Amore * 174297a3b0SGarrett D'Amore * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 184297a3b0SGarrett D'Amore * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 194297a3b0SGarrett D'Amore * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 204297a3b0SGarrett D'Amore * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 214297a3b0SGarrett D'Amore * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 224297a3b0SGarrett D'Amore * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 234297a3b0SGarrett D'Amore * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 244297a3b0SGarrett D'Amore * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 254297a3b0SGarrett D'Amore * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 264297a3b0SGarrett D'Amore * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 274297a3b0SGarrett D'Amore * SUCH DAMAGE. 284297a3b0SGarrett D'Amore */ 294297a3b0SGarrett D'Amore 304297a3b0SGarrett D'Amore #include "lint.h" 314297a3b0SGarrett D'Amore #include "file64.h" 324297a3b0SGarrett D'Amore #include <stdio.h> 334297a3b0SGarrett D'Amore #include <stdlib.h> 346b5e5868SGarrett D'Amore #include <stddef.h> 354297a3b0SGarrett D'Amore #include <string.h> 366b5e5868SGarrett D'Amore #include <wchar.h> 374297a3b0SGarrett D'Amore #include <errno.h> 384297a3b0SGarrett D'Amore #include <unistd.h> 396b5e5868SGarrett D'Amore #include <ctype.h> 406b5e5868SGarrett D'Amore #include <unistd.h> 416b5e5868SGarrett D'Amore #include <fcntl.h> 426b5e5868SGarrett D'Amore #include <assert.h> 436b5e5868SGarrett D'Amore #include <sys/stat.h> 446b5e5868SGarrett D'Amore #include <sys/mman.h> 454297a3b0SGarrett D'Amore 464297a3b0SGarrett D'Amore #include "collate.h" 47*bc09504fSGordon Ross #include "localeimpl.h" 48*bc09504fSGordon Ross 49*bc09504fSGordon Ross /* Check file format vs libc runtime. (See collatefile.h) */ 50*bc09504fSGordon Ross #if COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX 51*bc09504fSGordon Ross #error "COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX" 52*bc09504fSGordon Ross #endif 534297a3b0SGarrett D'Amore 54723fee08SGarrett D'Amore /* 55723fee08SGarrett D'Amore * See the comments in usr/src/cmd/localedef/collate.c for further 56723fee08SGarrett D'Amore * information. It would also be very helpful to have a copy of the 57723fee08SGarrett D'Amore * POSIX standard for collation (in the locale format manual page) 58723fee08SGarrett D'Amore * handy (www.opengroup.org). 59723fee08SGarrett D'Amore */ 60723fee08SGarrett D'Amore 612d08521bSGarrett D'Amore /* 622d08521bSGarrett D'Amore * POSIX uses empty tables and falls down to strcmp. 632d08521bSGarrett D'Amore */ 642d08521bSGarrett D'Amore struct lc_collate lc_collate_posix = { 652d08521bSGarrett D'Amore .lc_is_posix = 1, 662d08521bSGarrett D'Amore }; 674297a3b0SGarrett D'Amore 682d08521bSGarrett D'Amore struct locdata __posix_collate_locdata = { 692d08521bSGarrett D'Amore .l_lname = "C", 702d08521bSGarrett D'Amore .l_data = { &lc_collate_posix } 712d08521bSGarrett D'Amore }; 726b5e5868SGarrett D'Amore 734297a3b0SGarrett D'Amore 742d08521bSGarrett D'Amore struct locdata * 752d08521bSGarrett D'Amore __lc_collate_load(const char *locname) 764297a3b0SGarrett D'Amore { 776b5e5868SGarrett D'Amore int i, chains, z; 786b5e5868SGarrett D'Amore char buf[PATH_MAX]; 796b5e5868SGarrett D'Amore char *TMP; 806b5e5868SGarrett D'Amore char *map; 816b5e5868SGarrett D'Amore collate_info_t *info; 826b5e5868SGarrett D'Amore struct stat sbuf; 836b5e5868SGarrett D'Amore int fd; 842d08521bSGarrett D'Amore struct locdata *ldata; 852d08521bSGarrett D'Amore struct lc_collate *lcc; 864297a3b0SGarrett D'Amore 874297a3b0SGarrett D'Amore /* 884297a3b0SGarrett D'Amore * Slurp the locale file into the cache. 894297a3b0SGarrett D'Amore */ 904297a3b0SGarrett D'Amore 916da5aa94SGarrett D'Amore (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA", 922d08521bSGarrett D'Amore _PathLocale, locname); 936da5aa94SGarrett D'Amore 942d08521bSGarrett D'Amore if ((fd = open(buf, O_RDONLY)) < 0) { 952d08521bSGarrett D'Amore errno = EINVAL; 962d08521bSGarrett D'Amore return (NULL); 972d08521bSGarrett D'Amore } 986b5e5868SGarrett D'Amore if (fstat(fd, &sbuf) < 0) { 996b5e5868SGarrett D'Amore (void) close(fd); 1002d08521bSGarrett D'Amore errno = EINVAL; 1012d08521bSGarrett D'Amore return (NULL); 1026b5e5868SGarrett D'Amore } 1036b5e5868SGarrett D'Amore if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { 1046b5e5868SGarrett D'Amore (void) close(fd); 1056b5e5868SGarrett D'Amore errno = EINVAL; 1062d08521bSGarrett D'Amore return (NULL); 1076b5e5868SGarrett D'Amore } 1086b5e5868SGarrett D'Amore map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 1096b5e5868SGarrett D'Amore (void) close(fd); 1106b5e5868SGarrett D'Amore if ((TMP = map) == NULL) { 1112d08521bSGarrett D'Amore errno = EINVAL; 1122d08521bSGarrett D'Amore return (NULL); 1136b5e5868SGarrett D'Amore } 1144297a3b0SGarrett D'Amore 1156b5e5868SGarrett D'Amore if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { 1166b5e5868SGarrett D'Amore (void) munmap(map, sbuf.st_size); 1174297a3b0SGarrett D'Amore errno = EINVAL; 1182d08521bSGarrett D'Amore return (NULL); 1194297a3b0SGarrett D'Amore } 1206b5e5868SGarrett D'Amore TMP += COLLATE_STR_LEN; 1216b5e5868SGarrett D'Amore 1226b5e5868SGarrett D'Amore info = (void *)TMP; 1236b5e5868SGarrett D'Amore TMP += sizeof (*info); 1246b5e5868SGarrett D'Amore 1256b5e5868SGarrett D'Amore if ((info->directive_count < 1) || 1266b5e5868SGarrett D'Amore (info->directive_count >= COLL_WEIGHTS_MAX) || 1276b5e5868SGarrett D'Amore ((chains = info->chain_count) < 0)) { 1286b5e5868SGarrett D'Amore (void) munmap(map, sbuf.st_size); 1294297a3b0SGarrett D'Amore errno = EINVAL; 1302d08521bSGarrett D'Amore return (NULL); 1314297a3b0SGarrett D'Amore } 1326b5e5868SGarrett D'Amore 133723fee08SGarrett D'Amore i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 134723fee08SGarrett D'Amore (sizeof (collate_chain_t) * chains) + 135723fee08SGarrett D'Amore (sizeof (collate_large_t) * info->large_count); 1362d08521bSGarrett D'Amore for (z = 0; z < info->directive_count; z++) { 1376b5e5868SGarrett D'Amore i += sizeof (collate_subst_t) * info->subst_count[z]; 1386b5e5868SGarrett D'Amore } 1396b5e5868SGarrett D'Amore if (i != (sbuf.st_size - (TMP - map))) { 1406b5e5868SGarrett D'Amore (void) munmap(map, sbuf.st_size); 1416b5e5868SGarrett D'Amore errno = EINVAL; 1422d08521bSGarrett D'Amore return (NULL); 1436b5e5868SGarrett D'Amore } 1446b5e5868SGarrett D'Amore 1452d08521bSGarrett D'Amore 1462d08521bSGarrett D'Amore if ((ldata = __locdata_alloc(locname, sizeof (*lcc))) == NULL) { 1472d08521bSGarrett D'Amore (void) munmap(map, sbuf.st_size); 1482d08521bSGarrett D'Amore return (NULL); 1492d08521bSGarrett D'Amore } 1502d08521bSGarrett D'Amore lcc = ldata->l_data[0]; 1512d08521bSGarrett D'Amore ldata->l_map = map; 1522d08521bSGarrett D'Amore ldata->l_map_len = sbuf.st_size; 1532d08521bSGarrett D'Amore 1542d08521bSGarrett D'Amore lcc->lc_info = info; 1552d08521bSGarrett D'Amore lcc->lc_directive_count = info->directive_count; 1562d08521bSGarrett D'Amore lcc->lc_large_count = info->large_count; 1572d08521bSGarrett D'Amore 1582d08521bSGarrett D'Amore for (z = 0; z < COLL_WEIGHTS_MAX; z++) { 1592d08521bSGarrett D'Amore lcc->lc_directive[z] = info->directive[z]; 1602d08521bSGarrett D'Amore lcc->lc_subst_count[z] = info->subst_count[z]; 1612d08521bSGarrett D'Amore lcc->lc_pri_count[z] = info->pri_count[z]; 1622d08521bSGarrett D'Amore lcc->lc_undef_pri[z] = info->undef_pri[z]; 1632d08521bSGarrett D'Amore } 1642d08521bSGarrett D'Amore 1652d08521bSGarrett D'Amore lcc->lc_char_table = (void *)TMP; 166723fee08SGarrett D'Amore TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 1676b5e5868SGarrett D'Amore 1682d08521bSGarrett D'Amore for (z = 0; z < lcc->lc_directive_count; z++) { 1692d08521bSGarrett D'Amore int count; 1702d08521bSGarrett D'Amore if ((count = lcc->lc_subst_count[z]) > 0) { 1712d08521bSGarrett D'Amore lcc->lc_subst_table[z] = (void *)TMP; 1722d08521bSGarrett D'Amore TMP += count * sizeof (collate_subst_t); 1736b5e5868SGarrett D'Amore } else { 1742d08521bSGarrett D'Amore lcc->lc_subst_table[z] = NULL; 1756b5e5868SGarrett D'Amore } 1766b5e5868SGarrett D'Amore } 1776b5e5868SGarrett D'Amore 1786b5e5868SGarrett D'Amore if (chains > 0) { 1792d08521bSGarrett D'Amore lcc->lc_chain_table = (void *)TMP; 180723fee08SGarrett D'Amore TMP += chains * sizeof (collate_chain_t); 1814297a3b0SGarrett D'Amore } else 1822d08521bSGarrett D'Amore lcc->lc_chain_table = NULL; 1832d08521bSGarrett D'Amore lcc->lc_chain_count = chains; 1842d08521bSGarrett D'Amore if (lcc->lc_large_count > 0) 1852d08521bSGarrett D'Amore lcc->lc_large_table = (void *)TMP; 1866b5e5868SGarrett D'Amore else 1872d08521bSGarrett D'Amore lcc->lc_large_table = NULL; 1884297a3b0SGarrett D'Amore 1892d08521bSGarrett D'Amore return (ldata); 1904297a3b0SGarrett D'Amore } 1914297a3b0SGarrett D'Amore 1922d08521bSGarrett D'Amore static const int32_t * 1932d08521bSGarrett D'Amore substsearch(const struct lc_collate *lcc, const wchar_t key, int pass) 1944297a3b0SGarrett D'Amore { 1952d08521bSGarrett D'Amore const collate_subst_t *p; 1962d08521bSGarrett D'Amore int n = lcc->lc_subst_count[pass]; 1974297a3b0SGarrett D'Amore 1986b5e5868SGarrett D'Amore if (n == 0) 1994297a3b0SGarrett D'Amore return (NULL); 2006b5e5868SGarrett D'Amore 2012d08521bSGarrett D'Amore if (pass >= lcc->lc_directive_count) 2024297a3b0SGarrett D'Amore return (NULL); 203723fee08SGarrett D'Amore 204723fee08SGarrett D'Amore if (!(key & COLLATE_SUBST_PRIORITY)) 205723fee08SGarrett D'Amore return (NULL); 206723fee08SGarrett D'Amore 2072d08521bSGarrett D'Amore p = lcc->lc_subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 208723fee08SGarrett D'Amore assert(p->key == key); 209723fee08SGarrett D'Amore return (p->pri); 2104297a3b0SGarrett D'Amore } 2116b5e5868SGarrett D'Amore 212723fee08SGarrett D'Amore /* 213723fee08SGarrett D'Amore * Note: for performance reasons, we have expanded bsearch here. This avoids 214723fee08SGarrett D'Amore * function call overhead with each comparison. 215723fee08SGarrett D'Amore */ 216723fee08SGarrett D'Amore 217723fee08SGarrett D'Amore static collate_chain_t * 2182d08521bSGarrett D'Amore chainsearch(const struct lc_collate *lcc, const wchar_t *key, int *len) 2196b5e5868SGarrett D'Amore { 2206b5e5868SGarrett D'Amore int low; 2216b5e5868SGarrett D'Amore int high; 2226b5e5868SGarrett D'Amore int next, compar, l; 223723fee08SGarrett D'Amore collate_chain_t *p; 224723fee08SGarrett D'Amore collate_chain_t *tab; 2256b5e5868SGarrett D'Amore 2262d08521bSGarrett D'Amore if (lcc->lc_info->chain_count == 0) 2276b5e5868SGarrett D'Amore return (NULL); 2286b5e5868SGarrett D'Amore 2296b5e5868SGarrett D'Amore low = 0; 2302d08521bSGarrett D'Amore high = lcc->lc_info->chain_count - 1; 2312d08521bSGarrett D'Amore tab = lcc->lc_chain_table; 2326b5e5868SGarrett D'Amore 2336b5e5868SGarrett D'Amore while (low <= high) { 2346b5e5868SGarrett D'Amore next = (low + high) / 2; 2356b5e5868SGarrett D'Amore p = tab + next; 2366b5e5868SGarrett D'Amore compar = *key - *p->str; 2376b5e5868SGarrett D'Amore if (compar == 0) { 2386b5e5868SGarrett D'Amore l = wcsnlen(p->str, COLLATE_STR_LEN); 2396b5e5868SGarrett D'Amore compar = wcsncmp(key, p->str, l); 2406b5e5868SGarrett D'Amore if (compar == 0) { 2416b5e5868SGarrett D'Amore *len = l; 2426b5e5868SGarrett D'Amore return (p); 2434297a3b0SGarrett D'Amore } 2444297a3b0SGarrett D'Amore } 2456b5e5868SGarrett D'Amore if (compar > 0) 2466b5e5868SGarrett D'Amore low = next + 1; 2476b5e5868SGarrett D'Amore else 2486b5e5868SGarrett D'Amore high = next - 1; 2496b5e5868SGarrett D'Amore } 2506b5e5868SGarrett D'Amore return (NULL); 2516b5e5868SGarrett D'Amore } 2526b5e5868SGarrett D'Amore 253723fee08SGarrett D'Amore static collate_large_t * 2542d08521bSGarrett D'Amore largesearch(const struct lc_collate *lcc, const wchar_t key) 2556b5e5868SGarrett D'Amore { 2566b5e5868SGarrett D'Amore int low = 0; 2572d08521bSGarrett D'Amore int high = lcc->lc_info->large_count - 1; 2586b5e5868SGarrett D'Amore int next, compar; 259723fee08SGarrett D'Amore collate_large_t *p; 2602d08521bSGarrett D'Amore collate_large_t *tab = lcc->lc_large_table; 2616b5e5868SGarrett D'Amore 2622d08521bSGarrett D'Amore if (lcc->lc_info->large_count == 0) 2636b5e5868SGarrett D'Amore return (NULL); 2646b5e5868SGarrett D'Amore 2656b5e5868SGarrett D'Amore while (low <= high) { 2666b5e5868SGarrett D'Amore next = (low + high) / 2; 2676b5e5868SGarrett D'Amore p = tab + next; 2686b5e5868SGarrett D'Amore compar = key - p->val; 2696b5e5868SGarrett D'Amore if (compar == 0) 2706b5e5868SGarrett D'Amore return (p); 2716b5e5868SGarrett D'Amore if (compar > 0) 2726b5e5868SGarrett D'Amore low = next + 1; 2736b5e5868SGarrett D'Amore else 2746b5e5868SGarrett D'Amore high = next - 1; 2756b5e5868SGarrett D'Amore } 2766b5e5868SGarrett D'Amore return (NULL); 2774297a3b0SGarrett D'Amore } 2784297a3b0SGarrett D'Amore 2794297a3b0SGarrett D'Amore void 2802d08521bSGarrett D'Amore _collate_lookup(const struct lc_collate *lcc, const wchar_t *t, 2812d08521bSGarrett D'Amore int *len, int *pri, int which, const int **state) 2824297a3b0SGarrett D'Amore { 283723fee08SGarrett D'Amore collate_chain_t *p2; 284723fee08SGarrett D'Amore collate_large_t *match; 2856b5e5868SGarrett D'Amore int p, l; 2862d08521bSGarrett D'Amore const int *sptr; 2874297a3b0SGarrett D'Amore 2886b5e5868SGarrett D'Amore /* 289723fee08SGarrett D'Amore * If this is the "last" pass for the UNDEFINED, then 290723fee08SGarrett D'Amore * we just return the priority itself. 291723fee08SGarrett D'Amore */ 2922d08521bSGarrett D'Amore if (which >= lcc->lc_directive_count) { 293723fee08SGarrett D'Amore *pri = *t; 294723fee08SGarrett D'Amore *len = 1; 295723fee08SGarrett D'Amore *state = NULL; 296723fee08SGarrett D'Amore return; 297723fee08SGarrett D'Amore } 298723fee08SGarrett D'Amore 299723fee08SGarrett D'Amore /* 3006b5e5868SGarrett D'Amore * If we have remaining substitution data from a previous 3016b5e5868SGarrett D'Amore * call, consume it first. 3026b5e5868SGarrett D'Amore */ 3036b5e5868SGarrett D'Amore if ((sptr = *state) != NULL) { 3046b5e5868SGarrett D'Amore *pri = *sptr; 3056b5e5868SGarrett D'Amore sptr++; 3066b5e5868SGarrett D'Amore *state = *sptr ? sptr : NULL; 3076b5e5868SGarrett D'Amore *len = 0; 3084297a3b0SGarrett D'Amore return; 3094297a3b0SGarrett D'Amore } 3106b5e5868SGarrett D'Amore 3116b5e5868SGarrett D'Amore /* No active substitutions */ 3126b5e5868SGarrett D'Amore *len = 1; 3136b5e5868SGarrett D'Amore 3146b5e5868SGarrett D'Amore /* 3156b5e5868SGarrett D'Amore * Check for composites such as dipthongs that collate as a 3166b5e5868SGarrett D'Amore * single element (aka chains or collating-elements). 3176b5e5868SGarrett D'Amore */ 3182d08521bSGarrett D'Amore if (((p2 = chainsearch(lcc, t, &l)) != NULL) && 3196b5e5868SGarrett D'Amore ((p = p2->pri[which]) >= 0)) { 3206b5e5868SGarrett D'Amore 3216b5e5868SGarrett D'Amore *len = l; 3226b5e5868SGarrett D'Amore *pri = p; 3236b5e5868SGarrett D'Amore 3246b5e5868SGarrett D'Amore } else if (*t <= UCHAR_MAX) { 3256b5e5868SGarrett D'Amore 3266b5e5868SGarrett D'Amore /* 3276b5e5868SGarrett D'Amore * Character is a small (8-bit) character. 3286b5e5868SGarrett D'Amore * We just look these up directly for speed. 3296b5e5868SGarrett D'Amore */ 3302d08521bSGarrett D'Amore *pri = lcc->lc_char_table[*t].pri[which]; 3316b5e5868SGarrett D'Amore 3322d08521bSGarrett D'Amore } else if ((lcc->lc_info->large_count > 0) && 3332d08521bSGarrett D'Amore ((match = largesearch(lcc, *t)) != NULL)) { 3346b5e5868SGarrett D'Amore 3356b5e5868SGarrett D'Amore /* 3366b5e5868SGarrett D'Amore * Character was found in the extended table. 3376b5e5868SGarrett D'Amore */ 3386b5e5868SGarrett D'Amore *pri = match->pri.pri[which]; 3396b5e5868SGarrett D'Amore 3406b5e5868SGarrett D'Amore } else { 3416b5e5868SGarrett D'Amore /* 3426b5e5868SGarrett D'Amore * Character lacks a specific definition. 3436b5e5868SGarrett D'Amore */ 3442d08521bSGarrett D'Amore if (lcc->lc_directive[which] & DIRECTIVE_UNDEFINED) { 3456b5e5868SGarrett D'Amore /* Mask off sign bit to prevent ordering confusion. */ 3466b5e5868SGarrett D'Amore *pri = (*t & COLLATE_MAX_PRIORITY); 3476b5e5868SGarrett D'Amore } else { 3482d08521bSGarrett D'Amore *pri = lcc->lc_undef_pri[which]; 3494297a3b0SGarrett D'Amore } 3506b5e5868SGarrett D'Amore /* No substitutions for undefined characters! */ 3516b5e5868SGarrett D'Amore return; 3526b5e5868SGarrett D'Amore } 3536b5e5868SGarrett D'Amore 3546b5e5868SGarrett D'Amore /* 3556b5e5868SGarrett D'Amore * Try substituting (expanding) the character. We are 3566b5e5868SGarrett D'Amore * currently doing this *after* the chain compression. I 3576b5e5868SGarrett D'Amore * think it should not matter, but this way might be slightly 3586b5e5868SGarrett D'Amore * faster. 3596b5e5868SGarrett D'Amore * 3606b5e5868SGarrett D'Amore * We do this after the priority search, as this will help us 3616b5e5868SGarrett D'Amore * to identify a single key value. In order for this to work, 3626b5e5868SGarrett D'Amore * its important that the priority assigned to a given element 3636b5e5868SGarrett D'Amore * to be substituted be unique for that level. The localedef 3646b5e5868SGarrett D'Amore * code ensures this for us. 3656b5e5868SGarrett D'Amore */ 3662d08521bSGarrett D'Amore if ((sptr = substsearch(lcc, *pri, which)) != NULL) { 3676b5e5868SGarrett D'Amore if ((*pri = *sptr) != 0) { 3686b5e5868SGarrett D'Amore sptr++; 3696b5e5868SGarrett D'Amore *state = *sptr ? sptr : NULL; 3706b5e5868SGarrett D'Amore } 3716b5e5868SGarrett D'Amore } 3726b5e5868SGarrett D'Amore 3736b5e5868SGarrett D'Amore } 3746b5e5868SGarrett D'Amore 3756b5e5868SGarrett D'Amore /* 3766b5e5868SGarrett D'Amore * This is the meaty part of wcsxfrm & strxfrm. Note that it does 3776b5e5868SGarrett D'Amore * NOT NULL terminate. That is left to the caller. 3786b5e5868SGarrett D'Amore */ 3796b5e5868SGarrett D'Amore size_t 3802d08521bSGarrett D'Amore _collate_wxfrm(const struct lc_collate *lcc, const wchar_t *src, wchar_t *xf, 3812d08521bSGarrett D'Amore size_t room) 3826b5e5868SGarrett D'Amore { 3836b5e5868SGarrett D'Amore int pri; 3846b5e5868SGarrett D'Amore int len; 3856b5e5868SGarrett D'Amore const wchar_t *t; 3866b5e5868SGarrett D'Amore wchar_t *tr = NULL; 3876b5e5868SGarrett D'Amore int direc; 3886b5e5868SGarrett D'Amore int pass; 3892d08521bSGarrett D'Amore const int32_t *state; 3906b5e5868SGarrett D'Amore size_t want = 0; 3916b5e5868SGarrett D'Amore size_t need = 0; 3922d08521bSGarrett D'Amore int ndir = lcc->lc_directive_count; 3936b5e5868SGarrett D'Amore 3946b5e5868SGarrett D'Amore assert(src); 3956b5e5868SGarrett D'Amore 3962d08521bSGarrett D'Amore for (pass = 0; pass <= ndir; pass++) { 3976b5e5868SGarrett D'Amore 3986b5e5868SGarrett D'Amore state = NULL; 3996b5e5868SGarrett D'Amore 4006b5e5868SGarrett D'Amore if (pass != 0) { 4016b5e5868SGarrett D'Amore /* insert level separator from the previous pass */ 4026b5e5868SGarrett D'Amore if (room) { 4036b5e5868SGarrett D'Amore *xf++ = 1; 4046b5e5868SGarrett D'Amore room--; 4056b5e5868SGarrett D'Amore } 4066b5e5868SGarrett D'Amore want++; 4076b5e5868SGarrett D'Amore } 4086b5e5868SGarrett D'Amore 4096b5e5868SGarrett D'Amore /* special pass for undefined */ 4102d08521bSGarrett D'Amore if (pass == ndir) { 4116b5e5868SGarrett D'Amore direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 4126b5e5868SGarrett D'Amore } else { 4132d08521bSGarrett D'Amore direc = lcc->lc_directive[pass]; 4146b5e5868SGarrett D'Amore } 4156b5e5868SGarrett D'Amore 4166b5e5868SGarrett D'Amore t = src; 4176b5e5868SGarrett D'Amore 4186b5e5868SGarrett D'Amore if (direc & DIRECTIVE_BACKWARD) { 4196b5e5868SGarrett D'Amore wchar_t *bp, *fp, c; 4206b5e5868SGarrett D'Amore if (tr) 4216b5e5868SGarrett D'Amore free(tr); 4226b5e5868SGarrett D'Amore if ((tr = wcsdup(t)) == NULL) { 4236b5e5868SGarrett D'Amore errno = ENOMEM; 4246b5e5868SGarrett D'Amore goto fail; 4256b5e5868SGarrett D'Amore } 4266b5e5868SGarrett D'Amore bp = tr; 4276b5e5868SGarrett D'Amore fp = tr + wcslen(tr) - 1; 4286b5e5868SGarrett D'Amore while (bp < fp) { 4296b5e5868SGarrett D'Amore c = *bp; 4306b5e5868SGarrett D'Amore *bp++ = *fp; 4316b5e5868SGarrett D'Amore *fp-- = c; 4326b5e5868SGarrett D'Amore } 4336b5e5868SGarrett D'Amore t = (const wchar_t *)tr; 4346b5e5868SGarrett D'Amore } 4356b5e5868SGarrett D'Amore 4366b5e5868SGarrett D'Amore if (direc & DIRECTIVE_POSITION) { 4376b5e5868SGarrett D'Amore while (*t || state) { 4382d08521bSGarrett D'Amore _collate_lookup(lcc, t, &len, &pri, pass, 4392d08521bSGarrett D'Amore &state); 4406b5e5868SGarrett D'Amore t += len; 4416b5e5868SGarrett D'Amore if (pri <= 0) { 4426b5e5868SGarrett D'Amore if (pri < 0) { 4436b5e5868SGarrett D'Amore errno = EINVAL; 4446b5e5868SGarrett D'Amore goto fail; 4456b5e5868SGarrett D'Amore } 4466b5e5868SGarrett D'Amore pri = COLLATE_MAX_PRIORITY; 4476b5e5868SGarrett D'Amore } 4486b5e5868SGarrett D'Amore if (room) { 4496b5e5868SGarrett D'Amore *xf++ = pri; 4506b5e5868SGarrett D'Amore room--; 4516b5e5868SGarrett D'Amore } 4526b5e5868SGarrett D'Amore want++; 4536b5e5868SGarrett D'Amore need = want; 4546b5e5868SGarrett D'Amore } 4556b5e5868SGarrett D'Amore } else { 4566b5e5868SGarrett D'Amore while (*t || state) { 4572d08521bSGarrett D'Amore _collate_lookup(lcc, t, &len, &pri, pass, 4582d08521bSGarrett D'Amore &state); 4596b5e5868SGarrett D'Amore t += len; 4606b5e5868SGarrett D'Amore if (pri <= 0) { 4616b5e5868SGarrett D'Amore if (pri < 0) { 4626b5e5868SGarrett D'Amore errno = EINVAL; 4636b5e5868SGarrett D'Amore goto fail; 4646b5e5868SGarrett D'Amore } 4656b5e5868SGarrett D'Amore continue; 4666b5e5868SGarrett D'Amore } 4676b5e5868SGarrett D'Amore if (room) { 4686b5e5868SGarrett D'Amore *xf++ = pri; 4696b5e5868SGarrett D'Amore room--; 4706b5e5868SGarrett D'Amore } 4716b5e5868SGarrett D'Amore want++; 4726b5e5868SGarrett D'Amore need = want; 4736b5e5868SGarrett D'Amore } 4746b5e5868SGarrett D'Amore } 4756b5e5868SGarrett D'Amore } 4766b5e5868SGarrett D'Amore 4776b5e5868SGarrett D'Amore end: 4786b5e5868SGarrett D'Amore if (tr) 4796b5e5868SGarrett D'Amore free(tr); 4806b5e5868SGarrett D'Amore return (need); 4816b5e5868SGarrett D'Amore 4826b5e5868SGarrett D'Amore fail: 4836b5e5868SGarrett D'Amore if (tr) 4846b5e5868SGarrett D'Amore free(tr); 4856b5e5868SGarrett D'Amore return ((size_t)(-1)); 4866b5e5868SGarrett D'Amore } 4876b5e5868SGarrett D'Amore 4886b5e5868SGarrett D'Amore /* 4896b5e5868SGarrett D'Amore * In the non-POSIX case, we transform each character into a string of 4906b5e5868SGarrett D'Amore * characters representing the character's priority. Since char is usually 4916b5e5868SGarrett D'Amore * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 4926b5e5868SGarrett D'Amore * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 4936b5e5868SGarrett D'Amore * bits per byte. 4946b5e5868SGarrett D'Amore * 4956b5e5868SGarrett D'Amore * It turns out that we sometimes have real priorities that are 4966b5e5868SGarrett D'Amore * 31-bits wide. (But: be careful using priorities where the high 4976b5e5868SGarrett D'Amore * order bit is set -- i.e. the priority is negative. The sort order 4986b5e5868SGarrett D'Amore * may be surprising!) 4996b5e5868SGarrett D'Amore * 5006b5e5868SGarrett D'Amore * TODO: This would be a good area to optimize somewhat. It turns out 5016b5e5868SGarrett D'Amore * that real prioririties *except for the last UNDEFINED pass* are generally 5026b5e5868SGarrett D'Amore * very small. We need the localedef code to precalculate the max 5036b5e5868SGarrett D'Amore * priority for us, and ideally also give us a mask, and then we could 5046b5e5868SGarrett D'Amore * severely limit what we expand to. 5056b5e5868SGarrett D'Amore */ 5066b5e5868SGarrett D'Amore #define XFRM_BYTES 6 5076b5e5868SGarrett D'Amore #define XFRM_OFFSET ('0') /* make all printable characters */ 5086b5e5868SGarrett D'Amore #define XFRM_SHIFT 6 5096b5e5868SGarrett D'Amore #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 5106b5e5868SGarrett D'Amore #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 5116b5e5868SGarrett D'Amore 512723fee08SGarrett D'Amore static int 5132d08521bSGarrett D'Amore xfrm(locale_t loc, unsigned char *p, int pri, int pass) 5146b5e5868SGarrett D'Amore { 515723fee08SGarrett D'Amore /* we use unsigned to ensure zero fill on right shift */ 5162d08521bSGarrett D'Amore uint32_t val = (uint32_t)loc->collate->lc_pri_count[pass]; 517723fee08SGarrett D'Amore int nc = 0; 518723fee08SGarrett D'Amore 519723fee08SGarrett D'Amore while (val) { 520723fee08SGarrett D'Amore *p = (pri & XFRM_MASK) + XFRM_OFFSET; 5216b5e5868SGarrett D'Amore pri >>= XFRM_SHIFT; 522723fee08SGarrett D'Amore val >>= XFRM_SHIFT; 523723fee08SGarrett D'Amore p++; 524723fee08SGarrett D'Amore nc++; 5256b5e5868SGarrett D'Amore } 526723fee08SGarrett D'Amore return (nc); 5276b5e5868SGarrett D'Amore } 5286b5e5868SGarrett D'Amore 5296b5e5868SGarrett D'Amore size_t 5302d08521bSGarrett D'Amore _collate_sxfrm(const wchar_t *src, char *xf, size_t room, locale_t loc) 5316b5e5868SGarrett D'Amore { 5326b5e5868SGarrett D'Amore int pri; 5336b5e5868SGarrett D'Amore int len; 5346b5e5868SGarrett D'Amore const wchar_t *t; 5356b5e5868SGarrett D'Amore wchar_t *tr = NULL; 5366b5e5868SGarrett D'Amore int direc; 5376b5e5868SGarrett D'Amore int pass; 5382d08521bSGarrett D'Amore const int32_t *state; 5396b5e5868SGarrett D'Amore size_t want = 0; 5406b5e5868SGarrett D'Amore size_t need = 0; 5416b5e5868SGarrett D'Amore int b; 5426b5e5868SGarrett D'Amore uint8_t buf[XFRM_BYTES]; 5432d08521bSGarrett D'Amore const struct lc_collate *lcc = loc->collate; 5442d08521bSGarrett D'Amore int ndir = lcc->lc_directive_count; 5456b5e5868SGarrett D'Amore 5466b5e5868SGarrett D'Amore assert(src); 5476b5e5868SGarrett D'Amore 5482d08521bSGarrett D'Amore for (pass = 0; pass <= ndir; pass++) { 5496b5e5868SGarrett D'Amore 5506b5e5868SGarrett D'Amore state = NULL; 5516b5e5868SGarrett D'Amore 5526b5e5868SGarrett D'Amore if (pass != 0) { 5536b5e5868SGarrett D'Amore /* insert level separator from the previous pass */ 5546b5e5868SGarrett D'Amore if (room) { 5556b5e5868SGarrett D'Amore *xf++ = XFRM_SEP; 5566b5e5868SGarrett D'Amore room--; 5576b5e5868SGarrett D'Amore } 5586b5e5868SGarrett D'Amore want++; 5596b5e5868SGarrett D'Amore } 5606b5e5868SGarrett D'Amore 5616b5e5868SGarrett D'Amore /* special pass for undefined */ 5622d08521bSGarrett D'Amore if (pass == ndir) { 5636b5e5868SGarrett D'Amore direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 5646b5e5868SGarrett D'Amore } else { 5652d08521bSGarrett D'Amore direc = lcc->lc_directive[pass]; 5666b5e5868SGarrett D'Amore } 5676b5e5868SGarrett D'Amore 5686b5e5868SGarrett D'Amore t = src; 5696b5e5868SGarrett D'Amore 5706b5e5868SGarrett D'Amore if (direc & DIRECTIVE_BACKWARD) { 5716b5e5868SGarrett D'Amore wchar_t *bp, *fp, c; 5726b5e5868SGarrett D'Amore if (tr) 5736b5e5868SGarrett D'Amore free(tr); 5746b5e5868SGarrett D'Amore if ((tr = wcsdup(t)) == NULL) { 5756b5e5868SGarrett D'Amore errno = ENOMEM; 5766b5e5868SGarrett D'Amore goto fail; 5776b5e5868SGarrett D'Amore } 5786b5e5868SGarrett D'Amore bp = tr; 5796b5e5868SGarrett D'Amore fp = tr + wcslen(tr) - 1; 5806b5e5868SGarrett D'Amore while (bp < fp) { 5816b5e5868SGarrett D'Amore c = *bp; 5826b5e5868SGarrett D'Amore *bp++ = *fp; 5836b5e5868SGarrett D'Amore *fp-- = c; 5846b5e5868SGarrett D'Amore } 5856b5e5868SGarrett D'Amore t = (const wchar_t *)tr; 5866b5e5868SGarrett D'Amore } 5876b5e5868SGarrett D'Amore 5886b5e5868SGarrett D'Amore if (direc & DIRECTIVE_POSITION) { 5896b5e5868SGarrett D'Amore while (*t || state) { 5906b5e5868SGarrett D'Amore 5912d08521bSGarrett D'Amore _collate_lookup(lcc, t, &len, &pri, pass, 5922d08521bSGarrett D'Amore &state); 5936b5e5868SGarrett D'Amore t += len; 5946b5e5868SGarrett D'Amore if (pri <= 0) { 5956b5e5868SGarrett D'Amore if (pri < 0) { 5966b5e5868SGarrett D'Amore errno = EINVAL; 5976b5e5868SGarrett D'Amore goto fail; 5986b5e5868SGarrett D'Amore } 5996b5e5868SGarrett D'Amore pri = COLLATE_MAX_PRIORITY; 6006b5e5868SGarrett D'Amore } 6016b5e5868SGarrett D'Amore 6022d08521bSGarrett D'Amore b = xfrm(loc, buf, pri, pass); 603723fee08SGarrett D'Amore want += b; 6046b5e5868SGarrett D'Amore if (room) { 6056b5e5868SGarrett D'Amore while (b) { 6066b5e5868SGarrett D'Amore b--; 60763481050STheo Schlossnagle if (room) { 6086b5e5868SGarrett D'Amore *xf++ = buf[b]; 6096b5e5868SGarrett D'Amore room--; 6106b5e5868SGarrett D'Amore } 6116b5e5868SGarrett D'Amore } 61263481050STheo Schlossnagle } 6136b5e5868SGarrett D'Amore need = want; 6146b5e5868SGarrett D'Amore } 6156b5e5868SGarrett D'Amore } else { 6166b5e5868SGarrett D'Amore while (*t || state) { 6172d08521bSGarrett D'Amore _collate_lookup(lcc, t, &len, &pri, pass, 6182d08521bSGarrett D'Amore &state); 6196b5e5868SGarrett D'Amore t += len; 6206b5e5868SGarrett D'Amore if (pri <= 0) { 6216b5e5868SGarrett D'Amore if (pri < 0) { 6226b5e5868SGarrett D'Amore errno = EINVAL; 6236b5e5868SGarrett D'Amore goto fail; 6246b5e5868SGarrett D'Amore } 6256b5e5868SGarrett D'Amore continue; 6266b5e5868SGarrett D'Amore } 627723fee08SGarrett D'Amore 6282d08521bSGarrett D'Amore b = xfrm(loc, buf, pri, pass); 629723fee08SGarrett D'Amore want += b; 6306b5e5868SGarrett D'Amore if (room) { 6316b5e5868SGarrett D'Amore 6326b5e5868SGarrett D'Amore while (b) { 6336b5e5868SGarrett D'Amore b--; 63463481050STheo Schlossnagle if (room) { 6356b5e5868SGarrett D'Amore *xf++ = buf[b]; 6366b5e5868SGarrett D'Amore room--; 6376b5e5868SGarrett D'Amore } 6386b5e5868SGarrett D'Amore } 63963481050STheo Schlossnagle } 6406b5e5868SGarrett D'Amore need = want; 6416b5e5868SGarrett D'Amore } 6426b5e5868SGarrett D'Amore } 6436b5e5868SGarrett D'Amore } 6446b5e5868SGarrett D'Amore 6456b5e5868SGarrett D'Amore end: 6466b5e5868SGarrett D'Amore if (tr) 6476b5e5868SGarrett D'Amore free(tr); 6486b5e5868SGarrett D'Amore return (need); 6496b5e5868SGarrett D'Amore 6506b5e5868SGarrett D'Amore fail: 6516b5e5868SGarrett D'Amore if (tr) 6526b5e5868SGarrett D'Amore free(tr); 6536b5e5868SGarrett D'Amore return ((size_t)(-1)); 6544297a3b0SGarrett D'Amore } 655