xref: /titanic_41/usr/src/lib/libc/port/locale/collate.c (revision 62c3776affc53153ce74e14a6e7ce91950c28102)
14297a3b0SGarrett D'Amore /*
2*62c3776aSGarrett D'Amore  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3*62c3776aSGarrett D'Amore  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
44297a3b0SGarrett D'Amore  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
54297a3b0SGarrett D'Amore  *		at Electronni Visti IA, Kiev, Ukraine.
64297a3b0SGarrett D'Amore  *			All rights reserved.
74297a3b0SGarrett D'Amore  *
84297a3b0SGarrett D'Amore  * Redistribution and use in source and binary forms, with or without
94297a3b0SGarrett D'Amore  * modification, are permitted provided that the following conditions
104297a3b0SGarrett D'Amore  * are met:
114297a3b0SGarrett D'Amore  * 1. Redistributions of source code must retain the above copyright
124297a3b0SGarrett D'Amore  *    notice, this list of conditions and the following disclaimer.
134297a3b0SGarrett D'Amore  * 2. Redistributions in binary form must reproduce the above copyright
144297a3b0SGarrett D'Amore  *    notice, this list of conditions and the following disclaimer in the
154297a3b0SGarrett D'Amore  *    documentation and/or other materials provided with the distribution.
164297a3b0SGarrett D'Amore  *
174297a3b0SGarrett D'Amore  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
184297a3b0SGarrett D'Amore  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
194297a3b0SGarrett D'Amore  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
204297a3b0SGarrett D'Amore  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
214297a3b0SGarrett D'Amore  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
224297a3b0SGarrett D'Amore  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
234297a3b0SGarrett D'Amore  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
244297a3b0SGarrett D'Amore  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
254297a3b0SGarrett D'Amore  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
264297a3b0SGarrett D'Amore  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
274297a3b0SGarrett D'Amore  * SUCH DAMAGE.
284297a3b0SGarrett D'Amore  */
294297a3b0SGarrett D'Amore 
304297a3b0SGarrett D'Amore #include "lint.h"
314297a3b0SGarrett D'Amore #include "file64.h"
324297a3b0SGarrett D'Amore #include <stdio.h>
334297a3b0SGarrett D'Amore #include <stdlib.h>
346b5e5868SGarrett D'Amore #include <stddef.h>
354297a3b0SGarrett D'Amore #include <string.h>
366b5e5868SGarrett D'Amore #include <wchar.h>
374297a3b0SGarrett D'Amore #include <errno.h>
384297a3b0SGarrett D'Amore #include <unistd.h>
396b5e5868SGarrett D'Amore #include <ctype.h>
406b5e5868SGarrett D'Amore #include <unistd.h>
416b5e5868SGarrett D'Amore #include <fcntl.h>
426b5e5868SGarrett D'Amore #include <assert.h>
436b5e5868SGarrett D'Amore #include <sys/stat.h>
446b5e5868SGarrett D'Amore #include <sys/mman.h>
454297a3b0SGarrett D'Amore 
464297a3b0SGarrett D'Amore #include "collate.h"
474297a3b0SGarrett D'Amore #include "setlocale.h"
484297a3b0SGarrett D'Amore 
49723fee08SGarrett D'Amore /*
50723fee08SGarrett D'Amore  * See the comments in usr/src/cmd/localedef/collate.c for further
51723fee08SGarrett D'Amore  * information.  It would also be very helpful to have a copy of the
52723fee08SGarrett D'Amore  * POSIX standard for collation (in the locale format manual page)
53723fee08SGarrett D'Amore  * handy (www.opengroup.org).
54723fee08SGarrett D'Amore  */
55723fee08SGarrett D'Amore 
56*62c3776aSGarrett D'Amore /*
57*62c3776aSGarrett D'Amore  * POSIX uses empty tables and falls down to strcmp.
58*62c3776aSGarrett D'Amore  */
59*62c3776aSGarrett D'Amore struct lc_collate lc_collate_posix = {
60*62c3776aSGarrett D'Amore 	.lc_is_posix = 1,
61*62c3776aSGarrett D'Amore };
624297a3b0SGarrett D'Amore 
63*62c3776aSGarrett D'Amore struct locdata __posix_collate_locdata = {
64*62c3776aSGarrett D'Amore 	.l_lname = "C",
65*62c3776aSGarrett D'Amore 	.l_data = { &lc_collate_posix }
66*62c3776aSGarrett D'Amore };
676b5e5868SGarrett D'Amore 
684297a3b0SGarrett D'Amore 
69*62c3776aSGarrett D'Amore struct locdata *
__lc_collate_load(const char * locname)70*62c3776aSGarrett D'Amore __lc_collate_load(const char *locname)
714297a3b0SGarrett D'Amore {
726b5e5868SGarrett D'Amore 	int i, chains, z;
736b5e5868SGarrett D'Amore 	char buf[PATH_MAX];
746b5e5868SGarrett D'Amore 	char *TMP;
756b5e5868SGarrett D'Amore 	char *map;
766b5e5868SGarrett D'Amore 	collate_info_t *info;
776b5e5868SGarrett D'Amore 	struct stat sbuf;
786b5e5868SGarrett D'Amore 	int fd;
79*62c3776aSGarrett D'Amore 	struct locdata *ldata;
80*62c3776aSGarrett D'Amore 	struct lc_collate *lcc;
814297a3b0SGarrett D'Amore 
824297a3b0SGarrett D'Amore 	/*
834297a3b0SGarrett D'Amore 	 * Slurp the locale file into the cache.
844297a3b0SGarrett D'Amore 	 */
854297a3b0SGarrett D'Amore 
866da5aa94SGarrett D'Amore 	(void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA",
87*62c3776aSGarrett D'Amore 	    _PathLocale, locname);
886da5aa94SGarrett D'Amore 
89*62c3776aSGarrett D'Amore 	if ((fd = open(buf, O_RDONLY)) < 0) {
90*62c3776aSGarrett D'Amore 		errno = EINVAL;
91*62c3776aSGarrett D'Amore 		return (NULL);
92*62c3776aSGarrett D'Amore 	}
936b5e5868SGarrett D'Amore 	if (fstat(fd, &sbuf) < 0) {
946b5e5868SGarrett D'Amore 		(void) close(fd);
95*62c3776aSGarrett D'Amore 		errno = EINVAL;
96*62c3776aSGarrett D'Amore 		return (NULL);
976b5e5868SGarrett D'Amore 	}
986b5e5868SGarrett D'Amore 	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
996b5e5868SGarrett D'Amore 		(void) close(fd);
1006b5e5868SGarrett D'Amore 		errno = EINVAL;
101*62c3776aSGarrett D'Amore 		return (NULL);
1026b5e5868SGarrett D'Amore 	}
1036b5e5868SGarrett D'Amore 	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
1046b5e5868SGarrett D'Amore 	(void) close(fd);
1056b5e5868SGarrett D'Amore 	if ((TMP = map) == NULL) {
106*62c3776aSGarrett D'Amore 		errno = EINVAL;
107*62c3776aSGarrett D'Amore 		return (NULL);
1086b5e5868SGarrett D'Amore 	}
1094297a3b0SGarrett D'Amore 
1106b5e5868SGarrett D'Amore 	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
1116b5e5868SGarrett D'Amore 		(void) munmap(map, sbuf.st_size);
1124297a3b0SGarrett D'Amore 		errno = EINVAL;
113*62c3776aSGarrett D'Amore 		return (NULL);
1144297a3b0SGarrett D'Amore 	}
1156b5e5868SGarrett D'Amore 	TMP += COLLATE_STR_LEN;
1166b5e5868SGarrett D'Amore 
1176b5e5868SGarrett D'Amore 	info = (void *)TMP;
1186b5e5868SGarrett D'Amore 	TMP += sizeof (*info);
1196b5e5868SGarrett D'Amore 
1206b5e5868SGarrett D'Amore 	if ((info->directive_count < 1) ||
1216b5e5868SGarrett D'Amore 	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
1226b5e5868SGarrett D'Amore 	    ((chains = info->chain_count) < 0)) {
1236b5e5868SGarrett D'Amore 		(void) munmap(map, sbuf.st_size);
1244297a3b0SGarrett D'Amore 		errno = EINVAL;
125*62c3776aSGarrett D'Amore 		return (NULL);
1264297a3b0SGarrett D'Amore 	}
1276b5e5868SGarrett D'Amore 
128723fee08SGarrett D'Amore 	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
129723fee08SGarrett D'Amore 	    (sizeof (collate_chain_t) * chains) +
130723fee08SGarrett D'Amore 	    (sizeof (collate_large_t) * info->large_count);
131*62c3776aSGarrett D'Amore 	for (z = 0; z < info->directive_count; z++) {
1326b5e5868SGarrett D'Amore 		i += sizeof (collate_subst_t) * info->subst_count[z];
1336b5e5868SGarrett D'Amore 	}
1346b5e5868SGarrett D'Amore 	if (i != (sbuf.st_size - (TMP - map))) {
1356b5e5868SGarrett D'Amore 		(void) munmap(map, sbuf.st_size);
1366b5e5868SGarrett D'Amore 		errno = EINVAL;
137*62c3776aSGarrett D'Amore 		return (NULL);
1386b5e5868SGarrett D'Amore 	}
1396b5e5868SGarrett D'Amore 
140*62c3776aSGarrett D'Amore 
141*62c3776aSGarrett D'Amore 	if ((ldata = __locdata_alloc(locname, sizeof (*lcc))) == NULL) {
142*62c3776aSGarrett D'Amore 		(void) munmap(map, sbuf.st_size);
143*62c3776aSGarrett D'Amore 		return (NULL);
144*62c3776aSGarrett D'Amore 	}
145*62c3776aSGarrett D'Amore 	lcc = ldata->l_data[0];
146*62c3776aSGarrett D'Amore 	ldata->l_map = map;
147*62c3776aSGarrett D'Amore 	ldata->l_map_len = sbuf.st_size;
148*62c3776aSGarrett D'Amore 
149*62c3776aSGarrett D'Amore 	lcc->lc_info = info;
150*62c3776aSGarrett D'Amore 	lcc->lc_directive_count = info->directive_count;
151*62c3776aSGarrett D'Amore 	lcc->lc_large_count = info->large_count;
152*62c3776aSGarrett D'Amore 
153*62c3776aSGarrett D'Amore 	for (z = 0; z < COLL_WEIGHTS_MAX; z++) {
154*62c3776aSGarrett D'Amore 		lcc->lc_directive[z] = info->directive[z];
155*62c3776aSGarrett D'Amore 		lcc->lc_subst_count[z] = info->subst_count[z];
156*62c3776aSGarrett D'Amore 		lcc->lc_pri_count[z] = info->pri_count[z];
157*62c3776aSGarrett D'Amore 		lcc->lc_undef_pri[z] = info->undef_pri[z];
158*62c3776aSGarrett D'Amore 	}
159*62c3776aSGarrett D'Amore 
160*62c3776aSGarrett D'Amore 	lcc->lc_char_table = (void *)TMP;
161723fee08SGarrett D'Amore 	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
1626b5e5868SGarrett D'Amore 
163*62c3776aSGarrett D'Amore 	for (z = 0; z < lcc->lc_directive_count; z++) {
164*62c3776aSGarrett D'Amore 		int count;
165*62c3776aSGarrett D'Amore 		if ((count = lcc->lc_subst_count[z]) > 0) {
166*62c3776aSGarrett D'Amore 			lcc->lc_subst_table[z] = (void *)TMP;
167*62c3776aSGarrett D'Amore 			TMP += count * sizeof (collate_subst_t);
1686b5e5868SGarrett D'Amore 		} else {
169*62c3776aSGarrett D'Amore 			lcc->lc_subst_table[z] = NULL;
1706b5e5868SGarrett D'Amore 		}
1716b5e5868SGarrett D'Amore 	}
1726b5e5868SGarrett D'Amore 
1736b5e5868SGarrett D'Amore 	if (chains > 0) {
174*62c3776aSGarrett D'Amore 		lcc->lc_chain_table = (void *)TMP;
175723fee08SGarrett D'Amore 		TMP += chains * sizeof (collate_chain_t);
1764297a3b0SGarrett D'Amore 	} else
177*62c3776aSGarrett D'Amore 		lcc->lc_chain_table = NULL;
178*62c3776aSGarrett D'Amore 	lcc->lc_chain_count = chains;
179*62c3776aSGarrett D'Amore 	if (lcc->lc_large_count > 0)
180*62c3776aSGarrett D'Amore 		lcc->lc_large_table = (void *)TMP;
1816b5e5868SGarrett D'Amore 	else
182*62c3776aSGarrett D'Amore 		lcc->lc_large_table = NULL;
1834297a3b0SGarrett D'Amore 
184*62c3776aSGarrett D'Amore 	return (ldata);
1854297a3b0SGarrett D'Amore }
1864297a3b0SGarrett D'Amore 
187*62c3776aSGarrett D'Amore static const int32_t *
substsearch(const struct lc_collate * lcc,const wchar_t key,int pass)188*62c3776aSGarrett D'Amore substsearch(const struct lc_collate *lcc, const wchar_t key, int pass)
1894297a3b0SGarrett D'Amore {
190*62c3776aSGarrett D'Amore 	const collate_subst_t *p;
191*62c3776aSGarrett D'Amore 	int n = lcc->lc_subst_count[pass];
1924297a3b0SGarrett D'Amore 
1936b5e5868SGarrett D'Amore 	if (n == 0)
1944297a3b0SGarrett D'Amore 		return (NULL);
1956b5e5868SGarrett D'Amore 
196*62c3776aSGarrett D'Amore 	if (pass >= lcc->lc_directive_count)
1974297a3b0SGarrett D'Amore 		return (NULL);
198723fee08SGarrett D'Amore 
199723fee08SGarrett D'Amore 	if (!(key & COLLATE_SUBST_PRIORITY))
200723fee08SGarrett D'Amore 		return (NULL);
201723fee08SGarrett D'Amore 
202*62c3776aSGarrett D'Amore 	p = lcc->lc_subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
203723fee08SGarrett D'Amore 	assert(p->key == key);
204723fee08SGarrett D'Amore 	return (p->pri);
2054297a3b0SGarrett D'Amore }
2066b5e5868SGarrett D'Amore 
207723fee08SGarrett D'Amore /*
208723fee08SGarrett D'Amore  * Note: for performance reasons, we have expanded bsearch here.  This avoids
209723fee08SGarrett D'Amore  * function call overhead with each comparison.
210723fee08SGarrett D'Amore  */
211723fee08SGarrett D'Amore 
212723fee08SGarrett D'Amore static collate_chain_t *
chainsearch(const struct lc_collate * lcc,const wchar_t * key,int * len)213*62c3776aSGarrett D'Amore chainsearch(const struct lc_collate *lcc, const wchar_t *key, int *len)
2146b5e5868SGarrett D'Amore {
2156b5e5868SGarrett D'Amore 	int low;
2166b5e5868SGarrett D'Amore 	int high;
2176b5e5868SGarrett D'Amore 	int next, compar, l;
218723fee08SGarrett D'Amore 	collate_chain_t *p;
219723fee08SGarrett D'Amore 	collate_chain_t *tab;
2206b5e5868SGarrett D'Amore 
221*62c3776aSGarrett D'Amore 	if (lcc->lc_info->chain_count == 0)
2226b5e5868SGarrett D'Amore 		return (NULL);
2236b5e5868SGarrett D'Amore 
2246b5e5868SGarrett D'Amore 	low = 0;
225*62c3776aSGarrett D'Amore 	high = lcc->lc_info->chain_count - 1;
226*62c3776aSGarrett D'Amore 	tab = lcc->lc_chain_table;
2276b5e5868SGarrett D'Amore 
2286b5e5868SGarrett D'Amore 	while (low <= high) {
2296b5e5868SGarrett D'Amore 		next = (low + high) / 2;
2306b5e5868SGarrett D'Amore 		p = tab + next;
2316b5e5868SGarrett D'Amore 		compar = *key - *p->str;
2326b5e5868SGarrett D'Amore 		if (compar == 0) {
2336b5e5868SGarrett D'Amore 			l = wcsnlen(p->str, COLLATE_STR_LEN);
2346b5e5868SGarrett D'Amore 			compar = wcsncmp(key, p->str, l);
2356b5e5868SGarrett D'Amore 			if (compar == 0) {
2366b5e5868SGarrett D'Amore 				*len = l;
2376b5e5868SGarrett D'Amore 				return (p);
2384297a3b0SGarrett D'Amore 			}
2394297a3b0SGarrett D'Amore 		}
2406b5e5868SGarrett D'Amore 		if (compar > 0)
2416b5e5868SGarrett D'Amore 			low = next + 1;
2426b5e5868SGarrett D'Amore 		else
2436b5e5868SGarrett D'Amore 			high = next - 1;
2446b5e5868SGarrett D'Amore 	}
2456b5e5868SGarrett D'Amore 	return (NULL);
2466b5e5868SGarrett D'Amore }
2476b5e5868SGarrett D'Amore 
248723fee08SGarrett D'Amore static collate_large_t *
largesearch(const struct lc_collate * lcc,const wchar_t key)249*62c3776aSGarrett D'Amore largesearch(const struct lc_collate *lcc, const wchar_t key)
2506b5e5868SGarrett D'Amore {
2516b5e5868SGarrett D'Amore 	int low = 0;
252*62c3776aSGarrett D'Amore 	int high = lcc->lc_info->large_count - 1;
2536b5e5868SGarrett D'Amore 	int next, compar;
254723fee08SGarrett D'Amore 	collate_large_t *p;
255*62c3776aSGarrett D'Amore 	collate_large_t *tab = lcc->lc_large_table;
2566b5e5868SGarrett D'Amore 
257*62c3776aSGarrett D'Amore 	if (lcc->lc_info->large_count == 0)
2586b5e5868SGarrett D'Amore 		return (NULL);
2596b5e5868SGarrett D'Amore 
2606b5e5868SGarrett D'Amore 	while (low <= high) {
2616b5e5868SGarrett D'Amore 		next = (low + high) / 2;
2626b5e5868SGarrett D'Amore 		p = tab + next;
2636b5e5868SGarrett D'Amore 		compar = key - p->val;
2646b5e5868SGarrett D'Amore 		if (compar == 0)
2656b5e5868SGarrett D'Amore 			return (p);
2666b5e5868SGarrett D'Amore 		if (compar > 0)
2676b5e5868SGarrett D'Amore 			low = next + 1;
2686b5e5868SGarrett D'Amore 		else
2696b5e5868SGarrett D'Amore 			high = next - 1;
2706b5e5868SGarrett D'Amore 	}
2716b5e5868SGarrett D'Amore 	return (NULL);
2724297a3b0SGarrett D'Amore }
2734297a3b0SGarrett D'Amore 
2744297a3b0SGarrett D'Amore void
_collate_lookup(const struct lc_collate * lcc,const wchar_t * t,int * len,int * pri,int which,const int ** state)275*62c3776aSGarrett D'Amore _collate_lookup(const struct lc_collate *lcc, const wchar_t *t,
276*62c3776aSGarrett D'Amore     int *len, int *pri, int which, const int **state)
2774297a3b0SGarrett D'Amore {
278723fee08SGarrett D'Amore 	collate_chain_t *p2;
279723fee08SGarrett D'Amore 	collate_large_t *match;
2806b5e5868SGarrett D'Amore 	int p, l;
281*62c3776aSGarrett D'Amore 	const int *sptr;
2824297a3b0SGarrett D'Amore 
2836b5e5868SGarrett D'Amore 	/*
284723fee08SGarrett D'Amore 	 * If this is the "last" pass for the UNDEFINED, then
285723fee08SGarrett D'Amore 	 * we just return the priority itself.
286723fee08SGarrett D'Amore 	 */
287*62c3776aSGarrett D'Amore 	if (which >= lcc->lc_directive_count) {
288723fee08SGarrett D'Amore 		*pri = *t;
289723fee08SGarrett D'Amore 		*len = 1;
290723fee08SGarrett D'Amore 		*state = NULL;
291723fee08SGarrett D'Amore 		return;
292723fee08SGarrett D'Amore 	}
293723fee08SGarrett D'Amore 
294723fee08SGarrett D'Amore 	/*
2956b5e5868SGarrett D'Amore 	 * If we have remaining substitution data from a previous
2966b5e5868SGarrett D'Amore 	 * call, consume it first.
2976b5e5868SGarrett D'Amore 	 */
2986b5e5868SGarrett D'Amore 	if ((sptr = *state) != NULL) {
2996b5e5868SGarrett D'Amore 		*pri = *sptr;
3006b5e5868SGarrett D'Amore 		sptr++;
3016b5e5868SGarrett D'Amore 		*state = *sptr ? sptr : NULL;
3026b5e5868SGarrett D'Amore 		*len = 0;
3034297a3b0SGarrett D'Amore 		return;
3044297a3b0SGarrett D'Amore 	}
3056b5e5868SGarrett D'Amore 
3066b5e5868SGarrett D'Amore 	/* No active substitutions */
3076b5e5868SGarrett D'Amore 	*len = 1;
3086b5e5868SGarrett D'Amore 
3096b5e5868SGarrett D'Amore 	/*
3106b5e5868SGarrett D'Amore 	 * Check for composites such as dipthongs that collate as a
3116b5e5868SGarrett D'Amore 	 * single element (aka chains or collating-elements).
3126b5e5868SGarrett D'Amore 	 */
313*62c3776aSGarrett D'Amore 	if (((p2 = chainsearch(lcc, t, &l)) != NULL) &&
3146b5e5868SGarrett D'Amore 	    ((p = p2->pri[which]) >= 0)) {
3156b5e5868SGarrett D'Amore 
3166b5e5868SGarrett D'Amore 		*len = l;
3176b5e5868SGarrett D'Amore 		*pri = p;
3186b5e5868SGarrett D'Amore 
3196b5e5868SGarrett D'Amore 	} else if (*t <= UCHAR_MAX) {
3206b5e5868SGarrett D'Amore 
3216b5e5868SGarrett D'Amore 		/*
3226b5e5868SGarrett D'Amore 		 * Character is a small (8-bit) character.
3236b5e5868SGarrett D'Amore 		 * We just look these up directly for speed.
3246b5e5868SGarrett D'Amore 		 */
325*62c3776aSGarrett D'Amore 		*pri = lcc->lc_char_table[*t].pri[which];
3266b5e5868SGarrett D'Amore 
327*62c3776aSGarrett D'Amore 	} else if ((lcc->lc_info->large_count > 0) &&
328*62c3776aSGarrett D'Amore 	    ((match = largesearch(lcc, *t)) != NULL)) {
3296b5e5868SGarrett D'Amore 
3306b5e5868SGarrett D'Amore 		/*
3316b5e5868SGarrett D'Amore 		 * Character was found in the extended table.
3326b5e5868SGarrett D'Amore 		 */
3336b5e5868SGarrett D'Amore 		*pri = match->pri.pri[which];
3346b5e5868SGarrett D'Amore 
3356b5e5868SGarrett D'Amore 	} else {
3366b5e5868SGarrett D'Amore 		/*
3376b5e5868SGarrett D'Amore 		 * Character lacks a specific definition.
3386b5e5868SGarrett D'Amore 		 */
339*62c3776aSGarrett D'Amore 		if (lcc->lc_directive[which] & DIRECTIVE_UNDEFINED) {
3406b5e5868SGarrett D'Amore 			/* Mask off sign bit to prevent ordering confusion. */
3416b5e5868SGarrett D'Amore 			*pri = (*t & COLLATE_MAX_PRIORITY);
3426b5e5868SGarrett D'Amore 		} else {
343*62c3776aSGarrett D'Amore 			*pri = lcc->lc_undef_pri[which];
3444297a3b0SGarrett D'Amore 		}
3456b5e5868SGarrett D'Amore 		/* No substitutions for undefined characters! */
3466b5e5868SGarrett D'Amore 		return;
3476b5e5868SGarrett D'Amore 	}
3486b5e5868SGarrett D'Amore 
3496b5e5868SGarrett D'Amore 	/*
3506b5e5868SGarrett D'Amore 	 * Try substituting (expanding) the character.  We are
3516b5e5868SGarrett D'Amore 	 * currently doing this *after* the chain compression.  I
3526b5e5868SGarrett D'Amore 	 * think it should not matter, but this way might be slightly
3536b5e5868SGarrett D'Amore 	 * faster.
3546b5e5868SGarrett D'Amore 	 *
3556b5e5868SGarrett D'Amore 	 * We do this after the priority search, as this will help us
3566b5e5868SGarrett D'Amore 	 * to identify a single key value.  In order for this to work,
3576b5e5868SGarrett D'Amore 	 * its important that the priority assigned to a given element
3586b5e5868SGarrett D'Amore 	 * to be substituted be unique for that level.  The localedef
3596b5e5868SGarrett D'Amore 	 * code ensures this for us.
3606b5e5868SGarrett D'Amore 	 */
361*62c3776aSGarrett D'Amore 	if ((sptr = substsearch(lcc, *pri, which)) != NULL) {
3626b5e5868SGarrett D'Amore 		if ((*pri = *sptr) != 0) {
3636b5e5868SGarrett D'Amore 			sptr++;
3646b5e5868SGarrett D'Amore 			*state = *sptr ? sptr : NULL;
3656b5e5868SGarrett D'Amore 		}
3666b5e5868SGarrett D'Amore 	}
3676b5e5868SGarrett D'Amore 
3686b5e5868SGarrett D'Amore }
3696b5e5868SGarrett D'Amore 
3706b5e5868SGarrett D'Amore /*
3716b5e5868SGarrett D'Amore  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
3726b5e5868SGarrett D'Amore  * NOT NULL terminate.  That is left to the caller.
3736b5e5868SGarrett D'Amore  */
3746b5e5868SGarrett D'Amore size_t
_collate_wxfrm(const struct lc_collate * lcc,const wchar_t * src,wchar_t * xf,size_t room)375*62c3776aSGarrett D'Amore _collate_wxfrm(const struct lc_collate *lcc, const wchar_t *src, wchar_t *xf,
376*62c3776aSGarrett D'Amore     size_t room)
3776b5e5868SGarrett D'Amore {
3786b5e5868SGarrett D'Amore 	int		pri;
3796b5e5868SGarrett D'Amore 	int		len;
3806b5e5868SGarrett D'Amore 	const wchar_t	*t;
3816b5e5868SGarrett D'Amore 	wchar_t		*tr = NULL;
3826b5e5868SGarrett D'Amore 	int		direc;
3836b5e5868SGarrett D'Amore 	int		pass;
384*62c3776aSGarrett D'Amore 	const int32_t 	*state;
3856b5e5868SGarrett D'Amore 	size_t		want = 0;
3866b5e5868SGarrett D'Amore 	size_t		need = 0;
387*62c3776aSGarrett D'Amore 	int		ndir = lcc->lc_directive_count;
3886b5e5868SGarrett D'Amore 
3896b5e5868SGarrett D'Amore 	assert(src);
3906b5e5868SGarrett D'Amore 
391*62c3776aSGarrett D'Amore 	for (pass = 0; pass <= ndir; pass++) {
3926b5e5868SGarrett D'Amore 
3936b5e5868SGarrett D'Amore 		state = NULL;
3946b5e5868SGarrett D'Amore 
3956b5e5868SGarrett D'Amore 		if (pass != 0) {
3966b5e5868SGarrett D'Amore 			/* insert level separator from the previous pass */
3976b5e5868SGarrett D'Amore 			if (room) {
3986b5e5868SGarrett D'Amore 				*xf++ = 1;
3996b5e5868SGarrett D'Amore 				room--;
4006b5e5868SGarrett D'Amore 			}
4016b5e5868SGarrett D'Amore 			want++;
4026b5e5868SGarrett D'Amore 		}
4036b5e5868SGarrett D'Amore 
4046b5e5868SGarrett D'Amore 		/* special pass for undefined */
405*62c3776aSGarrett D'Amore 		if (pass == ndir) {
4066b5e5868SGarrett D'Amore 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
4076b5e5868SGarrett D'Amore 		} else {
408*62c3776aSGarrett D'Amore 			direc = lcc->lc_directive[pass];
4096b5e5868SGarrett D'Amore 		}
4106b5e5868SGarrett D'Amore 
4116b5e5868SGarrett D'Amore 		t = src;
4126b5e5868SGarrett D'Amore 
4136b5e5868SGarrett D'Amore 		if (direc & DIRECTIVE_BACKWARD) {
4146b5e5868SGarrett D'Amore 			wchar_t *bp, *fp, c;
4156b5e5868SGarrett D'Amore 			if (tr)
4166b5e5868SGarrett D'Amore 				free(tr);
4176b5e5868SGarrett D'Amore 			if ((tr = wcsdup(t)) == NULL) {
4186b5e5868SGarrett D'Amore 				errno = ENOMEM;
4196b5e5868SGarrett D'Amore 				goto fail;
4206b5e5868SGarrett D'Amore 			}
4216b5e5868SGarrett D'Amore 			bp = tr;
4226b5e5868SGarrett D'Amore 			fp = tr + wcslen(tr) - 1;
4236b5e5868SGarrett D'Amore 			while (bp < fp) {
4246b5e5868SGarrett D'Amore 				c = *bp;
4256b5e5868SGarrett D'Amore 				*bp++ = *fp;
4266b5e5868SGarrett D'Amore 				*fp-- = c;
4276b5e5868SGarrett D'Amore 			}
4286b5e5868SGarrett D'Amore 			t = (const wchar_t *)tr;
4296b5e5868SGarrett D'Amore 		}
4306b5e5868SGarrett D'Amore 
4316b5e5868SGarrett D'Amore 		if (direc & DIRECTIVE_POSITION) {
4326b5e5868SGarrett D'Amore 			while (*t || state) {
433*62c3776aSGarrett D'Amore 				_collate_lookup(lcc, t, &len, &pri, pass,
434*62c3776aSGarrett D'Amore 				    &state);
4356b5e5868SGarrett D'Amore 				t += len;
4366b5e5868SGarrett D'Amore 				if (pri <= 0) {
4376b5e5868SGarrett D'Amore 					if (pri < 0) {
4386b5e5868SGarrett D'Amore 						errno = EINVAL;
4396b5e5868SGarrett D'Amore 						goto fail;
4406b5e5868SGarrett D'Amore 					}
4416b5e5868SGarrett D'Amore 					pri = COLLATE_MAX_PRIORITY;
4426b5e5868SGarrett D'Amore 				}
4436b5e5868SGarrett D'Amore 				if (room) {
4446b5e5868SGarrett D'Amore 					*xf++ = pri;
4456b5e5868SGarrett D'Amore 					room--;
4466b5e5868SGarrett D'Amore 				}
4476b5e5868SGarrett D'Amore 				want++;
4486b5e5868SGarrett D'Amore 				need = want;
4496b5e5868SGarrett D'Amore 			}
4506b5e5868SGarrett D'Amore 		} else {
4516b5e5868SGarrett D'Amore 			while (*t || state) {
452*62c3776aSGarrett D'Amore 				_collate_lookup(lcc, t, &len, &pri, pass,
453*62c3776aSGarrett D'Amore 				    &state);
4546b5e5868SGarrett D'Amore 				t += len;
4556b5e5868SGarrett D'Amore 				if (pri <= 0) {
4566b5e5868SGarrett D'Amore 					if (pri < 0) {
4576b5e5868SGarrett D'Amore 						errno = EINVAL;
4586b5e5868SGarrett D'Amore 						goto fail;
4596b5e5868SGarrett D'Amore 					}
4606b5e5868SGarrett D'Amore 					continue;
4616b5e5868SGarrett D'Amore 				}
4626b5e5868SGarrett D'Amore 				if (room) {
4636b5e5868SGarrett D'Amore 					*xf++ = pri;
4646b5e5868SGarrett D'Amore 					room--;
4656b5e5868SGarrett D'Amore 				}
4666b5e5868SGarrett D'Amore 				want++;
4676b5e5868SGarrett D'Amore 				need = want;
4686b5e5868SGarrett D'Amore 			}
4696b5e5868SGarrett D'Amore 		}
4706b5e5868SGarrett D'Amore 	}
4716b5e5868SGarrett D'Amore 
4726b5e5868SGarrett D'Amore end:
4736b5e5868SGarrett D'Amore 	if (tr)
4746b5e5868SGarrett D'Amore 		free(tr);
4756b5e5868SGarrett D'Amore 	return (need);
4766b5e5868SGarrett D'Amore 
4776b5e5868SGarrett D'Amore fail:
4786b5e5868SGarrett D'Amore 	if (tr)
4796b5e5868SGarrett D'Amore 		free(tr);
4806b5e5868SGarrett D'Amore 	return ((size_t)(-1));
4816b5e5868SGarrett D'Amore }
4826b5e5868SGarrett D'Amore 
4836b5e5868SGarrett D'Amore /*
4846b5e5868SGarrett D'Amore  * In the non-POSIX case, we transform each character into a string of
4856b5e5868SGarrett D'Amore  * characters representing the character's priority.  Since char is usually
4866b5e5868SGarrett D'Amore  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
4876b5e5868SGarrett D'Amore  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
4886b5e5868SGarrett D'Amore  * bits per byte.
4896b5e5868SGarrett D'Amore  *
4906b5e5868SGarrett D'Amore  * It turns out that we sometimes have real priorities that are
4916b5e5868SGarrett D'Amore  * 31-bits wide.  (But: be careful using priorities where the high
4926b5e5868SGarrett D'Amore  * order bit is set -- i.e. the priority is negative.  The sort order
4936b5e5868SGarrett D'Amore  * may be surprising!)
4946b5e5868SGarrett D'Amore  *
4956b5e5868SGarrett D'Amore  * TODO: This would be a good area to optimize somewhat.  It turns out
4966b5e5868SGarrett D'Amore  * that real prioririties *except for the last UNDEFINED pass* are generally
4976b5e5868SGarrett D'Amore  * very small.  We need the localedef code to precalculate the max
4986b5e5868SGarrett D'Amore  * priority for us, and ideally also give us a mask, and then we could
4996b5e5868SGarrett D'Amore  * severely limit what we expand to.
5006b5e5868SGarrett D'Amore  */
5016b5e5868SGarrett D'Amore #define	XFRM_BYTES	6
5026b5e5868SGarrett D'Amore #define	XFRM_OFFSET	('0')	/* make all printable characters */
5036b5e5868SGarrett D'Amore #define	XFRM_SHIFT	6
5046b5e5868SGarrett D'Amore #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
5056b5e5868SGarrett D'Amore #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
5066b5e5868SGarrett D'Amore 
507723fee08SGarrett D'Amore static int
xfrm(locale_t loc,unsigned char * p,int pri,int pass)508*62c3776aSGarrett D'Amore xfrm(locale_t loc, unsigned char *p, int pri, int pass)
5096b5e5868SGarrett D'Amore {
510723fee08SGarrett D'Amore 	/* we use unsigned to ensure zero fill on right shift */
511*62c3776aSGarrett D'Amore 	uint32_t val = (uint32_t)loc->collate->lc_pri_count[pass];
512723fee08SGarrett D'Amore 	int nc = 0;
513723fee08SGarrett D'Amore 
514723fee08SGarrett D'Amore 	while (val) {
515723fee08SGarrett D'Amore 		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
5166b5e5868SGarrett D'Amore 		pri >>= XFRM_SHIFT;
517723fee08SGarrett D'Amore 		val >>= XFRM_SHIFT;
518723fee08SGarrett D'Amore 		p++;
519723fee08SGarrett D'Amore 		nc++;
5206b5e5868SGarrett D'Amore 	}
521723fee08SGarrett D'Amore 	return (nc);
5226b5e5868SGarrett D'Amore }
5236b5e5868SGarrett D'Amore 
5246b5e5868SGarrett D'Amore size_t
_collate_sxfrm(const wchar_t * src,char * xf,size_t room,locale_t loc)525*62c3776aSGarrett D'Amore _collate_sxfrm(const wchar_t *src, char *xf, size_t room, locale_t loc)
5266b5e5868SGarrett D'Amore {
5276b5e5868SGarrett D'Amore 	int		pri;
5286b5e5868SGarrett D'Amore 	int		len;
5296b5e5868SGarrett D'Amore 	const wchar_t	*t;
5306b5e5868SGarrett D'Amore 	wchar_t		*tr = NULL;
5316b5e5868SGarrett D'Amore 	int		direc;
5326b5e5868SGarrett D'Amore 	int		pass;
533*62c3776aSGarrett D'Amore 	const int32_t 	*state;
5346b5e5868SGarrett D'Amore 	size_t		want = 0;
5356b5e5868SGarrett D'Amore 	size_t		need = 0;
5366b5e5868SGarrett D'Amore 	int		b;
5376b5e5868SGarrett D'Amore 	uint8_t		buf[XFRM_BYTES];
538*62c3776aSGarrett D'Amore 	const struct lc_collate *lcc = loc->collate;
539*62c3776aSGarrett D'Amore 	int		ndir = lcc->lc_directive_count;
5406b5e5868SGarrett D'Amore 
5416b5e5868SGarrett D'Amore 	assert(src);
5426b5e5868SGarrett D'Amore 
543*62c3776aSGarrett D'Amore 	for (pass = 0; pass <= ndir; pass++) {
5446b5e5868SGarrett D'Amore 
5456b5e5868SGarrett D'Amore 		state = NULL;
5466b5e5868SGarrett D'Amore 
5476b5e5868SGarrett D'Amore 		if (pass != 0) {
5486b5e5868SGarrett D'Amore 			/* insert level separator from the previous pass */
5496b5e5868SGarrett D'Amore 			if (room) {
5506b5e5868SGarrett D'Amore 				*xf++ = XFRM_SEP;
5516b5e5868SGarrett D'Amore 				room--;
5526b5e5868SGarrett D'Amore 			}
5536b5e5868SGarrett D'Amore 			want++;
5546b5e5868SGarrett D'Amore 		}
5556b5e5868SGarrett D'Amore 
5566b5e5868SGarrett D'Amore 		/* special pass for undefined */
557*62c3776aSGarrett D'Amore 		if (pass == ndir) {
5586b5e5868SGarrett D'Amore 			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
5596b5e5868SGarrett D'Amore 		} else {
560*62c3776aSGarrett D'Amore 			direc = lcc->lc_directive[pass];
5616b5e5868SGarrett D'Amore 		}
5626b5e5868SGarrett D'Amore 
5636b5e5868SGarrett D'Amore 		t = src;
5646b5e5868SGarrett D'Amore 
5656b5e5868SGarrett D'Amore 		if (direc & DIRECTIVE_BACKWARD) {
5666b5e5868SGarrett D'Amore 			wchar_t *bp, *fp, c;
5676b5e5868SGarrett D'Amore 			if (tr)
5686b5e5868SGarrett D'Amore 				free(tr);
5696b5e5868SGarrett D'Amore 			if ((tr = wcsdup(t)) == NULL) {
5706b5e5868SGarrett D'Amore 				errno = ENOMEM;
5716b5e5868SGarrett D'Amore 				goto fail;
5726b5e5868SGarrett D'Amore 			}
5736b5e5868SGarrett D'Amore 			bp = tr;
5746b5e5868SGarrett D'Amore 			fp = tr + wcslen(tr) - 1;
5756b5e5868SGarrett D'Amore 			while (bp < fp) {
5766b5e5868SGarrett D'Amore 				c = *bp;
5776b5e5868SGarrett D'Amore 				*bp++ = *fp;
5786b5e5868SGarrett D'Amore 				*fp-- = c;
5796b5e5868SGarrett D'Amore 			}
5806b5e5868SGarrett D'Amore 			t = (const wchar_t *)tr;
5816b5e5868SGarrett D'Amore 		}
5826b5e5868SGarrett D'Amore 
5836b5e5868SGarrett D'Amore 		if (direc & DIRECTIVE_POSITION) {
5846b5e5868SGarrett D'Amore 			while (*t || state) {
5856b5e5868SGarrett D'Amore 
586*62c3776aSGarrett D'Amore 				_collate_lookup(lcc, t, &len, &pri, pass,
587*62c3776aSGarrett D'Amore 				    &state);
5886b5e5868SGarrett D'Amore 				t += len;
5896b5e5868SGarrett D'Amore 				if (pri <= 0) {
5906b5e5868SGarrett D'Amore 					if (pri < 0) {
5916b5e5868SGarrett D'Amore 						errno = EINVAL;
5926b5e5868SGarrett D'Amore 						goto fail;
5936b5e5868SGarrett D'Amore 					}
5946b5e5868SGarrett D'Amore 					pri = COLLATE_MAX_PRIORITY;
5956b5e5868SGarrett D'Amore 				}
5966b5e5868SGarrett D'Amore 
597*62c3776aSGarrett D'Amore 				b = xfrm(loc, buf, pri, pass);
598723fee08SGarrett D'Amore 				want += b;
5996b5e5868SGarrett D'Amore 				if (room) {
6006b5e5868SGarrett D'Amore 					while (b) {
6016b5e5868SGarrett D'Amore 						b--;
60263481050STheo Schlossnagle 						if (room) {
6036b5e5868SGarrett D'Amore 							*xf++ = buf[b];
6046b5e5868SGarrett D'Amore 							room--;
6056b5e5868SGarrett D'Amore 						}
6066b5e5868SGarrett D'Amore 					}
60763481050STheo Schlossnagle 				}
6086b5e5868SGarrett D'Amore 				need = want;
6096b5e5868SGarrett D'Amore 			}
6106b5e5868SGarrett D'Amore 		} else {
6116b5e5868SGarrett D'Amore 			while (*t || state) {
612*62c3776aSGarrett D'Amore 				_collate_lookup(lcc, t, &len, &pri, pass,
613*62c3776aSGarrett D'Amore 				    &state);
6146b5e5868SGarrett D'Amore 				t += len;
6156b5e5868SGarrett D'Amore 				if (pri <= 0) {
6166b5e5868SGarrett D'Amore 					if (pri < 0) {
6176b5e5868SGarrett D'Amore 						errno = EINVAL;
6186b5e5868SGarrett D'Amore 						goto fail;
6196b5e5868SGarrett D'Amore 					}
6206b5e5868SGarrett D'Amore 					continue;
6216b5e5868SGarrett D'Amore 				}
622723fee08SGarrett D'Amore 
623*62c3776aSGarrett D'Amore 				b = xfrm(loc, buf, pri, pass);
624723fee08SGarrett D'Amore 				want += b;
6256b5e5868SGarrett D'Amore 				if (room) {
6266b5e5868SGarrett D'Amore 
6276b5e5868SGarrett D'Amore 					while (b) {
6286b5e5868SGarrett D'Amore 						b--;
62963481050STheo Schlossnagle 						if (room) {
6306b5e5868SGarrett D'Amore 							*xf++ = buf[b];
6316b5e5868SGarrett D'Amore 							room--;
6326b5e5868SGarrett D'Amore 						}
6336b5e5868SGarrett D'Amore 					}
63463481050STheo Schlossnagle 				}
6356b5e5868SGarrett D'Amore 				need = want;
6366b5e5868SGarrett D'Amore 			}
6376b5e5868SGarrett D'Amore 		}
6386b5e5868SGarrett D'Amore 	}
6396b5e5868SGarrett D'Amore 
6406b5e5868SGarrett D'Amore end:
6416b5e5868SGarrett D'Amore 	if (tr)
6426b5e5868SGarrett D'Amore 		free(tr);
6436b5e5868SGarrett D'Amore 	return (need);
6446b5e5868SGarrett D'Amore 
6456b5e5868SGarrett D'Amore fail:
6466b5e5868SGarrett D'Amore 	if (tr)
6476b5e5868SGarrett D'Amore 		free(tr);
6486b5e5868SGarrett D'Amore 	return ((size_t)(-1));
6494297a3b0SGarrett D'Amore }
650