xref: /illumos-gate/usr/src/cmd/localedef/collate.c (revision 20a7641f9918de8574b8b3b47dbe35c4bfc78df1)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2017 Nexenta Systems, Inc.
14  */
15 
16 /*
17  * LC_COLLATE database generation routines for localedef.
18  */
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <errno.h>
23 #include <string.h>
24 #include <sys/types.h>
25 #include <sys/avl.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <wchar.h>
29 #include <widec.h>
30 #include <limits.h>
31 #include "localedef.h"
32 #include "parser.tab.h"
33 #include "collatefile.h"
34 
35 /*
36  * Design notes.
37  *
38  * It will be extremely helpful to the reader if they have access to
39  * the localedef and locale file format specifications available.
40  * Latest versions of these are available from www.opengroup.org.
41  *
42  * The design for the collation code is a bit complex.  The goal is a
43  * single collation database as described in collate.h (in
44  * libc/port/locale).  However, there are some other tidbits:
45  *
46  * a) The substitution entries are now a directly indexable array.  A
47  * priority elsewhere in the table is taken as an index into the
48  * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
49  * set.  (The bit is cleared and the result is the index into the
50  * table.
51  *
52  * b) We eliminate duplicate entries into the substitution table.
53  * This saves a lot of space.
54  *
55  * c) The priorities for each level are "compressed", so that each
56  * sorting level has consecutively numbered priorities starting at 1.
57  * (O is reserved for the ignore priority.)  This means sort levels
58  * which only have a few distinct priorities can represent the
59  * priority level in fewer bits, which makes the strxfrm output
60  * smaller.
61  *
62  * d) We record the total number of priorities so that strxfrm can
63  * figure out how many bytes to expand a numeric priority into.
64  *
65  * e) For the UNDEFINED pass (the last pass), we record the maximum
66  * number of bits needed to uniquely prioritize these entries, so that
67  * the last pass can also use smaller strxfrm output when possible.
68  *
69  * f) Priorities with the sign bit set are verboten.  This works out
70  * because no active character set needs that bit to carry significant
71  * information once the character is in wide form.
72  *
73  * To process the entire data to make the database, we actually run
74  * multiple passes over the data.
75  *
76  * The first pass, which is done at parse time, identifies elements,
77  * substitutions, and such, and records them in priority order.  As
78  * some priorities can refer to other priorities, using forward
79  * references, we use a table of references indicating whether the
80  * priority's value has been resolved, or whether it is still a
81  * reference.
82  *
83  * The second pass walks over all the items in priority order, noting
84  * that they are used directly, and not just an indirect reference.
85  * This is done by creating a "weight" structure for the item.  The
86  * weights are stashed in an AVL tree sorted by relative "priority".
87  *
88  * The third pass walks over all the weight structures, in priority
89  * order, and assigns a new monotonically increasing (per sort level)
90  * weight value to them.  These are the values that will actually be
91  * written to the file.
92  *
93  * The fourth pass just writes the data out.
94  */
95 
96 /*
97  * In order to resolve the priorities, we create a table of priorities.
98  * Entries in the table can be in one of three states.
99  *
100  * UNKNOWN is for newly allocated entries, and indicates that nothing
101  * is known about the priority.  (For example, when new entries are created
102  * for collating-symbols, this is the value assigned for them until the
103  * collating symbol's order has been determined.
104  *
105  * RESOLVED is used for an entry where the priority indicates the final
106  * numeric weight.
107  *
108  * REFER is used for entries that reference other entries.  Typically
109  * this is used for forward references.  A collating-symbol can never
110  * have this value.
111  *
112  * The "pass" field is used during final resolution to aid in detection
113  * of referencing loops.  (For example <A> depends on <B>, but <B> has its
114  * priority dependent on <A>.)
115  */
116 typedef enum {
117 	UNKNOWN,	/* priority is totally unknown */
118 	RESOLVED,	/* priority value fully resolved */
119 	REFER		/* priority is a reference (index) */
120 } res_t;
121 
122 typedef struct weight {
123 	int32_t		pri;
124 	int		opt;
125 	avl_node_t	avl;
126 } weight_t;
127 
128 typedef struct priority {
129 	res_t		res;
130 	int32_t		pri;
131 	int		pass;
132 	int		lineno;
133 } collpri_t;
134 
135 #define	NUM_WT	collinfo.directive_count
136 
137 /*
138  * These are the abstract collating symbols, which are just a symbolic
139  * way to reference a priority.
140  */
141 struct collsym {
142 	char		*name;
143 	int32_t		ref;
144 	avl_node_t	avl;
145 };
146 
147 /*
148  * These are also abstract collating symbols, but we allow them to have
149  * different priorities at different levels.
150  */
151 typedef struct collundef {
152 	char		*name;
153 	int32_t		ref[COLL_WEIGHTS_MAX];
154 	avl_node_t	avl;
155 } collundef_t;
156 
157 /*
158  * These are called "chains" in libc.  This records the fact that two
159  * more characters should be treated as a single collating entity when
160  * they appear together.  For example, in Spanish <C><h> gets collated
161  * as a character between <C> and <D>.
162  */
163 struct collelem {
164 	char		*symbol;
165 	wchar_t		*expand;
166 	int32_t		ref[COLL_WEIGHTS_MAX];
167 	avl_node_t	avl_bysymbol;
168 	avl_node_t	avl_byexpand;
169 };
170 
171 /*
172  * Individual characters have a sequence of weights as well.
173  */
174 typedef struct collchar {
175 	wchar_t		wc;
176 	int32_t		ref[COLL_WEIGHTS_MAX];
177 	avl_node_t	avl;
178 } collchar_t;
179 
180 /*
181  * Substitution entries.  The key is itself a priority.  Note that
182  * when we create one of these, we *automatically* wind up with a
183  * fully resolved priority for the key, because creation of
184  * substitutions creates a resolved priority at the same time.
185  */
186 typedef struct {
187 	int32_t		key;
188 	int32_t		ref[COLLATE_STR_LEN];
189 	avl_node_t	avl;
190 	avl_node_t	avl_ref;
191 } subst_t;
192 
193 static avl_tree_t	collsyms;
194 static avl_tree_t	collundefs;
195 static avl_tree_t	elem_by_symbol;
196 static avl_tree_t	elem_by_expand;
197 static avl_tree_t	collchars;
198 static avl_tree_t	substs[COLL_WEIGHTS_MAX];
199 static avl_tree_t	substs_ref[COLL_WEIGHTS_MAX];
200 static avl_tree_t	weights[COLL_WEIGHTS_MAX];
201 static int32_t		nweight[COLL_WEIGHTS_MAX];
202 
203 /*
204  * This is state tracking for the ellipsis token.  Note that we start
205  * the initial values so that the ellipsis logic will think we got a
206  * magic starting value of NUL.  It starts at minus one because the
207  * starting point is exclusive -- i.e. the starting point is not
208  * itself handled by the ellipsis code.
209  */
210 static int currorder = EOF;
211 static int lastorder = EOF;
212 static collelem_t *currelem;
213 static collchar_t *currchar;
214 static collundef_t *currundef;
215 static wchar_t ellipsis_start = 0;
216 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
217 
218 /*
219  * We keep a running tally of weights.
220  */
221 static int nextpri = 1;
222 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
223 
224 /*
225  * This array collects up the weights for each level.
226  */
227 static int32_t order_weights[COLL_WEIGHTS_MAX];
228 static int curr_weight = 0;
229 static int32_t subst_weights[COLLATE_STR_LEN];
230 static int curr_subst = 0;
231 
232 /*
233  * Some initial priority values.
234  */
235 static int32_t pri_undefined[COLL_WEIGHTS_MAX];
236 static int32_t pri_ignore;
237 
238 static collate_info_t collinfo;
239 
240 static collpri_t	*prilist = NULL;
241 static int		numpri = 0;
242 static int		maxpri = 0;
243 
244 static void start_order(int);
245 
246 static int32_t
247 new_pri(void)
248 {
249 	int i;
250 
251 	if (numpri >= maxpri) {
252 		maxpri = maxpri ? maxpri * 2 : 1024;
253 		prilist = realloc(prilist, sizeof (collpri_t) * maxpri);
254 		if (prilist == NULL) {
255 			errf(_("out of memory"));
256 			return (-1);
257 		}
258 		for (i = numpri; i < maxpri; i++) {
259 			prilist[i].res = UNKNOWN;
260 			prilist[i].pri = 0;
261 			prilist[i].pass = 0;
262 		}
263 	}
264 	return (numpri++);
265 }
266 
267 static collpri_t *
268 get_pri(int32_t ref)
269 {
270 	if ((ref < 0) || (ref > numpri)) {
271 		INTERR;
272 		return (NULL);
273 	}
274 	return (&prilist[ref]);
275 }
276 
277 static void
278 set_pri(int32_t ref, int32_t v, res_t res)
279 {
280 	collpri_t	*pri;
281 
282 	pri = get_pri(ref);
283 
284 	if ((res == REFER) && ((v < 0) || (v >= numpri))) {
285 		INTERR;
286 	}
287 
288 	/* Resolve self references */
289 	if ((res == REFER) && (ref == v)) {
290 		v = nextpri;
291 		res = RESOLVED;
292 	}
293 
294 	if (pri->res != UNKNOWN) {
295 		warn(_("repeated item in order list (first on %d)"),
296 		    pri->lineno);
297 		return;
298 	}
299 	pri->lineno = lineno;
300 	pri->pri = v;
301 	pri->res = res;
302 }
303 
304 static int32_t
305 resolve_pri(int32_t ref)
306 {
307 	collpri_t	*pri;
308 	static int32_t	pass = 0;
309 
310 	pri = get_pri(ref);
311 	pass++;
312 	while (pri->res == REFER) {
313 		if (pri->pass == pass) {
314 			/* report a line with the circular symbol */
315 			lineno = pri->lineno;
316 			errf(_("circular reference in order list"));
317 			return (-1);
318 		}
319 		if ((pri->pri < 0) || (pri->pri >= numpri)) {
320 			INTERR;
321 			return (-1);
322 		}
323 		pri->pass = pass;
324 		pri = &prilist[pri->pri];
325 	}
326 
327 	if (pri->res == UNKNOWN) {
328 		return (-1);
329 	}
330 	if (pri->res != RESOLVED)
331 		INTERR;
332 
333 	return (pri->pri);
334 }
335 
336 static int
337 weight_compare(const void *n1, const void *n2)
338 {
339 	int32_t	k1 = ((const weight_t *)n1)->pri;
340 	int32_t	k2 = ((const weight_t *)n2)->pri;
341 
342 	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
343 }
344 
345 static int
346 collsym_compare(const void *n1, const void *n2)
347 {
348 	const collsym_t *c1 = n1;
349 	const collsym_t *c2 = n2;
350 	int rv;
351 
352 	rv = strcmp(c1->name, c2->name);
353 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
354 }
355 
356 static int
357 collundef_compare(const void *n1, const void *n2)
358 {
359 	const collundef_t *c1 = n1;
360 	const collundef_t *c2 = n2;
361 	int rv;
362 
363 	rv = strcmp(c1->name, c2->name);
364 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
365 }
366 
367 static int
368 element_compare_symbol(const void *n1, const void *n2)
369 {
370 	const collelem_t *c1 = n1;
371 	const collelem_t *c2 = n2;
372 	int rv;
373 
374 	rv = strcmp(c1->symbol, c2->symbol);
375 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
376 }
377 
378 static int
379 element_compare_expand(const void *n1, const void *n2)
380 {
381 	const collelem_t *c1 = n1;
382 	const collelem_t *c2 = n2;
383 	int rv;
384 
385 	rv = wcscmp(c1->expand, c2->expand);
386 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
387 }
388 
389 static int
390 collchar_compare(const void *n1, const void *n2)
391 {
392 	wchar_t	k1 = ((const collchar_t *)n1)->wc;
393 	wchar_t	k2 = ((const collchar_t *)n2)->wc;
394 
395 	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
396 }
397 
398 static int
399 subst_compare(const void *n1, const void *n2)
400 {
401 	int32_t	k1 = ((const subst_t *)n1)->key;
402 	int32_t	k2 = ((const subst_t *)n2)->key;
403 
404 	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
405 }
406 
407 static int
408 subst_compare_ref(const void *n1, const void *n2)
409 {
410 	int32_t *c1 = ((subst_t *)n1)->ref;
411 	int32_t *c2 = ((subst_t *)n2)->ref;
412 	int rv;
413 
414 	rv = wcscmp((wchar_t *)c1, (wchar_t *)c2);
415 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
416 }
417 
418 void
419 init_collate(void)
420 {
421 	int i;
422 
423 	avl_create(&collsyms, collsym_compare, sizeof (collsym_t),
424 	    offsetof(collsym_t, avl));
425 
426 	avl_create(&collundefs, collundef_compare, sizeof (collsym_t),
427 	    offsetof(collundef_t, avl));
428 
429 	avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t),
430 	    offsetof(collelem_t, avl_bysymbol));
431 	avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t),
432 	    offsetof(collelem_t, avl_byexpand));
433 
434 	avl_create(&collchars, collchar_compare, sizeof (collchar_t),
435 	    offsetof(collchar_t, avl));
436 
437 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
438 		avl_create(&substs[i], subst_compare, sizeof (subst_t),
439 		    offsetof(subst_t, avl));
440 		avl_create(&substs_ref[i], subst_compare_ref,
441 		    sizeof (subst_t), offsetof(subst_t, avl_ref));
442 		avl_create(&weights[i], weight_compare, sizeof (weight_t),
443 		    offsetof(weight_t, avl));
444 		nweight[i] = 1;
445 	}
446 
447 	(void) memset(&collinfo, 0, sizeof (collinfo));
448 
449 	/* allocate some initial priorities */
450 	pri_ignore = new_pri();
451 
452 	set_pri(pri_ignore, 0, RESOLVED);
453 
454 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
455 		pri_undefined[i] = new_pri();
456 
457 		/* we will override this later */
458 		set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
459 	}
460 }
461 
462 void
463 define_collsym(char *name)
464 {
465 	collsym_t	*sym;
466 	avl_index_t	where;
467 
468 	if ((sym = calloc(1, sizeof (*sym))) == NULL) {
469 		errf(_("out of memory"));
470 		return;
471 	}
472 	sym->name = name;
473 	sym->ref = new_pri();
474 
475 	if (avl_find(&collsyms, sym, &where) != NULL) {
476 		/*
477 		 * This should never happen because we are only called
478 		 * for undefined symbols.
479 		 */
480 		free(sym);
481 		INTERR;
482 		return;
483 	}
484 	avl_insert(&collsyms, sym, where);
485 }
486 
487 collsym_t *
488 lookup_collsym(char *name)
489 {
490 	collsym_t	srch;
491 
492 	srch.name = name;
493 	return (avl_find(&collsyms, &srch, NULL));
494 }
495 
496 collelem_t *
497 lookup_collelem(char *symbol)
498 {
499 	collelem_t	srch;
500 
501 	srch.symbol = symbol;
502 	return (avl_find(&elem_by_symbol, &srch, NULL));
503 }
504 
505 static collundef_t *
506 get_collundef(char *name)
507 {
508 	collundef_t	srch;
509 	collundef_t	*ud;
510 	avl_index_t	where;
511 	int		i;
512 
513 	srch.name = name;
514 	if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) {
515 		if (((ud = calloc(1, sizeof (*ud))) == NULL) ||
516 		    ((ud->name = strdup(name)) == NULL)) {
517 			errf(_("out of memory"));
518 			free(ud);
519 			return (NULL);
520 		}
521 		for (i = 0; i < NUM_WT; i++) {
522 			ud->ref[i] = new_pri();
523 		}
524 		avl_insert(&collundefs, ud, where);
525 	}
526 	add_charmap_undefined(name);
527 	return (ud);
528 }
529 
530 static collchar_t *
531 get_collchar(wchar_t wc, int create)
532 {
533 	collchar_t	srch;
534 	collchar_t	*cc;
535 	avl_index_t	where;
536 	int		i;
537 
538 	srch.wc = wc;
539 	cc = avl_find(&collchars, &srch, &where);
540 	if ((cc == NULL) && create) {
541 		if ((cc = calloc(1, sizeof (*cc))) == NULL) {
542 			errf(_("out of memory"));
543 			return (NULL);
544 		}
545 		for (i = 0; i < NUM_WT; i++) {
546 			cc->ref[i] = new_pri();
547 		}
548 		cc->wc = wc;
549 		avl_insert(&collchars, cc, where);
550 	}
551 	return (cc);
552 }
553 
554 void
555 end_order_collsym(collsym_t *sym)
556 {
557 	start_order(T_COLLSYM);
558 	/* update the weight */
559 
560 	set_pri(sym->ref, nextpri, RESOLVED);
561 	nextpri++;
562 }
563 
564 void
565 end_order(void)
566 {
567 	int		i;
568 	int32_t		pri;
569 	int32_t		ref;
570 	collpri_t	*p;
571 
572 	/* advance the priority/weight */
573 	pri = nextpri;
574 
575 	switch (currorder) {
576 	case T_CHAR:
577 		for (i = 0; i < NUM_WT; i++) {
578 			if (((ref = order_weights[i]) < 0) ||
579 			    ((p = get_pri(ref)) == NULL) ||
580 			    (p->pri == -1)) {
581 				/* unspecified weight is a self reference */
582 				set_pri(currchar->ref[i], pri, RESOLVED);
583 			} else {
584 				set_pri(currchar->ref[i], ref, REFER);
585 			}
586 			order_weights[i] = -1;
587 		}
588 
589 		/* leave a cookie trail in case next symbol is ellipsis */
590 		ellipsis_start = currchar->wc + 1;
591 		currchar = NULL;
592 		break;
593 
594 	case T_ELLIPSIS:
595 		/* save off the weights were we can find them */
596 		for (i = 0; i < NUM_WT; i++) {
597 			ellipsis_weights[i] = order_weights[i];
598 			order_weights[i] = -1;
599 		}
600 		break;
601 
602 	case T_COLLELEM:
603 		if (currelem == NULL) {
604 			INTERR;
605 		} else {
606 			for (i = 0; i < NUM_WT; i++) {
607 
608 				if (((ref = order_weights[i]) < 0) ||
609 				    ((p = get_pri(ref)) == NULL) ||
610 				    (p->pri == -1)) {
611 					set_pri(currelem->ref[i], pri,
612 					    RESOLVED);
613 				} else {
614 					set_pri(currelem->ref[i], ref, REFER);
615 				}
616 				order_weights[i] = -1;
617 			}
618 		}
619 		break;
620 
621 	case T_UNDEFINED:
622 		for (i = 0; i < NUM_WT; i++) {
623 			if (((ref = order_weights[i]) < 0) ||
624 			    ((p = get_pri(ref)) == NULL) ||
625 			    (p->pri == -1)) {
626 				set_pri(pri_undefined[i], -1, RESOLVED);
627 			} else {
628 				set_pri(pri_undefined[i], ref, REFER);
629 			}
630 			order_weights[i] = -1;
631 		}
632 		break;
633 
634 	case T_SYMBOL:
635 		for (i = 0; i < NUM_WT; i++) {
636 			if (((ref = order_weights[i]) < 0) ||
637 			    ((p = get_pri(ref)) == NULL) ||
638 			    (p->pri == -1)) {
639 				set_pri(currundef->ref[i], pri, RESOLVED);
640 			} else {
641 				set_pri(currundef->ref[i], ref, REFER);
642 			}
643 			order_weights[i] = -1;
644 		}
645 		break;
646 
647 	default:
648 		INTERR;
649 	}
650 
651 	nextpri++;
652 }
653 
654 static void
655 start_order(int type)
656 {
657 	int	i;
658 
659 	lastorder = currorder;
660 	currorder = type;
661 
662 	/* this is used to protect ELLIPSIS processing */
663 	if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
664 		errf(_("character value expected"));
665 	}
666 
667 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
668 		order_weights[i] = -1;
669 	}
670 	curr_weight = 0;
671 }
672 
673 void
674 start_order_undefined(void)
675 {
676 	start_order(T_UNDEFINED);
677 }
678 
679 void
680 start_order_symbol(char *name)
681 {
682 	currundef = get_collundef(name);
683 	start_order(T_SYMBOL);
684 }
685 
686 void
687 start_order_char(wchar_t wc)
688 {
689 	collchar_t	*cc;
690 	int32_t		ref;
691 
692 	start_order(T_CHAR);
693 
694 	/*
695 	 * If we last saw an ellipsis, then we need to close the range.
696 	 * Handle that here.  Note that we have to be careful because the
697 	 * items *inside* the range are treated exclusiveley to the items
698 	 * outside of the range.  The ends of the range can have quite
699 	 * different weights than the range members.
700 	 */
701 	if (lastorder == T_ELLIPSIS) {
702 		int		i;
703 
704 		if (wc < ellipsis_start) {
705 			errf(_("malformed range!"));
706 			return;
707 		}
708 		while (ellipsis_start < wc) {
709 			/*
710 			 * pick all of the saved weights for the
711 			 * ellipsis.  note that -1 encodes for the
712 			 * ellipsis itself, which means to take the
713 			 * current relative priority.
714 			 */
715 			if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
716 				INTERR;
717 				return;
718 			}
719 			for (i = 0; i < NUM_WT; i++) {
720 				collpri_t *p;
721 				if (((ref = ellipsis_weights[i]) == -1) ||
722 				    ((p = get_pri(ref)) == NULL) ||
723 				    (p->pri == -1)) {
724 					set_pri(cc->ref[i], nextpri, RESOLVED);
725 				} else {
726 					set_pri(cc->ref[i], ref, REFER);
727 				}
728 				ellipsis_weights[i] = 0;
729 			}
730 			ellipsis_start++;
731 			nextpri++;
732 		}
733 	}
734 
735 	currchar = get_collchar(wc, 1);
736 }
737 
738 void
739 start_order_collelem(collelem_t *e)
740 {
741 	start_order(T_COLLELEM);
742 	currelem = e;
743 }
744 
745 void
746 start_order_ellipsis(void)
747 {
748 	int	i;
749 
750 	start_order(T_ELLIPSIS);
751 
752 	if (lastorder != T_CHAR) {
753 		errf(_("illegal starting point for range"));
754 		return;
755 	}
756 
757 	for (i = 0; i < NUM_WT; i++) {
758 		ellipsis_weights[i] = order_weights[i];
759 	}
760 }
761 
762 void
763 define_collelem(char *name, wchar_t *wcs)
764 {
765 	collelem_t	*e;
766 	avl_index_t	where1;
767 	avl_index_t	where2;
768 	int		i;
769 
770 	if (wcslen(wcs) >= COLLATE_STR_LEN) {
771 		errf(_("expanded collation element too long"));
772 		return;
773 	}
774 
775 	if ((e = calloc(1, sizeof (*e))) == NULL) {
776 		errf(_("out of memory"));
777 		return;
778 	}
779 	e->expand = wcs;
780 	e->symbol = name;
781 
782 	/*
783 	 * This is executed before the order statement, so we don't
784 	 * know how many priorities we *really* need.  We allocate one
785 	 * for each possible weight.  Not a big deal, as collating-elements
786 	 * prove to be quite rare.
787 	 */
788 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
789 		e->ref[i] = new_pri();
790 	}
791 
792 	/* A character sequence can only reduce to one element. */
793 	if ((avl_find(&elem_by_symbol, e, &where1) != NULL) ||
794 	    (avl_find(&elem_by_expand, e, &where2) != NULL)) {
795 		errf(_("duplicate collating element definition"));
796 		free(e);
797 		return;
798 	}
799 	avl_insert(&elem_by_symbol, e, where1);
800 	avl_insert(&elem_by_expand, e, where2);
801 }
802 
803 void
804 add_order_bit(int kw)
805 {
806 	uint8_t bit = DIRECTIVE_UNDEF;
807 
808 	switch (kw) {
809 	case T_FORWARD:
810 		bit = DIRECTIVE_FORWARD;
811 		break;
812 	case T_BACKWARD:
813 		bit = DIRECTIVE_BACKWARD;
814 		break;
815 	case T_POSITION:
816 		bit = DIRECTIVE_POSITION;
817 		break;
818 	default:
819 		INTERR;
820 		break;
821 	}
822 	collinfo.directive[collinfo.directive_count] |= bit;
823 }
824 
825 void
826 add_order_directive(void)
827 {
828 	if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
829 		errf(_("too many directives (max %d)"), COLL_WEIGHTS_MAX);
830 	}
831 	collinfo.directive_count++;
832 }
833 
834 static void
835 add_order_pri(int32_t ref)
836 {
837 	if (curr_weight >= NUM_WT) {
838 		errf(_("too many weights (max %d)"), NUM_WT);
839 		return;
840 	}
841 	order_weights[curr_weight] = ref;
842 	curr_weight++;
843 }
844 
845 void
846 add_order_collsym(collsym_t *s)
847 {
848 	add_order_pri(s->ref);
849 }
850 
851 void
852 add_order_char(wchar_t wc)
853 {
854 	collchar_t *cc;
855 
856 	if ((cc = get_collchar(wc, 1)) == NULL) {
857 		INTERR;
858 		return;
859 	}
860 
861 	add_order_pri(cc->ref[curr_weight]);
862 }
863 
864 void
865 add_order_collelem(collelem_t *e)
866 {
867 	add_order_pri(e->ref[curr_weight]);
868 }
869 
870 void
871 add_order_ignore(void)
872 {
873 	add_order_pri(pri_ignore);
874 }
875 
876 void
877 add_order_symbol(char *sym)
878 {
879 	collundef_t *c;
880 	if ((c = get_collundef(sym)) == NULL) {
881 		INTERR;
882 		return;
883 	}
884 	add_order_pri(c->ref[curr_weight]);
885 }
886 
887 void
888 add_order_ellipsis(void)
889 {
890 	/* special 0 value indicates self reference */
891 	add_order_pri(0);
892 }
893 
894 void
895 add_order_subst(void)
896 {
897 	subst_t srch;
898 	subst_t	*s;
899 	avl_index_t where;
900 	int i;
901 
902 	(void) memset(&srch, 0, sizeof (srch));
903 	for (i = 0; i < curr_subst; i++) {
904 		srch.ref[i] = subst_weights[i];
905 		subst_weights[i] = 0;
906 	}
907 	s = avl_find(&substs_ref[curr_weight], &srch, &where);
908 
909 	if (s == NULL) {
910 		if ((s = calloc(1, sizeof (*s))) == NULL) {
911 			errf(_("out of memory"));
912 			return;
913 		}
914 		s->key = new_pri();
915 
916 		/*
917 		 * We use a self reference for our key, but we set a
918 		 * high bit to indicate that this is a substitution
919 		 * reference.  This will expedite table lookups later,
920 		 * and prevent table lookups for situations that don't
921 		 * require it.  (In short, its a big win, because we
922 		 * can skip a lot of binary searching.)
923 		 */
924 		set_pri(s->key,
925 		    (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
926 		    RESOLVED);
927 		nextsubst[curr_weight] += 1;
928 
929 		for (i = 0; i < curr_subst; i++) {
930 			s->ref[i] = srch.ref[i];
931 		}
932 
933 		avl_insert(&substs_ref[curr_weight], s, where);
934 
935 		if (avl_find(&substs[curr_weight], s, &where) != NULL) {
936 			INTERR;
937 			return;
938 		}
939 		avl_insert(&substs[curr_weight], s, where);
940 	}
941 	curr_subst = 0;
942 
943 
944 	/*
945 	 * We are using the current (unique) priority as a search key
946 	 * in the substitution table.
947 	 */
948 	add_order_pri(s->key);
949 }
950 
951 static void
952 add_subst_pri(int32_t ref)
953 {
954 	if (curr_subst >= COLLATE_STR_LEN) {
955 		errf(_("substitution string is too long"));
956 		return;
957 	}
958 	subst_weights[curr_subst] = ref;
959 	curr_subst++;
960 }
961 
962 void
963 add_subst_char(wchar_t wc)
964 {
965 	collchar_t *cc;
966 
967 
968 	if (((cc = get_collchar(wc, 1)) == NULL) ||
969 	    (cc->wc != wc)) {
970 		INTERR;
971 		return;
972 	}
973 	/* we take the weight for the character at that position */
974 	add_subst_pri(cc->ref[curr_weight]);
975 }
976 
977 void
978 add_subst_collelem(collelem_t *e)
979 {
980 	add_subst_pri(e->ref[curr_weight]);
981 }
982 
983 void
984 add_subst_collsym(collsym_t *s)
985 {
986 	add_subst_pri(s->ref);
987 }
988 
989 void
990 add_subst_symbol(char *ptr)
991 {
992 	collundef_t *cu;
993 
994 	if ((cu = get_collundef(ptr)) != NULL) {
995 		add_subst_pri(cu->ref[curr_weight]);
996 	}
997 }
998 
999 void
1000 add_weight(int32_t ref, int pass)
1001 {
1002 	weight_t srch;
1003 	weight_t *w;
1004 	avl_index_t where;
1005 
1006 	srch.pri = resolve_pri(ref);
1007 
1008 	/* No translation of ignores */
1009 	if (srch.pri == 0)
1010 		return;
1011 
1012 	/* Substitution priorities are not weights */
1013 	if (srch.pri & COLLATE_SUBST_PRIORITY)
1014 		return;
1015 
1016 	if (avl_find(&weights[pass], &srch, &where) != NULL)
1017 		return;
1018 
1019 	if ((w = calloc(1, sizeof (*w))) == NULL) {
1020 		errf(_("out of memory"));
1021 		return;
1022 	}
1023 	w->pri = srch.pri;
1024 	avl_insert(&weights[pass], w, where);
1025 }
1026 
1027 void
1028 add_weights(int32_t *refs)
1029 {
1030 	int i;
1031 	for (i = 0; i < NUM_WT; i++) {
1032 		add_weight(refs[i], i);
1033 	}
1034 }
1035 
1036 int32_t
1037 get_weight(int32_t ref, int pass)
1038 {
1039 	weight_t	srch;
1040 	weight_t	*w;
1041 	int32_t		pri;
1042 
1043 	pri = resolve_pri(ref);
1044 	if (pri & COLLATE_SUBST_PRIORITY) {
1045 		return (pri);
1046 	}
1047 	if (pri <= 0) {
1048 		return (pri);
1049 	}
1050 	srch.pri = pri;
1051 	if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) {
1052 		INTERR;
1053 		return (-1);
1054 	}
1055 	return (w->opt);
1056 }
1057 
1058 void
1059 dump_collate(void)
1060 {
1061 	FILE			*f;
1062 	int			i, j, n;
1063 	size_t			sz;
1064 	int32_t			pri;
1065 	collelem_t		*ce;
1066 	collchar_t		*cc;
1067 	subst_t			*sb;
1068 	char			vers[COLLATE_STR_LEN];
1069 	collate_char_t		chars[UCHAR_MAX + 1];
1070 	collate_large_t		*large;
1071 	collate_subst_t		*subst[COLL_WEIGHTS_MAX];
1072 	collate_chain_t		*chain;
1073 
1074 	/*
1075 	 * We have to run through a preliminary pass to identify all the
1076 	 * weights that we use for each sorting level.
1077 	 */
1078 	for (i = 0; i < NUM_WT; i++) {
1079 		add_weight(pri_ignore, i);
1080 	}
1081 	for (i = 0; i < NUM_WT; i++) {
1082 		for (sb = avl_first(&substs[i]); sb;
1083 		    sb = AVL_NEXT(&substs[i], sb)) {
1084 			for (j = 0; sb->ref[j]; j++) {
1085 				add_weight(sb->ref[j], i);
1086 			}
1087 		}
1088 	}
1089 	for (ce = avl_first(&elem_by_expand);
1090 	    ce != NULL;
1091 	    ce = AVL_NEXT(&elem_by_expand, ce)) {
1092 		add_weights(ce->ref);
1093 	}
1094 	for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1095 		add_weights(cc->ref);
1096 	}
1097 
1098 	/*
1099 	 * Now we walk the entire set of weights, removing the gaps
1100 	 * in the weights.  This gives us optimum usage.  The walk
1101 	 * occurs in priority.
1102 	 */
1103 	for (i = 0; i < NUM_WT; i++) {
1104 		weight_t *w;
1105 		for (w = avl_first(&weights[i]); w;
1106 		    w = AVL_NEXT(&weights[i], w)) {
1107 			w->opt = nweight[i];
1108 			nweight[i] += 1;
1109 		}
1110 	}
1111 
1112 	(void) memset(&chars, 0, sizeof (chars));
1113 	(void) memset(vers, 0, COLLATE_STR_LEN);
1114 	(void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
1115 
1116 	/*
1117 	 * We need to make sure we arrange for the UNDEFINED field
1118 	 * to show up.  Also, set the total weight counts.
1119 	 */
1120 	for (i = 0; i < NUM_WT; i++) {
1121 		if (resolve_pri(pri_undefined[i]) == -1) {
1122 			set_pri(pri_undefined[i], -1, RESOLVED);
1123 			/* they collate at the end of everything else */
1124 			collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
1125 		}
1126 		collinfo.pri_count[i] = nweight[i];
1127 	}
1128 
1129 	collinfo.pri_count[NUM_WT] = max_wide();
1130 	collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
1131 	collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
1132 
1133 	/*
1134 	 * Ordinary character priorities
1135 	 */
1136 	for (i = 0; i <= UCHAR_MAX; i++) {
1137 		if ((cc = get_collchar(i, 0)) != NULL) {
1138 			for (j = 0; j < NUM_WT; j++) {
1139 				chars[i].pri[j] = get_weight(cc->ref[j], j);
1140 			}
1141 		} else {
1142 			for (j = 0; j < NUM_WT; j++) {
1143 				chars[i].pri[j] =
1144 				    get_weight(pri_undefined[j], j);
1145 			}
1146 			/*
1147 			 * Per POSIX, for undefined characters, we
1148 			 * also have to add a last item, which is the
1149 			 * character code.
1150 			 */
1151 			chars[i].pri[NUM_WT] = i;
1152 		}
1153 	}
1154 
1155 	/*
1156 	 * Substitution tables
1157 	 */
1158 	for (i = 0; i < NUM_WT; i++) {
1159 		collate_subst_t *st = NULL;
1160 		n = collinfo.subst_count[i] = avl_numnodes(&substs[i]);
1161 		if ((st = calloc(n, sizeof (collate_subst_t))) == NULL) {
1162 			errf(_("out of memory"));
1163 			return;
1164 		}
1165 		n = 0;
1166 		for (sb = avl_first(&substs[i]); sb;
1167 		    sb = AVL_NEXT(&substs[i], sb)) {
1168 			if ((st[n].key = resolve_pri(sb->key)) < 0) {
1169 				/* by definition these resolve! */
1170 				INTERR;
1171 			}
1172 			if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
1173 				INTERR;
1174 			}
1175 			for (j = 0; sb->ref[j]; j++) {
1176 				st[n].pri[j] = get_weight(sb->ref[j], i);
1177 			}
1178 			n++;
1179 		}
1180 		if (n != collinfo.subst_count[i])
1181 			INTERR;
1182 		subst[i] = st;
1183 	}
1184 
1185 
1186 	/*
1187 	 * Chains, i.e. collating elements
1188 	 */
1189 	collinfo.chain_count = avl_numnodes(&elem_by_expand);
1190 	chain = calloc(collinfo.chain_count, sizeof (collate_chain_t));
1191 	if (chain == NULL) {
1192 		errf(_("out of memory"));
1193 		return;
1194 	}
1195 	for (n = 0, ce = avl_first(&elem_by_expand);
1196 	    ce != NULL;
1197 	    ce = AVL_NEXT(&elem_by_expand, ce), n++) {
1198 		(void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
1199 		for (i = 0; i < NUM_WT; i++) {
1200 			chain[n].pri[i] = get_weight(ce->ref[i], i);
1201 		}
1202 	}
1203 	if (n != collinfo.chain_count)
1204 		INTERR;
1205 
1206 	/*
1207 	 * Large (> UCHAR_MAX) character priorities
1208 	 */
1209 	large = calloc(avl_numnodes(&collchars), sizeof (collate_large_t));
1210 	if (large == NULL) {
1211 		errf(_("out of memory"));
1212 		return;
1213 	}
1214 
1215 	i = 0;
1216 	for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1217 		int	undef = 0;
1218 		/* we already gathered those */
1219 		if (cc->wc <= UCHAR_MAX)
1220 			continue;
1221 		for (j = 0; j < NUM_WT; j++) {
1222 			if ((pri = get_weight(cc->ref[j], j)) < 0) {
1223 				undef = 1;
1224 			}
1225 			if (undef && (pri >= 0)) {
1226 				/* if undefined, then all priorities are */
1227 				INTERR;
1228 			} else {
1229 				large[i].pri.pri[j] = pri;
1230 			}
1231 		}
1232 		if (!undef) {
1233 			large[i].val = cc->wc;
1234 			collinfo.large_count = i++;
1235 		}
1236 	}
1237 
1238 	if ((f = open_category()) == NULL) {
1239 		return;
1240 	}
1241 
1242 	/* Time to write the entire data set out */
1243 
1244 	if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
1245 	    (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
1246 	    (wr_category(&chars, sizeof (chars), f) < 0)) {
1247 		delete_category(f);
1248 		return;
1249 	}
1250 
1251 	for (i = 0; i < NUM_WT; i++) {
1252 		sz =  sizeof (collate_subst_t) * collinfo.subst_count[i];
1253 		if (wr_category(subst[i], sz, f) < 0) {
1254 			delete_category(f);
1255 			return;
1256 		}
1257 	}
1258 	sz = sizeof (collate_chain_t) * collinfo.chain_count;
1259 	if (wr_category(chain, sz, f) < 0) {
1260 		delete_category(f);
1261 		return;
1262 	}
1263 	sz = sizeof (collate_large_t) * collinfo.large_count;
1264 	if (wr_category(large, sz, f) < 0) {
1265 		delete_category(f);
1266 		return;
1267 	}
1268 
1269 	close_category(f);
1270 }
1271