xref: /titanic_50/usr/src/cmd/localedef/collate.c (revision 1b1b6fbd07b95cdbe86f7b731a2fc22050bb8ae8)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
14  */
15 
16 /*
17  * LC_COLLATE database generation routines for localedef.
18  */
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <errno.h>
23 #include <string.h>
24 #include <sys/types.h>
25 #include <sys/avl.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <wchar.h>
29 #include <widec.h>
30 #include <limits.h>
31 #include "localedef.h"
32 #include "parser.tab.h"
33 #include "collate.h"
34 
35 /*
36  * Design notes.
37  *
38  * It will be extremely helpful to the reader if they have access to
39  * the localedef and locale file format specifications available.
40  * Latest versions of these are available from www.opengroup.org.
41  *
42  * The design for the collation code is a bit complex.  The goal is a
43  * single collation database as described in collate.h (in
44  * libc/port/locale).  However, there are some other tidbits:
45  *
46  * a) The substitution entries are now a directly indexable array.  A
47  * priority elsewhere in the table is taken as an index into the
48  * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
49  * set.  (The bit is cleared and the result is the index into the
50  * table.
51  *
52  * b) We eliminate duplicate entries into the substitution table.
53  * This saves a lot of space.
54  *
55  * c) The priorities for each level are "compressed", so that each
56  * sorting level has consecutively numbered priorities starting at 1.
57  * (O is reserved for the ignore priority.)  This means sort levels
58  * which only have a few distinct priorities can represent the
59  * priority level in fewer bits, which makes the strxfrm output
60  * smaller.
61  *
62  * d) We record the total number of priorities so that strxfrm can
63  * figure out how many bytes to expand a numeric priority into.
64  *
65  * e) For the UNDEFINED pass (the last pass), we record the maximum
66  * number of bits needed to uniquely prioritize these entries, so that
67  * the last pass can also use smaller strxfrm output when possible.
68  *
69  * f) Priorities with the sign bit set are verboten.  This works out
70  * because no active character set needs that bit to carry significant
71  * information once the character is in wide form.
72  *
73  * To process the entire data to make the database, we actually run
74  * multiple passes over the data.
75  *
76  * The first pass, which is done at parse time, identifies elements,
77  * substitutions, and such, and records them in priority order.  As
78  * some priorities can refer to other priorities, using forward
79  * references, we use a table of references indicating whether the
80  * priority's value has been resolved, or whether it is still a
81  * reference.
82  *
83  * The second pass walks over all the items in priority order, noting
84  * that they are used directly, and not just an indirect reference.
85  * This is done by creating a "weight" structure for the item.  The
86  * weights are stashed in an AVL tree sorted by relative "priority".
87  *
88  * The third pass walks over all the weight structures, in priority
89  * order, and assigns a new monotonically increasing (per sort level)
90  * weight value to them.  These are the values that will actually be
91  * written to the file.
92  *
93  * The fourth pass just writes the data out.
94  */
95 
96 /*
97  * In order to resolve the priorities, we create a table of priorities.
98  * Entries in the table can be in one of three states.
99  *
100  * UNKNOWN is for newly allocated entries, and indicates that nothing
101  * is known about the priority.  (For example, when new entries are created
102  * for collating-symbols, this is the value assigned for them until the
103  * collating symbol's order has been determined.
104  *
105  * RESOLVED is used for an entry where the priority indicates the final
106  * numeric weight.
107  *
108  * REFER is used for entries that reference other entries.  Typically
109  * this is used for forward references.  A collating-symbol can never
110  * have this value.
111  *
112  * The "pass" field is used during final resolution to aid in detection
113  * of referencing loops.  (For example <A> depends on <B>, but <B> has its
114  * priority dependent on <A>.)
115  */
116 typedef enum {
117 	UNKNOWN,	/* priority is totally unknown */
118 	RESOLVED,	/* priority value fully resolved */
119 	REFER		/* priority is a reference (index) */
120 } res_t;
121 
122 typedef struct weight {
123 	int32_t		pri;
124 	int		opt;
125 	avl_node_t	avl;
126 } weight_t;
127 
128 typedef struct priority {
129 	res_t		res;
130 	int32_t		pri;
131 	int		pass;
132 	int		lineno;
133 } collpri_t;
134 
135 #define	NUM_WT	collinfo.directive_count
136 
137 /*
138  * These are the abstract collating symbols, which are just a symbolic
139  * way to reference a priority.
140  */
141 struct collsym {
142 	char		*name;
143 	int32_t		ref;
144 	avl_node_t	avl;
145 };
146 
147 /*
148  * These are also abstract collating symbols, but we allow them to have
149  * different priorities at different levels.
150  */
151 typedef struct collundef {
152 	char		*name;
153 	int32_t		ref[COLL_WEIGHTS_MAX];
154 	avl_node_t	avl;
155 } collundef_t;
156 
157 /*
158  * These are called "chains" in libc.  This records the fact that two
159  * more characters should be treated as a single collating entity when
160  * they appear together.  For example, in Spanish <C><h> gets collated
161  * as a character between <C> and <D>.
162  */
163 struct collelem {
164 	char		*symbol;
165 	wchar_t		*expand;
166 	int32_t		ref[COLL_WEIGHTS_MAX];
167 	avl_node_t	avl_bysymbol;
168 	avl_node_t	avl_byexpand;
169 };
170 
171 /*
172  * Individual characters have a sequence of weights as well.
173  */
174 typedef struct collchar {
175 	wchar_t		wc;
176 	int32_t		ref[COLL_WEIGHTS_MAX];
177 	avl_node_t	avl;
178 } collchar_t;
179 
180 /*
181  * Substitution entries.  The key is itself a priority.  Note that
182  * when we create one of these, we *automatically* wind up with a
183  * fully resolved priority for the key, because creation of
184  * substitutions creates a resolved priority at the same time.
185  */
186 typedef struct {
187 	int32_t		key;
188 	int32_t		ref[COLLATE_STR_LEN];
189 	avl_node_t	avl;
190 	avl_node_t	avl_ref;
191 } subst_t;
192 
193 static avl_tree_t	collsyms;
194 static avl_tree_t	collundefs;
195 static avl_tree_t	elem_by_symbol;
196 static avl_tree_t	elem_by_expand;
197 static avl_tree_t	collchars;
198 static avl_tree_t	substs[COLL_WEIGHTS_MAX];
199 static avl_tree_t	substs_ref[COLL_WEIGHTS_MAX];
200 static avl_tree_t	weights[COLL_WEIGHTS_MAX];
201 static int32_t		nweight[COLL_WEIGHTS_MAX];
202 
203 /*
204  * This is state tracking for the ellipsis token.  Note that we start
205  * the initial values so that the ellipsis logic will think we got a
206  * magic starting value of NUL.  It starts at minus one because the
207  * starting point is exclusive -- i.e. the starting point is not
208  * itself handled by the ellipsis code.
209  */
210 static int currorder = EOF;
211 static int lastorder = EOF;
212 static collelem_t *currelem;
213 static collchar_t *currchar;
214 static collundef_t *currundef;
215 static wchar_t ellipsis_start = 0;
216 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
217 
218 /*
219  * We keep a running tally of weights.
220  */
221 static int nextpri = 1;
222 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
223 
224 /*
225  * This array collects up the weights for each level.
226  */
227 static int32_t order_weights[COLL_WEIGHTS_MAX];
228 static int curr_weight = 0;
229 static int32_t subst_weights[COLLATE_STR_LEN];
230 static int curr_subst = 0;
231 
232 /*
233  * Some initial priority values.
234  */
235 static int32_t pri_undefined[COLL_WEIGHTS_MAX];
236 static int32_t pri_ignore;
237 
238 static collate_info_t collinfo;
239 
240 static collpri_t	*prilist = NULL;
241 static int		numpri = 0;
242 static int		maxpri = 0;
243 
244 static void start_order(int);
245 
246 static int32_t
247 new_pri(void)
248 {
249 	int i;
250 
251 	if (numpri >= maxpri) {
252 		maxpri = maxpri ? maxpri * 2 : 1024;
253 		prilist = realloc(prilist, sizeof (collpri_t) * maxpri);
254 		if (prilist == NULL) {
255 			errf(_("out of memory"));
256 			return (-1);
257 		}
258 		for (i = numpri; i < maxpri; i++) {
259 			prilist[i].res = UNKNOWN;
260 			prilist[i].pri = 0;
261 			prilist[i].pass = 0;
262 		}
263 	}
264 	return (numpri++);
265 }
266 
267 static collpri_t *
268 get_pri(int32_t ref)
269 {
270 	if ((ref < 0) || (ref > numpri)) {
271 		INTERR;
272 		return (NULL);
273 	}
274 	return (&prilist[ref]);
275 }
276 
277 static void
278 set_pri(int32_t ref, int32_t v, res_t res)
279 {
280 	collpri_t	*pri;
281 
282 	pri = get_pri(ref);
283 
284 	if ((res == REFER) && ((v < 0) || (v >= numpri))) {
285 		INTERR;
286 	}
287 
288 	/* Resolve self references */
289 	if ((res == REFER) && (ref == v)) {
290 		v = nextpri;
291 		res = RESOLVED;
292 	}
293 
294 	if (pri->res != UNKNOWN) {
295 		warn(_("repeated item in order list (first on %d)"),
296 		    pri->lineno);
297 		return;
298 	}
299 	pri->lineno = lineno;
300 	pri->pri = v;
301 	pri->res = res;
302 }
303 
304 static int32_t
305 resolve_pri(int32_t ref)
306 {
307 	collpri_t	*pri;
308 	static int32_t	pass = 0;
309 
310 	pri = get_pri(ref);
311 	pass++;
312 	while (pri->res == REFER) {
313 		if (pri->pass == pass) {
314 			/* report a line with the circular symbol */
315 			lineno = pri->lineno;
316 			errf(_("circular reference in order list"));
317 			return (-1);
318 		}
319 		if ((pri->pri < 0) || (pri->pri >= numpri)) {
320 			INTERR;
321 			return (-1);
322 		}
323 		pri->pass = pass;
324 		pri = &prilist[pri->pri];
325 	}
326 
327 	if (pri->res == UNKNOWN) {
328 		return (-1);
329 	}
330 	if (pri->res != RESOLVED)
331 		INTERR;
332 
333 	return (pri->pri);
334 }
335 
336 static int
337 weight_compare(const void *n1, const void *n2)
338 {
339 	int32_t	k1 = ((const weight_t *)n1)->pri;
340 	int32_t	k2 = ((const weight_t *)n2)->pri;
341 
342 	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
343 }
344 
345 static int
346 collsym_compare(const void *n1, const void *n2)
347 {
348 	const collsym_t *c1 = n1;
349 	const collsym_t *c2 = n2;
350 	int rv;
351 
352 	rv = strcmp(c1->name, c2->name);
353 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
354 }
355 
356 static int
357 collundef_compare(const void *n1, const void *n2)
358 {
359 	const collundef_t *c1 = n1;
360 	const collundef_t *c2 = n2;
361 	int rv;
362 
363 	rv = strcmp(c1->name, c2->name);
364 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
365 }
366 
367 static int
368 element_compare_symbol(const void *n1, const void *n2)
369 {
370 	const collelem_t *c1 = n1;
371 	const collelem_t *c2 = n2;
372 	int rv;
373 
374 	rv = strcmp(c1->symbol, c2->symbol);
375 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
376 }
377 
378 static int
379 element_compare_expand(const void *n1, const void *n2)
380 {
381 	const collelem_t *c1 = n1;
382 	const collelem_t *c2 = n2;
383 	int rv;
384 
385 	rv = wcscmp(c1->expand, c2->expand);
386 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
387 }
388 
389 static int
390 collchar_compare(const void *n1, const void *n2)
391 {
392 	wchar_t	k1 = ((const collchar_t *)n1)->wc;
393 	wchar_t	k2 = ((const collchar_t *)n2)->wc;
394 
395 	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
396 }
397 
398 static int
399 subst_compare(const void *n1, const void *n2)
400 {
401 	int32_t	k1 = ((const subst_t *)n1)->key;
402 	int32_t	k2 = ((const subst_t *)n2)->key;
403 
404 	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
405 }
406 
407 static int
408 subst_compare_ref(const void *n1, const void *n2)
409 {
410 	int32_t *c1 = ((subst_t *)n1)->ref;
411 	int32_t *c2 = ((subst_t *)n2)->ref;
412 	int rv;
413 
414 	rv = wcscmp((wchar_t *)c1, (wchar_t *)c2);
415 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
416 }
417 
418 void
419 init_collate(void)
420 {
421 	int i;
422 
423 	avl_create(&collsyms, collsym_compare, sizeof (collsym_t),
424 	    offsetof(collsym_t, avl));
425 
426 	avl_create(&collundefs, collundef_compare, sizeof (collsym_t),
427 	    offsetof(collundef_t, avl));
428 
429 	avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t),
430 	    offsetof(collelem_t, avl_bysymbol));
431 	avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t),
432 	    offsetof(collelem_t, avl_byexpand));
433 
434 	avl_create(&collchars, collchar_compare, sizeof (collchar_t),
435 	    offsetof(collchar_t, avl));
436 
437 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
438 		avl_create(&substs[i], subst_compare, sizeof (subst_t),
439 		    offsetof(subst_t, avl));
440 		avl_create(&substs_ref[i], subst_compare_ref,
441 		    sizeof (subst_t), offsetof(subst_t, avl_ref));
442 		avl_create(&weights[i], weight_compare, sizeof (weight_t),
443 		    offsetof(weight_t, avl));
444 		nweight[i] = 1;
445 	}
446 
447 	(void) memset(&collinfo, 0, sizeof (collinfo));
448 
449 	/* allocate some initial priorities */
450 	pri_ignore = new_pri();
451 
452 	set_pri(pri_ignore, 0, RESOLVED);
453 
454 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
455 		pri_undefined[i] = new_pri();
456 
457 		/* we will override this later */
458 		set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
459 	}
460 }
461 
462 void
463 define_collsym(char *name)
464 {
465 	collsym_t	*sym;
466 	avl_index_t	where;
467 
468 	if ((sym = calloc(sizeof (*sym), 1)) == NULL) {
469 		errf(_("out of memory"));
470 		return;
471 	}
472 	sym->name = name;
473 	sym->ref = new_pri();
474 
475 	if (avl_find(&collsyms, sym, &where) != NULL) {
476 		/*
477 		 * This should never happen because we are only called
478 		 * for undefined symbols.
479 		 */
480 		INTERR;
481 		return;
482 	}
483 	avl_insert(&collsyms, sym, where);
484 }
485 
486 collsym_t *
487 lookup_collsym(char *name)
488 {
489 	collsym_t	srch;
490 
491 	srch.name = name;
492 	return (avl_find(&collsyms, &srch, NULL));
493 }
494 
495 collelem_t *
496 lookup_collelem(char *symbol)
497 {
498 	collelem_t	srch;
499 
500 	srch.symbol = symbol;
501 	return (avl_find(&elem_by_symbol, &srch, NULL));
502 }
503 
504 static collundef_t *
505 get_collundef(char *name)
506 {
507 	collundef_t	srch;
508 	collundef_t	*ud;
509 	avl_index_t	where;
510 	int		i;
511 
512 	srch.name = name;
513 	if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) {
514 		if (((ud = calloc(sizeof (*ud), 1)) == NULL) ||
515 		    ((ud->name = strdup(name)) == NULL)) {
516 			errf(_("out of memory"));
517 			return (NULL);
518 		}
519 		for (i = 0; i < NUM_WT; i++) {
520 			ud->ref[i] = new_pri();
521 		}
522 		avl_insert(&collundefs, ud, where);
523 	}
524 	add_charmap_undefined(name);
525 	return (ud);
526 }
527 
528 static collchar_t *
529 get_collchar(wchar_t wc, int create)
530 {
531 	collchar_t	srch;
532 	collchar_t	*cc;
533 	avl_index_t	where;
534 	int		i;
535 
536 	srch.wc = wc;
537 	cc = avl_find(&collchars, &srch, &where);
538 	if ((cc == NULL) && create) {
539 		if ((cc = calloc(sizeof (*cc), 1)) == NULL) {
540 			errf(_("out of memory"));
541 			return (NULL);
542 		}
543 		for (i = 0; i < NUM_WT; i++) {
544 			cc->ref[i] = new_pri();
545 		}
546 		cc->wc = wc;
547 		avl_insert(&collchars, cc, where);
548 	}
549 	return (cc);
550 }
551 
552 void
553 end_order_collsym(collsym_t *sym)
554 {
555 	start_order(T_COLLSYM);
556 	/* update the weight */
557 
558 	set_pri(sym->ref, nextpri, RESOLVED);
559 	nextpri++;
560 }
561 
562 void
563 end_order(void)
564 {
565 	int		i;
566 	int32_t		pri;
567 	int32_t		ref;
568 	collpri_t	*p;
569 
570 	/* advance the priority/weight */
571 	pri = nextpri;
572 
573 	switch (currorder) {
574 	case T_CHAR:
575 		for (i = 0; i < NUM_WT; i++) {
576 			if (((ref = order_weights[i]) < 0) ||
577 			    ((p = get_pri(ref)) == NULL) ||
578 			    (p->pri == -1)) {
579 				/* unspecified weight is a self reference */
580 				set_pri(currchar->ref[i], pri, RESOLVED);
581 			} else {
582 				set_pri(currchar->ref[i], ref, REFER);
583 			}
584 			order_weights[i] = -1;
585 		}
586 
587 		/* leave a cookie trail in case next symbol is ellipsis */
588 		ellipsis_start = currchar->wc + 1;
589 		currchar = NULL;
590 		break;
591 
592 	case T_ELLIPSIS:
593 		/* save off the weights were we can find them */
594 		for (i = 0; i < NUM_WT; i++) {
595 			ellipsis_weights[i] = order_weights[i];
596 			order_weights[i] = -1;
597 		}
598 		break;
599 
600 	case T_COLLELEM:
601 		if (currelem == NULL) {
602 			INTERR;
603 		} else {
604 			for (i = 0; i < NUM_WT; i++) {
605 
606 				if (((ref = order_weights[i]) < 0) ||
607 				    ((p = get_pri(ref)) == NULL) ||
608 				    (p->pri == -1)) {
609 					set_pri(currelem->ref[i], pri,
610 					    RESOLVED);
611 				} else {
612 					set_pri(currelem->ref[i], ref, REFER);
613 				}
614 				order_weights[i] = -1;
615 			}
616 		}
617 		break;
618 
619 	case T_UNDEFINED:
620 		for (i = 0; i < NUM_WT; i++) {
621 			if (((ref = order_weights[i]) < 0) ||
622 			    ((p = get_pri(ref)) == NULL) ||
623 			    (p->pri == -1)) {
624 				set_pri(pri_undefined[i], -1, RESOLVED);
625 			} else {
626 				set_pri(pri_undefined[i], ref, REFER);
627 			}
628 			order_weights[i] = -1;
629 		}
630 		break;
631 
632 	case T_SYMBOL:
633 		if (((ref = order_weights[i]) < 0) ||
634 		    ((p = get_pri(ref)) == NULL) ||
635 		    (p->pri == -1)) {
636 			set_pri(currundef->ref[i], pri, RESOLVED);
637 		} else {
638 			set_pri(currundef->ref[i], ref, REFER);
639 		}
640 		order_weights[i] = -1;
641 		break;
642 
643 	default:
644 		INTERR;
645 	}
646 
647 	nextpri++;
648 }
649 
650 static void
651 start_order(int type)
652 {
653 	int	i;
654 
655 	lastorder = currorder;
656 	currorder = type;
657 
658 	/* this is used to protect ELLIPSIS processing */
659 	if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
660 		errf(_("character value expected"));
661 	}
662 
663 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
664 		order_weights[i] = -1;
665 	}
666 	curr_weight = 0;
667 }
668 
669 void
670 start_order_undefined(void)
671 {
672 	start_order(T_UNDEFINED);
673 }
674 
675 void
676 start_order_symbol(char *name)
677 {
678 	currundef = get_collundef(name);
679 	start_order(T_SYMBOL);
680 }
681 
682 void
683 start_order_char(wchar_t wc)
684 {
685 	collchar_t	*cc;
686 	int32_t		ref;
687 
688 	start_order(T_CHAR);
689 
690 	/*
691 	 * If we last saw an ellipsis, then we need to close the range.
692 	 * Handle that here.  Note that we have to be careful because the
693 	 * items *inside* the range are treated exclusiveley to the items
694 	 * outside of the range.  The ends of the range can have quite
695 	 * different weights than the range members.
696 	 */
697 	if (lastorder == T_ELLIPSIS) {
698 		int		i;
699 
700 		if (wc < ellipsis_start) {
701 			errf(_("malformed range!"));
702 			return;
703 		}
704 		while (ellipsis_start < wc) {
705 			/*
706 			 * pick all of the saved weights for the
707 			 * ellipsis.  note that -1 encodes for the
708 			 * ellipsis itself, which means to take the
709 			 * current relative priority.
710 			 */
711 			if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
712 				INTERR;
713 				return;
714 			}
715 			for (i = 0; i < NUM_WT; i++) {
716 				collpri_t *p;
717 				if (((ref = ellipsis_weights[i]) == -1) ||
718 				    ((p = get_pri(ref)) == NULL) ||
719 				    (p->pri == -1)) {
720 					set_pri(cc->ref[i], nextpri, RESOLVED);
721 				} else {
722 					set_pri(cc->ref[i], ref, REFER);
723 				}
724 				ellipsis_weights[i] = NULL;
725 			}
726 			ellipsis_start++;
727 			nextpri++;
728 		}
729 	}
730 
731 	currchar = get_collchar(wc, 1);
732 }
733 
734 void
735 start_order_collelem(collelem_t *e)
736 {
737 	start_order(T_COLLELEM);
738 	currelem = e;
739 }
740 
741 void
742 start_order_ellipsis(void)
743 {
744 	int	i;
745 
746 	start_order(T_ELLIPSIS);
747 
748 	if (lastorder != T_CHAR) {
749 		errf(_("illegal starting point for range"));
750 		return;
751 	}
752 
753 	for (i = 0; i < NUM_WT; i++) {
754 		ellipsis_weights[i] = order_weights[i];
755 	}
756 }
757 
758 void
759 define_collelem(char *name, wchar_t *wcs)
760 {
761 	collelem_t	*e;
762 	avl_index_t	where1;
763 	avl_index_t	where2;
764 	int		i;
765 
766 	if (wcslen(wcs) >= COLLATE_STR_LEN) {
767 		errf(_("expanded collation element too long"));
768 		return;
769 	}
770 
771 	if ((e = calloc(sizeof (*e), 1)) == NULL) {
772 		errf(_("out of memory"));
773 		return;
774 	}
775 	e->expand = wcs;
776 	e->symbol = name;
777 
778 	/*
779 	 * This is executed before the order statement, so we don't
780 	 * know how many priorities we *really* need.  We allocate one
781 	 * for each possible weight.  Not a big deal, as collating-elements
782 	 * prove to be quite rare.
783 	 */
784 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
785 		e->ref[i] = new_pri();
786 	}
787 
788 	/* A character sequence can only reduce to one element. */
789 	if ((avl_find(&elem_by_symbol, e, &where1) != NULL) ||
790 	    (avl_find(&elem_by_expand, e, &where2) != NULL)) {
791 		errf(_("duplicate collating element definition"));
792 		return;
793 	}
794 	avl_insert(&elem_by_symbol, e, where1);
795 	avl_insert(&elem_by_expand, e, where2);
796 }
797 
798 void
799 add_order_bit(int kw)
800 {
801 	uint8_t bit = DIRECTIVE_UNDEF;
802 
803 	switch (kw) {
804 	case T_FORWARD:
805 		bit = DIRECTIVE_FORWARD;
806 		break;
807 	case T_BACKWARD:
808 		bit = DIRECTIVE_BACKWARD;
809 		break;
810 	case T_POSITION:
811 		bit = DIRECTIVE_POSITION;
812 		break;
813 	default:
814 		INTERR;
815 		break;
816 	}
817 	collinfo.directive[collinfo.directive_count] |= bit;
818 }
819 
820 void
821 add_order_directive(void)
822 {
823 	if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
824 		errf(_("too many directives (max %d)"), COLL_WEIGHTS_MAX);
825 	}
826 	collinfo.directive_count++;
827 }
828 
829 static void
830 add_order_pri(int32_t ref)
831 {
832 	if (curr_weight >= NUM_WT) {
833 		errf(_("too many weights (max %d)"), NUM_WT);
834 		return;
835 	}
836 	order_weights[curr_weight] = ref;
837 	curr_weight++;
838 }
839 
840 void
841 add_order_collsym(collsym_t *s)
842 {
843 	add_order_pri(s->ref);
844 }
845 
846 void
847 add_order_char(wchar_t wc)
848 {
849 	collchar_t *cc;
850 
851 	if ((cc = get_collchar(wc, 1)) == NULL) {
852 		INTERR;
853 		return;
854 	}
855 
856 	add_order_pri(cc->ref[curr_weight]);
857 }
858 
859 void
860 add_order_collelem(collelem_t *e)
861 {
862 	add_order_pri(e->ref[curr_weight]);
863 }
864 
865 void
866 add_order_ignore(void)
867 {
868 	add_order_pri(pri_ignore);
869 }
870 
871 void
872 add_order_symbol(char *sym)
873 {
874 	collundef_t *c;
875 	if ((c = get_collundef(sym)) == NULL) {
876 		INTERR;
877 		return;
878 	}
879 	add_order_pri(c->ref[curr_weight]);
880 }
881 
882 void
883 add_order_ellipsis(void)
884 {
885 	/* special NULL value indicates self reference */
886 	add_order_pri(NULL);
887 }
888 
889 void
890 add_order_subst(void)
891 {
892 	subst_t srch;
893 	subst_t	*s;
894 	avl_index_t where;
895 	int i;
896 
897 	(void) memset(&srch, 0, sizeof (srch));
898 	for (i = 0; i < curr_subst; i++) {
899 		srch.ref[i] = subst_weights[i];
900 		subst_weights[i] = 0;
901 	}
902 	s = avl_find(&substs_ref[curr_weight], &srch, &where);
903 
904 	if (s == NULL) {
905 		if ((s = calloc(sizeof (*s), 1)) == NULL) {
906 			errf(_("out of memory"));
907 			return;
908 		}
909 		s->key = new_pri();
910 
911 		/*
912 		 * We use a self reference for our key, but we set a
913 		 * high bit to indicate that this is a substitution
914 		 * reference.  This will expedite table lookups later,
915 		 * and prevent table lookups for situations that don't
916 		 * require it.  (In short, its a big win, because we
917 		 * can skip a lot of binary searching.)
918 		 */
919 		set_pri(s->key,
920 		    (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
921 		    RESOLVED);
922 		nextsubst[curr_weight] += 1;
923 
924 		for (i = 0; i < curr_subst; i++) {
925 			s->ref[i] = srch.ref[i];
926 		}
927 
928 		avl_insert(&substs_ref[curr_weight], s, where);
929 
930 		if (avl_find(&substs[curr_weight], s, &where) != NULL) {
931 			INTERR;
932 			return;
933 		}
934 		avl_insert(&substs[curr_weight], s, where);
935 	}
936 	curr_subst = 0;
937 
938 
939 	/*
940 	 * We are using the current (unique) priority as a search key
941 	 * in the substitution table.
942 	 */
943 	add_order_pri(s->key);
944 }
945 
946 static void
947 add_subst_pri(int32_t ref)
948 {
949 	if (curr_subst >= COLLATE_STR_LEN) {
950 		errf(_("substitution string is too long"));
951 		return;
952 	}
953 	subst_weights[curr_subst] = ref;
954 	curr_subst++;
955 }
956 
957 void
958 add_subst_char(wchar_t wc)
959 {
960 	collchar_t *cc;
961 
962 
963 	if (((cc = get_collchar(wc, 1)) == NULL) ||
964 	    (cc->wc != wc)) {
965 		INTERR;
966 		return;
967 	}
968 	/* we take the weight for the character at that position */
969 	add_subst_pri(cc->ref[curr_weight]);
970 }
971 
972 void
973 add_subst_collelem(collelem_t *e)
974 {
975 	add_subst_pri(e->ref[curr_weight]);
976 }
977 
978 void
979 add_subst_collsym(collsym_t *s)
980 {
981 	add_subst_pri(s->ref);
982 }
983 
984 void
985 add_subst_symbol(char *ptr)
986 {
987 	collundef_t *cu;
988 
989 	if ((cu = get_collundef(ptr)) != NULL) {
990 		add_subst_pri(cu->ref[curr_weight]);
991 	}
992 }
993 
994 void
995 add_weight(int32_t ref, int pass)
996 {
997 	weight_t srch;
998 	weight_t *w;
999 	avl_index_t where;
1000 
1001 	srch.pri = resolve_pri(ref);
1002 
1003 	/* No translation of ignores */
1004 	if (srch.pri == 0)
1005 		return;
1006 
1007 	/* Substitution priorities are not weights */
1008 	if (srch.pri & COLLATE_SUBST_PRIORITY)
1009 		return;
1010 
1011 	if (avl_find(&weights[pass], &srch, &where) != NULL)
1012 		return;
1013 
1014 	if ((w = calloc(sizeof (*w), 1)) == NULL) {
1015 		errf(_("out of memory"));
1016 		return;
1017 	}
1018 	w->pri = srch.pri;
1019 	avl_insert(&weights[pass], w, where);
1020 }
1021 
1022 void
1023 add_weights(int32_t *refs)
1024 {
1025 	int i;
1026 	for (i = 0; i < NUM_WT; i++) {
1027 		add_weight(refs[i], i);
1028 	}
1029 }
1030 
1031 int32_t
1032 get_weight(int32_t ref, int pass)
1033 {
1034 	weight_t	srch;
1035 	weight_t	*w;
1036 	int32_t		pri;
1037 
1038 	pri = resolve_pri(ref);
1039 	if (pri & COLLATE_SUBST_PRIORITY) {
1040 		return (pri);
1041 	}
1042 	if (pri <= 0) {
1043 		return (pri);
1044 	}
1045 	srch.pri = pri;
1046 	if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) {
1047 		INTERR;
1048 		return (-1);
1049 	}
1050 	return (w->opt);
1051 }
1052 
1053 void
1054 dump_collate(void)
1055 {
1056 	FILE			*f;
1057 	int			i, j, n;
1058 	size_t			sz;
1059 	int32_t			pri;
1060 	collelem_t		*ce;
1061 	collchar_t		*cc;
1062 	subst_t			*sb;
1063 	char			vers[COLLATE_STR_LEN];
1064 	collate_char_t		chars[UCHAR_MAX + 1];
1065 	collate_large_t		*large;
1066 	collate_subst_t		*subst[COLL_WEIGHTS_MAX];
1067 	collate_chain_t		*chain;
1068 
1069 	/*
1070 	 * We have to run throught a preliminary pass to identify all the
1071 	 * weights that we use for each sorting level.
1072 	 */
1073 	for (i = 0; i < NUM_WT; i++) {
1074 		add_weight(pri_ignore, i);
1075 	}
1076 	for (i = 0; i < NUM_WT; i++) {
1077 		for (sb = avl_first(&substs[i]); sb;
1078 		    sb = AVL_NEXT(&substs[i], sb)) {
1079 			for (j = 0; sb->ref[j]; j++) {
1080 				add_weight(sb->ref[j], i);
1081 			}
1082 		}
1083 	}
1084 	for (ce = avl_first(&elem_by_expand);
1085 	    ce != NULL;
1086 	    ce = AVL_NEXT(&elem_by_expand, ce)) {
1087 		add_weights(ce->ref);
1088 	}
1089 	for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1090 		add_weights(cc->ref);
1091 	}
1092 
1093 	/*
1094 	 * Now we walk the entire set of weights, removing the gaps
1095 	 * in the weights.  This gives us optimum usage.  The walk
1096 	 * occurs in priority.
1097 	 */
1098 	for (i = 0; i < NUM_WT; i++) {
1099 		weight_t *w;
1100 		for (w = avl_first(&weights[i]); w;
1101 		    w = AVL_NEXT(&weights[i], w)) {
1102 			w->opt = nweight[i];
1103 			nweight[i] += 1;
1104 		}
1105 	}
1106 
1107 	(void) memset(&chars, 0, sizeof (chars));
1108 	(void) memset(vers, 0, COLLATE_STR_LEN);
1109 	(void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
1110 
1111 	/*
1112 	 * We need to make sure we arrange for the UNDEFINED field
1113 	 * to show up.  Also, set the total weight counts.
1114 	 */
1115 	for (i = 0; i < NUM_WT; i++) {
1116 		if (resolve_pri(pri_undefined[i]) == -1) {
1117 			set_pri(pri_undefined[i], -1, RESOLVED);
1118 			/* they collate at the end of everything else */
1119 			collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
1120 		}
1121 		collinfo.pri_count[i] = nweight[i];
1122 	}
1123 
1124 	collinfo.pri_count[NUM_WT] = max_wide();
1125 	collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
1126 	collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
1127 
1128 	/*
1129 	 * Ordinary character priorities
1130 	 */
1131 	for (i = 0; i <= UCHAR_MAX; i++) {
1132 		if ((cc = get_collchar(i, 0)) != NULL) {
1133 			for (j = 0; j < NUM_WT; j++) {
1134 				chars[i].pri[j] = get_weight(cc->ref[j], j);
1135 			}
1136 		} else {
1137 			for (j = 0; j < NUM_WT; j++) {
1138 				chars[i].pri[j] =
1139 				    get_weight(pri_undefined[j], j);
1140 			}
1141 			/*
1142 			 * Per POSIX, for undefined characters, we
1143 			 * also have to add a last item, which is the
1144 			 * character code.
1145 			 */
1146 			chars[i].pri[NUM_WT] = i;
1147 		}
1148 	}
1149 
1150 	/*
1151 	 * Substitution tables
1152 	 */
1153 	for (i = 0; i < NUM_WT; i++) {
1154 		collate_subst_t *st = NULL;
1155 		n = collinfo.subst_count[i] = avl_numnodes(&substs[i]);
1156 		if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) {
1157 			errf(_("out of memory"));
1158 			return;
1159 		}
1160 		n = 0;
1161 		for (sb = avl_first(&substs[i]); sb;
1162 		    sb = AVL_NEXT(&substs[i], sb)) {
1163 			if ((st[n].key = resolve_pri(sb->key)) < 0) {
1164 				/* by definition these resolve! */
1165 				INTERR;
1166 			}
1167 			if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
1168 				INTERR;
1169 			}
1170 			for (j = 0; sb->ref[j]; j++) {
1171 				st[n].pri[j] = get_weight(sb->ref[j], i);
1172 			}
1173 			n++;
1174 		}
1175 		if (n != collinfo.subst_count[i])
1176 			INTERR;
1177 		subst[i] = st;
1178 	}
1179 
1180 
1181 	/*
1182 	 * Chains, i.e. collating elements
1183 	 */
1184 	collinfo.chain_count = avl_numnodes(&elem_by_expand);
1185 	chain = calloc(sizeof (collate_chain_t), collinfo.chain_count);
1186 	if (chain == NULL) {
1187 		errf(_("out of memory"));
1188 		return;
1189 	}
1190 	for (n = 0, ce = avl_first(&elem_by_expand);
1191 	    ce != NULL;
1192 	    ce = AVL_NEXT(&elem_by_expand, ce), n++) {
1193 		(void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
1194 		for (i = 0; i < NUM_WT; i++) {
1195 			chain[n].pri[i] = get_weight(ce->ref[i], i);
1196 		}
1197 	}
1198 	if (n != collinfo.chain_count)
1199 		INTERR;
1200 
1201 	/*
1202 	 * Large (> UCHAR_MAX) character priorities
1203 	 */
1204 	large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1);
1205 	if (large == NULL) {
1206 		errf(_("out of memory"));
1207 		return;
1208 	}
1209 
1210 	i = 0;
1211 	for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1212 		int	undef = 0;
1213 		/* we already gathered those */
1214 		if (cc->wc <= UCHAR_MAX)
1215 			continue;
1216 		for (j = 0; j < NUM_WT; j++) {
1217 			if ((pri = get_weight(cc->ref[j], j)) < 0) {
1218 				undef = 1;
1219 			}
1220 			if (undef && (pri >= 0)) {
1221 				/* if undefined, then all priorities are */
1222 				INTERR;
1223 			} else {
1224 				large[i].pri.pri[j] = pri;
1225 			}
1226 		}
1227 		if (!undef) {
1228 			large[i].val = cc->wc;
1229 			collinfo.large_count = i++;
1230 		}
1231 	}
1232 
1233 	if ((f = open_category()) == NULL) {
1234 		return;
1235 	}
1236 
1237 	/* Time to write the entire data set out */
1238 
1239 	if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
1240 	    (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
1241 	    (wr_category(&chars, sizeof (chars), f) < 0)) {
1242 		return;
1243 	}
1244 
1245 	for (i = 0; i < NUM_WT; i++) {
1246 		sz =  sizeof (collate_subst_t) * collinfo.subst_count[i];
1247 		if (wr_category(subst[i], sz, f) < 0) {
1248 			return;
1249 		}
1250 	}
1251 	sz = sizeof (collate_chain_t) * collinfo.chain_count;
1252 	if (wr_category(chain, sz, f) < 0) {
1253 		return;
1254 	}
1255 	sz = sizeof (collate_large_t) * collinfo.large_count;
1256 	if (wr_category(large, sz, f) < 0) {
1257 		return;
1258 	}
1259 
1260 	close_category(f);
1261 }
1262