1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
14 */
15
16 /*
17 * LC_COLLATE database generation routines for localedef.
18 */
19
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <errno.h>
23 #include <string.h>
24 #include <sys/types.h>
25 #include <sys/avl.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <wchar.h>
29 #include <widec.h>
30 #include <limits.h>
31 #include "localedef.h"
32 #include "parser.tab.h"
33 #include "collatefile.h"
34
35 /*
36 * Design notes.
37 *
38 * It will be extremely helpful to the reader if they have access to
39 * the localedef and locale file format specifications available.
40 * Latest versions of these are available from www.opengroup.org.
41 *
42 * The design for the collation code is a bit complex. The goal is a
43 * single collation database as described in collate.h (in
44 * libc/port/locale). However, there are some other tidbits:
45 *
46 * a) The substitution entries are now a directly indexable array. A
47 * priority elsewhere in the table is taken as an index into the
48 * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
49 * set. (The bit is cleared and the result is the index into the
50 * table.
51 *
52 * b) We eliminate duplicate entries into the substitution table.
53 * This saves a lot of space.
54 *
55 * c) The priorities for each level are "compressed", so that each
56 * sorting level has consecutively numbered priorities starting at 1.
57 * (O is reserved for the ignore priority.) This means sort levels
58 * which only have a few distinct priorities can represent the
59 * priority level in fewer bits, which makes the strxfrm output
60 * smaller.
61 *
62 * d) We record the total number of priorities so that strxfrm can
63 * figure out how many bytes to expand a numeric priority into.
64 *
65 * e) For the UNDEFINED pass (the last pass), we record the maximum
66 * number of bits needed to uniquely prioritize these entries, so that
67 * the last pass can also use smaller strxfrm output when possible.
68 *
69 * f) Priorities with the sign bit set are verboten. This works out
70 * because no active character set needs that bit to carry significant
71 * information once the character is in wide form.
72 *
73 * To process the entire data to make the database, we actually run
74 * multiple passes over the data.
75 *
76 * The first pass, which is done at parse time, identifies elements,
77 * substitutions, and such, and records them in priority order. As
78 * some priorities can refer to other priorities, using forward
79 * references, we use a table of references indicating whether the
80 * priority's value has been resolved, or whether it is still a
81 * reference.
82 *
83 * The second pass walks over all the items in priority order, noting
84 * that they are used directly, and not just an indirect reference.
85 * This is done by creating a "weight" structure for the item. The
86 * weights are stashed in an AVL tree sorted by relative "priority".
87 *
88 * The third pass walks over all the weight structures, in priority
89 * order, and assigns a new monotonically increasing (per sort level)
90 * weight value to them. These are the values that will actually be
91 * written to the file.
92 *
93 * The fourth pass just writes the data out.
94 */
95
96 /*
97 * In order to resolve the priorities, we create a table of priorities.
98 * Entries in the table can be in one of three states.
99 *
100 * UNKNOWN is for newly allocated entries, and indicates that nothing
101 * is known about the priority. (For example, when new entries are created
102 * for collating-symbols, this is the value assigned for them until the
103 * collating symbol's order has been determined.
104 *
105 * RESOLVED is used for an entry where the priority indicates the final
106 * numeric weight.
107 *
108 * REFER is used for entries that reference other entries. Typically
109 * this is used for forward references. A collating-symbol can never
110 * have this value.
111 *
112 * The "pass" field is used during final resolution to aid in detection
113 * of referencing loops. (For example <A> depends on <B>, but <B> has its
114 * priority dependent on <A>.)
115 */
116 typedef enum {
117 UNKNOWN, /* priority is totally unknown */
118 RESOLVED, /* priority value fully resolved */
119 REFER /* priority is a reference (index) */
120 } res_t;
121
122 typedef struct weight {
123 int32_t pri;
124 int opt;
125 avl_node_t avl;
126 } weight_t;
127
128 typedef struct priority {
129 res_t res;
130 int32_t pri;
131 int pass;
132 int lineno;
133 } collpri_t;
134
135 #define NUM_WT collinfo.directive_count
136
137 /*
138 * These are the abstract collating symbols, which are just a symbolic
139 * way to reference a priority.
140 */
141 struct collsym {
142 char *name;
143 int32_t ref;
144 avl_node_t avl;
145 };
146
147 /*
148 * These are also abstract collating symbols, but we allow them to have
149 * different priorities at different levels.
150 */
151 typedef struct collundef {
152 char *name;
153 int32_t ref[COLL_WEIGHTS_MAX];
154 avl_node_t avl;
155 } collundef_t;
156
157 /*
158 * These are called "chains" in libc. This records the fact that two
159 * more characters should be treated as a single collating entity when
160 * they appear together. For example, in Spanish <C><h> gets collated
161 * as a character between <C> and <D>.
162 */
163 struct collelem {
164 char *symbol;
165 wchar_t *expand;
166 int32_t ref[COLL_WEIGHTS_MAX];
167 avl_node_t avl_bysymbol;
168 avl_node_t avl_byexpand;
169 };
170
171 /*
172 * Individual characters have a sequence of weights as well.
173 */
174 typedef struct collchar {
175 wchar_t wc;
176 int32_t ref[COLL_WEIGHTS_MAX];
177 avl_node_t avl;
178 } collchar_t;
179
180 /*
181 * Substitution entries. The key is itself a priority. Note that
182 * when we create one of these, we *automatically* wind up with a
183 * fully resolved priority for the key, because creation of
184 * substitutions creates a resolved priority at the same time.
185 */
186 typedef struct {
187 int32_t key;
188 int32_t ref[COLLATE_STR_LEN];
189 avl_node_t avl;
190 avl_node_t avl_ref;
191 } subst_t;
192
193 static avl_tree_t collsyms;
194 static avl_tree_t collundefs;
195 static avl_tree_t elem_by_symbol;
196 static avl_tree_t elem_by_expand;
197 static avl_tree_t collchars;
198 static avl_tree_t substs[COLL_WEIGHTS_MAX];
199 static avl_tree_t substs_ref[COLL_WEIGHTS_MAX];
200 static avl_tree_t weights[COLL_WEIGHTS_MAX];
201 static int32_t nweight[COLL_WEIGHTS_MAX];
202
203 /*
204 * This is state tracking for the ellipsis token. Note that we start
205 * the initial values so that the ellipsis logic will think we got a
206 * magic starting value of NUL. It starts at minus one because the
207 * starting point is exclusive -- i.e. the starting point is not
208 * itself handled by the ellipsis code.
209 */
210 static int currorder = EOF;
211 static int lastorder = EOF;
212 static collelem_t *currelem;
213 static collchar_t *currchar;
214 static collundef_t *currundef;
215 static wchar_t ellipsis_start = 0;
216 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
217
218 /*
219 * We keep a running tally of weights.
220 */
221 static int nextpri = 1;
222 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
223
224 /*
225 * This array collects up the weights for each level.
226 */
227 static int32_t order_weights[COLL_WEIGHTS_MAX];
228 static int curr_weight = 0;
229 static int32_t subst_weights[COLLATE_STR_LEN];
230 static int curr_subst = 0;
231
232 /*
233 * Some initial priority values.
234 */
235 static int32_t pri_undefined[COLL_WEIGHTS_MAX];
236 static int32_t pri_ignore;
237
238 static collate_info_t collinfo;
239
240 static collpri_t *prilist = NULL;
241 static int numpri = 0;
242 static int maxpri = 0;
243
244 static void start_order(int);
245
246 static int32_t
new_pri(void)247 new_pri(void)
248 {
249 int i;
250
251 if (numpri >= maxpri) {
252 maxpri = maxpri ? maxpri * 2 : 1024;
253 prilist = realloc(prilist, sizeof (collpri_t) * maxpri);
254 if (prilist == NULL) {
255 errf(_("out of memory"));
256 return (-1);
257 }
258 for (i = numpri; i < maxpri; i++) {
259 prilist[i].res = UNKNOWN;
260 prilist[i].pri = 0;
261 prilist[i].pass = 0;
262 }
263 }
264 return (numpri++);
265 }
266
267 static collpri_t *
get_pri(int32_t ref)268 get_pri(int32_t ref)
269 {
270 if ((ref < 0) || (ref > numpri)) {
271 INTERR;
272 return (NULL);
273 }
274 return (&prilist[ref]);
275 }
276
277 static void
set_pri(int32_t ref,int32_t v,res_t res)278 set_pri(int32_t ref, int32_t v, res_t res)
279 {
280 collpri_t *pri;
281
282 pri = get_pri(ref);
283
284 if ((res == REFER) && ((v < 0) || (v >= numpri))) {
285 INTERR;
286 }
287
288 /* Resolve self references */
289 if ((res == REFER) && (ref == v)) {
290 v = nextpri;
291 res = RESOLVED;
292 }
293
294 if (pri->res != UNKNOWN) {
295 warn(_("repeated item in order list (first on %d)"),
296 pri->lineno);
297 return;
298 }
299 pri->lineno = lineno;
300 pri->pri = v;
301 pri->res = res;
302 }
303
304 static int32_t
resolve_pri(int32_t ref)305 resolve_pri(int32_t ref)
306 {
307 collpri_t *pri;
308 static int32_t pass = 0;
309
310 pri = get_pri(ref);
311 pass++;
312 while (pri->res == REFER) {
313 if (pri->pass == pass) {
314 /* report a line with the circular symbol */
315 lineno = pri->lineno;
316 errf(_("circular reference in order list"));
317 return (-1);
318 }
319 if ((pri->pri < 0) || (pri->pri >= numpri)) {
320 INTERR;
321 return (-1);
322 }
323 pri->pass = pass;
324 pri = &prilist[pri->pri];
325 }
326
327 if (pri->res == UNKNOWN) {
328 return (-1);
329 }
330 if (pri->res != RESOLVED)
331 INTERR;
332
333 return (pri->pri);
334 }
335
336 static int
weight_compare(const void * n1,const void * n2)337 weight_compare(const void *n1, const void *n2)
338 {
339 int32_t k1 = ((const weight_t *)n1)->pri;
340 int32_t k2 = ((const weight_t *)n2)->pri;
341
342 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
343 }
344
345 static int
collsym_compare(const void * n1,const void * n2)346 collsym_compare(const void *n1, const void *n2)
347 {
348 const collsym_t *c1 = n1;
349 const collsym_t *c2 = n2;
350 int rv;
351
352 rv = strcmp(c1->name, c2->name);
353 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
354 }
355
356 static int
collundef_compare(const void * n1,const void * n2)357 collundef_compare(const void *n1, const void *n2)
358 {
359 const collundef_t *c1 = n1;
360 const collundef_t *c2 = n2;
361 int rv;
362
363 rv = strcmp(c1->name, c2->name);
364 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
365 }
366
367 static int
element_compare_symbol(const void * n1,const void * n2)368 element_compare_symbol(const void *n1, const void *n2)
369 {
370 const collelem_t *c1 = n1;
371 const collelem_t *c2 = n2;
372 int rv;
373
374 rv = strcmp(c1->symbol, c2->symbol);
375 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
376 }
377
378 static int
element_compare_expand(const void * n1,const void * n2)379 element_compare_expand(const void *n1, const void *n2)
380 {
381 const collelem_t *c1 = n1;
382 const collelem_t *c2 = n2;
383 int rv;
384
385 rv = wcscmp(c1->expand, c2->expand);
386 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
387 }
388
389 static int
collchar_compare(const void * n1,const void * n2)390 collchar_compare(const void *n1, const void *n2)
391 {
392 wchar_t k1 = ((const collchar_t *)n1)->wc;
393 wchar_t k2 = ((const collchar_t *)n2)->wc;
394
395 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
396 }
397
398 static int
subst_compare(const void * n1,const void * n2)399 subst_compare(const void *n1, const void *n2)
400 {
401 int32_t k1 = ((const subst_t *)n1)->key;
402 int32_t k2 = ((const subst_t *)n2)->key;
403
404 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
405 }
406
407 static int
subst_compare_ref(const void * n1,const void * n2)408 subst_compare_ref(const void *n1, const void *n2)
409 {
410 int32_t *c1 = ((subst_t *)n1)->ref;
411 int32_t *c2 = ((subst_t *)n2)->ref;
412 int rv;
413
414 rv = wcscmp((wchar_t *)c1, (wchar_t *)c2);
415 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
416 }
417
418 void
init_collate(void)419 init_collate(void)
420 {
421 int i;
422
423 avl_create(&collsyms, collsym_compare, sizeof (collsym_t),
424 offsetof(collsym_t, avl));
425
426 avl_create(&collundefs, collundef_compare, sizeof (collsym_t),
427 offsetof(collundef_t, avl));
428
429 avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t),
430 offsetof(collelem_t, avl_bysymbol));
431 avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t),
432 offsetof(collelem_t, avl_byexpand));
433
434 avl_create(&collchars, collchar_compare, sizeof (collchar_t),
435 offsetof(collchar_t, avl));
436
437 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
438 avl_create(&substs[i], subst_compare, sizeof (subst_t),
439 offsetof(subst_t, avl));
440 avl_create(&substs_ref[i], subst_compare_ref,
441 sizeof (subst_t), offsetof(subst_t, avl_ref));
442 avl_create(&weights[i], weight_compare, sizeof (weight_t),
443 offsetof(weight_t, avl));
444 nweight[i] = 1;
445 }
446
447 (void) memset(&collinfo, 0, sizeof (collinfo));
448
449 /* allocate some initial priorities */
450 pri_ignore = new_pri();
451
452 set_pri(pri_ignore, 0, RESOLVED);
453
454 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
455 pri_undefined[i] = new_pri();
456
457 /* we will override this later */
458 set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
459 }
460 }
461
462 void
define_collsym(char * name)463 define_collsym(char *name)
464 {
465 collsym_t *sym;
466 avl_index_t where;
467
468 if ((sym = calloc(sizeof (*sym), 1)) == NULL) {
469 errf(_("out of memory"));
470 return;
471 }
472 sym->name = name;
473 sym->ref = new_pri();
474
475 if (avl_find(&collsyms, sym, &where) != NULL) {
476 /*
477 * This should never happen because we are only called
478 * for undefined symbols.
479 */
480 INTERR;
481 return;
482 }
483 avl_insert(&collsyms, sym, where);
484 }
485
486 collsym_t *
lookup_collsym(char * name)487 lookup_collsym(char *name)
488 {
489 collsym_t srch;
490
491 srch.name = name;
492 return (avl_find(&collsyms, &srch, NULL));
493 }
494
495 collelem_t *
lookup_collelem(char * symbol)496 lookup_collelem(char *symbol)
497 {
498 collelem_t srch;
499
500 srch.symbol = symbol;
501 return (avl_find(&elem_by_symbol, &srch, NULL));
502 }
503
504 static collundef_t *
get_collundef(char * name)505 get_collundef(char *name)
506 {
507 collundef_t srch;
508 collundef_t *ud;
509 avl_index_t where;
510 int i;
511
512 srch.name = name;
513 if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) {
514 if (((ud = calloc(sizeof (*ud), 1)) == NULL) ||
515 ((ud->name = strdup(name)) == NULL)) {
516 errf(_("out of memory"));
517 return (NULL);
518 }
519 for (i = 0; i < NUM_WT; i++) {
520 ud->ref[i] = new_pri();
521 }
522 avl_insert(&collundefs, ud, where);
523 }
524 add_charmap_undefined(name);
525 return (ud);
526 }
527
528 static collchar_t *
get_collchar(wchar_t wc,int create)529 get_collchar(wchar_t wc, int create)
530 {
531 collchar_t srch;
532 collchar_t *cc;
533 avl_index_t where;
534 int i;
535
536 srch.wc = wc;
537 cc = avl_find(&collchars, &srch, &where);
538 if ((cc == NULL) && create) {
539 if ((cc = calloc(sizeof (*cc), 1)) == NULL) {
540 errf(_("out of memory"));
541 return (NULL);
542 }
543 for (i = 0; i < NUM_WT; i++) {
544 cc->ref[i] = new_pri();
545 }
546 cc->wc = wc;
547 avl_insert(&collchars, cc, where);
548 }
549 return (cc);
550 }
551
552 void
end_order_collsym(collsym_t * sym)553 end_order_collsym(collsym_t *sym)
554 {
555 start_order(T_COLLSYM);
556 /* update the weight */
557
558 set_pri(sym->ref, nextpri, RESOLVED);
559 nextpri++;
560 }
561
562 void
end_order(void)563 end_order(void)
564 {
565 int i;
566 int32_t pri;
567 int32_t ref;
568 collpri_t *p;
569
570 /* advance the priority/weight */
571 pri = nextpri;
572
573 switch (currorder) {
574 case T_CHAR:
575 for (i = 0; i < NUM_WT; i++) {
576 if (((ref = order_weights[i]) < 0) ||
577 ((p = get_pri(ref)) == NULL) ||
578 (p->pri == -1)) {
579 /* unspecified weight is a self reference */
580 set_pri(currchar->ref[i], pri, RESOLVED);
581 } else {
582 set_pri(currchar->ref[i], ref, REFER);
583 }
584 order_weights[i] = -1;
585 }
586
587 /* leave a cookie trail in case next symbol is ellipsis */
588 ellipsis_start = currchar->wc + 1;
589 currchar = NULL;
590 break;
591
592 case T_ELLIPSIS:
593 /* save off the weights were we can find them */
594 for (i = 0; i < NUM_WT; i++) {
595 ellipsis_weights[i] = order_weights[i];
596 order_weights[i] = -1;
597 }
598 break;
599
600 case T_COLLELEM:
601 if (currelem == NULL) {
602 INTERR;
603 } else {
604 for (i = 0; i < NUM_WT; i++) {
605
606 if (((ref = order_weights[i]) < 0) ||
607 ((p = get_pri(ref)) == NULL) ||
608 (p->pri == -1)) {
609 set_pri(currelem->ref[i], pri,
610 RESOLVED);
611 } else {
612 set_pri(currelem->ref[i], ref, REFER);
613 }
614 order_weights[i] = -1;
615 }
616 }
617 break;
618
619 case T_UNDEFINED:
620 for (i = 0; i < NUM_WT; i++) {
621 if (((ref = order_weights[i]) < 0) ||
622 ((p = get_pri(ref)) == NULL) ||
623 (p->pri == -1)) {
624 set_pri(pri_undefined[i], -1, RESOLVED);
625 } else {
626 set_pri(pri_undefined[i], ref, REFER);
627 }
628 order_weights[i] = -1;
629 }
630 break;
631
632 case T_SYMBOL:
633 for (i = 0; i < NUM_WT; i++) {
634 if (((ref = order_weights[i]) < 0) ||
635 ((p = get_pri(ref)) == NULL) ||
636 (p->pri == -1)) {
637 set_pri(currundef->ref[i], pri, RESOLVED);
638 } else {
639 set_pri(currundef->ref[i], ref, REFER);
640 }
641 order_weights[i] = -1;
642 }
643 break;
644
645 default:
646 INTERR;
647 }
648
649 nextpri++;
650 }
651
652 static void
start_order(int type)653 start_order(int type)
654 {
655 int i;
656
657 lastorder = currorder;
658 currorder = type;
659
660 /* this is used to protect ELLIPSIS processing */
661 if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
662 errf(_("character value expected"));
663 }
664
665 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
666 order_weights[i] = -1;
667 }
668 curr_weight = 0;
669 }
670
671 void
start_order_undefined(void)672 start_order_undefined(void)
673 {
674 start_order(T_UNDEFINED);
675 }
676
677 void
start_order_symbol(char * name)678 start_order_symbol(char *name)
679 {
680 currundef = get_collundef(name);
681 start_order(T_SYMBOL);
682 }
683
684 void
start_order_char(wchar_t wc)685 start_order_char(wchar_t wc)
686 {
687 collchar_t *cc;
688 int32_t ref;
689
690 start_order(T_CHAR);
691
692 /*
693 * If we last saw an ellipsis, then we need to close the range.
694 * Handle that here. Note that we have to be careful because the
695 * items *inside* the range are treated exclusiveley to the items
696 * outside of the range. The ends of the range can have quite
697 * different weights than the range members.
698 */
699 if (lastorder == T_ELLIPSIS) {
700 int i;
701
702 if (wc < ellipsis_start) {
703 errf(_("malformed range!"));
704 return;
705 }
706 while (ellipsis_start < wc) {
707 /*
708 * pick all of the saved weights for the
709 * ellipsis. note that -1 encodes for the
710 * ellipsis itself, which means to take the
711 * current relative priority.
712 */
713 if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
714 INTERR;
715 return;
716 }
717 for (i = 0; i < NUM_WT; i++) {
718 collpri_t *p;
719 if (((ref = ellipsis_weights[i]) == -1) ||
720 ((p = get_pri(ref)) == NULL) ||
721 (p->pri == -1)) {
722 set_pri(cc->ref[i], nextpri, RESOLVED);
723 } else {
724 set_pri(cc->ref[i], ref, REFER);
725 }
726 ellipsis_weights[i] = 0;
727 }
728 ellipsis_start++;
729 nextpri++;
730 }
731 }
732
733 currchar = get_collchar(wc, 1);
734 }
735
736 void
start_order_collelem(collelem_t * e)737 start_order_collelem(collelem_t *e)
738 {
739 start_order(T_COLLELEM);
740 currelem = e;
741 }
742
743 void
start_order_ellipsis(void)744 start_order_ellipsis(void)
745 {
746 int i;
747
748 start_order(T_ELLIPSIS);
749
750 if (lastorder != T_CHAR) {
751 errf(_("illegal starting point for range"));
752 return;
753 }
754
755 for (i = 0; i < NUM_WT; i++) {
756 ellipsis_weights[i] = order_weights[i];
757 }
758 }
759
760 void
define_collelem(char * name,wchar_t * wcs)761 define_collelem(char *name, wchar_t *wcs)
762 {
763 collelem_t *e;
764 avl_index_t where1;
765 avl_index_t where2;
766 int i;
767
768 if (wcslen(wcs) >= COLLATE_STR_LEN) {
769 errf(_("expanded collation element too long"));
770 return;
771 }
772
773 if ((e = calloc(sizeof (*e), 1)) == NULL) {
774 errf(_("out of memory"));
775 return;
776 }
777 e->expand = wcs;
778 e->symbol = name;
779
780 /*
781 * This is executed before the order statement, so we don't
782 * know how many priorities we *really* need. We allocate one
783 * for each possible weight. Not a big deal, as collating-elements
784 * prove to be quite rare.
785 */
786 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
787 e->ref[i] = new_pri();
788 }
789
790 /* A character sequence can only reduce to one element. */
791 if ((avl_find(&elem_by_symbol, e, &where1) != NULL) ||
792 (avl_find(&elem_by_expand, e, &where2) != NULL)) {
793 errf(_("duplicate collating element definition"));
794 return;
795 }
796 avl_insert(&elem_by_symbol, e, where1);
797 avl_insert(&elem_by_expand, e, where2);
798 }
799
800 void
add_order_bit(int kw)801 add_order_bit(int kw)
802 {
803 uint8_t bit = DIRECTIVE_UNDEF;
804
805 switch (kw) {
806 case T_FORWARD:
807 bit = DIRECTIVE_FORWARD;
808 break;
809 case T_BACKWARD:
810 bit = DIRECTIVE_BACKWARD;
811 break;
812 case T_POSITION:
813 bit = DIRECTIVE_POSITION;
814 break;
815 default:
816 INTERR;
817 break;
818 }
819 collinfo.directive[collinfo.directive_count] |= bit;
820 }
821
822 void
add_order_directive(void)823 add_order_directive(void)
824 {
825 if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
826 errf(_("too many directives (max %d)"), COLL_WEIGHTS_MAX);
827 }
828 collinfo.directive_count++;
829 }
830
831 static void
add_order_pri(int32_t ref)832 add_order_pri(int32_t ref)
833 {
834 if (curr_weight >= NUM_WT) {
835 errf(_("too many weights (max %d)"), NUM_WT);
836 return;
837 }
838 order_weights[curr_weight] = ref;
839 curr_weight++;
840 }
841
842 void
add_order_collsym(collsym_t * s)843 add_order_collsym(collsym_t *s)
844 {
845 add_order_pri(s->ref);
846 }
847
848 void
add_order_char(wchar_t wc)849 add_order_char(wchar_t wc)
850 {
851 collchar_t *cc;
852
853 if ((cc = get_collchar(wc, 1)) == NULL) {
854 INTERR;
855 return;
856 }
857
858 add_order_pri(cc->ref[curr_weight]);
859 }
860
861 void
add_order_collelem(collelem_t * e)862 add_order_collelem(collelem_t *e)
863 {
864 add_order_pri(e->ref[curr_weight]);
865 }
866
867 void
add_order_ignore(void)868 add_order_ignore(void)
869 {
870 add_order_pri(pri_ignore);
871 }
872
873 void
add_order_symbol(char * sym)874 add_order_symbol(char *sym)
875 {
876 collundef_t *c;
877 if ((c = get_collundef(sym)) == NULL) {
878 INTERR;
879 return;
880 }
881 add_order_pri(c->ref[curr_weight]);
882 }
883
884 void
add_order_ellipsis(void)885 add_order_ellipsis(void)
886 {
887 /* special 0 value indicates self reference */
888 add_order_pri(0);
889 }
890
891 void
add_order_subst(void)892 add_order_subst(void)
893 {
894 subst_t srch;
895 subst_t *s;
896 avl_index_t where;
897 int i;
898
899 (void) memset(&srch, 0, sizeof (srch));
900 for (i = 0; i < curr_subst; i++) {
901 srch.ref[i] = subst_weights[i];
902 subst_weights[i] = 0;
903 }
904 s = avl_find(&substs_ref[curr_weight], &srch, &where);
905
906 if (s == NULL) {
907 if ((s = calloc(sizeof (*s), 1)) == NULL) {
908 errf(_("out of memory"));
909 return;
910 }
911 s->key = new_pri();
912
913 /*
914 * We use a self reference for our key, but we set a
915 * high bit to indicate that this is a substitution
916 * reference. This will expedite table lookups later,
917 * and prevent table lookups for situations that don't
918 * require it. (In short, its a big win, because we
919 * can skip a lot of binary searching.)
920 */
921 set_pri(s->key,
922 (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
923 RESOLVED);
924 nextsubst[curr_weight] += 1;
925
926 for (i = 0; i < curr_subst; i++) {
927 s->ref[i] = srch.ref[i];
928 }
929
930 avl_insert(&substs_ref[curr_weight], s, where);
931
932 if (avl_find(&substs[curr_weight], s, &where) != NULL) {
933 INTERR;
934 return;
935 }
936 avl_insert(&substs[curr_weight], s, where);
937 }
938 curr_subst = 0;
939
940
941 /*
942 * We are using the current (unique) priority as a search key
943 * in the substitution table.
944 */
945 add_order_pri(s->key);
946 }
947
948 static void
add_subst_pri(int32_t ref)949 add_subst_pri(int32_t ref)
950 {
951 if (curr_subst >= COLLATE_STR_LEN) {
952 errf(_("substitution string is too long"));
953 return;
954 }
955 subst_weights[curr_subst] = ref;
956 curr_subst++;
957 }
958
959 void
add_subst_char(wchar_t wc)960 add_subst_char(wchar_t wc)
961 {
962 collchar_t *cc;
963
964
965 if (((cc = get_collchar(wc, 1)) == NULL) ||
966 (cc->wc != wc)) {
967 INTERR;
968 return;
969 }
970 /* we take the weight for the character at that position */
971 add_subst_pri(cc->ref[curr_weight]);
972 }
973
974 void
add_subst_collelem(collelem_t * e)975 add_subst_collelem(collelem_t *e)
976 {
977 add_subst_pri(e->ref[curr_weight]);
978 }
979
980 void
add_subst_collsym(collsym_t * s)981 add_subst_collsym(collsym_t *s)
982 {
983 add_subst_pri(s->ref);
984 }
985
986 void
add_subst_symbol(char * ptr)987 add_subst_symbol(char *ptr)
988 {
989 collundef_t *cu;
990
991 if ((cu = get_collundef(ptr)) != NULL) {
992 add_subst_pri(cu->ref[curr_weight]);
993 }
994 }
995
996 void
add_weight(int32_t ref,int pass)997 add_weight(int32_t ref, int pass)
998 {
999 weight_t srch;
1000 weight_t *w;
1001 avl_index_t where;
1002
1003 srch.pri = resolve_pri(ref);
1004
1005 /* No translation of ignores */
1006 if (srch.pri == 0)
1007 return;
1008
1009 /* Substitution priorities are not weights */
1010 if (srch.pri & COLLATE_SUBST_PRIORITY)
1011 return;
1012
1013 if (avl_find(&weights[pass], &srch, &where) != NULL)
1014 return;
1015
1016 if ((w = calloc(sizeof (*w), 1)) == NULL) {
1017 errf(_("out of memory"));
1018 return;
1019 }
1020 w->pri = srch.pri;
1021 avl_insert(&weights[pass], w, where);
1022 }
1023
1024 void
add_weights(int32_t * refs)1025 add_weights(int32_t *refs)
1026 {
1027 int i;
1028 for (i = 0; i < NUM_WT; i++) {
1029 add_weight(refs[i], i);
1030 }
1031 }
1032
1033 int32_t
get_weight(int32_t ref,int pass)1034 get_weight(int32_t ref, int pass)
1035 {
1036 weight_t srch;
1037 weight_t *w;
1038 int32_t pri;
1039
1040 pri = resolve_pri(ref);
1041 if (pri & COLLATE_SUBST_PRIORITY) {
1042 return (pri);
1043 }
1044 if (pri <= 0) {
1045 return (pri);
1046 }
1047 srch.pri = pri;
1048 if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) {
1049 INTERR;
1050 return (-1);
1051 }
1052 return (w->opt);
1053 }
1054
1055 void
dump_collate(void)1056 dump_collate(void)
1057 {
1058 FILE *f;
1059 int i, j, n;
1060 size_t sz;
1061 int32_t pri;
1062 collelem_t *ce;
1063 collchar_t *cc;
1064 subst_t *sb;
1065 char vers[COLLATE_STR_LEN];
1066 collate_char_t chars[UCHAR_MAX + 1];
1067 collate_large_t *large;
1068 collate_subst_t *subst[COLL_WEIGHTS_MAX];
1069 collate_chain_t *chain;
1070
1071 /*
1072 * We have to run throught a preliminary pass to identify all the
1073 * weights that we use for each sorting level.
1074 */
1075 for (i = 0; i < NUM_WT; i++) {
1076 add_weight(pri_ignore, i);
1077 }
1078 for (i = 0; i < NUM_WT; i++) {
1079 for (sb = avl_first(&substs[i]); sb;
1080 sb = AVL_NEXT(&substs[i], sb)) {
1081 for (j = 0; sb->ref[j]; j++) {
1082 add_weight(sb->ref[j], i);
1083 }
1084 }
1085 }
1086 for (ce = avl_first(&elem_by_expand);
1087 ce != NULL;
1088 ce = AVL_NEXT(&elem_by_expand, ce)) {
1089 add_weights(ce->ref);
1090 }
1091 for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1092 add_weights(cc->ref);
1093 }
1094
1095 /*
1096 * Now we walk the entire set of weights, removing the gaps
1097 * in the weights. This gives us optimum usage. The walk
1098 * occurs in priority.
1099 */
1100 for (i = 0; i < NUM_WT; i++) {
1101 weight_t *w;
1102 for (w = avl_first(&weights[i]); w;
1103 w = AVL_NEXT(&weights[i], w)) {
1104 w->opt = nweight[i];
1105 nweight[i] += 1;
1106 }
1107 }
1108
1109 (void) memset(&chars, 0, sizeof (chars));
1110 (void) memset(vers, 0, COLLATE_STR_LEN);
1111 (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
1112
1113 /*
1114 * We need to make sure we arrange for the UNDEFINED field
1115 * to show up. Also, set the total weight counts.
1116 */
1117 for (i = 0; i < NUM_WT; i++) {
1118 if (resolve_pri(pri_undefined[i]) == -1) {
1119 set_pri(pri_undefined[i], -1, RESOLVED);
1120 /* they collate at the end of everything else */
1121 collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
1122 }
1123 collinfo.pri_count[i] = nweight[i];
1124 }
1125
1126 collinfo.pri_count[NUM_WT] = max_wide();
1127 collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
1128 collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
1129
1130 /*
1131 * Ordinary character priorities
1132 */
1133 for (i = 0; i <= UCHAR_MAX; i++) {
1134 if ((cc = get_collchar(i, 0)) != NULL) {
1135 for (j = 0; j < NUM_WT; j++) {
1136 chars[i].pri[j] = get_weight(cc->ref[j], j);
1137 }
1138 } else {
1139 for (j = 0; j < NUM_WT; j++) {
1140 chars[i].pri[j] =
1141 get_weight(pri_undefined[j], j);
1142 }
1143 /*
1144 * Per POSIX, for undefined characters, we
1145 * also have to add a last item, which is the
1146 * character code.
1147 */
1148 chars[i].pri[NUM_WT] = i;
1149 }
1150 }
1151
1152 /*
1153 * Substitution tables
1154 */
1155 for (i = 0; i < NUM_WT; i++) {
1156 collate_subst_t *st = NULL;
1157 n = collinfo.subst_count[i] = avl_numnodes(&substs[i]);
1158 if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) {
1159 errf(_("out of memory"));
1160 return;
1161 }
1162 n = 0;
1163 for (sb = avl_first(&substs[i]); sb;
1164 sb = AVL_NEXT(&substs[i], sb)) {
1165 if ((st[n].key = resolve_pri(sb->key)) < 0) {
1166 /* by definition these resolve! */
1167 INTERR;
1168 }
1169 if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
1170 INTERR;
1171 }
1172 for (j = 0; sb->ref[j]; j++) {
1173 st[n].pri[j] = get_weight(sb->ref[j], i);
1174 }
1175 n++;
1176 }
1177 if (n != collinfo.subst_count[i])
1178 INTERR;
1179 subst[i] = st;
1180 }
1181
1182
1183 /*
1184 * Chains, i.e. collating elements
1185 */
1186 collinfo.chain_count = avl_numnodes(&elem_by_expand);
1187 chain = calloc(sizeof (collate_chain_t), collinfo.chain_count);
1188 if (chain == NULL) {
1189 errf(_("out of memory"));
1190 return;
1191 }
1192 for (n = 0, ce = avl_first(&elem_by_expand);
1193 ce != NULL;
1194 ce = AVL_NEXT(&elem_by_expand, ce), n++) {
1195 (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
1196 for (i = 0; i < NUM_WT; i++) {
1197 chain[n].pri[i] = get_weight(ce->ref[i], i);
1198 }
1199 }
1200 if (n != collinfo.chain_count)
1201 INTERR;
1202
1203 /*
1204 * Large (> UCHAR_MAX) character priorities
1205 */
1206 large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1);
1207 if (large == NULL) {
1208 errf(_("out of memory"));
1209 return;
1210 }
1211
1212 i = 0;
1213 for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1214 int undef = 0;
1215 /* we already gathered those */
1216 if (cc->wc <= UCHAR_MAX)
1217 continue;
1218 for (j = 0; j < NUM_WT; j++) {
1219 if ((pri = get_weight(cc->ref[j], j)) < 0) {
1220 undef = 1;
1221 }
1222 if (undef && (pri >= 0)) {
1223 /* if undefined, then all priorities are */
1224 INTERR;
1225 } else {
1226 large[i].pri.pri[j] = pri;
1227 }
1228 }
1229 if (!undef) {
1230 large[i].val = cc->wc;
1231 collinfo.large_count = i++;
1232 }
1233 }
1234
1235 if ((f = open_category()) == NULL) {
1236 return;
1237 }
1238
1239 /* Time to write the entire data set out */
1240
1241 if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
1242 (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
1243 (wr_category(&chars, sizeof (chars), f) < 0)) {
1244 return;
1245 }
1246
1247 for (i = 0; i < NUM_WT; i++) {
1248 sz = sizeof (collate_subst_t) * collinfo.subst_count[i];
1249 if (wr_category(subst[i], sz, f) < 0) {
1250 return;
1251 }
1252 }
1253 sz = sizeof (collate_chain_t) * collinfo.chain_count;
1254 if (wr_category(chain, sz, f) < 0) {
1255 return;
1256 }
1257 sz = sizeof (collate_large_t) * collinfo.large_count;
1258 if (wr_category(large, sz, f) < 0) {
1259 return;
1260 }
1261
1262 close_category(f);
1263 }
1264