1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms version 1.0 5 * of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 14 */ 15 16 /* 17 * LC_COLLATE database generation routines for localedef. 18 */ 19 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include <errno.h> 23 #include <string.h> 24 #include <sys/types.h> 25 #include <sys/avl.h> 26 #include <string.h> 27 #include <unistd.h> 28 #include <wchar.h> 29 #include <widec.h> 30 #include <limits.h> 31 #include "localedef.h" 32 #include "parser.tab.h" 33 #include "collate.h" 34 35 /* 36 * In order to resolve the priorities, we create a table of priorities. 37 * Entries in the table can be in one of three states. 38 * 39 * UNKNOWN is for newly allocated entries, and indicates that nothing 40 * is known about the priority. (For example, when new entries are created 41 * for collating-symbols, this is the value assigned for them until the 42 * collating symbol's order has been determined. 43 * 44 * RESOLVED is used for an entry where the priority indicates the final 45 * numeric weight. 46 * 47 * REFER is used for entries that reference other entries. Typically 48 * this is used for forward references. A collating-symbol can never 49 * have this value. 50 * 51 * The "self" field is a reference to or own index in the pri list. 52 * 53 * The "pass" field is used during final resolution to aid in detection 54 * of referencing loops. (For example <A> depends on <B>, but <B> has its 55 * priority dependent on <A>.) 56 */ 57 typedef enum { 58 UNKNOWN, /* priority is totally unknown */ 59 RESOLVED, /* priority value fully resolved */ 60 REFER /* priority is a reference (index) */ 61 } res_t; 62 63 typedef struct priority { 64 res_t res; 65 int32_t pri; 66 int32_t self; 67 int pass; 68 int lineno; 69 } collpri_t; 70 71 72 #define NUM_WT collinfo.directive_count 73 74 /* 75 * These are the abstract collating symbols, which are just a symbolic 76 * way to reference a priority. 77 */ 78 struct collsym { 79 char *name; 80 int32_t ref; 81 avl_node_t avl; 82 }; 83 84 /* 85 * These are also abstract collating symbols, but we allow them to have 86 * different priorities at different levels. 87 */ 88 typedef struct collundef { 89 char *name; 90 int32_t ref[COLL_WEIGHTS_MAX]; 91 avl_node_t avl; 92 } collundef_t; 93 94 /* 95 * These are called "chains" in libc. This records the fact that two 96 * more characters should be treated as a single collating entity when 97 * they appear together. For example, in Spanish <C><h> gets collated 98 * as a character between <C> and <D>. 99 */ 100 struct collelem { 101 char *symbol; 102 wchar_t *expand; 103 int32_t ref[COLL_WEIGHTS_MAX]; 104 avl_node_t avl_bysymbol; 105 avl_node_t avl_byexpand; 106 }; 107 108 /* 109 * Individual characters have a sequence of weights as well. 110 */ 111 typedef struct collchar { 112 wchar_t wc; 113 wchar_t ref[COLL_WEIGHTS_MAX]; 114 avl_node_t avl; 115 } collchar_t; 116 117 /* 118 * Substitution entries. The key is itself a priority. Note that 119 * when we create one of these, we *automatically* wind up with a 120 * fully resolved priority for the key, because creation of 121 * substitutions creates a resolved priority at the same time. 122 */ 123 typedef struct { 124 int32_t key; 125 int32_t ref[COLLATE_STR_LEN]; 126 avl_node_t avl; 127 } subst_t; 128 129 static avl_tree_t collsyms; 130 static avl_tree_t collundefs; 131 static avl_tree_t elem_by_symbol; 132 static avl_tree_t elem_by_expand; 133 static avl_tree_t collchars; 134 static avl_tree_t substs[COLL_WEIGHTS_MAX]; 135 136 /* 137 * This is state tracking for the ellipsis token. Note that we start 138 * the initial values so that the ellipsis logic will think we got a 139 * magic starting value of NUL. It starts at minus one because the 140 * starting point is exclusive -- i.e. the starting point is not 141 * itself handled by the ellipsis code. 142 */ 143 static int currorder = EOF; 144 static int lastorder = EOF; 145 static collelem_t *currelem; 146 static collchar_t *currchar; 147 static collundef_t *currundef; 148 static wchar_t ellipsis_start = 0; 149 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX]; 150 151 /* 152 * We keep a running tally of weights. 153 */ 154 static int nextpri = 1; 155 156 /* 157 * This array collects up the weights for each level. 158 */ 159 static int32_t order_weights[COLL_WEIGHTS_MAX]; 160 static int curr_weight = 0; 161 static int32_t subst_weights[COLLATE_STR_LEN]; 162 static int curr_subst = 0; 163 164 /* 165 * Some initial priority values. 166 */ 167 static int32_t pri_undefined[COLL_WEIGHTS_MAX]; 168 static int32_t pri_ignore; 169 170 static collate_info_t collinfo; 171 172 static collpri_t *prilist = NULL; 173 static int numpri = 0; 174 static int maxpri = 0; 175 176 static void start_order(int); 177 178 static int32_t 179 new_pri(void) 180 { 181 int i; 182 183 if (numpri >= maxpri) { 184 maxpri = maxpri ? maxpri * 2 : 1024; 185 prilist = realloc(prilist, sizeof (collpri_t) * maxpri); 186 if (prilist == NULL) { 187 errf(_("out of memory")); 188 return (-1); 189 } 190 for (i = numpri; i < maxpri; i++) { 191 prilist[i].res = UNKNOWN; 192 prilist[i].pri = 0; 193 prilist[i].pass = 0; 194 prilist[i].self = i; 195 } 196 } 197 return (numpri++); 198 } 199 200 static collpri_t * 201 get_pri(int32_t ref) 202 { 203 if ((ref < 0) || (ref > numpri)) { 204 INTERR; 205 return (NULL); 206 } 207 return (&prilist[ref]); 208 } 209 210 static void 211 set_pri(int32_t ref, int32_t v, res_t res) 212 { 213 collpri_t *pri; 214 215 pri = get_pri(ref); 216 217 if ((res == REFER) && ((v < 0) || (v >= numpri))) { 218 INTERR; 219 } 220 221 /* Resolve self references */ 222 if ((res == REFER) && (ref == v)) { 223 v = nextpri; 224 res = RESOLVED; 225 } 226 227 if (pri->res != UNKNOWN) { 228 warn(_("repeated item in order list (first on %d)"), 229 pri->lineno); 230 return; 231 } 232 pri->lineno = lineno; 233 pri->pri = v; 234 pri->res = res; 235 } 236 237 static int32_t 238 resolve_pri(int32_t ref) 239 { 240 collpri_t *pri; 241 static int32_t pass = 0; 242 243 pri = get_pri(ref); 244 pass++; 245 while (pri->res == REFER) { 246 if (pri->pass == pass) { 247 /* report a line with the circular symbol */ 248 lineno = pri->lineno; 249 errf(_("circular reference in order list")); 250 return (-1); 251 } 252 if ((pri->pri < 0) || (pri->pri >= numpri)) { 253 INTERR; 254 return (-1); 255 } 256 pri->pass = pass; 257 pri = &prilist[pri->pri]; 258 } 259 260 if (pri->res == UNKNOWN) { 261 return (-1); 262 } 263 if (pri->res != RESOLVED) 264 INTERR; 265 266 return (pri->pri); 267 } 268 269 static int 270 collsym_compare(const void *n1, const void *n2) 271 { 272 const collsym_t *c1 = n1; 273 const collsym_t *c2 = n2; 274 int rv; 275 276 rv = strcmp(c1->name, c2->name); 277 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 278 } 279 280 static int 281 collundef_compare(const void *n1, const void *n2) 282 { 283 const collundef_t *c1 = n1; 284 const collundef_t *c2 = n2; 285 int rv; 286 287 rv = strcmp(c1->name, c2->name); 288 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 289 } 290 291 static int 292 element_compare_symbol(const void *n1, const void *n2) 293 { 294 const collelem_t *c1 = n1; 295 const collelem_t *c2 = n2; 296 int rv; 297 298 rv = strcmp(c1->symbol, c2->symbol); 299 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 300 } 301 302 static int 303 element_compare_expand(const void *n1, const void *n2) 304 { 305 const collelem_t *c1 = n1; 306 const collelem_t *c2 = n2; 307 int rv; 308 309 rv = wcscmp(c1->expand, c2->expand); 310 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 311 } 312 313 static int 314 collchar_compare(const void *n1, const void *n2) 315 { 316 wchar_t k1 = ((const collchar_t *)n1)->wc; 317 wchar_t k2 = ((const collchar_t *)n2)->wc; 318 319 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 320 } 321 322 static int 323 subst_compare(const void *n1, const void *n2) 324 { 325 int32_t k1 = ((const subst_t *)n1)->key; 326 int32_t k2 = ((const subst_t *)n2)->key; 327 328 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 329 } 330 331 void 332 init_collate(void) 333 { 334 int i; 335 336 avl_create(&collsyms, collsym_compare, sizeof (collsym_t), 337 offsetof(collsym_t, avl)); 338 339 avl_create(&collundefs, collundef_compare, sizeof (collsym_t), 340 offsetof(collundef_t, avl)); 341 342 avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t), 343 offsetof(collelem_t, avl_bysymbol)); 344 avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t), 345 offsetof(collelem_t, avl_byexpand)); 346 347 avl_create(&collchars, collchar_compare, sizeof (collchar_t), 348 offsetof(collchar_t, avl)); 349 350 for (i = 0; i < COLL_WEIGHTS_MAX; i++) 351 avl_create(&substs[i], subst_compare, sizeof (subst_t), 352 offsetof(subst_t, avl)); 353 354 (void) memset(&collinfo, 0, sizeof (collinfo)); 355 356 /* allocate some initial priorities */ 357 pri_ignore = new_pri(); 358 359 set_pri(pri_ignore, 0, RESOLVED); 360 361 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 362 pri_undefined[i] = new_pri(); 363 364 /* we will override this later */ 365 set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN); 366 } 367 } 368 369 void 370 define_collsym(char *name) 371 { 372 collsym_t *sym; 373 avl_index_t where; 374 375 if ((sym = calloc(sizeof (*sym), 1)) == NULL) { 376 errf(_("out of memory")); 377 return; 378 } 379 sym->name = name; 380 sym->ref = new_pri(); 381 382 if (avl_find(&collsyms, sym, &where) != NULL) { 383 /* 384 * This should never happen because we are only called 385 * for undefined symbols. 386 */ 387 INTERR; 388 return; 389 } 390 avl_insert(&collsyms, sym, where); 391 } 392 393 collsym_t * 394 lookup_collsym(char *name) 395 { 396 collsym_t srch; 397 398 srch.name = name; 399 return (avl_find(&collsyms, &srch, NULL)); 400 } 401 402 collelem_t * 403 lookup_collelem(char *symbol) 404 { 405 collelem_t srch; 406 407 srch.symbol = symbol; 408 return (avl_find(&elem_by_symbol, &srch, NULL)); 409 } 410 411 static collundef_t * 412 get_collundef(char *name) 413 { 414 collundef_t srch; 415 collundef_t *ud; 416 avl_index_t where; 417 int i; 418 419 srch.name = name; 420 if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) { 421 if (((ud = calloc(sizeof (*ud), 1)) == NULL) || 422 ((ud->name = strdup(name)) == NULL)) { 423 errf(_("out of memory")); 424 return (NULL); 425 } 426 for (i = 0; i < NUM_WT; i++) { 427 ud->ref[i] = new_pri(); 428 } 429 avl_insert(&collundefs, ud, where); 430 } 431 add_charmap_undefined(name); 432 return (ud); 433 } 434 435 static collchar_t * 436 get_collchar(wchar_t wc, int create) 437 { 438 collchar_t srch; 439 collchar_t *cc; 440 avl_index_t where; 441 int i; 442 443 srch.wc = wc; 444 cc = avl_find(&collchars, &srch, &where); 445 if ((cc == NULL) && create) { 446 if ((cc = calloc(sizeof (*cc), 1)) == NULL) { 447 errf(_("out of memory")); 448 return (NULL); 449 } 450 for (i = 0; i < NUM_WT; i++) { 451 cc->ref[i] = new_pri(); 452 } 453 cc->wc = wc; 454 avl_insert(&collchars, cc, where); 455 } 456 return (cc); 457 } 458 459 void 460 end_order_collsym(collsym_t *sym) 461 { 462 start_order(T_COLLSYM); 463 /* update the weight */ 464 465 set_pri(sym->ref, nextpri, RESOLVED); 466 nextpri++; 467 } 468 469 void 470 end_order(void) 471 { 472 int i; 473 int32_t pri; 474 int32_t ref; 475 collpri_t *p; 476 477 /* advance the priority/weight */ 478 pri = nextpri; 479 480 switch (currorder) { 481 case T_CHAR: 482 for (i = 0; i < NUM_WT; i++) { 483 if (((ref = order_weights[i]) < 0) || 484 ((p = get_pri(ref)) == NULL) || 485 (p->pri == -1)) { 486 /* unspecified weight is a self reference */ 487 set_pri(currchar->ref[i], pri, RESOLVED); 488 } else { 489 set_pri(currchar->ref[i], ref, REFER); 490 } 491 order_weights[i] = -1; 492 } 493 494 /* leave a cookie trail in case next symbol is ellipsis */ 495 ellipsis_start = currchar->wc + 1; 496 currchar = NULL; 497 break; 498 499 case T_ELLIPSIS: 500 /* save off the weights were we can find them */ 501 for (i = 0; i < NUM_WT; i++) { 502 ellipsis_weights[i] = order_weights[i]; 503 order_weights[i] = -1; 504 } 505 break; 506 507 case T_COLLELEM: 508 if (currelem == NULL) { 509 INTERR; 510 } else { 511 for (i = 0; i < NUM_WT; i++) { 512 513 if (((ref = order_weights[i]) < 0) || 514 ((p = get_pri(ref)) == NULL) || 515 (p->pri == -1)) { 516 set_pri(currelem->ref[i], pri, 517 RESOLVED); 518 } else { 519 set_pri(currelem->ref[i], ref, REFER); 520 } 521 order_weights[i] = -1; 522 } 523 } 524 break; 525 526 case T_UNDEFINED: 527 for (i = 0; i < NUM_WT; i++) { 528 if (((ref = order_weights[i]) < 0) || 529 ((p = get_pri(ref)) == NULL) || 530 (p->pri == -1)) { 531 set_pri(pri_undefined[i], -1, RESOLVED); 532 } else { 533 set_pri(pri_undefined[i], ref, REFER); 534 } 535 order_weights[i] = -1; 536 } 537 break; 538 539 case T_SYMBOL: 540 if (((ref = order_weights[i]) < 0) || 541 ((p = get_pri(ref)) == NULL) || 542 (p->pri == -1)) { 543 set_pri(currundef->ref[i], pri, RESOLVED); 544 } else { 545 set_pri(currundef->ref[i], ref, REFER); 546 } 547 order_weights[i] = -1; 548 break; 549 550 default: 551 INTERR; 552 } 553 554 nextpri++; 555 } 556 557 static void 558 start_order(int type) 559 { 560 int i; 561 562 lastorder = currorder; 563 currorder = type; 564 565 /* this is used to protect ELLIPSIS processing */ 566 if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) { 567 errf(_("character value expected")); 568 } 569 570 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 571 order_weights[i] = -1; 572 } 573 curr_weight = 0; 574 } 575 576 void 577 start_order_undefined(void) 578 { 579 start_order(T_UNDEFINED); 580 } 581 582 void 583 start_order_symbol(char *name) 584 { 585 currundef = get_collundef(name); 586 start_order(T_SYMBOL); 587 } 588 589 void 590 start_order_char(wchar_t wc) 591 { 592 collchar_t *cc; 593 int32_t ref; 594 595 start_order(T_CHAR); 596 597 /* 598 * If we last saw an ellipsis, then we need to close the range. 599 * Handle that here. Note that we have to be careful because the 600 * items *inside* the range are treated exclusiveley to the items 601 * outside of the range. The ends of the range can have quite 602 * different weights than the range members. 603 */ 604 if (lastorder == T_ELLIPSIS) { 605 int i; 606 607 if (wc < ellipsis_start) { 608 errf(_("malformed range!")); 609 return; 610 } 611 while (ellipsis_start < wc) { 612 /* 613 * pick all of the saved weights for the 614 * ellipsis. note that -1 encodes for the 615 * ellipsis itself, which means to take the 616 * current relative priority. 617 */ 618 if ((cc = get_collchar(ellipsis_start, 1)) == NULL) { 619 INTERR; 620 return; 621 } 622 for (i = 0; i < NUM_WT; i++) { 623 collpri_t *p; 624 if (((ref = ellipsis_weights[i]) == -1) || 625 ((p = get_pri(ref)) == NULL) || 626 (p->pri == -1)) { 627 set_pri(cc->ref[i], nextpri, RESOLVED); 628 } else { 629 set_pri(cc->ref[i], ref, REFER); 630 } 631 ellipsis_weights[i] = NULL; 632 } 633 ellipsis_start++; 634 nextpri++; 635 } 636 } 637 638 currchar = get_collchar(wc, 1); 639 } 640 641 void 642 start_order_collelem(collelem_t *e) 643 { 644 start_order(T_COLLELEM); 645 currelem = e; 646 } 647 648 void 649 start_order_ellipsis(void) 650 { 651 int i; 652 653 start_order(T_ELLIPSIS); 654 655 if (lastorder != T_CHAR) { 656 errf(_("illegal starting point for range")); 657 return; 658 } 659 660 for (i = 0; i < NUM_WT; i++) { 661 ellipsis_weights[i] = order_weights[i]; 662 } 663 } 664 665 void 666 define_collelem(char *name, wchar_t *wcs) 667 { 668 collelem_t *e; 669 avl_index_t where1; 670 avl_index_t where2; 671 int i; 672 673 if (wcslen(wcs) >= COLLATE_STR_LEN) { 674 errf(_("expanded collation element too long")); 675 return; 676 } 677 678 if ((e = calloc(sizeof (*e), 1)) == NULL) { 679 errf(_("out of memory")); 680 return; 681 } 682 e->expand = wcs; 683 e->symbol = name; 684 685 /* 686 * This is executed before the order statement, so we don't 687 * know how many priorities we *really* need. We allocate one 688 * for each possible weight. Not a big deal, as collating-elements 689 * prove to be quite rare. 690 */ 691 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 692 e->ref[i] = new_pri(); 693 } 694 695 /* A character sequence can only reduce to one element. */ 696 if ((avl_find(&elem_by_symbol, e, &where1) != NULL) || 697 (avl_find(&elem_by_expand, e, &where2) != NULL)) { 698 errf(_("duplicate collating element definition")); 699 return; 700 } 701 avl_insert(&elem_by_symbol, e, where1); 702 avl_insert(&elem_by_expand, e, where2); 703 } 704 705 void 706 add_order_bit(int kw) 707 { 708 uint8_t bit = DIRECTIVE_UNDEF; 709 710 switch (kw) { 711 case T_FORWARD: 712 bit = DIRECTIVE_FORWARD; 713 break; 714 case T_BACKWARD: 715 bit = DIRECTIVE_BACKWARD; 716 break; 717 case T_POSITION: 718 bit = DIRECTIVE_POSITION; 719 break; 720 default: 721 INTERR; 722 break; 723 } 724 collinfo.directive[collinfo.directive_count] |= bit; 725 } 726 727 void 728 add_order_directive(void) 729 { 730 if (collinfo.directive_count >= COLL_WEIGHTS_MAX) { 731 errf(_("too many directives (max %d)"), COLL_WEIGHTS_MAX); 732 } 733 collinfo.directive_count++; 734 } 735 736 static void 737 add_order_pri(int32_t ref) 738 { 739 if (curr_weight >= NUM_WT) { 740 errf(_("too many weights (max %d)"), NUM_WT); 741 return; 742 } 743 order_weights[curr_weight] = ref; 744 curr_weight++; 745 } 746 747 void 748 add_order_collsym(collsym_t *s) 749 { 750 add_order_pri(s->ref); 751 } 752 753 void 754 add_order_char(wchar_t wc) 755 { 756 collchar_t *cc; 757 758 if ((cc = get_collchar(wc, 1)) == NULL) { 759 INTERR; 760 return; 761 } 762 763 add_order_pri(cc->ref[curr_weight]); 764 } 765 766 void 767 add_order_collelem(collelem_t *e) 768 { 769 add_order_pri(e->ref[curr_weight]); 770 } 771 772 void 773 add_order_ignore(void) 774 { 775 add_order_pri(pri_ignore); 776 } 777 778 void 779 add_order_symbol(char *sym) 780 { 781 collundef_t *c; 782 if ((c = get_collundef(sym)) == NULL) { 783 INTERR; 784 return; 785 } 786 add_order_pri(c->ref[curr_weight]); 787 } 788 789 void 790 add_order_ellipsis(void) 791 { 792 /* special NULL value indicates self reference */ 793 add_order_pri(NULL); 794 } 795 796 void 797 add_order_subst(void) 798 { 799 subst_t *s; 800 avl_index_t where; 801 int i; 802 803 if ((s = calloc(sizeof (*s), 1)) == NULL) { 804 errf(_("out of memory")); 805 return; 806 } 807 s->key = new_pri(); 808 809 /* 810 * We use a self reference for our key, but we set a high bit 811 * to indicate that this is a substitution reference. This 812 * will expedite table lookups later, and prevent table 813 * lookups for situations that don't require it. (In short, 814 * its a big win, because we can skip a lot of binary 815 * searching.) 816 */ 817 set_pri(s->key, (nextpri | COLLATE_SUBST_PRIORITY), RESOLVED); 818 819 for (i = 0; i < curr_subst; i++) { 820 s->ref[i] = subst_weights[i]; 821 subst_weights[i] = 0; 822 } 823 curr_subst = 0; 824 825 if (avl_find(&substs[curr_weight], s, &where) != NULL) { 826 INTERR; 827 return; 828 } 829 avl_insert(&substs[curr_weight], s, where); 830 831 /* 832 * We are using the current (unique) priority as a search key 833 * in the substitution table. 834 */ 835 add_order_pri(s->key); 836 } 837 838 static void 839 add_subst_pri(int32_t ref) 840 { 841 if (curr_subst >= COLLATE_STR_LEN) { 842 errf(_("substitution string is too long")); 843 return; 844 } 845 subst_weights[curr_subst] = ref; 846 curr_subst++; 847 } 848 849 void 850 add_subst_char(wchar_t wc) 851 { 852 collchar_t *cc; 853 854 855 if (((cc = get_collchar(wc, 1)) == NULL) || 856 (cc->wc != wc)) { 857 INTERR; 858 return; 859 } 860 /* we take the weight for the character at that position */ 861 add_subst_pri(cc->ref[curr_weight]); 862 } 863 864 void 865 add_subst_collelem(collelem_t *e) 866 { 867 add_subst_pri(e->ref[curr_weight]); 868 } 869 870 void 871 add_subst_collsym(collsym_t *s) 872 { 873 add_subst_pri(s->ref); 874 } 875 876 void 877 add_subst_symbol(char *ptr) 878 { 879 collundef_t *cu; 880 881 if ((cu = get_collundef(ptr)) != NULL) { 882 add_subst_pri(cu->ref[curr_weight]); 883 } 884 } 885 886 void 887 dump_collate(void) 888 { 889 FILE *f; 890 int i, j, n; 891 size_t sz; 892 int32_t pri; 893 collelem_t *ce; 894 collchar_t *cc; 895 subst_t *sb; 896 char vers[COLLATE_STR_LEN]; 897 collate_char_pri_t char_pri[UCHAR_MAX + 1]; 898 collate_large_pri_t *large; 899 collate_subst_t *subst[COLL_WEIGHTS_MAX]; 900 collate_chain_pri_t *chain; 901 902 (void) memset(&char_pri, 0, sizeof (char_pri)); 903 (void) memset(vers, 0, COLLATE_STR_LEN); 904 (void) snprintf(vers, sizeof (vers), COLLATE_VERSION); 905 906 /* 907 * We need to make sure we arrange for the UNDEFINED field 908 * to show up. 909 */ 910 for (i = 0; i < NUM_WT; i++) { 911 if (resolve_pri(pri_undefined[i]) == -1) { 912 set_pri(pri_undefined[i], -1, RESOLVED); 913 /* they collate at the end of everything else */ 914 collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY; 915 } 916 } 917 918 collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY; 919 collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED; 920 921 /* 922 * Ordinary character priorities 923 */ 924 for (i = 0; i <= UCHAR_MAX; i++) { 925 if ((cc = get_collchar(i, 0)) != NULL) { 926 for (j = 0; j < NUM_WT; j++) { 927 char_pri[i].pri[j] = resolve_pri(cc->ref[j]); 928 } 929 } else { 930 for (j = 0; j < NUM_WT; j++) { 931 char_pri[i].pri[j] = 932 resolve_pri(pri_undefined[j]); 933 } 934 /* 935 * Per POSIX, for undefined characters, we 936 * also have to add a last item, which is the 937 * character code. 938 */ 939 char_pri[i].pri[NUM_WT] = i; 940 } 941 } 942 943 /* 944 * Substitution tables 945 */ 946 for (i = 0; i < collinfo.directive_count; i++) { 947 collate_subst_t *st = NULL; 948 n = collinfo.subst_count[i] = avl_numnodes(&substs[i]); 949 if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) { 950 errf(_("out of memory")); 951 return; 952 } 953 n = 0; 954 for (sb = avl_first(&substs[i]); sb; 955 sb = AVL_NEXT(&substs[i], sb)) { 956 if ((st[n].key = resolve_pri(sb->key)) < 0) { 957 /* by definition these resolve! */ 958 INTERR; 959 } 960 for (j = 0; sb->ref[j]; j++) { 961 st[n].pri[j] = resolve_pri(sb->ref[j]); 962 } 963 n++; 964 } 965 if (n != collinfo.subst_count[i]) 966 INTERR; 967 subst[i] = st; 968 } 969 970 /* 971 * Chains, i.e. collating elements 972 */ 973 collinfo.chain_count = avl_numnodes(&elem_by_expand); 974 chain = calloc(sizeof (collate_chain_pri_t), collinfo.chain_count); 975 if (chain == NULL) { 976 errf(_("out of memory")); 977 return; 978 } 979 for (n = 0, ce = avl_first(&elem_by_expand); 980 ce != NULL; 981 ce = AVL_NEXT(&elem_by_expand, ce), n++) { 982 (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN); 983 for (i = 0; i < NUM_WT; i++) { 984 chain[n].pri[i] = resolve_pri(ce->ref[i]); 985 } 986 } 987 if (n != collinfo.chain_count) 988 INTERR; 989 990 /* 991 * Large (> UCHAR_MAX) character priorities 992 */ 993 large = calloc(sizeof (collate_large_pri_t) * avl_numnodes(&collchars), 994 1); 995 if (large == NULL) { 996 errf(_("out of memory")); 997 return; 998 } 999 1000 i = 0; 1001 for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) { 1002 int undef = 0; 1003 /* we already gathered those */ 1004 if (cc->wc <= UCHAR_MAX) 1005 continue; 1006 for (j = 0; j < NUM_WT; j++) { 1007 if ((pri = resolve_pri(cc->ref[j])) < 0) { 1008 undef = 1; 1009 } 1010 if (undef && (pri >= 0)) { 1011 /* if undefined, then all priorities are */ 1012 INTERR; 1013 } else { 1014 large[i].pri.pri[j] = pri; 1015 } 1016 } 1017 if (!undef) { 1018 large[i].val = cc->wc; 1019 collinfo.large_pri_count = i++; 1020 } 1021 } 1022 1023 if ((f = open_category()) == NULL) { 1024 return; 1025 } 1026 1027 /* Time to write the entire data set out */ 1028 1029 if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) || 1030 (wr_category(&collinfo, sizeof (collinfo), f) < 0) || 1031 (wr_category(&char_pri, sizeof (char_pri), f) < 0)) { 1032 return; 1033 } 1034 1035 for (i = 0; i < NUM_WT; i++) { 1036 sz = sizeof (collate_subst_t) * collinfo.subst_count[i]; 1037 if (wr_category(subst[i], sz, f) < 0) { 1038 return; 1039 } 1040 } 1041 sz = sizeof (collate_chain_pri_t) * collinfo.chain_count; 1042 if (wr_category(chain, sz, f) < 0) { 1043 return; 1044 } 1045 sz = sizeof (collate_large_pri_t) * collinfo.large_pri_count; 1046 if (wr_category(large, sz, f) < 0) { 1047 return; 1048 } 1049 1050 close_category(f); 1051 } 1052