1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2017 Nexenta Systems, Inc. 14 */ 15 16 /* 17 * LC_COLLATE database generation routines for localedef. 18 */ 19 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include <errno.h> 23 #include <string.h> 24 #include <sys/types.h> 25 #include <sys/avl.h> 26 #include <string.h> 27 #include <unistd.h> 28 #include <wchar.h> 29 #include <widec.h> 30 #include <limits.h> 31 #include "localedef.h" 32 #include "parser.tab.h" 33 #include "collatefile.h" 34 35 /* 36 * Design notes. 37 * 38 * It will be extremely helpful to the reader if they have access to 39 * the localedef and locale file format specifications available. 40 * Latest versions of these are available from www.opengroup.org. 41 * 42 * The design for the collation code is a bit complex. The goal is a 43 * single collation database as described in collate.h (in 44 * libc/port/locale). However, there are some other tidbits: 45 * 46 * a) The substitution entries are now a directly indexable array. A 47 * priority elsewhere in the table is taken as an index into the 48 * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY) 49 * set. (The bit is cleared and the result is the index into the 50 * table. 51 * 52 * b) We eliminate duplicate entries into the substitution table. 53 * This saves a lot of space. 54 * 55 * c) The priorities for each level are "compressed", so that each 56 * sorting level has consecutively numbered priorities starting at 1. 57 * (O is reserved for the ignore priority.) This means sort levels 58 * which only have a few distinct priorities can represent the 59 * priority level in fewer bits, which makes the strxfrm output 60 * smaller. 61 * 62 * d) We record the total number of priorities so that strxfrm can 63 * figure out how many bytes to expand a numeric priority into. 64 * 65 * e) For the UNDEFINED pass (the last pass), we record the maximum 66 * number of bits needed to uniquely prioritize these entries, so that 67 * the last pass can also use smaller strxfrm output when possible. 68 * 69 * f) Priorities with the sign bit set are verboten. This works out 70 * because no active character set needs that bit to carry significant 71 * information once the character is in wide form. 72 * 73 * To process the entire data to make the database, we actually run 74 * multiple passes over the data. 75 * 76 * The first pass, which is done at parse time, identifies elements, 77 * substitutions, and such, and records them in priority order. As 78 * some priorities can refer to other priorities, using forward 79 * references, we use a table of references indicating whether the 80 * priority's value has been resolved, or whether it is still a 81 * reference. 82 * 83 * The second pass walks over all the items in priority order, noting 84 * that they are used directly, and not just an indirect reference. 85 * This is done by creating a "weight" structure for the item. The 86 * weights are stashed in an AVL tree sorted by relative "priority". 87 * 88 * The third pass walks over all the weight structures, in priority 89 * order, and assigns a new monotonically increasing (per sort level) 90 * weight value to them. These are the values that will actually be 91 * written to the file. 92 * 93 * The fourth pass just writes the data out. 94 */ 95 96 /* 97 * In order to resolve the priorities, we create a table of priorities. 98 * Entries in the table can be in one of three states. 99 * 100 * UNKNOWN is for newly allocated entries, and indicates that nothing 101 * is known about the priority. (For example, when new entries are created 102 * for collating-symbols, this is the value assigned for them until the 103 * collating symbol's order has been determined. 104 * 105 * RESOLVED is used for an entry where the priority indicates the final 106 * numeric weight. 107 * 108 * REFER is used for entries that reference other entries. Typically 109 * this is used for forward references. A collating-symbol can never 110 * have this value. 111 * 112 * The "pass" field is used during final resolution to aid in detection 113 * of referencing loops. (For example <A> depends on <B>, but <B> has its 114 * priority dependent on <A>.) 115 */ 116 typedef enum { 117 UNKNOWN, /* priority is totally unknown */ 118 RESOLVED, /* priority value fully resolved */ 119 REFER /* priority is a reference (index) */ 120 } res_t; 121 122 typedef struct weight { 123 int32_t pri; 124 int opt; 125 avl_node_t avl; 126 } weight_t; 127 128 typedef struct priority { 129 res_t res; 130 int32_t pri; 131 int pass; 132 int lineno; 133 } collpri_t; 134 135 #define NUM_WT collinfo.directive_count 136 137 /* 138 * These are the abstract collating symbols, which are just a symbolic 139 * way to reference a priority. 140 */ 141 struct collsym { 142 char *name; 143 int32_t ref; 144 avl_node_t avl; 145 }; 146 147 /* 148 * These are also abstract collating symbols, but we allow them to have 149 * different priorities at different levels. 150 */ 151 typedef struct collundef { 152 char *name; 153 int32_t ref[COLL_WEIGHTS_MAX]; 154 avl_node_t avl; 155 } collundef_t; 156 157 /* 158 * These are called "chains" in libc. This records the fact that two 159 * more characters should be treated as a single collating entity when 160 * they appear together. For example, in Spanish <C><h> gets collated 161 * as a character between <C> and <D>. 162 */ 163 struct collelem { 164 char *symbol; 165 wchar_t *expand; 166 int32_t ref[COLL_WEIGHTS_MAX]; 167 avl_node_t avl_bysymbol; 168 avl_node_t avl_byexpand; 169 }; 170 171 /* 172 * Individual characters have a sequence of weights as well. 173 */ 174 typedef struct collchar { 175 wchar_t wc; 176 int32_t ref[COLL_WEIGHTS_MAX]; 177 avl_node_t avl; 178 } collchar_t; 179 180 /* 181 * Substitution entries. The key is itself a priority. Note that 182 * when we create one of these, we *automatically* wind up with a 183 * fully resolved priority for the key, because creation of 184 * substitutions creates a resolved priority at the same time. 185 */ 186 typedef struct { 187 int32_t key; 188 int32_t ref[COLLATE_STR_LEN]; 189 avl_node_t avl; 190 avl_node_t avl_ref; 191 } subst_t; 192 193 static avl_tree_t collsyms; 194 static avl_tree_t collundefs; 195 static avl_tree_t elem_by_symbol; 196 static avl_tree_t elem_by_expand; 197 static avl_tree_t collchars; 198 static avl_tree_t substs[COLL_WEIGHTS_MAX]; 199 static avl_tree_t substs_ref[COLL_WEIGHTS_MAX]; 200 static avl_tree_t weights[COLL_WEIGHTS_MAX]; 201 static int32_t nweight[COLL_WEIGHTS_MAX]; 202 203 /* 204 * This is state tracking for the ellipsis token. Note that we start 205 * the initial values so that the ellipsis logic will think we got a 206 * magic starting value of NUL. It starts at minus one because the 207 * starting point is exclusive -- i.e. the starting point is not 208 * itself handled by the ellipsis code. 209 */ 210 static int currorder = EOF; 211 static int lastorder = EOF; 212 static collelem_t *currelem; 213 static collchar_t *currchar; 214 static collundef_t *currundef; 215 static wchar_t ellipsis_start = 0; 216 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX]; 217 218 /* 219 * We keep a running tally of weights. 220 */ 221 static int nextpri = 1; 222 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 }; 223 224 /* 225 * This array collects up the weights for each level. 226 */ 227 static int32_t order_weights[COLL_WEIGHTS_MAX]; 228 static int curr_weight = 0; 229 static int32_t subst_weights[COLLATE_STR_LEN]; 230 static int curr_subst = 0; 231 232 /* 233 * Some initial priority values. 234 */ 235 static int32_t pri_undefined[COLL_WEIGHTS_MAX]; 236 static int32_t pri_ignore; 237 238 static collate_info_t collinfo; 239 240 static collpri_t *prilist = NULL; 241 static int numpri = 0; 242 static int maxpri = 0; 243 244 static void start_order(int); 245 246 static int32_t 247 new_pri(void) 248 { 249 int i; 250 251 if (numpri >= maxpri) { 252 maxpri = maxpri ? maxpri * 2 : 1024; 253 prilist = realloc(prilist, sizeof (collpri_t) * maxpri); 254 if (prilist == NULL) { 255 errf(_("out of memory")); 256 return (-1); 257 } 258 for (i = numpri; i < maxpri; i++) { 259 prilist[i].res = UNKNOWN; 260 prilist[i].pri = 0; 261 prilist[i].pass = 0; 262 } 263 } 264 return (numpri++); 265 } 266 267 static collpri_t * 268 get_pri(int32_t ref) 269 { 270 if ((ref < 0) || (ref > numpri)) { 271 INTERR; 272 return (NULL); 273 } 274 return (&prilist[ref]); 275 } 276 277 static void 278 set_pri(int32_t ref, int32_t v, res_t res) 279 { 280 collpri_t *pri; 281 282 pri = get_pri(ref); 283 284 if ((res == REFER) && ((v < 0) || (v >= numpri))) { 285 INTERR; 286 } 287 288 /* Resolve self references */ 289 if ((res == REFER) && (ref == v)) { 290 v = nextpri; 291 res = RESOLVED; 292 } 293 294 if (pri->res != UNKNOWN) { 295 warn(_("repeated item in order list (first on %d)"), 296 pri->lineno); 297 return; 298 } 299 pri->lineno = lineno; 300 pri->pri = v; 301 pri->res = res; 302 } 303 304 static int32_t 305 resolve_pri(int32_t ref) 306 { 307 collpri_t *pri; 308 static int32_t pass = 0; 309 310 pri = get_pri(ref); 311 pass++; 312 while (pri->res == REFER) { 313 if (pri->pass == pass) { 314 /* report a line with the circular symbol */ 315 lineno = pri->lineno; 316 errf(_("circular reference in order list")); 317 return (-1); 318 } 319 if ((pri->pri < 0) || (pri->pri >= numpri)) { 320 INTERR; 321 return (-1); 322 } 323 pri->pass = pass; 324 pri = &prilist[pri->pri]; 325 } 326 327 if (pri->res == UNKNOWN) { 328 return (-1); 329 } 330 if (pri->res != RESOLVED) 331 INTERR; 332 333 return (pri->pri); 334 } 335 336 static int 337 weight_compare(const void *n1, const void *n2) 338 { 339 int32_t k1 = ((const weight_t *)n1)->pri; 340 int32_t k2 = ((const weight_t *)n2)->pri; 341 342 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 343 } 344 345 static int 346 collsym_compare(const void *n1, const void *n2) 347 { 348 const collsym_t *c1 = n1; 349 const collsym_t *c2 = n2; 350 int rv; 351 352 rv = strcmp(c1->name, c2->name); 353 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 354 } 355 356 static int 357 collundef_compare(const void *n1, const void *n2) 358 { 359 const collundef_t *c1 = n1; 360 const collundef_t *c2 = n2; 361 int rv; 362 363 rv = strcmp(c1->name, c2->name); 364 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 365 } 366 367 static int 368 element_compare_symbol(const void *n1, const void *n2) 369 { 370 const collelem_t *c1 = n1; 371 const collelem_t *c2 = n2; 372 int rv; 373 374 rv = strcmp(c1->symbol, c2->symbol); 375 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 376 } 377 378 static int 379 element_compare_expand(const void *n1, const void *n2) 380 { 381 const collelem_t *c1 = n1; 382 const collelem_t *c2 = n2; 383 int rv; 384 385 rv = wcscmp(c1->expand, c2->expand); 386 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 387 } 388 389 static int 390 collchar_compare(const void *n1, const void *n2) 391 { 392 wchar_t k1 = ((const collchar_t *)n1)->wc; 393 wchar_t k2 = ((const collchar_t *)n2)->wc; 394 395 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 396 } 397 398 static int 399 subst_compare(const void *n1, const void *n2) 400 { 401 int32_t k1 = ((const subst_t *)n1)->key; 402 int32_t k2 = ((const subst_t *)n2)->key; 403 404 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 405 } 406 407 static int 408 subst_compare_ref(const void *n1, const void *n2) 409 { 410 int32_t *c1 = ((subst_t *)n1)->ref; 411 int32_t *c2 = ((subst_t *)n2)->ref; 412 int rv; 413 414 rv = wcscmp((wchar_t *)c1, (wchar_t *)c2); 415 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 416 } 417 418 void 419 init_collate(void) 420 { 421 int i; 422 423 avl_create(&collsyms, collsym_compare, sizeof (collsym_t), 424 offsetof(collsym_t, avl)); 425 426 avl_create(&collundefs, collundef_compare, sizeof (collsym_t), 427 offsetof(collundef_t, avl)); 428 429 avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t), 430 offsetof(collelem_t, avl_bysymbol)); 431 avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t), 432 offsetof(collelem_t, avl_byexpand)); 433 434 avl_create(&collchars, collchar_compare, sizeof (collchar_t), 435 offsetof(collchar_t, avl)); 436 437 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 438 avl_create(&substs[i], subst_compare, sizeof (subst_t), 439 offsetof(subst_t, avl)); 440 avl_create(&substs_ref[i], subst_compare_ref, 441 sizeof (subst_t), offsetof(subst_t, avl_ref)); 442 avl_create(&weights[i], weight_compare, sizeof (weight_t), 443 offsetof(weight_t, avl)); 444 nweight[i] = 1; 445 } 446 447 (void) memset(&collinfo, 0, sizeof (collinfo)); 448 449 /* allocate some initial priorities */ 450 pri_ignore = new_pri(); 451 452 set_pri(pri_ignore, 0, RESOLVED); 453 454 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 455 pri_undefined[i] = new_pri(); 456 457 /* we will override this later */ 458 set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN); 459 } 460 } 461 462 void 463 define_collsym(char *name) 464 { 465 collsym_t *sym; 466 avl_index_t where; 467 468 if ((sym = calloc(sizeof (*sym), 1)) == NULL) { 469 errf(_("out of memory")); 470 return; 471 } 472 sym->name = name; 473 sym->ref = new_pri(); 474 475 if (avl_find(&collsyms, sym, &where) != NULL) { 476 /* 477 * This should never happen because we are only called 478 * for undefined symbols. 479 */ 480 free(sym); 481 INTERR; 482 return; 483 } 484 avl_insert(&collsyms, sym, where); 485 } 486 487 collsym_t * 488 lookup_collsym(char *name) 489 { 490 collsym_t srch; 491 492 srch.name = name; 493 return (avl_find(&collsyms, &srch, NULL)); 494 } 495 496 collelem_t * 497 lookup_collelem(char *symbol) 498 { 499 collelem_t srch; 500 501 srch.symbol = symbol; 502 return (avl_find(&elem_by_symbol, &srch, NULL)); 503 } 504 505 static collundef_t * 506 get_collundef(char *name) 507 { 508 collundef_t srch; 509 collundef_t *ud; 510 avl_index_t where; 511 int i; 512 513 srch.name = name; 514 if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) { 515 if (((ud = calloc(sizeof (*ud), 1)) == NULL) || 516 ((ud->name = strdup(name)) == NULL)) { 517 errf(_("out of memory")); 518 free(ud); 519 return (NULL); 520 } 521 for (i = 0; i < NUM_WT; i++) { 522 ud->ref[i] = new_pri(); 523 } 524 avl_insert(&collundefs, ud, where); 525 } 526 add_charmap_undefined(name); 527 return (ud); 528 } 529 530 static collchar_t * 531 get_collchar(wchar_t wc, int create) 532 { 533 collchar_t srch; 534 collchar_t *cc; 535 avl_index_t where; 536 int i; 537 538 srch.wc = wc; 539 cc = avl_find(&collchars, &srch, &where); 540 if ((cc == NULL) && create) { 541 if ((cc = calloc(sizeof (*cc), 1)) == NULL) { 542 errf(_("out of memory")); 543 return (NULL); 544 } 545 for (i = 0; i < NUM_WT; i++) { 546 cc->ref[i] = new_pri(); 547 } 548 cc->wc = wc; 549 avl_insert(&collchars, cc, where); 550 } 551 return (cc); 552 } 553 554 void 555 end_order_collsym(collsym_t *sym) 556 { 557 start_order(T_COLLSYM); 558 /* update the weight */ 559 560 set_pri(sym->ref, nextpri, RESOLVED); 561 nextpri++; 562 } 563 564 void 565 end_order(void) 566 { 567 int i; 568 int32_t pri; 569 int32_t ref; 570 collpri_t *p; 571 572 /* advance the priority/weight */ 573 pri = nextpri; 574 575 switch (currorder) { 576 case T_CHAR: 577 for (i = 0; i < NUM_WT; i++) { 578 if (((ref = order_weights[i]) < 0) || 579 ((p = get_pri(ref)) == NULL) || 580 (p->pri == -1)) { 581 /* unspecified weight is a self reference */ 582 set_pri(currchar->ref[i], pri, RESOLVED); 583 } else { 584 set_pri(currchar->ref[i], ref, REFER); 585 } 586 order_weights[i] = -1; 587 } 588 589 /* leave a cookie trail in case next symbol is ellipsis */ 590 ellipsis_start = currchar->wc + 1; 591 currchar = NULL; 592 break; 593 594 case T_ELLIPSIS: 595 /* save off the weights were we can find them */ 596 for (i = 0; i < NUM_WT; i++) { 597 ellipsis_weights[i] = order_weights[i]; 598 order_weights[i] = -1; 599 } 600 break; 601 602 case T_COLLELEM: 603 if (currelem == NULL) { 604 INTERR; 605 } else { 606 for (i = 0; i < NUM_WT; i++) { 607 608 if (((ref = order_weights[i]) < 0) || 609 ((p = get_pri(ref)) == NULL) || 610 (p->pri == -1)) { 611 set_pri(currelem->ref[i], pri, 612 RESOLVED); 613 } else { 614 set_pri(currelem->ref[i], ref, REFER); 615 } 616 order_weights[i] = -1; 617 } 618 } 619 break; 620 621 case T_UNDEFINED: 622 for (i = 0; i < NUM_WT; i++) { 623 if (((ref = order_weights[i]) < 0) || 624 ((p = get_pri(ref)) == NULL) || 625 (p->pri == -1)) { 626 set_pri(pri_undefined[i], -1, RESOLVED); 627 } else { 628 set_pri(pri_undefined[i], ref, REFER); 629 } 630 order_weights[i] = -1; 631 } 632 break; 633 634 case T_SYMBOL: 635 for (i = 0; i < NUM_WT; i++) { 636 if (((ref = order_weights[i]) < 0) || 637 ((p = get_pri(ref)) == NULL) || 638 (p->pri == -1)) { 639 set_pri(currundef->ref[i], pri, RESOLVED); 640 } else { 641 set_pri(currundef->ref[i], ref, REFER); 642 } 643 order_weights[i] = -1; 644 } 645 break; 646 647 default: 648 INTERR; 649 } 650 651 nextpri++; 652 } 653 654 static void 655 start_order(int type) 656 { 657 int i; 658 659 lastorder = currorder; 660 currorder = type; 661 662 /* this is used to protect ELLIPSIS processing */ 663 if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) { 664 errf(_("character value expected")); 665 } 666 667 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 668 order_weights[i] = -1; 669 } 670 curr_weight = 0; 671 } 672 673 void 674 start_order_undefined(void) 675 { 676 start_order(T_UNDEFINED); 677 } 678 679 void 680 start_order_symbol(char *name) 681 { 682 currundef = get_collundef(name); 683 start_order(T_SYMBOL); 684 } 685 686 void 687 start_order_char(wchar_t wc) 688 { 689 collchar_t *cc; 690 int32_t ref; 691 692 start_order(T_CHAR); 693 694 /* 695 * If we last saw an ellipsis, then we need to close the range. 696 * Handle that here. Note that we have to be careful because the 697 * items *inside* the range are treated exclusiveley to the items 698 * outside of the range. The ends of the range can have quite 699 * different weights than the range members. 700 */ 701 if (lastorder == T_ELLIPSIS) { 702 int i; 703 704 if (wc < ellipsis_start) { 705 errf(_("malformed range!")); 706 return; 707 } 708 while (ellipsis_start < wc) { 709 /* 710 * pick all of the saved weights for the 711 * ellipsis. note that -1 encodes for the 712 * ellipsis itself, which means to take the 713 * current relative priority. 714 */ 715 if ((cc = get_collchar(ellipsis_start, 1)) == NULL) { 716 INTERR; 717 return; 718 } 719 for (i = 0; i < NUM_WT; i++) { 720 collpri_t *p; 721 if (((ref = ellipsis_weights[i]) == -1) || 722 ((p = get_pri(ref)) == NULL) || 723 (p->pri == -1)) { 724 set_pri(cc->ref[i], nextpri, RESOLVED); 725 } else { 726 set_pri(cc->ref[i], ref, REFER); 727 } 728 ellipsis_weights[i] = 0; 729 } 730 ellipsis_start++; 731 nextpri++; 732 } 733 } 734 735 currchar = get_collchar(wc, 1); 736 } 737 738 void 739 start_order_collelem(collelem_t *e) 740 { 741 start_order(T_COLLELEM); 742 currelem = e; 743 } 744 745 void 746 start_order_ellipsis(void) 747 { 748 int i; 749 750 start_order(T_ELLIPSIS); 751 752 if (lastorder != T_CHAR) { 753 errf(_("illegal starting point for range")); 754 return; 755 } 756 757 for (i = 0; i < NUM_WT; i++) { 758 ellipsis_weights[i] = order_weights[i]; 759 } 760 } 761 762 void 763 define_collelem(char *name, wchar_t *wcs) 764 { 765 collelem_t *e; 766 avl_index_t where1; 767 avl_index_t where2; 768 int i; 769 770 if (wcslen(wcs) >= COLLATE_STR_LEN) { 771 errf(_("expanded collation element too long")); 772 return; 773 } 774 775 if ((e = calloc(sizeof (*e), 1)) == NULL) { 776 errf(_("out of memory")); 777 return; 778 } 779 e->expand = wcs; 780 e->symbol = name; 781 782 /* 783 * This is executed before the order statement, so we don't 784 * know how many priorities we *really* need. We allocate one 785 * for each possible weight. Not a big deal, as collating-elements 786 * prove to be quite rare. 787 */ 788 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 789 e->ref[i] = new_pri(); 790 } 791 792 /* A character sequence can only reduce to one element. */ 793 if ((avl_find(&elem_by_symbol, e, &where1) != NULL) || 794 (avl_find(&elem_by_expand, e, &where2) != NULL)) { 795 errf(_("duplicate collating element definition")); 796 free(e); 797 return; 798 } 799 avl_insert(&elem_by_symbol, e, where1); 800 avl_insert(&elem_by_expand, e, where2); 801 } 802 803 void 804 add_order_bit(int kw) 805 { 806 uint8_t bit = DIRECTIVE_UNDEF; 807 808 switch (kw) { 809 case T_FORWARD: 810 bit = DIRECTIVE_FORWARD; 811 break; 812 case T_BACKWARD: 813 bit = DIRECTIVE_BACKWARD; 814 break; 815 case T_POSITION: 816 bit = DIRECTIVE_POSITION; 817 break; 818 default: 819 INTERR; 820 break; 821 } 822 collinfo.directive[collinfo.directive_count] |= bit; 823 } 824 825 void 826 add_order_directive(void) 827 { 828 if (collinfo.directive_count >= COLL_WEIGHTS_MAX) { 829 errf(_("too many directives (max %d)"), COLL_WEIGHTS_MAX); 830 } 831 collinfo.directive_count++; 832 } 833 834 static void 835 add_order_pri(int32_t ref) 836 { 837 if (curr_weight >= NUM_WT) { 838 errf(_("too many weights (max %d)"), NUM_WT); 839 return; 840 } 841 order_weights[curr_weight] = ref; 842 curr_weight++; 843 } 844 845 void 846 add_order_collsym(collsym_t *s) 847 { 848 add_order_pri(s->ref); 849 } 850 851 void 852 add_order_char(wchar_t wc) 853 { 854 collchar_t *cc; 855 856 if ((cc = get_collchar(wc, 1)) == NULL) { 857 INTERR; 858 return; 859 } 860 861 add_order_pri(cc->ref[curr_weight]); 862 } 863 864 void 865 add_order_collelem(collelem_t *e) 866 { 867 add_order_pri(e->ref[curr_weight]); 868 } 869 870 void 871 add_order_ignore(void) 872 { 873 add_order_pri(pri_ignore); 874 } 875 876 void 877 add_order_symbol(char *sym) 878 { 879 collundef_t *c; 880 if ((c = get_collundef(sym)) == NULL) { 881 INTERR; 882 return; 883 } 884 add_order_pri(c->ref[curr_weight]); 885 } 886 887 void 888 add_order_ellipsis(void) 889 { 890 /* special 0 value indicates self reference */ 891 add_order_pri(0); 892 } 893 894 void 895 add_order_subst(void) 896 { 897 subst_t srch; 898 subst_t *s; 899 avl_index_t where; 900 int i; 901 902 (void) memset(&srch, 0, sizeof (srch)); 903 for (i = 0; i < curr_subst; i++) { 904 srch.ref[i] = subst_weights[i]; 905 subst_weights[i] = 0; 906 } 907 s = avl_find(&substs_ref[curr_weight], &srch, &where); 908 909 if (s == NULL) { 910 if ((s = calloc(sizeof (*s), 1)) == NULL) { 911 errf(_("out of memory")); 912 return; 913 } 914 s->key = new_pri(); 915 916 /* 917 * We use a self reference for our key, but we set a 918 * high bit to indicate that this is a substitution 919 * reference. This will expedite table lookups later, 920 * and prevent table lookups for situations that don't 921 * require it. (In short, its a big win, because we 922 * can skip a lot of binary searching.) 923 */ 924 set_pri(s->key, 925 (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY), 926 RESOLVED); 927 nextsubst[curr_weight] += 1; 928 929 for (i = 0; i < curr_subst; i++) { 930 s->ref[i] = srch.ref[i]; 931 } 932 933 avl_insert(&substs_ref[curr_weight], s, where); 934 935 if (avl_find(&substs[curr_weight], s, &where) != NULL) { 936 INTERR; 937 return; 938 } 939 avl_insert(&substs[curr_weight], s, where); 940 } 941 curr_subst = 0; 942 943 944 /* 945 * We are using the current (unique) priority as a search key 946 * in the substitution table. 947 */ 948 add_order_pri(s->key); 949 } 950 951 static void 952 add_subst_pri(int32_t ref) 953 { 954 if (curr_subst >= COLLATE_STR_LEN) { 955 errf(_("substitution string is too long")); 956 return; 957 } 958 subst_weights[curr_subst] = ref; 959 curr_subst++; 960 } 961 962 void 963 add_subst_char(wchar_t wc) 964 { 965 collchar_t *cc; 966 967 968 if (((cc = get_collchar(wc, 1)) == NULL) || 969 (cc->wc != wc)) { 970 INTERR; 971 return; 972 } 973 /* we take the weight for the character at that position */ 974 add_subst_pri(cc->ref[curr_weight]); 975 } 976 977 void 978 add_subst_collelem(collelem_t *e) 979 { 980 add_subst_pri(e->ref[curr_weight]); 981 } 982 983 void 984 add_subst_collsym(collsym_t *s) 985 { 986 add_subst_pri(s->ref); 987 } 988 989 void 990 add_subst_symbol(char *ptr) 991 { 992 collundef_t *cu; 993 994 if ((cu = get_collundef(ptr)) != NULL) { 995 add_subst_pri(cu->ref[curr_weight]); 996 } 997 } 998 999 void 1000 add_weight(int32_t ref, int pass) 1001 { 1002 weight_t srch; 1003 weight_t *w; 1004 avl_index_t where; 1005 1006 srch.pri = resolve_pri(ref); 1007 1008 /* No translation of ignores */ 1009 if (srch.pri == 0) 1010 return; 1011 1012 /* Substitution priorities are not weights */ 1013 if (srch.pri & COLLATE_SUBST_PRIORITY) 1014 return; 1015 1016 if (avl_find(&weights[pass], &srch, &where) != NULL) 1017 return; 1018 1019 if ((w = calloc(sizeof (*w), 1)) == NULL) { 1020 errf(_("out of memory")); 1021 return; 1022 } 1023 w->pri = srch.pri; 1024 avl_insert(&weights[pass], w, where); 1025 } 1026 1027 void 1028 add_weights(int32_t *refs) 1029 { 1030 int i; 1031 for (i = 0; i < NUM_WT; i++) { 1032 add_weight(refs[i], i); 1033 } 1034 } 1035 1036 int32_t 1037 get_weight(int32_t ref, int pass) 1038 { 1039 weight_t srch; 1040 weight_t *w; 1041 int32_t pri; 1042 1043 pri = resolve_pri(ref); 1044 if (pri & COLLATE_SUBST_PRIORITY) { 1045 return (pri); 1046 } 1047 if (pri <= 0) { 1048 return (pri); 1049 } 1050 srch.pri = pri; 1051 if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) { 1052 INTERR; 1053 return (-1); 1054 } 1055 return (w->opt); 1056 } 1057 1058 void 1059 dump_collate(void) 1060 { 1061 FILE *f; 1062 int i, j, n; 1063 size_t sz; 1064 int32_t pri; 1065 collelem_t *ce; 1066 collchar_t *cc; 1067 subst_t *sb; 1068 char vers[COLLATE_STR_LEN]; 1069 collate_char_t chars[UCHAR_MAX + 1]; 1070 collate_large_t *large; 1071 collate_subst_t *subst[COLL_WEIGHTS_MAX]; 1072 collate_chain_t *chain; 1073 1074 /* 1075 * We have to run throught a preliminary pass to identify all the 1076 * weights that we use for each sorting level. 1077 */ 1078 for (i = 0; i < NUM_WT; i++) { 1079 add_weight(pri_ignore, i); 1080 } 1081 for (i = 0; i < NUM_WT; i++) { 1082 for (sb = avl_first(&substs[i]); sb; 1083 sb = AVL_NEXT(&substs[i], sb)) { 1084 for (j = 0; sb->ref[j]; j++) { 1085 add_weight(sb->ref[j], i); 1086 } 1087 } 1088 } 1089 for (ce = avl_first(&elem_by_expand); 1090 ce != NULL; 1091 ce = AVL_NEXT(&elem_by_expand, ce)) { 1092 add_weights(ce->ref); 1093 } 1094 for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) { 1095 add_weights(cc->ref); 1096 } 1097 1098 /* 1099 * Now we walk the entire set of weights, removing the gaps 1100 * in the weights. This gives us optimum usage. The walk 1101 * occurs in priority. 1102 */ 1103 for (i = 0; i < NUM_WT; i++) { 1104 weight_t *w; 1105 for (w = avl_first(&weights[i]); w; 1106 w = AVL_NEXT(&weights[i], w)) { 1107 w->opt = nweight[i]; 1108 nweight[i] += 1; 1109 } 1110 } 1111 1112 (void) memset(&chars, 0, sizeof (chars)); 1113 (void) memset(vers, 0, COLLATE_STR_LEN); 1114 (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers)); 1115 1116 /* 1117 * We need to make sure we arrange for the UNDEFINED field 1118 * to show up. Also, set the total weight counts. 1119 */ 1120 for (i = 0; i < NUM_WT; i++) { 1121 if (resolve_pri(pri_undefined[i]) == -1) { 1122 set_pri(pri_undefined[i], -1, RESOLVED); 1123 /* they collate at the end of everything else */ 1124 collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY; 1125 } 1126 collinfo.pri_count[i] = nweight[i]; 1127 } 1128 1129 collinfo.pri_count[NUM_WT] = max_wide(); 1130 collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY; 1131 collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED; 1132 1133 /* 1134 * Ordinary character priorities 1135 */ 1136 for (i = 0; i <= UCHAR_MAX; i++) { 1137 if ((cc = get_collchar(i, 0)) != NULL) { 1138 for (j = 0; j < NUM_WT; j++) { 1139 chars[i].pri[j] = get_weight(cc->ref[j], j); 1140 } 1141 } else { 1142 for (j = 0; j < NUM_WT; j++) { 1143 chars[i].pri[j] = 1144 get_weight(pri_undefined[j], j); 1145 } 1146 /* 1147 * Per POSIX, for undefined characters, we 1148 * also have to add a last item, which is the 1149 * character code. 1150 */ 1151 chars[i].pri[NUM_WT] = i; 1152 } 1153 } 1154 1155 /* 1156 * Substitution tables 1157 */ 1158 for (i = 0; i < NUM_WT; i++) { 1159 collate_subst_t *st = NULL; 1160 n = collinfo.subst_count[i] = avl_numnodes(&substs[i]); 1161 if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) { 1162 errf(_("out of memory")); 1163 return; 1164 } 1165 n = 0; 1166 for (sb = avl_first(&substs[i]); sb; 1167 sb = AVL_NEXT(&substs[i], sb)) { 1168 if ((st[n].key = resolve_pri(sb->key)) < 0) { 1169 /* by definition these resolve! */ 1170 INTERR; 1171 } 1172 if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) { 1173 INTERR; 1174 } 1175 for (j = 0; sb->ref[j]; j++) { 1176 st[n].pri[j] = get_weight(sb->ref[j], i); 1177 } 1178 n++; 1179 } 1180 if (n != collinfo.subst_count[i]) 1181 INTERR; 1182 subst[i] = st; 1183 } 1184 1185 1186 /* 1187 * Chains, i.e. collating elements 1188 */ 1189 collinfo.chain_count = avl_numnodes(&elem_by_expand); 1190 chain = calloc(sizeof (collate_chain_t), collinfo.chain_count); 1191 if (chain == NULL) { 1192 errf(_("out of memory")); 1193 return; 1194 } 1195 for (n = 0, ce = avl_first(&elem_by_expand); 1196 ce != NULL; 1197 ce = AVL_NEXT(&elem_by_expand, ce), n++) { 1198 (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN); 1199 for (i = 0; i < NUM_WT; i++) { 1200 chain[n].pri[i] = get_weight(ce->ref[i], i); 1201 } 1202 } 1203 if (n != collinfo.chain_count) 1204 INTERR; 1205 1206 /* 1207 * Large (> UCHAR_MAX) character priorities 1208 */ 1209 large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1); 1210 if (large == NULL) { 1211 errf(_("out of memory")); 1212 return; 1213 } 1214 1215 i = 0; 1216 for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) { 1217 int undef = 0; 1218 /* we already gathered those */ 1219 if (cc->wc <= UCHAR_MAX) 1220 continue; 1221 for (j = 0; j < NUM_WT; j++) { 1222 if ((pri = get_weight(cc->ref[j], j)) < 0) { 1223 undef = 1; 1224 } 1225 if (undef && (pri >= 0)) { 1226 /* if undefined, then all priorities are */ 1227 INTERR; 1228 } else { 1229 large[i].pri.pri[j] = pri; 1230 } 1231 } 1232 if (!undef) { 1233 large[i].val = cc->wc; 1234 collinfo.large_count = i++; 1235 } 1236 } 1237 1238 if ((f = open_category()) == NULL) { 1239 return; 1240 } 1241 1242 /* Time to write the entire data set out */ 1243 1244 if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) || 1245 (wr_category(&collinfo, sizeof (collinfo), f) < 0) || 1246 (wr_category(&chars, sizeof (chars), f) < 0)) { 1247 delete_category(f); 1248 return; 1249 } 1250 1251 for (i = 0; i < NUM_WT; i++) { 1252 sz = sizeof (collate_subst_t) * collinfo.subst_count[i]; 1253 if (wr_category(subst[i], sz, f) < 0) { 1254 delete_category(f); 1255 return; 1256 } 1257 } 1258 sz = sizeof (collate_chain_t) * collinfo.chain_count; 1259 if (wr_category(chain, sz, f) < 0) { 1260 delete_category(f); 1261 return; 1262 } 1263 sz = sizeof (collate_large_t) * collinfo.large_count; 1264 if (wr_category(large, sz, f) < 0) { 1265 delete_category(f); 1266 return; 1267 } 1268 1269 close_category(f); 1270 } 1271