1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms version 1.0 5 * of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 14 */ 15 16 /* 17 * LC_COLLATE database generation routines for localedef. 18 */ 19 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include <errno.h> 23 #include <string.h> 24 #include <sys/types.h> 25 #include <sys/avl.h> 26 #include <string.h> 27 #include <unistd.h> 28 #include <wchar.h> 29 #include <widec.h> 30 #include <limits.h> 31 #include "localedef.h" 32 #include "parser.tab.h" 33 #include "collate.h" 34 35 /* 36 * Design notes. 37 * 38 * It will be extremely helpful to the reader if they have access to 39 * the localedef and locale file format specifications available. 40 * Latest versions of these are available from www.opengroup.org. 41 * 42 * The design for the collation code is a bit complex. The goal is a 43 * single collation database as described in collate.h (in 44 * libc/port/locale). However, there are some other tidbits: 45 * 46 * a) The substitution entries are now a directly indexable array. A 47 * priority elsewhere in the table is taken as an index into the 48 * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY) 49 * set. (The bit is cleared and the result is the index into the 50 * table. 51 * 52 * b) We eliminate duplicate entries into the substitution table. 53 * This saves a lot of space. 54 * 55 * c) The priorities for each level are "compressed", so that each 56 * sorting level has consecutively numbered priorities starting at 1. 57 * (O is reserved for the ignore priority.) This means sort levels 58 * which only have a few distinct priorities can represent the 59 * priority level in fewer bits, which makes the strxfrm output 60 * smaller. 61 * 62 * d) We record the total number of priorities so that strxfrm can 63 * figure out how many bytes to expand a numeric priority into. 64 * 65 * e) For the UNDEFINED pass (the last pass), we record the maximum 66 * number of bits needed to uniquely prioritize these entries, so that 67 * the last pass can also use smaller strxfrm output when possible. 68 * 69 * f) Priorities with the sign bit set are verboten. This works out 70 * because no active character set needs that bit to carry significant 71 * information once the character is in wide form. 72 * 73 * To process the entire data to make the database, we actually run 74 * multiple passes over the data. 75 * 76 * The first pass, which is done at parse time, identifies elements, 77 * substitutions, and such, and records them in priority order. As 78 * some priorities can refer to other priorities, using forward 79 * references, we use a table of references indicating whether the 80 * priority's value has been resolved, or whether it is still a 81 * reference. 82 * 83 * The second pass walks over all the items in priority order, noting 84 * that they are used directly, and not just an indirect reference. 85 * This is done by creating a "weight" structure for the item. The 86 * weights are stashed in an AVL tree sorted by relative "priority". 87 * 88 * The third pass walks over all the weight structures, in priority 89 * order, and assigns a new monotonically increasing (per sort level) 90 * weight value to them. These are the values that will actually be 91 * written to the file. 92 * 93 * The fourth pass just writes the data out. 94 */ 95 96 /* 97 * In order to resolve the priorities, we create a table of priorities. 98 * Entries in the table can be in one of three states. 99 * 100 * UNKNOWN is for newly allocated entries, and indicates that nothing 101 * is known about the priority. (For example, when new entries are created 102 * for collating-symbols, this is the value assigned for them until the 103 * collating symbol's order has been determined. 104 * 105 * RESOLVED is used for an entry where the priority indicates the final 106 * numeric weight. 107 * 108 * REFER is used for entries that reference other entries. Typically 109 * this is used for forward references. A collating-symbol can never 110 * have this value. 111 * 112 * The "pass" field is used during final resolution to aid in detection 113 * of referencing loops. (For example <A> depends on <B>, but <B> has its 114 * priority dependent on <A>.) 115 */ 116 typedef enum { 117 UNKNOWN, /* priority is totally unknown */ 118 RESOLVED, /* priority value fully resolved */ 119 REFER /* priority is a reference (index) */ 120 } res_t; 121 122 typedef struct weight { 123 int32_t pri; 124 int opt; 125 avl_node_t avl; 126 } weight_t; 127 128 typedef struct priority { 129 res_t res; 130 int32_t pri; 131 int pass; 132 int lineno; 133 } collpri_t; 134 135 #define NUM_WT collinfo.directive_count 136 137 /* 138 * These are the abstract collating symbols, which are just a symbolic 139 * way to reference a priority. 140 */ 141 struct collsym { 142 char *name; 143 int32_t ref; 144 avl_node_t avl; 145 }; 146 147 /* 148 * These are also abstract collating symbols, but we allow them to have 149 * different priorities at different levels. 150 */ 151 typedef struct collundef { 152 char *name; 153 int32_t ref[COLL_WEIGHTS_MAX]; 154 avl_node_t avl; 155 } collundef_t; 156 157 /* 158 * These are called "chains" in libc. This records the fact that two 159 * more characters should be treated as a single collating entity when 160 * they appear together. For example, in Spanish <C><h> gets collated 161 * as a character between <C> and <D>. 162 */ 163 struct collelem { 164 char *symbol; 165 wchar_t *expand; 166 int32_t ref[COLL_WEIGHTS_MAX]; 167 avl_node_t avl_bysymbol; 168 avl_node_t avl_byexpand; 169 }; 170 171 /* 172 * Individual characters have a sequence of weights as well. 173 */ 174 typedef struct collchar { 175 wchar_t wc; 176 int32_t ref[COLL_WEIGHTS_MAX]; 177 avl_node_t avl; 178 } collchar_t; 179 180 /* 181 * Substitution entries. The key is itself a priority. Note that 182 * when we create one of these, we *automatically* wind up with a 183 * fully resolved priority for the key, because creation of 184 * substitutions creates a resolved priority at the same time. 185 */ 186 typedef struct { 187 int32_t key; 188 int32_t ref[COLLATE_STR_LEN]; 189 avl_node_t avl; 190 avl_node_t avl_ref; 191 } subst_t; 192 193 static avl_tree_t collsyms; 194 static avl_tree_t collundefs; 195 static avl_tree_t elem_by_symbol; 196 static avl_tree_t elem_by_expand; 197 static avl_tree_t collchars; 198 static avl_tree_t substs[COLL_WEIGHTS_MAX]; 199 static avl_tree_t substs_ref[COLL_WEIGHTS_MAX]; 200 static avl_tree_t weights[COLL_WEIGHTS_MAX]; 201 static int32_t nweight[COLL_WEIGHTS_MAX]; 202 203 /* 204 * This is state tracking for the ellipsis token. Note that we start 205 * the initial values so that the ellipsis logic will think we got a 206 * magic starting value of NUL. It starts at minus one because the 207 * starting point is exclusive -- i.e. the starting point is not 208 * itself handled by the ellipsis code. 209 */ 210 static int currorder = EOF; 211 static int lastorder = EOF; 212 static collelem_t *currelem; 213 static collchar_t *currchar; 214 static collundef_t *currundef; 215 static wchar_t ellipsis_start = 0; 216 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX]; 217 218 /* 219 * We keep a running tally of weights. 220 */ 221 static int nextpri = 1; 222 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 }; 223 224 /* 225 * This array collects up the weights for each level. 226 */ 227 static int32_t order_weights[COLL_WEIGHTS_MAX]; 228 static int curr_weight = 0; 229 static int32_t subst_weights[COLLATE_STR_LEN]; 230 static int curr_subst = 0; 231 232 /* 233 * Some initial priority values. 234 */ 235 static int32_t pri_undefined[COLL_WEIGHTS_MAX]; 236 static int32_t pri_ignore; 237 238 static collate_info_t collinfo; 239 240 static collpri_t *prilist = NULL; 241 static int numpri = 0; 242 static int maxpri = 0; 243 244 static void start_order(int); 245 246 static int32_t 247 new_pri(void) 248 { 249 int i; 250 251 if (numpri >= maxpri) { 252 maxpri = maxpri ? maxpri * 2 : 1024; 253 prilist = realloc(prilist, sizeof (collpri_t) * maxpri); 254 if (prilist == NULL) { 255 errf(_("out of memory")); 256 return (-1); 257 } 258 for (i = numpri; i < maxpri; i++) { 259 prilist[i].res = UNKNOWN; 260 prilist[i].pri = 0; 261 prilist[i].pass = 0; 262 } 263 } 264 return (numpri++); 265 } 266 267 static collpri_t * 268 get_pri(int32_t ref) 269 { 270 if ((ref < 0) || (ref > numpri)) { 271 INTERR; 272 return (NULL); 273 } 274 return (&prilist[ref]); 275 } 276 277 static void 278 set_pri(int32_t ref, int32_t v, res_t res) 279 { 280 collpri_t *pri; 281 282 pri = get_pri(ref); 283 284 if ((res == REFER) && ((v < 0) || (v >= numpri))) { 285 INTERR; 286 } 287 288 /* Resolve self references */ 289 if ((res == REFER) && (ref == v)) { 290 v = nextpri; 291 res = RESOLVED; 292 } 293 294 if (pri->res != UNKNOWN) { 295 warn(_("repeated item in order list (first on %d)"), 296 pri->lineno); 297 return; 298 } 299 pri->lineno = lineno; 300 pri->pri = v; 301 pri->res = res; 302 } 303 304 static int32_t 305 resolve_pri(int32_t ref) 306 { 307 collpri_t *pri; 308 static int32_t pass = 0; 309 310 pri = get_pri(ref); 311 pass++; 312 while (pri->res == REFER) { 313 if (pri->pass == pass) { 314 /* report a line with the circular symbol */ 315 lineno = pri->lineno; 316 errf(_("circular reference in order list")); 317 return (-1); 318 } 319 if ((pri->pri < 0) || (pri->pri >= numpri)) { 320 INTERR; 321 return (-1); 322 } 323 pri->pass = pass; 324 pri = &prilist[pri->pri]; 325 } 326 327 if (pri->res == UNKNOWN) { 328 return (-1); 329 } 330 if (pri->res != RESOLVED) 331 INTERR; 332 333 return (pri->pri); 334 } 335 336 static int 337 weight_compare(const void *n1, const void *n2) 338 { 339 int32_t k1 = ((const weight_t *)n1)->pri; 340 int32_t k2 = ((const weight_t *)n2)->pri; 341 342 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 343 } 344 345 static int 346 collsym_compare(const void *n1, const void *n2) 347 { 348 const collsym_t *c1 = n1; 349 const collsym_t *c2 = n2; 350 int rv; 351 352 rv = strcmp(c1->name, c2->name); 353 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 354 } 355 356 static int 357 collundef_compare(const void *n1, const void *n2) 358 { 359 const collundef_t *c1 = n1; 360 const collundef_t *c2 = n2; 361 int rv; 362 363 rv = strcmp(c1->name, c2->name); 364 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 365 } 366 367 static int 368 element_compare_symbol(const void *n1, const void *n2) 369 { 370 const collelem_t *c1 = n1; 371 const collelem_t *c2 = n2; 372 int rv; 373 374 rv = strcmp(c1->symbol, c2->symbol); 375 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 376 } 377 378 static int 379 element_compare_expand(const void *n1, const void *n2) 380 { 381 const collelem_t *c1 = n1; 382 const collelem_t *c2 = n2; 383 int rv; 384 385 rv = wcscmp(c1->expand, c2->expand); 386 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 387 } 388 389 static int 390 collchar_compare(const void *n1, const void *n2) 391 { 392 wchar_t k1 = ((const collchar_t *)n1)->wc; 393 wchar_t k2 = ((const collchar_t *)n2)->wc; 394 395 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 396 } 397 398 static int 399 subst_compare(const void *n1, const void *n2) 400 { 401 int32_t k1 = ((const subst_t *)n1)->key; 402 int32_t k2 = ((const subst_t *)n2)->key; 403 404 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 405 } 406 407 static int 408 subst_compare_ref(const void *n1, const void *n2) 409 { 410 int32_t *c1 = ((subst_t *)n1)->ref; 411 int32_t *c2 = ((subst_t *)n2)->ref; 412 int rv; 413 414 rv = wcscmp((wchar_t *)c1, (wchar_t *)c2); 415 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 416 } 417 418 void 419 init_collate(void) 420 { 421 int i; 422 423 avl_create(&collsyms, collsym_compare, sizeof (collsym_t), 424 offsetof(collsym_t, avl)); 425 426 avl_create(&collundefs, collundef_compare, sizeof (collsym_t), 427 offsetof(collundef_t, avl)); 428 429 avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t), 430 offsetof(collelem_t, avl_bysymbol)); 431 avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t), 432 offsetof(collelem_t, avl_byexpand)); 433 434 avl_create(&collchars, collchar_compare, sizeof (collchar_t), 435 offsetof(collchar_t, avl)); 436 437 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 438 avl_create(&substs[i], subst_compare, sizeof (subst_t), 439 offsetof(subst_t, avl)); 440 avl_create(&substs_ref[i], subst_compare_ref, 441 sizeof (subst_t), offsetof(subst_t, avl_ref)); 442 avl_create(&weights[i], weight_compare, sizeof (weight_t), 443 offsetof(weight_t, avl)); 444 nweight[i] = 1; 445 } 446 447 (void) memset(&collinfo, 0, sizeof (collinfo)); 448 449 /* allocate some initial priorities */ 450 pri_ignore = new_pri(); 451 452 set_pri(pri_ignore, 0, RESOLVED); 453 454 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 455 pri_undefined[i] = new_pri(); 456 457 /* we will override this later */ 458 set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN); 459 } 460 } 461 462 void 463 define_collsym(char *name) 464 { 465 collsym_t *sym; 466 avl_index_t where; 467 468 if ((sym = calloc(sizeof (*sym), 1)) == NULL) { 469 errf(_("out of memory")); 470 return; 471 } 472 sym->name = name; 473 sym->ref = new_pri(); 474 475 if (avl_find(&collsyms, sym, &where) != NULL) { 476 /* 477 * This should never happen because we are only called 478 * for undefined symbols. 479 */ 480 INTERR; 481 return; 482 } 483 avl_insert(&collsyms, sym, where); 484 } 485 486 collsym_t * 487 lookup_collsym(char *name) 488 { 489 collsym_t srch; 490 491 srch.name = name; 492 return (avl_find(&collsyms, &srch, NULL)); 493 } 494 495 collelem_t * 496 lookup_collelem(char *symbol) 497 { 498 collelem_t srch; 499 500 srch.symbol = symbol; 501 return (avl_find(&elem_by_symbol, &srch, NULL)); 502 } 503 504 static collundef_t * 505 get_collundef(char *name) 506 { 507 collundef_t srch; 508 collundef_t *ud; 509 avl_index_t where; 510 int i; 511 512 srch.name = name; 513 if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) { 514 if (((ud = calloc(sizeof (*ud), 1)) == NULL) || 515 ((ud->name = strdup(name)) == NULL)) { 516 errf(_("out of memory")); 517 return (NULL); 518 } 519 for (i = 0; i < NUM_WT; i++) { 520 ud->ref[i] = new_pri(); 521 } 522 avl_insert(&collundefs, ud, where); 523 } 524 add_charmap_undefined(name); 525 return (ud); 526 } 527 528 static collchar_t * 529 get_collchar(wchar_t wc, int create) 530 { 531 collchar_t srch; 532 collchar_t *cc; 533 avl_index_t where; 534 int i; 535 536 srch.wc = wc; 537 cc = avl_find(&collchars, &srch, &where); 538 if ((cc == NULL) && create) { 539 if ((cc = calloc(sizeof (*cc), 1)) == NULL) { 540 errf(_("out of memory")); 541 return (NULL); 542 } 543 for (i = 0; i < NUM_WT; i++) { 544 cc->ref[i] = new_pri(); 545 } 546 cc->wc = wc; 547 avl_insert(&collchars, cc, where); 548 } 549 return (cc); 550 } 551 552 void 553 end_order_collsym(collsym_t *sym) 554 { 555 start_order(T_COLLSYM); 556 /* update the weight */ 557 558 set_pri(sym->ref, nextpri, RESOLVED); 559 nextpri++; 560 } 561 562 void 563 end_order(void) 564 { 565 int i; 566 int32_t pri; 567 int32_t ref; 568 collpri_t *p; 569 570 /* advance the priority/weight */ 571 pri = nextpri; 572 573 switch (currorder) { 574 case T_CHAR: 575 for (i = 0; i < NUM_WT; i++) { 576 if (((ref = order_weights[i]) < 0) || 577 ((p = get_pri(ref)) == NULL) || 578 (p->pri == -1)) { 579 /* unspecified weight is a self reference */ 580 set_pri(currchar->ref[i], pri, RESOLVED); 581 } else { 582 set_pri(currchar->ref[i], ref, REFER); 583 } 584 order_weights[i] = -1; 585 } 586 587 /* leave a cookie trail in case next symbol is ellipsis */ 588 ellipsis_start = currchar->wc + 1; 589 currchar = NULL; 590 break; 591 592 case T_ELLIPSIS: 593 /* save off the weights were we can find them */ 594 for (i = 0; i < NUM_WT; i++) { 595 ellipsis_weights[i] = order_weights[i]; 596 order_weights[i] = -1; 597 } 598 break; 599 600 case T_COLLELEM: 601 if (currelem == NULL) { 602 INTERR; 603 } else { 604 for (i = 0; i < NUM_WT; i++) { 605 606 if (((ref = order_weights[i]) < 0) || 607 ((p = get_pri(ref)) == NULL) || 608 (p->pri == -1)) { 609 set_pri(currelem->ref[i], pri, 610 RESOLVED); 611 } else { 612 set_pri(currelem->ref[i], ref, REFER); 613 } 614 order_weights[i] = -1; 615 } 616 } 617 break; 618 619 case T_UNDEFINED: 620 for (i = 0; i < NUM_WT; i++) { 621 if (((ref = order_weights[i]) < 0) || 622 ((p = get_pri(ref)) == NULL) || 623 (p->pri == -1)) { 624 set_pri(pri_undefined[i], -1, RESOLVED); 625 } else { 626 set_pri(pri_undefined[i], ref, REFER); 627 } 628 order_weights[i] = -1; 629 } 630 break; 631 632 case T_SYMBOL: 633 if (((ref = order_weights[i]) < 0) || 634 ((p = get_pri(ref)) == NULL) || 635 (p->pri == -1)) { 636 set_pri(currundef->ref[i], pri, RESOLVED); 637 } else { 638 set_pri(currundef->ref[i], ref, REFER); 639 } 640 order_weights[i] = -1; 641 break; 642 643 default: 644 INTERR; 645 } 646 647 nextpri++; 648 } 649 650 static void 651 start_order(int type) 652 { 653 int i; 654 655 lastorder = currorder; 656 currorder = type; 657 658 /* this is used to protect ELLIPSIS processing */ 659 if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) { 660 errf(_("character value expected")); 661 } 662 663 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 664 order_weights[i] = -1; 665 } 666 curr_weight = 0; 667 } 668 669 void 670 start_order_undefined(void) 671 { 672 start_order(T_UNDEFINED); 673 } 674 675 void 676 start_order_symbol(char *name) 677 { 678 currundef = get_collundef(name); 679 start_order(T_SYMBOL); 680 } 681 682 void 683 start_order_char(wchar_t wc) 684 { 685 collchar_t *cc; 686 int32_t ref; 687 688 start_order(T_CHAR); 689 690 /* 691 * If we last saw an ellipsis, then we need to close the range. 692 * Handle that here. Note that we have to be careful because the 693 * items *inside* the range are treated exclusiveley to the items 694 * outside of the range. The ends of the range can have quite 695 * different weights than the range members. 696 */ 697 if (lastorder == T_ELLIPSIS) { 698 int i; 699 700 if (wc < ellipsis_start) { 701 errf(_("malformed range!")); 702 return; 703 } 704 while (ellipsis_start < wc) { 705 /* 706 * pick all of the saved weights for the 707 * ellipsis. note that -1 encodes for the 708 * ellipsis itself, which means to take the 709 * current relative priority. 710 */ 711 if ((cc = get_collchar(ellipsis_start, 1)) == NULL) { 712 INTERR; 713 return; 714 } 715 for (i = 0; i < NUM_WT; i++) { 716 collpri_t *p; 717 if (((ref = ellipsis_weights[i]) == -1) || 718 ((p = get_pri(ref)) == NULL) || 719 (p->pri == -1)) { 720 set_pri(cc->ref[i], nextpri, RESOLVED); 721 } else { 722 set_pri(cc->ref[i], ref, REFER); 723 } 724 ellipsis_weights[i] = NULL; 725 } 726 ellipsis_start++; 727 nextpri++; 728 } 729 } 730 731 currchar = get_collchar(wc, 1); 732 } 733 734 void 735 start_order_collelem(collelem_t *e) 736 { 737 start_order(T_COLLELEM); 738 currelem = e; 739 } 740 741 void 742 start_order_ellipsis(void) 743 { 744 int i; 745 746 start_order(T_ELLIPSIS); 747 748 if (lastorder != T_CHAR) { 749 errf(_("illegal starting point for range")); 750 return; 751 } 752 753 for (i = 0; i < NUM_WT; i++) { 754 ellipsis_weights[i] = order_weights[i]; 755 } 756 } 757 758 void 759 define_collelem(char *name, wchar_t *wcs) 760 { 761 collelem_t *e; 762 avl_index_t where1; 763 avl_index_t where2; 764 int i; 765 766 if (wcslen(wcs) >= COLLATE_STR_LEN) { 767 errf(_("expanded collation element too long")); 768 return; 769 } 770 771 if ((e = calloc(sizeof (*e), 1)) == NULL) { 772 errf(_("out of memory")); 773 return; 774 } 775 e->expand = wcs; 776 e->symbol = name; 777 778 /* 779 * This is executed before the order statement, so we don't 780 * know how many priorities we *really* need. We allocate one 781 * for each possible weight. Not a big deal, as collating-elements 782 * prove to be quite rare. 783 */ 784 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 785 e->ref[i] = new_pri(); 786 } 787 788 /* A character sequence can only reduce to one element. */ 789 if ((avl_find(&elem_by_symbol, e, &where1) != NULL) || 790 (avl_find(&elem_by_expand, e, &where2) != NULL)) { 791 errf(_("duplicate collating element definition")); 792 return; 793 } 794 avl_insert(&elem_by_symbol, e, where1); 795 avl_insert(&elem_by_expand, e, where2); 796 } 797 798 void 799 add_order_bit(int kw) 800 { 801 uint8_t bit = DIRECTIVE_UNDEF; 802 803 switch (kw) { 804 case T_FORWARD: 805 bit = DIRECTIVE_FORWARD; 806 break; 807 case T_BACKWARD: 808 bit = DIRECTIVE_BACKWARD; 809 break; 810 case T_POSITION: 811 bit = DIRECTIVE_POSITION; 812 break; 813 default: 814 INTERR; 815 break; 816 } 817 collinfo.directive[collinfo.directive_count] |= bit; 818 } 819 820 void 821 add_order_directive(void) 822 { 823 if (collinfo.directive_count >= COLL_WEIGHTS_MAX) { 824 errf(_("too many directives (max %d)"), COLL_WEIGHTS_MAX); 825 } 826 collinfo.directive_count++; 827 } 828 829 static void 830 add_order_pri(int32_t ref) 831 { 832 if (curr_weight >= NUM_WT) { 833 errf(_("too many weights (max %d)"), NUM_WT); 834 return; 835 } 836 order_weights[curr_weight] = ref; 837 curr_weight++; 838 } 839 840 void 841 add_order_collsym(collsym_t *s) 842 { 843 add_order_pri(s->ref); 844 } 845 846 void 847 add_order_char(wchar_t wc) 848 { 849 collchar_t *cc; 850 851 if ((cc = get_collchar(wc, 1)) == NULL) { 852 INTERR; 853 return; 854 } 855 856 add_order_pri(cc->ref[curr_weight]); 857 } 858 859 void 860 add_order_collelem(collelem_t *e) 861 { 862 add_order_pri(e->ref[curr_weight]); 863 } 864 865 void 866 add_order_ignore(void) 867 { 868 add_order_pri(pri_ignore); 869 } 870 871 void 872 add_order_symbol(char *sym) 873 { 874 collundef_t *c; 875 if ((c = get_collundef(sym)) == NULL) { 876 INTERR; 877 return; 878 } 879 add_order_pri(c->ref[curr_weight]); 880 } 881 882 void 883 add_order_ellipsis(void) 884 { 885 /* special NULL value indicates self reference */ 886 add_order_pri(NULL); 887 } 888 889 void 890 add_order_subst(void) 891 { 892 subst_t srch; 893 subst_t *s; 894 avl_index_t where; 895 int i; 896 897 (void) memset(&srch, 0, sizeof (srch)); 898 for (i = 0; i < curr_subst; i++) { 899 srch.ref[i] = subst_weights[i]; 900 subst_weights[i] = 0; 901 } 902 s = avl_find(&substs_ref[curr_weight], &srch, &where); 903 904 if (s == NULL) { 905 if ((s = calloc(sizeof (*s), 1)) == NULL) { 906 errf(_("out of memory")); 907 return; 908 } 909 s->key = new_pri(); 910 911 /* 912 * We use a self reference for our key, but we set a 913 * high bit to indicate that this is a substitution 914 * reference. This will expedite table lookups later, 915 * and prevent table lookups for situations that don't 916 * require it. (In short, its a big win, because we 917 * can skip a lot of binary searching.) 918 */ 919 set_pri(s->key, 920 (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY), 921 RESOLVED); 922 nextsubst[curr_weight] += 1; 923 924 for (i = 0; i < curr_subst; i++) { 925 s->ref[i] = srch.ref[i]; 926 } 927 928 avl_insert(&substs_ref[curr_weight], s, where); 929 930 if (avl_find(&substs[curr_weight], s, &where) != NULL) { 931 INTERR; 932 return; 933 } 934 avl_insert(&substs[curr_weight], s, where); 935 } 936 curr_subst = 0; 937 938 939 /* 940 * We are using the current (unique) priority as a search key 941 * in the substitution table. 942 */ 943 add_order_pri(s->key); 944 } 945 946 static void 947 add_subst_pri(int32_t ref) 948 { 949 if (curr_subst >= COLLATE_STR_LEN) { 950 errf(_("substitution string is too long")); 951 return; 952 } 953 subst_weights[curr_subst] = ref; 954 curr_subst++; 955 } 956 957 void 958 add_subst_char(wchar_t wc) 959 { 960 collchar_t *cc; 961 962 963 if (((cc = get_collchar(wc, 1)) == NULL) || 964 (cc->wc != wc)) { 965 INTERR; 966 return; 967 } 968 /* we take the weight for the character at that position */ 969 add_subst_pri(cc->ref[curr_weight]); 970 } 971 972 void 973 add_subst_collelem(collelem_t *e) 974 { 975 add_subst_pri(e->ref[curr_weight]); 976 } 977 978 void 979 add_subst_collsym(collsym_t *s) 980 { 981 add_subst_pri(s->ref); 982 } 983 984 void 985 add_subst_symbol(char *ptr) 986 { 987 collundef_t *cu; 988 989 if ((cu = get_collundef(ptr)) != NULL) { 990 add_subst_pri(cu->ref[curr_weight]); 991 } 992 } 993 994 void 995 add_weight(int32_t ref, int pass) 996 { 997 weight_t srch; 998 weight_t *w; 999 avl_index_t where; 1000 1001 srch.pri = resolve_pri(ref); 1002 1003 /* No translation of ignores */ 1004 if (srch.pri == 0) 1005 return; 1006 1007 /* Substitution priorities are not weights */ 1008 if (srch.pri & COLLATE_SUBST_PRIORITY) 1009 return; 1010 1011 if (avl_find(&weights[pass], &srch, &where) != NULL) 1012 return; 1013 1014 if ((w = calloc(sizeof (*w), 1)) == NULL) { 1015 errf(_("out of memory")); 1016 return; 1017 } 1018 w->pri = srch.pri; 1019 avl_insert(&weights[pass], w, where); 1020 } 1021 1022 void 1023 add_weights(int32_t *refs) 1024 { 1025 int i; 1026 for (i = 0; i < NUM_WT; i++) { 1027 add_weight(refs[i], i); 1028 } 1029 } 1030 1031 int32_t 1032 get_weight(int32_t ref, int pass) 1033 { 1034 weight_t srch; 1035 weight_t *w; 1036 int32_t pri; 1037 1038 pri = resolve_pri(ref); 1039 if (pri & COLLATE_SUBST_PRIORITY) { 1040 return (pri); 1041 } 1042 if (pri <= 0) { 1043 return (pri); 1044 } 1045 srch.pri = pri; 1046 if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) { 1047 INTERR; 1048 return (-1); 1049 } 1050 return (w->opt); 1051 } 1052 1053 void 1054 dump_collate(void) 1055 { 1056 FILE *f; 1057 int i, j, n; 1058 size_t sz; 1059 int32_t pri; 1060 collelem_t *ce; 1061 collchar_t *cc; 1062 subst_t *sb; 1063 char vers[COLLATE_STR_LEN]; 1064 collate_char_t chars[UCHAR_MAX + 1]; 1065 collate_large_t *large; 1066 collate_subst_t *subst[COLL_WEIGHTS_MAX]; 1067 collate_chain_t *chain; 1068 1069 /* 1070 * We have to run throught a preliminary pass to identify all the 1071 * weights that we use for each sorting level. 1072 */ 1073 for (i = 0; i < NUM_WT; i++) { 1074 add_weight(pri_ignore, i); 1075 } 1076 for (i = 0; i < NUM_WT; i++) { 1077 for (sb = avl_first(&substs[i]); sb; 1078 sb = AVL_NEXT(&substs[i], sb)) { 1079 for (j = 0; sb->ref[j]; j++) { 1080 add_weight(sb->ref[j], i); 1081 } 1082 } 1083 } 1084 for (ce = avl_first(&elem_by_expand); 1085 ce != NULL; 1086 ce = AVL_NEXT(&elem_by_expand, ce)) { 1087 add_weights(ce->ref); 1088 } 1089 for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) { 1090 add_weights(cc->ref); 1091 } 1092 1093 /* 1094 * Now we walk the entire set of weights, removing the gaps 1095 * in the weights. This gives us optimum usage. The walk 1096 * occurs in priority. 1097 */ 1098 for (i = 0; i < NUM_WT; i++) { 1099 weight_t *w; 1100 for (w = avl_first(&weights[i]); w; 1101 w = AVL_NEXT(&weights[i], w)) { 1102 w->opt = nweight[i]; 1103 nweight[i] += 1; 1104 } 1105 } 1106 1107 (void) memset(&chars, 0, sizeof (chars)); 1108 (void) memset(vers, 0, COLLATE_STR_LEN); 1109 (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers)); 1110 1111 /* 1112 * We need to make sure we arrange for the UNDEFINED field 1113 * to show up. Also, set the total weight counts. 1114 */ 1115 for (i = 0; i < NUM_WT; i++) { 1116 if (resolve_pri(pri_undefined[i]) == -1) { 1117 set_pri(pri_undefined[i], -1, RESOLVED); 1118 /* they collate at the end of everything else */ 1119 collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY; 1120 } 1121 collinfo.pri_count[i] = nweight[i]; 1122 } 1123 1124 collinfo.pri_count[NUM_WT] = max_wide(); 1125 collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY; 1126 collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED; 1127 1128 /* 1129 * Ordinary character priorities 1130 */ 1131 for (i = 0; i <= UCHAR_MAX; i++) { 1132 if ((cc = get_collchar(i, 0)) != NULL) { 1133 for (j = 0; j < NUM_WT; j++) { 1134 chars[i].pri[j] = get_weight(cc->ref[j], j); 1135 } 1136 } else { 1137 for (j = 0; j < NUM_WT; j++) { 1138 chars[i].pri[j] = 1139 get_weight(pri_undefined[j], j); 1140 } 1141 /* 1142 * Per POSIX, for undefined characters, we 1143 * also have to add a last item, which is the 1144 * character code. 1145 */ 1146 chars[i].pri[NUM_WT] = i; 1147 } 1148 } 1149 1150 /* 1151 * Substitution tables 1152 */ 1153 for (i = 0; i < NUM_WT; i++) { 1154 collate_subst_t *st = NULL; 1155 n = collinfo.subst_count[i] = avl_numnodes(&substs[i]); 1156 if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) { 1157 errf(_("out of memory")); 1158 return; 1159 } 1160 n = 0; 1161 for (sb = avl_first(&substs[i]); sb; 1162 sb = AVL_NEXT(&substs[i], sb)) { 1163 if ((st[n].key = resolve_pri(sb->key)) < 0) { 1164 /* by definition these resolve! */ 1165 INTERR; 1166 } 1167 if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) { 1168 INTERR; 1169 } 1170 for (j = 0; sb->ref[j]; j++) { 1171 st[n].pri[j] = get_weight(sb->ref[j], i); 1172 } 1173 n++; 1174 } 1175 if (n != collinfo.subst_count[i]) 1176 INTERR; 1177 subst[i] = st; 1178 } 1179 1180 1181 /* 1182 * Chains, i.e. collating elements 1183 */ 1184 collinfo.chain_count = avl_numnodes(&elem_by_expand); 1185 chain = calloc(sizeof (collate_chain_t), collinfo.chain_count); 1186 if (chain == NULL) { 1187 errf(_("out of memory")); 1188 return; 1189 } 1190 for (n = 0, ce = avl_first(&elem_by_expand); 1191 ce != NULL; 1192 ce = AVL_NEXT(&elem_by_expand, ce), n++) { 1193 (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN); 1194 for (i = 0; i < NUM_WT; i++) { 1195 chain[n].pri[i] = get_weight(ce->ref[i], i); 1196 } 1197 } 1198 if (n != collinfo.chain_count) 1199 INTERR; 1200 1201 /* 1202 * Large (> UCHAR_MAX) character priorities 1203 */ 1204 large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1); 1205 if (large == NULL) { 1206 errf(_("out of memory")); 1207 return; 1208 } 1209 1210 i = 0; 1211 for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) { 1212 int undef = 0; 1213 /* we already gathered those */ 1214 if (cc->wc <= UCHAR_MAX) 1215 continue; 1216 for (j = 0; j < NUM_WT; j++) { 1217 if ((pri = get_weight(cc->ref[j], j)) < 0) { 1218 undef = 1; 1219 } 1220 if (undef && (pri >= 0)) { 1221 /* if undefined, then all priorities are */ 1222 INTERR; 1223 } else { 1224 large[i].pri.pri[j] = pri; 1225 } 1226 } 1227 if (!undef) { 1228 large[i].val = cc->wc; 1229 collinfo.large_count = i++; 1230 } 1231 } 1232 1233 if ((f = open_category()) == NULL) { 1234 return; 1235 } 1236 1237 /* Time to write the entire data set out */ 1238 1239 if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) || 1240 (wr_category(&collinfo, sizeof (collinfo), f) < 0) || 1241 (wr_category(&chars, sizeof (chars), f) < 0)) { 1242 return; 1243 } 1244 1245 for (i = 0; i < NUM_WT; i++) { 1246 sz = sizeof (collate_subst_t) * collinfo.subst_count[i]; 1247 if (wr_category(subst[i], sz, f) < 0) { 1248 return; 1249 } 1250 } 1251 sz = sizeof (collate_chain_t) * collinfo.chain_count; 1252 if (wr_category(chain, sz, f) < 0) { 1253 return; 1254 } 1255 sz = sizeof (collate_large_t) * collinfo.large_count; 1256 if (wr_category(large, sz, f) < 0) { 1257 return; 1258 } 1259 1260 close_category(f); 1261 } 1262