1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 14 */ 15 16 /* 17 * LC_COLLATE database generation routines for localedef. 18 */ 19 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include <errno.h> 23 #include <string.h> 24 #include <sys/types.h> 25 #include <sys/avl.h> 26 #include <string.h> 27 #include <unistd.h> 28 #include <wchar.h> 29 #include <widec.h> 30 #include <limits.h> 31 #include "localedef.h" 32 #include "parser.tab.h" 33 #include "collate.h" 34 35 /* 36 * Design notes. 37 * 38 * It will be extremely helpful to the reader if they have access to 39 * the localedef and locale file format specifications available. 40 * Latest versions of these are available from www.opengroup.org. 41 * 42 * The design for the collation code is a bit complex. The goal is a 43 * single collation database as described in collate.h (in 44 * libc/port/locale). However, there are some other tidbits: 45 * 46 * a) The substitution entries are now a directly indexable array. A 47 * priority elsewhere in the table is taken as an index into the 48 * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY) 49 * set. (The bit is cleared and the result is the index into the 50 * table. 51 * 52 * b) We eliminate duplicate entries into the substitution table. 53 * This saves a lot of space. 54 * 55 * c) The priorities for each level are "compressed", so that each 56 * sorting level has consecutively numbered priorities starting at 1. 57 * (O is reserved for the ignore priority.) This means sort levels 58 * which only have a few distinct priorities can represent the 59 * priority level in fewer bits, which makes the strxfrm output 60 * smaller. 61 * 62 * d) We record the total number of priorities so that strxfrm can 63 * figure out how many bytes to expand a numeric priority into. 64 * 65 * e) For the UNDEFINED pass (the last pass), we record the maximum 66 * number of bits needed to uniquely prioritize these entries, so that 67 * the last pass can also use smaller strxfrm output when possible. 68 * 69 * f) Priorities with the sign bit set are verboten. This works out 70 * because no active character set needs that bit to carry significant 71 * information once the character is in wide form. 72 * 73 * To process the entire data to make the database, we actually run 74 * multiple passes over the data. 75 * 76 * The first pass, which is done at parse time, identifies elements, 77 * substitutions, and such, and records them in priority order. As 78 * some priorities can refer to other priorities, using forward 79 * references, we use a table of references indicating whether the 80 * priority's value has been resolved, or whether it is still a 81 * reference. 82 * 83 * The second pass walks over all the items in priority order, noting 84 * that they are used directly, and not just an indirect reference. 85 * This is done by creating a "weight" structure for the item. The 86 * weights are stashed in an AVL tree sorted by relative "priority". 87 * 88 * The third pass walks over all the weight structures, in priority 89 * order, and assigns a new monotonically increasing (per sort level) 90 * weight value to them. These are the values that will actually be 91 * written to the file. 92 * 93 * The fourth pass just writes the data out. 94 */ 95 96 /* 97 * In order to resolve the priorities, we create a table of priorities. 98 * Entries in the table can be in one of three states. 99 * 100 * UNKNOWN is for newly allocated entries, and indicates that nothing 101 * is known about the priority. (For example, when new entries are created 102 * for collating-symbols, this is the value assigned for them until the 103 * collating symbol's order has been determined. 104 * 105 * RESOLVED is used for an entry where the priority indicates the final 106 * numeric weight. 107 * 108 * REFER is used for entries that reference other entries. Typically 109 * this is used for forward references. A collating-symbol can never 110 * have this value. 111 * 112 * The "pass" field is used during final resolution to aid in detection 113 * of referencing loops. (For example <A> depends on <B>, but <B> has its 114 * priority dependent on <A>.) 115 */ 116 typedef enum { 117 UNKNOWN, /* priority is totally unknown */ 118 RESOLVED, /* priority value fully resolved */ 119 REFER /* priority is a reference (index) */ 120 } res_t; 121 122 typedef struct weight { 123 int32_t pri; 124 int opt; 125 avl_node_t avl; 126 } weight_t; 127 128 typedef struct priority { 129 res_t res; 130 int32_t pri; 131 int pass; 132 int lineno; 133 } collpri_t; 134 135 #define NUM_WT collinfo.directive_count 136 137 /* 138 * These are the abstract collating symbols, which are just a symbolic 139 * way to reference a priority. 140 */ 141 struct collsym { 142 char *name; 143 int32_t ref; 144 avl_node_t avl; 145 }; 146 147 /* 148 * These are also abstract collating symbols, but we allow them to have 149 * different priorities at different levels. 150 */ 151 typedef struct collundef { 152 char *name; 153 int32_t ref[COLL_WEIGHTS_MAX]; 154 avl_node_t avl; 155 } collundef_t; 156 157 /* 158 * These are called "chains" in libc. This records the fact that two 159 * more characters should be treated as a single collating entity when 160 * they appear together. For example, in Spanish <C><h> gets collated 161 * as a character between <C> and <D>. 162 */ 163 struct collelem { 164 char *symbol; 165 wchar_t *expand; 166 int32_t ref[COLL_WEIGHTS_MAX]; 167 avl_node_t avl_bysymbol; 168 avl_node_t avl_byexpand; 169 }; 170 171 /* 172 * Individual characters have a sequence of weights as well. 173 */ 174 typedef struct collchar { 175 wchar_t wc; 176 int32_t ref[COLL_WEIGHTS_MAX]; 177 avl_node_t avl; 178 } collchar_t; 179 180 /* 181 * Substitution entries. The key is itself a priority. Note that 182 * when we create one of these, we *automatically* wind up with a 183 * fully resolved priority for the key, because creation of 184 * substitutions creates a resolved priority at the same time. 185 */ 186 typedef struct { 187 int32_t key; 188 int32_t ref[COLLATE_STR_LEN]; 189 avl_node_t avl; 190 avl_node_t avl_ref; 191 } subst_t; 192 193 static avl_tree_t collsyms; 194 static avl_tree_t collundefs; 195 static avl_tree_t elem_by_symbol; 196 static avl_tree_t elem_by_expand; 197 static avl_tree_t collchars; 198 static avl_tree_t substs[COLL_WEIGHTS_MAX]; 199 static avl_tree_t substs_ref[COLL_WEIGHTS_MAX]; 200 static avl_tree_t weights[COLL_WEIGHTS_MAX]; 201 static int32_t nweight[COLL_WEIGHTS_MAX]; 202 203 /* 204 * This is state tracking for the ellipsis token. Note that we start 205 * the initial values so that the ellipsis logic will think we got a 206 * magic starting value of NUL. It starts at minus one because the 207 * starting point is exclusive -- i.e. the starting point is not 208 * itself handled by the ellipsis code. 209 */ 210 static int currorder = EOF; 211 static int lastorder = EOF; 212 static collelem_t *currelem; 213 static collchar_t *currchar; 214 static collundef_t *currundef; 215 static wchar_t ellipsis_start = 0; 216 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX]; 217 218 /* 219 * We keep a running tally of weights. 220 */ 221 static int nextpri = 1; 222 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 }; 223 224 /* 225 * This array collects up the weights for each level. 226 */ 227 static int32_t order_weights[COLL_WEIGHTS_MAX]; 228 static int curr_weight = 0; 229 static int32_t subst_weights[COLLATE_STR_LEN]; 230 static int curr_subst = 0; 231 232 /* 233 * Some initial priority values. 234 */ 235 static int32_t pri_undefined[COLL_WEIGHTS_MAX]; 236 static int32_t pri_ignore; 237 238 static collate_info_t collinfo; 239 240 static collpri_t *prilist = NULL; 241 static int numpri = 0; 242 static int maxpri = 0; 243 244 static void start_order(int); 245 246 static int32_t 247 new_pri(void) 248 { 249 int i; 250 251 if (numpri >= maxpri) { 252 maxpri = maxpri ? maxpri * 2 : 1024; 253 prilist = realloc(prilist, sizeof (collpri_t) * maxpri); 254 if (prilist == NULL) { 255 errf(_("out of memory")); 256 return (-1); 257 } 258 for (i = numpri; i < maxpri; i++) { 259 prilist[i].res = UNKNOWN; 260 prilist[i].pri = 0; 261 prilist[i].pass = 0; 262 } 263 } 264 return (numpri++); 265 } 266 267 static collpri_t * 268 get_pri(int32_t ref) 269 { 270 if ((ref < 0) || (ref > numpri)) { 271 INTERR; 272 return (NULL); 273 } 274 return (&prilist[ref]); 275 } 276 277 static void 278 set_pri(int32_t ref, int32_t v, res_t res) 279 { 280 collpri_t *pri; 281 282 pri = get_pri(ref); 283 284 if ((res == REFER) && ((v < 0) || (v >= numpri))) { 285 INTERR; 286 } 287 288 /* Resolve self references */ 289 if ((res == REFER) && (ref == v)) { 290 v = nextpri; 291 res = RESOLVED; 292 } 293 294 if (pri->res != UNKNOWN) { 295 warn(_("repeated item in order list (first on %d)"), 296 pri->lineno); 297 return; 298 } 299 pri->lineno = lineno; 300 pri->pri = v; 301 pri->res = res; 302 } 303 304 static int32_t 305 resolve_pri(int32_t ref) 306 { 307 collpri_t *pri; 308 static int32_t pass = 0; 309 310 pri = get_pri(ref); 311 pass++; 312 while (pri->res == REFER) { 313 if (pri->pass == pass) { 314 /* report a line with the circular symbol */ 315 lineno = pri->lineno; 316 errf(_("circular reference in order list")); 317 return (-1); 318 } 319 if ((pri->pri < 0) || (pri->pri >= numpri)) { 320 INTERR; 321 return (-1); 322 } 323 pri->pass = pass; 324 pri = &prilist[pri->pri]; 325 } 326 327 if (pri->res == UNKNOWN) { 328 return (-1); 329 } 330 if (pri->res != RESOLVED) 331 INTERR; 332 333 return (pri->pri); 334 } 335 336 static int 337 weight_compare(const void *n1, const void *n2) 338 { 339 int32_t k1 = ((const weight_t *)n1)->pri; 340 int32_t k2 = ((const weight_t *)n2)->pri; 341 342 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 343 } 344 345 static int 346 collsym_compare(const void *n1, const void *n2) 347 { 348 const collsym_t *c1 = n1; 349 const collsym_t *c2 = n2; 350 int rv; 351 352 rv = strcmp(c1->name, c2->name); 353 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 354 } 355 356 static int 357 collundef_compare(const void *n1, const void *n2) 358 { 359 const collundef_t *c1 = n1; 360 const collundef_t *c2 = n2; 361 int rv; 362 363 rv = strcmp(c1->name, c2->name); 364 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 365 } 366 367 static int 368 element_compare_symbol(const void *n1, const void *n2) 369 { 370 const collelem_t *c1 = n1; 371 const collelem_t *c2 = n2; 372 int rv; 373 374 rv = strcmp(c1->symbol, c2->symbol); 375 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 376 } 377 378 static int 379 element_compare_expand(const void *n1, const void *n2) 380 { 381 const collelem_t *c1 = n1; 382 const collelem_t *c2 = n2; 383 int rv; 384 385 rv = wcscmp(c1->expand, c2->expand); 386 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 387 } 388 389 static int 390 collchar_compare(const void *n1, const void *n2) 391 { 392 wchar_t k1 = ((const collchar_t *)n1)->wc; 393 wchar_t k2 = ((const collchar_t *)n2)->wc; 394 395 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 396 } 397 398 static int 399 subst_compare(const void *n1, const void *n2) 400 { 401 int32_t k1 = ((const subst_t *)n1)->key; 402 int32_t k2 = ((const subst_t *)n2)->key; 403 404 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 405 } 406 407 static int 408 subst_compare_ref(const void *n1, const void *n2) 409 { 410 int32_t *c1 = ((subst_t *)n1)->ref; 411 int32_t *c2 = ((subst_t *)n2)->ref; 412 int rv; 413 414 rv = wcscmp((wchar_t *)c1, (wchar_t *)c2); 415 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 416 } 417 418 void 419 init_collate(void) 420 { 421 int i; 422 423 avl_create(&collsyms, collsym_compare, sizeof (collsym_t), 424 offsetof(collsym_t, avl)); 425 426 avl_create(&collundefs, collundef_compare, sizeof (collsym_t), 427 offsetof(collundef_t, avl)); 428 429 avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t), 430 offsetof(collelem_t, avl_bysymbol)); 431 avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t), 432 offsetof(collelem_t, avl_byexpand)); 433 434 avl_create(&collchars, collchar_compare, sizeof (collchar_t), 435 offsetof(collchar_t, avl)); 436 437 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 438 avl_create(&substs[i], subst_compare, sizeof (subst_t), 439 offsetof(subst_t, avl)); 440 avl_create(&substs_ref[i], subst_compare_ref, 441 sizeof (subst_t), offsetof(subst_t, avl_ref)); 442 avl_create(&weights[i], weight_compare, sizeof (weight_t), 443 offsetof(weight_t, avl)); 444 nweight[i] = 1; 445 } 446 447 (void) memset(&collinfo, 0, sizeof (collinfo)); 448 449 /* allocate some initial priorities */ 450 pri_ignore = new_pri(); 451 452 set_pri(pri_ignore, 0, RESOLVED); 453 454 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 455 pri_undefined[i] = new_pri(); 456 457 /* we will override this later */ 458 set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN); 459 } 460 } 461 462 void 463 define_collsym(char *name) 464 { 465 collsym_t *sym; 466 avl_index_t where; 467 468 if ((sym = calloc(sizeof (*sym), 1)) == NULL) { 469 errf(_("out of memory")); 470 return; 471 } 472 sym->name = name; 473 sym->ref = new_pri(); 474 475 if (avl_find(&collsyms, sym, &where) != NULL) { 476 /* 477 * This should never happen because we are only called 478 * for undefined symbols. 479 */ 480 INTERR; 481 return; 482 } 483 avl_insert(&collsyms, sym, where); 484 } 485 486 collsym_t * 487 lookup_collsym(char *name) 488 { 489 collsym_t srch; 490 491 srch.name = name; 492 return (avl_find(&collsyms, &srch, NULL)); 493 } 494 495 collelem_t * 496 lookup_collelem(char *symbol) 497 { 498 collelem_t srch; 499 500 srch.symbol = symbol; 501 return (avl_find(&elem_by_symbol, &srch, NULL)); 502 } 503 504 static collundef_t * 505 get_collundef(char *name) 506 { 507 collundef_t srch; 508 collundef_t *ud; 509 avl_index_t where; 510 int i; 511 512 srch.name = name; 513 if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) { 514 if (((ud = calloc(sizeof (*ud), 1)) == NULL) || 515 ((ud->name = strdup(name)) == NULL)) { 516 errf(_("out of memory")); 517 return (NULL); 518 } 519 for (i = 0; i < NUM_WT; i++) { 520 ud->ref[i] = new_pri(); 521 } 522 avl_insert(&collundefs, ud, where); 523 } 524 add_charmap_undefined(name); 525 return (ud); 526 } 527 528 static collchar_t * 529 get_collchar(wchar_t wc, int create) 530 { 531 collchar_t srch; 532 collchar_t *cc; 533 avl_index_t where; 534 int i; 535 536 srch.wc = wc; 537 cc = avl_find(&collchars, &srch, &where); 538 if ((cc == NULL) && create) { 539 if ((cc = calloc(sizeof (*cc), 1)) == NULL) { 540 errf(_("out of memory")); 541 return (NULL); 542 } 543 for (i = 0; i < NUM_WT; i++) { 544 cc->ref[i] = new_pri(); 545 } 546 cc->wc = wc; 547 avl_insert(&collchars, cc, where); 548 } 549 return (cc); 550 } 551 552 void 553 end_order_collsym(collsym_t *sym) 554 { 555 start_order(T_COLLSYM); 556 /* update the weight */ 557 558 set_pri(sym->ref, nextpri, RESOLVED); 559 nextpri++; 560 } 561 562 void 563 end_order(void) 564 { 565 int i; 566 int32_t pri; 567 int32_t ref; 568 collpri_t *p; 569 570 /* advance the priority/weight */ 571 pri = nextpri; 572 573 switch (currorder) { 574 case T_CHAR: 575 for (i = 0; i < NUM_WT; i++) { 576 if (((ref = order_weights[i]) < 0) || 577 ((p = get_pri(ref)) == NULL) || 578 (p->pri == -1)) { 579 /* unspecified weight is a self reference */ 580 set_pri(currchar->ref[i], pri, RESOLVED); 581 } else { 582 set_pri(currchar->ref[i], ref, REFER); 583 } 584 order_weights[i] = -1; 585 } 586 587 /* leave a cookie trail in case next symbol is ellipsis */ 588 ellipsis_start = currchar->wc + 1; 589 currchar = NULL; 590 break; 591 592 case T_ELLIPSIS: 593 /* save off the weights were we can find them */ 594 for (i = 0; i < NUM_WT; i++) { 595 ellipsis_weights[i] = order_weights[i]; 596 order_weights[i] = -1; 597 } 598 break; 599 600 case T_COLLELEM: 601 if (currelem == NULL) { 602 INTERR; 603 } else { 604 for (i = 0; i < NUM_WT; i++) { 605 606 if (((ref = order_weights[i]) < 0) || 607 ((p = get_pri(ref)) == NULL) || 608 (p->pri == -1)) { 609 set_pri(currelem->ref[i], pri, 610 RESOLVED); 611 } else { 612 set_pri(currelem->ref[i], ref, REFER); 613 } 614 order_weights[i] = -1; 615 } 616 } 617 break; 618 619 case T_UNDEFINED: 620 for (i = 0; i < NUM_WT; i++) { 621 if (((ref = order_weights[i]) < 0) || 622 ((p = get_pri(ref)) == NULL) || 623 (p->pri == -1)) { 624 set_pri(pri_undefined[i], -1, RESOLVED); 625 } else { 626 set_pri(pri_undefined[i], ref, REFER); 627 } 628 order_weights[i] = -1; 629 } 630 break; 631 632 case T_SYMBOL: 633 for (i = 0; i < NUM_WT; i++) { 634 if (((ref = order_weights[i]) < 0) || 635 ((p = get_pri(ref)) == NULL) || 636 (p->pri == -1)) { 637 set_pri(currundef->ref[i], pri, RESOLVED); 638 } else { 639 set_pri(currundef->ref[i], ref, REFER); 640 } 641 order_weights[i] = -1; 642 } 643 break; 644 645 default: 646 INTERR; 647 } 648 649 nextpri++; 650 } 651 652 static void 653 start_order(int type) 654 { 655 int i; 656 657 lastorder = currorder; 658 currorder = type; 659 660 /* this is used to protect ELLIPSIS processing */ 661 if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) { 662 errf(_("character value expected")); 663 } 664 665 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 666 order_weights[i] = -1; 667 } 668 curr_weight = 0; 669 } 670 671 void 672 start_order_undefined(void) 673 { 674 start_order(T_UNDEFINED); 675 } 676 677 void 678 start_order_symbol(char *name) 679 { 680 currundef = get_collundef(name); 681 start_order(T_SYMBOL); 682 } 683 684 void 685 start_order_char(wchar_t wc) 686 { 687 collchar_t *cc; 688 int32_t ref; 689 690 start_order(T_CHAR); 691 692 /* 693 * If we last saw an ellipsis, then we need to close the range. 694 * Handle that here. Note that we have to be careful because the 695 * items *inside* the range are treated exclusiveley to the items 696 * outside of the range. The ends of the range can have quite 697 * different weights than the range members. 698 */ 699 if (lastorder == T_ELLIPSIS) { 700 int i; 701 702 if (wc < ellipsis_start) { 703 errf(_("malformed range!")); 704 return; 705 } 706 while (ellipsis_start < wc) { 707 /* 708 * pick all of the saved weights for the 709 * ellipsis. note that -1 encodes for the 710 * ellipsis itself, which means to take the 711 * current relative priority. 712 */ 713 if ((cc = get_collchar(ellipsis_start, 1)) == NULL) { 714 INTERR; 715 return; 716 } 717 for (i = 0; i < NUM_WT; i++) { 718 collpri_t *p; 719 if (((ref = ellipsis_weights[i]) == -1) || 720 ((p = get_pri(ref)) == NULL) || 721 (p->pri == -1)) { 722 set_pri(cc->ref[i], nextpri, RESOLVED); 723 } else { 724 set_pri(cc->ref[i], ref, REFER); 725 } 726 ellipsis_weights[i] = NULL; 727 } 728 ellipsis_start++; 729 nextpri++; 730 } 731 } 732 733 currchar = get_collchar(wc, 1); 734 } 735 736 void 737 start_order_collelem(collelem_t *e) 738 { 739 start_order(T_COLLELEM); 740 currelem = e; 741 } 742 743 void 744 start_order_ellipsis(void) 745 { 746 int i; 747 748 start_order(T_ELLIPSIS); 749 750 if (lastorder != T_CHAR) { 751 errf(_("illegal starting point for range")); 752 return; 753 } 754 755 for (i = 0; i < NUM_WT; i++) { 756 ellipsis_weights[i] = order_weights[i]; 757 } 758 } 759 760 void 761 define_collelem(char *name, wchar_t *wcs) 762 { 763 collelem_t *e; 764 avl_index_t where1; 765 avl_index_t where2; 766 int i; 767 768 if (wcslen(wcs) >= COLLATE_STR_LEN) { 769 errf(_("expanded collation element too long")); 770 return; 771 } 772 773 if ((e = calloc(sizeof (*e), 1)) == NULL) { 774 errf(_("out of memory")); 775 return; 776 } 777 e->expand = wcs; 778 e->symbol = name; 779 780 /* 781 * This is executed before the order statement, so we don't 782 * know how many priorities we *really* need. We allocate one 783 * for each possible weight. Not a big deal, as collating-elements 784 * prove to be quite rare. 785 */ 786 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 787 e->ref[i] = new_pri(); 788 } 789 790 /* A character sequence can only reduce to one element. */ 791 if ((avl_find(&elem_by_symbol, e, &where1) != NULL) || 792 (avl_find(&elem_by_expand, e, &where2) != NULL)) { 793 errf(_("duplicate collating element definition")); 794 return; 795 } 796 avl_insert(&elem_by_symbol, e, where1); 797 avl_insert(&elem_by_expand, e, where2); 798 } 799 800 void 801 add_order_bit(int kw) 802 { 803 uint8_t bit = DIRECTIVE_UNDEF; 804 805 switch (kw) { 806 case T_FORWARD: 807 bit = DIRECTIVE_FORWARD; 808 break; 809 case T_BACKWARD: 810 bit = DIRECTIVE_BACKWARD; 811 break; 812 case T_POSITION: 813 bit = DIRECTIVE_POSITION; 814 break; 815 default: 816 INTERR; 817 break; 818 } 819 collinfo.directive[collinfo.directive_count] |= bit; 820 } 821 822 void 823 add_order_directive(void) 824 { 825 if (collinfo.directive_count >= COLL_WEIGHTS_MAX) { 826 errf(_("too many directives (max %d)"), COLL_WEIGHTS_MAX); 827 } 828 collinfo.directive_count++; 829 } 830 831 static void 832 add_order_pri(int32_t ref) 833 { 834 if (curr_weight >= NUM_WT) { 835 errf(_("too many weights (max %d)"), NUM_WT); 836 return; 837 } 838 order_weights[curr_weight] = ref; 839 curr_weight++; 840 } 841 842 void 843 add_order_collsym(collsym_t *s) 844 { 845 add_order_pri(s->ref); 846 } 847 848 void 849 add_order_char(wchar_t wc) 850 { 851 collchar_t *cc; 852 853 if ((cc = get_collchar(wc, 1)) == NULL) { 854 INTERR; 855 return; 856 } 857 858 add_order_pri(cc->ref[curr_weight]); 859 } 860 861 void 862 add_order_collelem(collelem_t *e) 863 { 864 add_order_pri(e->ref[curr_weight]); 865 } 866 867 void 868 add_order_ignore(void) 869 { 870 add_order_pri(pri_ignore); 871 } 872 873 void 874 add_order_symbol(char *sym) 875 { 876 collundef_t *c; 877 if ((c = get_collundef(sym)) == NULL) { 878 INTERR; 879 return; 880 } 881 add_order_pri(c->ref[curr_weight]); 882 } 883 884 void 885 add_order_ellipsis(void) 886 { 887 /* special NULL value indicates self reference */ 888 add_order_pri(NULL); 889 } 890 891 void 892 add_order_subst(void) 893 { 894 subst_t srch; 895 subst_t *s; 896 avl_index_t where; 897 int i; 898 899 (void) memset(&srch, 0, sizeof (srch)); 900 for (i = 0; i < curr_subst; i++) { 901 srch.ref[i] = subst_weights[i]; 902 subst_weights[i] = 0; 903 } 904 s = avl_find(&substs_ref[curr_weight], &srch, &where); 905 906 if (s == NULL) { 907 if ((s = calloc(sizeof (*s), 1)) == NULL) { 908 errf(_("out of memory")); 909 return; 910 } 911 s->key = new_pri(); 912 913 /* 914 * We use a self reference for our key, but we set a 915 * high bit to indicate that this is a substitution 916 * reference. This will expedite table lookups later, 917 * and prevent table lookups for situations that don't 918 * require it. (In short, its a big win, because we 919 * can skip a lot of binary searching.) 920 */ 921 set_pri(s->key, 922 (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY), 923 RESOLVED); 924 nextsubst[curr_weight] += 1; 925 926 for (i = 0; i < curr_subst; i++) { 927 s->ref[i] = srch.ref[i]; 928 } 929 930 avl_insert(&substs_ref[curr_weight], s, where); 931 932 if (avl_find(&substs[curr_weight], s, &where) != NULL) { 933 INTERR; 934 return; 935 } 936 avl_insert(&substs[curr_weight], s, where); 937 } 938 curr_subst = 0; 939 940 941 /* 942 * We are using the current (unique) priority as a search key 943 * in the substitution table. 944 */ 945 add_order_pri(s->key); 946 } 947 948 static void 949 add_subst_pri(int32_t ref) 950 { 951 if (curr_subst >= COLLATE_STR_LEN) { 952 errf(_("substitution string is too long")); 953 return; 954 } 955 subst_weights[curr_subst] = ref; 956 curr_subst++; 957 } 958 959 void 960 add_subst_char(wchar_t wc) 961 { 962 collchar_t *cc; 963 964 965 if (((cc = get_collchar(wc, 1)) == NULL) || 966 (cc->wc != wc)) { 967 INTERR; 968 return; 969 } 970 /* we take the weight for the character at that position */ 971 add_subst_pri(cc->ref[curr_weight]); 972 } 973 974 void 975 add_subst_collelem(collelem_t *e) 976 { 977 add_subst_pri(e->ref[curr_weight]); 978 } 979 980 void 981 add_subst_collsym(collsym_t *s) 982 { 983 add_subst_pri(s->ref); 984 } 985 986 void 987 add_subst_symbol(char *ptr) 988 { 989 collundef_t *cu; 990 991 if ((cu = get_collundef(ptr)) != NULL) { 992 add_subst_pri(cu->ref[curr_weight]); 993 } 994 } 995 996 void 997 add_weight(int32_t ref, int pass) 998 { 999 weight_t srch; 1000 weight_t *w; 1001 avl_index_t where; 1002 1003 srch.pri = resolve_pri(ref); 1004 1005 /* No translation of ignores */ 1006 if (srch.pri == 0) 1007 return; 1008 1009 /* Substitution priorities are not weights */ 1010 if (srch.pri & COLLATE_SUBST_PRIORITY) 1011 return; 1012 1013 if (avl_find(&weights[pass], &srch, &where) != NULL) 1014 return; 1015 1016 if ((w = calloc(sizeof (*w), 1)) == NULL) { 1017 errf(_("out of memory")); 1018 return; 1019 } 1020 w->pri = srch.pri; 1021 avl_insert(&weights[pass], w, where); 1022 } 1023 1024 void 1025 add_weights(int32_t *refs) 1026 { 1027 int i; 1028 for (i = 0; i < NUM_WT; i++) { 1029 add_weight(refs[i], i); 1030 } 1031 } 1032 1033 int32_t 1034 get_weight(int32_t ref, int pass) 1035 { 1036 weight_t srch; 1037 weight_t *w; 1038 int32_t pri; 1039 1040 pri = resolve_pri(ref); 1041 if (pri & COLLATE_SUBST_PRIORITY) { 1042 return (pri); 1043 } 1044 if (pri <= 0) { 1045 return (pri); 1046 } 1047 srch.pri = pri; 1048 if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) { 1049 INTERR; 1050 return (-1); 1051 } 1052 return (w->opt); 1053 } 1054 1055 void 1056 dump_collate(void) 1057 { 1058 FILE *f; 1059 int i, j, n; 1060 size_t sz; 1061 int32_t pri; 1062 collelem_t *ce; 1063 collchar_t *cc; 1064 subst_t *sb; 1065 char vers[COLLATE_STR_LEN]; 1066 collate_char_t chars[UCHAR_MAX + 1]; 1067 collate_large_t *large; 1068 collate_subst_t *subst[COLL_WEIGHTS_MAX]; 1069 collate_chain_t *chain; 1070 1071 /* 1072 * We have to run throught a preliminary pass to identify all the 1073 * weights that we use for each sorting level. 1074 */ 1075 for (i = 0; i < NUM_WT; i++) { 1076 add_weight(pri_ignore, i); 1077 } 1078 for (i = 0; i < NUM_WT; i++) { 1079 for (sb = avl_first(&substs[i]); sb; 1080 sb = AVL_NEXT(&substs[i], sb)) { 1081 for (j = 0; sb->ref[j]; j++) { 1082 add_weight(sb->ref[j], i); 1083 } 1084 } 1085 } 1086 for (ce = avl_first(&elem_by_expand); 1087 ce != NULL; 1088 ce = AVL_NEXT(&elem_by_expand, ce)) { 1089 add_weights(ce->ref); 1090 } 1091 for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) { 1092 add_weights(cc->ref); 1093 } 1094 1095 /* 1096 * Now we walk the entire set of weights, removing the gaps 1097 * in the weights. This gives us optimum usage. The walk 1098 * occurs in priority. 1099 */ 1100 for (i = 0; i < NUM_WT; i++) { 1101 weight_t *w; 1102 for (w = avl_first(&weights[i]); w; 1103 w = AVL_NEXT(&weights[i], w)) { 1104 w->opt = nweight[i]; 1105 nweight[i] += 1; 1106 } 1107 } 1108 1109 (void) memset(&chars, 0, sizeof (chars)); 1110 (void) memset(vers, 0, COLLATE_STR_LEN); 1111 (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers)); 1112 1113 /* 1114 * We need to make sure we arrange for the UNDEFINED field 1115 * to show up. Also, set the total weight counts. 1116 */ 1117 for (i = 0; i < NUM_WT; i++) { 1118 if (resolve_pri(pri_undefined[i]) == -1) { 1119 set_pri(pri_undefined[i], -1, RESOLVED); 1120 /* they collate at the end of everything else */ 1121 collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY; 1122 } 1123 collinfo.pri_count[i] = nweight[i]; 1124 } 1125 1126 collinfo.pri_count[NUM_WT] = max_wide(); 1127 collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY; 1128 collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED; 1129 1130 /* 1131 * Ordinary character priorities 1132 */ 1133 for (i = 0; i <= UCHAR_MAX; i++) { 1134 if ((cc = get_collchar(i, 0)) != NULL) { 1135 for (j = 0; j < NUM_WT; j++) { 1136 chars[i].pri[j] = get_weight(cc->ref[j], j); 1137 } 1138 } else { 1139 for (j = 0; j < NUM_WT; j++) { 1140 chars[i].pri[j] = 1141 get_weight(pri_undefined[j], j); 1142 } 1143 /* 1144 * Per POSIX, for undefined characters, we 1145 * also have to add a last item, which is the 1146 * character code. 1147 */ 1148 chars[i].pri[NUM_WT] = i; 1149 } 1150 } 1151 1152 /* 1153 * Substitution tables 1154 */ 1155 for (i = 0; i < NUM_WT; i++) { 1156 collate_subst_t *st = NULL; 1157 n = collinfo.subst_count[i] = avl_numnodes(&substs[i]); 1158 if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) { 1159 errf(_("out of memory")); 1160 return; 1161 } 1162 n = 0; 1163 for (sb = avl_first(&substs[i]); sb; 1164 sb = AVL_NEXT(&substs[i], sb)) { 1165 if ((st[n].key = resolve_pri(sb->key)) < 0) { 1166 /* by definition these resolve! */ 1167 INTERR; 1168 } 1169 if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) { 1170 INTERR; 1171 } 1172 for (j = 0; sb->ref[j]; j++) { 1173 st[n].pri[j] = get_weight(sb->ref[j], i); 1174 } 1175 n++; 1176 } 1177 if (n != collinfo.subst_count[i]) 1178 INTERR; 1179 subst[i] = st; 1180 } 1181 1182 1183 /* 1184 * Chains, i.e. collating elements 1185 */ 1186 collinfo.chain_count = avl_numnodes(&elem_by_expand); 1187 chain = calloc(sizeof (collate_chain_t), collinfo.chain_count); 1188 if (chain == NULL) { 1189 errf(_("out of memory")); 1190 return; 1191 } 1192 for (n = 0, ce = avl_first(&elem_by_expand); 1193 ce != NULL; 1194 ce = AVL_NEXT(&elem_by_expand, ce), n++) { 1195 (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN); 1196 for (i = 0; i < NUM_WT; i++) { 1197 chain[n].pri[i] = get_weight(ce->ref[i], i); 1198 } 1199 } 1200 if (n != collinfo.chain_count) 1201 INTERR; 1202 1203 /* 1204 * Large (> UCHAR_MAX) character priorities 1205 */ 1206 large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1); 1207 if (large == NULL) { 1208 errf(_("out of memory")); 1209 return; 1210 } 1211 1212 i = 0; 1213 for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) { 1214 int undef = 0; 1215 /* we already gathered those */ 1216 if (cc->wc <= UCHAR_MAX) 1217 continue; 1218 for (j = 0; j < NUM_WT; j++) { 1219 if ((pri = get_weight(cc->ref[j], j)) < 0) { 1220 undef = 1; 1221 } 1222 if (undef && (pri >= 0)) { 1223 /* if undefined, then all priorities are */ 1224 INTERR; 1225 } else { 1226 large[i].pri.pri[j] = pri; 1227 } 1228 } 1229 if (!undef) { 1230 large[i].val = cc->wc; 1231 collinfo.large_count = i++; 1232 } 1233 } 1234 1235 if ((f = open_category()) == NULL) { 1236 return; 1237 } 1238 1239 /* Time to write the entire data set out */ 1240 1241 if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) || 1242 (wr_category(&collinfo, sizeof (collinfo), f) < 0) || 1243 (wr_category(&chars, sizeof (chars), f) < 0)) { 1244 return; 1245 } 1246 1247 for (i = 0; i < NUM_WT; i++) { 1248 sz = sizeof (collate_subst_t) * collinfo.subst_count[i]; 1249 if (wr_category(subst[i], sz, f) < 0) { 1250 return; 1251 } 1252 } 1253 sz = sizeof (collate_chain_t) * collinfo.chain_count; 1254 if (wr_category(chain, sz, f) < 0) { 1255 return; 1256 } 1257 sz = sizeof (collate_large_t) * collinfo.large_count; 1258 if (wr_category(large, sz, f) < 0) { 1259 return; 1260 } 1261 1262 close_category(f); 1263 } 1264