1 /*- 2 * Copyright 2018 Nexenta Systems, Inc. 3 * Copyright 2015 John Marino <draco@marino.st> 4 * 5 * This source code is derived from the illumos localedef command, and 6 * provided under BSD-style license terms by Nexenta Systems, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 /* 32 * LC_COLLATE database generation routines for localedef. 33 */ 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include <sys/types.h> 38 #include <sys/tree.h> 39 40 #include <stdio.h> 41 #include <stddef.h> 42 #include <stdlib.h> 43 #include <errno.h> 44 #include <string.h> 45 #include <unistd.h> 46 #include <wchar.h> 47 #include <limits.h> 48 #include "localedef.h" 49 #include "parser.h" 50 #include "collate.h" 51 52 _Static_assert(COLL_WEIGHTS_MAX == 10, "This code assumes a value of 10"); 53 54 /* 55 * Design notes. 56 * 57 * It will be extremely helpful to the reader if they have access to 58 * the localedef and locale file format specifications available. 59 * Latest versions of these are available from www.opengroup.org. 60 * 61 * The design for the collation code is a bit complex. The goal is a 62 * single collation database as described in collate.h (in 63 * libc/port/locale). However, there are some other tidbits: 64 * 65 * a) The substitution entries are now a directly indexable array. A 66 * priority elsewhere in the table is taken as an index into the 67 * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY) 68 * set. (The bit is cleared and the result is the index into the 69 * table. 70 * 71 * b) We eliminate duplicate entries into the substitution table. 72 * This saves a lot of space. 73 * 74 * c) The priorities for each level are "compressed", so that each 75 * sorting level has consecutively numbered priorities starting at 1. 76 * (O is reserved for the ignore priority.) This means sort levels 77 * which only have a few distinct priorities can represent the 78 * priority level in fewer bits, which makes the strxfrm output 79 * smaller. 80 * 81 * d) We record the total number of priorities so that strxfrm can 82 * figure out how many bytes to expand a numeric priority into. 83 * 84 * e) For the UNDEFINED pass (the last pass), we record the maximum 85 * number of bits needed to uniquely prioritize these entries, so that 86 * the last pass can also use smaller strxfrm output when possible. 87 * 88 * f) Priorities with the sign bit set are verboten. This works out 89 * because no active character set needs that bit to carry significant 90 * information once the character is in wide form. 91 * 92 * To process the entire data to make the database, we actually run 93 * multiple passes over the data. 94 * 95 * The first pass, which is done at parse time, identifies elements, 96 * substitutions, and such, and records them in priority order. As 97 * some priorities can refer to other priorities, using forward 98 * references, we use a table of references indicating whether the 99 * priority's value has been resolved, or whether it is still a 100 * reference. 101 * 102 * The second pass walks over all the items in priority order, noting 103 * that they are used directly, and not just an indirect reference. 104 * This is done by creating a "weight" structure for the item. The 105 * weights are stashed in an RB tree sorted by relative "priority". 106 * 107 * The third pass walks over all the weight structures, in priority 108 * order, and assigns a new monotonically increasing (per sort level) 109 * weight value to them. These are the values that will actually be 110 * written to the file. 111 * 112 * The fourth pass just writes the data out. 113 */ 114 115 /* 116 * In order to resolve the priorities, we create a table of priorities. 117 * Entries in the table can be in one of three states. 118 * 119 * UNKNOWN is for newly allocated entries, and indicates that nothing 120 * is known about the priority. (For example, when new entries are created 121 * for collating-symbols, this is the value assigned for them until the 122 * collating symbol's order has been determined. 123 * 124 * RESOLVED is used for an entry where the priority indicates the final 125 * numeric weight. 126 * 127 * REFER is used for entries that reference other entries. Typically 128 * this is used for forward references. A collating-symbol can never 129 * have this value. 130 * 131 * The "pass" field is used during final resolution to aid in detection 132 * of referencing loops. (For example <A> depends on <B>, but <B> has its 133 * priority dependent on <A>.) 134 */ 135 typedef enum { 136 UNKNOWN, /* priority is totally unknown */ 137 RESOLVED, /* priority value fully resolved */ 138 REFER /* priority is a reference (index) */ 139 } res_t; 140 141 typedef struct weight { 142 int32_t pri; 143 int opt; 144 RB_ENTRY(weight) entry; 145 } weight_t; 146 147 typedef struct priority { 148 res_t res; 149 int32_t pri; 150 int pass; 151 int lineno; 152 } collpri_t; 153 154 #define NUM_WT collinfo.directive_count 155 156 /* 157 * These are the abstract collating symbols, which are just a symbolic 158 * way to reference a priority. 159 */ 160 struct collsym { 161 char *name; 162 int32_t ref; 163 RB_ENTRY(collsym) entry; 164 }; 165 166 /* 167 * These are also abstract collating symbols, but we allow them to have 168 * different priorities at different levels. 169 */ 170 typedef struct collundef { 171 char *name; 172 int32_t ref[COLL_WEIGHTS_MAX]; 173 RB_ENTRY(collundef) entry; 174 } collundef_t; 175 176 /* 177 * These are called "chains" in libc. This records the fact that two 178 * more characters should be treated as a single collating entity when 179 * they appear together. For example, in Spanish <C><h> gets collated 180 * as a character between <C> and <D>. 181 */ 182 struct collelem { 183 char *symbol; 184 wchar_t *expand; 185 int32_t ref[COLL_WEIGHTS_MAX]; 186 RB_ENTRY(collelem) rb_bysymbol; 187 RB_ENTRY(collelem) rb_byexpand; 188 }; 189 190 /* 191 * Individual characters have a sequence of weights as well. 192 */ 193 typedef struct collchar { 194 wchar_t wc; 195 int32_t ref[COLL_WEIGHTS_MAX]; 196 RB_ENTRY(collchar) entry; 197 } collchar_t; 198 199 /* 200 * Substitution entries. The key is itself a priority. Note that 201 * when we create one of these, we *automatically* wind up with a 202 * fully resolved priority for the key, because creation of 203 * substitutions creates a resolved priority at the same time. 204 */ 205 typedef struct subst{ 206 int32_t key; 207 int32_t ref[COLLATE_STR_LEN]; 208 RB_ENTRY(subst) entry; 209 RB_ENTRY(subst) entry_ref; 210 } subst_t; 211 212 static RB_HEAD(collsyms, collsym) collsyms; 213 static RB_HEAD(collundefs, collundef) collundefs; 214 static RB_HEAD(elem_by_symbol, collelem) elem_by_symbol; 215 static RB_HEAD(elem_by_expand, collelem) elem_by_expand; 216 static RB_HEAD(collchars, collchar) collchars; 217 static RB_HEAD(substs, subst) substs[COLL_WEIGHTS_MAX]; 218 static RB_HEAD(substs_ref, subst) substs_ref[COLL_WEIGHTS_MAX]; 219 static RB_HEAD(weights, weight) weights[COLL_WEIGHTS_MAX]; 220 static int32_t nweight[COLL_WEIGHTS_MAX]; 221 222 /* 223 * This is state tracking for the ellipsis token. Note that we start 224 * the initial values so that the ellipsis logic will think we got a 225 * magic starting value of NUL. It starts at minus one because the 226 * starting point is exclusive -- i.e. the starting point is not 227 * itself handled by the ellipsis code. 228 */ 229 static int currorder = EOF; 230 static int lastorder = EOF; 231 static collelem_t *currelem; 232 static collchar_t *currchar; 233 static collundef_t *currundef; 234 static wchar_t ellipsis_start = 0; 235 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX]; 236 237 /* 238 * We keep a running tally of weights. 239 */ 240 static int nextpri = 1; 241 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 }; 242 243 /* 244 * This array collects up the weights for each level. 245 */ 246 static int32_t order_weights[COLL_WEIGHTS_MAX]; 247 static int curr_weight = 0; 248 static int32_t subst_weights[COLLATE_STR_LEN]; 249 static int curr_subst = 0; 250 251 /* 252 * Some initial priority values. 253 */ 254 static int32_t pri_undefined[COLL_WEIGHTS_MAX]; 255 static int32_t pri_ignore; 256 257 static collate_info_t collinfo; 258 static int32_t subst_count[COLL_WEIGHTS_MAX]; 259 static int32_t chain_count; 260 static int32_t large_count; 261 262 static collpri_t *prilist = NULL; 263 static int numpri = 0; 264 static int maxpri = 0; 265 266 static void start_order(int); 267 268 static int32_t 269 new_pri(void) 270 { 271 int i; 272 273 if (numpri >= maxpri) { 274 maxpri = maxpri ? maxpri * 2 : 1024; 275 prilist = realloc(prilist, sizeof (collpri_t) * maxpri); 276 if (prilist == NULL) { 277 fprintf(stderr,"out of memory"); 278 return (-1); 279 } 280 for (i = numpri; i < maxpri; i++) { 281 prilist[i].res = UNKNOWN; 282 prilist[i].pri = 0; 283 prilist[i].pass = 0; 284 } 285 } 286 return (numpri++); 287 } 288 289 static collpri_t * 290 get_pri(int32_t ref) 291 { 292 if ((ref < 0) || (ref > numpri)) { 293 INTERR; 294 return (NULL); 295 } 296 return (&prilist[ref]); 297 } 298 299 static void 300 set_pri(int32_t ref, int32_t v, res_t res) 301 { 302 collpri_t *pri; 303 304 pri = get_pri(ref); 305 306 if ((res == REFER) && ((v < 0) || (v >= numpri))) { 307 INTERR; 308 } 309 310 /* Resolve self references */ 311 if ((res == REFER) && (ref == v)) { 312 v = nextpri; 313 res = RESOLVED; 314 } 315 316 if (pri->res != UNKNOWN) { 317 warn("repeated item in order list (first on %d)", 318 pri->lineno); 319 return; 320 } 321 pri->lineno = lineno; 322 pri->pri = v; 323 pri->res = res; 324 } 325 326 static int32_t 327 resolve_pri(int32_t ref) 328 { 329 collpri_t *pri; 330 static int32_t pass = 0; 331 332 pri = get_pri(ref); 333 pass++; 334 while (pri->res == REFER) { 335 if (pri->pass == pass) { 336 /* report a line with the circular symbol */ 337 lineno = pri->lineno; 338 fprintf(stderr,"circular reference in order list"); 339 return (-1); 340 } 341 if ((pri->pri < 0) || (pri->pri >= numpri)) { 342 INTERR; 343 return (-1); 344 } 345 pri->pass = pass; 346 pri = &prilist[pri->pri]; 347 } 348 349 if (pri->res == UNKNOWN) { 350 return (-1); 351 } 352 if (pri->res != RESOLVED) 353 INTERR; 354 355 return (pri->pri); 356 } 357 358 static int 359 weight_compare(const void *n1, const void *n2) 360 { 361 int32_t k1 = ((const weight_t *)n1)->pri; 362 int32_t k2 = ((const weight_t *)n2)->pri; 363 364 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 365 } 366 367 RB_GENERATE_STATIC(weights, weight, entry, weight_compare); 368 369 static int 370 collsym_compare(const void *n1, const void *n2) 371 { 372 const collsym_t *c1 = n1; 373 const collsym_t *c2 = n2; 374 int rv; 375 376 rv = strcmp(c1->name, c2->name); 377 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 378 } 379 380 RB_GENERATE_STATIC(collsyms, collsym, entry, collsym_compare); 381 382 static int 383 collundef_compare(const void *n1, const void *n2) 384 { 385 const collundef_t *c1 = n1; 386 const collundef_t *c2 = n2; 387 int rv; 388 389 rv = strcmp(c1->name, c2->name); 390 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 391 } 392 393 RB_GENERATE_STATIC(collundefs, collundef, entry, collundef_compare); 394 395 static int 396 element_compare_symbol(const void *n1, const void *n2) 397 { 398 const collelem_t *c1 = n1; 399 const collelem_t *c2 = n2; 400 int rv; 401 402 rv = strcmp(c1->symbol, c2->symbol); 403 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 404 } 405 406 RB_GENERATE_STATIC(elem_by_symbol, collelem, rb_bysymbol, element_compare_symbol); 407 408 static int 409 element_compare_expand(const void *n1, const void *n2) 410 { 411 const collelem_t *c1 = n1; 412 const collelem_t *c2 = n2; 413 int rv; 414 415 rv = wcscmp(c1->expand, c2->expand); 416 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 417 } 418 419 RB_GENERATE_STATIC(elem_by_expand, collelem, rb_byexpand, element_compare_expand); 420 421 static int 422 collchar_compare(const void *n1, const void *n2) 423 { 424 wchar_t k1 = ((const collchar_t *)n1)->wc; 425 wchar_t k2 = ((const collchar_t *)n2)->wc; 426 427 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 428 } 429 430 RB_GENERATE_STATIC(collchars, collchar, entry, collchar_compare); 431 432 static int 433 subst_compare(const void *n1, const void *n2) 434 { 435 int32_t k1 = ((const subst_t *)n1)->key; 436 int32_t k2 = ((const subst_t *)n2)->key; 437 438 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); 439 } 440 441 RB_GENERATE_STATIC(substs, subst, entry, subst_compare); 442 443 static int 444 subst_compare_ref(const void *n1, const void *n2) 445 { 446 const wchar_t *c1 = ((const subst_t *)n1)->ref; 447 const wchar_t *c2 = ((const subst_t *)n2)->ref; 448 int rv; 449 450 rv = wcscmp(c1, c2); 451 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); 452 } 453 454 RB_GENERATE_STATIC(substs_ref, subst, entry_ref, subst_compare_ref); 455 456 void 457 init_collate(void) 458 { 459 int i; 460 461 RB_INIT(&collsyms); 462 463 RB_INIT(&collundefs); 464 465 RB_INIT(&elem_by_symbol); 466 467 RB_INIT(&elem_by_expand); 468 469 RB_INIT(&collchars); 470 471 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 472 RB_INIT(&substs[i]); 473 RB_INIT(&substs_ref[i]); 474 RB_INIT(&weights[i]); 475 nweight[i] = 1; 476 } 477 478 (void) memset(&collinfo, 0, sizeof (collinfo)); 479 480 /* allocate some initial priorities */ 481 pri_ignore = new_pri(); 482 483 set_pri(pri_ignore, 0, RESOLVED); 484 485 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 486 pri_undefined[i] = new_pri(); 487 488 /* we will override this later */ 489 set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN); 490 } 491 } 492 493 void 494 define_collsym(char *name) 495 { 496 collsym_t *sym; 497 498 if ((sym = calloc(1, sizeof(*sym))) == NULL) { 499 fprintf(stderr,"out of memory"); 500 return; 501 } 502 sym->name = name; 503 sym->ref = new_pri(); 504 505 if (RB_FIND(collsyms, &collsyms, sym) != NULL) { 506 /* 507 * This should never happen because we are only called 508 * for undefined symbols. 509 */ 510 free(sym); 511 INTERR; 512 return; 513 } 514 RB_INSERT(collsyms, &collsyms, sym); 515 } 516 517 collsym_t * 518 lookup_collsym(char *name) 519 { 520 collsym_t srch; 521 522 srch.name = name; 523 return (RB_FIND(collsyms, &collsyms, &srch)); 524 } 525 526 collelem_t * 527 lookup_collelem(char *symbol) 528 { 529 collelem_t srch; 530 531 srch.symbol = symbol; 532 return (RB_FIND(elem_by_symbol, &elem_by_symbol, &srch)); 533 } 534 535 static collundef_t * 536 get_collundef(char *name) 537 { 538 collundef_t srch; 539 collundef_t *ud; 540 int i; 541 542 srch.name = name; 543 if ((ud = RB_FIND(collundefs, &collundefs, &srch)) == NULL) { 544 if (((ud = calloc(1, sizeof(*ud))) == NULL) || 545 ((ud->name = strdup(name)) == NULL)) { 546 fprintf(stderr,"out of memory"); 547 free(ud); 548 return (NULL); 549 } 550 for (i = 0; i < NUM_WT; i++) { 551 ud->ref[i] = new_pri(); 552 } 553 RB_INSERT(collundefs, &collundefs, ud); 554 } 555 add_charmap_undefined(name); 556 return (ud); 557 } 558 559 static collchar_t * 560 get_collchar(wchar_t wc, int create) 561 { 562 collchar_t srch; 563 collchar_t *cc; 564 int i; 565 566 srch.wc = wc; 567 cc = RB_FIND(collchars, &collchars, &srch); 568 if ((cc == NULL) && create) { 569 if ((cc = calloc(1, sizeof(*cc))) == NULL) { 570 fprintf(stderr, "out of memory"); 571 return (NULL); 572 } 573 for (i = 0; i < NUM_WT; i++) { 574 cc->ref[i] = new_pri(); 575 } 576 cc->wc = wc; 577 RB_INSERT(collchars, &collchars, cc); 578 } 579 return (cc); 580 } 581 582 void 583 end_order_collsym(collsym_t *sym) 584 { 585 start_order(T_COLLSYM); 586 /* update the weight */ 587 588 set_pri(sym->ref, nextpri, RESOLVED); 589 nextpri++; 590 } 591 592 void 593 end_order(void) 594 { 595 int i; 596 int32_t pri; 597 int32_t ref; 598 collpri_t *p; 599 600 /* advance the priority/weight */ 601 pri = nextpri; 602 603 switch (currorder) { 604 case T_CHAR: 605 for (i = 0; i < NUM_WT; i++) { 606 if (((ref = order_weights[i]) < 0) || 607 ((p = get_pri(ref)) == NULL) || 608 (p->pri == -1)) { 609 /* unspecified weight is a self reference */ 610 set_pri(currchar->ref[i], pri, RESOLVED); 611 } else { 612 set_pri(currchar->ref[i], ref, REFER); 613 } 614 order_weights[i] = -1; 615 } 616 617 /* leave a cookie trail in case next symbol is ellipsis */ 618 ellipsis_start = currchar->wc + 1; 619 currchar = NULL; 620 break; 621 622 case T_ELLIPSIS: 623 /* save off the weights were we can find them */ 624 for (i = 0; i < NUM_WT; i++) { 625 ellipsis_weights[i] = order_weights[i]; 626 order_weights[i] = -1; 627 } 628 break; 629 630 case T_COLLELEM: 631 if (currelem == NULL) { 632 INTERR; 633 } else { 634 for (i = 0; i < NUM_WT; i++) { 635 636 if (((ref = order_weights[i]) < 0) || 637 ((p = get_pri(ref)) == NULL) || 638 (p->pri == -1)) { 639 set_pri(currelem->ref[i], pri, 640 RESOLVED); 641 } else { 642 set_pri(currelem->ref[i], ref, REFER); 643 } 644 order_weights[i] = -1; 645 } 646 } 647 break; 648 649 case T_UNDEFINED: 650 for (i = 0; i < NUM_WT; i++) { 651 if (((ref = order_weights[i]) < 0) || 652 ((p = get_pri(ref)) == NULL) || 653 (p->pri == -1)) { 654 set_pri(pri_undefined[i], -1, RESOLVED); 655 } else { 656 set_pri(pri_undefined[i], ref, REFER); 657 } 658 order_weights[i] = -1; 659 } 660 break; 661 662 case T_SYMBOL: 663 for (i = 0; i < NUM_WT; i++) { 664 if (((ref = order_weights[i]) < 0) || 665 ((p = get_pri(ref)) == NULL) || 666 (p->pri == -1)) { 667 set_pri(currundef->ref[i], pri, RESOLVED); 668 } else { 669 set_pri(currundef->ref[i], ref, REFER); 670 } 671 order_weights[i] = -1; 672 } 673 break; 674 675 default: 676 INTERR; 677 } 678 679 nextpri++; 680 } 681 682 static void 683 start_order(int type) 684 { 685 int i; 686 687 lastorder = currorder; 688 currorder = type; 689 690 /* this is used to protect ELLIPSIS processing */ 691 if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) { 692 fprintf(stderr, "character value expected"); 693 } 694 695 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 696 order_weights[i] = -1; 697 } 698 curr_weight = 0; 699 } 700 701 void 702 start_order_undefined(void) 703 { 704 start_order(T_UNDEFINED); 705 } 706 707 void 708 start_order_symbol(char *name) 709 { 710 currundef = get_collundef(name); 711 start_order(T_SYMBOL); 712 } 713 714 void 715 start_order_char(wchar_t wc) 716 { 717 collchar_t *cc; 718 int32_t ref; 719 720 start_order(T_CHAR); 721 722 /* 723 * If we last saw an ellipsis, then we need to close the range. 724 * Handle that here. Note that we have to be careful because the 725 * items *inside* the range are treated exclusiveley to the items 726 * outside of the range. The ends of the range can have quite 727 * different weights than the range members. 728 */ 729 if (lastorder == T_ELLIPSIS) { 730 int i; 731 732 if (wc < ellipsis_start) { 733 fprintf(stderr, "malformed range!"); 734 return; 735 } 736 while (ellipsis_start < wc) { 737 /* 738 * pick all of the saved weights for the 739 * ellipsis. note that -1 encodes for the 740 * ellipsis itself, which means to take the 741 * current relative priority. 742 */ 743 if ((cc = get_collchar(ellipsis_start, 1)) == NULL) { 744 INTERR; 745 return; 746 } 747 for (i = 0; i < NUM_WT; i++) { 748 collpri_t *p; 749 if (((ref = ellipsis_weights[i]) == -1) || 750 ((p = get_pri(ref)) == NULL) || 751 (p->pri == -1)) { 752 set_pri(cc->ref[i], nextpri, RESOLVED); 753 } else { 754 set_pri(cc->ref[i], ref, REFER); 755 } 756 ellipsis_weights[i] = 0; 757 } 758 ellipsis_start++; 759 nextpri++; 760 } 761 } 762 763 currchar = get_collchar(wc, 1); 764 } 765 766 void 767 start_order_collelem(collelem_t *e) 768 { 769 start_order(T_COLLELEM); 770 currelem = e; 771 } 772 773 void 774 start_order_ellipsis(void) 775 { 776 int i; 777 778 start_order(T_ELLIPSIS); 779 780 if (lastorder != T_CHAR) { 781 fprintf(stderr, "illegal starting point for range"); 782 return; 783 } 784 785 for (i = 0; i < NUM_WT; i++) { 786 ellipsis_weights[i] = order_weights[i]; 787 } 788 } 789 790 void 791 define_collelem(char *name, wchar_t *wcs) 792 { 793 collelem_t *e; 794 int i; 795 796 if (wcslen(wcs) >= COLLATE_STR_LEN) { 797 fprintf(stderr,"expanded collation element too long"); 798 return; 799 } 800 801 if ((e = calloc(1, sizeof(*e))) == NULL) { 802 fprintf(stderr, "out of memory"); 803 return; 804 } 805 e->expand = wcs; 806 e->symbol = name; 807 808 /* 809 * This is executed before the order statement, so we don't 810 * know how many priorities we *really* need. We allocate one 811 * for each possible weight. Not a big deal, as collating-elements 812 * prove to be quite rare. 813 */ 814 for (i = 0; i < COLL_WEIGHTS_MAX; i++) { 815 e->ref[i] = new_pri(); 816 } 817 818 /* A character sequence can only reduce to one element. */ 819 if ((RB_FIND(elem_by_symbol, &elem_by_symbol, e) != NULL) || 820 (RB_FIND(elem_by_expand, &elem_by_expand, e) != NULL)) { 821 fprintf(stderr, "duplicate collating element definition"); 822 free(e); 823 return; 824 } 825 RB_INSERT(elem_by_symbol, &elem_by_symbol, e); 826 RB_INSERT(elem_by_expand, &elem_by_expand, e); 827 } 828 829 void 830 add_order_bit(int kw) 831 { 832 uint8_t bit = DIRECTIVE_UNDEF; 833 834 switch (kw) { 835 case T_FORWARD: 836 bit = DIRECTIVE_FORWARD; 837 break; 838 case T_BACKWARD: 839 bit = DIRECTIVE_BACKWARD; 840 break; 841 case T_POSITION: 842 bit = DIRECTIVE_POSITION; 843 break; 844 default: 845 INTERR; 846 break; 847 } 848 collinfo.directive[collinfo.directive_count] |= bit; 849 } 850 851 void 852 add_order_directive(void) 853 { 854 if (collinfo.directive_count >= COLL_WEIGHTS_MAX) { 855 fprintf(stderr, "too many directives (max %d)\n", COLL_WEIGHTS_MAX); 856 return; 857 } 858 collinfo.directive_count++; 859 } 860 861 static void 862 add_order_pri(int32_t ref) 863 { 864 if (curr_weight >= NUM_WT) { 865 fprintf(stderr, "too many weights (max %d)\n", NUM_WT); 866 return; 867 } 868 order_weights[curr_weight] = ref; 869 curr_weight++; 870 } 871 872 void 873 add_order_collsym(collsym_t *s) 874 { 875 add_order_pri(s->ref); 876 } 877 878 void 879 add_order_char(wchar_t wc) 880 { 881 collchar_t *cc; 882 883 if ((cc = get_collchar(wc, 1)) == NULL) { 884 INTERR; 885 return; 886 } 887 888 add_order_pri(cc->ref[curr_weight]); 889 } 890 891 void 892 add_order_collelem(collelem_t *e) 893 { 894 add_order_pri(e->ref[curr_weight]); 895 } 896 897 void 898 add_order_ignore(void) 899 { 900 add_order_pri(pri_ignore); 901 } 902 903 void 904 add_order_symbol(char *sym) 905 { 906 collundef_t *c; 907 if ((c = get_collundef(sym)) == NULL) { 908 INTERR; 909 return; 910 } 911 add_order_pri(c->ref[curr_weight]); 912 } 913 914 void 915 add_order_ellipsis(void) 916 { 917 /* special NULL value indicates self reference */ 918 add_order_pri(0); 919 } 920 921 void 922 add_order_subst(void) 923 { 924 subst_t srch; 925 subst_t *s; 926 int i; 927 928 (void) memset(&srch, 0, sizeof (srch)); 929 for (i = 0; i < curr_subst; i++) { 930 srch.ref[i] = subst_weights[i]; 931 subst_weights[i] = 0; 932 } 933 s = RB_FIND(substs_ref, &substs_ref[curr_weight], &srch); 934 935 if (s == NULL) { 936 if ((s = calloc(1, sizeof(*s))) == NULL) { 937 fprintf(stderr,"out of memory"); 938 return; 939 } 940 s->key = new_pri(); 941 942 /* 943 * We use a self reference for our key, but we set a 944 * high bit to indicate that this is a substitution 945 * reference. This will expedite table lookups later, 946 * and prevent table lookups for situations that don't 947 * require it. (In short, its a big win, because we 948 * can skip a lot of binary searching.) 949 */ 950 set_pri(s->key, 951 (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY), 952 RESOLVED); 953 nextsubst[curr_weight] += 1; 954 955 for (i = 0; i < curr_subst; i++) { 956 s->ref[i] = srch.ref[i]; 957 } 958 959 RB_INSERT(substs_ref, &substs_ref[curr_weight], s); 960 961 if (RB_FIND(substs, &substs[curr_weight], s) != NULL) { 962 INTERR; 963 return; 964 } 965 RB_INSERT(substs, &substs[curr_weight], s); 966 } 967 curr_subst = 0; 968 969 970 /* 971 * We are using the current (unique) priority as a search key 972 * in the substitution table. 973 */ 974 add_order_pri(s->key); 975 } 976 977 static void 978 add_subst_pri(int32_t ref) 979 { 980 if (curr_subst >= COLLATE_STR_LEN) { 981 fprintf(stderr,"substitution string is too long"); 982 return; 983 } 984 subst_weights[curr_subst] = ref; 985 curr_subst++; 986 } 987 988 void 989 add_subst_char(wchar_t wc) 990 { 991 collchar_t *cc; 992 993 994 if (((cc = get_collchar(wc, 1)) == NULL) || 995 (cc->wc != wc)) { 996 INTERR; 997 return; 998 } 999 /* we take the weight for the character at that position */ 1000 add_subst_pri(cc->ref[curr_weight]); 1001 } 1002 1003 void 1004 add_subst_collelem(collelem_t *e) 1005 { 1006 add_subst_pri(e->ref[curr_weight]); 1007 } 1008 1009 void 1010 add_subst_collsym(collsym_t *s) 1011 { 1012 add_subst_pri(s->ref); 1013 } 1014 1015 void 1016 add_subst_symbol(char *ptr) 1017 { 1018 collundef_t *cu; 1019 1020 if ((cu = get_collundef(ptr)) != NULL) { 1021 add_subst_pri(cu->ref[curr_weight]); 1022 } 1023 } 1024 1025 void 1026 add_weight(int32_t ref, int pass) 1027 { 1028 weight_t srch; 1029 weight_t *w; 1030 1031 srch.pri = resolve_pri(ref); 1032 1033 /* No translation of ignores */ 1034 if (srch.pri == 0) 1035 return; 1036 1037 /* Substitution priorities are not weights */ 1038 if (srch.pri & COLLATE_SUBST_PRIORITY) 1039 return; 1040 1041 if (RB_FIND(weights, &weights[pass], &srch) != NULL) 1042 return; 1043 1044 if ((w = calloc(1, sizeof(*w))) == NULL) { 1045 fprintf(stderr, "out of memory"); 1046 return; 1047 } 1048 w->pri = srch.pri; 1049 RB_INSERT(weights, &weights[pass], w); 1050 } 1051 1052 void 1053 add_weights(int32_t *refs) 1054 { 1055 int i; 1056 for (i = 0; i < NUM_WT; i++) { 1057 add_weight(refs[i], i); 1058 } 1059 } 1060 1061 int32_t 1062 get_weight(int32_t ref, int pass) 1063 { 1064 weight_t srch; 1065 weight_t *w; 1066 int32_t pri; 1067 1068 pri = resolve_pri(ref); 1069 if (pri & COLLATE_SUBST_PRIORITY) { 1070 return (pri); 1071 } 1072 if (pri <= 0) { 1073 return (pri); 1074 } 1075 srch.pri = pri; 1076 if ((w = RB_FIND(weights, &weights[pass], &srch)) == NULL) { 1077 INTERR; 1078 return (-1); 1079 } 1080 return (w->opt); 1081 } 1082 1083 wchar_t * 1084 wsncpy(wchar_t *s1, const wchar_t *s2, size_t n) 1085 { 1086 wchar_t *os1 = s1; 1087 1088 n++; 1089 while (--n > 0 && (*s1++ = htote(*s2++)) != 0) 1090 continue; 1091 if (n > 0) 1092 while (--n > 0) 1093 *s1++ = 0; 1094 return (os1); 1095 } 1096 1097 #define RB_COUNT(x, name, head, cnt) do { \ 1098 (cnt) = 0; \ 1099 RB_FOREACH(x, name, (head)) { \ 1100 (cnt)++; \ 1101 } \ 1102 } while (0) 1103 1104 #define RB_NUMNODES(type, name, head, cnt) do { \ 1105 type *t; \ 1106 cnt = 0; \ 1107 RB_FOREACH(t, name, head) { \ 1108 cnt++; \ 1109 } \ 1110 } while (0) 1111 1112 void 1113 dump_collate(void) 1114 { 1115 FILE *f; 1116 int i, j, n; 1117 size_t sz; 1118 int32_t pri; 1119 collelem_t *ce; 1120 collchar_t *cc; 1121 subst_t *sb; 1122 char vers[COLLATE_STR_LEN]; 1123 collate_char_t chars[UCHAR_MAX + 1]; 1124 collate_large_t *large; 1125 collate_subst_t *subst[COLL_WEIGHTS_MAX]; 1126 collate_chain_t *chain; 1127 1128 /* 1129 * We have to run through a preliminary pass to identify all the 1130 * weights that we use for each sorting level. 1131 */ 1132 for (i = 0; i < NUM_WT; i++) { 1133 add_weight(pri_ignore, i); 1134 } 1135 for (i = 0; i < NUM_WT; i++) { 1136 RB_FOREACH(sb, substs, &substs[i]) { 1137 for (j = 0; sb->ref[j]; j++) { 1138 add_weight(sb->ref[j], i); 1139 } 1140 } 1141 } 1142 RB_FOREACH(ce, elem_by_expand, &elem_by_expand) { 1143 add_weights(ce->ref); 1144 } 1145 RB_FOREACH(cc, collchars, &collchars) { 1146 add_weights(cc->ref); 1147 } 1148 1149 /* 1150 * Now we walk the entire set of weights, removing the gaps 1151 * in the weights. This gives us optimum usage. The walk 1152 * occurs in priority. 1153 */ 1154 for (i = 0; i < NUM_WT; i++) { 1155 weight_t *w; 1156 RB_FOREACH(w, weights, &weights[i]) { 1157 w->opt = nweight[i]; 1158 nweight[i] += 1; 1159 } 1160 } 1161 1162 (void) memset(&chars, 0, sizeof (chars)); 1163 (void) memset(vers, 0, COLLATE_STR_LEN); 1164 (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers)); 1165 1166 /* 1167 * We need to make sure we arrange for the UNDEFINED field 1168 * to show up. Also, set the total weight counts. 1169 */ 1170 for (i = 0; i < NUM_WT; i++) { 1171 if (resolve_pri(pri_undefined[i]) == -1) { 1172 set_pri(pri_undefined[i], -1, RESOLVED); 1173 /* they collate at the end of everything else */ 1174 collinfo.undef_pri[i] = htote(COLLATE_MAX_PRIORITY); 1175 } 1176 collinfo.pri_count[i] = htote(nweight[i]); 1177 } 1178 1179 collinfo.pri_count[NUM_WT] = htote(max_wide()); 1180 collinfo.undef_pri[NUM_WT] = htote(COLLATE_MAX_PRIORITY); 1181 collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED; 1182 1183 /* 1184 * Ordinary character priorities 1185 */ 1186 for (i = 0; i <= UCHAR_MAX; i++) { 1187 if ((cc = get_collchar(i, 0)) != NULL) { 1188 for (j = 0; j < NUM_WT; j++) { 1189 chars[i].pri[j] = 1190 htote(get_weight(cc->ref[j], j)); 1191 } 1192 } else { 1193 for (j = 0; j < NUM_WT; j++) { 1194 chars[i].pri[j] = 1195 htote(get_weight(pri_undefined[j], j)); 1196 } 1197 /* 1198 * Per POSIX, for undefined characters, we 1199 * also have to add a last item, which is the 1200 * character code. 1201 */ 1202 chars[i].pri[NUM_WT] = htote(i); 1203 } 1204 } 1205 1206 /* 1207 * Substitution tables 1208 */ 1209 for (i = 0; i < NUM_WT; i++) { 1210 collate_subst_t *st = NULL; 1211 subst_t *temp; 1212 RB_COUNT(temp, substs, &substs[i], n); 1213 subst_count[i] = n; 1214 if ((st = calloc(n, sizeof(collate_subst_t))) == NULL) { 1215 fprintf(stderr, "out of memory"); 1216 return; 1217 } 1218 n = 0; 1219 RB_FOREACH(sb, substs, &substs[i]) { 1220 if ((st[n].key = resolve_pri(sb->key)) < 0) { 1221 /* by definition these resolve! */ 1222 INTERR; 1223 } 1224 if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) { 1225 INTERR; 1226 } 1227 st[n].key = htote(st[n].key); 1228 for (j = 0; sb->ref[j]; j++) { 1229 st[n].pri[j] = htote(get_weight(sb->ref[j], 1230 i)); 1231 } 1232 n++; 1233 } 1234 if (n != subst_count[i]) 1235 INTERR; 1236 subst[i] = st; 1237 } 1238 1239 1240 /* 1241 * Chains, i.e. collating elements 1242 */ 1243 RB_NUMNODES(collelem_t, elem_by_expand, &elem_by_expand, chain_count); 1244 chain = calloc(chain_count, sizeof(collate_chain_t)); 1245 if (chain == NULL) { 1246 fprintf(stderr, "out of memory"); 1247 return; 1248 } 1249 n = 0; 1250 RB_FOREACH(ce, elem_by_expand, &elem_by_expand) { 1251 (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN); 1252 for (i = 0; i < NUM_WT; i++) { 1253 chain[n].pri[i] = htote(get_weight(ce->ref[i], i)); 1254 } 1255 n++; 1256 } 1257 if (n != chain_count) 1258 INTERR; 1259 1260 /* 1261 * Large (> UCHAR_MAX) character priorities 1262 */ 1263 RB_NUMNODES(collchar_t, collchars, &collchars, n); 1264 large = calloc(n, sizeof(collate_large_t)); 1265 if (large == NULL) { 1266 fprintf(stderr, "out of memory"); 1267 return; 1268 } 1269 1270 i = 0; 1271 RB_FOREACH(cc, collchars, &collchars) { 1272 int undef = 0; 1273 /* we already gathered those */ 1274 if (cc->wc <= UCHAR_MAX) 1275 continue; 1276 for (j = 0; j < NUM_WT; j++) { 1277 if ((pri = get_weight(cc->ref[j], j)) < 0) { 1278 undef = 1; 1279 } 1280 if (undef && (pri >= 0)) { 1281 /* if undefined, then all priorities are */ 1282 INTERR; 1283 } else { 1284 large[i].pri.pri[j] = htote(pri); 1285 } 1286 } 1287 if (!undef) { 1288 large[i].val = htote(cc->wc); 1289 large_count = i++; 1290 } 1291 } 1292 1293 if ((f = open_category()) == NULL) { 1294 return; 1295 } 1296 1297 /* Time to write the entire data set out */ 1298 1299 for (i = 0; i < NUM_WT; i++) 1300 collinfo.subst_count[i] = htote(subst_count[i]); 1301 collinfo.chain_count = htote(chain_count); 1302 collinfo.large_count = htote(large_count); 1303 1304 if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) || 1305 (wr_category(&collinfo, sizeof (collinfo), f) < 0) || 1306 (wr_category(&chars, sizeof (chars), f) < 0)) { 1307 return; 1308 } 1309 1310 for (i = 0; i < NUM_WT; i++) { 1311 sz = sizeof (collate_subst_t) * subst_count[i]; 1312 if (wr_category(subst[i], sz, f) < 0) { 1313 return; 1314 } 1315 } 1316 sz = sizeof (collate_chain_t) * chain_count; 1317 if (wr_category(chain, sz, f) < 0) { 1318 return; 1319 } 1320 sz = sizeof (collate_large_t) * large_count; 1321 if (wr_category(large, sz, f) < 0) { 1322 return; 1323 } 1324 1325 close_category(f); 1326 } 1327