1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright 2014 Garrett D'Amore <garrett@damore.org> 5 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 6 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 7 * at Electronni Visti IA, Kiev, Ukraine. 8 * All rights reserved. 9 * 10 * Copyright (c) 2011 The FreeBSD Foundation 11 * 12 * Portions of this software were developed by David Chisnall 13 * under sponsorship from the FreeBSD Foundation. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * Adapted to xlocale by John Marino <draco@marino.st> 37 */ 38 39 #include "namespace.h" 40 41 #include <sys/types.h> 42 #include <sys/stat.h> 43 #include <sys/mman.h> 44 45 #include <assert.h> 46 #include <stdio.h> 47 #include <stdlib.h> 48 #include <string.h> 49 #include <wchar.h> 50 #include <errno.h> 51 #include <unistd.h> 52 #include <fcntl.h> 53 #include "un-namespace.h" 54 55 #include "collate.h" 56 #include "setlocale.h" 57 #include "ldpart.h" 58 #include "libc_private.h" 59 60 struct xlocale_collate __xlocale_global_collate = { 61 {{0}, "C"}, 1, 0, 0, 0 62 }; 63 64 struct xlocale_collate __xlocale_C_collate = { 65 {{0}, "C"}, 1, 0, 0, 0 66 }; 67 68 struct xlocale_collate __xlocale_POSIX_collate = { 69 {{0}, "POSIX"}, 1, 0, 0, 0 70 }; 71 72 struct xlocale_collate __xlocale_CUTF8_collate = { 73 {{0}, "C.UTF-8"}, 1, 0, 0, 0 74 }; 75 76 static int 77 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table); 78 79 static void 80 destruct_collate(void *t) 81 { 82 struct xlocale_collate *table = t; 83 if (table->map && (table->maplen > 0)) { 84 (void) munmap(table->map, table->maplen); 85 } 86 free(t); 87 } 88 89 void * 90 __collate_load(const char *encoding, __unused locale_t unused) 91 { 92 if (strcmp(encoding, "C") == 0) 93 return (&__xlocale_C_collate); 94 else if (strcmp(encoding, "POSIX") == 0) 95 return (&__xlocale_POSIX_collate); 96 else if (strcmp(encoding, "C.UTF-8") == 0) 97 return (&__xlocale_CUTF8_collate); 98 99 struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 100 1); 101 if (table == NULL) 102 return (NULL); 103 table->header.header.destructor = destruct_collate; 104 105 /* 106 * FIXME: Make sure that _LDP_CACHE is never returned. We 107 * should be doing the caching outside of this section. 108 */ 109 if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) { 110 xlocale_release(table); 111 return (NULL); 112 } 113 return (table); 114 } 115 116 /** 117 * Load the collation tables for the specified encoding into the global table. 118 */ 119 int 120 __collate_load_tables(const char *encoding) 121 { 122 123 return (__collate_load_tables_l(encoding, &__xlocale_global_collate)); 124 } 125 126 static int 127 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) 128 { 129 int i, chains, z; 130 char *buf; 131 char *TMP; 132 char *map; 133 collate_info_t *info; 134 struct stat sbuf; 135 int fd; 136 137 table->__collate_load_error = 1; 138 139 /* 'encoding' must be already checked. */ 140 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 || 141 strncmp(encoding, "C.", 2) == 0) { 142 return (_LDP_CACHE); 143 } 144 145 if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1) 146 return (_LDP_ERROR); 147 148 if ((fd = _open(buf, O_RDONLY | O_CLOEXEC)) < 0) { 149 free(buf); 150 return (_LDP_ERROR); 151 } 152 free(buf); 153 if (_fstat(fd, &sbuf) < 0) { 154 (void) _close(fd); 155 return (_LDP_ERROR); 156 } 157 if (sbuf.st_size < (COLLATE_FMT_VERSION_LEN + 158 XLOCALE_DEF_VERSION_LEN + 159 sizeof (*info))) { 160 (void) _close(fd); 161 errno = EINVAL; 162 return (_LDP_ERROR); 163 } 164 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 165 (void) _close(fd); 166 if ((TMP = map) == MAP_FAILED) { 167 return (_LDP_ERROR); 168 } 169 170 if (strncmp(TMP, COLLATE_FMT_VERSION, COLLATE_FMT_VERSION_LEN) != 0) { 171 (void) munmap(map, sbuf.st_size); 172 errno = EINVAL; 173 return (_LDP_ERROR); 174 } 175 TMP += COLLATE_FMT_VERSION_LEN; 176 strlcat(table->header.version, TMP, sizeof (table->header.version)); 177 TMP += XLOCALE_DEF_VERSION_LEN; 178 179 info = (void *)TMP; 180 TMP += sizeof (*info); 181 182 if ((info->directive_count < 1) || 183 (info->directive_count >= COLL_WEIGHTS_MAX) || 184 ((chains = info->chain_count) < 0)) { 185 (void) munmap(map, sbuf.st_size); 186 errno = EINVAL; 187 return (_LDP_ERROR); 188 } 189 190 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 191 (sizeof (collate_chain_t) * chains) + 192 (sizeof (collate_large_t) * info->large_count); 193 for (z = 0; z < info->directive_count; z++) { 194 i += sizeof (collate_subst_t) * info->subst_count[z]; 195 } 196 if (i != (sbuf.st_size - (TMP - map))) { 197 (void) munmap(map, sbuf.st_size); 198 errno = EINVAL; 199 return (_LDP_ERROR); 200 } 201 202 if (table->map && (table->maplen > 0)) { 203 (void) munmap(table->map, table->maplen); 204 } 205 table->map = map; 206 table->maplen = sbuf.st_size; 207 table->info = info; 208 table->char_pri_table = (void *)TMP; 209 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 210 211 for (z = 0; z < info->directive_count; z++) { 212 if (info->subst_count[z] > 0) { 213 table->subst_table[z] = (void *)TMP; 214 TMP += info->subst_count[z] * sizeof (collate_subst_t); 215 } else { 216 table->subst_table[z] = NULL; 217 } 218 } 219 220 if (chains > 0) { 221 table->chain_pri_table = (void *)TMP; 222 TMP += chains * sizeof (collate_chain_t); 223 } else 224 table->chain_pri_table = NULL; 225 if (info->large_count > 0) 226 table->large_pri_table = (void *)TMP; 227 else 228 table->large_pri_table = NULL; 229 230 table->__collate_load_error = 0; 231 return (_LDP_LOADED); 232 } 233 234 static const int32_t * 235 substsearch(struct xlocale_collate *table, const wchar_t key, int pass) 236 { 237 const collate_subst_t *p; 238 int n = table->info->subst_count[pass]; 239 240 if (n == 0) 241 return (NULL); 242 243 if (pass >= table->info->directive_count) 244 return (NULL); 245 246 if (!(key & COLLATE_SUBST_PRIORITY)) 247 return (NULL); 248 249 p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 250 assert(p->key == key); 251 252 return (p->pri); 253 } 254 255 static collate_chain_t * 256 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len) 257 { 258 int low = 0; 259 int high = table->info->chain_count - 1; 260 int next, compar, l; 261 collate_chain_t *p; 262 collate_chain_t *tab = table->chain_pri_table; 263 264 if (high < 0) 265 return (NULL); 266 267 while (low <= high) { 268 next = (low + high) / 2; 269 p = tab + next; 270 compar = *key - *p->str; 271 if (compar == 0) { 272 l = wcsnlen(p->str, COLLATE_STR_LEN); 273 compar = wcsncmp(key, p->str, l); 274 if (compar == 0) { 275 *len = l; 276 return (p); 277 } 278 } 279 if (compar > 0) 280 low = next + 1; 281 else 282 high = next - 1; 283 } 284 return (NULL); 285 } 286 287 static collate_large_t * 288 largesearch(struct xlocale_collate *table, const wchar_t key) 289 { 290 int low = 0; 291 int high = table->info->large_count - 1; 292 int next, compar; 293 collate_large_t *p; 294 collate_large_t *tab = table->large_pri_table; 295 296 if (high < 0) 297 return (NULL); 298 299 while (low <= high) { 300 next = (low + high) / 2; 301 p = tab + next; 302 compar = key - p->val; 303 if (compar == 0) 304 return (p); 305 if (compar > 0) 306 low = next + 1; 307 else 308 high = next - 1; 309 } 310 return (NULL); 311 } 312 313 void 314 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len, 315 int *pri, int which, const int **state) 316 { 317 collate_chain_t *p2; 318 collate_large_t *match; 319 int p, l; 320 const int *sptr; 321 322 /* 323 * If this is the "last" pass for the UNDEFINED, then 324 * we just return the priority itself. 325 */ 326 if (which >= table->info->directive_count) { 327 *pri = *t; 328 *len = 1; 329 *state = NULL; 330 return; 331 } 332 333 /* 334 * If we have remaining substitution data from a previous 335 * call, consume it first. 336 */ 337 if ((sptr = *state) != NULL) { 338 *pri = *sptr; 339 sptr++; 340 if ((sptr == *state) || (sptr == NULL)) 341 *state = NULL; 342 else 343 *state = sptr; 344 *len = 0; 345 return; 346 } 347 348 /* No active substitutions */ 349 *len = 1; 350 351 /* 352 * Check for composites such as diphthongs that collate as a 353 * single element (aka chains or collating-elements). 354 */ 355 if (((p2 = chainsearch(table, t, &l)) != NULL) && 356 ((p = p2->pri[which]) >= 0)) { 357 358 *len = l; 359 *pri = p; 360 361 } else if (*t <= UCHAR_MAX) { 362 363 /* 364 * Character is a small (8-bit) character. 365 * We just look these up directly for speed. 366 */ 367 *pri = table->char_pri_table[*t].pri[which]; 368 369 } else if ((table->info->large_count > 0) && 370 ((match = largesearch(table, *t)) != NULL)) { 371 372 /* 373 * Character was found in the extended table. 374 */ 375 *pri = match->pri.pri[which]; 376 377 } else { 378 /* 379 * Character lacks a specific definition. 380 */ 381 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) { 382 /* Mask off sign bit to prevent ordering confusion. */ 383 *pri = (*t & COLLATE_MAX_PRIORITY); 384 } else { 385 *pri = table->info->undef_pri[which]; 386 } 387 /* No substitutions for undefined characters! */ 388 return; 389 } 390 391 /* 392 * Try substituting (expanding) the character. We are 393 * currently doing this *after* the chain compression. I 394 * think it should not matter, but this way might be slightly 395 * faster. 396 * 397 * We do this after the priority search, as this will help us 398 * to identify a single key value. In order for this to work, 399 * its important that the priority assigned to a given element 400 * to be substituted be unique for that level. The localedef 401 * code ensures this for us. 402 */ 403 if ((sptr = substsearch(table, *pri, which)) != NULL) { 404 if ((*pri = *sptr) > 0) { 405 sptr++; 406 *state = *sptr ? sptr : NULL; 407 } 408 } 409 410 } 411 412 /* 413 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 414 * NOT NULL terminate. That is left to the caller. 415 */ 416 size_t 417 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf, 418 size_t room) 419 { 420 int pri; 421 int len; 422 const wchar_t *t; 423 wchar_t *tr = NULL; 424 int direc; 425 int pass; 426 const int32_t *state; 427 size_t want = 0; 428 size_t need = 0; 429 int ndir = table->info->directive_count; 430 431 assert(src); 432 433 for (pass = 0; pass <= ndir; pass++) { 434 435 state = NULL; 436 437 if (pass != 0) { 438 /* insert level separator from the previous pass */ 439 if (room) { 440 *xf++ = 1; 441 room--; 442 } 443 want++; 444 } 445 446 /* special pass for undefined */ 447 if (pass == ndir) { 448 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 449 } else { 450 direc = table->info->directive[pass]; 451 } 452 453 t = src; 454 455 if (direc & DIRECTIVE_BACKWARD) { 456 wchar_t *bp, *fp, c; 457 free(tr); 458 if ((tr = wcsdup(t)) == NULL) { 459 errno = ENOMEM; 460 goto fail; 461 } 462 bp = tr; 463 fp = tr + wcslen(tr) - 1; 464 while (bp < fp) { 465 c = *bp; 466 *bp++ = *fp; 467 *fp-- = c; 468 } 469 t = (const wchar_t *)tr; 470 } 471 472 if (direc & DIRECTIVE_POSITION) { 473 while (*t || state) { 474 _collate_lookup(table, t, &len, &pri, pass, &state); 475 t += len; 476 if (pri <= 0) { 477 if (pri < 0) { 478 errno = EINVAL; 479 goto fail; 480 } 481 state = NULL; 482 pri = COLLATE_MAX_PRIORITY; 483 } 484 if (room) { 485 *xf++ = pri; 486 room--; 487 } 488 want++; 489 need = want; 490 } 491 } else { 492 while (*t || state) { 493 _collate_lookup(table, t, &len, &pri, pass, &state); 494 t += len; 495 if (pri <= 0) { 496 if (pri < 0) { 497 errno = EINVAL; 498 goto fail; 499 } 500 state = NULL; 501 continue; 502 } 503 if (room) { 504 *xf++ = pri; 505 room--; 506 } 507 want++; 508 need = want; 509 } 510 } 511 } 512 free(tr); 513 return (need); 514 515 fail: 516 free(tr); 517 return ((size_t)(-1)); 518 } 519 520 /* 521 * In the non-POSIX case, we transform each character into a string of 522 * characters representing the character's priority. Since char is usually 523 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 524 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 525 * bits per byte. 526 * 527 * It turns out that we sometimes have real priorities that are 528 * 31-bits wide. (But: be careful using priorities where the high 529 * order bit is set -- i.e. the priority is negative. The sort order 530 * may be surprising!) 531 * 532 * TODO: This would be a good area to optimize somewhat. It turns out 533 * that real prioririties *except for the last UNDEFINED pass* are generally 534 * very small. We need the localedef code to precalculate the max 535 * priority for us, and ideally also give us a mask, and then we could 536 * severely limit what we expand to. 537 */ 538 #define XFRM_BYTES 6 539 #define XFRM_OFFSET ('0') /* make all printable characters */ 540 #define XFRM_SHIFT 6 541 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 542 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 543 544 static int 545 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass) 546 { 547 /* we use unsigned to ensure zero fill on right shift */ 548 uint32_t val = (uint32_t)table->info->pri_count[pass]; 549 int nc = 0; 550 551 while (val) { 552 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 553 pri >>= XFRM_SHIFT; 554 val >>= XFRM_SHIFT; 555 p++; 556 nc++; 557 } 558 return (nc); 559 } 560 561 size_t 562 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf, 563 size_t room) 564 { 565 int pri; 566 int len; 567 const wchar_t *t; 568 wchar_t *tr = NULL; 569 int direc; 570 int pass; 571 const int32_t *state; 572 size_t want = 0; 573 size_t need = 0; 574 int b; 575 uint8_t buf[XFRM_BYTES]; 576 int ndir = table->info->directive_count; 577 578 assert(src); 579 580 for (pass = 0; pass <= ndir; pass++) { 581 582 state = NULL; 583 584 if (pass != 0) { 585 /* insert level separator from the previous pass */ 586 if (room) { 587 *xf++ = XFRM_SEP; 588 room--; 589 } 590 want++; 591 } 592 593 /* special pass for undefined */ 594 if (pass == ndir) { 595 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 596 } else { 597 direc = table->info->directive[pass]; 598 } 599 600 t = src; 601 602 if (direc & DIRECTIVE_BACKWARD) { 603 wchar_t *bp, *fp, c; 604 free(tr); 605 if ((tr = wcsdup(t)) == NULL) { 606 errno = ENOMEM; 607 goto fail; 608 } 609 bp = tr; 610 fp = tr + wcslen(tr) - 1; 611 while (bp < fp) { 612 c = *bp; 613 *bp++ = *fp; 614 *fp-- = c; 615 } 616 t = (const wchar_t *)tr; 617 } 618 619 if (direc & DIRECTIVE_POSITION) { 620 while (*t || state) { 621 622 _collate_lookup(table, t, &len, &pri, pass, &state); 623 t += len; 624 if (pri <= 0) { 625 if (pri < 0) { 626 errno = EINVAL; 627 goto fail; 628 } 629 state = NULL; 630 pri = COLLATE_MAX_PRIORITY; 631 } 632 633 b = xfrm(table, buf, pri, pass); 634 want += b; 635 if (room) { 636 while (b) { 637 b--; 638 if (room) { 639 *xf++ = buf[b]; 640 room--; 641 } 642 } 643 } 644 need = want; 645 } 646 } else { 647 while (*t || state) { 648 _collate_lookup(table, t, &len, &pri, pass, &state); 649 t += len; 650 if (pri <= 0) { 651 if (pri < 0) { 652 errno = EINVAL; 653 goto fail; 654 } 655 state = NULL; 656 continue; 657 } 658 659 b = xfrm(table, buf, pri, pass); 660 want += b; 661 if (room) { 662 663 while (b) { 664 b--; 665 if (room) { 666 *xf++ = buf[b]; 667 room--; 668 } 669 } 670 } 671 need = want; 672 } 673 } 674 } 675 free(tr); 676 return (need); 677 678 fail: 679 free(tr); 680 return ((size_t)(-1)); 681 } 682 683 /* 684 * __collate_equiv_value returns the primary collation value for the given 685 * collating symbol specified by str and len. Zero or negative is returned 686 * if the collating symbol was not found. This function is used by bracket 687 * code in the TRE regex library. 688 */ 689 int 690 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) 691 { 692 int32_t e; 693 694 if (len < 1 || len >= COLLATE_STR_LEN) 695 return (-1); 696 697 FIX_LOCALE(locale); 698 struct xlocale_collate *table = 699 (struct xlocale_collate*)locale->components[XLC_COLLATE]; 700 701 if (table->__collate_load_error) 702 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1); 703 704 if (len == 1) { 705 e = -1; 706 if (*str <= UCHAR_MAX) 707 e = table->char_pri_table[*str].pri[0]; 708 else if (table->info->large_count > 0) { 709 collate_large_t *match_large; 710 match_large = largesearch(table, *str); 711 if (match_large) 712 e = match_large->pri.pri[0]; 713 } 714 if (e == 0) 715 return (1); 716 return (e > 0 ? e : 0); 717 } 718 if (table->info->chain_count > 0) { 719 wchar_t name[COLLATE_STR_LEN]; 720 collate_chain_t *match_chain; 721 int clen; 722 723 wcsncpy (name, str, len); 724 name[len] = 0; 725 match_chain = chainsearch(table, name, &clen); 726 if (match_chain) { 727 e = match_chain->pri[0]; 728 if (e == 0) 729 return (1); 730 return (e < 0 ? -e : e); 731 } 732 } 733 return (0); 734 } 735