1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright 2014 Garrett D'Amore <garrett@damore.org> 5 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 6 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 7 * at Electronni Visti IA, Kiev, Ukraine. 8 * All rights reserved. 9 * 10 * Copyright (c) 2011 The FreeBSD Foundation 11 * 12 * Portions of this software were developed by David Chisnall 13 * under sponsorship from the FreeBSD Foundation. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * Adapted to xlocale by John Marino <draco@marino.st> 37 */ 38 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include "namespace.h" 43 44 #include <sys/types.h> 45 #include <sys/stat.h> 46 #include <sys/mman.h> 47 48 #include <assert.h> 49 #include <stdio.h> 50 #include <stdlib.h> 51 #include <string.h> 52 #include <wchar.h> 53 #include <errno.h> 54 #include <unistd.h> 55 #include <fcntl.h> 56 #include "un-namespace.h" 57 58 #include "collate.h" 59 #include "setlocale.h" 60 #include "ldpart.h" 61 #include "libc_private.h" 62 63 struct xlocale_collate __xlocale_global_collate = { 64 {{0}, "C"}, 1, 0, 0, 0 65 }; 66 67 struct xlocale_collate __xlocale_C_collate = { 68 {{0}, "C"}, 1, 0, 0, 0 69 }; 70 71 struct xlocale_collate __xlocale_POSIX_collate = { 72 {{0}, "POSIX"}, 1, 0, 0, 0 73 }; 74 75 struct xlocale_collate __xlocale_CUTF8_collate = { 76 {{0}, "C.UTF-8"}, 1, 0, 0, 0 77 }; 78 79 static int 80 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table); 81 82 static void 83 destruct_collate(void *t) 84 { 85 struct xlocale_collate *table = t; 86 if (table->map && (table->maplen > 0)) { 87 (void) munmap(table->map, table->maplen); 88 } 89 free(t); 90 } 91 92 void * 93 __collate_load(const char *encoding, __unused locale_t unused) 94 { 95 if (strcmp(encoding, "C") == 0) 96 return (&__xlocale_C_collate); 97 else if (strcmp(encoding, "POSIX") == 0) 98 return (&__xlocale_POSIX_collate); 99 else if (strcmp(encoding, "C.UTF-8") == 0) 100 return (&__xlocale_CUTF8_collate); 101 102 struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 103 1); 104 if (table == NULL) 105 return (NULL); 106 table->header.header.destructor = destruct_collate; 107 108 /* 109 * FIXME: Make sure that _LDP_CACHE is never returned. We 110 * should be doing the caching outside of this section. 111 */ 112 if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) { 113 xlocale_release(table); 114 return (NULL); 115 } 116 return (table); 117 } 118 119 /** 120 * Load the collation tables for the specified encoding into the global table. 121 */ 122 int 123 __collate_load_tables(const char *encoding) 124 { 125 126 return (__collate_load_tables_l(encoding, &__xlocale_global_collate)); 127 } 128 129 static int 130 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) 131 { 132 int i, chains, z; 133 char *buf; 134 char *TMP; 135 char *map; 136 collate_info_t *info; 137 struct stat sbuf; 138 int fd; 139 140 table->__collate_load_error = 1; 141 142 /* 'encoding' must be already checked. */ 143 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 || 144 strncmp(encoding, "C.", 2) == 0) { 145 return (_LDP_CACHE); 146 } 147 148 if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1) 149 return (_LDP_ERROR); 150 151 if ((fd = _open(buf, O_RDONLY | O_CLOEXEC)) < 0) { 152 free(buf); 153 return (_LDP_ERROR); 154 } 155 free(buf); 156 if (_fstat(fd, &sbuf) < 0) { 157 (void) _close(fd); 158 return (_LDP_ERROR); 159 } 160 if (sbuf.st_size < (COLLATE_FMT_VERSION_LEN + 161 XLOCALE_DEF_VERSION_LEN + 162 sizeof (*info))) { 163 (void) _close(fd); 164 errno = EINVAL; 165 return (_LDP_ERROR); 166 } 167 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 168 (void) _close(fd); 169 if ((TMP = map) == MAP_FAILED) { 170 return (_LDP_ERROR); 171 } 172 173 if (strncmp(TMP, COLLATE_FMT_VERSION, COLLATE_FMT_VERSION_LEN) != 0) { 174 (void) munmap(map, sbuf.st_size); 175 errno = EINVAL; 176 return (_LDP_ERROR); 177 } 178 TMP += COLLATE_FMT_VERSION_LEN; 179 strlcat(table->header.version, TMP, sizeof (table->header.version)); 180 TMP += XLOCALE_DEF_VERSION_LEN; 181 182 info = (void *)TMP; 183 TMP += sizeof (*info); 184 185 if ((info->directive_count < 1) || 186 (info->directive_count >= COLL_WEIGHTS_MAX) || 187 ((chains = info->chain_count) < 0)) { 188 (void) munmap(map, sbuf.st_size); 189 errno = EINVAL; 190 return (_LDP_ERROR); 191 } 192 193 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 194 (sizeof (collate_chain_t) * chains) + 195 (sizeof (collate_large_t) * info->large_count); 196 for (z = 0; z < info->directive_count; z++) { 197 i += sizeof (collate_subst_t) * info->subst_count[z]; 198 } 199 if (i != (sbuf.st_size - (TMP - map))) { 200 (void) munmap(map, sbuf.st_size); 201 errno = EINVAL; 202 return (_LDP_ERROR); 203 } 204 205 if (table->map && (table->maplen > 0)) { 206 (void) munmap(table->map, table->maplen); 207 } 208 table->map = map; 209 table->maplen = sbuf.st_size; 210 table->info = info; 211 table->char_pri_table = (void *)TMP; 212 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 213 214 for (z = 0; z < info->directive_count; z++) { 215 if (info->subst_count[z] > 0) { 216 table->subst_table[z] = (void *)TMP; 217 TMP += info->subst_count[z] * sizeof (collate_subst_t); 218 } else { 219 table->subst_table[z] = NULL; 220 } 221 } 222 223 if (chains > 0) { 224 table->chain_pri_table = (void *)TMP; 225 TMP += chains * sizeof (collate_chain_t); 226 } else 227 table->chain_pri_table = NULL; 228 if (info->large_count > 0) 229 table->large_pri_table = (void *)TMP; 230 else 231 table->large_pri_table = NULL; 232 233 table->__collate_load_error = 0; 234 return (_LDP_LOADED); 235 } 236 237 static const int32_t * 238 substsearch(struct xlocale_collate *table, const wchar_t key, int pass) 239 { 240 const collate_subst_t *p; 241 int n = table->info->subst_count[pass]; 242 243 if (n == 0) 244 return (NULL); 245 246 if (pass >= table->info->directive_count) 247 return (NULL); 248 249 if (!(key & COLLATE_SUBST_PRIORITY)) 250 return (NULL); 251 252 p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 253 assert(p->key == key); 254 255 return (p->pri); 256 } 257 258 static collate_chain_t * 259 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len) 260 { 261 int low = 0; 262 int high = table->info->chain_count - 1; 263 int next, compar, l; 264 collate_chain_t *p; 265 collate_chain_t *tab = table->chain_pri_table; 266 267 if (high < 0) 268 return (NULL); 269 270 while (low <= high) { 271 next = (low + high) / 2; 272 p = tab + next; 273 compar = *key - *p->str; 274 if (compar == 0) { 275 l = wcsnlen(p->str, COLLATE_STR_LEN); 276 compar = wcsncmp(key, p->str, l); 277 if (compar == 0) { 278 *len = l; 279 return (p); 280 } 281 } 282 if (compar > 0) 283 low = next + 1; 284 else 285 high = next - 1; 286 } 287 return (NULL); 288 } 289 290 static collate_large_t * 291 largesearch(struct xlocale_collate *table, const wchar_t key) 292 { 293 int low = 0; 294 int high = table->info->large_count - 1; 295 int next, compar; 296 collate_large_t *p; 297 collate_large_t *tab = table->large_pri_table; 298 299 if (high < 0) 300 return (NULL); 301 302 while (low <= high) { 303 next = (low + high) / 2; 304 p = tab + next; 305 compar = key - p->val; 306 if (compar == 0) 307 return (p); 308 if (compar > 0) 309 low = next + 1; 310 else 311 high = next - 1; 312 } 313 return (NULL); 314 } 315 316 void 317 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len, 318 int *pri, int which, const int **state) 319 { 320 collate_chain_t *p2; 321 collate_large_t *match; 322 int p, l; 323 const int *sptr; 324 325 /* 326 * If this is the "last" pass for the UNDEFINED, then 327 * we just return the priority itself. 328 */ 329 if (which >= table->info->directive_count) { 330 *pri = *t; 331 *len = 1; 332 *state = NULL; 333 return; 334 } 335 336 /* 337 * If we have remaining substitution data from a previous 338 * call, consume it first. 339 */ 340 if ((sptr = *state) != NULL) { 341 *pri = *sptr; 342 sptr++; 343 if ((sptr == *state) || (sptr == NULL)) 344 *state = NULL; 345 else 346 *state = sptr; 347 *len = 0; 348 return; 349 } 350 351 /* No active substitutions */ 352 *len = 1; 353 354 /* 355 * Check for composites such as diphthongs that collate as a 356 * single element (aka chains or collating-elements). 357 */ 358 if (((p2 = chainsearch(table, t, &l)) != NULL) && 359 ((p = p2->pri[which]) >= 0)) { 360 361 *len = l; 362 *pri = p; 363 364 } else if (*t <= UCHAR_MAX) { 365 366 /* 367 * Character is a small (8-bit) character. 368 * We just look these up directly for speed. 369 */ 370 *pri = table->char_pri_table[*t].pri[which]; 371 372 } else if ((table->info->large_count > 0) && 373 ((match = largesearch(table, *t)) != NULL)) { 374 375 /* 376 * Character was found in the extended table. 377 */ 378 *pri = match->pri.pri[which]; 379 380 } else { 381 /* 382 * Character lacks a specific definition. 383 */ 384 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) { 385 /* Mask off sign bit to prevent ordering confusion. */ 386 *pri = (*t & COLLATE_MAX_PRIORITY); 387 } else { 388 *pri = table->info->undef_pri[which]; 389 } 390 /* No substitutions for undefined characters! */ 391 return; 392 } 393 394 /* 395 * Try substituting (expanding) the character. We are 396 * currently doing this *after* the chain compression. I 397 * think it should not matter, but this way might be slightly 398 * faster. 399 * 400 * We do this after the priority search, as this will help us 401 * to identify a single key value. In order for this to work, 402 * its important that the priority assigned to a given element 403 * to be substituted be unique for that level. The localedef 404 * code ensures this for us. 405 */ 406 if ((sptr = substsearch(table, *pri, which)) != NULL) { 407 if ((*pri = *sptr) > 0) { 408 sptr++; 409 *state = *sptr ? sptr : NULL; 410 } 411 } 412 413 } 414 415 /* 416 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 417 * NOT NULL terminate. That is left to the caller. 418 */ 419 size_t 420 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf, 421 size_t room) 422 { 423 int pri; 424 int len; 425 const wchar_t *t; 426 wchar_t *tr = NULL; 427 int direc; 428 int pass; 429 const int32_t *state; 430 size_t want = 0; 431 size_t need = 0; 432 int ndir = table->info->directive_count; 433 434 assert(src); 435 436 for (pass = 0; pass <= ndir; pass++) { 437 438 state = NULL; 439 440 if (pass != 0) { 441 /* insert level separator from the previous pass */ 442 if (room) { 443 *xf++ = 1; 444 room--; 445 } 446 want++; 447 } 448 449 /* special pass for undefined */ 450 if (pass == ndir) { 451 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 452 } else { 453 direc = table->info->directive[pass]; 454 } 455 456 t = src; 457 458 if (direc & DIRECTIVE_BACKWARD) { 459 wchar_t *bp, *fp, c; 460 free(tr); 461 if ((tr = wcsdup(t)) == NULL) { 462 errno = ENOMEM; 463 goto fail; 464 } 465 bp = tr; 466 fp = tr + wcslen(tr) - 1; 467 while (bp < fp) { 468 c = *bp; 469 *bp++ = *fp; 470 *fp-- = c; 471 } 472 t = (const wchar_t *)tr; 473 } 474 475 if (direc & DIRECTIVE_POSITION) { 476 while (*t || state) { 477 _collate_lookup(table, t, &len, &pri, pass, &state); 478 t += len; 479 if (pri <= 0) { 480 if (pri < 0) { 481 errno = EINVAL; 482 goto fail; 483 } 484 state = NULL; 485 pri = COLLATE_MAX_PRIORITY; 486 } 487 if (room) { 488 *xf++ = pri; 489 room--; 490 } 491 want++; 492 need = want; 493 } 494 } else { 495 while (*t || state) { 496 _collate_lookup(table, t, &len, &pri, pass, &state); 497 t += len; 498 if (pri <= 0) { 499 if (pri < 0) { 500 errno = EINVAL; 501 goto fail; 502 } 503 state = NULL; 504 continue; 505 } 506 if (room) { 507 *xf++ = pri; 508 room--; 509 } 510 want++; 511 need = want; 512 } 513 } 514 } 515 free(tr); 516 return (need); 517 518 fail: 519 free(tr); 520 return ((size_t)(-1)); 521 } 522 523 /* 524 * In the non-POSIX case, we transform each character into a string of 525 * characters representing the character's priority. Since char is usually 526 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 527 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 528 * bits per byte. 529 * 530 * It turns out that we sometimes have real priorities that are 531 * 31-bits wide. (But: be careful using priorities where the high 532 * order bit is set -- i.e. the priority is negative. The sort order 533 * may be surprising!) 534 * 535 * TODO: This would be a good area to optimize somewhat. It turns out 536 * that real prioririties *except for the last UNDEFINED pass* are generally 537 * very small. We need the localedef code to precalculate the max 538 * priority for us, and ideally also give us a mask, and then we could 539 * severely limit what we expand to. 540 */ 541 #define XFRM_BYTES 6 542 #define XFRM_OFFSET ('0') /* make all printable characters */ 543 #define XFRM_SHIFT 6 544 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 545 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 546 547 static int 548 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass) 549 { 550 /* we use unsigned to ensure zero fill on right shift */ 551 uint32_t val = (uint32_t)table->info->pri_count[pass]; 552 int nc = 0; 553 554 while (val) { 555 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 556 pri >>= XFRM_SHIFT; 557 val >>= XFRM_SHIFT; 558 p++; 559 nc++; 560 } 561 return (nc); 562 } 563 564 size_t 565 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf, 566 size_t room) 567 { 568 int pri; 569 int len; 570 const wchar_t *t; 571 wchar_t *tr = NULL; 572 int direc; 573 int pass; 574 const int32_t *state; 575 size_t want = 0; 576 size_t need = 0; 577 int b; 578 uint8_t buf[XFRM_BYTES]; 579 int ndir = table->info->directive_count; 580 581 assert(src); 582 583 for (pass = 0; pass <= ndir; pass++) { 584 585 state = NULL; 586 587 if (pass != 0) { 588 /* insert level separator from the previous pass */ 589 if (room) { 590 *xf++ = XFRM_SEP; 591 room--; 592 } 593 want++; 594 } 595 596 /* special pass for undefined */ 597 if (pass == ndir) { 598 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 599 } else { 600 direc = table->info->directive[pass]; 601 } 602 603 t = src; 604 605 if (direc & DIRECTIVE_BACKWARD) { 606 wchar_t *bp, *fp, c; 607 free(tr); 608 if ((tr = wcsdup(t)) == NULL) { 609 errno = ENOMEM; 610 goto fail; 611 } 612 bp = tr; 613 fp = tr + wcslen(tr) - 1; 614 while (bp < fp) { 615 c = *bp; 616 *bp++ = *fp; 617 *fp-- = c; 618 } 619 t = (const wchar_t *)tr; 620 } 621 622 if (direc & DIRECTIVE_POSITION) { 623 while (*t || state) { 624 625 _collate_lookup(table, t, &len, &pri, pass, &state); 626 t += len; 627 if (pri <= 0) { 628 if (pri < 0) { 629 errno = EINVAL; 630 goto fail; 631 } 632 state = NULL; 633 pri = COLLATE_MAX_PRIORITY; 634 } 635 636 b = xfrm(table, buf, pri, pass); 637 want += b; 638 if (room) { 639 while (b) { 640 b--; 641 if (room) { 642 *xf++ = buf[b]; 643 room--; 644 } 645 } 646 } 647 need = want; 648 } 649 } else { 650 while (*t || state) { 651 _collate_lookup(table, t, &len, &pri, pass, &state); 652 t += len; 653 if (pri <= 0) { 654 if (pri < 0) { 655 errno = EINVAL; 656 goto fail; 657 } 658 state = NULL; 659 continue; 660 } 661 662 b = xfrm(table, buf, pri, pass); 663 want += b; 664 if (room) { 665 666 while (b) { 667 b--; 668 if (room) { 669 *xf++ = buf[b]; 670 room--; 671 } 672 } 673 } 674 need = want; 675 } 676 } 677 } 678 free(tr); 679 return (need); 680 681 fail: 682 free(tr); 683 return ((size_t)(-1)); 684 } 685 686 /* 687 * __collate_equiv_value returns the primary collation value for the given 688 * collating symbol specified by str and len. Zero or negative is returned 689 * if the collating symbol was not found. This function is used by bracket 690 * code in the TRE regex library. 691 */ 692 int 693 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) 694 { 695 int32_t e; 696 697 if (len < 1 || len >= COLLATE_STR_LEN) 698 return (-1); 699 700 FIX_LOCALE(locale); 701 struct xlocale_collate *table = 702 (struct xlocale_collate*)locale->components[XLC_COLLATE]; 703 704 if (table->__collate_load_error) 705 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1); 706 707 if (len == 1) { 708 e = -1; 709 if (*str <= UCHAR_MAX) 710 e = table->char_pri_table[*str].pri[0]; 711 else if (table->info->large_count > 0) { 712 collate_large_t *match_large; 713 match_large = largesearch(table, *str); 714 if (match_large) 715 e = match_large->pri.pri[0]; 716 } 717 if (e == 0) 718 return (1); 719 return (e > 0 ? e : 0); 720 } 721 if (table->info->chain_count > 0) { 722 wchar_t name[COLLATE_STR_LEN]; 723 collate_chain_t *match_chain; 724 int clen; 725 726 wcsncpy (name, str, len); 727 name[len] = 0; 728 match_chain = chainsearch(table, name, &clen); 729 if (match_chain) { 730 e = match_chain->pri[0]; 731 if (e == 0) 732 return (1); 733 return (e < 0 ? -e : e); 734 } 735 } 736 return (0); 737 } 738