1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright 2014 Garrett D'Amore <garrett@damore.org> 5 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 6 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 7 * at Electronni Visti IA, Kiev, Ukraine. 8 * All rights reserved. 9 * 10 * Copyright (c) 2011 The FreeBSD Foundation 11 * 12 * Portions of this software were developed by David Chisnall 13 * under sponsorship from the FreeBSD Foundation. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * Adapted to xlocale by John Marino <draco@marino.st> 37 */ 38 39 #include <sys/cdefs.h> 40 #include "namespace.h" 41 42 #include <sys/types.h> 43 #include <sys/stat.h> 44 #include <sys/mman.h> 45 46 #include <assert.h> 47 #include <stdio.h> 48 #include <stdlib.h> 49 #include <string.h> 50 #include <wchar.h> 51 #include <errno.h> 52 #include <unistd.h> 53 #include <fcntl.h> 54 #include "un-namespace.h" 55 56 #include "collate.h" 57 #include "setlocale.h" 58 #include "ldpart.h" 59 #include "libc_private.h" 60 61 struct xlocale_collate __xlocale_global_collate = { 62 {{0}, "C"}, 1, 0, 0, 0 63 }; 64 65 struct xlocale_collate __xlocale_C_collate = { 66 {{0}, "C"}, 1, 0, 0, 0 67 }; 68 69 struct xlocale_collate __xlocale_POSIX_collate = { 70 {{0}, "POSIX"}, 1, 0, 0, 0 71 }; 72 73 struct xlocale_collate __xlocale_CUTF8_collate = { 74 {{0}, "C.UTF-8"}, 1, 0, 0, 0 75 }; 76 77 static int 78 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table); 79 80 static void 81 destruct_collate(void *t) 82 { 83 struct xlocale_collate *table = t; 84 if (table->map && (table->maplen > 0)) { 85 (void) munmap(table->map, table->maplen); 86 } 87 free(t); 88 } 89 90 void * 91 __collate_load(const char *encoding, __unused locale_t unused) 92 { 93 if (strcmp(encoding, "C") == 0) 94 return (&__xlocale_C_collate); 95 else if (strcmp(encoding, "POSIX") == 0) 96 return (&__xlocale_POSIX_collate); 97 else if (strcmp(encoding, "C.UTF-8") == 0) 98 return (&__xlocale_CUTF8_collate); 99 100 struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 101 1); 102 if (table == NULL) 103 return (NULL); 104 table->header.header.destructor = destruct_collate; 105 106 /* 107 * FIXME: Make sure that _LDP_CACHE is never returned. We 108 * should be doing the caching outside of this section. 109 */ 110 if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) { 111 xlocale_release(table); 112 return (NULL); 113 } 114 return (table); 115 } 116 117 /** 118 * Load the collation tables for the specified encoding into the global table. 119 */ 120 int 121 __collate_load_tables(const char *encoding) 122 { 123 124 return (__collate_load_tables_l(encoding, &__xlocale_global_collate)); 125 } 126 127 static int 128 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) 129 { 130 int i, chains, z; 131 char *buf; 132 char *TMP; 133 char *map; 134 collate_info_t *info; 135 struct stat sbuf; 136 int fd; 137 138 table->__collate_load_error = 1; 139 140 /* 'encoding' must be already checked. */ 141 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 || 142 strncmp(encoding, "C.", 2) == 0) { 143 return (_LDP_CACHE); 144 } 145 146 if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1) 147 return (_LDP_ERROR); 148 149 if ((fd = _open(buf, O_RDONLY | O_CLOEXEC)) < 0) { 150 free(buf); 151 return (_LDP_ERROR); 152 } 153 free(buf); 154 if (_fstat(fd, &sbuf) < 0) { 155 (void) _close(fd); 156 return (_LDP_ERROR); 157 } 158 if (sbuf.st_size < (COLLATE_FMT_VERSION_LEN + 159 XLOCALE_DEF_VERSION_LEN + 160 sizeof (*info))) { 161 (void) _close(fd); 162 errno = EINVAL; 163 return (_LDP_ERROR); 164 } 165 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 166 (void) _close(fd); 167 if ((TMP = map) == MAP_FAILED) { 168 return (_LDP_ERROR); 169 } 170 171 if (strncmp(TMP, COLLATE_FMT_VERSION, COLLATE_FMT_VERSION_LEN) != 0) { 172 (void) munmap(map, sbuf.st_size); 173 errno = EINVAL; 174 return (_LDP_ERROR); 175 } 176 TMP += COLLATE_FMT_VERSION_LEN; 177 strlcat(table->header.version, TMP, sizeof (table->header.version)); 178 TMP += XLOCALE_DEF_VERSION_LEN; 179 180 info = (void *)TMP; 181 TMP += sizeof (*info); 182 183 if ((info->directive_count < 1) || 184 (info->directive_count >= COLL_WEIGHTS_MAX) || 185 ((chains = info->chain_count) < 0)) { 186 (void) munmap(map, sbuf.st_size); 187 errno = EINVAL; 188 return (_LDP_ERROR); 189 } 190 191 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 192 (sizeof (collate_chain_t) * chains) + 193 (sizeof (collate_large_t) * info->large_count); 194 for (z = 0; z < info->directive_count; z++) { 195 i += sizeof (collate_subst_t) * info->subst_count[z]; 196 } 197 if (i != (sbuf.st_size - (TMP - map))) { 198 (void) munmap(map, sbuf.st_size); 199 errno = EINVAL; 200 return (_LDP_ERROR); 201 } 202 203 if (table->map && (table->maplen > 0)) { 204 (void) munmap(table->map, table->maplen); 205 } 206 table->map = map; 207 table->maplen = sbuf.st_size; 208 table->info = info; 209 table->char_pri_table = (void *)TMP; 210 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 211 212 for (z = 0; z < info->directive_count; z++) { 213 if (info->subst_count[z] > 0) { 214 table->subst_table[z] = (void *)TMP; 215 TMP += info->subst_count[z] * sizeof (collate_subst_t); 216 } else { 217 table->subst_table[z] = NULL; 218 } 219 } 220 221 if (chains > 0) { 222 table->chain_pri_table = (void *)TMP; 223 TMP += chains * sizeof (collate_chain_t); 224 } else 225 table->chain_pri_table = NULL; 226 if (info->large_count > 0) 227 table->large_pri_table = (void *)TMP; 228 else 229 table->large_pri_table = NULL; 230 231 table->__collate_load_error = 0; 232 return (_LDP_LOADED); 233 } 234 235 static const int32_t * 236 substsearch(struct xlocale_collate *table, const wchar_t key, int pass) 237 { 238 const collate_subst_t *p; 239 int n = table->info->subst_count[pass]; 240 241 if (n == 0) 242 return (NULL); 243 244 if (pass >= table->info->directive_count) 245 return (NULL); 246 247 if (!(key & COLLATE_SUBST_PRIORITY)) 248 return (NULL); 249 250 p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 251 assert(p->key == key); 252 253 return (p->pri); 254 } 255 256 static collate_chain_t * 257 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len) 258 { 259 int low = 0; 260 int high = table->info->chain_count - 1; 261 int next, compar, l; 262 collate_chain_t *p; 263 collate_chain_t *tab = table->chain_pri_table; 264 265 if (high < 0) 266 return (NULL); 267 268 while (low <= high) { 269 next = (low + high) / 2; 270 p = tab + next; 271 compar = *key - *p->str; 272 if (compar == 0) { 273 l = wcsnlen(p->str, COLLATE_STR_LEN); 274 compar = wcsncmp(key, p->str, l); 275 if (compar == 0) { 276 *len = l; 277 return (p); 278 } 279 } 280 if (compar > 0) 281 low = next + 1; 282 else 283 high = next - 1; 284 } 285 return (NULL); 286 } 287 288 static collate_large_t * 289 largesearch(struct xlocale_collate *table, const wchar_t key) 290 { 291 int low = 0; 292 int high = table->info->large_count - 1; 293 int next, compar; 294 collate_large_t *p; 295 collate_large_t *tab = table->large_pri_table; 296 297 if (high < 0) 298 return (NULL); 299 300 while (low <= high) { 301 next = (low + high) / 2; 302 p = tab + next; 303 compar = key - p->val; 304 if (compar == 0) 305 return (p); 306 if (compar > 0) 307 low = next + 1; 308 else 309 high = next - 1; 310 } 311 return (NULL); 312 } 313 314 void 315 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len, 316 int *pri, int which, const int **state) 317 { 318 collate_chain_t *p2; 319 collate_large_t *match; 320 int p, l; 321 const int *sptr; 322 323 /* 324 * If this is the "last" pass for the UNDEFINED, then 325 * we just return the priority itself. 326 */ 327 if (which >= table->info->directive_count) { 328 *pri = *t; 329 *len = 1; 330 *state = NULL; 331 return; 332 } 333 334 /* 335 * If we have remaining substitution data from a previous 336 * call, consume it first. 337 */ 338 if ((sptr = *state) != NULL) { 339 *pri = *sptr; 340 sptr++; 341 if ((sptr == *state) || (sptr == NULL)) 342 *state = NULL; 343 else 344 *state = sptr; 345 *len = 0; 346 return; 347 } 348 349 /* No active substitutions */ 350 *len = 1; 351 352 /* 353 * Check for composites such as diphthongs that collate as a 354 * single element (aka chains or collating-elements). 355 */ 356 if (((p2 = chainsearch(table, t, &l)) != NULL) && 357 ((p = p2->pri[which]) >= 0)) { 358 359 *len = l; 360 *pri = p; 361 362 } else if (*t <= UCHAR_MAX) { 363 364 /* 365 * Character is a small (8-bit) character. 366 * We just look these up directly for speed. 367 */ 368 *pri = table->char_pri_table[*t].pri[which]; 369 370 } else if ((table->info->large_count > 0) && 371 ((match = largesearch(table, *t)) != NULL)) { 372 373 /* 374 * Character was found in the extended table. 375 */ 376 *pri = match->pri.pri[which]; 377 378 } else { 379 /* 380 * Character lacks a specific definition. 381 */ 382 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) { 383 /* Mask off sign bit to prevent ordering confusion. */ 384 *pri = (*t & COLLATE_MAX_PRIORITY); 385 } else { 386 *pri = table->info->undef_pri[which]; 387 } 388 /* No substitutions for undefined characters! */ 389 return; 390 } 391 392 /* 393 * Try substituting (expanding) the character. We are 394 * currently doing this *after* the chain compression. I 395 * think it should not matter, but this way might be slightly 396 * faster. 397 * 398 * We do this after the priority search, as this will help us 399 * to identify a single key value. In order for this to work, 400 * its important that the priority assigned to a given element 401 * to be substituted be unique for that level. The localedef 402 * code ensures this for us. 403 */ 404 if ((sptr = substsearch(table, *pri, which)) != NULL) { 405 if ((*pri = *sptr) > 0) { 406 sptr++; 407 *state = *sptr ? sptr : NULL; 408 } 409 } 410 411 } 412 413 /* 414 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 415 * NOT NULL terminate. That is left to the caller. 416 */ 417 size_t 418 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf, 419 size_t room) 420 { 421 int pri; 422 int len; 423 const wchar_t *t; 424 wchar_t *tr = NULL; 425 int direc; 426 int pass; 427 const int32_t *state; 428 size_t want = 0; 429 size_t need = 0; 430 int ndir = table->info->directive_count; 431 432 assert(src); 433 434 for (pass = 0; pass <= ndir; pass++) { 435 436 state = NULL; 437 438 if (pass != 0) { 439 /* insert level separator from the previous pass */ 440 if (room) { 441 *xf++ = 1; 442 room--; 443 } 444 want++; 445 } 446 447 /* special pass for undefined */ 448 if (pass == ndir) { 449 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 450 } else { 451 direc = table->info->directive[pass]; 452 } 453 454 t = src; 455 456 if (direc & DIRECTIVE_BACKWARD) { 457 wchar_t *bp, *fp, c; 458 free(tr); 459 if ((tr = wcsdup(t)) == NULL) { 460 errno = ENOMEM; 461 goto fail; 462 } 463 bp = tr; 464 fp = tr + wcslen(tr) - 1; 465 while (bp < fp) { 466 c = *bp; 467 *bp++ = *fp; 468 *fp-- = c; 469 } 470 t = (const wchar_t *)tr; 471 } 472 473 if (direc & DIRECTIVE_POSITION) { 474 while (*t || state) { 475 _collate_lookup(table, t, &len, &pri, pass, &state); 476 t += len; 477 if (pri <= 0) { 478 if (pri < 0) { 479 errno = EINVAL; 480 goto fail; 481 } 482 state = NULL; 483 pri = COLLATE_MAX_PRIORITY; 484 } 485 if (room) { 486 *xf++ = pri; 487 room--; 488 } 489 want++; 490 need = want; 491 } 492 } else { 493 while (*t || state) { 494 _collate_lookup(table, t, &len, &pri, pass, &state); 495 t += len; 496 if (pri <= 0) { 497 if (pri < 0) { 498 errno = EINVAL; 499 goto fail; 500 } 501 state = NULL; 502 continue; 503 } 504 if (room) { 505 *xf++ = pri; 506 room--; 507 } 508 want++; 509 need = want; 510 } 511 } 512 } 513 free(tr); 514 return (need); 515 516 fail: 517 free(tr); 518 return ((size_t)(-1)); 519 } 520 521 /* 522 * In the non-POSIX case, we transform each character into a string of 523 * characters representing the character's priority. Since char is usually 524 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 525 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 526 * bits per byte. 527 * 528 * It turns out that we sometimes have real priorities that are 529 * 31-bits wide. (But: be careful using priorities where the high 530 * order bit is set -- i.e. the priority is negative. The sort order 531 * may be surprising!) 532 * 533 * TODO: This would be a good area to optimize somewhat. It turns out 534 * that real prioririties *except for the last UNDEFINED pass* are generally 535 * very small. We need the localedef code to precalculate the max 536 * priority for us, and ideally also give us a mask, and then we could 537 * severely limit what we expand to. 538 */ 539 #define XFRM_BYTES 6 540 #define XFRM_OFFSET ('0') /* make all printable characters */ 541 #define XFRM_SHIFT 6 542 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 543 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 544 545 static int 546 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass) 547 { 548 /* we use unsigned to ensure zero fill on right shift */ 549 uint32_t val = (uint32_t)table->info->pri_count[pass]; 550 int nc = 0; 551 552 while (val) { 553 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 554 pri >>= XFRM_SHIFT; 555 val >>= XFRM_SHIFT; 556 p++; 557 nc++; 558 } 559 return (nc); 560 } 561 562 size_t 563 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf, 564 size_t room) 565 { 566 int pri; 567 int len; 568 const wchar_t *t; 569 wchar_t *tr = NULL; 570 int direc; 571 int pass; 572 const int32_t *state; 573 size_t want = 0; 574 size_t need = 0; 575 int b; 576 uint8_t buf[XFRM_BYTES]; 577 int ndir = table->info->directive_count; 578 579 assert(src); 580 581 for (pass = 0; pass <= ndir; pass++) { 582 583 state = NULL; 584 585 if (pass != 0) { 586 /* insert level separator from the previous pass */ 587 if (room) { 588 *xf++ = XFRM_SEP; 589 room--; 590 } 591 want++; 592 } 593 594 /* special pass for undefined */ 595 if (pass == ndir) { 596 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 597 } else { 598 direc = table->info->directive[pass]; 599 } 600 601 t = src; 602 603 if (direc & DIRECTIVE_BACKWARD) { 604 wchar_t *bp, *fp, c; 605 free(tr); 606 if ((tr = wcsdup(t)) == NULL) { 607 errno = ENOMEM; 608 goto fail; 609 } 610 bp = tr; 611 fp = tr + wcslen(tr) - 1; 612 while (bp < fp) { 613 c = *bp; 614 *bp++ = *fp; 615 *fp-- = c; 616 } 617 t = (const wchar_t *)tr; 618 } 619 620 if (direc & DIRECTIVE_POSITION) { 621 while (*t || state) { 622 623 _collate_lookup(table, t, &len, &pri, pass, &state); 624 t += len; 625 if (pri <= 0) { 626 if (pri < 0) { 627 errno = EINVAL; 628 goto fail; 629 } 630 state = NULL; 631 pri = COLLATE_MAX_PRIORITY; 632 } 633 634 b = xfrm(table, buf, pri, pass); 635 want += b; 636 if (room) { 637 while (b) { 638 b--; 639 if (room) { 640 *xf++ = buf[b]; 641 room--; 642 } 643 } 644 } 645 need = want; 646 } 647 } else { 648 while (*t || state) { 649 _collate_lookup(table, t, &len, &pri, pass, &state); 650 t += len; 651 if (pri <= 0) { 652 if (pri < 0) { 653 errno = EINVAL; 654 goto fail; 655 } 656 state = NULL; 657 continue; 658 } 659 660 b = xfrm(table, buf, pri, pass); 661 want += b; 662 if (room) { 663 664 while (b) { 665 b--; 666 if (room) { 667 *xf++ = buf[b]; 668 room--; 669 } 670 } 671 } 672 need = want; 673 } 674 } 675 } 676 free(tr); 677 return (need); 678 679 fail: 680 free(tr); 681 return ((size_t)(-1)); 682 } 683 684 /* 685 * __collate_equiv_value returns the primary collation value for the given 686 * collating symbol specified by str and len. Zero or negative is returned 687 * if the collating symbol was not found. This function is used by bracket 688 * code in the TRE regex library. 689 */ 690 int 691 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) 692 { 693 int32_t e; 694 695 if (len < 1 || len >= COLLATE_STR_LEN) 696 return (-1); 697 698 FIX_LOCALE(locale); 699 struct xlocale_collate *table = 700 (struct xlocale_collate*)locale->components[XLC_COLLATE]; 701 702 if (table->__collate_load_error) 703 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1); 704 705 if (len == 1) { 706 e = -1; 707 if (*str <= UCHAR_MAX) 708 e = table->char_pri_table[*str].pri[0]; 709 else if (table->info->large_count > 0) { 710 collate_large_t *match_large; 711 match_large = largesearch(table, *str); 712 if (match_large) 713 e = match_large->pri.pri[0]; 714 } 715 if (e == 0) 716 return (1); 717 return (e > 0 ? e : 0); 718 } 719 if (table->info->chain_count > 0) { 720 wchar_t name[COLLATE_STR_LEN]; 721 collate_chain_t *match_chain; 722 int clen; 723 724 wcsncpy (name, str, len); 725 name[len] = 0; 726 match_chain = chainsearch(table, name, &clen); 727 if (match_chain) { 728 e = match_chain->pri[0]; 729 if (e == 0) 730 return (1); 731 return (e < 0 ? -e : e); 732 } 733 } 734 return (0); 735 } 736