1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright 2014 Garrett D'Amore <garrett@damore.org> 5 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 6 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 7 * at Electronni Visti IA, Kiev, Ukraine. 8 * All rights reserved. 9 * 10 * Copyright (c) 2011 The FreeBSD Foundation 11 * 12 * Portions of this software were developed by David Chisnall 13 * under sponsorship from the FreeBSD Foundation. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * Adapted to xlocale by John Marino <draco@marino.st> 37 */ 38 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include "namespace.h" 43 44 #include <sys/types.h> 45 #include <sys/stat.h> 46 #include <sys/mman.h> 47 48 #include <assert.h> 49 #include <stdio.h> 50 #include <stdlib.h> 51 #include <string.h> 52 #include <wchar.h> 53 #include <errno.h> 54 #include <unistd.h> 55 #include <fcntl.h> 56 #include "un-namespace.h" 57 58 #include "collate.h" 59 #include "setlocale.h" 60 #include "ldpart.h" 61 #include "libc_private.h" 62 63 struct xlocale_collate __xlocale_global_collate = { 64 {{0}, "C"}, 1, 0, 0, 0 65 }; 66 67 struct xlocale_collate __xlocale_C_collate = { 68 {{0}, "C"}, 1, 0, 0, 0 69 }; 70 71 static int 72 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table); 73 74 static void 75 destruct_collate(void *t) 76 { 77 struct xlocale_collate *table = t; 78 if (table->map && (table->maplen > 0)) { 79 (void) munmap(table->map, table->maplen); 80 } 81 free(t); 82 } 83 84 void * 85 __collate_load(const char *encoding, __unused locale_t unused) 86 { 87 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 || 88 strncmp(encoding, "C.", 2) == 0) { 89 return (&__xlocale_C_collate); 90 } 91 struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 92 1); 93 if (table == NULL) 94 return (NULL); 95 table->header.header.destructor = destruct_collate; 96 97 /* 98 * FIXME: Make sure that _LDP_CACHE is never returned. We 99 * should be doing the caching outside of this section. 100 */ 101 if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) { 102 xlocale_release(table); 103 return (NULL); 104 } 105 return (table); 106 } 107 108 /** 109 * Load the collation tables for the specified encoding into the global table. 110 */ 111 int 112 __collate_load_tables(const char *encoding) 113 { 114 115 return (__collate_load_tables_l(encoding, &__xlocale_global_collate)); 116 } 117 118 static int 119 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) 120 { 121 int i, chains, z; 122 char *buf; 123 char *TMP; 124 char *map; 125 collate_info_t *info; 126 struct stat sbuf; 127 int fd; 128 129 table->__collate_load_error = 1; 130 131 /* 'encoding' must be already checked. */ 132 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 || 133 strncmp(encoding, "C.", 2) == 0) { 134 return (_LDP_CACHE); 135 } 136 137 if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1) 138 return (_LDP_ERROR); 139 140 if ((fd = _open(buf, O_RDONLY | O_CLOEXEC)) < 0) { 141 free(buf); 142 return (_LDP_ERROR); 143 } 144 free(buf); 145 if (_fstat(fd, &sbuf) < 0) { 146 (void) _close(fd); 147 return (_LDP_ERROR); 148 } 149 if (sbuf.st_size < (COLLATE_FMT_VERSION_LEN + 150 XLOCALE_DEF_VERSION_LEN + 151 sizeof (info))) { 152 (void) _close(fd); 153 errno = EINVAL; 154 return (_LDP_ERROR); 155 } 156 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 157 (void) _close(fd); 158 if ((TMP = map) == MAP_FAILED) { 159 return (_LDP_ERROR); 160 } 161 162 if (strncmp(TMP, COLLATE_FMT_VERSION, COLLATE_FMT_VERSION_LEN) != 0) { 163 (void) munmap(map, sbuf.st_size); 164 errno = EINVAL; 165 return (_LDP_ERROR); 166 } 167 TMP += COLLATE_FMT_VERSION_LEN; 168 strlcat(table->header.version, TMP, sizeof (table->header.version)); 169 TMP += XLOCALE_DEF_VERSION_LEN; 170 171 info = (void *)TMP; 172 TMP += sizeof (*info); 173 174 if ((info->directive_count < 1) || 175 (info->directive_count >= COLL_WEIGHTS_MAX) || 176 ((chains = info->chain_count) < 0)) { 177 (void) munmap(map, sbuf.st_size); 178 errno = EINVAL; 179 return (_LDP_ERROR); 180 } 181 182 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 183 (sizeof (collate_chain_t) * chains) + 184 (sizeof (collate_large_t) * info->large_count); 185 for (z = 0; z < info->directive_count; z++) { 186 i += sizeof (collate_subst_t) * info->subst_count[z]; 187 } 188 if (i != (sbuf.st_size - (TMP - map))) { 189 (void) munmap(map, sbuf.st_size); 190 errno = EINVAL; 191 return (_LDP_ERROR); 192 } 193 194 if (table->map && (table->maplen > 0)) { 195 (void) munmap(table->map, table->maplen); 196 } 197 table->map = map; 198 table->maplen = sbuf.st_size; 199 table->info = info; 200 table->char_pri_table = (void *)TMP; 201 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 202 203 for (z = 0; z < info->directive_count; z++) { 204 if (info->subst_count[z] > 0) { 205 table->subst_table[z] = (void *)TMP; 206 TMP += info->subst_count[z] * sizeof (collate_subst_t); 207 } else { 208 table->subst_table[z] = NULL; 209 } 210 } 211 212 if (chains > 0) { 213 table->chain_pri_table = (void *)TMP; 214 TMP += chains * sizeof (collate_chain_t); 215 } else 216 table->chain_pri_table = NULL; 217 if (info->large_count > 0) 218 table->large_pri_table = (void *)TMP; 219 else 220 table->large_pri_table = NULL; 221 222 table->__collate_load_error = 0; 223 return (_LDP_LOADED); 224 } 225 226 static const int32_t * 227 substsearch(struct xlocale_collate *table, const wchar_t key, int pass) 228 { 229 const collate_subst_t *p; 230 int n = table->info->subst_count[pass]; 231 232 if (n == 0) 233 return (NULL); 234 235 if (pass >= table->info->directive_count) 236 return (NULL); 237 238 if (!(key & COLLATE_SUBST_PRIORITY)) 239 return (NULL); 240 241 p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 242 assert(p->key == key); 243 244 return (p->pri); 245 } 246 247 static collate_chain_t * 248 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len) 249 { 250 int low = 0; 251 int high = table->info->chain_count - 1; 252 int next, compar, l; 253 collate_chain_t *p; 254 collate_chain_t *tab = table->chain_pri_table; 255 256 if (high < 0) 257 return (NULL); 258 259 while (low <= high) { 260 next = (low + high) / 2; 261 p = tab + next; 262 compar = *key - *p->str; 263 if (compar == 0) { 264 l = wcsnlen(p->str, COLLATE_STR_LEN); 265 compar = wcsncmp(key, p->str, l); 266 if (compar == 0) { 267 *len = l; 268 return (p); 269 } 270 } 271 if (compar > 0) 272 low = next + 1; 273 else 274 high = next - 1; 275 } 276 return (NULL); 277 } 278 279 static collate_large_t * 280 largesearch(struct xlocale_collate *table, const wchar_t key) 281 { 282 int low = 0; 283 int high = table->info->large_count - 1; 284 int next, compar; 285 collate_large_t *p; 286 collate_large_t *tab = table->large_pri_table; 287 288 if (high < 0) 289 return (NULL); 290 291 while (low <= high) { 292 next = (low + high) / 2; 293 p = tab + next; 294 compar = key - p->val; 295 if (compar == 0) 296 return (p); 297 if (compar > 0) 298 low = next + 1; 299 else 300 high = next - 1; 301 } 302 return (NULL); 303 } 304 305 void 306 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len, 307 int *pri, int which, const int **state) 308 { 309 collate_chain_t *p2; 310 collate_large_t *match; 311 int p, l; 312 const int *sptr; 313 314 /* 315 * If this is the "last" pass for the UNDEFINED, then 316 * we just return the priority itself. 317 */ 318 if (which >= table->info->directive_count) { 319 *pri = *t; 320 *len = 1; 321 *state = NULL; 322 return; 323 } 324 325 /* 326 * If we have remaining substitution data from a previous 327 * call, consume it first. 328 */ 329 if ((sptr = *state) != NULL) { 330 *pri = *sptr; 331 sptr++; 332 if ((sptr == *state) || (sptr == NULL)) 333 *state = NULL; 334 else 335 *state = sptr; 336 *len = 0; 337 return; 338 } 339 340 /* No active substitutions */ 341 *len = 1; 342 343 /* 344 * Check for composites such as diphthongs that collate as a 345 * single element (aka chains or collating-elements). 346 */ 347 if (((p2 = chainsearch(table, t, &l)) != NULL) && 348 ((p = p2->pri[which]) >= 0)) { 349 350 *len = l; 351 *pri = p; 352 353 } else if (*t <= UCHAR_MAX) { 354 355 /* 356 * Character is a small (8-bit) character. 357 * We just look these up directly for speed. 358 */ 359 *pri = table->char_pri_table[*t].pri[which]; 360 361 } else if ((table->info->large_count > 0) && 362 ((match = largesearch(table, *t)) != NULL)) { 363 364 /* 365 * Character was found in the extended table. 366 */ 367 *pri = match->pri.pri[which]; 368 369 } else { 370 /* 371 * Character lacks a specific definition. 372 */ 373 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) { 374 /* Mask off sign bit to prevent ordering confusion. */ 375 *pri = (*t & COLLATE_MAX_PRIORITY); 376 } else { 377 *pri = table->info->undef_pri[which]; 378 } 379 /* No substitutions for undefined characters! */ 380 return; 381 } 382 383 /* 384 * Try substituting (expanding) the character. We are 385 * currently doing this *after* the chain compression. I 386 * think it should not matter, but this way might be slightly 387 * faster. 388 * 389 * We do this after the priority search, as this will help us 390 * to identify a single key value. In order for this to work, 391 * its important that the priority assigned to a given element 392 * to be substituted be unique for that level. The localedef 393 * code ensures this for us. 394 */ 395 if ((sptr = substsearch(table, *pri, which)) != NULL) { 396 if ((*pri = *sptr) > 0) { 397 sptr++; 398 *state = *sptr ? sptr : NULL; 399 } 400 } 401 402 } 403 404 /* 405 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 406 * NOT NULL terminate. That is left to the caller. 407 */ 408 size_t 409 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf, 410 size_t room) 411 { 412 int pri; 413 int len; 414 const wchar_t *t; 415 wchar_t *tr = NULL; 416 int direc; 417 int pass; 418 const int32_t *state; 419 size_t want = 0; 420 size_t need = 0; 421 int ndir = table->info->directive_count; 422 423 assert(src); 424 425 for (pass = 0; pass <= ndir; pass++) { 426 427 state = NULL; 428 429 if (pass != 0) { 430 /* insert level separator from the previous pass */ 431 if (room) { 432 *xf++ = 1; 433 room--; 434 } 435 want++; 436 } 437 438 /* special pass for undefined */ 439 if (pass == ndir) { 440 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 441 } else { 442 direc = table->info->directive[pass]; 443 } 444 445 t = src; 446 447 if (direc & DIRECTIVE_BACKWARD) { 448 wchar_t *bp, *fp, c; 449 free(tr); 450 if ((tr = wcsdup(t)) == NULL) { 451 errno = ENOMEM; 452 goto fail; 453 } 454 bp = tr; 455 fp = tr + wcslen(tr) - 1; 456 while (bp < fp) { 457 c = *bp; 458 *bp++ = *fp; 459 *fp-- = c; 460 } 461 t = (const wchar_t *)tr; 462 } 463 464 if (direc & DIRECTIVE_POSITION) { 465 while (*t || state) { 466 _collate_lookup(table, t, &len, &pri, pass, &state); 467 t += len; 468 if (pri <= 0) { 469 if (pri < 0) { 470 errno = EINVAL; 471 goto fail; 472 } 473 state = NULL; 474 pri = COLLATE_MAX_PRIORITY; 475 } 476 if (room) { 477 *xf++ = pri; 478 room--; 479 } 480 want++; 481 need = want; 482 } 483 } else { 484 while (*t || state) { 485 _collate_lookup(table, t, &len, &pri, pass, &state); 486 t += len; 487 if (pri <= 0) { 488 if (pri < 0) { 489 errno = EINVAL; 490 goto fail; 491 } 492 state = NULL; 493 continue; 494 } 495 if (room) { 496 *xf++ = pri; 497 room--; 498 } 499 want++; 500 need = want; 501 } 502 } 503 } 504 free(tr); 505 return (need); 506 507 fail: 508 free(tr); 509 return ((size_t)(-1)); 510 } 511 512 /* 513 * In the non-POSIX case, we transform each character into a string of 514 * characters representing the character's priority. Since char is usually 515 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 516 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 517 * bits per byte. 518 * 519 * It turns out that we sometimes have real priorities that are 520 * 31-bits wide. (But: be careful using priorities where the high 521 * order bit is set -- i.e. the priority is negative. The sort order 522 * may be surprising!) 523 * 524 * TODO: This would be a good area to optimize somewhat. It turns out 525 * that real prioririties *except for the last UNDEFINED pass* are generally 526 * very small. We need the localedef code to precalculate the max 527 * priority for us, and ideally also give us a mask, and then we could 528 * severely limit what we expand to. 529 */ 530 #define XFRM_BYTES 6 531 #define XFRM_OFFSET ('0') /* make all printable characters */ 532 #define XFRM_SHIFT 6 533 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 534 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 535 536 static int 537 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass) 538 { 539 /* we use unsigned to ensure zero fill on right shift */ 540 uint32_t val = (uint32_t)table->info->pri_count[pass]; 541 int nc = 0; 542 543 while (val) { 544 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 545 pri >>= XFRM_SHIFT; 546 val >>= XFRM_SHIFT; 547 p++; 548 nc++; 549 } 550 return (nc); 551 } 552 553 size_t 554 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf, 555 size_t room) 556 { 557 int pri; 558 int len; 559 const wchar_t *t; 560 wchar_t *tr = NULL; 561 int direc; 562 int pass; 563 const int32_t *state; 564 size_t want = 0; 565 size_t need = 0; 566 int b; 567 uint8_t buf[XFRM_BYTES]; 568 int ndir = table->info->directive_count; 569 570 assert(src); 571 572 for (pass = 0; pass <= ndir; pass++) { 573 574 state = NULL; 575 576 if (pass != 0) { 577 /* insert level separator from the previous pass */ 578 if (room) { 579 *xf++ = XFRM_SEP; 580 room--; 581 } 582 want++; 583 } 584 585 /* special pass for undefined */ 586 if (pass == ndir) { 587 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 588 } else { 589 direc = table->info->directive[pass]; 590 } 591 592 t = src; 593 594 if (direc & DIRECTIVE_BACKWARD) { 595 wchar_t *bp, *fp, c; 596 free(tr); 597 if ((tr = wcsdup(t)) == NULL) { 598 errno = ENOMEM; 599 goto fail; 600 } 601 bp = tr; 602 fp = tr + wcslen(tr) - 1; 603 while (bp < fp) { 604 c = *bp; 605 *bp++ = *fp; 606 *fp-- = c; 607 } 608 t = (const wchar_t *)tr; 609 } 610 611 if (direc & DIRECTIVE_POSITION) { 612 while (*t || state) { 613 614 _collate_lookup(table, t, &len, &pri, pass, &state); 615 t += len; 616 if (pri <= 0) { 617 if (pri < 0) { 618 errno = EINVAL; 619 goto fail; 620 } 621 state = NULL; 622 pri = COLLATE_MAX_PRIORITY; 623 } 624 625 b = xfrm(table, buf, pri, pass); 626 want += b; 627 if (room) { 628 while (b) { 629 b--; 630 if (room) { 631 *xf++ = buf[b]; 632 room--; 633 } 634 } 635 } 636 need = want; 637 } 638 } else { 639 while (*t || state) { 640 _collate_lookup(table, t, &len, &pri, pass, &state); 641 t += len; 642 if (pri <= 0) { 643 if (pri < 0) { 644 errno = EINVAL; 645 goto fail; 646 } 647 state = NULL; 648 continue; 649 } 650 651 b = xfrm(table, buf, pri, pass); 652 want += b; 653 if (room) { 654 655 while (b) { 656 b--; 657 if (room) { 658 *xf++ = buf[b]; 659 room--; 660 } 661 } 662 } 663 need = want; 664 } 665 } 666 } 667 free(tr); 668 return (need); 669 670 fail: 671 free(tr); 672 return ((size_t)(-1)); 673 } 674 675 /* 676 * __collate_equiv_value returns the primary collation value for the given 677 * collating symbol specified by str and len. Zero or negative is returned 678 * if the collating symbol was not found. This function is used by bracket 679 * code in the TRE regex library. 680 */ 681 int 682 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) 683 { 684 int32_t e; 685 686 if (len < 1 || len >= COLLATE_STR_LEN) 687 return (-1); 688 689 FIX_LOCALE(locale); 690 struct xlocale_collate *table = 691 (struct xlocale_collate*)locale->components[XLC_COLLATE]; 692 693 if (table->__collate_load_error) 694 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1); 695 696 if (len == 1) { 697 e = -1; 698 if (*str <= UCHAR_MAX) 699 e = table->char_pri_table[*str].pri[0]; 700 else if (table->info->large_count > 0) { 701 collate_large_t *match_large; 702 match_large = largesearch(table, *str); 703 if (match_large) 704 e = match_large->pri.pri[0]; 705 } 706 if (e == 0) 707 return (1); 708 return (e > 0 ? e : 0); 709 } 710 if (table->info->chain_count > 0) { 711 wchar_t name[COLLATE_STR_LEN]; 712 collate_chain_t *match_chain; 713 int clen; 714 715 wcsncpy (name, str, len); 716 name[len] = 0; 717 match_chain = chainsearch(table, name, &clen); 718 if (match_chain) { 719 e = match_chain->pri[0]; 720 if (e == 0) 721 return (1); 722 return (e < 0 ? -e : e); 723 } 724 } 725 return (0); 726 } 727