1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright 2014 Garrett D'Amore <garrett@damore.org> 5 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 6 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 7 * at Electronni Visti IA, Kiev, Ukraine. 8 * All rights reserved. 9 * 10 * Copyright (c) 2011 The FreeBSD Foundation 11 * All rights reserved. 12 * Portions of this software were developed by David Chisnall 13 * under sponsorship from the FreeBSD Foundation. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * Adapted to xlocale by John Marino <draco@marino.st> 37 */ 38 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include "namespace.h" 43 44 #include <sys/types.h> 45 #include <sys/stat.h> 46 #include <sys/mman.h> 47 48 #include <assert.h> 49 #include <stdio.h> 50 #include <stdlib.h> 51 #include <string.h> 52 #include <wchar.h> 53 #include <errno.h> 54 #include <unistd.h> 55 #include <fcntl.h> 56 #include "un-namespace.h" 57 58 #include "endian.h" 59 #include "collate.h" 60 #include "setlocale.h" 61 #include "ldpart.h" 62 #include "libc_private.h" 63 64 struct xlocale_collate __xlocale_global_collate = { 65 {{0}, "C"}, 1, 0, 0, 0 66 }; 67 68 struct xlocale_collate __xlocale_C_collate = { 69 {{0}, "C"}, 1, 0, 0, 0 70 }; 71 72 static int 73 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table); 74 75 static void 76 destruct_collate(void *t) 77 { 78 struct xlocale_collate *table = t; 79 if (table->map && (table->maplen > 0)) { 80 (void) munmap(table->map, table->maplen); 81 } 82 free(t); 83 } 84 85 void * 86 __collate_load(const char *encoding, __unused locale_t unused) 87 { 88 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 89 return &__xlocale_C_collate; 90 } 91 struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1); 92 table->header.header.destructor = destruct_collate; 93 // FIXME: Make sure that _LDP_CACHE is never returned. We should be doing 94 // the caching outside of this section 95 if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) { 96 xlocale_release(table); 97 return NULL; 98 } 99 return table; 100 } 101 102 /** 103 * Load the collation tables for the specified encoding into the global table. 104 */ 105 int 106 __collate_load_tables(const char *encoding) 107 { 108 109 return (__collate_load_tables_l(encoding, &__xlocale_global_collate)); 110 } 111 112 int 113 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) 114 { 115 int i, chains, z; 116 char *buf; 117 char *TMP; 118 char *map; 119 collate_info_t *info; 120 struct stat sbuf; 121 int fd; 122 123 table->__collate_load_error = 1; 124 125 /* 'encoding' must be already checked. */ 126 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 127 return (_LDP_CACHE); 128 } 129 130 if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1) 131 return (_LDP_ERROR); 132 133 if ((fd = _open(buf, O_RDONLY)) < 0) { 134 free(buf); 135 return (_LDP_ERROR); 136 } 137 free(buf); 138 if (_fstat(fd, &sbuf) < 0) { 139 (void) _close(fd); 140 return (_LDP_ERROR); 141 } 142 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { 143 (void) _close(fd); 144 errno = EINVAL; 145 return (_LDP_ERROR); 146 } 147 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 148 (void) _close(fd); 149 if ((TMP = map) == NULL) { 150 return (_LDP_ERROR); 151 } 152 153 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { 154 (void) munmap(map, sbuf.st_size); 155 errno = EINVAL; 156 return (_LDP_ERROR); 157 } 158 TMP += COLLATE_STR_LEN; 159 160 info = (void *)TMP; 161 TMP += sizeof (*info); 162 163 if ((info->directive_count < 1) || 164 (info->directive_count >= COLL_WEIGHTS_MAX) || 165 ((chains = BSWAP(info->chain_count)) < 0)) { 166 (void) munmap(map, sbuf.st_size); 167 errno = EINVAL; 168 return (_LDP_ERROR); 169 } 170 171 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 172 (sizeof (collate_chain_t) * chains) + 173 (sizeof (collate_large_t) * BSWAP(info->large_count)); 174 for (z = 0; z < info->directive_count; z++) { 175 i += sizeof (collate_subst_t) * BSWAP(info->subst_count[z]); 176 } 177 if (i != (sbuf.st_size - (TMP - map))) { 178 (void) munmap(map, sbuf.st_size); 179 errno = EINVAL; 180 return (_LDP_ERROR); 181 } 182 183 table->info = info; 184 table->char_pri_table = (void *)TMP; 185 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 186 187 for (z = 0; z < info->directive_count; z++) { 188 if (BSWAP(info->subst_count[z]) > 0) { 189 table->subst_table[z] = (void *)TMP; 190 TMP += BSWAP(info->subst_count[z]) * sizeof (collate_subst_t); 191 } else { 192 table->subst_table[z] = NULL; 193 } 194 } 195 196 if (chains > 0) { 197 table->chain_pri_table = (void *)TMP; 198 TMP += chains * sizeof (collate_chain_t); 199 } else 200 table->chain_pri_table = NULL; 201 if (BSWAP(info->large_count) > 0) 202 table->large_pri_table = (void *)TMP; 203 else 204 table->large_pri_table = NULL; 205 206 table->__collate_load_error = 0; 207 return (_LDP_LOADED); 208 } 209 210 static const int32_t * 211 substsearch(struct xlocale_collate *table, const wchar_t key, int pass) 212 { 213 const collate_subst_t *p; 214 int n = BSWAP(table->info->subst_count[pass]); 215 216 if (n == 0) 217 return (NULL); 218 219 if (pass >= table->info->directive_count) 220 return (NULL); 221 222 if (!(key & COLLATE_SUBST_PRIORITY)) 223 return (NULL); 224 225 p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 226 assert(BSWAP(p->key) == key); 227 228 return (p->pri); 229 } 230 231 static collate_chain_t * 232 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len) 233 { 234 int low = 0; 235 int high = BSWAP(table->info->chain_count) - 1; 236 int next, compar, l; 237 collate_chain_t *p; 238 collate_chain_t *tab = table->chain_pri_table; 239 240 if (high < 0) 241 return (NULL); 242 243 while (low <= high) { 244 next = (low + high) / 2; 245 p = tab + next; 246 compar = *key - le16toh(*p->str); 247 if (compar == 0) { 248 l = wcsnlen(p->str, COLLATE_STR_LEN); 249 compar = wcsncmp(key, p->str, l); 250 if (compar == 0) { 251 *len = l; 252 return (p); 253 } 254 } 255 if (compar > 0) 256 low = next + 1; 257 else 258 high = next - 1; 259 } 260 return (NULL); 261 } 262 263 static collate_large_t * 264 largesearch(struct xlocale_collate *table, const wchar_t key) 265 { 266 int low = 0; 267 int high = BSWAP(table->info->large_count) - 1; 268 int next, compar; 269 collate_large_t *p; 270 collate_large_t *tab = table->large_pri_table; 271 272 if (high < 0) 273 return (NULL); 274 275 while (low <= high) { 276 next = (low + high) / 2; 277 p = tab + next; 278 compar = key - BSWAP(p->val); 279 if (compar == 0) 280 return (p); 281 if (compar > 0) 282 low = next + 1; 283 else 284 high = next - 1; 285 } 286 return (NULL); 287 } 288 289 void 290 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len, 291 int *pri, int which, const int **state) 292 { 293 collate_chain_t *p2; 294 collate_large_t *match; 295 int p, l; 296 const int *sptr; 297 298 /* 299 * If this is the "last" pass for the UNDEFINED, then 300 * we just return the priority itself. 301 */ 302 if (which >= table->info->directive_count) { 303 *pri = *t; 304 *len = 1; 305 *state = NULL; 306 return; 307 } 308 309 /* 310 * If we have remaining substitution data from a previous 311 * call, consume it first. 312 */ 313 if ((sptr = *state) != NULL) { 314 *pri = *sptr; 315 sptr++; 316 if ((sptr == *state) || (sptr == NULL)) 317 *state = NULL; 318 else 319 *state = sptr; 320 *len = 0; 321 return; 322 } 323 324 /* No active substitutions */ 325 *len = 1; 326 327 /* 328 * Check for composites such as diphthongs that collate as a 329 * single element (aka chains or collating-elements). 330 */ 331 if (((p2 = chainsearch(table, t, &l)) != NULL) && 332 ((p = p2->pri[which]) >= 0)) { 333 334 *len = l; 335 *pri = p; 336 337 } else if (*t <= UCHAR_MAX) { 338 339 /* 340 * Character is a small (8-bit) character. 341 * We just look these up directly for speed. 342 */ 343 *pri = BSWAP(table->char_pri_table[*t].pri[which]); 344 345 } else if ((BSWAP(table->info->large_count) > 0) && 346 ((match = largesearch(table, *t)) != NULL)) { 347 348 /* 349 * Character was found in the extended table. 350 */ 351 *pri = BSWAP(match->pri.pri[which]); 352 353 } else { 354 /* 355 * Character lacks a specific definition. 356 */ 357 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) { 358 /* Mask off sign bit to prevent ordering confusion. */ 359 *pri = (*t & COLLATE_MAX_PRIORITY); 360 } else { 361 *pri = BSWAP(table->info->undef_pri[which]); 362 } 363 /* No substitutions for undefined characters! */ 364 return; 365 } 366 367 /* 368 * Try substituting (expanding) the character. We are 369 * currently doing this *after* the chain compression. I 370 * think it should not matter, but this way might be slightly 371 * faster. 372 * 373 * We do this after the priority search, as this will help us 374 * to identify a single key value. In order for this to work, 375 * its important that the priority assigned to a given element 376 * to be substituted be unique for that level. The localedef 377 * code ensures this for us. 378 */ 379 if ((sptr = substsearch(table, *pri, which)) != NULL) { 380 if ((*pri = BSWAP(*sptr)) > 0) { 381 sptr++; 382 *state = BSWAP(*sptr) ? sptr : NULL; 383 } 384 } 385 386 } 387 388 /* 389 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 390 * NOT NULL terminate. That is left to the caller. 391 */ 392 size_t 393 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf, 394 size_t room) 395 { 396 int pri; 397 int len; 398 const wchar_t *t; 399 wchar_t *tr = NULL; 400 int direc; 401 int pass; 402 const int32_t *state; 403 size_t want = 0; 404 size_t need = 0; 405 int ndir = table->info->directive_count; 406 407 assert(src); 408 409 for (pass = 0; pass <= ndir; pass++) { 410 411 state = NULL; 412 413 if (pass != 0) { 414 /* insert level separator from the previous pass */ 415 if (room) { 416 *xf++ = 1; 417 room--; 418 } 419 want++; 420 } 421 422 /* special pass for undefined */ 423 if (pass == ndir) { 424 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 425 } else { 426 direc = table->info->directive[pass]; 427 } 428 429 t = src; 430 431 if (direc & DIRECTIVE_BACKWARD) { 432 wchar_t *bp, *fp, c; 433 free(tr); 434 if ((tr = wcsdup(t)) == NULL) { 435 errno = ENOMEM; 436 goto fail; 437 } 438 bp = tr; 439 fp = tr + wcslen(tr) - 1; 440 while (bp < fp) { 441 c = *bp; 442 *bp++ = *fp; 443 *fp-- = c; 444 } 445 t = (const wchar_t *)tr; 446 } 447 448 if (direc & DIRECTIVE_POSITION) { 449 while (*t || state) { 450 _collate_lookup(table, t, &len, &pri, pass, &state); 451 t += len; 452 if (pri <= 0) { 453 if (pri < 0) { 454 errno = EINVAL; 455 goto fail; 456 } 457 state = NULL; 458 pri = COLLATE_MAX_PRIORITY; 459 } 460 if (room) { 461 *xf++ = pri; 462 room--; 463 } 464 want++; 465 need = want; 466 } 467 } else { 468 while (*t || state) { 469 _collate_lookup(table, t, &len, &pri, pass, &state); 470 t += len; 471 if (pri <= 0) { 472 if (pri < 0) { 473 errno = EINVAL; 474 goto fail; 475 } 476 state = NULL; 477 continue; 478 } 479 if (room) { 480 *xf++ = pri; 481 room--; 482 } 483 want++; 484 need = want; 485 } 486 } 487 } 488 free(tr); 489 return (need); 490 491 fail: 492 free(tr); 493 return ((size_t)(-1)); 494 } 495 496 /* 497 * In the non-POSIX case, we transform each character into a string of 498 * characters representing the character's priority. Since char is usually 499 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 500 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 501 * bits per byte. 502 * 503 * It turns out that we sometimes have real priorities that are 504 * 31-bits wide. (But: be careful using priorities where the high 505 * order bit is set -- i.e. the priority is negative. The sort order 506 * may be surprising!) 507 * 508 * TODO: This would be a good area to optimize somewhat. It turns out 509 * that real prioririties *except for the last UNDEFINED pass* are generally 510 * very small. We need the localedef code to precalculate the max 511 * priority for us, and ideally also give us a mask, and then we could 512 * severely limit what we expand to. 513 */ 514 #define XFRM_BYTES 6 515 #define XFRM_OFFSET ('0') /* make all printable characters */ 516 #define XFRM_SHIFT 6 517 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 518 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 519 520 static int 521 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass) 522 { 523 /* we use unsigned to ensure zero fill on right shift */ 524 uint32_t val = BSWAP((uint32_t)table->info->pri_count[pass]); 525 int nc = 0; 526 527 while (val) { 528 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 529 pri >>= XFRM_SHIFT; 530 val >>= XFRM_SHIFT; 531 p++; 532 nc++; 533 } 534 return (nc); 535 } 536 537 size_t 538 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf, 539 size_t room) 540 { 541 int pri; 542 int len; 543 const wchar_t *t; 544 wchar_t *tr = NULL; 545 int direc; 546 int pass; 547 const int32_t *state; 548 size_t want = 0; 549 size_t need = 0; 550 int b; 551 uint8_t buf[XFRM_BYTES]; 552 int ndir = table->info->directive_count; 553 554 assert(src); 555 556 for (pass = 0; pass <= ndir; pass++) { 557 558 state = NULL; 559 560 if (pass != 0) { 561 /* insert level separator from the previous pass */ 562 if (room) { 563 *xf++ = XFRM_SEP; 564 room--; 565 } 566 want++; 567 } 568 569 /* special pass for undefined */ 570 if (pass == ndir) { 571 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 572 } else { 573 direc = table->info->directive[pass]; 574 } 575 576 t = src; 577 578 if (direc & DIRECTIVE_BACKWARD) { 579 wchar_t *bp, *fp, c; 580 free(tr); 581 if ((tr = wcsdup(t)) == NULL) { 582 errno = ENOMEM; 583 goto fail; 584 } 585 bp = tr; 586 fp = tr + wcslen(tr) - 1; 587 while (bp < fp) { 588 c = *bp; 589 *bp++ = *fp; 590 *fp-- = c; 591 } 592 t = (const wchar_t *)tr; 593 } 594 595 if (direc & DIRECTIVE_POSITION) { 596 while (*t || state) { 597 598 _collate_lookup(table, t, &len, &pri, pass, &state); 599 t += len; 600 if (pri <= 0) { 601 if (pri < 0) { 602 errno = EINVAL; 603 goto fail; 604 } 605 state = NULL; 606 pri = COLLATE_MAX_PRIORITY; 607 } 608 609 b = xfrm(table, buf, pri, pass); 610 want += b; 611 if (room) { 612 while (b) { 613 b--; 614 if (room) { 615 *xf++ = buf[b]; 616 room--; 617 } 618 } 619 } 620 need = want; 621 } 622 } else { 623 while (*t || state) { 624 _collate_lookup(table, t, &len, &pri, pass, &state); 625 t += len; 626 if (pri <= 0) { 627 if (pri < 0) { 628 errno = EINVAL; 629 goto fail; 630 } 631 state = NULL; 632 continue; 633 } 634 635 b = xfrm(table, buf, pri, pass); 636 want += b; 637 if (room) { 638 639 while (b) { 640 b--; 641 if (room) { 642 *xf++ = buf[b]; 643 room--; 644 } 645 } 646 } 647 need = want; 648 } 649 } 650 } 651 free(tr); 652 return (need); 653 654 fail: 655 free(tr); 656 return ((size_t)(-1)); 657 } 658 659 /* 660 * __collate_equiv_value returns the primary collation value for the given 661 * collating symbol specified by str and len. Zero or negative is returned 662 * if the collating symbol was not found. This function is used by bracket 663 * code in the TRE regex library. 664 */ 665 int 666 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) 667 { 668 int32_t e; 669 670 if (len < 1 || len >= COLLATE_STR_LEN) 671 return (-1); 672 673 FIX_LOCALE(locale); 674 struct xlocale_collate *table = 675 (struct xlocale_collate*)locale->components[XLC_COLLATE]; 676 677 if (table->__collate_load_error) 678 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1); 679 680 if (len == 1) { 681 e = -1; 682 if (*str <= UCHAR_MAX) 683 e = table->char_pri_table[*str].pri[0]; 684 else if (BSWAP(table->info->large_count) > 0) { 685 collate_large_t *match_large; 686 match_large = largesearch(table, *str); 687 if (match_large) 688 e = match_large->pri.pri[0]; 689 } 690 if (e == 0) 691 return (1); 692 return (e > 0 ? e : 0); 693 } 694 if (BSWAP(table->info->chain_count) > 0) { 695 wchar_t name[COLLATE_STR_LEN]; 696 collate_chain_t *match_chain; 697 int clen; 698 699 wcsncpy (name, str, len); 700 name[len] = 0; 701 match_chain = chainsearch(table, name, &clen); 702 if (match_chain) { 703 e = match_chain->pri[0]; 704 if (e == 0) 705 return (1); 706 return (e < 0 ? -e : e); 707 } 708 } 709 return (0); 710 } 711