1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright 2014 Garrett D'Amore <garrett@damore.org> 5 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 6 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 7 * at Electronni Visti IA, Kiev, Ukraine. 8 * All rights reserved. 9 * 10 * Copyright (c) 2011 The FreeBSD Foundation 11 * All rights reserved. 12 * Portions of this software were developed by David Chisnall 13 * under sponsorship from the FreeBSD Foundation. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * Adapted to xlocale by John Marino <draco@marino.st> 37 */ 38 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include "namespace.h" 43 44 #include <sys/types.h> 45 #include <sys/stat.h> 46 #include <sys/mman.h> 47 48 #include <assert.h> 49 #include <stdio.h> 50 #include <stdlib.h> 51 #include <string.h> 52 #include <wchar.h> 53 #include <errno.h> 54 #include <unistd.h> 55 #include <fcntl.h> 56 #include "un-namespace.h" 57 58 #include "collate.h" 59 #include "setlocale.h" 60 #include "ldpart.h" 61 #include "libc_private.h" 62 63 struct xlocale_collate __xlocale_global_collate = { 64 {{0}, "C"}, 1, 0, 0, 0 65 }; 66 67 struct xlocale_collate __xlocale_C_collate = { 68 {{0}, "C"}, 1, 0, 0, 0 69 }; 70 71 static int 72 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table); 73 74 static void 75 destruct_collate(void *t) 76 { 77 struct xlocale_collate *table = t; 78 if (table->map && (table->maplen > 0)) { 79 (void) munmap(table->map, table->maplen); 80 } 81 free(t); 82 } 83 84 void * 85 __collate_load(const char *encoding, __unused locale_t unused) 86 { 87 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 88 return &__xlocale_C_collate; 89 } 90 struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1); 91 table->header.header.destructor = destruct_collate; 92 // FIXME: Make sure that _LDP_CACHE is never returned. We should be doing 93 // the caching outside of this section 94 if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) { 95 xlocale_release(table); 96 return NULL; 97 } 98 return table; 99 } 100 101 /** 102 * Load the collation tables for the specified encoding into the global table. 103 */ 104 int 105 __collate_load_tables(const char *encoding) 106 { 107 108 return (__collate_load_tables_l(encoding, &__xlocale_global_collate)); 109 } 110 111 int 112 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) 113 { 114 int i, chains, z; 115 char *buf; 116 char *TMP; 117 char *map; 118 collate_info_t *info; 119 struct stat sbuf; 120 int fd; 121 122 table->__collate_load_error = 1; 123 124 /* 'encoding' must be already checked. */ 125 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 126 return (_LDP_CACHE); 127 } 128 129 if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1) 130 return (_LDP_ERROR); 131 132 if ((fd = _open(buf, O_RDONLY)) < 0) { 133 free(buf); 134 return (_LDP_ERROR); 135 } 136 free(buf); 137 if (_fstat(fd, &sbuf) < 0) { 138 (void) _close(fd); 139 return (_LDP_ERROR); 140 } 141 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { 142 (void) _close(fd); 143 errno = EINVAL; 144 return (_LDP_ERROR); 145 } 146 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 147 (void) _close(fd); 148 if ((TMP = map) == NULL) { 149 return (_LDP_ERROR); 150 } 151 152 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { 153 (void) munmap(map, sbuf.st_size); 154 errno = EINVAL; 155 return (_LDP_ERROR); 156 } 157 TMP += COLLATE_STR_LEN; 158 159 info = (void *)TMP; 160 TMP += sizeof (*info); 161 162 if ((info->directive_count < 1) || 163 (info->directive_count >= COLL_WEIGHTS_MAX) || 164 ((chains = info->chain_count) < 0)) { 165 (void) munmap(map, sbuf.st_size); 166 errno = EINVAL; 167 return (_LDP_ERROR); 168 } 169 170 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 171 (sizeof (collate_chain_t) * chains) + 172 (sizeof (collate_large_t) * info->large_count); 173 for (z = 0; z < info->directive_count; z++) { 174 i += sizeof (collate_subst_t) * info->subst_count[z]; 175 } 176 if (i != (sbuf.st_size - (TMP - map))) { 177 (void) munmap(map, sbuf.st_size); 178 errno = EINVAL; 179 return (_LDP_ERROR); 180 } 181 182 table->info = info; 183 table->char_pri_table = (void *)TMP; 184 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 185 186 for (z = 0; z < info->directive_count; z++) { 187 if (info->subst_count[z] > 0) { 188 table->subst_table[z] = (void *)TMP; 189 TMP += info->subst_count[z] * sizeof (collate_subst_t); 190 } else { 191 table->subst_table[z] = NULL; 192 } 193 } 194 195 if (chains > 0) { 196 table->chain_pri_table = (void *)TMP; 197 TMP += chains * sizeof (collate_chain_t); 198 } else 199 table->chain_pri_table = NULL; 200 if (info->large_count > 0) 201 table->large_pri_table = (void *)TMP; 202 else 203 table->large_pri_table = NULL; 204 205 table->__collate_load_error = 0; 206 return (_LDP_LOADED); 207 } 208 209 static const int32_t * 210 substsearch(struct xlocale_collate *table, const wchar_t key, int pass) 211 { 212 const collate_subst_t *p; 213 int n = table->info->subst_count[pass]; 214 215 if (n == 0) 216 return (NULL); 217 218 if (pass >= table->info->directive_count) 219 return (NULL); 220 221 if (!(key & COLLATE_SUBST_PRIORITY)) 222 return (NULL); 223 224 p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 225 assert(p->key == key); 226 227 return (p->pri); 228 } 229 230 static collate_chain_t * 231 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len) 232 { 233 int low = 0; 234 int high = table->info->chain_count - 1; 235 int next, compar, l; 236 collate_chain_t *p; 237 collate_chain_t *tab = table->chain_pri_table; 238 239 if (high < 0) 240 return (NULL); 241 242 while (low <= high) { 243 next = (low + high) / 2; 244 p = tab + next; 245 compar = *key - *p->str; 246 if (compar == 0) { 247 l = wcsnlen(p->str, COLLATE_STR_LEN); 248 compar = wcsncmp(key, p->str, l); 249 if (compar == 0) { 250 *len = l; 251 return (p); 252 } 253 } 254 if (compar > 0) 255 low = next + 1; 256 else 257 high = next - 1; 258 } 259 return (NULL); 260 } 261 262 static collate_large_t * 263 largesearch(struct xlocale_collate *table, const wchar_t key) 264 { 265 int low = 0; 266 int high = table->info->large_count - 1; 267 int next, compar; 268 collate_large_t *p; 269 collate_large_t *tab = table->large_pri_table; 270 271 if (high < 0) 272 return (NULL); 273 274 while (low <= high) { 275 next = (low + high) / 2; 276 p = tab + next; 277 compar = key - p->val; 278 if (compar == 0) 279 return (p); 280 if (compar > 0) 281 low = next + 1; 282 else 283 high = next - 1; 284 } 285 return (NULL); 286 } 287 288 void 289 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len, 290 int *pri, int which, const int **state) 291 { 292 collate_chain_t *p2; 293 collate_large_t *match; 294 int p, l; 295 const int *sptr; 296 297 /* 298 * If this is the "last" pass for the UNDEFINED, then 299 * we just return the priority itself. 300 */ 301 if (which >= table->info->directive_count) { 302 *pri = *t; 303 *len = 1; 304 *state = NULL; 305 return; 306 } 307 308 /* 309 * If we have remaining substitution data from a previous 310 * call, consume it first. 311 */ 312 if ((sptr = *state) != NULL) { 313 *pri = *sptr; 314 sptr++; 315 if ((sptr == *state) || (sptr == NULL)) 316 *state = NULL; 317 else 318 *state = sptr; 319 *len = 0; 320 return; 321 } 322 323 /* No active substitutions */ 324 *len = 1; 325 326 /* 327 * Check for composites such as diphthongs that collate as a 328 * single element (aka chains or collating-elements). 329 */ 330 if (((p2 = chainsearch(table, t, &l)) != NULL) && 331 ((p = p2->pri[which]) >= 0)) { 332 333 *len = l; 334 *pri = p; 335 336 } else if (*t <= UCHAR_MAX) { 337 338 /* 339 * Character is a small (8-bit) character. 340 * We just look these up directly for speed. 341 */ 342 *pri = table->char_pri_table[*t].pri[which]; 343 344 } else if ((table->info->large_count > 0) && 345 ((match = largesearch(table, *t)) != NULL)) { 346 347 /* 348 * Character was found in the extended table. 349 */ 350 *pri = match->pri.pri[which]; 351 352 } else { 353 /* 354 * Character lacks a specific definition. 355 */ 356 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) { 357 /* Mask off sign bit to prevent ordering confusion. */ 358 *pri = (*t & COLLATE_MAX_PRIORITY); 359 } else { 360 *pri = table->info->undef_pri[which]; 361 } 362 /* No substitutions for undefined characters! */ 363 return; 364 } 365 366 /* 367 * Try substituting (expanding) the character. We are 368 * currently doing this *after* the chain compression. I 369 * think it should not matter, but this way might be slightly 370 * faster. 371 * 372 * We do this after the priority search, as this will help us 373 * to identify a single key value. In order for this to work, 374 * its important that the priority assigned to a given element 375 * to be substituted be unique for that level. The localedef 376 * code ensures this for us. 377 */ 378 if ((sptr = substsearch(table, *pri, which)) != NULL) { 379 if ((*pri = *sptr) > 0) { 380 sptr++; 381 *state = *sptr ? sptr : NULL; 382 } 383 } 384 385 } 386 387 /* 388 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 389 * NOT NULL terminate. That is left to the caller. 390 */ 391 size_t 392 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf, 393 size_t room) 394 { 395 int pri; 396 int len; 397 const wchar_t *t; 398 wchar_t *tr = NULL; 399 int direc; 400 int pass; 401 const int32_t *state; 402 size_t want = 0; 403 size_t need = 0; 404 int ndir = table->info->directive_count; 405 406 assert(src); 407 408 for (pass = 0; pass <= ndir; pass++) { 409 410 state = NULL; 411 412 if (pass != 0) { 413 /* insert level separator from the previous pass */ 414 if (room) { 415 *xf++ = 1; 416 room--; 417 } 418 want++; 419 } 420 421 /* special pass for undefined */ 422 if (pass == ndir) { 423 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 424 } else { 425 direc = table->info->directive[pass]; 426 } 427 428 t = src; 429 430 if (direc & DIRECTIVE_BACKWARD) { 431 wchar_t *bp, *fp, c; 432 free(tr); 433 if ((tr = wcsdup(t)) == NULL) { 434 errno = ENOMEM; 435 goto fail; 436 } 437 bp = tr; 438 fp = tr + wcslen(tr) - 1; 439 while (bp < fp) { 440 c = *bp; 441 *bp++ = *fp; 442 *fp-- = c; 443 } 444 t = (const wchar_t *)tr; 445 } 446 447 if (direc & DIRECTIVE_POSITION) { 448 while (*t || state) { 449 _collate_lookup(table, t, &len, &pri, pass, &state); 450 t += len; 451 if (pri <= 0) { 452 if (pri < 0) { 453 errno = EINVAL; 454 goto fail; 455 } 456 state = NULL; 457 pri = COLLATE_MAX_PRIORITY; 458 } 459 if (room) { 460 *xf++ = pri; 461 room--; 462 } 463 want++; 464 need = want; 465 } 466 } else { 467 while (*t || state) { 468 _collate_lookup(table, t, &len, &pri, pass, &state); 469 t += len; 470 if (pri <= 0) { 471 if (pri < 0) { 472 errno = EINVAL; 473 goto fail; 474 } 475 state = NULL; 476 continue; 477 } 478 if (room) { 479 *xf++ = pri; 480 room--; 481 } 482 want++; 483 need = want; 484 } 485 } 486 } 487 free(tr); 488 return (need); 489 490 fail: 491 free(tr); 492 return ((size_t)(-1)); 493 } 494 495 /* 496 * In the non-POSIX case, we transform each character into a string of 497 * characters representing the character's priority. Since char is usually 498 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 499 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 500 * bits per byte. 501 * 502 * It turns out that we sometimes have real priorities that are 503 * 31-bits wide. (But: be careful using priorities where the high 504 * order bit is set -- i.e. the priority is negative. The sort order 505 * may be surprising!) 506 * 507 * TODO: This would be a good area to optimize somewhat. It turns out 508 * that real prioririties *except for the last UNDEFINED pass* are generally 509 * very small. We need the localedef code to precalculate the max 510 * priority for us, and ideally also give us a mask, and then we could 511 * severely limit what we expand to. 512 */ 513 #define XFRM_BYTES 6 514 #define XFRM_OFFSET ('0') /* make all printable characters */ 515 #define XFRM_SHIFT 6 516 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 517 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 518 519 static int 520 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass) 521 { 522 /* we use unsigned to ensure zero fill on right shift */ 523 uint32_t val = (uint32_t)table->info->pri_count[pass]; 524 int nc = 0; 525 526 while (val) { 527 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 528 pri >>= XFRM_SHIFT; 529 val >>= XFRM_SHIFT; 530 p++; 531 nc++; 532 } 533 return (nc); 534 } 535 536 size_t 537 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf, 538 size_t room) 539 { 540 int pri; 541 int len; 542 const wchar_t *t; 543 wchar_t *tr = NULL; 544 int direc; 545 int pass; 546 const int32_t *state; 547 size_t want = 0; 548 size_t need = 0; 549 int b; 550 uint8_t buf[XFRM_BYTES]; 551 int ndir = table->info->directive_count; 552 553 assert(src); 554 555 for (pass = 0; pass <= ndir; pass++) { 556 557 state = NULL; 558 559 if (pass != 0) { 560 /* insert level separator from the previous pass */ 561 if (room) { 562 *xf++ = XFRM_SEP; 563 room--; 564 } 565 want++; 566 } 567 568 /* special pass for undefined */ 569 if (pass == ndir) { 570 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 571 } else { 572 direc = table->info->directive[pass]; 573 } 574 575 t = src; 576 577 if (direc & DIRECTIVE_BACKWARD) { 578 wchar_t *bp, *fp, c; 579 free(tr); 580 if ((tr = wcsdup(t)) == NULL) { 581 errno = ENOMEM; 582 goto fail; 583 } 584 bp = tr; 585 fp = tr + wcslen(tr) - 1; 586 while (bp < fp) { 587 c = *bp; 588 *bp++ = *fp; 589 *fp-- = c; 590 } 591 t = (const wchar_t *)tr; 592 } 593 594 if (direc & DIRECTIVE_POSITION) { 595 while (*t || state) { 596 597 _collate_lookup(table, t, &len, &pri, pass, &state); 598 t += len; 599 if (pri <= 0) { 600 if (pri < 0) { 601 errno = EINVAL; 602 goto fail; 603 } 604 state = NULL; 605 pri = COLLATE_MAX_PRIORITY; 606 } 607 608 b = xfrm(table, buf, pri, pass); 609 want += b; 610 if (room) { 611 while (b) { 612 b--; 613 if (room) { 614 *xf++ = buf[b]; 615 room--; 616 } 617 } 618 } 619 need = want; 620 } 621 } else { 622 while (*t || state) { 623 _collate_lookup(table, t, &len, &pri, pass, &state); 624 t += len; 625 if (pri <= 0) { 626 if (pri < 0) { 627 errno = EINVAL; 628 goto fail; 629 } 630 state = NULL; 631 continue; 632 } 633 634 b = xfrm(table, buf, pri, pass); 635 want += b; 636 if (room) { 637 638 while (b) { 639 b--; 640 if (room) { 641 *xf++ = buf[b]; 642 room--; 643 } 644 } 645 } 646 need = want; 647 } 648 } 649 } 650 free(tr); 651 return (need); 652 653 fail: 654 free(tr); 655 return ((size_t)(-1)); 656 } 657 658 /* 659 * __collate_equiv_value returns the primary collation value for the given 660 * collating symbol specified by str and len. Zero or negative is returned 661 * if the collating symbol was not found. This function is used by bracket 662 * code in the TRE regex library. 663 */ 664 int 665 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) 666 { 667 int32_t e; 668 669 if (len < 1 || len >= COLLATE_STR_LEN) 670 return (-1); 671 672 FIX_LOCALE(locale); 673 struct xlocale_collate *table = 674 (struct xlocale_collate*)locale->components[XLC_COLLATE]; 675 676 if (table->__collate_load_error) 677 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1); 678 679 if (len == 1) { 680 e = -1; 681 if (*str <= UCHAR_MAX) 682 e = table->char_pri_table[*str].pri[0]; 683 else if (table->info->large_count > 0) { 684 collate_large_t *match_large; 685 match_large = largesearch(table, *str); 686 if (match_large) 687 e = match_large->pri.pri[0]; 688 } 689 if (e == 0) 690 return (1); 691 return (e > 0 ? e : 0); 692 } 693 if (table->info->chain_count > 0) { 694 wchar_t name[COLLATE_STR_LEN]; 695 collate_chain_t *match_chain; 696 int clen; 697 698 wcsncpy (name, str, len); 699 name[len] = 0; 700 match_chain = chainsearch(table, name, &clen); 701 if (match_chain) { 702 e = match_chain->pri[0]; 703 if (e == 0) 704 return (1); 705 return (e < 0 ? -e : e); 706 } 707 } 708 return (0); 709 } 710