1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright 2014 Garrett D'Amore <garrett@damore.org> 5 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 6 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 7 * at Electronni Visti IA, Kiev, Ukraine. 8 * All rights reserved. 9 * 10 * Copyright (c) 2011 The FreeBSD Foundation 11 * All rights reserved. 12 * Portions of this software were developed by David Chisnall 13 * under sponsorship from the FreeBSD Foundation. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * Adapted to xlocale by John Marino <draco@marino.st> 37 */ 38 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include "namespace.h" 43 44 #include <sys/types.h> 45 #include <sys/stat.h> 46 #include <sys/mman.h> 47 48 #include <assert.h> 49 #include <stdio.h> 50 #include <stdlib.h> 51 #include <string.h> 52 #include <wchar.h> 53 #include <errno.h> 54 #include <unistd.h> 55 #include <fcntl.h> 56 #include "un-namespace.h" 57 58 #include "collate.h" 59 #include "setlocale.h" 60 #include "ldpart.h" 61 #include "libc_private.h" 62 63 struct xlocale_collate __xlocale_global_collate = { 64 {{0}, "C"}, 1, 0, 0, 0 65 }; 66 67 struct xlocale_collate __xlocale_C_collate = { 68 {{0}, "C"}, 1, 0, 0, 0 69 }; 70 71 static int 72 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table); 73 74 static void 75 destruct_collate(void *t) 76 { 77 struct xlocale_collate *table = t; 78 if (table->map && (table->maplen > 0)) { 79 (void) munmap(table->map, table->maplen); 80 } 81 free(t); 82 } 83 84 void * 85 __collate_load(const char *encoding, __unused locale_t unused) 86 { 87 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 || 88 strncmp(encoding, "C.", 2) == 0) { 89 return &__xlocale_C_collate; 90 } 91 struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1); 92 table->header.header.destructor = destruct_collate; 93 // FIXME: Make sure that _LDP_CACHE is never returned. We should be doing 94 // the caching outside of this section 95 if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) { 96 xlocale_release(table); 97 return NULL; 98 } 99 return table; 100 } 101 102 /** 103 * Load the collation tables for the specified encoding into the global table. 104 */ 105 int 106 __collate_load_tables(const char *encoding) 107 { 108 109 return (__collate_load_tables_l(encoding, &__xlocale_global_collate)); 110 } 111 112 int 113 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) 114 { 115 int i, chains, z; 116 char *buf; 117 char *TMP; 118 char *map; 119 collate_info_t *info; 120 struct stat sbuf; 121 int fd; 122 123 table->__collate_load_error = 1; 124 125 /* 'encoding' must be already checked. */ 126 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 || 127 strncmp(encoding, "C.", 2) == 0) { 128 return (_LDP_CACHE); 129 } 130 131 if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1) 132 return (_LDP_ERROR); 133 134 if ((fd = _open(buf, O_RDONLY)) < 0) { 135 free(buf); 136 return (_LDP_ERROR); 137 } 138 free(buf); 139 if (_fstat(fd, &sbuf) < 0) { 140 (void) _close(fd); 141 return (_LDP_ERROR); 142 } 143 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { 144 (void) _close(fd); 145 errno = EINVAL; 146 return (_LDP_ERROR); 147 } 148 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 149 (void) _close(fd); 150 if ((TMP = map) == NULL) { 151 return (_LDP_ERROR); 152 } 153 154 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { 155 (void) munmap(map, sbuf.st_size); 156 errno = EINVAL; 157 return (_LDP_ERROR); 158 } 159 TMP += COLLATE_STR_LEN; 160 161 info = (void *)TMP; 162 TMP += sizeof (*info); 163 164 if ((info->directive_count < 1) || 165 (info->directive_count >= COLL_WEIGHTS_MAX) || 166 ((chains = info->chain_count) < 0)) { 167 (void) munmap(map, sbuf.st_size); 168 errno = EINVAL; 169 return (_LDP_ERROR); 170 } 171 172 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 173 (sizeof (collate_chain_t) * chains) + 174 (sizeof (collate_large_t) * info->large_count); 175 for (z = 0; z < info->directive_count; z++) { 176 i += sizeof (collate_subst_t) * info->subst_count[z]; 177 } 178 if (i != (sbuf.st_size - (TMP - map))) { 179 (void) munmap(map, sbuf.st_size); 180 errno = EINVAL; 181 return (_LDP_ERROR); 182 } 183 184 table->info = info; 185 table->char_pri_table = (void *)TMP; 186 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 187 188 for (z = 0; z < info->directive_count; z++) { 189 if (info->subst_count[z] > 0) { 190 table->subst_table[z] = (void *)TMP; 191 TMP += info->subst_count[z] * sizeof (collate_subst_t); 192 } else { 193 table->subst_table[z] = NULL; 194 } 195 } 196 197 if (chains > 0) { 198 table->chain_pri_table = (void *)TMP; 199 TMP += chains * sizeof (collate_chain_t); 200 } else 201 table->chain_pri_table = NULL; 202 if (info->large_count > 0) 203 table->large_pri_table = (void *)TMP; 204 else 205 table->large_pri_table = NULL; 206 207 table->__collate_load_error = 0; 208 return (_LDP_LOADED); 209 } 210 211 static const int32_t * 212 substsearch(struct xlocale_collate *table, const wchar_t key, int pass) 213 { 214 const collate_subst_t *p; 215 int n = table->info->subst_count[pass]; 216 217 if (n == 0) 218 return (NULL); 219 220 if (pass >= table->info->directive_count) 221 return (NULL); 222 223 if (!(key & COLLATE_SUBST_PRIORITY)) 224 return (NULL); 225 226 p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 227 assert(p->key == key); 228 229 return (p->pri); 230 } 231 232 static collate_chain_t * 233 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len) 234 { 235 int low = 0; 236 int high = table->info->chain_count - 1; 237 int next, compar, l; 238 collate_chain_t *p; 239 collate_chain_t *tab = table->chain_pri_table; 240 241 if (high < 0) 242 return (NULL); 243 244 while (low <= high) { 245 next = (low + high) / 2; 246 p = tab + next; 247 compar = *key - *p->str; 248 if (compar == 0) { 249 l = wcsnlen(p->str, COLLATE_STR_LEN); 250 compar = wcsncmp(key, p->str, l); 251 if (compar == 0) { 252 *len = l; 253 return (p); 254 } 255 } 256 if (compar > 0) 257 low = next + 1; 258 else 259 high = next - 1; 260 } 261 return (NULL); 262 } 263 264 static collate_large_t * 265 largesearch(struct xlocale_collate *table, const wchar_t key) 266 { 267 int low = 0; 268 int high = table->info->large_count - 1; 269 int next, compar; 270 collate_large_t *p; 271 collate_large_t *tab = table->large_pri_table; 272 273 if (high < 0) 274 return (NULL); 275 276 while (low <= high) { 277 next = (low + high) / 2; 278 p = tab + next; 279 compar = key - p->val; 280 if (compar == 0) 281 return (p); 282 if (compar > 0) 283 low = next + 1; 284 else 285 high = next - 1; 286 } 287 return (NULL); 288 } 289 290 void 291 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len, 292 int *pri, int which, const int **state) 293 { 294 collate_chain_t *p2; 295 collate_large_t *match; 296 int p, l; 297 const int *sptr; 298 299 /* 300 * If this is the "last" pass for the UNDEFINED, then 301 * we just return the priority itself. 302 */ 303 if (which >= table->info->directive_count) { 304 *pri = *t; 305 *len = 1; 306 *state = NULL; 307 return; 308 } 309 310 /* 311 * If we have remaining substitution data from a previous 312 * call, consume it first. 313 */ 314 if ((sptr = *state) != NULL) { 315 *pri = *sptr; 316 sptr++; 317 if ((sptr == *state) || (sptr == NULL)) 318 *state = NULL; 319 else 320 *state = sptr; 321 *len = 0; 322 return; 323 } 324 325 /* No active substitutions */ 326 *len = 1; 327 328 /* 329 * Check for composites such as diphthongs that collate as a 330 * single element (aka chains or collating-elements). 331 */ 332 if (((p2 = chainsearch(table, t, &l)) != NULL) && 333 ((p = p2->pri[which]) >= 0)) { 334 335 *len = l; 336 *pri = p; 337 338 } else if (*t <= UCHAR_MAX) { 339 340 /* 341 * Character is a small (8-bit) character. 342 * We just look these up directly for speed. 343 */ 344 *pri = table->char_pri_table[*t].pri[which]; 345 346 } else if ((table->info->large_count > 0) && 347 ((match = largesearch(table, *t)) != NULL)) { 348 349 /* 350 * Character was found in the extended table. 351 */ 352 *pri = match->pri.pri[which]; 353 354 } else { 355 /* 356 * Character lacks a specific definition. 357 */ 358 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) { 359 /* Mask off sign bit to prevent ordering confusion. */ 360 *pri = (*t & COLLATE_MAX_PRIORITY); 361 } else { 362 *pri = table->info->undef_pri[which]; 363 } 364 /* No substitutions for undefined characters! */ 365 return; 366 } 367 368 /* 369 * Try substituting (expanding) the character. We are 370 * currently doing this *after* the chain compression. I 371 * think it should not matter, but this way might be slightly 372 * faster. 373 * 374 * We do this after the priority search, as this will help us 375 * to identify a single key value. In order for this to work, 376 * its important that the priority assigned to a given element 377 * to be substituted be unique for that level. The localedef 378 * code ensures this for us. 379 */ 380 if ((sptr = substsearch(table, *pri, which)) != NULL) { 381 if ((*pri = *sptr) > 0) { 382 sptr++; 383 *state = *sptr ? sptr : NULL; 384 } 385 } 386 387 } 388 389 /* 390 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 391 * NOT NULL terminate. That is left to the caller. 392 */ 393 size_t 394 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf, 395 size_t room) 396 { 397 int pri; 398 int len; 399 const wchar_t *t; 400 wchar_t *tr = NULL; 401 int direc; 402 int pass; 403 const int32_t *state; 404 size_t want = 0; 405 size_t need = 0; 406 int ndir = table->info->directive_count; 407 408 assert(src); 409 410 for (pass = 0; pass <= ndir; pass++) { 411 412 state = NULL; 413 414 if (pass != 0) { 415 /* insert level separator from the previous pass */ 416 if (room) { 417 *xf++ = 1; 418 room--; 419 } 420 want++; 421 } 422 423 /* special pass for undefined */ 424 if (pass == ndir) { 425 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 426 } else { 427 direc = table->info->directive[pass]; 428 } 429 430 t = src; 431 432 if (direc & DIRECTIVE_BACKWARD) { 433 wchar_t *bp, *fp, c; 434 free(tr); 435 if ((tr = wcsdup(t)) == NULL) { 436 errno = ENOMEM; 437 goto fail; 438 } 439 bp = tr; 440 fp = tr + wcslen(tr) - 1; 441 while (bp < fp) { 442 c = *bp; 443 *bp++ = *fp; 444 *fp-- = c; 445 } 446 t = (const wchar_t *)tr; 447 } 448 449 if (direc & DIRECTIVE_POSITION) { 450 while (*t || state) { 451 _collate_lookup(table, t, &len, &pri, pass, &state); 452 t += len; 453 if (pri <= 0) { 454 if (pri < 0) { 455 errno = EINVAL; 456 goto fail; 457 } 458 state = NULL; 459 pri = COLLATE_MAX_PRIORITY; 460 } 461 if (room) { 462 *xf++ = pri; 463 room--; 464 } 465 want++; 466 need = want; 467 } 468 } else { 469 while (*t || state) { 470 _collate_lookup(table, t, &len, &pri, pass, &state); 471 t += len; 472 if (pri <= 0) { 473 if (pri < 0) { 474 errno = EINVAL; 475 goto fail; 476 } 477 state = NULL; 478 continue; 479 } 480 if (room) { 481 *xf++ = pri; 482 room--; 483 } 484 want++; 485 need = want; 486 } 487 } 488 } 489 free(tr); 490 return (need); 491 492 fail: 493 free(tr); 494 return ((size_t)(-1)); 495 } 496 497 /* 498 * In the non-POSIX case, we transform each character into a string of 499 * characters representing the character's priority. Since char is usually 500 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 501 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 502 * bits per byte. 503 * 504 * It turns out that we sometimes have real priorities that are 505 * 31-bits wide. (But: be careful using priorities where the high 506 * order bit is set -- i.e. the priority is negative. The sort order 507 * may be surprising!) 508 * 509 * TODO: This would be a good area to optimize somewhat. It turns out 510 * that real prioririties *except for the last UNDEFINED pass* are generally 511 * very small. We need the localedef code to precalculate the max 512 * priority for us, and ideally also give us a mask, and then we could 513 * severely limit what we expand to. 514 */ 515 #define XFRM_BYTES 6 516 #define XFRM_OFFSET ('0') /* make all printable characters */ 517 #define XFRM_SHIFT 6 518 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 519 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 520 521 static int 522 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass) 523 { 524 /* we use unsigned to ensure zero fill on right shift */ 525 uint32_t val = (uint32_t)table->info->pri_count[pass]; 526 int nc = 0; 527 528 while (val) { 529 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 530 pri >>= XFRM_SHIFT; 531 val >>= XFRM_SHIFT; 532 p++; 533 nc++; 534 } 535 return (nc); 536 } 537 538 size_t 539 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf, 540 size_t room) 541 { 542 int pri; 543 int len; 544 const wchar_t *t; 545 wchar_t *tr = NULL; 546 int direc; 547 int pass; 548 const int32_t *state; 549 size_t want = 0; 550 size_t need = 0; 551 int b; 552 uint8_t buf[XFRM_BYTES]; 553 int ndir = table->info->directive_count; 554 555 assert(src); 556 557 for (pass = 0; pass <= ndir; pass++) { 558 559 state = NULL; 560 561 if (pass != 0) { 562 /* insert level separator from the previous pass */ 563 if (room) { 564 *xf++ = XFRM_SEP; 565 room--; 566 } 567 want++; 568 } 569 570 /* special pass for undefined */ 571 if (pass == ndir) { 572 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 573 } else { 574 direc = table->info->directive[pass]; 575 } 576 577 t = src; 578 579 if (direc & DIRECTIVE_BACKWARD) { 580 wchar_t *bp, *fp, c; 581 free(tr); 582 if ((tr = wcsdup(t)) == NULL) { 583 errno = ENOMEM; 584 goto fail; 585 } 586 bp = tr; 587 fp = tr + wcslen(tr) - 1; 588 while (bp < fp) { 589 c = *bp; 590 *bp++ = *fp; 591 *fp-- = c; 592 } 593 t = (const wchar_t *)tr; 594 } 595 596 if (direc & DIRECTIVE_POSITION) { 597 while (*t || state) { 598 599 _collate_lookup(table, t, &len, &pri, pass, &state); 600 t += len; 601 if (pri <= 0) { 602 if (pri < 0) { 603 errno = EINVAL; 604 goto fail; 605 } 606 state = NULL; 607 pri = COLLATE_MAX_PRIORITY; 608 } 609 610 b = xfrm(table, buf, pri, pass); 611 want += b; 612 if (room) { 613 while (b) { 614 b--; 615 if (room) { 616 *xf++ = buf[b]; 617 room--; 618 } 619 } 620 } 621 need = want; 622 } 623 } else { 624 while (*t || state) { 625 _collate_lookup(table, t, &len, &pri, pass, &state); 626 t += len; 627 if (pri <= 0) { 628 if (pri < 0) { 629 errno = EINVAL; 630 goto fail; 631 } 632 state = NULL; 633 continue; 634 } 635 636 b = xfrm(table, buf, pri, pass); 637 want += b; 638 if (room) { 639 640 while (b) { 641 b--; 642 if (room) { 643 *xf++ = buf[b]; 644 room--; 645 } 646 } 647 } 648 need = want; 649 } 650 } 651 } 652 free(tr); 653 return (need); 654 655 fail: 656 free(tr); 657 return ((size_t)(-1)); 658 } 659 660 /* 661 * __collate_equiv_value returns the primary collation value for the given 662 * collating symbol specified by str and len. Zero or negative is returned 663 * if the collating symbol was not found. This function is used by bracket 664 * code in the TRE regex library. 665 */ 666 int 667 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) 668 { 669 int32_t e; 670 671 if (len < 1 || len >= COLLATE_STR_LEN) 672 return (-1); 673 674 FIX_LOCALE(locale); 675 struct xlocale_collate *table = 676 (struct xlocale_collate*)locale->components[XLC_COLLATE]; 677 678 if (table->__collate_load_error) 679 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1); 680 681 if (len == 1) { 682 e = -1; 683 if (*str <= UCHAR_MAX) 684 e = table->char_pri_table[*str].pri[0]; 685 else if (table->info->large_count > 0) { 686 collate_large_t *match_large; 687 match_large = largesearch(table, *str); 688 if (match_large) 689 e = match_large->pri.pri[0]; 690 } 691 if (e == 0) 692 return (1); 693 return (e > 0 ? e : 0); 694 } 695 if (table->info->chain_count > 0) { 696 wchar_t name[COLLATE_STR_LEN]; 697 collate_chain_t *match_chain; 698 int clen; 699 700 wcsncpy (name, str, len); 701 name[len] = 0; 702 match_chain = chainsearch(table, name, &clen); 703 if (match_chain) { 704 e = match_chain->pri[0]; 705 if (e == 0) 706 return (1); 707 return (e < 0 ? -e : e); 708 } 709 } 710 return (0); 711 } 712