1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright 2014 Garrett D'Amore <garrett@damore.org> 5 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 6 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 7 * at Electronni Visti IA, Kiev, Ukraine. 8 * All rights reserved. 9 * 10 * Copyright (c) 2011 The FreeBSD Foundation 11 * All rights reserved. 12 * Portions of this software were developed by David Chisnall 13 * under sponsorship from the FreeBSD Foundation. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * Adapted to xlocale by John Marino <draco@marino.st> 37 */ 38 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include "namespace.h" 43 44 #include <sys/types.h> 45 #include <sys/stat.h> 46 #include <sys/mman.h> 47 48 #include <assert.h> 49 #include <stdio.h> 50 #include <stdlib.h> 51 #include <string.h> 52 #include <wchar.h> 53 #include <errno.h> 54 #include <unistd.h> 55 #include <fcntl.h> 56 #include "un-namespace.h" 57 58 #include "collate.h" 59 #include "setlocale.h" 60 #include "ldpart.h" 61 #include "libc_private.h" 62 63 struct xlocale_collate __xlocale_global_collate = { 64 {{0}, "C"}, 1, 0, 0, 0 65 }; 66 67 struct xlocale_collate __xlocale_C_collate = { 68 {{0}, "C"}, 1, 0, 0, 0 69 }; 70 71 static int 72 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table); 73 74 static void 75 destruct_collate(void *t) 76 { 77 struct xlocale_collate *table = t; 78 if (table->map && (table->maplen > 0)) { 79 (void) munmap(table->map, table->maplen); 80 } 81 free(t); 82 } 83 84 void * 85 __collate_load(const char *encoding, __unused locale_t unused) 86 { 87 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 || 88 strncmp(encoding, "C.", 2) == 0) { 89 return &__xlocale_C_collate; 90 } 91 struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1); 92 table->header.header.destructor = destruct_collate; 93 // FIXME: Make sure that _LDP_CACHE is never returned. We should be doing 94 // the caching outside of this section 95 if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) { 96 xlocale_release(table); 97 return NULL; 98 } 99 return table; 100 } 101 102 /** 103 * Load the collation tables for the specified encoding into the global table. 104 */ 105 int 106 __collate_load_tables(const char *encoding) 107 { 108 109 return (__collate_load_tables_l(encoding, &__xlocale_global_collate)); 110 } 111 112 static int 113 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) 114 { 115 int i, chains, z; 116 char *buf; 117 char *TMP; 118 char *map; 119 collate_info_t *info; 120 struct stat sbuf; 121 int fd; 122 123 table->__collate_load_error = 1; 124 125 /* 'encoding' must be already checked. */ 126 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 || 127 strncmp(encoding, "C.", 2) == 0) { 128 return (_LDP_CACHE); 129 } 130 131 if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1) 132 return (_LDP_ERROR); 133 134 if ((fd = _open(buf, O_RDONLY)) < 0) { 135 free(buf); 136 return (_LDP_ERROR); 137 } 138 free(buf); 139 if (_fstat(fd, &sbuf) < 0) { 140 (void) _close(fd); 141 return (_LDP_ERROR); 142 } 143 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { 144 (void) _close(fd); 145 errno = EINVAL; 146 return (_LDP_ERROR); 147 } 148 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 149 (void) _close(fd); 150 if ((TMP = map) == MAP_FAILED) { 151 return (_LDP_ERROR); 152 } 153 154 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { 155 (void) munmap(map, sbuf.st_size); 156 errno = EINVAL; 157 return (_LDP_ERROR); 158 } 159 TMP += COLLATE_STR_LEN; 160 161 info = (void *)TMP; 162 TMP += sizeof (*info); 163 164 if ((info->directive_count < 1) || 165 (info->directive_count >= COLL_WEIGHTS_MAX) || 166 ((chains = info->chain_count) < 0)) { 167 (void) munmap(map, sbuf.st_size); 168 errno = EINVAL; 169 return (_LDP_ERROR); 170 } 171 172 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 173 (sizeof (collate_chain_t) * chains) + 174 (sizeof (collate_large_t) * info->large_count); 175 for (z = 0; z < info->directive_count; z++) { 176 i += sizeof (collate_subst_t) * info->subst_count[z]; 177 } 178 if (i != (sbuf.st_size - (TMP - map))) { 179 (void) munmap(map, sbuf.st_size); 180 errno = EINVAL; 181 return (_LDP_ERROR); 182 } 183 184 if (table->map && (table->maplen > 0)) { 185 (void) munmap(table->map, table->maplen); 186 } 187 table->map = map; 188 table->maplen = sbuf.st_size; 189 table->info = info; 190 table->char_pri_table = (void *)TMP; 191 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 192 193 for (z = 0; z < info->directive_count; z++) { 194 if (info->subst_count[z] > 0) { 195 table->subst_table[z] = (void *)TMP; 196 TMP += info->subst_count[z] * sizeof (collate_subst_t); 197 } else { 198 table->subst_table[z] = NULL; 199 } 200 } 201 202 if (chains > 0) { 203 table->chain_pri_table = (void *)TMP; 204 TMP += chains * sizeof (collate_chain_t); 205 } else 206 table->chain_pri_table = NULL; 207 if (info->large_count > 0) 208 table->large_pri_table = (void *)TMP; 209 else 210 table->large_pri_table = NULL; 211 212 table->__collate_load_error = 0; 213 return (_LDP_LOADED); 214 } 215 216 static const int32_t * 217 substsearch(struct xlocale_collate *table, const wchar_t key, int pass) 218 { 219 const collate_subst_t *p; 220 int n = table->info->subst_count[pass]; 221 222 if (n == 0) 223 return (NULL); 224 225 if (pass >= table->info->directive_count) 226 return (NULL); 227 228 if (!(key & COLLATE_SUBST_PRIORITY)) 229 return (NULL); 230 231 p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 232 assert(p->key == key); 233 234 return (p->pri); 235 } 236 237 static collate_chain_t * 238 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len) 239 { 240 int low = 0; 241 int high = table->info->chain_count - 1; 242 int next, compar, l; 243 collate_chain_t *p; 244 collate_chain_t *tab = table->chain_pri_table; 245 246 if (high < 0) 247 return (NULL); 248 249 while (low <= high) { 250 next = (low + high) / 2; 251 p = tab + next; 252 compar = *key - *p->str; 253 if (compar == 0) { 254 l = wcsnlen(p->str, COLLATE_STR_LEN); 255 compar = wcsncmp(key, p->str, l); 256 if (compar == 0) { 257 *len = l; 258 return (p); 259 } 260 } 261 if (compar > 0) 262 low = next + 1; 263 else 264 high = next - 1; 265 } 266 return (NULL); 267 } 268 269 static collate_large_t * 270 largesearch(struct xlocale_collate *table, const wchar_t key) 271 { 272 int low = 0; 273 int high = table->info->large_count - 1; 274 int next, compar; 275 collate_large_t *p; 276 collate_large_t *tab = table->large_pri_table; 277 278 if (high < 0) 279 return (NULL); 280 281 while (low <= high) { 282 next = (low + high) / 2; 283 p = tab + next; 284 compar = key - p->val; 285 if (compar == 0) 286 return (p); 287 if (compar > 0) 288 low = next + 1; 289 else 290 high = next - 1; 291 } 292 return (NULL); 293 } 294 295 void 296 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len, 297 int *pri, int which, const int **state) 298 { 299 collate_chain_t *p2; 300 collate_large_t *match; 301 int p, l; 302 const int *sptr; 303 304 /* 305 * If this is the "last" pass for the UNDEFINED, then 306 * we just return the priority itself. 307 */ 308 if (which >= table->info->directive_count) { 309 *pri = *t; 310 *len = 1; 311 *state = NULL; 312 return; 313 } 314 315 /* 316 * If we have remaining substitution data from a previous 317 * call, consume it first. 318 */ 319 if ((sptr = *state) != NULL) { 320 *pri = *sptr; 321 sptr++; 322 if ((sptr == *state) || (sptr == NULL)) 323 *state = NULL; 324 else 325 *state = sptr; 326 *len = 0; 327 return; 328 } 329 330 /* No active substitutions */ 331 *len = 1; 332 333 /* 334 * Check for composites such as diphthongs that collate as a 335 * single element (aka chains or collating-elements). 336 */ 337 if (((p2 = chainsearch(table, t, &l)) != NULL) && 338 ((p = p2->pri[which]) >= 0)) { 339 340 *len = l; 341 *pri = p; 342 343 } else if (*t <= UCHAR_MAX) { 344 345 /* 346 * Character is a small (8-bit) character. 347 * We just look these up directly for speed. 348 */ 349 *pri = table->char_pri_table[*t].pri[which]; 350 351 } else if ((table->info->large_count > 0) && 352 ((match = largesearch(table, *t)) != NULL)) { 353 354 /* 355 * Character was found in the extended table. 356 */ 357 *pri = match->pri.pri[which]; 358 359 } else { 360 /* 361 * Character lacks a specific definition. 362 */ 363 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) { 364 /* Mask off sign bit to prevent ordering confusion. */ 365 *pri = (*t & COLLATE_MAX_PRIORITY); 366 } else { 367 *pri = table->info->undef_pri[which]; 368 } 369 /* No substitutions for undefined characters! */ 370 return; 371 } 372 373 /* 374 * Try substituting (expanding) the character. We are 375 * currently doing this *after* the chain compression. I 376 * think it should not matter, but this way might be slightly 377 * faster. 378 * 379 * We do this after the priority search, as this will help us 380 * to identify a single key value. In order for this to work, 381 * its important that the priority assigned to a given element 382 * to be substituted be unique for that level. The localedef 383 * code ensures this for us. 384 */ 385 if ((sptr = substsearch(table, *pri, which)) != NULL) { 386 if ((*pri = *sptr) > 0) { 387 sptr++; 388 *state = *sptr ? sptr : NULL; 389 } 390 } 391 392 } 393 394 /* 395 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 396 * NOT NULL terminate. That is left to the caller. 397 */ 398 size_t 399 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf, 400 size_t room) 401 { 402 int pri; 403 int len; 404 const wchar_t *t; 405 wchar_t *tr = NULL; 406 int direc; 407 int pass; 408 const int32_t *state; 409 size_t want = 0; 410 size_t need = 0; 411 int ndir = table->info->directive_count; 412 413 assert(src); 414 415 for (pass = 0; pass <= ndir; pass++) { 416 417 state = NULL; 418 419 if (pass != 0) { 420 /* insert level separator from the previous pass */ 421 if (room) { 422 *xf++ = 1; 423 room--; 424 } 425 want++; 426 } 427 428 /* special pass for undefined */ 429 if (pass == ndir) { 430 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 431 } else { 432 direc = table->info->directive[pass]; 433 } 434 435 t = src; 436 437 if (direc & DIRECTIVE_BACKWARD) { 438 wchar_t *bp, *fp, c; 439 free(tr); 440 if ((tr = wcsdup(t)) == NULL) { 441 errno = ENOMEM; 442 goto fail; 443 } 444 bp = tr; 445 fp = tr + wcslen(tr) - 1; 446 while (bp < fp) { 447 c = *bp; 448 *bp++ = *fp; 449 *fp-- = c; 450 } 451 t = (const wchar_t *)tr; 452 } 453 454 if (direc & DIRECTIVE_POSITION) { 455 while (*t || state) { 456 _collate_lookup(table, t, &len, &pri, pass, &state); 457 t += len; 458 if (pri <= 0) { 459 if (pri < 0) { 460 errno = EINVAL; 461 goto fail; 462 } 463 state = NULL; 464 pri = COLLATE_MAX_PRIORITY; 465 } 466 if (room) { 467 *xf++ = pri; 468 room--; 469 } 470 want++; 471 need = want; 472 } 473 } else { 474 while (*t || state) { 475 _collate_lookup(table, t, &len, &pri, pass, &state); 476 t += len; 477 if (pri <= 0) { 478 if (pri < 0) { 479 errno = EINVAL; 480 goto fail; 481 } 482 state = NULL; 483 continue; 484 } 485 if (room) { 486 *xf++ = pri; 487 room--; 488 } 489 want++; 490 need = want; 491 } 492 } 493 } 494 free(tr); 495 return (need); 496 497 fail: 498 free(tr); 499 return ((size_t)(-1)); 500 } 501 502 /* 503 * In the non-POSIX case, we transform each character into a string of 504 * characters representing the character's priority. Since char is usually 505 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 506 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 507 * bits per byte. 508 * 509 * It turns out that we sometimes have real priorities that are 510 * 31-bits wide. (But: be careful using priorities where the high 511 * order bit is set -- i.e. the priority is negative. The sort order 512 * may be surprising!) 513 * 514 * TODO: This would be a good area to optimize somewhat. It turns out 515 * that real prioririties *except for the last UNDEFINED pass* are generally 516 * very small. We need the localedef code to precalculate the max 517 * priority for us, and ideally also give us a mask, and then we could 518 * severely limit what we expand to. 519 */ 520 #define XFRM_BYTES 6 521 #define XFRM_OFFSET ('0') /* make all printable characters */ 522 #define XFRM_SHIFT 6 523 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 524 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 525 526 static int 527 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass) 528 { 529 /* we use unsigned to ensure zero fill on right shift */ 530 uint32_t val = (uint32_t)table->info->pri_count[pass]; 531 int nc = 0; 532 533 while (val) { 534 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 535 pri >>= XFRM_SHIFT; 536 val >>= XFRM_SHIFT; 537 p++; 538 nc++; 539 } 540 return (nc); 541 } 542 543 size_t 544 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf, 545 size_t room) 546 { 547 int pri; 548 int len; 549 const wchar_t *t; 550 wchar_t *tr = NULL; 551 int direc; 552 int pass; 553 const int32_t *state; 554 size_t want = 0; 555 size_t need = 0; 556 int b; 557 uint8_t buf[XFRM_BYTES]; 558 int ndir = table->info->directive_count; 559 560 assert(src); 561 562 for (pass = 0; pass <= ndir; pass++) { 563 564 state = NULL; 565 566 if (pass != 0) { 567 /* insert level separator from the previous pass */ 568 if (room) { 569 *xf++ = XFRM_SEP; 570 room--; 571 } 572 want++; 573 } 574 575 /* special pass for undefined */ 576 if (pass == ndir) { 577 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 578 } else { 579 direc = table->info->directive[pass]; 580 } 581 582 t = src; 583 584 if (direc & DIRECTIVE_BACKWARD) { 585 wchar_t *bp, *fp, c; 586 free(tr); 587 if ((tr = wcsdup(t)) == NULL) { 588 errno = ENOMEM; 589 goto fail; 590 } 591 bp = tr; 592 fp = tr + wcslen(tr) - 1; 593 while (bp < fp) { 594 c = *bp; 595 *bp++ = *fp; 596 *fp-- = c; 597 } 598 t = (const wchar_t *)tr; 599 } 600 601 if (direc & DIRECTIVE_POSITION) { 602 while (*t || state) { 603 604 _collate_lookup(table, t, &len, &pri, pass, &state); 605 t += len; 606 if (pri <= 0) { 607 if (pri < 0) { 608 errno = EINVAL; 609 goto fail; 610 } 611 state = NULL; 612 pri = COLLATE_MAX_PRIORITY; 613 } 614 615 b = xfrm(table, buf, pri, pass); 616 want += b; 617 if (room) { 618 while (b) { 619 b--; 620 if (room) { 621 *xf++ = buf[b]; 622 room--; 623 } 624 } 625 } 626 need = want; 627 } 628 } else { 629 while (*t || state) { 630 _collate_lookup(table, t, &len, &pri, pass, &state); 631 t += len; 632 if (pri <= 0) { 633 if (pri < 0) { 634 errno = EINVAL; 635 goto fail; 636 } 637 state = NULL; 638 continue; 639 } 640 641 b = xfrm(table, buf, pri, pass); 642 want += b; 643 if (room) { 644 645 while (b) { 646 b--; 647 if (room) { 648 *xf++ = buf[b]; 649 room--; 650 } 651 } 652 } 653 need = want; 654 } 655 } 656 } 657 free(tr); 658 return (need); 659 660 fail: 661 free(tr); 662 return ((size_t)(-1)); 663 } 664 665 /* 666 * __collate_equiv_value returns the primary collation value for the given 667 * collating symbol specified by str and len. Zero or negative is returned 668 * if the collating symbol was not found. This function is used by bracket 669 * code in the TRE regex library. 670 */ 671 int 672 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) 673 { 674 int32_t e; 675 676 if (len < 1 || len >= COLLATE_STR_LEN) 677 return (-1); 678 679 FIX_LOCALE(locale); 680 struct xlocale_collate *table = 681 (struct xlocale_collate*)locale->components[XLC_COLLATE]; 682 683 if (table->__collate_load_error) 684 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1); 685 686 if (len == 1) { 687 e = -1; 688 if (*str <= UCHAR_MAX) 689 e = table->char_pri_table[*str].pri[0]; 690 else if (table->info->large_count > 0) { 691 collate_large_t *match_large; 692 match_large = largesearch(table, *str); 693 if (match_large) 694 e = match_large->pri.pri[0]; 695 } 696 if (e == 0) 697 return (1); 698 return (e > 0 ? e : 0); 699 } 700 if (table->info->chain_count > 0) { 701 wchar_t name[COLLATE_STR_LEN]; 702 collate_chain_t *match_chain; 703 int clen; 704 705 wcsncpy (name, str, len); 706 name[len] = 0; 707 match_chain = chainsearch(table, name, &clen); 708 if (match_chain) { 709 e = match_chain->pri[0]; 710 if (e == 0) 711 return (1); 712 return (e < 0 ? -e : e); 713 } 714 } 715 return (0); 716 } 717