1 /*- 2 * Copyright 2014 Garrett D'Amore <garrett@damore.org> 3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 4 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 5 * at Electronni Visti IA, Kiev, Ukraine. 6 * All rights reserved. 7 * 8 * Copyright (c) 2011 The FreeBSD Foundation 9 * All rights reserved. 10 * Portions of this software were developed by David Chisnall 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Adapted to xlocale by John Marino <draco@marino.st> 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "namespace.h" 41 42 #include <sys/types.h> 43 #include <sys/stat.h> 44 #include <sys/mman.h> 45 46 #include <assert.h> 47 #include <stdio.h> 48 #include <stdlib.h> 49 #include <string.h> 50 #include <wchar.h> 51 #include <errno.h> 52 #include <unistd.h> 53 #include <fcntl.h> 54 #include "un-namespace.h" 55 56 #include "endian.h" 57 #include "collate.h" 58 #include "setlocale.h" 59 #include "ldpart.h" 60 #include "libc_private.h" 61 62 struct xlocale_collate __xlocale_global_collate = { 63 {{0}, "C"}, 1, 0, 0, 0 64 }; 65 66 struct xlocale_collate __xlocale_C_collate = { 67 {{0}, "C"}, 1, 0, 0, 0 68 }; 69 70 static int 71 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table); 72 73 static void 74 destruct_collate(void *t) 75 { 76 struct xlocale_collate *table = t; 77 if (table->map && (table->maplen > 0)) { 78 (void) munmap(table->map, table->maplen); 79 } 80 free(t); 81 } 82 83 void * 84 __collate_load(const char *encoding, __unused locale_t unused) 85 { 86 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 87 return &__xlocale_C_collate; 88 } 89 struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1); 90 table->header.header.destructor = destruct_collate; 91 // FIXME: Make sure that _LDP_CACHE is never returned. We should be doing 92 // the caching outside of this section 93 if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) { 94 xlocale_release(table); 95 return NULL; 96 } 97 return table; 98 } 99 100 /** 101 * Load the collation tables for the specified encoding into the global table. 102 */ 103 int 104 __collate_load_tables(const char *encoding) 105 { 106 107 return (__collate_load_tables_l(encoding, &__xlocale_global_collate)); 108 } 109 110 int 111 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) 112 { 113 int i, chains, z; 114 char *buf; 115 char *TMP; 116 char *map; 117 collate_info_t *info; 118 struct stat sbuf; 119 int fd; 120 121 table->__collate_load_error = 1; 122 123 /* 'encoding' must be already checked. */ 124 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 125 return (_LDP_CACHE); 126 } 127 128 if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1) 129 return (_LDP_ERROR); 130 131 if ((fd = _open(buf, O_RDONLY)) < 0) { 132 free(buf); 133 return (_LDP_ERROR); 134 } 135 free(buf); 136 if (_fstat(fd, &sbuf) < 0) { 137 (void) _close(fd); 138 return (_LDP_ERROR); 139 } 140 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { 141 (void) _close(fd); 142 errno = EINVAL; 143 return (_LDP_ERROR); 144 } 145 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 146 (void) _close(fd); 147 if ((TMP = map) == NULL) { 148 return (_LDP_ERROR); 149 } 150 151 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { 152 (void) munmap(map, sbuf.st_size); 153 errno = EINVAL; 154 return (_LDP_ERROR); 155 } 156 TMP += COLLATE_STR_LEN; 157 158 info = (void *)TMP; 159 TMP += sizeof (*info); 160 161 if ((info->directive_count < 1) || 162 (info->directive_count >= COLL_WEIGHTS_MAX) || 163 ((chains = BSWAP(info->chain_count)) < 0)) { 164 (void) munmap(map, sbuf.st_size); 165 errno = EINVAL; 166 return (_LDP_ERROR); 167 } 168 169 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 170 (sizeof (collate_chain_t) * chains) + 171 (sizeof (collate_large_t) * BSWAP(info->large_count)); 172 for (z = 0; z < info->directive_count; z++) { 173 i += sizeof (collate_subst_t) * BSWAP(info->subst_count[z]); 174 } 175 if (i != (sbuf.st_size - (TMP - map))) { 176 (void) munmap(map, sbuf.st_size); 177 errno = EINVAL; 178 return (_LDP_ERROR); 179 } 180 181 table->info = info; 182 table->char_pri_table = (void *)TMP; 183 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 184 185 for (z = 0; z < info->directive_count; z++) { 186 if (BSWAP(info->subst_count[z]) > 0) { 187 table->subst_table[z] = (void *)TMP; 188 TMP += BSWAP(info->subst_count[z]) * sizeof (collate_subst_t); 189 } else { 190 table->subst_table[z] = NULL; 191 } 192 } 193 194 if (chains > 0) { 195 table->chain_pri_table = (void *)TMP; 196 TMP += chains * sizeof (collate_chain_t); 197 } else 198 table->chain_pri_table = NULL; 199 if (BSWAP(info->large_count) > 0) 200 table->large_pri_table = (void *)TMP; 201 else 202 table->large_pri_table = NULL; 203 204 table->__collate_load_error = 0; 205 return (_LDP_LOADED); 206 } 207 208 static const int32_t * 209 substsearch(struct xlocale_collate *table, const wchar_t key, int pass) 210 { 211 const collate_subst_t *p; 212 int n = BSWAP(table->info->subst_count[pass]); 213 214 if (n == 0) 215 return (NULL); 216 217 if (pass >= table->info->directive_count) 218 return (NULL); 219 220 if (!(key & COLLATE_SUBST_PRIORITY)) 221 return (NULL); 222 223 p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 224 assert(BSWAP(p->key) == key); 225 226 return (p->pri); 227 } 228 229 static collate_chain_t * 230 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len) 231 { 232 int low = 0; 233 int high = BSWAP(table->info->chain_count) - 1; 234 int next, compar, l; 235 collate_chain_t *p; 236 collate_chain_t *tab = table->chain_pri_table; 237 238 if (high < 0) 239 return (NULL); 240 241 while (low <= high) { 242 next = (low + high) / 2; 243 p = tab + next; 244 compar = *key - le16toh(*p->str); 245 if (compar == 0) { 246 l = wcsnlen(p->str, COLLATE_STR_LEN); 247 compar = wcsncmp(key, p->str, l); 248 if (compar == 0) { 249 *len = l; 250 return (p); 251 } 252 } 253 if (compar > 0) 254 low = next + 1; 255 else 256 high = next - 1; 257 } 258 return (NULL); 259 } 260 261 static collate_large_t * 262 largesearch(struct xlocale_collate *table, const wchar_t key) 263 { 264 int low = 0; 265 int high = BSWAP(table->info->large_count) - 1; 266 int next, compar; 267 collate_large_t *p; 268 collate_large_t *tab = table->large_pri_table; 269 270 if (high < 0) 271 return (NULL); 272 273 while (low <= high) { 274 next = (low + high) / 2; 275 p = tab + next; 276 compar = key - BSWAP(p->val); 277 if (compar == 0) 278 return (p); 279 if (compar > 0) 280 low = next + 1; 281 else 282 high = next - 1; 283 } 284 return (NULL); 285 } 286 287 void 288 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len, 289 int *pri, int which, const int **state) 290 { 291 collate_chain_t *p2; 292 collate_large_t *match; 293 int p, l; 294 const int *sptr; 295 296 /* 297 * If this is the "last" pass for the UNDEFINED, then 298 * we just return the priority itself. 299 */ 300 if (which >= table->info->directive_count) { 301 *pri = *t; 302 *len = 1; 303 *state = NULL; 304 return; 305 } 306 307 /* 308 * If we have remaining substitution data from a previous 309 * call, consume it first. 310 */ 311 if ((sptr = *state) != NULL) { 312 *pri = *sptr; 313 sptr++; 314 if ((sptr == *state) || (sptr == NULL)) 315 *state = NULL; 316 else 317 *state = sptr; 318 *len = 0; 319 return; 320 } 321 322 /* No active substitutions */ 323 *len = 1; 324 325 /* 326 * Check for composites such as diphthongs that collate as a 327 * single element (aka chains or collating-elements). 328 */ 329 if (((p2 = chainsearch(table, t, &l)) != NULL) && 330 ((p = p2->pri[which]) >= 0)) { 331 332 *len = l; 333 *pri = p; 334 335 } else if (*t <= UCHAR_MAX) { 336 337 /* 338 * Character is a small (8-bit) character. 339 * We just look these up directly for speed. 340 */ 341 *pri = BSWAP(table->char_pri_table[*t].pri[which]); 342 343 } else if ((BSWAP(table->info->large_count) > 0) && 344 ((match = largesearch(table, *t)) != NULL)) { 345 346 /* 347 * Character was found in the extended table. 348 */ 349 *pri = BSWAP(match->pri.pri[which]); 350 351 } else { 352 /* 353 * Character lacks a specific definition. 354 */ 355 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) { 356 /* Mask off sign bit to prevent ordering confusion. */ 357 *pri = (*t & COLLATE_MAX_PRIORITY); 358 } else { 359 *pri = BSWAP(table->info->undef_pri[which]); 360 } 361 /* No substitutions for undefined characters! */ 362 return; 363 } 364 365 /* 366 * Try substituting (expanding) the character. We are 367 * currently doing this *after* the chain compression. I 368 * think it should not matter, but this way might be slightly 369 * faster. 370 * 371 * We do this after the priority search, as this will help us 372 * to identify a single key value. In order for this to work, 373 * its important that the priority assigned to a given element 374 * to be substituted be unique for that level. The localedef 375 * code ensures this for us. 376 */ 377 if ((sptr = substsearch(table, *pri, which)) != NULL) { 378 if ((*pri = BSWAP(*sptr)) > 0) { 379 sptr++; 380 *state = BSWAP(*sptr) ? sptr : NULL; 381 } 382 } 383 384 } 385 386 /* 387 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 388 * NOT NULL terminate. That is left to the caller. 389 */ 390 size_t 391 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf, 392 size_t room) 393 { 394 int pri; 395 int len; 396 const wchar_t *t; 397 wchar_t *tr = NULL; 398 int direc; 399 int pass; 400 const int32_t *state; 401 size_t want = 0; 402 size_t need = 0; 403 int ndir = table->info->directive_count; 404 405 assert(src); 406 407 for (pass = 0; pass <= ndir; pass++) { 408 409 state = NULL; 410 411 if (pass != 0) { 412 /* insert level separator from the previous pass */ 413 if (room) { 414 *xf++ = 1; 415 room--; 416 } 417 want++; 418 } 419 420 /* special pass for undefined */ 421 if (pass == ndir) { 422 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 423 } else { 424 direc = table->info->directive[pass]; 425 } 426 427 t = src; 428 429 if (direc & DIRECTIVE_BACKWARD) { 430 wchar_t *bp, *fp, c; 431 free(tr); 432 if ((tr = wcsdup(t)) == NULL) { 433 errno = ENOMEM; 434 goto fail; 435 } 436 bp = tr; 437 fp = tr + wcslen(tr) - 1; 438 while (bp < fp) { 439 c = *bp; 440 *bp++ = *fp; 441 *fp-- = c; 442 } 443 t = (const wchar_t *)tr; 444 } 445 446 if (direc & DIRECTIVE_POSITION) { 447 while (*t || state) { 448 _collate_lookup(table, t, &len, &pri, pass, &state); 449 t += len; 450 if (pri <= 0) { 451 if (pri < 0) { 452 errno = EINVAL; 453 goto fail; 454 } 455 state = NULL; 456 pri = COLLATE_MAX_PRIORITY; 457 } 458 if (room) { 459 *xf++ = pri; 460 room--; 461 } 462 want++; 463 need = want; 464 } 465 } else { 466 while (*t || state) { 467 _collate_lookup(table, t, &len, &pri, pass, &state); 468 t += len; 469 if (pri <= 0) { 470 if (pri < 0) { 471 errno = EINVAL; 472 goto fail; 473 } 474 state = NULL; 475 continue; 476 } 477 if (room) { 478 *xf++ = pri; 479 room--; 480 } 481 want++; 482 need = want; 483 } 484 } 485 } 486 free(tr); 487 return (need); 488 489 fail: 490 free(tr); 491 return ((size_t)(-1)); 492 } 493 494 /* 495 * In the non-POSIX case, we transform each character into a string of 496 * characters representing the character's priority. Since char is usually 497 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 498 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 499 * bits per byte. 500 * 501 * It turns out that we sometimes have real priorities that are 502 * 31-bits wide. (But: be careful using priorities where the high 503 * order bit is set -- i.e. the priority is negative. The sort order 504 * may be surprising!) 505 * 506 * TODO: This would be a good area to optimize somewhat. It turns out 507 * that real prioririties *except for the last UNDEFINED pass* are generally 508 * very small. We need the localedef code to precalculate the max 509 * priority for us, and ideally also give us a mask, and then we could 510 * severely limit what we expand to. 511 */ 512 #define XFRM_BYTES 6 513 #define XFRM_OFFSET ('0') /* make all printable characters */ 514 #define XFRM_SHIFT 6 515 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 516 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 517 518 static int 519 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass) 520 { 521 /* we use unsigned to ensure zero fill on right shift */ 522 uint32_t val = BSWAP((uint32_t)table->info->pri_count[pass]); 523 int nc = 0; 524 525 while (val) { 526 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 527 pri >>= XFRM_SHIFT; 528 val >>= XFRM_SHIFT; 529 p++; 530 nc++; 531 } 532 return (nc); 533 } 534 535 size_t 536 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf, 537 size_t room) 538 { 539 int pri; 540 int len; 541 const wchar_t *t; 542 wchar_t *tr = NULL; 543 int direc; 544 int pass; 545 const int32_t *state; 546 size_t want = 0; 547 size_t need = 0; 548 int b; 549 uint8_t buf[XFRM_BYTES]; 550 int ndir = table->info->directive_count; 551 552 assert(src); 553 554 for (pass = 0; pass <= ndir; pass++) { 555 556 state = NULL; 557 558 if (pass != 0) { 559 /* insert level separator from the previous pass */ 560 if (room) { 561 *xf++ = XFRM_SEP; 562 room--; 563 } 564 want++; 565 } 566 567 /* special pass for undefined */ 568 if (pass == ndir) { 569 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 570 } else { 571 direc = table->info->directive[pass]; 572 } 573 574 t = src; 575 576 if (direc & DIRECTIVE_BACKWARD) { 577 wchar_t *bp, *fp, c; 578 free(tr); 579 if ((tr = wcsdup(t)) == NULL) { 580 errno = ENOMEM; 581 goto fail; 582 } 583 bp = tr; 584 fp = tr + wcslen(tr) - 1; 585 while (bp < fp) { 586 c = *bp; 587 *bp++ = *fp; 588 *fp-- = c; 589 } 590 t = (const wchar_t *)tr; 591 } 592 593 if (direc & DIRECTIVE_POSITION) { 594 while (*t || state) { 595 596 _collate_lookup(table, t, &len, &pri, pass, &state); 597 t += len; 598 if (pri <= 0) { 599 if (pri < 0) { 600 errno = EINVAL; 601 goto fail; 602 } 603 state = NULL; 604 pri = COLLATE_MAX_PRIORITY; 605 } 606 607 b = xfrm(table, buf, pri, pass); 608 want += b; 609 if (room) { 610 while (b) { 611 b--; 612 if (room) { 613 *xf++ = buf[b]; 614 room--; 615 } 616 } 617 } 618 need = want; 619 } 620 } else { 621 while (*t || state) { 622 _collate_lookup(table, t, &len, &pri, pass, &state); 623 t += len; 624 if (pri <= 0) { 625 if (pri < 0) { 626 errno = EINVAL; 627 goto fail; 628 } 629 state = NULL; 630 continue; 631 } 632 633 b = xfrm(table, buf, pri, pass); 634 want += b; 635 if (room) { 636 637 while (b) { 638 b--; 639 if (room) { 640 *xf++ = buf[b]; 641 room--; 642 } 643 } 644 } 645 need = want; 646 } 647 } 648 } 649 free(tr); 650 return (need); 651 652 fail: 653 free(tr); 654 return ((size_t)(-1)); 655 } 656 657 /* 658 * __collate_equiv_value returns the primary collation value for the given 659 * collating symbol specified by str and len. Zero or negative is returned 660 * if the collating symbol was not found. This function is used by bracket 661 * code in the TRE regex library. 662 */ 663 int 664 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) 665 { 666 int32_t e; 667 668 if (len < 1 || len >= COLLATE_STR_LEN) 669 return (-1); 670 671 FIX_LOCALE(locale); 672 struct xlocale_collate *table = 673 (struct xlocale_collate*)locale->components[XLC_COLLATE]; 674 675 if (table->__collate_load_error) 676 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1); 677 678 if (len == 1) { 679 e = -1; 680 if (*str <= UCHAR_MAX) 681 e = table->char_pri_table[*str].pri[0]; 682 else if (BSWAP(table->info->large_count) > 0) { 683 collate_large_t *match_large; 684 match_large = largesearch(table, *str); 685 if (match_large) 686 e = match_large->pri.pri[0]; 687 } 688 if (e == 0) 689 return (1); 690 return (e > 0 ? e : 0); 691 } 692 if (BSWAP(table->info->chain_count) > 0) { 693 wchar_t name[COLLATE_STR_LEN]; 694 collate_chain_t *match_chain; 695 int clen; 696 697 wcsncpy (name, str, len); 698 name[len] = 0; 699 match_chain = chainsearch(table, name, &clen); 700 if (match_chain) { 701 e = match_chain->pri[0]; 702 if (e == 0) 703 return (1); 704 return (e < 0 ? -e : e); 705 } 706 } 707 return (0); 708 } 709