1 /*- 2 * Copyright 2014 Garrett D'Amore <garrett@damore.org> 3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 4 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 5 * at Electronni Visti IA, Kiev, Ukraine. 6 * All rights reserved. 7 * 8 * Copyright (c) 2011 The FreeBSD Foundation 9 * All rights reserved. 10 * Portions of this software were developed by David Chisnall 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Adapted to xlocale by John Marino <draco@marino.st> 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "namespace.h" 41 42 #include <sys/types.h> 43 #include <sys/stat.h> 44 #include <sys/mman.h> 45 46 #include <assert.h> 47 #include <stdio.h> 48 #include <stdlib.h> 49 #include <string.h> 50 #include <wchar.h> 51 #include <errno.h> 52 #include <unistd.h> 53 #include <fcntl.h> 54 #include "un-namespace.h" 55 56 #include "collate.h" 57 #include "setlocale.h" 58 #include "ldpart.h" 59 #include "libc_private.h" 60 61 struct xlocale_collate __xlocale_global_collate = { 62 {{0}, "C"}, 1, 0, 0, 0 63 }; 64 65 struct xlocale_collate __xlocale_C_collate = { 66 {{0}, "C"}, 1, 0, 0, 0 67 }; 68 69 static int 70 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table); 71 72 static void 73 destruct_collate(void *t) 74 { 75 struct xlocale_collate *table = t; 76 if (table->map && (table->maplen > 0)) { 77 (void) munmap(table->map, table->maplen); 78 } 79 free(t); 80 } 81 82 void * 83 __collate_load(const char *encoding, __unused locale_t unused) 84 { 85 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 86 return &__xlocale_C_collate; 87 } 88 struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1); 89 table->header.header.destructor = destruct_collate; 90 // FIXME: Make sure that _LDP_CACHE is never returned. We should be doing 91 // the caching outside of this section 92 if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) { 93 xlocale_release(table); 94 return NULL; 95 } 96 return table; 97 } 98 99 /** 100 * Load the collation tables for the specified encoding into the global table. 101 */ 102 int 103 __collate_load_tables(const char *encoding) 104 { 105 106 return (__collate_load_tables_l(encoding, &__xlocale_global_collate)); 107 } 108 109 int 110 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) 111 { 112 int i, chains, z; 113 char *buf; 114 char *TMP; 115 char *map; 116 collate_info_t *info; 117 struct stat sbuf; 118 int fd; 119 120 table->__collate_load_error = 1; 121 122 /* 'encoding' must be already checked. */ 123 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 124 return (_LDP_CACHE); 125 } 126 127 asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding); 128 if (buf == NULL) 129 return (_LDP_ERROR); 130 131 if ((fd = _open(buf, O_RDONLY)) < 0) { 132 free(buf); 133 return (_LDP_ERROR); 134 } 135 free(buf); 136 if (_fstat(fd, &sbuf) < 0) { 137 (void) _close(fd); 138 return (_LDP_ERROR); 139 } 140 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { 141 (void) _close(fd); 142 errno = EINVAL; 143 return (_LDP_ERROR); 144 } 145 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 146 (void) _close(fd); 147 if ((TMP = map) == NULL) { 148 return (_LDP_ERROR); 149 } 150 151 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { 152 (void) munmap(map, sbuf.st_size); 153 errno = EINVAL; 154 return (_LDP_ERROR); 155 } 156 TMP += COLLATE_STR_LEN; 157 158 info = (void *)TMP; 159 TMP += sizeof (*info); 160 161 if ((info->directive_count < 1) || 162 (info->directive_count >= COLL_WEIGHTS_MAX) || 163 ((chains = info->chain_count) < 0)) { 164 (void) munmap(map, sbuf.st_size); 165 errno = EINVAL; 166 return (_LDP_ERROR); 167 } 168 169 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 170 (sizeof (collate_chain_t) * chains) + 171 (sizeof (collate_large_t) * info->large_count); 172 for (z = 0; z < info->directive_count; z++) { 173 i += sizeof (collate_subst_t) * info->subst_count[z]; 174 } 175 if (i != (sbuf.st_size - (TMP - map))) { 176 (void) munmap(map, sbuf.st_size); 177 errno = EINVAL; 178 return (_LDP_ERROR); 179 } 180 181 table->info = info; 182 table->char_pri_table = (void *)TMP; 183 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 184 185 for (z = 0; z < info->directive_count; z++) { 186 if (info->subst_count[z] > 0) { 187 table->subst_table[z] = (void *)TMP; 188 TMP += info->subst_count[z] * sizeof (collate_subst_t); 189 } else { 190 table->subst_table[z] = NULL; 191 } 192 } 193 194 if (chains > 0) { 195 table->chain_pri_table = (void *)TMP; 196 TMP += chains * sizeof (collate_chain_t); 197 } else 198 table->chain_pri_table = NULL; 199 if (info->large_count > 0) 200 table->large_pri_table = (void *)TMP; 201 else 202 table->large_pri_table = NULL; 203 204 table->__collate_load_error = 0; 205 return (_LDP_LOADED); 206 } 207 208 static const int32_t * 209 substsearch(struct xlocale_collate *table, const wchar_t key, int pass) 210 { 211 const collate_subst_t *p; 212 int n = table->info->subst_count[pass]; 213 214 if (n == 0) 215 return (NULL); 216 217 if (pass >= table->info->directive_count) 218 return (NULL); 219 220 if (!(key & COLLATE_SUBST_PRIORITY)) 221 return (NULL); 222 223 p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 224 assert(p->key == key); 225 return (p->pri); 226 } 227 228 static collate_chain_t * 229 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len) 230 { 231 int low = 0; 232 int high = table->info->chain_count - 1;; 233 int next, compar, l; 234 collate_chain_t *p; 235 collate_chain_t *tab = table->chain_pri_table; 236 237 if (high < 0) 238 return (NULL); 239 240 while (low <= high) { 241 next = (low + high) / 2; 242 p = tab + next; 243 compar = *key - *p->str; 244 if (compar == 0) { 245 l = wcsnlen(p->str, COLLATE_STR_LEN); 246 compar = wcsncmp(key, p->str, l); 247 if (compar == 0) { 248 *len = l; 249 return (p); 250 } 251 } 252 if (compar > 0) 253 low = next + 1; 254 else 255 high = next - 1; 256 } 257 return (NULL); 258 } 259 260 static collate_large_t * 261 largesearch(struct xlocale_collate *table, const wchar_t key) 262 { 263 int low = 0; 264 int high = table->info->large_count - 1; 265 int next, compar; 266 collate_large_t *p; 267 collate_large_t *tab = table->large_pri_table; 268 269 if (high < 0) 270 return (NULL); 271 272 while (low <= high) { 273 next = (low + high) / 2; 274 p = tab + next; 275 compar = key - p->val; 276 if (compar == 0) 277 return (p); 278 if (compar > 0) 279 low = next + 1; 280 else 281 high = next - 1; 282 } 283 return (NULL); 284 } 285 286 void 287 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len, 288 int *pri, int which, const int **state) 289 { 290 collate_chain_t *p2; 291 collate_large_t *match; 292 int p, l; 293 const int *sptr; 294 295 /* 296 * If this is the "last" pass for the UNDEFINED, then 297 * we just return the priority itself. 298 */ 299 if (which >= table->info->directive_count) { 300 *pri = *t; 301 *len = 1; 302 *state = NULL; 303 return; 304 } 305 306 /* 307 * If we have remaining substitution data from a previous 308 * call, consume it first. 309 */ 310 if ((sptr = *state) != NULL) { 311 *pri = *sptr; 312 sptr++; 313 if ((sptr == *state) || (sptr == NULL)) 314 *state = NULL; 315 else 316 *state = sptr; 317 *len = 0; 318 return; 319 } 320 321 /* No active substitutions */ 322 *len = 1; 323 324 /* 325 * Check for composites such as diphthongs that collate as a 326 * single element (aka chains or collating-elements). 327 */ 328 if (((p2 = chainsearch(table, t, &l)) != NULL) && 329 ((p = p2->pri[which]) >= 0)) { 330 331 *len = l; 332 *pri = p; 333 334 } else if (*t <= UCHAR_MAX) { 335 336 /* 337 * Character is a small (8-bit) character. 338 * We just look these up directly for speed. 339 */ 340 *pri = table->char_pri_table[*t].pri[which]; 341 342 } else if ((table->info->large_count > 0) && 343 ((match = largesearch(table, *t)) != NULL)) { 344 345 /* 346 * Character was found in the extended table. 347 */ 348 *pri = match->pri.pri[which]; 349 350 } else { 351 /* 352 * Character lacks a specific definition. 353 */ 354 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) { 355 /* Mask off sign bit to prevent ordering confusion. */ 356 *pri = (*t & COLLATE_MAX_PRIORITY); 357 } else { 358 *pri = table->info->undef_pri[which]; 359 } 360 /* No substitutions for undefined characters! */ 361 return; 362 } 363 364 /* 365 * Try substituting (expanding) the character. We are 366 * currently doing this *after* the chain compression. I 367 * think it should not matter, but this way might be slightly 368 * faster. 369 * 370 * We do this after the priority search, as this will help us 371 * to identify a single key value. In order for this to work, 372 * its important that the priority assigned to a given element 373 * to be substituted be unique for that level. The localedef 374 * code ensures this for us. 375 */ 376 if ((sptr = substsearch(table, *pri, which)) != NULL) { 377 if ((*pri = *sptr) > 0) { 378 sptr++; 379 *state = *sptr ? sptr : NULL; 380 } 381 } 382 383 } 384 385 /* 386 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 387 * NOT NULL terminate. That is left to the caller. 388 */ 389 size_t 390 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf, 391 size_t room) 392 { 393 int pri; 394 int len; 395 const wchar_t *t; 396 wchar_t *tr = NULL; 397 int direc; 398 int pass; 399 const int32_t *state; 400 size_t want = 0; 401 size_t need = 0; 402 int ndir = table->info->directive_count; 403 404 assert(src); 405 406 for (pass = 0; pass <= ndir; pass++) { 407 408 state = NULL; 409 410 if (pass != 0) { 411 /* insert level separator from the previous pass */ 412 if (room) { 413 *xf++ = 1; 414 room--; 415 } 416 want++; 417 } 418 419 /* special pass for undefined */ 420 if (pass == ndir) { 421 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 422 } else { 423 direc = table->info->directive[pass]; 424 } 425 426 t = src; 427 428 if (direc & DIRECTIVE_BACKWARD) { 429 wchar_t *bp, *fp, c; 430 free(tr); 431 if ((tr = wcsdup(t)) == NULL) { 432 errno = ENOMEM; 433 goto fail; 434 } 435 bp = tr; 436 fp = tr + wcslen(tr) - 1; 437 while (bp < fp) { 438 c = *bp; 439 *bp++ = *fp; 440 *fp-- = c; 441 } 442 t = (const wchar_t *)tr; 443 } 444 445 if (direc & DIRECTIVE_POSITION) { 446 while (*t || state) { 447 _collate_lookup(table, t, &len, &pri, pass, &state); 448 t += len; 449 if (pri <= 0) { 450 if (pri < 0) { 451 errno = EINVAL; 452 goto fail; 453 } 454 state = NULL; 455 pri = COLLATE_MAX_PRIORITY; 456 } 457 if (room) { 458 *xf++ = pri; 459 room--; 460 } 461 want++; 462 need = want; 463 } 464 } else { 465 while (*t || state) { 466 _collate_lookup(table, t, &len, &pri, pass, &state); 467 t += len; 468 if (pri <= 0) { 469 if (pri < 0) { 470 errno = EINVAL; 471 goto fail; 472 } 473 state = NULL; 474 continue; 475 } 476 if (room) { 477 *xf++ = pri; 478 room--; 479 } 480 want++; 481 need = want; 482 } 483 } 484 } 485 free(tr); 486 return (need); 487 488 fail: 489 free(tr); 490 return ((size_t)(-1)); 491 } 492 493 /* 494 * In the non-POSIX case, we transform each character into a string of 495 * characters representing the character's priority. Since char is usually 496 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 497 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 498 * bits per byte. 499 * 500 * It turns out that we sometimes have real priorities that are 501 * 31-bits wide. (But: be careful using priorities where the high 502 * order bit is set -- i.e. the priority is negative. The sort order 503 * may be surprising!) 504 * 505 * TODO: This would be a good area to optimize somewhat. It turns out 506 * that real prioririties *except for the last UNDEFINED pass* are generally 507 * very small. We need the localedef code to precalculate the max 508 * priority for us, and ideally also give us a mask, and then we could 509 * severely limit what we expand to. 510 */ 511 #define XFRM_BYTES 6 512 #define XFRM_OFFSET ('0') /* make all printable characters */ 513 #define XFRM_SHIFT 6 514 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 515 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 516 517 static int 518 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass) 519 { 520 /* we use unsigned to ensure zero fill on right shift */ 521 uint32_t val = (uint32_t)table->info->pri_count[pass]; 522 int nc = 0; 523 524 while (val) { 525 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 526 pri >>= XFRM_SHIFT; 527 val >>= XFRM_SHIFT; 528 p++; 529 nc++; 530 } 531 return (nc); 532 } 533 534 size_t 535 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf, 536 size_t room) 537 { 538 int pri; 539 int len; 540 const wchar_t *t; 541 wchar_t *tr = NULL; 542 int direc; 543 int pass; 544 const int32_t *state; 545 size_t want = 0; 546 size_t need = 0; 547 int b; 548 uint8_t buf[XFRM_BYTES]; 549 int ndir = table->info->directive_count; 550 551 assert(src); 552 553 for (pass = 0; pass <= ndir; pass++) { 554 555 state = NULL; 556 557 if (pass != 0) { 558 /* insert level separator from the previous pass */ 559 if (room) { 560 *xf++ = XFRM_SEP; 561 room--; 562 } 563 want++; 564 } 565 566 /* special pass for undefined */ 567 if (pass == ndir) { 568 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 569 } else { 570 direc = table->info->directive[pass]; 571 } 572 573 t = src; 574 575 if (direc & DIRECTIVE_BACKWARD) { 576 wchar_t *bp, *fp, c; 577 free(tr); 578 if ((tr = wcsdup(t)) == NULL) { 579 errno = ENOMEM; 580 goto fail; 581 } 582 bp = tr; 583 fp = tr + wcslen(tr) - 1; 584 while (bp < fp) { 585 c = *bp; 586 *bp++ = *fp; 587 *fp-- = c; 588 } 589 t = (const wchar_t *)tr; 590 } 591 592 if (direc & DIRECTIVE_POSITION) { 593 while (*t || state) { 594 595 _collate_lookup(table, t, &len, &pri, pass, &state); 596 t += len; 597 if (pri <= 0) { 598 if (pri < 0) { 599 errno = EINVAL; 600 goto fail; 601 } 602 state = NULL; 603 pri = COLLATE_MAX_PRIORITY; 604 } 605 606 b = xfrm(table, buf, pri, pass); 607 want += b; 608 if (room) { 609 while (b) { 610 b--; 611 if (room) { 612 *xf++ = buf[b]; 613 room--; 614 } 615 } 616 } 617 need = want; 618 } 619 } else { 620 while (*t || state) { 621 _collate_lookup(table, t, &len, &pri, pass, &state); 622 t += len; 623 if (pri <= 0) { 624 if (pri < 0) { 625 errno = EINVAL; 626 goto fail; 627 } 628 state = NULL; 629 continue; 630 } 631 632 b = xfrm(table, buf, pri, pass); 633 want += b; 634 if (room) { 635 636 while (b) { 637 b--; 638 if (room) { 639 *xf++ = buf[b]; 640 room--; 641 } 642 } 643 } 644 need = want; 645 } 646 } 647 } 648 free(tr); 649 return (need); 650 651 fail: 652 free(tr); 653 return ((size_t)(-1)); 654 } 655 656 /* 657 * __collate_equiv_value returns the primary collation value for the given 658 * collating symbol specified by str and len. Zero or negative is returned 659 * if the collating symbol was not found. This function is used by bracket 660 * code in the TRE regex library. 661 */ 662 int 663 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) 664 { 665 int32_t e; 666 667 if (len < 1 || len >= COLLATE_STR_LEN) 668 return (-1); 669 670 FIX_LOCALE(locale); 671 struct xlocale_collate *table = 672 (struct xlocale_collate*)locale->components[XLC_COLLATE]; 673 674 if (table->__collate_load_error) 675 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1); 676 677 if (len == 1) { 678 e = -1; 679 if (*str <= UCHAR_MAX) 680 e = table->char_pri_table[*str].pri[0]; 681 else if (table->info->large_count > 0) { 682 collate_large_t *match_large; 683 match_large = largesearch(table, *str); 684 if (match_large) 685 e = match_large->pri.pri[0]; 686 } 687 if (e == 0) 688 return (1); 689 return (e > 0 ? e : 0); 690 } 691 if (table->info->chain_count > 0) { 692 wchar_t name[COLLATE_STR_LEN]; 693 collate_chain_t *match_chain; 694 int clen; 695 696 wcsncpy (name, str, len); 697 name[len] = 0; 698 match_chain = chainsearch(table, name, &clen); 699 if (match_chain) { 700 e = match_chain->pri[0]; 701 if (e == 0) 702 return (1); 703 return (e < 0 ? -e : e); 704 } 705 } 706 return (0); 707 } 708