1 /*- 2 * Copyright 2014 Garrett D'Amore <garrett@damore.org> 3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 4 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 5 * at Electronni Visti IA, Kiev, Ukraine. 6 * All rights reserved. 7 * 8 * Copyright (c) 2011 The FreeBSD Foundation 9 * All rights reserved. 10 * Portions of this software were developed by David Chisnall 11 * under sponsorship from the FreeBSD Foundation. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Adapted to xlocale by John Marino <draco@marino.st> 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "namespace.h" 41 42 #include <sys/types.h> 43 #include <sys/stat.h> 44 #include <sys/mman.h> 45 46 #include <assert.h> 47 #include <stdio.h> 48 #include <stdlib.h> 49 #include <string.h> 50 #include <wchar.h> 51 #include <errno.h> 52 #include <unistd.h> 53 #include <fcntl.h> 54 #include "un-namespace.h" 55 56 #include "collate.h" 57 #include "setlocale.h" 58 #include "ldpart.h" 59 #include "libc_private.h" 60 61 struct xlocale_collate __xlocale_global_collate = { 62 {{0}, "C"}, 1, 0, 0, 0 63 }; 64 65 struct xlocale_collate __xlocale_C_collate = { 66 {{0}, "C"}, 1, 0, 0, 0 67 }; 68 69 static int 70 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table); 71 72 static void 73 destruct_collate(void *t) 74 { 75 struct xlocale_collate *table = t; 76 if (table->map && (table->maplen > 0)) { 77 (void) munmap(table->map, table->maplen); 78 } 79 free(t); 80 } 81 82 void * 83 __collate_load(const char *encoding, __unused locale_t unused) 84 { 85 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 86 return &__xlocale_C_collate; 87 } 88 struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1); 89 table->header.header.destructor = destruct_collate; 90 // FIXME: Make sure that _LDP_CACHE is never returned. We should be doing 91 // the caching outside of this section 92 if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) { 93 xlocale_release(table); 94 return NULL; 95 } 96 return table; 97 } 98 99 /** 100 * Load the collation tables for the specified encoding into the global table. 101 */ 102 int 103 __collate_load_tables(const char *encoding) 104 { 105 106 return (__collate_load_tables_l(encoding, &__xlocale_global_collate)); 107 } 108 109 int 110 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) 111 { 112 int i, chains, z; 113 char *buf; 114 char *TMP; 115 char *map; 116 collate_info_t *info; 117 struct stat sbuf; 118 int fd; 119 120 table->__collate_load_error = 1; 121 122 /* 'encoding' must be already checked. */ 123 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 124 return (_LDP_CACHE); 125 } 126 127 asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding); 128 if (buf == NULL) 129 return (_LDP_ERROR); 130 131 if ((fd = _open(buf, O_RDONLY)) < 0) { 132 free(buf); 133 return (_LDP_ERROR); 134 } 135 free(buf); 136 if (_fstat(fd, &sbuf) < 0) { 137 (void) _close(fd); 138 return (_LDP_ERROR); 139 } 140 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { 141 (void) _close(fd); 142 errno = EINVAL; 143 return (_LDP_ERROR); 144 } 145 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 146 (void) _close(fd); 147 if ((TMP = map) == NULL) { 148 return (_LDP_ERROR); 149 } 150 151 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { 152 (void) munmap(map, sbuf.st_size); 153 errno = EINVAL; 154 return (_LDP_ERROR); 155 } 156 TMP += COLLATE_STR_LEN; 157 158 info = (void *)TMP; 159 TMP += sizeof (*info); 160 161 if ((info->directive_count < 1) || 162 (info->directive_count >= COLL_WEIGHTS_MAX) || 163 ((chains = info->chain_count) < 0)) { 164 (void) munmap(map, sbuf.st_size); 165 errno = EINVAL; 166 return (_LDP_ERROR); 167 } 168 169 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 170 (sizeof (collate_chain_t) * chains) + 171 (sizeof (collate_large_t) * info->large_count); 172 for (z = 0; z < info->directive_count; z++) { 173 i += sizeof (collate_subst_t) * info->subst_count[z]; 174 } 175 if (i != (sbuf.st_size - (TMP - map))) { 176 (void) munmap(map, sbuf.st_size); 177 errno = EINVAL; 178 return (_LDP_ERROR); 179 } 180 181 table->info = info; 182 table->char_pri_table = (void *)TMP; 183 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 184 185 for (z = 0; z < info->directive_count; z++) { 186 if (info->subst_count[z] > 0) { 187 table->subst_table[z] = (void *)TMP; 188 TMP += info->subst_count[z] * sizeof (collate_subst_t); 189 } else { 190 table->subst_table[z] = NULL; 191 } 192 } 193 194 if (chains > 0) { 195 table->chain_pri_table = (void *)TMP; 196 TMP += chains * sizeof (collate_chain_t); 197 } else 198 table->chain_pri_table = NULL; 199 if (info->large_count > 0) 200 table->large_pri_table = (void *)TMP; 201 else 202 table->large_pri_table = NULL; 203 204 table->__collate_load_error = 0; 205 return (_LDP_LOADED); 206 } 207 208 static const int32_t * 209 substsearch(struct xlocale_collate *table, const wchar_t key, int pass) 210 { 211 const collate_subst_t *p; 212 int n = table->info->subst_count[pass]; 213 214 if (n == 0) 215 return (NULL); 216 217 if (pass >= table->info->directive_count) 218 return (NULL); 219 220 if (!(key & COLLATE_SUBST_PRIORITY)) 221 return (NULL); 222 223 p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 224 assert(p->key == key); 225 return (p->pri); 226 } 227 228 static collate_chain_t * 229 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len) 230 { 231 int low = 0; 232 int high = table->info->chain_count - 1;; 233 int next, compar, l; 234 collate_chain_t *p; 235 collate_chain_t *tab = table->chain_pri_table; 236 237 if (high < 0) 238 return (NULL); 239 240 while (low <= high) { 241 next = (low + high) / 2; 242 p = tab + next; 243 compar = *key - *p->str; 244 if (compar == 0) { 245 l = wcsnlen(p->str, COLLATE_STR_LEN); 246 compar = wcsncmp(key, p->str, l); 247 if (compar == 0) { 248 *len = l; 249 return (p); 250 } 251 } 252 if (compar > 0) 253 low = next + 1; 254 else 255 high = next - 1; 256 } 257 return (NULL); 258 } 259 260 static collate_large_t * 261 largesearch(struct xlocale_collate *table, const wchar_t key) 262 { 263 int low = 0; 264 int high = table->info->large_count - 1; 265 int next, compar; 266 collate_large_t *p; 267 collate_large_t *tab = table->large_pri_table; 268 269 if (high < 0) 270 return (NULL); 271 272 while (low <= high) { 273 next = (low + high) / 2; 274 p = tab + next; 275 compar = key - p->val; 276 if (compar == 0) 277 return (p); 278 if (compar > 0) 279 low = next + 1; 280 else 281 high = next - 1; 282 } 283 return (NULL); 284 } 285 286 void 287 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len, 288 int *pri, int which, const int **state) 289 { 290 collate_chain_t *p2; 291 collate_large_t *match; 292 int p, l; 293 const int *sptr; 294 295 /* 296 * If this is the "last" pass for the UNDEFINED, then 297 * we just return the priority itself. 298 */ 299 if (which >= table->info->directive_count) { 300 *pri = *t; 301 *len = 1; 302 *state = NULL; 303 return; 304 } 305 306 /* 307 * If we have remaining substitution data from a previous 308 * call, consume it first. 309 */ 310 if ((sptr = *state) != NULL) { 311 *pri = *sptr; 312 sptr++; 313 if ((sptr == *state) || (sptr == NULL)) 314 *state = NULL; 315 else 316 *state = sptr; 317 *len = 0; 318 return; 319 } 320 321 /* No active substitutions */ 322 *len = 1; 323 324 /* 325 * Check for composites such as dipthongs that collate as a 326 * single element (aka chains or collating-elements). 327 */ 328 if (((p2 = chainsearch(table, t, &l)) != NULL) && 329 ((p = p2->pri[which]) >= 0)) { 330 331 *len = l; 332 *pri = p; 333 334 } else if (*t <= UCHAR_MAX) { 335 336 /* 337 * Character is a small (8-bit) character. 338 * We just look these up directly for speed. 339 */ 340 *pri = table->char_pri_table[*t].pri[which]; 341 342 } else if ((table->info->large_count > 0) && 343 ((match = largesearch(table, *t)) != NULL)) { 344 345 /* 346 * Character was found in the extended table. 347 */ 348 *pri = match->pri.pri[which]; 349 350 } else { 351 /* 352 * Character lacks a specific definition. 353 */ 354 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) { 355 /* Mask off sign bit to prevent ordering confusion. */ 356 *pri = (*t & COLLATE_MAX_PRIORITY); 357 } else { 358 *pri = table->info->undef_pri[which]; 359 } 360 /* No substitutions for undefined characters! */ 361 return; 362 } 363 364 /* 365 * Try substituting (expanding) the character. We are 366 * currently doing this *after* the chain compression. I 367 * think it should not matter, but this way might be slightly 368 * faster. 369 * 370 * We do this after the priority search, as this will help us 371 * to identify a single key value. In order for this to work, 372 * its important that the priority assigned to a given element 373 * to be substituted be unique for that level. The localedef 374 * code ensures this for us. 375 */ 376 if ((sptr = substsearch(table, *pri, which)) != NULL) { 377 if ((*pri = *sptr) > 0) { 378 sptr++; 379 *state = *sptr ? sptr : NULL; 380 } 381 } 382 383 } 384 385 /* 386 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 387 * NOT NULL terminate. That is left to the caller. 388 */ 389 size_t 390 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf, 391 size_t room) 392 { 393 int pri; 394 int len; 395 const wchar_t *t; 396 wchar_t *tr = NULL; 397 int direc; 398 int pass; 399 const int32_t *state; 400 size_t want = 0; 401 size_t need = 0; 402 int ndir = table->info->directive_count; 403 404 assert(src); 405 406 for (pass = 0; pass <= ndir; pass++) { 407 408 state = NULL; 409 410 if (pass != 0) { 411 /* insert level separator from the previous pass */ 412 if (room) { 413 *xf++ = 1; 414 room--; 415 } 416 want++; 417 } 418 419 /* special pass for undefined */ 420 if (pass == ndir) { 421 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 422 } else { 423 direc = table->info->directive[pass]; 424 } 425 426 t = src; 427 428 if (direc & DIRECTIVE_BACKWARD) { 429 wchar_t *bp, *fp, c; 430 free(tr); 431 if ((tr = wcsdup(t)) == NULL) { 432 errno = ENOMEM; 433 goto fail; 434 } 435 bp = tr; 436 fp = tr + wcslen(tr) - 1; 437 while (bp < fp) { 438 c = *bp; 439 *bp++ = *fp; 440 *fp-- = c; 441 } 442 t = (const wchar_t *)tr; 443 } 444 445 if (direc & DIRECTIVE_POSITION) { 446 while (*t || state) { 447 _collate_lookup(table, t, &len, &pri, pass, &state); 448 t += len; 449 if (pri <= 0) { 450 if (pri < 0) { 451 errno = EINVAL; 452 goto fail; 453 } 454 pri = COLLATE_MAX_PRIORITY; 455 } 456 if (room) { 457 *xf++ = pri; 458 room--; 459 } 460 want++; 461 need = want; 462 } 463 } else { 464 while (*t || state) { 465 _collate_lookup(table, t, &len, &pri, pass, &state); 466 t += len; 467 if (pri <= 0) { 468 if (pri < 0) { 469 errno = EINVAL; 470 goto fail; 471 } 472 continue; 473 } 474 if (room) { 475 *xf++ = pri; 476 room--; 477 } 478 want++; 479 need = want; 480 } 481 } 482 } 483 free(tr); 484 return (need); 485 486 fail: 487 free(tr); 488 return ((size_t)(-1)); 489 } 490 491 /* 492 * In the non-POSIX case, we transform each character into a string of 493 * characters representing the character's priority. Since char is usually 494 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 495 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 496 * bits per byte. 497 * 498 * It turns out that we sometimes have real priorities that are 499 * 31-bits wide. (But: be careful using priorities where the high 500 * order bit is set -- i.e. the priority is negative. The sort order 501 * may be surprising!) 502 * 503 * TODO: This would be a good area to optimize somewhat. It turns out 504 * that real prioririties *except for the last UNDEFINED pass* are generally 505 * very small. We need the localedef code to precalculate the max 506 * priority for us, and ideally also give us a mask, and then we could 507 * severely limit what we expand to. 508 */ 509 #define XFRM_BYTES 6 510 #define XFRM_OFFSET ('0') /* make all printable characters */ 511 #define XFRM_SHIFT 6 512 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 513 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 514 515 static int 516 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass) 517 { 518 /* we use unsigned to ensure zero fill on right shift */ 519 uint32_t val = (uint32_t)table->info->pri_count[pass]; 520 int nc = 0; 521 522 while (val) { 523 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 524 pri >>= XFRM_SHIFT; 525 val >>= XFRM_SHIFT; 526 p++; 527 nc++; 528 } 529 return (nc); 530 } 531 532 size_t 533 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf, 534 size_t room) 535 { 536 int pri; 537 int len; 538 const wchar_t *t; 539 wchar_t *tr = NULL; 540 int direc; 541 int pass; 542 const int32_t *state; 543 size_t want = 0; 544 size_t need = 0; 545 int b; 546 uint8_t buf[XFRM_BYTES]; 547 int ndir = table->info->directive_count; 548 549 assert(src); 550 551 for (pass = 0; pass <= ndir; pass++) { 552 553 state = NULL; 554 555 if (pass != 0) { 556 /* insert level separator from the previous pass */ 557 if (room) { 558 *xf++ = XFRM_SEP; 559 room--; 560 } 561 want++; 562 } 563 564 /* special pass for undefined */ 565 if (pass == ndir) { 566 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 567 } else { 568 direc = table->info->directive[pass]; 569 } 570 571 t = src; 572 573 if (direc & DIRECTIVE_BACKWARD) { 574 wchar_t *bp, *fp, c; 575 free(tr); 576 if ((tr = wcsdup(t)) == NULL) { 577 errno = ENOMEM; 578 goto fail; 579 } 580 bp = tr; 581 fp = tr + wcslen(tr) - 1; 582 while (bp < fp) { 583 c = *bp; 584 *bp++ = *fp; 585 *fp-- = c; 586 } 587 t = (const wchar_t *)tr; 588 } 589 590 if (direc & DIRECTIVE_POSITION) { 591 while (*t || state) { 592 593 _collate_lookup(table, t, &len, &pri, pass, &state); 594 t += len; 595 if (pri <= 0) { 596 if (pri < 0) { 597 errno = EINVAL; 598 goto fail; 599 } 600 pri = COLLATE_MAX_PRIORITY; 601 } 602 603 b = xfrm(table, buf, pri, pass); 604 want += b; 605 if (room) { 606 while (b) { 607 b--; 608 if (room) { 609 *xf++ = buf[b]; 610 room--; 611 } 612 } 613 } 614 need = want; 615 } 616 } else { 617 while (*t || state) { 618 _collate_lookup(table, t, &len, &pri, pass, &state); 619 t += len; 620 if (pri <= 0) { 621 if (pri < 0) { 622 errno = EINVAL; 623 goto fail; 624 } 625 continue; 626 } 627 628 b = xfrm(table, buf, pri, pass); 629 want += b; 630 if (room) { 631 632 while (b) { 633 b--; 634 if (room) { 635 *xf++ = buf[b]; 636 room--; 637 } 638 } 639 } 640 need = want; 641 } 642 } 643 } 644 free(tr); 645 return (need); 646 647 fail: 648 free(tr); 649 return ((size_t)(-1)); 650 } 651 652 /* 653 * __collate_equiv_value returns the primary collation value for the given 654 * collating symbol specified by str and len. Zero or negative is returned 655 * if the collating symbol was not found. This function is used by bracket 656 * code in the TRE regex library. 657 */ 658 int 659 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) 660 { 661 int32_t e; 662 663 if (len < 1 || len >= COLLATE_STR_LEN) 664 return (-1); 665 666 FIX_LOCALE(locale); 667 struct xlocale_collate *table = 668 (struct xlocale_collate*)locale->components[XLC_COLLATE]; 669 670 if (table->__collate_load_error) 671 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1); 672 673 if (len == 1) { 674 e = -1; 675 if (*str <= UCHAR_MAX) 676 e = table->char_pri_table[*str].pri[0]; 677 else if (table->info->large_count > 0) { 678 collate_large_t *match_large; 679 match_large = largesearch(table, *str); 680 if (match_large) 681 e = match_large->pri.pri[0]; 682 } 683 if (e == 0) 684 return (1); 685 return (e > 0 ? e : 0); 686 } 687 if (table->info->chain_count > 0) { 688 wchar_t name[COLLATE_STR_LEN]; 689 collate_chain_t *match_chain; 690 int clen; 691 692 wcsncpy (name, str, len); 693 name[len] = 0; 694 match_chain = chainsearch(table, name, &clen); 695 if (match_chain) { 696 e = match_chain->pri[0]; 697 if (e == 0) 698 return (1); 699 return (e < 0 ? -e : e); 700 } 701 } 702 return (0); 703 } 704