1 /* 2 * Copyright 2014 Garrett D'Amore <garrett@damore.org> 3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 4 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 5 * at Electronni Visti IA, Kiev, Ukraine. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include "lint.h" 31 #include "file64.h" 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <stddef.h> 35 #include <string.h> 36 #include <wchar.h> 37 #include <errno.h> 38 #include <unistd.h> 39 #include <ctype.h> 40 #include <unistd.h> 41 #include <fcntl.h> 42 #include <assert.h> 43 #include <sys/stat.h> 44 #include <sys/mman.h> 45 46 #include "collate.h" 47 #include "localeimpl.h" 48 49 /* Check file format vs libc runtime. (See collatefile.h) */ 50 #if COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX 51 #error "COLL_WEIGHTS_MAX != COLLATE_WEIGHTS_MAX" 52 #endif 53 54 /* 55 * See the comments in usr/src/cmd/localedef/collate.c for further 56 * information. It would also be very helpful to have a copy of the 57 * POSIX standard for collation (in the locale format manual page) 58 * handy (www.opengroup.org). 59 */ 60 61 /* 62 * POSIX uses empty tables and falls down to strcmp. 63 */ 64 struct lc_collate lc_collate_posix = { 65 .lc_is_posix = 1, 66 }; 67 68 struct locdata __posix_collate_locdata = { 69 .l_lname = "C", 70 .l_data = { &lc_collate_posix } 71 }; 72 73 74 struct locdata * 75 __lc_collate_load(const char *locname) 76 { 77 int i, chains, z; 78 char buf[PATH_MAX]; 79 char *TMP; 80 char *map; 81 collate_info_t *info; 82 struct stat sbuf; 83 int fd; 84 struct locdata *ldata; 85 struct lc_collate *lcc; 86 87 /* 88 * Slurp the locale file into the cache. 89 */ 90 91 (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA", 92 _PathLocale, locname); 93 94 if ((fd = open(buf, O_RDONLY)) < 0) { 95 errno = EINVAL; 96 return (NULL); 97 } 98 if (fstat(fd, &sbuf) < 0) { 99 (void) close(fd); 100 errno = EINVAL; 101 return (NULL); 102 } 103 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { 104 (void) close(fd); 105 errno = EINVAL; 106 return (NULL); 107 } 108 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 109 (void) close(fd); 110 if ((TMP = map) == NULL) { 111 errno = EINVAL; 112 return (NULL); 113 } 114 115 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { 116 (void) munmap(map, sbuf.st_size); 117 errno = EINVAL; 118 return (NULL); 119 } 120 TMP += COLLATE_STR_LEN; 121 122 info = (void *)TMP; 123 TMP += sizeof (*info); 124 125 if ((info->directive_count < 1) || 126 (info->directive_count >= COLL_WEIGHTS_MAX) || 127 ((chains = info->chain_count) < 0)) { 128 (void) munmap(map, sbuf.st_size); 129 errno = EINVAL; 130 return (NULL); 131 } 132 133 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + 134 (sizeof (collate_chain_t) * chains) + 135 (sizeof (collate_large_t) * info->large_count); 136 for (z = 0; z < info->directive_count; z++) { 137 i += sizeof (collate_subst_t) * info->subst_count[z]; 138 } 139 if (i != (sbuf.st_size - (TMP - map))) { 140 (void) munmap(map, sbuf.st_size); 141 errno = EINVAL; 142 return (NULL); 143 } 144 145 146 if ((ldata = __locdata_alloc(locname, sizeof (*lcc))) == NULL) { 147 (void) munmap(map, sbuf.st_size); 148 return (NULL); 149 } 150 lcc = ldata->l_data[0]; 151 ldata->l_map = map; 152 ldata->l_map_len = sbuf.st_size; 153 154 lcc->lc_info = info; 155 lcc->lc_directive_count = info->directive_count; 156 lcc->lc_large_count = info->large_count; 157 158 for (z = 0; z < COLL_WEIGHTS_MAX; z++) { 159 lcc->lc_directive[z] = info->directive[z]; 160 lcc->lc_subst_count[z] = info->subst_count[z]; 161 lcc->lc_pri_count[z] = info->pri_count[z]; 162 lcc->lc_undef_pri[z] = info->undef_pri[z]; 163 } 164 165 lcc->lc_char_table = (void *)TMP; 166 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); 167 168 for (z = 0; z < lcc->lc_directive_count; z++) { 169 int count; 170 if ((count = lcc->lc_subst_count[z]) > 0) { 171 lcc->lc_subst_table[z] = (void *)TMP; 172 TMP += count * sizeof (collate_subst_t); 173 } else { 174 lcc->lc_subst_table[z] = NULL; 175 } 176 } 177 178 if (chains > 0) { 179 lcc->lc_chain_table = (void *)TMP; 180 TMP += chains * sizeof (collate_chain_t); 181 } else 182 lcc->lc_chain_table = NULL; 183 lcc->lc_chain_count = chains; 184 if (lcc->lc_large_count > 0) 185 lcc->lc_large_table = (void *)TMP; 186 else 187 lcc->lc_large_table = NULL; 188 189 return (ldata); 190 } 191 192 static const int32_t * 193 substsearch(const struct lc_collate *lcc, const wchar_t key, int pass) 194 { 195 const collate_subst_t *p; 196 int n = lcc->lc_subst_count[pass]; 197 198 if (n == 0) 199 return (NULL); 200 201 if (pass >= lcc->lc_directive_count) 202 return (NULL); 203 204 if (!(key & COLLATE_SUBST_PRIORITY)) 205 return (NULL); 206 207 p = lcc->lc_subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); 208 assert(p->key == key); 209 return (p->pri); 210 } 211 212 static collate_chain_t * 213 chainsearch(const struct lc_collate *lcc, const wchar_t *key, int *len) 214 { 215 int low = 0; 216 int high = lcc->lc_info->chain_count - 1; 217 int next, compar, l; 218 collate_chain_t *p; 219 collate_chain_t *tab = lcc->lc_chain_table; 220 221 if (high < 0) 222 return (NULL); 223 224 while (low <= high) { 225 next = (low + high) / 2; 226 p = tab + next; 227 compar = *key - *p->str; 228 if (compar == 0) { 229 l = wcsnlen(p->str, COLLATE_STR_LEN); 230 compar = wcsncmp(key, p->str, l); 231 if (compar == 0) { 232 *len = l; 233 return (p); 234 } 235 } 236 if (compar > 0) 237 low = next + 1; 238 else 239 high = next - 1; 240 } 241 return (NULL); 242 } 243 244 static collate_large_t * 245 largesearch(const struct lc_collate *lcc, const wchar_t key) 246 { 247 int low = 0; 248 int high = lcc->lc_info->large_count - 1; 249 int next, compar; 250 collate_large_t *p; 251 collate_large_t *tab = lcc->lc_large_table; 252 253 if (high < 0) 254 return (NULL); 255 256 while (low <= high) { 257 next = (low + high) / 2; 258 p = tab + next; 259 compar = key - p->val; 260 if (compar == 0) 261 return (p); 262 if (compar > 0) 263 low = next + 1; 264 else 265 high = next - 1; 266 } 267 return (NULL); 268 } 269 270 void 271 _collate_lookup(const struct lc_collate *lcc, const wchar_t *t, 272 int *len, int *pri, int which, const int **state) 273 { 274 collate_chain_t *p2; 275 collate_large_t *match; 276 int p, l; 277 const int *sptr; 278 279 /* 280 * If this is the "last" pass for the UNDEFINED, then 281 * we just return the priority itself. 282 */ 283 if (which >= lcc->lc_directive_count) { 284 *pri = *t; 285 *len = 1; 286 *state = NULL; 287 return; 288 } 289 290 /* 291 * If we have remaining substitution data from a previous 292 * call, consume it first. 293 */ 294 if ((sptr = *state) != NULL) { 295 *pri = *sptr; 296 sptr++; 297 if ((sptr == *state) || (sptr == NULL)) 298 *state = NULL; 299 else 300 *state = sptr; 301 *len = 0; 302 return; 303 } 304 305 /* No active substitutions */ 306 *len = 1; 307 308 /* 309 * Check for composites such as dipthongs that collate as a 310 * single element (aka chains or collating-elements). 311 */ 312 if (((p2 = chainsearch(lcc, t, &l)) != NULL) && 313 ((p = p2->pri[which]) >= 0)) { 314 315 *len = l; 316 *pri = p; 317 318 } else if (*t <= UCHAR_MAX) { 319 320 /* 321 * Character is a small (8-bit) character. 322 * We just look these up directly for speed. 323 */ 324 *pri = lcc->lc_char_table[*t].pri[which]; 325 326 } else if ((lcc->lc_info->large_count > 0) && 327 ((match = largesearch(lcc, *t)) != NULL)) { 328 329 /* 330 * Character was found in the extended table. 331 */ 332 *pri = match->pri.pri[which]; 333 334 } else { 335 /* 336 * Character lacks a specific definition. 337 */ 338 if (lcc->lc_directive[which] & DIRECTIVE_UNDEFINED) { 339 /* Mask off sign bit to prevent ordering confusion. */ 340 *pri = (*t & COLLATE_MAX_PRIORITY); 341 } else { 342 *pri = lcc->lc_undef_pri[which]; 343 } 344 /* No substitutions for undefined characters! */ 345 return; 346 } 347 348 /* 349 * Try substituting (expanding) the character. We are 350 * currently doing this *after* the chain compression. I 351 * think it should not matter, but this way might be slightly 352 * faster. 353 * 354 * We do this after the priority search, as this will help us 355 * to identify a single key value. In order for this to work, 356 * its important that the priority assigned to a given element 357 * to be substituted be unique for that level. The localedef 358 * code ensures this for us. 359 */ 360 if ((sptr = substsearch(lcc, *pri, which)) != NULL) { 361 if ((*pri = *sptr) > 0) { 362 sptr++; 363 *state = *sptr ? sptr : NULL; 364 } 365 } 366 367 } 368 369 /* 370 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 371 * NOT NULL terminate. That is left to the caller. 372 */ 373 size_t 374 _collate_wxfrm(const struct lc_collate *lcc, const wchar_t *src, wchar_t *xf, 375 size_t room) 376 { 377 int pri; 378 int len; 379 const wchar_t *t; 380 wchar_t *tr = NULL; 381 int direc; 382 int pass; 383 const int32_t *state; 384 size_t want = 0; 385 size_t need = 0; 386 int ndir = lcc->lc_directive_count; 387 388 assert(src); 389 390 for (pass = 0; pass <= ndir; pass++) { 391 392 state = NULL; 393 394 if (pass != 0) { 395 /* insert level separator from the previous pass */ 396 if (room) { 397 *xf++ = 1; 398 room--; 399 } 400 want++; 401 } 402 403 /* special pass for undefined */ 404 if (pass == ndir) { 405 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 406 } else { 407 direc = lcc->lc_directive[pass]; 408 } 409 410 t = src; 411 412 if (direc & DIRECTIVE_BACKWARD) { 413 wchar_t *bp, *fp, c; 414 if (tr) 415 free(tr); 416 if ((tr = wcsdup(t)) == NULL) { 417 errno = ENOMEM; 418 goto fail; 419 } 420 bp = tr; 421 fp = tr + wcslen(tr) - 1; 422 while (bp < fp) { 423 c = *bp; 424 *bp++ = *fp; 425 *fp-- = c; 426 } 427 t = (const wchar_t *)tr; 428 } 429 430 if (direc & DIRECTIVE_POSITION) { 431 while (*t || state) { 432 _collate_lookup(lcc, t, &len, &pri, pass, 433 &state); 434 t += len; 435 if (pri <= 0) { 436 if (pri < 0) { 437 errno = EINVAL; 438 goto fail; 439 } 440 state = NULL; 441 pri = COLLATE_MAX_PRIORITY; 442 } 443 if (room) { 444 *xf++ = pri; 445 room--; 446 } 447 want++; 448 need = want; 449 } 450 } else { 451 while (*t || state) { 452 _collate_lookup(lcc, t, &len, &pri, pass, 453 &state); 454 t += len; 455 if (pri <= 0) { 456 if (pri < 0) { 457 errno = EINVAL; 458 goto fail; 459 } 460 state = NULL; 461 continue; 462 } 463 if (room) { 464 *xf++ = pri; 465 room--; 466 } 467 want++; 468 need = want; 469 } 470 } 471 } 472 473 free(tr); 474 return (need); 475 476 fail: 477 free(tr); 478 return ((size_t)(-1)); 479 } 480 481 /* 482 * In the non-POSIX case, we transform each character into a string of 483 * characters representing the character's priority. Since char is usually 484 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 485 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 486 * bits per byte. 487 * 488 * It turns out that we sometimes have real priorities that are 489 * 31-bits wide. (But: be careful using priorities where the high 490 * order bit is set -- i.e. the priority is negative. The sort order 491 * may be surprising!) 492 * 493 * TODO: This would be a good area to optimize somewhat. It turns out 494 * that real prioririties *except for the last UNDEFINED pass* are generally 495 * very small. We need the localedef code to precalculate the max 496 * priority for us, and ideally also give us a mask, and then we could 497 * severely limit what we expand to. 498 */ 499 #define XFRM_BYTES 6 500 #define XFRM_OFFSET ('0') /* make all printable characters */ 501 #define XFRM_SHIFT 6 502 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 503 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 504 505 static int 506 xfrm(locale_t loc, unsigned char *p, int pri, int pass) 507 { 508 /* we use unsigned to ensure zero fill on right shift */ 509 uint32_t val = (uint32_t)loc->collate->lc_pri_count[pass]; 510 int nc = 0; 511 512 while (val) { 513 *p = (pri & XFRM_MASK) + XFRM_OFFSET; 514 pri >>= XFRM_SHIFT; 515 val >>= XFRM_SHIFT; 516 p++; 517 nc++; 518 } 519 return (nc); 520 } 521 522 size_t 523 _collate_sxfrm(const wchar_t *src, char *xf, size_t room, locale_t loc) 524 { 525 int pri; 526 int len; 527 const wchar_t *t; 528 wchar_t *tr = NULL; 529 int direc; 530 int pass; 531 const int32_t *state; 532 size_t want = 0; 533 size_t need = 0; 534 int b; 535 uint8_t buf[XFRM_BYTES]; 536 const struct lc_collate *lcc = loc->collate; 537 int ndir = lcc->lc_directive_count; 538 539 assert(src); 540 541 for (pass = 0; pass <= ndir; pass++) { 542 543 state = NULL; 544 545 if (pass != 0) { 546 /* insert level separator from the previous pass */ 547 if (room) { 548 *xf++ = XFRM_SEP; 549 room--; 550 } 551 want++; 552 } 553 554 /* special pass for undefined */ 555 if (pass == ndir) { 556 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 557 } else { 558 direc = lcc->lc_directive[pass]; 559 } 560 561 t = src; 562 563 if (direc & DIRECTIVE_BACKWARD) { 564 wchar_t *bp, *fp, c; 565 if (tr) 566 free(tr); 567 if ((tr = wcsdup(t)) == NULL) { 568 errno = ENOMEM; 569 goto fail; 570 } 571 bp = tr; 572 fp = tr + wcslen(tr) - 1; 573 while (bp < fp) { 574 c = *bp; 575 *bp++ = *fp; 576 *fp-- = c; 577 } 578 t = (const wchar_t *)tr; 579 } 580 581 if (direc & DIRECTIVE_POSITION) { 582 while (*t || state) { 583 584 _collate_lookup(lcc, t, &len, &pri, pass, 585 &state); 586 t += len; 587 if (pri <= 0) { 588 if (pri < 0) { 589 errno = EINVAL; 590 goto fail; 591 } 592 state = NULL; 593 pri = COLLATE_MAX_PRIORITY; 594 } 595 596 b = xfrm(loc, buf, pri, pass); 597 want += b; 598 if (room) { 599 while (b) { 600 b--; 601 if (room) { 602 *xf++ = buf[b]; 603 room--; 604 } 605 } 606 } 607 need = want; 608 } 609 } else { 610 while (*t || state) { 611 _collate_lookup(lcc, t, &len, &pri, pass, 612 &state); 613 t += len; 614 if (pri <= 0) { 615 if (pri < 0) { 616 errno = EINVAL; 617 goto fail; 618 } 619 state = NULL; 620 continue; 621 } 622 623 b = xfrm(loc, buf, pri, pass); 624 want += b; 625 if (room) { 626 627 while (b) { 628 b--; 629 if (room) { 630 *xf++ = buf[b]; 631 room--; 632 } 633 } 634 } 635 need = want; 636 } 637 } 638 } 639 640 free(tr); 641 return (need); 642 643 fail: 644 free(tr); 645 return ((size_t)(-1)); 646 } 647