1 /* 2 * Copright 2010 Nexenta Systems, Inc. All rights reserved. 3 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> 4 * at Electronni Visti IA, Kiev, Ukraine. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include "lint.h" 30 #include "file64.h" 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <stddef.h> 34 #include <string.h> 35 #include <wchar.h> 36 #include <errno.h> 37 #include <unistd.h> 38 #include <ctype.h> 39 #include <unistd.h> 40 #include <fcntl.h> 41 #include <assert.h> 42 #include <sys/stat.h> 43 #include <sys/mman.h> 44 45 #include "collate.h" 46 #include "setlocale.h" 47 #include "ldpart.h" 48 49 static collate_subst_t *subst_table[COLL_WEIGHTS_MAX]; 50 static collate_char_pri_t *char_pri_table; 51 static collate_large_pri_t *large_pri_table; 52 static collate_chain_pri_t *chain_pri_table; 53 static char *cache = NULL; 54 static size_t cachesz; 55 static char collate_encoding[ENCODING_LEN + 1]; 56 57 /* Exposed externally to other parts of libc. */ 58 collate_info_t *_collate_info; 59 int _collate_load_error = 1; 60 61 62 int 63 _collate_load_tables(const char *encoding) 64 { 65 int i, chains, z; 66 char buf[PATH_MAX]; 67 char *TMP; 68 char *map; 69 collate_info_t *info; 70 struct stat sbuf; 71 int fd; 72 73 /* 'encoding' must be already checked. */ 74 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { 75 _collate_load_error = 1; 76 return (_LDP_CACHE); 77 } 78 79 /* 80 * If the locale name is the same as our cache, use the cache. 81 */ 82 if (cache && (strncmp(encoding, collate_encoding, ENCODING_LEN) == 0)) { 83 _collate_load_error = 0; 84 return (_LDP_CACHE); 85 } 86 87 /* 88 * Slurp the locale file into the cache. 89 */ 90 91 (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE/LCL_DATA", 92 _PathLocale, encoding); 93 94 if ((fd = open(buf, O_RDONLY)) < 0) 95 return (_LDP_ERROR); 96 if (fstat(fd, &sbuf) < 0) { 97 (void) close(fd); 98 return (_LDP_ERROR); 99 } 100 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { 101 (void) close(fd); 102 errno = EINVAL; 103 return (_LDP_ERROR); 104 } 105 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 106 (void) close(fd); 107 if ((TMP = map) == NULL) { 108 return (_LDP_ERROR); 109 } 110 111 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { 112 (void) munmap(map, sbuf.st_size); 113 errno = EINVAL; 114 return (_LDP_ERROR); 115 } 116 TMP += COLLATE_STR_LEN; 117 118 info = (void *)TMP; 119 TMP += sizeof (*info); 120 121 if ((info->directive_count < 1) || 122 (info->directive_count >= COLL_WEIGHTS_MAX) || 123 ((chains = info->chain_count) < 0)) { 124 (void) munmap(map, sbuf.st_size); 125 errno = EINVAL; 126 return (_LDP_ERROR); 127 } 128 129 i = (sizeof (collate_char_pri_t) * (UCHAR_MAX + 1)) + 130 (sizeof (collate_chain_pri_t) * chains) + 131 (sizeof (collate_large_pri_t) * info->large_pri_count); 132 for (z = 0; z < (info->directive_count); z++) { 133 i += sizeof (collate_subst_t) * info->subst_count[z]; 134 } 135 if (i != (sbuf.st_size - (TMP - map))) { 136 (void) munmap(map, sbuf.st_size); 137 errno = EINVAL; 138 return (_LDP_ERROR); 139 } 140 141 char_pri_table = (void *)TMP; 142 TMP += sizeof (collate_char_pri_t) * (UCHAR_MAX + 1); 143 144 for (z = 0; z < info->directive_count; z++) { 145 if (info->subst_count[z] > 0) { 146 subst_table[z] = (void *)TMP; 147 TMP += info->subst_count[z] * sizeof (collate_subst_t); 148 } else { 149 subst_table[z] = NULL; 150 } 151 } 152 153 if (chains > 0) { 154 chain_pri_table = (void *)TMP; 155 TMP += chains * sizeof (collate_chain_pri_t); 156 } else 157 chain_pri_table = NULL; 158 if (info->large_pri_count > 0) 159 large_pri_table = (void *)TMP; 160 else 161 large_pri_table = NULL; 162 163 (void) strlcpy(collate_encoding, encoding, ENCODING_LEN); 164 _collate_info = info; 165 166 if (cache) 167 (void) munmap(cache, cachesz); 168 169 cache = map; 170 cachesz = sbuf.st_size; 171 _collate_load_error = 0; 172 173 return (_LDP_LOADED); 174 } 175 176 /* 177 * Note: for performance reasons, we have expanded bsearch here. This avoids 178 * function call overhead with each comparison. 179 */ 180 181 static int32_t * 182 substsearch(const wchar_t key, int pass) 183 { 184 int low; 185 int high; 186 int next, compar; 187 collate_subst_t *p; 188 collate_subst_t *tab; 189 int n = _collate_info->subst_count[pass]; 190 191 if (n == 0) 192 return (NULL); 193 194 tab = subst_table[pass]; 195 low = 0; 196 high = n - 1; 197 while (low <= high) { 198 next = (low + high) / 2; 199 p = tab + next; 200 compar = key - p->key; 201 if (compar == 0) 202 return (p->pri); 203 if (compar > 0) 204 low = next + 1; 205 else 206 high = next - 1; 207 } 208 return (NULL); 209 } 210 211 static collate_chain_pri_t * 212 chainsearch(const wchar_t *key, int *len) 213 { 214 int low; 215 int high; 216 int next, compar, l; 217 collate_chain_pri_t *p; 218 collate_chain_pri_t *tab; 219 220 if (_collate_info->chain_count == 0) 221 return (NULL); 222 223 low = 0; 224 high = _collate_info->chain_count - 1; 225 tab = chain_pri_table; 226 227 while (low <= high) { 228 next = (low + high) / 2; 229 p = tab + next; 230 compar = *key - *p->str; 231 if (compar == 0) { 232 l = wcsnlen(p->str, COLLATE_STR_LEN); 233 compar = wcsncmp(key, p->str, l); 234 if (compar == 0) { 235 *len = l; 236 return (p); 237 } 238 } 239 if (compar > 0) 240 low = next + 1; 241 else 242 high = next - 1; 243 } 244 return (NULL); 245 } 246 247 static collate_large_pri_t * 248 largesearch(const wchar_t key) 249 { 250 int low = 0; 251 int high = _collate_info->large_pri_count - 1; 252 int next, compar; 253 collate_large_pri_t *p; 254 collate_large_pri_t *tab = large_pri_table; 255 256 if (_collate_info->large_pri_count == 0) 257 return (NULL); 258 259 while (low <= high) { 260 next = (low + high) / 2; 261 p = tab + next; 262 compar = key - p->val; 263 if (compar == 0) 264 return (p); 265 if (compar > 0) 266 low = next + 1; 267 else 268 high = next - 1; 269 } 270 return (NULL); 271 } 272 273 void 274 _collate_lookup(const wchar_t *t, int *len, int *pri, int which, 275 int **state) 276 { 277 collate_chain_pri_t *p2; 278 collate_large_pri_t *match; 279 int p, l; 280 int *sptr; 281 282 /* 283 * If we have remaining substitution data from a previous 284 * call, consume it first. 285 */ 286 if ((sptr = *state) != NULL) { 287 *pri = *sptr; 288 sptr++; 289 *state = *sptr ? sptr : NULL; 290 *len = 0; 291 return; 292 } 293 294 /* No active substitutions */ 295 *len = 1; 296 297 /* 298 * Check for composites such as dipthongs that collate as a 299 * single element (aka chains or collating-elements). 300 */ 301 if (((p2 = chainsearch(t, &l)) != NULL) && 302 ((p = p2->pri[which]) >= 0)) { 303 304 *len = l; 305 *pri = p; 306 307 } else if (*t <= UCHAR_MAX) { 308 309 /* 310 * Character is a small (8-bit) character. 311 * We just look these up directly for speed. 312 */ 313 *pri = char_pri_table[*t].pri[which]; 314 315 } else if ((_collate_info->large_pri_count > 0) && 316 ((match = largesearch(*t)) != NULL)) { 317 318 /* 319 * Character was found in the extended table. 320 */ 321 *pri = match->pri.pri[which]; 322 323 } else { 324 /* 325 * Character lacks a specific definition. 326 */ 327 if (_collate_info->directive[which] & DIRECTIVE_UNDEFINED) { 328 /* Mask off sign bit to prevent ordering confusion. */ 329 *pri = (*t & COLLATE_MAX_PRIORITY); 330 } else { 331 *pri = _collate_info->undef_pri[which]; 332 } 333 /* No substitutions for undefined characters! */ 334 return; 335 } 336 337 /* 338 * Try substituting (expanding) the character. We are 339 * currently doing this *after* the chain compression. I 340 * think it should not matter, but this way might be slightly 341 * faster. 342 * 343 * We do this after the priority search, as this will help us 344 * to identify a single key value. In order for this to work, 345 * its important that the priority assigned to a given element 346 * to be substituted be unique for that level. The localedef 347 * code ensures this for us. 348 */ 349 if ((*pri & COLLATE_SUBST_PRIORITY) && 350 (sptr = substsearch(*pri, which)) != NULL) { 351 if ((*pri = *sptr) != 0) { 352 sptr++; 353 *state = *sptr ? sptr : NULL; 354 } 355 } 356 357 } 358 359 /* 360 * This is the meaty part of wcsxfrm & strxfrm. Note that it does 361 * NOT NULL terminate. That is left to the caller. 362 */ 363 size_t 364 _collate_wxfrm(const wchar_t *src, wchar_t *xf, size_t room) 365 { 366 int pri; 367 int len; 368 const wchar_t *t; 369 wchar_t *tr = NULL; 370 int direc; 371 int pass; 372 int32_t *state; 373 size_t want = 0; 374 size_t need = 0; 375 376 assert(src); 377 378 for (pass = 0; pass <= _collate_info->directive_count; pass++) { 379 380 state = NULL; 381 382 if (pass != 0) { 383 /* insert level separator from the previous pass */ 384 if (room) { 385 *xf++ = 1; 386 room--; 387 } 388 want++; 389 } 390 391 /* special pass for undefined */ 392 if (pass == _collate_info->directive_count) { 393 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 394 } else { 395 direc = _collate_info->directive[pass]; 396 } 397 398 t = src; 399 400 if (direc & DIRECTIVE_BACKWARD) { 401 wchar_t *bp, *fp, c; 402 if (tr) 403 free(tr); 404 if ((tr = wcsdup(t)) == NULL) { 405 errno = ENOMEM; 406 goto fail; 407 } 408 bp = tr; 409 fp = tr + wcslen(tr) - 1; 410 while (bp < fp) { 411 c = *bp; 412 *bp++ = *fp; 413 *fp-- = c; 414 } 415 t = (const wchar_t *)tr; 416 } 417 418 if (direc & DIRECTIVE_POSITION) { 419 while (*t || state) { 420 _collate_lookup(t, &len, &pri, pass, &state); 421 t += len; 422 if (pri <= 0) { 423 if (pri < 0) { 424 errno = EINVAL; 425 goto fail; 426 } 427 pri = COLLATE_MAX_PRIORITY; 428 } 429 if (room) { 430 *xf++ = pri; 431 room--; 432 } 433 want++; 434 need = want; 435 } 436 } else { 437 while (*t || state) { 438 _collate_lookup(t, &len, &pri, pass, &state); 439 t += len; 440 if (pri <= 0) { 441 if (pri < 0) { 442 errno = EINVAL; 443 goto fail; 444 } 445 continue; 446 } 447 if (room) { 448 *xf++ = pri; 449 room--; 450 } 451 want++; 452 need = want; 453 } 454 } 455 } 456 457 end: 458 if (tr) 459 free(tr); 460 return (need); 461 462 fail: 463 if (tr) 464 free(tr); 465 return ((size_t)(-1)); 466 } 467 468 /* 469 * In the non-POSIX case, we transform each character into a string of 470 * characters representing the character's priority. Since char is usually 471 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add 472 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 473 * bits per byte. 474 * 475 * It turns out that we sometimes have real priorities that are 476 * 31-bits wide. (But: be careful using priorities where the high 477 * order bit is set -- i.e. the priority is negative. The sort order 478 * may be surprising!) 479 * 480 * TODO: This would be a good area to optimize somewhat. It turns out 481 * that real prioririties *except for the last UNDEFINED pass* are generally 482 * very small. We need the localedef code to precalculate the max 483 * priority for us, and ideally also give us a mask, and then we could 484 * severely limit what we expand to. 485 */ 486 #define XFRM_BYTES 6 487 #define XFRM_OFFSET ('0') /* make all printable characters */ 488 #define XFRM_SHIFT 6 489 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) 490 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ 491 492 static void 493 xfrm(unsigned char *p, int pri) 494 { 495 int i; 496 for (i = 0; i < XFRM_BYTES; i++) { 497 p[i] = (pri & XFRM_MASK) + XFRM_OFFSET; 498 pri >>= XFRM_SHIFT; 499 } 500 } 501 502 size_t 503 _collate_sxfrm(const wchar_t *src, char *xf, size_t room) 504 { 505 int pri; 506 int len; 507 const wchar_t *t; 508 wchar_t *tr = NULL; 509 int direc; 510 int pass; 511 int32_t *state; 512 size_t want = 0; 513 size_t need = 0; 514 int b; 515 uint8_t buf[XFRM_BYTES]; 516 517 assert(src); 518 519 for (pass = 0; pass <= _collate_info->directive_count; pass++) { 520 521 state = NULL; 522 523 if (pass != 0) { 524 /* insert level separator from the previous pass */ 525 if (room) { 526 *xf++ = XFRM_SEP; 527 room--; 528 } 529 want++; 530 } 531 532 /* special pass for undefined */ 533 if (pass == _collate_info->directive_count) { 534 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; 535 } else { 536 direc = _collate_info->directive[pass]; 537 } 538 539 t = src; 540 541 if (direc & DIRECTIVE_BACKWARD) { 542 wchar_t *bp, *fp, c; 543 if (tr) 544 free(tr); 545 if ((tr = wcsdup(t)) == NULL) { 546 errno = ENOMEM; 547 goto fail; 548 } 549 bp = tr; 550 fp = tr + wcslen(tr) - 1; 551 while (bp < fp) { 552 c = *bp; 553 *bp++ = *fp; 554 *fp-- = c; 555 } 556 t = (const wchar_t *)tr; 557 } 558 559 if (direc & DIRECTIVE_POSITION) { 560 while (*t || state) { 561 562 _collate_lookup(t, &len, &pri, pass, &state); 563 t += len; 564 if (pri <= 0) { 565 if (pri < 0) { 566 errno = EINVAL; 567 goto fail; 568 } 569 pri = COLLATE_MAX_PRIORITY; 570 } 571 572 if (room) { 573 b = XFRM_BYTES; 574 xfrm(buf, pri); 575 576 while (b) { 577 b--; 578 if (room) 579 *xf++ = buf[b]; 580 room--; 581 } 582 } 583 want += XFRM_BYTES; 584 need = want; 585 } 586 } else { 587 while (*t || state) { 588 _collate_lookup(t, &len, &pri, pass, &state); 589 t += len; 590 if (pri <= 0) { 591 if (pri < 0) { 592 errno = EINVAL; 593 goto fail; 594 } 595 continue; 596 } 597 if (room) { 598 b = XFRM_BYTES; 599 xfrm(buf, pri); 600 601 while (b) { 602 b--; 603 if (room) 604 *xf++ = buf[b]; 605 room--; 606 } 607 } 608 want += XFRM_BYTES; 609 need = want; 610 } 611 } 612 } 613 614 end: 615 if (tr) 616 free(tr); 617 return (need); 618 619 fail: 620 if (tr) 621 free(tr); 622 return ((size_t)(-1)); 623 } 624