1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <_string_table.h> 28 #include <strings.h> 29 #include <sgs.h> 30 #include <stdio.h> 31 32 /* 33 * This file provides the interfaces to build a Str_tbl suitable for use by 34 * either the sgsmsg message system, or a standard ELF string table (SHT_STRTAB) 35 * as created by ld(1). 36 * 37 * There are two modes which can be used when constructing a string table: 38 * 39 * st_new(0) 40 * standard string table - no compression. This is the 41 * traditional, fast method. 42 * 43 * st_new(FLG_STTAB_COMPRESS) 44 * builds a compressed string table which both eliminates 45 * duplicate strings, and permits strings with common suffixes 46 * (atexit vs. exit) to overlap in the table. This provides space 47 * savings for many string tables. Although more work than the 48 * traditional method, the algorithms used are designed to scale 49 * and keep any overhead at a minimum. 50 * 51 * These string tables are built with a common interface in a two-pass manner. 52 * The first pass finds all of the strings required for the string-table and 53 * calculates the size required for the final string table. 54 * 55 * The second pass allocates the string table, populates the strings into the 56 * table and returns the offsets the strings have been assigned. 57 * 58 * The calling sequence to build and populate a string table is: 59 * 60 * st_new(); // initialize strtab 61 * 62 * st_insert(st1); // first pass of strings ... 63 * // calculates size required for 64 * // string table 65 * 66 * st_delstring(st?); // remove string previously 67 * // inserted 68 * st_insert(stN); 69 * 70 * st_getstrtab_sz(); // freezes strtab and computes 71 * // size of table. 72 * 73 * st_setstrbuf(); // associates a final destination 74 * // for the string table 75 * 76 * st_setstring(st1); // populate the string table 77 * ... // offsets are based off of second 78 * // pass through the string table 79 * st_setstring(stN); 80 * 81 * st_destroy(); // tear down string table 82 * // structures. 83 * 84 * String Suffix Compression Algorithm: 85 * 86 * Here's a quick high level overview of the Suffix String 87 * compression algorithm used. First - the heart of the algorithm 88 * is a Hash table list which represents a dictionary of all unique 89 * strings inserted into the string table. The hash function for 90 * this table is a standard string hash except that the hash starts 91 * at the last character in the string (&str[n - 1]) and works towards 92 * the first character in the function (&str[0]). As we compute the 93 * HASH value for a given string, we also compute the hash values 94 * for all of the possible suffix strings for that string. 95 * 96 * As we compute the hash - at each character see if the current 97 * suffix string for that hash is already present in the table. If 98 * it is, and the string is a master string. Then change that 99 * string to a suffix string of the new string being inserted. 100 * 101 * When the final hash value is found (hash for str[0...n]), check 102 * to see if it is in the hash table - if so increment the reference 103 * count for the string. If it is not yet in the table, insert a 104 * new hash table entry for a master string. 105 * 106 * The above method will find all suffixes of a given string given 107 * that the strings are inserted from shortest to longest. That is 108 * why this is a two phase method, we first collect all of the 109 * strings and store them based off of their length in an AVL tree. 110 * Once all of the strings have been submitted we then start the 111 * hash table build by traversing the AVL tree in order and 112 * inserting the strings from shortest to longest as described 113 * above. 114 */ 115 116 /* LINTLIBRARY */ 117 118 static int 119 avl_len_compare(const void *n1, const void *n2) 120 { 121 size_t len1, len2; 122 123 len1 = ((LenNode *)n1)->ln_strlen; 124 len2 = ((LenNode *)n2)->ln_strlen; 125 126 if (len1 == len2) 127 return (0); 128 if (len2 < len1) 129 return (1); 130 return (-1); 131 } 132 133 static int 134 avl_str_compare(const void *n1, const void *n2) 135 { 136 const char *str1, *str2; 137 int rc; 138 139 str1 = ((StrNode *)n1)->sn_str; 140 str2 = ((StrNode *)n2)->sn_str; 141 142 rc = strcmp(str1, str2); 143 if (rc > 0) 144 return (1); 145 if (rc < 0) 146 return (-1); 147 return (0); 148 } 149 150 /* 151 * Return an initialized Str_tbl - returns NULL on failure. 152 * 153 * flags: 154 * FLG_STTAB_COMPRESS - build a compressed string table 155 */ 156 Str_tbl * 157 st_new(uint_t flags) 158 { 159 Str_tbl *stp; 160 161 if ((stp = calloc(1, sizeof (*stp))) == NULL) 162 return (NULL); 163 164 /* 165 * Start with a leading '\0' - it's tradition. 166 */ 167 stp->st_strsize = stp->st_fullstrsize = stp->st_nextoff = 1; 168 169 /* 170 * Do we compress this string table? 171 */ 172 stp->st_flags = flags; 173 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) 174 return (stp); 175 176 if ((stp->st_lentree = calloc(1, sizeof (*stp->st_lentree))) == NULL) 177 return (NULL); 178 179 avl_create(stp->st_lentree, &avl_len_compare, sizeof (LenNode), 180 SGSOFFSETOF(LenNode, ln_avlnode)); 181 182 return (stp); 183 } 184 185 /* 186 * Insert a new string into the Str_tbl. There are two AVL trees used. 187 * 188 * - The first LenNode AVL tree maintains a tree of nodes based on string 189 * sizes. 190 * - Each LenNode maintains a StrNode AVL tree for each string. Large 191 * applications have been known to contribute thousands of strings of 192 * the same size. Should strings need to be removed (-z ignore), then 193 * the string AVL tree makes this removal efficient and scalable. 194 */ 195 int 196 st_insert(Str_tbl *stp, const char *str) 197 { 198 size_t len; 199 StrNode *snp, sn = { 0 }; 200 LenNode *lnp, ln = { 0 }; 201 avl_index_t where; 202 203 /* 204 * String table can't have been cooked 205 */ 206 assert((stp->st_flags & FLG_STTAB_COOKED) == 0); 207 208 /* 209 * Null strings always point to the head of the string 210 * table - no reason to keep searching. 211 */ 212 if ((len = strlen(str)) == 0) 213 return (0); 214 215 stp->st_fullstrsize += len + 1; 216 stp->st_strcnt++; 217 218 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) 219 return (0); 220 221 /* 222 * From the controlling string table, determine which LenNode AVL node 223 * provides for this string length. If the node doesn't exist, insert 224 * a new node to represent this string length. 225 */ 226 ln.ln_strlen = len; 227 if ((lnp = avl_find(stp->st_lentree, &ln, &where)) == NULL) { 228 if ((lnp = calloc(1, sizeof (*lnp))) == NULL) 229 return (-1); 230 231 if ((lnp->ln_strtree = calloc(1, sizeof (*lnp->ln_strtree))) == 232 NULL) { 233 free(lnp); 234 return (-1); 235 } 236 237 lnp->ln_strlen = len; 238 avl_insert(stp->st_lentree, lnp, where); 239 240 avl_create(lnp->ln_strtree, &avl_str_compare, sizeof (StrNode), 241 SGSOFFSETOF(StrNode, sn_avlnode)); 242 } 243 244 /* 245 * From the string length AVL node determine whether a StrNode AVL node 246 * provides this string. If the node doesn't exist, insert a new node 247 * to represent this string. 248 */ 249 sn.sn_str = str; 250 if ((snp = avl_find(lnp->ln_strtree, &sn, &where)) == NULL) { 251 if ((snp = calloc(1, sizeof (*snp))) == NULL) 252 return (-1); 253 snp->sn_str = str; 254 avl_insert(lnp->ln_strtree, snp, where); 255 } 256 snp->sn_refcnt++; 257 258 return (0); 259 } 260 261 /* 262 * Remove a previously inserted string from the Str_tbl. 263 */ 264 int 265 st_delstring(Str_tbl *stp, const char *str) 266 { 267 size_t len; 268 LenNode *lnp, ln = { 0 }; 269 StrNode *snp, sn = { 0 }; 270 271 /* 272 * String table can't have been cooked 273 */ 274 assert((stp->st_flags & FLG_STTAB_COOKED) == 0); 275 276 len = strlen(str); 277 stp->st_fullstrsize -= len + 1; 278 279 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) 280 return (0); 281 282 /* 283 * Determine which LenNode AVL node provides for this string length. 284 */ 285 ln.ln_strlen = len; 286 if ((lnp = avl_find(stp->st_lentree, &ln, 0)) != NULL) { 287 sn.sn_str = str; 288 if ((snp = avl_find(lnp->ln_strtree, &sn, 0)) != NULL) { 289 /* 290 * Reduce the reference count, and if zero remove the 291 * node. 292 */ 293 if (--snp->sn_refcnt == 0) 294 avl_remove(lnp->ln_strtree, snp); 295 return (0); 296 } 297 } 298 299 /* 300 * No strings of this length, or no string itself - someone goofed. 301 */ 302 return (-1); 303 } 304 305 /* 306 * Tear down a String_Table structure. 307 */ 308 void 309 st_destroy(Str_tbl *stp) 310 { 311 Str_hash *sthash, *psthash; 312 Str_master *mstr, *pmstr; 313 uint_t i; 314 315 /* 316 * cleanup the master strings 317 */ 318 for (mstr = stp->st_mstrlist, pmstr = 0; mstr; 319 mstr = mstr->sm_next) { 320 if (pmstr) 321 free(pmstr); 322 pmstr = mstr; 323 } 324 if (pmstr) 325 free(pmstr); 326 327 if (stp->st_hashbcks) { 328 for (i = 0; i < stp->st_hbckcnt; i++) { 329 for (sthash = stp->st_hashbcks[i], psthash = 0; 330 sthash; sthash = sthash->hi_next) { 331 if (psthash) 332 free(psthash); 333 psthash = sthash; 334 } 335 if (psthash) 336 free(psthash); 337 } 338 free(stp->st_hashbcks); 339 } 340 free(stp); 341 } 342 343 /* 344 * Hash a single additional character into hashval, separately so we can 345 * iteratively get suffix hashes. See st_string_hash and st_hash_insert 346 */ 347 static inline uint_t 348 st_string_hashround(uint_t hashval, char c) 349 { 350 /* h = ((h * 33) + c) */ 351 return (((hashval << 5) + hashval) + c); 352 } 353 354 /* 355 * We use a classic 'Bernstein k=33' hash function. But 356 * instead of hashing from the start of the string to the 357 * end, we do it in reverse. 358 * 359 * This way we are essentially building all of the 360 * suffix hashvalues as we go. We can check to see if 361 * any suffixes already exist in the tree as we generate 362 * the hash. 363 */ 364 static inline uint_t 365 st_string_hash(const char *str) 366 { 367 uint_t hashval = HASHSEED; 368 size_t stlen = strlen(str); 369 370 /* We should never be hashing the NUL string */ 371 assert(stlen > 0); 372 373 for (int i = stlen; i >= 0; i--) { 374 assert(i <= stlen); /* not unsigned->signed truncated */ 375 hashval = st_string_hashround(hashval, str[i]); 376 } 377 378 return (hashval); 379 } 380 381 /* 382 * For a given string - copy it into the buffer associated with the string 383 * table - and return the offset it has been assigned in stoff. 384 * 385 * If a value of '-1' is returned - the string was not found in 386 * the Str_tbl. 387 */ 388 int 389 st_setstring(Str_tbl *stp, const char *str, size_t *stoff) 390 { 391 size_t stlen; 392 uint_t hashval; 393 Str_hash *sthash; 394 Str_master *mstr; 395 396 /* 397 * String table *must* have been previously cooked 398 */ 399 assert(stp->st_strbuf != NULL); 400 401 assert(stp->st_flags & FLG_STTAB_COOKED); 402 stlen = strlen(str); 403 /* 404 * Null string always points to head of string table 405 */ 406 if (stlen == 0) { 407 if (stoff != NULL) 408 *stoff = 0; 409 return (0); 410 } 411 412 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 413 size_t _stoff; 414 415 stlen++; /* count for trailing '\0' */ 416 _stoff = stp->st_nextoff; 417 /* 418 * Have we overflowed our assigned buffer? 419 */ 420 if ((_stoff + stlen) > stp->st_fullstrsize) 421 return (-1); 422 memcpy(stp->st_strbuf + _stoff, str, stlen); 423 if (stoff != NULL) 424 *stoff = _stoff; 425 stp->st_nextoff += stlen; 426 return (0); 427 } 428 429 /* 430 * Calculate reverse hash for string. 431 */ 432 hashval = st_string_hash(str); 433 434 for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash; 435 sthash = sthash->hi_next) { 436 const char *hstr; 437 438 if (sthash->hi_hashval != hashval) 439 continue; 440 441 hstr = &sthash->hi_mstr->sm_str[sthash->hi_mstr->sm_strlen - 442 sthash->hi_strlen]; 443 if (strcmp(str, hstr) == 0) 444 break; 445 } 446 447 /* 448 * Did we find the string? 449 */ 450 if (sthash == 0) 451 return (-1); 452 453 /* 454 * Has this string been copied into the string table? 455 */ 456 mstr = sthash->hi_mstr; 457 if (mstr->sm_stroff == 0) { 458 size_t mstrlen = mstr->sm_strlen + 1; 459 460 mstr->sm_stroff = stp->st_nextoff; 461 462 /* 463 * Have we overflowed our assigned buffer? 464 */ 465 if ((mstr->sm_stroff + mstrlen) > stp->st_fullstrsize) 466 return (-1); 467 468 (void) memcpy(stp->st_strbuf + mstr->sm_stroff, 469 mstr->sm_str, mstrlen); 470 stp->st_nextoff += mstrlen; 471 } 472 473 /* 474 * Calculate offset of (sub)string. 475 */ 476 if (stoff != NULL) 477 *stoff = mstr->sm_stroff + mstr->sm_strlen - sthash->hi_strlen; 478 479 return (0); 480 } 481 482 static int 483 st_hash_insert(Str_tbl *stp, const char *str, size_t len) 484 { 485 int i; 486 uint_t hashval = HASHSEED; 487 uint_t bckcnt = stp->st_hbckcnt; 488 Str_hash **hashbcks = stp->st_hashbcks; 489 Str_hash *sthash; 490 Str_master *mstr = 0; 491 492 for (i = len; i >= 0; i--) { 493 /* 494 * Build up 'hashval' character by character, so we always 495 * have the hash of the current string suffix 496 */ 497 hashval = st_string_hashround(hashval, str[i]); 498 499 for (sthash = hashbcks[hashval % bckcnt]; 500 sthash; sthash = sthash->hi_next) { 501 const char *hstr; 502 Str_master *_mstr; 503 504 if (sthash->hi_hashval != hashval) 505 continue; 506 507 _mstr = sthash->hi_mstr; 508 hstr = &_mstr->sm_str[_mstr->sm_strlen - 509 sthash->hi_strlen]; 510 511 if (strcmp(&str[i], hstr)) 512 continue; 513 514 if (i == 0) { 515 /* 516 * Entry already in table, increment refcnt and 517 * get out. 518 */ 519 sthash->hi_refcnt++; 520 return (0); 521 } else { 522 /* 523 * If this 'suffix' is presently a 'master 524 * string, then take over it's record. 525 */ 526 if (sthash->hi_strlen == _mstr->sm_strlen) { 527 /* 528 * we should only do this once. 529 */ 530 assert(mstr == 0); 531 mstr = _mstr; 532 } 533 } 534 } 535 } 536 537 /* 538 * Do we need a new master string, or can we take over 539 * one we already found in the table? 540 */ 541 if (mstr == 0) { 542 /* 543 * allocate a new master string 544 */ 545 if ((mstr = calloc(1, sizeof (*mstr))) == NULL) 546 return (-1); 547 mstr->sm_next = stp->st_mstrlist; 548 stp->st_mstrlist = mstr; 549 stp->st_strsize += len + 1; 550 } else { 551 /* 552 * We are taking over a existing master string, the string size 553 * only increments by the difference between the current string 554 * and the previous master. 555 */ 556 assert(len > mstr->sm_strlen); 557 stp->st_strsize += len - mstr->sm_strlen; 558 } 559 560 if ((sthash = calloc(1, sizeof (*sthash))) == NULL) 561 return (-1); 562 563 mstr->sm_hashval = sthash->hi_hashval = hashval; 564 mstr->sm_strlen = sthash->hi_strlen = len; 565 mstr->sm_str = str; 566 sthash->hi_refcnt = 1; 567 sthash->hi_mstr = mstr; 568 569 /* 570 * Insert string element into head of hash list 571 */ 572 hashval = hashval % bckcnt; 573 sthash->hi_next = hashbcks[hashval]; 574 hashbcks[hashval] = sthash; 575 return (0); 576 } 577 578 /* 579 * Return amount of space required for the string table. 580 */ 581 size_t 582 st_getstrtab_sz(Str_tbl *stp) 583 { 584 assert(stp->st_fullstrsize > 0); 585 586 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 587 stp->st_flags |= FLG_STTAB_COOKED; 588 return (stp->st_fullstrsize); 589 } 590 591 if ((stp->st_flags & FLG_STTAB_COOKED) == 0) { 592 LenNode *lnp; 593 void *cookie; 594 595 stp->st_flags |= FLG_STTAB_COOKED; 596 /* 597 * allocate a hash table about the size of # of 598 * strings input. 599 */ 600 stp->st_hbckcnt = findprime(stp->st_strcnt); 601 if ((stp->st_hashbcks = calloc(stp->st_hbckcnt, 602 sizeof (*stp->st_hashbcks))) == NULL) 603 return (0); 604 605 /* 606 * We now walk all of the strings in the list, from shortest to 607 * longest, and insert them into the hashtable. 608 */ 609 if ((lnp = avl_first(stp->st_lentree)) == NULL) { 610 /* 611 * Is it possible we have an empty string table, if so, 612 * the table still contains '\0', so return the size. 613 */ 614 if (avl_numnodes(stp->st_lentree) == 0) { 615 assert(stp->st_strsize == 1); 616 return (stp->st_strsize); 617 } 618 return (0); 619 } 620 621 while (lnp) { 622 StrNode *snp; 623 624 /* 625 * Walk the string lists and insert them into the hash 626 * list. Once a string is inserted we no longer need 627 * it's entry, so the string can be freed. 628 */ 629 for (snp = avl_first(lnp->ln_strtree); snp; 630 snp = AVL_NEXT(lnp->ln_strtree, snp)) { 631 if (st_hash_insert(stp, snp->sn_str, 632 lnp->ln_strlen) == -1) 633 return (0); 634 } 635 636 /* 637 * Now that the strings have been copied, walk the 638 * StrNode tree and free all the AVL nodes. Note, 639 * avl_destroy_nodes() beats avl_remove() as the 640 * latter balances the nodes as they are removed. 641 * We just want to tear the whole thing down fast. 642 */ 643 cookie = NULL; 644 while ((snp = avl_destroy_nodes(lnp->ln_strtree, 645 &cookie)) != NULL) 646 free(snp); 647 avl_destroy(lnp->ln_strtree); 648 free(lnp->ln_strtree); 649 lnp->ln_strtree = NULL; 650 651 /* 652 * Move on to the next LenNode. 653 */ 654 lnp = AVL_NEXT(stp->st_lentree, lnp); 655 } 656 657 /* 658 * Now that all of the strings have been freed, walk the 659 * LenNode tree and free all of the AVL nodes. Note, 660 * avl_destroy_nodes() beats avl_remove() as the latter 661 * balances the nodes as they are removed. We just want to 662 * tear the whole thing down fast. 663 */ 664 cookie = NULL; 665 while ((lnp = avl_destroy_nodes(stp->st_lentree, 666 &cookie)) != NULL) 667 free(lnp); 668 avl_destroy(stp->st_lentree); 669 free(stp->st_lentree); 670 stp->st_lentree = 0; 671 } 672 673 assert(stp->st_strsize > 0); 674 assert(stp->st_fullstrsize >= stp->st_strsize); 675 676 return (stp->st_strsize); 677 } 678 679 const char * 680 st_getstrbuf(Str_tbl *stp) 681 { 682 return (stp->st_strbuf); 683 } 684 685 686 /* 687 * Associate a buffer with a string table. 688 */ 689 int 690 st_setstrbuf(Str_tbl *stp, char *stbuf, size_t bufsize) 691 { 692 assert(stp->st_flags & FLG_STTAB_COOKED); 693 694 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 695 if (bufsize < stp->st_fullstrsize) 696 return (-1); 697 } else { 698 if (bufsize < stp->st_strsize) 699 return (-1); 700 } 701 702 stp->st_strbuf = stbuf; 703 #ifdef DEBUG 704 /* 705 * for debug builds - start with a stringtable filled in 706 * with '0xff'. This makes it very easy to spot unfilled 707 * holes in the strtab. 708 */ 709 memset(stbuf, 0xff, bufsize); 710 stbuf[0] = '\0'; 711 #else 712 memset(stbuf, 0x0, bufsize); 713 #endif 714 return (0); 715 } 716 717 /* 718 * Populate the buffer with all strings from stp. 719 * The table must be compressed and cooked 720 */ 721 void 722 st_setallstrings(Str_tbl *stp) 723 { 724 assert(stp->st_strbuf != NULL); 725 assert((stp->st_flags & FLG_STTAB_COOKED)); 726 assert((stp->st_flags & FLG_STTAB_COMPRESS)); 727 728 for (Str_master *str = stp->st_mstrlist; str != NULL; 729 str = str->sm_next) { 730 int res = 0; 731 732 res = st_setstring(stp, str->sm_str, NULL); 733 assert(res == 0); 734 } 735 } 736 737 /* 738 * Find str in the given table 739 * return it's offset, or -1 740 */ 741 off_t 742 st_findstring(Str_tbl *stp, const char *needle) 743 { 744 uint_t hashval; 745 Str_hash *sthash; 746 Str_master *mstr; 747 748 assert(stp->st_strbuf != NULL); 749 assert((stp->st_flags & FLG_STTAB_COOKED)); 750 751 /* The NUL string is always first */ 752 if (needle[0] == '\0') 753 return (0); 754 755 /* In the uncompressed case we must linear search */ 756 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 757 const char *str, *end; 758 759 end = stp->st_strbuf + stp->st_fullstrsize; 760 761 for (str = stp->st_strbuf; str < end; 762 str += strlen(str) + 1) { 763 if (strcmp(str, needle) == 0) 764 return (str - stp->st_strbuf); 765 } 766 767 return (-1); 768 } 769 770 hashval = st_string_hash(needle); 771 772 for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; 773 sthash != NULL; 774 sthash = sthash->hi_next) { 775 const char *hstr; 776 777 if (sthash->hi_hashval != hashval) 778 continue; 779 780 hstr = &sthash->hi_mstr->sm_str[sthash->hi_mstr->sm_strlen - 781 sthash->hi_strlen]; 782 if (strcmp(needle, hstr) == 0) 783 break; 784 } 785 786 /* 787 * Did we find the string? 788 */ 789 if (sthash == NULL) 790 return (-1); 791 792 mstr = sthash->hi_mstr; 793 assert(mstr->sm_stroff != 0); 794 795 /* 796 * Calculate offset of (sub)string. 797 */ 798 return (mstr->sm_stroff + mstr->sm_strlen - sthash->hi_strlen); 799 } 800