1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <_string_table.h> 30 #include <strings.h> 31 #include <sgs.h> 32 #include <stdio.h> 33 34 /* 35 * This file provides the interfaces to build a Str_tbl suitable for use by 36 * either the sgsmsg message system, or a standard ELF string table (SHT_STRTAB) 37 * as created by ld(1). 38 * 39 * There are two modes which can be used when constructing a string table: 40 * 41 * st_new(0) 42 * standard string table - no compression. This is the 43 * traditional, fast method. 44 * 45 * st_new(FLG_STTAB_COMPRESS) 46 * builds a compressed string table which both eliminates 47 * duplicate strings, and permits strings with common suffixes 48 * (atexit vs. exit) to overlap in the table. This provides space 49 * savings for many string tables. Although more work than the 50 * traditional method, the algorithms used are designed to scale 51 * and keep any overhead at a minimum. 52 * 53 * These string tables are built with a common interface in a two-pass manner. 54 * The first pass finds all of the strings required for the string-table and 55 * calculates the size required for the final string table. 56 * 57 * The second pass allocates the string table, populates the strings into the 58 * table and returns the offsets the strings have been assigned. 59 * 60 * The calling sequence to build and populate a string table is: 61 * 62 * st_new(); // initialize strtab 63 * 64 * st_insert(st1); // first pass of strings ... 65 * // calculates size required for 66 * // string table 67 * 68 * st_delstring(st?); // remove string previously 69 * // inserted 70 * st_insert(stN); 71 * 72 * st_getstrtab_sz(); // freezes strtab and computes 73 * // size of table. 74 * 75 * st_setstrbuf(); // associates a final destination 76 * // for the string table 77 * 78 * st_setstring(st1); // populate the string table 79 * ... // offsets are based off of second 80 * // pass through the string table 81 * st_setstring(stN); 82 * 83 * st_destroy(); // tear down string table 84 * // structures. 85 * 86 * String Suffix Compression Algorithm: 87 * 88 * Here's a quick high level overview of the Suffix String 89 * compression algorithm used. First - the heart of the algorithm 90 * is a Hash table list which represents a dictionary of all unique 91 * strings inserted into the string table. The hash function for 92 * this table is a standard string hash except that the hash starts 93 * at the last character in the string (&str[n - 1]) and works towards 94 * the first character in the function (&str[0]). As we compute the 95 * HASH value for a given string, we also compute the hash values 96 * for all of the possible suffix strings for that string. 97 * 98 * As we compute the hash - at each character see if the current 99 * suffix string for that hash is already present in the table. If 100 * it is, and the string is a master string. Then change that 101 * string to a suffix string of the new string being inserted. 102 * 103 * When the final hash value is found (hash for str[0...n]), check 104 * to see if it is in the hash table - if so increment the reference 105 * count for the string. If it is not yet in the table, insert a 106 * new hash table entry for a master string. 107 * 108 * The above method will find all suffixes of a given string given 109 * that the strings are inserted from shortest to longest. That is 110 * why this is a two phase method, we first collect all of the 111 * strings and store them based off of their length in an AVL tree. 112 * Once all of the strings have been submitted we then start the 113 * hash table build by traversing the AVL tree in order and 114 * inserting the strings from shortest to longest as described 115 * above. 116 */ 117 118 /* LINTLIBRARY */ 119 120 static int 121 avl_len_compare(const void *n1, const void *n2) 122 { 123 size_t len1, len2; 124 125 len1 = ((LenNode *)n1)->ln_strlen; 126 len2 = ((LenNode *)n2)->ln_strlen; 127 128 if (len1 == len2) 129 return (0); 130 if (len2 < len1) 131 return (1); 132 return (-1); 133 } 134 135 static int 136 avl_str_compare(const void *n1, const void *n2) 137 { 138 const char *str1, *str2; 139 int rc; 140 141 str1 = ((StrNode *)n1)->sn_str; 142 str2 = ((StrNode *)n2)->sn_str; 143 144 rc = strcmp(str1, str2); 145 if (rc > 0) 146 return (1); 147 if (rc < 0) 148 return (-1); 149 return (0); 150 } 151 152 /* 153 * Return an initialized Str_tbl - returns NULL on failure. 154 * 155 * flags: 156 * FLG_STTAB_COMPRESS - build a compressed string table 157 */ 158 Str_tbl * 159 st_new(uint_t flags) 160 { 161 Str_tbl *stp; 162 163 if ((stp = calloc(sizeof (Str_tbl), 1)) == NULL) 164 return (NULL); 165 166 /* 167 * Start with a leading '\0' - it's tradition. 168 */ 169 stp->st_strsize = stp->st_fullstrsize = stp->st_nextoff = 1; 170 171 /* 172 * Do we compress this string table? 173 */ 174 stp->st_flags = flags; 175 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) 176 return (stp); 177 178 if ((stp->st_lentree = calloc(sizeof (avl_tree_t), 1)) == NULL) 179 return (NULL); 180 181 avl_create(stp->st_lentree, &avl_len_compare, sizeof (LenNode), 182 SGSOFFSETOF(LenNode, ln_avlnode)); 183 184 return (stp); 185 } 186 187 /* 188 * Insert a new string into the Str_tbl. There are two AVL trees used. 189 * 190 * . The first LenNode AVL tree maintains a tree of nodes based on string 191 * sizes. 192 * . Each LenNode maintains a StrNode AVL tree for each string. Large 193 * applications have been known to contribute thousands of strings of 194 * the same size. Should strings need to be removed (-z ignore), then 195 * the string AVL tree makes this removal efficient and scalable. 196 */ 197 int 198 st_insert(Str_tbl *stp, const char *str) 199 { 200 size_t len; 201 StrNode *snp, sn = { 0 }; 202 LenNode *lnp, ln = { 0 }; 203 avl_index_t where; 204 205 /* 206 * String table can't have been cooked 207 */ 208 assert((stp->st_flags & FLG_STTAB_COOKED) == 0); 209 210 /* 211 * Null strings always point to the head of the string 212 * table - no reason to keep searching. 213 */ 214 if ((len = strlen(str)) == 0) 215 return (0); 216 217 stp->st_fullstrsize += len + 1; 218 stp->st_strcnt++; 219 220 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) 221 return (0); 222 223 /* 224 * From the controlling string table, determine which LenNode AVL node 225 * provides for this string length. If the node doesn't exist, insert 226 * a new node to represent this string length. 227 */ 228 ln.ln_strlen = len; 229 if ((lnp = avl_find(stp->st_lentree, &ln, &where)) == NULL) { 230 if ((lnp = calloc(sizeof (LenNode), 1)) == NULL) 231 return (-1); 232 lnp->ln_strlen = len; 233 avl_insert(stp->st_lentree, lnp, where); 234 235 if ((lnp->ln_strtree = calloc(sizeof (avl_tree_t), 1)) == NULL) 236 return (0); 237 238 avl_create(lnp->ln_strtree, &avl_str_compare, sizeof (StrNode), 239 SGSOFFSETOF(StrNode, sn_avlnode)); 240 } 241 242 /* 243 * From the string length AVL node determine whether a StrNode AVL node 244 * provides this string. If the node doesn't exist, insert a new node 245 * to represent this string. 246 */ 247 sn.sn_str = str; 248 if ((snp = avl_find(lnp->ln_strtree, &sn, &where)) == NULL) { 249 if ((snp = calloc(sizeof (StrNode), 1)) == NULL) 250 return (-1); 251 snp->sn_str = str; 252 avl_insert(lnp->ln_strtree, snp, where); 253 } 254 snp->sn_refcnt++; 255 256 return (0); 257 } 258 259 /* 260 * Remove a previously inserted string from the Str_tbl. 261 */ 262 int 263 st_delstring(Str_tbl *stp, const char *str) 264 { 265 size_t len; 266 LenNode *lnp, ln = { 0 }; 267 StrNode *snp, sn = { 0 }; 268 269 /* 270 * String table can't have been cooked 271 */ 272 assert((stp->st_flags & FLG_STTAB_COOKED) == 0); 273 274 len = strlen(str); 275 stp->st_fullstrsize -= len + 1; 276 277 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) 278 return (0); 279 280 /* 281 * Determine which LenNode AVL node provides for this string length. 282 */ 283 ln.ln_strlen = len; 284 if ((lnp = avl_find(stp->st_lentree, &ln, 0)) != NULL) { 285 sn.sn_str = str; 286 if ((snp = avl_find(lnp->ln_strtree, &sn, 0)) != NULL) { 287 /* 288 * Reduce the reference count, and if zero remove the 289 * node. 290 */ 291 if (--snp->sn_refcnt == 0) 292 avl_remove(lnp->ln_strtree, snp); 293 return (0); 294 } 295 } 296 297 /* 298 * No strings of this length, or no string itself - someone goofed. 299 */ 300 return (-1); 301 } 302 303 /* 304 * Tear down a String_Table structure. 305 */ 306 void 307 st_destroy(Str_tbl *stp) 308 { 309 Str_hash *sthash, *psthash; 310 Str_master *mstr, *pmstr; 311 uint_t i; 312 313 /* 314 * cleanup the master strings 315 */ 316 for (mstr = stp->st_mstrlist, pmstr = 0; mstr; 317 mstr = mstr->sm_next) { 318 if (pmstr) 319 free(pmstr); 320 pmstr = mstr; 321 } 322 if (pmstr) 323 free(pmstr); 324 325 if (stp->st_hashbcks) { 326 for (i = 0; i < stp->st_hbckcnt; i++) { 327 for (sthash = stp->st_hashbcks[i], psthash = 0; 328 sthash; sthash = sthash->hi_next) { 329 if (psthash) 330 free(psthash); 331 psthash = sthash; 332 } 333 if (psthash) 334 free(psthash); 335 } 336 free(stp->st_hashbcks); 337 } 338 free(stp); 339 } 340 341 342 /* 343 * For a given string - copy it into the buffer associated with 344 * the string table - and return the offset it has been assigned. 345 * 346 * If a value of '-1' is returned - the string was not found in 347 * the Str_tbl. 348 */ 349 int 350 st_setstring(Str_tbl *stp, const char *str, size_t *stoff) 351 { 352 size_t stlen; 353 uint_t hashval; 354 Str_hash *sthash; 355 Str_master *mstr; 356 int i; 357 358 /* 359 * String table *must* have been previously cooked 360 */ 361 assert(stp->st_strbuf); 362 363 assert(stp->st_flags & FLG_STTAB_COOKED); 364 stlen = strlen(str); 365 /* 366 * Null string always points to head of string table 367 */ 368 if (stlen == 0) { 369 *stoff = 0; 370 return (0); 371 } 372 373 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 374 size_t _stoff; 375 376 stlen++; /* count for trailing '\0' */ 377 _stoff = stp->st_nextoff; 378 /* 379 * Have we overflowed our assigned buffer? 380 */ 381 if ((_stoff + stlen) > stp->st_fullstrsize) 382 return (-1); 383 memcpy(stp->st_strbuf + _stoff, str, stlen); 384 *stoff = _stoff; 385 stp->st_nextoff += stlen; 386 return (0); 387 } 388 389 /* 390 * Calculate reverse hash for string. 391 */ 392 hashval = HASHSEED; 393 for (i = stlen; i >= 0; i--) { 394 hashval = ((hashval << 5) + hashval) + 395 str[i]; /* h = ((h * 33) + c) */ 396 } 397 398 for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash; 399 sthash = sthash->hi_next) { 400 const char *hstr; 401 402 if (sthash->hi_hashval != hashval) 403 continue; 404 405 hstr = &sthash->hi_mstr->sm_str[sthash->hi_mstr->sm_strlen - 406 sthash->hi_strlen]; 407 if (strcmp(str, hstr) == 0) 408 break; 409 } 410 411 /* 412 * Did we find the string? 413 */ 414 if (sthash == 0) 415 return (-1); 416 417 /* 418 * Has this string been copied into the string table? 419 */ 420 mstr = sthash->hi_mstr; 421 if (mstr->sm_stroff == 0) { 422 size_t mstrlen = mstr->sm_strlen + 1; 423 424 mstr->sm_stroff = stp->st_nextoff; 425 426 /* 427 * Have we overflowed our assigned buffer? 428 */ 429 if ((mstr->sm_stroff + mstrlen) > stp->st_fullstrsize) 430 return (-1); 431 432 (void) memcpy(stp->st_strbuf + mstr->sm_stroff, 433 mstr->sm_str, mstrlen); 434 stp->st_nextoff += mstrlen; 435 } 436 437 /* 438 * Calculate offset of (sub)string. 439 */ 440 *stoff = mstr->sm_stroff + mstr->sm_strlen - sthash->hi_strlen; 441 442 return (0); 443 } 444 445 446 static int 447 st_hash_insert(Str_tbl *stp, const char *str, size_t len) 448 { 449 int i; 450 uint_t hashval = HASHSEED; 451 uint_t bckcnt = stp->st_hbckcnt; 452 Str_hash **hashbcks = stp->st_hashbcks; 453 Str_hash *sthash; 454 Str_master *mstr = 0; 455 456 /* 457 * We use a classic 'Bernstein k=33' hash function. But 458 * instead of hashing from the start of the string to the 459 * end, we do it in reverse. 460 * 461 * This way - we are essentially building all of the 462 * suffix hashvalues as we go. We can check to see if 463 * any suffixes already exist in the tree as we generate 464 * the hash. 465 */ 466 for (i = len; i >= 0; i--) { 467 hashval = ((hashval << 5) + hashval) + 468 str[i]; /* h = ((h * 33) + c) */ 469 470 for (sthash = hashbcks[hashval % bckcnt]; 471 sthash; sthash = sthash->hi_next) { 472 const char *hstr; 473 Str_master *_mstr; 474 475 if (sthash->hi_hashval != hashval) 476 continue; 477 478 _mstr = sthash->hi_mstr; 479 hstr = &_mstr->sm_str[_mstr->sm_strlen - 480 sthash->hi_strlen]; 481 482 if (strcmp(&str[i], hstr)) 483 continue; 484 485 if (i == 0) { 486 /* 487 * Entry already in table, increment refcnt and 488 * get out. 489 */ 490 sthash->hi_refcnt++; 491 return (0); 492 } else { 493 /* 494 * If this 'suffix' is presently a 'master 495 * string, then take over it's record. 496 */ 497 if (sthash->hi_strlen == _mstr->sm_strlen) { 498 /* 499 * we should only do this once. 500 */ 501 assert(mstr == 0); 502 mstr = _mstr; 503 } 504 } 505 } 506 } 507 508 /* 509 * Do we need a new master string, or can we take over 510 * one we already found in the table? 511 */ 512 if (mstr == 0) { 513 /* 514 * allocate a new master string 515 */ 516 if ((mstr = calloc(sizeof (Str_hash), 1)) == 0) 517 return (-1); 518 mstr->sm_next = stp->st_mstrlist; 519 stp->st_mstrlist = mstr; 520 stp->st_strsize += len + 1; 521 } else { 522 /* 523 * We are taking over a existing master string, the string size 524 * only increments by the difference between the current string 525 * and the previous master. 526 */ 527 assert(len > mstr->sm_strlen); 528 stp->st_strsize += len - mstr->sm_strlen; 529 } 530 531 if ((sthash = calloc(sizeof (Str_hash), 1)) == 0) 532 return (-1); 533 534 mstr->sm_hashval = sthash->hi_hashval = hashval; 535 mstr->sm_strlen = sthash->hi_strlen = len; 536 mstr->sm_str = str; 537 sthash->hi_refcnt = 1; 538 sthash->hi_mstr = mstr; 539 540 /* 541 * Insert string element into head of hash list 542 */ 543 hashval = hashval % bckcnt; 544 sthash->hi_next = hashbcks[hashval]; 545 hashbcks[hashval] = sthash; 546 return (0); 547 } 548 549 /* 550 * Return amount of space required for the string table. 551 */ 552 size_t 553 st_getstrtab_sz(Str_tbl *stp) 554 { 555 assert(stp->st_fullstrsize > 0); 556 557 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 558 stp->st_flags |= FLG_STTAB_COOKED; 559 return (stp->st_fullstrsize); 560 } 561 562 if ((stp->st_flags & FLG_STTAB_COOKED) == 0) { 563 LenNode *lnp; 564 void *cookie; 565 566 stp->st_flags |= FLG_STTAB_COOKED; 567 /* 568 * allocate a hash table about the size of # of 569 * strings input. 570 */ 571 stp->st_hbckcnt = findprime(stp->st_strcnt); 572 if ((stp->st_hashbcks = 573 calloc(sizeof (Str_hash), stp->st_hbckcnt)) == NULL) 574 return (0); 575 576 /* 577 * We now walk all of the strings in the list, from shortest to 578 * longest, and insert them into the hashtable. 579 */ 580 if ((lnp = avl_first(stp->st_lentree)) == NULL) { 581 /* 582 * Is it possible we have an empty string table, if so, 583 * the table still contains '\0', so return the size. 584 */ 585 if (avl_numnodes(stp->st_lentree) == 0) { 586 assert(stp->st_strsize == 1); 587 return (stp->st_strsize); 588 } 589 return (0); 590 } 591 592 while (lnp) { 593 StrNode *snp; 594 595 /* 596 * Walk the string lists and insert them into the hash 597 * list. Once a string is inserted we no longer need 598 * it's entry, so the string can be freed. 599 */ 600 for (snp = avl_first(lnp->ln_strtree); snp; 601 snp = AVL_NEXT(lnp->ln_strtree, snp)) { 602 if (st_hash_insert(stp, snp->sn_str, 603 lnp->ln_strlen) == -1) 604 return (0); 605 } 606 607 /* 608 * Now that the strings have been copied, walk the 609 * StrNode tree and free all the AVL nodes. Note, 610 * avl_destroy_nodes() beats avl_remove() as the 611 * latter balances the nodes as they are removed. 612 * We just want to tear the whole thing down fast. 613 */ 614 cookie = NULL; 615 while ((snp = avl_destroy_nodes(lnp->ln_strtree, 616 &cookie)) != NULL) 617 free(snp); 618 avl_destroy(lnp->ln_strtree); 619 free(lnp->ln_strtree); 620 lnp->ln_strtree = NULL; 621 622 /* 623 * Move on to the next LenNode. 624 */ 625 lnp = AVL_NEXT(stp->st_lentree, lnp); 626 } 627 628 /* 629 * Now that all of the strings have been freed, walk the 630 * LenNode tree and free all of the AVL nodes. Note, 631 * avl_destroy_nodes() beats avl_remove() as the latter 632 * balances the nodes as they are removed. We just want to 633 * tear the whole thing down fast. 634 */ 635 cookie = NULL; 636 while ((lnp = avl_destroy_nodes(stp->st_lentree, 637 &cookie)) != NULL) 638 free(lnp); 639 avl_destroy(stp->st_lentree); 640 free(stp->st_lentree); 641 stp->st_lentree = 0; 642 } 643 644 assert(stp->st_strsize > 0); 645 assert(stp->st_fullstrsize >= stp->st_strsize); 646 647 return (stp->st_strsize); 648 } 649 650 /* 651 * Associate a buffer with a string table. 652 */ 653 const char * 654 st_getstrbuf(Str_tbl *stp) 655 { 656 return (stp->st_strbuf); 657 } 658 659 int 660 st_setstrbuf(Str_tbl *stp, char *stbuf, size_t bufsize) 661 { 662 assert(stp->st_flags & FLG_STTAB_COOKED); 663 664 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 665 if (bufsize < stp->st_fullstrsize) 666 return (-1); 667 } else { 668 if (bufsize < stp->st_strsize) 669 return (-1); 670 } 671 672 stp->st_strbuf = stbuf; 673 #ifdef DEBUG 674 /* 675 * for debug builds - start with a stringtable filled in 676 * with '0xff'. This makes it very easy to find wholes 677 * which we failed to fill in - in the strtab. 678 */ 679 memset(stbuf, 0xff, bufsize); 680 stbuf[0] = '\0'; 681 #else 682 memset(stbuf, 0x0, bufsize); 683 #endif 684 return (0); 685 } 686