1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <_string_table.h> 28 #include <strings.h> 29 #include <sgs.h> 30 #include <stdio.h> 31 32 /* 33 * This file provides the interfaces to build a Str_tbl suitable for use by 34 * either the sgsmsg message system, or a standard ELF string table (SHT_STRTAB) 35 * as created by ld(1). 36 * 37 * There are two modes which can be used when constructing a string table: 38 * 39 * st_new(0) 40 * standard string table - no compression. This is the 41 * traditional, fast method. 42 * 43 * st_new(FLG_STTAB_COMPRESS) 44 * builds a compressed string table which both eliminates 45 * duplicate strings, and permits strings with common suffixes 46 * (atexit vs. exit) to overlap in the table. This provides space 47 * savings for many string tables. Although more work than the 48 * traditional method, the algorithms used are designed to scale 49 * and keep any overhead at a minimum. 50 * 51 * These string tables are built with a common interface in a two-pass manner. 52 * The first pass finds all of the strings required for the string-table and 53 * calculates the size required for the final string table. 54 * 55 * The second pass allocates the string table, populates the strings into the 56 * table and returns the offsets the strings have been assigned. 57 * 58 * The calling sequence to build and populate a string table is: 59 * 60 * st_new(); // initialize strtab 61 * 62 * st_insert(st1); // first pass of strings ... 63 * // calculates size required for 64 * // string table 65 * 66 * st_delstring(st?); // remove string previously 67 * // inserted 68 * st_insert(stN); 69 * 70 * st_getstrtab_sz(); // freezes strtab and computes 71 * // size of table. 72 * 73 * st_setstrbuf(); // associates a final destination 74 * // for the string table 75 * 76 * st_setstring(st1); // populate the string table 77 * ... // offsets are based off of second 78 * // pass through the string table 79 * st_setstring(stN); 80 * 81 * st_destroy(); // tear down string table 82 * // structures. 83 * 84 * String Suffix Compression Algorithm: 85 * 86 * Here's a quick high level overview of the Suffix String 87 * compression algorithm used. First - the heart of the algorithm 88 * is a Hash table list which represents a dictionary of all unique 89 * strings inserted into the string table. The hash function for 90 * this table is a standard string hash except that the hash starts 91 * at the last character in the string (&str[n - 1]) and works towards 92 * the first character in the function (&str[0]). As we compute the 93 * HASH value for a given string, we also compute the hash values 94 * for all of the possible suffix strings for that string. 95 * 96 * As we compute the hash - at each character see if the current 97 * suffix string for that hash is already present in the table. If 98 * it is, and the string is a master string. Then change that 99 * string to a suffix string of the new string being inserted. 100 * 101 * When the final hash value is found (hash for str[0...n]), check 102 * to see if it is in the hash table - if so increment the reference 103 * count for the string. If it is not yet in the table, insert a 104 * new hash table entry for a master string. 105 * 106 * The above method will find all suffixes of a given string given 107 * that the strings are inserted from shortest to longest. That is 108 * why this is a two phase method, we first collect all of the 109 * strings and store them based off of their length in an AVL tree. 110 * Once all of the strings have been submitted we then start the 111 * hash table build by traversing the AVL tree in order and 112 * inserting the strings from shortest to longest as described 113 * above. 114 */ 115 116 /* LINTLIBRARY */ 117 118 static int 119 avl_len_compare(const void *n1, const void *n2) 120 { 121 size_t len1, len2; 122 123 len1 = ((LenNode *)n1)->ln_strlen; 124 len2 = ((LenNode *)n2)->ln_strlen; 125 126 if (len1 == len2) 127 return (0); 128 if (len2 < len1) 129 return (1); 130 return (-1); 131 } 132 133 static int 134 avl_str_compare(const void *n1, const void *n2) 135 { 136 const char *str1, *str2; 137 int rc; 138 139 str1 = ((StrNode *)n1)->sn_str; 140 str2 = ((StrNode *)n2)->sn_str; 141 142 rc = strcmp(str1, str2); 143 if (rc > 0) 144 return (1); 145 if (rc < 0) 146 return (-1); 147 return (0); 148 } 149 150 /* 151 * Return an initialized Str_tbl - returns NULL on failure. 152 * 153 * flags: 154 * FLG_STTAB_COMPRESS - build a compressed string table 155 */ 156 Str_tbl * 157 st_new(uint_t flags) 158 { 159 Str_tbl *stp; 160 161 if ((stp = calloc(sizeof (*stp), 1)) == NULL) 162 return (NULL); 163 164 /* 165 * Start with a leading '\0' - it's tradition. 166 */ 167 stp->st_strsize = stp->st_fullstrsize = stp->st_nextoff = 1; 168 169 /* 170 * Do we compress this string table? 171 */ 172 stp->st_flags = flags; 173 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) 174 return (stp); 175 176 if ((stp->st_lentree = calloc(sizeof (*stp->st_lentree), 1)) == NULL) 177 return (NULL); 178 179 avl_create(stp->st_lentree, &avl_len_compare, sizeof (LenNode), 180 SGSOFFSETOF(LenNode, ln_avlnode)); 181 182 return (stp); 183 } 184 185 /* 186 * Insert a new string into the Str_tbl. There are two AVL trees used. 187 * 188 * - The first LenNode AVL tree maintains a tree of nodes based on string 189 * sizes. 190 * - Each LenNode maintains a StrNode AVL tree for each string. Large 191 * applications have been known to contribute thousands of strings of 192 * the same size. Should strings need to be removed (-z ignore), then 193 * the string AVL tree makes this removal efficient and scalable. 194 */ 195 int 196 st_insert(Str_tbl *stp, const char *str) 197 { 198 size_t len; 199 StrNode *snp, sn = { 0 }; 200 LenNode *lnp, ln = { 0 }; 201 avl_index_t where; 202 203 /* 204 * String table can't have been cooked 205 */ 206 assert((stp->st_flags & FLG_STTAB_COOKED) == 0); 207 208 /* 209 * Null strings always point to the head of the string 210 * table - no reason to keep searching. 211 */ 212 if ((len = strlen(str)) == 0) 213 return (0); 214 215 stp->st_fullstrsize += len + 1; 216 stp->st_strcnt++; 217 218 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) 219 return (0); 220 221 /* 222 * From the controlling string table, determine which LenNode AVL node 223 * provides for this string length. If the node doesn't exist, insert 224 * a new node to represent this string length. 225 */ 226 ln.ln_strlen = len; 227 if ((lnp = avl_find(stp->st_lentree, &ln, &where)) == NULL) { 228 if ((lnp = calloc(sizeof (*lnp), 1)) == NULL) 229 return (-1); 230 lnp->ln_strlen = len; 231 avl_insert(stp->st_lentree, lnp, where); 232 233 if ((lnp->ln_strtree = calloc(sizeof (*lnp->ln_strtree), 1)) == 234 NULL) 235 return (0); 236 237 avl_create(lnp->ln_strtree, &avl_str_compare, sizeof (StrNode), 238 SGSOFFSETOF(StrNode, sn_avlnode)); 239 } 240 241 /* 242 * From the string length AVL node determine whether a StrNode AVL node 243 * provides this string. If the node doesn't exist, insert a new node 244 * to represent this string. 245 */ 246 sn.sn_str = str; 247 if ((snp = avl_find(lnp->ln_strtree, &sn, &where)) == NULL) { 248 if ((snp = calloc(sizeof (*snp), 1)) == NULL) 249 return (-1); 250 snp->sn_str = str; 251 avl_insert(lnp->ln_strtree, snp, where); 252 } 253 snp->sn_refcnt++; 254 255 return (0); 256 } 257 258 /* 259 * Remove a previously inserted string from the Str_tbl. 260 */ 261 int 262 st_delstring(Str_tbl *stp, const char *str) 263 { 264 size_t len; 265 LenNode *lnp, ln = { 0 }; 266 StrNode *snp, sn = { 0 }; 267 268 /* 269 * String table can't have been cooked 270 */ 271 assert((stp->st_flags & FLG_STTAB_COOKED) == 0); 272 273 len = strlen(str); 274 stp->st_fullstrsize -= len + 1; 275 276 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) 277 return (0); 278 279 /* 280 * Determine which LenNode AVL node provides for this string length. 281 */ 282 ln.ln_strlen = len; 283 if ((lnp = avl_find(stp->st_lentree, &ln, 0)) != NULL) { 284 sn.sn_str = str; 285 if ((snp = avl_find(lnp->ln_strtree, &sn, 0)) != NULL) { 286 /* 287 * Reduce the reference count, and if zero remove the 288 * node. 289 */ 290 if (--snp->sn_refcnt == 0) 291 avl_remove(lnp->ln_strtree, snp); 292 return (0); 293 } 294 } 295 296 /* 297 * No strings of this length, or no string itself - someone goofed. 298 */ 299 return (-1); 300 } 301 302 /* 303 * Tear down a String_Table structure. 304 */ 305 void 306 st_destroy(Str_tbl *stp) 307 { 308 Str_hash *sthash, *psthash; 309 Str_master *mstr, *pmstr; 310 uint_t i; 311 312 /* 313 * cleanup the master strings 314 */ 315 for (mstr = stp->st_mstrlist, pmstr = 0; mstr; 316 mstr = mstr->sm_next) { 317 if (pmstr) 318 free(pmstr); 319 pmstr = mstr; 320 } 321 if (pmstr) 322 free(pmstr); 323 324 if (stp->st_hashbcks) { 325 for (i = 0; i < stp->st_hbckcnt; i++) { 326 for (sthash = stp->st_hashbcks[i], psthash = 0; 327 sthash; sthash = sthash->hi_next) { 328 if (psthash) 329 free(psthash); 330 psthash = sthash; 331 } 332 if (psthash) 333 free(psthash); 334 } 335 free(stp->st_hashbcks); 336 } 337 free(stp); 338 } 339 340 341 /* 342 * For a given string - copy it into the buffer associated with 343 * the string table - and return the offset it has been assigned. 344 * 345 * If a value of '-1' is returned - the string was not found in 346 * the Str_tbl. 347 */ 348 int 349 st_setstring(Str_tbl *stp, const char *str, size_t *stoff) 350 { 351 size_t stlen; 352 uint_t hashval; 353 Str_hash *sthash; 354 Str_master *mstr; 355 int i; 356 357 /* 358 * String table *must* have been previously cooked 359 */ 360 assert(stp->st_strbuf); 361 362 assert(stp->st_flags & FLG_STTAB_COOKED); 363 stlen = strlen(str); 364 /* 365 * Null string always points to head of string table 366 */ 367 if (stlen == 0) { 368 *stoff = 0; 369 return (0); 370 } 371 372 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 373 size_t _stoff; 374 375 stlen++; /* count for trailing '\0' */ 376 _stoff = stp->st_nextoff; 377 /* 378 * Have we overflowed our assigned buffer? 379 */ 380 if ((_stoff + stlen) > stp->st_fullstrsize) 381 return (-1); 382 memcpy(stp->st_strbuf + _stoff, str, stlen); 383 *stoff = _stoff; 384 stp->st_nextoff += stlen; 385 return (0); 386 } 387 388 /* 389 * Calculate reverse hash for string. 390 */ 391 hashval = HASHSEED; 392 for (i = stlen; i >= 0; i--) { 393 hashval = ((hashval << 5) + hashval) + 394 str[i]; /* h = ((h * 33) + c) */ 395 } 396 397 for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash; 398 sthash = sthash->hi_next) { 399 const char *hstr; 400 401 if (sthash->hi_hashval != hashval) 402 continue; 403 404 hstr = &sthash->hi_mstr->sm_str[sthash->hi_mstr->sm_strlen - 405 sthash->hi_strlen]; 406 if (strcmp(str, hstr) == 0) 407 break; 408 } 409 410 /* 411 * Did we find the string? 412 */ 413 if (sthash == 0) 414 return (-1); 415 416 /* 417 * Has this string been copied into the string table? 418 */ 419 mstr = sthash->hi_mstr; 420 if (mstr->sm_stroff == 0) { 421 size_t mstrlen = mstr->sm_strlen + 1; 422 423 mstr->sm_stroff = stp->st_nextoff; 424 425 /* 426 * Have we overflowed our assigned buffer? 427 */ 428 if ((mstr->sm_stroff + mstrlen) > stp->st_fullstrsize) 429 return (-1); 430 431 (void) memcpy(stp->st_strbuf + mstr->sm_stroff, 432 mstr->sm_str, mstrlen); 433 stp->st_nextoff += mstrlen; 434 } 435 436 /* 437 * Calculate offset of (sub)string. 438 */ 439 *stoff = mstr->sm_stroff + mstr->sm_strlen - sthash->hi_strlen; 440 441 return (0); 442 } 443 444 445 static int 446 st_hash_insert(Str_tbl *stp, const char *str, size_t len) 447 { 448 int i; 449 uint_t hashval = HASHSEED; 450 uint_t bckcnt = stp->st_hbckcnt; 451 Str_hash **hashbcks = stp->st_hashbcks; 452 Str_hash *sthash; 453 Str_master *mstr = 0; 454 455 /* 456 * We use a classic 'Bernstein k=33' hash function. But 457 * instead of hashing from the start of the string to the 458 * end, we do it in reverse. 459 * 460 * This way - we are essentially building all of the 461 * suffix hashvalues as we go. We can check to see if 462 * any suffixes already exist in the tree as we generate 463 * the hash. 464 */ 465 for (i = len; i >= 0; i--) { 466 hashval = ((hashval << 5) + hashval) + 467 str[i]; /* h = ((h * 33) + c) */ 468 469 for (sthash = hashbcks[hashval % bckcnt]; 470 sthash; sthash = sthash->hi_next) { 471 const char *hstr; 472 Str_master *_mstr; 473 474 if (sthash->hi_hashval != hashval) 475 continue; 476 477 _mstr = sthash->hi_mstr; 478 hstr = &_mstr->sm_str[_mstr->sm_strlen - 479 sthash->hi_strlen]; 480 481 if (strcmp(&str[i], hstr)) 482 continue; 483 484 if (i == 0) { 485 /* 486 * Entry already in table, increment refcnt and 487 * get out. 488 */ 489 sthash->hi_refcnt++; 490 return (0); 491 } else { 492 /* 493 * If this 'suffix' is presently a 'master 494 * string, then take over it's record. 495 */ 496 if (sthash->hi_strlen == _mstr->sm_strlen) { 497 /* 498 * we should only do this once. 499 */ 500 assert(mstr == 0); 501 mstr = _mstr; 502 } 503 } 504 } 505 } 506 507 /* 508 * Do we need a new master string, or can we take over 509 * one we already found in the table? 510 */ 511 if (mstr == 0) { 512 /* 513 * allocate a new master string 514 */ 515 if ((mstr = calloc(sizeof (*mstr), 1)) == 0) 516 return (-1); 517 mstr->sm_next = stp->st_mstrlist; 518 stp->st_mstrlist = mstr; 519 stp->st_strsize += len + 1; 520 } else { 521 /* 522 * We are taking over a existing master string, the string size 523 * only increments by the difference between the current string 524 * and the previous master. 525 */ 526 assert(len > mstr->sm_strlen); 527 stp->st_strsize += len - mstr->sm_strlen; 528 } 529 530 if ((sthash = calloc(sizeof (*sthash), 1)) == 0) 531 return (-1); 532 533 mstr->sm_hashval = sthash->hi_hashval = hashval; 534 mstr->sm_strlen = sthash->hi_strlen = len; 535 mstr->sm_str = str; 536 sthash->hi_refcnt = 1; 537 sthash->hi_mstr = mstr; 538 539 /* 540 * Insert string element into head of hash list 541 */ 542 hashval = hashval % bckcnt; 543 sthash->hi_next = hashbcks[hashval]; 544 hashbcks[hashval] = sthash; 545 return (0); 546 } 547 548 /* 549 * Return amount of space required for the string table. 550 */ 551 size_t 552 st_getstrtab_sz(Str_tbl *stp) 553 { 554 assert(stp->st_fullstrsize > 0); 555 556 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 557 stp->st_flags |= FLG_STTAB_COOKED; 558 return (stp->st_fullstrsize); 559 } 560 561 if ((stp->st_flags & FLG_STTAB_COOKED) == 0) { 562 LenNode *lnp; 563 void *cookie; 564 565 stp->st_flags |= FLG_STTAB_COOKED; 566 /* 567 * allocate a hash table about the size of # of 568 * strings input. 569 */ 570 stp->st_hbckcnt = findprime(stp->st_strcnt); 571 if ((stp->st_hashbcks = calloc(sizeof (*stp->st_hashbcks), 572 stp->st_hbckcnt)) == NULL) 573 return (0); 574 575 /* 576 * We now walk all of the strings in the list, from shortest to 577 * longest, and insert them into the hashtable. 578 */ 579 if ((lnp = avl_first(stp->st_lentree)) == NULL) { 580 /* 581 * Is it possible we have an empty string table, if so, 582 * the table still contains '\0', so return the size. 583 */ 584 if (avl_numnodes(stp->st_lentree) == 0) { 585 assert(stp->st_strsize == 1); 586 return (stp->st_strsize); 587 } 588 return (0); 589 } 590 591 while (lnp) { 592 StrNode *snp; 593 594 /* 595 * Walk the string lists and insert them into the hash 596 * list. Once a string is inserted we no longer need 597 * it's entry, so the string can be freed. 598 */ 599 for (snp = avl_first(lnp->ln_strtree); snp; 600 snp = AVL_NEXT(lnp->ln_strtree, snp)) { 601 if (st_hash_insert(stp, snp->sn_str, 602 lnp->ln_strlen) == -1) 603 return (0); 604 } 605 606 /* 607 * Now that the strings have been copied, walk the 608 * StrNode tree and free all the AVL nodes. Note, 609 * avl_destroy_nodes() beats avl_remove() as the 610 * latter balances the nodes as they are removed. 611 * We just want to tear the whole thing down fast. 612 */ 613 cookie = NULL; 614 while ((snp = avl_destroy_nodes(lnp->ln_strtree, 615 &cookie)) != NULL) 616 free(snp); 617 avl_destroy(lnp->ln_strtree); 618 free(lnp->ln_strtree); 619 lnp->ln_strtree = NULL; 620 621 /* 622 * Move on to the next LenNode. 623 */ 624 lnp = AVL_NEXT(stp->st_lentree, lnp); 625 } 626 627 /* 628 * Now that all of the strings have been freed, walk the 629 * LenNode tree and free all of the AVL nodes. Note, 630 * avl_destroy_nodes() beats avl_remove() as the latter 631 * balances the nodes as they are removed. We just want to 632 * tear the whole thing down fast. 633 */ 634 cookie = NULL; 635 while ((lnp = avl_destroy_nodes(stp->st_lentree, 636 &cookie)) != NULL) 637 free(lnp); 638 avl_destroy(stp->st_lentree); 639 free(stp->st_lentree); 640 stp->st_lentree = 0; 641 } 642 643 assert(stp->st_strsize > 0); 644 assert(stp->st_fullstrsize >= stp->st_strsize); 645 646 return (stp->st_strsize); 647 } 648 649 /* 650 * Associate a buffer with a string table. 651 */ 652 const char * 653 st_getstrbuf(Str_tbl *stp) 654 { 655 return (stp->st_strbuf); 656 } 657 658 int 659 st_setstrbuf(Str_tbl *stp, char *stbuf, size_t bufsize) 660 { 661 assert(stp->st_flags & FLG_STTAB_COOKED); 662 663 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 664 if (bufsize < stp->st_fullstrsize) 665 return (-1); 666 } else { 667 if (bufsize < stp->st_strsize) 668 return (-1); 669 } 670 671 stp->st_strbuf = stbuf; 672 #ifdef DEBUG 673 /* 674 * for debug builds - start with a stringtable filled in 675 * with '0xff'. This makes it very easy to spot unfilled 676 * holes in the strtab. 677 */ 678 memset(stbuf, 0xff, bufsize); 679 stbuf[0] = '\0'; 680 #else 681 memset(stbuf, 0x0, bufsize); 682 #endif 683 return (0); 684 } 685