1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <string_table.h> 30 #include <strings.h> 31 #include <sgs.h> 32 #include <stdio.h> 33 34 35 36 /* 37 * This file provides the interfaces to build a Str_tbl suitable 38 * for use by either the sgsmsg system or a standard ELF 39 * SHT_STRTAB. 40 * 41 * There are two modes which can be used when constructing a 42 * string table: 43 * 44 * st_new(0) 45 * standard string table - no compression. This is the 46 * traditional method and fast 47 * 48 * st_new(FLG_STNEW_COMPRESS) 49 * build a compressed string table which both 50 * eliminates duplicate strings and permits 51 * strings with common suffixes (atexit vs. exit) to 52 * overlap in the table. This provides space 53 * savings for many string tables. 54 * 55 * These string tables are now built with a common interface in a 56 * two-pass manner, the first pass it to find all of the strings 57 * required for the string-table and to calculate the size that 58 * will be required for the final string table. 59 * 60 * The second pass allocates the string table and populates the 61 * strings into the table and returns the offsets the strings 62 * have been assigned. 63 * 64 * The calling sequence to build and populate a string table is: 65 * 66 * st_new(); // initialize strtab 67 * 68 * st_insert(st1); // first pass of strings ... 69 * // calculates size required for 70 * // string table 71 * 72 * st_delstring(st?); // remove string previously 73 * // inserted 74 * 75 * st_insert(stN); 76 * 77 * st_getstrtab_sz(); // freezes strtab and computes 78 * // size of table. 79 * 80 * st_setstrbuf(); // associates a final destination 81 * // for the string table 82 * 83 * st_setstring(st1); // populate the string table 84 * ... // offsets are based off of second 85 * // pass through the string table 86 * st_setstring(stN); 87 * 88 * st_destroy(); // tear down string table 89 * // structures. 90 * 91 * String Suffix Compression Algorithm: 92 * 93 * Here's a quick high level overview of the Suffix String 94 * compression algorithm used. First - the heart of the algorithm 95 * is a Hash table list which represents a dictionary of all unique 96 * strings inserted into the string table. The hash function for 97 * this table is a standard string hash except that the hash starts 98 * at the last character in the string (&str[n - 1]) and works towards 99 * the first character in the function (&str[0]). As we compute the 100 * HASH value for a given string, we also compute the hash values 101 * for all of the possible suffix strings for that string. 102 * 103 * As we compute the hash - at each character see if the current 104 * suffix string for that hash is already present in the table. If 105 * it is, and the string is a master string. Then change that 106 * string to a suffix string of the new string being inserted. 107 * 108 * When the final hash value is found (hash for str[0...n]), check 109 * to see if it is in the hash table - if so increment the reference 110 * count for the string. If it is not yet in the table, insert a 111 * new hash table entry for a master string. 112 * 113 * The above method will find all suffixes of a given string given 114 * that the strings are inserted from shortest to longest. That is 115 * why this is a two phase method, we first collect all of the 116 * strings and store them based off of their length in a nice AVL tree. 117 * Once all of the strings have been submitted we then start the 118 * hash table build by traversing the AVL tree in order and 119 * inserting the strings from shortest to longest as described 120 * above. 121 * 122 */ 123 124 /* LINTLIBRARY */ 125 126 127 int 128 strlen_compare(const void *elem1, const void *elem2) 129 { 130 uint_t l1, l2; 131 l1 = ((Stringelem *)elem1)->se_stlen; 132 l2 = ((Stringelem *)elem2)->se_stlen; 133 134 if (l1 == l2) 135 return (0); 136 if (l2 < l1) 137 return (1); 138 139 return (-1); 140 } 141 142 /* 143 * Return a initialized Str_tbl - returns NULL on failure. 144 * 145 * stflags: 146 * 147 * FLG_STNEW_COMPRESS - build a compressed string table 148 * 149 */ 150 Str_tbl * 151 st_new(uint_t stflags) 152 { 153 Str_tbl *stp; 154 155 if ((stp = calloc(sizeof (Str_tbl), 1)) == 0) 156 return (0); 157 158 /* 159 * Start with a leading '\0' - it's tradition. 160 */ 161 stp->st_stringsize = stp->st_fullstringsize = stp->st_nextoff = 1; 162 163 /* 164 * Do we compress this string table 165 */ 166 if ((stflags & FLG_STNEW_COMPRESS) == 0) 167 return (stp); 168 169 stp->st_flags |= FLG_STTAB_COMPRESS; 170 if ((stp->st_strtree = calloc(sizeof (avl_tree_t), 1)) == 0) { 171 return (0); 172 } 173 174 avl_create(stp->st_strtree, &strlen_compare, sizeof (Stringelem), 175 SGSOFFSETOF(Stringelem, se_avlnode)); 176 177 return (stp); 178 } 179 180 /* 181 * Tear down a String_Table structure. 182 */ 183 void 184 st_destroy(Str_tbl *stp) 185 { 186 Str_hash *sthash, *psthash; 187 Str_master *mstr, *pmstr; 188 uint_t i; 189 190 /* 191 * cleanup the master strings 192 */ 193 for (mstr = stp->st_mstrlist, pmstr = 0; mstr; 194 mstr = mstr->sm_next) { 195 if (pmstr) 196 free(pmstr); 197 pmstr = mstr; 198 } 199 if (pmstr) 200 free(pmstr); 201 202 if (stp->st_hashbcks) { 203 for (i = 0; i < stp->st_hbckcnt; i++) { 204 for (sthash = stp->st_hashbcks[i], psthash = 0; 205 sthash; sthash = sthash->hi_next) { 206 if (psthash) 207 free(psthash); 208 psthash = sthash; 209 } 210 if (psthash) 211 free(psthash); 212 } 213 free(stp->st_hashbcks); 214 } 215 free(stp); 216 } 217 218 219 220 221 /* 222 * Remove a previously inserted string from the Str_tbl 223 */ 224 int 225 st_delstring(Str_tbl *stp, const char *str) 226 { 227 uint_t stlen; 228 Stringelem qstelem; 229 Stringelem *stelem; 230 Stringlist *stlist, *pstlist; 231 232 /* 233 * String table can't have been cooked 234 */ 235 assert((stp->st_flags & FLG_STTAB_COOKED) == 0); 236 237 stlen = (uint_t)strlen(str); 238 stp->st_fullstringsize -= stlen + 1; 239 240 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) 241 return (0); 242 243 qstelem.se_stlen = stlen; 244 if ((stelem = avl_find(stp->st_strtree, &qstelem, 0)) == NULL) { 245 /* 246 * no strings of this length recorded, let alone 247 * this specific string - someone goofed. 248 */ 249 return (-1); 250 } 251 252 pstlist = 0; 253 for (stlist = stelem->se_strlist; stlist; stlist = stlist->sl_next) { 254 if (strcmp(str, stlist->sl_string) == 0) 255 break; 256 pstlist = stlist; 257 } 258 259 if (stlist == 0) { 260 /* 261 * string was not found 262 */ 263 return (-1); 264 } 265 266 if (pstlist == 0) { 267 /* 268 * String is first on list. 269 */ 270 stelem->se_strlist = stlist->sl_next; 271 } else { 272 /* 273 * remove string from list. 274 */ 275 pstlist->sl_next = stlist->sl_next; 276 } 277 278 free(stlist); 279 return (0); 280 } 281 282 283 /* 284 * Insert a new string into the Str_tbl 285 */ 286 int 287 st_insert(Str_tbl *stp, const char *str) 288 { 289 uint_t stlen; 290 Stringelem qstelem; 291 Stringelem *stelem; 292 Stringlist *strlist; 293 avl_index_t where; 294 295 /* 296 * String table can't have been cooked 297 */ 298 assert((stp->st_flags & FLG_STTAB_COOKED) == 0); 299 stlen = (uint_t)strlen(str); 300 /* 301 * Null strings always point to the head of the string 302 * table - no reason to keep searching. 303 */ 304 if (stlen == 0) 305 return (0); 306 307 stp->st_fullstringsize += stlen + 1; 308 stp->st_stringcnt++; 309 310 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) 311 return (0); 312 313 qstelem.se_stlen = strlen(str); 314 if ((stelem = avl_find(stp->st_strtree, &qstelem, 315 &where)) == NULL) { 316 if ((stelem = calloc(sizeof (Stringelem), 1)) == 0) 317 return (-1); 318 stelem->se_stlen = qstelem.se_stlen; 319 avl_insert(stp->st_strtree, stelem, where); 320 } 321 if ((strlist = malloc(sizeof (Stringlist))) == 0) 322 return (-1); 323 324 strlist->sl_string = str; 325 strlist->sl_next = stelem->se_strlist; 326 stelem->se_strlist = strlist; 327 328 return (0); 329 } 330 331 332 /* 333 * For a given string - copy it into the buffer associated with 334 * the string table - and return the offset it has been assigned. 335 * 336 * If a value of '-1' is returned - the string was not found in 337 * the Str_tbl. 338 */ 339 int 340 st_setstring(Str_tbl *stp, const char *str, uint_t *stoff) 341 { 342 uint_t stlen; 343 uint_t hashval; 344 Str_hash *sthash; 345 Str_master *mstr; 346 int i; 347 348 /* 349 * String table *must* have been previously cooked 350 */ 351 assert(stp->st_strbuf); 352 353 assert(stp->st_flags & FLG_STTAB_COOKED); 354 stlen = (uint_t)strlen(str); 355 /* 356 * Null string always points to head of string table 357 */ 358 if (stlen == 0) { 359 *stoff = 0; 360 return (0); 361 } 362 363 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 364 uint_t _stoff; 365 366 stlen++; /* count for trailing '\0' */ 367 _stoff = stp->st_nextoff; 368 /* 369 * Have we overflowed our assigned buffer? 370 */ 371 if ((_stoff + stlen) > stp->st_fullstringsize) 372 return (-1); 373 memcpy(stp->st_strbuf + _stoff, str, stlen); 374 *stoff = _stoff; 375 stp->st_nextoff += stlen; 376 return (0); 377 } 378 379 /* 380 * Calculate reverse hash for string 381 */ 382 hashval = HASHSEED; 383 for (i = stlen; i >= 0; i--) { 384 hashval = ((hashval << 5) + hashval) + 385 str[i]; /* h = ((h * 33) + c) */ 386 } 387 388 for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash; 389 sthash = sthash->hi_next) { 390 if (sthash->hi_hashval == hashval) { 391 const char *hstr; 392 393 hstr = &sthash->hi_mstr->sm_str[ 394 sthash->hi_mstr->sm_stlen - 395 sthash->hi_stlen]; 396 if (strcmp(str, hstr) == 0) { 397 break; 398 } 399 } 400 } 401 402 /* 403 * Did we find the string? 404 */ 405 if (sthash == 0) 406 return (-1); 407 408 /* 409 * Has this string been copied into the string table? 410 */ 411 mstr = sthash->hi_mstr; 412 if (mstr->sm_stoff == 0) { 413 uint_t mstlen = mstr->sm_stlen + 1; 414 mstr->sm_stoff = stp->st_nextoff; 415 /* 416 * Have we overflowed our assigned buffer? 417 */ 418 if ((mstr->sm_stoff + mstlen) > stp->st_fullstringsize) 419 return (-1); 420 memcpy(stp->st_strbuf + mstr->sm_stoff, mstr->sm_str, 421 mstlen); 422 stp->st_nextoff += mstlen; 423 } 424 /* 425 * Calculate offset of (sub)string 426 */ 427 *stoff = mstr->sm_stoff + mstr->sm_stlen - sthash->hi_stlen; 428 429 return (0); 430 } 431 432 433 static int 434 st_hash_insert(Str_tbl *stp, const char *str, uint_t stlen) 435 { 436 int i; 437 uint_t hashval = HASHSEED; 438 uint_t bckcnt = stp->st_hbckcnt; 439 Str_hash **hashbcks = stp->st_hashbcks; 440 Str_hash *sthash; 441 Str_master *mstr = 0; 442 443 /* 444 * We use a classic 'Bernstein k=33' hash function. But 445 * instead of hashing from the start of the string to the 446 * end, we do it in reverse. 447 * 448 * This way - we are essentially building all of the 449 * suffix hashvalues as we go. We can check to see if 450 * any suffixes already exist in the tree as we generate 451 * the hash. 452 */ 453 for (i = stlen; i >= 0; i--) { 454 455 hashval = ((hashval << 5) + hashval) + 456 str[i]; /* h = ((h * 33) + c) */ 457 for (sthash = hashbcks[hashval % bckcnt]; 458 sthash; sthash = sthash->hi_next) { 459 460 if (sthash->hi_hashval == hashval) { 461 const char *hstr; 462 Str_master *_mstr; 463 464 _mstr = sthash->hi_mstr; 465 hstr = &_mstr->sm_str[_mstr->sm_stlen - 466 sthash->hi_stlen]; 467 if (strcmp(&str[i], hstr) == 0) { 468 if (i == 0) { 469 /* 470 * Entry already in table, 471 * increment refcnt and get 472 * out. 473 */ 474 sthash->hi_refcnt++; 475 return (0); 476 } else { 477 /* 478 * If this 'suffix' is 479 * presently a 'master' string, 480 * then take over it's record. 481 */ 482 if (sthash->hi_stlen == 483 _mstr->sm_stlen) { 484 /* 485 * we should only do 486 * this once. 487 */ 488 assert(mstr == 0); 489 mstr = _mstr; 490 } 491 } 492 } 493 } 494 } 495 } 496 497 498 /* 499 * Do we need a new master string, or can we take over 500 * one we already found in the table? 501 */ 502 if (mstr == 0) { 503 /* 504 * allocate a new master string 505 */ 506 if ((mstr = calloc(sizeof (Str_hash), 1)) == 0) 507 return (-1); 508 mstr->sm_next = stp->st_mstrlist; 509 stp->st_mstrlist = mstr; 510 stp->st_stringsize += stlen + 1; 511 } else { 512 /* 513 * We are taking over a existing master string, 514 * the stringsize only increments by the 515 * difference between the currnet string and the 516 * previous master. 517 */ 518 assert(stlen > mstr->sm_stlen); 519 stp->st_stringsize += stlen - mstr->sm_stlen; 520 } 521 522 if ((sthash = calloc(sizeof (Str_hash), 1)) == 0) 523 return (-1); 524 525 mstr->sm_hashval = sthash->hi_hashval = hashval; 526 mstr->sm_stlen = sthash->hi_stlen = stlen; 527 mstr->sm_str = str; 528 sthash->hi_refcnt = 1; 529 sthash->hi_mstr = mstr; 530 531 /* 532 * Insert string element into head of hash list 533 */ 534 hashval = hashval % bckcnt; 535 sthash->hi_next = hashbcks[hashval]; 536 hashbcks[hashval] = sthash; 537 return (0); 538 } 539 540 /* 541 * Return amount of space required for the string table. 542 */ 543 uint_t 544 st_getstrtab_sz(Str_tbl *stp) 545 { 546 assert(stp->st_fullstringsize > 0); 547 548 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 549 stp->st_flags |= FLG_STTAB_COOKED; 550 return (stp->st_fullstringsize); 551 } 552 553 554 if ((stp->st_flags & FLG_STTAB_COOKED) == 0) { 555 Stringelem *stelem; 556 void *cookie; 557 558 stp->st_flags |= FLG_STTAB_COOKED; 559 /* 560 * allocate a hash table about the size of # of 561 * strings input. 562 */ 563 stp->st_hbckcnt = findprime(stp->st_stringcnt); 564 if ((stp->st_hashbcks = 565 calloc(sizeof (Str_hash), stp->st_hbckcnt)) == NULL) 566 return (0); 567 568 /* 569 * We now walk all of the strings in the list, 570 * from shortest to longest, and insert them into 571 * the hashtable. 572 */ 573 if ((stelem = avl_first(stp->st_strtree)) == NULL) { 574 /* 575 * Is it possible we have a empty string table, 576 * if so - the table still conains '\0' 577 * so still return the size. 578 */ 579 if (avl_numnodes(stp->st_strtree) == 0) { 580 assert(stp->st_stringsize == 1); 581 return (stp->st_stringsize); 582 } 583 return (0); 584 } 585 while (stelem) { 586 Stringlist *strlist, *pstrlist; 587 588 /* 589 * Walk the string lists and insert them 590 * into the hash list. Once a string is 591 * inserted we no longer need it's entry, 592 * so free it 593 */ 594 for (strlist = stelem->se_strlist, pstrlist = 0; 595 strlist; strlist = strlist->sl_next) { 596 if (st_hash_insert(stp, strlist->sl_string, 597 stelem->se_stlen) == -1) 598 return (0); 599 if (pstrlist) 600 free(pstrlist); 601 } 602 free(pstrlist); 603 stelem->se_strlist = 0; 604 stelem = AVL_NEXT(stp->st_strtree, stelem); 605 } 606 607 /* 608 * Now that all of the strings have been freed, 609 * go ahead and quickly re-walk the AVL tree and 610 * free all of the AVL nodes. 611 * 612 * avl_destroy_nodes() beats avl_remove() because 613 * avl_remove will 'ballance' the tree as nodes 614 * are deleted - we just want to tear the whole 615 * thing down now. 616 */ 617 cookie = NULL; 618 while ((stelem = avl_destroy_nodes(stp->st_strtree, 619 &cookie)) != NULL) 620 free(stelem); 621 avl_destroy(stp->st_strtree); 622 free(stp->st_strtree); 623 stp->st_strtree = 0; 624 } 625 626 assert(stp->st_stringsize > 0); 627 assert(stp->st_fullstringsize >= stp->st_stringsize); 628 629 return (stp->st_stringsize); 630 } 631 632 /* 633 * Associate a buffer with the string table. 634 */ 635 const char * 636 st_getstrbuf(Str_tbl *stp) 637 { 638 return (stp->st_strbuf); 639 } 640 641 int 642 st_setstrbuf(Str_tbl *stp, char *stbuf, uint_t bufsize) 643 { 644 assert(stp->st_flags & FLG_STTAB_COOKED); 645 646 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 647 if (bufsize < stp->st_fullstringsize) 648 return (-1); 649 } else { 650 if (bufsize < stp->st_stringsize) 651 return (-1); 652 } 653 654 stp->st_strbuf = stbuf; 655 #ifdef DEBUG 656 /* 657 * for debug builds - start with a stringtable filled in 658 * with '0xff'. This makes it very easy to find wholes 659 * which we failed to fill in - in the strtab. 660 */ 661 memset(stbuf, 0xff, bufsize); 662 stbuf[0] = '\0'; 663 #else 664 memset(stbuf, 0x0, bufsize); 665 #endif 666 return (0); 667 } 668