1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2003 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <string_table.h> 30 #include <stdlib.h> 31 #include <strings.h> 32 #include <sgs.h> 33 #include <stdio.h> 34 35 36 37 /* 38 * This file provides the interfaces to build a Str_tbl suitable 39 * for use by either the sgsmsg system or a standard ELF 40 * SHT_STRTAB. 41 * 42 * There are two modes which can be used when constructing a 43 * string table: 44 * 45 * st_new(0) 46 * standard string table - no compression. This is the 47 * traditional method and fast 48 * 49 * st_new(FLG_STNEW_COMPRESS) 50 * build a compressed string table which both 51 * eliminates duplicate strings and permits 52 * strings with common suffixes (atexit vs. exit) to 53 * overlap in the table. This provides space 54 * savings for many string tables. 55 * 56 * These string tables are now built with a common interface in a 57 * two-pass manner, the first pass it to find all of the strings 58 * required for the string-table and to calculate the size that 59 * will be required for the final string table. 60 * 61 * The second pass allocates the string table and populates the 62 * strings into the table and returns the offsets the strings 63 * have been assigned. 64 * 65 * The calling sequence to build and populate a string table is: 66 * 67 * st_new(); // initialize strtab 68 * 69 * st_insert(st1); // first pass of strings ... 70 * // calculates size required for 71 * // string table 72 * 73 * st_delstring(st?); // remove string previously 74 * // inserted 75 * 76 * st_insert(stN); 77 * 78 * st_getstrtab_sz(); // freezes strtab and computes 79 * // size of table. 80 * 81 * st_setstrbuf(); // associates a final destination 82 * // for the string table 83 * 84 * st_setstring(st1); // populate the string table 85 * ... // offsets are based off of second 86 * // pass through the string table 87 * st_setstring(stN); 88 * 89 * st_destroy(); // tear down string table 90 * // structures. 91 * 92 * String Suffix Compression Algorithm: 93 * 94 * Here's a quick high level overview of the Suffix String 95 * compression algorithm used. First - the heart of the algorithm 96 * is a Hash table list which represents a dictionary of all unique 97 * strings inserted into the string table. The hash function for 98 * this table is a standard string hash except that the hash starts 99 * at the last character in the string (&str[n - 1]) and works towards 100 * the first character in the function (&str[0]). As we compute the 101 * HASH value for a given string, we also compute the hash values 102 * for all of the possible suffix strings for that string. 103 * 104 * As we compute the hash - at each character see if the current 105 * suffix string for that hash is already present in the table. If 106 * it is, and the string is a master string. Then change that 107 * string to a suffix string of the new string being inserted. 108 * 109 * When the final hash value is found (hash for str[0...n]), check 110 * to see if it is in the hash table - if so increment the reference 111 * count for the string. If it is not yet in the table, insert a 112 * new hash table entry for a master string. 113 * 114 * The above method will find all suffixes of a given string given 115 * that the strings are inserted from shortest to longest. That is 116 * why this is a two phase method, we first collect all of the 117 * strings and store them based off of their length in a nice AVL tree. 118 * Once all of the strings have been submitted we then start the 119 * hash table build by traversing the AVL tree in order and 120 * inserting the strings from shortest to longest as described 121 * above. 122 * 123 */ 124 125 /* LINTLIBRARY */ 126 127 128 int 129 strlen_compare(const void *elem1, const void *elem2) 130 { 131 uint_t l1, l2; 132 l1 = ((Stringelem *)elem1)->se_stlen; 133 l2 = ((Stringelem *)elem2)->se_stlen; 134 135 if (l1 == l2) 136 return (0); 137 if (l2 < l1) 138 return (1); 139 140 return (-1); 141 } 142 143 /* 144 * Return a initialized Str_tbl - returns NULL on failure. 145 * 146 * stflags: 147 * 148 * FLG_STNEW_COMPRESS - build a compressed string table 149 * 150 */ 151 Str_tbl * 152 st_new(uint_t stflags) 153 { 154 Str_tbl *stp; 155 156 if ((stp = calloc(sizeof (Str_tbl), 1)) == 0) 157 return (0); 158 159 /* 160 * Start with a leading '\0' - it's tradition. 161 */ 162 stp->st_stringsize = stp->st_fullstringsize = stp->st_nextoff = 1; 163 164 /* 165 * Do we compress this string table 166 */ 167 if ((stflags & FLG_STNEW_COMPRESS) == 0) 168 return (stp); 169 170 stp->st_flags |= FLG_STTAB_COMPRESS; 171 if ((stp->st_strtree = calloc(sizeof (avl_tree_t), 1)) == 0) { 172 return (0); 173 } 174 175 avl_create(stp->st_strtree, &strlen_compare, sizeof (Stringelem), 176 SGSOFFSETOF(Stringelem, se_avlnode)); 177 178 return (stp); 179 } 180 181 /* 182 * Tear down a String_Table structure. 183 */ 184 void 185 st_destroy(Str_tbl *stp) 186 { 187 Str_hash *sthash, *psthash; 188 Str_master *mstr, *pmstr; 189 uint_t i; 190 191 /* 192 * cleanup the master strings 193 */ 194 for (mstr = stp->st_mstrlist, pmstr = 0; mstr; 195 mstr = mstr->sm_next) { 196 if (pmstr) 197 free(pmstr); 198 pmstr = mstr; 199 } 200 if (pmstr) 201 free(pmstr); 202 203 if (stp->st_hashbcks) { 204 for (i = 0; i < stp->st_hbckcnt; i++) { 205 for (sthash = stp->st_hashbcks[i], psthash = 0; 206 sthash; sthash = sthash->hi_next) { 207 if (psthash) 208 free(psthash); 209 psthash = sthash; 210 } 211 if (psthash) 212 free(psthash); 213 } 214 free(stp->st_hashbcks); 215 } 216 free(stp); 217 } 218 219 220 221 222 /* 223 * Remove a previously inserted string from the Str_tbl 224 */ 225 int 226 st_delstring(Str_tbl *stp, const char *str) 227 { 228 uint_t stlen; 229 Stringelem qstelem; 230 Stringelem *stelem; 231 Stringlist *stlist, *pstlist; 232 233 /* 234 * String table can't have been cooked 235 */ 236 assert((stp->st_flags & FLG_STTAB_COOKED) == 0); 237 238 stlen = (uint_t)strlen(str); 239 stp->st_fullstringsize -= stlen + 1; 240 241 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) 242 return (0); 243 244 qstelem.se_stlen = stlen; 245 if ((stelem = avl_find(stp->st_strtree, &qstelem, 0)) == NULL) { 246 /* 247 * no strings of this length recorded, let alone 248 * this specific string - someone goofed. 249 */ 250 return (-1); 251 } 252 253 pstlist = 0; 254 for (stlist = stelem->se_strlist; stlist; stlist = stlist->sl_next) { 255 if (strcmp(str, stlist->sl_string) == 0) 256 break; 257 pstlist = stlist; 258 } 259 260 if (stlist == 0) { 261 /* 262 * string was not found 263 */ 264 return (-1); 265 } 266 267 if (pstlist == 0) { 268 /* 269 * String is first on list. 270 */ 271 stelem->se_strlist = stlist->sl_next; 272 } else { 273 /* 274 * remove string from list. 275 */ 276 pstlist->sl_next = stlist->sl_next; 277 } 278 279 free(stlist); 280 return (0); 281 } 282 283 284 /* 285 * Insert a new string into the Str_tbl 286 */ 287 int 288 st_insert(Str_tbl *stp, const char *str) 289 { 290 uint_t stlen; 291 Stringelem qstelem; 292 Stringelem *stelem; 293 Stringlist *strlist; 294 avl_index_t where; 295 296 /* 297 * String table can't have been cooked 298 */ 299 assert((stp->st_flags & FLG_STTAB_COOKED) == 0); 300 stlen = (uint_t)strlen(str); 301 /* 302 * Null strings always point to the head of the string 303 * table - no reason to keep searching. 304 */ 305 if (stlen == 0) 306 return (0); 307 308 stp->st_fullstringsize += stlen + 1; 309 stp->st_stringcnt++; 310 311 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) 312 return (0); 313 314 qstelem.se_stlen = strlen(str); 315 if ((stelem = avl_find(stp->st_strtree, &qstelem, 316 &where)) == NULL) { 317 if ((stelem = calloc(sizeof (Stringelem), 1)) == 0) 318 return (-1); 319 stelem->se_stlen = qstelem.se_stlen; 320 avl_insert(stp->st_strtree, stelem, where); 321 } 322 if ((strlist = malloc(sizeof (Stringlist))) == 0) 323 return (-1); 324 325 strlist->sl_string = str; 326 strlist->sl_next = stelem->se_strlist; 327 stelem->se_strlist = strlist; 328 329 return (0); 330 } 331 332 333 /* 334 * For a given string - copy it into the buffer associated with 335 * the string table - and return the offset it has been assigned. 336 * 337 * If a value of '-1' is returned - the string was not found in 338 * the Str_tbl. 339 */ 340 int 341 st_setstring(Str_tbl *stp, const char *str, uint_t *stoff) 342 { 343 uint_t stlen; 344 uint_t hashval; 345 Str_hash *sthash; 346 Str_master *mstr; 347 int i; 348 349 /* 350 * String table *must* have been previously cooked 351 */ 352 assert(stp->st_strbuf); 353 354 assert(stp->st_flags & FLG_STTAB_COOKED); 355 stlen = (uint_t)strlen(str); 356 /* 357 * Null string always points to head of string table 358 */ 359 if (stlen == 0) { 360 *stoff = 0; 361 return (0); 362 } 363 364 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 365 uint_t _stoff; 366 367 stlen++; /* count for trailing '\0' */ 368 _stoff = stp->st_nextoff; 369 /* 370 * Have we overflowed our assigned buffer? 371 */ 372 if ((_stoff + stlen) > stp->st_fullstringsize) 373 return (-1); 374 memcpy(stp->st_strbuf + _stoff, str, stlen); 375 *stoff = _stoff; 376 stp->st_nextoff += stlen; 377 return (0); 378 } 379 380 /* 381 * Calculate reverse hash for string 382 */ 383 hashval = HASHSEED; 384 for (i = stlen; i >= 0; i--) { 385 hashval = ((hashval << 5) + hashval) + 386 str[i]; /* h = ((h * 33) + c) */ 387 } 388 389 for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash; 390 sthash = sthash->hi_next) { 391 if (sthash->hi_hashval == hashval) { 392 const char *hstr; 393 394 hstr = &sthash->hi_mstr->sm_str[ 395 sthash->hi_mstr->sm_stlen - 396 sthash->hi_stlen]; 397 if (strcmp(str, hstr) == 0) { 398 break; 399 } 400 } 401 } 402 403 /* 404 * Did we find the string? 405 */ 406 if (sthash == 0) 407 return (-1); 408 409 /* 410 * Has this string been copied into the string table? 411 */ 412 mstr = sthash->hi_mstr; 413 if (mstr->sm_stoff == 0) { 414 uint_t mstlen = mstr->sm_stlen + 1; 415 mstr->sm_stoff = stp->st_nextoff; 416 /* 417 * Have we overflowed our assigned buffer? 418 */ 419 if ((mstr->sm_stoff + mstlen) > stp->st_fullstringsize) 420 return (-1); 421 memcpy(stp->st_strbuf + mstr->sm_stoff, mstr->sm_str, 422 mstlen); 423 stp->st_nextoff += mstlen; 424 } 425 /* 426 * Calculate offset of (sub)string 427 */ 428 *stoff = mstr->sm_stoff + mstr->sm_stlen - sthash->hi_stlen; 429 430 return (0); 431 } 432 433 434 static int 435 st_hash_insert(Str_tbl *stp, const char *str, uint_t stlen) 436 { 437 int i; 438 uint_t hashval = HASHSEED; 439 uint_t bckcnt = stp->st_hbckcnt; 440 Str_hash **hashbcks = stp->st_hashbcks; 441 Str_hash *sthash; 442 Str_master *mstr = 0; 443 444 /* 445 * We use a classic 'Bernstein k=33' hash function. But 446 * instead of hashing from the start of the string to the 447 * end, we do it in reverse. 448 * 449 * This way - we are essentially building all of the 450 * suffix hashvalues as we go. We can check to see if 451 * any suffixes already exist in the tree as we generate 452 * the hash. 453 */ 454 for (i = stlen; i >= 0; i--) { 455 456 hashval = ((hashval << 5) + hashval) + 457 str[i]; /* h = ((h * 33) + c) */ 458 for (sthash = hashbcks[hashval % bckcnt]; 459 sthash; sthash = sthash->hi_next) { 460 461 if (sthash->hi_hashval == hashval) { 462 const char *hstr; 463 Str_master *_mstr; 464 465 _mstr = sthash->hi_mstr; 466 hstr = &_mstr->sm_str[_mstr->sm_stlen - 467 sthash->hi_stlen]; 468 if (strcmp(&str[i], hstr) == 0) { 469 if (i == 0) { 470 /* 471 * Entry already in table, 472 * increment refcnt and get 473 * out. 474 */ 475 sthash->hi_refcnt++; 476 return (0); 477 } else { 478 /* 479 * If this 'suffix' is 480 * presently a 'master' string, 481 * then take over it's record. 482 */ 483 if (sthash->hi_stlen == 484 _mstr->sm_stlen) { 485 /* 486 * we should only do 487 * this once. 488 */ 489 assert(mstr == 0); 490 mstr = _mstr; 491 } 492 } 493 } 494 } 495 } 496 } 497 498 499 /* 500 * Do we need a new master string, or can we take over 501 * one we already found in the table? 502 */ 503 if (mstr == 0) { 504 /* 505 * allocate a new master string 506 */ 507 if ((mstr = calloc(sizeof (Str_hash), 1)) == 0) 508 return (-1); 509 mstr->sm_next = stp->st_mstrlist; 510 stp->st_mstrlist = mstr; 511 stp->st_stringsize += stlen + 1; 512 } else { 513 /* 514 * We are taking over a existing master string, 515 * the stringsize only increments by the 516 * difference between the currnet string and the 517 * previous master. 518 */ 519 assert(stlen > mstr->sm_stlen); 520 stp->st_stringsize += stlen - mstr->sm_stlen; 521 } 522 523 if ((sthash = calloc(sizeof (Str_hash), 1)) == 0) 524 return (-1); 525 526 mstr->sm_hashval = sthash->hi_hashval = hashval; 527 mstr->sm_stlen = sthash->hi_stlen = stlen; 528 mstr->sm_str = str; 529 sthash->hi_refcnt = 1; 530 sthash->hi_mstr = mstr; 531 532 /* 533 * Insert string element into head of hash list 534 */ 535 hashval = hashval % bckcnt; 536 sthash->hi_next = hashbcks[hashval]; 537 hashbcks[hashval] = sthash; 538 return (0); 539 } 540 541 /* 542 * Return amount of space required for the string table. 543 */ 544 uint_t 545 st_getstrtab_sz(Str_tbl *stp) 546 { 547 assert(stp->st_fullstringsize > 0); 548 549 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 550 stp->st_flags |= FLG_STTAB_COOKED; 551 return (stp->st_fullstringsize); 552 } 553 554 555 if ((stp->st_flags & FLG_STTAB_COOKED) == 0) { 556 Stringelem *stelem; 557 void *cookie; 558 559 stp->st_flags |= FLG_STTAB_COOKED; 560 /* 561 * allocate a hash table about the size of # of 562 * strings input. 563 */ 564 stp->st_hbckcnt = findprime(stp->st_stringcnt); 565 if ((stp->st_hashbcks = 566 calloc(sizeof (Str_hash), stp->st_hbckcnt)) == NULL) 567 return (0); 568 569 /* 570 * We now walk all of the strings in the list, 571 * from shortest to longest, and insert them into 572 * the hashtable. 573 */ 574 if ((stelem = avl_first(stp->st_strtree)) == NULL) { 575 /* 576 * Is it possible we have a empty string table, 577 * if so - the table still conains '\0' 578 * so still return the size. 579 */ 580 if (avl_numnodes(stp->st_strtree) == 0) { 581 assert(stp->st_stringsize == 1); 582 return (stp->st_stringsize); 583 } 584 return (0); 585 } 586 while (stelem) { 587 Stringlist *strlist, *pstrlist; 588 589 /* 590 * Walk the string lists and insert them 591 * into the hash list. Once a string is 592 * inserted we no longer need it's entry, 593 * so free it 594 */ 595 for (strlist = stelem->se_strlist, pstrlist = 0; 596 strlist; strlist = strlist->sl_next) { 597 if (st_hash_insert(stp, strlist->sl_string, 598 stelem->se_stlen) == -1) 599 return (0); 600 if (pstrlist) 601 free(pstrlist); 602 } 603 free(pstrlist); 604 stelem->se_strlist = 0; 605 stelem = AVL_NEXT(stp->st_strtree, stelem); 606 } 607 608 /* 609 * Now that all of the strings have been freed, 610 * go ahead and quickly re-walk the AVL tree and 611 * free all of the AVL nodes. 612 * 613 * avl_destroy_nodes() beats avl_remove() because 614 * avl_remove will 'ballance' the tree as nodes 615 * are deleted - we just want to tear the whole 616 * thing down now. 617 */ 618 cookie = NULL; 619 while ((stelem = avl_destroy_nodes(stp->st_strtree, 620 &cookie)) != NULL) 621 free(stelem); 622 avl_destroy(stp->st_strtree); 623 free(stp->st_strtree); 624 stp->st_strtree = 0; 625 } 626 627 assert(stp->st_stringsize > 0); 628 assert(stp->st_fullstringsize >= stp->st_stringsize); 629 630 return (stp->st_stringsize); 631 } 632 633 /* 634 * Associate a buffer with the string table. 635 */ 636 const char * 637 st_getstrbuf(Str_tbl *stp) 638 { 639 return (stp->st_strbuf); 640 } 641 642 int 643 st_setstrbuf(Str_tbl *stp, char *stbuf, uint_t bufsize) 644 { 645 assert(stp->st_flags & FLG_STTAB_COOKED); 646 647 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { 648 if (bufsize < stp->st_fullstringsize) 649 return (-1); 650 } else { 651 if (bufsize < stp->st_stringsize) 652 return (-1); 653 } 654 655 stp->st_strbuf = stbuf; 656 #ifdef DEBUG 657 /* 658 * for debug builds - start with a stringtable filled in 659 * with '0xff'. This makes it very easy to find wholes 660 * which we failed to fill in - in the strtab. 661 */ 662 memset(stbuf, 0xff, bufsize); 663 stbuf[0] = '\0'; 664 #else 665 memset(stbuf, 0x0, bufsize); 666 #endif 667 return (0); 668 } 669