1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <_string_table.h>
28 #include <strings.h>
29 #include <sgs.h>
30 #include <stdio.h>
31
32 /*
33 * This file provides the interfaces to build a Str_tbl suitable for use by
34 * either the sgsmsg message system, or a standard ELF string table (SHT_STRTAB)
35 * as created by ld(1).
36 *
37 * There are two modes which can be used when constructing a string table:
38 *
39 * st_new(0)
40 * standard string table - no compression. This is the
41 * traditional, fast method.
42 *
43 * st_new(FLG_STTAB_COMPRESS)
44 * builds a compressed string table which both eliminates
45 * duplicate strings, and permits strings with common suffixes
46 * (atexit vs. exit) to overlap in the table. This provides space
47 * savings for many string tables. Although more work than the
48 * traditional method, the algorithms used are designed to scale
49 * and keep any overhead at a minimum.
50 *
51 * These string tables are built with a common interface in a two-pass manner.
52 * The first pass finds all of the strings required for the string-table and
53 * calculates the size required for the final string table.
54 *
55 * The second pass allocates the string table, populates the strings into the
56 * table and returns the offsets the strings have been assigned.
57 *
58 * The calling sequence to build and populate a string table is:
59 *
60 * st_new(); // initialize strtab
61 *
62 * st_insert(st1); // first pass of strings ...
63 * // calculates size required for
64 * // string table
65 *
66 * st_delstring(st?); // remove string previously
67 * // inserted
68 * st_insert(stN);
69 *
70 * st_getstrtab_sz(); // freezes strtab and computes
71 * // size of table.
72 *
73 * st_setstrbuf(); // associates a final destination
74 * // for the string table
75 *
76 * st_setstring(st1); // populate the string table
77 * ... // offsets are based off of second
78 * // pass through the string table
79 * st_setstring(stN);
80 *
81 * st_destroy(); // tear down string table
82 * // structures.
83 *
84 * String Suffix Compression Algorithm:
85 *
86 * Here's a quick high level overview of the Suffix String
87 * compression algorithm used. First - the heart of the algorithm
88 * is a Hash table list which represents a dictionary of all unique
89 * strings inserted into the string table. The hash function for
90 * this table is a standard string hash except that the hash starts
91 * at the last character in the string (&str[n - 1]) and works towards
92 * the first character in the function (&str[0]). As we compute the
93 * HASH value for a given string, we also compute the hash values
94 * for all of the possible suffix strings for that string.
95 *
96 * As we compute the hash - at each character see if the current
97 * suffix string for that hash is already present in the table. If
98 * it is, and the string is a master string. Then change that
99 * string to a suffix string of the new string being inserted.
100 *
101 * When the final hash value is found (hash for str[0...n]), check
102 * to see if it is in the hash table - if so increment the reference
103 * count for the string. If it is not yet in the table, insert a
104 * new hash table entry for a master string.
105 *
106 * The above method will find all suffixes of a given string given
107 * that the strings are inserted from shortest to longest. That is
108 * why this is a two phase method, we first collect all of the
109 * strings and store them based off of their length in an AVL tree.
110 * Once all of the strings have been submitted we then start the
111 * hash table build by traversing the AVL tree in order and
112 * inserting the strings from shortest to longest as described
113 * above.
114 */
115
116 /* LINTLIBRARY */
117
118 static int
avl_len_compare(const void * n1,const void * n2)119 avl_len_compare(const void *n1, const void *n2)
120 {
121 size_t len1, len2;
122
123 len1 = ((LenNode *)n1)->ln_strlen;
124 len2 = ((LenNode *)n2)->ln_strlen;
125
126 if (len1 == len2)
127 return (0);
128 if (len2 < len1)
129 return (1);
130 return (-1);
131 }
132
133 static int
avl_str_compare(const void * n1,const void * n2)134 avl_str_compare(const void *n1, const void *n2)
135 {
136 const char *str1, *str2;
137 int rc;
138
139 str1 = ((StrNode *)n1)->sn_str;
140 str2 = ((StrNode *)n2)->sn_str;
141
142 rc = strcmp(str1, str2);
143 if (rc > 0)
144 return (1);
145 if (rc < 0)
146 return (-1);
147 return (0);
148 }
149
150 /*
151 * Return an initialized Str_tbl - returns NULL on failure.
152 *
153 * flags:
154 * FLG_STTAB_COMPRESS - build a compressed string table
155 */
156 Str_tbl *
st_new(uint_t flags)157 st_new(uint_t flags)
158 {
159 Str_tbl *stp;
160
161 if ((stp = calloc(sizeof (*stp), 1)) == NULL)
162 return (NULL);
163
164 /*
165 * Start with a leading '\0' - it's tradition.
166 */
167 stp->st_strsize = stp->st_fullstrsize = stp->st_nextoff = 1;
168
169 /*
170 * Do we compress this string table?
171 */
172 stp->st_flags = flags;
173 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
174 return (stp);
175
176 if ((stp->st_lentree = calloc(sizeof (*stp->st_lentree), 1)) == NULL)
177 return (NULL);
178
179 avl_create(stp->st_lentree, &avl_len_compare, sizeof (LenNode),
180 SGSOFFSETOF(LenNode, ln_avlnode));
181
182 return (stp);
183 }
184
185 /*
186 * Insert a new string into the Str_tbl. There are two AVL trees used.
187 *
188 * - The first LenNode AVL tree maintains a tree of nodes based on string
189 * sizes.
190 * - Each LenNode maintains a StrNode AVL tree for each string. Large
191 * applications have been known to contribute thousands of strings of
192 * the same size. Should strings need to be removed (-z ignore), then
193 * the string AVL tree makes this removal efficient and scalable.
194 */
195 int
st_insert(Str_tbl * stp,const char * str)196 st_insert(Str_tbl *stp, const char *str)
197 {
198 size_t len;
199 StrNode *snp, sn = { 0 };
200 LenNode *lnp, ln = { 0 };
201 avl_index_t where;
202
203 /*
204 * String table can't have been cooked
205 */
206 assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
207
208 /*
209 * Null strings always point to the head of the string
210 * table - no reason to keep searching.
211 */
212 if ((len = strlen(str)) == 0)
213 return (0);
214
215 stp->st_fullstrsize += len + 1;
216 stp->st_strcnt++;
217
218 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
219 return (0);
220
221 /*
222 * From the controlling string table, determine which LenNode AVL node
223 * provides for this string length. If the node doesn't exist, insert
224 * a new node to represent this string length.
225 */
226 ln.ln_strlen = len;
227 if ((lnp = avl_find(stp->st_lentree, &ln, &where)) == NULL) {
228 if ((lnp = calloc(sizeof (*lnp), 1)) == NULL)
229 return (-1);
230 lnp->ln_strlen = len;
231 avl_insert(stp->st_lentree, lnp, where);
232
233 if ((lnp->ln_strtree = calloc(sizeof (*lnp->ln_strtree), 1)) ==
234 NULL)
235 return (0);
236
237 avl_create(lnp->ln_strtree, &avl_str_compare, sizeof (StrNode),
238 SGSOFFSETOF(StrNode, sn_avlnode));
239 }
240
241 /*
242 * From the string length AVL node determine whether a StrNode AVL node
243 * provides this string. If the node doesn't exist, insert a new node
244 * to represent this string.
245 */
246 sn.sn_str = str;
247 if ((snp = avl_find(lnp->ln_strtree, &sn, &where)) == NULL) {
248 if ((snp = calloc(sizeof (*snp), 1)) == NULL)
249 return (-1);
250 snp->sn_str = str;
251 avl_insert(lnp->ln_strtree, snp, where);
252 }
253 snp->sn_refcnt++;
254
255 return (0);
256 }
257
258 /*
259 * Remove a previously inserted string from the Str_tbl.
260 */
261 int
st_delstring(Str_tbl * stp,const char * str)262 st_delstring(Str_tbl *stp, const char *str)
263 {
264 size_t len;
265 LenNode *lnp, ln = { 0 };
266 StrNode *snp, sn = { 0 };
267
268 /*
269 * String table can't have been cooked
270 */
271 assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
272
273 len = strlen(str);
274 stp->st_fullstrsize -= len + 1;
275
276 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
277 return (0);
278
279 /*
280 * Determine which LenNode AVL node provides for this string length.
281 */
282 ln.ln_strlen = len;
283 if ((lnp = avl_find(stp->st_lentree, &ln, 0)) != NULL) {
284 sn.sn_str = str;
285 if ((snp = avl_find(lnp->ln_strtree, &sn, 0)) != NULL) {
286 /*
287 * Reduce the reference count, and if zero remove the
288 * node.
289 */
290 if (--snp->sn_refcnt == 0)
291 avl_remove(lnp->ln_strtree, snp);
292 return (0);
293 }
294 }
295
296 /*
297 * No strings of this length, or no string itself - someone goofed.
298 */
299 return (-1);
300 }
301
302 /*
303 * Tear down a String_Table structure.
304 */
305 void
st_destroy(Str_tbl * stp)306 st_destroy(Str_tbl *stp)
307 {
308 Str_hash *sthash, *psthash;
309 Str_master *mstr, *pmstr;
310 uint_t i;
311
312 /*
313 * cleanup the master strings
314 */
315 for (mstr = stp->st_mstrlist, pmstr = 0; mstr;
316 mstr = mstr->sm_next) {
317 if (pmstr)
318 free(pmstr);
319 pmstr = mstr;
320 }
321 if (pmstr)
322 free(pmstr);
323
324 if (stp->st_hashbcks) {
325 for (i = 0; i < stp->st_hbckcnt; i++) {
326 for (sthash = stp->st_hashbcks[i], psthash = 0;
327 sthash; sthash = sthash->hi_next) {
328 if (psthash)
329 free(psthash);
330 psthash = sthash;
331 }
332 if (psthash)
333 free(psthash);
334 }
335 free(stp->st_hashbcks);
336 }
337 free(stp);
338 }
339
340
341 /*
342 * For a given string - copy it into the buffer associated with
343 * the string table - and return the offset it has been assigned.
344 *
345 * If a value of '-1' is returned - the string was not found in
346 * the Str_tbl.
347 */
348 int
st_setstring(Str_tbl * stp,const char * str,size_t * stoff)349 st_setstring(Str_tbl *stp, const char *str, size_t *stoff)
350 {
351 size_t stlen;
352 uint_t hashval;
353 Str_hash *sthash;
354 Str_master *mstr;
355 int i;
356
357 /*
358 * String table *must* have been previously cooked
359 */
360 assert(stp->st_strbuf);
361
362 assert(stp->st_flags & FLG_STTAB_COOKED);
363 stlen = strlen(str);
364 /*
365 * Null string always points to head of string table
366 */
367 if (stlen == 0) {
368 *stoff = 0;
369 return (0);
370 }
371
372 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
373 size_t _stoff;
374
375 stlen++; /* count for trailing '\0' */
376 _stoff = stp->st_nextoff;
377 /*
378 * Have we overflowed our assigned buffer?
379 */
380 if ((_stoff + stlen) > stp->st_fullstrsize)
381 return (-1);
382 memcpy(stp->st_strbuf + _stoff, str, stlen);
383 *stoff = _stoff;
384 stp->st_nextoff += stlen;
385 return (0);
386 }
387
388 /*
389 * Calculate reverse hash for string.
390 */
391 hashval = HASHSEED;
392 for (i = stlen; i >= 0; i--) {
393 hashval = ((hashval << 5) + hashval) +
394 str[i]; /* h = ((h * 33) + c) */
395 }
396
397 for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash;
398 sthash = sthash->hi_next) {
399 const char *hstr;
400
401 if (sthash->hi_hashval != hashval)
402 continue;
403
404 hstr = &sthash->hi_mstr->sm_str[sthash->hi_mstr->sm_strlen -
405 sthash->hi_strlen];
406 if (strcmp(str, hstr) == 0)
407 break;
408 }
409
410 /*
411 * Did we find the string?
412 */
413 if (sthash == 0)
414 return (-1);
415
416 /*
417 * Has this string been copied into the string table?
418 */
419 mstr = sthash->hi_mstr;
420 if (mstr->sm_stroff == 0) {
421 size_t mstrlen = mstr->sm_strlen + 1;
422
423 mstr->sm_stroff = stp->st_nextoff;
424
425 /*
426 * Have we overflowed our assigned buffer?
427 */
428 if ((mstr->sm_stroff + mstrlen) > stp->st_fullstrsize)
429 return (-1);
430
431 (void) memcpy(stp->st_strbuf + mstr->sm_stroff,
432 mstr->sm_str, mstrlen);
433 stp->st_nextoff += mstrlen;
434 }
435
436 /*
437 * Calculate offset of (sub)string.
438 */
439 *stoff = mstr->sm_stroff + mstr->sm_strlen - sthash->hi_strlen;
440
441 return (0);
442 }
443
444
445 static int
st_hash_insert(Str_tbl * stp,const char * str,size_t len)446 st_hash_insert(Str_tbl *stp, const char *str, size_t len)
447 {
448 int i;
449 uint_t hashval = HASHSEED;
450 uint_t bckcnt = stp->st_hbckcnt;
451 Str_hash **hashbcks = stp->st_hashbcks;
452 Str_hash *sthash;
453 Str_master *mstr = 0;
454
455 /*
456 * We use a classic 'Bernstein k=33' hash function. But
457 * instead of hashing from the start of the string to the
458 * end, we do it in reverse.
459 *
460 * This way - we are essentially building all of the
461 * suffix hashvalues as we go. We can check to see if
462 * any suffixes already exist in the tree as we generate
463 * the hash.
464 */
465 for (i = len; i >= 0; i--) {
466 hashval = ((hashval << 5) + hashval) +
467 str[i]; /* h = ((h * 33) + c) */
468
469 for (sthash = hashbcks[hashval % bckcnt];
470 sthash; sthash = sthash->hi_next) {
471 const char *hstr;
472 Str_master *_mstr;
473
474 if (sthash->hi_hashval != hashval)
475 continue;
476
477 _mstr = sthash->hi_mstr;
478 hstr = &_mstr->sm_str[_mstr->sm_strlen -
479 sthash->hi_strlen];
480
481 if (strcmp(&str[i], hstr))
482 continue;
483
484 if (i == 0) {
485 /*
486 * Entry already in table, increment refcnt and
487 * get out.
488 */
489 sthash->hi_refcnt++;
490 return (0);
491 } else {
492 /*
493 * If this 'suffix' is presently a 'master
494 * string, then take over it's record.
495 */
496 if (sthash->hi_strlen == _mstr->sm_strlen) {
497 /*
498 * we should only do this once.
499 */
500 assert(mstr == 0);
501 mstr = _mstr;
502 }
503 }
504 }
505 }
506
507 /*
508 * Do we need a new master string, or can we take over
509 * one we already found in the table?
510 */
511 if (mstr == 0) {
512 /*
513 * allocate a new master string
514 */
515 if ((mstr = calloc(sizeof (*mstr), 1)) == 0)
516 return (-1);
517 mstr->sm_next = stp->st_mstrlist;
518 stp->st_mstrlist = mstr;
519 stp->st_strsize += len + 1;
520 } else {
521 /*
522 * We are taking over a existing master string, the string size
523 * only increments by the difference between the current string
524 * and the previous master.
525 */
526 assert(len > mstr->sm_strlen);
527 stp->st_strsize += len - mstr->sm_strlen;
528 }
529
530 if ((sthash = calloc(sizeof (*sthash), 1)) == 0)
531 return (-1);
532
533 mstr->sm_hashval = sthash->hi_hashval = hashval;
534 mstr->sm_strlen = sthash->hi_strlen = len;
535 mstr->sm_str = str;
536 sthash->hi_refcnt = 1;
537 sthash->hi_mstr = mstr;
538
539 /*
540 * Insert string element into head of hash list
541 */
542 hashval = hashval % bckcnt;
543 sthash->hi_next = hashbcks[hashval];
544 hashbcks[hashval] = sthash;
545 return (0);
546 }
547
548 /*
549 * Return amount of space required for the string table.
550 */
551 size_t
st_getstrtab_sz(Str_tbl * stp)552 st_getstrtab_sz(Str_tbl *stp)
553 {
554 assert(stp->st_fullstrsize > 0);
555
556 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
557 stp->st_flags |= FLG_STTAB_COOKED;
558 return (stp->st_fullstrsize);
559 }
560
561 if ((stp->st_flags & FLG_STTAB_COOKED) == 0) {
562 LenNode *lnp;
563 void *cookie;
564
565 stp->st_flags |= FLG_STTAB_COOKED;
566 /*
567 * allocate a hash table about the size of # of
568 * strings input.
569 */
570 stp->st_hbckcnt = findprime(stp->st_strcnt);
571 if ((stp->st_hashbcks = calloc(sizeof (*stp->st_hashbcks),
572 stp->st_hbckcnt)) == NULL)
573 return (0);
574
575 /*
576 * We now walk all of the strings in the list, from shortest to
577 * longest, and insert them into the hashtable.
578 */
579 if ((lnp = avl_first(stp->st_lentree)) == NULL) {
580 /*
581 * Is it possible we have an empty string table, if so,
582 * the table still contains '\0', so return the size.
583 */
584 if (avl_numnodes(stp->st_lentree) == 0) {
585 assert(stp->st_strsize == 1);
586 return (stp->st_strsize);
587 }
588 return (0);
589 }
590
591 while (lnp) {
592 StrNode *snp;
593
594 /*
595 * Walk the string lists and insert them into the hash
596 * list. Once a string is inserted we no longer need
597 * it's entry, so the string can be freed.
598 */
599 for (snp = avl_first(lnp->ln_strtree); snp;
600 snp = AVL_NEXT(lnp->ln_strtree, snp)) {
601 if (st_hash_insert(stp, snp->sn_str,
602 lnp->ln_strlen) == -1)
603 return (0);
604 }
605
606 /*
607 * Now that the strings have been copied, walk the
608 * StrNode tree and free all the AVL nodes. Note,
609 * avl_destroy_nodes() beats avl_remove() as the
610 * latter balances the nodes as they are removed.
611 * We just want to tear the whole thing down fast.
612 */
613 cookie = NULL;
614 while ((snp = avl_destroy_nodes(lnp->ln_strtree,
615 &cookie)) != NULL)
616 free(snp);
617 avl_destroy(lnp->ln_strtree);
618 free(lnp->ln_strtree);
619 lnp->ln_strtree = NULL;
620
621 /*
622 * Move on to the next LenNode.
623 */
624 lnp = AVL_NEXT(stp->st_lentree, lnp);
625 }
626
627 /*
628 * Now that all of the strings have been freed, walk the
629 * LenNode tree and free all of the AVL nodes. Note,
630 * avl_destroy_nodes() beats avl_remove() as the latter
631 * balances the nodes as they are removed. We just want to
632 * tear the whole thing down fast.
633 */
634 cookie = NULL;
635 while ((lnp = avl_destroy_nodes(stp->st_lentree,
636 &cookie)) != NULL)
637 free(lnp);
638 avl_destroy(stp->st_lentree);
639 free(stp->st_lentree);
640 stp->st_lentree = 0;
641 }
642
643 assert(stp->st_strsize > 0);
644 assert(stp->st_fullstrsize >= stp->st_strsize);
645
646 return (stp->st_strsize);
647 }
648
649 /*
650 * Associate a buffer with a string table.
651 */
652 const char *
st_getstrbuf(Str_tbl * stp)653 st_getstrbuf(Str_tbl *stp)
654 {
655 return (stp->st_strbuf);
656 }
657
658 int
st_setstrbuf(Str_tbl * stp,char * stbuf,size_t bufsize)659 st_setstrbuf(Str_tbl *stp, char *stbuf, size_t bufsize)
660 {
661 assert(stp->st_flags & FLG_STTAB_COOKED);
662
663 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
664 if (bufsize < stp->st_fullstrsize)
665 return (-1);
666 } else {
667 if (bufsize < stp->st_strsize)
668 return (-1);
669 }
670
671 stp->st_strbuf = stbuf;
672 #ifdef DEBUG
673 /*
674 * for debug builds - start with a stringtable filled in
675 * with '0xff'. This makes it very easy to spot unfilled
676 * holes in the strtab.
677 */
678 memset(stbuf, 0xff, bufsize);
679 stbuf[0] = '\0';
680 #else
681 memset(stbuf, 0x0, bufsize);
682 #endif
683 return (0);
684 }
685