xref: /titanic_44/usr/src/cmd/sgs/tools/common/string_table.c (revision 7257d1b4d25bfac0c802847390e98a464fd787ac)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <_string_table.h>
30 #include <strings.h>
31 #include <sgs.h>
32 #include <stdio.h>
33 
34 /*
35  * This file provides the interfaces to build a Str_tbl suitable for use by
36  * either the sgsmsg message system, or a standard ELF string table (SHT_STRTAB)
37  * as created by ld(1).
38  *
39  * There are two modes which can be used when constructing a string table:
40  *
41  *	st_new(0)
42  *		standard string table - no compression.  This is the
43  *		traditional, fast method.
44  *
45  *	st_new(FLG_STTAB_COMPRESS)
46  *		builds a compressed string table which both eliminates
47  *		duplicate strings, and permits strings with common suffixes
48  *		(atexit vs. exit) to overlap in the table.  This provides space
49  *		savings for many string tables.  Although more work than the
50  *		traditional method, the algorithms used are designed to scale
51  *		and keep any overhead at a minimum.
52  *
53  * These string tables are built with a common interface in a two-pass manner.
54  * The first pass finds all of the strings required for the string-table and
55  * calculates the size required for the final string table.
56  *
57  * The second pass allocates the string table, populates the strings into the
58  * table and returns the offsets the strings have been assigned.
59  *
60  * The calling sequence to build and populate a string table is:
61  *
62  *		st_new();		// initialize strtab
63  *
64  *		st_insert(st1);		// first pass of strings ...
65  *					// calculates size required for
66  *					// string table
67  *
68  *		st_delstring(st?);	// remove string previously
69  *					// inserted
70  *		st_insert(stN);
71  *
72  *		st_getstrtab_sz();	// freezes strtab and computes
73  *					// size of table.
74  *
75  *		st_setstrbuf();		// associates a final destination
76  *					// for the string table
77  *
78  *		st_setstring(st1);	// populate the string table
79  *		...			// offsets are based off of second
80  *					// pass	through the string table
81  *		st_setstring(stN);
82  *
83  *		st_destroy();		// tear down string table
84  *					// structures.
85  *
86  * String Suffix Compression Algorithm:
87  *
88  *   Here's a quick high level overview of the Suffix String
89  *   compression algorithm used.  First - the heart of the algorithm
90  *   is a Hash table list which represents a dictionary of all unique
91  *   strings inserted into the string table.  The hash function for
92  *   this table is a standard string hash except that the hash starts
93  *   at the last character in the string (&str[n - 1]) and works towards
94  *   the first character in the function (&str[0]).  As we compute the
95  *   HASH value for a given string, we also compute the hash values
96  *   for all of the possible suffix strings for that string.
97  *
98  *   As we compute the hash - at each character see if the current
99  *   suffix string for that hash is already present in the table.  If
100  *   it is, and the string is a master string.  Then change that
101  *   string to a suffix string of the new string being inserted.
102  *
103  *   When the final hash value is found (hash for str[0...n]), check
104  *   to see if it is in the hash table - if so increment the reference
105  *   count for the string.  If it is not yet in the table, insert a
106  *   new hash table entry for a master string.
107  *
108  *   The above method will find all suffixes of a given string given
109  *   that the strings are inserted from shortest to longest.  That is
110  *   why this is a two phase method, we first collect all of the
111  *   strings and store them based off of their length in an AVL tree.
112  *   Once all of the strings have been submitted we then start the
113  *   hash table build by traversing the AVL tree in order and
114  *   inserting the strings from shortest to longest as described
115  *   above.
116  */
117 
118 /* LINTLIBRARY */
119 
120 static int
121 avl_len_compare(const void *n1, const void *n2)
122 {
123 	size_t	len1, len2;
124 
125 	len1 = ((LenNode *)n1)->ln_strlen;
126 	len2 = ((LenNode *)n2)->ln_strlen;
127 
128 	if (len1 == len2)
129 		return (0);
130 	if (len2 < len1)
131 		return (1);
132 	return (-1);
133 }
134 
135 static int
136 avl_str_compare(const void *n1, const void *n2)
137 {
138 	const char	*str1, *str2;
139 	int		rc;
140 
141 	str1 = ((StrNode *)n1)->sn_str;
142 	str2 = ((StrNode *)n2)->sn_str;
143 
144 	rc = strcmp(str1, str2);
145 	if (rc > 0)
146 		return (1);
147 	if (rc < 0)
148 		return (-1);
149 	return (0);
150 }
151 
152 /*
153  * Return an initialized Str_tbl - returns NULL on failure.
154  *
155  * flags:
156  *	FLG_STTAB_COMPRESS - build a compressed string table
157  */
158 Str_tbl *
159 st_new(uint_t flags)
160 {
161 	Str_tbl	*stp;
162 
163 	if ((stp = calloc(sizeof (Str_tbl), 1)) == NULL)
164 		return (NULL);
165 
166 	/*
167 	 * Start with a leading '\0' - it's tradition.
168 	 */
169 	stp->st_strsize = stp->st_fullstrsize = stp->st_nextoff = 1;
170 
171 	/*
172 	 * Do we compress this string table?
173 	 */
174 	stp->st_flags = flags;
175 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
176 		return (stp);
177 
178 	if ((stp->st_lentree = calloc(sizeof (avl_tree_t), 1)) == NULL)
179 		return (NULL);
180 
181 	avl_create(stp->st_lentree, &avl_len_compare, sizeof (LenNode),
182 	    SGSOFFSETOF(LenNode, ln_avlnode));
183 
184 	return (stp);
185 }
186 
187 /*
188  * Insert a new string into the Str_tbl.  There are two AVL trees used.
189  *
190  *  .	The first LenNode AVL tree maintains a tree of nodes based on string
191  *	sizes.
192  *  .	Each LenNode maintains a StrNode AVL tree for each string.  Large
193  *	applications have been known to contribute thousands of strings of
194  *	the same size.  Should strings need to be removed (-z ignore), then
195  *	the string AVL tree makes this removal efficient and scalable.
196  */
197 int
198 st_insert(Str_tbl *stp, const char *str)
199 {
200 	size_t		len;
201 	StrNode		*snp, sn = { 0 };
202 	LenNode		*lnp, ln = { 0 };
203 	avl_index_t	where;
204 
205 	/*
206 	 * String table can't have been cooked
207 	 */
208 	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
209 
210 	/*
211 	 * Null strings always point to the head of the string
212 	 * table - no reason to keep searching.
213 	 */
214 	if ((len = strlen(str)) == 0)
215 		return (0);
216 
217 	stp->st_fullstrsize += len + 1;
218 	stp->st_strcnt++;
219 
220 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
221 		return (0);
222 
223 	/*
224 	 * From the controlling string table, determine which LenNode AVL node
225 	 * provides for this string length.  If the node doesn't exist, insert
226 	 * a new node to represent this string length.
227 	 */
228 	ln.ln_strlen = len;
229 	if ((lnp = avl_find(stp->st_lentree, &ln, &where)) == NULL) {
230 		if ((lnp = calloc(sizeof (LenNode), 1)) == NULL)
231 			return (-1);
232 		lnp->ln_strlen = len;
233 		avl_insert(stp->st_lentree, lnp, where);
234 
235 		if ((lnp->ln_strtree = calloc(sizeof (avl_tree_t), 1)) == NULL)
236 			return (0);
237 
238 		avl_create(lnp->ln_strtree, &avl_str_compare, sizeof (StrNode),
239 		    SGSOFFSETOF(StrNode, sn_avlnode));
240 	}
241 
242 	/*
243 	 * From the string length AVL node determine whether a StrNode AVL node
244 	 * provides this string.  If the node doesn't exist, insert a new node
245 	 * to represent this string.
246 	 */
247 	sn.sn_str = str;
248 	if ((snp = avl_find(lnp->ln_strtree, &sn, &where)) == NULL) {
249 		if ((snp = calloc(sizeof (StrNode), 1)) == NULL)
250 			return (-1);
251 		snp->sn_str = str;
252 		avl_insert(lnp->ln_strtree, snp, where);
253 	}
254 	snp->sn_refcnt++;
255 
256 	return (0);
257 }
258 
259 /*
260  * Remove a previously inserted string from the Str_tbl.
261  */
262 int
263 st_delstring(Str_tbl *stp, const char *str)
264 {
265 	size_t		len;
266 	LenNode		*lnp, ln = { 0 };
267 	StrNode		*snp, sn = { 0 };
268 
269 	/*
270 	 * String table can't have been cooked
271 	 */
272 	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
273 
274 	len = strlen(str);
275 	stp->st_fullstrsize -= len + 1;
276 
277 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
278 		return (0);
279 
280 	/*
281 	 * Determine which LenNode AVL node provides for this string length.
282 	 */
283 	ln.ln_strlen = len;
284 	if ((lnp = avl_find(stp->st_lentree, &ln, 0)) != NULL) {
285 		sn.sn_str = str;
286 		if ((snp = avl_find(lnp->ln_strtree, &sn, 0)) != NULL) {
287 			/*
288 			 * Reduce the reference count, and if zero remove the
289 			 * node.
290 			 */
291 			if (--snp->sn_refcnt == 0)
292 				avl_remove(lnp->ln_strtree, snp);
293 			return (0);
294 		}
295 	}
296 
297 	/*
298 	 * No strings of this length, or no string itself - someone goofed.
299 	 */
300 	return (-1);
301 }
302 
303 /*
304  * Tear down a String_Table structure.
305  */
306 void
307 st_destroy(Str_tbl *stp)
308 {
309 	Str_hash	*sthash, *psthash;
310 	Str_master	*mstr, *pmstr;
311 	uint_t		i;
312 
313 	/*
314 	 * cleanup the master strings
315 	 */
316 	for (mstr = stp->st_mstrlist, pmstr = 0; mstr;
317 	    mstr = mstr->sm_next) {
318 		if (pmstr)
319 			free(pmstr);
320 		pmstr = mstr;
321 	}
322 	if (pmstr)
323 		free(pmstr);
324 
325 	if (stp->st_hashbcks) {
326 		for (i = 0; i < stp->st_hbckcnt; i++) {
327 			for (sthash = stp->st_hashbcks[i], psthash = 0;
328 			    sthash; sthash = sthash->hi_next) {
329 				if (psthash)
330 					free(psthash);
331 				psthash = sthash;
332 			}
333 			if (psthash)
334 				free(psthash);
335 		}
336 		free(stp->st_hashbcks);
337 	}
338 	free(stp);
339 }
340 
341 
342 /*
343  * For a given string - copy it into the buffer associated with
344  * the string table - and return the offset it has been assigned.
345  *
346  * If a value of '-1' is returned - the string was not found in
347  * the Str_tbl.
348  */
349 int
350 st_setstring(Str_tbl *stp, const char *str, size_t *stoff)
351 {
352 	size_t		stlen;
353 	uint_t		hashval;
354 	Str_hash	*sthash;
355 	Str_master	*mstr;
356 	int		i;
357 
358 	/*
359 	 * String table *must* have been previously cooked
360 	 */
361 	assert(stp->st_strbuf);
362 
363 	assert(stp->st_flags & FLG_STTAB_COOKED);
364 	stlen = strlen(str);
365 	/*
366 	 * Null string always points to head of string table
367 	 */
368 	if (stlen == 0) {
369 		*stoff = 0;
370 		return (0);
371 	}
372 
373 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
374 		size_t		_stoff;
375 
376 		stlen++;	/* count for trailing '\0' */
377 		_stoff = stp->st_nextoff;
378 		/*
379 		 * Have we overflowed our assigned buffer?
380 		 */
381 		if ((_stoff + stlen) > stp->st_fullstrsize)
382 			return (-1);
383 		memcpy(stp->st_strbuf + _stoff, str, stlen);
384 		*stoff = _stoff;
385 		stp->st_nextoff += stlen;
386 		return (0);
387 	}
388 
389 	/*
390 	 * Calculate reverse hash for string.
391 	 */
392 	hashval = HASHSEED;
393 	for (i = stlen; i >= 0; i--) {
394 		hashval = ((hashval << 5) + hashval) +
395 		    str[i];			/* h = ((h * 33) + c) */
396 	}
397 
398 	for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash;
399 	    sthash = sthash->hi_next) {
400 		const char	*hstr;
401 
402 		if (sthash->hi_hashval != hashval)
403 			continue;
404 
405 		hstr = &sthash->hi_mstr->sm_str[sthash->hi_mstr->sm_strlen -
406 		    sthash->hi_strlen];
407 		if (strcmp(str, hstr) == 0)
408 			break;
409 	}
410 
411 	/*
412 	 * Did we find the string?
413 	 */
414 	if (sthash == 0)
415 		return (-1);
416 
417 	/*
418 	 * Has this string been copied into the string table?
419 	 */
420 	mstr = sthash->hi_mstr;
421 	if (mstr->sm_stroff == 0) {
422 		size_t	mstrlen = mstr->sm_strlen + 1;
423 
424 		mstr->sm_stroff = stp->st_nextoff;
425 
426 		/*
427 		 * Have we overflowed our assigned buffer?
428 		 */
429 		if ((mstr->sm_stroff + mstrlen) > stp->st_fullstrsize)
430 			return (-1);
431 
432 		(void) memcpy(stp->st_strbuf + mstr->sm_stroff,
433 		    mstr->sm_str, mstrlen);
434 		stp->st_nextoff += mstrlen;
435 	}
436 
437 	/*
438 	 * Calculate offset of (sub)string.
439 	 */
440 	*stoff = mstr->sm_stroff + mstr->sm_strlen - sthash->hi_strlen;
441 
442 	return (0);
443 }
444 
445 
446 static int
447 st_hash_insert(Str_tbl *stp, const char *str, size_t len)
448 {
449 	int		i;
450 	uint_t		hashval = HASHSEED;
451 	uint_t		bckcnt = stp->st_hbckcnt;
452 	Str_hash	**hashbcks = stp->st_hashbcks;
453 	Str_hash	*sthash;
454 	Str_master	*mstr = 0;
455 
456 	/*
457 	 * We use a classic 'Bernstein k=33' hash function.  But
458 	 * instead of hashing from the start of the string to the
459 	 * end, we do it in reverse.
460 	 *
461 	 * This way - we are essentially building all of the
462 	 * suffix hashvalues as we go.  We can check to see if
463 	 * any suffixes already exist in the tree as we generate
464 	 * the hash.
465 	 */
466 	for (i = len; i >= 0; i--) {
467 		hashval = ((hashval << 5) + hashval) +
468 		    str[i];			/* h = ((h * 33) + c) */
469 
470 		for (sthash = hashbcks[hashval % bckcnt];
471 		    sthash; sthash = sthash->hi_next) {
472 			const char	*hstr;
473 			Str_master	*_mstr;
474 
475 			if (sthash->hi_hashval != hashval)
476 				continue;
477 
478 			_mstr = sthash->hi_mstr;
479 			hstr = &_mstr->sm_str[_mstr->sm_strlen -
480 			    sthash->hi_strlen];
481 
482 			if (strcmp(&str[i], hstr))
483 				continue;
484 
485 			if (i == 0) {
486 				/*
487 				 * Entry already in table, increment refcnt and
488 				 * get out.
489 				 */
490 				sthash->hi_refcnt++;
491 				return (0);
492 			} else {
493 				/*
494 				 * If this 'suffix' is presently a 'master
495 				 * string, then take over it's record.
496 				 */
497 				if (sthash->hi_strlen == _mstr->sm_strlen) {
498 					/*
499 					 * we should only do this once.
500 					 */
501 					assert(mstr == 0);
502 					mstr = _mstr;
503 				}
504 			}
505 		}
506 	}
507 
508 	/*
509 	 * Do we need a new master string, or can we take over
510 	 * one we already found in the table?
511 	 */
512 	if (mstr == 0) {
513 		/*
514 		 * allocate a new master string
515 		 */
516 		if ((mstr = calloc(sizeof (Str_hash), 1)) == 0)
517 			return (-1);
518 		mstr->sm_next = stp->st_mstrlist;
519 		stp->st_mstrlist = mstr;
520 		stp->st_strsize += len + 1;
521 	} else {
522 		/*
523 		 * We are taking over a existing master string, the string size
524 		 * only increments by the difference between the current string
525 		 * and the previous master.
526 		 */
527 		assert(len > mstr->sm_strlen);
528 		stp->st_strsize += len - mstr->sm_strlen;
529 	}
530 
531 	if ((sthash = calloc(sizeof (Str_hash), 1)) == 0)
532 		return (-1);
533 
534 	mstr->sm_hashval = sthash->hi_hashval = hashval;
535 	mstr->sm_strlen = sthash->hi_strlen = len;
536 	mstr->sm_str = str;
537 	sthash->hi_refcnt = 1;
538 	sthash->hi_mstr = mstr;
539 
540 	/*
541 	 * Insert string element into head of hash list
542 	 */
543 	hashval = hashval % bckcnt;
544 	sthash->hi_next = hashbcks[hashval];
545 	hashbcks[hashval] = sthash;
546 	return (0);
547 }
548 
549 /*
550  * Return amount of space required for the string table.
551  */
552 size_t
553 st_getstrtab_sz(Str_tbl *stp)
554 {
555 	assert(stp->st_fullstrsize > 0);
556 
557 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
558 		stp->st_flags |= FLG_STTAB_COOKED;
559 		return (stp->st_fullstrsize);
560 	}
561 
562 	if ((stp->st_flags & FLG_STTAB_COOKED) == 0) {
563 		LenNode		*lnp;
564 		void		*cookie;
565 
566 		stp->st_flags |= FLG_STTAB_COOKED;
567 		/*
568 		 * allocate a hash table about the size of # of
569 		 * strings input.
570 		 */
571 		stp->st_hbckcnt = findprime(stp->st_strcnt);
572 		if ((stp->st_hashbcks =
573 		    calloc(sizeof (Str_hash), stp->st_hbckcnt)) == NULL)
574 			return (0);
575 
576 		/*
577 		 * We now walk all of the strings in the list, from shortest to
578 		 * longest, and insert them into the hashtable.
579 		 */
580 		if ((lnp = avl_first(stp->st_lentree)) == NULL) {
581 			/*
582 			 * Is it possible we have an empty string table, if so,
583 			 * the table still contains '\0', so return the size.
584 			 */
585 			if (avl_numnodes(stp->st_lentree) == 0) {
586 				assert(stp->st_strsize == 1);
587 				return (stp->st_strsize);
588 			}
589 			return (0);
590 		}
591 
592 		while (lnp) {
593 			StrNode	*snp;
594 
595 			/*
596 			 * Walk the string lists and insert them into the hash
597 			 * list.  Once a string is inserted we no longer need
598 			 * it's entry, so the string can be freed.
599 			 */
600 			for (snp = avl_first(lnp->ln_strtree); snp;
601 			    snp = AVL_NEXT(lnp->ln_strtree, snp)) {
602 				if (st_hash_insert(stp, snp->sn_str,
603 				    lnp->ln_strlen) == -1)
604 					return (0);
605 			}
606 
607 			/*
608 			 * Now that the strings have been copied, walk the
609 			 * StrNode tree and free all the AVL nodes.  Note,
610 			 * avl_destroy_nodes() beats avl_remove() as the
611 			 * latter balances the nodes as they are removed.
612 			 * We just want to tear the whole thing down fast.
613 			 */
614 			cookie = NULL;
615 			while ((snp = avl_destroy_nodes(lnp->ln_strtree,
616 			    &cookie)) != NULL)
617 				free(snp);
618 			avl_destroy(lnp->ln_strtree);
619 			free(lnp->ln_strtree);
620 			lnp->ln_strtree = NULL;
621 
622 			/*
623 			 * Move on to the next LenNode.
624 			 */
625 			lnp = AVL_NEXT(stp->st_lentree, lnp);
626 		}
627 
628 		/*
629 		 * Now that all of the strings have been freed, walk the
630 		 * LenNode tree and free all of the AVL nodes.  Note,
631 		 * avl_destroy_nodes() beats avl_remove() as the latter
632 		 * balances the nodes as they are removed. We just want to
633 		 * tear the whole thing down fast.
634 		 */
635 		cookie = NULL;
636 		while ((lnp = avl_destroy_nodes(stp->st_lentree,
637 		    &cookie)) != NULL)
638 			free(lnp);
639 		avl_destroy(stp->st_lentree);
640 		free(stp->st_lentree);
641 		stp->st_lentree = 0;
642 	}
643 
644 	assert(stp->st_strsize > 0);
645 	assert(stp->st_fullstrsize >= stp->st_strsize);
646 
647 	return (stp->st_strsize);
648 }
649 
650 /*
651  * Associate a buffer with a string table.
652  */
653 const char *
654 st_getstrbuf(Str_tbl *stp)
655 {
656 	return (stp->st_strbuf);
657 }
658 
659 int
660 st_setstrbuf(Str_tbl *stp, char *stbuf, size_t bufsize)
661 {
662 	assert(stp->st_flags & FLG_STTAB_COOKED);
663 
664 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
665 		if (bufsize < stp->st_fullstrsize)
666 			return (-1);
667 	} else {
668 		if (bufsize < stp->st_strsize)
669 			return (-1);
670 	}
671 
672 	stp->st_strbuf = stbuf;
673 #ifdef	DEBUG
674 	/*
675 	 * for debug builds - start with a stringtable filled in
676 	 * with '0xff'.  This makes it very easy to find wholes
677 	 * which we failed to fill in - in the strtab.
678 	 */
679 	memset(stbuf, 0xff, bufsize);
680 	stbuf[0] = '\0';
681 #else
682 	memset(stbuf, 0x0, bufsize);
683 #endif
684 	return (0);
685 }
686