xref: /titanic_52/usr/src/cmd/sgs/tools/common/string_table.c (revision b819cea2f73f98c5662230cc9affc8cc84f77fcf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <_string_table.h>
28 #include <strings.h>
29 #include <sgs.h>
30 #include <stdio.h>
31 
32 /*
33  * This file provides the interfaces to build a Str_tbl suitable for use by
34  * either the sgsmsg message system, or a standard ELF string table (SHT_STRTAB)
35  * as created by ld(1).
36  *
37  * There are two modes which can be used when constructing a string table:
38  *
39  *	st_new(0)
40  *		standard string table - no compression.  This is the
41  *		traditional, fast method.
42  *
43  *	st_new(FLG_STTAB_COMPRESS)
44  *		builds a compressed string table which both eliminates
45  *		duplicate strings, and permits strings with common suffixes
46  *		(atexit vs. exit) to overlap in the table.  This provides space
47  *		savings for many string tables.  Although more work than the
48  *		traditional method, the algorithms used are designed to scale
49  *		and keep any overhead at a minimum.
50  *
51  * These string tables are built with a common interface in a two-pass manner.
52  * The first pass finds all of the strings required for the string-table and
53  * calculates the size required for the final string table.
54  *
55  * The second pass allocates the string table, populates the strings into the
56  * table and returns the offsets the strings have been assigned.
57  *
58  * The calling sequence to build and populate a string table is:
59  *
60  *		st_new();		// initialize strtab
61  *
62  *		st_insert(st1);		// first pass of strings ...
63  *					// calculates size required for
64  *					// string table
65  *
66  *		st_delstring(st?);	// remove string previously
67  *					// inserted
68  *		st_insert(stN);
69  *
70  *		st_getstrtab_sz();	// freezes strtab and computes
71  *					// size of table.
72  *
73  *		st_setstrbuf();		// associates a final destination
74  *					// for the string table
75  *
76  *		st_setstring(st1);	// populate the string table
77  *		...			// offsets are based off of second
78  *					// pass	through the string table
79  *		st_setstring(stN);
80  *
81  *		st_destroy();		// tear down string table
82  *					// structures.
83  *
84  * String Suffix Compression Algorithm:
85  *
86  *   Here's a quick high level overview of the Suffix String
87  *   compression algorithm used.  First - the heart of the algorithm
88  *   is a Hash table list which represents a dictionary of all unique
89  *   strings inserted into the string table.  The hash function for
90  *   this table is a standard string hash except that the hash starts
91  *   at the last character in the string (&str[n - 1]) and works towards
92  *   the first character in the function (&str[0]).  As we compute the
93  *   HASH value for a given string, we also compute the hash values
94  *   for all of the possible suffix strings for that string.
95  *
96  *   As we compute the hash - at each character see if the current
97  *   suffix string for that hash is already present in the table.  If
98  *   it is, and the string is a master string.  Then change that
99  *   string to a suffix string of the new string being inserted.
100  *
101  *   When the final hash value is found (hash for str[0...n]), check
102  *   to see if it is in the hash table - if so increment the reference
103  *   count for the string.  If it is not yet in the table, insert a
104  *   new hash table entry for a master string.
105  *
106  *   The above method will find all suffixes of a given string given
107  *   that the strings are inserted from shortest to longest.  That is
108  *   why this is a two phase method, we first collect all of the
109  *   strings and store them based off of their length in an AVL tree.
110  *   Once all of the strings have been submitted we then start the
111  *   hash table build by traversing the AVL tree in order and
112  *   inserting the strings from shortest to longest as described
113  *   above.
114  */
115 
116 /* LINTLIBRARY */
117 
118 static int
119 avl_len_compare(const void *n1, const void *n2)
120 {
121 	size_t	len1, len2;
122 
123 	len1 = ((LenNode *)n1)->ln_strlen;
124 	len2 = ((LenNode *)n2)->ln_strlen;
125 
126 	if (len1 == len2)
127 		return (0);
128 	if (len2 < len1)
129 		return (1);
130 	return (-1);
131 }
132 
133 static int
134 avl_str_compare(const void *n1, const void *n2)
135 {
136 	const char	*str1, *str2;
137 	int		rc;
138 
139 	str1 = ((StrNode *)n1)->sn_str;
140 	str2 = ((StrNode *)n2)->sn_str;
141 
142 	rc = strcmp(str1, str2);
143 	if (rc > 0)
144 		return (1);
145 	if (rc < 0)
146 		return (-1);
147 	return (0);
148 }
149 
150 /*
151  * Return an initialized Str_tbl - returns NULL on failure.
152  *
153  * flags:
154  *	FLG_STTAB_COMPRESS - build a compressed string table
155  */
156 Str_tbl *
157 st_new(uint_t flags)
158 {
159 	Str_tbl	*stp;
160 
161 	if ((stp = calloc(sizeof (*stp), 1)) == NULL)
162 		return (NULL);
163 
164 	/*
165 	 * Start with a leading '\0' - it's tradition.
166 	 */
167 	stp->st_strsize = stp->st_fullstrsize = stp->st_nextoff = 1;
168 
169 	/*
170 	 * Do we compress this string table?
171 	 */
172 	stp->st_flags = flags;
173 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
174 		return (stp);
175 
176 	if ((stp->st_lentree = calloc(sizeof (*stp->st_lentree), 1)) == NULL)
177 		return (NULL);
178 
179 	avl_create(stp->st_lentree, &avl_len_compare, sizeof (LenNode),
180 	    SGSOFFSETOF(LenNode, ln_avlnode));
181 
182 	return (stp);
183 }
184 
185 /*
186  * Insert a new string into the Str_tbl.  There are two AVL trees used.
187  *
188  *  -	The first LenNode AVL tree maintains a tree of nodes based on string
189  *	sizes.
190  *  -	Each LenNode maintains a StrNode AVL tree for each string.  Large
191  *	applications have been known to contribute thousands of strings of
192  *	the same size.  Should strings need to be removed (-z ignore), then
193  *	the string AVL tree makes this removal efficient and scalable.
194  */
195 int
196 st_insert(Str_tbl *stp, const char *str)
197 {
198 	size_t		len;
199 	StrNode		*snp, sn = { 0 };
200 	LenNode		*lnp, ln = { 0 };
201 	avl_index_t	where;
202 
203 	/*
204 	 * String table can't have been cooked
205 	 */
206 	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
207 
208 	/*
209 	 * Null strings always point to the head of the string
210 	 * table - no reason to keep searching.
211 	 */
212 	if ((len = strlen(str)) == 0)
213 		return (0);
214 
215 	stp->st_fullstrsize += len + 1;
216 	stp->st_strcnt++;
217 
218 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
219 		return (0);
220 
221 	/*
222 	 * From the controlling string table, determine which LenNode AVL node
223 	 * provides for this string length.  If the node doesn't exist, insert
224 	 * a new node to represent this string length.
225 	 */
226 	ln.ln_strlen = len;
227 	if ((lnp = avl_find(stp->st_lentree, &ln, &where)) == NULL) {
228 		if ((lnp = calloc(sizeof (*lnp), 1)) == NULL)
229 			return (-1);
230 		lnp->ln_strlen = len;
231 		avl_insert(stp->st_lentree, lnp, where);
232 
233 		if ((lnp->ln_strtree = calloc(sizeof (*lnp->ln_strtree), 1)) ==
234 		    NULL)
235 			return (0);
236 
237 		avl_create(lnp->ln_strtree, &avl_str_compare, sizeof (StrNode),
238 		    SGSOFFSETOF(StrNode, sn_avlnode));
239 	}
240 
241 	/*
242 	 * From the string length AVL node determine whether a StrNode AVL node
243 	 * provides this string.  If the node doesn't exist, insert a new node
244 	 * to represent this string.
245 	 */
246 	sn.sn_str = str;
247 	if ((snp = avl_find(lnp->ln_strtree, &sn, &where)) == NULL) {
248 		if ((snp = calloc(sizeof (*snp), 1)) == NULL)
249 			return (-1);
250 		snp->sn_str = str;
251 		avl_insert(lnp->ln_strtree, snp, where);
252 	}
253 	snp->sn_refcnt++;
254 
255 	return (0);
256 }
257 
258 /*
259  * Remove a previously inserted string from the Str_tbl.
260  */
261 int
262 st_delstring(Str_tbl *stp, const char *str)
263 {
264 	size_t		len;
265 	LenNode		*lnp, ln = { 0 };
266 	StrNode		*snp, sn = { 0 };
267 
268 	/*
269 	 * String table can't have been cooked
270 	 */
271 	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
272 
273 	len = strlen(str);
274 	stp->st_fullstrsize -= len + 1;
275 
276 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
277 		return (0);
278 
279 	/*
280 	 * Determine which LenNode AVL node provides for this string length.
281 	 */
282 	ln.ln_strlen = len;
283 	if ((lnp = avl_find(stp->st_lentree, &ln, 0)) != NULL) {
284 		sn.sn_str = str;
285 		if ((snp = avl_find(lnp->ln_strtree, &sn, 0)) != NULL) {
286 			/*
287 			 * Reduce the reference count, and if zero remove the
288 			 * node.
289 			 */
290 			if (--snp->sn_refcnt == 0)
291 				avl_remove(lnp->ln_strtree, snp);
292 			return (0);
293 		}
294 	}
295 
296 	/*
297 	 * No strings of this length, or no string itself - someone goofed.
298 	 */
299 	return (-1);
300 }
301 
302 /*
303  * Tear down a String_Table structure.
304  */
305 void
306 st_destroy(Str_tbl *stp)
307 {
308 	Str_hash	*sthash, *psthash;
309 	Str_master	*mstr, *pmstr;
310 	uint_t		i;
311 
312 	/*
313 	 * cleanup the master strings
314 	 */
315 	for (mstr = stp->st_mstrlist, pmstr = 0; mstr;
316 	    mstr = mstr->sm_next) {
317 		if (pmstr)
318 			free(pmstr);
319 		pmstr = mstr;
320 	}
321 	if (pmstr)
322 		free(pmstr);
323 
324 	if (stp->st_hashbcks) {
325 		for (i = 0; i < stp->st_hbckcnt; i++) {
326 			for (sthash = stp->st_hashbcks[i], psthash = 0;
327 			    sthash; sthash = sthash->hi_next) {
328 				if (psthash)
329 					free(psthash);
330 				psthash = sthash;
331 			}
332 			if (psthash)
333 				free(psthash);
334 		}
335 		free(stp->st_hashbcks);
336 	}
337 	free(stp);
338 }
339 
340 
341 /*
342  * For a given string - copy it into the buffer associated with
343  * the string table - and return the offset it has been assigned.
344  *
345  * If a value of '-1' is returned - the string was not found in
346  * the Str_tbl.
347  */
348 int
349 st_setstring(Str_tbl *stp, const char *str, size_t *stoff)
350 {
351 	size_t		stlen;
352 	uint_t		hashval;
353 	Str_hash	*sthash;
354 	Str_master	*mstr;
355 	int		i;
356 
357 	/*
358 	 * String table *must* have been previously cooked
359 	 */
360 	assert(stp->st_strbuf);
361 
362 	assert(stp->st_flags & FLG_STTAB_COOKED);
363 	stlen = strlen(str);
364 	/*
365 	 * Null string always points to head of string table
366 	 */
367 	if (stlen == 0) {
368 		*stoff = 0;
369 		return (0);
370 	}
371 
372 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
373 		size_t		_stoff;
374 
375 		stlen++;	/* count for trailing '\0' */
376 		_stoff = stp->st_nextoff;
377 		/*
378 		 * Have we overflowed our assigned buffer?
379 		 */
380 		if ((_stoff + stlen) > stp->st_fullstrsize)
381 			return (-1);
382 		memcpy(stp->st_strbuf + _stoff, str, stlen);
383 		*stoff = _stoff;
384 		stp->st_nextoff += stlen;
385 		return (0);
386 	}
387 
388 	/*
389 	 * Calculate reverse hash for string.
390 	 */
391 	hashval = HASHSEED;
392 	for (i = stlen; i >= 0; i--) {
393 		hashval = ((hashval << 5) + hashval) +
394 		    str[i];			/* h = ((h * 33) + c) */
395 	}
396 
397 	for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash;
398 	    sthash = sthash->hi_next) {
399 		const char	*hstr;
400 
401 		if (sthash->hi_hashval != hashval)
402 			continue;
403 
404 		hstr = &sthash->hi_mstr->sm_str[sthash->hi_mstr->sm_strlen -
405 		    sthash->hi_strlen];
406 		if (strcmp(str, hstr) == 0)
407 			break;
408 	}
409 
410 	/*
411 	 * Did we find the string?
412 	 */
413 	if (sthash == 0)
414 		return (-1);
415 
416 	/*
417 	 * Has this string been copied into the string table?
418 	 */
419 	mstr = sthash->hi_mstr;
420 	if (mstr->sm_stroff == 0) {
421 		size_t	mstrlen = mstr->sm_strlen + 1;
422 
423 		mstr->sm_stroff = stp->st_nextoff;
424 
425 		/*
426 		 * Have we overflowed our assigned buffer?
427 		 */
428 		if ((mstr->sm_stroff + mstrlen) > stp->st_fullstrsize)
429 			return (-1);
430 
431 		(void) memcpy(stp->st_strbuf + mstr->sm_stroff,
432 		    mstr->sm_str, mstrlen);
433 		stp->st_nextoff += mstrlen;
434 	}
435 
436 	/*
437 	 * Calculate offset of (sub)string.
438 	 */
439 	*stoff = mstr->sm_stroff + mstr->sm_strlen - sthash->hi_strlen;
440 
441 	return (0);
442 }
443 
444 
445 static int
446 st_hash_insert(Str_tbl *stp, const char *str, size_t len)
447 {
448 	int		i;
449 	uint_t		hashval = HASHSEED;
450 	uint_t		bckcnt = stp->st_hbckcnt;
451 	Str_hash	**hashbcks = stp->st_hashbcks;
452 	Str_hash	*sthash;
453 	Str_master	*mstr = 0;
454 
455 	/*
456 	 * We use a classic 'Bernstein k=33' hash function.  But
457 	 * instead of hashing from the start of the string to the
458 	 * end, we do it in reverse.
459 	 *
460 	 * This way - we are essentially building all of the
461 	 * suffix hashvalues as we go.  We can check to see if
462 	 * any suffixes already exist in the tree as we generate
463 	 * the hash.
464 	 */
465 	for (i = len; i >= 0; i--) {
466 		hashval = ((hashval << 5) + hashval) +
467 		    str[i];			/* h = ((h * 33) + c) */
468 
469 		for (sthash = hashbcks[hashval % bckcnt];
470 		    sthash; sthash = sthash->hi_next) {
471 			const char	*hstr;
472 			Str_master	*_mstr;
473 
474 			if (sthash->hi_hashval != hashval)
475 				continue;
476 
477 			_mstr = sthash->hi_mstr;
478 			hstr = &_mstr->sm_str[_mstr->sm_strlen -
479 			    sthash->hi_strlen];
480 
481 			if (strcmp(&str[i], hstr))
482 				continue;
483 
484 			if (i == 0) {
485 				/*
486 				 * Entry already in table, increment refcnt and
487 				 * get out.
488 				 */
489 				sthash->hi_refcnt++;
490 				return (0);
491 			} else {
492 				/*
493 				 * If this 'suffix' is presently a 'master
494 				 * string, then take over it's record.
495 				 */
496 				if (sthash->hi_strlen == _mstr->sm_strlen) {
497 					/*
498 					 * we should only do this once.
499 					 */
500 					assert(mstr == 0);
501 					mstr = _mstr;
502 				}
503 			}
504 		}
505 	}
506 
507 	/*
508 	 * Do we need a new master string, or can we take over
509 	 * one we already found in the table?
510 	 */
511 	if (mstr == 0) {
512 		/*
513 		 * allocate a new master string
514 		 */
515 		if ((mstr = calloc(sizeof (*mstr), 1)) == 0)
516 			return (-1);
517 		mstr->sm_next = stp->st_mstrlist;
518 		stp->st_mstrlist = mstr;
519 		stp->st_strsize += len + 1;
520 	} else {
521 		/*
522 		 * We are taking over a existing master string, the string size
523 		 * only increments by the difference between the current string
524 		 * and the previous master.
525 		 */
526 		assert(len > mstr->sm_strlen);
527 		stp->st_strsize += len - mstr->sm_strlen;
528 	}
529 
530 	if ((sthash = calloc(sizeof (*sthash), 1)) == 0)
531 		return (-1);
532 
533 	mstr->sm_hashval = sthash->hi_hashval = hashval;
534 	mstr->sm_strlen = sthash->hi_strlen = len;
535 	mstr->sm_str = str;
536 	sthash->hi_refcnt = 1;
537 	sthash->hi_mstr = mstr;
538 
539 	/*
540 	 * Insert string element into head of hash list
541 	 */
542 	hashval = hashval % bckcnt;
543 	sthash->hi_next = hashbcks[hashval];
544 	hashbcks[hashval] = sthash;
545 	return (0);
546 }
547 
548 /*
549  * Return amount of space required for the string table.
550  */
551 size_t
552 st_getstrtab_sz(Str_tbl *stp)
553 {
554 	assert(stp->st_fullstrsize > 0);
555 
556 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
557 		stp->st_flags |= FLG_STTAB_COOKED;
558 		return (stp->st_fullstrsize);
559 	}
560 
561 	if ((stp->st_flags & FLG_STTAB_COOKED) == 0) {
562 		LenNode		*lnp;
563 		void		*cookie;
564 
565 		stp->st_flags |= FLG_STTAB_COOKED;
566 		/*
567 		 * allocate a hash table about the size of # of
568 		 * strings input.
569 		 */
570 		stp->st_hbckcnt = findprime(stp->st_strcnt);
571 		if ((stp->st_hashbcks = calloc(sizeof (*stp->st_hashbcks),
572 		    stp->st_hbckcnt)) == NULL)
573 			return (0);
574 
575 		/*
576 		 * We now walk all of the strings in the list, from shortest to
577 		 * longest, and insert them into the hashtable.
578 		 */
579 		if ((lnp = avl_first(stp->st_lentree)) == NULL) {
580 			/*
581 			 * Is it possible we have an empty string table, if so,
582 			 * the table still contains '\0', so return the size.
583 			 */
584 			if (avl_numnodes(stp->st_lentree) == 0) {
585 				assert(stp->st_strsize == 1);
586 				return (stp->st_strsize);
587 			}
588 			return (0);
589 		}
590 
591 		while (lnp) {
592 			StrNode	*snp;
593 
594 			/*
595 			 * Walk the string lists and insert them into the hash
596 			 * list.  Once a string is inserted we no longer need
597 			 * it's entry, so the string can be freed.
598 			 */
599 			for (snp = avl_first(lnp->ln_strtree); snp;
600 			    snp = AVL_NEXT(lnp->ln_strtree, snp)) {
601 				if (st_hash_insert(stp, snp->sn_str,
602 				    lnp->ln_strlen) == -1)
603 					return (0);
604 			}
605 
606 			/*
607 			 * Now that the strings have been copied, walk the
608 			 * StrNode tree and free all the AVL nodes.  Note,
609 			 * avl_destroy_nodes() beats avl_remove() as the
610 			 * latter balances the nodes as they are removed.
611 			 * We just want to tear the whole thing down fast.
612 			 */
613 			cookie = NULL;
614 			while ((snp = avl_destroy_nodes(lnp->ln_strtree,
615 			    &cookie)) != NULL)
616 				free(snp);
617 			avl_destroy(lnp->ln_strtree);
618 			free(lnp->ln_strtree);
619 			lnp->ln_strtree = NULL;
620 
621 			/*
622 			 * Move on to the next LenNode.
623 			 */
624 			lnp = AVL_NEXT(stp->st_lentree, lnp);
625 		}
626 
627 		/*
628 		 * Now that all of the strings have been freed, walk the
629 		 * LenNode tree and free all of the AVL nodes.  Note,
630 		 * avl_destroy_nodes() beats avl_remove() as the latter
631 		 * balances the nodes as they are removed. We just want to
632 		 * tear the whole thing down fast.
633 		 */
634 		cookie = NULL;
635 		while ((lnp = avl_destroy_nodes(stp->st_lentree,
636 		    &cookie)) != NULL)
637 			free(lnp);
638 		avl_destroy(stp->st_lentree);
639 		free(stp->st_lentree);
640 		stp->st_lentree = 0;
641 	}
642 
643 	assert(stp->st_strsize > 0);
644 	assert(stp->st_fullstrsize >= stp->st_strsize);
645 
646 	return (stp->st_strsize);
647 }
648 
649 /*
650  * Associate a buffer with a string table.
651  */
652 const char *
653 st_getstrbuf(Str_tbl *stp)
654 {
655 	return (stp->st_strbuf);
656 }
657 
658 int
659 st_setstrbuf(Str_tbl *stp, char *stbuf, size_t bufsize)
660 {
661 	assert(stp->st_flags & FLG_STTAB_COOKED);
662 
663 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
664 		if (bufsize < stp->st_fullstrsize)
665 			return (-1);
666 	} else {
667 		if (bufsize < stp->st_strsize)
668 			return (-1);
669 	}
670 
671 	stp->st_strbuf = stbuf;
672 #ifdef	DEBUG
673 	/*
674 	 * for debug builds - start with a stringtable filled in
675 	 * with '0xff'.  This makes it very easy to spot unfilled
676 	 * holes in the strtab.
677 	 */
678 	memset(stbuf, 0xff, bufsize);
679 	stbuf[0] = '\0';
680 #else
681 	memset(stbuf, 0x0, bufsize);
682 #endif
683 	return (0);
684 }
685