xref: /titanic_50/usr/src/cmd/sgs/tools/common/string_table.c (revision 25cf1a301a396c38e8adf52c15f537b80d2483f7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <string_table.h>
30 #include <strings.h>
31 #include <sgs.h>
32 #include <stdio.h>
33 
34 
35 
36 /*
37  * This file provides the interfaces to build a Str_tbl suitable
38  * for use by either the sgsmsg system or a standard ELF
39  * SHT_STRTAB.
40  *
41  * There are two modes which can be used when constructing a
42  * string table:
43  *
44  *	st_new(0)
45  *		standard string table - no compression.  This is the
46  *		traditional method and fast
47  *
48  *	st_new(FLG_STNEW_COMPRESS)
49  *		build a compressed string table which both
50  *		eliminates duplicate strings and permits
51  *		strings with common suffixes (atexit vs. exit) to
52  *		overlap in the table.  This provides space
53  *		savings for many string tables.
54  *
55  * These string tables are now built with a common interface in a
56  * two-pass manner, the first pass it to find all of the strings
57  * required for the string-table and to calculate the size that
58  * will be required for the final string table.
59  *
60  * The second pass allocates the string table and populates the
61  * strings into the table and returns the offsets the strings
62  * have been assigned.
63  *
64  * The calling sequence to build and populate a string table is:
65  *
66  *		st_new();		// initialize strtab
67  *
68  *		st_insert(st1);		// first pass of strings ...
69  *					// calculates size required for
70  *					// string table
71  *
72  *		st_delstring(st?);	// remove string previously
73  *					// inserted
74  *
75  *		st_insert(stN);
76  *
77  *		st_getstrtab_sz();	// freezes strtab and computes
78  *					// size of table.
79  *
80  *		st_setstrbuf();		// associates a final destination
81  *					// for the string table
82  *
83  *		st_setstring(st1);	// populate the string table
84  *		...			// offsets are based off of second
85  *					// pass	through the string table
86  *		st_setstring(stN);
87  *
88  *		st_destroy();		// tear down string table
89  *					// structures.
90  *
91  * String Suffix Compression Algorithm:
92  *
93  *   Here's a quick high level overview of the Suffix String
94  *   compression algorithm used.  First - the heart of the algorithm
95  *   is a Hash table list which represents a dictionary of all unique
96  *   strings inserted into the string table.  The hash function for
97  *   this table is a standard string hash except that the hash starts
98  *   at the last character in the string (&str[n - 1]) and works towards
99  *   the first character in the function (&str[0]).  As we compute the
100  *   HASH value for a given string, we also compute the hash values
101  *   for all of the possible suffix strings for that string.
102  *
103  *   As we compute the hash - at each character see if the current
104  *   suffix string for that hash is already present in the table.  If
105  *   it is, and the string is a master string.  Then change that
106  *   string to a suffix string of the new string being inserted.
107  *
108  *   When the final hash value is found (hash for str[0...n]), check
109  *   to see if it is in the hash table - if so increment the reference
110  *   count for the string.  If it is not yet in the table, insert a
111  *   new hash table entry for a master string.
112  *
113  *   The above method will find all suffixes of a given string given
114  *   that the strings are inserted from shortest to longest.  That is
115  *   why this is a two phase method, we first collect all of the
116  *   strings and store them based off of their length in a nice AVL tree.
117  *   Once all of the strings have been submitted we then start the
118  *   hash table build by traversing the AVL tree in order and
119  *   inserting the strings from shortest to longest as described
120  *   above.
121  *
122  */
123 
124 /* LINTLIBRARY */
125 
126 
127 int
128 strlen_compare(const void *elem1, const void *elem2)
129 {
130 	uint_t	l1, l2;
131 	l1 = ((Stringelem *)elem1)->se_stlen;
132 	l2 = ((Stringelem *)elem2)->se_stlen;
133 
134 	if (l1 == l2)
135 		return (0);
136 	if (l2 < l1)
137 		return (1);
138 
139 	return (-1);
140 }
141 
142 /*
143  * Return a initialized Str_tbl - returns NULL on failure.
144  *
145  * stflags:
146  *
147  *	FLG_STNEW_COMPRESS - build a compressed string table
148  *
149  */
150 Str_tbl *
151 st_new(uint_t stflags)
152 {
153 	Str_tbl	*stp;
154 
155 	if ((stp = calloc(sizeof (Str_tbl), 1)) == 0)
156 		return (0);
157 
158 	/*
159 	 * Start with a leading '\0' - it's tradition.
160 	 */
161 	stp->st_stringsize = stp->st_fullstringsize = stp->st_nextoff = 1;
162 
163 	/*
164 	 * Do we compress this string table
165 	 */
166 	if ((stflags & FLG_STNEW_COMPRESS) == 0)
167 		return (stp);
168 
169 	stp->st_flags |= FLG_STTAB_COMPRESS;
170 	if ((stp->st_strtree = calloc(sizeof (avl_tree_t), 1)) == 0) {
171 		return (0);
172 	}
173 
174 	avl_create(stp->st_strtree, &strlen_compare, sizeof (Stringelem),
175 		SGSOFFSETOF(Stringelem, se_avlnode));
176 
177 	return (stp);
178 }
179 
180 /*
181  * Tear down a String_Table structure.
182  */
183 void
184 st_destroy(Str_tbl *stp)
185 {
186 	Str_hash	*sthash, *psthash;
187 	Str_master	*mstr, *pmstr;
188 	uint_t		i;
189 
190 	/*
191 	 * cleanup the master strings
192 	 */
193 	for (mstr = stp->st_mstrlist, pmstr = 0; mstr;
194 	    mstr = mstr->sm_next) {
195 		if (pmstr)
196 			free(pmstr);
197 		pmstr = mstr;
198 	}
199 	if (pmstr)
200 		free(pmstr);
201 
202 	if (stp->st_hashbcks) {
203 		for (i = 0; i < stp->st_hbckcnt; i++) {
204 			for (sthash = stp->st_hashbcks[i], psthash = 0;
205 			    sthash; sthash = sthash->hi_next) {
206 				if (psthash)
207 					free(psthash);
208 				psthash = sthash;
209 			}
210 			if (psthash)
211 				free(psthash);
212 		}
213 		free(stp->st_hashbcks);
214 	}
215 	free(stp);
216 }
217 
218 
219 
220 
221 /*
222  * Remove a previously inserted string from the Str_tbl
223  */
224 int
225 st_delstring(Str_tbl *stp, const char *str)
226 {
227 	uint_t		stlen;
228 	Stringelem	qstelem;
229 	Stringelem	*stelem;
230 	Stringlist	*stlist, *pstlist;
231 
232 	/*
233 	 * String table can't have been cooked
234 	 */
235 	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
236 
237 	stlen = (uint_t)strlen(str);
238 	stp->st_fullstringsize -= stlen + 1;
239 
240 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
241 		return (0);
242 
243 	qstelem.se_stlen = stlen;
244 	if ((stelem = avl_find(stp->st_strtree, &qstelem, 0)) == NULL) {
245 		/*
246 		 * no strings of this length recorded, let alone
247 		 * this specific string - someone goofed.
248 		 */
249 		return (-1);
250 	}
251 
252 	pstlist = 0;
253 	for (stlist = stelem->se_strlist; stlist; stlist = stlist->sl_next) {
254 		if (strcmp(str, stlist->sl_string) == 0)
255 			break;
256 		pstlist = stlist;
257 	}
258 
259 	if (stlist == 0) {
260 		/*
261 		 * string was not found
262 		 */
263 		return (-1);
264 	}
265 
266 	if (pstlist == 0) {
267 		/*
268 		 * String is first on list.
269 		 */
270 		stelem->se_strlist = stlist->sl_next;
271 	} else {
272 		/*
273 		 * remove string from list.
274 		 */
275 		pstlist->sl_next = stlist->sl_next;
276 	}
277 
278 	free(stlist);
279 	return (0);
280 }
281 
282 
283 /*
284  * Insert a new string into the Str_tbl
285  */
286 int
287 st_insert(Str_tbl *stp, const char *str)
288 {
289 	uint_t	stlen;
290 	Stringelem	qstelem;
291 	Stringelem	*stelem;
292 	Stringlist	*strlist;
293 	avl_index_t	where;
294 
295 	/*
296 	 * String table can't have been cooked
297 	 */
298 	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
299 	stlen = (uint_t)strlen(str);
300 	/*
301 	 * Null strings always point to the head of the string
302 	 * table - no reason to keep searching.
303 	 */
304 	if (stlen == 0)
305 		return (0);
306 
307 	stp->st_fullstringsize += stlen + 1;
308 	stp->st_stringcnt++;
309 
310 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
311 		return (0);
312 
313 	qstelem.se_stlen = strlen(str);
314 	if ((stelem = avl_find(stp->st_strtree, &qstelem,
315 	    &where)) == NULL) {
316 		if ((stelem = calloc(sizeof (Stringelem), 1)) == 0)
317 			return (-1);
318 		stelem->se_stlen = qstelem.se_stlen;
319 		avl_insert(stp->st_strtree, stelem, where);
320 	}
321 	if ((strlist = malloc(sizeof (Stringlist))) == 0)
322 		return (-1);
323 
324 	strlist->sl_string = str;
325 	strlist->sl_next = stelem->se_strlist;
326 	stelem->se_strlist = strlist;
327 
328 	return (0);
329 }
330 
331 
332 /*
333  * For a given string - copy it into the buffer associated with
334  * the string table - and return the offset it has been assigned.
335  *
336  * If a value of '-1' is returned - the string was not found in
337  * the Str_tbl.
338  */
339 int
340 st_setstring(Str_tbl *stp, const char *str, uint_t *stoff)
341 {
342 	uint_t		stlen;
343 	uint_t		hashval;
344 	Str_hash	*sthash;
345 	Str_master	*mstr;
346 	int		i;
347 
348 	/*
349 	 * String table *must* have been previously cooked
350 	 */
351 	assert(stp->st_strbuf);
352 
353 	assert(stp->st_flags & FLG_STTAB_COOKED);
354 	stlen = (uint_t)strlen(str);
355 	/*
356 	 * Null string always points to head of string table
357 	 */
358 	if (stlen == 0) {
359 		*stoff = 0;
360 		return (0);
361 	}
362 
363 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
364 		uint_t		_stoff;
365 
366 		stlen++;	/* count for trailing '\0' */
367 		_stoff = stp->st_nextoff;
368 		/*
369 		 * Have we overflowed our assigned buffer?
370 		 */
371 		if ((_stoff + stlen) > stp->st_fullstringsize)
372 			return (-1);
373 		memcpy(stp->st_strbuf + _stoff, str, stlen);
374 		*stoff = _stoff;
375 		stp->st_nextoff += stlen;
376 		return (0);
377 	}
378 
379 	/*
380 	 * Calculate reverse hash for string
381 	 */
382 	hashval = HASHSEED;
383 	for (i = stlen; i >= 0; i--) {
384 		hashval = ((hashval << 5) + hashval) +
385 			str[i];			/* h = ((h * 33) + c) */
386 	}
387 
388 	for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash;
389 	    sthash = sthash->hi_next) {
390 		if (sthash->hi_hashval == hashval) {
391 			const char	*hstr;
392 
393 			hstr = &sthash->hi_mstr->sm_str[
394 			    sthash->hi_mstr->sm_stlen -
395 			    sthash->hi_stlen];
396 			if (strcmp(str, hstr) == 0) {
397 				break;
398 			}
399 		}
400 	}
401 
402 	/*
403 	 * Did we find the string?
404 	 */
405 	if (sthash == 0)
406 		return (-1);
407 
408 	/*
409 	 * Has this string been copied into the string table?
410 	 */
411 	mstr = sthash->hi_mstr;
412 	if (mstr->sm_stoff == 0) {
413 		uint_t	mstlen = mstr->sm_stlen + 1;
414 		mstr->sm_stoff = stp->st_nextoff;
415 		/*
416 		 * Have we overflowed our assigned buffer?
417 		 */
418 		if ((mstr->sm_stoff + mstlen) > stp->st_fullstringsize)
419 			return (-1);
420 		memcpy(stp->st_strbuf + mstr->sm_stoff, mstr->sm_str,
421 			mstlen);
422 		stp->st_nextoff += mstlen;
423 	}
424 	/*
425 	 * Calculate offset of (sub)string
426 	 */
427 	*stoff = mstr->sm_stoff + mstr->sm_stlen - sthash->hi_stlen;
428 
429 	return (0);
430 }
431 
432 
433 static int
434 st_hash_insert(Str_tbl *stp, const char *str, uint_t stlen)
435 {
436 	int		i;
437 	uint_t		hashval = HASHSEED;
438 	uint_t		bckcnt = stp->st_hbckcnt;
439 	Str_hash	**hashbcks = stp->st_hashbcks;
440 	Str_hash	*sthash;
441 	Str_master	*mstr = 0;
442 
443 	/*
444 	 * We use a classic 'Bernstein k=33' hash function.  But
445 	 * instead of hashing from the start of the string to the
446 	 * end, we do it in reverse.
447 	 *
448 	 * This way - we are essentially building all of the
449 	 * suffix hashvalues as we go.  We can check to see if
450 	 * any suffixes already exist in the tree as we generate
451 	 * the hash.
452 	 */
453 	for (i = stlen; i >= 0; i--) {
454 
455 		hashval = ((hashval << 5) + hashval) +
456 			str[i];			/* h = ((h * 33) + c) */
457 		for (sthash = hashbcks[hashval % bckcnt];
458 		    sthash; sthash = sthash->hi_next) {
459 
460 			if (sthash->hi_hashval == hashval) {
461 				const char	*hstr;
462 				Str_master	*_mstr;
463 
464 				_mstr = sthash->hi_mstr;
465 				hstr = &_mstr->sm_str[_mstr->sm_stlen -
466 				    sthash->hi_stlen];
467 				if (strcmp(&str[i], hstr) == 0) {
468 					if (i == 0) {
469 						/*
470 						 * Entry already in table,
471 						 * increment refcnt and get
472 						 * out.
473 						 */
474 						sthash->hi_refcnt++;
475 						return (0);
476 					} else {
477 						/*
478 						 * If this 'suffix' is
479 						 * presently a 'master' string,
480 						 * then take over it's record.
481 						 */
482 						if (sthash->hi_stlen ==
483 						    _mstr->sm_stlen) {
484 							/*
485 							 * we should only do
486 							 * this once.
487 							 */
488 							assert(mstr == 0);
489 							mstr = _mstr;
490 						}
491 					}
492 				}
493 			}
494 		}
495 	}
496 
497 
498 	/*
499 	 * Do we need a new master string, or can we take over
500 	 * one we already found in the table?
501 	 */
502 	if (mstr == 0) {
503 		/*
504 		 * allocate a new master string
505 		 */
506 		if ((mstr = calloc(sizeof (Str_hash), 1)) == 0)
507 			return (-1);
508 		mstr->sm_next = stp->st_mstrlist;
509 		stp->st_mstrlist = mstr;
510 		stp->st_stringsize += stlen + 1;
511 	} else {
512 		/*
513 		 * We are taking over a existing master string,
514 		 * the stringsize only increments by the
515 		 * difference between the currnet string and the
516 		 * previous master.
517 		 */
518 		assert(stlen > mstr->sm_stlen);
519 		stp->st_stringsize += stlen - mstr->sm_stlen;
520 	}
521 
522 	if ((sthash = calloc(sizeof (Str_hash), 1)) == 0)
523 		return (-1);
524 
525 	mstr->sm_hashval = sthash->hi_hashval = hashval;
526 	mstr->sm_stlen = sthash->hi_stlen = stlen;
527 	mstr->sm_str = str;
528 	sthash->hi_refcnt = 1;
529 	sthash->hi_mstr = mstr;
530 
531 	/*
532 	 * Insert string element into head of hash list
533 	 */
534 	hashval = hashval % bckcnt;
535 	sthash->hi_next = hashbcks[hashval];
536 	hashbcks[hashval] = sthash;
537 	return (0);
538 }
539 
540 /*
541  * Return amount of space required for the string table.
542  */
543 uint_t
544 st_getstrtab_sz(Str_tbl *stp)
545 {
546 	assert(stp->st_fullstringsize > 0);
547 
548 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
549 		stp->st_flags |= FLG_STTAB_COOKED;
550 		return (stp->st_fullstringsize);
551 	}
552 
553 
554 	if ((stp->st_flags & FLG_STTAB_COOKED) == 0) {
555 		Stringelem	*stelem;
556 		void		*cookie;
557 
558 		stp->st_flags |= FLG_STTAB_COOKED;
559 		/*
560 		 * allocate a hash table about the size of # of
561 		 * strings input.
562 		 */
563 		stp->st_hbckcnt = findprime(stp->st_stringcnt);
564 		if ((stp->st_hashbcks =
565 		    calloc(sizeof (Str_hash), stp->st_hbckcnt)) == NULL)
566 			return (0);
567 
568 		/*
569 		 * We now walk all of the strings in the list,
570 		 * from shortest to longest, and insert them into
571 		 * the hashtable.
572 		 */
573 		if ((stelem = avl_first(stp->st_strtree)) == NULL) {
574 			/*
575 			 * Is it possible we have a empty string table,
576 			 * if so - the table still conains '\0'
577 			 * so still return the size.
578 			 */
579 			if (avl_numnodes(stp->st_strtree) == 0) {
580 				assert(stp->st_stringsize == 1);
581 				return (stp->st_stringsize);
582 			}
583 			return (0);
584 		}
585 		while (stelem) {
586 			Stringlist	*strlist, *pstrlist;
587 
588 			/*
589 			 * Walk the string lists and insert them
590 			 * into the hash list.  Once a string is
591 			 * inserted we no longer need it's entry,
592 			 * so free it
593 			 */
594 			for (strlist = stelem->se_strlist, pstrlist = 0;
595 			    strlist; strlist = strlist->sl_next) {
596 				if (st_hash_insert(stp, strlist->sl_string,
597 				    stelem->se_stlen) == -1)
598 					return (0);
599 				if (pstrlist)
600 					free(pstrlist);
601 			}
602 			free(pstrlist);
603 			stelem->se_strlist = 0;
604 			stelem = AVL_NEXT(stp->st_strtree, stelem);
605 		}
606 
607 		/*
608 		 * Now that all of the strings have been freed,
609 		 * go ahead and quickly re-walk the AVL tree and
610 		 * free all of the AVL nodes.
611 		 *
612 		 * avl_destroy_nodes() beats avl_remove() because
613 		 * avl_remove will 'ballance' the tree as nodes
614 		 * are deleted - we just want to tear the whole
615 		 * thing down now.
616 		 */
617 		cookie = NULL;
618 		while ((stelem = avl_destroy_nodes(stp->st_strtree,
619 		    &cookie)) != NULL)
620 			free(stelem);
621 		avl_destroy(stp->st_strtree);
622 		free(stp->st_strtree);
623 		stp->st_strtree = 0;
624 	}
625 
626 	assert(stp->st_stringsize > 0);
627 	assert(stp->st_fullstringsize >= stp->st_stringsize);
628 
629 	return (stp->st_stringsize);
630 }
631 
632 /*
633  * Associate a buffer with the string table.
634  */
635 const char *
636 st_getstrbuf(Str_tbl *stp)
637 {
638 	return (stp->st_strbuf);
639 }
640 
641 int
642 st_setstrbuf(Str_tbl *stp, char *stbuf, uint_t bufsize)
643 {
644 	assert(stp->st_flags & FLG_STTAB_COOKED);
645 
646 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
647 		if (bufsize < stp->st_fullstringsize)
648 			return (-1);
649 	} else {
650 		if (bufsize < stp->st_stringsize)
651 			return (-1);
652 	}
653 
654 	stp->st_strbuf = stbuf;
655 #ifdef	DEBUG
656 	/*
657 	 * for debug builds - start with a stringtable filled in
658 	 * with '0xff'.  This makes it very easy to find wholes
659 	 * which we failed to fill in - in the strtab.
660 	 */
661 	memset(stbuf, 0xff, bufsize);
662 	stbuf[0] = '\0';
663 #else
664 	memset(stbuf, 0x0, bufsize);
665 #endif
666 	return (0);
667 }
668