xref: /titanic_41/usr/src/cmd/sgs/tools/common/string_table.c (revision 70025d765b044c6d8594bb965a2247a61e991a99)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <string_table.h>
30 #include <stdlib.h>
31 #include <strings.h>
32 #include <sgs.h>
33 #include <stdio.h>
34 
35 
36 
37 /*
38  * This file provides the interfaces to build a Str_tbl suitable
39  * for use by either the sgsmsg system or a standard ELF
40  * SHT_STRTAB.
41  *
42  * There are two modes which can be used when constructing a
43  * string table:
44  *
45  *	st_new(0)
46  *		standard string table - no compression.  This is the
47  *		traditional method and fast
48  *
49  *	st_new(FLG_STNEW_COMPRESS)
50  *		build a compressed string table which both
51  *		eliminates duplicate strings and permits
52  *		strings with common suffixes (atexit vs. exit) to
53  *		overlap in the table.  This provides space
54  *		savings for many string tables.
55  *
56  * These string tables are now built with a common interface in a
57  * two-pass manner, the first pass it to find all of the strings
58  * required for the string-table and to calculate the size that
59  * will be required for the final string table.
60  *
61  * The second pass allocates the string table and populates the
62  * strings into the table and returns the offsets the strings
63  * have been assigned.
64  *
65  * The calling sequence to build and populate a string table is:
66  *
67  *		st_new();		// initialize strtab
68  *
69  *		st_insert(st1);		// first pass of strings ...
70  *					// calculates size required for
71  *					// string table
72  *
73  *		st_delstring(st?);	// remove string previously
74  *					// inserted
75  *
76  *		st_insert(stN);
77  *
78  *		st_getstrtab_sz();	// freezes strtab and computes
79  *					// size of table.
80  *
81  *		st_setstrbuf();		// associates a final destination
82  *					// for the string table
83  *
84  *		st_setstring(st1);	// populate the string table
85  *		...			// offsets are based off of second
86  *					// pass	through the string table
87  *		st_setstring(stN);
88  *
89  *		st_destroy();		// tear down string table
90  *					// structures.
91  *
92  * String Suffix Compression Algorithm:
93  *
94  *   Here's a quick high level overview of the Suffix String
95  *   compression algorithm used.  First - the heart of the algorithm
96  *   is a Hash table list which represents a dictionary of all unique
97  *   strings inserted into the string table.  The hash function for
98  *   this table is a standard string hash except that the hash starts
99  *   at the last character in the string (&str[n - 1]) and works towards
100  *   the first character in the function (&str[0]).  As we compute the
101  *   HASH value for a given string, we also compute the hash values
102  *   for all of the possible suffix strings for that string.
103  *
104  *   As we compute the hash - at each character see if the current
105  *   suffix string for that hash is already present in the table.  If
106  *   it is, and the string is a master string.  Then change that
107  *   string to a suffix string of the new string being inserted.
108  *
109  *   When the final hash value is found (hash for str[0...n]), check
110  *   to see if it is in the hash table - if so increment the reference
111  *   count for the string.  If it is not yet in the table, insert a
112  *   new hash table entry for a master string.
113  *
114  *   The above method will find all suffixes of a given string given
115  *   that the strings are inserted from shortest to longest.  That is
116  *   why this is a two phase method, we first collect all of the
117  *   strings and store them based off of their length in a nice AVL tree.
118  *   Once all of the strings have been submitted we then start the
119  *   hash table build by traversing the AVL tree in order and
120  *   inserting the strings from shortest to longest as described
121  *   above.
122  *
123  */
124 
125 /* LINTLIBRARY */
126 
127 
128 int
129 strlen_compare(const void *elem1, const void *elem2)
130 {
131 	uint_t	l1, l2;
132 	l1 = ((Stringelem *)elem1)->se_stlen;
133 	l2 = ((Stringelem *)elem2)->se_stlen;
134 
135 	if (l1 == l2)
136 		return (0);
137 	if (l2 < l1)
138 		return (1);
139 
140 	return (-1);
141 }
142 
143 /*
144  * Return a initialized Str_tbl - returns NULL on failure.
145  *
146  * stflags:
147  *
148  *	FLG_STNEW_COMPRESS - build a compressed string table
149  *
150  */
151 Str_tbl *
152 st_new(uint_t stflags)
153 {
154 	Str_tbl	*stp;
155 
156 	if ((stp = calloc(sizeof (Str_tbl), 1)) == 0)
157 		return (0);
158 
159 	/*
160 	 * Start with a leading '\0' - it's tradition.
161 	 */
162 	stp->st_stringsize = stp->st_fullstringsize = stp->st_nextoff = 1;
163 
164 	/*
165 	 * Do we compress this string table
166 	 */
167 	if ((stflags & FLG_STNEW_COMPRESS) == 0)
168 		return (stp);
169 
170 	stp->st_flags |= FLG_STTAB_COMPRESS;
171 	if ((stp->st_strtree = calloc(sizeof (avl_tree_t), 1)) == 0) {
172 		return (0);
173 	}
174 
175 	avl_create(stp->st_strtree, &strlen_compare, sizeof (Stringelem),
176 		SGSOFFSETOF(Stringelem, se_avlnode));
177 
178 	return (stp);
179 }
180 
181 /*
182  * Tear down a String_Table structure.
183  */
184 void
185 st_destroy(Str_tbl *stp)
186 {
187 	Str_hash	*sthash, *psthash;
188 	Str_master	*mstr, *pmstr;
189 	uint_t		i;
190 
191 	/*
192 	 * cleanup the master strings
193 	 */
194 	for (mstr = stp->st_mstrlist, pmstr = 0; mstr;
195 	    mstr = mstr->sm_next) {
196 		if (pmstr)
197 			free(pmstr);
198 		pmstr = mstr;
199 	}
200 	if (pmstr)
201 		free(pmstr);
202 
203 	if (stp->st_hashbcks) {
204 		for (i = 0; i < stp->st_hbckcnt; i++) {
205 			for (sthash = stp->st_hashbcks[i], psthash = 0;
206 			    sthash; sthash = sthash->hi_next) {
207 				if (psthash)
208 					free(psthash);
209 				psthash = sthash;
210 			}
211 			if (psthash)
212 				free(psthash);
213 		}
214 		free(stp->st_hashbcks);
215 	}
216 	free(stp);
217 }
218 
219 
220 
221 
222 /*
223  * Remove a previously inserted string from the Str_tbl
224  */
225 int
226 st_delstring(Str_tbl *stp, const char *str)
227 {
228 	uint_t		stlen;
229 	Stringelem	qstelem;
230 	Stringelem	*stelem;
231 	Stringlist	*stlist, *pstlist;
232 
233 	/*
234 	 * String table can't have been cooked
235 	 */
236 	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
237 
238 	stlen = (uint_t)strlen(str);
239 	stp->st_fullstringsize -= stlen + 1;
240 
241 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
242 		return (0);
243 
244 	qstelem.se_stlen = stlen;
245 	if ((stelem = avl_find(stp->st_strtree, &qstelem, 0)) == NULL) {
246 		/*
247 		 * no strings of this length recorded, let alone
248 		 * this specific string - someone goofed.
249 		 */
250 		return (-1);
251 	}
252 
253 	pstlist = 0;
254 	for (stlist = stelem->se_strlist; stlist; stlist = stlist->sl_next) {
255 		if (strcmp(str, stlist->sl_string) == 0)
256 			break;
257 		pstlist = stlist;
258 	}
259 
260 	if (stlist == 0) {
261 		/*
262 		 * string was not found
263 		 */
264 		return (-1);
265 	}
266 
267 	if (pstlist == 0) {
268 		/*
269 		 * String is first on list.
270 		 */
271 		stelem->se_strlist = stlist->sl_next;
272 	} else {
273 		/*
274 		 * remove string from list.
275 		 */
276 		pstlist->sl_next = stlist->sl_next;
277 	}
278 
279 	free(stlist);
280 	return (0);
281 }
282 
283 
284 /*
285  * Insert a new string into the Str_tbl
286  */
287 int
288 st_insert(Str_tbl *stp, const char *str)
289 {
290 	uint_t	stlen;
291 	Stringelem	qstelem;
292 	Stringelem	*stelem;
293 	Stringlist	*strlist;
294 	avl_index_t	where;
295 
296 	/*
297 	 * String table can't have been cooked
298 	 */
299 	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
300 	stlen = (uint_t)strlen(str);
301 	/*
302 	 * Null strings always point to the head of the string
303 	 * table - no reason to keep searching.
304 	 */
305 	if (stlen == 0)
306 		return (0);
307 
308 	stp->st_fullstringsize += stlen + 1;
309 	stp->st_stringcnt++;
310 
311 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
312 		return (0);
313 
314 	qstelem.se_stlen = strlen(str);
315 	if ((stelem = avl_find(stp->st_strtree, &qstelem,
316 	    &where)) == NULL) {
317 		if ((stelem = calloc(sizeof (Stringelem), 1)) == 0)
318 			return (-1);
319 		stelem->se_stlen = qstelem.se_stlen;
320 		avl_insert(stp->st_strtree, stelem, where);
321 	}
322 	if ((strlist = malloc(sizeof (Stringlist))) == 0)
323 		return (-1);
324 
325 	strlist->sl_string = str;
326 	strlist->sl_next = stelem->se_strlist;
327 	stelem->se_strlist = strlist;
328 
329 	return (0);
330 }
331 
332 
333 /*
334  * For a given string - copy it into the buffer associated with
335  * the string table - and return the offset it has been assigned.
336  *
337  * If a value of '-1' is returned - the string was not found in
338  * the Str_tbl.
339  */
340 int
341 st_setstring(Str_tbl *stp, const char *str, uint_t *stoff)
342 {
343 	uint_t		stlen;
344 	uint_t		hashval;
345 	Str_hash	*sthash;
346 	Str_master	*mstr;
347 	int		i;
348 
349 	/*
350 	 * String table *must* have been previously cooked
351 	 */
352 	assert(stp->st_strbuf);
353 
354 	assert(stp->st_flags & FLG_STTAB_COOKED);
355 	stlen = (uint_t)strlen(str);
356 	/*
357 	 * Null string always points to head of string table
358 	 */
359 	if (stlen == 0) {
360 		*stoff = 0;
361 		return (0);
362 	}
363 
364 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
365 		uint_t		_stoff;
366 
367 		stlen++;	/* count for trailing '\0' */
368 		_stoff = stp->st_nextoff;
369 		/*
370 		 * Have we overflowed our assigned buffer?
371 		 */
372 		if ((_stoff + stlen) > stp->st_fullstringsize)
373 			return (-1);
374 		memcpy(stp->st_strbuf + _stoff, str, stlen);
375 		*stoff = _stoff;
376 		stp->st_nextoff += stlen;
377 		return (0);
378 	}
379 
380 	/*
381 	 * Calculate reverse hash for string
382 	 */
383 	hashval = HASHSEED;
384 	for (i = stlen; i >= 0; i--) {
385 		hashval = ((hashval << 5) + hashval) +
386 			str[i];			/* h = ((h * 33) + c) */
387 	}
388 
389 	for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash;
390 	    sthash = sthash->hi_next) {
391 		if (sthash->hi_hashval == hashval) {
392 			const char	*hstr;
393 
394 			hstr = &sthash->hi_mstr->sm_str[
395 			    sthash->hi_mstr->sm_stlen -
396 			    sthash->hi_stlen];
397 			if (strcmp(str, hstr) == 0) {
398 				break;
399 			}
400 		}
401 	}
402 
403 	/*
404 	 * Did we find the string?
405 	 */
406 	if (sthash == 0)
407 		return (-1);
408 
409 	/*
410 	 * Has this string been copied into the string table?
411 	 */
412 	mstr = sthash->hi_mstr;
413 	if (mstr->sm_stoff == 0) {
414 		uint_t	mstlen = mstr->sm_stlen + 1;
415 		mstr->sm_stoff = stp->st_nextoff;
416 		/*
417 		 * Have we overflowed our assigned buffer?
418 		 */
419 		if ((mstr->sm_stoff + mstlen) > stp->st_fullstringsize)
420 			return (-1);
421 		memcpy(stp->st_strbuf + mstr->sm_stoff, mstr->sm_str,
422 			mstlen);
423 		stp->st_nextoff += mstlen;
424 	}
425 	/*
426 	 * Calculate offset of (sub)string
427 	 */
428 	*stoff = mstr->sm_stoff + mstr->sm_stlen - sthash->hi_stlen;
429 
430 	return (0);
431 }
432 
433 
434 static int
435 st_hash_insert(Str_tbl *stp, const char *str, uint_t stlen)
436 {
437 	int		i;
438 	uint_t		hashval = HASHSEED;
439 	uint_t		bckcnt = stp->st_hbckcnt;
440 	Str_hash	**hashbcks = stp->st_hashbcks;
441 	Str_hash	*sthash;
442 	Str_master	*mstr = 0;
443 
444 	/*
445 	 * We use a classic 'Bernstein k=33' hash function.  But
446 	 * instead of hashing from the start of the string to the
447 	 * end, we do it in reverse.
448 	 *
449 	 * This way - we are essentially building all of the
450 	 * suffix hashvalues as we go.  We can check to see if
451 	 * any suffixes already exist in the tree as we generate
452 	 * the hash.
453 	 */
454 	for (i = stlen; i >= 0; i--) {
455 
456 		hashval = ((hashval << 5) + hashval) +
457 			str[i];			/* h = ((h * 33) + c) */
458 		for (sthash = hashbcks[hashval % bckcnt];
459 		    sthash; sthash = sthash->hi_next) {
460 
461 			if (sthash->hi_hashval == hashval) {
462 				const char	*hstr;
463 				Str_master	*_mstr;
464 
465 				_mstr = sthash->hi_mstr;
466 				hstr = &_mstr->sm_str[_mstr->sm_stlen -
467 				    sthash->hi_stlen];
468 				if (strcmp(&str[i], hstr) == 0) {
469 					if (i == 0) {
470 						/*
471 						 * Entry already in table,
472 						 * increment refcnt and get
473 						 * out.
474 						 */
475 						sthash->hi_refcnt++;
476 						return (0);
477 					} else {
478 						/*
479 						 * If this 'suffix' is
480 						 * presently a 'master' string,
481 						 * then take over it's record.
482 						 */
483 						if (sthash->hi_stlen ==
484 						    _mstr->sm_stlen) {
485 							/*
486 							 * we should only do
487 							 * this once.
488 							 */
489 							assert(mstr == 0);
490 							mstr = _mstr;
491 						}
492 					}
493 				}
494 			}
495 		}
496 	}
497 
498 
499 	/*
500 	 * Do we need a new master string, or can we take over
501 	 * one we already found in the table?
502 	 */
503 	if (mstr == 0) {
504 		/*
505 		 * allocate a new master string
506 		 */
507 		if ((mstr = calloc(sizeof (Str_hash), 1)) == 0)
508 			return (-1);
509 		mstr->sm_next = stp->st_mstrlist;
510 		stp->st_mstrlist = mstr;
511 		stp->st_stringsize += stlen + 1;
512 	} else {
513 		/*
514 		 * We are taking over a existing master string,
515 		 * the stringsize only increments by the
516 		 * difference between the currnet string and the
517 		 * previous master.
518 		 */
519 		assert(stlen > mstr->sm_stlen);
520 		stp->st_stringsize += stlen - mstr->sm_stlen;
521 	}
522 
523 	if ((sthash = calloc(sizeof (Str_hash), 1)) == 0)
524 		return (-1);
525 
526 	mstr->sm_hashval = sthash->hi_hashval = hashval;
527 	mstr->sm_stlen = sthash->hi_stlen = stlen;
528 	mstr->sm_str = str;
529 	sthash->hi_refcnt = 1;
530 	sthash->hi_mstr = mstr;
531 
532 	/*
533 	 * Insert string element into head of hash list
534 	 */
535 	hashval = hashval % bckcnt;
536 	sthash->hi_next = hashbcks[hashval];
537 	hashbcks[hashval] = sthash;
538 	return (0);
539 }
540 
541 /*
542  * Return amount of space required for the string table.
543  */
544 uint_t
545 st_getstrtab_sz(Str_tbl *stp)
546 {
547 	assert(stp->st_fullstringsize > 0);
548 
549 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
550 		stp->st_flags |= FLG_STTAB_COOKED;
551 		return (stp->st_fullstringsize);
552 	}
553 
554 
555 	if ((stp->st_flags & FLG_STTAB_COOKED) == 0) {
556 		Stringelem	*stelem;
557 		void		*cookie;
558 
559 		stp->st_flags |= FLG_STTAB_COOKED;
560 		/*
561 		 * allocate a hash table about the size of # of
562 		 * strings input.
563 		 */
564 		stp->st_hbckcnt = findprime(stp->st_stringcnt);
565 		if ((stp->st_hashbcks =
566 		    calloc(sizeof (Str_hash), stp->st_hbckcnt)) == NULL)
567 			return (0);
568 
569 		/*
570 		 * We now walk all of the strings in the list,
571 		 * from shortest to longest, and insert them into
572 		 * the hashtable.
573 		 */
574 		if ((stelem = avl_first(stp->st_strtree)) == NULL) {
575 			/*
576 			 * Is it possible we have a empty string table,
577 			 * if so - the table still conains '\0'
578 			 * so still return the size.
579 			 */
580 			if (avl_numnodes(stp->st_strtree) == 0) {
581 				assert(stp->st_stringsize == 1);
582 				return (stp->st_stringsize);
583 			}
584 			return (0);
585 		}
586 		while (stelem) {
587 			Stringlist	*strlist, *pstrlist;
588 
589 			/*
590 			 * Walk the string lists and insert them
591 			 * into the hash list.  Once a string is
592 			 * inserted we no longer need it's entry,
593 			 * so free it
594 			 */
595 			for (strlist = stelem->se_strlist, pstrlist = 0;
596 			    strlist; strlist = strlist->sl_next) {
597 				if (st_hash_insert(stp, strlist->sl_string,
598 				    stelem->se_stlen) == -1)
599 					return (0);
600 				if (pstrlist)
601 					free(pstrlist);
602 			}
603 			free(pstrlist);
604 			stelem->se_strlist = 0;
605 			stelem = AVL_NEXT(stp->st_strtree, stelem);
606 		}
607 
608 		/*
609 		 * Now that all of the strings have been freed,
610 		 * go ahead and quickly re-walk the AVL tree and
611 		 * free all of the AVL nodes.
612 		 *
613 		 * avl_destroy_nodes() beats avl_remove() because
614 		 * avl_remove will 'ballance' the tree as nodes
615 		 * are deleted - we just want to tear the whole
616 		 * thing down now.
617 		 */
618 		cookie = NULL;
619 		while ((stelem = avl_destroy_nodes(stp->st_strtree,
620 		    &cookie)) != NULL)
621 			free(stelem);
622 		avl_destroy(stp->st_strtree);
623 		free(stp->st_strtree);
624 		stp->st_strtree = 0;
625 	}
626 
627 	assert(stp->st_stringsize > 0);
628 	assert(stp->st_fullstringsize >= stp->st_stringsize);
629 
630 	return (stp->st_stringsize);
631 }
632 
633 /*
634  * Associate a buffer with the string table.
635  */
636 const char *
637 st_getstrbuf(Str_tbl *stp)
638 {
639 	return (stp->st_strbuf);
640 }
641 
642 int
643 st_setstrbuf(Str_tbl *stp, char *stbuf, uint_t bufsize)
644 {
645 	assert(stp->st_flags & FLG_STTAB_COOKED);
646 
647 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
648 		if (bufsize < stp->st_fullstringsize)
649 			return (-1);
650 	} else {
651 		if (bufsize < stp->st_stringsize)
652 			return (-1);
653 	}
654 
655 	stp->st_strbuf = stbuf;
656 #ifdef	DEBUG
657 	/*
658 	 * for debug builds - start with a stringtable filled in
659 	 * with '0xff'.  This makes it very easy to find wholes
660 	 * which we failed to fill in - in the strtab.
661 	 */
662 	memset(stbuf, 0xff, bufsize);
663 	stbuf[0] = '\0';
664 #else
665 	memset(stbuf, 0x0, bufsize);
666 #endif
667 	return (0);
668 }
669