xref: /titanic_50/usr/src/cmd/mdb/common/modules/genunix/findstack.c (revision 9cd928fe5e3ea4e05f64cfb380beb54b2623e7dc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <mdb/mdb_modapi.h>
27 #include <mdb/mdb_ctf.h>
28 
29 #include <sys/types.h>
30 #include <sys/regset.h>
31 #include <sys/stack.h>
32 #include <sys/thread.h>
33 #include <sys/modctl.h>
34 
35 #include "findstack.h"
36 #include "thread.h"
37 #include "sobj.h"
38 
39 typedef struct findstack_info {
40 	uintptr_t	*fsi_stack;	/* place to record frames */
41 
42 	uintptr_t	fsi_sp;		/* stack pointer */
43 	uintptr_t	fsi_pc;		/* pc */
44 	uintptr_t	fsi_sobj_ops;	/* sobj_ops */
45 
46 	uint_t		fsi_tstate;	/* t_state */
47 
48 	uchar_t		fsi_depth;	/* stack depth */
49 	uchar_t		fsi_failed;	/* search failed */
50 	uchar_t		fsi_overflow;	/* stack was deeper than max_depth */
51 	uchar_t		fsi_panic;	/* thread called panic() */
52 
53 	uchar_t		fsi_max_depth;	/* stack frames available */
54 } findstack_info_t;
55 #define	FSI_FAIL_BADTHREAD	1
56 #define	FSI_FAIL_NOTINMEMORY	2
57 #define	FSI_FAIL_THREADCORRUPT	3
58 #define	FSI_FAIL_STACKNOTFOUND	4
59 
60 #ifndef STACK_BIAS
61 #define	STACK_BIAS	0
62 #endif
63 
64 #define	fs_dprintf(x)					\
65 	if (findstack_debug_on) {			\
66 		mdb_printf("findstack debug: ");	\
67 		/*CSTYLED*/				\
68 		mdb_printf x ;				\
69 	}
70 
71 static int findstack_debug_on = 0;
72 
73 #if defined(__i386) || defined(__amd64)
74 struct rwindow {
75 	uintptr_t rw_fp;
76 	uintptr_t rw_rtn;
77 };
78 #endif
79 
80 #define	TOO_BIG_FOR_A_STACK (1024 * 1024)
81 
82 #define	KTOU(p) ((p) - kbase + ubase)
83 #define	UTOK(p) ((p) - ubase + kbase)
84 
85 #define	CRAWL_FOUNDALL	(-1)
86 
87 /*
88  * Given a stack pointer, try to crawl down it to the bottom.
89  * "frame" is a VA in MDB's address space.
90  *
91  * Returns the number of frames successfully crawled down, or
92  * CRAWL_FOUNDALL if it got to the bottom of the stack.
93  */
94 static int
95 crawl(uintptr_t frame, uintptr_t kbase, uintptr_t ktop, uintptr_t ubase,
96     int kill_fp, findstack_info_t *fsip)
97 {
98 	int levels = 0;
99 
100 	fsip->fsi_depth = 0;
101 	fsip->fsi_overflow = 0;
102 
103 	fs_dprintf(("<0> frame = %p, kbase = %p, ktop = %p, ubase = %p\n",
104 	    frame, kbase, ktop, ubase));
105 	for (;;) {
106 		uintptr_t fp;
107 		long *fpp = (long *)&((struct rwindow *)frame)->rw_fp;
108 
109 		fs_dprintf(("<1> fpp = %p, frame = %p\n", fpp, frame));
110 
111 		if ((frame & (STACK_ALIGN - 1)) != 0)
112 			break;
113 
114 		fp = ((struct rwindow *)frame)->rw_fp + STACK_BIAS;
115 		if (fsip->fsi_depth < fsip->fsi_max_depth)
116 			fsip->fsi_stack[fsip->fsi_depth++] =
117 			    ((struct rwindow *)frame)->rw_rtn;
118 		else
119 			fsip->fsi_overflow = 1;
120 
121 		fs_dprintf(("<2> fp = %p\n", fp));
122 
123 		if (fp == ktop)
124 			return (CRAWL_FOUNDALL);
125 		fs_dprintf(("<3> not at base\n"));
126 
127 #if defined(__i386) || defined(__amd64)
128 		if (ktop - fp == sizeof (struct rwindow)) {
129 			fs_dprintf(("<4> found base\n"));
130 			return (CRAWL_FOUNDALL);
131 		}
132 #endif
133 
134 		fs_dprintf(("<5> fp = %p, kbase = %p, ktop - size = %p\n",
135 		    fp, kbase, ktop - sizeof (struct rwindow)));
136 
137 		if (fp < kbase || fp >= (ktop - sizeof (struct rwindow)))
138 			break;
139 
140 		frame = KTOU(fp);
141 		fs_dprintf(("<6> frame = %p\n", frame));
142 
143 		/*
144 		 * NULL out the old %fp so we don't go down this stack
145 		 * more than once.
146 		 */
147 		if (kill_fp) {
148 			fs_dprintf(("<7> fpp = %p\n", fpp));
149 			*fpp = NULL;
150 		}
151 
152 		fs_dprintf(("<8> levels = %d\n", levels));
153 		levels++;
154 	}
155 
156 	return (levels);
157 }
158 
159 /*
160  * "sp" is a kernel VA.
161  */
162 static int
163 print_stack(uintptr_t sp, uintptr_t pc, uintptr_t addr,
164     int argc, const mdb_arg_t *argv, int free_state)
165 {
166 	int showargs = 0, count, err;
167 
168 	count = mdb_getopts(argc, argv,
169 	    'v', MDB_OPT_SETBITS, TRUE, &showargs, NULL);
170 	argc -= count;
171 	argv += count;
172 
173 	if (argc > 1 || (argc == 1 && argv->a_type != MDB_TYPE_STRING))
174 		return (DCMD_USAGE);
175 
176 	mdb_printf("stack pointer for thread %p%s: %p\n",
177 	    addr, (free_state ? " (TS_FREE)" : ""), sp);
178 	if (pc != 0)
179 		mdb_printf("[ %0?lr %a() ]\n", sp, pc);
180 
181 	mdb_inc_indent(2);
182 	mdb_set_dot(sp);
183 
184 	if (argc == 1)
185 		err = mdb_eval(argv->a_un.a_str);
186 	else if (showargs)
187 		err = mdb_eval("<.$C");
188 	else
189 		err = mdb_eval("<.$C0");
190 
191 	mdb_dec_indent(2);
192 
193 	return ((err == -1) ? DCMD_ABORT : DCMD_OK);
194 }
195 
196 /*ARGSUSED*/
197 static int
198 do_findstack(uintptr_t addr, findstack_info_t *fsip, uint_t print_warnings)
199 {
200 	kthread_t thr;
201 	size_t stksz;
202 	uintptr_t ubase, utop;
203 	uintptr_t kbase, ktop;
204 	uintptr_t win, sp;
205 
206 	fsip->fsi_failed = 0;
207 	fsip->fsi_pc = 0;
208 	fsip->fsi_sp = 0;
209 	fsip->fsi_depth = 0;
210 	fsip->fsi_overflow = 0;
211 
212 	bzero(&thr, sizeof (thr));
213 	if (mdb_ctf_vread(&thr, "kthread_t", addr,
214 	    MDB_CTF_VREAD_IGNORE_ALL) == -1) {
215 		if (print_warnings)
216 			mdb_warn("couldn't read thread at %p\n", addr);
217 		fsip->fsi_failed = FSI_FAIL_BADTHREAD;
218 		return (DCMD_ERR);
219 	}
220 
221 	fsip->fsi_sobj_ops = (uintptr_t)thr.t_sobj_ops;
222 	fsip->fsi_tstate = thr.t_state;
223 	fsip->fsi_panic = !!(thr.t_flag & T_PANIC);
224 
225 	if ((thr.t_schedflag & TS_LOAD) == 0) {
226 		if (print_warnings)
227 			mdb_warn("thread %p isn't in memory\n", addr);
228 		fsip->fsi_failed = FSI_FAIL_NOTINMEMORY;
229 		return (DCMD_ERR);
230 	}
231 
232 	if (thr.t_stk < thr.t_stkbase) {
233 		if (print_warnings)
234 			mdb_warn(
235 			    "stack base or stack top corrupt for thread %p\n",
236 			    addr);
237 		fsip->fsi_failed = FSI_FAIL_THREADCORRUPT;
238 		return (DCMD_ERR);
239 	}
240 
241 	kbase = (uintptr_t)thr.t_stkbase;
242 	ktop = (uintptr_t)thr.t_stk;
243 	stksz = ktop - kbase;
244 
245 #ifdef __amd64
246 	/*
247 	 * The stack on amd64 is intentionally misaligned, so ignore the top
248 	 * half-frame.  See thread_stk_init().  When handling traps, the frame
249 	 * is automatically aligned by the hardware, so we only alter ktop if
250 	 * needed.
251 	 */
252 	if ((ktop & (STACK_ALIGN - 1)) != 0)
253 		ktop -= STACK_ENTRY_ALIGN;
254 #endif
255 
256 	/*
257 	 * If the stack size is larger than a meg, assume that it's bogus.
258 	 */
259 	if (stksz > TOO_BIG_FOR_A_STACK) {
260 		if (print_warnings)
261 			mdb_warn("stack size for thread %p is too big to be "
262 			    "reasonable\n", addr);
263 		fsip->fsi_failed = FSI_FAIL_THREADCORRUPT;
264 		return (DCMD_ERR);
265 	}
266 
267 	/*
268 	 * This could be (and was) a UM_GC allocation.  Unfortunately,
269 	 * stksz tends to be very large.  As currently implemented, dcmds
270 	 * invoked as part of pipelines don't have their UM_GC-allocated
271 	 * memory freed until the pipeline completes.  With stksz in the
272 	 * neighborhood of 20k, the popular ::walk thread |::findstack
273 	 * pipeline can easily run memory-constrained debuggers (kmdb) out
274 	 * of memory.  This can be changed back to a gc-able allocation when
275 	 * the debugger is changed to free UM_GC memory more promptly.
276 	 */
277 	ubase = (uintptr_t)mdb_alloc(stksz, UM_SLEEP);
278 	utop = ubase + stksz;
279 	if (mdb_vread((caddr_t)ubase, stksz, kbase) != stksz) {
280 		mdb_free((void *)ubase, stksz);
281 		if (print_warnings)
282 			mdb_warn("couldn't read entire stack for thread %p\n",
283 			    addr);
284 		fsip->fsi_failed = FSI_FAIL_THREADCORRUPT;
285 		return (DCMD_ERR);
286 	}
287 
288 	/*
289 	 * Try the saved %sp first, if it looks reasonable.
290 	 */
291 	sp = KTOU((uintptr_t)thr.t_sp + STACK_BIAS);
292 	if (sp >= ubase && sp <= utop) {
293 		if (crawl(sp, kbase, ktop, ubase, 0, fsip) == CRAWL_FOUNDALL) {
294 			fsip->fsi_sp = (uintptr_t)thr.t_sp;
295 #if !defined(__i386)
296 			fsip->fsi_pc = (uintptr_t)thr.t_pc;
297 #endif
298 			goto found;
299 		}
300 	}
301 
302 	/*
303 	 * Now walk through the whole stack, starting at the base,
304 	 * trying every possible "window".
305 	 */
306 	for (win = ubase;
307 	    win + sizeof (struct rwindow) <= utop;
308 	    win += sizeof (struct rwindow *)) {
309 		if (crawl(win, kbase, ktop, ubase, 1, fsip) == CRAWL_FOUNDALL) {
310 			fsip->fsi_sp = UTOK(win) - STACK_BIAS;
311 			goto found;
312 		}
313 	}
314 
315 	/*
316 	 * We didn't conclusively find the stack.  So we'll take another lap,
317 	 * and print out anything that looks possible.
318 	 */
319 	if (print_warnings)
320 		mdb_printf("Possible stack pointers for thread %p:\n", addr);
321 	(void) mdb_vread((caddr_t)ubase, stksz, kbase);
322 
323 	for (win = ubase;
324 	    win + sizeof (struct rwindow) <= utop;
325 	    win += sizeof (struct rwindow *)) {
326 		uintptr_t fp = ((struct rwindow *)win)->rw_fp;
327 		int levels;
328 
329 		if ((levels = crawl(win, kbase, ktop, ubase, 1, fsip)) > 1) {
330 			if (print_warnings)
331 				mdb_printf("  %p (%d)\n", fp, levels);
332 		} else if (levels == CRAWL_FOUNDALL) {
333 			/*
334 			 * If this is a live system, the stack could change
335 			 * between the two mdb_vread(ubase, utop, kbase)'s,
336 			 * and we could have a fully valid stack here.
337 			 */
338 			fsip->fsi_sp = UTOK(win) - STACK_BIAS;
339 			goto found;
340 		}
341 	}
342 
343 	fsip->fsi_depth = 0;
344 	fsip->fsi_overflow = 0;
345 	fsip->fsi_failed = FSI_FAIL_STACKNOTFOUND;
346 
347 	mdb_free((void *)ubase, stksz);
348 	return (DCMD_ERR);
349 found:
350 	mdb_free((void *)ubase, stksz);
351 	return (DCMD_OK);
352 }
353 
354 int
355 findstack(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
356 {
357 	findstack_info_t fsi;
358 	int retval;
359 
360 	if (!(flags & DCMD_ADDRSPEC))
361 		return (DCMD_USAGE);
362 
363 	bzero(&fsi, sizeof (fsi));
364 
365 	if ((retval = do_findstack(addr, &fsi, 1)) != DCMD_OK ||
366 	    fsi.fsi_failed)
367 		return (retval);
368 
369 	return (print_stack(fsi.fsi_sp, fsi.fsi_pc, addr,
370 	    argc, argv, fsi.fsi_tstate == TS_FREE));
371 }
372 
373 /*ARGSUSED*/
374 int
375 findstack_debug(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *av)
376 {
377 	findstack_debug_on ^= 1;
378 
379 	mdb_printf("findstack: debugging is now %s\n",
380 	    findstack_debug_on ? "on" : "off");
381 
382 	return (DCMD_OK);
383 }
384 
385 static void
386 uppercase(char *p)
387 {
388 	for (; *p != '\0'; p++) {
389 		if (*p >= 'a' && *p <= 'z')
390 			*p += 'A' - 'a';
391 	}
392 }
393 
394 static void
395 sobj_to_text(uintptr_t addr, char *out, size_t out_sz)
396 {
397 	sobj_ops_to_text(addr, out, out_sz);
398 	uppercase(out);
399 }
400 
401 #define	SOBJ_ALL	1
402 static int
403 text_to_sobj(const char *text, uintptr_t *out)
404 {
405 	if (strcasecmp(text, "ALL") == 0) {
406 		*out = SOBJ_ALL;
407 		return (0);
408 	}
409 	return (sobj_text_to_ops(text, out));
410 }
411 
412 #define	TSTATE_PANIC	-2U
413 static int
414 text_to_tstate(const char *text, uint_t *out)
415 {
416 	if (strcasecmp(text, "panic") == 0)
417 		*out = TSTATE_PANIC;
418 	else if (thread_text_to_state(text, out) != 0) {
419 		mdb_warn("tstate \"%s\" not recognized\n", text);
420 		return (-1);
421 	}
422 	return (0);
423 }
424 
425 static void
426 tstate_to_text(uint_t tstate, uint_t paniced, char *out, size_t out_sz)
427 {
428 	if (paniced)
429 		mdb_snprintf(out, out_sz, "panic");
430 	else
431 		thread_state_to_text(tstate, out, out_sz);
432 	uppercase(out);
433 }
434 
435 typedef struct stacks_entry {
436 	struct stacks_entry	*se_next;
437 	struct stacks_entry	*se_dup;	/* dups of this stack */
438 	uintptr_t		se_thread;
439 	uintptr_t		se_sp;
440 	uintptr_t		se_sobj_ops;
441 	uint32_t		se_tstate;
442 	uint32_t		se_count;	/* # threads w/ this stack */
443 	uint8_t			se_overflow;
444 	uint8_t			se_depth;
445 	uint8_t			se_failed;	/* failure reason; FSI_FAIL_* */
446 	uint8_t			se_panic;
447 	uintptr_t		se_stack[1];
448 } stacks_entry_t;
449 #define	STACKS_ENTRY_SIZE(x) OFFSETOF(stacks_entry_t, se_stack[(x)])
450 
451 #define	STACKS_HSIZE 127
452 
453 /* Maximum stack depth reported in stacks */
454 #define	STACKS_MAX_DEPTH	254
455 
456 typedef struct stacks_info {
457 	size_t		si_count;	/* total stacks_entry_ts (incl dups) */
458 	size_t		si_entries;	/* # entries in hash table */
459 	stacks_entry_t	**si_hash;	/* hash table */
460 
461 	findstack_info_t si_fsi;	/* transient callback state */
462 } stacks_info_t;
463 
464 
465 /* global state cached between invocations */
466 #define	STACKS_STATE_CLEAN	0
467 #define	STACKS_STATE_DIRTY	1
468 #define	STACKS_STATE_DONE	2
469 static uint_t stacks_state = STACKS_STATE_CLEAN;
470 static stacks_entry_t **stacks_hash;
471 static stacks_entry_t **stacks_array;
472 static size_t stacks_array_size;
473 
474 size_t
475 stacks_hash_entry(stacks_entry_t *sep)
476 {
477 	size_t depth = sep->se_depth;
478 	uintptr_t *stack = sep->se_stack;
479 
480 	uint64_t total = depth;
481 
482 	while (depth > 0) {
483 		total += *stack;
484 		stack++; depth--;
485 	}
486 
487 	return (total % STACKS_HSIZE);
488 }
489 
490 /*
491  * This is used to both compare stacks for equality and to sort the final
492  * list of unique stacks.  forsort specifies the latter behavior, which
493  * additionally:
494  *	compares se_count, and
495  *	sorts the stacks by text function name.
496  *
497  * The equality test is independent of se_count, and doesn't care about
498  * relative ordering, so we don't do the extra work of looking up symbols
499  * for the stack addresses.
500  */
501 int
502 stacks_entry_comp_impl(stacks_entry_t *l, stacks_entry_t *r,
503     uint_t forsort)
504 {
505 	int idx;
506 
507 	int depth = MIN(l->se_depth, r->se_depth);
508 
509 	/* no matter what, panic stacks come last. */
510 	if (l->se_panic > r->se_panic)
511 		return (1);
512 	if (l->se_panic < r->se_panic)
513 		return (-1);
514 
515 	if (forsort) {
516 		/* put large counts earlier */
517 		if (l->se_count > r->se_count)
518 			return (-1);
519 		if (l->se_count < r->se_count)
520 			return (1);
521 	}
522 
523 	if (l->se_tstate > r->se_tstate)
524 		return (1);
525 	if (l->se_tstate < r->se_tstate)
526 		return (-1);
527 
528 	if (l->se_failed > r->se_failed)
529 		return (1);
530 	if (l->se_failed < r->se_failed)
531 		return (-1);
532 
533 	for (idx = 0; idx < depth; idx++) {
534 		char lbuf[MDB_SYM_NAMLEN];
535 		char rbuf[MDB_SYM_NAMLEN];
536 
537 		int rval;
538 		uintptr_t laddr = l->se_stack[idx];
539 		uintptr_t raddr = r->se_stack[idx];
540 
541 		if (laddr == raddr)
542 			continue;
543 
544 		if (forsort &&
545 		    mdb_lookup_by_addr(laddr, MDB_SYM_FUZZY,
546 		    lbuf, sizeof (lbuf), NULL) != -1 &&
547 		    mdb_lookup_by_addr(raddr, MDB_SYM_FUZZY,
548 		    rbuf, sizeof (rbuf), NULL) != -1 &&
549 		    (rval = strcmp(lbuf, rbuf)) != 0)
550 			return (rval);
551 
552 		if (laddr > raddr)
553 			return (1);
554 		return (-1);
555 	}
556 
557 	if (l->se_overflow > r->se_overflow)
558 		return (-1);
559 	if (l->se_overflow < r->se_overflow)
560 		return (1);
561 
562 	if (l->se_depth > r->se_depth)
563 		return (1);
564 	if (l->se_depth < r->se_depth)
565 		return (-1);
566 
567 	if (l->se_sobj_ops > r->se_sobj_ops)
568 		return (1);
569 	if (l->se_sobj_ops < r->se_sobj_ops)
570 		return (-1);
571 
572 	return (0);
573 }
574 
575 int
576 stacks_entry_comp(const void *l_arg, const void *r_arg)
577 {
578 	stacks_entry_t * const *lp = l_arg;
579 	stacks_entry_t * const *rp = r_arg;
580 
581 	return (stacks_entry_comp_impl(*lp, *rp, 1));
582 }
583 
584 void
585 stacks_cleanup(int force)
586 {
587 	int idx = 0;
588 	stacks_entry_t *cur, *next;
589 
590 	if (stacks_state == STACKS_STATE_CLEAN)
591 		return;
592 
593 	if (!force && stacks_state == STACKS_STATE_DONE)
594 		return;
595 
596 	/*
597 	 * Until the array is sorted and stable, stacks_hash will be non-NULL.
598 	 * This way, we can get at all of the data, even if qsort() was
599 	 * interrupted while mucking with the array.
600 	 */
601 	if (stacks_hash != NULL) {
602 		for (idx = 0; idx < STACKS_HSIZE; idx++) {
603 			while ((cur = stacks_hash[idx]) != NULL) {
604 				while ((next = cur->se_dup) != NULL) {
605 					cur->se_dup = next->se_dup;
606 					mdb_free(next,
607 					    STACKS_ENTRY_SIZE(next->se_depth));
608 				}
609 				next = cur->se_next;
610 				stacks_hash[idx] = next;
611 				mdb_free(cur, STACKS_ENTRY_SIZE(cur->se_depth));
612 			}
613 		}
614 		if (stacks_array != NULL)
615 			mdb_free(stacks_array,
616 			    stacks_array_size * sizeof (*stacks_array));
617 
618 	} else if (stacks_array != NULL) {
619 		for (idx = 0; idx < stacks_array_size; idx++) {
620 			if ((cur = stacks_array[idx]) != NULL) {
621 				while ((next = cur->se_dup) != NULL) {
622 					cur->se_dup = next->se_dup;
623 					mdb_free(next,
624 					    STACKS_ENTRY_SIZE(next->se_depth));
625 				}
626 				stacks_array[idx] = NULL;
627 				mdb_free(cur, STACKS_ENTRY_SIZE(cur->se_depth));
628 			}
629 		}
630 		mdb_free(stacks_array,
631 		    stacks_array_size * sizeof (*stacks_array));
632 	}
633 
634 	stacks_array_size = 0;
635 	stacks_state = STACKS_STATE_CLEAN;
636 }
637 
638 /*ARGSUSED*/
639 int
640 stacks_thread_cb(uintptr_t addr, const void *ignored, void *cbarg)
641 {
642 	stacks_info_t *sip = cbarg;
643 	findstack_info_t *fsip = &sip->si_fsi;
644 
645 	stacks_entry_t **sepp, *nsep, *sep;
646 	int idx;
647 	size_t depth;
648 
649 	if (do_findstack(addr, fsip, 0) != DCMD_OK &&
650 	    fsip->fsi_failed == FSI_FAIL_BADTHREAD) {
651 		mdb_warn("couldn't read thread at %p\n", addr);
652 		return (WALK_NEXT);
653 	}
654 
655 	sip->si_count++;
656 
657 	depth = fsip->fsi_depth;
658 	nsep = mdb_zalloc(STACKS_ENTRY_SIZE(depth), UM_SLEEP);
659 	nsep->se_thread = addr;
660 	nsep->se_sp = fsip->fsi_sp;
661 	nsep->se_sobj_ops = fsip->fsi_sobj_ops;
662 	nsep->se_tstate = fsip->fsi_tstate;
663 	nsep->se_count = 1;
664 	nsep->se_overflow = fsip->fsi_overflow;
665 	nsep->se_depth = depth;
666 	nsep->se_failed = fsip->fsi_failed;
667 	nsep->se_panic = fsip->fsi_panic;
668 
669 	for (idx = 0; idx < depth; idx++)
670 		nsep->se_stack[idx] = fsip->fsi_stack[idx];
671 
672 	for (sepp = &sip->si_hash[stacks_hash_entry(nsep)];
673 	    (sep = *sepp) != NULL;
674 	    sepp = &sep->se_next) {
675 
676 		if (stacks_entry_comp_impl(sep, nsep, 0) != 0)
677 			continue;
678 
679 		nsep->se_dup = sep->se_dup;
680 		sep->se_dup = nsep;
681 		sep->se_count++;
682 		return (WALK_NEXT);
683 	}
684 
685 	nsep->se_next = NULL;
686 	*sepp = nsep;
687 	sip->si_entries++;
688 
689 	return (WALK_NEXT);
690 }
691 
692 int
693 stacks_run_tlist(mdb_pipe_t *tlist, stacks_info_t *si)
694 {
695 	size_t idx;
696 	size_t found = 0;
697 	kthread_t kt;
698 	int ret;
699 
700 	for (idx = 0; idx < tlist->pipe_len; idx++) {
701 		uintptr_t addr = tlist->pipe_data[idx];
702 
703 		if (mdb_vread(&kt, sizeof (kt), addr) == -1) {
704 			mdb_warn("unable to read kthread_t at %p", addr);
705 			continue;
706 		}
707 		found++;
708 
709 		ret = stacks_thread_cb(addr, &kt, si);
710 		if (ret == WALK_DONE)
711 			break;
712 		if (ret != WALK_NEXT)
713 			return (-1);
714 	}
715 
716 	if (found)
717 		return (0);
718 	return (-1);
719 }
720 
721 int
722 stacks_run(int verbose, mdb_pipe_t *tlist)
723 {
724 	stacks_info_t si;
725 	findstack_info_t *fsip = &si.si_fsi;
726 	size_t idx;
727 	stacks_entry_t **cur;
728 
729 	bzero(&si, sizeof (si));
730 
731 	stacks_state = STACKS_STATE_DIRTY;
732 
733 	stacks_hash = si.si_hash =
734 	    mdb_zalloc(STACKS_HSIZE * sizeof (*si.si_hash), UM_SLEEP);
735 	si.si_entries = 0;
736 	si.si_count = 0;
737 
738 	fsip->fsi_max_depth = STACKS_MAX_DEPTH;
739 	fsip->fsi_stack =
740 	    mdb_alloc(fsip->fsi_max_depth * sizeof (*fsip->fsi_stack),
741 	    UM_SLEEP | UM_GC);
742 
743 	if (verbose)
744 		mdb_warn("stacks: processing kernel threads\n");
745 
746 	if (tlist != NULL) {
747 		if (stacks_run_tlist(tlist, &si))
748 			return (DCMD_ERR);
749 	} else {
750 		if (mdb_walk("thread", stacks_thread_cb, &si) != 0) {
751 			mdb_warn("cannot walk \"thread\"");
752 			return (DCMD_ERR);
753 		}
754 	}
755 
756 	if (verbose)
757 		mdb_warn("stacks: %d unique stacks / %d threads\n",
758 		    si.si_entries, si.si_count);
759 
760 	stacks_array_size = si.si_entries;
761 	stacks_array =
762 	    mdb_zalloc(si.si_entries * sizeof (*stacks_array), UM_SLEEP);
763 	cur = stacks_array;
764 	for (idx = 0; idx < STACKS_HSIZE; idx++) {
765 		stacks_entry_t *sep;
766 		for (sep = si.si_hash[idx]; sep != NULL; sep = sep->se_next)
767 			*(cur++) = sep;
768 	}
769 
770 	if (cur != stacks_array + si.si_entries) {
771 		mdb_warn("stacks: miscounted array size (%d != size: %d)\n",
772 		    (cur - stacks_array), stacks_array_size);
773 		return (DCMD_ERR);
774 	}
775 	qsort(stacks_array, si.si_entries, sizeof (*stacks_array),
776 	    stacks_entry_comp);
777 
778 	/* Now that we're done, free the hash table */
779 	stacks_hash = NULL;
780 	mdb_free(si.si_hash, STACKS_HSIZE * sizeof (*si.si_hash));
781 
782 	if (tlist == NULL)
783 		stacks_state = STACKS_STATE_DONE;
784 
785 	if (verbose)
786 		mdb_warn("stacks: done\n");
787 
788 	return (DCMD_OK);
789 }
790 
791 static int
792 stacks_has_caller(stacks_entry_t *sep, uintptr_t addr)
793 {
794 	uintptr_t laddr = addr;
795 	uintptr_t haddr = addr + 1;
796 	int idx;
797 	char c[MDB_SYM_NAMLEN];
798 	GElf_Sym sym;
799 
800 	if (mdb_lookup_by_addr(addr, MDB_SYM_FUZZY,
801 	    c, sizeof (c), &sym) != -1 &&
802 	    addr == (uintptr_t)sym.st_value) {
803 		laddr = (uintptr_t)sym.st_value;
804 		haddr = (uintptr_t)sym.st_value + sym.st_size;
805 	}
806 
807 	for (idx = 0; idx < sep->se_depth; idx++)
808 		if (sep->se_stack[idx] >= laddr && sep->se_stack[idx] < haddr)
809 			return (1);
810 
811 	return (0);
812 }
813 
814 typedef struct find_module_struct {
815 	struct modctl *mcp;
816 	const char *name;
817 } find_module_struct_t;
818 
819 /*ARGSUSED*/
820 int
821 find_module_cb(uintptr_t addr, const void *modctl_arg, void *cbarg)
822 {
823 	find_module_struct_t *sp = cbarg;
824 	char mod_modname[MODMAXNAMELEN + 1];
825 	const struct modctl *mp = modctl_arg;
826 
827 	if (!mp->mod_modname)
828 		return (WALK_NEXT);
829 
830 	if (mdb_readstr(mod_modname, sizeof (mod_modname),
831 	    (uintptr_t)mp->mod_modname) == -1) {
832 		mdb_warn("failed to read mod_modname in \"modctl\" walk");
833 		return (WALK_ERR);
834 	}
835 
836 	if (strcmp(sp->name, mod_modname))
837 		return (WALK_NEXT);
838 
839 	sp->mcp = mdb_alloc(sizeof (*sp->mcp), UM_SLEEP | UM_GC);
840 	bcopy(mp, sp->mcp, sizeof (*sp->mcp));
841 	return (WALK_DONE);
842 }
843 
844 static struct modctl *
845 find_module(const char *name)
846 {
847 	find_module_struct_t mptr;
848 
849 	mptr.name = name;
850 	mptr.mcp = NULL;
851 
852 	if (mdb_walk("modctl", find_module_cb, &mptr) != 0)
853 		mdb_warn("cannot walk \"modctl\"");
854 	return (mptr.mcp);
855 }
856 
857 static int
858 stacks_has_module(stacks_entry_t *sep, struct modctl *mp)
859 {
860 	int idx;
861 
862 	if (mp == NULL)
863 		return (0);
864 
865 	for (idx = 0; idx < sep->se_depth; idx++)
866 		if (sep->se_stack[idx] >= (uintptr_t)mp->mod_text &&
867 		    sep->se_stack[idx] <
868 		    ((uintptr_t)mp->mod_text + mp->mod_text_size))
869 			return (1);
870 	return (0);
871 }
872 
873 
874 static int
875 uintptrcomp(const void *lp, const void *rp)
876 {
877 	uintptr_t lhs = *(const uintptr_t *)lp;
878 	uintptr_t rhs = *(const uintptr_t *)rp;
879 	if (lhs > rhs)
880 		return (1);
881 	if (lhs < rhs)
882 		return (-1);
883 	return (0);
884 }
885 
886 /*ARGSUSED*/
887 static void
888 print_sobj_help(int type, const char *name, const char *ops_name, void *ign)
889 {
890 	mdb_printf(" %s", name);
891 }
892 
893 /*ARGSUSED*/
894 static void
895 print_tstate_help(uint_t state, const char *name, void *ignored)
896 {
897 	mdb_printf(" %s", name);
898 }
899 
900 void
901 stacks_help(void)
902 {
903 	mdb_printf(
904 "::stacks processes all of the thread stacks on the system, grouping\n"
905 "together threads which have the same:\n"
906 "\n"
907 "  * Thread state,\n"
908 "  * Sync object type, and\n"
909 "  * PCs in their stack trace.\n"
910 "\n"
911 "The default output (no address or options) is just a dump of the thread\n"
912 "groups in the system.  For a view of active threads, use \"::stacks -i\",\n"
913 "which filters out FREE threads (interrupt threads which are currently\n"
914 "inactive) and threads sleeping on a CV. (Note that those threads may still\n"
915 "be noteworthy; this is just for a first glance.)  More general filtering\n"
916 "options are described below, in the \"FILTERS\" section.\n"
917 "\n"
918 "::stacks can be used in a pipeline.  The input to ::stacks is one or more\n"
919 "thread pointers.  For example, to get a summary of threads in a process,\n"
920 "you can do:\n"
921 "\n"
922 "  %<b>procp%</b>::walk thread | ::stacks\n"
923 "\n"
924 "When output into a pipe, ::stacks prints all of the threads input,\n"
925 "filtered by the given filtering options.  This means that multiple\n"
926 "::stacks invocations can be piped together to achieve more complicated\n"
927 "filters.  For example, to get threads which have both 'fop_read' and\n"
928 "'cv_wait_sig_swap' in their stack trace, you could do:\n"
929 "\n"
930 "  ::stacks -c fop_read | ::stacks -c cv_wait_sig_swap_core\n"
931 "\n"
932 "To get the full list of threads in each group, use the '-a' flag:\n"
933 "\n"
934 "  ::stacks -a\n"
935 "\n");
936 	mdb_dec_indent(2);
937 	mdb_printf("%<b>OPTIONS%</b>\n");
938 	mdb_inc_indent(2);
939 	mdb_printf("%s",
940 "  -a    Print all of the grouped threads, instead of just a count.\n"
941 "  -f    Force a re-run of the thread stack gathering.\n"
942 "  -v    Be verbose about thread stack gathering.\n"
943 "\n");
944 	mdb_dec_indent(2);
945 	mdb_printf("%<b>FILTERS%</b>\n");
946 	mdb_inc_indent(2);
947 	mdb_printf("%s",
948 "  -i    Show active threads; equivalent to '-S CV -T FREE'.\n"
949 "  -c func[+offset]\n"
950 "        Only print threads whose stacks contain func/func+offset.\n"
951 "  -C func[+offset]\n"
952 "        Only print threads whose stacks do not contain func/func+offset.\n"
953 "  -m module\n"
954 "        Only print threads whose stacks contain functions from module.\n"
955 "  -M module\n"
956 "        Only print threads whose stacks do not contain functions from\n"
957 "        module.\n"
958 "  -s {type | ALL}\n"
959 "        Only print threads which are on a 'type' synchronization object\n"
960 "        (SOBJ).\n"
961 "  -S {type | ALL}\n"
962 "        Only print threads which are not on a 'type' SOBJ.\n"
963 "  -t tstate\n"
964 "        Only print threads which are in thread state 'tstate'.\n"
965 "  -T tstate\n"
966 "        Only print threads which are not in thread state 'tstate'.\n"
967 "\n");
968 	mdb_printf("   SOBJ types:");
969 	sobj_type_walk(print_sobj_help, NULL);
970 	mdb_printf("\n");
971 	mdb_printf("Thread states:");
972 	thread_walk_states(print_tstate_help, NULL);
973 	mdb_printf(" panic\n");
974 }
975 
976 /*ARGSUSED*/
977 int
978 stacks(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
979 {
980 	size_t idx;
981 
982 	char *seen = NULL;
983 
984 	const char *caller_str = NULL;
985 	const char *excl_caller_str = NULL;
986 	uintptr_t caller = 0, excl_caller = 0;
987 	const char *module_str = NULL;
988 	const char *excl_module_str = NULL;
989 	struct modctl *module = NULL, *excl_module = NULL;
990 	const char *sobj = NULL;
991 	const char *excl_sobj = NULL;
992 	uintptr_t sobj_ops = 0, excl_sobj_ops = 0;
993 	const char *tstate_str = NULL;
994 	const char *excl_tstate_str = NULL;
995 	uint_t tstate = -1U;
996 	uint_t excl_tstate = -1U;
997 	uint_t printed = 0;
998 
999 	uint_t all = 0;
1000 	uint_t force = 0;
1001 	uint_t interesting = 0;
1002 	uint_t verbose = 0;
1003 
1004 	/*
1005 	 * We have a slight behavior difference between having piped
1006 	 * input and 'addr::stacks'.  Without a pipe, we assume the
1007 	 * thread pointer given is a representative thread, and so
1008 	 * we include all similar threads in the system in our output.
1009 	 *
1010 	 * With a pipe, we filter down to just the threads in our
1011 	 * input.
1012 	 */
1013 	uint_t addrspec = (flags & DCMD_ADDRSPEC);
1014 	uint_t only_matching = addrspec && (flags & DCMD_PIPE);
1015 
1016 	mdb_pipe_t p;
1017 
1018 	if (mdb_getopts(argc, argv,
1019 	    'a', MDB_OPT_SETBITS, TRUE, &all,
1020 	    'f', MDB_OPT_SETBITS, TRUE, &force,
1021 	    'i', MDB_OPT_SETBITS, TRUE, &interesting,
1022 	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
1023 	    'c', MDB_OPT_STR, &caller_str,
1024 	    'C', MDB_OPT_STR, &excl_caller_str,
1025 	    'm', MDB_OPT_STR, &module_str,
1026 	    'M', MDB_OPT_STR, &excl_module_str,
1027 	    's', MDB_OPT_STR, &sobj,
1028 	    'S', MDB_OPT_STR, &excl_sobj,
1029 	    't', MDB_OPT_STR, &tstate_str,
1030 	    'T', MDB_OPT_STR, &excl_tstate_str,
1031 	    NULL) != argc)
1032 		return (DCMD_USAGE);
1033 
1034 	if (interesting) {
1035 		if (sobj != NULL || excl_sobj != NULL ||
1036 		    tstate_str != NULL || excl_tstate_str != NULL) {
1037 			mdb_warn(
1038 			    "stacks: -i is incompatible with -[sStT]\n");
1039 			return (DCMD_USAGE);
1040 		}
1041 		excl_sobj = "CV";
1042 		excl_tstate_str = "FREE";
1043 	}
1044 
1045 	if (caller_str != NULL) {
1046 		mdb_set_dot(0);
1047 		if (mdb_eval(caller_str) != 0) {
1048 			mdb_warn("stacks: evaluation of \"%s\" failed",
1049 			    caller_str);
1050 			return (DCMD_ABORT);
1051 		}
1052 		caller = mdb_get_dot();
1053 	}
1054 
1055 	if (excl_caller_str != NULL) {
1056 		mdb_set_dot(0);
1057 		if (mdb_eval(excl_caller_str) != 0) {
1058 			mdb_warn("stacks: evaluation of \"%s\" failed",
1059 			    excl_caller_str);
1060 			return (DCMD_ABORT);
1061 		}
1062 		excl_caller = mdb_get_dot();
1063 	}
1064 	mdb_set_dot(addr);
1065 
1066 	if (module_str != NULL &&
1067 	    (module = find_module(module_str)) == NULL) {
1068 		mdb_warn("stacks: module \"%s\" is unknown", module_str);
1069 		return (DCMD_ABORT);
1070 	}
1071 
1072 	if (excl_module_str != NULL &&
1073 	    (excl_module = find_module(excl_module_str)) == NULL) {
1074 		mdb_warn("stacks: module \"%s\" is unknown", excl_module_str);
1075 		return (DCMD_ABORT);
1076 	}
1077 
1078 	if (sobj != NULL &&
1079 	    text_to_sobj(sobj, &sobj_ops) != 0)
1080 		return (DCMD_USAGE);
1081 
1082 	if (excl_sobj != NULL &&
1083 	    text_to_sobj(excl_sobj, &excl_sobj_ops) != 0)
1084 		return (DCMD_USAGE);
1085 
1086 	if (sobj_ops != 0 && excl_sobj_ops != 0) {
1087 		mdb_warn("stacks: only one of -s and -S can be specified\n");
1088 		return (DCMD_USAGE);
1089 	}
1090 
1091 	if (tstate_str != NULL &&
1092 	    text_to_tstate(tstate_str, &tstate) != 0)
1093 		return (DCMD_USAGE);
1094 
1095 	if (excl_tstate_str != NULL &&
1096 	    text_to_tstate(excl_tstate_str, &excl_tstate) != 0)
1097 		return (DCMD_USAGE);
1098 
1099 	if (tstate != -1U && excl_tstate != -1U) {
1100 		mdb_warn("stacks: only one of -t and -T can be specified\n");
1101 		return (DCMD_USAGE);
1102 	}
1103 
1104 	/*
1105 	 * If there's an address specified, we're going to further filter
1106 	 * to only entries which have an address in the input.  To reduce
1107 	 * overhead (and make the sorted output come out right), we
1108 	 * use mdb_get_pipe() to grab the entire pipeline of input, then
1109 	 * use qsort() and bsearch() to speed up the search.
1110 	 */
1111 	if (addrspec) {
1112 		mdb_get_pipe(&p);
1113 		if (p.pipe_data == NULL || p.pipe_len == 0) {
1114 			p.pipe_data = &addr;
1115 			p.pipe_len = 1;
1116 		}
1117 		qsort(p.pipe_data, p.pipe_len, sizeof (uintptr_t),
1118 		    uintptrcomp);
1119 
1120 		/* remove any duplicates in the data */
1121 		idx = 0;
1122 		while (idx < p.pipe_len - 1) {
1123 			uintptr_t *data = &p.pipe_data[idx];
1124 			size_t len = p.pipe_len - idx;
1125 
1126 			if (data[0] == data[1]) {
1127 				memmove(data, data + 1,
1128 				    (len - 1) * sizeof (*data));
1129 				p.pipe_len--;
1130 				continue; /* repeat without incrementing idx */
1131 			}
1132 			idx++;
1133 		}
1134 
1135 		seen = mdb_zalloc(p.pipe_len, UM_SLEEP | UM_GC);
1136 	}
1137 
1138 	/*
1139 	 * Force a cleanup if we're connected to a live system. Never
1140 	 * do a cleanup after the first invocation around the loop.
1141 	 */
1142 	force |= (mdb_get_state() == MDB_STATE_RUNNING);
1143 	if (force && (flags & (DCMD_LOOPFIRST|DCMD_LOOP)) == DCMD_LOOP)
1144 		force = 0;
1145 
1146 	stacks_cleanup(force);
1147 
1148 	if (stacks_state == STACKS_STATE_CLEAN) {
1149 		int res = stacks_run(verbose, addrspec ? &p : NULL);
1150 		if (res != DCMD_OK)
1151 			return (res);
1152 	}
1153 
1154 	for (idx = 0; idx < stacks_array_size; idx++) {
1155 		stacks_entry_t *sep = stacks_array[idx];
1156 		stacks_entry_t *cur = sep;
1157 		int frame;
1158 		size_t count = sep->se_count;
1159 
1160 		if (addrspec) {
1161 			stacks_entry_t *head = NULL, *tail = NULL, *sp;
1162 			size_t foundcount = 0;
1163 			/*
1164 			 * We use the now-unused hash chain field se_next to
1165 			 * link together the dups which match our list.
1166 			 */
1167 			for (sp = sep; sp != NULL; sp = sp->se_dup) {
1168 				uintptr_t *entry = bsearch(&sp->se_thread,
1169 				    p.pipe_data, p.pipe_len, sizeof (uintptr_t),
1170 				    uintptrcomp);
1171 				if (entry != NULL) {
1172 					foundcount++;
1173 					seen[entry - p.pipe_data]++;
1174 					if (head == NULL)
1175 						head = sp;
1176 					else
1177 						tail->se_next = sp;
1178 					tail = sp;
1179 					sp->se_next = NULL;
1180 				}
1181 			}
1182 			if (head == NULL)
1183 				continue;	/* no match, skip entry */
1184 
1185 			if (only_matching) {
1186 				cur = sep = head;
1187 				count = foundcount;
1188 			}
1189 		}
1190 
1191 		if (caller != 0 && !stacks_has_caller(sep, caller))
1192 			continue;
1193 		if (excl_caller != 0 && stacks_has_caller(sep, excl_caller))
1194 			continue;
1195 		if (module != 0 && !stacks_has_module(sep, module))
1196 			continue;
1197 		if (excl_module != 0 && stacks_has_module(sep, excl_module))
1198 			continue;
1199 
1200 		if (tstate != -1U) {
1201 			if (tstate == TSTATE_PANIC) {
1202 				if (!sep->se_panic)
1203 					continue;
1204 			} else if (sep->se_panic || sep->se_tstate != tstate)
1205 				continue;
1206 		}
1207 		if (excl_tstate != -1U) {
1208 			if (excl_tstate == TSTATE_PANIC) {
1209 				if (sep->se_panic)
1210 					continue;
1211 			} else if (!sep->se_panic &&
1212 			    sep->se_tstate == excl_tstate)
1213 				continue;
1214 		}
1215 
1216 		if (sobj_ops == SOBJ_ALL) {
1217 			if (sep->se_sobj_ops == 0)
1218 				continue;
1219 		} else if (sobj_ops != 0) {
1220 			if (sobj_ops != sep->se_sobj_ops)
1221 				continue;
1222 		}
1223 
1224 		if (!(interesting && sep->se_panic)) {
1225 			if (excl_sobj_ops == SOBJ_ALL) {
1226 				if (sep->se_sobj_ops != 0)
1227 					continue;
1228 			} else if (excl_sobj_ops != 0) {
1229 				if (excl_sobj_ops == sep->se_sobj_ops)
1230 					continue;
1231 			}
1232 		}
1233 
1234 		if (flags & DCMD_PIPE_OUT) {
1235 			while (sep != NULL) {
1236 				mdb_printf("%lr\n", sep->se_thread);
1237 				sep = only_matching ?
1238 				    sep->se_next : sep->se_dup;
1239 			}
1240 			continue;
1241 		}
1242 
1243 		if (all || !printed) {
1244 			mdb_printf("%<u>%-?s %-8s %-?s %8s%</u>\n",
1245 			    "THREAD", "STATE", "SOBJ", "COUNT");
1246 			printed = 1;
1247 		}
1248 
1249 		do {
1250 			char state[20];
1251 			char sobj[100];
1252 
1253 			tstate_to_text(cur->se_tstate, cur->se_panic,
1254 			    state, sizeof (state));
1255 			sobj_to_text(cur->se_sobj_ops,
1256 			    sobj, sizeof (sobj));
1257 
1258 			if (cur == sep)
1259 				mdb_printf("%?p %-8s %-?s %8d\n",
1260 				    cur->se_thread, state, sobj, count);
1261 			else
1262 				mdb_printf("%?p %-8s %-?s %8s\n",
1263 				    cur->se_thread, state, sobj, "-");
1264 
1265 			cur = only_matching ? cur->se_next : cur->se_dup;
1266 		} while (all && cur != NULL);
1267 
1268 		if (sep->se_failed != 0) {
1269 			char *reason;
1270 			switch (sep->se_failed) {
1271 			case FSI_FAIL_NOTINMEMORY:
1272 				reason = "thread not in memory";
1273 				break;
1274 			case FSI_FAIL_THREADCORRUPT:
1275 				reason = "thread structure stack info corrupt";
1276 				break;
1277 			case FSI_FAIL_STACKNOTFOUND:
1278 				reason = "no consistent stack found";
1279 				break;
1280 			default:
1281 				reason = "unknown failure";
1282 				break;
1283 			}
1284 			mdb_printf("%?s <%s>\n", "", reason);
1285 		}
1286 
1287 		for (frame = 0; frame < sep->se_depth; frame++)
1288 			mdb_printf("%?s %a\n", "", sep->se_stack[frame]);
1289 		if (sep->se_overflow)
1290 			mdb_printf("%?s ... truncated ...\n", "");
1291 		mdb_printf("\n");
1292 	}
1293 
1294 	if (flags & DCMD_ADDRSPEC) {
1295 		for (idx = 0; idx < p.pipe_len; idx++)
1296 			if (seen[idx] == 0)
1297 				mdb_warn("stacks: %p not in thread list\n",
1298 				    p.pipe_data[idx]);
1299 	}
1300 	return (DCMD_OK);
1301 }
1302