xref: /illumos-gate/usr/src/cmd/mdb/common/modules/genunix/findstack.c (revision 3d393ee6c37fa10ac512ed6d36109ad616dc7c1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <mdb/mdb_modapi.h>
27 #include <mdb/mdb_ctf.h>
28 
29 #include <sys/types.h>
30 #include <sys/regset.h>
31 #include <sys/stack.h>
32 #include <sys/thread.h>
33 
34 #include "findstack.h"
35 #include "thread.h"
36 #include "sobj.h"
37 
38 typedef struct findstack_info {
39 	uintptr_t	*fsi_stack;	/* place to record frames */
40 
41 	uintptr_t	fsi_sp;		/* stack pointer */
42 	uintptr_t	fsi_pc;		/* pc */
43 	uintptr_t	fsi_sobj_ops;	/* sobj_ops */
44 
45 	uint_t		fsi_tstate;	/* t_state */
46 
47 	uchar_t		fsi_depth;	/* stack depth */
48 	uchar_t		fsi_failed;	/* search failed */
49 	uchar_t		fsi_overflow;	/* stack was deeper than max_depth */
50 	uchar_t		fsi_panic;	/* thread called panic() */
51 
52 	uchar_t		fsi_max_depth;	/* stack frames available */
53 } findstack_info_t;
54 #define	FSI_FAIL_BADTHREAD	1
55 #define	FSI_FAIL_NOTINMEMORY	2
56 #define	FSI_FAIL_THREADCORRUPT	3
57 #define	FSI_FAIL_STACKNOTFOUND	4
58 
59 #ifndef STACK_BIAS
60 #define	STACK_BIAS	0
61 #endif
62 
63 #define	fs_dprintf(x)					\
64 	if (findstack_debug_on) {			\
65 		mdb_printf("findstack debug: ");	\
66 		/*CSTYLED*/				\
67 		mdb_printf x ;				\
68 	}
69 
70 static int findstack_debug_on = 0;
71 
72 #if defined(__i386) || defined(__amd64)
73 struct rwindow {
74 	uintptr_t rw_fp;
75 	uintptr_t rw_rtn;
76 };
77 #endif
78 
79 #define	TOO_BIG_FOR_A_STACK (1024 * 1024)
80 
81 #define	KTOU(p) ((p) - kbase + ubase)
82 #define	UTOK(p) ((p) - ubase + kbase)
83 
84 #define	CRAWL_FOUNDALL	(-1)
85 
86 /*
87  * Given a stack pointer, try to crawl down it to the bottom.
88  * "frame" is a VA in MDB's address space.
89  *
90  * Returns the number of frames successfully crawled down, or
91  * CRAWL_FOUNDALL if it got to the bottom of the stack.
92  */
93 static int
94 crawl(uintptr_t frame, uintptr_t kbase, uintptr_t ktop, uintptr_t ubase,
95     int kill_fp, findstack_info_t *fsip)
96 {
97 	int levels = 0;
98 
99 	fsip->fsi_depth = 0;
100 	fsip->fsi_overflow = 0;
101 
102 	fs_dprintf(("<0> frame = %p, kbase = %p, ktop = %p, ubase = %p\n",
103 	    frame, kbase, ktop, ubase));
104 	for (;;) {
105 		uintptr_t fp;
106 		long *fpp = (long *)&((struct rwindow *)frame)->rw_fp;
107 
108 		fs_dprintf(("<1> fpp = %p, frame = %p\n", fpp, frame));
109 
110 		if ((frame & (STACK_ALIGN - 1)) != 0)
111 			break;
112 
113 		fp = ((struct rwindow *)frame)->rw_fp + STACK_BIAS;
114 		if (fsip->fsi_depth < fsip->fsi_max_depth)
115 			fsip->fsi_stack[fsip->fsi_depth++] =
116 			    ((struct rwindow *)frame)->rw_rtn;
117 		else
118 			fsip->fsi_overflow = 1;
119 
120 		fs_dprintf(("<2> fp = %p\n", fp));
121 
122 		if (fp == ktop)
123 			return (CRAWL_FOUNDALL);
124 		fs_dprintf(("<3> not at base\n"));
125 
126 #if defined(__i386) || defined(__amd64)
127 		if (ktop - fp == sizeof (struct rwindow)) {
128 			fs_dprintf(("<4> found base\n"));
129 			return (CRAWL_FOUNDALL);
130 		}
131 #endif
132 
133 		fs_dprintf(("<5> fp = %p, kbase = %p, ktop - size = %p\n",
134 		    fp, kbase, ktop - sizeof (struct rwindow)));
135 
136 		if (fp < kbase || fp >= (ktop - sizeof (struct rwindow)))
137 			break;
138 
139 		frame = KTOU(fp);
140 		fs_dprintf(("<6> frame = %p\n", frame));
141 
142 		/*
143 		 * NULL out the old %fp so we don't go down this stack
144 		 * more than once.
145 		 */
146 		if (kill_fp) {
147 			fs_dprintf(("<7> fpp = %p\n", fpp));
148 			*fpp = NULL;
149 		}
150 
151 		fs_dprintf(("<8> levels = %d\n", levels));
152 		levels++;
153 	}
154 
155 	return (levels);
156 }
157 
158 /*
159  * "sp" is a kernel VA.
160  */
161 static int
162 print_stack(uintptr_t sp, uintptr_t pc, uintptr_t addr,
163     int argc, const mdb_arg_t *argv, int free_state)
164 {
165 	int showargs = 0, count, err;
166 
167 	count = mdb_getopts(argc, argv,
168 	    'v', MDB_OPT_SETBITS, TRUE, &showargs, NULL);
169 	argc -= count;
170 	argv += count;
171 
172 	if (argc > 1 || (argc == 1 && argv->a_type != MDB_TYPE_STRING))
173 		return (DCMD_USAGE);
174 
175 	mdb_printf("stack pointer for thread %p%s: %p\n",
176 	    addr, (free_state ? " (TS_FREE)" : ""), sp);
177 	if (pc != 0)
178 		mdb_printf("[ %0?lr %a() ]\n", sp, pc);
179 
180 	mdb_inc_indent(2);
181 	mdb_set_dot(sp);
182 
183 	if (argc == 1)
184 		err = mdb_eval(argv->a_un.a_str);
185 	else if (showargs)
186 		err = mdb_eval("<.$C");
187 	else
188 		err = mdb_eval("<.$C0");
189 
190 	mdb_dec_indent(2);
191 
192 	return ((err == -1) ? DCMD_ABORT : DCMD_OK);
193 }
194 
195 /*ARGSUSED*/
196 static int
197 do_findstack(uintptr_t addr, findstack_info_t *fsip, uint_t print_warnings)
198 {
199 	kthread_t thr;
200 	size_t stksz;
201 	uintptr_t ubase, utop;
202 	uintptr_t kbase, ktop;
203 	uintptr_t win, sp;
204 
205 	fsip->fsi_failed = 0;
206 	fsip->fsi_pc = 0;
207 	fsip->fsi_sp = 0;
208 	fsip->fsi_depth = 0;
209 	fsip->fsi_overflow = 0;
210 
211 	bzero(&thr, sizeof (thr));
212 	if (mdb_ctf_vread(&thr, "kthread_t", addr,
213 	    MDB_CTF_VREAD_IGNORE_ALL) == -1) {
214 		if (print_warnings)
215 			mdb_warn("couldn't read thread at %p\n", addr);
216 		fsip->fsi_failed = FSI_FAIL_BADTHREAD;
217 		return (DCMD_ERR);
218 	}
219 
220 	fsip->fsi_sobj_ops = (uintptr_t)thr.t_sobj_ops;
221 	fsip->fsi_tstate = thr.t_state;
222 	fsip->fsi_panic = !!(thr.t_flag & T_PANIC);
223 
224 	if ((thr.t_schedflag & TS_LOAD) == 0) {
225 		if (print_warnings)
226 			mdb_warn("thread %p isn't in memory\n", addr);
227 		fsip->fsi_failed = FSI_FAIL_NOTINMEMORY;
228 		return (DCMD_ERR);
229 	}
230 
231 	if (thr.t_stk < thr.t_stkbase) {
232 		if (print_warnings)
233 			mdb_warn(
234 			    "stack base or stack top corrupt for thread %p\n",
235 			    addr);
236 		fsip->fsi_failed = FSI_FAIL_THREADCORRUPT;
237 		return (DCMD_ERR);
238 	}
239 
240 	kbase = (uintptr_t)thr.t_stkbase;
241 	ktop = (uintptr_t)thr.t_stk;
242 	stksz = ktop - kbase;
243 
244 #ifdef __amd64
245 	/*
246 	 * The stack on amd64 is intentionally misaligned, so ignore the top
247 	 * half-frame.  See thread_stk_init().  When handling traps, the frame
248 	 * is automatically aligned by the hardware, so we only alter ktop if
249 	 * needed.
250 	 */
251 	if ((ktop & (STACK_ALIGN - 1)) != 0)
252 		ktop -= STACK_ENTRY_ALIGN;
253 #endif
254 
255 	/*
256 	 * If the stack size is larger than a meg, assume that it's bogus.
257 	 */
258 	if (stksz > TOO_BIG_FOR_A_STACK) {
259 		if (print_warnings)
260 			mdb_warn("stack size for thread %p is too big to be "
261 			    "reasonable\n", addr);
262 		fsip->fsi_failed = FSI_FAIL_THREADCORRUPT;
263 		return (DCMD_ERR);
264 	}
265 
266 	/*
267 	 * This could be (and was) a UM_GC allocation.  Unfortunately,
268 	 * stksz tends to be very large.  As currently implemented, dcmds
269 	 * invoked as part of pipelines don't have their UM_GC-allocated
270 	 * memory freed until the pipeline completes.  With stksz in the
271 	 * neighborhood of 20k, the popular ::walk thread |::findstack
272 	 * pipeline can easily run memory-constrained debuggers (kmdb) out
273 	 * of memory.  This can be changed back to a gc-able allocation when
274 	 * the debugger is changed to free UM_GC memory more promptly.
275 	 */
276 	ubase = (uintptr_t)mdb_alloc(stksz, UM_SLEEP);
277 	utop = ubase + stksz;
278 	if (mdb_vread((caddr_t)ubase, stksz, kbase) != stksz) {
279 		mdb_free((void *)ubase, stksz);
280 		if (print_warnings)
281 			mdb_warn("couldn't read entire stack for thread %p\n",
282 			    addr);
283 		fsip->fsi_failed = FSI_FAIL_THREADCORRUPT;
284 		return (DCMD_ERR);
285 	}
286 
287 	/*
288 	 * Try the saved %sp first, if it looks reasonable.
289 	 */
290 	sp = KTOU((uintptr_t)thr.t_sp + STACK_BIAS);
291 	if (sp >= ubase && sp <= utop) {
292 		if (crawl(sp, kbase, ktop, ubase, 0, fsip) == CRAWL_FOUNDALL) {
293 			fsip->fsi_sp = (uintptr_t)thr.t_sp;
294 #if !defined(__i386)
295 			fsip->fsi_pc = (uintptr_t)thr.t_pc;
296 #endif
297 			goto found;
298 		}
299 	}
300 
301 	/*
302 	 * Now walk through the whole stack, starting at the base,
303 	 * trying every possible "window".
304 	 */
305 	for (win = ubase;
306 	    win + sizeof (struct rwindow) <= utop;
307 	    win += sizeof (struct rwindow *)) {
308 		if (crawl(win, kbase, ktop, ubase, 1, fsip) == CRAWL_FOUNDALL) {
309 			fsip->fsi_sp = UTOK(win) - STACK_BIAS;
310 			goto found;
311 		}
312 	}
313 
314 	/*
315 	 * We didn't conclusively find the stack.  So we'll take another lap,
316 	 * and print out anything that looks possible.
317 	 */
318 	if (print_warnings)
319 		mdb_printf("Possible stack pointers for thread %p:\n", addr);
320 	(void) mdb_vread((caddr_t)ubase, stksz, kbase);
321 
322 	for (win = ubase;
323 	    win + sizeof (struct rwindow) <= utop;
324 	    win += sizeof (struct rwindow *)) {
325 		uintptr_t fp = ((struct rwindow *)win)->rw_fp;
326 		int levels;
327 
328 		if ((levels = crawl(win, kbase, ktop, ubase, 1, fsip)) > 1) {
329 			if (print_warnings)
330 				mdb_printf("  %p (%d)\n", fp, levels);
331 		} else if (levels == CRAWL_FOUNDALL) {
332 			/*
333 			 * If this is a live system, the stack could change
334 			 * between the two mdb_vread(ubase, utop, kbase)'s,
335 			 * and we could have a fully valid stack here.
336 			 */
337 			fsip->fsi_sp = UTOK(win) - STACK_BIAS;
338 			goto found;
339 		}
340 	}
341 
342 	fsip->fsi_depth = 0;
343 	fsip->fsi_overflow = 0;
344 	fsip->fsi_failed = FSI_FAIL_STACKNOTFOUND;
345 
346 	mdb_free((void *)ubase, stksz);
347 	return (DCMD_ERR);
348 found:
349 	mdb_free((void *)ubase, stksz);
350 	return (DCMD_OK);
351 }
352 
353 int
354 findstack(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
355 {
356 	findstack_info_t fsi;
357 	int retval;
358 
359 	if (!(flags & DCMD_ADDRSPEC))
360 		return (DCMD_USAGE);
361 
362 	bzero(&fsi, sizeof (fsi));
363 
364 	if ((retval = do_findstack(addr, &fsi, 1)) != DCMD_OK ||
365 	    fsi.fsi_failed)
366 		return (retval);
367 
368 	return (print_stack(fsi.fsi_sp, fsi.fsi_pc, addr,
369 	    argc, argv, fsi.fsi_tstate == TS_FREE));
370 }
371 
372 /*ARGSUSED*/
373 int
374 findstack_debug(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *av)
375 {
376 	findstack_debug_on ^= 1;
377 
378 	mdb_printf("findstack: debugging is now %s\n",
379 	    findstack_debug_on ? "on" : "off");
380 
381 	return (DCMD_OK);
382 }
383 
384 static void
385 uppercase(char *p)
386 {
387 	for (; *p != '\0'; p++) {
388 		if (*p >= 'a' && *p <= 'z')
389 			*p += 'A' - 'a';
390 	}
391 }
392 
393 static void
394 sobj_to_text(uintptr_t addr, char *out, size_t out_sz)
395 {
396 	sobj_ops_to_text(addr, out, out_sz);
397 	uppercase(out);
398 }
399 
400 #define	SOBJ_ALL	1
401 static int
402 text_to_sobj(const char *text, uintptr_t *out)
403 {
404 	if (strcasecmp(text, "ALL") == 0) {
405 		*out = SOBJ_ALL;
406 		return (0);
407 	}
408 	return (sobj_text_to_ops(text, out));
409 }
410 
411 #define	TSTATE_PANIC	-2U
412 static int
413 text_to_tstate(const char *text, uint_t *out)
414 {
415 	if (strcasecmp(text, "panic") == 0)
416 		*out = TSTATE_PANIC;
417 	else if (thread_text_to_state(text, out) != 0) {
418 		mdb_warn("tstate \"%s\" not recognized\n", text);
419 		return (-1);
420 	}
421 	return (0);
422 }
423 
424 static void
425 tstate_to_text(uint_t tstate, uint_t paniced, char *out, size_t out_sz)
426 {
427 	if (paniced)
428 		mdb_snprintf(out, out_sz, "panic");
429 	else
430 		thread_state_to_text(tstate, out, out_sz);
431 	uppercase(out);
432 }
433 
434 typedef struct stacks_entry {
435 	struct stacks_entry	*se_next;
436 	struct stacks_entry	*se_dup;	/* dups of this stack */
437 	uintptr_t		se_thread;
438 	uintptr_t		se_sp;
439 	uintptr_t		se_sobj_ops;
440 	uint32_t		se_tstate;
441 	uint32_t		se_count;	/* # threads w/ this stack */
442 	uint8_t			se_overflow;
443 	uint8_t			se_depth;
444 	uint8_t			se_failed;	/* failure reason; FSI_FAIL_* */
445 	uint8_t			se_panic;
446 	uintptr_t		se_stack[1];
447 } stacks_entry_t;
448 #define	STACKS_ENTRY_SIZE(x) OFFSETOF(stacks_entry_t, se_stack[(x)])
449 
450 #define	STACKS_HSIZE 127
451 
452 /* Maximum stack depth reported in stacks */
453 #define	STACKS_MAX_DEPTH	254
454 
455 typedef struct stacks_info {
456 	size_t		si_count;	/* total stacks_entry_ts (incl dups) */
457 	size_t		si_entries;	/* # entries in hash table */
458 	stacks_entry_t	**si_hash;	/* hash table */
459 
460 	findstack_info_t si_fsi;	/* transient callback state */
461 } stacks_info_t;
462 
463 
464 /* global state cached between invocations */
465 #define	STACKS_STATE_CLEAN	0
466 #define	STACKS_STATE_DIRTY	1
467 #define	STACKS_STATE_DONE	2
468 static uint_t stacks_state = STACKS_STATE_CLEAN;
469 static stacks_entry_t **stacks_hash;
470 static stacks_entry_t **stacks_array;
471 static size_t stacks_array_size;
472 
473 size_t
474 stacks_hash_entry(stacks_entry_t *sep)
475 {
476 	size_t depth = sep->se_depth;
477 	uintptr_t *stack = sep->se_stack;
478 
479 	uint64_t total = depth;
480 
481 	while (depth > 0) {
482 		total += *stack;
483 		stack++; depth--;
484 	}
485 
486 	return (total % STACKS_HSIZE);
487 }
488 
489 /*
490  * This is used to both compare stacks for equality and to sort the final
491  * list of unique stacks.  forsort specifies the latter behavior, which
492  * additionally:
493  *	compares se_count, and
494  *	sorts the stacks by text function name.
495  *
496  * The equality test is independent of se_count, and doesn't care about
497  * relative ordering, so we don't do the extra work of looking up symbols
498  * for the stack addresses.
499  */
500 int
501 stacks_entry_comp_impl(stacks_entry_t *l, stacks_entry_t *r,
502     uint_t forsort)
503 {
504 	int idx;
505 
506 	int depth = MIN(l->se_depth, r->se_depth);
507 
508 	/* no matter what, panic stacks come last. */
509 	if (l->se_panic > r->se_panic)
510 		return (1);
511 	if (l->se_panic < r->se_panic)
512 		return (-1);
513 
514 	if (forsort) {
515 		/* put large counts earlier */
516 		if (l->se_count > r->se_count)
517 			return (-1);
518 		if (l->se_count < r->se_count)
519 			return (1);
520 	}
521 
522 	if (l->se_tstate > r->se_tstate)
523 		return (1);
524 	if (l->se_tstate < r->se_tstate)
525 		return (-1);
526 
527 	if (l->se_failed > r->se_failed)
528 		return (1);
529 	if (l->se_failed < r->se_failed)
530 		return (-1);
531 
532 	for (idx = 0; idx < depth; idx++) {
533 		char lbuf[MDB_SYM_NAMLEN];
534 		char rbuf[MDB_SYM_NAMLEN];
535 
536 		int rval;
537 		uintptr_t laddr = l->se_stack[idx];
538 		uintptr_t raddr = r->se_stack[idx];
539 
540 		if (laddr == raddr)
541 			continue;
542 
543 		if (forsort &&
544 		    mdb_lookup_by_addr(laddr, MDB_SYM_FUZZY,
545 		    lbuf, sizeof (lbuf), NULL) != -1 &&
546 		    mdb_lookup_by_addr(raddr, MDB_SYM_FUZZY,
547 		    rbuf, sizeof (rbuf), NULL) != -1 &&
548 		    (rval = strcmp(lbuf, rbuf)) != 0)
549 			return (rval);
550 
551 		if (laddr > raddr)
552 			return (1);
553 		return (-1);
554 	}
555 
556 	if (l->se_overflow > r->se_overflow)
557 		return (-1);
558 	if (l->se_overflow < r->se_overflow)
559 		return (1);
560 
561 	if (l->se_depth > r->se_depth)
562 		return (1);
563 	if (l->se_depth < r->se_depth)
564 		return (-1);
565 
566 	if (l->se_sobj_ops > r->se_sobj_ops)
567 		return (1);
568 	if (l->se_sobj_ops < r->se_sobj_ops)
569 		return (-1);
570 
571 	return (0);
572 }
573 
574 int
575 stacks_entry_comp(const void *l_arg, const void *r_arg)
576 {
577 	stacks_entry_t * const *lp = l_arg;
578 	stacks_entry_t * const *rp = r_arg;
579 
580 	return (stacks_entry_comp_impl(*lp, *rp, 1));
581 }
582 
583 void
584 stacks_cleanup(int force)
585 {
586 	int idx = 0;
587 	stacks_entry_t *cur, *next;
588 
589 	if (stacks_state == STACKS_STATE_CLEAN)
590 		return;
591 
592 	if (!force && stacks_state == STACKS_STATE_DONE)
593 		return;
594 
595 	/*
596 	 * Until the array is sorted and stable, stacks_hash will be non-NULL.
597 	 * This way, we can get at all of the data, even if qsort() was
598 	 * interrupted while mucking with the array.
599 	 */
600 	if (stacks_hash != NULL) {
601 		for (idx = 0; idx < STACKS_HSIZE; idx++) {
602 			while ((cur = stacks_hash[idx]) != NULL) {
603 				while ((next = cur->se_dup) != NULL) {
604 					cur->se_dup = next->se_dup;
605 					mdb_free(next,
606 					    STACKS_ENTRY_SIZE(next->se_depth));
607 				}
608 				next = cur->se_next;
609 				stacks_hash[idx] = next;
610 				mdb_free(cur, STACKS_ENTRY_SIZE(cur->se_depth));
611 			}
612 		}
613 		if (stacks_array != NULL)
614 			mdb_free(stacks_array,
615 			    stacks_array_size * sizeof (*stacks_array));
616 
617 	} else if (stacks_array != NULL) {
618 		for (idx = 0; idx < stacks_array_size; idx++) {
619 			if ((cur = stacks_array[idx]) != NULL) {
620 				while ((next = cur->se_dup) != NULL) {
621 					cur->se_dup = next->se_dup;
622 					mdb_free(next,
623 					    STACKS_ENTRY_SIZE(next->se_depth));
624 				}
625 				stacks_array[idx] = NULL;
626 				mdb_free(cur, STACKS_ENTRY_SIZE(cur->se_depth));
627 			}
628 		}
629 		mdb_free(stacks_array,
630 		    stacks_array_size * sizeof (*stacks_array));
631 	}
632 
633 	stacks_array_size = 0;
634 	stacks_state = STACKS_STATE_CLEAN;
635 }
636 
637 /*ARGSUSED*/
638 int
639 stacks_thread_cb(uintptr_t addr, const void *ignored, void *cbarg)
640 {
641 	stacks_info_t *sip = cbarg;
642 	findstack_info_t *fsip = &sip->si_fsi;
643 
644 	stacks_entry_t **sepp, *nsep, *sep;
645 	int idx;
646 	size_t depth;
647 
648 	if (do_findstack(addr, fsip, 0) != DCMD_OK &&
649 	    fsip->fsi_failed == FSI_FAIL_BADTHREAD) {
650 		mdb_warn("couldn't read thread at %p\n", addr);
651 		return (WALK_NEXT);
652 	}
653 
654 	sip->si_count++;
655 
656 	depth = fsip->fsi_depth;
657 	nsep = mdb_zalloc(STACKS_ENTRY_SIZE(depth), UM_SLEEP);
658 	nsep->se_thread = addr;
659 	nsep->se_sp = fsip->fsi_sp;
660 	nsep->se_sobj_ops = fsip->fsi_sobj_ops;
661 	nsep->se_tstate = fsip->fsi_tstate;
662 	nsep->se_count = 1;
663 	nsep->se_overflow = fsip->fsi_overflow;
664 	nsep->se_depth = depth;
665 	nsep->se_failed = fsip->fsi_failed;
666 	nsep->se_panic = fsip->fsi_panic;
667 
668 	for (idx = 0; idx < depth; idx++)
669 		nsep->se_stack[idx] = fsip->fsi_stack[idx];
670 
671 	for (sepp = &sip->si_hash[stacks_hash_entry(nsep)];
672 	    (sep = *sepp) != NULL;
673 	    sepp = &sep->se_next) {
674 
675 		if (stacks_entry_comp_impl(sep, nsep, 0) != 0)
676 			continue;
677 
678 		nsep->se_dup = sep->se_dup;
679 		sep->se_dup = nsep;
680 		sep->se_count++;
681 		return (WALK_NEXT);
682 	}
683 
684 	nsep->se_next = NULL;
685 	*sepp = nsep;
686 	sip->si_entries++;
687 
688 	return (WALK_NEXT);
689 }
690 
691 int
692 stacks_run(int verbose)
693 {
694 	stacks_info_t si;
695 	findstack_info_t *fsip = &si.si_fsi;
696 	size_t idx;
697 	stacks_entry_t **cur;
698 
699 	bzero(&si, sizeof (si));
700 
701 	stacks_state = STACKS_STATE_DIRTY;
702 
703 	stacks_hash = si.si_hash =
704 	    mdb_zalloc(STACKS_HSIZE * sizeof (*si.si_hash), UM_SLEEP);
705 	si.si_entries = 0;
706 	si.si_count = 0;
707 
708 	fsip->fsi_max_depth = STACKS_MAX_DEPTH;
709 	fsip->fsi_stack =
710 	    mdb_alloc(fsip->fsi_max_depth * sizeof (*fsip->fsi_stack),
711 	    UM_SLEEP | UM_GC);
712 
713 	if (verbose)
714 		mdb_warn("stacks: processing kernel threads\n");
715 
716 	if (mdb_walk("thread", stacks_thread_cb, &si) != 0) {
717 		mdb_warn("cannot walk \"thread\"");
718 		return (DCMD_ERR);
719 	}
720 
721 	if (verbose)
722 		mdb_warn("stacks: %d unique stacks / %d threads\n",
723 		    si.si_entries, si.si_count);
724 
725 	stacks_array_size = si.si_entries;
726 	stacks_array =
727 	    mdb_zalloc(si.si_entries * sizeof (*stacks_array), UM_SLEEP);
728 	cur = stacks_array;
729 	for (idx = 0; idx < STACKS_HSIZE; idx++) {
730 		stacks_entry_t *sep;
731 		for (sep = si.si_hash[idx]; sep != NULL; sep = sep->se_next)
732 			*(cur++) = sep;
733 	}
734 
735 	if (cur != stacks_array + si.si_entries) {
736 		mdb_warn("stacks: miscounted array size (%d != size: %d)\n",
737 		    (cur - stacks_array), stacks_array_size);
738 		return (DCMD_ERR);
739 	}
740 	qsort(stacks_array, si.si_entries, sizeof (*stacks_array),
741 	    stacks_entry_comp);
742 
743 	/* Now that we're done, free the hash table */
744 	stacks_hash = NULL;
745 	mdb_free(si.si_hash, STACKS_HSIZE * sizeof (*si.si_hash));
746 
747 	stacks_state = STACKS_STATE_DONE;
748 
749 	if (verbose)
750 		mdb_warn("stacks: done\n");
751 
752 	return (DCMD_OK);
753 }
754 
755 static int
756 stacks_has_caller(stacks_entry_t *sep, uintptr_t addr)
757 {
758 	uintptr_t laddr = addr;
759 	uintptr_t haddr = addr + 1;
760 	int idx;
761 	char c[MDB_SYM_NAMLEN];
762 	GElf_Sym sym;
763 
764 	if (mdb_lookup_by_addr(addr, MDB_SYM_FUZZY,
765 	    c, sizeof (c), &sym) != -1 &&
766 	    addr == (uintptr_t)sym.st_value) {
767 		laddr = (uintptr_t)sym.st_value;
768 		haddr = (uintptr_t)sym.st_value + sym.st_size;
769 	}
770 
771 	for (idx = 0; idx < sep->se_depth; idx++)
772 		if (sep->se_stack[idx] >= laddr && sep->se_stack[idx] < haddr)
773 			return (1);
774 
775 	return (0);
776 }
777 
778 static int
779 uintptrcomp(const void *lp, const void *rp)
780 {
781 	uintptr_t lhs = *(const uintptr_t *)lp;
782 	uintptr_t rhs = *(const uintptr_t *)rp;
783 	if (lhs > rhs)
784 		return (1);
785 	if (lhs < rhs)
786 		return (-1);
787 	return (0);
788 }
789 
790 /*ARGSUSED*/
791 static void
792 print_sobj_help(int type, const char *name, const char *ops_name, void *ign)
793 {
794 	mdb_printf(" %s", name);
795 }
796 
797 /*ARGSUSED*/
798 static void
799 print_tstate_help(uint_t state, const char *name, void *ignored)
800 {
801 	mdb_printf(" %s", name);
802 }
803 
804 void
805 stacks_help(void)
806 {
807 	mdb_printf(
808 "::stacks processes all of the thread stacks on the system, grouping\n"
809 "together threads which have the same:\n"
810 "\n"
811 "  * Thread state,\n"
812 "  * Sync object type, and\n"
813 "  * PCs in their stack trace.\n"
814 "\n"
815 "The default output (no address or options) is just a dump of the thread\n"
816 "groups in the system.  For a view of active threads, use \"::stacks -i\",\n"
817 "which filters out FREE threads (interrupt threads which are currently\n"
818 "inactive) and threads sleeping on a CV. (Note that those threads may still\n"
819 "be noteworthy; this is just for a first glance.)  More general filtering\n"
820 "options are described below, in the \"FILTERS\" section.\n"
821 "\n"
822 "::stacks can be used in a pipeline.  The input to ::stacks is one or more\n"
823 "thread pointers.  For example, to get a summary of threads in a process,\n"
824 "you can do:\n"
825 "\n"
826 "  %<b>procp%</b>::walk thread | ::stacks\n"
827 "\n"
828 "When output into a pipe, ::stacks prints all of the threads input,\n"
829 "filtered by the given filtering options.  This means that multiple\n"
830 "::stacks invocations can be piped together to achieve more complicated\n"
831 "filters.  For example, to get threads which have both 'fop_read' and\n"
832 "'cv_wait_sig_swap' in their stack trace, you could do:\n"
833 "\n"
834 "  ::stacks -c fop_read | ::stacks -c cv_wait_sig_swap_core\n"
835 "\n"
836 "To get the full list of threads in each group, use the '-a' flag:\n"
837 "\n"
838 "  ::stacks -a\n"
839 "\n");
840 	mdb_dec_indent(2);
841 	mdb_printf("%<b>OPTIONS%</b>\n");
842 	mdb_inc_indent(2);
843 	mdb_printf("%s",
844 "  -a    Print all of the grouped threads, instead of just a count.\n"
845 "  -f    Force a re-run of the thread stack gathering.\n"
846 "  -v    Be verbose about thread stack gathering.\n"
847 "\n");
848 	mdb_dec_indent(2);
849 	mdb_printf("%<b>FILTERS%</b>\n");
850 	mdb_inc_indent(2);
851 	mdb_printf("%s",
852 "  -i    Show active threads; equivalent to '-S CV -T FREE'.\n"
853 "  -c func[+offset]\n"
854 "        Only print threads whose stacks contain func/func+offset.\n"
855 "  -C func[+offset]\n"
856 "        Only print threads whose stacks do not contain func/func+offset.\n"
857 "  -s {type | ALL}\n"
858 "        Only print threads which are on a 'type' synchronization object\n"
859 "        (SOBJ).\n"
860 "  -S {type | ALL}\n"
861 "        Only print threads which are not on a 'type' SOBJ.\n"
862 "  -t tstate\n"
863 "        Only print threads which are in thread state 'tstate'.\n"
864 "  -T tstate\n"
865 "        Only print threads which are not in thread state 'tstate'.\n"
866 "\n");
867 	mdb_printf("   SOBJ types:");
868 	sobj_type_walk(print_sobj_help, NULL);
869 	mdb_printf("\n");
870 	mdb_printf("Thread states:");
871 	thread_walk_states(print_tstate_help, NULL);
872 	mdb_printf(" panic\n");
873 }
874 
875 /*ARGSUSED*/
876 int
877 stacks(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
878 {
879 	size_t idx;
880 
881 	char *seen = NULL;
882 
883 	const char *caller_str = NULL;
884 	const char *excl_caller_str = NULL;
885 	uintptr_t caller = 0, excl_caller = 0;
886 	const char *sobj = NULL;
887 	const char *excl_sobj = NULL;
888 	uintptr_t sobj_ops = 0, excl_sobj_ops = 0;
889 	const char *tstate_str = NULL;
890 	const char *excl_tstate_str = NULL;
891 	uint_t tstate = -1U;
892 	uint_t excl_tstate = -1U;
893 
894 	uint_t all = 0;
895 	uint_t force = 0;
896 	uint_t interesting = 0;
897 	uint_t verbose = 0;
898 
899 	/*
900 	 * We have a slight behavior difference between having piped
901 	 * input and 'addr::stacks'.  Without a pipe, we assume the
902 	 * thread pointer given is a representative thread, and so
903 	 * we include all similar threads in the system in our output.
904 	 *
905 	 * With a pipe, we filter down to just the threads in our
906 	 * input.
907 	 */
908 	uint_t addrspec = (flags & DCMD_ADDRSPEC);
909 	uint_t only_matching = addrspec && (flags & DCMD_PIPE);
910 
911 	mdb_pipe_t p;
912 
913 	if (mdb_getopts(argc, argv,
914 	    'a', MDB_OPT_SETBITS, TRUE, &all,
915 	    'f', MDB_OPT_SETBITS, TRUE, &force,
916 	    'i', MDB_OPT_SETBITS, TRUE, &interesting,
917 	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
918 	    'c', MDB_OPT_STR, &caller_str,
919 	    'C', MDB_OPT_STR, &excl_caller_str,
920 	    's', MDB_OPT_STR, &sobj,
921 	    'S', MDB_OPT_STR, &excl_sobj,
922 	    't', MDB_OPT_STR, &tstate_str,
923 	    'T', MDB_OPT_STR, &excl_tstate_str,
924 	    NULL) != argc)
925 		return (DCMD_USAGE);
926 
927 	if (interesting) {
928 		if (sobj != NULL || excl_sobj != NULL ||
929 		    tstate_str != NULL || excl_tstate_str != NULL) {
930 			mdb_warn(
931 			    "stacks: -i is incompatible with -[sStT]\n");
932 			return (DCMD_USAGE);
933 		}
934 		excl_sobj = "CV";
935 		excl_tstate_str = "FREE";
936 	}
937 
938 	if (caller_str != NULL) {
939 		mdb_set_dot(0);
940 		if (mdb_eval(caller_str) != 0) {
941 			mdb_warn("stacks: evaluation of \"%s\" failed",
942 			    caller_str);
943 			return (DCMD_ABORT);
944 		}
945 		caller = mdb_get_dot();
946 	}
947 
948 	if (excl_caller_str != NULL) {
949 		mdb_set_dot(0);
950 		if (mdb_eval(excl_caller_str) != 0) {
951 			mdb_warn("stacks: evaluation of \"%s\" failed",
952 			    excl_caller_str);
953 			return (DCMD_ABORT);
954 		}
955 		excl_caller = mdb_get_dot();
956 	}
957 	mdb_set_dot(addr);
958 
959 	if (sobj != NULL &&
960 	    text_to_sobj(sobj, &sobj_ops) != 0)
961 		return (DCMD_USAGE);
962 
963 	if (excl_sobj != NULL &&
964 	    text_to_sobj(excl_sobj, &excl_sobj_ops) != 0)
965 		return (DCMD_USAGE);
966 
967 	if (sobj_ops != 0 && excl_sobj_ops != 0) {
968 		mdb_warn("stacks: only one of -s and -S can be specified\n");
969 		return (DCMD_USAGE);
970 	}
971 
972 	if (tstate_str &&
973 	    text_to_tstate(tstate_str, &tstate) != 0)
974 		return (DCMD_USAGE);
975 	if (excl_tstate_str &&
976 	    text_to_tstate(excl_tstate_str, &excl_tstate) != 0)
977 		return (DCMD_USAGE);
978 
979 	if (tstate != -1U && excl_tstate != -1U) {
980 		mdb_warn("stacks: only one of -t and -T can be specified\n");
981 		return (DCMD_USAGE);
982 	}
983 
984 	/*
985 	 * Force a cleanup if we're connected to a live system. Never
986 	 * do a cleanup after the first invocation around the loop.
987 	 */
988 	force |= (mdb_get_state() == MDB_STATE_RUNNING);
989 	if (force && (flags & (DCMD_LOOPFIRST|DCMD_LOOP)) == DCMD_LOOP)
990 		force = 0;
991 
992 	stacks_cleanup(force);
993 
994 	if (stacks_state == STACKS_STATE_CLEAN) {
995 		int res = stacks_run(verbose);
996 		if (res != DCMD_OK)
997 			return (res);
998 	}
999 
1000 	if (!all && DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
1001 		mdb_printf("%<u>%-?s %-8s %-?s %8s%</u>\n",
1002 		    "THREAD", "STATE", "SOBJ", "COUNT");
1003 	}
1004 
1005 	/*
1006 	 * If there's an address specified, we're going to further filter
1007 	 * to only entries which have an address in the input.  To reduce
1008 	 * overhead (and make the sorted output come out right), we
1009 	 * use mdb_get_pipe() to grab the entire pipeline of input, then
1010 	 * use qsort() and bsearch() to speed up the search.
1011 	 */
1012 	if (addrspec) {
1013 		mdb_get_pipe(&p);
1014 		if (p.pipe_data == NULL || p.pipe_len == 0) {
1015 			p.pipe_data = &addr;
1016 			p.pipe_len = 1;
1017 		}
1018 		qsort(p.pipe_data, p.pipe_len, sizeof (uintptr_t),
1019 		    uintptrcomp);
1020 
1021 		/* remove any duplicates in the data */
1022 		idx = 0;
1023 		while (idx < p.pipe_len - 1) {
1024 			uintptr_t *data = &p.pipe_data[idx];
1025 			size_t len = p.pipe_len - idx;
1026 
1027 			if (data[0] == data[1]) {
1028 				memmove(data, data + 1,
1029 				    (len - 1) * sizeof (*data));
1030 				p.pipe_len--;
1031 				continue; /* repeat without incrementing idx */
1032 			}
1033 			idx++;
1034 		}
1035 
1036 		seen = mdb_zalloc(p.pipe_len, UM_SLEEP | UM_GC);
1037 	}
1038 
1039 	for (idx = 0; idx < stacks_array_size; idx++) {
1040 		stacks_entry_t *sep = stacks_array[idx];
1041 		stacks_entry_t *cur = sep;
1042 		int frame;
1043 		size_t count = sep->se_count;
1044 
1045 		if (addrspec) {
1046 			stacks_entry_t *head = NULL, *tail = NULL, *sp;
1047 			size_t foundcount = 0;
1048 			/*
1049 			 * We use the now-unused hash chain field se_next to
1050 			 * link together the dups which match our list.
1051 			 */
1052 			for (sp = sep; sp != NULL; sp = sp->se_dup) {
1053 				uintptr_t *entry = bsearch(&sp->se_thread,
1054 				    p.pipe_data, p.pipe_len, sizeof (uintptr_t),
1055 				    uintptrcomp);
1056 				if (entry != NULL) {
1057 					foundcount++;
1058 					seen[entry - p.pipe_data]++;
1059 					if (head == NULL)
1060 						head = sp;
1061 					else
1062 						tail->se_next = sp;
1063 					tail = sp;
1064 					sp->se_next = NULL;
1065 				}
1066 			}
1067 			if (head == NULL)
1068 				continue;	/* no match, skip entry */
1069 
1070 			if (only_matching) {
1071 				cur = sep = head;
1072 				count = foundcount;
1073 			}
1074 		}
1075 
1076 		if (caller != 0 && !stacks_has_caller(sep, caller))
1077 			continue;
1078 		if (excl_caller != 0 && stacks_has_caller(sep, excl_caller))
1079 			continue;
1080 
1081 		if (tstate != -1U) {
1082 			if (tstate == TSTATE_PANIC) {
1083 				if (!sep->se_panic)
1084 					continue;
1085 			} else if (sep->se_panic || sep->se_tstate != tstate)
1086 				continue;
1087 		}
1088 		if (excl_tstate != -1U) {
1089 			if (excl_tstate == TSTATE_PANIC) {
1090 				if (sep->se_panic)
1091 					continue;
1092 			} else if (!sep->se_panic &&
1093 			    sep->se_tstate == excl_tstate)
1094 				continue;
1095 		}
1096 
1097 		if (sobj_ops == SOBJ_ALL) {
1098 			if (sep->se_sobj_ops == 0)
1099 				continue;
1100 		} else if (sobj_ops != 0) {
1101 			if (sobj_ops != sep->se_sobj_ops)
1102 				continue;
1103 		}
1104 
1105 		if (!(interesting && sep->se_panic)) {
1106 			if (excl_sobj_ops == SOBJ_ALL) {
1107 				if (sep->se_sobj_ops != 0)
1108 					continue;
1109 			} else if (excl_sobj_ops != 0) {
1110 				if (excl_sobj_ops == sep->se_sobj_ops)
1111 					continue;
1112 			}
1113 		}
1114 
1115 		if (flags & DCMD_PIPE_OUT) {
1116 			while (sep != NULL) {
1117 				mdb_printf("%lr\n", sep->se_thread);
1118 				sep = only_matching ?
1119 				    sep->se_next : sep->se_dup;
1120 			}
1121 			continue;
1122 		}
1123 
1124 		if (all) {
1125 			mdb_printf("%<u>%-?s %-8s %-?s %8s%</u>\n",
1126 			    "THREAD", "STATE", "SOBJTYPE", "COUNT");
1127 		}
1128 
1129 		do {
1130 			char state[20];
1131 			char sobj[100];
1132 
1133 			tstate_to_text(cur->se_tstate, cur->se_panic,
1134 			    state, sizeof (state));
1135 			sobj_to_text(cur->se_sobj_ops,
1136 			    sobj, sizeof (sobj));
1137 
1138 			if (cur == sep)
1139 				mdb_printf("%?p %-8s %-?s %8d\n",
1140 				    cur->se_thread, state, sobj, count);
1141 			else
1142 				mdb_printf("%?p %-8s %-?s %8s\n",
1143 				    cur->se_thread, state, sobj, "-");
1144 
1145 			cur = only_matching ? cur->se_next : cur->se_dup;
1146 		} while (all && cur != NULL);
1147 
1148 		if (sep->se_failed != 0) {
1149 			char *reason;
1150 			switch (sep->se_failed) {
1151 			case FSI_FAIL_NOTINMEMORY:
1152 				reason = "thread not in memory";
1153 				break;
1154 			case FSI_FAIL_THREADCORRUPT:
1155 				reason = "thread structure stack info corrupt";
1156 				break;
1157 			case FSI_FAIL_STACKNOTFOUND:
1158 				reason = "no consistent stack found";
1159 				break;
1160 			default:
1161 				reason = "unknown failure";
1162 				break;
1163 			}
1164 			mdb_printf("%?s <%s>\n", "", reason);
1165 		}
1166 
1167 		for (frame = 0; frame < sep->se_depth; frame++)
1168 			mdb_printf("%?s %a\n", "", sep->se_stack[frame]);
1169 		if (sep->se_overflow)
1170 			mdb_printf("%?s ... truncated ...\n", "");
1171 		mdb_printf("\n");
1172 	}
1173 
1174 	if (flags & DCMD_ADDRSPEC) {
1175 		for (idx = 0; idx < p.pipe_len; idx++)
1176 			if (seen[idx] == 0)
1177 				mdb_warn("stacks: %p not in thread list\n",
1178 				    p.pipe_data[idx]);
1179 	}
1180 	return (DCMD_OK);
1181 }
1182