xref: /titanic_41/usr/src/cmd/mdb/common/modules/genunix/kmem.c (revision 86b2a80075734c8ecdc898b083ef73b81254096b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2018 Joyent, Inc.  All rights reserved.
28  * Copyright (c) 2012 by Delphix. All rights reserved.
29  */
30 
31 #include <mdb/mdb_param.h>
32 #include <mdb/mdb_modapi.h>
33 #include <mdb/mdb_ctf.h>
34 #include <mdb/mdb_whatis.h>
35 #include <sys/cpuvar.h>
36 #include <sys/kmem_impl.h>
37 #include <sys/vmem_impl.h>
38 #include <sys/machelf.h>
39 #include <sys/modctl.h>
40 #include <sys/kobj.h>
41 #include <sys/panic.h>
42 #include <sys/stack.h>
43 #include <sys/sysmacros.h>
44 #include <vm/page.h>
45 
46 #include "avl.h"
47 #include "combined.h"
48 #include "dist.h"
49 #include "kmem.h"
50 #include "list.h"
51 
52 #define	dprintf(x) if (mdb_debug_level) { \
53 	mdb_printf("kmem debug: ");  \
54 	/*CSTYLED*/\
55 	mdb_printf x ;\
56 }
57 
58 #define	KM_ALLOCATED		0x01
59 #define	KM_FREE			0x02
60 #define	KM_BUFCTL		0x04
61 #define	KM_CONSTRUCTED		0x08	/* only constructed free buffers */
62 #define	KM_HASH			0x10
63 
64 static int mdb_debug_level = 0;
65 
66 /*ARGSUSED*/
67 static int
kmem_init_walkers(uintptr_t addr,const kmem_cache_t * c,void * ignored)68 kmem_init_walkers(uintptr_t addr, const kmem_cache_t *c, void *ignored)
69 {
70 	mdb_walker_t w;
71 	char descr[64];
72 
73 	(void) mdb_snprintf(descr, sizeof (descr),
74 	    "walk the %s cache", c->cache_name);
75 
76 	w.walk_name = c->cache_name;
77 	w.walk_descr = descr;
78 	w.walk_init = kmem_walk_init;
79 	w.walk_step = kmem_walk_step;
80 	w.walk_fini = kmem_walk_fini;
81 	w.walk_init_arg = (void *)addr;
82 
83 	if (mdb_add_walker(&w) == -1)
84 		mdb_warn("failed to add %s walker", c->cache_name);
85 
86 	return (WALK_NEXT);
87 }
88 
89 /*ARGSUSED*/
90 int
kmem_debug(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)91 kmem_debug(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
92 {
93 	mdb_debug_level ^= 1;
94 
95 	mdb_printf("kmem: debugging is now %s\n",
96 	    mdb_debug_level ? "on" : "off");
97 
98 	return (DCMD_OK);
99 }
100 
101 int
kmem_cache_walk_init(mdb_walk_state_t * wsp)102 kmem_cache_walk_init(mdb_walk_state_t *wsp)
103 {
104 	GElf_Sym sym;
105 
106 	if (mdb_lookup_by_name("kmem_caches", &sym) == -1) {
107 		mdb_warn("couldn't find kmem_caches");
108 		return (WALK_ERR);
109 	}
110 
111 	wsp->walk_addr = (uintptr_t)sym.st_value;
112 
113 	return (list_walk_init_named(wsp, "cache list", "cache"));
114 }
115 
116 int
kmem_cpu_cache_walk_init(mdb_walk_state_t * wsp)117 kmem_cpu_cache_walk_init(mdb_walk_state_t *wsp)
118 {
119 	if (wsp->walk_addr == NULL) {
120 		mdb_warn("kmem_cpu_cache doesn't support global walks");
121 		return (WALK_ERR);
122 	}
123 
124 	if (mdb_layered_walk("cpu", wsp) == -1) {
125 		mdb_warn("couldn't walk 'cpu'");
126 		return (WALK_ERR);
127 	}
128 
129 	wsp->walk_data = (void *)wsp->walk_addr;
130 
131 	return (WALK_NEXT);
132 }
133 
134 int
kmem_cpu_cache_walk_step(mdb_walk_state_t * wsp)135 kmem_cpu_cache_walk_step(mdb_walk_state_t *wsp)
136 {
137 	uintptr_t caddr = (uintptr_t)wsp->walk_data;
138 	const cpu_t *cpu = wsp->walk_layer;
139 	kmem_cpu_cache_t cc;
140 
141 	caddr += OFFSETOF(kmem_cache_t, cache_cpu[cpu->cpu_seqid]);
142 
143 	if (mdb_vread(&cc, sizeof (kmem_cpu_cache_t), caddr) == -1) {
144 		mdb_warn("couldn't read kmem_cpu_cache at %p", caddr);
145 		return (WALK_ERR);
146 	}
147 
148 	return (wsp->walk_callback(caddr, &cc, wsp->walk_cbdata));
149 }
150 
151 static int
kmem_slab_check(void * p,uintptr_t saddr,void * arg)152 kmem_slab_check(void *p, uintptr_t saddr, void *arg)
153 {
154 	kmem_slab_t *sp = p;
155 	uintptr_t caddr = (uintptr_t)arg;
156 	if ((uintptr_t)sp->slab_cache != caddr) {
157 		mdb_warn("slab %p isn't in cache %p (in cache %p)\n",
158 		    saddr, caddr, sp->slab_cache);
159 		return (-1);
160 	}
161 
162 	return (0);
163 }
164 
165 static int
kmem_partial_slab_check(void * p,uintptr_t saddr,void * arg)166 kmem_partial_slab_check(void *p, uintptr_t saddr, void *arg)
167 {
168 	kmem_slab_t *sp = p;
169 
170 	int rc = kmem_slab_check(p, saddr, arg);
171 	if (rc != 0) {
172 		return (rc);
173 	}
174 
175 	if (!KMEM_SLAB_IS_PARTIAL(sp)) {
176 		mdb_warn("slab %p is not a partial slab\n", saddr);
177 		return (-1);
178 	}
179 
180 	return (0);
181 }
182 
183 static int
kmem_complete_slab_check(void * p,uintptr_t saddr,void * arg)184 kmem_complete_slab_check(void *p, uintptr_t saddr, void *arg)
185 {
186 	kmem_slab_t *sp = p;
187 
188 	int rc = kmem_slab_check(p, saddr, arg);
189 	if (rc != 0) {
190 		return (rc);
191 	}
192 
193 	if (!KMEM_SLAB_IS_ALL_USED(sp)) {
194 		mdb_warn("slab %p is not completely allocated\n", saddr);
195 		return (-1);
196 	}
197 
198 	return (0);
199 }
200 
201 typedef struct {
202 	uintptr_t kns_cache_addr;
203 	int kns_nslabs;
204 } kmem_nth_slab_t;
205 
206 static int
kmem_nth_slab_check(void * p,uintptr_t saddr,void * arg)207 kmem_nth_slab_check(void *p, uintptr_t saddr, void *arg)
208 {
209 	kmem_nth_slab_t *chkp = arg;
210 
211 	int rc = kmem_slab_check(p, saddr, (void *)chkp->kns_cache_addr);
212 	if (rc != 0) {
213 		return (rc);
214 	}
215 
216 	return (chkp->kns_nslabs-- == 0 ? 1 : 0);
217 }
218 
219 static int
kmem_complete_slab_walk_init(mdb_walk_state_t * wsp)220 kmem_complete_slab_walk_init(mdb_walk_state_t *wsp)
221 {
222 	uintptr_t caddr = wsp->walk_addr;
223 
224 	wsp->walk_addr = (uintptr_t)(caddr +
225 	    offsetof(kmem_cache_t, cache_complete_slabs));
226 
227 	return (list_walk_init_checked(wsp, "slab list", "slab",
228 	    kmem_complete_slab_check, (void *)caddr));
229 }
230 
231 static int
kmem_partial_slab_walk_init(mdb_walk_state_t * wsp)232 kmem_partial_slab_walk_init(mdb_walk_state_t *wsp)
233 {
234 	uintptr_t caddr = wsp->walk_addr;
235 
236 	wsp->walk_addr = (uintptr_t)(caddr +
237 	    offsetof(kmem_cache_t, cache_partial_slabs));
238 
239 	return (avl_walk_init_checked(wsp, "slab list", "slab",
240 	    kmem_partial_slab_check, (void *)caddr));
241 }
242 
243 int
kmem_slab_walk_init(mdb_walk_state_t * wsp)244 kmem_slab_walk_init(mdb_walk_state_t *wsp)
245 {
246 	uintptr_t caddr = wsp->walk_addr;
247 
248 	if (caddr == NULL) {
249 		mdb_warn("kmem_slab doesn't support global walks\n");
250 		return (WALK_ERR);
251 	}
252 
253 	combined_walk_init(wsp);
254 	combined_walk_add(wsp,
255 	    kmem_complete_slab_walk_init, list_walk_step, list_walk_fini);
256 	combined_walk_add(wsp,
257 	    kmem_partial_slab_walk_init, avl_walk_step, avl_walk_fini);
258 
259 	return (WALK_NEXT);
260 }
261 
262 static int
kmem_first_complete_slab_walk_init(mdb_walk_state_t * wsp)263 kmem_first_complete_slab_walk_init(mdb_walk_state_t *wsp)
264 {
265 	uintptr_t caddr = wsp->walk_addr;
266 	kmem_nth_slab_t *chk;
267 
268 	chk = mdb_alloc(sizeof (kmem_nth_slab_t),
269 	    UM_SLEEP | UM_GC);
270 	chk->kns_cache_addr = caddr;
271 	chk->kns_nslabs = 1;
272 	wsp->walk_addr = (uintptr_t)(caddr +
273 	    offsetof(kmem_cache_t, cache_complete_slabs));
274 
275 	return (list_walk_init_checked(wsp, "slab list", "slab",
276 	    kmem_nth_slab_check, chk));
277 }
278 
279 int
kmem_slab_walk_partial_init(mdb_walk_state_t * wsp)280 kmem_slab_walk_partial_init(mdb_walk_state_t *wsp)
281 {
282 	uintptr_t caddr = wsp->walk_addr;
283 	kmem_cache_t c;
284 
285 	if (caddr == NULL) {
286 		mdb_warn("kmem_slab_partial doesn't support global walks\n");
287 		return (WALK_ERR);
288 	}
289 
290 	if (mdb_vread(&c, sizeof (c), caddr) == -1) {
291 		mdb_warn("couldn't read kmem_cache at %p", caddr);
292 		return (WALK_ERR);
293 	}
294 
295 	combined_walk_init(wsp);
296 
297 	/*
298 	 * Some consumers (umem_walk_step(), in particular) require at
299 	 * least one callback if there are any buffers in the cache.  So
300 	 * if there are *no* partial slabs, report the first full slab, if
301 	 * any.
302 	 *
303 	 * Yes, this is ugly, but it's cleaner than the other possibilities.
304 	 */
305 	if (c.cache_partial_slabs.avl_numnodes == 0) {
306 		combined_walk_add(wsp, kmem_first_complete_slab_walk_init,
307 		    list_walk_step, list_walk_fini);
308 	} else {
309 		combined_walk_add(wsp, kmem_partial_slab_walk_init,
310 		    avl_walk_step, avl_walk_fini);
311 	}
312 
313 	return (WALK_NEXT);
314 }
315 
316 int
kmem_cache(uintptr_t addr,uint_t flags,int ac,const mdb_arg_t * argv)317 kmem_cache(uintptr_t addr, uint_t flags, int ac, const mdb_arg_t *argv)
318 {
319 	kmem_cache_t c;
320 	const char *filter = NULL;
321 
322 	if (mdb_getopts(ac, argv,
323 	    'n', MDB_OPT_STR, &filter,
324 	    NULL) != ac) {
325 		return (DCMD_USAGE);
326 	}
327 
328 	if (!(flags & DCMD_ADDRSPEC)) {
329 		if (mdb_walk_dcmd("kmem_cache", "kmem_cache", ac, argv) == -1) {
330 			mdb_warn("can't walk kmem_cache");
331 			return (DCMD_ERR);
332 		}
333 		return (DCMD_OK);
334 	}
335 
336 	if (DCMD_HDRSPEC(flags))
337 		mdb_printf("%-?s %-25s %4s %6s %8s %8s\n", "ADDR", "NAME",
338 		    "FLAG", "CFLAG", "BUFSIZE", "BUFTOTL");
339 
340 	if (mdb_vread(&c, sizeof (c), addr) == -1) {
341 		mdb_warn("couldn't read kmem_cache at %p", addr);
342 		return (DCMD_ERR);
343 	}
344 
345 	if ((filter != NULL) && (strstr(c.cache_name, filter) == NULL))
346 		return (DCMD_OK);
347 
348 	mdb_printf("%0?p %-25s %04x %06x %8ld %8lld\n", addr, c.cache_name,
349 	    c.cache_flags, c.cache_cflags, c.cache_bufsize, c.cache_buftotal);
350 
351 	return (DCMD_OK);
352 }
353 
354 void
kmem_cache_help(void)355 kmem_cache_help(void)
356 {
357 	mdb_printf("%s", "Print kernel memory caches.\n\n");
358 	mdb_dec_indent(2);
359 	mdb_printf("%<b>OPTIONS%</b>\n");
360 	mdb_inc_indent(2);
361 	mdb_printf("%s",
362 "  -n name\n"
363 "        name of kmem cache (or matching partial name)\n"
364 "\n"
365 "Column\tDescription\n"
366 "\n"
367 "ADDR\t\taddress of kmem cache\n"
368 "NAME\t\tname of kmem cache\n"
369 "FLAG\t\tvarious cache state flags\n"
370 "CFLAG\t\tcache creation flags\n"
371 "BUFSIZE\tobject size in bytes\n"
372 "BUFTOTL\tcurrent total buffers in cache (allocated and free)\n");
373 }
374 
375 #define	LABEL_WIDTH	11
376 static void
kmem_slabs_print_dist(uint_t * ks_bucket,size_t buffers_per_slab,size_t maxbuckets,size_t minbucketsize)377 kmem_slabs_print_dist(uint_t *ks_bucket, size_t buffers_per_slab,
378     size_t maxbuckets, size_t minbucketsize)
379 {
380 	uint64_t total;
381 	int buckets;
382 	int i;
383 	const int *distarray;
384 	int complete[2];
385 
386 	buckets = buffers_per_slab;
387 
388 	total = 0;
389 	for (i = 0; i <= buffers_per_slab; i++)
390 		total += ks_bucket[i];
391 
392 	if (maxbuckets > 1)
393 		buckets = MIN(buckets, maxbuckets);
394 
395 	if (minbucketsize > 1) {
396 		/*
397 		 * minbucketsize does not apply to the first bucket reserved
398 		 * for completely allocated slabs
399 		 */
400 		buckets = MIN(buckets, 1 + ((buffers_per_slab - 1) /
401 		    minbucketsize));
402 		if ((buckets < 2) && (buffers_per_slab > 1)) {
403 			buckets = 2;
404 			minbucketsize = (buffers_per_slab - 1);
405 		}
406 	}
407 
408 	/*
409 	 * The first printed bucket is reserved for completely allocated slabs.
410 	 * Passing (buckets - 1) excludes that bucket from the generated
411 	 * distribution, since we're handling it as a special case.
412 	 */
413 	complete[0] = buffers_per_slab;
414 	complete[1] = buffers_per_slab + 1;
415 	distarray = dist_linear(buckets - 1, 1, buffers_per_slab - 1);
416 
417 	mdb_printf("%*s\n", LABEL_WIDTH, "Allocated");
418 	dist_print_header("Buffers", LABEL_WIDTH, "Slabs");
419 
420 	dist_print_bucket(complete, 0, ks_bucket, total, LABEL_WIDTH);
421 	/*
422 	 * Print bucket ranges in descending order after the first bucket for
423 	 * completely allocated slabs, so a person can see immediately whether
424 	 * or not there is fragmentation without having to scan possibly
425 	 * multiple screens of output. Starting at (buckets - 2) excludes the
426 	 * extra terminating bucket.
427 	 */
428 	for (i = buckets - 2; i >= 0; i--) {
429 		dist_print_bucket(distarray, i, ks_bucket, total, LABEL_WIDTH);
430 	}
431 	mdb_printf("\n");
432 }
433 #undef LABEL_WIDTH
434 
435 /*ARGSUSED*/
436 static int
kmem_first_slab(uintptr_t addr,const kmem_slab_t * sp,boolean_t * is_slab)437 kmem_first_slab(uintptr_t addr, const kmem_slab_t *sp, boolean_t *is_slab)
438 {
439 	*is_slab = B_TRUE;
440 	return (WALK_DONE);
441 }
442 
443 /*ARGSUSED*/
444 static int
kmem_first_partial_slab(uintptr_t addr,const kmem_slab_t * sp,boolean_t * is_slab)445 kmem_first_partial_slab(uintptr_t addr, const kmem_slab_t *sp,
446     boolean_t *is_slab)
447 {
448 	/*
449 	 * The "kmem_partial_slab" walker reports the first full slab if there
450 	 * are no partial slabs (for the sake of consumers that require at least
451 	 * one callback if there are any buffers in the cache).
452 	 */
453 	*is_slab = KMEM_SLAB_IS_PARTIAL(sp);
454 	return (WALK_DONE);
455 }
456 
457 typedef struct kmem_slab_usage {
458 	int ksu_refcnt;			/* count of allocated buffers on slab */
459 	boolean_t ksu_nomove;		/* slab marked non-reclaimable */
460 } kmem_slab_usage_t;
461 
462 typedef struct kmem_slab_stats {
463 	const kmem_cache_t *ks_cp;
464 	int ks_slabs;			/* slabs in cache */
465 	int ks_partial_slabs;		/* partially allocated slabs in cache */
466 	uint64_t ks_unused_buffers;	/* total unused buffers in cache */
467 	int ks_max_buffers_per_slab;	/* max buffers per slab */
468 	int ks_usage_len;		/* ks_usage array length */
469 	kmem_slab_usage_t *ks_usage;	/* partial slab usage */
470 	uint_t *ks_bucket;		/* slab usage distribution */
471 } kmem_slab_stats_t;
472 
473 /*ARGSUSED*/
474 static int
kmem_slablist_stat(uintptr_t addr,const kmem_slab_t * sp,kmem_slab_stats_t * ks)475 kmem_slablist_stat(uintptr_t addr, const kmem_slab_t *sp,
476     kmem_slab_stats_t *ks)
477 {
478 	kmem_slab_usage_t *ksu;
479 	long unused;
480 
481 	ks->ks_slabs++;
482 	ks->ks_bucket[sp->slab_refcnt]++;
483 
484 	unused = (sp->slab_chunks - sp->slab_refcnt);
485 	if (unused == 0) {
486 		return (WALK_NEXT);
487 	}
488 
489 	ks->ks_partial_slabs++;
490 	ks->ks_unused_buffers += unused;
491 
492 	if (ks->ks_partial_slabs > ks->ks_usage_len) {
493 		kmem_slab_usage_t *usage;
494 		int len = ks->ks_usage_len;
495 
496 		len = (len == 0 ? 16 : len * 2);
497 		usage = mdb_zalloc(len * sizeof (kmem_slab_usage_t), UM_SLEEP);
498 		if (ks->ks_usage != NULL) {
499 			bcopy(ks->ks_usage, usage,
500 			    ks->ks_usage_len * sizeof (kmem_slab_usage_t));
501 			mdb_free(ks->ks_usage,
502 			    ks->ks_usage_len * sizeof (kmem_slab_usage_t));
503 		}
504 		ks->ks_usage = usage;
505 		ks->ks_usage_len = len;
506 	}
507 
508 	ksu = &ks->ks_usage[ks->ks_partial_slabs - 1];
509 	ksu->ksu_refcnt = sp->slab_refcnt;
510 	ksu->ksu_nomove = (sp->slab_flags & KMEM_SLAB_NOMOVE);
511 	return (WALK_NEXT);
512 }
513 
514 static void
kmem_slabs_header()515 kmem_slabs_header()
516 {
517 	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
518 	    "", "", "Partial", "", "Unused", "");
519 	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
520 	    "Cache Name", "Slabs", "Slabs", "Buffers", "Buffers", "Waste");
521 	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
522 	    "-------------------------", "--------", "--------", "---------",
523 	    "---------", "------");
524 }
525 
526 int
kmem_slabs(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)527 kmem_slabs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
528 {
529 	kmem_cache_t c;
530 	kmem_slab_stats_t stats;
531 	mdb_walk_cb_t cb;
532 	int pct;
533 	int tenths_pct;
534 	size_t maxbuckets = 1;
535 	size_t minbucketsize = 0;
536 	const char *filter = NULL;
537 	const char *name = NULL;
538 	uint_t opt_v = FALSE;
539 	boolean_t buckets = B_FALSE;
540 	boolean_t skip = B_FALSE;
541 
542 	if (mdb_getopts(argc, argv,
543 	    'B', MDB_OPT_UINTPTR, &minbucketsize,
544 	    'b', MDB_OPT_UINTPTR, &maxbuckets,
545 	    'n', MDB_OPT_STR, &filter,
546 	    'N', MDB_OPT_STR, &name,
547 	    'v', MDB_OPT_SETBITS, TRUE, &opt_v,
548 	    NULL) != argc) {
549 		return (DCMD_USAGE);
550 	}
551 
552 	if ((maxbuckets != 1) || (minbucketsize != 0)) {
553 		buckets = B_TRUE;
554 	}
555 
556 	if (!(flags & DCMD_ADDRSPEC)) {
557 		if (mdb_walk_dcmd("kmem_cache", "kmem_slabs", argc,
558 		    argv) == -1) {
559 			mdb_warn("can't walk kmem_cache");
560 			return (DCMD_ERR);
561 		}
562 		return (DCMD_OK);
563 	}
564 
565 	if (mdb_vread(&c, sizeof (c), addr) == -1) {
566 		mdb_warn("couldn't read kmem_cache at %p", addr);
567 		return (DCMD_ERR);
568 	}
569 
570 	if (name == NULL) {
571 		skip = ((filter != NULL) &&
572 		    (strstr(c.cache_name, filter) == NULL));
573 	} else if (filter == NULL) {
574 		skip = (strcmp(c.cache_name, name) != 0);
575 	} else {
576 		/* match either -n or -N */
577 		skip = ((strcmp(c.cache_name, name) != 0) &&
578 		    (strstr(c.cache_name, filter) == NULL));
579 	}
580 
581 	if (!(opt_v || buckets) && DCMD_HDRSPEC(flags)) {
582 		kmem_slabs_header();
583 	} else if ((opt_v || buckets) && !skip) {
584 		if (DCMD_HDRSPEC(flags)) {
585 			kmem_slabs_header();
586 		} else {
587 			boolean_t is_slab = B_FALSE;
588 			const char *walker_name;
589 			if (opt_v) {
590 				cb = (mdb_walk_cb_t)kmem_first_partial_slab;
591 				walker_name = "kmem_slab_partial";
592 			} else {
593 				cb = (mdb_walk_cb_t)kmem_first_slab;
594 				walker_name = "kmem_slab";
595 			}
596 			(void) mdb_pwalk(walker_name, cb, &is_slab, addr);
597 			if (is_slab) {
598 				kmem_slabs_header();
599 			}
600 		}
601 	}
602 
603 	if (skip) {
604 		return (DCMD_OK);
605 	}
606 
607 	bzero(&stats, sizeof (kmem_slab_stats_t));
608 	stats.ks_cp = &c;
609 	stats.ks_max_buffers_per_slab = c.cache_maxchunks;
610 	/* +1 to include a zero bucket */
611 	stats.ks_bucket = mdb_zalloc((stats.ks_max_buffers_per_slab + 1) *
612 	    sizeof (*stats.ks_bucket), UM_SLEEP);
613 	cb = (mdb_walk_cb_t)kmem_slablist_stat;
614 	(void) mdb_pwalk("kmem_slab", cb, &stats, addr);
615 
616 	if (c.cache_buftotal == 0) {
617 		pct = 0;
618 		tenths_pct = 0;
619 	} else {
620 		uint64_t n = stats.ks_unused_buffers * 10000;
621 		pct = (int)(n / c.cache_buftotal);
622 		tenths_pct = pct - ((pct / 100) * 100);
623 		tenths_pct = (tenths_pct + 5) / 10; /* round nearest tenth */
624 		if (tenths_pct == 10) {
625 			pct += 100;
626 			tenths_pct = 0;
627 		}
628 	}
629 
630 	pct /= 100;
631 	mdb_printf("%-25s %8d %8d %9lld %9lld %3d.%1d%%\n", c.cache_name,
632 	    stats.ks_slabs, stats.ks_partial_slabs, c.cache_buftotal,
633 	    stats.ks_unused_buffers, pct, tenths_pct);
634 
635 	if (maxbuckets == 0) {
636 		maxbuckets = stats.ks_max_buffers_per_slab;
637 	}
638 
639 	if (((maxbuckets > 1) || (minbucketsize > 0)) &&
640 	    (stats.ks_slabs > 0)) {
641 		mdb_printf("\n");
642 		kmem_slabs_print_dist(stats.ks_bucket,
643 		    stats.ks_max_buffers_per_slab, maxbuckets, minbucketsize);
644 	}
645 
646 	mdb_free(stats.ks_bucket, (stats.ks_max_buffers_per_slab + 1) *
647 	    sizeof (*stats.ks_bucket));
648 
649 	if (!opt_v) {
650 		return (DCMD_OK);
651 	}
652 
653 	if (opt_v && (stats.ks_partial_slabs > 0)) {
654 		int i;
655 		kmem_slab_usage_t *ksu;
656 
657 		mdb_printf("  %d complete (%d), %d partial:",
658 		    (stats.ks_slabs - stats.ks_partial_slabs),
659 		    stats.ks_max_buffers_per_slab,
660 		    stats.ks_partial_slabs);
661 
662 		for (i = 0; i < stats.ks_partial_slabs; i++) {
663 			ksu = &stats.ks_usage[i];
664 			mdb_printf(" %d%s", ksu->ksu_refcnt,
665 			    (ksu->ksu_nomove ? "*" : ""));
666 		}
667 		mdb_printf("\n\n");
668 	}
669 
670 	if (stats.ks_usage_len > 0) {
671 		mdb_free(stats.ks_usage,
672 		    stats.ks_usage_len * sizeof (kmem_slab_usage_t));
673 	}
674 
675 	return (DCMD_OK);
676 }
677 
678 void
kmem_slabs_help(void)679 kmem_slabs_help(void)
680 {
681 	mdb_printf("%s",
682 "Display slab usage per kmem cache.\n\n");
683 	mdb_dec_indent(2);
684 	mdb_printf("%<b>OPTIONS%</b>\n");
685 	mdb_inc_indent(2);
686 	mdb_printf("%s",
687 "  -n name\n"
688 "        name of kmem cache (or matching partial name)\n"
689 "  -N name\n"
690 "        exact name of kmem cache\n"
691 "  -b maxbins\n"
692 "        Print a distribution of allocated buffers per slab using at\n"
693 "        most maxbins bins. The first bin is reserved for completely\n"
694 "        allocated slabs. Setting maxbins to zero (-b 0) has the same\n"
695 "        effect as specifying the maximum allocated buffers per slab\n"
696 "        or setting minbinsize to 1 (-B 1).\n"
697 "  -B minbinsize\n"
698 "        Print a distribution of allocated buffers per slab, making\n"
699 "        all bins (except the first, reserved for completely allocated\n"
700 "        slabs) at least minbinsize buffers apart.\n"
701 "  -v    verbose output: List the allocated buffer count of each partial\n"
702 "        slab on the free list in order from front to back to show how\n"
703 "        closely the slabs are ordered by usage. For example\n"
704 "\n"
705 "          10 complete, 3 partial (8): 7 3 1\n"
706 "\n"
707 "        means there are thirteen slabs with eight buffers each, including\n"
708 "        three partially allocated slabs with less than all eight buffers\n"
709 "        allocated.\n"
710 "\n"
711 "        Buffer allocations are always from the front of the partial slab\n"
712 "        list. When a buffer is freed from a completely used slab, that\n"
713 "        slab is added to the front of the partial slab list. Assuming\n"
714 "        that all buffers are equally likely to be freed soon, the\n"
715 "        desired order of partial slabs is most-used at the front of the\n"
716 "        list and least-used at the back (as in the example above).\n"
717 "        However, if a slab contains an allocated buffer that will not\n"
718 "        soon be freed, it would be better for that slab to be at the\n"
719 "        front where all of its buffers can be allocated. Taking a slab\n"
720 "        off the partial slab list (either with all buffers freed or all\n"
721 "        buffers allocated) reduces cache fragmentation.\n"
722 "\n"
723 "        A slab's allocated buffer count representing a partial slab (9 in\n"
724 "        the example below) may be marked as follows:\n"
725 "\n"
726 "        9*   An asterisk indicates that kmem has marked the slab non-\n"
727 "        reclaimable because the kmem client refused to move one of the\n"
728 "        slab's buffers. Since kmem does not expect to completely free the\n"
729 "        slab, it moves it to the front of the list in the hope of\n"
730 "        completely allocating it instead. A slab marked with an asterisk\n"
731 "        stays marked for as long as it remains on the partial slab list.\n"
732 "\n"
733 "Column\t\tDescription\n"
734 "\n"
735 "Cache Name\t\tname of kmem cache\n"
736 "Slabs\t\t\ttotal slab count\n"
737 "Partial Slabs\t\tcount of partially allocated slabs on the free list\n"
738 "Buffers\t\ttotal buffer count (Slabs * (buffers per slab))\n"
739 "Unused Buffers\tcount of unallocated buffers across all partial slabs\n"
740 "Waste\t\t\t(Unused Buffers / Buffers) does not include space\n"
741 "\t\t\t  for accounting structures (debug mode), slab\n"
742 "\t\t\t  coloring (incremental small offsets to stagger\n"
743 "\t\t\t  buffer alignment), or the per-CPU magazine layer\n");
744 }
745 
746 static int
addrcmp(const void * lhs,const void * rhs)747 addrcmp(const void *lhs, const void *rhs)
748 {
749 	uintptr_t p1 = *((uintptr_t *)lhs);
750 	uintptr_t p2 = *((uintptr_t *)rhs);
751 
752 	if (p1 < p2)
753 		return (-1);
754 	if (p1 > p2)
755 		return (1);
756 	return (0);
757 }
758 
759 static int
bufctlcmp(const kmem_bufctl_audit_t ** lhs,const kmem_bufctl_audit_t ** rhs)760 bufctlcmp(const kmem_bufctl_audit_t **lhs, const kmem_bufctl_audit_t **rhs)
761 {
762 	const kmem_bufctl_audit_t *bcp1 = *lhs;
763 	const kmem_bufctl_audit_t *bcp2 = *rhs;
764 
765 	if (bcp1->bc_timestamp > bcp2->bc_timestamp)
766 		return (-1);
767 
768 	if (bcp1->bc_timestamp < bcp2->bc_timestamp)
769 		return (1);
770 
771 	return (0);
772 }
773 
774 typedef struct kmem_hash_walk {
775 	uintptr_t *kmhw_table;
776 	size_t kmhw_nelems;
777 	size_t kmhw_pos;
778 	kmem_bufctl_t kmhw_cur;
779 } kmem_hash_walk_t;
780 
781 int
kmem_hash_walk_init(mdb_walk_state_t * wsp)782 kmem_hash_walk_init(mdb_walk_state_t *wsp)
783 {
784 	kmem_hash_walk_t *kmhw;
785 	uintptr_t *hash;
786 	kmem_cache_t c;
787 	uintptr_t haddr, addr = wsp->walk_addr;
788 	size_t nelems;
789 	size_t hsize;
790 
791 	if (addr == NULL) {
792 		mdb_warn("kmem_hash doesn't support global walks\n");
793 		return (WALK_ERR);
794 	}
795 
796 	if (mdb_vread(&c, sizeof (c), addr) == -1) {
797 		mdb_warn("couldn't read cache at addr %p", addr);
798 		return (WALK_ERR);
799 	}
800 
801 	if (!(c.cache_flags & KMF_HASH)) {
802 		mdb_warn("cache %p doesn't have a hash table\n", addr);
803 		return (WALK_DONE);		/* nothing to do */
804 	}
805 
806 	kmhw = mdb_zalloc(sizeof (kmem_hash_walk_t), UM_SLEEP);
807 	kmhw->kmhw_cur.bc_next = NULL;
808 	kmhw->kmhw_pos = 0;
809 
810 	kmhw->kmhw_nelems = nelems = c.cache_hash_mask + 1;
811 	hsize = nelems * sizeof (uintptr_t);
812 	haddr = (uintptr_t)c.cache_hash_table;
813 
814 	kmhw->kmhw_table = hash = mdb_alloc(hsize, UM_SLEEP);
815 	if (mdb_vread(hash, hsize, haddr) == -1) {
816 		mdb_warn("failed to read hash table at %p", haddr);
817 		mdb_free(hash, hsize);
818 		mdb_free(kmhw, sizeof (kmem_hash_walk_t));
819 		return (WALK_ERR);
820 	}
821 
822 	wsp->walk_data = kmhw;
823 
824 	return (WALK_NEXT);
825 }
826 
827 int
kmem_hash_walk_step(mdb_walk_state_t * wsp)828 kmem_hash_walk_step(mdb_walk_state_t *wsp)
829 {
830 	kmem_hash_walk_t *kmhw = wsp->walk_data;
831 	uintptr_t addr = NULL;
832 
833 	if ((addr = (uintptr_t)kmhw->kmhw_cur.bc_next) == NULL) {
834 		while (kmhw->kmhw_pos < kmhw->kmhw_nelems) {
835 			if ((addr = kmhw->kmhw_table[kmhw->kmhw_pos++]) != NULL)
836 				break;
837 		}
838 	}
839 	if (addr == NULL)
840 		return (WALK_DONE);
841 
842 	if (mdb_vread(&kmhw->kmhw_cur, sizeof (kmem_bufctl_t), addr) == -1) {
843 		mdb_warn("couldn't read kmem_bufctl_t at addr %p", addr);
844 		return (WALK_ERR);
845 	}
846 
847 	return (wsp->walk_callback(addr, &kmhw->kmhw_cur, wsp->walk_cbdata));
848 }
849 
850 void
kmem_hash_walk_fini(mdb_walk_state_t * wsp)851 kmem_hash_walk_fini(mdb_walk_state_t *wsp)
852 {
853 	kmem_hash_walk_t *kmhw = wsp->walk_data;
854 
855 	if (kmhw == NULL)
856 		return;
857 
858 	mdb_free(kmhw->kmhw_table, kmhw->kmhw_nelems * sizeof (uintptr_t));
859 	mdb_free(kmhw, sizeof (kmem_hash_walk_t));
860 }
861 
862 /*
863  * Find the address of the bufctl structure for the address 'buf' in cache
864  * 'cp', which is at address caddr, and place it in *out.
865  */
866 static int
kmem_hash_lookup(kmem_cache_t * cp,uintptr_t caddr,void * buf,uintptr_t * out)867 kmem_hash_lookup(kmem_cache_t *cp, uintptr_t caddr, void *buf, uintptr_t *out)
868 {
869 	uintptr_t bucket = (uintptr_t)KMEM_HASH(cp, buf);
870 	kmem_bufctl_t *bcp;
871 	kmem_bufctl_t bc;
872 
873 	if (mdb_vread(&bcp, sizeof (kmem_bufctl_t *), bucket) == -1) {
874 		mdb_warn("unable to read hash bucket for %p in cache %p",
875 		    buf, caddr);
876 		return (-1);
877 	}
878 
879 	while (bcp != NULL) {
880 		if (mdb_vread(&bc, sizeof (kmem_bufctl_t),
881 		    (uintptr_t)bcp) == -1) {
882 			mdb_warn("unable to read bufctl at %p", bcp);
883 			return (-1);
884 		}
885 		if (bc.bc_addr == buf) {
886 			*out = (uintptr_t)bcp;
887 			return (0);
888 		}
889 		bcp = bc.bc_next;
890 	}
891 
892 	mdb_warn("unable to find bufctl for %p in cache %p\n", buf, caddr);
893 	return (-1);
894 }
895 
896 int
kmem_get_magsize(const kmem_cache_t * cp)897 kmem_get_magsize(const kmem_cache_t *cp)
898 {
899 	uintptr_t addr = (uintptr_t)cp->cache_magtype;
900 	GElf_Sym mt_sym;
901 	kmem_magtype_t mt;
902 	int res;
903 
904 	/*
905 	 * if cpu 0 has a non-zero magsize, it must be correct.  caches
906 	 * with KMF_NOMAGAZINE have disabled their magazine layers, so
907 	 * it is okay to return 0 for them.
908 	 */
909 	if ((res = cp->cache_cpu[0].cc_magsize) != 0 ||
910 	    (cp->cache_flags & KMF_NOMAGAZINE))
911 		return (res);
912 
913 	if (mdb_lookup_by_name("kmem_magtype", &mt_sym) == -1) {
914 		mdb_warn("unable to read 'kmem_magtype'");
915 	} else if (addr < mt_sym.st_value ||
916 	    addr + sizeof (mt) - 1 > mt_sym.st_value + mt_sym.st_size - 1 ||
917 	    ((addr - mt_sym.st_value) % sizeof (mt)) != 0) {
918 		mdb_warn("cache '%s' has invalid magtype pointer (%p)\n",
919 		    cp->cache_name, addr);
920 		return (0);
921 	}
922 	if (mdb_vread(&mt, sizeof (mt), addr) == -1) {
923 		mdb_warn("unable to read magtype at %a", addr);
924 		return (0);
925 	}
926 	return (mt.mt_magsize);
927 }
928 
929 /*ARGSUSED*/
930 static int
kmem_estimate_slab(uintptr_t addr,const kmem_slab_t * sp,size_t * est)931 kmem_estimate_slab(uintptr_t addr, const kmem_slab_t *sp, size_t *est)
932 {
933 	*est -= (sp->slab_chunks - sp->slab_refcnt);
934 
935 	return (WALK_NEXT);
936 }
937 
938 /*
939  * Returns an upper bound on the number of allocated buffers in a given
940  * cache.
941  */
942 size_t
kmem_estimate_allocated(uintptr_t addr,const kmem_cache_t * cp)943 kmem_estimate_allocated(uintptr_t addr, const kmem_cache_t *cp)
944 {
945 	int magsize;
946 	size_t cache_est;
947 
948 	cache_est = cp->cache_buftotal;
949 
950 	(void) mdb_pwalk("kmem_slab_partial",
951 	    (mdb_walk_cb_t)kmem_estimate_slab, &cache_est, addr);
952 
953 	if ((magsize = kmem_get_magsize(cp)) != 0) {
954 		size_t mag_est = cp->cache_full.ml_total * magsize;
955 
956 		if (cache_est >= mag_est) {
957 			cache_est -= mag_est;
958 		} else {
959 			mdb_warn("cache %p's magazine layer holds more buffers "
960 			    "than the slab layer.\n", addr);
961 		}
962 	}
963 	return (cache_est);
964 }
965 
966 #define	READMAG_ROUNDS(rounds) { \
967 	if (mdb_vread(mp, magbsize, (uintptr_t)kmp) == -1) { \
968 		mdb_warn("couldn't read magazine at %p", kmp); \
969 		goto fail; \
970 	} \
971 	for (i = 0; i < rounds; i++) { \
972 		maglist[magcnt++] = mp->mag_round[i]; \
973 		if (magcnt == magmax) { \
974 			mdb_warn("%d magazines exceeds fudge factor\n", \
975 			    magcnt); \
976 			goto fail; \
977 		} \
978 	} \
979 }
980 
981 int
kmem_read_magazines(kmem_cache_t * cp,uintptr_t addr,int ncpus,void *** maglistp,size_t * magcntp,size_t * magmaxp,int alloc_flags)982 kmem_read_magazines(kmem_cache_t *cp, uintptr_t addr, int ncpus,
983     void ***maglistp, size_t *magcntp, size_t *magmaxp, int alloc_flags)
984 {
985 	kmem_magazine_t *kmp, *mp;
986 	void **maglist = NULL;
987 	int i, cpu;
988 	size_t magsize, magmax, magbsize;
989 	size_t magcnt = 0;
990 
991 	/*
992 	 * Read the magtype out of the cache, after verifying the pointer's
993 	 * correctness.
994 	 */
995 	magsize = kmem_get_magsize(cp);
996 	if (magsize == 0) {
997 		*maglistp = NULL;
998 		*magcntp = 0;
999 		*magmaxp = 0;
1000 		return (WALK_NEXT);
1001 	}
1002 
1003 	/*
1004 	 * There are several places where we need to go buffer hunting:
1005 	 * the per-CPU loaded magazine, the per-CPU spare full magazine,
1006 	 * and the full magazine list in the depot.
1007 	 *
1008 	 * For an upper bound on the number of buffers in the magazine
1009 	 * layer, we have the number of magazines on the cache_full
1010 	 * list plus at most two magazines per CPU (the loaded and the
1011 	 * spare).  Toss in 100 magazines as a fudge factor in case this
1012 	 * is live (the number "100" comes from the same fudge factor in
1013 	 * crash(1M)).
1014 	 */
1015 	magmax = (cp->cache_full.ml_total + 2 * ncpus + 100) * magsize;
1016 	magbsize = offsetof(kmem_magazine_t, mag_round[magsize]);
1017 
1018 	if (magbsize >= PAGESIZE / 2) {
1019 		mdb_warn("magazine size for cache %p unreasonable (%x)\n",
1020 		    addr, magbsize);
1021 		return (WALK_ERR);
1022 	}
1023 
1024 	maglist = mdb_alloc(magmax * sizeof (void *), alloc_flags);
1025 	mp = mdb_alloc(magbsize, alloc_flags);
1026 	if (mp == NULL || maglist == NULL)
1027 		goto fail;
1028 
1029 	/*
1030 	 * First up: the magazines in the depot (i.e. on the cache_full list).
1031 	 */
1032 	for (kmp = cp->cache_full.ml_list; kmp != NULL; ) {
1033 		READMAG_ROUNDS(magsize);
1034 		kmp = mp->mag_next;
1035 
1036 		if (kmp == cp->cache_full.ml_list)
1037 			break; /* cache_full list loop detected */
1038 	}
1039 
1040 	dprintf(("cache_full list done\n"));
1041 
1042 	/*
1043 	 * Now whip through the CPUs, snagging the loaded magazines
1044 	 * and full spares.
1045 	 *
1046 	 * In order to prevent inconsistent dumps, rounds and prounds
1047 	 * are copied aside before dumping begins.
1048 	 */
1049 	for (cpu = 0; cpu < ncpus; cpu++) {
1050 		kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu];
1051 		short rounds, prounds;
1052 
1053 		if (KMEM_DUMPCC(ccp)) {
1054 			rounds = ccp->cc_dump_rounds;
1055 			prounds = ccp->cc_dump_prounds;
1056 		} else {
1057 			rounds = ccp->cc_rounds;
1058 			prounds = ccp->cc_prounds;
1059 		}
1060 
1061 		dprintf(("reading cpu cache %p\n",
1062 		    (uintptr_t)ccp - (uintptr_t)cp + addr));
1063 
1064 		if (rounds > 0 &&
1065 		    (kmp = ccp->cc_loaded) != NULL) {
1066 			dprintf(("reading %d loaded rounds\n", rounds));
1067 			READMAG_ROUNDS(rounds);
1068 		}
1069 
1070 		if (prounds > 0 &&
1071 		    (kmp = ccp->cc_ploaded) != NULL) {
1072 			dprintf(("reading %d previously loaded rounds\n",
1073 			    prounds));
1074 			READMAG_ROUNDS(prounds);
1075 		}
1076 	}
1077 
1078 	dprintf(("magazine layer: %d buffers\n", magcnt));
1079 
1080 	if (!(alloc_flags & UM_GC))
1081 		mdb_free(mp, magbsize);
1082 
1083 	*maglistp = maglist;
1084 	*magcntp = magcnt;
1085 	*magmaxp = magmax;
1086 
1087 	return (WALK_NEXT);
1088 
1089 fail:
1090 	if (!(alloc_flags & UM_GC)) {
1091 		if (mp)
1092 			mdb_free(mp, magbsize);
1093 		if (maglist)
1094 			mdb_free(maglist, magmax * sizeof (void *));
1095 	}
1096 	return (WALK_ERR);
1097 }
1098 
1099 static int
kmem_walk_callback(mdb_walk_state_t * wsp,uintptr_t buf)1100 kmem_walk_callback(mdb_walk_state_t *wsp, uintptr_t buf)
1101 {
1102 	return (wsp->walk_callback(buf, NULL, wsp->walk_cbdata));
1103 }
1104 
1105 static int
bufctl_walk_callback(kmem_cache_t * cp,mdb_walk_state_t * wsp,uintptr_t buf)1106 bufctl_walk_callback(kmem_cache_t *cp, mdb_walk_state_t *wsp, uintptr_t buf)
1107 {
1108 	kmem_bufctl_audit_t b;
1109 
1110 	/*
1111 	 * if KMF_AUDIT is not set, we know that we're looking at a
1112 	 * kmem_bufctl_t.
1113 	 */
1114 	if (!(cp->cache_flags & KMF_AUDIT) ||
1115 	    mdb_vread(&b, sizeof (kmem_bufctl_audit_t), buf) == -1) {
1116 		(void) memset(&b, 0, sizeof (b));
1117 		if (mdb_vread(&b, sizeof (kmem_bufctl_t), buf) == -1) {
1118 			mdb_warn("unable to read bufctl at %p", buf);
1119 			return (WALK_ERR);
1120 		}
1121 	}
1122 
1123 	return (wsp->walk_callback(buf, &b, wsp->walk_cbdata));
1124 }
1125 
1126 typedef struct kmem_walk {
1127 	int kmw_type;
1128 
1129 	uintptr_t kmw_addr;		/* cache address */
1130 	kmem_cache_t *kmw_cp;
1131 	size_t kmw_csize;
1132 
1133 	/*
1134 	 * magazine layer
1135 	 */
1136 	void **kmw_maglist;
1137 	size_t kmw_max;
1138 	size_t kmw_count;
1139 	size_t kmw_pos;
1140 
1141 	/*
1142 	 * slab layer
1143 	 */
1144 	char *kmw_valid;	/* to keep track of freed buffers */
1145 	char *kmw_ubase;	/* buffer for slab data */
1146 } kmem_walk_t;
1147 
1148 static int
kmem_walk_init_common(mdb_walk_state_t * wsp,int type)1149 kmem_walk_init_common(mdb_walk_state_t *wsp, int type)
1150 {
1151 	kmem_walk_t *kmw;
1152 	int ncpus, csize;
1153 	kmem_cache_t *cp;
1154 	size_t vm_quantum;
1155 
1156 	size_t magmax, magcnt;
1157 	void **maglist = NULL;
1158 	uint_t chunksize, slabsize;
1159 	int status = WALK_ERR;
1160 	uintptr_t addr = wsp->walk_addr;
1161 	const char *layered;
1162 
1163 	type &= ~KM_HASH;
1164 
1165 	if (addr == NULL) {
1166 		mdb_warn("kmem walk doesn't support global walks\n");
1167 		return (WALK_ERR);
1168 	}
1169 
1170 	dprintf(("walking %p\n", addr));
1171 
1172 	/*
1173 	 * First we need to figure out how many CPUs are configured in the
1174 	 * system to know how much to slurp out.
1175 	 */
1176 	mdb_readvar(&ncpus, "max_ncpus");
1177 
1178 	csize = KMEM_CACHE_SIZE(ncpus);
1179 	cp = mdb_alloc(csize, UM_SLEEP);
1180 
1181 	if (mdb_vread(cp, csize, addr) == -1) {
1182 		mdb_warn("couldn't read cache at addr %p", addr);
1183 		goto out2;
1184 	}
1185 
1186 	/*
1187 	 * It's easy for someone to hand us an invalid cache address.
1188 	 * Unfortunately, it is hard for this walker to survive an
1189 	 * invalid cache cleanly.  So we make sure that:
1190 	 *
1191 	 *	1. the vmem arena for the cache is readable,
1192 	 *	2. the vmem arena's quantum is a power of 2,
1193 	 *	3. our slabsize is a multiple of the quantum, and
1194 	 *	4. our chunksize is >0 and less than our slabsize.
1195 	 */
1196 	if (mdb_vread(&vm_quantum, sizeof (vm_quantum),
1197 	    (uintptr_t)&cp->cache_arena->vm_quantum) == -1 ||
1198 	    vm_quantum == 0 ||
1199 	    (vm_quantum & (vm_quantum - 1)) != 0 ||
1200 	    cp->cache_slabsize < vm_quantum ||
1201 	    P2PHASE(cp->cache_slabsize, vm_quantum) != 0 ||
1202 	    cp->cache_chunksize == 0 ||
1203 	    cp->cache_chunksize > cp->cache_slabsize) {
1204 		mdb_warn("%p is not a valid kmem_cache_t\n", addr);
1205 		goto out2;
1206 	}
1207 
1208 	dprintf(("buf total is %d\n", cp->cache_buftotal));
1209 
1210 	if (cp->cache_buftotal == 0) {
1211 		mdb_free(cp, csize);
1212 		return (WALK_DONE);
1213 	}
1214 
1215 	/*
1216 	 * If they ask for bufctls, but it's a small-slab cache,
1217 	 * there is nothing to report.
1218 	 */
1219 	if ((type & KM_BUFCTL) && !(cp->cache_flags & KMF_HASH)) {
1220 		dprintf(("bufctl requested, not KMF_HASH (flags: %p)\n",
1221 		    cp->cache_flags));
1222 		mdb_free(cp, csize);
1223 		return (WALK_DONE);
1224 	}
1225 
1226 	/*
1227 	 * If they want constructed buffers, but there's no constructor or
1228 	 * the cache has DEADBEEF checking enabled, there is nothing to report.
1229 	 */
1230 	if ((type & KM_CONSTRUCTED) && (!(type & KM_FREE) ||
1231 	    cp->cache_constructor == NULL ||
1232 	    (cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) == KMF_DEADBEEF)) {
1233 		mdb_free(cp, csize);
1234 		return (WALK_DONE);
1235 	}
1236 
1237 	/*
1238 	 * Read in the contents of the magazine layer
1239 	 */
1240 	if (kmem_read_magazines(cp, addr, ncpus, &maglist, &magcnt,
1241 	    &magmax, UM_SLEEP) == WALK_ERR)
1242 		goto out2;
1243 
1244 	/*
1245 	 * We have all of the buffers from the magazines;  if we are walking
1246 	 * allocated buffers, sort them so we can bsearch them later.
1247 	 */
1248 	if (type & KM_ALLOCATED)
1249 		qsort(maglist, magcnt, sizeof (void *), addrcmp);
1250 
1251 	wsp->walk_data = kmw = mdb_zalloc(sizeof (kmem_walk_t), UM_SLEEP);
1252 
1253 	kmw->kmw_type = type;
1254 	kmw->kmw_addr = addr;
1255 	kmw->kmw_cp = cp;
1256 	kmw->kmw_csize = csize;
1257 	kmw->kmw_maglist = maglist;
1258 	kmw->kmw_max = magmax;
1259 	kmw->kmw_count = magcnt;
1260 	kmw->kmw_pos = 0;
1261 
1262 	/*
1263 	 * When walking allocated buffers in a KMF_HASH cache, we walk the
1264 	 * hash table instead of the slab layer.
1265 	 */
1266 	if ((cp->cache_flags & KMF_HASH) && (type & KM_ALLOCATED)) {
1267 		layered = "kmem_hash";
1268 
1269 		kmw->kmw_type |= KM_HASH;
1270 	} else {
1271 		/*
1272 		 * If we are walking freed buffers, we only need the
1273 		 * magazine layer plus the partially allocated slabs.
1274 		 * To walk allocated buffers, we need all of the slabs.
1275 		 */
1276 		if (type & KM_ALLOCATED)
1277 			layered = "kmem_slab";
1278 		else
1279 			layered = "kmem_slab_partial";
1280 
1281 		/*
1282 		 * for small-slab caches, we read in the entire slab.  For
1283 		 * freed buffers, we can just walk the freelist.  For
1284 		 * allocated buffers, we use a 'valid' array to track
1285 		 * the freed buffers.
1286 		 */
1287 		if (!(cp->cache_flags & KMF_HASH)) {
1288 			chunksize = cp->cache_chunksize;
1289 			slabsize = cp->cache_slabsize;
1290 
1291 			kmw->kmw_ubase = mdb_alloc(slabsize +
1292 			    sizeof (kmem_bufctl_t), UM_SLEEP);
1293 
1294 			if (type & KM_ALLOCATED)
1295 				kmw->kmw_valid =
1296 				    mdb_alloc(slabsize / chunksize, UM_SLEEP);
1297 		}
1298 	}
1299 
1300 	status = WALK_NEXT;
1301 
1302 	if (mdb_layered_walk(layered, wsp) == -1) {
1303 		mdb_warn("unable to start layered '%s' walk", layered);
1304 		status = WALK_ERR;
1305 	}
1306 
1307 out1:
1308 	if (status == WALK_ERR) {
1309 		if (kmw->kmw_valid)
1310 			mdb_free(kmw->kmw_valid, slabsize / chunksize);
1311 
1312 		if (kmw->kmw_ubase)
1313 			mdb_free(kmw->kmw_ubase, slabsize +
1314 			    sizeof (kmem_bufctl_t));
1315 
1316 		if (kmw->kmw_maglist)
1317 			mdb_free(kmw->kmw_maglist,
1318 			    kmw->kmw_max * sizeof (uintptr_t));
1319 
1320 		mdb_free(kmw, sizeof (kmem_walk_t));
1321 		wsp->walk_data = NULL;
1322 	}
1323 
1324 out2:
1325 	if (status == WALK_ERR)
1326 		mdb_free(cp, csize);
1327 
1328 	return (status);
1329 }
1330 
1331 int
kmem_walk_step(mdb_walk_state_t * wsp)1332 kmem_walk_step(mdb_walk_state_t *wsp)
1333 {
1334 	kmem_walk_t *kmw = wsp->walk_data;
1335 	int type = kmw->kmw_type;
1336 	kmem_cache_t *cp = kmw->kmw_cp;
1337 
1338 	void **maglist = kmw->kmw_maglist;
1339 	int magcnt = kmw->kmw_count;
1340 
1341 	uintptr_t chunksize, slabsize;
1342 	uintptr_t addr;
1343 	const kmem_slab_t *sp;
1344 	const kmem_bufctl_t *bcp;
1345 	kmem_bufctl_t bc;
1346 
1347 	int chunks;
1348 	char *kbase;
1349 	void *buf;
1350 	int i, ret;
1351 
1352 	char *valid, *ubase;
1353 
1354 	/*
1355 	 * first, handle the 'kmem_hash' layered walk case
1356 	 */
1357 	if (type & KM_HASH) {
1358 		/*
1359 		 * We have a buffer which has been allocated out of the
1360 		 * global layer. We need to make sure that it's not
1361 		 * actually sitting in a magazine before we report it as
1362 		 * an allocated buffer.
1363 		 */
1364 		buf = ((const kmem_bufctl_t *)wsp->walk_layer)->bc_addr;
1365 
1366 		if (magcnt > 0 &&
1367 		    bsearch(&buf, maglist, magcnt, sizeof (void *),
1368 		    addrcmp) != NULL)
1369 			return (WALK_NEXT);
1370 
1371 		if (type & KM_BUFCTL)
1372 			return (bufctl_walk_callback(cp, wsp, wsp->walk_addr));
1373 
1374 		return (kmem_walk_callback(wsp, (uintptr_t)buf));
1375 	}
1376 
1377 	ret = WALK_NEXT;
1378 
1379 	addr = kmw->kmw_addr;
1380 
1381 	/*
1382 	 * If we're walking freed buffers, report everything in the
1383 	 * magazine layer before processing the first slab.
1384 	 */
1385 	if ((type & KM_FREE) && magcnt != 0) {
1386 		kmw->kmw_count = 0;		/* only do this once */
1387 		for (i = 0; i < magcnt; i++) {
1388 			buf = maglist[i];
1389 
1390 			if (type & KM_BUFCTL) {
1391 				uintptr_t out;
1392 
1393 				if (cp->cache_flags & KMF_BUFTAG) {
1394 					kmem_buftag_t *btp;
1395 					kmem_buftag_t tag;
1396 
1397 					/* LINTED - alignment */
1398 					btp = KMEM_BUFTAG(cp, buf);
1399 					if (mdb_vread(&tag, sizeof (tag),
1400 					    (uintptr_t)btp) == -1) {
1401 						mdb_warn("reading buftag for "
1402 						    "%p at %p", buf, btp);
1403 						continue;
1404 					}
1405 					out = (uintptr_t)tag.bt_bufctl;
1406 				} else {
1407 					if (kmem_hash_lookup(cp, addr, buf,
1408 					    &out) == -1)
1409 						continue;
1410 				}
1411 				ret = bufctl_walk_callback(cp, wsp, out);
1412 			} else {
1413 				ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1414 			}
1415 
1416 			if (ret != WALK_NEXT)
1417 				return (ret);
1418 		}
1419 	}
1420 
1421 	/*
1422 	 * If they want constructed buffers, we're finished, since the
1423 	 * magazine layer holds them all.
1424 	 */
1425 	if (type & KM_CONSTRUCTED)
1426 		return (WALK_DONE);
1427 
1428 	/*
1429 	 * Handle the buffers in the current slab
1430 	 */
1431 	chunksize = cp->cache_chunksize;
1432 	slabsize = cp->cache_slabsize;
1433 
1434 	sp = wsp->walk_layer;
1435 	chunks = sp->slab_chunks;
1436 	kbase = sp->slab_base;
1437 
1438 	dprintf(("kbase is %p\n", kbase));
1439 
1440 	if (!(cp->cache_flags & KMF_HASH)) {
1441 		valid = kmw->kmw_valid;
1442 		ubase = kmw->kmw_ubase;
1443 
1444 		if (mdb_vread(ubase, chunks * chunksize,
1445 		    (uintptr_t)kbase) == -1) {
1446 			mdb_warn("failed to read slab contents at %p", kbase);
1447 			return (WALK_ERR);
1448 		}
1449 
1450 		/*
1451 		 * Set up the valid map as fully allocated -- we'll punch
1452 		 * out the freelist.
1453 		 */
1454 		if (type & KM_ALLOCATED)
1455 			(void) memset(valid, 1, chunks);
1456 	} else {
1457 		valid = NULL;
1458 		ubase = NULL;
1459 	}
1460 
1461 	/*
1462 	 * walk the slab's freelist
1463 	 */
1464 	bcp = sp->slab_head;
1465 
1466 	dprintf(("refcnt is %d; chunks is %d\n", sp->slab_refcnt, chunks));
1467 
1468 	/*
1469 	 * since we could be in the middle of allocating a buffer,
1470 	 * our refcnt could be one higher than it aught.  So we
1471 	 * check one further on the freelist than the count allows.
1472 	 */
1473 	for (i = sp->slab_refcnt; i <= chunks; i++) {
1474 		uint_t ndx;
1475 
1476 		dprintf(("bcp is %p\n", bcp));
1477 
1478 		if (bcp == NULL) {
1479 			if (i == chunks)
1480 				break;
1481 			mdb_warn(
1482 			    "slab %p in cache %p freelist too short by %d\n",
1483 			    sp, addr, chunks - i);
1484 			break;
1485 		}
1486 
1487 		if (cp->cache_flags & KMF_HASH) {
1488 			if (mdb_vread(&bc, sizeof (bc), (uintptr_t)bcp) == -1) {
1489 				mdb_warn("failed to read bufctl ptr at %p",
1490 				    bcp);
1491 				break;
1492 			}
1493 			buf = bc.bc_addr;
1494 		} else {
1495 			/*
1496 			 * Otherwise the buffer is (or should be) in the slab
1497 			 * that we've read in; determine its offset in the
1498 			 * slab, validate that it's not corrupt, and add to
1499 			 * our base address to find the umem_bufctl_t.  (Note
1500 			 * that we don't need to add the size of the bufctl
1501 			 * to our offset calculation because of the slop that's
1502 			 * allocated for the buffer at ubase.)
1503 			 */
1504 			uintptr_t offs = (uintptr_t)bcp - (uintptr_t)kbase;
1505 
1506 			if (offs > chunks * chunksize) {
1507 				mdb_warn("found corrupt bufctl ptr %p"
1508 				    " in slab %p in cache %p\n", bcp,
1509 				    wsp->walk_addr, addr);
1510 				break;
1511 			}
1512 
1513 			bc = *((kmem_bufctl_t *)((uintptr_t)ubase + offs));
1514 			buf = KMEM_BUF(cp, bcp);
1515 		}
1516 
1517 		ndx = ((uintptr_t)buf - (uintptr_t)kbase) / chunksize;
1518 
1519 		if (ndx > slabsize / cp->cache_bufsize) {
1520 			/*
1521 			 * This is very wrong; we have managed to find
1522 			 * a buffer in the slab which shouldn't
1523 			 * actually be here.  Emit a warning, and
1524 			 * try to continue.
1525 			 */
1526 			mdb_warn("buf %p is out of range for "
1527 			    "slab %p, cache %p\n", buf, sp, addr);
1528 		} else if (type & KM_ALLOCATED) {
1529 			/*
1530 			 * we have found a buffer on the slab's freelist;
1531 			 * clear its entry
1532 			 */
1533 			valid[ndx] = 0;
1534 		} else {
1535 			/*
1536 			 * Report this freed buffer
1537 			 */
1538 			if (type & KM_BUFCTL) {
1539 				ret = bufctl_walk_callback(cp, wsp,
1540 				    (uintptr_t)bcp);
1541 			} else {
1542 				ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1543 			}
1544 			if (ret != WALK_NEXT)
1545 				return (ret);
1546 		}
1547 
1548 		bcp = bc.bc_next;
1549 	}
1550 
1551 	if (bcp != NULL) {
1552 		dprintf(("slab %p in cache %p freelist too long (%p)\n",
1553 		    sp, addr, bcp));
1554 	}
1555 
1556 	/*
1557 	 * If we are walking freed buffers, the loop above handled reporting
1558 	 * them.
1559 	 */
1560 	if (type & KM_FREE)
1561 		return (WALK_NEXT);
1562 
1563 	if (type & KM_BUFCTL) {
1564 		mdb_warn("impossible situation: small-slab KM_BUFCTL walk for "
1565 		    "cache %p\n", addr);
1566 		return (WALK_ERR);
1567 	}
1568 
1569 	/*
1570 	 * Report allocated buffers, skipping buffers in the magazine layer.
1571 	 * We only get this far for small-slab caches.
1572 	 */
1573 	for (i = 0; ret == WALK_NEXT && i < chunks; i++) {
1574 		buf = (char *)kbase + i * chunksize;
1575 
1576 		if (!valid[i])
1577 			continue;		/* on slab freelist */
1578 
1579 		if (magcnt > 0 &&
1580 		    bsearch(&buf, maglist, magcnt, sizeof (void *),
1581 		    addrcmp) != NULL)
1582 			continue;		/* in magazine layer */
1583 
1584 		ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1585 	}
1586 	return (ret);
1587 }
1588 
1589 void
kmem_walk_fini(mdb_walk_state_t * wsp)1590 kmem_walk_fini(mdb_walk_state_t *wsp)
1591 {
1592 	kmem_walk_t *kmw = wsp->walk_data;
1593 	uintptr_t chunksize;
1594 	uintptr_t slabsize;
1595 
1596 	if (kmw == NULL)
1597 		return;
1598 
1599 	if (kmw->kmw_maglist != NULL)
1600 		mdb_free(kmw->kmw_maglist, kmw->kmw_max * sizeof (void *));
1601 
1602 	chunksize = kmw->kmw_cp->cache_chunksize;
1603 	slabsize = kmw->kmw_cp->cache_slabsize;
1604 
1605 	if (kmw->kmw_valid != NULL)
1606 		mdb_free(kmw->kmw_valid, slabsize / chunksize);
1607 	if (kmw->kmw_ubase != NULL)
1608 		mdb_free(kmw->kmw_ubase, slabsize + sizeof (kmem_bufctl_t));
1609 
1610 	mdb_free(kmw->kmw_cp, kmw->kmw_csize);
1611 	mdb_free(kmw, sizeof (kmem_walk_t));
1612 }
1613 
1614 /*ARGSUSED*/
1615 static int
kmem_walk_all(uintptr_t addr,const kmem_cache_t * c,mdb_walk_state_t * wsp)1616 kmem_walk_all(uintptr_t addr, const kmem_cache_t *c, mdb_walk_state_t *wsp)
1617 {
1618 	/*
1619 	 * Buffers allocated from NOTOUCH caches can also show up as freed
1620 	 * memory in other caches.  This can be a little confusing, so we
1621 	 * don't walk NOTOUCH caches when walking all caches (thereby assuring
1622 	 * that "::walk kmem" and "::walk freemem" yield disjoint output).
1623 	 */
1624 	if (c->cache_cflags & KMC_NOTOUCH)
1625 		return (WALK_NEXT);
1626 
1627 	if (mdb_pwalk(wsp->walk_data, wsp->walk_callback,
1628 	    wsp->walk_cbdata, addr) == -1)
1629 		return (WALK_DONE);
1630 
1631 	return (WALK_NEXT);
1632 }
1633 
1634 #define	KMEM_WALK_ALL(name, wsp) { \
1635 	wsp->walk_data = (name); \
1636 	if (mdb_walk("kmem_cache", (mdb_walk_cb_t)kmem_walk_all, wsp) == -1) \
1637 		return (WALK_ERR); \
1638 	return (WALK_DONE); \
1639 }
1640 
1641 int
kmem_walk_init(mdb_walk_state_t * wsp)1642 kmem_walk_init(mdb_walk_state_t *wsp)
1643 {
1644 	if (wsp->walk_arg != NULL)
1645 		wsp->walk_addr = (uintptr_t)wsp->walk_arg;
1646 
1647 	if (wsp->walk_addr == NULL)
1648 		KMEM_WALK_ALL("kmem", wsp);
1649 	return (kmem_walk_init_common(wsp, KM_ALLOCATED));
1650 }
1651 
1652 int
bufctl_walk_init(mdb_walk_state_t * wsp)1653 bufctl_walk_init(mdb_walk_state_t *wsp)
1654 {
1655 	if (wsp->walk_addr == NULL)
1656 		KMEM_WALK_ALL("bufctl", wsp);
1657 	return (kmem_walk_init_common(wsp, KM_ALLOCATED | KM_BUFCTL));
1658 }
1659 
1660 int
freemem_walk_init(mdb_walk_state_t * wsp)1661 freemem_walk_init(mdb_walk_state_t *wsp)
1662 {
1663 	if (wsp->walk_addr == NULL)
1664 		KMEM_WALK_ALL("freemem", wsp);
1665 	return (kmem_walk_init_common(wsp, KM_FREE));
1666 }
1667 
1668 int
freemem_constructed_walk_init(mdb_walk_state_t * wsp)1669 freemem_constructed_walk_init(mdb_walk_state_t *wsp)
1670 {
1671 	if (wsp->walk_addr == NULL)
1672 		KMEM_WALK_ALL("freemem_constructed", wsp);
1673 	return (kmem_walk_init_common(wsp, KM_FREE | KM_CONSTRUCTED));
1674 }
1675 
1676 int
freectl_walk_init(mdb_walk_state_t * wsp)1677 freectl_walk_init(mdb_walk_state_t *wsp)
1678 {
1679 	if (wsp->walk_addr == NULL)
1680 		KMEM_WALK_ALL("freectl", wsp);
1681 	return (kmem_walk_init_common(wsp, KM_FREE | KM_BUFCTL));
1682 }
1683 
1684 int
freectl_constructed_walk_init(mdb_walk_state_t * wsp)1685 freectl_constructed_walk_init(mdb_walk_state_t *wsp)
1686 {
1687 	if (wsp->walk_addr == NULL)
1688 		KMEM_WALK_ALL("freectl_constructed", wsp);
1689 	return (kmem_walk_init_common(wsp,
1690 	    KM_FREE | KM_BUFCTL | KM_CONSTRUCTED));
1691 }
1692 
1693 typedef struct bufctl_history_walk {
1694 	void		*bhw_next;
1695 	kmem_cache_t	*bhw_cache;
1696 	kmem_slab_t	*bhw_slab;
1697 	hrtime_t	bhw_timestamp;
1698 } bufctl_history_walk_t;
1699 
1700 int
bufctl_history_walk_init(mdb_walk_state_t * wsp)1701 bufctl_history_walk_init(mdb_walk_state_t *wsp)
1702 {
1703 	bufctl_history_walk_t *bhw;
1704 	kmem_bufctl_audit_t bc;
1705 	kmem_bufctl_audit_t bcn;
1706 
1707 	if (wsp->walk_addr == NULL) {
1708 		mdb_warn("bufctl_history walk doesn't support global walks\n");
1709 		return (WALK_ERR);
1710 	}
1711 
1712 	if (mdb_vread(&bc, sizeof (bc), wsp->walk_addr) == -1) {
1713 		mdb_warn("unable to read bufctl at %p", wsp->walk_addr);
1714 		return (WALK_ERR);
1715 	}
1716 
1717 	bhw = mdb_zalloc(sizeof (*bhw), UM_SLEEP);
1718 	bhw->bhw_timestamp = 0;
1719 	bhw->bhw_cache = bc.bc_cache;
1720 	bhw->bhw_slab = bc.bc_slab;
1721 
1722 	/*
1723 	 * sometimes the first log entry matches the base bufctl;  in that
1724 	 * case, skip the base bufctl.
1725 	 */
1726 	if (bc.bc_lastlog != NULL &&
1727 	    mdb_vread(&bcn, sizeof (bcn), (uintptr_t)bc.bc_lastlog) != -1 &&
1728 	    bc.bc_addr == bcn.bc_addr &&
1729 	    bc.bc_cache == bcn.bc_cache &&
1730 	    bc.bc_slab == bcn.bc_slab &&
1731 	    bc.bc_timestamp == bcn.bc_timestamp &&
1732 	    bc.bc_thread == bcn.bc_thread)
1733 		bhw->bhw_next = bc.bc_lastlog;
1734 	else
1735 		bhw->bhw_next = (void *)wsp->walk_addr;
1736 
1737 	wsp->walk_addr = (uintptr_t)bc.bc_addr;
1738 	wsp->walk_data = bhw;
1739 
1740 	return (WALK_NEXT);
1741 }
1742 
1743 int
bufctl_history_walk_step(mdb_walk_state_t * wsp)1744 bufctl_history_walk_step(mdb_walk_state_t *wsp)
1745 {
1746 	bufctl_history_walk_t *bhw = wsp->walk_data;
1747 	uintptr_t addr = (uintptr_t)bhw->bhw_next;
1748 	uintptr_t baseaddr = wsp->walk_addr;
1749 	kmem_bufctl_audit_t bc;
1750 
1751 	if (addr == NULL)
1752 		return (WALK_DONE);
1753 
1754 	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
1755 		mdb_warn("unable to read bufctl at %p", bhw->bhw_next);
1756 		return (WALK_ERR);
1757 	}
1758 
1759 	/*
1760 	 * The bufctl is only valid if the address, cache, and slab are
1761 	 * correct.  We also check that the timestamp is decreasing, to
1762 	 * prevent infinite loops.
1763 	 */
1764 	if ((uintptr_t)bc.bc_addr != baseaddr ||
1765 	    bc.bc_cache != bhw->bhw_cache ||
1766 	    bc.bc_slab != bhw->bhw_slab ||
1767 	    (bhw->bhw_timestamp != 0 && bc.bc_timestamp >= bhw->bhw_timestamp))
1768 		return (WALK_DONE);
1769 
1770 	bhw->bhw_next = bc.bc_lastlog;
1771 	bhw->bhw_timestamp = bc.bc_timestamp;
1772 
1773 	return (wsp->walk_callback(addr, &bc, wsp->walk_cbdata));
1774 }
1775 
1776 void
bufctl_history_walk_fini(mdb_walk_state_t * wsp)1777 bufctl_history_walk_fini(mdb_walk_state_t *wsp)
1778 {
1779 	bufctl_history_walk_t *bhw = wsp->walk_data;
1780 
1781 	mdb_free(bhw, sizeof (*bhw));
1782 }
1783 
1784 typedef struct kmem_log_walk {
1785 	kmem_bufctl_audit_t *klw_base;
1786 	kmem_bufctl_audit_t **klw_sorted;
1787 	kmem_log_header_t klw_lh;
1788 	size_t klw_size;
1789 	size_t klw_maxndx;
1790 	size_t klw_ndx;
1791 } kmem_log_walk_t;
1792 
1793 int
kmem_log_walk_init(mdb_walk_state_t * wsp)1794 kmem_log_walk_init(mdb_walk_state_t *wsp)
1795 {
1796 	uintptr_t lp = wsp->walk_addr;
1797 	kmem_log_walk_t *klw;
1798 	kmem_log_header_t *lhp;
1799 	int maxndx, i, j, k;
1800 
1801 	/*
1802 	 * By default (global walk), walk the kmem_transaction_log.  Otherwise
1803 	 * read the log whose kmem_log_header_t is stored at walk_addr.
1804 	 */
1805 	if (lp == NULL && mdb_readvar(&lp, "kmem_transaction_log") == -1) {
1806 		mdb_warn("failed to read 'kmem_transaction_log'");
1807 		return (WALK_ERR);
1808 	}
1809 
1810 	if (lp == NULL) {
1811 		mdb_warn("log is disabled\n");
1812 		return (WALK_ERR);
1813 	}
1814 
1815 	klw = mdb_zalloc(sizeof (kmem_log_walk_t), UM_SLEEP);
1816 	lhp = &klw->klw_lh;
1817 
1818 	if (mdb_vread(lhp, sizeof (kmem_log_header_t), lp) == -1) {
1819 		mdb_warn("failed to read log header at %p", lp);
1820 		mdb_free(klw, sizeof (kmem_log_walk_t));
1821 		return (WALK_ERR);
1822 	}
1823 
1824 	klw->klw_size = lhp->lh_chunksize * lhp->lh_nchunks;
1825 	klw->klw_base = mdb_alloc(klw->klw_size, UM_SLEEP);
1826 	maxndx = lhp->lh_chunksize / sizeof (kmem_bufctl_audit_t) - 1;
1827 
1828 	if (mdb_vread(klw->klw_base, klw->klw_size,
1829 	    (uintptr_t)lhp->lh_base) == -1) {
1830 		mdb_warn("failed to read log at base %p", lhp->lh_base);
1831 		mdb_free(klw->klw_base, klw->klw_size);
1832 		mdb_free(klw, sizeof (kmem_log_walk_t));
1833 		return (WALK_ERR);
1834 	}
1835 
1836 	klw->klw_sorted = mdb_alloc(maxndx * lhp->lh_nchunks *
1837 	    sizeof (kmem_bufctl_audit_t *), UM_SLEEP);
1838 
1839 	for (i = 0, k = 0; i < lhp->lh_nchunks; i++) {
1840 		kmem_bufctl_audit_t *chunk = (kmem_bufctl_audit_t *)
1841 		    ((uintptr_t)klw->klw_base + i * lhp->lh_chunksize);
1842 
1843 		for (j = 0; j < maxndx; j++)
1844 			klw->klw_sorted[k++] = &chunk[j];
1845 	}
1846 
1847 	qsort(klw->klw_sorted, k, sizeof (kmem_bufctl_audit_t *),
1848 	    (int(*)(const void *, const void *))bufctlcmp);
1849 
1850 	klw->klw_maxndx = k;
1851 	wsp->walk_data = klw;
1852 
1853 	return (WALK_NEXT);
1854 }
1855 
1856 int
kmem_log_walk_step(mdb_walk_state_t * wsp)1857 kmem_log_walk_step(mdb_walk_state_t *wsp)
1858 {
1859 	kmem_log_walk_t *klw = wsp->walk_data;
1860 	kmem_bufctl_audit_t *bcp;
1861 
1862 	if (klw->klw_ndx == klw->klw_maxndx)
1863 		return (WALK_DONE);
1864 
1865 	bcp = klw->klw_sorted[klw->klw_ndx++];
1866 
1867 	return (wsp->walk_callback((uintptr_t)bcp - (uintptr_t)klw->klw_base +
1868 	    (uintptr_t)klw->klw_lh.lh_base, bcp, wsp->walk_cbdata));
1869 }
1870 
1871 void
kmem_log_walk_fini(mdb_walk_state_t * wsp)1872 kmem_log_walk_fini(mdb_walk_state_t *wsp)
1873 {
1874 	kmem_log_walk_t *klw = wsp->walk_data;
1875 
1876 	mdb_free(klw->klw_base, klw->klw_size);
1877 	mdb_free(klw->klw_sorted, klw->klw_maxndx *
1878 	    sizeof (kmem_bufctl_audit_t *));
1879 	mdb_free(klw, sizeof (kmem_log_walk_t));
1880 }
1881 
1882 typedef struct allocdby_bufctl {
1883 	uintptr_t abb_addr;
1884 	hrtime_t abb_ts;
1885 } allocdby_bufctl_t;
1886 
1887 typedef struct allocdby_walk {
1888 	const char *abw_walk;
1889 	uintptr_t abw_thread;
1890 	size_t abw_nbufs;
1891 	size_t abw_size;
1892 	allocdby_bufctl_t *abw_buf;
1893 	size_t abw_ndx;
1894 } allocdby_walk_t;
1895 
1896 int
allocdby_walk_bufctl(uintptr_t addr,const kmem_bufctl_audit_t * bcp,allocdby_walk_t * abw)1897 allocdby_walk_bufctl(uintptr_t addr, const kmem_bufctl_audit_t *bcp,
1898     allocdby_walk_t *abw)
1899 {
1900 	if ((uintptr_t)bcp->bc_thread != abw->abw_thread)
1901 		return (WALK_NEXT);
1902 
1903 	if (abw->abw_nbufs == abw->abw_size) {
1904 		allocdby_bufctl_t *buf;
1905 		size_t oldsize = sizeof (allocdby_bufctl_t) * abw->abw_size;
1906 
1907 		buf = mdb_zalloc(oldsize << 1, UM_SLEEP);
1908 
1909 		bcopy(abw->abw_buf, buf, oldsize);
1910 		mdb_free(abw->abw_buf, oldsize);
1911 
1912 		abw->abw_size <<= 1;
1913 		abw->abw_buf = buf;
1914 	}
1915 
1916 	abw->abw_buf[abw->abw_nbufs].abb_addr = addr;
1917 	abw->abw_buf[abw->abw_nbufs].abb_ts = bcp->bc_timestamp;
1918 	abw->abw_nbufs++;
1919 
1920 	return (WALK_NEXT);
1921 }
1922 
1923 /*ARGSUSED*/
1924 int
allocdby_walk_cache(uintptr_t addr,const kmem_cache_t * c,allocdby_walk_t * abw)1925 allocdby_walk_cache(uintptr_t addr, const kmem_cache_t *c, allocdby_walk_t *abw)
1926 {
1927 	if (mdb_pwalk(abw->abw_walk, (mdb_walk_cb_t)allocdby_walk_bufctl,
1928 	    abw, addr) == -1) {
1929 		mdb_warn("couldn't walk bufctl for cache %p", addr);
1930 		return (WALK_DONE);
1931 	}
1932 
1933 	return (WALK_NEXT);
1934 }
1935 
1936 static int
allocdby_cmp(const allocdby_bufctl_t * lhs,const allocdby_bufctl_t * rhs)1937 allocdby_cmp(const allocdby_bufctl_t *lhs, const allocdby_bufctl_t *rhs)
1938 {
1939 	if (lhs->abb_ts < rhs->abb_ts)
1940 		return (1);
1941 	if (lhs->abb_ts > rhs->abb_ts)
1942 		return (-1);
1943 	return (0);
1944 }
1945 
1946 static int
allocdby_walk_init_common(mdb_walk_state_t * wsp,const char * walk)1947 allocdby_walk_init_common(mdb_walk_state_t *wsp, const char *walk)
1948 {
1949 	allocdby_walk_t *abw;
1950 
1951 	if (wsp->walk_addr == NULL) {
1952 		mdb_warn("allocdby walk doesn't support global walks\n");
1953 		return (WALK_ERR);
1954 	}
1955 
1956 	abw = mdb_zalloc(sizeof (allocdby_walk_t), UM_SLEEP);
1957 
1958 	abw->abw_thread = wsp->walk_addr;
1959 	abw->abw_walk = walk;
1960 	abw->abw_size = 128;	/* something reasonable */
1961 	abw->abw_buf =
1962 	    mdb_zalloc(abw->abw_size * sizeof (allocdby_bufctl_t), UM_SLEEP);
1963 
1964 	wsp->walk_data = abw;
1965 
1966 	if (mdb_walk("kmem_cache",
1967 	    (mdb_walk_cb_t)allocdby_walk_cache, abw) == -1) {
1968 		mdb_warn("couldn't walk kmem_cache");
1969 		allocdby_walk_fini(wsp);
1970 		return (WALK_ERR);
1971 	}
1972 
1973 	qsort(abw->abw_buf, abw->abw_nbufs, sizeof (allocdby_bufctl_t),
1974 	    (int(*)(const void *, const void *))allocdby_cmp);
1975 
1976 	return (WALK_NEXT);
1977 }
1978 
1979 int
allocdby_walk_init(mdb_walk_state_t * wsp)1980 allocdby_walk_init(mdb_walk_state_t *wsp)
1981 {
1982 	return (allocdby_walk_init_common(wsp, "bufctl"));
1983 }
1984 
1985 int
freedby_walk_init(mdb_walk_state_t * wsp)1986 freedby_walk_init(mdb_walk_state_t *wsp)
1987 {
1988 	return (allocdby_walk_init_common(wsp, "freectl"));
1989 }
1990 
1991 int
allocdby_walk_step(mdb_walk_state_t * wsp)1992 allocdby_walk_step(mdb_walk_state_t *wsp)
1993 {
1994 	allocdby_walk_t *abw = wsp->walk_data;
1995 	kmem_bufctl_audit_t bc;
1996 	uintptr_t addr;
1997 
1998 	if (abw->abw_ndx == abw->abw_nbufs)
1999 		return (WALK_DONE);
2000 
2001 	addr = abw->abw_buf[abw->abw_ndx++].abb_addr;
2002 
2003 	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
2004 		mdb_warn("couldn't read bufctl at %p", addr);
2005 		return (WALK_DONE);
2006 	}
2007 
2008 	return (wsp->walk_callback(addr, &bc, wsp->walk_cbdata));
2009 }
2010 
2011 void
allocdby_walk_fini(mdb_walk_state_t * wsp)2012 allocdby_walk_fini(mdb_walk_state_t *wsp)
2013 {
2014 	allocdby_walk_t *abw = wsp->walk_data;
2015 
2016 	mdb_free(abw->abw_buf, sizeof (allocdby_bufctl_t) * abw->abw_size);
2017 	mdb_free(abw, sizeof (allocdby_walk_t));
2018 }
2019 
2020 /*ARGSUSED*/
2021 int
allocdby_walk(uintptr_t addr,const kmem_bufctl_audit_t * bcp,void * ignored)2022 allocdby_walk(uintptr_t addr, const kmem_bufctl_audit_t *bcp, void *ignored)
2023 {
2024 	char c[MDB_SYM_NAMLEN];
2025 	GElf_Sym sym;
2026 	int i;
2027 
2028 	mdb_printf("%0?p %12llx ", addr, bcp->bc_timestamp);
2029 	for (i = 0; i < bcp->bc_depth; i++) {
2030 		if (mdb_lookup_by_addr(bcp->bc_stack[i],
2031 		    MDB_SYM_FUZZY, c, sizeof (c), &sym) == -1)
2032 			continue;
2033 		if (strncmp(c, "kmem_", 5) == 0)
2034 			continue;
2035 		mdb_printf("%s+0x%lx",
2036 		    c, bcp->bc_stack[i] - (uintptr_t)sym.st_value);
2037 		break;
2038 	}
2039 	mdb_printf("\n");
2040 
2041 	return (WALK_NEXT);
2042 }
2043 
2044 static int
allocdby_common(uintptr_t addr,uint_t flags,const char * w)2045 allocdby_common(uintptr_t addr, uint_t flags, const char *w)
2046 {
2047 	if (!(flags & DCMD_ADDRSPEC))
2048 		return (DCMD_USAGE);
2049 
2050 	mdb_printf("%-?s %12s %s\n", "BUFCTL", "TIMESTAMP", "CALLER");
2051 
2052 	if (mdb_pwalk(w, (mdb_walk_cb_t)allocdby_walk, NULL, addr) == -1) {
2053 		mdb_warn("can't walk '%s' for %p", w, addr);
2054 		return (DCMD_ERR);
2055 	}
2056 
2057 	return (DCMD_OK);
2058 }
2059 
2060 /*ARGSUSED*/
2061 int
allocdby(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2062 allocdby(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2063 {
2064 	return (allocdby_common(addr, flags, "allocdby"));
2065 }
2066 
2067 /*ARGSUSED*/
2068 int
freedby(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2069 freedby(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2070 {
2071 	return (allocdby_common(addr, flags, "freedby"));
2072 }
2073 
2074 /*
2075  * Return a string describing the address in relation to the given thread's
2076  * stack.
2077  *
2078  * - If the thread state is TS_FREE, return " (inactive interrupt thread)".
2079  *
2080  * - If the address is above the stack pointer, return an empty string
2081  *   signifying that the address is active.
2082  *
2083  * - If the address is below the stack pointer, and the thread is not on proc,
2084  *   return " (below sp)".
2085  *
2086  * - If the address is below the stack pointer, and the thread is on proc,
2087  *   return " (possibly below sp)".  Depending on context, we may or may not
2088  *   have an accurate t_sp.
2089  */
2090 static const char *
stack_active(const kthread_t * t,uintptr_t addr)2091 stack_active(const kthread_t *t, uintptr_t addr)
2092 {
2093 	uintptr_t panicstk;
2094 	GElf_Sym sym;
2095 
2096 	if (t->t_state == TS_FREE)
2097 		return (" (inactive interrupt thread)");
2098 
2099 	/*
2100 	 * Check to see if we're on the panic stack.  If so, ignore t_sp, as it
2101 	 * no longer relates to the thread's real stack.
2102 	 */
2103 	if (mdb_lookup_by_name("panic_stack", &sym) == 0) {
2104 		panicstk = (uintptr_t)sym.st_value;
2105 
2106 		if (t->t_sp >= panicstk && t->t_sp < panicstk + PANICSTKSIZE)
2107 			return ("");
2108 	}
2109 
2110 	if (addr >= t->t_sp + STACK_BIAS)
2111 		return ("");
2112 
2113 	if (t->t_state == TS_ONPROC)
2114 		return (" (possibly below sp)");
2115 
2116 	return (" (below sp)");
2117 }
2118 
2119 /*
2120  * Additional state for the kmem and vmem ::whatis handlers
2121  */
2122 typedef struct whatis_info {
2123 	mdb_whatis_t *wi_w;
2124 	const kmem_cache_t *wi_cache;
2125 	const vmem_t *wi_vmem;
2126 	vmem_t *wi_msb_arena;
2127 	size_t wi_slab_size;
2128 	uint_t wi_slab_found;
2129 	uint_t wi_kmem_lite_count;
2130 	uint_t wi_freemem;
2131 } whatis_info_t;
2132 
2133 /* call one of our dcmd functions with "-v" and the provided address */
2134 static void
whatis_call_printer(mdb_dcmd_f * dcmd,uintptr_t addr)2135 whatis_call_printer(mdb_dcmd_f *dcmd, uintptr_t addr)
2136 {
2137 	mdb_arg_t a;
2138 	a.a_type = MDB_TYPE_STRING;
2139 	a.a_un.a_str = "-v";
2140 
2141 	mdb_printf(":\n");
2142 	(void) (*dcmd)(addr, DCMD_ADDRSPEC, 1, &a);
2143 }
2144 
2145 static void
whatis_print_kmf_lite(uintptr_t btaddr,size_t count)2146 whatis_print_kmf_lite(uintptr_t btaddr, size_t count)
2147 {
2148 #define	KMEM_LITE_MAX	16
2149 	pc_t callers[KMEM_LITE_MAX];
2150 	pc_t uninit = (pc_t)KMEM_UNINITIALIZED_PATTERN;
2151 
2152 	kmem_buftag_t bt;
2153 	intptr_t stat;
2154 	const char *plural = "";
2155 	int i;
2156 
2157 	/* validate our arguments and read in the buftag */
2158 	if (count == 0 || count > KMEM_LITE_MAX ||
2159 	    mdb_vread(&bt, sizeof (bt), btaddr) == -1)
2160 		return;
2161 
2162 	/* validate the buffer state and read in the callers */
2163 	stat = (intptr_t)bt.bt_bufctl ^ bt.bt_bxstat;
2164 
2165 	if (stat != KMEM_BUFTAG_ALLOC && stat != KMEM_BUFTAG_FREE)
2166 		return;
2167 
2168 	if (mdb_vread(callers, count * sizeof (pc_t),
2169 	    btaddr + offsetof(kmem_buftag_lite_t, bt_history)) == -1)
2170 		return;
2171 
2172 	/* If there aren't any filled in callers, bail */
2173 	if (callers[0] == uninit)
2174 		return;
2175 
2176 	plural = (callers[1] == uninit) ? "" : "s";
2177 
2178 	/* Everything's done and checked; print them out */
2179 	mdb_printf(":\n");
2180 
2181 	mdb_inc_indent(8);
2182 	mdb_printf("recent caller%s: %a", plural, callers[0]);
2183 	for (i = 1; i < count; i++) {
2184 		if (callers[i] == uninit)
2185 			break;
2186 		mdb_printf(", %a", callers[i]);
2187 	}
2188 	mdb_dec_indent(8);
2189 }
2190 
2191 static void
whatis_print_kmem(whatis_info_t * wi,uintptr_t maddr,uintptr_t addr,uintptr_t baddr)2192 whatis_print_kmem(whatis_info_t *wi, uintptr_t maddr, uintptr_t addr,
2193     uintptr_t baddr)
2194 {
2195 	mdb_whatis_t *w = wi->wi_w;
2196 
2197 	const kmem_cache_t *cp = wi->wi_cache;
2198 	/* LINTED pointer cast may result in improper alignment */
2199 	uintptr_t btaddr = (uintptr_t)KMEM_BUFTAG(cp, addr);
2200 	int quiet = (mdb_whatis_flags(w) & WHATIS_QUIET);
2201 	int call_printer = (!quiet && (cp->cache_flags & KMF_AUDIT));
2202 
2203 	mdb_whatis_report_object(w, maddr, addr, "");
2204 
2205 	if (baddr != 0 && !call_printer)
2206 		mdb_printf("bufctl %p ", baddr);
2207 
2208 	mdb_printf("%s from %s",
2209 	    (wi->wi_freemem == FALSE) ? "allocated" : "freed", cp->cache_name);
2210 
2211 	if (baddr != 0 && call_printer) {
2212 		whatis_call_printer(bufctl, baddr);
2213 		return;
2214 	}
2215 
2216 	/* for KMF_LITE caches, try to print out the previous callers */
2217 	if (!quiet && (cp->cache_flags & KMF_LITE))
2218 		whatis_print_kmf_lite(btaddr, wi->wi_kmem_lite_count);
2219 
2220 	mdb_printf("\n");
2221 }
2222 
2223 /*ARGSUSED*/
2224 static int
whatis_walk_kmem(uintptr_t addr,void * ignored,whatis_info_t * wi)2225 whatis_walk_kmem(uintptr_t addr, void *ignored, whatis_info_t *wi)
2226 {
2227 	mdb_whatis_t *w = wi->wi_w;
2228 
2229 	uintptr_t cur;
2230 	size_t size = wi->wi_cache->cache_bufsize;
2231 
2232 	while (mdb_whatis_match(w, addr, size, &cur))
2233 		whatis_print_kmem(wi, cur, addr, NULL);
2234 
2235 	return (WHATIS_WALKRET(w));
2236 }
2237 
2238 /*ARGSUSED*/
2239 static int
whatis_walk_bufctl(uintptr_t baddr,const kmem_bufctl_t * bcp,whatis_info_t * wi)2240 whatis_walk_bufctl(uintptr_t baddr, const kmem_bufctl_t *bcp, whatis_info_t *wi)
2241 {
2242 	mdb_whatis_t *w = wi->wi_w;
2243 
2244 	uintptr_t cur;
2245 	uintptr_t addr = (uintptr_t)bcp->bc_addr;
2246 	size_t size = wi->wi_cache->cache_bufsize;
2247 
2248 	while (mdb_whatis_match(w, addr, size, &cur))
2249 		whatis_print_kmem(wi, cur, addr, baddr);
2250 
2251 	return (WHATIS_WALKRET(w));
2252 }
2253 
2254 static int
whatis_walk_seg(uintptr_t addr,const vmem_seg_t * vs,whatis_info_t * wi)2255 whatis_walk_seg(uintptr_t addr, const vmem_seg_t *vs, whatis_info_t *wi)
2256 {
2257 	mdb_whatis_t *w = wi->wi_w;
2258 
2259 	size_t size = vs->vs_end - vs->vs_start;
2260 	uintptr_t cur;
2261 
2262 	/* We're not interested in anything but alloc and free segments */
2263 	if (vs->vs_type != VMEM_ALLOC && vs->vs_type != VMEM_FREE)
2264 		return (WALK_NEXT);
2265 
2266 	while (mdb_whatis_match(w, vs->vs_start, size, &cur)) {
2267 		mdb_whatis_report_object(w, cur, vs->vs_start, "");
2268 
2269 		/*
2270 		 * If we're not printing it seperately, provide the vmem_seg
2271 		 * pointer if it has a stack trace.
2272 		 */
2273 		if ((mdb_whatis_flags(w) & WHATIS_QUIET) &&
2274 		    (!(mdb_whatis_flags(w) & WHATIS_BUFCTL) ||
2275 		    (vs->vs_type == VMEM_ALLOC && vs->vs_depth != 0))) {
2276 			mdb_printf("vmem_seg %p ", addr);
2277 		}
2278 
2279 		mdb_printf("%s from the %s vmem arena",
2280 		    (vs->vs_type == VMEM_ALLOC) ? "allocated" : "freed",
2281 		    wi->wi_vmem->vm_name);
2282 
2283 		if (!(mdb_whatis_flags(w) & WHATIS_QUIET))
2284 			whatis_call_printer(vmem_seg, addr);
2285 		else
2286 			mdb_printf("\n");
2287 	}
2288 
2289 	return (WHATIS_WALKRET(w));
2290 }
2291 
2292 static int
whatis_walk_vmem(uintptr_t addr,const vmem_t * vmem,whatis_info_t * wi)2293 whatis_walk_vmem(uintptr_t addr, const vmem_t *vmem, whatis_info_t *wi)
2294 {
2295 	mdb_whatis_t *w = wi->wi_w;
2296 	const char *nm = vmem->vm_name;
2297 
2298 	int identifier = ((vmem->vm_cflags & VMC_IDENTIFIER) != 0);
2299 	int idspace = ((mdb_whatis_flags(w) & WHATIS_IDSPACE) != 0);
2300 
2301 	if (identifier != idspace)
2302 		return (WALK_NEXT);
2303 
2304 	wi->wi_vmem = vmem;
2305 
2306 	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2307 		mdb_printf("Searching vmem arena %s...\n", nm);
2308 
2309 	if (mdb_pwalk("vmem_seg",
2310 	    (mdb_walk_cb_t)whatis_walk_seg, wi, addr) == -1) {
2311 		mdb_warn("can't walk vmem_seg for %p", addr);
2312 		return (WALK_NEXT);
2313 	}
2314 
2315 	return (WHATIS_WALKRET(w));
2316 }
2317 
2318 /*ARGSUSED*/
2319 static int
whatis_walk_slab(uintptr_t saddr,const kmem_slab_t * sp,whatis_info_t * wi)2320 whatis_walk_slab(uintptr_t saddr, const kmem_slab_t *sp, whatis_info_t *wi)
2321 {
2322 	mdb_whatis_t *w = wi->wi_w;
2323 
2324 	/* It must overlap with the slab data, or it's not interesting */
2325 	if (mdb_whatis_overlaps(w,
2326 	    (uintptr_t)sp->slab_base, wi->wi_slab_size)) {
2327 		wi->wi_slab_found++;
2328 		return (WALK_DONE);
2329 	}
2330 	return (WALK_NEXT);
2331 }
2332 
2333 static int
whatis_walk_cache(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2334 whatis_walk_cache(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2335 {
2336 	mdb_whatis_t *w = wi->wi_w;
2337 
2338 	char *walk, *freewalk;
2339 	mdb_walk_cb_t func;
2340 	int do_bufctl;
2341 
2342 	int identifier = ((c->cache_flags & KMC_IDENTIFIER) != 0);
2343 	int idspace = ((mdb_whatis_flags(w) & WHATIS_IDSPACE) != 0);
2344 
2345 	if (identifier != idspace)
2346 		return (WALK_NEXT);
2347 
2348 	/* Override the '-b' flag as necessary */
2349 	if (!(c->cache_flags & KMF_HASH))
2350 		do_bufctl = FALSE;	/* no bufctls to walk */
2351 	else if (c->cache_flags & KMF_AUDIT)
2352 		do_bufctl = TRUE;	/* we always want debugging info */
2353 	else
2354 		do_bufctl = ((mdb_whatis_flags(w) & WHATIS_BUFCTL) != 0);
2355 
2356 	if (do_bufctl) {
2357 		walk = "bufctl";
2358 		freewalk = "freectl";
2359 		func = (mdb_walk_cb_t)whatis_walk_bufctl;
2360 	} else {
2361 		walk = "kmem";
2362 		freewalk = "freemem";
2363 		func = (mdb_walk_cb_t)whatis_walk_kmem;
2364 	}
2365 
2366 	wi->wi_cache = c;
2367 
2368 	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2369 		mdb_printf("Searching %s...\n", c->cache_name);
2370 
2371 	/*
2372 	 * If more then two buffers live on each slab, figure out if we're
2373 	 * interested in anything in any slab before doing the more expensive
2374 	 * kmem/freemem (bufctl/freectl) walkers.
2375 	 */
2376 	wi->wi_slab_size = c->cache_slabsize - c->cache_maxcolor;
2377 	if (!(c->cache_flags & KMF_HASH))
2378 		wi->wi_slab_size -= sizeof (kmem_slab_t);
2379 
2380 	if ((wi->wi_slab_size / c->cache_chunksize) > 2) {
2381 		wi->wi_slab_found = 0;
2382 		if (mdb_pwalk("kmem_slab", (mdb_walk_cb_t)whatis_walk_slab, wi,
2383 		    addr) == -1) {
2384 			mdb_warn("can't find kmem_slab walker");
2385 			return (WALK_DONE);
2386 		}
2387 		if (wi->wi_slab_found == 0)
2388 			return (WALK_NEXT);
2389 	}
2390 
2391 	wi->wi_freemem = FALSE;
2392 	if (mdb_pwalk(walk, func, wi, addr) == -1) {
2393 		mdb_warn("can't find %s walker", walk);
2394 		return (WALK_DONE);
2395 	}
2396 
2397 	if (mdb_whatis_done(w))
2398 		return (WALK_DONE);
2399 
2400 	/*
2401 	 * We have searched for allocated memory; now search for freed memory.
2402 	 */
2403 	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2404 		mdb_printf("Searching %s for free memory...\n", c->cache_name);
2405 
2406 	wi->wi_freemem = TRUE;
2407 	if (mdb_pwalk(freewalk, func, wi, addr) == -1) {
2408 		mdb_warn("can't find %s walker", freewalk);
2409 		return (WALK_DONE);
2410 	}
2411 
2412 	return (WHATIS_WALKRET(w));
2413 }
2414 
2415 static int
whatis_walk_touch(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2416 whatis_walk_touch(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2417 {
2418 	if (c->cache_arena == wi->wi_msb_arena ||
2419 	    (c->cache_cflags & KMC_NOTOUCH))
2420 		return (WALK_NEXT);
2421 
2422 	return (whatis_walk_cache(addr, c, wi));
2423 }
2424 
2425 static int
whatis_walk_metadata(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2426 whatis_walk_metadata(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2427 {
2428 	if (c->cache_arena != wi->wi_msb_arena)
2429 		return (WALK_NEXT);
2430 
2431 	return (whatis_walk_cache(addr, c, wi));
2432 }
2433 
2434 static int
whatis_walk_notouch(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2435 whatis_walk_notouch(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2436 {
2437 	if (c->cache_arena == wi->wi_msb_arena ||
2438 	    !(c->cache_cflags & KMC_NOTOUCH))
2439 		return (WALK_NEXT);
2440 
2441 	return (whatis_walk_cache(addr, c, wi));
2442 }
2443 
2444 static int
whatis_walk_thread(uintptr_t addr,const kthread_t * t,mdb_whatis_t * w)2445 whatis_walk_thread(uintptr_t addr, const kthread_t *t, mdb_whatis_t *w)
2446 {
2447 	uintptr_t cur;
2448 	uintptr_t saddr;
2449 	size_t size;
2450 
2451 	/*
2452 	 * Often, one calls ::whatis on an address from a thread structure.
2453 	 * We use this opportunity to short circuit this case...
2454 	 */
2455 	while (mdb_whatis_match(w, addr, sizeof (kthread_t), &cur))
2456 		mdb_whatis_report_object(w, cur, addr,
2457 		    "allocated as a thread structure\n");
2458 
2459 	/*
2460 	 * Now check the stack
2461 	 */
2462 	if (t->t_stkbase == NULL)
2463 		return (WALK_NEXT);
2464 
2465 	/*
2466 	 * This assumes that t_stk is the end of the stack, but it's really
2467 	 * only the initial stack pointer for the thread.  Arguments to the
2468 	 * initial procedure, SA(MINFRAME), etc. are all after t_stk.  So
2469 	 * that 't->t_stk::whatis' reports "part of t's stack", we include
2470 	 * t_stk in the range (the "+ 1", below), but the kernel should
2471 	 * really include the full stack bounds where we can find it.
2472 	 */
2473 	saddr = (uintptr_t)t->t_stkbase;
2474 	size = (uintptr_t)t->t_stk - saddr + 1;
2475 	while (mdb_whatis_match(w, saddr, size, &cur))
2476 		mdb_whatis_report_object(w, cur, cur,
2477 		    "in thread %p's stack%s\n", addr, stack_active(t, cur));
2478 
2479 	return (WHATIS_WALKRET(w));
2480 }
2481 
2482 static void
whatis_modctl_match(mdb_whatis_t * w,const char * name,uintptr_t base,size_t size,const char * where)2483 whatis_modctl_match(mdb_whatis_t *w, const char *name,
2484     uintptr_t base, size_t size, const char *where)
2485 {
2486 	uintptr_t cur;
2487 
2488 	/*
2489 	 * Since we're searching for addresses inside a module, we report
2490 	 * them as symbols.
2491 	 */
2492 	while (mdb_whatis_match(w, base, size, &cur))
2493 		mdb_whatis_report_address(w, cur, "in %s's %s\n", name, where);
2494 }
2495 
2496 static int
whatis_walk_modctl(uintptr_t addr,const struct modctl * m,mdb_whatis_t * w)2497 whatis_walk_modctl(uintptr_t addr, const struct modctl *m, mdb_whatis_t *w)
2498 {
2499 	char name[MODMAXNAMELEN];
2500 	struct module mod;
2501 	Shdr shdr;
2502 
2503 	if (m->mod_mp == NULL)
2504 		return (WALK_NEXT);
2505 
2506 	if (mdb_vread(&mod, sizeof (mod), (uintptr_t)m->mod_mp) == -1) {
2507 		mdb_warn("couldn't read modctl %p's module", addr);
2508 		return (WALK_NEXT);
2509 	}
2510 
2511 	if (mdb_readstr(name, sizeof (name), (uintptr_t)m->mod_modname) == -1)
2512 		(void) mdb_snprintf(name, sizeof (name), "0x%p", addr);
2513 
2514 	whatis_modctl_match(w, name,
2515 	    (uintptr_t)mod.text, mod.text_size, "text segment");
2516 	whatis_modctl_match(w, name,
2517 	    (uintptr_t)mod.data, mod.data_size, "data segment");
2518 	whatis_modctl_match(w, name,
2519 	    (uintptr_t)mod.bss, mod.bss_size, "bss segment");
2520 
2521 	if (mdb_vread(&shdr, sizeof (shdr), (uintptr_t)mod.symhdr) == -1) {
2522 		mdb_warn("couldn't read symbol header for %p's module", addr);
2523 		return (WALK_NEXT);
2524 	}
2525 
2526 	whatis_modctl_match(w, name,
2527 	    (uintptr_t)mod.symtbl, mod.nsyms * shdr.sh_entsize, "symtab");
2528 	whatis_modctl_match(w, name,
2529 	    (uintptr_t)mod.symspace, mod.symsize, "symtab");
2530 
2531 	return (WHATIS_WALKRET(w));
2532 }
2533 
2534 /*ARGSUSED*/
2535 static int
whatis_walk_memseg(uintptr_t addr,const struct memseg * seg,mdb_whatis_t * w)2536 whatis_walk_memseg(uintptr_t addr, const struct memseg *seg, mdb_whatis_t *w)
2537 {
2538 	uintptr_t cur;
2539 
2540 	uintptr_t base = (uintptr_t)seg->pages;
2541 	size_t size = (uintptr_t)seg->epages - base;
2542 
2543 	while (mdb_whatis_match(w, base, size, &cur)) {
2544 		/* round our found pointer down to the page_t base. */
2545 		size_t offset = (cur - base) % sizeof (page_t);
2546 
2547 		mdb_whatis_report_object(w, cur, cur - offset,
2548 		    "allocated as a page structure\n");
2549 	}
2550 
2551 	return (WHATIS_WALKRET(w));
2552 }
2553 
2554 /*ARGSUSED*/
2555 static int
whatis_run_modules(mdb_whatis_t * w,void * arg)2556 whatis_run_modules(mdb_whatis_t *w, void *arg)
2557 {
2558 	if (mdb_walk("modctl", (mdb_walk_cb_t)whatis_walk_modctl, w) == -1) {
2559 		mdb_warn("couldn't find modctl walker");
2560 		return (1);
2561 	}
2562 	return (0);
2563 }
2564 
2565 /*ARGSUSED*/
2566 static int
whatis_run_threads(mdb_whatis_t * w,void * ignored)2567 whatis_run_threads(mdb_whatis_t *w, void *ignored)
2568 {
2569 	/*
2570 	 * Now search all thread stacks.  Yes, this is a little weak; we
2571 	 * can save a lot of work by first checking to see if the
2572 	 * address is in segkp vs. segkmem.  But hey, computers are
2573 	 * fast.
2574 	 */
2575 	if (mdb_walk("thread", (mdb_walk_cb_t)whatis_walk_thread, w) == -1) {
2576 		mdb_warn("couldn't find thread walker");
2577 		return (1);
2578 	}
2579 	return (0);
2580 }
2581 
2582 /*ARGSUSED*/
2583 static int
whatis_run_pages(mdb_whatis_t * w,void * ignored)2584 whatis_run_pages(mdb_whatis_t *w, void *ignored)
2585 {
2586 	if (mdb_walk("memseg", (mdb_walk_cb_t)whatis_walk_memseg, w) == -1) {
2587 		mdb_warn("couldn't find memseg walker");
2588 		return (1);
2589 	}
2590 	return (0);
2591 }
2592 
2593 /*ARGSUSED*/
2594 static int
whatis_run_kmem(mdb_whatis_t * w,void * ignored)2595 whatis_run_kmem(mdb_whatis_t *w, void *ignored)
2596 {
2597 	whatis_info_t wi;
2598 
2599 	bzero(&wi, sizeof (wi));
2600 	wi.wi_w = w;
2601 
2602 	if (mdb_readvar(&wi.wi_msb_arena, "kmem_msb_arena") == -1)
2603 		mdb_warn("unable to readvar \"kmem_msb_arena\"");
2604 
2605 	if (mdb_readvar(&wi.wi_kmem_lite_count,
2606 	    "kmem_lite_count") == -1 || wi.wi_kmem_lite_count > 16)
2607 		wi.wi_kmem_lite_count = 0;
2608 
2609 	/*
2610 	 * We process kmem caches in the following order:
2611 	 *
2612 	 *	non-KMC_NOTOUCH, non-metadata	(typically the most interesting)
2613 	 *	metadata			(can be huge with KMF_AUDIT)
2614 	 *	KMC_NOTOUCH, non-metadata	(see kmem_walk_all())
2615 	 */
2616 	if (mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_touch,
2617 	    &wi) == -1 ||
2618 	    mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_metadata,
2619 	    &wi) == -1 ||
2620 	    mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_notouch,
2621 	    &wi) == -1) {
2622 		mdb_warn("couldn't find kmem_cache walker");
2623 		return (1);
2624 	}
2625 	return (0);
2626 }
2627 
2628 /*ARGSUSED*/
2629 static int
whatis_run_vmem(mdb_whatis_t * w,void * ignored)2630 whatis_run_vmem(mdb_whatis_t *w, void *ignored)
2631 {
2632 	whatis_info_t wi;
2633 
2634 	bzero(&wi, sizeof (wi));
2635 	wi.wi_w = w;
2636 
2637 	if (mdb_walk("vmem_postfix",
2638 	    (mdb_walk_cb_t)whatis_walk_vmem, &wi) == -1) {
2639 		mdb_warn("couldn't find vmem_postfix walker");
2640 		return (1);
2641 	}
2642 	return (0);
2643 }
2644 
2645 typedef struct kmem_log_cpu {
2646 	uintptr_t kmc_low;
2647 	uintptr_t kmc_high;
2648 } kmem_log_cpu_t;
2649 
2650 typedef struct kmem_log_data {
2651 	uintptr_t kmd_addr;
2652 	kmem_log_cpu_t *kmd_cpu;
2653 } kmem_log_data_t;
2654 
2655 int
kmem_log_walk(uintptr_t addr,const kmem_bufctl_audit_t * b,kmem_log_data_t * kmd)2656 kmem_log_walk(uintptr_t addr, const kmem_bufctl_audit_t *b,
2657     kmem_log_data_t *kmd)
2658 {
2659 	int i;
2660 	kmem_log_cpu_t *kmc = kmd->kmd_cpu;
2661 	size_t bufsize;
2662 
2663 	for (i = 0; i < NCPU; i++) {
2664 		if (addr >= kmc[i].kmc_low && addr < kmc[i].kmc_high)
2665 			break;
2666 	}
2667 
2668 	if (kmd->kmd_addr) {
2669 		if (b->bc_cache == NULL)
2670 			return (WALK_NEXT);
2671 
2672 		if (mdb_vread(&bufsize, sizeof (bufsize),
2673 		    (uintptr_t)&b->bc_cache->cache_bufsize) == -1) {
2674 			mdb_warn(
2675 			    "failed to read cache_bufsize for cache at %p",
2676 			    b->bc_cache);
2677 			return (WALK_ERR);
2678 		}
2679 
2680 		if (kmd->kmd_addr < (uintptr_t)b->bc_addr ||
2681 		    kmd->kmd_addr >= (uintptr_t)b->bc_addr + bufsize)
2682 			return (WALK_NEXT);
2683 	}
2684 
2685 	if (i == NCPU)
2686 		mdb_printf("   ");
2687 	else
2688 		mdb_printf("%3d", i);
2689 
2690 	mdb_printf(" %0?p %0?p %16llx %0?p\n", addr, b->bc_addr,
2691 	    b->bc_timestamp, b->bc_thread);
2692 
2693 	return (WALK_NEXT);
2694 }
2695 
2696 /*ARGSUSED*/
2697 int
kmem_log(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2698 kmem_log(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2699 {
2700 	kmem_log_header_t lh;
2701 	kmem_cpu_log_header_t clh;
2702 	uintptr_t lhp, clhp;
2703 	int ncpus;
2704 	uintptr_t *cpu;
2705 	GElf_Sym sym;
2706 	kmem_log_cpu_t *kmc;
2707 	int i;
2708 	kmem_log_data_t kmd;
2709 	uint_t opt_b = FALSE;
2710 
2711 	if (mdb_getopts(argc, argv,
2712 	    'b', MDB_OPT_SETBITS, TRUE, &opt_b, NULL) != argc)
2713 		return (DCMD_USAGE);
2714 
2715 	if (mdb_readvar(&lhp, "kmem_transaction_log") == -1) {
2716 		mdb_warn("failed to read 'kmem_transaction_log'");
2717 		return (DCMD_ERR);
2718 	}
2719 
2720 	if (lhp == NULL) {
2721 		mdb_warn("no kmem transaction log\n");
2722 		return (DCMD_ERR);
2723 	}
2724 
2725 	mdb_readvar(&ncpus, "ncpus");
2726 
2727 	if (mdb_vread(&lh, sizeof (kmem_log_header_t), lhp) == -1) {
2728 		mdb_warn("failed to read log header at %p", lhp);
2729 		return (DCMD_ERR);
2730 	}
2731 
2732 	clhp = lhp + ((uintptr_t)&lh.lh_cpu[0] - (uintptr_t)&lh);
2733 
2734 	cpu = mdb_alloc(sizeof (uintptr_t) * NCPU, UM_SLEEP | UM_GC);
2735 
2736 	if (mdb_lookup_by_name("cpu", &sym) == -1) {
2737 		mdb_warn("couldn't find 'cpu' array");
2738 		return (DCMD_ERR);
2739 	}
2740 
2741 	if (sym.st_size != NCPU * sizeof (uintptr_t)) {
2742 		mdb_warn("expected 'cpu' to be of size %d; found %d\n",
2743 		    NCPU * sizeof (uintptr_t), sym.st_size);
2744 		return (DCMD_ERR);
2745 	}
2746 
2747 	if (mdb_vread(cpu, sym.st_size, (uintptr_t)sym.st_value) == -1) {
2748 		mdb_warn("failed to read cpu array at %p", sym.st_value);
2749 		return (DCMD_ERR);
2750 	}
2751 
2752 	kmc = mdb_zalloc(sizeof (kmem_log_cpu_t) * NCPU, UM_SLEEP | UM_GC);
2753 	kmd.kmd_addr = NULL;
2754 	kmd.kmd_cpu = kmc;
2755 
2756 	for (i = 0; i < NCPU; i++) {
2757 
2758 		if (cpu[i] == NULL)
2759 			continue;
2760 
2761 		if (mdb_vread(&clh, sizeof (clh), clhp) == -1) {
2762 			mdb_warn("cannot read cpu %d's log header at %p",
2763 			    i, clhp);
2764 			return (DCMD_ERR);
2765 		}
2766 
2767 		kmc[i].kmc_low = clh.clh_chunk * lh.lh_chunksize +
2768 		    (uintptr_t)lh.lh_base;
2769 		kmc[i].kmc_high = (uintptr_t)clh.clh_current;
2770 
2771 		clhp += sizeof (kmem_cpu_log_header_t);
2772 	}
2773 
2774 	mdb_printf("%3s %-?s %-?s %16s %-?s\n", "CPU", "ADDR", "BUFADDR",
2775 	    "TIMESTAMP", "THREAD");
2776 
2777 	/*
2778 	 * If we have been passed an address, print out only log entries
2779 	 * corresponding to that address.  If opt_b is specified, then interpret
2780 	 * the address as a bufctl.
2781 	 */
2782 	if (flags & DCMD_ADDRSPEC) {
2783 		kmem_bufctl_audit_t b;
2784 
2785 		if (opt_b) {
2786 			kmd.kmd_addr = addr;
2787 		} else {
2788 			if (mdb_vread(&b,
2789 			    sizeof (kmem_bufctl_audit_t), addr) == -1) {
2790 				mdb_warn("failed to read bufctl at %p", addr);
2791 				return (DCMD_ERR);
2792 			}
2793 
2794 			(void) kmem_log_walk(addr, &b, &kmd);
2795 
2796 			return (DCMD_OK);
2797 		}
2798 	}
2799 
2800 	if (mdb_walk("kmem_log", (mdb_walk_cb_t)kmem_log_walk, &kmd) == -1) {
2801 		mdb_warn("can't find kmem log walker");
2802 		return (DCMD_ERR);
2803 	}
2804 
2805 	return (DCMD_OK);
2806 }
2807 
2808 typedef struct bufctl_history_cb {
2809 	int		bhc_flags;
2810 	int		bhc_argc;
2811 	const mdb_arg_t	*bhc_argv;
2812 	int		bhc_ret;
2813 } bufctl_history_cb_t;
2814 
2815 /*ARGSUSED*/
2816 static int
bufctl_history_callback(uintptr_t addr,const void * ign,void * arg)2817 bufctl_history_callback(uintptr_t addr, const void *ign, void *arg)
2818 {
2819 	bufctl_history_cb_t *bhc = arg;
2820 
2821 	bhc->bhc_ret =
2822 	    bufctl(addr, bhc->bhc_flags, bhc->bhc_argc, bhc->bhc_argv);
2823 
2824 	bhc->bhc_flags &= ~DCMD_LOOPFIRST;
2825 
2826 	return ((bhc->bhc_ret == DCMD_OK)? WALK_NEXT : WALK_DONE);
2827 }
2828 
2829 void
bufctl_help(void)2830 bufctl_help(void)
2831 {
2832 	mdb_printf("%s",
2833 "Display the contents of kmem_bufctl_audit_ts, with optional filtering.\n\n");
2834 	mdb_dec_indent(2);
2835 	mdb_printf("%<b>OPTIONS%</b>\n");
2836 	mdb_inc_indent(2);
2837 	mdb_printf("%s",
2838 "  -v    Display the full content of the bufctl, including its stack trace\n"
2839 "  -h    retrieve the bufctl's transaction history, if available\n"
2840 "  -a addr\n"
2841 "        filter out bufctls not involving the buffer at addr\n"
2842 "  -c caller\n"
2843 "        filter out bufctls without the function/PC in their stack trace\n"
2844 "  -e earliest\n"
2845 "        filter out bufctls timestamped before earliest\n"
2846 "  -l latest\n"
2847 "        filter out bufctls timestamped after latest\n"
2848 "  -t thread\n"
2849 "        filter out bufctls not involving thread\n");
2850 }
2851 
2852 int
bufctl(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2853 bufctl(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2854 {
2855 	kmem_bufctl_audit_t bc;
2856 	uint_t verbose = FALSE;
2857 	uint_t history = FALSE;
2858 	uint_t in_history = FALSE;
2859 	uintptr_t caller = NULL, thread = NULL;
2860 	uintptr_t laddr, haddr, baddr = NULL;
2861 	hrtime_t earliest = 0, latest = 0;
2862 	int i, depth;
2863 	char c[MDB_SYM_NAMLEN];
2864 	GElf_Sym sym;
2865 
2866 	if (mdb_getopts(argc, argv,
2867 	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
2868 	    'h', MDB_OPT_SETBITS, TRUE, &history,
2869 	    'H', MDB_OPT_SETBITS, TRUE, &in_history,		/* internal */
2870 	    'c', MDB_OPT_UINTPTR, &caller,
2871 	    't', MDB_OPT_UINTPTR, &thread,
2872 	    'e', MDB_OPT_UINT64, &earliest,
2873 	    'l', MDB_OPT_UINT64, &latest,
2874 	    'a', MDB_OPT_UINTPTR, &baddr, NULL) != argc)
2875 		return (DCMD_USAGE);
2876 
2877 	if (!(flags & DCMD_ADDRSPEC))
2878 		return (DCMD_USAGE);
2879 
2880 	if (in_history && !history)
2881 		return (DCMD_USAGE);
2882 
2883 	if (history && !in_history) {
2884 		mdb_arg_t *nargv = mdb_zalloc(sizeof (*nargv) * (argc + 1),
2885 		    UM_SLEEP | UM_GC);
2886 		bufctl_history_cb_t bhc;
2887 
2888 		nargv[0].a_type = MDB_TYPE_STRING;
2889 		nargv[0].a_un.a_str = "-H";		/* prevent recursion */
2890 
2891 		for (i = 0; i < argc; i++)
2892 			nargv[i + 1] = argv[i];
2893 
2894 		/*
2895 		 * When in history mode, we treat each element as if it
2896 		 * were in a seperate loop, so that the headers group
2897 		 * bufctls with similar histories.
2898 		 */
2899 		bhc.bhc_flags = flags | DCMD_LOOP | DCMD_LOOPFIRST;
2900 		bhc.bhc_argc = argc + 1;
2901 		bhc.bhc_argv = nargv;
2902 		bhc.bhc_ret = DCMD_OK;
2903 
2904 		if (mdb_pwalk("bufctl_history", bufctl_history_callback, &bhc,
2905 		    addr) == -1) {
2906 			mdb_warn("unable to walk bufctl_history");
2907 			return (DCMD_ERR);
2908 		}
2909 
2910 		if (bhc.bhc_ret == DCMD_OK && !(flags & DCMD_PIPE_OUT))
2911 			mdb_printf("\n");
2912 
2913 		return (bhc.bhc_ret);
2914 	}
2915 
2916 	if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
2917 		if (verbose) {
2918 			mdb_printf("%16s %16s %16s %16s\n"
2919 			    "%<u>%16s %16s %16s %16s%</u>\n",
2920 			    "ADDR", "BUFADDR", "TIMESTAMP", "THREAD",
2921 			    "", "CACHE", "LASTLOG", "CONTENTS");
2922 		} else {
2923 			mdb_printf("%<u>%-?s %-?s %-12s %-?s %s%</u>\n",
2924 			    "ADDR", "BUFADDR", "TIMESTAMP", "THREAD", "CALLER");
2925 		}
2926 	}
2927 
2928 	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
2929 		mdb_warn("couldn't read bufctl at %p", addr);
2930 		return (DCMD_ERR);
2931 	}
2932 
2933 	/*
2934 	 * Guard against bogus bc_depth in case the bufctl is corrupt or
2935 	 * the address does not really refer to a bufctl.
2936 	 */
2937 	depth = MIN(bc.bc_depth, KMEM_STACK_DEPTH);
2938 
2939 	if (caller != NULL) {
2940 		laddr = caller;
2941 		haddr = caller + sizeof (caller);
2942 
2943 		if (mdb_lookup_by_addr(caller, MDB_SYM_FUZZY, c, sizeof (c),
2944 		    &sym) != -1 && caller == (uintptr_t)sym.st_value) {
2945 			/*
2946 			 * We were provided an exact symbol value; any
2947 			 * address in the function is valid.
2948 			 */
2949 			laddr = (uintptr_t)sym.st_value;
2950 			haddr = (uintptr_t)sym.st_value + sym.st_size;
2951 		}
2952 
2953 		for (i = 0; i < depth; i++)
2954 			if (bc.bc_stack[i] >= laddr && bc.bc_stack[i] < haddr)
2955 				break;
2956 
2957 		if (i == depth)
2958 			return (DCMD_OK);
2959 	}
2960 
2961 	if (thread != NULL && (uintptr_t)bc.bc_thread != thread)
2962 		return (DCMD_OK);
2963 
2964 	if (earliest != 0 && bc.bc_timestamp < earliest)
2965 		return (DCMD_OK);
2966 
2967 	if (latest != 0 && bc.bc_timestamp > latest)
2968 		return (DCMD_OK);
2969 
2970 	if (baddr != 0 && (uintptr_t)bc.bc_addr != baddr)
2971 		return (DCMD_OK);
2972 
2973 	if (flags & DCMD_PIPE_OUT) {
2974 		mdb_printf("%#lr\n", addr);
2975 		return (DCMD_OK);
2976 	}
2977 
2978 	if (verbose) {
2979 		mdb_printf(
2980 		    "%<b>%16p%</b> %16p %16llx %16p\n"
2981 		    "%16s %16p %16p %16p\n",
2982 		    addr, bc.bc_addr, bc.bc_timestamp, bc.bc_thread,
2983 		    "", bc.bc_cache, bc.bc_lastlog, bc.bc_contents);
2984 
2985 		mdb_inc_indent(17);
2986 		for (i = 0; i < depth; i++)
2987 			mdb_printf("%a\n", bc.bc_stack[i]);
2988 		mdb_dec_indent(17);
2989 		mdb_printf("\n");
2990 	} else {
2991 		mdb_printf("%0?p %0?p %12llx %0?p", addr, bc.bc_addr,
2992 		    bc.bc_timestamp, bc.bc_thread);
2993 
2994 		for (i = 0; i < depth; i++) {
2995 			if (mdb_lookup_by_addr(bc.bc_stack[i],
2996 			    MDB_SYM_FUZZY, c, sizeof (c), &sym) == -1)
2997 				continue;
2998 			if (strncmp(c, "kmem_", 5) == 0)
2999 				continue;
3000 			mdb_printf(" %a\n", bc.bc_stack[i]);
3001 			break;
3002 		}
3003 
3004 		if (i >= depth)
3005 			mdb_printf("\n");
3006 	}
3007 
3008 	return (DCMD_OK);
3009 }
3010 
3011 typedef struct kmem_verify {
3012 	uint64_t *kmv_buf;		/* buffer to read cache contents into */
3013 	size_t kmv_size;		/* number of bytes in kmv_buf */
3014 	int kmv_corruption;		/* > 0 if corruption found. */
3015 	uint_t kmv_flags;		/* dcmd flags */
3016 	struct kmem_cache kmv_cache;	/* the cache we're operating on */
3017 } kmem_verify_t;
3018 
3019 /*
3020  * verify_pattern()
3021  * 	verify that buf is filled with the pattern pat.
3022  */
3023 static int64_t
verify_pattern(uint64_t * buf_arg,size_t size,uint64_t pat)3024 verify_pattern(uint64_t *buf_arg, size_t size, uint64_t pat)
3025 {
3026 	/*LINTED*/
3027 	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
3028 	uint64_t *buf;
3029 
3030 	for (buf = buf_arg; buf < bufend; buf++)
3031 		if (*buf != pat)
3032 			return ((uintptr_t)buf - (uintptr_t)buf_arg);
3033 	return (-1);
3034 }
3035 
3036 /*
3037  * verify_buftag()
3038  *	verify that btp->bt_bxstat == (bcp ^ pat)
3039  */
3040 static int
verify_buftag(kmem_buftag_t * btp,uintptr_t pat)3041 verify_buftag(kmem_buftag_t *btp, uintptr_t pat)
3042 {
3043 	return (btp->bt_bxstat == ((intptr_t)btp->bt_bufctl ^ pat) ? 0 : -1);
3044 }
3045 
3046 /*
3047  * verify_free()
3048  * 	verify the integrity of a free block of memory by checking
3049  * 	that it is filled with 0xdeadbeef and that its buftag is sane.
3050  */
3051 /*ARGSUSED1*/
3052 static int
verify_free(uintptr_t addr,const void * data,void * private)3053 verify_free(uintptr_t addr, const void *data, void *private)
3054 {
3055 	kmem_verify_t *kmv = (kmem_verify_t *)private;
3056 	uint64_t *buf = kmv->kmv_buf;	/* buf to validate */
3057 	int64_t corrupt;		/* corruption offset */
3058 	kmem_buftag_t *buftagp;		/* ptr to buftag */
3059 	kmem_cache_t *cp = &kmv->kmv_cache;
3060 	boolean_t besilent = !!(kmv->kmv_flags & (DCMD_LOOP | DCMD_PIPE_OUT));
3061 
3062 	/*LINTED*/
3063 	buftagp = KMEM_BUFTAG(cp, buf);
3064 
3065 	/*
3066 	 * Read the buffer to check.
3067 	 */
3068 	if (mdb_vread(buf, kmv->kmv_size, addr) == -1) {
3069 		if (!besilent)
3070 			mdb_warn("couldn't read %p", addr);
3071 		return (WALK_NEXT);
3072 	}
3073 
3074 	if ((corrupt = verify_pattern(buf, cp->cache_verify,
3075 	    KMEM_FREE_PATTERN)) >= 0) {
3076 		if (!besilent)
3077 			mdb_printf("buffer %p (free) seems corrupted, at %p\n",
3078 			    addr, (uintptr_t)addr + corrupt);
3079 		goto corrupt;
3080 	}
3081 	/*
3082 	 * When KMF_LITE is set, buftagp->bt_redzone is used to hold
3083 	 * the first bytes of the buffer, hence we cannot check for red
3084 	 * zone corruption.
3085 	 */
3086 	if ((cp->cache_flags & (KMF_HASH | KMF_LITE)) == KMF_HASH &&
3087 	    buftagp->bt_redzone != KMEM_REDZONE_PATTERN) {
3088 		if (!besilent)
3089 			mdb_printf("buffer %p (free) seems to "
3090 			    "have a corrupt redzone pattern\n", addr);
3091 		goto corrupt;
3092 	}
3093 
3094 	/*
3095 	 * confirm bufctl pointer integrity.
3096 	 */
3097 	if (verify_buftag(buftagp, KMEM_BUFTAG_FREE) == -1) {
3098 		if (!besilent)
3099 			mdb_printf("buffer %p (free) has a corrupt "
3100 			    "buftag\n", addr);
3101 		goto corrupt;
3102 	}
3103 
3104 	return (WALK_NEXT);
3105 corrupt:
3106 	if (kmv->kmv_flags & DCMD_PIPE_OUT)
3107 		mdb_printf("%p\n", addr);
3108 	kmv->kmv_corruption++;
3109 	return (WALK_NEXT);
3110 }
3111 
3112 /*
3113  * verify_alloc()
3114  * 	Verify that the buftag of an allocated buffer makes sense with respect
3115  * 	to the buffer.
3116  */
3117 /*ARGSUSED1*/
3118 static int
verify_alloc(uintptr_t addr,const void * data,void * private)3119 verify_alloc(uintptr_t addr, const void *data, void *private)
3120 {
3121 	kmem_verify_t *kmv = (kmem_verify_t *)private;
3122 	kmem_cache_t *cp = &kmv->kmv_cache;
3123 	uint64_t *buf = kmv->kmv_buf;	/* buf to validate */
3124 	/*LINTED*/
3125 	kmem_buftag_t *buftagp = KMEM_BUFTAG(cp, buf);
3126 	uint32_t *ip = (uint32_t *)buftagp;
3127 	uint8_t *bp = (uint8_t *)buf;
3128 	int looks_ok = 0, size_ok = 1;	/* flags for finding corruption */
3129 	boolean_t besilent = !!(kmv->kmv_flags & (DCMD_LOOP | DCMD_PIPE_OUT));
3130 
3131 	/*
3132 	 * Read the buffer to check.
3133 	 */
3134 	if (mdb_vread(buf, kmv->kmv_size, addr) == -1) {
3135 		if (!besilent)
3136 			mdb_warn("couldn't read %p", addr);
3137 		return (WALK_NEXT);
3138 	}
3139 
3140 	/*
3141 	 * There are two cases to handle:
3142 	 * 1. If the buf was alloc'd using kmem_cache_alloc, it will have
3143 	 *    0xfeedfacefeedface at the end of it
3144 	 * 2. If the buf was alloc'd using kmem_alloc, it will have
3145 	 *    0xbb just past the end of the region in use.  At the buftag,
3146 	 *    it will have 0xfeedface (or, if the whole buffer is in use,
3147 	 *    0xfeedface & bb000000 or 0xfeedfacf & 000000bb depending on
3148 	 *    endianness), followed by 32 bits containing the offset of the
3149 	 *    0xbb byte in the buffer.
3150 	 *
3151 	 * Finally, the two 32-bit words that comprise the second half of the
3152 	 * buftag should xor to KMEM_BUFTAG_ALLOC
3153 	 */
3154 
3155 	if (buftagp->bt_redzone == KMEM_REDZONE_PATTERN)
3156 		looks_ok = 1;
3157 	else if (!KMEM_SIZE_VALID(ip[1]))
3158 		size_ok = 0;
3159 	else if (bp[KMEM_SIZE_DECODE(ip[1])] == KMEM_REDZONE_BYTE)
3160 		looks_ok = 1;
3161 	else
3162 		size_ok = 0;
3163 
3164 	if (!size_ok) {
3165 		if (!besilent)
3166 			mdb_printf("buffer %p (allocated) has a corrupt "
3167 			    "redzone size encoding\n", addr);
3168 		goto corrupt;
3169 	}
3170 
3171 	if (!looks_ok) {
3172 		if (!besilent)
3173 			mdb_printf("buffer %p (allocated) has a corrupt "
3174 			    "redzone signature\n", addr);
3175 		goto corrupt;
3176 	}
3177 
3178 	if (verify_buftag(buftagp, KMEM_BUFTAG_ALLOC) == -1) {
3179 		if (!besilent)
3180 			mdb_printf("buffer %p (allocated) has a "
3181 			    "corrupt buftag\n", addr);
3182 		goto corrupt;
3183 	}
3184 
3185 	return (WALK_NEXT);
3186 corrupt:
3187 	if (kmv->kmv_flags & DCMD_PIPE_OUT)
3188 		mdb_printf("%p\n", addr);
3189 
3190 	kmv->kmv_corruption++;
3191 	return (WALK_NEXT);
3192 }
3193 
3194 /*ARGSUSED2*/
3195 int
kmem_verify(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3196 kmem_verify(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3197 {
3198 	if (flags & DCMD_ADDRSPEC) {
3199 		int check_alloc = 0, check_free = 0;
3200 		kmem_verify_t kmv;
3201 
3202 		if (mdb_vread(&kmv.kmv_cache, sizeof (kmv.kmv_cache),
3203 		    addr) == -1) {
3204 			mdb_warn("couldn't read kmem_cache %p", addr);
3205 			return (DCMD_ERR);
3206 		}
3207 
3208 		if ((kmv.kmv_cache.cache_dump.kd_unsafe ||
3209 		    kmv.kmv_cache.cache_dump.kd_alloc_fails) &&
3210 		    !(flags & (DCMD_LOOP | DCMD_PIPE_OUT))) {
3211 			mdb_warn("WARNING: cache was used during dump: "
3212 			    "corruption may be incorrectly reported\n");
3213 		}
3214 
3215 		kmv.kmv_size = kmv.kmv_cache.cache_buftag +
3216 		    sizeof (kmem_buftag_t);
3217 		kmv.kmv_buf = mdb_alloc(kmv.kmv_size, UM_SLEEP | UM_GC);
3218 		kmv.kmv_corruption = 0;
3219 		kmv.kmv_flags = flags;
3220 
3221 		if ((kmv.kmv_cache.cache_flags & KMF_REDZONE)) {
3222 			check_alloc = 1;
3223 			if (kmv.kmv_cache.cache_flags & KMF_DEADBEEF)
3224 				check_free = 1;
3225 		} else {
3226 			if (!(flags & DCMD_LOOP)) {
3227 				mdb_warn("cache %p (%s) does not have "
3228 				    "redzone checking enabled\n", addr,
3229 				    kmv.kmv_cache.cache_name);
3230 			}
3231 			return (DCMD_ERR);
3232 		}
3233 
3234 		if (!(flags & (DCMD_LOOP | DCMD_PIPE_OUT))) {
3235 			mdb_printf("Summary for cache '%s'\n",
3236 			    kmv.kmv_cache.cache_name);
3237 			mdb_inc_indent(2);
3238 		}
3239 
3240 		if (check_alloc)
3241 			(void) mdb_pwalk("kmem", verify_alloc, &kmv, addr);
3242 		if (check_free)
3243 			(void) mdb_pwalk("freemem", verify_free, &kmv, addr);
3244 
3245 		if (!(flags & DCMD_PIPE_OUT)) {
3246 			if (flags & DCMD_LOOP) {
3247 				if (kmv.kmv_corruption == 0) {
3248 					mdb_printf("%-*s %?p clean\n",
3249 					    KMEM_CACHE_NAMELEN,
3250 					    kmv.kmv_cache.cache_name, addr);
3251 				} else {
3252 					mdb_printf("%-*s %?p %d corrupt "
3253 					    "buffer%s\n", KMEM_CACHE_NAMELEN,
3254 					    kmv.kmv_cache.cache_name, addr,
3255 					    kmv.kmv_corruption,
3256 					    kmv.kmv_corruption > 1 ? "s" : "");
3257 				}
3258 			} else {
3259 				/*
3260 				 * This is the more verbose mode, when the user
3261 				 * typed addr::kmem_verify.  If the cache was
3262 				 * clean, nothing will have yet been printed. So
3263 				 * say something.
3264 				 */
3265 				if (kmv.kmv_corruption == 0)
3266 					mdb_printf("clean\n");
3267 
3268 				mdb_dec_indent(2);
3269 			}
3270 		}
3271 	} else {
3272 		/*
3273 		 * If the user didn't specify a cache to verify, we'll walk all
3274 		 * kmem_cache's, specifying ourself as a callback for each...
3275 		 * this is the equivalent of '::walk kmem_cache .::kmem_verify'
3276 		 */
3277 
3278 		if (!(flags & DCMD_PIPE_OUT)) {
3279 			uintptr_t dump_curr;
3280 			uintptr_t dump_end;
3281 
3282 			if (mdb_readvar(&dump_curr, "kmem_dump_curr") != -1 &&
3283 			    mdb_readvar(&dump_end, "kmem_dump_end") != -1 &&
3284 			    dump_curr == dump_end) {
3285 				mdb_warn("WARNING: exceeded kmem_dump_size; "
3286 				    "corruption may be incorrectly reported\n");
3287 			}
3288 
3289 			mdb_printf("%<u>%-*s %-?s %-20s%</b>\n",
3290 			    KMEM_CACHE_NAMELEN, "Cache Name", "Addr",
3291 			    "Cache Integrity");
3292 		}
3293 
3294 		(void) (mdb_walk_dcmd("kmem_cache", "kmem_verify", 0, NULL));
3295 	}
3296 
3297 	return (DCMD_OK);
3298 }
3299 
3300 typedef struct vmem_node {
3301 	struct vmem_node *vn_next;
3302 	struct vmem_node *vn_parent;
3303 	struct vmem_node *vn_sibling;
3304 	struct vmem_node *vn_children;
3305 	uintptr_t vn_addr;
3306 	int vn_marked;
3307 	vmem_t vn_vmem;
3308 } vmem_node_t;
3309 
3310 typedef struct vmem_walk {
3311 	vmem_node_t *vw_root;
3312 	vmem_node_t *vw_current;
3313 } vmem_walk_t;
3314 
3315 int
vmem_walk_init(mdb_walk_state_t * wsp)3316 vmem_walk_init(mdb_walk_state_t *wsp)
3317 {
3318 	uintptr_t vaddr, paddr;
3319 	vmem_node_t *head = NULL, *root = NULL, *current = NULL, *parent, *vp;
3320 	vmem_walk_t *vw;
3321 
3322 	if (mdb_readvar(&vaddr, "vmem_list") == -1) {
3323 		mdb_warn("couldn't read 'vmem_list'");
3324 		return (WALK_ERR);
3325 	}
3326 
3327 	while (vaddr != NULL) {
3328 		vp = mdb_zalloc(sizeof (vmem_node_t), UM_SLEEP);
3329 		vp->vn_addr = vaddr;
3330 		vp->vn_next = head;
3331 		head = vp;
3332 
3333 		if (vaddr == wsp->walk_addr)
3334 			current = vp;
3335 
3336 		if (mdb_vread(&vp->vn_vmem, sizeof (vmem_t), vaddr) == -1) {
3337 			mdb_warn("couldn't read vmem_t at %p", vaddr);
3338 			goto err;
3339 		}
3340 
3341 		vaddr = (uintptr_t)vp->vn_vmem.vm_next;
3342 	}
3343 
3344 	for (vp = head; vp != NULL; vp = vp->vn_next) {
3345 
3346 		if ((paddr = (uintptr_t)vp->vn_vmem.vm_source) == NULL) {
3347 			vp->vn_sibling = root;
3348 			root = vp;
3349 			continue;
3350 		}
3351 
3352 		for (parent = head; parent != NULL; parent = parent->vn_next) {
3353 			if (parent->vn_addr != paddr)
3354 				continue;
3355 			vp->vn_sibling = parent->vn_children;
3356 			parent->vn_children = vp;
3357 			vp->vn_parent = parent;
3358 			break;
3359 		}
3360 
3361 		if (parent == NULL) {
3362 			mdb_warn("couldn't find %p's parent (%p)\n",
3363 			    vp->vn_addr, paddr);
3364 			goto err;
3365 		}
3366 	}
3367 
3368 	vw = mdb_zalloc(sizeof (vmem_walk_t), UM_SLEEP);
3369 	vw->vw_root = root;
3370 
3371 	if (current != NULL)
3372 		vw->vw_current = current;
3373 	else
3374 		vw->vw_current = root;
3375 
3376 	wsp->walk_data = vw;
3377 	return (WALK_NEXT);
3378 err:
3379 	for (vp = head; head != NULL; vp = head) {
3380 		head = vp->vn_next;
3381 		mdb_free(vp, sizeof (vmem_node_t));
3382 	}
3383 
3384 	return (WALK_ERR);
3385 }
3386 
3387 int
vmem_walk_step(mdb_walk_state_t * wsp)3388 vmem_walk_step(mdb_walk_state_t *wsp)
3389 {
3390 	vmem_walk_t *vw = wsp->walk_data;
3391 	vmem_node_t *vp;
3392 	int rval;
3393 
3394 	if ((vp = vw->vw_current) == NULL)
3395 		return (WALK_DONE);
3396 
3397 	rval = wsp->walk_callback(vp->vn_addr, &vp->vn_vmem, wsp->walk_cbdata);
3398 
3399 	if (vp->vn_children != NULL) {
3400 		vw->vw_current = vp->vn_children;
3401 		return (rval);
3402 	}
3403 
3404 	do {
3405 		vw->vw_current = vp->vn_sibling;
3406 		vp = vp->vn_parent;
3407 	} while (vw->vw_current == NULL && vp != NULL);
3408 
3409 	return (rval);
3410 }
3411 
3412 /*
3413  * The "vmem_postfix" walk walks the vmem arenas in post-fix order; all
3414  * children are visited before their parent.  We perform the postfix walk
3415  * iteratively (rather than recursively) to allow mdb to regain control
3416  * after each callback.
3417  */
3418 int
vmem_postfix_walk_step(mdb_walk_state_t * wsp)3419 vmem_postfix_walk_step(mdb_walk_state_t *wsp)
3420 {
3421 	vmem_walk_t *vw = wsp->walk_data;
3422 	vmem_node_t *vp = vw->vw_current;
3423 	int rval;
3424 
3425 	/*
3426 	 * If this node is marked, then we know that we have already visited
3427 	 * all of its children.  If the node has any siblings, they need to
3428 	 * be visited next; otherwise, we need to visit the parent.  Note
3429 	 * that vp->vn_marked will only be zero on the first invocation of
3430 	 * the step function.
3431 	 */
3432 	if (vp->vn_marked) {
3433 		if (vp->vn_sibling != NULL)
3434 			vp = vp->vn_sibling;
3435 		else if (vp->vn_parent != NULL)
3436 			vp = vp->vn_parent;
3437 		else {
3438 			/*
3439 			 * We have neither a parent, nor a sibling, and we
3440 			 * have already been visited; we're done.
3441 			 */
3442 			return (WALK_DONE);
3443 		}
3444 	}
3445 
3446 	/*
3447 	 * Before we visit this node, visit its children.
3448 	 */
3449 	while (vp->vn_children != NULL && !vp->vn_children->vn_marked)
3450 		vp = vp->vn_children;
3451 
3452 	vp->vn_marked = 1;
3453 	vw->vw_current = vp;
3454 	rval = wsp->walk_callback(vp->vn_addr, &vp->vn_vmem, wsp->walk_cbdata);
3455 
3456 	return (rval);
3457 }
3458 
3459 void
vmem_walk_fini(mdb_walk_state_t * wsp)3460 vmem_walk_fini(mdb_walk_state_t *wsp)
3461 {
3462 	vmem_walk_t *vw = wsp->walk_data;
3463 	vmem_node_t *root = vw->vw_root;
3464 	int done;
3465 
3466 	if (root == NULL)
3467 		return;
3468 
3469 	if ((vw->vw_root = root->vn_children) != NULL)
3470 		vmem_walk_fini(wsp);
3471 
3472 	vw->vw_root = root->vn_sibling;
3473 	done = (root->vn_sibling == NULL && root->vn_parent == NULL);
3474 	mdb_free(root, sizeof (vmem_node_t));
3475 
3476 	if (done) {
3477 		mdb_free(vw, sizeof (vmem_walk_t));
3478 	} else {
3479 		vmem_walk_fini(wsp);
3480 	}
3481 }
3482 
3483 typedef struct vmem_seg_walk {
3484 	uint8_t vsw_type;
3485 	uintptr_t vsw_start;
3486 	uintptr_t vsw_current;
3487 } vmem_seg_walk_t;
3488 
3489 /*ARGSUSED*/
3490 int
vmem_seg_walk_common_init(mdb_walk_state_t * wsp,uint8_t type,char * name)3491 vmem_seg_walk_common_init(mdb_walk_state_t *wsp, uint8_t type, char *name)
3492 {
3493 	vmem_seg_walk_t *vsw;
3494 
3495 	if (wsp->walk_addr == NULL) {
3496 		mdb_warn("vmem_%s does not support global walks\n", name);
3497 		return (WALK_ERR);
3498 	}
3499 
3500 	wsp->walk_data = vsw = mdb_alloc(sizeof (vmem_seg_walk_t), UM_SLEEP);
3501 
3502 	vsw->vsw_type = type;
3503 	vsw->vsw_start = wsp->walk_addr + offsetof(vmem_t, vm_seg0);
3504 	vsw->vsw_current = vsw->vsw_start;
3505 
3506 	return (WALK_NEXT);
3507 }
3508 
3509 /*
3510  * vmem segments can't have type 0 (this should be added to vmem_impl.h).
3511  */
3512 #define	VMEM_NONE	0
3513 
3514 int
vmem_alloc_walk_init(mdb_walk_state_t * wsp)3515 vmem_alloc_walk_init(mdb_walk_state_t *wsp)
3516 {
3517 	return (vmem_seg_walk_common_init(wsp, VMEM_ALLOC, "alloc"));
3518 }
3519 
3520 int
vmem_free_walk_init(mdb_walk_state_t * wsp)3521 vmem_free_walk_init(mdb_walk_state_t *wsp)
3522 {
3523 	return (vmem_seg_walk_common_init(wsp, VMEM_FREE, "free"));
3524 }
3525 
3526 int
vmem_span_walk_init(mdb_walk_state_t * wsp)3527 vmem_span_walk_init(mdb_walk_state_t *wsp)
3528 {
3529 	return (vmem_seg_walk_common_init(wsp, VMEM_SPAN, "span"));
3530 }
3531 
3532 int
vmem_seg_walk_init(mdb_walk_state_t * wsp)3533 vmem_seg_walk_init(mdb_walk_state_t *wsp)
3534 {
3535 	return (vmem_seg_walk_common_init(wsp, VMEM_NONE, "seg"));
3536 }
3537 
3538 int
vmem_seg_walk_step(mdb_walk_state_t * wsp)3539 vmem_seg_walk_step(mdb_walk_state_t *wsp)
3540 {
3541 	vmem_seg_t seg;
3542 	vmem_seg_walk_t *vsw = wsp->walk_data;
3543 	uintptr_t addr = vsw->vsw_current;
3544 	static size_t seg_size = 0;
3545 	int rval;
3546 
3547 	if (!seg_size) {
3548 		if (mdb_readvar(&seg_size, "vmem_seg_size") == -1) {
3549 			mdb_warn("failed to read 'vmem_seg_size'");
3550 			seg_size = sizeof (vmem_seg_t);
3551 		}
3552 	}
3553 
3554 	if (seg_size < sizeof (seg))
3555 		bzero((caddr_t)&seg + seg_size, sizeof (seg) - seg_size);
3556 
3557 	if (mdb_vread(&seg, seg_size, addr) == -1) {
3558 		mdb_warn("couldn't read vmem_seg at %p", addr);
3559 		return (WALK_ERR);
3560 	}
3561 
3562 	vsw->vsw_current = (uintptr_t)seg.vs_anext;
3563 	if (vsw->vsw_type != VMEM_NONE && seg.vs_type != vsw->vsw_type) {
3564 		rval = WALK_NEXT;
3565 	} else {
3566 		rval = wsp->walk_callback(addr, &seg, wsp->walk_cbdata);
3567 	}
3568 
3569 	if (vsw->vsw_current == vsw->vsw_start)
3570 		return (WALK_DONE);
3571 
3572 	return (rval);
3573 }
3574 
3575 void
vmem_seg_walk_fini(mdb_walk_state_t * wsp)3576 vmem_seg_walk_fini(mdb_walk_state_t *wsp)
3577 {
3578 	vmem_seg_walk_t *vsw = wsp->walk_data;
3579 
3580 	mdb_free(vsw, sizeof (vmem_seg_walk_t));
3581 }
3582 
3583 #define	VMEM_NAMEWIDTH	22
3584 
3585 int
vmem(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3586 vmem(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3587 {
3588 	vmem_t v, parent;
3589 	vmem_kstat_t *vkp = &v.vm_kstat;
3590 	uintptr_t paddr;
3591 	int ident = 0;
3592 	char c[VMEM_NAMEWIDTH];
3593 
3594 	if (!(flags & DCMD_ADDRSPEC)) {
3595 		if (mdb_walk_dcmd("vmem", "vmem", argc, argv) == -1) {
3596 			mdb_warn("can't walk vmem");
3597 			return (DCMD_ERR);
3598 		}
3599 		return (DCMD_OK);
3600 	}
3601 
3602 	if (DCMD_HDRSPEC(flags))
3603 		mdb_printf("%-?s %-*s %10s %12s %9s %5s\n",
3604 		    "ADDR", VMEM_NAMEWIDTH, "NAME", "INUSE",
3605 		    "TOTAL", "SUCCEED", "FAIL");
3606 
3607 	if (mdb_vread(&v, sizeof (v), addr) == -1) {
3608 		mdb_warn("couldn't read vmem at %p", addr);
3609 		return (DCMD_ERR);
3610 	}
3611 
3612 	for (paddr = (uintptr_t)v.vm_source; paddr != NULL; ident += 2) {
3613 		if (mdb_vread(&parent, sizeof (parent), paddr) == -1) {
3614 			mdb_warn("couldn't trace %p's ancestry", addr);
3615 			ident = 0;
3616 			break;
3617 		}
3618 		paddr = (uintptr_t)parent.vm_source;
3619 	}
3620 
3621 	(void) mdb_snprintf(c, VMEM_NAMEWIDTH, "%*s%s", ident, "", v.vm_name);
3622 
3623 	mdb_printf("%0?p %-*s %10llu %12llu %9llu %5llu\n",
3624 	    addr, VMEM_NAMEWIDTH, c,
3625 	    vkp->vk_mem_inuse.value.ui64, vkp->vk_mem_total.value.ui64,
3626 	    vkp->vk_alloc.value.ui64, vkp->vk_fail.value.ui64);
3627 
3628 	return (DCMD_OK);
3629 }
3630 
3631 void
vmem_seg_help(void)3632 vmem_seg_help(void)
3633 {
3634 	mdb_printf("%s",
3635 "Display the contents of vmem_seg_ts, with optional filtering.\n\n"
3636 "\n"
3637 "A vmem_seg_t represents a range of addresses (or arbitrary numbers),\n"
3638 "representing a single chunk of data.  Only ALLOC segments have debugging\n"
3639 "information.\n");
3640 	mdb_dec_indent(2);
3641 	mdb_printf("%<b>OPTIONS%</b>\n");
3642 	mdb_inc_indent(2);
3643 	mdb_printf("%s",
3644 "  -v    Display the full content of the vmem_seg, including its stack trace\n"
3645 "  -s    report the size of the segment, instead of the end address\n"
3646 "  -c caller\n"
3647 "        filter out segments without the function/PC in their stack trace\n"
3648 "  -e earliest\n"
3649 "        filter out segments timestamped before earliest\n"
3650 "  -l latest\n"
3651 "        filter out segments timestamped after latest\n"
3652 "  -m minsize\n"
3653 "        filer out segments smaller than minsize\n"
3654 "  -M maxsize\n"
3655 "        filer out segments larger than maxsize\n"
3656 "  -t thread\n"
3657 "        filter out segments not involving thread\n"
3658 "  -T type\n"
3659 "        filter out segments not of type 'type'\n"
3660 "        type is one of: ALLOC/FREE/SPAN/ROTOR/WALKER\n");
3661 }
3662 
3663 /*ARGSUSED*/
3664 int
vmem_seg(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3665 vmem_seg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3666 {
3667 	vmem_seg_t vs;
3668 	pc_t *stk = vs.vs_stack;
3669 	uintptr_t sz;
3670 	uint8_t t;
3671 	const char *type = NULL;
3672 	GElf_Sym sym;
3673 	char c[MDB_SYM_NAMLEN];
3674 	int no_debug;
3675 	int i;
3676 	int depth;
3677 	uintptr_t laddr, haddr;
3678 
3679 	uintptr_t caller = NULL, thread = NULL;
3680 	uintptr_t minsize = 0, maxsize = 0;
3681 
3682 	hrtime_t earliest = 0, latest = 0;
3683 
3684 	uint_t size = 0;
3685 	uint_t verbose = 0;
3686 
3687 	if (!(flags & DCMD_ADDRSPEC))
3688 		return (DCMD_USAGE);
3689 
3690 	if (mdb_getopts(argc, argv,
3691 	    'c', MDB_OPT_UINTPTR, &caller,
3692 	    'e', MDB_OPT_UINT64, &earliest,
3693 	    'l', MDB_OPT_UINT64, &latest,
3694 	    's', MDB_OPT_SETBITS, TRUE, &size,
3695 	    'm', MDB_OPT_UINTPTR, &minsize,
3696 	    'M', MDB_OPT_UINTPTR, &maxsize,
3697 	    't', MDB_OPT_UINTPTR, &thread,
3698 	    'T', MDB_OPT_STR, &type,
3699 	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
3700 	    NULL) != argc)
3701 		return (DCMD_USAGE);
3702 
3703 	if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
3704 		if (verbose) {
3705 			mdb_printf("%16s %4s %16s %16s %16s\n"
3706 			    "%<u>%16s %4s %16s %16s %16s%</u>\n",
3707 			    "ADDR", "TYPE", "START", "END", "SIZE",
3708 			    "", "", "THREAD", "TIMESTAMP", "");
3709 		} else {
3710 			mdb_printf("%?s %4s %?s %?s %s\n", "ADDR", "TYPE",
3711 			    "START", size? "SIZE" : "END", "WHO");
3712 		}
3713 	}
3714 
3715 	if (mdb_vread(&vs, sizeof (vs), addr) == -1) {
3716 		mdb_warn("couldn't read vmem_seg at %p", addr);
3717 		return (DCMD_ERR);
3718 	}
3719 
3720 	if (type != NULL) {
3721 		if (strcmp(type, "ALLC") == 0 || strcmp(type, "ALLOC") == 0)
3722 			t = VMEM_ALLOC;
3723 		else if (strcmp(type, "FREE") == 0)
3724 			t = VMEM_FREE;
3725 		else if (strcmp(type, "SPAN") == 0)
3726 			t = VMEM_SPAN;
3727 		else if (strcmp(type, "ROTR") == 0 ||
3728 		    strcmp(type, "ROTOR") == 0)
3729 			t = VMEM_ROTOR;
3730 		else if (strcmp(type, "WLKR") == 0 ||
3731 		    strcmp(type, "WALKER") == 0)
3732 			t = VMEM_WALKER;
3733 		else {
3734 			mdb_warn("\"%s\" is not a recognized vmem_seg type\n",
3735 			    type);
3736 			return (DCMD_ERR);
3737 		}
3738 
3739 		if (vs.vs_type != t)
3740 			return (DCMD_OK);
3741 	}
3742 
3743 	sz = vs.vs_end - vs.vs_start;
3744 
3745 	if (minsize != 0 && sz < minsize)
3746 		return (DCMD_OK);
3747 
3748 	if (maxsize != 0 && sz > maxsize)
3749 		return (DCMD_OK);
3750 
3751 	t = vs.vs_type;
3752 	depth = vs.vs_depth;
3753 
3754 	/*
3755 	 * debug info, when present, is only accurate for VMEM_ALLOC segments
3756 	 */
3757 	no_debug = (t != VMEM_ALLOC) ||
3758 	    (depth == 0 || depth > VMEM_STACK_DEPTH);
3759 
3760 	if (no_debug) {
3761 		if (caller != NULL || thread != NULL || earliest != 0 ||
3762 		    latest != 0)
3763 			return (DCMD_OK);		/* not enough info */
3764 	} else {
3765 		if (caller != NULL) {
3766 			laddr = caller;
3767 			haddr = caller + sizeof (caller);
3768 
3769 			if (mdb_lookup_by_addr(caller, MDB_SYM_FUZZY, c,
3770 			    sizeof (c), &sym) != -1 &&
3771 			    caller == (uintptr_t)sym.st_value) {
3772 				/*
3773 				 * We were provided an exact symbol value; any
3774 				 * address in the function is valid.
3775 				 */
3776 				laddr = (uintptr_t)sym.st_value;
3777 				haddr = (uintptr_t)sym.st_value + sym.st_size;
3778 			}
3779 
3780 			for (i = 0; i < depth; i++)
3781 				if (vs.vs_stack[i] >= laddr &&
3782 				    vs.vs_stack[i] < haddr)
3783 					break;
3784 
3785 			if (i == depth)
3786 				return (DCMD_OK);
3787 		}
3788 
3789 		if (thread != NULL && (uintptr_t)vs.vs_thread != thread)
3790 			return (DCMD_OK);
3791 
3792 		if (earliest != 0 && vs.vs_timestamp < earliest)
3793 			return (DCMD_OK);
3794 
3795 		if (latest != 0 && vs.vs_timestamp > latest)
3796 			return (DCMD_OK);
3797 	}
3798 
3799 	type = (t == VMEM_ALLOC ? "ALLC" :
3800 	    t == VMEM_FREE ? "FREE" :
3801 	    t == VMEM_SPAN ? "SPAN" :
3802 	    t == VMEM_ROTOR ? "ROTR" :
3803 	    t == VMEM_WALKER ? "WLKR" :
3804 	    "????");
3805 
3806 	if (flags & DCMD_PIPE_OUT) {
3807 		mdb_printf("%#lr\n", addr);
3808 		return (DCMD_OK);
3809 	}
3810 
3811 	if (verbose) {
3812 		mdb_printf("%<b>%16p%</b> %4s %16p %16p %16d\n",
3813 		    addr, type, vs.vs_start, vs.vs_end, sz);
3814 
3815 		if (no_debug)
3816 			return (DCMD_OK);
3817 
3818 		mdb_printf("%16s %4s %16p %16llx\n",
3819 		    "", "", vs.vs_thread, vs.vs_timestamp);
3820 
3821 		mdb_inc_indent(17);
3822 		for (i = 0; i < depth; i++) {
3823 			mdb_printf("%a\n", stk[i]);
3824 		}
3825 		mdb_dec_indent(17);
3826 		mdb_printf("\n");
3827 	} else {
3828 		mdb_printf("%0?p %4s %0?p %0?p", addr, type,
3829 		    vs.vs_start, size? sz : vs.vs_end);
3830 
3831 		if (no_debug) {
3832 			mdb_printf("\n");
3833 			return (DCMD_OK);
3834 		}
3835 
3836 		for (i = 0; i < depth; i++) {
3837 			if (mdb_lookup_by_addr(stk[i], MDB_SYM_FUZZY,
3838 			    c, sizeof (c), &sym) == -1)
3839 				continue;
3840 			if (strncmp(c, "vmem_", 5) == 0)
3841 				continue;
3842 			break;
3843 		}
3844 		mdb_printf(" %a\n", stk[i]);
3845 	}
3846 	return (DCMD_OK);
3847 }
3848 
3849 typedef struct kmalog_data {
3850 	uintptr_t	kma_addr;
3851 	hrtime_t	kma_newest;
3852 } kmalog_data_t;
3853 
3854 /*ARGSUSED*/
3855 static int
showbc(uintptr_t addr,const kmem_bufctl_audit_t * bcp,kmalog_data_t * kma)3856 showbc(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmalog_data_t *kma)
3857 {
3858 	char name[KMEM_CACHE_NAMELEN + 1];
3859 	hrtime_t delta;
3860 	int i, depth;
3861 	size_t bufsize;
3862 
3863 	if (bcp->bc_timestamp == 0)
3864 		return (WALK_DONE);
3865 
3866 	if (kma->kma_newest == 0)
3867 		kma->kma_newest = bcp->bc_timestamp;
3868 
3869 	if (kma->kma_addr) {
3870 		if (mdb_vread(&bufsize, sizeof (bufsize),
3871 		    (uintptr_t)&bcp->bc_cache->cache_bufsize) == -1) {
3872 			mdb_warn(
3873 			    "failed to read cache_bufsize for cache at %p",
3874 			    bcp->bc_cache);
3875 			return (WALK_ERR);
3876 		}
3877 
3878 		if (kma->kma_addr < (uintptr_t)bcp->bc_addr ||
3879 		    kma->kma_addr >= (uintptr_t)bcp->bc_addr + bufsize)
3880 			return (WALK_NEXT);
3881 	}
3882 
3883 	delta = kma->kma_newest - bcp->bc_timestamp;
3884 	depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
3885 
3886 	if (mdb_readstr(name, sizeof (name), (uintptr_t)
3887 	    &bcp->bc_cache->cache_name) <= 0)
3888 		(void) mdb_snprintf(name, sizeof (name), "%a", bcp->bc_cache);
3889 
3890 	mdb_printf("\nT-%lld.%09lld  addr=%p  %s\n",
3891 	    delta / NANOSEC, delta % NANOSEC, bcp->bc_addr, name);
3892 
3893 	for (i = 0; i < depth; i++)
3894 		mdb_printf("\t %a\n", bcp->bc_stack[i]);
3895 
3896 	return (WALK_NEXT);
3897 }
3898 
3899 int
kmalog(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3900 kmalog(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3901 {
3902 	const char *logname = "kmem_transaction_log";
3903 	kmalog_data_t kma;
3904 
3905 	if (argc > 1)
3906 		return (DCMD_USAGE);
3907 
3908 	kma.kma_newest = 0;
3909 	if (flags & DCMD_ADDRSPEC)
3910 		kma.kma_addr = addr;
3911 	else
3912 		kma.kma_addr = NULL;
3913 
3914 	if (argc > 0) {
3915 		if (argv->a_type != MDB_TYPE_STRING)
3916 			return (DCMD_USAGE);
3917 		if (strcmp(argv->a_un.a_str, "fail") == 0)
3918 			logname = "kmem_failure_log";
3919 		else if (strcmp(argv->a_un.a_str, "slab") == 0)
3920 			logname = "kmem_slab_log";
3921 		else
3922 			return (DCMD_USAGE);
3923 	}
3924 
3925 	if (mdb_readvar(&addr, logname) == -1) {
3926 		mdb_warn("failed to read %s log header pointer");
3927 		return (DCMD_ERR);
3928 	}
3929 
3930 	if (mdb_pwalk("kmem_log", (mdb_walk_cb_t)showbc, &kma, addr) == -1) {
3931 		mdb_warn("failed to walk kmem log");
3932 		return (DCMD_ERR);
3933 	}
3934 
3935 	return (DCMD_OK);
3936 }
3937 
3938 /*
3939  * As the final lure for die-hard crash(1M) users, we provide ::kmausers here.
3940  * The first piece is a structure which we use to accumulate kmem_cache_t
3941  * addresses of interest.  The kmc_add is used as a callback for the kmem_cache
3942  * walker; we either add all caches, or ones named explicitly as arguments.
3943  */
3944 
3945 typedef struct kmclist {
3946 	const char *kmc_name;			/* Name to match (or NULL) */
3947 	uintptr_t *kmc_caches;			/* List of kmem_cache_t addrs */
3948 	int kmc_nelems;				/* Num entries in kmc_caches */
3949 	int kmc_size;				/* Size of kmc_caches array */
3950 } kmclist_t;
3951 
3952 static int
kmc_add(uintptr_t addr,const kmem_cache_t * cp,kmclist_t * kmc)3953 kmc_add(uintptr_t addr, const kmem_cache_t *cp, kmclist_t *kmc)
3954 {
3955 	void *p;
3956 	int s;
3957 
3958 	if (kmc->kmc_name == NULL ||
3959 	    strcmp(cp->cache_name, kmc->kmc_name) == 0) {
3960 		/*
3961 		 * If we have a match, grow our array (if necessary), and then
3962 		 * add the virtual address of the matching cache to our list.
3963 		 */
3964 		if (kmc->kmc_nelems >= kmc->kmc_size) {
3965 			s = kmc->kmc_size ? kmc->kmc_size * 2 : 256;
3966 			p = mdb_alloc(sizeof (uintptr_t) * s, UM_SLEEP | UM_GC);
3967 
3968 			bcopy(kmc->kmc_caches, p,
3969 			    sizeof (uintptr_t) * kmc->kmc_size);
3970 
3971 			kmc->kmc_caches = p;
3972 			kmc->kmc_size = s;
3973 		}
3974 
3975 		kmc->kmc_caches[kmc->kmc_nelems++] = addr;
3976 		return (kmc->kmc_name ? WALK_DONE : WALK_NEXT);
3977 	}
3978 
3979 	return (WALK_NEXT);
3980 }
3981 
3982 /*
3983  * The second piece of ::kmausers is a hash table of allocations.  Each
3984  * allocation owner is identified by its stack trace and data_size.  We then
3985  * track the total bytes of all such allocations, and the number of allocations
3986  * to report at the end.  Once we have a list of caches, we walk through the
3987  * allocated bufctls of each, and update our hash table accordingly.
3988  */
3989 
3990 typedef struct kmowner {
3991 	struct kmowner *kmo_head;		/* First hash elt in bucket */
3992 	struct kmowner *kmo_next;		/* Next hash elt in chain */
3993 	size_t kmo_signature;			/* Hash table signature */
3994 	uint_t kmo_num;				/* Number of allocations */
3995 	size_t kmo_data_size;			/* Size of each allocation */
3996 	size_t kmo_total_size;			/* Total bytes of allocation */
3997 	int kmo_depth;				/* Depth of stack trace */
3998 	uintptr_t kmo_stack[KMEM_STACK_DEPTH];	/* Stack trace */
3999 } kmowner_t;
4000 
4001 typedef struct kmusers {
4002 	uintptr_t kmu_addr;			/* address of interest */
4003 	const kmem_cache_t *kmu_cache;		/* Current kmem cache */
4004 	kmowner_t *kmu_hash;			/* Hash table of owners */
4005 	int kmu_nelems;				/* Number of entries in use */
4006 	int kmu_size;				/* Total number of entries */
4007 } kmusers_t;
4008 
4009 static void
kmu_add(kmusers_t * kmu,const kmem_bufctl_audit_t * bcp,size_t size,size_t data_size)4010 kmu_add(kmusers_t *kmu, const kmem_bufctl_audit_t *bcp,
4011     size_t size, size_t data_size)
4012 {
4013 	int i, depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
4014 	size_t bucket, signature = data_size;
4015 	kmowner_t *kmo, *kmoend;
4016 
4017 	/*
4018 	 * If the hash table is full, double its size and rehash everything.
4019 	 */
4020 	if (kmu->kmu_nelems >= kmu->kmu_size) {
4021 		int s = kmu->kmu_size ? kmu->kmu_size * 2 : 1024;
4022 
4023 		kmo = mdb_alloc(sizeof (kmowner_t) * s, UM_SLEEP | UM_GC);
4024 		bcopy(kmu->kmu_hash, kmo, sizeof (kmowner_t) * kmu->kmu_size);
4025 		kmu->kmu_hash = kmo;
4026 		kmu->kmu_size = s;
4027 
4028 		kmoend = kmu->kmu_hash + kmu->kmu_size;
4029 		for (kmo = kmu->kmu_hash; kmo < kmoend; kmo++)
4030 			kmo->kmo_head = NULL;
4031 
4032 		kmoend = kmu->kmu_hash + kmu->kmu_nelems;
4033 		for (kmo = kmu->kmu_hash; kmo < kmoend; kmo++) {
4034 			bucket = kmo->kmo_signature & (kmu->kmu_size - 1);
4035 			kmo->kmo_next = kmu->kmu_hash[bucket].kmo_head;
4036 			kmu->kmu_hash[bucket].kmo_head = kmo;
4037 		}
4038 	}
4039 
4040 	/*
4041 	 * Finish computing the hash signature from the stack trace, and then
4042 	 * see if the owner is in the hash table.  If so, update our stats.
4043 	 */
4044 	for (i = 0; i < depth; i++)
4045 		signature += bcp->bc_stack[i];
4046 
4047 	bucket = signature & (kmu->kmu_size - 1);
4048 
4049 	for (kmo = kmu->kmu_hash[bucket].kmo_head; kmo; kmo = kmo->kmo_next) {
4050 		if (kmo->kmo_signature == signature) {
4051 			size_t difference = 0;
4052 
4053 			difference |= kmo->kmo_data_size - data_size;
4054 			difference |= kmo->kmo_depth - depth;
4055 
4056 			for (i = 0; i < depth; i++) {
4057 				difference |= kmo->kmo_stack[i] -
4058 				    bcp->bc_stack[i];
4059 			}
4060 
4061 			if (difference == 0) {
4062 				kmo->kmo_total_size += size;
4063 				kmo->kmo_num++;
4064 				return;
4065 			}
4066 		}
4067 	}
4068 
4069 	/*
4070 	 * If the owner is not yet hashed, grab the next element and fill it
4071 	 * in based on the allocation information.
4072 	 */
4073 	kmo = &kmu->kmu_hash[kmu->kmu_nelems++];
4074 	kmo->kmo_next = kmu->kmu_hash[bucket].kmo_head;
4075 	kmu->kmu_hash[bucket].kmo_head = kmo;
4076 
4077 	kmo->kmo_signature = signature;
4078 	kmo->kmo_num = 1;
4079 	kmo->kmo_data_size = data_size;
4080 	kmo->kmo_total_size = size;
4081 	kmo->kmo_depth = depth;
4082 
4083 	for (i = 0; i < depth; i++)
4084 		kmo->kmo_stack[i] = bcp->bc_stack[i];
4085 }
4086 
4087 /*
4088  * When ::kmausers is invoked without the -f flag, we simply update our hash
4089  * table with the information from each allocated bufctl.
4090  */
4091 /*ARGSUSED*/
4092 static int
kmause1(uintptr_t addr,const kmem_bufctl_audit_t * bcp,kmusers_t * kmu)4093 kmause1(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmusers_t *kmu)
4094 {
4095 	const kmem_cache_t *cp = kmu->kmu_cache;
4096 
4097 	kmu_add(kmu, bcp, cp->cache_bufsize, cp->cache_bufsize);
4098 	return (WALK_NEXT);
4099 }
4100 
4101 /*
4102  * When ::kmausers is invoked with the -f flag, we print out the information
4103  * for each bufctl as well as updating the hash table.
4104  */
4105 static int
kmause2(uintptr_t addr,const kmem_bufctl_audit_t * bcp,kmusers_t * kmu)4106 kmause2(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmusers_t *kmu)
4107 {
4108 	int i, depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
4109 	const kmem_cache_t *cp = kmu->kmu_cache;
4110 	kmem_bufctl_t bufctl;
4111 
4112 	if (kmu->kmu_addr) {
4113 		if (mdb_vread(&bufctl, sizeof (bufctl),  addr) == -1)
4114 			mdb_warn("couldn't read bufctl at %p", addr);
4115 		else if (kmu->kmu_addr < (uintptr_t)bufctl.bc_addr ||
4116 		    kmu->kmu_addr >= (uintptr_t)bufctl.bc_addr +
4117 		    cp->cache_bufsize)
4118 			return (WALK_NEXT);
4119 	}
4120 
4121 	mdb_printf("size %d, addr %p, thread %p, cache %s\n",
4122 	    cp->cache_bufsize, addr, bcp->bc_thread, cp->cache_name);
4123 
4124 	for (i = 0; i < depth; i++)
4125 		mdb_printf("\t %a\n", bcp->bc_stack[i]);
4126 
4127 	kmu_add(kmu, bcp, cp->cache_bufsize, cp->cache_bufsize);
4128 	return (WALK_NEXT);
4129 }
4130 
4131 /*
4132  * We sort our results by allocation size before printing them.
4133  */
4134 static int
kmownercmp(const void * lp,const void * rp)4135 kmownercmp(const void *lp, const void *rp)
4136 {
4137 	const kmowner_t *lhs = lp;
4138 	const kmowner_t *rhs = rp;
4139 
4140 	return (rhs->kmo_total_size - lhs->kmo_total_size);
4141 }
4142 
4143 /*
4144  * The main engine of ::kmausers is relatively straightforward: First we
4145  * accumulate our list of kmem_cache_t addresses into the kmclist_t. Next we
4146  * iterate over the allocated bufctls of each cache in the list.  Finally,
4147  * we sort and print our results.
4148  */
4149 /*ARGSUSED*/
4150 int
kmausers(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)4151 kmausers(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
4152 {
4153 	int mem_threshold = 8192;	/* Minimum # bytes for printing */
4154 	int cnt_threshold = 100;	/* Minimum # blocks for printing */
4155 	int audited_caches = 0;		/* Number of KMF_AUDIT caches found */
4156 	int do_all_caches = 1;		/* Do all caches (no arguments) */
4157 	int opt_e = FALSE;		/* Include "small" users */
4158 	int opt_f = FALSE;		/* Print stack traces */
4159 
4160 	mdb_walk_cb_t callback = (mdb_walk_cb_t)kmause1;
4161 	kmowner_t *kmo, *kmoend;
4162 	int i, oelems;
4163 
4164 	kmclist_t kmc;
4165 	kmusers_t kmu;
4166 
4167 	bzero(&kmc, sizeof (kmc));
4168 	bzero(&kmu, sizeof (kmu));
4169 
4170 	while ((i = mdb_getopts(argc, argv,
4171 	    'e', MDB_OPT_SETBITS, TRUE, &opt_e,
4172 	    'f', MDB_OPT_SETBITS, TRUE, &opt_f, NULL)) != argc) {
4173 
4174 		argv += i;	/* skip past options we just processed */
4175 		argc -= i;	/* adjust argc */
4176 
4177 		if (argv->a_type != MDB_TYPE_STRING || *argv->a_un.a_str == '-')
4178 			return (DCMD_USAGE);
4179 
4180 		oelems = kmc.kmc_nelems;
4181 		kmc.kmc_name = argv->a_un.a_str;
4182 		(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmc_add, &kmc);
4183 
4184 		if (kmc.kmc_nelems == oelems) {
4185 			mdb_warn("unknown kmem cache: %s\n", kmc.kmc_name);
4186 			return (DCMD_ERR);
4187 		}
4188 
4189 		do_all_caches = 0;
4190 		argv++;
4191 		argc--;
4192 	}
4193 
4194 	if (flags & DCMD_ADDRSPEC) {
4195 		opt_f = TRUE;
4196 		kmu.kmu_addr = addr;
4197 	} else {
4198 		kmu.kmu_addr = NULL;
4199 	}
4200 
4201 	if (opt_e)
4202 		mem_threshold = cnt_threshold = 0;
4203 
4204 	if (opt_f)
4205 		callback = (mdb_walk_cb_t)kmause2;
4206 
4207 	if (do_all_caches) {
4208 		kmc.kmc_name = NULL; /* match all cache names */
4209 		(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmc_add, &kmc);
4210 	}
4211 
4212 	for (i = 0; i < kmc.kmc_nelems; i++) {
4213 		uintptr_t cp = kmc.kmc_caches[i];
4214 		kmem_cache_t c;
4215 
4216 		if (mdb_vread(&c, sizeof (c), cp) == -1) {
4217 			mdb_warn("failed to read cache at %p", cp);
4218 			continue;
4219 		}
4220 
4221 		if (!(c.cache_flags & KMF_AUDIT)) {
4222 			if (!do_all_caches) {
4223 				mdb_warn("KMF_AUDIT is not enabled for %s\n",
4224 				    c.cache_name);
4225 			}
4226 			continue;
4227 		}
4228 
4229 		kmu.kmu_cache = &c;
4230 		(void) mdb_pwalk("bufctl", callback, &kmu, cp);
4231 		audited_caches++;
4232 	}
4233 
4234 	if (audited_caches == 0 && do_all_caches) {
4235 		mdb_warn("KMF_AUDIT is not enabled for any caches\n");
4236 		return (DCMD_ERR);
4237 	}
4238 
4239 	qsort(kmu.kmu_hash, kmu.kmu_nelems, sizeof (kmowner_t), kmownercmp);
4240 	kmoend = kmu.kmu_hash + kmu.kmu_nelems;
4241 
4242 	for (kmo = kmu.kmu_hash; kmo < kmoend; kmo++) {
4243 		if (kmo->kmo_total_size < mem_threshold &&
4244 		    kmo->kmo_num < cnt_threshold)
4245 			continue;
4246 		mdb_printf("%lu bytes for %u allocations with data size %lu:\n",
4247 		    kmo->kmo_total_size, kmo->kmo_num, kmo->kmo_data_size);
4248 		for (i = 0; i < kmo->kmo_depth; i++)
4249 			mdb_printf("\t %a\n", kmo->kmo_stack[i]);
4250 	}
4251 
4252 	return (DCMD_OK);
4253 }
4254 
4255 void
kmausers_help(void)4256 kmausers_help(void)
4257 {
4258 	mdb_printf(
4259 	    "Displays the largest users of the kmem allocator, sorted by \n"
4260 	    "trace.  If one or more caches is specified, only those caches\n"
4261 	    "will be searched.  By default, all caches are searched.  If an\n"
4262 	    "address is specified, then only those allocations which include\n"
4263 	    "the given address are displayed.  Specifying an address implies\n"
4264 	    "-f.\n"
4265 	    "\n"
4266 	    "\t-e\tInclude all users, not just the largest\n"
4267 	    "\t-f\tDisplay individual allocations.  By default, users are\n"
4268 	    "\t\tgrouped by stack\n");
4269 }
4270 
4271 static int
kmem_ready_check(void)4272 kmem_ready_check(void)
4273 {
4274 	int ready;
4275 
4276 	if (mdb_readvar(&ready, "kmem_ready") < 0)
4277 		return (-1); /* errno is set for us */
4278 
4279 	return (ready);
4280 }
4281 
4282 void
kmem_statechange(void)4283 kmem_statechange(void)
4284 {
4285 	static int been_ready = 0;
4286 
4287 	if (been_ready)
4288 		return;
4289 
4290 	if (kmem_ready_check() <= 0)
4291 		return;
4292 
4293 	been_ready = 1;
4294 	(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmem_init_walkers, NULL);
4295 }
4296 
4297 void
kmem_init(void)4298 kmem_init(void)
4299 {
4300 	mdb_walker_t w = {
4301 		"kmem_cache", "walk list of kmem caches", kmem_cache_walk_init,
4302 		list_walk_step, list_walk_fini
4303 	};
4304 
4305 	/*
4306 	 * If kmem is ready, we'll need to invoke the kmem_cache walker
4307 	 * immediately.  Walkers in the linkage structure won't be ready until
4308 	 * _mdb_init returns, so we'll need to add this one manually.  If kmem
4309 	 * is ready, we'll use the walker to initialize the caches.  If kmem
4310 	 * isn't ready, we'll register a callback that will allow us to defer
4311 	 * cache walking until it is.
4312 	 */
4313 	if (mdb_add_walker(&w) != 0) {
4314 		mdb_warn("failed to add kmem_cache walker");
4315 		return;
4316 	}
4317 
4318 	kmem_statechange();
4319 
4320 	/* register our ::whatis handlers */
4321 	mdb_whatis_register("modules", whatis_run_modules, NULL,
4322 	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4323 	mdb_whatis_register("threads", whatis_run_threads, NULL,
4324 	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4325 	mdb_whatis_register("pages", whatis_run_pages, NULL,
4326 	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4327 	mdb_whatis_register("kmem", whatis_run_kmem, NULL,
4328 	    WHATIS_PRIO_ALLOCATOR, 0);
4329 	mdb_whatis_register("vmem", whatis_run_vmem, NULL,
4330 	    WHATIS_PRIO_ALLOCATOR, 0);
4331 }
4332 
4333 typedef struct whatthread {
4334 	uintptr_t	wt_target;
4335 	int		wt_verbose;
4336 } whatthread_t;
4337 
4338 static int
whatthread_walk_thread(uintptr_t addr,const kthread_t * t,whatthread_t * w)4339 whatthread_walk_thread(uintptr_t addr, const kthread_t *t, whatthread_t *w)
4340 {
4341 	uintptr_t current, data;
4342 
4343 	if (t->t_stkbase == NULL)
4344 		return (WALK_NEXT);
4345 
4346 	/*
4347 	 * Warn about swapped out threads, but drive on anyway
4348 	 */
4349 	if (!(t->t_schedflag & TS_LOAD)) {
4350 		mdb_warn("thread %p's stack swapped out\n", addr);
4351 		return (WALK_NEXT);
4352 	}
4353 
4354 	/*
4355 	 * Search the thread's stack for the given pointer.  Note that it would
4356 	 * be more efficient to follow ::kgrep's lead and read in page-sized
4357 	 * chunks, but this routine is already fast and simple.
4358 	 */
4359 	for (current = (uintptr_t)t->t_stkbase; current < (uintptr_t)t->t_stk;
4360 	    current += sizeof (uintptr_t)) {
4361 		if (mdb_vread(&data, sizeof (data), current) == -1) {
4362 			mdb_warn("couldn't read thread %p's stack at %p",
4363 			    addr, current);
4364 			return (WALK_ERR);
4365 		}
4366 
4367 		if (data == w->wt_target) {
4368 			if (w->wt_verbose) {
4369 				mdb_printf("%p in thread %p's stack%s\n",
4370 				    current, addr, stack_active(t, current));
4371 			} else {
4372 				mdb_printf("%#lr\n", addr);
4373 				return (WALK_NEXT);
4374 			}
4375 		}
4376 	}
4377 
4378 	return (WALK_NEXT);
4379 }
4380 
4381 int
whatthread(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)4382 whatthread(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
4383 {
4384 	whatthread_t w;
4385 
4386 	if (!(flags & DCMD_ADDRSPEC))
4387 		return (DCMD_USAGE);
4388 
4389 	w.wt_verbose = FALSE;
4390 	w.wt_target = addr;
4391 
4392 	if (mdb_getopts(argc, argv,
4393 	    'v', MDB_OPT_SETBITS, TRUE, &w.wt_verbose, NULL) != argc)
4394 		return (DCMD_USAGE);
4395 
4396 	if (mdb_walk("thread", (mdb_walk_cb_t)whatthread_walk_thread, &w)
4397 	    == -1) {
4398 		mdb_warn("couldn't walk threads");
4399 		return (DCMD_ERR);
4400 	}
4401 
4402 	return (DCMD_OK);
4403 }
4404