xref: /illumos-gate/usr/src/cmd/mdb/common/modules/genunix/kmem.c (revision f2ef24df256e7d9e85bd7f89b3b382a964414dbd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2018 Joyent, Inc.  All rights reserved.
28  * Copyright (c) 2012 by Delphix. All rights reserved.
29  * Copyright 2025 Oxide Computer Company
30  */
31 
32 #include <mdb/mdb_param.h>
33 #include <mdb/mdb_modapi.h>
34 #include <mdb/mdb_ctf.h>
35 #include <mdb/mdb_whatis.h>
36 #include <sys/cpuvar.h>
37 #include <sys/kmem_impl.h>
38 #include <sys/vmem_impl.h>
39 #include <sys/machelf.h>
40 #include <sys/modctl.h>
41 #include <sys/kobj.h>
42 #include <sys/panic.h>
43 #include <sys/stack.h>
44 #include <sys/sysmacros.h>
45 #include <vm/page.h>
46 
47 #include "avl.h"
48 #include "combined.h"
49 #include "dist.h"
50 #include "kmem.h"
51 #include "list.h"
52 
53 #define	dprintf(x) if (mdb_debug_level) { \
54 	mdb_printf("kmem debug: ");  \
55 	/*CSTYLED*/\
56 	mdb_printf x ;\
57 }
58 
59 #define	KM_ALLOCATED		0x01
60 #define	KM_FREE			0x02
61 #define	KM_BUFCTL		0x04
62 #define	KM_CONSTRUCTED		0x08	/* only constructed free buffers */
63 #define	KM_HASH			0x10
64 
65 static int mdb_debug_level = 0;
66 
67 /*ARGSUSED*/
68 static int
kmem_init_walkers(uintptr_t addr,const kmem_cache_t * c,void * ignored)69 kmem_init_walkers(uintptr_t addr, const kmem_cache_t *c, void *ignored)
70 {
71 	mdb_walker_t w;
72 	char descr[64];
73 
74 	(void) mdb_snprintf(descr, sizeof (descr),
75 	    "walk the %s cache", c->cache_name);
76 
77 	w.walk_name = c->cache_name;
78 	w.walk_descr = descr;
79 	w.walk_init = kmem_walk_init;
80 	w.walk_step = kmem_walk_step;
81 	w.walk_fini = kmem_walk_fini;
82 	w.walk_init_arg = (void *)addr;
83 
84 	if (mdb_add_walker(&w) == -1)
85 		mdb_warn("failed to add %s walker", c->cache_name);
86 
87 	return (WALK_NEXT);
88 }
89 
90 /*ARGSUSED*/
91 int
kmem_debug(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)92 kmem_debug(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
93 {
94 	mdb_debug_level ^= 1;
95 
96 	mdb_printf("kmem: debugging is now %s\n",
97 	    mdb_debug_level ? "on" : "off");
98 
99 	return (DCMD_OK);
100 }
101 
102 int
kmem_cache_walk_init(mdb_walk_state_t * wsp)103 kmem_cache_walk_init(mdb_walk_state_t *wsp)
104 {
105 	GElf_Sym sym;
106 
107 	if (mdb_lookup_by_name("kmem_caches", &sym) == -1) {
108 		mdb_warn("couldn't find kmem_caches");
109 		return (WALK_ERR);
110 	}
111 
112 	wsp->walk_addr = (uintptr_t)sym.st_value;
113 
114 	return (list_walk_init_named(wsp, "cache list", "cache"));
115 }
116 
117 int
kmem_cpu_cache_walk_init(mdb_walk_state_t * wsp)118 kmem_cpu_cache_walk_init(mdb_walk_state_t *wsp)
119 {
120 	if (wsp->walk_addr == 0) {
121 		mdb_warn("kmem_cpu_cache doesn't support global walks");
122 		return (WALK_ERR);
123 	}
124 
125 	if (mdb_layered_walk("cpu", wsp) == -1) {
126 		mdb_warn("couldn't walk 'cpu'");
127 		return (WALK_ERR);
128 	}
129 
130 	wsp->walk_data = (void *)wsp->walk_addr;
131 
132 	return (WALK_NEXT);
133 }
134 
135 int
kmem_cpu_cache_walk_step(mdb_walk_state_t * wsp)136 kmem_cpu_cache_walk_step(mdb_walk_state_t *wsp)
137 {
138 	uintptr_t caddr = (uintptr_t)wsp->walk_data;
139 	const cpu_t *cpu = wsp->walk_layer;
140 	kmem_cpu_cache_t cc;
141 
142 	caddr += OFFSETOF(kmem_cache_t, cache_cpu[cpu->cpu_seqid]);
143 
144 	if (mdb_vread(&cc, sizeof (kmem_cpu_cache_t), caddr) == -1) {
145 		mdb_warn("couldn't read kmem_cpu_cache at %p", caddr);
146 		return (WALK_ERR);
147 	}
148 
149 	return (wsp->walk_callback(caddr, &cc, wsp->walk_cbdata));
150 }
151 
152 static int
kmem_slab_check(void * p,uintptr_t saddr,void * arg)153 kmem_slab_check(void *p, uintptr_t saddr, void *arg)
154 {
155 	kmem_slab_t *sp = p;
156 	uintptr_t caddr = (uintptr_t)arg;
157 	if ((uintptr_t)sp->slab_cache != caddr) {
158 		mdb_warn("slab %p isn't in cache %p (in cache %p)\n",
159 		    saddr, caddr, sp->slab_cache);
160 		return (-1);
161 	}
162 
163 	return (0);
164 }
165 
166 static int
kmem_partial_slab_check(void * p,uintptr_t saddr,void * arg)167 kmem_partial_slab_check(void *p, uintptr_t saddr, void *arg)
168 {
169 	kmem_slab_t *sp = p;
170 
171 	int rc = kmem_slab_check(p, saddr, arg);
172 	if (rc != 0) {
173 		return (rc);
174 	}
175 
176 	if (!KMEM_SLAB_IS_PARTIAL(sp)) {
177 		mdb_warn("slab %p is not a partial slab\n", saddr);
178 		return (-1);
179 	}
180 
181 	return (0);
182 }
183 
184 static int
kmem_complete_slab_check(void * p,uintptr_t saddr,void * arg)185 kmem_complete_slab_check(void *p, uintptr_t saddr, void *arg)
186 {
187 	kmem_slab_t *sp = p;
188 
189 	int rc = kmem_slab_check(p, saddr, arg);
190 	if (rc != 0) {
191 		return (rc);
192 	}
193 
194 	if (!KMEM_SLAB_IS_ALL_USED(sp)) {
195 		mdb_warn("slab %p is not completely allocated\n", saddr);
196 		return (-1);
197 	}
198 
199 	return (0);
200 }
201 
202 typedef struct {
203 	uintptr_t kns_cache_addr;
204 	int kns_nslabs;
205 } kmem_nth_slab_t;
206 
207 static int
kmem_nth_slab_check(void * p,uintptr_t saddr,void * arg)208 kmem_nth_slab_check(void *p, uintptr_t saddr, void *arg)
209 {
210 	kmem_nth_slab_t *chkp = arg;
211 
212 	int rc = kmem_slab_check(p, saddr, (void *)chkp->kns_cache_addr);
213 	if (rc != 0) {
214 		return (rc);
215 	}
216 
217 	return (chkp->kns_nslabs-- == 0 ? 1 : 0);
218 }
219 
220 static int
kmem_complete_slab_walk_init(mdb_walk_state_t * wsp)221 kmem_complete_slab_walk_init(mdb_walk_state_t *wsp)
222 {
223 	uintptr_t caddr = wsp->walk_addr;
224 
225 	wsp->walk_addr = (uintptr_t)(caddr +
226 	    offsetof(kmem_cache_t, cache_complete_slabs));
227 
228 	return (list_walk_init_checked(wsp, "slab list", "slab",
229 	    kmem_complete_slab_check, (void *)caddr));
230 }
231 
232 static int
kmem_partial_slab_walk_init(mdb_walk_state_t * wsp)233 kmem_partial_slab_walk_init(mdb_walk_state_t *wsp)
234 {
235 	uintptr_t caddr = wsp->walk_addr;
236 
237 	wsp->walk_addr = (uintptr_t)(caddr +
238 	    offsetof(kmem_cache_t, cache_partial_slabs));
239 
240 	return (avl_walk_init_checked(wsp, "slab list", "slab",
241 	    kmem_partial_slab_check, (void *)caddr));
242 }
243 
244 int
kmem_slab_walk_init(mdb_walk_state_t * wsp)245 kmem_slab_walk_init(mdb_walk_state_t *wsp)
246 {
247 	uintptr_t caddr = wsp->walk_addr;
248 
249 	if (caddr == 0) {
250 		mdb_warn("kmem_slab doesn't support global walks\n");
251 		return (WALK_ERR);
252 	}
253 
254 	combined_walk_init(wsp);
255 	combined_walk_add(wsp,
256 	    kmem_complete_slab_walk_init, list_walk_step, list_walk_fini);
257 	combined_walk_add(wsp,
258 	    kmem_partial_slab_walk_init, avl_walk_step, avl_walk_fini);
259 
260 	return (WALK_NEXT);
261 }
262 
263 static int
kmem_first_complete_slab_walk_init(mdb_walk_state_t * wsp)264 kmem_first_complete_slab_walk_init(mdb_walk_state_t *wsp)
265 {
266 	uintptr_t caddr = wsp->walk_addr;
267 	kmem_nth_slab_t *chk;
268 
269 	chk = mdb_alloc(sizeof (kmem_nth_slab_t),
270 	    UM_SLEEP | UM_GC);
271 	chk->kns_cache_addr = caddr;
272 	chk->kns_nslabs = 1;
273 	wsp->walk_addr = (uintptr_t)(caddr +
274 	    offsetof(kmem_cache_t, cache_complete_slabs));
275 
276 	return (list_walk_init_checked(wsp, "slab list", "slab",
277 	    kmem_nth_slab_check, chk));
278 }
279 
280 int
kmem_slab_walk_partial_init(mdb_walk_state_t * wsp)281 kmem_slab_walk_partial_init(mdb_walk_state_t *wsp)
282 {
283 	uintptr_t caddr = wsp->walk_addr;
284 	kmem_cache_t c;
285 
286 	if (caddr == 0) {
287 		mdb_warn("kmem_slab_partial doesn't support global walks\n");
288 		return (WALK_ERR);
289 	}
290 
291 	if (mdb_vread(&c, sizeof (c), caddr) == -1) {
292 		mdb_warn("couldn't read kmem_cache at %p", caddr);
293 		return (WALK_ERR);
294 	}
295 
296 	combined_walk_init(wsp);
297 
298 	/*
299 	 * Some consumers (umem_walk_step(), in particular) require at
300 	 * least one callback if there are any buffers in the cache.  So
301 	 * if there are *no* partial slabs, report the first full slab, if
302 	 * any.
303 	 *
304 	 * Yes, this is ugly, but it's cleaner than the other possibilities.
305 	 */
306 	if (c.cache_partial_slabs.avl_numnodes == 0) {
307 		combined_walk_add(wsp, kmem_first_complete_slab_walk_init,
308 		    list_walk_step, list_walk_fini);
309 	} else {
310 		combined_walk_add(wsp, kmem_partial_slab_walk_init,
311 		    avl_walk_step, avl_walk_fini);
312 	}
313 
314 	return (WALK_NEXT);
315 }
316 
317 int
kmem_cache(uintptr_t addr,uint_t flags,int ac,const mdb_arg_t * argv)318 kmem_cache(uintptr_t addr, uint_t flags, int ac, const mdb_arg_t *argv)
319 {
320 	kmem_cache_t c;
321 	const char *filter = NULL;
322 
323 	if (mdb_getopts(ac, argv,
324 	    'n', MDB_OPT_STR, &filter,
325 	    NULL) != ac) {
326 		return (DCMD_USAGE);
327 	}
328 
329 	if (!(flags & DCMD_ADDRSPEC)) {
330 		if (mdb_walk_dcmd("kmem_cache", "kmem_cache", ac, argv) == -1) {
331 			mdb_warn("can't walk kmem_cache");
332 			return (DCMD_ERR);
333 		}
334 		return (DCMD_OK);
335 	}
336 
337 	if (DCMD_HDRSPEC(flags))
338 		mdb_printf("%-?s %-25s %4s %6s %8s %8s\n", "ADDR", "NAME",
339 		    "FLAG", "CFLAG", "BUFSIZE", "BUFTOTL");
340 
341 	if (mdb_vread(&c, sizeof (c), addr) == -1) {
342 		mdb_warn("couldn't read kmem_cache at %p", addr);
343 		return (DCMD_ERR);
344 	}
345 
346 	if ((filter != NULL) && (strstr(c.cache_name, filter) == NULL))
347 		return (DCMD_OK);
348 
349 	mdb_printf("%0?p %-25s %04x %06x %8ld %8lld\n", addr, c.cache_name,
350 	    c.cache_flags, c.cache_cflags, c.cache_bufsize, c.cache_buftotal);
351 
352 	return (DCMD_OK);
353 }
354 
355 void
kmem_cache_help(void)356 kmem_cache_help(void)
357 {
358 	mdb_printf("%s", "Print kernel memory caches.\n\n");
359 	mdb_dec_indent(2);
360 	mdb_printf("%<b>OPTIONS%</b>\n");
361 	mdb_inc_indent(2);
362 	mdb_printf("%s",
363 "  -n name\n"
364 "        name of kmem cache (or matching partial name)\n"
365 "\n"
366 "Column\tDescription\n"
367 "\n"
368 "ADDR\t\taddress of kmem cache\n"
369 "NAME\t\tname of kmem cache\n"
370 "FLAG\t\tvarious cache state flags\n"
371 "CFLAG\t\tcache creation flags\n"
372 "BUFSIZE\tobject size in bytes\n"
373 "BUFTOTL\tcurrent total buffers in cache (allocated and free)\n");
374 }
375 
376 #define	LABEL_WIDTH	11
377 static void
kmem_slabs_print_dist(uint_t * ks_bucket,size_t buffers_per_slab,size_t maxbuckets,size_t minbucketsize)378 kmem_slabs_print_dist(uint_t *ks_bucket, size_t buffers_per_slab,
379     size_t maxbuckets, size_t minbucketsize)
380 {
381 	uint64_t total;
382 	int buckets;
383 	int i;
384 	const int *distarray;
385 	int complete[2];
386 
387 	buckets = buffers_per_slab;
388 
389 	total = 0;
390 	for (i = 0; i <= buffers_per_slab; i++)
391 		total += ks_bucket[i];
392 
393 	if (maxbuckets > 1)
394 		buckets = MIN(buckets, maxbuckets);
395 
396 	if (minbucketsize > 1) {
397 		/*
398 		 * minbucketsize does not apply to the first bucket reserved
399 		 * for completely allocated slabs
400 		 */
401 		buckets = MIN(buckets, 1 + ((buffers_per_slab - 1) /
402 		    minbucketsize));
403 		if ((buckets < 2) && (buffers_per_slab > 1)) {
404 			buckets = 2;
405 			minbucketsize = (buffers_per_slab - 1);
406 		}
407 	}
408 
409 	/*
410 	 * The first printed bucket is reserved for completely allocated slabs.
411 	 * Passing (buckets - 1) excludes that bucket from the generated
412 	 * distribution, since we're handling it as a special case.
413 	 */
414 	complete[0] = buffers_per_slab;
415 	complete[1] = buffers_per_slab + 1;
416 	distarray = dist_linear(buckets - 1, 1, buffers_per_slab - 1);
417 
418 	mdb_printf("%*s\n", LABEL_WIDTH, "Allocated");
419 	dist_print_header("Buffers", LABEL_WIDTH, "Slabs");
420 
421 	dist_print_bucket(complete, 0, ks_bucket, total, LABEL_WIDTH);
422 	/*
423 	 * Print bucket ranges in descending order after the first bucket for
424 	 * completely allocated slabs, so a person can see immediately whether
425 	 * or not there is fragmentation without having to scan possibly
426 	 * multiple screens of output. Starting at (buckets - 2) excludes the
427 	 * extra terminating bucket.
428 	 */
429 	for (i = buckets - 2; i >= 0; i--) {
430 		dist_print_bucket(distarray, i, ks_bucket, total, LABEL_WIDTH);
431 	}
432 	mdb_printf("\n");
433 }
434 #undef LABEL_WIDTH
435 
436 /*ARGSUSED*/
437 static int
kmem_first_slab(uintptr_t addr,const kmem_slab_t * sp,boolean_t * is_slab)438 kmem_first_slab(uintptr_t addr, const kmem_slab_t *sp, boolean_t *is_slab)
439 {
440 	*is_slab = B_TRUE;
441 	return (WALK_DONE);
442 }
443 
444 /*ARGSUSED*/
445 static int
kmem_first_partial_slab(uintptr_t addr,const kmem_slab_t * sp,boolean_t * is_slab)446 kmem_first_partial_slab(uintptr_t addr, const kmem_slab_t *sp,
447     boolean_t *is_slab)
448 {
449 	/*
450 	 * The "kmem_partial_slab" walker reports the first full slab if there
451 	 * are no partial slabs (for the sake of consumers that require at least
452 	 * one callback if there are any buffers in the cache).
453 	 */
454 	*is_slab = KMEM_SLAB_IS_PARTIAL(sp);
455 	return (WALK_DONE);
456 }
457 
458 typedef struct kmem_slab_usage {
459 	int ksu_refcnt;			/* count of allocated buffers on slab */
460 	boolean_t ksu_nomove;		/* slab marked non-reclaimable */
461 } kmem_slab_usage_t;
462 
463 typedef struct kmem_slab_stats {
464 	const kmem_cache_t *ks_cp;
465 	int ks_slabs;			/* slabs in cache */
466 	int ks_partial_slabs;		/* partially allocated slabs in cache */
467 	uint64_t ks_unused_buffers;	/* total unused buffers in cache */
468 	int ks_max_buffers_per_slab;	/* max buffers per slab */
469 	int ks_usage_len;		/* ks_usage array length */
470 	kmem_slab_usage_t *ks_usage;	/* partial slab usage */
471 	uint_t *ks_bucket;		/* slab usage distribution */
472 } kmem_slab_stats_t;
473 
474 /*ARGSUSED*/
475 static int
kmem_slablist_stat(uintptr_t addr,const kmem_slab_t * sp,kmem_slab_stats_t * ks)476 kmem_slablist_stat(uintptr_t addr, const kmem_slab_t *sp,
477     kmem_slab_stats_t *ks)
478 {
479 	kmem_slab_usage_t *ksu;
480 	long unused;
481 
482 	ks->ks_slabs++;
483 	ks->ks_bucket[sp->slab_refcnt]++;
484 
485 	unused = (sp->slab_chunks - sp->slab_refcnt);
486 	if (unused == 0) {
487 		return (WALK_NEXT);
488 	}
489 
490 	ks->ks_partial_slabs++;
491 	ks->ks_unused_buffers += unused;
492 
493 	if (ks->ks_partial_slabs > ks->ks_usage_len) {
494 		kmem_slab_usage_t *usage;
495 		int len = ks->ks_usage_len;
496 
497 		len = (len == 0 ? 16 : len * 2);
498 		usage = mdb_zalloc(len * sizeof (kmem_slab_usage_t), UM_SLEEP);
499 		if (ks->ks_usage != NULL) {
500 			bcopy(ks->ks_usage, usage,
501 			    ks->ks_usage_len * sizeof (kmem_slab_usage_t));
502 			mdb_free(ks->ks_usage,
503 			    ks->ks_usage_len * sizeof (kmem_slab_usage_t));
504 		}
505 		ks->ks_usage = usage;
506 		ks->ks_usage_len = len;
507 	}
508 
509 	ksu = &ks->ks_usage[ks->ks_partial_slabs - 1];
510 	ksu->ksu_refcnt = sp->slab_refcnt;
511 	ksu->ksu_nomove = (sp->slab_flags & KMEM_SLAB_NOMOVE);
512 	return (WALK_NEXT);
513 }
514 
515 static void
kmem_slabs_header()516 kmem_slabs_header()
517 {
518 	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
519 	    "", "", "Partial", "", "Unused", "");
520 	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
521 	    "Cache Name", "Slabs", "Slabs", "Buffers", "Buffers", "Waste");
522 	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
523 	    "-------------------------", "--------", "--------", "---------",
524 	    "---------", "------");
525 }
526 
527 int
kmem_slabs(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)528 kmem_slabs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
529 {
530 	kmem_cache_t c;
531 	kmem_slab_stats_t stats;
532 	mdb_walk_cb_t cb;
533 	int pct;
534 	int tenths_pct;
535 	size_t maxbuckets = 1;
536 	size_t minbucketsize = 0;
537 	const char *filter = NULL;
538 	const char *name = NULL;
539 	uint_t opt_v = FALSE;
540 	boolean_t buckets = B_FALSE;
541 	boolean_t skip = B_FALSE;
542 
543 	if (mdb_getopts(argc, argv,
544 	    'B', MDB_OPT_UINTPTR, &minbucketsize,
545 	    'b', MDB_OPT_UINTPTR, &maxbuckets,
546 	    'n', MDB_OPT_STR, &filter,
547 	    'N', MDB_OPT_STR, &name,
548 	    'v', MDB_OPT_SETBITS, TRUE, &opt_v,
549 	    NULL) != argc) {
550 		return (DCMD_USAGE);
551 	}
552 
553 	if ((maxbuckets != 1) || (minbucketsize != 0)) {
554 		buckets = B_TRUE;
555 	}
556 
557 	if (!(flags & DCMD_ADDRSPEC)) {
558 		if (mdb_walk_dcmd("kmem_cache", "kmem_slabs", argc,
559 		    argv) == -1) {
560 			mdb_warn("can't walk kmem_cache");
561 			return (DCMD_ERR);
562 		}
563 		return (DCMD_OK);
564 	}
565 
566 	if (mdb_vread(&c, sizeof (c), addr) == -1) {
567 		mdb_warn("couldn't read kmem_cache at %p", addr);
568 		return (DCMD_ERR);
569 	}
570 
571 	if (name == NULL) {
572 		skip = ((filter != NULL) &&
573 		    (strstr(c.cache_name, filter) == NULL));
574 	} else if (filter == NULL) {
575 		skip = (strcmp(c.cache_name, name) != 0);
576 	} else {
577 		/* match either -n or -N */
578 		skip = ((strcmp(c.cache_name, name) != 0) &&
579 		    (strstr(c.cache_name, filter) == NULL));
580 	}
581 
582 	if (!(opt_v || buckets) && DCMD_HDRSPEC(flags)) {
583 		kmem_slabs_header();
584 	} else if ((opt_v || buckets) && !skip) {
585 		if (DCMD_HDRSPEC(flags)) {
586 			kmem_slabs_header();
587 		} else {
588 			boolean_t is_slab = B_FALSE;
589 			const char *walker_name;
590 			if (opt_v) {
591 				cb = (mdb_walk_cb_t)kmem_first_partial_slab;
592 				walker_name = "kmem_slab_partial";
593 			} else {
594 				cb = (mdb_walk_cb_t)kmem_first_slab;
595 				walker_name = "kmem_slab";
596 			}
597 			(void) mdb_pwalk(walker_name, cb, &is_slab, addr);
598 			if (is_slab) {
599 				kmem_slabs_header();
600 			}
601 		}
602 	}
603 
604 	if (skip) {
605 		return (DCMD_OK);
606 	}
607 
608 	bzero(&stats, sizeof (kmem_slab_stats_t));
609 	stats.ks_cp = &c;
610 	stats.ks_max_buffers_per_slab = c.cache_maxchunks;
611 	/* +1 to include a zero bucket */
612 	stats.ks_bucket = mdb_zalloc((stats.ks_max_buffers_per_slab + 1) *
613 	    sizeof (*stats.ks_bucket), UM_SLEEP);
614 	cb = (mdb_walk_cb_t)kmem_slablist_stat;
615 	(void) mdb_pwalk("kmem_slab", cb, &stats, addr);
616 
617 	if (c.cache_buftotal == 0) {
618 		pct = 0;
619 		tenths_pct = 0;
620 	} else {
621 		uint64_t n = stats.ks_unused_buffers * 10000;
622 		pct = (int)(n / c.cache_buftotal);
623 		tenths_pct = pct - ((pct / 100) * 100);
624 		tenths_pct = (tenths_pct + 5) / 10; /* round nearest tenth */
625 		if (tenths_pct == 10) {
626 			pct += 100;
627 			tenths_pct = 0;
628 		}
629 	}
630 
631 	pct /= 100;
632 	mdb_printf("%-25s %8d %8d %9lld %9lld %3d.%1d%%\n", c.cache_name,
633 	    stats.ks_slabs, stats.ks_partial_slabs, c.cache_buftotal,
634 	    stats.ks_unused_buffers, pct, tenths_pct);
635 
636 	if (maxbuckets == 0) {
637 		maxbuckets = stats.ks_max_buffers_per_slab;
638 	}
639 
640 	if (((maxbuckets > 1) || (minbucketsize > 0)) &&
641 	    (stats.ks_slabs > 0)) {
642 		mdb_printf("\n");
643 		kmem_slabs_print_dist(stats.ks_bucket,
644 		    stats.ks_max_buffers_per_slab, maxbuckets, minbucketsize);
645 	}
646 
647 	mdb_free(stats.ks_bucket, (stats.ks_max_buffers_per_slab + 1) *
648 	    sizeof (*stats.ks_bucket));
649 
650 	if (!opt_v) {
651 		return (DCMD_OK);
652 	}
653 
654 	if (opt_v && (stats.ks_partial_slabs > 0)) {
655 		int i;
656 		kmem_slab_usage_t *ksu;
657 
658 		mdb_printf("  %d complete (%d), %d partial:",
659 		    (stats.ks_slabs - stats.ks_partial_slabs),
660 		    stats.ks_max_buffers_per_slab,
661 		    stats.ks_partial_slabs);
662 
663 		for (i = 0; i < stats.ks_partial_slabs; i++) {
664 			ksu = &stats.ks_usage[i];
665 			mdb_printf(" %d%s", ksu->ksu_refcnt,
666 			    (ksu->ksu_nomove ? "*" : ""));
667 		}
668 		mdb_printf("\n\n");
669 	}
670 
671 	if (stats.ks_usage_len > 0) {
672 		mdb_free(stats.ks_usage,
673 		    stats.ks_usage_len * sizeof (kmem_slab_usage_t));
674 	}
675 
676 	return (DCMD_OK);
677 }
678 
679 void
kmem_slabs_help(void)680 kmem_slabs_help(void)
681 {
682 	mdb_printf("%s",
683 "Display slab usage per kmem cache.\n\n");
684 	mdb_dec_indent(2);
685 	mdb_printf("%<b>OPTIONS%</b>\n");
686 	mdb_inc_indent(2);
687 	mdb_printf("%s",
688 "  -n name\n"
689 "        name of kmem cache (or matching partial name)\n"
690 "  -N name\n"
691 "        exact name of kmem cache\n"
692 "  -b maxbins\n"
693 "        Print a distribution of allocated buffers per slab using at\n"
694 "        most maxbins bins. The first bin is reserved for completely\n"
695 "        allocated slabs. Setting maxbins to zero (-b 0) has the same\n"
696 "        effect as specifying the maximum allocated buffers per slab\n"
697 "        or setting minbinsize to 1 (-B 1).\n"
698 "  -B minbinsize\n"
699 "        Print a distribution of allocated buffers per slab, making\n"
700 "        all bins (except the first, reserved for completely allocated\n"
701 "        slabs) at least minbinsize buffers apart.\n"
702 "  -v    verbose output: List the allocated buffer count of each partial\n"
703 "        slab on the free list in order from front to back to show how\n"
704 "        closely the slabs are ordered by usage. For example\n"
705 "\n"
706 "          10 complete, 3 partial (8): 7 3 1\n"
707 "\n"
708 "        means there are thirteen slabs with eight buffers each, including\n"
709 "        three partially allocated slabs with less than all eight buffers\n"
710 "        allocated.\n"
711 "\n"
712 "        Buffer allocations are always from the front of the partial slab\n"
713 "        list. When a buffer is freed from a completely used slab, that\n"
714 "        slab is added to the front of the partial slab list. Assuming\n"
715 "        that all buffers are equally likely to be freed soon, the\n"
716 "        desired order of partial slabs is most-used at the front of the\n"
717 "        list and least-used at the back (as in the example above).\n"
718 "        However, if a slab contains an allocated buffer that will not\n"
719 "        soon be freed, it would be better for that slab to be at the\n"
720 "        front where all of its buffers can be allocated. Taking a slab\n"
721 "        off the partial slab list (either with all buffers freed or all\n"
722 "        buffers allocated) reduces cache fragmentation.\n"
723 "\n"
724 "        A slab's allocated buffer count representing a partial slab (9 in\n"
725 "        the example below) may be marked as follows:\n"
726 "\n"
727 "        9*   An asterisk indicates that kmem has marked the slab non-\n"
728 "        reclaimable because the kmem client refused to move one of the\n"
729 "        slab's buffers. Since kmem does not expect to completely free the\n"
730 "        slab, it moves it to the front of the list in the hope of\n"
731 "        completely allocating it instead. A slab marked with an asterisk\n"
732 "        stays marked for as long as it remains on the partial slab list.\n"
733 "\n"
734 "Column\t\tDescription\n"
735 "\n"
736 "Cache Name\t\tname of kmem cache\n"
737 "Slabs\t\t\ttotal slab count\n"
738 "Partial Slabs\t\tcount of partially allocated slabs on the free list\n"
739 "Buffers\t\ttotal buffer count (Slabs * (buffers per slab))\n"
740 "Unused Buffers\tcount of unallocated buffers across all partial slabs\n"
741 "Waste\t\t\t(Unused Buffers / Buffers) does not include space\n"
742 "\t\t\t  for accounting structures (debug mode), slab\n"
743 "\t\t\t  coloring (incremental small offsets to stagger\n"
744 "\t\t\t  buffer alignment), or the per-CPU magazine layer\n");
745 }
746 
747 static int
addrcmp(const void * lhs,const void * rhs)748 addrcmp(const void *lhs, const void *rhs)
749 {
750 	uintptr_t p1 = *((uintptr_t *)lhs);
751 	uintptr_t p2 = *((uintptr_t *)rhs);
752 
753 	if (p1 < p2)
754 		return (-1);
755 	if (p1 > p2)
756 		return (1);
757 	return (0);
758 }
759 
760 static int
bufctlcmp(const kmem_bufctl_audit_t ** lhs,const kmem_bufctl_audit_t ** rhs)761 bufctlcmp(const kmem_bufctl_audit_t **lhs, const kmem_bufctl_audit_t **rhs)
762 {
763 	const kmem_bufctl_audit_t *bcp1 = *lhs;
764 	const kmem_bufctl_audit_t *bcp2 = *rhs;
765 
766 	if (bcp1->bc_timestamp > bcp2->bc_timestamp)
767 		return (-1);
768 
769 	if (bcp1->bc_timestamp < bcp2->bc_timestamp)
770 		return (1);
771 
772 	return (0);
773 }
774 
775 typedef struct kmem_hash_walk {
776 	uintptr_t *kmhw_table;
777 	size_t kmhw_nelems;
778 	size_t kmhw_pos;
779 	kmem_bufctl_t kmhw_cur;
780 } kmem_hash_walk_t;
781 
782 int
kmem_hash_walk_init(mdb_walk_state_t * wsp)783 kmem_hash_walk_init(mdb_walk_state_t *wsp)
784 {
785 	kmem_hash_walk_t *kmhw;
786 	uintptr_t *hash;
787 	kmem_cache_t c;
788 	uintptr_t haddr, addr = wsp->walk_addr;
789 	size_t nelems;
790 	size_t hsize;
791 
792 	if (addr == 0) {
793 		mdb_warn("kmem_hash doesn't support global walks\n");
794 		return (WALK_ERR);
795 	}
796 
797 	if (mdb_vread(&c, sizeof (c), addr) == -1) {
798 		mdb_warn("couldn't read cache at addr %p", addr);
799 		return (WALK_ERR);
800 	}
801 
802 	if (!(c.cache_flags & KMF_HASH)) {
803 		mdb_warn("cache %p doesn't have a hash table\n", addr);
804 		return (WALK_DONE);		/* nothing to do */
805 	}
806 
807 	kmhw = mdb_zalloc(sizeof (kmem_hash_walk_t), UM_SLEEP);
808 	kmhw->kmhw_cur.bc_next = NULL;
809 	kmhw->kmhw_pos = 0;
810 
811 	kmhw->kmhw_nelems = nelems = c.cache_hash_mask + 1;
812 	hsize = nelems * sizeof (uintptr_t);
813 	haddr = (uintptr_t)c.cache_hash_table;
814 
815 	kmhw->kmhw_table = hash = mdb_alloc(hsize, UM_SLEEP);
816 	if (mdb_vread(hash, hsize, haddr) == -1) {
817 		mdb_warn("failed to read hash table at %p", haddr);
818 		mdb_free(hash, hsize);
819 		mdb_free(kmhw, sizeof (kmem_hash_walk_t));
820 		return (WALK_ERR);
821 	}
822 
823 	wsp->walk_data = kmhw;
824 
825 	return (WALK_NEXT);
826 }
827 
828 int
kmem_hash_walk_step(mdb_walk_state_t * wsp)829 kmem_hash_walk_step(mdb_walk_state_t *wsp)
830 {
831 	kmem_hash_walk_t *kmhw = wsp->walk_data;
832 	uintptr_t addr = 0;
833 
834 	if ((addr = (uintptr_t)kmhw->kmhw_cur.bc_next) == 0) {
835 		while (kmhw->kmhw_pos < kmhw->kmhw_nelems) {
836 			if ((addr = kmhw->kmhw_table[kmhw->kmhw_pos++]) != 0)
837 				break;
838 		}
839 	}
840 	if (addr == 0)
841 		return (WALK_DONE);
842 
843 	if (mdb_vread(&kmhw->kmhw_cur, sizeof (kmem_bufctl_t), addr) == -1) {
844 		mdb_warn("couldn't read kmem_bufctl_t at addr %p", addr);
845 		return (WALK_ERR);
846 	}
847 
848 	return (wsp->walk_callback(addr, &kmhw->kmhw_cur, wsp->walk_cbdata));
849 }
850 
851 void
kmem_hash_walk_fini(mdb_walk_state_t * wsp)852 kmem_hash_walk_fini(mdb_walk_state_t *wsp)
853 {
854 	kmem_hash_walk_t *kmhw = wsp->walk_data;
855 
856 	if (kmhw == NULL)
857 		return;
858 
859 	mdb_free(kmhw->kmhw_table, kmhw->kmhw_nelems * sizeof (uintptr_t));
860 	mdb_free(kmhw, sizeof (kmem_hash_walk_t));
861 }
862 
863 /*
864  * Find the address of the bufctl structure for the address 'buf' in cache
865  * 'cp', which is at address caddr, and place it in *out.
866  */
867 static int
kmem_hash_lookup(kmem_cache_t * cp,uintptr_t caddr,void * buf,uintptr_t * out)868 kmem_hash_lookup(kmem_cache_t *cp, uintptr_t caddr, void *buf, uintptr_t *out)
869 {
870 	uintptr_t bucket = (uintptr_t)KMEM_HASH(cp, buf);
871 	kmem_bufctl_t *bcp;
872 	kmem_bufctl_t bc;
873 
874 	if (mdb_vread(&bcp, sizeof (kmem_bufctl_t *), bucket) == -1) {
875 		mdb_warn("unable to read hash bucket for %p in cache %p",
876 		    buf, caddr);
877 		return (-1);
878 	}
879 
880 	while (bcp != NULL) {
881 		if (mdb_vread(&bc, sizeof (kmem_bufctl_t),
882 		    (uintptr_t)bcp) == -1) {
883 			mdb_warn("unable to read bufctl at %p", bcp);
884 			return (-1);
885 		}
886 		if (bc.bc_addr == buf) {
887 			*out = (uintptr_t)bcp;
888 			return (0);
889 		}
890 		bcp = bc.bc_next;
891 	}
892 
893 	mdb_warn("unable to find bufctl for %p in cache %p\n", buf, caddr);
894 	return (-1);
895 }
896 
897 int
kmem_get_magsize(const kmem_cache_t * cp)898 kmem_get_magsize(const kmem_cache_t *cp)
899 {
900 	uintptr_t addr = (uintptr_t)cp->cache_magtype;
901 	GElf_Sym mt_sym;
902 	kmem_magtype_t mt;
903 	int res;
904 
905 	/*
906 	 * if cpu 0 has a non-zero magsize, it must be correct.  caches
907 	 * with KMF_NOMAGAZINE have disabled their magazine layers, so
908 	 * it is okay to return 0 for them.
909 	 */
910 	if ((res = cp->cache_cpu[0].cc_magsize) != 0 ||
911 	    (cp->cache_flags & KMF_NOMAGAZINE))
912 		return (res);
913 
914 	if (mdb_lookup_by_name("kmem_magtype", &mt_sym) == -1) {
915 		mdb_warn("unable to read 'kmem_magtype'");
916 	} else if (addr < mt_sym.st_value ||
917 	    addr + sizeof (mt) - 1 > mt_sym.st_value + mt_sym.st_size - 1 ||
918 	    ((addr - mt_sym.st_value) % sizeof (mt)) != 0) {
919 		mdb_warn("cache '%s' has invalid magtype pointer (%p)\n",
920 		    cp->cache_name, addr);
921 		return (0);
922 	}
923 	if (mdb_vread(&mt, sizeof (mt), addr) == -1) {
924 		mdb_warn("unable to read magtype at %a", addr);
925 		return (0);
926 	}
927 	return (mt.mt_magsize);
928 }
929 
930 /*ARGSUSED*/
931 static int
kmem_estimate_slab(uintptr_t addr,const kmem_slab_t * sp,size_t * est)932 kmem_estimate_slab(uintptr_t addr, const kmem_slab_t *sp, size_t *est)
933 {
934 	*est -= (sp->slab_chunks - sp->slab_refcnt);
935 
936 	return (WALK_NEXT);
937 }
938 
939 /*
940  * Returns an upper bound on the number of allocated buffers in a given
941  * cache.
942  */
943 size_t
kmem_estimate_allocated(uintptr_t addr,const kmem_cache_t * cp)944 kmem_estimate_allocated(uintptr_t addr, const kmem_cache_t *cp)
945 {
946 	int magsize;
947 	size_t cache_est;
948 
949 	cache_est = cp->cache_buftotal;
950 
951 	(void) mdb_pwalk("kmem_slab_partial",
952 	    (mdb_walk_cb_t)kmem_estimate_slab, &cache_est, addr);
953 
954 	if ((magsize = kmem_get_magsize(cp)) != 0) {
955 		size_t mag_est = cp->cache_full.ml_total * magsize;
956 
957 		if (cache_est >= mag_est) {
958 			cache_est -= mag_est;
959 		} else {
960 			mdb_warn("cache %p's magazine layer holds more buffers "
961 			    "than the slab layer.\n", addr);
962 		}
963 	}
964 	return (cache_est);
965 }
966 
967 #define	READMAG_ROUNDS(rounds) { \
968 	if (mdb_vread(mp, magbsize, (uintptr_t)kmp) == -1) { \
969 		mdb_warn("couldn't read magazine at %p", kmp); \
970 		goto fail; \
971 	} \
972 	for (i = 0; i < rounds; i++) { \
973 		maglist[magcnt++] = mp->mag_round[i]; \
974 		if (magcnt == magmax) { \
975 			mdb_warn("%d magazines exceeds fudge factor\n", \
976 			    magcnt); \
977 			goto fail; \
978 		} \
979 	} \
980 }
981 
982 int
kmem_read_magazines(kmem_cache_t * cp,uintptr_t addr,int ncpus,void *** maglistp,size_t * magcntp,size_t * magmaxp,int alloc_flags)983 kmem_read_magazines(kmem_cache_t *cp, uintptr_t addr, int ncpus,
984     void ***maglistp, size_t *magcntp, size_t *magmaxp, int alloc_flags)
985 {
986 	kmem_magazine_t *kmp, *mp;
987 	void **maglist = NULL;
988 	int i, cpu;
989 	size_t magsize, magmax, magbsize;
990 	size_t magcnt = 0;
991 
992 	/*
993 	 * Read the magtype out of the cache, after verifying the pointer's
994 	 * correctness.
995 	 */
996 	magsize = kmem_get_magsize(cp);
997 	if (magsize == 0) {
998 		*maglistp = NULL;
999 		*magcntp = 0;
1000 		*magmaxp = 0;
1001 		return (WALK_NEXT);
1002 	}
1003 
1004 	/*
1005 	 * There are several places where we need to go buffer hunting:
1006 	 * the per-CPU loaded magazine, the per-CPU spare full magazine,
1007 	 * and the full magazine list in the depot.
1008 	 *
1009 	 * For an upper bound on the number of buffers in the magazine
1010 	 * layer, we have the number of magazines on the cache_full
1011 	 * list plus at most two magazines per CPU (the loaded and the
1012 	 * spare).  Toss in 100 magazines as a fudge factor in case this
1013 	 * is live (the number "100" comes from the same fudge factor in
1014 	 * crash(8)).
1015 	 */
1016 	magmax = (cp->cache_full.ml_total + 2 * ncpus + 100) * magsize;
1017 	magbsize = offsetof(kmem_magazine_t, mag_round[magsize]);
1018 
1019 	if (magbsize >= PAGESIZE / 2) {
1020 		mdb_warn("magazine size for cache %p unreasonable (%x)\n",
1021 		    addr, magbsize);
1022 		return (WALK_ERR);
1023 	}
1024 
1025 	maglist = mdb_alloc(magmax * sizeof (void *), alloc_flags);
1026 	mp = mdb_alloc(magbsize, alloc_flags);
1027 	if (mp == NULL || maglist == NULL)
1028 		goto fail;
1029 
1030 	/*
1031 	 * First up: the magazines in the depot (i.e. on the cache_full list).
1032 	 */
1033 	for (kmp = cp->cache_full.ml_list; kmp != NULL; ) {
1034 		READMAG_ROUNDS(magsize);
1035 		kmp = mp->mag_next;
1036 
1037 		if (kmp == cp->cache_full.ml_list)
1038 			break; /* cache_full list loop detected */
1039 	}
1040 
1041 	dprintf(("cache_full list done\n"));
1042 
1043 	/*
1044 	 * Now whip through the CPUs, snagging the loaded magazines
1045 	 * and full spares.
1046 	 *
1047 	 * In order to prevent inconsistent dumps, rounds and prounds
1048 	 * are copied aside before dumping begins.
1049 	 */
1050 	for (cpu = 0; cpu < ncpus; cpu++) {
1051 		kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu];
1052 		short rounds, prounds;
1053 
1054 		if (KMEM_DUMPCC(ccp)) {
1055 			rounds = ccp->cc_dump_rounds;
1056 			prounds = ccp->cc_dump_prounds;
1057 		} else {
1058 			rounds = ccp->cc_rounds;
1059 			prounds = ccp->cc_prounds;
1060 		}
1061 
1062 		dprintf(("reading cpu cache %p\n",
1063 		    (uintptr_t)ccp - (uintptr_t)cp + addr));
1064 
1065 		if (rounds > 0 &&
1066 		    (kmp = ccp->cc_loaded) != NULL) {
1067 			dprintf(("reading %d loaded rounds\n", rounds));
1068 			READMAG_ROUNDS(rounds);
1069 		}
1070 
1071 		if (prounds > 0 &&
1072 		    (kmp = ccp->cc_ploaded) != NULL) {
1073 			dprintf(("reading %d previously loaded rounds\n",
1074 			    prounds));
1075 			READMAG_ROUNDS(prounds);
1076 		}
1077 	}
1078 
1079 	dprintf(("magazine layer: %d buffers\n", magcnt));
1080 
1081 	if (!(alloc_flags & UM_GC))
1082 		mdb_free(mp, magbsize);
1083 
1084 	*maglistp = maglist;
1085 	*magcntp = magcnt;
1086 	*magmaxp = magmax;
1087 
1088 	return (WALK_NEXT);
1089 
1090 fail:
1091 	if (!(alloc_flags & UM_GC)) {
1092 		if (mp)
1093 			mdb_free(mp, magbsize);
1094 		if (maglist)
1095 			mdb_free(maglist, magmax * sizeof (void *));
1096 	}
1097 	return (WALK_ERR);
1098 }
1099 
1100 static int
kmem_walk_callback(mdb_walk_state_t * wsp,uintptr_t buf)1101 kmem_walk_callback(mdb_walk_state_t *wsp, uintptr_t buf)
1102 {
1103 	return (wsp->walk_callback(buf, NULL, wsp->walk_cbdata));
1104 }
1105 
1106 static int
bufctl_walk_callback(kmem_cache_t * cp,mdb_walk_state_t * wsp,uintptr_t buf)1107 bufctl_walk_callback(kmem_cache_t *cp, mdb_walk_state_t *wsp, uintptr_t buf)
1108 {
1109 	kmem_bufctl_audit_t b;
1110 
1111 	/*
1112 	 * if KMF_AUDIT is not set, we know that we're looking at a
1113 	 * kmem_bufctl_t.
1114 	 */
1115 	if (!(cp->cache_flags & KMF_AUDIT) ||
1116 	    mdb_vread(&b, sizeof (kmem_bufctl_audit_t), buf) == -1) {
1117 		(void) memset(&b, 0, sizeof (b));
1118 		if (mdb_vread(&b, sizeof (kmem_bufctl_t), buf) == -1) {
1119 			mdb_warn("unable to read bufctl at %p", buf);
1120 			return (WALK_ERR);
1121 		}
1122 	}
1123 
1124 	return (wsp->walk_callback(buf, &b, wsp->walk_cbdata));
1125 }
1126 
1127 typedef struct kmem_walk {
1128 	int kmw_type;
1129 
1130 	uintptr_t kmw_addr;		/* cache address */
1131 	kmem_cache_t *kmw_cp;
1132 	size_t kmw_csize;
1133 
1134 	/*
1135 	 * magazine layer
1136 	 */
1137 	void **kmw_maglist;
1138 	size_t kmw_max;
1139 	size_t kmw_count;
1140 	size_t kmw_pos;
1141 
1142 	/*
1143 	 * slab layer
1144 	 */
1145 	char *kmw_valid;	/* to keep track of freed buffers */
1146 	char *kmw_ubase;	/* buffer for slab data */
1147 } kmem_walk_t;
1148 
1149 static int
kmem_walk_init_common(mdb_walk_state_t * wsp,int type)1150 kmem_walk_init_common(mdb_walk_state_t *wsp, int type)
1151 {
1152 	kmem_walk_t *kmw;
1153 	int ncpus, csize;
1154 	kmem_cache_t *cp;
1155 	size_t vm_quantum;
1156 
1157 	size_t magmax, magcnt;
1158 	void **maglist = NULL;
1159 	uint_t chunksize = 1, slabsize = 1;
1160 	int status = WALK_ERR;
1161 	uintptr_t addr = wsp->walk_addr;
1162 	const char *layered;
1163 
1164 	type &= ~KM_HASH;
1165 
1166 	if (addr == 0) {
1167 		mdb_warn("kmem walk doesn't support global walks\n");
1168 		return (WALK_ERR);
1169 	}
1170 
1171 	dprintf(("walking %p\n", addr));
1172 
1173 	/*
1174 	 * First we need to figure out how many CPUs are configured in the
1175 	 * system to know how much to slurp out.
1176 	 */
1177 	mdb_readvar(&ncpus, "max_ncpus");
1178 
1179 	csize = KMEM_CACHE_SIZE(ncpus);
1180 	cp = mdb_alloc(csize, UM_SLEEP);
1181 
1182 	if (mdb_vread(cp, csize, addr) == -1) {
1183 		mdb_warn("couldn't read cache at addr %p", addr);
1184 		goto out2;
1185 	}
1186 
1187 	/*
1188 	 * It's easy for someone to hand us an invalid cache address.
1189 	 * Unfortunately, it is hard for this walker to survive an
1190 	 * invalid cache cleanly.  So we make sure that:
1191 	 *
1192 	 *	1. the vmem arena for the cache is readable,
1193 	 *	2. the vmem arena's quantum is a power of 2,
1194 	 *	3. our slabsize is a multiple of the quantum, and
1195 	 *	4. our chunksize is >0 and less than our slabsize.
1196 	 */
1197 	if (mdb_vread(&vm_quantum, sizeof (vm_quantum),
1198 	    (uintptr_t)&cp->cache_arena->vm_quantum) == -1 ||
1199 	    vm_quantum == 0 ||
1200 	    (vm_quantum & (vm_quantum - 1)) != 0 ||
1201 	    cp->cache_slabsize < vm_quantum ||
1202 	    P2PHASE(cp->cache_slabsize, vm_quantum) != 0 ||
1203 	    cp->cache_chunksize == 0 ||
1204 	    cp->cache_chunksize > cp->cache_slabsize) {
1205 		mdb_warn("%p is not a valid kmem_cache_t\n", addr);
1206 		goto out2;
1207 	}
1208 
1209 	dprintf(("buf total is %d\n", cp->cache_buftotal));
1210 
1211 	if (cp->cache_buftotal == 0) {
1212 		mdb_free(cp, csize);
1213 		return (WALK_DONE);
1214 	}
1215 
1216 	/*
1217 	 * If they ask for bufctls, but it's a small-slab cache,
1218 	 * there is nothing to report.
1219 	 */
1220 	if ((type & KM_BUFCTL) && !(cp->cache_flags & KMF_HASH)) {
1221 		dprintf(("bufctl requested, not KMF_HASH (flags: %p)\n",
1222 		    cp->cache_flags));
1223 		mdb_free(cp, csize);
1224 		return (WALK_DONE);
1225 	}
1226 
1227 	/*
1228 	 * If they want constructed buffers, but there's no constructor or
1229 	 * the cache has DEADBEEF checking enabled, there is nothing to report.
1230 	 */
1231 	if ((type & KM_CONSTRUCTED) && (!(type & KM_FREE) ||
1232 	    cp->cache_constructor == NULL ||
1233 	    (cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) == KMF_DEADBEEF)) {
1234 		mdb_free(cp, csize);
1235 		return (WALK_DONE);
1236 	}
1237 
1238 	/*
1239 	 * Read in the contents of the magazine layer
1240 	 */
1241 	if (kmem_read_magazines(cp, addr, ncpus, &maglist, &magcnt,
1242 	    &magmax, UM_SLEEP) == WALK_ERR)
1243 		goto out2;
1244 
1245 	/*
1246 	 * We have all of the buffers from the magazines;  if we are walking
1247 	 * allocated buffers, sort them so we can bsearch them later.
1248 	 */
1249 	if (type & KM_ALLOCATED)
1250 		qsort(maglist, magcnt, sizeof (void *), addrcmp);
1251 
1252 	wsp->walk_data = kmw = mdb_zalloc(sizeof (kmem_walk_t), UM_SLEEP);
1253 
1254 	kmw->kmw_type = type;
1255 	kmw->kmw_addr = addr;
1256 	kmw->kmw_cp = cp;
1257 	kmw->kmw_csize = csize;
1258 	kmw->kmw_maglist = maglist;
1259 	kmw->kmw_max = magmax;
1260 	kmw->kmw_count = magcnt;
1261 	kmw->kmw_pos = 0;
1262 
1263 	/*
1264 	 * When walking allocated buffers in a KMF_HASH cache, we walk the
1265 	 * hash table instead of the slab layer.
1266 	 */
1267 	if ((cp->cache_flags & KMF_HASH) && (type & KM_ALLOCATED)) {
1268 		layered = "kmem_hash";
1269 
1270 		kmw->kmw_type |= KM_HASH;
1271 	} else {
1272 		/*
1273 		 * If we are walking freed buffers, we only need the
1274 		 * magazine layer plus the partially allocated slabs.
1275 		 * To walk allocated buffers, we need all of the slabs.
1276 		 */
1277 		if (type & KM_ALLOCATED)
1278 			layered = "kmem_slab";
1279 		else
1280 			layered = "kmem_slab_partial";
1281 
1282 		/*
1283 		 * for small-slab caches, we read in the entire slab.  For
1284 		 * freed buffers, we can just walk the freelist.  For
1285 		 * allocated buffers, we use a 'valid' array to track
1286 		 * the freed buffers.
1287 		 */
1288 		if (!(cp->cache_flags & KMF_HASH)) {
1289 			chunksize = cp->cache_chunksize;
1290 			slabsize = cp->cache_slabsize;
1291 
1292 			kmw->kmw_ubase = mdb_alloc(slabsize +
1293 			    sizeof (kmem_bufctl_t), UM_SLEEP);
1294 
1295 			if (type & KM_ALLOCATED)
1296 				kmw->kmw_valid =
1297 				    mdb_alloc(slabsize / chunksize, UM_SLEEP);
1298 		}
1299 	}
1300 
1301 	status = WALK_NEXT;
1302 
1303 	if (mdb_layered_walk(layered, wsp) == -1) {
1304 		mdb_warn("unable to start layered '%s' walk", layered);
1305 		status = WALK_ERR;
1306 	}
1307 
1308 out1:
1309 	if (status == WALK_ERR) {
1310 		if (kmw->kmw_valid)
1311 			mdb_free(kmw->kmw_valid, slabsize / chunksize);
1312 
1313 		if (kmw->kmw_ubase)
1314 			mdb_free(kmw->kmw_ubase, slabsize +
1315 			    sizeof (kmem_bufctl_t));
1316 
1317 		if (kmw->kmw_maglist)
1318 			mdb_free(kmw->kmw_maglist,
1319 			    kmw->kmw_max * sizeof (uintptr_t));
1320 
1321 		mdb_free(kmw, sizeof (kmem_walk_t));
1322 		wsp->walk_data = NULL;
1323 	}
1324 
1325 out2:
1326 	if (status == WALK_ERR)
1327 		mdb_free(cp, csize);
1328 
1329 	return (status);
1330 }
1331 
1332 int
kmem_walk_step(mdb_walk_state_t * wsp)1333 kmem_walk_step(mdb_walk_state_t *wsp)
1334 {
1335 	kmem_walk_t *kmw = wsp->walk_data;
1336 	int type = kmw->kmw_type;
1337 	kmem_cache_t *cp = kmw->kmw_cp;
1338 
1339 	void **maglist = kmw->kmw_maglist;
1340 	int magcnt = kmw->kmw_count;
1341 
1342 	uintptr_t chunksize, slabsize;
1343 	uintptr_t addr;
1344 	const kmem_slab_t *sp;
1345 	const kmem_bufctl_t *bcp;
1346 	kmem_bufctl_t bc;
1347 
1348 	int chunks;
1349 	char *kbase;
1350 	void *buf;
1351 	int i, ret;
1352 
1353 	char *valid, *ubase;
1354 
1355 	/*
1356 	 * first, handle the 'kmem_hash' layered walk case
1357 	 */
1358 	if (type & KM_HASH) {
1359 		/*
1360 		 * We have a buffer which has been allocated out of the
1361 		 * global layer. We need to make sure that it's not
1362 		 * actually sitting in a magazine before we report it as
1363 		 * an allocated buffer.
1364 		 */
1365 		buf = ((const kmem_bufctl_t *)wsp->walk_layer)->bc_addr;
1366 
1367 		if (magcnt > 0 &&
1368 		    bsearch(&buf, maglist, magcnt, sizeof (void *),
1369 		    addrcmp) != NULL)
1370 			return (WALK_NEXT);
1371 
1372 		if (type & KM_BUFCTL)
1373 			return (bufctl_walk_callback(cp, wsp, wsp->walk_addr));
1374 
1375 		return (kmem_walk_callback(wsp, (uintptr_t)buf));
1376 	}
1377 
1378 	ret = WALK_NEXT;
1379 
1380 	addr = kmw->kmw_addr;
1381 
1382 	/*
1383 	 * If we're walking freed buffers, report everything in the
1384 	 * magazine layer before processing the first slab.
1385 	 */
1386 	if ((type & KM_FREE) && magcnt != 0) {
1387 		kmw->kmw_count = 0;		/* only do this once */
1388 		for (i = 0; i < magcnt; i++) {
1389 			buf = maglist[i];
1390 
1391 			if (type & KM_BUFCTL) {
1392 				uintptr_t out;
1393 
1394 				if (cp->cache_flags & KMF_BUFTAG) {
1395 					kmem_buftag_t *btp;
1396 					kmem_buftag_t tag;
1397 
1398 					/* LINTED - alignment */
1399 					btp = KMEM_BUFTAG(cp, buf);
1400 					if (mdb_vread(&tag, sizeof (tag),
1401 					    (uintptr_t)btp) == -1) {
1402 						mdb_warn("reading buftag for "
1403 						    "%p at %p", buf, btp);
1404 						continue;
1405 					}
1406 					out = (uintptr_t)tag.bt_bufctl;
1407 				} else {
1408 					if (kmem_hash_lookup(cp, addr, buf,
1409 					    &out) == -1)
1410 						continue;
1411 				}
1412 				ret = bufctl_walk_callback(cp, wsp, out);
1413 			} else {
1414 				ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1415 			}
1416 
1417 			if (ret != WALK_NEXT)
1418 				return (ret);
1419 		}
1420 	}
1421 
1422 	/*
1423 	 * If they want constructed buffers, we're finished, since the
1424 	 * magazine layer holds them all.
1425 	 */
1426 	if (type & KM_CONSTRUCTED)
1427 		return (WALK_DONE);
1428 
1429 	/*
1430 	 * Handle the buffers in the current slab
1431 	 */
1432 	chunksize = cp->cache_chunksize;
1433 	slabsize = cp->cache_slabsize;
1434 
1435 	sp = wsp->walk_layer;
1436 	chunks = sp->slab_chunks;
1437 	kbase = sp->slab_base;
1438 
1439 	dprintf(("kbase is %p\n", kbase));
1440 
1441 	if (!(cp->cache_flags & KMF_HASH)) {
1442 		valid = kmw->kmw_valid;
1443 		ubase = kmw->kmw_ubase;
1444 
1445 		if (mdb_vread(ubase, chunks * chunksize,
1446 		    (uintptr_t)kbase) == -1) {
1447 			mdb_warn("failed to read slab contents at %p", kbase);
1448 			return (WALK_ERR);
1449 		}
1450 
1451 		/*
1452 		 * Set up the valid map as fully allocated -- we'll punch
1453 		 * out the freelist.
1454 		 */
1455 		if (type & KM_ALLOCATED)
1456 			(void) memset(valid, 1, chunks);
1457 	} else {
1458 		valid = NULL;
1459 		ubase = NULL;
1460 	}
1461 
1462 	/*
1463 	 * walk the slab's freelist
1464 	 */
1465 	bcp = sp->slab_head;
1466 
1467 	dprintf(("refcnt is %d; chunks is %d\n", sp->slab_refcnt, chunks));
1468 
1469 	/*
1470 	 * since we could be in the middle of allocating a buffer,
1471 	 * our refcnt could be one higher than it aught.  So we
1472 	 * check one further on the freelist than the count allows.
1473 	 */
1474 	for (i = sp->slab_refcnt; i <= chunks; i++) {
1475 		uint_t ndx;
1476 
1477 		dprintf(("bcp is %p\n", bcp));
1478 
1479 		if (bcp == NULL) {
1480 			if (i == chunks)
1481 				break;
1482 			mdb_warn(
1483 			    "slab %p in cache %p freelist too short by %d\n",
1484 			    sp, addr, chunks - i);
1485 			break;
1486 		}
1487 
1488 		if (cp->cache_flags & KMF_HASH) {
1489 			if (mdb_vread(&bc, sizeof (bc), (uintptr_t)bcp) == -1) {
1490 				mdb_warn("failed to read bufctl ptr at %p",
1491 				    bcp);
1492 				break;
1493 			}
1494 			buf = bc.bc_addr;
1495 		} else {
1496 			/*
1497 			 * Otherwise the buffer is (or should be) in the slab
1498 			 * that we've read in; determine its offset in the
1499 			 * slab, validate that it's not corrupt, and add to
1500 			 * our base address to find the umem_bufctl_t.  (Note
1501 			 * that we don't need to add the size of the bufctl
1502 			 * to our offset calculation because of the slop that's
1503 			 * allocated for the buffer at ubase.)
1504 			 */
1505 			uintptr_t offs = (uintptr_t)bcp - (uintptr_t)kbase;
1506 
1507 			if (offs > chunks * chunksize) {
1508 				mdb_warn("found corrupt bufctl ptr %p"
1509 				    " in slab %p in cache %p\n", bcp,
1510 				    wsp->walk_addr, addr);
1511 				break;
1512 			}
1513 
1514 			bc = *((kmem_bufctl_t *)((uintptr_t)ubase + offs));
1515 			buf = KMEM_BUF(cp, bcp);
1516 		}
1517 
1518 		ndx = ((uintptr_t)buf - (uintptr_t)kbase) / chunksize;
1519 
1520 		if (ndx > slabsize / cp->cache_bufsize) {
1521 			/*
1522 			 * This is very wrong; we have managed to find
1523 			 * a buffer in the slab which shouldn't
1524 			 * actually be here.  Emit a warning, and
1525 			 * try to continue.
1526 			 */
1527 			mdb_warn("buf %p is out of range for "
1528 			    "slab %p, cache %p\n", buf, sp, addr);
1529 		} else if (type & KM_ALLOCATED) {
1530 			/*
1531 			 * we have found a buffer on the slab's freelist;
1532 			 * clear its entry
1533 			 */
1534 			valid[ndx] = 0;
1535 		} else {
1536 			/*
1537 			 * Report this freed buffer
1538 			 */
1539 			if (type & KM_BUFCTL) {
1540 				ret = bufctl_walk_callback(cp, wsp,
1541 				    (uintptr_t)bcp);
1542 			} else {
1543 				ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1544 			}
1545 			if (ret != WALK_NEXT)
1546 				return (ret);
1547 		}
1548 
1549 		bcp = bc.bc_next;
1550 	}
1551 
1552 	if (bcp != NULL) {
1553 		dprintf(("slab %p in cache %p freelist too long (%p)\n",
1554 		    sp, addr, bcp));
1555 	}
1556 
1557 	/*
1558 	 * If we are walking freed buffers, the loop above handled reporting
1559 	 * them.
1560 	 */
1561 	if (type & KM_FREE)
1562 		return (WALK_NEXT);
1563 
1564 	if (type & KM_BUFCTL) {
1565 		mdb_warn("impossible situation: small-slab KM_BUFCTL walk for "
1566 		    "cache %p\n", addr);
1567 		return (WALK_ERR);
1568 	}
1569 
1570 	/*
1571 	 * Report allocated buffers, skipping buffers in the magazine layer.
1572 	 * We only get this far for small-slab caches.
1573 	 */
1574 	for (i = 0; ret == WALK_NEXT && i < chunks; i++) {
1575 		buf = (char *)kbase + i * chunksize;
1576 
1577 		if (!valid[i])
1578 			continue;		/* on slab freelist */
1579 
1580 		if (magcnt > 0 &&
1581 		    bsearch(&buf, maglist, magcnt, sizeof (void *),
1582 		    addrcmp) != NULL)
1583 			continue;		/* in magazine layer */
1584 
1585 		ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1586 	}
1587 	return (ret);
1588 }
1589 
1590 void
kmem_walk_fini(mdb_walk_state_t * wsp)1591 kmem_walk_fini(mdb_walk_state_t *wsp)
1592 {
1593 	kmem_walk_t *kmw = wsp->walk_data;
1594 	uintptr_t chunksize;
1595 	uintptr_t slabsize;
1596 
1597 	if (kmw == NULL)
1598 		return;
1599 
1600 	if (kmw->kmw_maglist != NULL)
1601 		mdb_free(kmw->kmw_maglist, kmw->kmw_max * sizeof (void *));
1602 
1603 	chunksize = kmw->kmw_cp->cache_chunksize;
1604 	slabsize = kmw->kmw_cp->cache_slabsize;
1605 
1606 	if (kmw->kmw_valid != NULL)
1607 		mdb_free(kmw->kmw_valid, slabsize / chunksize);
1608 	if (kmw->kmw_ubase != NULL)
1609 		mdb_free(kmw->kmw_ubase, slabsize + sizeof (kmem_bufctl_t));
1610 
1611 	mdb_free(kmw->kmw_cp, kmw->kmw_csize);
1612 	mdb_free(kmw, sizeof (kmem_walk_t));
1613 }
1614 
1615 /*ARGSUSED*/
1616 static int
kmem_walk_all(uintptr_t addr,const kmem_cache_t * c,mdb_walk_state_t * wsp)1617 kmem_walk_all(uintptr_t addr, const kmem_cache_t *c, mdb_walk_state_t *wsp)
1618 {
1619 	/*
1620 	 * Buffers allocated from NOTOUCH caches can also show up as freed
1621 	 * memory in other caches.  This can be a little confusing, so we
1622 	 * don't walk NOTOUCH caches when walking all caches (thereby assuring
1623 	 * that "::walk kmem" and "::walk freemem" yield disjoint output).
1624 	 */
1625 	if (c->cache_cflags & KMC_NOTOUCH)
1626 		return (WALK_NEXT);
1627 
1628 	if (mdb_pwalk(wsp->walk_data, wsp->walk_callback,
1629 	    wsp->walk_cbdata, addr) == -1)
1630 		return (WALK_DONE);
1631 
1632 	return (WALK_NEXT);
1633 }
1634 
1635 #define	KMEM_WALK_ALL(name, wsp) { \
1636 	wsp->walk_data = (name); \
1637 	if (mdb_walk("kmem_cache", (mdb_walk_cb_t)kmem_walk_all, wsp) == -1) \
1638 		return (WALK_ERR); \
1639 	return (WALK_DONE); \
1640 }
1641 
1642 int
kmem_walk_init(mdb_walk_state_t * wsp)1643 kmem_walk_init(mdb_walk_state_t *wsp)
1644 {
1645 	if (wsp->walk_arg != NULL)
1646 		wsp->walk_addr = (uintptr_t)wsp->walk_arg;
1647 
1648 	if (wsp->walk_addr == 0)
1649 		KMEM_WALK_ALL("kmem", wsp);
1650 	return (kmem_walk_init_common(wsp, KM_ALLOCATED));
1651 }
1652 
1653 int
bufctl_walk_init(mdb_walk_state_t * wsp)1654 bufctl_walk_init(mdb_walk_state_t *wsp)
1655 {
1656 	if (wsp->walk_addr == 0)
1657 		KMEM_WALK_ALL("bufctl", wsp);
1658 	return (kmem_walk_init_common(wsp, KM_ALLOCATED | KM_BUFCTL));
1659 }
1660 
1661 int
freemem_walk_init(mdb_walk_state_t * wsp)1662 freemem_walk_init(mdb_walk_state_t *wsp)
1663 {
1664 	if (wsp->walk_addr == 0)
1665 		KMEM_WALK_ALL("freemem", wsp);
1666 	return (kmem_walk_init_common(wsp, KM_FREE));
1667 }
1668 
1669 int
freemem_constructed_walk_init(mdb_walk_state_t * wsp)1670 freemem_constructed_walk_init(mdb_walk_state_t *wsp)
1671 {
1672 	if (wsp->walk_addr == 0)
1673 		KMEM_WALK_ALL("freemem_constructed", wsp);
1674 	return (kmem_walk_init_common(wsp, KM_FREE | KM_CONSTRUCTED));
1675 }
1676 
1677 int
freectl_walk_init(mdb_walk_state_t * wsp)1678 freectl_walk_init(mdb_walk_state_t *wsp)
1679 {
1680 	if (wsp->walk_addr == 0)
1681 		KMEM_WALK_ALL("freectl", wsp);
1682 	return (kmem_walk_init_common(wsp, KM_FREE | KM_BUFCTL));
1683 }
1684 
1685 int
freectl_constructed_walk_init(mdb_walk_state_t * wsp)1686 freectl_constructed_walk_init(mdb_walk_state_t *wsp)
1687 {
1688 	if (wsp->walk_addr == 0)
1689 		KMEM_WALK_ALL("freectl_constructed", wsp);
1690 	return (kmem_walk_init_common(wsp,
1691 	    KM_FREE | KM_BUFCTL | KM_CONSTRUCTED));
1692 }
1693 
1694 typedef struct bufctl_history_walk {
1695 	void		*bhw_next;
1696 	kmem_cache_t	*bhw_cache;
1697 	kmem_slab_t	*bhw_slab;
1698 	hrtime_t	bhw_timestamp;
1699 } bufctl_history_walk_t;
1700 
1701 int
bufctl_history_walk_init(mdb_walk_state_t * wsp)1702 bufctl_history_walk_init(mdb_walk_state_t *wsp)
1703 {
1704 	bufctl_history_walk_t *bhw;
1705 	kmem_bufctl_audit_t bc;
1706 	kmem_bufctl_audit_t bcn;
1707 
1708 	if (wsp->walk_addr == 0) {
1709 		mdb_warn("bufctl_history walk doesn't support global walks\n");
1710 		return (WALK_ERR);
1711 	}
1712 
1713 	if (mdb_vread(&bc, sizeof (bc), wsp->walk_addr) == -1) {
1714 		mdb_warn("unable to read bufctl at %p", wsp->walk_addr);
1715 		return (WALK_ERR);
1716 	}
1717 
1718 	bhw = mdb_zalloc(sizeof (*bhw), UM_SLEEP);
1719 	bhw->bhw_timestamp = 0;
1720 	bhw->bhw_cache = bc.bc_cache;
1721 	bhw->bhw_slab = bc.bc_slab;
1722 
1723 	/*
1724 	 * sometimes the first log entry matches the base bufctl;  in that
1725 	 * case, skip the base bufctl.
1726 	 */
1727 	if (bc.bc_lastlog != NULL &&
1728 	    mdb_vread(&bcn, sizeof (bcn), (uintptr_t)bc.bc_lastlog) != -1 &&
1729 	    bc.bc_addr == bcn.bc_addr &&
1730 	    bc.bc_cache == bcn.bc_cache &&
1731 	    bc.bc_slab == bcn.bc_slab &&
1732 	    bc.bc_timestamp == bcn.bc_timestamp &&
1733 	    bc.bc_thread == bcn.bc_thread)
1734 		bhw->bhw_next = bc.bc_lastlog;
1735 	else
1736 		bhw->bhw_next = (void *)wsp->walk_addr;
1737 
1738 	wsp->walk_addr = (uintptr_t)bc.bc_addr;
1739 	wsp->walk_data = bhw;
1740 
1741 	return (WALK_NEXT);
1742 }
1743 
1744 int
bufctl_history_walk_step(mdb_walk_state_t * wsp)1745 bufctl_history_walk_step(mdb_walk_state_t *wsp)
1746 {
1747 	bufctl_history_walk_t *bhw = wsp->walk_data;
1748 	uintptr_t addr = (uintptr_t)bhw->bhw_next;
1749 	uintptr_t baseaddr = wsp->walk_addr;
1750 	kmem_bufctl_audit_t bc;
1751 
1752 	if (addr == 0)
1753 		return (WALK_DONE);
1754 
1755 	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
1756 		mdb_warn("unable to read bufctl at %p", bhw->bhw_next);
1757 		return (WALK_ERR);
1758 	}
1759 
1760 	/*
1761 	 * The bufctl is only valid if the address, cache, and slab are
1762 	 * correct.  We also check that the timestamp is decreasing, to
1763 	 * prevent infinite loops.
1764 	 */
1765 	if ((uintptr_t)bc.bc_addr != baseaddr ||
1766 	    bc.bc_cache != bhw->bhw_cache ||
1767 	    bc.bc_slab != bhw->bhw_slab ||
1768 	    (bhw->bhw_timestamp != 0 && bc.bc_timestamp >= bhw->bhw_timestamp))
1769 		return (WALK_DONE);
1770 
1771 	bhw->bhw_next = bc.bc_lastlog;
1772 	bhw->bhw_timestamp = bc.bc_timestamp;
1773 
1774 	return (wsp->walk_callback(addr, &bc, wsp->walk_cbdata));
1775 }
1776 
1777 void
bufctl_history_walk_fini(mdb_walk_state_t * wsp)1778 bufctl_history_walk_fini(mdb_walk_state_t *wsp)
1779 {
1780 	bufctl_history_walk_t *bhw = wsp->walk_data;
1781 
1782 	mdb_free(bhw, sizeof (*bhw));
1783 }
1784 
1785 typedef struct kmem_log_walk {
1786 	kmem_bufctl_audit_t *klw_base;
1787 	kmem_bufctl_audit_t **klw_sorted;
1788 	kmem_log_header_t klw_lh;
1789 	size_t klw_size;
1790 	size_t klw_maxndx;
1791 	size_t klw_ndx;
1792 } kmem_log_walk_t;
1793 
1794 int
kmem_log_walk_init(mdb_walk_state_t * wsp)1795 kmem_log_walk_init(mdb_walk_state_t *wsp)
1796 {
1797 	uintptr_t lp = wsp->walk_addr;
1798 	kmem_log_walk_t *klw;
1799 	kmem_log_header_t *lhp;
1800 	int maxndx, i, j, k;
1801 
1802 	/*
1803 	 * By default (global walk), walk the kmem_transaction_log.  Otherwise
1804 	 * read the log whose kmem_log_header_t is stored at walk_addr.
1805 	 */
1806 	if (lp == 0 && mdb_readvar(&lp, "kmem_transaction_log") == -1) {
1807 		mdb_warn("failed to read 'kmem_transaction_log'");
1808 		return (WALK_ERR);
1809 	}
1810 
1811 	if (lp == 0) {
1812 		mdb_warn("log is disabled\n");
1813 		return (WALK_ERR);
1814 	}
1815 
1816 	klw = mdb_zalloc(sizeof (kmem_log_walk_t), UM_SLEEP);
1817 	lhp = &klw->klw_lh;
1818 
1819 	if (mdb_vread(lhp, sizeof (kmem_log_header_t), lp) == -1) {
1820 		mdb_warn("failed to read log header at %p", lp);
1821 		mdb_free(klw, sizeof (kmem_log_walk_t));
1822 		return (WALK_ERR);
1823 	}
1824 
1825 	klw->klw_size = lhp->lh_chunksize * lhp->lh_nchunks;
1826 	klw->klw_base = mdb_alloc(klw->klw_size, UM_SLEEP);
1827 	maxndx = lhp->lh_chunksize / sizeof (kmem_bufctl_audit_t) - 1;
1828 
1829 	if (mdb_vread(klw->klw_base, klw->klw_size,
1830 	    (uintptr_t)lhp->lh_base) == -1) {
1831 		mdb_warn("failed to read log at base %p", lhp->lh_base);
1832 		mdb_free(klw->klw_base, klw->klw_size);
1833 		mdb_free(klw, sizeof (kmem_log_walk_t));
1834 		return (WALK_ERR);
1835 	}
1836 
1837 	klw->klw_sorted = mdb_alloc(maxndx * lhp->lh_nchunks *
1838 	    sizeof (kmem_bufctl_audit_t *), UM_SLEEP);
1839 
1840 	for (i = 0, k = 0; i < lhp->lh_nchunks; i++) {
1841 		kmem_bufctl_audit_t *chunk = (kmem_bufctl_audit_t *)
1842 		    ((uintptr_t)klw->klw_base + i * lhp->lh_chunksize);
1843 
1844 		for (j = 0; j < maxndx; j++)
1845 			klw->klw_sorted[k++] = &chunk[j];
1846 	}
1847 
1848 	qsort(klw->klw_sorted, k, sizeof (kmem_bufctl_audit_t *),
1849 	    (int(*)(const void *, const void *))bufctlcmp);
1850 
1851 	klw->klw_maxndx = k;
1852 	wsp->walk_data = klw;
1853 
1854 	return (WALK_NEXT);
1855 }
1856 
1857 int
kmem_log_walk_step(mdb_walk_state_t * wsp)1858 kmem_log_walk_step(mdb_walk_state_t *wsp)
1859 {
1860 	kmem_log_walk_t *klw = wsp->walk_data;
1861 	kmem_bufctl_audit_t *bcp;
1862 
1863 	if (klw->klw_ndx == klw->klw_maxndx)
1864 		return (WALK_DONE);
1865 
1866 	bcp = klw->klw_sorted[klw->klw_ndx++];
1867 
1868 	return (wsp->walk_callback((uintptr_t)bcp - (uintptr_t)klw->klw_base +
1869 	    (uintptr_t)klw->klw_lh.lh_base, bcp, wsp->walk_cbdata));
1870 }
1871 
1872 void
kmem_log_walk_fini(mdb_walk_state_t * wsp)1873 kmem_log_walk_fini(mdb_walk_state_t *wsp)
1874 {
1875 	kmem_log_walk_t *klw = wsp->walk_data;
1876 
1877 	mdb_free(klw->klw_base, klw->klw_size);
1878 	mdb_free(klw->klw_sorted, klw->klw_maxndx *
1879 	    sizeof (kmem_bufctl_audit_t *));
1880 	mdb_free(klw, sizeof (kmem_log_walk_t));
1881 }
1882 
1883 typedef struct allocdby_bufctl {
1884 	uintptr_t abb_addr;
1885 	hrtime_t abb_ts;
1886 } allocdby_bufctl_t;
1887 
1888 typedef struct allocdby_walk {
1889 	const char *abw_walk;
1890 	uintptr_t abw_thread;
1891 	size_t abw_nbufs;
1892 	size_t abw_size;
1893 	allocdby_bufctl_t *abw_buf;
1894 	size_t abw_ndx;
1895 } allocdby_walk_t;
1896 
1897 int
allocdby_walk_bufctl(uintptr_t addr,const kmem_bufctl_audit_t * bcp,allocdby_walk_t * abw)1898 allocdby_walk_bufctl(uintptr_t addr, const kmem_bufctl_audit_t *bcp,
1899     allocdby_walk_t *abw)
1900 {
1901 	if ((uintptr_t)bcp->bc_thread != abw->abw_thread)
1902 		return (WALK_NEXT);
1903 
1904 	if (abw->abw_nbufs == abw->abw_size) {
1905 		allocdby_bufctl_t *buf;
1906 		size_t oldsize = sizeof (allocdby_bufctl_t) * abw->abw_size;
1907 
1908 		buf = mdb_zalloc(oldsize << 1, UM_SLEEP);
1909 
1910 		bcopy(abw->abw_buf, buf, oldsize);
1911 		mdb_free(abw->abw_buf, oldsize);
1912 
1913 		abw->abw_size <<= 1;
1914 		abw->abw_buf = buf;
1915 	}
1916 
1917 	abw->abw_buf[abw->abw_nbufs].abb_addr = addr;
1918 	abw->abw_buf[abw->abw_nbufs].abb_ts = bcp->bc_timestamp;
1919 	abw->abw_nbufs++;
1920 
1921 	return (WALK_NEXT);
1922 }
1923 
1924 /*ARGSUSED*/
1925 int
allocdby_walk_cache(uintptr_t addr,const kmem_cache_t * c,allocdby_walk_t * abw)1926 allocdby_walk_cache(uintptr_t addr, const kmem_cache_t *c, allocdby_walk_t *abw)
1927 {
1928 	if (mdb_pwalk(abw->abw_walk, (mdb_walk_cb_t)allocdby_walk_bufctl,
1929 	    abw, addr) == -1) {
1930 		mdb_warn("couldn't walk bufctl for cache %p", addr);
1931 		return (WALK_DONE);
1932 	}
1933 
1934 	return (WALK_NEXT);
1935 }
1936 
1937 static int
allocdby_cmp(const allocdby_bufctl_t * lhs,const allocdby_bufctl_t * rhs)1938 allocdby_cmp(const allocdby_bufctl_t *lhs, const allocdby_bufctl_t *rhs)
1939 {
1940 	if (lhs->abb_ts < rhs->abb_ts)
1941 		return (1);
1942 	if (lhs->abb_ts > rhs->abb_ts)
1943 		return (-1);
1944 	return (0);
1945 }
1946 
1947 static int
allocdby_walk_init_common(mdb_walk_state_t * wsp,const char * walk)1948 allocdby_walk_init_common(mdb_walk_state_t *wsp, const char *walk)
1949 {
1950 	allocdby_walk_t *abw;
1951 
1952 	if (wsp->walk_addr == 0) {
1953 		mdb_warn("allocdby walk doesn't support global walks\n");
1954 		return (WALK_ERR);
1955 	}
1956 
1957 	abw = mdb_zalloc(sizeof (allocdby_walk_t), UM_SLEEP);
1958 
1959 	abw->abw_thread = wsp->walk_addr;
1960 	abw->abw_walk = walk;
1961 	abw->abw_size = 128;	/* something reasonable */
1962 	abw->abw_buf =
1963 	    mdb_zalloc(abw->abw_size * sizeof (allocdby_bufctl_t), UM_SLEEP);
1964 
1965 	wsp->walk_data = abw;
1966 
1967 	if (mdb_walk("kmem_cache",
1968 	    (mdb_walk_cb_t)allocdby_walk_cache, abw) == -1) {
1969 		mdb_warn("couldn't walk kmem_cache");
1970 		allocdby_walk_fini(wsp);
1971 		return (WALK_ERR);
1972 	}
1973 
1974 	qsort(abw->abw_buf, abw->abw_nbufs, sizeof (allocdby_bufctl_t),
1975 	    (int(*)(const void *, const void *))allocdby_cmp);
1976 
1977 	return (WALK_NEXT);
1978 }
1979 
1980 int
allocdby_walk_init(mdb_walk_state_t * wsp)1981 allocdby_walk_init(mdb_walk_state_t *wsp)
1982 {
1983 	return (allocdby_walk_init_common(wsp, "bufctl"));
1984 }
1985 
1986 int
freedby_walk_init(mdb_walk_state_t * wsp)1987 freedby_walk_init(mdb_walk_state_t *wsp)
1988 {
1989 	return (allocdby_walk_init_common(wsp, "freectl"));
1990 }
1991 
1992 int
allocdby_walk_step(mdb_walk_state_t * wsp)1993 allocdby_walk_step(mdb_walk_state_t *wsp)
1994 {
1995 	allocdby_walk_t *abw = wsp->walk_data;
1996 	kmem_bufctl_audit_t bc;
1997 	uintptr_t addr;
1998 
1999 	if (abw->abw_ndx == abw->abw_nbufs)
2000 		return (WALK_DONE);
2001 
2002 	addr = abw->abw_buf[abw->abw_ndx++].abb_addr;
2003 
2004 	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
2005 		mdb_warn("couldn't read bufctl at %p", addr);
2006 		return (WALK_DONE);
2007 	}
2008 
2009 	return (wsp->walk_callback(addr, &bc, wsp->walk_cbdata));
2010 }
2011 
2012 void
allocdby_walk_fini(mdb_walk_state_t * wsp)2013 allocdby_walk_fini(mdb_walk_state_t *wsp)
2014 {
2015 	allocdby_walk_t *abw = wsp->walk_data;
2016 
2017 	mdb_free(abw->abw_buf, sizeof (allocdby_bufctl_t) * abw->abw_size);
2018 	mdb_free(abw, sizeof (allocdby_walk_t));
2019 }
2020 
2021 /*ARGSUSED*/
2022 int
allocdby_walk(uintptr_t addr,const kmem_bufctl_audit_t * bcp,void * ignored)2023 allocdby_walk(uintptr_t addr, const kmem_bufctl_audit_t *bcp, void *ignored)
2024 {
2025 	char c[MDB_SYM_NAMLEN];
2026 	GElf_Sym sym;
2027 	int i;
2028 
2029 	mdb_printf("%0?p %12llx ", addr, bcp->bc_timestamp);
2030 	for (i = 0; i < bcp->bc_depth; i++) {
2031 		if (mdb_lookup_by_addr(bcp->bc_stack[i],
2032 		    MDB_SYM_FUZZY, c, sizeof (c), &sym) == -1)
2033 			continue;
2034 		if (strncmp(c, "kmem_", 5) == 0)
2035 			continue;
2036 		mdb_printf("%s+0x%lx",
2037 		    c, bcp->bc_stack[i] - (uintptr_t)sym.st_value);
2038 		break;
2039 	}
2040 	mdb_printf("\n");
2041 
2042 	return (WALK_NEXT);
2043 }
2044 
2045 static int
allocdby_common(uintptr_t addr,uint_t flags,const char * w)2046 allocdby_common(uintptr_t addr, uint_t flags, const char *w)
2047 {
2048 	if (!(flags & DCMD_ADDRSPEC))
2049 		return (DCMD_USAGE);
2050 
2051 	mdb_printf("%-?s %12s %s\n", "BUFCTL", "TIMESTAMP", "CALLER");
2052 
2053 	if (mdb_pwalk(w, (mdb_walk_cb_t)allocdby_walk, NULL, addr) == -1) {
2054 		mdb_warn("can't walk '%s' for %p", w, addr);
2055 		return (DCMD_ERR);
2056 	}
2057 
2058 	return (DCMD_OK);
2059 }
2060 
2061 /*ARGSUSED*/
2062 int
allocdby(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2063 allocdby(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2064 {
2065 	return (allocdby_common(addr, flags, "allocdby"));
2066 }
2067 
2068 /*ARGSUSED*/
2069 int
freedby(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2070 freedby(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2071 {
2072 	return (allocdby_common(addr, flags, "freedby"));
2073 }
2074 
2075 /*
2076  * Return a string describing the address in relation to the given thread's
2077  * stack.
2078  *
2079  * - If the thread state is TS_FREE, return " (inactive interrupt thread)".
2080  *
2081  * - If the address is above the stack pointer, return an empty string
2082  *   signifying that the address is active.
2083  *
2084  * - If the address is below the stack pointer, and the thread is not on proc,
2085  *   return " (below sp)".
2086  *
2087  * - If the address is below the stack pointer, and the thread is on proc,
2088  *   return " (possibly below sp)".  Depending on context, we may or may not
2089  *   have an accurate t_sp.
2090  */
2091 static const char *
stack_active(const kthread_t * t,uintptr_t addr)2092 stack_active(const kthread_t *t, uintptr_t addr)
2093 {
2094 	uintptr_t panicstk;
2095 	GElf_Sym sym;
2096 
2097 	if (t->t_state == TS_FREE)
2098 		return (" (inactive interrupt thread)");
2099 
2100 	/*
2101 	 * Check to see if we're on the panic stack.  If so, ignore t_sp, as it
2102 	 * no longer relates to the thread's real stack.
2103 	 */
2104 	if (mdb_lookup_by_name("panic_stack", &sym) == 0) {
2105 		panicstk = (uintptr_t)sym.st_value;
2106 
2107 		if (t->t_sp >= panicstk && t->t_sp < panicstk + PANICSTKSIZE)
2108 			return ("");
2109 	}
2110 
2111 	if (addr >= t->t_sp + STACK_BIAS)
2112 		return ("");
2113 
2114 	if (t->t_state == TS_ONPROC)
2115 		return (" (possibly below sp)");
2116 
2117 	return (" (below sp)");
2118 }
2119 
2120 /*
2121  * Additional state for the kmem and vmem ::whatis handlers
2122  */
2123 typedef struct whatis_info {
2124 	mdb_whatis_t *wi_w;
2125 	const kmem_cache_t *wi_cache;
2126 	const vmem_t *wi_vmem;
2127 	vmem_t *wi_msb_arena;
2128 	size_t wi_slab_size;
2129 	uint_t wi_slab_found;
2130 	uint_t wi_kmem_lite_count;
2131 	uint_t wi_freemem;
2132 } whatis_info_t;
2133 
2134 /* call one of our dcmd functions with "-v" and the provided address */
2135 static void
whatis_call_printer(mdb_dcmd_f * dcmd,uintptr_t addr)2136 whatis_call_printer(mdb_dcmd_f *dcmd, uintptr_t addr)
2137 {
2138 	mdb_arg_t a;
2139 	a.a_type = MDB_TYPE_STRING;
2140 	a.a_un.a_str = "-v";
2141 
2142 	mdb_printf(":\n");
2143 	(void) (*dcmd)(addr, DCMD_ADDRSPEC, 1, &a);
2144 }
2145 
2146 static void
whatis_print_kmf_lite(uintptr_t btaddr,size_t count)2147 whatis_print_kmf_lite(uintptr_t btaddr, size_t count)
2148 {
2149 #define	KMEM_LITE_MAX	16
2150 	pc_t callers[KMEM_LITE_MAX];
2151 	pc_t uninit = (pc_t)KMEM_UNINITIALIZED_PATTERN;
2152 
2153 	kmem_buftag_t bt;
2154 	intptr_t stat;
2155 	const char *plural = "";
2156 	int i;
2157 
2158 	/* validate our arguments and read in the buftag */
2159 	if (count == 0 || count > KMEM_LITE_MAX ||
2160 	    mdb_vread(&bt, sizeof (bt), btaddr) == -1)
2161 		return;
2162 
2163 	/* validate the buffer state and read in the callers */
2164 	stat = (intptr_t)bt.bt_bufctl ^ bt.bt_bxstat;
2165 
2166 	if (stat != KMEM_BUFTAG_ALLOC && stat != KMEM_BUFTAG_FREE)
2167 		return;
2168 
2169 	if (mdb_vread(callers, count * sizeof (pc_t),
2170 	    btaddr + offsetof(kmem_buftag_lite_t, bt_history)) == -1)
2171 		return;
2172 
2173 	/* If there aren't any filled in callers, bail */
2174 	if (callers[0] == uninit)
2175 		return;
2176 
2177 	plural = (callers[1] == uninit) ? "" : "s";
2178 
2179 	/* Everything's done and checked; print them out */
2180 	mdb_printf(":\n");
2181 
2182 	mdb_inc_indent(8);
2183 	mdb_printf("recent caller%s: %a", plural, callers[0]);
2184 	for (i = 1; i < count; i++) {
2185 		if (callers[i] == uninit)
2186 			break;
2187 		mdb_printf(", %a", callers[i]);
2188 	}
2189 	mdb_dec_indent(8);
2190 }
2191 
2192 static void
whatis_print_kmem(whatis_info_t * wi,uintptr_t maddr,uintptr_t addr,uintptr_t baddr)2193 whatis_print_kmem(whatis_info_t *wi, uintptr_t maddr, uintptr_t addr,
2194     uintptr_t baddr)
2195 {
2196 	mdb_whatis_t *w = wi->wi_w;
2197 
2198 	const kmem_cache_t *cp = wi->wi_cache;
2199 	/* LINTED pointer cast may result in improper alignment */
2200 	uintptr_t btaddr = (uintptr_t)KMEM_BUFTAG(cp, addr);
2201 	int quiet = (mdb_whatis_flags(w) & WHATIS_QUIET);
2202 	int call_printer = (!quiet && (cp->cache_flags & KMF_AUDIT));
2203 
2204 	mdb_whatis_report_object(w, maddr, addr, "");
2205 
2206 	if (baddr != 0 && !call_printer)
2207 		mdb_printf("bufctl %p ", baddr);
2208 
2209 	mdb_printf("%s from %s",
2210 	    (wi->wi_freemem == FALSE) ? "allocated" : "freed", cp->cache_name);
2211 
2212 	if (baddr != 0 && call_printer) {
2213 		whatis_call_printer(bufctl, baddr);
2214 		return;
2215 	}
2216 
2217 	/* for KMF_LITE caches, try to print out the previous callers */
2218 	if (!quiet && (cp->cache_flags & KMF_LITE))
2219 		whatis_print_kmf_lite(btaddr, wi->wi_kmem_lite_count);
2220 
2221 	mdb_printf("\n");
2222 }
2223 
2224 /*ARGSUSED*/
2225 static int
whatis_walk_kmem(uintptr_t addr,void * ignored,whatis_info_t * wi)2226 whatis_walk_kmem(uintptr_t addr, void *ignored, whatis_info_t *wi)
2227 {
2228 	mdb_whatis_t *w = wi->wi_w;
2229 
2230 	uintptr_t cur;
2231 	size_t size = wi->wi_cache->cache_bufsize;
2232 
2233 	while (mdb_whatis_match(w, addr, size, &cur))
2234 		whatis_print_kmem(wi, cur, addr, 0);
2235 
2236 	return (WHATIS_WALKRET(w));
2237 }
2238 
2239 /*ARGSUSED*/
2240 static int
whatis_walk_bufctl(uintptr_t baddr,const kmem_bufctl_t * bcp,whatis_info_t * wi)2241 whatis_walk_bufctl(uintptr_t baddr, const kmem_bufctl_t *bcp, whatis_info_t *wi)
2242 {
2243 	mdb_whatis_t *w = wi->wi_w;
2244 
2245 	uintptr_t cur;
2246 	uintptr_t addr = (uintptr_t)bcp->bc_addr;
2247 	size_t size = wi->wi_cache->cache_bufsize;
2248 
2249 	while (mdb_whatis_match(w, addr, size, &cur))
2250 		whatis_print_kmem(wi, cur, addr, baddr);
2251 
2252 	return (WHATIS_WALKRET(w));
2253 }
2254 
2255 static int
whatis_walk_seg(uintptr_t addr,const vmem_seg_t * vs,whatis_info_t * wi)2256 whatis_walk_seg(uintptr_t addr, const vmem_seg_t *vs, whatis_info_t *wi)
2257 {
2258 	mdb_whatis_t *w = wi->wi_w;
2259 
2260 	size_t size = vs->vs_end - vs->vs_start;
2261 	uintptr_t cur;
2262 
2263 	/* We're not interested in anything but alloc and free segments */
2264 	if (vs->vs_type != VMEM_ALLOC && vs->vs_type != VMEM_FREE)
2265 		return (WALK_NEXT);
2266 
2267 	while (mdb_whatis_match(w, vs->vs_start, size, &cur)) {
2268 		mdb_whatis_report_object(w, cur, vs->vs_start, "");
2269 
2270 		/*
2271 		 * If we're not printing it seperately, provide the vmem_seg
2272 		 * pointer if it has a stack trace.
2273 		 */
2274 		if ((mdb_whatis_flags(w) & WHATIS_QUIET) &&
2275 		    (!(mdb_whatis_flags(w) & WHATIS_BUFCTL) ||
2276 		    (vs->vs_type == VMEM_ALLOC && vs->vs_depth != 0))) {
2277 			mdb_printf("vmem_seg %p ", addr);
2278 		}
2279 
2280 		mdb_printf("%s from the %s vmem arena",
2281 		    (vs->vs_type == VMEM_ALLOC) ? "allocated" : "freed",
2282 		    wi->wi_vmem->vm_name);
2283 
2284 		if (!(mdb_whatis_flags(w) & WHATIS_QUIET))
2285 			whatis_call_printer(vmem_seg, addr);
2286 		else
2287 			mdb_printf("\n");
2288 	}
2289 
2290 	return (WHATIS_WALKRET(w));
2291 }
2292 
2293 static int
whatis_walk_vmem(uintptr_t addr,const vmem_t * vmem,whatis_info_t * wi)2294 whatis_walk_vmem(uintptr_t addr, const vmem_t *vmem, whatis_info_t *wi)
2295 {
2296 	mdb_whatis_t *w = wi->wi_w;
2297 	const char *nm = vmem->vm_name;
2298 
2299 	int identifier = ((vmem->vm_cflags & VMC_IDENTIFIER) != 0);
2300 	int idspace = ((mdb_whatis_flags(w) & WHATIS_IDSPACE) != 0);
2301 
2302 	if (identifier != idspace)
2303 		return (WALK_NEXT);
2304 
2305 	wi->wi_vmem = vmem;
2306 
2307 	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2308 		mdb_printf("Searching vmem arena %s...\n", nm);
2309 
2310 	if (mdb_pwalk("vmem_seg",
2311 	    (mdb_walk_cb_t)whatis_walk_seg, wi, addr) == -1) {
2312 		mdb_warn("can't walk vmem_seg for %p", addr);
2313 		return (WALK_NEXT);
2314 	}
2315 
2316 	return (WHATIS_WALKRET(w));
2317 }
2318 
2319 /*ARGSUSED*/
2320 static int
whatis_walk_slab(uintptr_t saddr,const kmem_slab_t * sp,whatis_info_t * wi)2321 whatis_walk_slab(uintptr_t saddr, const kmem_slab_t *sp, whatis_info_t *wi)
2322 {
2323 	mdb_whatis_t *w = wi->wi_w;
2324 
2325 	/* It must overlap with the slab data, or it's not interesting */
2326 	if (mdb_whatis_overlaps(w,
2327 	    (uintptr_t)sp->slab_base, wi->wi_slab_size)) {
2328 		wi->wi_slab_found++;
2329 		return (WALK_DONE);
2330 	}
2331 	return (WALK_NEXT);
2332 }
2333 
2334 static int
whatis_walk_cache(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2335 whatis_walk_cache(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2336 {
2337 	mdb_whatis_t *w = wi->wi_w;
2338 
2339 	char *walk, *freewalk;
2340 	mdb_walk_cb_t func;
2341 	int do_bufctl;
2342 
2343 	int identifier = ((c->cache_flags & KMC_IDENTIFIER) != 0);
2344 	int idspace = ((mdb_whatis_flags(w) & WHATIS_IDSPACE) != 0);
2345 
2346 	if (identifier != idspace)
2347 		return (WALK_NEXT);
2348 
2349 	/* Override the '-b' flag as necessary */
2350 	if (!(c->cache_flags & KMF_HASH))
2351 		do_bufctl = FALSE;	/* no bufctls to walk */
2352 	else if (c->cache_flags & KMF_AUDIT)
2353 		do_bufctl = TRUE;	/* we always want debugging info */
2354 	else
2355 		do_bufctl = ((mdb_whatis_flags(w) & WHATIS_BUFCTL) != 0);
2356 
2357 	if (do_bufctl) {
2358 		walk = "bufctl";
2359 		freewalk = "freectl";
2360 		func = (mdb_walk_cb_t)whatis_walk_bufctl;
2361 	} else {
2362 		walk = "kmem";
2363 		freewalk = "freemem";
2364 		func = (mdb_walk_cb_t)whatis_walk_kmem;
2365 	}
2366 
2367 	wi->wi_cache = c;
2368 
2369 	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2370 		mdb_printf("Searching %s...\n", c->cache_name);
2371 
2372 	/*
2373 	 * If more then two buffers live on each slab, figure out if we're
2374 	 * interested in anything in any slab before doing the more expensive
2375 	 * kmem/freemem (bufctl/freectl) walkers.
2376 	 */
2377 	wi->wi_slab_size = c->cache_slabsize - c->cache_maxcolor;
2378 	if (!(c->cache_flags & KMF_HASH))
2379 		wi->wi_slab_size -= sizeof (kmem_slab_t);
2380 
2381 	if ((wi->wi_slab_size / c->cache_chunksize) > 2) {
2382 		wi->wi_slab_found = 0;
2383 		if (mdb_pwalk("kmem_slab", (mdb_walk_cb_t)whatis_walk_slab, wi,
2384 		    addr) == -1) {
2385 			mdb_warn("can't find kmem_slab walker");
2386 			return (WALK_DONE);
2387 		}
2388 		if (wi->wi_slab_found == 0)
2389 			return (WALK_NEXT);
2390 	}
2391 
2392 	wi->wi_freemem = FALSE;
2393 	if (mdb_pwalk(walk, func, wi, addr) == -1) {
2394 		mdb_warn("can't find %s walker", walk);
2395 		return (WALK_DONE);
2396 	}
2397 
2398 	if (mdb_whatis_done(w))
2399 		return (WALK_DONE);
2400 
2401 	/*
2402 	 * We have searched for allocated memory; now search for freed memory.
2403 	 */
2404 	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2405 		mdb_printf("Searching %s for free memory...\n", c->cache_name);
2406 
2407 	wi->wi_freemem = TRUE;
2408 	if (mdb_pwalk(freewalk, func, wi, addr) == -1) {
2409 		mdb_warn("can't find %s walker", freewalk);
2410 		return (WALK_DONE);
2411 	}
2412 
2413 	return (WHATIS_WALKRET(w));
2414 }
2415 
2416 static int
whatis_walk_touch(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2417 whatis_walk_touch(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2418 {
2419 	if (c->cache_arena == wi->wi_msb_arena ||
2420 	    (c->cache_cflags & KMC_NOTOUCH))
2421 		return (WALK_NEXT);
2422 
2423 	return (whatis_walk_cache(addr, c, wi));
2424 }
2425 
2426 static int
whatis_walk_metadata(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2427 whatis_walk_metadata(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2428 {
2429 	if (c->cache_arena != wi->wi_msb_arena)
2430 		return (WALK_NEXT);
2431 
2432 	return (whatis_walk_cache(addr, c, wi));
2433 }
2434 
2435 static int
whatis_walk_notouch(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2436 whatis_walk_notouch(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2437 {
2438 	if (c->cache_arena == wi->wi_msb_arena ||
2439 	    !(c->cache_cflags & KMC_NOTOUCH))
2440 		return (WALK_NEXT);
2441 
2442 	return (whatis_walk_cache(addr, c, wi));
2443 }
2444 
2445 static int
whatis_walk_thread(uintptr_t addr,const kthread_t * t,mdb_whatis_t * w)2446 whatis_walk_thread(uintptr_t addr, const kthread_t *t, mdb_whatis_t *w)
2447 {
2448 	uintptr_t cur;
2449 	uintptr_t saddr;
2450 	size_t size;
2451 
2452 	/*
2453 	 * Often, one calls ::whatis on an address from a thread structure.
2454 	 * We use this opportunity to short circuit this case...
2455 	 */
2456 	while (mdb_whatis_match(w, addr, sizeof (kthread_t), &cur))
2457 		mdb_whatis_report_object(w, cur, addr,
2458 		    "allocated as a thread structure\n");
2459 
2460 	/*
2461 	 * Now check the stack
2462 	 */
2463 	if (t->t_stkbase == NULL)
2464 		return (WALK_NEXT);
2465 
2466 	/*
2467 	 * This assumes that t_stk is the end of the stack, but it's really
2468 	 * only the initial stack pointer for the thread.  Arguments to the
2469 	 * initial procedure, SA(MINFRAME), etc. are all after t_stk.  So
2470 	 * that 't->t_stk::whatis' reports "part of t's stack", we include
2471 	 * t_stk in the range (the "+ 1", below), but the kernel should
2472 	 * really include the full stack bounds where we can find it.
2473 	 */
2474 	saddr = (uintptr_t)t->t_stkbase;
2475 	size = (uintptr_t)t->t_stk - saddr + 1;
2476 	while (mdb_whatis_match(w, saddr, size, &cur))
2477 		mdb_whatis_report_object(w, cur, cur,
2478 		    "in thread %p's stack%s\n", addr, stack_active(t, cur));
2479 
2480 	return (WHATIS_WALKRET(w));
2481 }
2482 
2483 static void
whatis_modctl_match(mdb_whatis_t * w,const char * name,uintptr_t base,size_t size,const char * where)2484 whatis_modctl_match(mdb_whatis_t *w, const char *name,
2485     uintptr_t base, size_t size, const char *where)
2486 {
2487 	uintptr_t cur;
2488 
2489 	/*
2490 	 * Since we're searching for addresses inside a module, we report
2491 	 * them as symbols.
2492 	 */
2493 	while (mdb_whatis_match(w, base, size, &cur))
2494 		mdb_whatis_report_address(w, cur, "in %s's %s\n", name, where);
2495 }
2496 
2497 struct kmem_ctf_module {
2498 	Shdr *symhdr;
2499 	char *symtbl;
2500 	unsigned int nsyms;
2501 	char *symspace;
2502 	size_t symsize;
2503 	char *text;
2504 	char *data;
2505 	uintptr_t bss;
2506 	size_t text_size;
2507 	size_t data_size;
2508 	size_t bss_size;
2509 };
2510 
2511 static int
whatis_walk_modctl(uintptr_t addr,const struct modctl * m,mdb_whatis_t * w)2512 whatis_walk_modctl(uintptr_t addr, const struct modctl *m, mdb_whatis_t *w)
2513 {
2514 	char name[MODMAXNAMELEN];
2515 	struct kmem_ctf_module mod;
2516 	Shdr shdr;
2517 
2518 	if (m->mod_mp == NULL)
2519 		return (WALK_NEXT);
2520 
2521 	if (mdb_ctf_vread(&mod, "struct module", "struct kmem_ctf_module",
2522 	    (uintptr_t)m->mod_mp, 0) == -1) {
2523 		mdb_warn("couldn't read modctl %p's module", addr);
2524 		return (WALK_NEXT);
2525 	}
2526 
2527 	if (mdb_readstr(name, sizeof (name), (uintptr_t)m->mod_modname) == -1)
2528 		(void) mdb_snprintf(name, sizeof (name), "0x%p", addr);
2529 
2530 	whatis_modctl_match(w, name,
2531 	    (uintptr_t)mod.text, mod.text_size, "text segment");
2532 	whatis_modctl_match(w, name,
2533 	    (uintptr_t)mod.data, mod.data_size, "data segment");
2534 	whatis_modctl_match(w, name,
2535 	    (uintptr_t)mod.bss, mod.bss_size, "bss segment");
2536 
2537 	if (mdb_vread(&shdr, sizeof (shdr), (uintptr_t)mod.symhdr) == -1) {
2538 		mdb_warn("couldn't read symbol header for %p's module", addr);
2539 		return (WALK_NEXT);
2540 	}
2541 
2542 	whatis_modctl_match(w, name,
2543 	    (uintptr_t)mod.symtbl, mod.nsyms * shdr.sh_entsize, "symtab");
2544 	whatis_modctl_match(w, name,
2545 	    (uintptr_t)mod.symspace, mod.symsize, "symtab");
2546 
2547 	return (WHATIS_WALKRET(w));
2548 }
2549 
2550 /*ARGSUSED*/
2551 static int
whatis_walk_memseg(uintptr_t addr,const struct memseg * seg,mdb_whatis_t * w)2552 whatis_walk_memseg(uintptr_t addr, const struct memseg *seg, mdb_whatis_t *w)
2553 {
2554 	uintptr_t cur;
2555 
2556 	uintptr_t base = (uintptr_t)seg->pages;
2557 	size_t size = (uintptr_t)seg->epages - base;
2558 
2559 	while (mdb_whatis_match(w, base, size, &cur)) {
2560 		/* round our found pointer down to the page_t base. */
2561 		size_t offset = (cur - base) % sizeof (page_t);
2562 
2563 		mdb_whatis_report_object(w, cur, cur - offset,
2564 		    "allocated as a page structure\n");
2565 	}
2566 
2567 	return (WHATIS_WALKRET(w));
2568 }
2569 
2570 /*ARGSUSED*/
2571 static int
whatis_run_modules(mdb_whatis_t * w,void * arg)2572 whatis_run_modules(mdb_whatis_t *w, void *arg)
2573 {
2574 	if (mdb_walk("modctl", (mdb_walk_cb_t)whatis_walk_modctl, w) == -1) {
2575 		mdb_warn("couldn't find modctl walker");
2576 		return (1);
2577 	}
2578 	return (0);
2579 }
2580 
2581 /*ARGSUSED*/
2582 static int
whatis_run_threads(mdb_whatis_t * w,void * ignored)2583 whatis_run_threads(mdb_whatis_t *w, void *ignored)
2584 {
2585 	/*
2586 	 * Now search all thread stacks.  Yes, this is a little weak; we
2587 	 * can save a lot of work by first checking to see if the
2588 	 * address is in segkp vs. segkmem.  But hey, computers are
2589 	 * fast.
2590 	 */
2591 	if (mdb_walk("thread", (mdb_walk_cb_t)whatis_walk_thread, w) == -1) {
2592 		mdb_warn("couldn't find thread walker");
2593 		return (1);
2594 	}
2595 	return (0);
2596 }
2597 
2598 /*ARGSUSED*/
2599 static int
whatis_run_pages(mdb_whatis_t * w,void * ignored)2600 whatis_run_pages(mdb_whatis_t *w, void *ignored)
2601 {
2602 	if (mdb_walk("memseg", (mdb_walk_cb_t)whatis_walk_memseg, w) == -1) {
2603 		mdb_warn("couldn't find memseg walker");
2604 		return (1);
2605 	}
2606 	return (0);
2607 }
2608 
2609 /*ARGSUSED*/
2610 static int
whatis_run_kmem(mdb_whatis_t * w,void * ignored)2611 whatis_run_kmem(mdb_whatis_t *w, void *ignored)
2612 {
2613 	whatis_info_t wi;
2614 
2615 	bzero(&wi, sizeof (wi));
2616 	wi.wi_w = w;
2617 
2618 	if (mdb_readvar(&wi.wi_msb_arena, "kmem_msb_arena") == -1)
2619 		mdb_warn("unable to readvar \"kmem_msb_arena\"");
2620 
2621 	if (mdb_readvar(&wi.wi_kmem_lite_count,
2622 	    "kmem_lite_count") == -1 || wi.wi_kmem_lite_count > 16)
2623 		wi.wi_kmem_lite_count = 0;
2624 
2625 	/*
2626 	 * We process kmem caches in the following order:
2627 	 *
2628 	 *	non-KMC_NOTOUCH, non-metadata	(typically the most interesting)
2629 	 *	metadata			(can be huge with KMF_AUDIT)
2630 	 *	KMC_NOTOUCH, non-metadata	(see kmem_walk_all())
2631 	 */
2632 	if (mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_touch,
2633 	    &wi) == -1 ||
2634 	    mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_metadata,
2635 	    &wi) == -1 ||
2636 	    mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_notouch,
2637 	    &wi) == -1) {
2638 		mdb_warn("couldn't find kmem_cache walker");
2639 		return (1);
2640 	}
2641 	return (0);
2642 }
2643 
2644 /*ARGSUSED*/
2645 static int
whatis_run_vmem(mdb_whatis_t * w,void * ignored)2646 whatis_run_vmem(mdb_whatis_t *w, void *ignored)
2647 {
2648 	whatis_info_t wi;
2649 
2650 	bzero(&wi, sizeof (wi));
2651 	wi.wi_w = w;
2652 
2653 	if (mdb_walk("vmem_postfix",
2654 	    (mdb_walk_cb_t)whatis_walk_vmem, &wi) == -1) {
2655 		mdb_warn("couldn't find vmem_postfix walker");
2656 		return (1);
2657 	}
2658 	return (0);
2659 }
2660 
2661 typedef struct kmem_log_cpu {
2662 	uintptr_t kmc_low;
2663 	uintptr_t kmc_high;
2664 } kmem_log_cpu_t;
2665 
2666 typedef struct kmem_log_data {
2667 	uintptr_t kmd_addr;
2668 	kmem_log_cpu_t *kmd_cpu;
2669 } kmem_log_data_t;
2670 
2671 int
kmem_log_walk(uintptr_t addr,const kmem_bufctl_audit_t * b,kmem_log_data_t * kmd)2672 kmem_log_walk(uintptr_t addr, const kmem_bufctl_audit_t *b,
2673     kmem_log_data_t *kmd)
2674 {
2675 	int i;
2676 	kmem_log_cpu_t *kmc = kmd->kmd_cpu;
2677 	size_t bufsize;
2678 
2679 	for (i = 0; i < NCPU; i++) {
2680 		if (addr >= kmc[i].kmc_low && addr < kmc[i].kmc_high)
2681 			break;
2682 	}
2683 
2684 	if (kmd->kmd_addr) {
2685 		if (b->bc_cache == NULL)
2686 			return (WALK_NEXT);
2687 
2688 		if (mdb_vread(&bufsize, sizeof (bufsize),
2689 		    (uintptr_t)&b->bc_cache->cache_bufsize) == -1) {
2690 			mdb_warn(
2691 			    "failed to read cache_bufsize for cache at %p",
2692 			    b->bc_cache);
2693 			return (WALK_ERR);
2694 		}
2695 
2696 		if (kmd->kmd_addr < (uintptr_t)b->bc_addr ||
2697 		    kmd->kmd_addr >= (uintptr_t)b->bc_addr + bufsize)
2698 			return (WALK_NEXT);
2699 	}
2700 
2701 	if (i == NCPU)
2702 		mdb_printf("   ");
2703 	else
2704 		mdb_printf("%3d", i);
2705 
2706 	mdb_printf(" %0?p %0?p %16llx %0?p\n", addr, b->bc_addr,
2707 	    b->bc_timestamp, b->bc_thread);
2708 
2709 	return (WALK_NEXT);
2710 }
2711 
2712 /*ARGSUSED*/
2713 int
kmem_log(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2714 kmem_log(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2715 {
2716 	kmem_log_header_t lh;
2717 	kmem_cpu_log_header_t clh;
2718 	uintptr_t lhp, clhp;
2719 	int ncpus;
2720 	uintptr_t *cpu;
2721 	GElf_Sym sym;
2722 	kmem_log_cpu_t *kmc;
2723 	int i;
2724 	kmem_log_data_t kmd;
2725 	uint_t opt_b = FALSE;
2726 
2727 	if (mdb_getopts(argc, argv,
2728 	    'b', MDB_OPT_SETBITS, TRUE, &opt_b, NULL) != argc)
2729 		return (DCMD_USAGE);
2730 
2731 	if (mdb_readvar(&lhp, "kmem_transaction_log") == -1) {
2732 		mdb_warn("failed to read 'kmem_transaction_log'");
2733 		return (DCMD_ERR);
2734 	}
2735 
2736 	if (lhp == 0) {
2737 		mdb_warn("no kmem transaction log\n");
2738 		return (DCMD_ERR);
2739 	}
2740 
2741 	mdb_readvar(&ncpus, "ncpus");
2742 
2743 	if (mdb_vread(&lh, sizeof (kmem_log_header_t), lhp) == -1) {
2744 		mdb_warn("failed to read log header at %p", lhp);
2745 		return (DCMD_ERR);
2746 	}
2747 
2748 	clhp = lhp + ((uintptr_t)&lh.lh_cpu[0] - (uintptr_t)&lh);
2749 
2750 	cpu = mdb_alloc(sizeof (uintptr_t) * NCPU, UM_SLEEP | UM_GC);
2751 
2752 	if (mdb_lookup_by_name("cpu", &sym) == -1) {
2753 		mdb_warn("couldn't find 'cpu' array");
2754 		return (DCMD_ERR);
2755 	}
2756 
2757 	if (sym.st_size != NCPU * sizeof (uintptr_t)) {
2758 		mdb_warn("expected 'cpu' to be of size %d; found %d\n",
2759 		    NCPU * sizeof (uintptr_t), sym.st_size);
2760 		return (DCMD_ERR);
2761 	}
2762 
2763 	if (mdb_vread(cpu, sym.st_size, (uintptr_t)sym.st_value) == -1) {
2764 		mdb_warn("failed to read cpu array at %p", sym.st_value);
2765 		return (DCMD_ERR);
2766 	}
2767 
2768 	kmc = mdb_zalloc(sizeof (kmem_log_cpu_t) * NCPU, UM_SLEEP | UM_GC);
2769 	kmd.kmd_addr = 0;
2770 	kmd.kmd_cpu = kmc;
2771 
2772 	for (i = 0; i < NCPU; i++) {
2773 
2774 		if (cpu[i] == 0)
2775 			continue;
2776 
2777 		if (mdb_vread(&clh, sizeof (clh), clhp) == -1) {
2778 			mdb_warn("cannot read cpu %d's log header at %p",
2779 			    i, clhp);
2780 			return (DCMD_ERR);
2781 		}
2782 
2783 		kmc[i].kmc_low = clh.clh_chunk * lh.lh_chunksize +
2784 		    (uintptr_t)lh.lh_base;
2785 		kmc[i].kmc_high = (uintptr_t)clh.clh_current;
2786 
2787 		clhp += sizeof (kmem_cpu_log_header_t);
2788 	}
2789 
2790 	mdb_printf("%3s %-?s %-?s %16s %-?s\n", "CPU", "ADDR", "BUFADDR",
2791 	    "TIMESTAMP", "THREAD");
2792 
2793 	/*
2794 	 * If we have been passed an address, print out only log entries
2795 	 * corresponding to that address.  If opt_b is specified, then interpret
2796 	 * the address as a bufctl.
2797 	 */
2798 	if (flags & DCMD_ADDRSPEC) {
2799 		kmem_bufctl_audit_t b;
2800 
2801 		if (opt_b) {
2802 			kmd.kmd_addr = addr;
2803 		} else {
2804 			if (mdb_vread(&b,
2805 			    sizeof (kmem_bufctl_audit_t), addr) == -1) {
2806 				mdb_warn("failed to read bufctl at %p", addr);
2807 				return (DCMD_ERR);
2808 			}
2809 
2810 			(void) kmem_log_walk(addr, &b, &kmd);
2811 
2812 			return (DCMD_OK);
2813 		}
2814 	}
2815 
2816 	if (mdb_walk("kmem_log", (mdb_walk_cb_t)kmem_log_walk, &kmd) == -1) {
2817 		mdb_warn("can't find kmem log walker");
2818 		return (DCMD_ERR);
2819 	}
2820 
2821 	return (DCMD_OK);
2822 }
2823 
2824 typedef struct bufctl_history_cb {
2825 	int		bhc_flags;
2826 	int		bhc_argc;
2827 	const mdb_arg_t	*bhc_argv;
2828 	int		bhc_ret;
2829 } bufctl_history_cb_t;
2830 
2831 /*ARGSUSED*/
2832 static int
bufctl_history_callback(uintptr_t addr,const void * ign,void * arg)2833 bufctl_history_callback(uintptr_t addr, const void *ign, void *arg)
2834 {
2835 	bufctl_history_cb_t *bhc = arg;
2836 
2837 	bhc->bhc_ret =
2838 	    bufctl(addr, bhc->bhc_flags, bhc->bhc_argc, bhc->bhc_argv);
2839 
2840 	bhc->bhc_flags &= ~DCMD_LOOPFIRST;
2841 
2842 	return ((bhc->bhc_ret == DCMD_OK)? WALK_NEXT : WALK_DONE);
2843 }
2844 
2845 void
bufctl_help(void)2846 bufctl_help(void)
2847 {
2848 	mdb_printf("%s",
2849 "Display the contents of kmem_bufctl_audit_ts, with optional filtering.\n\n");
2850 	mdb_dec_indent(2);
2851 	mdb_printf("%<b>OPTIONS%</b>\n");
2852 	mdb_inc_indent(2);
2853 	mdb_printf("%s",
2854 "  -v    Display the full content of the bufctl, including its stack trace\n"
2855 "  -h    retrieve the bufctl's transaction history, if available\n"
2856 "  -a addr\n"
2857 "        filter out bufctls not involving the buffer at addr\n"
2858 "  -c caller\n"
2859 "        filter out bufctls without the function/PC in their stack trace\n"
2860 "  -e earliest\n"
2861 "        filter out bufctls timestamped before earliest\n"
2862 "  -l latest\n"
2863 "        filter out bufctls timestamped after latest\n"
2864 "  -t thread\n"
2865 "        filter out bufctls not involving thread\n");
2866 }
2867 
2868 int
bufctl(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2869 bufctl(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2870 {
2871 	kmem_bufctl_audit_t bc;
2872 	uint_t verbose = FALSE;
2873 	uint_t history = FALSE;
2874 	uint_t in_history = FALSE;
2875 	uintptr_t caller = 0, thread = 0;
2876 	uintptr_t laddr, haddr, baddr = 0;
2877 	hrtime_t earliest = 0, latest = 0;
2878 	int i, depth;
2879 	char c[MDB_SYM_NAMLEN];
2880 	GElf_Sym sym;
2881 
2882 	if (mdb_getopts(argc, argv,
2883 	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
2884 	    'h', MDB_OPT_SETBITS, TRUE, &history,
2885 	    'H', MDB_OPT_SETBITS, TRUE, &in_history,		/* internal */
2886 	    'c', MDB_OPT_UINTPTR, &caller,
2887 	    't', MDB_OPT_UINTPTR, &thread,
2888 	    'e', MDB_OPT_UINT64, &earliest,
2889 	    'l', MDB_OPT_UINT64, &latest,
2890 	    'a', MDB_OPT_UINTPTR, &baddr, NULL) != argc)
2891 		return (DCMD_USAGE);
2892 
2893 	if (!(flags & DCMD_ADDRSPEC))
2894 		return (DCMD_USAGE);
2895 
2896 	if (in_history && !history)
2897 		return (DCMD_USAGE);
2898 
2899 	if (history && !in_history) {
2900 		mdb_arg_t *nargv = mdb_zalloc(sizeof (*nargv) * (argc + 1),
2901 		    UM_SLEEP | UM_GC);
2902 		bufctl_history_cb_t bhc;
2903 
2904 		nargv[0].a_type = MDB_TYPE_STRING;
2905 		nargv[0].a_un.a_str = "-H";		/* prevent recursion */
2906 
2907 		for (i = 0; i < argc; i++)
2908 			nargv[i + 1] = argv[i];
2909 
2910 		/*
2911 		 * When in history mode, we treat each element as if it
2912 		 * were in a seperate loop, so that the headers group
2913 		 * bufctls with similar histories.
2914 		 */
2915 		bhc.bhc_flags = flags | DCMD_LOOP | DCMD_LOOPFIRST;
2916 		bhc.bhc_argc = argc + 1;
2917 		bhc.bhc_argv = nargv;
2918 		bhc.bhc_ret = DCMD_OK;
2919 
2920 		if (mdb_pwalk("bufctl_history", bufctl_history_callback, &bhc,
2921 		    addr) == -1) {
2922 			mdb_warn("unable to walk bufctl_history");
2923 			return (DCMD_ERR);
2924 		}
2925 
2926 		if (bhc.bhc_ret == DCMD_OK && !(flags & DCMD_PIPE_OUT))
2927 			mdb_printf("\n");
2928 
2929 		return (bhc.bhc_ret);
2930 	}
2931 
2932 	if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
2933 		if (verbose) {
2934 			mdb_printf("%16s %16s %16s %16s\n"
2935 			    "%<u>%16s %16s %16s %16s%</u>\n",
2936 			    "ADDR", "BUFADDR", "TIMESTAMP", "THREAD",
2937 			    "", "CACHE", "LASTLOG", "CONTENTS");
2938 		} else {
2939 			mdb_printf("%<u>%-?s %-?s %-12s %-?s %s%</u>\n",
2940 			    "ADDR", "BUFADDR", "TIMESTAMP", "THREAD", "CALLER");
2941 		}
2942 	}
2943 
2944 	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
2945 		mdb_warn("couldn't read bufctl at %p", addr);
2946 		return (DCMD_ERR);
2947 	}
2948 
2949 	/*
2950 	 * Guard against bogus bc_depth in case the bufctl is corrupt or
2951 	 * the address does not really refer to a bufctl.
2952 	 */
2953 	depth = MIN(bc.bc_depth, KMEM_STACK_DEPTH);
2954 
2955 	if (caller != 0) {
2956 		laddr = caller;
2957 		haddr = caller + sizeof (caller);
2958 
2959 		if (mdb_lookup_by_addr(caller, MDB_SYM_FUZZY, c, sizeof (c),
2960 		    &sym) != -1 && caller == (uintptr_t)sym.st_value) {
2961 			/*
2962 			 * We were provided an exact symbol value; any
2963 			 * address in the function is valid.
2964 			 */
2965 			laddr = (uintptr_t)sym.st_value;
2966 			haddr = (uintptr_t)sym.st_value + sym.st_size;
2967 		}
2968 
2969 		for (i = 0; i < depth; i++)
2970 			if (bc.bc_stack[i] >= laddr && bc.bc_stack[i] < haddr)
2971 				break;
2972 
2973 		if (i == depth)
2974 			return (DCMD_OK);
2975 	}
2976 
2977 	if (thread != 0 && (uintptr_t)bc.bc_thread != thread)
2978 		return (DCMD_OK);
2979 
2980 	if (earliest != 0 && bc.bc_timestamp < earliest)
2981 		return (DCMD_OK);
2982 
2983 	if (latest != 0 && bc.bc_timestamp > latest)
2984 		return (DCMD_OK);
2985 
2986 	if (baddr != 0 && (uintptr_t)bc.bc_addr != baddr)
2987 		return (DCMD_OK);
2988 
2989 	if (flags & DCMD_PIPE_OUT) {
2990 		mdb_printf("%#lr\n", addr);
2991 		return (DCMD_OK);
2992 	}
2993 
2994 	if (verbose) {
2995 		mdb_printf(
2996 		    "%<b>%16p%</b> %16p %16llx %16p\n"
2997 		    "%16s %16p %16p %16p\n",
2998 		    addr, bc.bc_addr, bc.bc_timestamp, bc.bc_thread,
2999 		    "", bc.bc_cache, bc.bc_lastlog, bc.bc_contents);
3000 
3001 		mdb_inc_indent(17);
3002 		for (i = 0; i < depth; i++)
3003 			mdb_printf("%a\n", bc.bc_stack[i]);
3004 		mdb_dec_indent(17);
3005 		mdb_printf("\n");
3006 	} else {
3007 		mdb_printf("%0?p %0?p %12llx %0?p", addr, bc.bc_addr,
3008 		    bc.bc_timestamp, bc.bc_thread);
3009 
3010 		for (i = 0; i < depth; i++) {
3011 			if (mdb_lookup_by_addr(bc.bc_stack[i],
3012 			    MDB_SYM_FUZZY, c, sizeof (c), &sym) == -1)
3013 				continue;
3014 			if (strncmp(c, "kmem_", 5) == 0)
3015 				continue;
3016 			mdb_printf(" %a\n", bc.bc_stack[i]);
3017 			break;
3018 		}
3019 
3020 		if (i >= depth)
3021 			mdb_printf("\n");
3022 	}
3023 
3024 	return (DCMD_OK);
3025 }
3026 
3027 typedef struct kmem_verify {
3028 	uint64_t *kmv_buf;		/* buffer to read cache contents into */
3029 	size_t kmv_size;		/* number of bytes in kmv_buf */
3030 	int kmv_corruption;		/* > 0 if corruption found. */
3031 	uint_t kmv_flags;		/* dcmd flags */
3032 	struct kmem_cache kmv_cache;	/* the cache we're operating on */
3033 } kmem_verify_t;
3034 
3035 /*
3036  * verify_pattern()
3037  *	verify that buf is filled with the pattern pat.
3038  */
3039 static int64_t
verify_pattern(uint64_t * buf_arg,size_t size,uint64_t pat)3040 verify_pattern(uint64_t *buf_arg, size_t size, uint64_t pat)
3041 {
3042 	/*LINTED*/
3043 	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
3044 	uint64_t *buf;
3045 
3046 	for (buf = buf_arg; buf < bufend; buf++)
3047 		if (*buf != pat)
3048 			return ((uintptr_t)buf - (uintptr_t)buf_arg);
3049 	return (-1);
3050 }
3051 
3052 /*
3053  * verify_buftag()
3054  *	verify that btp->bt_bxstat == (bcp ^ pat)
3055  */
3056 static int
verify_buftag(kmem_buftag_t * btp,uintptr_t pat)3057 verify_buftag(kmem_buftag_t *btp, uintptr_t pat)
3058 {
3059 	return (btp->bt_bxstat == ((intptr_t)btp->bt_bufctl ^ pat) ? 0 : -1);
3060 }
3061 
3062 /*
3063  * verify_free()
3064  *	verify the integrity of a free block of memory by checking
3065  *	that it is filled with 0xdeadbeef and that its buftag is sane.
3066  */
3067 /*ARGSUSED1*/
3068 static int
verify_free(uintptr_t addr,const void * data,void * private)3069 verify_free(uintptr_t addr, const void *data, void *private)
3070 {
3071 	kmem_verify_t *kmv = (kmem_verify_t *)private;
3072 	uint64_t *buf = kmv->kmv_buf;	/* buf to validate */
3073 	int64_t corrupt;		/* corruption offset */
3074 	kmem_buftag_t *buftagp;		/* ptr to buftag */
3075 	kmem_cache_t *cp = &kmv->kmv_cache;
3076 	boolean_t besilent = !!(kmv->kmv_flags & (DCMD_LOOP | DCMD_PIPE_OUT));
3077 
3078 	/*LINTED*/
3079 	buftagp = KMEM_BUFTAG(cp, buf);
3080 
3081 	/*
3082 	 * Read the buffer to check.
3083 	 */
3084 	if (mdb_vread(buf, kmv->kmv_size, addr) == -1) {
3085 		if (!besilent)
3086 			mdb_warn("couldn't read %p", addr);
3087 		return (WALK_NEXT);
3088 	}
3089 
3090 	if ((corrupt = verify_pattern(buf, cp->cache_verify,
3091 	    KMEM_FREE_PATTERN)) >= 0) {
3092 		if (!besilent)
3093 			mdb_printf("buffer %p (free) seems corrupted, at %p\n",
3094 			    addr, (uintptr_t)addr + corrupt);
3095 		goto corrupt;
3096 	}
3097 	/*
3098 	 * When KMF_LITE is set, buftagp->bt_redzone is used to hold
3099 	 * the first bytes of the buffer, hence we cannot check for red
3100 	 * zone corruption.
3101 	 */
3102 	if ((cp->cache_flags & (KMF_HASH | KMF_LITE)) == KMF_HASH &&
3103 	    buftagp->bt_redzone != KMEM_REDZONE_PATTERN) {
3104 		if (!besilent)
3105 			mdb_printf("buffer %p (free) seems to "
3106 			    "have a corrupt redzone pattern\n", addr);
3107 		goto corrupt;
3108 	}
3109 
3110 	/*
3111 	 * confirm bufctl pointer integrity.
3112 	 */
3113 	if (verify_buftag(buftagp, KMEM_BUFTAG_FREE) == -1) {
3114 		if (!besilent)
3115 			mdb_printf("buffer %p (free) has a corrupt "
3116 			    "buftag\n", addr);
3117 		goto corrupt;
3118 	}
3119 
3120 	return (WALK_NEXT);
3121 corrupt:
3122 	if (kmv->kmv_flags & DCMD_PIPE_OUT)
3123 		mdb_printf("%p\n", addr);
3124 	kmv->kmv_corruption++;
3125 	return (WALK_NEXT);
3126 }
3127 
3128 /*
3129  * verify_alloc()
3130  *	Verify that the buftag of an allocated buffer makes sense with respect
3131  *	to the buffer.
3132  */
3133 /*ARGSUSED1*/
3134 static int
verify_alloc(uintptr_t addr,const void * data,void * private)3135 verify_alloc(uintptr_t addr, const void *data, void *private)
3136 {
3137 	kmem_verify_t *kmv = (kmem_verify_t *)private;
3138 	kmem_cache_t *cp = &kmv->kmv_cache;
3139 	uint64_t *buf = kmv->kmv_buf;	/* buf to validate */
3140 	/*LINTED*/
3141 	kmem_buftag_t *buftagp = KMEM_BUFTAG(cp, buf);
3142 	uint32_t *ip = (uint32_t *)buftagp;
3143 	uint8_t *bp = (uint8_t *)buf;
3144 	int looks_ok = 0, size_ok = 1;	/* flags for finding corruption */
3145 	boolean_t besilent = !!(kmv->kmv_flags & (DCMD_LOOP | DCMD_PIPE_OUT));
3146 
3147 	/*
3148 	 * Read the buffer to check.
3149 	 */
3150 	if (mdb_vread(buf, kmv->kmv_size, addr) == -1) {
3151 		if (!besilent)
3152 			mdb_warn("couldn't read %p", addr);
3153 		return (WALK_NEXT);
3154 	}
3155 
3156 	/*
3157 	 * There are two cases to handle:
3158 	 * 1. If the buf was alloc'd using kmem_cache_alloc, it will have
3159 	 *    0xfeedfacefeedface at the end of it
3160 	 * 2. If the buf was alloc'd using kmem_alloc, it will have
3161 	 *    0xbb just past the end of the region in use.  At the buftag,
3162 	 *    it will have 0xfeedface (or, if the whole buffer is in use,
3163 	 *    0xfeedface & bb000000 or 0xfeedfacf & 000000bb depending on
3164 	 *    endianness), followed by 32 bits containing the offset of the
3165 	 *    0xbb byte in the buffer.
3166 	 *
3167 	 * Finally, the two 32-bit words that comprise the second half of the
3168 	 * buftag should xor to KMEM_BUFTAG_ALLOC
3169 	 */
3170 
3171 	if (buftagp->bt_redzone == KMEM_REDZONE_PATTERN)
3172 		looks_ok = 1;
3173 	else if (!KMEM_SIZE_VALID(ip[1]))
3174 		size_ok = 0;
3175 	else if (bp[KMEM_SIZE_DECODE(ip[1])] == KMEM_REDZONE_BYTE)
3176 		looks_ok = 1;
3177 	else
3178 		size_ok = 0;
3179 
3180 	if (!size_ok) {
3181 		if (!besilent)
3182 			mdb_printf("buffer %p (allocated) has a corrupt "
3183 			    "redzone size encoding\n", addr);
3184 		goto corrupt;
3185 	}
3186 
3187 	if (!looks_ok) {
3188 		if (!besilent)
3189 			mdb_printf("buffer %p (allocated) has a corrupt "
3190 			    "redzone signature\n", addr);
3191 		goto corrupt;
3192 	}
3193 
3194 	if (verify_buftag(buftagp, KMEM_BUFTAG_ALLOC) == -1) {
3195 		if (!besilent)
3196 			mdb_printf("buffer %p (allocated) has a "
3197 			    "corrupt buftag\n", addr);
3198 		goto corrupt;
3199 	}
3200 
3201 	return (WALK_NEXT);
3202 corrupt:
3203 	if (kmv->kmv_flags & DCMD_PIPE_OUT)
3204 		mdb_printf("%p\n", addr);
3205 
3206 	kmv->kmv_corruption++;
3207 	return (WALK_NEXT);
3208 }
3209 
3210 /*ARGSUSED2*/
3211 int
kmem_verify(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3212 kmem_verify(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3213 {
3214 	if (flags & DCMD_ADDRSPEC) {
3215 		int check_alloc = 0, check_free = 0;
3216 		kmem_verify_t kmv;
3217 
3218 		if (mdb_vread(&kmv.kmv_cache, sizeof (kmv.kmv_cache),
3219 		    addr) == -1) {
3220 			mdb_warn("couldn't read kmem_cache %p", addr);
3221 			return (DCMD_ERR);
3222 		}
3223 
3224 		if ((kmv.kmv_cache.cache_dump.kd_unsafe ||
3225 		    kmv.kmv_cache.cache_dump.kd_alloc_fails) &&
3226 		    !(flags & (DCMD_LOOP | DCMD_PIPE_OUT))) {
3227 			mdb_warn("WARNING: cache was used during dump: "
3228 			    "corruption may be incorrectly reported\n");
3229 		}
3230 
3231 		kmv.kmv_size = kmv.kmv_cache.cache_buftag +
3232 		    sizeof (kmem_buftag_t);
3233 		kmv.kmv_buf = mdb_alloc(kmv.kmv_size, UM_SLEEP | UM_GC);
3234 		kmv.kmv_corruption = 0;
3235 		kmv.kmv_flags = flags;
3236 
3237 		if ((kmv.kmv_cache.cache_flags & KMF_REDZONE)) {
3238 			check_alloc = 1;
3239 			if (kmv.kmv_cache.cache_flags & KMF_DEADBEEF)
3240 				check_free = 1;
3241 		} else {
3242 			if (!(flags & DCMD_LOOP)) {
3243 				mdb_warn("cache %p (%s) does not have "
3244 				    "redzone checking enabled\n", addr,
3245 				    kmv.kmv_cache.cache_name);
3246 			}
3247 			return (DCMD_ERR);
3248 		}
3249 
3250 		if (!(flags & (DCMD_LOOP | DCMD_PIPE_OUT))) {
3251 			mdb_printf("Summary for cache '%s'\n",
3252 			    kmv.kmv_cache.cache_name);
3253 			mdb_inc_indent(2);
3254 		}
3255 
3256 		if (check_alloc)
3257 			(void) mdb_pwalk("kmem", verify_alloc, &kmv, addr);
3258 		if (check_free)
3259 			(void) mdb_pwalk("freemem", verify_free, &kmv, addr);
3260 
3261 		if (!(flags & DCMD_PIPE_OUT)) {
3262 			if (flags & DCMD_LOOP) {
3263 				if (kmv.kmv_corruption == 0) {
3264 					mdb_printf("%-*s %?p clean\n",
3265 					    KMEM_CACHE_NAMELEN,
3266 					    kmv.kmv_cache.cache_name, addr);
3267 				} else {
3268 					mdb_printf("%-*s %?p %d corrupt "
3269 					    "buffer%s\n", KMEM_CACHE_NAMELEN,
3270 					    kmv.kmv_cache.cache_name, addr,
3271 					    kmv.kmv_corruption,
3272 					    kmv.kmv_corruption > 1 ? "s" : "");
3273 				}
3274 			} else {
3275 				/*
3276 				 * This is the more verbose mode, when the user
3277 				 * typed addr::kmem_verify.  If the cache was
3278 				 * clean, nothing will have yet been printed. So
3279 				 * say something.
3280 				 */
3281 				if (kmv.kmv_corruption == 0)
3282 					mdb_printf("clean\n");
3283 
3284 				mdb_dec_indent(2);
3285 			}
3286 		}
3287 	} else {
3288 		/*
3289 		 * If the user didn't specify a cache to verify, we'll walk all
3290 		 * kmem_cache's, specifying ourself as a callback for each...
3291 		 * this is the equivalent of '::walk kmem_cache .::kmem_verify'
3292 		 */
3293 
3294 		if (!(flags & DCMD_PIPE_OUT)) {
3295 			uintptr_t dump_curr;
3296 			uintptr_t dump_end;
3297 
3298 			if (mdb_readvar(&dump_curr, "kmem_dump_curr") != -1 &&
3299 			    mdb_readvar(&dump_end, "kmem_dump_end") != -1 &&
3300 			    dump_curr == dump_end) {
3301 				mdb_warn("WARNING: exceeded kmem_dump_size; "
3302 				    "corruption may be incorrectly reported\n");
3303 			}
3304 
3305 			mdb_printf("%<u>%-*s %-?s %-20s%</b>\n",
3306 			    KMEM_CACHE_NAMELEN, "Cache Name", "Addr",
3307 			    "Cache Integrity");
3308 		}
3309 
3310 		(void) (mdb_walk_dcmd("kmem_cache", "kmem_verify", 0, NULL));
3311 	}
3312 
3313 	return (DCMD_OK);
3314 }
3315 
3316 typedef struct vmem_node {
3317 	struct vmem_node *vn_next;
3318 	struct vmem_node *vn_parent;
3319 	struct vmem_node *vn_sibling;
3320 	struct vmem_node *vn_children;
3321 	uintptr_t vn_addr;
3322 	int vn_marked;
3323 	vmem_t vn_vmem;
3324 } vmem_node_t;
3325 
3326 typedef struct vmem_walk {
3327 	vmem_node_t *vw_root;
3328 	vmem_node_t *vw_current;
3329 } vmem_walk_t;
3330 
3331 int
vmem_walk_init(mdb_walk_state_t * wsp)3332 vmem_walk_init(mdb_walk_state_t *wsp)
3333 {
3334 	uintptr_t vaddr, paddr;
3335 	vmem_node_t *head = NULL, *root = NULL, *current = NULL, *parent, *vp;
3336 	vmem_walk_t *vw;
3337 
3338 	if (mdb_readvar(&vaddr, "vmem_list") == -1) {
3339 		mdb_warn("couldn't read 'vmem_list'");
3340 		return (WALK_ERR);
3341 	}
3342 
3343 	while (vaddr != 0) {
3344 		vp = mdb_zalloc(sizeof (vmem_node_t), UM_SLEEP);
3345 		vp->vn_addr = vaddr;
3346 		vp->vn_next = head;
3347 		head = vp;
3348 
3349 		if (vaddr == wsp->walk_addr)
3350 			current = vp;
3351 
3352 		if (mdb_vread(&vp->vn_vmem, sizeof (vmem_t), vaddr) == -1) {
3353 			mdb_warn("couldn't read vmem_t at %p", vaddr);
3354 			goto err;
3355 		}
3356 
3357 		vaddr = (uintptr_t)vp->vn_vmem.vm_next;
3358 	}
3359 
3360 	for (vp = head; vp != NULL; vp = vp->vn_next) {
3361 
3362 		if ((paddr = (uintptr_t)vp->vn_vmem.vm_source) == 0) {
3363 			vp->vn_sibling = root;
3364 			root = vp;
3365 			continue;
3366 		}
3367 
3368 		for (parent = head; parent != NULL; parent = parent->vn_next) {
3369 			if (parent->vn_addr != paddr)
3370 				continue;
3371 			vp->vn_sibling = parent->vn_children;
3372 			parent->vn_children = vp;
3373 			vp->vn_parent = parent;
3374 			break;
3375 		}
3376 
3377 		if (parent == NULL) {
3378 			mdb_warn("couldn't find %p's parent (%p)\n",
3379 			    vp->vn_addr, paddr);
3380 			goto err;
3381 		}
3382 	}
3383 
3384 	vw = mdb_zalloc(sizeof (vmem_walk_t), UM_SLEEP);
3385 	vw->vw_root = root;
3386 
3387 	if (current != NULL)
3388 		vw->vw_current = current;
3389 	else
3390 		vw->vw_current = root;
3391 
3392 	wsp->walk_data = vw;
3393 	return (WALK_NEXT);
3394 err:
3395 	for (vp = head; head != NULL; vp = head) {
3396 		head = vp->vn_next;
3397 		mdb_free(vp, sizeof (vmem_node_t));
3398 	}
3399 
3400 	return (WALK_ERR);
3401 }
3402 
3403 int
vmem_walk_step(mdb_walk_state_t * wsp)3404 vmem_walk_step(mdb_walk_state_t *wsp)
3405 {
3406 	vmem_walk_t *vw = wsp->walk_data;
3407 	vmem_node_t *vp;
3408 	int rval;
3409 
3410 	if ((vp = vw->vw_current) == NULL)
3411 		return (WALK_DONE);
3412 
3413 	rval = wsp->walk_callback(vp->vn_addr, &vp->vn_vmem, wsp->walk_cbdata);
3414 
3415 	if (vp->vn_children != NULL) {
3416 		vw->vw_current = vp->vn_children;
3417 		return (rval);
3418 	}
3419 
3420 	do {
3421 		vw->vw_current = vp->vn_sibling;
3422 		vp = vp->vn_parent;
3423 	} while (vw->vw_current == NULL && vp != NULL);
3424 
3425 	return (rval);
3426 }
3427 
3428 /*
3429  * The "vmem_postfix" walk walks the vmem arenas in post-fix order; all
3430  * children are visited before their parent.  We perform the postfix walk
3431  * iteratively (rather than recursively) to allow mdb to regain control
3432  * after each callback.
3433  */
3434 int
vmem_postfix_walk_step(mdb_walk_state_t * wsp)3435 vmem_postfix_walk_step(mdb_walk_state_t *wsp)
3436 {
3437 	vmem_walk_t *vw = wsp->walk_data;
3438 	vmem_node_t *vp = vw->vw_current;
3439 	int rval;
3440 
3441 	/*
3442 	 * If this node is marked, then we know that we have already visited
3443 	 * all of its children.  If the node has any siblings, they need to
3444 	 * be visited next; otherwise, we need to visit the parent.  Note
3445 	 * that vp->vn_marked will only be zero on the first invocation of
3446 	 * the step function.
3447 	 */
3448 	if (vp->vn_marked) {
3449 		if (vp->vn_sibling != NULL)
3450 			vp = vp->vn_sibling;
3451 		else if (vp->vn_parent != NULL)
3452 			vp = vp->vn_parent;
3453 		else {
3454 			/*
3455 			 * We have neither a parent, nor a sibling, and we
3456 			 * have already been visited; we're done.
3457 			 */
3458 			return (WALK_DONE);
3459 		}
3460 	}
3461 
3462 	/*
3463 	 * Before we visit this node, visit its children.
3464 	 */
3465 	while (vp->vn_children != NULL && !vp->vn_children->vn_marked)
3466 		vp = vp->vn_children;
3467 
3468 	vp->vn_marked = 1;
3469 	vw->vw_current = vp;
3470 	rval = wsp->walk_callback(vp->vn_addr, &vp->vn_vmem, wsp->walk_cbdata);
3471 
3472 	return (rval);
3473 }
3474 
3475 void
vmem_walk_fini(mdb_walk_state_t * wsp)3476 vmem_walk_fini(mdb_walk_state_t *wsp)
3477 {
3478 	vmem_walk_t *vw = wsp->walk_data;
3479 	vmem_node_t *root = vw->vw_root;
3480 	int done;
3481 
3482 	if (root == NULL)
3483 		return;
3484 
3485 	if ((vw->vw_root = root->vn_children) != NULL)
3486 		vmem_walk_fini(wsp);
3487 
3488 	vw->vw_root = root->vn_sibling;
3489 	done = (root->vn_sibling == NULL && root->vn_parent == NULL);
3490 	mdb_free(root, sizeof (vmem_node_t));
3491 
3492 	if (done) {
3493 		mdb_free(vw, sizeof (vmem_walk_t));
3494 	} else {
3495 		vmem_walk_fini(wsp);
3496 	}
3497 }
3498 
3499 typedef struct vmem_seg_walk {
3500 	uint8_t vsw_type;
3501 	uintptr_t vsw_start;
3502 	uintptr_t vsw_current;
3503 } vmem_seg_walk_t;
3504 
3505 /*ARGSUSED*/
3506 int
vmem_seg_walk_common_init(mdb_walk_state_t * wsp,uint8_t type,char * name)3507 vmem_seg_walk_common_init(mdb_walk_state_t *wsp, uint8_t type, char *name)
3508 {
3509 	vmem_seg_walk_t *vsw;
3510 
3511 	if (wsp->walk_addr == 0) {
3512 		mdb_warn("vmem_%s does not support global walks\n", name);
3513 		return (WALK_ERR);
3514 	}
3515 
3516 	wsp->walk_data = vsw = mdb_alloc(sizeof (vmem_seg_walk_t), UM_SLEEP);
3517 
3518 	vsw->vsw_type = type;
3519 	vsw->vsw_start = wsp->walk_addr + offsetof(vmem_t, vm_seg0);
3520 	vsw->vsw_current = vsw->vsw_start;
3521 
3522 	return (WALK_NEXT);
3523 }
3524 
3525 /*
3526  * vmem segments can't have type 0 (this should be added to vmem_impl.h).
3527  */
3528 #define	VMEM_NONE	0
3529 
3530 int
vmem_alloc_walk_init(mdb_walk_state_t * wsp)3531 vmem_alloc_walk_init(mdb_walk_state_t *wsp)
3532 {
3533 	return (vmem_seg_walk_common_init(wsp, VMEM_ALLOC, "alloc"));
3534 }
3535 
3536 int
vmem_free_walk_init(mdb_walk_state_t * wsp)3537 vmem_free_walk_init(mdb_walk_state_t *wsp)
3538 {
3539 	return (vmem_seg_walk_common_init(wsp, VMEM_FREE, "free"));
3540 }
3541 
3542 int
vmem_span_walk_init(mdb_walk_state_t * wsp)3543 vmem_span_walk_init(mdb_walk_state_t *wsp)
3544 {
3545 	return (vmem_seg_walk_common_init(wsp, VMEM_SPAN, "span"));
3546 }
3547 
3548 int
vmem_seg_walk_init(mdb_walk_state_t * wsp)3549 vmem_seg_walk_init(mdb_walk_state_t *wsp)
3550 {
3551 	return (vmem_seg_walk_common_init(wsp, VMEM_NONE, "seg"));
3552 }
3553 
3554 int
vmem_seg_walk_step(mdb_walk_state_t * wsp)3555 vmem_seg_walk_step(mdb_walk_state_t *wsp)
3556 {
3557 	vmem_seg_t seg;
3558 	vmem_seg_walk_t *vsw = wsp->walk_data;
3559 	uintptr_t addr = vsw->vsw_current;
3560 	static size_t seg_size = 0;
3561 	int rval;
3562 
3563 	if (!seg_size) {
3564 		if (mdb_readvar(&seg_size, "vmem_seg_size") == -1) {
3565 			mdb_warn("failed to read 'vmem_seg_size'");
3566 			seg_size = sizeof (vmem_seg_t);
3567 		}
3568 	}
3569 
3570 	if (seg_size < sizeof (seg))
3571 		bzero((caddr_t)&seg + seg_size, sizeof (seg) - seg_size);
3572 
3573 	if (mdb_vread(&seg, seg_size, addr) == -1) {
3574 		mdb_warn("couldn't read vmem_seg at %p", addr);
3575 		return (WALK_ERR);
3576 	}
3577 
3578 	vsw->vsw_current = (uintptr_t)seg.vs_anext;
3579 	if (vsw->vsw_type != VMEM_NONE && seg.vs_type != vsw->vsw_type) {
3580 		rval = WALK_NEXT;
3581 	} else {
3582 		rval = wsp->walk_callback(addr, &seg, wsp->walk_cbdata);
3583 	}
3584 
3585 	if (vsw->vsw_current == vsw->vsw_start)
3586 		return (WALK_DONE);
3587 
3588 	return (rval);
3589 }
3590 
3591 void
vmem_seg_walk_fini(mdb_walk_state_t * wsp)3592 vmem_seg_walk_fini(mdb_walk_state_t *wsp)
3593 {
3594 	vmem_seg_walk_t *vsw = wsp->walk_data;
3595 
3596 	mdb_free(vsw, sizeof (vmem_seg_walk_t));
3597 }
3598 
3599 #define	VMEM_NAMEWIDTH	22
3600 
3601 int
vmem(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3602 vmem(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3603 {
3604 	vmem_t v, parent;
3605 	vmem_kstat_t *vkp = &v.vm_kstat;
3606 	uintptr_t paddr;
3607 	int ident = 0;
3608 	char c[VMEM_NAMEWIDTH];
3609 
3610 	if (!(flags & DCMD_ADDRSPEC)) {
3611 		if (mdb_walk_dcmd("vmem", "vmem", argc, argv) == -1) {
3612 			mdb_warn("can't walk vmem");
3613 			return (DCMD_ERR);
3614 		}
3615 		return (DCMD_OK);
3616 	}
3617 
3618 	if (DCMD_HDRSPEC(flags))
3619 		mdb_printf("%-?s %-*s %10s %12s %9s %5s\n",
3620 		    "ADDR", VMEM_NAMEWIDTH, "NAME", "INUSE",
3621 		    "TOTAL", "SUCCEED", "FAIL");
3622 
3623 	if (mdb_vread(&v, sizeof (v), addr) == -1) {
3624 		mdb_warn("couldn't read vmem at %p", addr);
3625 		return (DCMD_ERR);
3626 	}
3627 
3628 	for (paddr = (uintptr_t)v.vm_source; paddr != 0; ident += 2) {
3629 		if (mdb_vread(&parent, sizeof (parent), paddr) == -1) {
3630 			mdb_warn("couldn't trace %p's ancestry", addr);
3631 			ident = 0;
3632 			break;
3633 		}
3634 		paddr = (uintptr_t)parent.vm_source;
3635 	}
3636 
3637 	(void) mdb_snprintf(c, VMEM_NAMEWIDTH, "%*s%s", ident, "", v.vm_name);
3638 
3639 	mdb_printf("%0?p %-*s %10llu %12llu %9llu %5llu\n",
3640 	    addr, VMEM_NAMEWIDTH, c,
3641 	    vkp->vk_mem_inuse.value.ui64, vkp->vk_mem_total.value.ui64,
3642 	    vkp->vk_alloc.value.ui64, vkp->vk_fail.value.ui64);
3643 
3644 	return (DCMD_OK);
3645 }
3646 
3647 void
vmem_seg_help(void)3648 vmem_seg_help(void)
3649 {
3650 	mdb_printf("%s",
3651 "Display the contents of vmem_seg_ts, with optional filtering.\n\n"
3652 "\n"
3653 "A vmem_seg_t represents a range of addresses (or arbitrary numbers),\n"
3654 "representing a single chunk of data.  Only ALLOC segments have debugging\n"
3655 "information.\n");
3656 	mdb_dec_indent(2);
3657 	mdb_printf("%<b>OPTIONS%</b>\n");
3658 	mdb_inc_indent(2);
3659 	mdb_printf("%s",
3660 "  -v    Display the full content of the vmem_seg, including its stack trace\n"
3661 "  -s    report the size of the segment, instead of the end address\n"
3662 "  -c caller\n"
3663 "        filter out segments without the function/PC in their stack trace\n"
3664 "  -e earliest\n"
3665 "        filter out segments timestamped before earliest\n"
3666 "  -l latest\n"
3667 "        filter out segments timestamped after latest\n"
3668 "  -m minsize\n"
3669 "        filer out segments smaller than minsize\n"
3670 "  -M maxsize\n"
3671 "        filer out segments larger than maxsize\n"
3672 "  -t thread\n"
3673 "        filter out segments not involving thread\n"
3674 "  -T type\n"
3675 "        filter out segments not of type 'type'\n"
3676 "        type is one of: ALLOC/FREE/SPAN/ROTOR/WALKER\n");
3677 }
3678 
3679 /*ARGSUSED*/
3680 int
vmem_seg(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3681 vmem_seg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3682 {
3683 	vmem_seg_t vs;
3684 	pc_t *stk = vs.vs_stack;
3685 	uintptr_t sz;
3686 	uint8_t t;
3687 	const char *type = NULL;
3688 	GElf_Sym sym;
3689 	char c[MDB_SYM_NAMLEN];
3690 	int no_debug;
3691 	int i;
3692 	int depth;
3693 	uintptr_t laddr, haddr;
3694 
3695 	uintptr_t caller = 0, thread = 0;
3696 	uintptr_t minsize = 0, maxsize = 0;
3697 
3698 	hrtime_t earliest = 0, latest = 0;
3699 
3700 	uint_t size = 0;
3701 	uint_t verbose = 0;
3702 
3703 	if (!(flags & DCMD_ADDRSPEC))
3704 		return (DCMD_USAGE);
3705 
3706 	if (mdb_getopts(argc, argv,
3707 	    'c', MDB_OPT_UINTPTR, &caller,
3708 	    'e', MDB_OPT_UINT64, &earliest,
3709 	    'l', MDB_OPT_UINT64, &latest,
3710 	    's', MDB_OPT_SETBITS, TRUE, &size,
3711 	    'm', MDB_OPT_UINTPTR, &minsize,
3712 	    'M', MDB_OPT_UINTPTR, &maxsize,
3713 	    't', MDB_OPT_UINTPTR, &thread,
3714 	    'T', MDB_OPT_STR, &type,
3715 	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
3716 	    NULL) != argc)
3717 		return (DCMD_USAGE);
3718 
3719 	if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
3720 		if (verbose) {
3721 			mdb_printf("%16s %4s %16s %16s %16s\n"
3722 			    "%<u>%16s %4s %16s %16s %16s%</u>\n",
3723 			    "ADDR", "TYPE", "START", "END", "SIZE",
3724 			    "", "", "THREAD", "TIMESTAMP", "");
3725 		} else {
3726 			mdb_printf("%?s %4s %?s %?s %s\n", "ADDR", "TYPE",
3727 			    "START", size? "SIZE" : "END", "WHO");
3728 		}
3729 	}
3730 
3731 	if (mdb_vread(&vs, sizeof (vs), addr) == -1) {
3732 		mdb_warn("couldn't read vmem_seg at %p", addr);
3733 		return (DCMD_ERR);
3734 	}
3735 
3736 	if (type != NULL) {
3737 		if (strcmp(type, "ALLC") == 0 || strcmp(type, "ALLOC") == 0)
3738 			t = VMEM_ALLOC;
3739 		else if (strcmp(type, "FREE") == 0)
3740 			t = VMEM_FREE;
3741 		else if (strcmp(type, "SPAN") == 0)
3742 			t = VMEM_SPAN;
3743 		else if (strcmp(type, "ROTR") == 0 ||
3744 		    strcmp(type, "ROTOR") == 0)
3745 			t = VMEM_ROTOR;
3746 		else if (strcmp(type, "WLKR") == 0 ||
3747 		    strcmp(type, "WALKER") == 0)
3748 			t = VMEM_WALKER;
3749 		else {
3750 			mdb_warn("\"%s\" is not a recognized vmem_seg type\n",
3751 			    type);
3752 			return (DCMD_ERR);
3753 		}
3754 
3755 		if (vs.vs_type != t)
3756 			return (DCMD_OK);
3757 	}
3758 
3759 	sz = vs.vs_end - vs.vs_start;
3760 
3761 	if (minsize != 0 && sz < minsize)
3762 		return (DCMD_OK);
3763 
3764 	if (maxsize != 0 && sz > maxsize)
3765 		return (DCMD_OK);
3766 
3767 	t = vs.vs_type;
3768 	depth = vs.vs_depth;
3769 
3770 	/*
3771 	 * debug info, when present, is only accurate for VMEM_ALLOC segments
3772 	 */
3773 	no_debug = (t != VMEM_ALLOC) ||
3774 	    (depth == 0 || depth > VMEM_STACK_DEPTH);
3775 
3776 	if (no_debug) {
3777 		if (caller != 0 || thread != 0 || earliest != 0 || latest != 0)
3778 			return (DCMD_OK);		/* not enough info */
3779 	} else {
3780 		if (caller != 0) {
3781 			laddr = caller;
3782 			haddr = caller + sizeof (caller);
3783 
3784 			if (mdb_lookup_by_addr(caller, MDB_SYM_FUZZY, c,
3785 			    sizeof (c), &sym) != -1 &&
3786 			    caller == (uintptr_t)sym.st_value) {
3787 				/*
3788 				 * We were provided an exact symbol value; any
3789 				 * address in the function is valid.
3790 				 */
3791 				laddr = (uintptr_t)sym.st_value;
3792 				haddr = (uintptr_t)sym.st_value + sym.st_size;
3793 			}
3794 
3795 			for (i = 0; i < depth; i++)
3796 				if (vs.vs_stack[i] >= laddr &&
3797 				    vs.vs_stack[i] < haddr)
3798 					break;
3799 
3800 			if (i == depth)
3801 				return (DCMD_OK);
3802 		}
3803 
3804 		if (thread != 0 && (uintptr_t)vs.vs_thread != thread)
3805 			return (DCMD_OK);
3806 
3807 		if (earliest != 0 && vs.vs_timestamp < earliest)
3808 			return (DCMD_OK);
3809 
3810 		if (latest != 0 && vs.vs_timestamp > latest)
3811 			return (DCMD_OK);
3812 	}
3813 
3814 	type = (t == VMEM_ALLOC ? "ALLC" :
3815 	    t == VMEM_FREE ? "FREE" :
3816 	    t == VMEM_SPAN ? "SPAN" :
3817 	    t == VMEM_ROTOR ? "ROTR" :
3818 	    t == VMEM_WALKER ? "WLKR" :
3819 	    "????");
3820 
3821 	if (flags & DCMD_PIPE_OUT) {
3822 		mdb_printf("%#lr\n", addr);
3823 		return (DCMD_OK);
3824 	}
3825 
3826 	if (verbose) {
3827 		mdb_printf("%<b>%16p%</b> %4s %16p %16p %16ld\n",
3828 		    addr, type, vs.vs_start, vs.vs_end, sz);
3829 
3830 		if (no_debug)
3831 			return (DCMD_OK);
3832 
3833 		mdb_printf("%16s %4s %16p %16llx\n",
3834 		    "", "", vs.vs_thread, vs.vs_timestamp);
3835 
3836 		mdb_inc_indent(17);
3837 		for (i = 0; i < depth; i++) {
3838 			mdb_printf("%a\n", stk[i]);
3839 		}
3840 		mdb_dec_indent(17);
3841 		mdb_printf("\n");
3842 	} else {
3843 		mdb_printf("%0?p %4s %0?p %0?p", addr, type,
3844 		    vs.vs_start, size? sz : vs.vs_end);
3845 
3846 		if (no_debug) {
3847 			mdb_printf("\n");
3848 			return (DCMD_OK);
3849 		}
3850 
3851 		for (i = 0; i < depth; i++) {
3852 			if (mdb_lookup_by_addr(stk[i], MDB_SYM_FUZZY,
3853 			    c, sizeof (c), &sym) == -1)
3854 				continue;
3855 			if (strncmp(c, "vmem_", 5) == 0)
3856 				continue;
3857 			break;
3858 		}
3859 		mdb_printf(" %a\n", stk[i]);
3860 	}
3861 	return (DCMD_OK);
3862 }
3863 
3864 typedef struct kmalog_data {
3865 	uintptr_t	kma_addr;
3866 	hrtime_t	kma_newest;
3867 } kmalog_data_t;
3868 
3869 /*ARGSUSED*/
3870 static int
showbc(uintptr_t addr,const kmem_bufctl_audit_t * bcp,kmalog_data_t * kma)3871 showbc(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmalog_data_t *kma)
3872 {
3873 	char name[KMEM_CACHE_NAMELEN + 1];
3874 	hrtime_t delta;
3875 	int i, depth;
3876 	size_t bufsize;
3877 
3878 	if (bcp->bc_timestamp == 0)
3879 		return (WALK_DONE);
3880 
3881 	if (kma->kma_newest == 0)
3882 		kma->kma_newest = bcp->bc_timestamp;
3883 
3884 	if (kma->kma_addr) {
3885 		if (mdb_vread(&bufsize, sizeof (bufsize),
3886 		    (uintptr_t)&bcp->bc_cache->cache_bufsize) == -1) {
3887 			mdb_warn(
3888 			    "failed to read cache_bufsize for cache at %p",
3889 			    bcp->bc_cache);
3890 			return (WALK_ERR);
3891 		}
3892 
3893 		if (kma->kma_addr < (uintptr_t)bcp->bc_addr ||
3894 		    kma->kma_addr >= (uintptr_t)bcp->bc_addr + bufsize)
3895 			return (WALK_NEXT);
3896 	}
3897 
3898 	delta = kma->kma_newest - bcp->bc_timestamp;
3899 	depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
3900 
3901 	if (mdb_readstr(name, sizeof (name), (uintptr_t)
3902 	    &bcp->bc_cache->cache_name) <= 0)
3903 		(void) mdb_snprintf(name, sizeof (name), "%a", bcp->bc_cache);
3904 
3905 	mdb_printf("\nT-%lld.%09lld  addr=%p  %s\n",
3906 	    delta / NANOSEC, delta % NANOSEC, bcp->bc_addr, name);
3907 
3908 	for (i = 0; i < depth; i++)
3909 		mdb_printf("\t %a\n", bcp->bc_stack[i]);
3910 
3911 	return (WALK_NEXT);
3912 }
3913 
3914 int
kmalog(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3915 kmalog(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3916 {
3917 	const char *logname = "kmem_transaction_log";
3918 	kmalog_data_t kma;
3919 
3920 	if (argc > 1)
3921 		return (DCMD_USAGE);
3922 
3923 	kma.kma_newest = 0;
3924 	if (flags & DCMD_ADDRSPEC)
3925 		kma.kma_addr = addr;
3926 	else
3927 		kma.kma_addr = 0;
3928 
3929 	if (argc > 0) {
3930 		if (argv->a_type != MDB_TYPE_STRING)
3931 			return (DCMD_USAGE);
3932 		if (strcmp(argv->a_un.a_str, "fail") == 0)
3933 			logname = "kmem_failure_log";
3934 		else if (strcmp(argv->a_un.a_str, "slab") == 0)
3935 			logname = "kmem_slab_log";
3936 		else if (strcmp(argv->a_un.a_str, "zerosized") == 0)
3937 			logname = "kmem_zerosized_log";
3938 		else
3939 			return (DCMD_USAGE);
3940 	}
3941 
3942 	if (mdb_readvar(&addr, logname) == -1) {
3943 		mdb_warn("failed to read %s log header pointer");
3944 		return (DCMD_ERR);
3945 	}
3946 
3947 	if (mdb_pwalk("kmem_log", (mdb_walk_cb_t)showbc, &kma, addr) == -1) {
3948 		mdb_warn("failed to walk kmem log");
3949 		return (DCMD_ERR);
3950 	}
3951 
3952 	return (DCMD_OK);
3953 }
3954 
3955 /*
3956  * As the final lure for die-hard crash(8) users, we provide ::kmausers here.
3957  * The first piece is a structure which we use to accumulate kmem_cache_t
3958  * addresses of interest.  The kmc_add is used as a callback for the kmem_cache
3959  * walker; we either add all caches, or ones named explicitly as arguments.
3960  */
3961 
3962 typedef struct kmclist {
3963 	const char *kmc_name;			/* Name to match (or NULL) */
3964 	uintptr_t *kmc_caches;			/* List of kmem_cache_t addrs */
3965 	int kmc_nelems;				/* Num entries in kmc_caches */
3966 	int kmc_size;				/* Size of kmc_caches array */
3967 } kmclist_t;
3968 
3969 static int
kmc_add(uintptr_t addr,const kmem_cache_t * cp,kmclist_t * kmc)3970 kmc_add(uintptr_t addr, const kmem_cache_t *cp, kmclist_t *kmc)
3971 {
3972 	void *p;
3973 	int s;
3974 
3975 	if (kmc->kmc_name == NULL ||
3976 	    strcmp(cp->cache_name, kmc->kmc_name) == 0) {
3977 		/*
3978 		 * If we have a match, grow our array (if necessary), and then
3979 		 * add the virtual address of the matching cache to our list.
3980 		 */
3981 		if (kmc->kmc_nelems >= kmc->kmc_size) {
3982 			s = kmc->kmc_size ? kmc->kmc_size * 2 : 256;
3983 			p = mdb_alloc(sizeof (uintptr_t) * s, UM_SLEEP | UM_GC);
3984 
3985 			bcopy(kmc->kmc_caches, p,
3986 			    sizeof (uintptr_t) * kmc->kmc_size);
3987 
3988 			kmc->kmc_caches = p;
3989 			kmc->kmc_size = s;
3990 		}
3991 
3992 		kmc->kmc_caches[kmc->kmc_nelems++] = addr;
3993 		return (kmc->kmc_name ? WALK_DONE : WALK_NEXT);
3994 	}
3995 
3996 	return (WALK_NEXT);
3997 }
3998 
3999 /*
4000  * The second piece of ::kmausers is a hash table of allocations.  Each
4001  * allocation owner is identified by its stack trace and data_size.  We then
4002  * track the total bytes of all such allocations, and the number of allocations
4003  * to report at the end.  Once we have a list of caches, we walk through the
4004  * allocated bufctls of each, and update our hash table accordingly.
4005  */
4006 
4007 typedef struct kmowner {
4008 	struct kmowner *kmo_head;		/* First hash elt in bucket */
4009 	struct kmowner *kmo_next;		/* Next hash elt in chain */
4010 	size_t kmo_signature;			/* Hash table signature */
4011 	uint_t kmo_num;				/* Number of allocations */
4012 	size_t kmo_data_size;			/* Size of each allocation */
4013 	size_t kmo_total_size;			/* Total bytes of allocation */
4014 	int kmo_depth;				/* Depth of stack trace */
4015 	uintptr_t kmo_stack[KMEM_STACK_DEPTH];	/* Stack trace */
4016 } kmowner_t;
4017 
4018 typedef struct kmusers {
4019 	uintptr_t kmu_addr;			/* address of interest */
4020 	const kmem_cache_t *kmu_cache;		/* Current kmem cache */
4021 	kmowner_t *kmu_hash;			/* Hash table of owners */
4022 	int kmu_nelems;				/* Number of entries in use */
4023 	int kmu_size;				/* Total number of entries */
4024 } kmusers_t;
4025 
4026 static void
kmu_add(kmusers_t * kmu,const kmem_bufctl_audit_t * bcp,size_t size,size_t data_size)4027 kmu_add(kmusers_t *kmu, const kmem_bufctl_audit_t *bcp,
4028     size_t size, size_t data_size)
4029 {
4030 	int i, depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
4031 	size_t bucket, signature = data_size;
4032 	kmowner_t *kmo, *kmoend;
4033 
4034 	/*
4035 	 * If the hash table is full, double its size and rehash everything.
4036 	 */
4037 	if (kmu->kmu_nelems >= kmu->kmu_size) {
4038 		int s = kmu->kmu_size ? kmu->kmu_size * 2 : 1024;
4039 
4040 		kmo = mdb_alloc(sizeof (kmowner_t) * s, UM_SLEEP | UM_GC);
4041 		bcopy(kmu->kmu_hash, kmo, sizeof (kmowner_t) * kmu->kmu_size);
4042 		kmu->kmu_hash = kmo;
4043 		kmu->kmu_size = s;
4044 
4045 		kmoend = kmu->kmu_hash + kmu->kmu_size;
4046 		for (kmo = kmu->kmu_hash; kmo < kmoend; kmo++)
4047 			kmo->kmo_head = NULL;
4048 
4049 		kmoend = kmu->kmu_hash + kmu->kmu_nelems;
4050 		for (kmo = kmu->kmu_hash; kmo < kmoend; kmo++) {
4051 			bucket = kmo->kmo_signature & (kmu->kmu_size - 1);
4052 			kmo->kmo_next = kmu->kmu_hash[bucket].kmo_head;
4053 			kmu->kmu_hash[bucket].kmo_head = kmo;
4054 		}
4055 	}
4056 
4057 	/*
4058 	 * Finish computing the hash signature from the stack trace, and then
4059 	 * see if the owner is in the hash table.  If so, update our stats.
4060 	 */
4061 	for (i = 0; i < depth; i++)
4062 		signature += bcp->bc_stack[i];
4063 
4064 	bucket = signature & (kmu->kmu_size - 1);
4065 
4066 	for (kmo = kmu->kmu_hash[bucket].kmo_head; kmo; kmo = kmo->kmo_next) {
4067 		if (kmo->kmo_signature == signature) {
4068 			size_t difference = 0;
4069 
4070 			difference |= kmo->kmo_data_size - data_size;
4071 			difference |= kmo->kmo_depth - depth;
4072 
4073 			for (i = 0; i < depth; i++) {
4074 				difference |= kmo->kmo_stack[i] -
4075 				    bcp->bc_stack[i];
4076 			}
4077 
4078 			if (difference == 0) {
4079 				kmo->kmo_total_size += size;
4080 				kmo->kmo_num++;
4081 				return;
4082 			}
4083 		}
4084 	}
4085 
4086 	/*
4087 	 * If the owner is not yet hashed, grab the next element and fill it
4088 	 * in based on the allocation information.
4089 	 */
4090 	kmo = &kmu->kmu_hash[kmu->kmu_nelems++];
4091 	kmo->kmo_next = kmu->kmu_hash[bucket].kmo_head;
4092 	kmu->kmu_hash[bucket].kmo_head = kmo;
4093 
4094 	kmo->kmo_signature = signature;
4095 	kmo->kmo_num = 1;
4096 	kmo->kmo_data_size = data_size;
4097 	kmo->kmo_total_size = size;
4098 	kmo->kmo_depth = depth;
4099 
4100 	for (i = 0; i < depth; i++)
4101 		kmo->kmo_stack[i] = bcp->bc_stack[i];
4102 }
4103 
4104 /*
4105  * When ::kmausers is invoked without the -f flag, we simply update our hash
4106  * table with the information from each allocated bufctl.
4107  */
4108 /*ARGSUSED*/
4109 static int
kmause1(uintptr_t addr,const kmem_bufctl_audit_t * bcp,kmusers_t * kmu)4110 kmause1(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmusers_t *kmu)
4111 {
4112 	const kmem_cache_t *cp = kmu->kmu_cache;
4113 
4114 	kmu_add(kmu, bcp, cp->cache_bufsize, cp->cache_bufsize);
4115 	return (WALK_NEXT);
4116 }
4117 
4118 /*
4119  * When ::kmausers is invoked with the -f flag, we print out the information
4120  * for each bufctl as well as updating the hash table.
4121  */
4122 static int
kmause2(uintptr_t addr,const kmem_bufctl_audit_t * bcp,kmusers_t * kmu)4123 kmause2(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmusers_t *kmu)
4124 {
4125 	int i, depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
4126 	const kmem_cache_t *cp = kmu->kmu_cache;
4127 	kmem_bufctl_t bufctl;
4128 
4129 	if (kmu->kmu_addr) {
4130 		if (mdb_vread(&bufctl, sizeof (bufctl),  addr) == -1)
4131 			mdb_warn("couldn't read bufctl at %p", addr);
4132 		else if (kmu->kmu_addr < (uintptr_t)bufctl.bc_addr ||
4133 		    kmu->kmu_addr >= (uintptr_t)bufctl.bc_addr +
4134 		    cp->cache_bufsize)
4135 			return (WALK_NEXT);
4136 	}
4137 
4138 	mdb_printf("size %d, addr %p, thread %p, cache %s\n",
4139 	    cp->cache_bufsize, addr, bcp->bc_thread, cp->cache_name);
4140 
4141 	for (i = 0; i < depth; i++)
4142 		mdb_printf("\t %a\n", bcp->bc_stack[i]);
4143 
4144 	kmu_add(kmu, bcp, cp->cache_bufsize, cp->cache_bufsize);
4145 	return (WALK_NEXT);
4146 }
4147 
4148 /*
4149  * We sort our results by allocation size before printing them.
4150  */
4151 static int
kmownercmp(const void * lp,const void * rp)4152 kmownercmp(const void *lp, const void *rp)
4153 {
4154 	const kmowner_t *lhs = lp;
4155 	const kmowner_t *rhs = rp;
4156 
4157 	return (rhs->kmo_total_size - lhs->kmo_total_size);
4158 }
4159 
4160 /*
4161  * The main engine of ::kmausers is relatively straightforward: First we
4162  * accumulate our list of kmem_cache_t addresses into the kmclist_t. Next we
4163  * iterate over the allocated bufctls of each cache in the list.  Finally,
4164  * we sort and print our results.
4165  */
4166 /*ARGSUSED*/
4167 int
kmausers(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)4168 kmausers(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
4169 {
4170 	int mem_threshold = 8192;	/* Minimum # bytes for printing */
4171 	int cnt_threshold = 100;	/* Minimum # blocks for printing */
4172 	int audited_caches = 0;		/* Number of KMF_AUDIT caches found */
4173 	int do_all_caches = 1;		/* Do all caches (no arguments) */
4174 	int opt_e = FALSE;		/* Include "small" users */
4175 	int opt_f = FALSE;		/* Print stack traces */
4176 
4177 	mdb_walk_cb_t callback = (mdb_walk_cb_t)kmause1;
4178 	kmowner_t *kmo, *kmoend;
4179 	int i, oelems;
4180 
4181 	kmclist_t kmc;
4182 	kmusers_t kmu;
4183 
4184 	bzero(&kmc, sizeof (kmc));
4185 	bzero(&kmu, sizeof (kmu));
4186 
4187 	while ((i = mdb_getopts(argc, argv,
4188 	    'e', MDB_OPT_SETBITS, TRUE, &opt_e,
4189 	    'f', MDB_OPT_SETBITS, TRUE, &opt_f, NULL)) != argc) {
4190 
4191 		argv += i;	/* skip past options we just processed */
4192 		argc -= i;	/* adjust argc */
4193 
4194 		if (argv->a_type != MDB_TYPE_STRING || *argv->a_un.a_str == '-')
4195 			return (DCMD_USAGE);
4196 
4197 		oelems = kmc.kmc_nelems;
4198 		kmc.kmc_name = argv->a_un.a_str;
4199 		(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmc_add, &kmc);
4200 
4201 		if (kmc.kmc_nelems == oelems) {
4202 			mdb_warn("unknown kmem cache: %s\n", kmc.kmc_name);
4203 			return (DCMD_ERR);
4204 		}
4205 
4206 		do_all_caches = 0;
4207 		argv++;
4208 		argc--;
4209 	}
4210 
4211 	if (flags & DCMD_ADDRSPEC) {
4212 		opt_f = TRUE;
4213 		kmu.kmu_addr = addr;
4214 	} else {
4215 		kmu.kmu_addr = 0;
4216 	}
4217 
4218 	if (opt_e)
4219 		mem_threshold = cnt_threshold = 0;
4220 
4221 	if (opt_f)
4222 		callback = (mdb_walk_cb_t)kmause2;
4223 
4224 	if (do_all_caches) {
4225 		kmc.kmc_name = NULL; /* match all cache names */
4226 		(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmc_add, &kmc);
4227 	}
4228 
4229 	for (i = 0; i < kmc.kmc_nelems; i++) {
4230 		uintptr_t cp = kmc.kmc_caches[i];
4231 		kmem_cache_t c;
4232 
4233 		if (mdb_vread(&c, sizeof (c), cp) == -1) {
4234 			mdb_warn("failed to read cache at %p", cp);
4235 			continue;
4236 		}
4237 
4238 		if (!(c.cache_flags & KMF_AUDIT)) {
4239 			if (!do_all_caches) {
4240 				mdb_warn("KMF_AUDIT is not enabled for %s\n",
4241 				    c.cache_name);
4242 			}
4243 			continue;
4244 		}
4245 
4246 		kmu.kmu_cache = &c;
4247 		(void) mdb_pwalk("bufctl", callback, &kmu, cp);
4248 		audited_caches++;
4249 	}
4250 
4251 	if (audited_caches == 0 && do_all_caches) {
4252 		mdb_warn("KMF_AUDIT is not enabled for any caches\n");
4253 		return (DCMD_ERR);
4254 	}
4255 
4256 	qsort(kmu.kmu_hash, kmu.kmu_nelems, sizeof (kmowner_t), kmownercmp);
4257 	kmoend = kmu.kmu_hash + kmu.kmu_nelems;
4258 
4259 	for (kmo = kmu.kmu_hash; kmo < kmoend; kmo++) {
4260 		if (kmo->kmo_total_size < mem_threshold &&
4261 		    kmo->kmo_num < cnt_threshold)
4262 			continue;
4263 		mdb_printf("%lu bytes for %u allocations with data size %lu:\n",
4264 		    kmo->kmo_total_size, kmo->kmo_num, kmo->kmo_data_size);
4265 		for (i = 0; i < kmo->kmo_depth; i++)
4266 			mdb_printf("\t %a\n", kmo->kmo_stack[i]);
4267 	}
4268 
4269 	return (DCMD_OK);
4270 }
4271 
4272 void
kmausers_help(void)4273 kmausers_help(void)
4274 {
4275 	mdb_printf(
4276 	    "Displays the largest users of the kmem allocator, sorted by \n"
4277 	    "trace.  If one or more caches is specified, only those caches\n"
4278 	    "will be searched.  By default, all caches are searched.  If an\n"
4279 	    "address is specified, then only those allocations which include\n"
4280 	    "the given address are displayed.  Specifying an address implies\n"
4281 	    "-f.\n"
4282 	    "\n"
4283 	    "\t-e\tInclude all users, not just the largest\n"
4284 	    "\t-f\tDisplay individual allocations.  By default, users are\n"
4285 	    "\t\tgrouped by stack\n");
4286 }
4287 
4288 static int
kmem_ready_check(void)4289 kmem_ready_check(void)
4290 {
4291 	int ready;
4292 
4293 	if (mdb_readvar(&ready, "kmem_ready") < 0)
4294 		return (-1); /* errno is set for us */
4295 
4296 	return (ready);
4297 }
4298 
4299 void
kmem_statechange(void)4300 kmem_statechange(void)
4301 {
4302 	static int been_ready = 0;
4303 
4304 	if (been_ready)
4305 		return;
4306 
4307 	if (kmem_ready_check() <= 0)
4308 		return;
4309 
4310 	been_ready = 1;
4311 	(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmem_init_walkers, NULL);
4312 }
4313 
4314 void
kmem_init(void)4315 kmem_init(void)
4316 {
4317 	mdb_walker_t w = {
4318 		"kmem_cache", "walk list of kmem caches", kmem_cache_walk_init,
4319 		list_walk_step, list_walk_fini
4320 	};
4321 
4322 	/*
4323 	 * If kmem is ready, we'll need to invoke the kmem_cache walker
4324 	 * immediately.  Walkers in the linkage structure won't be ready until
4325 	 * _mdb_init returns, so we'll need to add this one manually.  If kmem
4326 	 * is ready, we'll use the walker to initialize the caches.  If kmem
4327 	 * isn't ready, we'll register a callback that will allow us to defer
4328 	 * cache walking until it is.
4329 	 */
4330 	if (mdb_add_walker(&w) != 0) {
4331 		mdb_warn("failed to add kmem_cache walker");
4332 		return;
4333 	}
4334 
4335 	kmem_statechange();
4336 
4337 	/* register our ::whatis handlers */
4338 	mdb_whatis_register("modules", whatis_run_modules, NULL,
4339 	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4340 	mdb_whatis_register("threads", whatis_run_threads, NULL,
4341 	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4342 	mdb_whatis_register("pages", whatis_run_pages, NULL,
4343 	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4344 	mdb_whatis_register("kmem", whatis_run_kmem, NULL,
4345 	    WHATIS_PRIO_ALLOCATOR, 0);
4346 	mdb_whatis_register("vmem", whatis_run_vmem, NULL,
4347 	    WHATIS_PRIO_ALLOCATOR, 0);
4348 }
4349 
4350 typedef struct whatthread {
4351 	uintptr_t	wt_target;
4352 	int		wt_verbose;
4353 } whatthread_t;
4354 
4355 static int
whatthread_walk_thread(uintptr_t addr,const kthread_t * t,whatthread_t * w)4356 whatthread_walk_thread(uintptr_t addr, const kthread_t *t, whatthread_t *w)
4357 {
4358 	uintptr_t current, data;
4359 
4360 	if (t->t_stkbase == NULL)
4361 		return (WALK_NEXT);
4362 
4363 	/*
4364 	 * Warn about swapped out threads, but drive on anyway
4365 	 */
4366 	if (!(t->t_schedflag & TS_LOAD)) {
4367 		mdb_warn("thread %p's stack swapped out\n", addr);
4368 		return (WALK_NEXT);
4369 	}
4370 
4371 	/*
4372 	 * Search the thread's stack for the given pointer.  Note that it would
4373 	 * be more efficient to follow ::kgrep's lead and read in page-sized
4374 	 * chunks, but this routine is already fast and simple.
4375 	 */
4376 	for (current = (uintptr_t)t->t_stkbase; current < (uintptr_t)t->t_stk;
4377 	    current += sizeof (uintptr_t)) {
4378 		if (mdb_vread(&data, sizeof (data), current) == -1) {
4379 			mdb_warn("couldn't read thread %p's stack at %p",
4380 			    addr, current);
4381 			return (WALK_ERR);
4382 		}
4383 
4384 		if (data == w->wt_target) {
4385 			if (w->wt_verbose) {
4386 				mdb_printf("%p in thread %p's stack%s\n",
4387 				    current, addr, stack_active(t, current));
4388 			} else {
4389 				mdb_printf("%#lr\n", addr);
4390 				return (WALK_NEXT);
4391 			}
4392 		}
4393 	}
4394 
4395 	return (WALK_NEXT);
4396 }
4397 
4398 int
whatthread(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)4399 whatthread(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
4400 {
4401 	whatthread_t w;
4402 
4403 	if (!(flags & DCMD_ADDRSPEC))
4404 		return (DCMD_USAGE);
4405 
4406 	w.wt_verbose = FALSE;
4407 	w.wt_target = addr;
4408 
4409 	if (mdb_getopts(argc, argv,
4410 	    'v', MDB_OPT_SETBITS, TRUE, &w.wt_verbose, NULL) != argc)
4411 		return (DCMD_USAGE);
4412 
4413 	if (mdb_walk("thread", (mdb_walk_cb_t)whatthread_walk_thread, &w)
4414 	    == -1) {
4415 		mdb_warn("couldn't walk threads");
4416 		return (DCMD_ERR);
4417 	}
4418 
4419 	return (DCMD_OK);
4420 }
4421