xref: /illumos-gate/usr/src/cmd/mdb/common/modules/genunix/kmem.c (revision a48fdbef3a00fbebe91cb7211c789eff0e39c957)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2018 Joyent, Inc.  All rights reserved.
28  * Copyright (c) 2012 by Delphix. All rights reserved.
29  * Copyright 2024 Oxide Computer Company
30  */
31 
32 #include <mdb/mdb_param.h>
33 #include <mdb/mdb_modapi.h>
34 #include <mdb/mdb_ctf.h>
35 #include <mdb/mdb_whatis.h>
36 #include <sys/cpuvar.h>
37 #include <sys/kmem_impl.h>
38 #include <sys/vmem_impl.h>
39 #include <sys/machelf.h>
40 #include <sys/modctl.h>
41 #include <sys/kobj.h>
42 #include <sys/panic.h>
43 #include <sys/stack.h>
44 #include <sys/sysmacros.h>
45 #include <vm/page.h>
46 
47 #include "avl.h"
48 #include "combined.h"
49 #include "dist.h"
50 #include "kmem.h"
51 #include "list.h"
52 
53 #define	dprintf(x) if (mdb_debug_level) { \
54 	mdb_printf("kmem debug: ");  \
55 	/*CSTYLED*/\
56 	mdb_printf x ;\
57 }
58 
59 #define	KM_ALLOCATED		0x01
60 #define	KM_FREE			0x02
61 #define	KM_BUFCTL		0x04
62 #define	KM_CONSTRUCTED		0x08	/* only constructed free buffers */
63 #define	KM_HASH			0x10
64 
65 static int mdb_debug_level = 0;
66 
67 /*ARGSUSED*/
68 static int
kmem_init_walkers(uintptr_t addr,const kmem_cache_t * c,void * ignored)69 kmem_init_walkers(uintptr_t addr, const kmem_cache_t *c, void *ignored)
70 {
71 	mdb_walker_t w;
72 	char descr[64];
73 
74 	(void) mdb_snprintf(descr, sizeof (descr),
75 	    "walk the %s cache", c->cache_name);
76 
77 	w.walk_name = c->cache_name;
78 	w.walk_descr = descr;
79 	w.walk_init = kmem_walk_init;
80 	w.walk_step = kmem_walk_step;
81 	w.walk_fini = kmem_walk_fini;
82 	w.walk_init_arg = (void *)addr;
83 
84 	if (mdb_add_walker(&w) == -1)
85 		mdb_warn("failed to add %s walker", c->cache_name);
86 
87 	return (WALK_NEXT);
88 }
89 
90 /*ARGSUSED*/
91 int
kmem_debug(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)92 kmem_debug(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
93 {
94 	mdb_debug_level ^= 1;
95 
96 	mdb_printf("kmem: debugging is now %s\n",
97 	    mdb_debug_level ? "on" : "off");
98 
99 	return (DCMD_OK);
100 }
101 
102 int
kmem_cache_walk_init(mdb_walk_state_t * wsp)103 kmem_cache_walk_init(mdb_walk_state_t *wsp)
104 {
105 	GElf_Sym sym;
106 
107 	if (mdb_lookup_by_name("kmem_caches", &sym) == -1) {
108 		mdb_warn("couldn't find kmem_caches");
109 		return (WALK_ERR);
110 	}
111 
112 	wsp->walk_addr = (uintptr_t)sym.st_value;
113 
114 	return (list_walk_init_named(wsp, "cache list", "cache"));
115 }
116 
117 int
kmem_cpu_cache_walk_init(mdb_walk_state_t * wsp)118 kmem_cpu_cache_walk_init(mdb_walk_state_t *wsp)
119 {
120 	if (wsp->walk_addr == 0) {
121 		mdb_warn("kmem_cpu_cache doesn't support global walks");
122 		return (WALK_ERR);
123 	}
124 
125 	if (mdb_layered_walk("cpu", wsp) == -1) {
126 		mdb_warn("couldn't walk 'cpu'");
127 		return (WALK_ERR);
128 	}
129 
130 	wsp->walk_data = (void *)wsp->walk_addr;
131 
132 	return (WALK_NEXT);
133 }
134 
135 int
kmem_cpu_cache_walk_step(mdb_walk_state_t * wsp)136 kmem_cpu_cache_walk_step(mdb_walk_state_t *wsp)
137 {
138 	uintptr_t caddr = (uintptr_t)wsp->walk_data;
139 	const cpu_t *cpu = wsp->walk_layer;
140 	kmem_cpu_cache_t cc;
141 
142 	caddr += OFFSETOF(kmem_cache_t, cache_cpu[cpu->cpu_seqid]);
143 
144 	if (mdb_vread(&cc, sizeof (kmem_cpu_cache_t), caddr) == -1) {
145 		mdb_warn("couldn't read kmem_cpu_cache at %p", caddr);
146 		return (WALK_ERR);
147 	}
148 
149 	return (wsp->walk_callback(caddr, &cc, wsp->walk_cbdata));
150 }
151 
152 static int
kmem_slab_check(void * p,uintptr_t saddr,void * arg)153 kmem_slab_check(void *p, uintptr_t saddr, void *arg)
154 {
155 	kmem_slab_t *sp = p;
156 	uintptr_t caddr = (uintptr_t)arg;
157 	if ((uintptr_t)sp->slab_cache != caddr) {
158 		mdb_warn("slab %p isn't in cache %p (in cache %p)\n",
159 		    saddr, caddr, sp->slab_cache);
160 		return (-1);
161 	}
162 
163 	return (0);
164 }
165 
166 static int
kmem_partial_slab_check(void * p,uintptr_t saddr,void * arg)167 kmem_partial_slab_check(void *p, uintptr_t saddr, void *arg)
168 {
169 	kmem_slab_t *sp = p;
170 
171 	int rc = kmem_slab_check(p, saddr, arg);
172 	if (rc != 0) {
173 		return (rc);
174 	}
175 
176 	if (!KMEM_SLAB_IS_PARTIAL(sp)) {
177 		mdb_warn("slab %p is not a partial slab\n", saddr);
178 		return (-1);
179 	}
180 
181 	return (0);
182 }
183 
184 static int
kmem_complete_slab_check(void * p,uintptr_t saddr,void * arg)185 kmem_complete_slab_check(void *p, uintptr_t saddr, void *arg)
186 {
187 	kmem_slab_t *sp = p;
188 
189 	int rc = kmem_slab_check(p, saddr, arg);
190 	if (rc != 0) {
191 		return (rc);
192 	}
193 
194 	if (!KMEM_SLAB_IS_ALL_USED(sp)) {
195 		mdb_warn("slab %p is not completely allocated\n", saddr);
196 		return (-1);
197 	}
198 
199 	return (0);
200 }
201 
202 typedef struct {
203 	uintptr_t kns_cache_addr;
204 	int kns_nslabs;
205 } kmem_nth_slab_t;
206 
207 static int
kmem_nth_slab_check(void * p,uintptr_t saddr,void * arg)208 kmem_nth_slab_check(void *p, uintptr_t saddr, void *arg)
209 {
210 	kmem_nth_slab_t *chkp = arg;
211 
212 	int rc = kmem_slab_check(p, saddr, (void *)chkp->kns_cache_addr);
213 	if (rc != 0) {
214 		return (rc);
215 	}
216 
217 	return (chkp->kns_nslabs-- == 0 ? 1 : 0);
218 }
219 
220 static int
kmem_complete_slab_walk_init(mdb_walk_state_t * wsp)221 kmem_complete_slab_walk_init(mdb_walk_state_t *wsp)
222 {
223 	uintptr_t caddr = wsp->walk_addr;
224 
225 	wsp->walk_addr = (uintptr_t)(caddr +
226 	    offsetof(kmem_cache_t, cache_complete_slabs));
227 
228 	return (list_walk_init_checked(wsp, "slab list", "slab",
229 	    kmem_complete_slab_check, (void *)caddr));
230 }
231 
232 static int
kmem_partial_slab_walk_init(mdb_walk_state_t * wsp)233 kmem_partial_slab_walk_init(mdb_walk_state_t *wsp)
234 {
235 	uintptr_t caddr = wsp->walk_addr;
236 
237 	wsp->walk_addr = (uintptr_t)(caddr +
238 	    offsetof(kmem_cache_t, cache_partial_slabs));
239 
240 	return (avl_walk_init_checked(wsp, "slab list", "slab",
241 	    kmem_partial_slab_check, (void *)caddr));
242 }
243 
244 int
kmem_slab_walk_init(mdb_walk_state_t * wsp)245 kmem_slab_walk_init(mdb_walk_state_t *wsp)
246 {
247 	uintptr_t caddr = wsp->walk_addr;
248 
249 	if (caddr == 0) {
250 		mdb_warn("kmem_slab doesn't support global walks\n");
251 		return (WALK_ERR);
252 	}
253 
254 	combined_walk_init(wsp);
255 	combined_walk_add(wsp,
256 	    kmem_complete_slab_walk_init, list_walk_step, list_walk_fini);
257 	combined_walk_add(wsp,
258 	    kmem_partial_slab_walk_init, avl_walk_step, avl_walk_fini);
259 
260 	return (WALK_NEXT);
261 }
262 
263 static int
kmem_first_complete_slab_walk_init(mdb_walk_state_t * wsp)264 kmem_first_complete_slab_walk_init(mdb_walk_state_t *wsp)
265 {
266 	uintptr_t caddr = wsp->walk_addr;
267 	kmem_nth_slab_t *chk;
268 
269 	chk = mdb_alloc(sizeof (kmem_nth_slab_t),
270 	    UM_SLEEP | UM_GC);
271 	chk->kns_cache_addr = caddr;
272 	chk->kns_nslabs = 1;
273 	wsp->walk_addr = (uintptr_t)(caddr +
274 	    offsetof(kmem_cache_t, cache_complete_slabs));
275 
276 	return (list_walk_init_checked(wsp, "slab list", "slab",
277 	    kmem_nth_slab_check, chk));
278 }
279 
280 int
kmem_slab_walk_partial_init(mdb_walk_state_t * wsp)281 kmem_slab_walk_partial_init(mdb_walk_state_t *wsp)
282 {
283 	uintptr_t caddr = wsp->walk_addr;
284 	kmem_cache_t c;
285 
286 	if (caddr == 0) {
287 		mdb_warn("kmem_slab_partial doesn't support global walks\n");
288 		return (WALK_ERR);
289 	}
290 
291 	if (mdb_vread(&c, sizeof (c), caddr) == -1) {
292 		mdb_warn("couldn't read kmem_cache at %p", caddr);
293 		return (WALK_ERR);
294 	}
295 
296 	combined_walk_init(wsp);
297 
298 	/*
299 	 * Some consumers (umem_walk_step(), in particular) require at
300 	 * least one callback if there are any buffers in the cache.  So
301 	 * if there are *no* partial slabs, report the first full slab, if
302 	 * any.
303 	 *
304 	 * Yes, this is ugly, but it's cleaner than the other possibilities.
305 	 */
306 	if (c.cache_partial_slabs.avl_numnodes == 0) {
307 		combined_walk_add(wsp, kmem_first_complete_slab_walk_init,
308 		    list_walk_step, list_walk_fini);
309 	} else {
310 		combined_walk_add(wsp, kmem_partial_slab_walk_init,
311 		    avl_walk_step, avl_walk_fini);
312 	}
313 
314 	return (WALK_NEXT);
315 }
316 
317 int
kmem_cache(uintptr_t addr,uint_t flags,int ac,const mdb_arg_t * argv)318 kmem_cache(uintptr_t addr, uint_t flags, int ac, const mdb_arg_t *argv)
319 {
320 	kmem_cache_t c;
321 	const char *filter = NULL;
322 
323 	if (mdb_getopts(ac, argv,
324 	    'n', MDB_OPT_STR, &filter,
325 	    NULL) != ac) {
326 		return (DCMD_USAGE);
327 	}
328 
329 	if (!(flags & DCMD_ADDRSPEC)) {
330 		if (mdb_walk_dcmd("kmem_cache", "kmem_cache", ac, argv) == -1) {
331 			mdb_warn("can't walk kmem_cache");
332 			return (DCMD_ERR);
333 		}
334 		return (DCMD_OK);
335 	}
336 
337 	if (DCMD_HDRSPEC(flags))
338 		mdb_printf("%-?s %-25s %4s %6s %8s %8s\n", "ADDR", "NAME",
339 		    "FLAG", "CFLAG", "BUFSIZE", "BUFTOTL");
340 
341 	if (mdb_vread(&c, sizeof (c), addr) == -1) {
342 		mdb_warn("couldn't read kmem_cache at %p", addr);
343 		return (DCMD_ERR);
344 	}
345 
346 	if ((filter != NULL) && (strstr(c.cache_name, filter) == NULL))
347 		return (DCMD_OK);
348 
349 	mdb_printf("%0?p %-25s %04x %06x %8ld %8lld\n", addr, c.cache_name,
350 	    c.cache_flags, c.cache_cflags, c.cache_bufsize, c.cache_buftotal);
351 
352 	return (DCMD_OK);
353 }
354 
355 void
kmem_cache_help(void)356 kmem_cache_help(void)
357 {
358 	mdb_printf("%s", "Print kernel memory caches.\n\n");
359 	mdb_dec_indent(2);
360 	mdb_printf("%<b>OPTIONS%</b>\n");
361 	mdb_inc_indent(2);
362 	mdb_printf("%s",
363 "  -n name\n"
364 "        name of kmem cache (or matching partial name)\n"
365 "\n"
366 "Column\tDescription\n"
367 "\n"
368 "ADDR\t\taddress of kmem cache\n"
369 "NAME\t\tname of kmem cache\n"
370 "FLAG\t\tvarious cache state flags\n"
371 "CFLAG\t\tcache creation flags\n"
372 "BUFSIZE\tobject size in bytes\n"
373 "BUFTOTL\tcurrent total buffers in cache (allocated and free)\n");
374 }
375 
376 #define	LABEL_WIDTH	11
377 static void
kmem_slabs_print_dist(uint_t * ks_bucket,size_t buffers_per_slab,size_t maxbuckets,size_t minbucketsize)378 kmem_slabs_print_dist(uint_t *ks_bucket, size_t buffers_per_slab,
379     size_t maxbuckets, size_t minbucketsize)
380 {
381 	uint64_t total;
382 	int buckets;
383 	int i;
384 	const int *distarray;
385 	int complete[2];
386 
387 	buckets = buffers_per_slab;
388 
389 	total = 0;
390 	for (i = 0; i <= buffers_per_slab; i++)
391 		total += ks_bucket[i];
392 
393 	if (maxbuckets > 1)
394 		buckets = MIN(buckets, maxbuckets);
395 
396 	if (minbucketsize > 1) {
397 		/*
398 		 * minbucketsize does not apply to the first bucket reserved
399 		 * for completely allocated slabs
400 		 */
401 		buckets = MIN(buckets, 1 + ((buffers_per_slab - 1) /
402 		    minbucketsize));
403 		if ((buckets < 2) && (buffers_per_slab > 1)) {
404 			buckets = 2;
405 			minbucketsize = (buffers_per_slab - 1);
406 		}
407 	}
408 
409 	/*
410 	 * The first printed bucket is reserved for completely allocated slabs.
411 	 * Passing (buckets - 1) excludes that bucket from the generated
412 	 * distribution, since we're handling it as a special case.
413 	 */
414 	complete[0] = buffers_per_slab;
415 	complete[1] = buffers_per_slab + 1;
416 	distarray = dist_linear(buckets - 1, 1, buffers_per_slab - 1);
417 
418 	mdb_printf("%*s\n", LABEL_WIDTH, "Allocated");
419 	dist_print_header("Buffers", LABEL_WIDTH, "Slabs");
420 
421 	dist_print_bucket(complete, 0, ks_bucket, total, LABEL_WIDTH);
422 	/*
423 	 * Print bucket ranges in descending order after the first bucket for
424 	 * completely allocated slabs, so a person can see immediately whether
425 	 * or not there is fragmentation without having to scan possibly
426 	 * multiple screens of output. Starting at (buckets - 2) excludes the
427 	 * extra terminating bucket.
428 	 */
429 	for (i = buckets - 2; i >= 0; i--) {
430 		dist_print_bucket(distarray, i, ks_bucket, total, LABEL_WIDTH);
431 	}
432 	mdb_printf("\n");
433 }
434 #undef LABEL_WIDTH
435 
436 /*ARGSUSED*/
437 static int
kmem_first_slab(uintptr_t addr,const kmem_slab_t * sp,boolean_t * is_slab)438 kmem_first_slab(uintptr_t addr, const kmem_slab_t *sp, boolean_t *is_slab)
439 {
440 	*is_slab = B_TRUE;
441 	return (WALK_DONE);
442 }
443 
444 /*ARGSUSED*/
445 static int
kmem_first_partial_slab(uintptr_t addr,const kmem_slab_t * sp,boolean_t * is_slab)446 kmem_first_partial_slab(uintptr_t addr, const kmem_slab_t *sp,
447     boolean_t *is_slab)
448 {
449 	/*
450 	 * The "kmem_partial_slab" walker reports the first full slab if there
451 	 * are no partial slabs (for the sake of consumers that require at least
452 	 * one callback if there are any buffers in the cache).
453 	 */
454 	*is_slab = KMEM_SLAB_IS_PARTIAL(sp);
455 	return (WALK_DONE);
456 }
457 
458 typedef struct kmem_slab_usage {
459 	int ksu_refcnt;			/* count of allocated buffers on slab */
460 	boolean_t ksu_nomove;		/* slab marked non-reclaimable */
461 } kmem_slab_usage_t;
462 
463 typedef struct kmem_slab_stats {
464 	const kmem_cache_t *ks_cp;
465 	int ks_slabs;			/* slabs in cache */
466 	int ks_partial_slabs;		/* partially allocated slabs in cache */
467 	uint64_t ks_unused_buffers;	/* total unused buffers in cache */
468 	int ks_max_buffers_per_slab;	/* max buffers per slab */
469 	int ks_usage_len;		/* ks_usage array length */
470 	kmem_slab_usage_t *ks_usage;	/* partial slab usage */
471 	uint_t *ks_bucket;		/* slab usage distribution */
472 } kmem_slab_stats_t;
473 
474 /*ARGSUSED*/
475 static int
kmem_slablist_stat(uintptr_t addr,const kmem_slab_t * sp,kmem_slab_stats_t * ks)476 kmem_slablist_stat(uintptr_t addr, const kmem_slab_t *sp,
477     kmem_slab_stats_t *ks)
478 {
479 	kmem_slab_usage_t *ksu;
480 	long unused;
481 
482 	ks->ks_slabs++;
483 	ks->ks_bucket[sp->slab_refcnt]++;
484 
485 	unused = (sp->slab_chunks - sp->slab_refcnt);
486 	if (unused == 0) {
487 		return (WALK_NEXT);
488 	}
489 
490 	ks->ks_partial_slabs++;
491 	ks->ks_unused_buffers += unused;
492 
493 	if (ks->ks_partial_slabs > ks->ks_usage_len) {
494 		kmem_slab_usage_t *usage;
495 		int len = ks->ks_usage_len;
496 
497 		len = (len == 0 ? 16 : len * 2);
498 		usage = mdb_zalloc(len * sizeof (kmem_slab_usage_t), UM_SLEEP);
499 		if (ks->ks_usage != NULL) {
500 			bcopy(ks->ks_usage, usage,
501 			    ks->ks_usage_len * sizeof (kmem_slab_usage_t));
502 			mdb_free(ks->ks_usage,
503 			    ks->ks_usage_len * sizeof (kmem_slab_usage_t));
504 		}
505 		ks->ks_usage = usage;
506 		ks->ks_usage_len = len;
507 	}
508 
509 	ksu = &ks->ks_usage[ks->ks_partial_slabs - 1];
510 	ksu->ksu_refcnt = sp->slab_refcnt;
511 	ksu->ksu_nomove = (sp->slab_flags & KMEM_SLAB_NOMOVE);
512 	return (WALK_NEXT);
513 }
514 
515 static void
kmem_slabs_header()516 kmem_slabs_header()
517 {
518 	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
519 	    "", "", "Partial", "", "Unused", "");
520 	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
521 	    "Cache Name", "Slabs", "Slabs", "Buffers", "Buffers", "Waste");
522 	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
523 	    "-------------------------", "--------", "--------", "---------",
524 	    "---------", "------");
525 }
526 
527 int
kmem_slabs(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)528 kmem_slabs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
529 {
530 	kmem_cache_t c;
531 	kmem_slab_stats_t stats;
532 	mdb_walk_cb_t cb;
533 	int pct;
534 	int tenths_pct;
535 	size_t maxbuckets = 1;
536 	size_t minbucketsize = 0;
537 	const char *filter = NULL;
538 	const char *name = NULL;
539 	uint_t opt_v = FALSE;
540 	boolean_t buckets = B_FALSE;
541 	boolean_t skip = B_FALSE;
542 
543 	if (mdb_getopts(argc, argv,
544 	    'B', MDB_OPT_UINTPTR, &minbucketsize,
545 	    'b', MDB_OPT_UINTPTR, &maxbuckets,
546 	    'n', MDB_OPT_STR, &filter,
547 	    'N', MDB_OPT_STR, &name,
548 	    'v', MDB_OPT_SETBITS, TRUE, &opt_v,
549 	    NULL) != argc) {
550 		return (DCMD_USAGE);
551 	}
552 
553 	if ((maxbuckets != 1) || (minbucketsize != 0)) {
554 		buckets = B_TRUE;
555 	}
556 
557 	if (!(flags & DCMD_ADDRSPEC)) {
558 		if (mdb_walk_dcmd("kmem_cache", "kmem_slabs", argc,
559 		    argv) == -1) {
560 			mdb_warn("can't walk kmem_cache");
561 			return (DCMD_ERR);
562 		}
563 		return (DCMD_OK);
564 	}
565 
566 	if (mdb_vread(&c, sizeof (c), addr) == -1) {
567 		mdb_warn("couldn't read kmem_cache at %p", addr);
568 		return (DCMD_ERR);
569 	}
570 
571 	if (name == NULL) {
572 		skip = ((filter != NULL) &&
573 		    (strstr(c.cache_name, filter) == NULL));
574 	} else if (filter == NULL) {
575 		skip = (strcmp(c.cache_name, name) != 0);
576 	} else {
577 		/* match either -n or -N */
578 		skip = ((strcmp(c.cache_name, name) != 0) &&
579 		    (strstr(c.cache_name, filter) == NULL));
580 	}
581 
582 	if (!(opt_v || buckets) && DCMD_HDRSPEC(flags)) {
583 		kmem_slabs_header();
584 	} else if ((opt_v || buckets) && !skip) {
585 		if (DCMD_HDRSPEC(flags)) {
586 			kmem_slabs_header();
587 		} else {
588 			boolean_t is_slab = B_FALSE;
589 			const char *walker_name;
590 			if (opt_v) {
591 				cb = (mdb_walk_cb_t)kmem_first_partial_slab;
592 				walker_name = "kmem_slab_partial";
593 			} else {
594 				cb = (mdb_walk_cb_t)kmem_first_slab;
595 				walker_name = "kmem_slab";
596 			}
597 			(void) mdb_pwalk(walker_name, cb, &is_slab, addr);
598 			if (is_slab) {
599 				kmem_slabs_header();
600 			}
601 		}
602 	}
603 
604 	if (skip) {
605 		return (DCMD_OK);
606 	}
607 
608 	bzero(&stats, sizeof (kmem_slab_stats_t));
609 	stats.ks_cp = &c;
610 	stats.ks_max_buffers_per_slab = c.cache_maxchunks;
611 	/* +1 to include a zero bucket */
612 	stats.ks_bucket = mdb_zalloc((stats.ks_max_buffers_per_slab + 1) *
613 	    sizeof (*stats.ks_bucket), UM_SLEEP);
614 	cb = (mdb_walk_cb_t)kmem_slablist_stat;
615 	(void) mdb_pwalk("kmem_slab", cb, &stats, addr);
616 
617 	if (c.cache_buftotal == 0) {
618 		pct = 0;
619 		tenths_pct = 0;
620 	} else {
621 		uint64_t n = stats.ks_unused_buffers * 10000;
622 		pct = (int)(n / c.cache_buftotal);
623 		tenths_pct = pct - ((pct / 100) * 100);
624 		tenths_pct = (tenths_pct + 5) / 10; /* round nearest tenth */
625 		if (tenths_pct == 10) {
626 			pct += 100;
627 			tenths_pct = 0;
628 		}
629 	}
630 
631 	pct /= 100;
632 	mdb_printf("%-25s %8d %8d %9lld %9lld %3d.%1d%%\n", c.cache_name,
633 	    stats.ks_slabs, stats.ks_partial_slabs, c.cache_buftotal,
634 	    stats.ks_unused_buffers, pct, tenths_pct);
635 
636 	if (maxbuckets == 0) {
637 		maxbuckets = stats.ks_max_buffers_per_slab;
638 	}
639 
640 	if (((maxbuckets > 1) || (minbucketsize > 0)) &&
641 	    (stats.ks_slabs > 0)) {
642 		mdb_printf("\n");
643 		kmem_slabs_print_dist(stats.ks_bucket,
644 		    stats.ks_max_buffers_per_slab, maxbuckets, minbucketsize);
645 	}
646 
647 	mdb_free(stats.ks_bucket, (stats.ks_max_buffers_per_slab + 1) *
648 	    sizeof (*stats.ks_bucket));
649 
650 	if (!opt_v) {
651 		return (DCMD_OK);
652 	}
653 
654 	if (opt_v && (stats.ks_partial_slabs > 0)) {
655 		int i;
656 		kmem_slab_usage_t *ksu;
657 
658 		mdb_printf("  %d complete (%d), %d partial:",
659 		    (stats.ks_slabs - stats.ks_partial_slabs),
660 		    stats.ks_max_buffers_per_slab,
661 		    stats.ks_partial_slabs);
662 
663 		for (i = 0; i < stats.ks_partial_slabs; i++) {
664 			ksu = &stats.ks_usage[i];
665 			mdb_printf(" %d%s", ksu->ksu_refcnt,
666 			    (ksu->ksu_nomove ? "*" : ""));
667 		}
668 		mdb_printf("\n\n");
669 	}
670 
671 	if (stats.ks_usage_len > 0) {
672 		mdb_free(stats.ks_usage,
673 		    stats.ks_usage_len * sizeof (kmem_slab_usage_t));
674 	}
675 
676 	return (DCMD_OK);
677 }
678 
679 void
kmem_slabs_help(void)680 kmem_slabs_help(void)
681 {
682 	mdb_printf("%s",
683 "Display slab usage per kmem cache.\n\n");
684 	mdb_dec_indent(2);
685 	mdb_printf("%<b>OPTIONS%</b>\n");
686 	mdb_inc_indent(2);
687 	mdb_printf("%s",
688 "  -n name\n"
689 "        name of kmem cache (or matching partial name)\n"
690 "  -N name\n"
691 "        exact name of kmem cache\n"
692 "  -b maxbins\n"
693 "        Print a distribution of allocated buffers per slab using at\n"
694 "        most maxbins bins. The first bin is reserved for completely\n"
695 "        allocated slabs. Setting maxbins to zero (-b 0) has the same\n"
696 "        effect as specifying the maximum allocated buffers per slab\n"
697 "        or setting minbinsize to 1 (-B 1).\n"
698 "  -B minbinsize\n"
699 "        Print a distribution of allocated buffers per slab, making\n"
700 "        all bins (except the first, reserved for completely allocated\n"
701 "        slabs) at least minbinsize buffers apart.\n"
702 "  -v    verbose output: List the allocated buffer count of each partial\n"
703 "        slab on the free list in order from front to back to show how\n"
704 "        closely the slabs are ordered by usage. For example\n"
705 "\n"
706 "          10 complete, 3 partial (8): 7 3 1\n"
707 "\n"
708 "        means there are thirteen slabs with eight buffers each, including\n"
709 "        three partially allocated slabs with less than all eight buffers\n"
710 "        allocated.\n"
711 "\n"
712 "        Buffer allocations are always from the front of the partial slab\n"
713 "        list. When a buffer is freed from a completely used slab, that\n"
714 "        slab is added to the front of the partial slab list. Assuming\n"
715 "        that all buffers are equally likely to be freed soon, the\n"
716 "        desired order of partial slabs is most-used at the front of the\n"
717 "        list and least-used at the back (as in the example above).\n"
718 "        However, if a slab contains an allocated buffer that will not\n"
719 "        soon be freed, it would be better for that slab to be at the\n"
720 "        front where all of its buffers can be allocated. Taking a slab\n"
721 "        off the partial slab list (either with all buffers freed or all\n"
722 "        buffers allocated) reduces cache fragmentation.\n"
723 "\n"
724 "        A slab's allocated buffer count representing a partial slab (9 in\n"
725 "        the example below) may be marked as follows:\n"
726 "\n"
727 "        9*   An asterisk indicates that kmem has marked the slab non-\n"
728 "        reclaimable because the kmem client refused to move one of the\n"
729 "        slab's buffers. Since kmem does not expect to completely free the\n"
730 "        slab, it moves it to the front of the list in the hope of\n"
731 "        completely allocating it instead. A slab marked with an asterisk\n"
732 "        stays marked for as long as it remains on the partial slab list.\n"
733 "\n"
734 "Column\t\tDescription\n"
735 "\n"
736 "Cache Name\t\tname of kmem cache\n"
737 "Slabs\t\t\ttotal slab count\n"
738 "Partial Slabs\t\tcount of partially allocated slabs on the free list\n"
739 "Buffers\t\ttotal buffer count (Slabs * (buffers per slab))\n"
740 "Unused Buffers\tcount of unallocated buffers across all partial slabs\n"
741 "Waste\t\t\t(Unused Buffers / Buffers) does not include space\n"
742 "\t\t\t  for accounting structures (debug mode), slab\n"
743 "\t\t\t  coloring (incremental small offsets to stagger\n"
744 "\t\t\t  buffer alignment), or the per-CPU magazine layer\n");
745 }
746 
747 static int
addrcmp(const void * lhs,const void * rhs)748 addrcmp(const void *lhs, const void *rhs)
749 {
750 	uintptr_t p1 = *((uintptr_t *)lhs);
751 	uintptr_t p2 = *((uintptr_t *)rhs);
752 
753 	if (p1 < p2)
754 		return (-1);
755 	if (p1 > p2)
756 		return (1);
757 	return (0);
758 }
759 
760 static int
bufctlcmp(const kmem_bufctl_audit_t ** lhs,const kmem_bufctl_audit_t ** rhs)761 bufctlcmp(const kmem_bufctl_audit_t **lhs, const kmem_bufctl_audit_t **rhs)
762 {
763 	const kmem_bufctl_audit_t *bcp1 = *lhs;
764 	const kmem_bufctl_audit_t *bcp2 = *rhs;
765 
766 	if (bcp1->bc_timestamp > bcp2->bc_timestamp)
767 		return (-1);
768 
769 	if (bcp1->bc_timestamp < bcp2->bc_timestamp)
770 		return (1);
771 
772 	return (0);
773 }
774 
775 typedef struct kmem_hash_walk {
776 	uintptr_t *kmhw_table;
777 	size_t kmhw_nelems;
778 	size_t kmhw_pos;
779 	kmem_bufctl_t kmhw_cur;
780 } kmem_hash_walk_t;
781 
782 int
kmem_hash_walk_init(mdb_walk_state_t * wsp)783 kmem_hash_walk_init(mdb_walk_state_t *wsp)
784 {
785 	kmem_hash_walk_t *kmhw;
786 	uintptr_t *hash;
787 	kmem_cache_t c;
788 	uintptr_t haddr, addr = wsp->walk_addr;
789 	size_t nelems;
790 	size_t hsize;
791 
792 	if (addr == 0) {
793 		mdb_warn("kmem_hash doesn't support global walks\n");
794 		return (WALK_ERR);
795 	}
796 
797 	if (mdb_vread(&c, sizeof (c), addr) == -1) {
798 		mdb_warn("couldn't read cache at addr %p", addr);
799 		return (WALK_ERR);
800 	}
801 
802 	if (!(c.cache_flags & KMF_HASH)) {
803 		mdb_warn("cache %p doesn't have a hash table\n", addr);
804 		return (WALK_DONE);		/* nothing to do */
805 	}
806 
807 	kmhw = mdb_zalloc(sizeof (kmem_hash_walk_t), UM_SLEEP);
808 	kmhw->kmhw_cur.bc_next = NULL;
809 	kmhw->kmhw_pos = 0;
810 
811 	kmhw->kmhw_nelems = nelems = c.cache_hash_mask + 1;
812 	hsize = nelems * sizeof (uintptr_t);
813 	haddr = (uintptr_t)c.cache_hash_table;
814 
815 	kmhw->kmhw_table = hash = mdb_alloc(hsize, UM_SLEEP);
816 	if (mdb_vread(hash, hsize, haddr) == -1) {
817 		mdb_warn("failed to read hash table at %p", haddr);
818 		mdb_free(hash, hsize);
819 		mdb_free(kmhw, sizeof (kmem_hash_walk_t));
820 		return (WALK_ERR);
821 	}
822 
823 	wsp->walk_data = kmhw;
824 
825 	return (WALK_NEXT);
826 }
827 
828 int
kmem_hash_walk_step(mdb_walk_state_t * wsp)829 kmem_hash_walk_step(mdb_walk_state_t *wsp)
830 {
831 	kmem_hash_walk_t *kmhw = wsp->walk_data;
832 	uintptr_t addr = 0;
833 
834 	if ((addr = (uintptr_t)kmhw->kmhw_cur.bc_next) == 0) {
835 		while (kmhw->kmhw_pos < kmhw->kmhw_nelems) {
836 			if ((addr = kmhw->kmhw_table[kmhw->kmhw_pos++]) != 0)
837 				break;
838 		}
839 	}
840 	if (addr == 0)
841 		return (WALK_DONE);
842 
843 	if (mdb_vread(&kmhw->kmhw_cur, sizeof (kmem_bufctl_t), addr) == -1) {
844 		mdb_warn("couldn't read kmem_bufctl_t at addr %p", addr);
845 		return (WALK_ERR);
846 	}
847 
848 	return (wsp->walk_callback(addr, &kmhw->kmhw_cur, wsp->walk_cbdata));
849 }
850 
851 void
kmem_hash_walk_fini(mdb_walk_state_t * wsp)852 kmem_hash_walk_fini(mdb_walk_state_t *wsp)
853 {
854 	kmem_hash_walk_t *kmhw = wsp->walk_data;
855 
856 	if (kmhw == NULL)
857 		return;
858 
859 	mdb_free(kmhw->kmhw_table, kmhw->kmhw_nelems * sizeof (uintptr_t));
860 	mdb_free(kmhw, sizeof (kmem_hash_walk_t));
861 }
862 
863 /*
864  * Find the address of the bufctl structure for the address 'buf' in cache
865  * 'cp', which is at address caddr, and place it in *out.
866  */
867 static int
kmem_hash_lookup(kmem_cache_t * cp,uintptr_t caddr,void * buf,uintptr_t * out)868 kmem_hash_lookup(kmem_cache_t *cp, uintptr_t caddr, void *buf, uintptr_t *out)
869 {
870 	uintptr_t bucket = (uintptr_t)KMEM_HASH(cp, buf);
871 	kmem_bufctl_t *bcp;
872 	kmem_bufctl_t bc;
873 
874 	if (mdb_vread(&bcp, sizeof (kmem_bufctl_t *), bucket) == -1) {
875 		mdb_warn("unable to read hash bucket for %p in cache %p",
876 		    buf, caddr);
877 		return (-1);
878 	}
879 
880 	while (bcp != NULL) {
881 		if (mdb_vread(&bc, sizeof (kmem_bufctl_t),
882 		    (uintptr_t)bcp) == -1) {
883 			mdb_warn("unable to read bufctl at %p", bcp);
884 			return (-1);
885 		}
886 		if (bc.bc_addr == buf) {
887 			*out = (uintptr_t)bcp;
888 			return (0);
889 		}
890 		bcp = bc.bc_next;
891 	}
892 
893 	mdb_warn("unable to find bufctl for %p in cache %p\n", buf, caddr);
894 	return (-1);
895 }
896 
897 int
kmem_get_magsize(const kmem_cache_t * cp)898 kmem_get_magsize(const kmem_cache_t *cp)
899 {
900 	uintptr_t addr = (uintptr_t)cp->cache_magtype;
901 	GElf_Sym mt_sym;
902 	kmem_magtype_t mt;
903 	int res;
904 
905 	/*
906 	 * if cpu 0 has a non-zero magsize, it must be correct.  caches
907 	 * with KMF_NOMAGAZINE have disabled their magazine layers, so
908 	 * it is okay to return 0 for them.
909 	 */
910 	if ((res = cp->cache_cpu[0].cc_magsize) != 0 ||
911 	    (cp->cache_flags & KMF_NOMAGAZINE))
912 		return (res);
913 
914 	if (mdb_lookup_by_name("kmem_magtype", &mt_sym) == -1) {
915 		mdb_warn("unable to read 'kmem_magtype'");
916 	} else if (addr < mt_sym.st_value ||
917 	    addr + sizeof (mt) - 1 > mt_sym.st_value + mt_sym.st_size - 1 ||
918 	    ((addr - mt_sym.st_value) % sizeof (mt)) != 0) {
919 		mdb_warn("cache '%s' has invalid magtype pointer (%p)\n",
920 		    cp->cache_name, addr);
921 		return (0);
922 	}
923 	if (mdb_vread(&mt, sizeof (mt), addr) == -1) {
924 		mdb_warn("unable to read magtype at %a", addr);
925 		return (0);
926 	}
927 	return (mt.mt_magsize);
928 }
929 
930 /*ARGSUSED*/
931 static int
kmem_estimate_slab(uintptr_t addr,const kmem_slab_t * sp,size_t * est)932 kmem_estimate_slab(uintptr_t addr, const kmem_slab_t *sp, size_t *est)
933 {
934 	*est -= (sp->slab_chunks - sp->slab_refcnt);
935 
936 	return (WALK_NEXT);
937 }
938 
939 /*
940  * Returns an upper bound on the number of allocated buffers in a given
941  * cache.
942  */
943 size_t
kmem_estimate_allocated(uintptr_t addr,const kmem_cache_t * cp)944 kmem_estimate_allocated(uintptr_t addr, const kmem_cache_t *cp)
945 {
946 	int magsize;
947 	size_t cache_est;
948 
949 	cache_est = cp->cache_buftotal;
950 
951 	(void) mdb_pwalk("kmem_slab_partial",
952 	    (mdb_walk_cb_t)kmem_estimate_slab, &cache_est, addr);
953 
954 	if ((magsize = kmem_get_magsize(cp)) != 0) {
955 		size_t mag_est = cp->cache_full.ml_total * magsize;
956 
957 		if (cache_est >= mag_est) {
958 			cache_est -= mag_est;
959 		} else {
960 			mdb_warn("cache %p's magazine layer holds more buffers "
961 			    "than the slab layer.\n", addr);
962 		}
963 	}
964 	return (cache_est);
965 }
966 
967 #define	READMAG_ROUNDS(rounds) { \
968 	if (mdb_vread(mp, magbsize, (uintptr_t)kmp) == -1) { \
969 		mdb_warn("couldn't read magazine at %p", kmp); \
970 		goto fail; \
971 	} \
972 	for (i = 0; i < rounds; i++) { \
973 		maglist[magcnt++] = mp->mag_round[i]; \
974 		if (magcnt == magmax) { \
975 			mdb_warn("%d magazines exceeds fudge factor\n", \
976 			    magcnt); \
977 			goto fail; \
978 		} \
979 	} \
980 }
981 
982 int
kmem_read_magazines(kmem_cache_t * cp,uintptr_t addr,int ncpus,void *** maglistp,size_t * magcntp,size_t * magmaxp,int alloc_flags)983 kmem_read_magazines(kmem_cache_t *cp, uintptr_t addr, int ncpus,
984     void ***maglistp, size_t *magcntp, size_t *magmaxp, int alloc_flags)
985 {
986 	kmem_magazine_t *kmp, *mp;
987 	void **maglist = NULL;
988 	int i, cpu;
989 	size_t magsize, magmax, magbsize;
990 	size_t magcnt = 0;
991 
992 	/*
993 	 * Read the magtype out of the cache, after verifying the pointer's
994 	 * correctness.
995 	 */
996 	magsize = kmem_get_magsize(cp);
997 	if (magsize == 0) {
998 		*maglistp = NULL;
999 		*magcntp = 0;
1000 		*magmaxp = 0;
1001 		return (WALK_NEXT);
1002 	}
1003 
1004 	/*
1005 	 * There are several places where we need to go buffer hunting:
1006 	 * the per-CPU loaded magazine, the per-CPU spare full magazine,
1007 	 * and the full magazine list in the depot.
1008 	 *
1009 	 * For an upper bound on the number of buffers in the magazine
1010 	 * layer, we have the number of magazines on the cache_full
1011 	 * list plus at most two magazines per CPU (the loaded and the
1012 	 * spare).  Toss in 100 magazines as a fudge factor in case this
1013 	 * is live (the number "100" comes from the same fudge factor in
1014 	 * crash(8)).
1015 	 */
1016 	magmax = (cp->cache_full.ml_total + 2 * ncpus + 100) * magsize;
1017 	magbsize = offsetof(kmem_magazine_t, mag_round[magsize]);
1018 
1019 	if (magbsize >= PAGESIZE / 2) {
1020 		mdb_warn("magazine size for cache %p unreasonable (%x)\n",
1021 		    addr, magbsize);
1022 		return (WALK_ERR);
1023 	}
1024 
1025 	maglist = mdb_alloc(magmax * sizeof (void *), alloc_flags);
1026 	mp = mdb_alloc(magbsize, alloc_flags);
1027 	if (mp == NULL || maglist == NULL)
1028 		goto fail;
1029 
1030 	/*
1031 	 * First up: the magazines in the depot (i.e. on the cache_full list).
1032 	 */
1033 	for (kmp = cp->cache_full.ml_list; kmp != NULL; ) {
1034 		READMAG_ROUNDS(magsize);
1035 		kmp = mp->mag_next;
1036 
1037 		if (kmp == cp->cache_full.ml_list)
1038 			break; /* cache_full list loop detected */
1039 	}
1040 
1041 	dprintf(("cache_full list done\n"));
1042 
1043 	/*
1044 	 * Now whip through the CPUs, snagging the loaded magazines
1045 	 * and full spares.
1046 	 *
1047 	 * In order to prevent inconsistent dumps, rounds and prounds
1048 	 * are copied aside before dumping begins.
1049 	 */
1050 	for (cpu = 0; cpu < ncpus; cpu++) {
1051 		kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu];
1052 		short rounds, prounds;
1053 
1054 		if (KMEM_DUMPCC(ccp)) {
1055 			rounds = ccp->cc_dump_rounds;
1056 			prounds = ccp->cc_dump_prounds;
1057 		} else {
1058 			rounds = ccp->cc_rounds;
1059 			prounds = ccp->cc_prounds;
1060 		}
1061 
1062 		dprintf(("reading cpu cache %p\n",
1063 		    (uintptr_t)ccp - (uintptr_t)cp + addr));
1064 
1065 		if (rounds > 0 &&
1066 		    (kmp = ccp->cc_loaded) != NULL) {
1067 			dprintf(("reading %d loaded rounds\n", rounds));
1068 			READMAG_ROUNDS(rounds);
1069 		}
1070 
1071 		if (prounds > 0 &&
1072 		    (kmp = ccp->cc_ploaded) != NULL) {
1073 			dprintf(("reading %d previously loaded rounds\n",
1074 			    prounds));
1075 			READMAG_ROUNDS(prounds);
1076 		}
1077 	}
1078 
1079 	dprintf(("magazine layer: %d buffers\n", magcnt));
1080 
1081 	if (!(alloc_flags & UM_GC))
1082 		mdb_free(mp, magbsize);
1083 
1084 	*maglistp = maglist;
1085 	*magcntp = magcnt;
1086 	*magmaxp = magmax;
1087 
1088 	return (WALK_NEXT);
1089 
1090 fail:
1091 	if (!(alloc_flags & UM_GC)) {
1092 		if (mp)
1093 			mdb_free(mp, magbsize);
1094 		if (maglist)
1095 			mdb_free(maglist, magmax * sizeof (void *));
1096 	}
1097 	return (WALK_ERR);
1098 }
1099 
1100 static int
kmem_walk_callback(mdb_walk_state_t * wsp,uintptr_t buf)1101 kmem_walk_callback(mdb_walk_state_t *wsp, uintptr_t buf)
1102 {
1103 	return (wsp->walk_callback(buf, NULL, wsp->walk_cbdata));
1104 }
1105 
1106 static int
bufctl_walk_callback(kmem_cache_t * cp,mdb_walk_state_t * wsp,uintptr_t buf)1107 bufctl_walk_callback(kmem_cache_t *cp, mdb_walk_state_t *wsp, uintptr_t buf)
1108 {
1109 	kmem_bufctl_audit_t b;
1110 
1111 	/*
1112 	 * if KMF_AUDIT is not set, we know that we're looking at a
1113 	 * kmem_bufctl_t.
1114 	 */
1115 	if (!(cp->cache_flags & KMF_AUDIT) ||
1116 	    mdb_vread(&b, sizeof (kmem_bufctl_audit_t), buf) == -1) {
1117 		(void) memset(&b, 0, sizeof (b));
1118 		if (mdb_vread(&b, sizeof (kmem_bufctl_t), buf) == -1) {
1119 			mdb_warn("unable to read bufctl at %p", buf);
1120 			return (WALK_ERR);
1121 		}
1122 	}
1123 
1124 	return (wsp->walk_callback(buf, &b, wsp->walk_cbdata));
1125 }
1126 
1127 typedef struct kmem_walk {
1128 	int kmw_type;
1129 
1130 	uintptr_t kmw_addr;		/* cache address */
1131 	kmem_cache_t *kmw_cp;
1132 	size_t kmw_csize;
1133 
1134 	/*
1135 	 * magazine layer
1136 	 */
1137 	void **kmw_maglist;
1138 	size_t kmw_max;
1139 	size_t kmw_count;
1140 	size_t kmw_pos;
1141 
1142 	/*
1143 	 * slab layer
1144 	 */
1145 	char *kmw_valid;	/* to keep track of freed buffers */
1146 	char *kmw_ubase;	/* buffer for slab data */
1147 } kmem_walk_t;
1148 
1149 static int
kmem_walk_init_common(mdb_walk_state_t * wsp,int type)1150 kmem_walk_init_common(mdb_walk_state_t *wsp, int type)
1151 {
1152 	kmem_walk_t *kmw;
1153 	int ncpus, csize;
1154 	kmem_cache_t *cp;
1155 	size_t vm_quantum;
1156 
1157 	size_t magmax, magcnt;
1158 	void **maglist = NULL;
1159 	uint_t chunksize = 1, slabsize = 1;
1160 	int status = WALK_ERR;
1161 	uintptr_t addr = wsp->walk_addr;
1162 	const char *layered;
1163 
1164 	type &= ~KM_HASH;
1165 
1166 	if (addr == 0) {
1167 		mdb_warn("kmem walk doesn't support global walks\n");
1168 		return (WALK_ERR);
1169 	}
1170 
1171 	dprintf(("walking %p\n", addr));
1172 
1173 	/*
1174 	 * First we need to figure out how many CPUs are configured in the
1175 	 * system to know how much to slurp out.
1176 	 */
1177 	mdb_readvar(&ncpus, "max_ncpus");
1178 
1179 	csize = KMEM_CACHE_SIZE(ncpus);
1180 	cp = mdb_alloc(csize, UM_SLEEP);
1181 
1182 	if (mdb_vread(cp, csize, addr) == -1) {
1183 		mdb_warn("couldn't read cache at addr %p", addr);
1184 		goto out2;
1185 	}
1186 
1187 	/*
1188 	 * It's easy for someone to hand us an invalid cache address.
1189 	 * Unfortunately, it is hard for this walker to survive an
1190 	 * invalid cache cleanly.  So we make sure that:
1191 	 *
1192 	 *	1. the vmem arena for the cache is readable,
1193 	 *	2. the vmem arena's quantum is a power of 2,
1194 	 *	3. our slabsize is a multiple of the quantum, and
1195 	 *	4. our chunksize is >0 and less than our slabsize.
1196 	 */
1197 	if (mdb_vread(&vm_quantum, sizeof (vm_quantum),
1198 	    (uintptr_t)&cp->cache_arena->vm_quantum) == -1 ||
1199 	    vm_quantum == 0 ||
1200 	    (vm_quantum & (vm_quantum - 1)) != 0 ||
1201 	    cp->cache_slabsize < vm_quantum ||
1202 	    P2PHASE(cp->cache_slabsize, vm_quantum) != 0 ||
1203 	    cp->cache_chunksize == 0 ||
1204 	    cp->cache_chunksize > cp->cache_slabsize) {
1205 		mdb_warn("%p is not a valid kmem_cache_t\n", addr);
1206 		goto out2;
1207 	}
1208 
1209 	dprintf(("buf total is %d\n", cp->cache_buftotal));
1210 
1211 	if (cp->cache_buftotal == 0) {
1212 		mdb_free(cp, csize);
1213 		return (WALK_DONE);
1214 	}
1215 
1216 	/*
1217 	 * If they ask for bufctls, but it's a small-slab cache,
1218 	 * there is nothing to report.
1219 	 */
1220 	if ((type & KM_BUFCTL) && !(cp->cache_flags & KMF_HASH)) {
1221 		dprintf(("bufctl requested, not KMF_HASH (flags: %p)\n",
1222 		    cp->cache_flags));
1223 		mdb_free(cp, csize);
1224 		return (WALK_DONE);
1225 	}
1226 
1227 	/*
1228 	 * If they want constructed buffers, but there's no constructor or
1229 	 * the cache has DEADBEEF checking enabled, there is nothing to report.
1230 	 */
1231 	if ((type & KM_CONSTRUCTED) && (!(type & KM_FREE) ||
1232 	    cp->cache_constructor == NULL ||
1233 	    (cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) == KMF_DEADBEEF)) {
1234 		mdb_free(cp, csize);
1235 		return (WALK_DONE);
1236 	}
1237 
1238 	/*
1239 	 * Read in the contents of the magazine layer
1240 	 */
1241 	if (kmem_read_magazines(cp, addr, ncpus, &maglist, &magcnt,
1242 	    &magmax, UM_SLEEP) == WALK_ERR)
1243 		goto out2;
1244 
1245 	/*
1246 	 * We have all of the buffers from the magazines;  if we are walking
1247 	 * allocated buffers, sort them so we can bsearch them later.
1248 	 */
1249 	if (type & KM_ALLOCATED)
1250 		qsort(maglist, magcnt, sizeof (void *), addrcmp);
1251 
1252 	wsp->walk_data = kmw = mdb_zalloc(sizeof (kmem_walk_t), UM_SLEEP);
1253 
1254 	kmw->kmw_type = type;
1255 	kmw->kmw_addr = addr;
1256 	kmw->kmw_cp = cp;
1257 	kmw->kmw_csize = csize;
1258 	kmw->kmw_maglist = maglist;
1259 	kmw->kmw_max = magmax;
1260 	kmw->kmw_count = magcnt;
1261 	kmw->kmw_pos = 0;
1262 
1263 	/*
1264 	 * When walking allocated buffers in a KMF_HASH cache, we walk the
1265 	 * hash table instead of the slab layer.
1266 	 */
1267 	if ((cp->cache_flags & KMF_HASH) && (type & KM_ALLOCATED)) {
1268 		layered = "kmem_hash";
1269 
1270 		kmw->kmw_type |= KM_HASH;
1271 	} else {
1272 		/*
1273 		 * If we are walking freed buffers, we only need the
1274 		 * magazine layer plus the partially allocated slabs.
1275 		 * To walk allocated buffers, we need all of the slabs.
1276 		 */
1277 		if (type & KM_ALLOCATED)
1278 			layered = "kmem_slab";
1279 		else
1280 			layered = "kmem_slab_partial";
1281 
1282 		/*
1283 		 * for small-slab caches, we read in the entire slab.  For
1284 		 * freed buffers, we can just walk the freelist.  For
1285 		 * allocated buffers, we use a 'valid' array to track
1286 		 * the freed buffers.
1287 		 */
1288 		if (!(cp->cache_flags & KMF_HASH)) {
1289 			chunksize = cp->cache_chunksize;
1290 			slabsize = cp->cache_slabsize;
1291 
1292 			kmw->kmw_ubase = mdb_alloc(slabsize +
1293 			    sizeof (kmem_bufctl_t), UM_SLEEP);
1294 
1295 			if (type & KM_ALLOCATED)
1296 				kmw->kmw_valid =
1297 				    mdb_alloc(slabsize / chunksize, UM_SLEEP);
1298 		}
1299 	}
1300 
1301 	status = WALK_NEXT;
1302 
1303 	if (mdb_layered_walk(layered, wsp) == -1) {
1304 		mdb_warn("unable to start layered '%s' walk", layered);
1305 		status = WALK_ERR;
1306 	}
1307 
1308 out1:
1309 	if (status == WALK_ERR) {
1310 		if (kmw->kmw_valid)
1311 			mdb_free(kmw->kmw_valid, slabsize / chunksize);
1312 
1313 		if (kmw->kmw_ubase)
1314 			mdb_free(kmw->kmw_ubase, slabsize +
1315 			    sizeof (kmem_bufctl_t));
1316 
1317 		if (kmw->kmw_maglist)
1318 			mdb_free(kmw->kmw_maglist,
1319 			    kmw->kmw_max * sizeof (uintptr_t));
1320 
1321 		mdb_free(kmw, sizeof (kmem_walk_t));
1322 		wsp->walk_data = NULL;
1323 	}
1324 
1325 out2:
1326 	if (status == WALK_ERR)
1327 		mdb_free(cp, csize);
1328 
1329 	return (status);
1330 }
1331 
1332 int
kmem_walk_step(mdb_walk_state_t * wsp)1333 kmem_walk_step(mdb_walk_state_t *wsp)
1334 {
1335 	kmem_walk_t *kmw = wsp->walk_data;
1336 	int type = kmw->kmw_type;
1337 	kmem_cache_t *cp = kmw->kmw_cp;
1338 
1339 	void **maglist = kmw->kmw_maglist;
1340 	int magcnt = kmw->kmw_count;
1341 
1342 	uintptr_t chunksize, slabsize;
1343 	uintptr_t addr;
1344 	const kmem_slab_t *sp;
1345 	const kmem_bufctl_t *bcp;
1346 	kmem_bufctl_t bc;
1347 
1348 	int chunks;
1349 	char *kbase;
1350 	void *buf;
1351 	int i, ret;
1352 
1353 	char *valid, *ubase;
1354 
1355 	/*
1356 	 * first, handle the 'kmem_hash' layered walk case
1357 	 */
1358 	if (type & KM_HASH) {
1359 		/*
1360 		 * We have a buffer which has been allocated out of the
1361 		 * global layer. We need to make sure that it's not
1362 		 * actually sitting in a magazine before we report it as
1363 		 * an allocated buffer.
1364 		 */
1365 		buf = ((const kmem_bufctl_t *)wsp->walk_layer)->bc_addr;
1366 
1367 		if (magcnt > 0 &&
1368 		    bsearch(&buf, maglist, magcnt, sizeof (void *),
1369 		    addrcmp) != NULL)
1370 			return (WALK_NEXT);
1371 
1372 		if (type & KM_BUFCTL)
1373 			return (bufctl_walk_callback(cp, wsp, wsp->walk_addr));
1374 
1375 		return (kmem_walk_callback(wsp, (uintptr_t)buf));
1376 	}
1377 
1378 	ret = WALK_NEXT;
1379 
1380 	addr = kmw->kmw_addr;
1381 
1382 	/*
1383 	 * If we're walking freed buffers, report everything in the
1384 	 * magazine layer before processing the first slab.
1385 	 */
1386 	if ((type & KM_FREE) && magcnt != 0) {
1387 		kmw->kmw_count = 0;		/* only do this once */
1388 		for (i = 0; i < magcnt; i++) {
1389 			buf = maglist[i];
1390 
1391 			if (type & KM_BUFCTL) {
1392 				uintptr_t out;
1393 
1394 				if (cp->cache_flags & KMF_BUFTAG) {
1395 					kmem_buftag_t *btp;
1396 					kmem_buftag_t tag;
1397 
1398 					/* LINTED - alignment */
1399 					btp = KMEM_BUFTAG(cp, buf);
1400 					if (mdb_vread(&tag, sizeof (tag),
1401 					    (uintptr_t)btp) == -1) {
1402 						mdb_warn("reading buftag for "
1403 						    "%p at %p", buf, btp);
1404 						continue;
1405 					}
1406 					out = (uintptr_t)tag.bt_bufctl;
1407 				} else {
1408 					if (kmem_hash_lookup(cp, addr, buf,
1409 					    &out) == -1)
1410 						continue;
1411 				}
1412 				ret = bufctl_walk_callback(cp, wsp, out);
1413 			} else {
1414 				ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1415 			}
1416 
1417 			if (ret != WALK_NEXT)
1418 				return (ret);
1419 		}
1420 	}
1421 
1422 	/*
1423 	 * If they want constructed buffers, we're finished, since the
1424 	 * magazine layer holds them all.
1425 	 */
1426 	if (type & KM_CONSTRUCTED)
1427 		return (WALK_DONE);
1428 
1429 	/*
1430 	 * Handle the buffers in the current slab
1431 	 */
1432 	chunksize = cp->cache_chunksize;
1433 	slabsize = cp->cache_slabsize;
1434 
1435 	sp = wsp->walk_layer;
1436 	chunks = sp->slab_chunks;
1437 	kbase = sp->slab_base;
1438 
1439 	dprintf(("kbase is %p\n", kbase));
1440 
1441 	if (!(cp->cache_flags & KMF_HASH)) {
1442 		valid = kmw->kmw_valid;
1443 		ubase = kmw->kmw_ubase;
1444 
1445 		if (mdb_vread(ubase, chunks * chunksize,
1446 		    (uintptr_t)kbase) == -1) {
1447 			mdb_warn("failed to read slab contents at %p", kbase);
1448 			return (WALK_ERR);
1449 		}
1450 
1451 		/*
1452 		 * Set up the valid map as fully allocated -- we'll punch
1453 		 * out the freelist.
1454 		 */
1455 		if (type & KM_ALLOCATED)
1456 			(void) memset(valid, 1, chunks);
1457 	} else {
1458 		valid = NULL;
1459 		ubase = NULL;
1460 	}
1461 
1462 	/*
1463 	 * walk the slab's freelist
1464 	 */
1465 	bcp = sp->slab_head;
1466 
1467 	dprintf(("refcnt is %d; chunks is %d\n", sp->slab_refcnt, chunks));
1468 
1469 	/*
1470 	 * since we could be in the middle of allocating a buffer,
1471 	 * our refcnt could be one higher than it aught.  So we
1472 	 * check one further on the freelist than the count allows.
1473 	 */
1474 	for (i = sp->slab_refcnt; i <= chunks; i++) {
1475 		uint_t ndx;
1476 
1477 		dprintf(("bcp is %p\n", bcp));
1478 
1479 		if (bcp == NULL) {
1480 			if (i == chunks)
1481 				break;
1482 			mdb_warn(
1483 			    "slab %p in cache %p freelist too short by %d\n",
1484 			    sp, addr, chunks - i);
1485 			break;
1486 		}
1487 
1488 		if (cp->cache_flags & KMF_HASH) {
1489 			if (mdb_vread(&bc, sizeof (bc), (uintptr_t)bcp) == -1) {
1490 				mdb_warn("failed to read bufctl ptr at %p",
1491 				    bcp);
1492 				break;
1493 			}
1494 			buf = bc.bc_addr;
1495 		} else {
1496 			/*
1497 			 * Otherwise the buffer is (or should be) in the slab
1498 			 * that we've read in; determine its offset in the
1499 			 * slab, validate that it's not corrupt, and add to
1500 			 * our base address to find the umem_bufctl_t.  (Note
1501 			 * that we don't need to add the size of the bufctl
1502 			 * to our offset calculation because of the slop that's
1503 			 * allocated for the buffer at ubase.)
1504 			 */
1505 			uintptr_t offs = (uintptr_t)bcp - (uintptr_t)kbase;
1506 
1507 			if (offs > chunks * chunksize) {
1508 				mdb_warn("found corrupt bufctl ptr %p"
1509 				    " in slab %p in cache %p\n", bcp,
1510 				    wsp->walk_addr, addr);
1511 				break;
1512 			}
1513 
1514 			bc = *((kmem_bufctl_t *)((uintptr_t)ubase + offs));
1515 			buf = KMEM_BUF(cp, bcp);
1516 		}
1517 
1518 		ndx = ((uintptr_t)buf - (uintptr_t)kbase) / chunksize;
1519 
1520 		if (ndx > slabsize / cp->cache_bufsize) {
1521 			/*
1522 			 * This is very wrong; we have managed to find
1523 			 * a buffer in the slab which shouldn't
1524 			 * actually be here.  Emit a warning, and
1525 			 * try to continue.
1526 			 */
1527 			mdb_warn("buf %p is out of range for "
1528 			    "slab %p, cache %p\n", buf, sp, addr);
1529 		} else if (type & KM_ALLOCATED) {
1530 			/*
1531 			 * we have found a buffer on the slab's freelist;
1532 			 * clear its entry
1533 			 */
1534 			valid[ndx] = 0;
1535 		} else {
1536 			/*
1537 			 * Report this freed buffer
1538 			 */
1539 			if (type & KM_BUFCTL) {
1540 				ret = bufctl_walk_callback(cp, wsp,
1541 				    (uintptr_t)bcp);
1542 			} else {
1543 				ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1544 			}
1545 			if (ret != WALK_NEXT)
1546 				return (ret);
1547 		}
1548 
1549 		bcp = bc.bc_next;
1550 	}
1551 
1552 	if (bcp != NULL) {
1553 		dprintf(("slab %p in cache %p freelist too long (%p)\n",
1554 		    sp, addr, bcp));
1555 	}
1556 
1557 	/*
1558 	 * If we are walking freed buffers, the loop above handled reporting
1559 	 * them.
1560 	 */
1561 	if (type & KM_FREE)
1562 		return (WALK_NEXT);
1563 
1564 	if (type & KM_BUFCTL) {
1565 		mdb_warn("impossible situation: small-slab KM_BUFCTL walk for "
1566 		    "cache %p\n", addr);
1567 		return (WALK_ERR);
1568 	}
1569 
1570 	/*
1571 	 * Report allocated buffers, skipping buffers in the magazine layer.
1572 	 * We only get this far for small-slab caches.
1573 	 */
1574 	for (i = 0; ret == WALK_NEXT && i < chunks; i++) {
1575 		buf = (char *)kbase + i * chunksize;
1576 
1577 		if (!valid[i])
1578 			continue;		/* on slab freelist */
1579 
1580 		if (magcnt > 0 &&
1581 		    bsearch(&buf, maglist, magcnt, sizeof (void *),
1582 		    addrcmp) != NULL)
1583 			continue;		/* in magazine layer */
1584 
1585 		ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1586 	}
1587 	return (ret);
1588 }
1589 
1590 void
kmem_walk_fini(mdb_walk_state_t * wsp)1591 kmem_walk_fini(mdb_walk_state_t *wsp)
1592 {
1593 	kmem_walk_t *kmw = wsp->walk_data;
1594 	uintptr_t chunksize;
1595 	uintptr_t slabsize;
1596 
1597 	if (kmw == NULL)
1598 		return;
1599 
1600 	if (kmw->kmw_maglist != NULL)
1601 		mdb_free(kmw->kmw_maglist, kmw->kmw_max * sizeof (void *));
1602 
1603 	chunksize = kmw->kmw_cp->cache_chunksize;
1604 	slabsize = kmw->kmw_cp->cache_slabsize;
1605 
1606 	if (kmw->kmw_valid != NULL)
1607 		mdb_free(kmw->kmw_valid, slabsize / chunksize);
1608 	if (kmw->kmw_ubase != NULL)
1609 		mdb_free(kmw->kmw_ubase, slabsize + sizeof (kmem_bufctl_t));
1610 
1611 	mdb_free(kmw->kmw_cp, kmw->kmw_csize);
1612 	mdb_free(kmw, sizeof (kmem_walk_t));
1613 }
1614 
1615 /*ARGSUSED*/
1616 static int
kmem_walk_all(uintptr_t addr,const kmem_cache_t * c,mdb_walk_state_t * wsp)1617 kmem_walk_all(uintptr_t addr, const kmem_cache_t *c, mdb_walk_state_t *wsp)
1618 {
1619 	/*
1620 	 * Buffers allocated from NOTOUCH caches can also show up as freed
1621 	 * memory in other caches.  This can be a little confusing, so we
1622 	 * don't walk NOTOUCH caches when walking all caches (thereby assuring
1623 	 * that "::walk kmem" and "::walk freemem" yield disjoint output).
1624 	 */
1625 	if (c->cache_cflags & KMC_NOTOUCH)
1626 		return (WALK_NEXT);
1627 
1628 	if (mdb_pwalk(wsp->walk_data, wsp->walk_callback,
1629 	    wsp->walk_cbdata, addr) == -1)
1630 		return (WALK_DONE);
1631 
1632 	return (WALK_NEXT);
1633 }
1634 
1635 #define	KMEM_WALK_ALL(name, wsp) { \
1636 	wsp->walk_data = (name); \
1637 	if (mdb_walk("kmem_cache", (mdb_walk_cb_t)kmem_walk_all, wsp) == -1) \
1638 		return (WALK_ERR); \
1639 	return (WALK_DONE); \
1640 }
1641 
1642 int
kmem_walk_init(mdb_walk_state_t * wsp)1643 kmem_walk_init(mdb_walk_state_t *wsp)
1644 {
1645 	if (wsp->walk_arg != NULL)
1646 		wsp->walk_addr = (uintptr_t)wsp->walk_arg;
1647 
1648 	if (wsp->walk_addr == 0)
1649 		KMEM_WALK_ALL("kmem", wsp);
1650 	return (kmem_walk_init_common(wsp, KM_ALLOCATED));
1651 }
1652 
1653 int
bufctl_walk_init(mdb_walk_state_t * wsp)1654 bufctl_walk_init(mdb_walk_state_t *wsp)
1655 {
1656 	if (wsp->walk_addr == 0)
1657 		KMEM_WALK_ALL("bufctl", wsp);
1658 	return (kmem_walk_init_common(wsp, KM_ALLOCATED | KM_BUFCTL));
1659 }
1660 
1661 int
freemem_walk_init(mdb_walk_state_t * wsp)1662 freemem_walk_init(mdb_walk_state_t *wsp)
1663 {
1664 	if (wsp->walk_addr == 0)
1665 		KMEM_WALK_ALL("freemem", wsp);
1666 	return (kmem_walk_init_common(wsp, KM_FREE));
1667 }
1668 
1669 int
freemem_constructed_walk_init(mdb_walk_state_t * wsp)1670 freemem_constructed_walk_init(mdb_walk_state_t *wsp)
1671 {
1672 	if (wsp->walk_addr == 0)
1673 		KMEM_WALK_ALL("freemem_constructed", wsp);
1674 	return (kmem_walk_init_common(wsp, KM_FREE | KM_CONSTRUCTED));
1675 }
1676 
1677 int
freectl_walk_init(mdb_walk_state_t * wsp)1678 freectl_walk_init(mdb_walk_state_t *wsp)
1679 {
1680 	if (wsp->walk_addr == 0)
1681 		KMEM_WALK_ALL("freectl", wsp);
1682 	return (kmem_walk_init_common(wsp, KM_FREE | KM_BUFCTL));
1683 }
1684 
1685 int
freectl_constructed_walk_init(mdb_walk_state_t * wsp)1686 freectl_constructed_walk_init(mdb_walk_state_t *wsp)
1687 {
1688 	if (wsp->walk_addr == 0)
1689 		KMEM_WALK_ALL("freectl_constructed", wsp);
1690 	return (kmem_walk_init_common(wsp,
1691 	    KM_FREE | KM_BUFCTL | KM_CONSTRUCTED));
1692 }
1693 
1694 typedef struct bufctl_history_walk {
1695 	void		*bhw_next;
1696 	kmem_cache_t	*bhw_cache;
1697 	kmem_slab_t	*bhw_slab;
1698 	hrtime_t	bhw_timestamp;
1699 } bufctl_history_walk_t;
1700 
1701 int
bufctl_history_walk_init(mdb_walk_state_t * wsp)1702 bufctl_history_walk_init(mdb_walk_state_t *wsp)
1703 {
1704 	bufctl_history_walk_t *bhw;
1705 	kmem_bufctl_audit_t bc;
1706 	kmem_bufctl_audit_t bcn;
1707 
1708 	if (wsp->walk_addr == 0) {
1709 		mdb_warn("bufctl_history walk doesn't support global walks\n");
1710 		return (WALK_ERR);
1711 	}
1712 
1713 	if (mdb_vread(&bc, sizeof (bc), wsp->walk_addr) == -1) {
1714 		mdb_warn("unable to read bufctl at %p", wsp->walk_addr);
1715 		return (WALK_ERR);
1716 	}
1717 
1718 	bhw = mdb_zalloc(sizeof (*bhw), UM_SLEEP);
1719 	bhw->bhw_timestamp = 0;
1720 	bhw->bhw_cache = bc.bc_cache;
1721 	bhw->bhw_slab = bc.bc_slab;
1722 
1723 	/*
1724 	 * sometimes the first log entry matches the base bufctl;  in that
1725 	 * case, skip the base bufctl.
1726 	 */
1727 	if (bc.bc_lastlog != NULL &&
1728 	    mdb_vread(&bcn, sizeof (bcn), (uintptr_t)bc.bc_lastlog) != -1 &&
1729 	    bc.bc_addr == bcn.bc_addr &&
1730 	    bc.bc_cache == bcn.bc_cache &&
1731 	    bc.bc_slab == bcn.bc_slab &&
1732 	    bc.bc_timestamp == bcn.bc_timestamp &&
1733 	    bc.bc_thread == bcn.bc_thread)
1734 		bhw->bhw_next = bc.bc_lastlog;
1735 	else
1736 		bhw->bhw_next = (void *)wsp->walk_addr;
1737 
1738 	wsp->walk_addr = (uintptr_t)bc.bc_addr;
1739 	wsp->walk_data = bhw;
1740 
1741 	return (WALK_NEXT);
1742 }
1743 
1744 int
bufctl_history_walk_step(mdb_walk_state_t * wsp)1745 bufctl_history_walk_step(mdb_walk_state_t *wsp)
1746 {
1747 	bufctl_history_walk_t *bhw = wsp->walk_data;
1748 	uintptr_t addr = (uintptr_t)bhw->bhw_next;
1749 	uintptr_t baseaddr = wsp->walk_addr;
1750 	kmem_bufctl_audit_t bc;
1751 
1752 	if (addr == 0)
1753 		return (WALK_DONE);
1754 
1755 	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
1756 		mdb_warn("unable to read bufctl at %p", bhw->bhw_next);
1757 		return (WALK_ERR);
1758 	}
1759 
1760 	/*
1761 	 * The bufctl is only valid if the address, cache, and slab are
1762 	 * correct.  We also check that the timestamp is decreasing, to
1763 	 * prevent infinite loops.
1764 	 */
1765 	if ((uintptr_t)bc.bc_addr != baseaddr ||
1766 	    bc.bc_cache != bhw->bhw_cache ||
1767 	    bc.bc_slab != bhw->bhw_slab ||
1768 	    (bhw->bhw_timestamp != 0 && bc.bc_timestamp >= bhw->bhw_timestamp))
1769 		return (WALK_DONE);
1770 
1771 	bhw->bhw_next = bc.bc_lastlog;
1772 	bhw->bhw_timestamp = bc.bc_timestamp;
1773 
1774 	return (wsp->walk_callback(addr, &bc, wsp->walk_cbdata));
1775 }
1776 
1777 void
bufctl_history_walk_fini(mdb_walk_state_t * wsp)1778 bufctl_history_walk_fini(mdb_walk_state_t *wsp)
1779 {
1780 	bufctl_history_walk_t *bhw = wsp->walk_data;
1781 
1782 	mdb_free(bhw, sizeof (*bhw));
1783 }
1784 
1785 typedef struct kmem_log_walk {
1786 	kmem_bufctl_audit_t *klw_base;
1787 	kmem_bufctl_audit_t **klw_sorted;
1788 	kmem_log_header_t klw_lh;
1789 	size_t klw_size;
1790 	size_t klw_maxndx;
1791 	size_t klw_ndx;
1792 } kmem_log_walk_t;
1793 
1794 int
kmem_log_walk_init(mdb_walk_state_t * wsp)1795 kmem_log_walk_init(mdb_walk_state_t *wsp)
1796 {
1797 	uintptr_t lp = wsp->walk_addr;
1798 	kmem_log_walk_t *klw;
1799 	kmem_log_header_t *lhp;
1800 	int maxndx, i, j, k;
1801 
1802 	/*
1803 	 * By default (global walk), walk the kmem_transaction_log.  Otherwise
1804 	 * read the log whose kmem_log_header_t is stored at walk_addr.
1805 	 */
1806 	if (lp == 0 && mdb_readvar(&lp, "kmem_transaction_log") == -1) {
1807 		mdb_warn("failed to read 'kmem_transaction_log'");
1808 		return (WALK_ERR);
1809 	}
1810 
1811 	if (lp == 0) {
1812 		mdb_warn("log is disabled\n");
1813 		return (WALK_ERR);
1814 	}
1815 
1816 	klw = mdb_zalloc(sizeof (kmem_log_walk_t), UM_SLEEP);
1817 	lhp = &klw->klw_lh;
1818 
1819 	if (mdb_vread(lhp, sizeof (kmem_log_header_t), lp) == -1) {
1820 		mdb_warn("failed to read log header at %p", lp);
1821 		mdb_free(klw, sizeof (kmem_log_walk_t));
1822 		return (WALK_ERR);
1823 	}
1824 
1825 	klw->klw_size = lhp->lh_chunksize * lhp->lh_nchunks;
1826 	klw->klw_base = mdb_alloc(klw->klw_size, UM_SLEEP);
1827 	maxndx = lhp->lh_chunksize / sizeof (kmem_bufctl_audit_t) - 1;
1828 
1829 	if (mdb_vread(klw->klw_base, klw->klw_size,
1830 	    (uintptr_t)lhp->lh_base) == -1) {
1831 		mdb_warn("failed to read log at base %p", lhp->lh_base);
1832 		mdb_free(klw->klw_base, klw->klw_size);
1833 		mdb_free(klw, sizeof (kmem_log_walk_t));
1834 		return (WALK_ERR);
1835 	}
1836 
1837 	klw->klw_sorted = mdb_alloc(maxndx * lhp->lh_nchunks *
1838 	    sizeof (kmem_bufctl_audit_t *), UM_SLEEP);
1839 
1840 	for (i = 0, k = 0; i < lhp->lh_nchunks; i++) {
1841 		kmem_bufctl_audit_t *chunk = (kmem_bufctl_audit_t *)
1842 		    ((uintptr_t)klw->klw_base + i * lhp->lh_chunksize);
1843 
1844 		for (j = 0; j < maxndx; j++)
1845 			klw->klw_sorted[k++] = &chunk[j];
1846 	}
1847 
1848 	qsort(klw->klw_sorted, k, sizeof (kmem_bufctl_audit_t *),
1849 	    (int(*)(const void *, const void *))bufctlcmp);
1850 
1851 	klw->klw_maxndx = k;
1852 	wsp->walk_data = klw;
1853 
1854 	return (WALK_NEXT);
1855 }
1856 
1857 int
kmem_log_walk_step(mdb_walk_state_t * wsp)1858 kmem_log_walk_step(mdb_walk_state_t *wsp)
1859 {
1860 	kmem_log_walk_t *klw = wsp->walk_data;
1861 	kmem_bufctl_audit_t *bcp;
1862 
1863 	if (klw->klw_ndx == klw->klw_maxndx)
1864 		return (WALK_DONE);
1865 
1866 	bcp = klw->klw_sorted[klw->klw_ndx++];
1867 
1868 	return (wsp->walk_callback((uintptr_t)bcp - (uintptr_t)klw->klw_base +
1869 	    (uintptr_t)klw->klw_lh.lh_base, bcp, wsp->walk_cbdata));
1870 }
1871 
1872 void
kmem_log_walk_fini(mdb_walk_state_t * wsp)1873 kmem_log_walk_fini(mdb_walk_state_t *wsp)
1874 {
1875 	kmem_log_walk_t *klw = wsp->walk_data;
1876 
1877 	mdb_free(klw->klw_base, klw->klw_size);
1878 	mdb_free(klw->klw_sorted, klw->klw_maxndx *
1879 	    sizeof (kmem_bufctl_audit_t *));
1880 	mdb_free(klw, sizeof (kmem_log_walk_t));
1881 }
1882 
1883 typedef struct allocdby_bufctl {
1884 	uintptr_t abb_addr;
1885 	hrtime_t abb_ts;
1886 } allocdby_bufctl_t;
1887 
1888 typedef struct allocdby_walk {
1889 	const char *abw_walk;
1890 	uintptr_t abw_thread;
1891 	size_t abw_nbufs;
1892 	size_t abw_size;
1893 	allocdby_bufctl_t *abw_buf;
1894 	size_t abw_ndx;
1895 } allocdby_walk_t;
1896 
1897 int
allocdby_walk_bufctl(uintptr_t addr,const kmem_bufctl_audit_t * bcp,allocdby_walk_t * abw)1898 allocdby_walk_bufctl(uintptr_t addr, const kmem_bufctl_audit_t *bcp,
1899     allocdby_walk_t *abw)
1900 {
1901 	if ((uintptr_t)bcp->bc_thread != abw->abw_thread)
1902 		return (WALK_NEXT);
1903 
1904 	if (abw->abw_nbufs == abw->abw_size) {
1905 		allocdby_bufctl_t *buf;
1906 		size_t oldsize = sizeof (allocdby_bufctl_t) * abw->abw_size;
1907 
1908 		buf = mdb_zalloc(oldsize << 1, UM_SLEEP);
1909 
1910 		bcopy(abw->abw_buf, buf, oldsize);
1911 		mdb_free(abw->abw_buf, oldsize);
1912 
1913 		abw->abw_size <<= 1;
1914 		abw->abw_buf = buf;
1915 	}
1916 
1917 	abw->abw_buf[abw->abw_nbufs].abb_addr = addr;
1918 	abw->abw_buf[abw->abw_nbufs].abb_ts = bcp->bc_timestamp;
1919 	abw->abw_nbufs++;
1920 
1921 	return (WALK_NEXT);
1922 }
1923 
1924 /*ARGSUSED*/
1925 int
allocdby_walk_cache(uintptr_t addr,const kmem_cache_t * c,allocdby_walk_t * abw)1926 allocdby_walk_cache(uintptr_t addr, const kmem_cache_t *c, allocdby_walk_t *abw)
1927 {
1928 	if (mdb_pwalk(abw->abw_walk, (mdb_walk_cb_t)allocdby_walk_bufctl,
1929 	    abw, addr) == -1) {
1930 		mdb_warn("couldn't walk bufctl for cache %p", addr);
1931 		return (WALK_DONE);
1932 	}
1933 
1934 	return (WALK_NEXT);
1935 }
1936 
1937 static int
allocdby_cmp(const allocdby_bufctl_t * lhs,const allocdby_bufctl_t * rhs)1938 allocdby_cmp(const allocdby_bufctl_t *lhs, const allocdby_bufctl_t *rhs)
1939 {
1940 	if (lhs->abb_ts < rhs->abb_ts)
1941 		return (1);
1942 	if (lhs->abb_ts > rhs->abb_ts)
1943 		return (-1);
1944 	return (0);
1945 }
1946 
1947 static int
allocdby_walk_init_common(mdb_walk_state_t * wsp,const char * walk)1948 allocdby_walk_init_common(mdb_walk_state_t *wsp, const char *walk)
1949 {
1950 	allocdby_walk_t *abw;
1951 
1952 	if (wsp->walk_addr == 0) {
1953 		mdb_warn("allocdby walk doesn't support global walks\n");
1954 		return (WALK_ERR);
1955 	}
1956 
1957 	abw = mdb_zalloc(sizeof (allocdby_walk_t), UM_SLEEP);
1958 
1959 	abw->abw_thread = wsp->walk_addr;
1960 	abw->abw_walk = walk;
1961 	abw->abw_size = 128;	/* something reasonable */
1962 	abw->abw_buf =
1963 	    mdb_zalloc(abw->abw_size * sizeof (allocdby_bufctl_t), UM_SLEEP);
1964 
1965 	wsp->walk_data = abw;
1966 
1967 	if (mdb_walk("kmem_cache",
1968 	    (mdb_walk_cb_t)allocdby_walk_cache, abw) == -1) {
1969 		mdb_warn("couldn't walk kmem_cache");
1970 		allocdby_walk_fini(wsp);
1971 		return (WALK_ERR);
1972 	}
1973 
1974 	qsort(abw->abw_buf, abw->abw_nbufs, sizeof (allocdby_bufctl_t),
1975 	    (int(*)(const void *, const void *))allocdby_cmp);
1976 
1977 	return (WALK_NEXT);
1978 }
1979 
1980 int
allocdby_walk_init(mdb_walk_state_t * wsp)1981 allocdby_walk_init(mdb_walk_state_t *wsp)
1982 {
1983 	return (allocdby_walk_init_common(wsp, "bufctl"));
1984 }
1985 
1986 int
freedby_walk_init(mdb_walk_state_t * wsp)1987 freedby_walk_init(mdb_walk_state_t *wsp)
1988 {
1989 	return (allocdby_walk_init_common(wsp, "freectl"));
1990 }
1991 
1992 int
allocdby_walk_step(mdb_walk_state_t * wsp)1993 allocdby_walk_step(mdb_walk_state_t *wsp)
1994 {
1995 	allocdby_walk_t *abw = wsp->walk_data;
1996 	kmem_bufctl_audit_t bc;
1997 	uintptr_t addr;
1998 
1999 	if (abw->abw_ndx == abw->abw_nbufs)
2000 		return (WALK_DONE);
2001 
2002 	addr = abw->abw_buf[abw->abw_ndx++].abb_addr;
2003 
2004 	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
2005 		mdb_warn("couldn't read bufctl at %p", addr);
2006 		return (WALK_DONE);
2007 	}
2008 
2009 	return (wsp->walk_callback(addr, &bc, wsp->walk_cbdata));
2010 }
2011 
2012 void
allocdby_walk_fini(mdb_walk_state_t * wsp)2013 allocdby_walk_fini(mdb_walk_state_t *wsp)
2014 {
2015 	allocdby_walk_t *abw = wsp->walk_data;
2016 
2017 	mdb_free(abw->abw_buf, sizeof (allocdby_bufctl_t) * abw->abw_size);
2018 	mdb_free(abw, sizeof (allocdby_walk_t));
2019 }
2020 
2021 /*ARGSUSED*/
2022 int
allocdby_walk(uintptr_t addr,const kmem_bufctl_audit_t * bcp,void * ignored)2023 allocdby_walk(uintptr_t addr, const kmem_bufctl_audit_t *bcp, void *ignored)
2024 {
2025 	char c[MDB_SYM_NAMLEN];
2026 	GElf_Sym sym;
2027 	int i;
2028 
2029 	mdb_printf("%0?p %12llx ", addr, bcp->bc_timestamp);
2030 	for (i = 0; i < bcp->bc_depth; i++) {
2031 		if (mdb_lookup_by_addr(bcp->bc_stack[i],
2032 		    MDB_SYM_FUZZY, c, sizeof (c), &sym) == -1)
2033 			continue;
2034 		if (strncmp(c, "kmem_", 5) == 0)
2035 			continue;
2036 		mdb_printf("%s+0x%lx",
2037 		    c, bcp->bc_stack[i] - (uintptr_t)sym.st_value);
2038 		break;
2039 	}
2040 	mdb_printf("\n");
2041 
2042 	return (WALK_NEXT);
2043 }
2044 
2045 static int
allocdby_common(uintptr_t addr,uint_t flags,const char * w)2046 allocdby_common(uintptr_t addr, uint_t flags, const char *w)
2047 {
2048 	if (!(flags & DCMD_ADDRSPEC))
2049 		return (DCMD_USAGE);
2050 
2051 	mdb_printf("%-?s %12s %s\n", "BUFCTL", "TIMESTAMP", "CALLER");
2052 
2053 	if (mdb_pwalk(w, (mdb_walk_cb_t)allocdby_walk, NULL, addr) == -1) {
2054 		mdb_warn("can't walk '%s' for %p", w, addr);
2055 		return (DCMD_ERR);
2056 	}
2057 
2058 	return (DCMD_OK);
2059 }
2060 
2061 /*ARGSUSED*/
2062 int
allocdby(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2063 allocdby(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2064 {
2065 	return (allocdby_common(addr, flags, "allocdby"));
2066 }
2067 
2068 /*ARGSUSED*/
2069 int
freedby(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2070 freedby(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2071 {
2072 	return (allocdby_common(addr, flags, "freedby"));
2073 }
2074 
2075 /*
2076  * Return a string describing the address in relation to the given thread's
2077  * stack.
2078  *
2079  * - If the thread state is TS_FREE, return " (inactive interrupt thread)".
2080  *
2081  * - If the address is above the stack pointer, return an empty string
2082  *   signifying that the address is active.
2083  *
2084  * - If the address is below the stack pointer, and the thread is not on proc,
2085  *   return " (below sp)".
2086  *
2087  * - If the address is below the stack pointer, and the thread is on proc,
2088  *   return " (possibly below sp)".  Depending on context, we may or may not
2089  *   have an accurate t_sp.
2090  */
2091 static const char *
stack_active(const kthread_t * t,uintptr_t addr)2092 stack_active(const kthread_t *t, uintptr_t addr)
2093 {
2094 	uintptr_t panicstk;
2095 	GElf_Sym sym;
2096 
2097 	if (t->t_state == TS_FREE)
2098 		return (" (inactive interrupt thread)");
2099 
2100 	/*
2101 	 * Check to see if we're on the panic stack.  If so, ignore t_sp, as it
2102 	 * no longer relates to the thread's real stack.
2103 	 */
2104 	if (mdb_lookup_by_name("panic_stack", &sym) == 0) {
2105 		panicstk = (uintptr_t)sym.st_value;
2106 
2107 		if (t->t_sp >= panicstk && t->t_sp < panicstk + PANICSTKSIZE)
2108 			return ("");
2109 	}
2110 
2111 	if (addr >= t->t_sp + STACK_BIAS)
2112 		return ("");
2113 
2114 	if (t->t_state == TS_ONPROC)
2115 		return (" (possibly below sp)");
2116 
2117 	return (" (below sp)");
2118 }
2119 
2120 /*
2121  * Additional state for the kmem and vmem ::whatis handlers
2122  */
2123 typedef struct whatis_info {
2124 	mdb_whatis_t *wi_w;
2125 	const kmem_cache_t *wi_cache;
2126 	const vmem_t *wi_vmem;
2127 	vmem_t *wi_msb_arena;
2128 	size_t wi_slab_size;
2129 	uint_t wi_slab_found;
2130 	uint_t wi_kmem_lite_count;
2131 	uint_t wi_freemem;
2132 } whatis_info_t;
2133 
2134 /* call one of our dcmd functions with "-v" and the provided address */
2135 static void
whatis_call_printer(mdb_dcmd_f * dcmd,uintptr_t addr)2136 whatis_call_printer(mdb_dcmd_f *dcmd, uintptr_t addr)
2137 {
2138 	mdb_arg_t a;
2139 	a.a_type = MDB_TYPE_STRING;
2140 	a.a_un.a_str = "-v";
2141 
2142 	mdb_printf(":\n");
2143 	(void) (*dcmd)(addr, DCMD_ADDRSPEC, 1, &a);
2144 }
2145 
2146 static void
whatis_print_kmf_lite(uintptr_t btaddr,size_t count)2147 whatis_print_kmf_lite(uintptr_t btaddr, size_t count)
2148 {
2149 #define	KMEM_LITE_MAX	16
2150 	pc_t callers[KMEM_LITE_MAX];
2151 	pc_t uninit = (pc_t)KMEM_UNINITIALIZED_PATTERN;
2152 
2153 	kmem_buftag_t bt;
2154 	intptr_t stat;
2155 	const char *plural = "";
2156 	int i;
2157 
2158 	/* validate our arguments and read in the buftag */
2159 	if (count == 0 || count > KMEM_LITE_MAX ||
2160 	    mdb_vread(&bt, sizeof (bt), btaddr) == -1)
2161 		return;
2162 
2163 	/* validate the buffer state and read in the callers */
2164 	stat = (intptr_t)bt.bt_bufctl ^ bt.bt_bxstat;
2165 
2166 	if (stat != KMEM_BUFTAG_ALLOC && stat != KMEM_BUFTAG_FREE)
2167 		return;
2168 
2169 	if (mdb_vread(callers, count * sizeof (pc_t),
2170 	    btaddr + offsetof(kmem_buftag_lite_t, bt_history)) == -1)
2171 		return;
2172 
2173 	/* If there aren't any filled in callers, bail */
2174 	if (callers[0] == uninit)
2175 		return;
2176 
2177 	plural = (callers[1] == uninit) ? "" : "s";
2178 
2179 	/* Everything's done and checked; print them out */
2180 	mdb_printf(":\n");
2181 
2182 	mdb_inc_indent(8);
2183 	mdb_printf("recent caller%s: %a", plural, callers[0]);
2184 	for (i = 1; i < count; i++) {
2185 		if (callers[i] == uninit)
2186 			break;
2187 		mdb_printf(", %a", callers[i]);
2188 	}
2189 	mdb_dec_indent(8);
2190 }
2191 
2192 static void
whatis_print_kmem(whatis_info_t * wi,uintptr_t maddr,uintptr_t addr,uintptr_t baddr)2193 whatis_print_kmem(whatis_info_t *wi, uintptr_t maddr, uintptr_t addr,
2194     uintptr_t baddr)
2195 {
2196 	mdb_whatis_t *w = wi->wi_w;
2197 
2198 	const kmem_cache_t *cp = wi->wi_cache;
2199 	/* LINTED pointer cast may result in improper alignment */
2200 	uintptr_t btaddr = (uintptr_t)KMEM_BUFTAG(cp, addr);
2201 	int quiet = (mdb_whatis_flags(w) & WHATIS_QUIET);
2202 	int call_printer = (!quiet && (cp->cache_flags & KMF_AUDIT));
2203 
2204 	mdb_whatis_report_object(w, maddr, addr, "");
2205 
2206 	if (baddr != 0 && !call_printer)
2207 		mdb_printf("bufctl %p ", baddr);
2208 
2209 	mdb_printf("%s from %s",
2210 	    (wi->wi_freemem == FALSE) ? "allocated" : "freed", cp->cache_name);
2211 
2212 	if (baddr != 0 && call_printer) {
2213 		whatis_call_printer(bufctl, baddr);
2214 		return;
2215 	}
2216 
2217 	/* for KMF_LITE caches, try to print out the previous callers */
2218 	if (!quiet && (cp->cache_flags & KMF_LITE))
2219 		whatis_print_kmf_lite(btaddr, wi->wi_kmem_lite_count);
2220 
2221 	mdb_printf("\n");
2222 }
2223 
2224 /*ARGSUSED*/
2225 static int
whatis_walk_kmem(uintptr_t addr,void * ignored,whatis_info_t * wi)2226 whatis_walk_kmem(uintptr_t addr, void *ignored, whatis_info_t *wi)
2227 {
2228 	mdb_whatis_t *w = wi->wi_w;
2229 
2230 	uintptr_t cur;
2231 	size_t size = wi->wi_cache->cache_bufsize;
2232 
2233 	while (mdb_whatis_match(w, addr, size, &cur))
2234 		whatis_print_kmem(wi, cur, addr, 0);
2235 
2236 	return (WHATIS_WALKRET(w));
2237 }
2238 
2239 /*ARGSUSED*/
2240 static int
whatis_walk_bufctl(uintptr_t baddr,const kmem_bufctl_t * bcp,whatis_info_t * wi)2241 whatis_walk_bufctl(uintptr_t baddr, const kmem_bufctl_t *bcp, whatis_info_t *wi)
2242 {
2243 	mdb_whatis_t *w = wi->wi_w;
2244 
2245 	uintptr_t cur;
2246 	uintptr_t addr = (uintptr_t)bcp->bc_addr;
2247 	size_t size = wi->wi_cache->cache_bufsize;
2248 
2249 	while (mdb_whatis_match(w, addr, size, &cur))
2250 		whatis_print_kmem(wi, cur, addr, baddr);
2251 
2252 	return (WHATIS_WALKRET(w));
2253 }
2254 
2255 static int
whatis_walk_seg(uintptr_t addr,const vmem_seg_t * vs,whatis_info_t * wi)2256 whatis_walk_seg(uintptr_t addr, const vmem_seg_t *vs, whatis_info_t *wi)
2257 {
2258 	mdb_whatis_t *w = wi->wi_w;
2259 
2260 	size_t size = vs->vs_end - vs->vs_start;
2261 	uintptr_t cur;
2262 
2263 	/* We're not interested in anything but alloc and free segments */
2264 	if (vs->vs_type != VMEM_ALLOC && vs->vs_type != VMEM_FREE)
2265 		return (WALK_NEXT);
2266 
2267 	while (mdb_whatis_match(w, vs->vs_start, size, &cur)) {
2268 		mdb_whatis_report_object(w, cur, vs->vs_start, "");
2269 
2270 		/*
2271 		 * If we're not printing it seperately, provide the vmem_seg
2272 		 * pointer if it has a stack trace.
2273 		 */
2274 		if ((mdb_whatis_flags(w) & WHATIS_QUIET) &&
2275 		    (!(mdb_whatis_flags(w) & WHATIS_BUFCTL) ||
2276 		    (vs->vs_type == VMEM_ALLOC && vs->vs_depth != 0))) {
2277 			mdb_printf("vmem_seg %p ", addr);
2278 		}
2279 
2280 		mdb_printf("%s from the %s vmem arena",
2281 		    (vs->vs_type == VMEM_ALLOC) ? "allocated" : "freed",
2282 		    wi->wi_vmem->vm_name);
2283 
2284 		if (!(mdb_whatis_flags(w) & WHATIS_QUIET))
2285 			whatis_call_printer(vmem_seg, addr);
2286 		else
2287 			mdb_printf("\n");
2288 	}
2289 
2290 	return (WHATIS_WALKRET(w));
2291 }
2292 
2293 static int
whatis_walk_vmem(uintptr_t addr,const vmem_t * vmem,whatis_info_t * wi)2294 whatis_walk_vmem(uintptr_t addr, const vmem_t *vmem, whatis_info_t *wi)
2295 {
2296 	mdb_whatis_t *w = wi->wi_w;
2297 	const char *nm = vmem->vm_name;
2298 
2299 	int identifier = ((vmem->vm_cflags & VMC_IDENTIFIER) != 0);
2300 	int idspace = ((mdb_whatis_flags(w) & WHATIS_IDSPACE) != 0);
2301 
2302 	if (identifier != idspace)
2303 		return (WALK_NEXT);
2304 
2305 	wi->wi_vmem = vmem;
2306 
2307 	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2308 		mdb_printf("Searching vmem arena %s...\n", nm);
2309 
2310 	if (mdb_pwalk("vmem_seg",
2311 	    (mdb_walk_cb_t)whatis_walk_seg, wi, addr) == -1) {
2312 		mdb_warn("can't walk vmem_seg for %p", addr);
2313 		return (WALK_NEXT);
2314 	}
2315 
2316 	return (WHATIS_WALKRET(w));
2317 }
2318 
2319 /*ARGSUSED*/
2320 static int
whatis_walk_slab(uintptr_t saddr,const kmem_slab_t * sp,whatis_info_t * wi)2321 whatis_walk_slab(uintptr_t saddr, const kmem_slab_t *sp, whatis_info_t *wi)
2322 {
2323 	mdb_whatis_t *w = wi->wi_w;
2324 
2325 	/* It must overlap with the slab data, or it's not interesting */
2326 	if (mdb_whatis_overlaps(w,
2327 	    (uintptr_t)sp->slab_base, wi->wi_slab_size)) {
2328 		wi->wi_slab_found++;
2329 		return (WALK_DONE);
2330 	}
2331 	return (WALK_NEXT);
2332 }
2333 
2334 static int
whatis_walk_cache(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2335 whatis_walk_cache(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2336 {
2337 	mdb_whatis_t *w = wi->wi_w;
2338 
2339 	char *walk, *freewalk;
2340 	mdb_walk_cb_t func;
2341 	int do_bufctl;
2342 
2343 	int identifier = ((c->cache_flags & KMC_IDENTIFIER) != 0);
2344 	int idspace = ((mdb_whatis_flags(w) & WHATIS_IDSPACE) != 0);
2345 
2346 	if (identifier != idspace)
2347 		return (WALK_NEXT);
2348 
2349 	/* Override the '-b' flag as necessary */
2350 	if (!(c->cache_flags & KMF_HASH))
2351 		do_bufctl = FALSE;	/* no bufctls to walk */
2352 	else if (c->cache_flags & KMF_AUDIT)
2353 		do_bufctl = TRUE;	/* we always want debugging info */
2354 	else
2355 		do_bufctl = ((mdb_whatis_flags(w) & WHATIS_BUFCTL) != 0);
2356 
2357 	if (do_bufctl) {
2358 		walk = "bufctl";
2359 		freewalk = "freectl";
2360 		func = (mdb_walk_cb_t)whatis_walk_bufctl;
2361 	} else {
2362 		walk = "kmem";
2363 		freewalk = "freemem";
2364 		func = (mdb_walk_cb_t)whatis_walk_kmem;
2365 	}
2366 
2367 	wi->wi_cache = c;
2368 
2369 	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2370 		mdb_printf("Searching %s...\n", c->cache_name);
2371 
2372 	/*
2373 	 * If more then two buffers live on each slab, figure out if we're
2374 	 * interested in anything in any slab before doing the more expensive
2375 	 * kmem/freemem (bufctl/freectl) walkers.
2376 	 */
2377 	wi->wi_slab_size = c->cache_slabsize - c->cache_maxcolor;
2378 	if (!(c->cache_flags & KMF_HASH))
2379 		wi->wi_slab_size -= sizeof (kmem_slab_t);
2380 
2381 	if ((wi->wi_slab_size / c->cache_chunksize) > 2) {
2382 		wi->wi_slab_found = 0;
2383 		if (mdb_pwalk("kmem_slab", (mdb_walk_cb_t)whatis_walk_slab, wi,
2384 		    addr) == -1) {
2385 			mdb_warn("can't find kmem_slab walker");
2386 			return (WALK_DONE);
2387 		}
2388 		if (wi->wi_slab_found == 0)
2389 			return (WALK_NEXT);
2390 	}
2391 
2392 	wi->wi_freemem = FALSE;
2393 	if (mdb_pwalk(walk, func, wi, addr) == -1) {
2394 		mdb_warn("can't find %s walker", walk);
2395 		return (WALK_DONE);
2396 	}
2397 
2398 	if (mdb_whatis_done(w))
2399 		return (WALK_DONE);
2400 
2401 	/*
2402 	 * We have searched for allocated memory; now search for freed memory.
2403 	 */
2404 	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2405 		mdb_printf("Searching %s for free memory...\n", c->cache_name);
2406 
2407 	wi->wi_freemem = TRUE;
2408 	if (mdb_pwalk(freewalk, func, wi, addr) == -1) {
2409 		mdb_warn("can't find %s walker", freewalk);
2410 		return (WALK_DONE);
2411 	}
2412 
2413 	return (WHATIS_WALKRET(w));
2414 }
2415 
2416 static int
whatis_walk_touch(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2417 whatis_walk_touch(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2418 {
2419 	if (c->cache_arena == wi->wi_msb_arena ||
2420 	    (c->cache_cflags & KMC_NOTOUCH))
2421 		return (WALK_NEXT);
2422 
2423 	return (whatis_walk_cache(addr, c, wi));
2424 }
2425 
2426 static int
whatis_walk_metadata(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2427 whatis_walk_metadata(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2428 {
2429 	if (c->cache_arena != wi->wi_msb_arena)
2430 		return (WALK_NEXT);
2431 
2432 	return (whatis_walk_cache(addr, c, wi));
2433 }
2434 
2435 static int
whatis_walk_notouch(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2436 whatis_walk_notouch(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2437 {
2438 	if (c->cache_arena == wi->wi_msb_arena ||
2439 	    !(c->cache_cflags & KMC_NOTOUCH))
2440 		return (WALK_NEXT);
2441 
2442 	return (whatis_walk_cache(addr, c, wi));
2443 }
2444 
2445 static int
whatis_walk_thread(uintptr_t addr,const kthread_t * t,mdb_whatis_t * w)2446 whatis_walk_thread(uintptr_t addr, const kthread_t *t, mdb_whatis_t *w)
2447 {
2448 	uintptr_t cur;
2449 	uintptr_t saddr;
2450 	size_t size;
2451 
2452 	/*
2453 	 * Often, one calls ::whatis on an address from a thread structure.
2454 	 * We use this opportunity to short circuit this case...
2455 	 */
2456 	while (mdb_whatis_match(w, addr, sizeof (kthread_t), &cur))
2457 		mdb_whatis_report_object(w, cur, addr,
2458 		    "allocated as a thread structure\n");
2459 
2460 	/*
2461 	 * Now check the stack
2462 	 */
2463 	if (t->t_stkbase == NULL)
2464 		return (WALK_NEXT);
2465 
2466 	/*
2467 	 * This assumes that t_stk is the end of the stack, but it's really
2468 	 * only the initial stack pointer for the thread.  Arguments to the
2469 	 * initial procedure, SA(MINFRAME), etc. are all after t_stk.  So
2470 	 * that 't->t_stk::whatis' reports "part of t's stack", we include
2471 	 * t_stk in the range (the "+ 1", below), but the kernel should
2472 	 * really include the full stack bounds where we can find it.
2473 	 */
2474 	saddr = (uintptr_t)t->t_stkbase;
2475 	size = (uintptr_t)t->t_stk - saddr + 1;
2476 	while (mdb_whatis_match(w, saddr, size, &cur))
2477 		mdb_whatis_report_object(w, cur, cur,
2478 		    "in thread %p's stack%s\n", addr, stack_active(t, cur));
2479 
2480 	return (WHATIS_WALKRET(w));
2481 }
2482 
2483 static void
whatis_modctl_match(mdb_whatis_t * w,const char * name,uintptr_t base,size_t size,const char * where)2484 whatis_modctl_match(mdb_whatis_t *w, const char *name,
2485     uintptr_t base, size_t size, const char *where)
2486 {
2487 	uintptr_t cur;
2488 
2489 	/*
2490 	 * Since we're searching for addresses inside a module, we report
2491 	 * them as symbols.
2492 	 */
2493 	while (mdb_whatis_match(w, base, size, &cur))
2494 		mdb_whatis_report_address(w, cur, "in %s's %s\n", name, where);
2495 }
2496 
2497 static int
whatis_walk_modctl(uintptr_t addr,const struct modctl * m,mdb_whatis_t * w)2498 whatis_walk_modctl(uintptr_t addr, const struct modctl *m, mdb_whatis_t *w)
2499 {
2500 	char name[MODMAXNAMELEN];
2501 	struct module mod;
2502 	Shdr shdr;
2503 
2504 	if (m->mod_mp == NULL)
2505 		return (WALK_NEXT);
2506 
2507 	if (mdb_vread(&mod, sizeof (mod), (uintptr_t)m->mod_mp) == -1) {
2508 		mdb_warn("couldn't read modctl %p's module", addr);
2509 		return (WALK_NEXT);
2510 	}
2511 
2512 	if (mdb_readstr(name, sizeof (name), (uintptr_t)m->mod_modname) == -1)
2513 		(void) mdb_snprintf(name, sizeof (name), "0x%p", addr);
2514 
2515 	whatis_modctl_match(w, name,
2516 	    (uintptr_t)mod.text, mod.text_size, "text segment");
2517 	whatis_modctl_match(w, name,
2518 	    (uintptr_t)mod.data, mod.data_size, "data segment");
2519 	whatis_modctl_match(w, name,
2520 	    (uintptr_t)mod.bss, mod.bss_size, "bss segment");
2521 
2522 	if (mdb_vread(&shdr, sizeof (shdr), (uintptr_t)mod.symhdr) == -1) {
2523 		mdb_warn("couldn't read symbol header for %p's module", addr);
2524 		return (WALK_NEXT);
2525 	}
2526 
2527 	whatis_modctl_match(w, name,
2528 	    (uintptr_t)mod.symtbl, mod.nsyms * shdr.sh_entsize, "symtab");
2529 	whatis_modctl_match(w, name,
2530 	    (uintptr_t)mod.symspace, mod.symsize, "symtab");
2531 
2532 	return (WHATIS_WALKRET(w));
2533 }
2534 
2535 /*ARGSUSED*/
2536 static int
whatis_walk_memseg(uintptr_t addr,const struct memseg * seg,mdb_whatis_t * w)2537 whatis_walk_memseg(uintptr_t addr, const struct memseg *seg, mdb_whatis_t *w)
2538 {
2539 	uintptr_t cur;
2540 
2541 	uintptr_t base = (uintptr_t)seg->pages;
2542 	size_t size = (uintptr_t)seg->epages - base;
2543 
2544 	while (mdb_whatis_match(w, base, size, &cur)) {
2545 		/* round our found pointer down to the page_t base. */
2546 		size_t offset = (cur - base) % sizeof (page_t);
2547 
2548 		mdb_whatis_report_object(w, cur, cur - offset,
2549 		    "allocated as a page structure\n");
2550 	}
2551 
2552 	return (WHATIS_WALKRET(w));
2553 }
2554 
2555 /*ARGSUSED*/
2556 static int
whatis_run_modules(mdb_whatis_t * w,void * arg)2557 whatis_run_modules(mdb_whatis_t *w, void *arg)
2558 {
2559 	if (mdb_walk("modctl", (mdb_walk_cb_t)whatis_walk_modctl, w) == -1) {
2560 		mdb_warn("couldn't find modctl walker");
2561 		return (1);
2562 	}
2563 	return (0);
2564 }
2565 
2566 /*ARGSUSED*/
2567 static int
whatis_run_threads(mdb_whatis_t * w,void * ignored)2568 whatis_run_threads(mdb_whatis_t *w, void *ignored)
2569 {
2570 	/*
2571 	 * Now search all thread stacks.  Yes, this is a little weak; we
2572 	 * can save a lot of work by first checking to see if the
2573 	 * address is in segkp vs. segkmem.  But hey, computers are
2574 	 * fast.
2575 	 */
2576 	if (mdb_walk("thread", (mdb_walk_cb_t)whatis_walk_thread, w) == -1) {
2577 		mdb_warn("couldn't find thread walker");
2578 		return (1);
2579 	}
2580 	return (0);
2581 }
2582 
2583 /*ARGSUSED*/
2584 static int
whatis_run_pages(mdb_whatis_t * w,void * ignored)2585 whatis_run_pages(mdb_whatis_t *w, void *ignored)
2586 {
2587 	if (mdb_walk("memseg", (mdb_walk_cb_t)whatis_walk_memseg, w) == -1) {
2588 		mdb_warn("couldn't find memseg walker");
2589 		return (1);
2590 	}
2591 	return (0);
2592 }
2593 
2594 /*ARGSUSED*/
2595 static int
whatis_run_kmem(mdb_whatis_t * w,void * ignored)2596 whatis_run_kmem(mdb_whatis_t *w, void *ignored)
2597 {
2598 	whatis_info_t wi;
2599 
2600 	bzero(&wi, sizeof (wi));
2601 	wi.wi_w = w;
2602 
2603 	if (mdb_readvar(&wi.wi_msb_arena, "kmem_msb_arena") == -1)
2604 		mdb_warn("unable to readvar \"kmem_msb_arena\"");
2605 
2606 	if (mdb_readvar(&wi.wi_kmem_lite_count,
2607 	    "kmem_lite_count") == -1 || wi.wi_kmem_lite_count > 16)
2608 		wi.wi_kmem_lite_count = 0;
2609 
2610 	/*
2611 	 * We process kmem caches in the following order:
2612 	 *
2613 	 *	non-KMC_NOTOUCH, non-metadata	(typically the most interesting)
2614 	 *	metadata			(can be huge with KMF_AUDIT)
2615 	 *	KMC_NOTOUCH, non-metadata	(see kmem_walk_all())
2616 	 */
2617 	if (mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_touch,
2618 	    &wi) == -1 ||
2619 	    mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_metadata,
2620 	    &wi) == -1 ||
2621 	    mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_notouch,
2622 	    &wi) == -1) {
2623 		mdb_warn("couldn't find kmem_cache walker");
2624 		return (1);
2625 	}
2626 	return (0);
2627 }
2628 
2629 /*ARGSUSED*/
2630 static int
whatis_run_vmem(mdb_whatis_t * w,void * ignored)2631 whatis_run_vmem(mdb_whatis_t *w, void *ignored)
2632 {
2633 	whatis_info_t wi;
2634 
2635 	bzero(&wi, sizeof (wi));
2636 	wi.wi_w = w;
2637 
2638 	if (mdb_walk("vmem_postfix",
2639 	    (mdb_walk_cb_t)whatis_walk_vmem, &wi) == -1) {
2640 		mdb_warn("couldn't find vmem_postfix walker");
2641 		return (1);
2642 	}
2643 	return (0);
2644 }
2645 
2646 typedef struct kmem_log_cpu {
2647 	uintptr_t kmc_low;
2648 	uintptr_t kmc_high;
2649 } kmem_log_cpu_t;
2650 
2651 typedef struct kmem_log_data {
2652 	uintptr_t kmd_addr;
2653 	kmem_log_cpu_t *kmd_cpu;
2654 } kmem_log_data_t;
2655 
2656 int
kmem_log_walk(uintptr_t addr,const kmem_bufctl_audit_t * b,kmem_log_data_t * kmd)2657 kmem_log_walk(uintptr_t addr, const kmem_bufctl_audit_t *b,
2658     kmem_log_data_t *kmd)
2659 {
2660 	int i;
2661 	kmem_log_cpu_t *kmc = kmd->kmd_cpu;
2662 	size_t bufsize;
2663 
2664 	for (i = 0; i < NCPU; i++) {
2665 		if (addr >= kmc[i].kmc_low && addr < kmc[i].kmc_high)
2666 			break;
2667 	}
2668 
2669 	if (kmd->kmd_addr) {
2670 		if (b->bc_cache == NULL)
2671 			return (WALK_NEXT);
2672 
2673 		if (mdb_vread(&bufsize, sizeof (bufsize),
2674 		    (uintptr_t)&b->bc_cache->cache_bufsize) == -1) {
2675 			mdb_warn(
2676 			    "failed to read cache_bufsize for cache at %p",
2677 			    b->bc_cache);
2678 			return (WALK_ERR);
2679 		}
2680 
2681 		if (kmd->kmd_addr < (uintptr_t)b->bc_addr ||
2682 		    kmd->kmd_addr >= (uintptr_t)b->bc_addr + bufsize)
2683 			return (WALK_NEXT);
2684 	}
2685 
2686 	if (i == NCPU)
2687 		mdb_printf("   ");
2688 	else
2689 		mdb_printf("%3d", i);
2690 
2691 	mdb_printf(" %0?p %0?p %16llx %0?p\n", addr, b->bc_addr,
2692 	    b->bc_timestamp, b->bc_thread);
2693 
2694 	return (WALK_NEXT);
2695 }
2696 
2697 /*ARGSUSED*/
2698 int
kmem_log(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2699 kmem_log(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2700 {
2701 	kmem_log_header_t lh;
2702 	kmem_cpu_log_header_t clh;
2703 	uintptr_t lhp, clhp;
2704 	int ncpus;
2705 	uintptr_t *cpu;
2706 	GElf_Sym sym;
2707 	kmem_log_cpu_t *kmc;
2708 	int i;
2709 	kmem_log_data_t kmd;
2710 	uint_t opt_b = FALSE;
2711 
2712 	if (mdb_getopts(argc, argv,
2713 	    'b', MDB_OPT_SETBITS, TRUE, &opt_b, NULL) != argc)
2714 		return (DCMD_USAGE);
2715 
2716 	if (mdb_readvar(&lhp, "kmem_transaction_log") == -1) {
2717 		mdb_warn("failed to read 'kmem_transaction_log'");
2718 		return (DCMD_ERR);
2719 	}
2720 
2721 	if (lhp == 0) {
2722 		mdb_warn("no kmem transaction log\n");
2723 		return (DCMD_ERR);
2724 	}
2725 
2726 	mdb_readvar(&ncpus, "ncpus");
2727 
2728 	if (mdb_vread(&lh, sizeof (kmem_log_header_t), lhp) == -1) {
2729 		mdb_warn("failed to read log header at %p", lhp);
2730 		return (DCMD_ERR);
2731 	}
2732 
2733 	clhp = lhp + ((uintptr_t)&lh.lh_cpu[0] - (uintptr_t)&lh);
2734 
2735 	cpu = mdb_alloc(sizeof (uintptr_t) * NCPU, UM_SLEEP | UM_GC);
2736 
2737 	if (mdb_lookup_by_name("cpu", &sym) == -1) {
2738 		mdb_warn("couldn't find 'cpu' array");
2739 		return (DCMD_ERR);
2740 	}
2741 
2742 	if (sym.st_size != NCPU * sizeof (uintptr_t)) {
2743 		mdb_warn("expected 'cpu' to be of size %d; found %d\n",
2744 		    NCPU * sizeof (uintptr_t), sym.st_size);
2745 		return (DCMD_ERR);
2746 	}
2747 
2748 	if (mdb_vread(cpu, sym.st_size, (uintptr_t)sym.st_value) == -1) {
2749 		mdb_warn("failed to read cpu array at %p", sym.st_value);
2750 		return (DCMD_ERR);
2751 	}
2752 
2753 	kmc = mdb_zalloc(sizeof (kmem_log_cpu_t) * NCPU, UM_SLEEP | UM_GC);
2754 	kmd.kmd_addr = 0;
2755 	kmd.kmd_cpu = kmc;
2756 
2757 	for (i = 0; i < NCPU; i++) {
2758 
2759 		if (cpu[i] == 0)
2760 			continue;
2761 
2762 		if (mdb_vread(&clh, sizeof (clh), clhp) == -1) {
2763 			mdb_warn("cannot read cpu %d's log header at %p",
2764 			    i, clhp);
2765 			return (DCMD_ERR);
2766 		}
2767 
2768 		kmc[i].kmc_low = clh.clh_chunk * lh.lh_chunksize +
2769 		    (uintptr_t)lh.lh_base;
2770 		kmc[i].kmc_high = (uintptr_t)clh.clh_current;
2771 
2772 		clhp += sizeof (kmem_cpu_log_header_t);
2773 	}
2774 
2775 	mdb_printf("%3s %-?s %-?s %16s %-?s\n", "CPU", "ADDR", "BUFADDR",
2776 	    "TIMESTAMP", "THREAD");
2777 
2778 	/*
2779 	 * If we have been passed an address, print out only log entries
2780 	 * corresponding to that address.  If opt_b is specified, then interpret
2781 	 * the address as a bufctl.
2782 	 */
2783 	if (flags & DCMD_ADDRSPEC) {
2784 		kmem_bufctl_audit_t b;
2785 
2786 		if (opt_b) {
2787 			kmd.kmd_addr = addr;
2788 		} else {
2789 			if (mdb_vread(&b,
2790 			    sizeof (kmem_bufctl_audit_t), addr) == -1) {
2791 				mdb_warn("failed to read bufctl at %p", addr);
2792 				return (DCMD_ERR);
2793 			}
2794 
2795 			(void) kmem_log_walk(addr, &b, &kmd);
2796 
2797 			return (DCMD_OK);
2798 		}
2799 	}
2800 
2801 	if (mdb_walk("kmem_log", (mdb_walk_cb_t)kmem_log_walk, &kmd) == -1) {
2802 		mdb_warn("can't find kmem log walker");
2803 		return (DCMD_ERR);
2804 	}
2805 
2806 	return (DCMD_OK);
2807 }
2808 
2809 typedef struct bufctl_history_cb {
2810 	int		bhc_flags;
2811 	int		bhc_argc;
2812 	const mdb_arg_t	*bhc_argv;
2813 	int		bhc_ret;
2814 } bufctl_history_cb_t;
2815 
2816 /*ARGSUSED*/
2817 static int
bufctl_history_callback(uintptr_t addr,const void * ign,void * arg)2818 bufctl_history_callback(uintptr_t addr, const void *ign, void *arg)
2819 {
2820 	bufctl_history_cb_t *bhc = arg;
2821 
2822 	bhc->bhc_ret =
2823 	    bufctl(addr, bhc->bhc_flags, bhc->bhc_argc, bhc->bhc_argv);
2824 
2825 	bhc->bhc_flags &= ~DCMD_LOOPFIRST;
2826 
2827 	return ((bhc->bhc_ret == DCMD_OK)? WALK_NEXT : WALK_DONE);
2828 }
2829 
2830 void
bufctl_help(void)2831 bufctl_help(void)
2832 {
2833 	mdb_printf("%s",
2834 "Display the contents of kmem_bufctl_audit_ts, with optional filtering.\n\n");
2835 	mdb_dec_indent(2);
2836 	mdb_printf("%<b>OPTIONS%</b>\n");
2837 	mdb_inc_indent(2);
2838 	mdb_printf("%s",
2839 "  -v    Display the full content of the bufctl, including its stack trace\n"
2840 "  -h    retrieve the bufctl's transaction history, if available\n"
2841 "  -a addr\n"
2842 "        filter out bufctls not involving the buffer at addr\n"
2843 "  -c caller\n"
2844 "        filter out bufctls without the function/PC in their stack trace\n"
2845 "  -e earliest\n"
2846 "        filter out bufctls timestamped before earliest\n"
2847 "  -l latest\n"
2848 "        filter out bufctls timestamped after latest\n"
2849 "  -t thread\n"
2850 "        filter out bufctls not involving thread\n");
2851 }
2852 
2853 int
bufctl(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2854 bufctl(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2855 {
2856 	kmem_bufctl_audit_t bc;
2857 	uint_t verbose = FALSE;
2858 	uint_t history = FALSE;
2859 	uint_t in_history = FALSE;
2860 	uintptr_t caller = 0, thread = 0;
2861 	uintptr_t laddr, haddr, baddr = 0;
2862 	hrtime_t earliest = 0, latest = 0;
2863 	int i, depth;
2864 	char c[MDB_SYM_NAMLEN];
2865 	GElf_Sym sym;
2866 
2867 	if (mdb_getopts(argc, argv,
2868 	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
2869 	    'h', MDB_OPT_SETBITS, TRUE, &history,
2870 	    'H', MDB_OPT_SETBITS, TRUE, &in_history,		/* internal */
2871 	    'c', MDB_OPT_UINTPTR, &caller,
2872 	    't', MDB_OPT_UINTPTR, &thread,
2873 	    'e', MDB_OPT_UINT64, &earliest,
2874 	    'l', MDB_OPT_UINT64, &latest,
2875 	    'a', MDB_OPT_UINTPTR, &baddr, NULL) != argc)
2876 		return (DCMD_USAGE);
2877 
2878 	if (!(flags & DCMD_ADDRSPEC))
2879 		return (DCMD_USAGE);
2880 
2881 	if (in_history && !history)
2882 		return (DCMD_USAGE);
2883 
2884 	if (history && !in_history) {
2885 		mdb_arg_t *nargv = mdb_zalloc(sizeof (*nargv) * (argc + 1),
2886 		    UM_SLEEP | UM_GC);
2887 		bufctl_history_cb_t bhc;
2888 
2889 		nargv[0].a_type = MDB_TYPE_STRING;
2890 		nargv[0].a_un.a_str = "-H";		/* prevent recursion */
2891 
2892 		for (i = 0; i < argc; i++)
2893 			nargv[i + 1] = argv[i];
2894 
2895 		/*
2896 		 * When in history mode, we treat each element as if it
2897 		 * were in a seperate loop, so that the headers group
2898 		 * bufctls with similar histories.
2899 		 */
2900 		bhc.bhc_flags = flags | DCMD_LOOP | DCMD_LOOPFIRST;
2901 		bhc.bhc_argc = argc + 1;
2902 		bhc.bhc_argv = nargv;
2903 		bhc.bhc_ret = DCMD_OK;
2904 
2905 		if (mdb_pwalk("bufctl_history", bufctl_history_callback, &bhc,
2906 		    addr) == -1) {
2907 			mdb_warn("unable to walk bufctl_history");
2908 			return (DCMD_ERR);
2909 		}
2910 
2911 		if (bhc.bhc_ret == DCMD_OK && !(flags & DCMD_PIPE_OUT))
2912 			mdb_printf("\n");
2913 
2914 		return (bhc.bhc_ret);
2915 	}
2916 
2917 	if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
2918 		if (verbose) {
2919 			mdb_printf("%16s %16s %16s %16s\n"
2920 			    "%<u>%16s %16s %16s %16s%</u>\n",
2921 			    "ADDR", "BUFADDR", "TIMESTAMP", "THREAD",
2922 			    "", "CACHE", "LASTLOG", "CONTENTS");
2923 		} else {
2924 			mdb_printf("%<u>%-?s %-?s %-12s %-?s %s%</u>\n",
2925 			    "ADDR", "BUFADDR", "TIMESTAMP", "THREAD", "CALLER");
2926 		}
2927 	}
2928 
2929 	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
2930 		mdb_warn("couldn't read bufctl at %p", addr);
2931 		return (DCMD_ERR);
2932 	}
2933 
2934 	/*
2935 	 * Guard against bogus bc_depth in case the bufctl is corrupt or
2936 	 * the address does not really refer to a bufctl.
2937 	 */
2938 	depth = MIN(bc.bc_depth, KMEM_STACK_DEPTH);
2939 
2940 	if (caller != 0) {
2941 		laddr = caller;
2942 		haddr = caller + sizeof (caller);
2943 
2944 		if (mdb_lookup_by_addr(caller, MDB_SYM_FUZZY, c, sizeof (c),
2945 		    &sym) != -1 && caller == (uintptr_t)sym.st_value) {
2946 			/*
2947 			 * We were provided an exact symbol value; any
2948 			 * address in the function is valid.
2949 			 */
2950 			laddr = (uintptr_t)sym.st_value;
2951 			haddr = (uintptr_t)sym.st_value + sym.st_size;
2952 		}
2953 
2954 		for (i = 0; i < depth; i++)
2955 			if (bc.bc_stack[i] >= laddr && bc.bc_stack[i] < haddr)
2956 				break;
2957 
2958 		if (i == depth)
2959 			return (DCMD_OK);
2960 	}
2961 
2962 	if (thread != 0 && (uintptr_t)bc.bc_thread != thread)
2963 		return (DCMD_OK);
2964 
2965 	if (earliest != 0 && bc.bc_timestamp < earliest)
2966 		return (DCMD_OK);
2967 
2968 	if (latest != 0 && bc.bc_timestamp > latest)
2969 		return (DCMD_OK);
2970 
2971 	if (baddr != 0 && (uintptr_t)bc.bc_addr != baddr)
2972 		return (DCMD_OK);
2973 
2974 	if (flags & DCMD_PIPE_OUT) {
2975 		mdb_printf("%#lr\n", addr);
2976 		return (DCMD_OK);
2977 	}
2978 
2979 	if (verbose) {
2980 		mdb_printf(
2981 		    "%<b>%16p%</b> %16p %16llx %16p\n"
2982 		    "%16s %16p %16p %16p\n",
2983 		    addr, bc.bc_addr, bc.bc_timestamp, bc.bc_thread,
2984 		    "", bc.bc_cache, bc.bc_lastlog, bc.bc_contents);
2985 
2986 		mdb_inc_indent(17);
2987 		for (i = 0; i < depth; i++)
2988 			mdb_printf("%a\n", bc.bc_stack[i]);
2989 		mdb_dec_indent(17);
2990 		mdb_printf("\n");
2991 	} else {
2992 		mdb_printf("%0?p %0?p %12llx %0?p", addr, bc.bc_addr,
2993 		    bc.bc_timestamp, bc.bc_thread);
2994 
2995 		for (i = 0; i < depth; i++) {
2996 			if (mdb_lookup_by_addr(bc.bc_stack[i],
2997 			    MDB_SYM_FUZZY, c, sizeof (c), &sym) == -1)
2998 				continue;
2999 			if (strncmp(c, "kmem_", 5) == 0)
3000 				continue;
3001 			mdb_printf(" %a\n", bc.bc_stack[i]);
3002 			break;
3003 		}
3004 
3005 		if (i >= depth)
3006 			mdb_printf("\n");
3007 	}
3008 
3009 	return (DCMD_OK);
3010 }
3011 
3012 typedef struct kmem_verify {
3013 	uint64_t *kmv_buf;		/* buffer to read cache contents into */
3014 	size_t kmv_size;		/* number of bytes in kmv_buf */
3015 	int kmv_corruption;		/* > 0 if corruption found. */
3016 	uint_t kmv_flags;		/* dcmd flags */
3017 	struct kmem_cache kmv_cache;	/* the cache we're operating on */
3018 } kmem_verify_t;
3019 
3020 /*
3021  * verify_pattern()
3022  *	verify that buf is filled with the pattern pat.
3023  */
3024 static int64_t
verify_pattern(uint64_t * buf_arg,size_t size,uint64_t pat)3025 verify_pattern(uint64_t *buf_arg, size_t size, uint64_t pat)
3026 {
3027 	/*LINTED*/
3028 	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
3029 	uint64_t *buf;
3030 
3031 	for (buf = buf_arg; buf < bufend; buf++)
3032 		if (*buf != pat)
3033 			return ((uintptr_t)buf - (uintptr_t)buf_arg);
3034 	return (-1);
3035 }
3036 
3037 /*
3038  * verify_buftag()
3039  *	verify that btp->bt_bxstat == (bcp ^ pat)
3040  */
3041 static int
verify_buftag(kmem_buftag_t * btp,uintptr_t pat)3042 verify_buftag(kmem_buftag_t *btp, uintptr_t pat)
3043 {
3044 	return (btp->bt_bxstat == ((intptr_t)btp->bt_bufctl ^ pat) ? 0 : -1);
3045 }
3046 
3047 /*
3048  * verify_free()
3049  *	verify the integrity of a free block of memory by checking
3050  *	that it is filled with 0xdeadbeef and that its buftag is sane.
3051  */
3052 /*ARGSUSED1*/
3053 static int
verify_free(uintptr_t addr,const void * data,void * private)3054 verify_free(uintptr_t addr, const void *data, void *private)
3055 {
3056 	kmem_verify_t *kmv = (kmem_verify_t *)private;
3057 	uint64_t *buf = kmv->kmv_buf;	/* buf to validate */
3058 	int64_t corrupt;		/* corruption offset */
3059 	kmem_buftag_t *buftagp;		/* ptr to buftag */
3060 	kmem_cache_t *cp = &kmv->kmv_cache;
3061 	boolean_t besilent = !!(kmv->kmv_flags & (DCMD_LOOP | DCMD_PIPE_OUT));
3062 
3063 	/*LINTED*/
3064 	buftagp = KMEM_BUFTAG(cp, buf);
3065 
3066 	/*
3067 	 * Read the buffer to check.
3068 	 */
3069 	if (mdb_vread(buf, kmv->kmv_size, addr) == -1) {
3070 		if (!besilent)
3071 			mdb_warn("couldn't read %p", addr);
3072 		return (WALK_NEXT);
3073 	}
3074 
3075 	if ((corrupt = verify_pattern(buf, cp->cache_verify,
3076 	    KMEM_FREE_PATTERN)) >= 0) {
3077 		if (!besilent)
3078 			mdb_printf("buffer %p (free) seems corrupted, at %p\n",
3079 			    addr, (uintptr_t)addr + corrupt);
3080 		goto corrupt;
3081 	}
3082 	/*
3083 	 * When KMF_LITE is set, buftagp->bt_redzone is used to hold
3084 	 * the first bytes of the buffer, hence we cannot check for red
3085 	 * zone corruption.
3086 	 */
3087 	if ((cp->cache_flags & (KMF_HASH | KMF_LITE)) == KMF_HASH &&
3088 	    buftagp->bt_redzone != KMEM_REDZONE_PATTERN) {
3089 		if (!besilent)
3090 			mdb_printf("buffer %p (free) seems to "
3091 			    "have a corrupt redzone pattern\n", addr);
3092 		goto corrupt;
3093 	}
3094 
3095 	/*
3096 	 * confirm bufctl pointer integrity.
3097 	 */
3098 	if (verify_buftag(buftagp, KMEM_BUFTAG_FREE) == -1) {
3099 		if (!besilent)
3100 			mdb_printf("buffer %p (free) has a corrupt "
3101 			    "buftag\n", addr);
3102 		goto corrupt;
3103 	}
3104 
3105 	return (WALK_NEXT);
3106 corrupt:
3107 	if (kmv->kmv_flags & DCMD_PIPE_OUT)
3108 		mdb_printf("%p\n", addr);
3109 	kmv->kmv_corruption++;
3110 	return (WALK_NEXT);
3111 }
3112 
3113 /*
3114  * verify_alloc()
3115  *	Verify that the buftag of an allocated buffer makes sense with respect
3116  *	to the buffer.
3117  */
3118 /*ARGSUSED1*/
3119 static int
verify_alloc(uintptr_t addr,const void * data,void * private)3120 verify_alloc(uintptr_t addr, const void *data, void *private)
3121 {
3122 	kmem_verify_t *kmv = (kmem_verify_t *)private;
3123 	kmem_cache_t *cp = &kmv->kmv_cache;
3124 	uint64_t *buf = kmv->kmv_buf;	/* buf to validate */
3125 	/*LINTED*/
3126 	kmem_buftag_t *buftagp = KMEM_BUFTAG(cp, buf);
3127 	uint32_t *ip = (uint32_t *)buftagp;
3128 	uint8_t *bp = (uint8_t *)buf;
3129 	int looks_ok = 0, size_ok = 1;	/* flags for finding corruption */
3130 	boolean_t besilent = !!(kmv->kmv_flags & (DCMD_LOOP | DCMD_PIPE_OUT));
3131 
3132 	/*
3133 	 * Read the buffer to check.
3134 	 */
3135 	if (mdb_vread(buf, kmv->kmv_size, addr) == -1) {
3136 		if (!besilent)
3137 			mdb_warn("couldn't read %p", addr);
3138 		return (WALK_NEXT);
3139 	}
3140 
3141 	/*
3142 	 * There are two cases to handle:
3143 	 * 1. If the buf was alloc'd using kmem_cache_alloc, it will have
3144 	 *    0xfeedfacefeedface at the end of it
3145 	 * 2. If the buf was alloc'd using kmem_alloc, it will have
3146 	 *    0xbb just past the end of the region in use.  At the buftag,
3147 	 *    it will have 0xfeedface (or, if the whole buffer is in use,
3148 	 *    0xfeedface & bb000000 or 0xfeedfacf & 000000bb depending on
3149 	 *    endianness), followed by 32 bits containing the offset of the
3150 	 *    0xbb byte in the buffer.
3151 	 *
3152 	 * Finally, the two 32-bit words that comprise the second half of the
3153 	 * buftag should xor to KMEM_BUFTAG_ALLOC
3154 	 */
3155 
3156 	if (buftagp->bt_redzone == KMEM_REDZONE_PATTERN)
3157 		looks_ok = 1;
3158 	else if (!KMEM_SIZE_VALID(ip[1]))
3159 		size_ok = 0;
3160 	else if (bp[KMEM_SIZE_DECODE(ip[1])] == KMEM_REDZONE_BYTE)
3161 		looks_ok = 1;
3162 	else
3163 		size_ok = 0;
3164 
3165 	if (!size_ok) {
3166 		if (!besilent)
3167 			mdb_printf("buffer %p (allocated) has a corrupt "
3168 			    "redzone size encoding\n", addr);
3169 		goto corrupt;
3170 	}
3171 
3172 	if (!looks_ok) {
3173 		if (!besilent)
3174 			mdb_printf("buffer %p (allocated) has a corrupt "
3175 			    "redzone signature\n", addr);
3176 		goto corrupt;
3177 	}
3178 
3179 	if (verify_buftag(buftagp, KMEM_BUFTAG_ALLOC) == -1) {
3180 		if (!besilent)
3181 			mdb_printf("buffer %p (allocated) has a "
3182 			    "corrupt buftag\n", addr);
3183 		goto corrupt;
3184 	}
3185 
3186 	return (WALK_NEXT);
3187 corrupt:
3188 	if (kmv->kmv_flags & DCMD_PIPE_OUT)
3189 		mdb_printf("%p\n", addr);
3190 
3191 	kmv->kmv_corruption++;
3192 	return (WALK_NEXT);
3193 }
3194 
3195 /*ARGSUSED2*/
3196 int
kmem_verify(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3197 kmem_verify(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3198 {
3199 	if (flags & DCMD_ADDRSPEC) {
3200 		int check_alloc = 0, check_free = 0;
3201 		kmem_verify_t kmv;
3202 
3203 		if (mdb_vread(&kmv.kmv_cache, sizeof (kmv.kmv_cache),
3204 		    addr) == -1) {
3205 			mdb_warn("couldn't read kmem_cache %p", addr);
3206 			return (DCMD_ERR);
3207 		}
3208 
3209 		if ((kmv.kmv_cache.cache_dump.kd_unsafe ||
3210 		    kmv.kmv_cache.cache_dump.kd_alloc_fails) &&
3211 		    !(flags & (DCMD_LOOP | DCMD_PIPE_OUT))) {
3212 			mdb_warn("WARNING: cache was used during dump: "
3213 			    "corruption may be incorrectly reported\n");
3214 		}
3215 
3216 		kmv.kmv_size = kmv.kmv_cache.cache_buftag +
3217 		    sizeof (kmem_buftag_t);
3218 		kmv.kmv_buf = mdb_alloc(kmv.kmv_size, UM_SLEEP | UM_GC);
3219 		kmv.kmv_corruption = 0;
3220 		kmv.kmv_flags = flags;
3221 
3222 		if ((kmv.kmv_cache.cache_flags & KMF_REDZONE)) {
3223 			check_alloc = 1;
3224 			if (kmv.kmv_cache.cache_flags & KMF_DEADBEEF)
3225 				check_free = 1;
3226 		} else {
3227 			if (!(flags & DCMD_LOOP)) {
3228 				mdb_warn("cache %p (%s) does not have "
3229 				    "redzone checking enabled\n", addr,
3230 				    kmv.kmv_cache.cache_name);
3231 			}
3232 			return (DCMD_ERR);
3233 		}
3234 
3235 		if (!(flags & (DCMD_LOOP | DCMD_PIPE_OUT))) {
3236 			mdb_printf("Summary for cache '%s'\n",
3237 			    kmv.kmv_cache.cache_name);
3238 			mdb_inc_indent(2);
3239 		}
3240 
3241 		if (check_alloc)
3242 			(void) mdb_pwalk("kmem", verify_alloc, &kmv, addr);
3243 		if (check_free)
3244 			(void) mdb_pwalk("freemem", verify_free, &kmv, addr);
3245 
3246 		if (!(flags & DCMD_PIPE_OUT)) {
3247 			if (flags & DCMD_LOOP) {
3248 				if (kmv.kmv_corruption == 0) {
3249 					mdb_printf("%-*s %?p clean\n",
3250 					    KMEM_CACHE_NAMELEN,
3251 					    kmv.kmv_cache.cache_name, addr);
3252 				} else {
3253 					mdb_printf("%-*s %?p %d corrupt "
3254 					    "buffer%s\n", KMEM_CACHE_NAMELEN,
3255 					    kmv.kmv_cache.cache_name, addr,
3256 					    kmv.kmv_corruption,
3257 					    kmv.kmv_corruption > 1 ? "s" : "");
3258 				}
3259 			} else {
3260 				/*
3261 				 * This is the more verbose mode, when the user
3262 				 * typed addr::kmem_verify.  If the cache was
3263 				 * clean, nothing will have yet been printed. So
3264 				 * say something.
3265 				 */
3266 				if (kmv.kmv_corruption == 0)
3267 					mdb_printf("clean\n");
3268 
3269 				mdb_dec_indent(2);
3270 			}
3271 		}
3272 	} else {
3273 		/*
3274 		 * If the user didn't specify a cache to verify, we'll walk all
3275 		 * kmem_cache's, specifying ourself as a callback for each...
3276 		 * this is the equivalent of '::walk kmem_cache .::kmem_verify'
3277 		 */
3278 
3279 		if (!(flags & DCMD_PIPE_OUT)) {
3280 			uintptr_t dump_curr;
3281 			uintptr_t dump_end;
3282 
3283 			if (mdb_readvar(&dump_curr, "kmem_dump_curr") != -1 &&
3284 			    mdb_readvar(&dump_end, "kmem_dump_end") != -1 &&
3285 			    dump_curr == dump_end) {
3286 				mdb_warn("WARNING: exceeded kmem_dump_size; "
3287 				    "corruption may be incorrectly reported\n");
3288 			}
3289 
3290 			mdb_printf("%<u>%-*s %-?s %-20s%</b>\n",
3291 			    KMEM_CACHE_NAMELEN, "Cache Name", "Addr",
3292 			    "Cache Integrity");
3293 		}
3294 
3295 		(void) (mdb_walk_dcmd("kmem_cache", "kmem_verify", 0, NULL));
3296 	}
3297 
3298 	return (DCMD_OK);
3299 }
3300 
3301 typedef struct vmem_node {
3302 	struct vmem_node *vn_next;
3303 	struct vmem_node *vn_parent;
3304 	struct vmem_node *vn_sibling;
3305 	struct vmem_node *vn_children;
3306 	uintptr_t vn_addr;
3307 	int vn_marked;
3308 	vmem_t vn_vmem;
3309 } vmem_node_t;
3310 
3311 typedef struct vmem_walk {
3312 	vmem_node_t *vw_root;
3313 	vmem_node_t *vw_current;
3314 } vmem_walk_t;
3315 
3316 int
vmem_walk_init(mdb_walk_state_t * wsp)3317 vmem_walk_init(mdb_walk_state_t *wsp)
3318 {
3319 	uintptr_t vaddr, paddr;
3320 	vmem_node_t *head = NULL, *root = NULL, *current = NULL, *parent, *vp;
3321 	vmem_walk_t *vw;
3322 
3323 	if (mdb_readvar(&vaddr, "vmem_list") == -1) {
3324 		mdb_warn("couldn't read 'vmem_list'");
3325 		return (WALK_ERR);
3326 	}
3327 
3328 	while (vaddr != 0) {
3329 		vp = mdb_zalloc(sizeof (vmem_node_t), UM_SLEEP);
3330 		vp->vn_addr = vaddr;
3331 		vp->vn_next = head;
3332 		head = vp;
3333 
3334 		if (vaddr == wsp->walk_addr)
3335 			current = vp;
3336 
3337 		if (mdb_vread(&vp->vn_vmem, sizeof (vmem_t), vaddr) == -1) {
3338 			mdb_warn("couldn't read vmem_t at %p", vaddr);
3339 			goto err;
3340 		}
3341 
3342 		vaddr = (uintptr_t)vp->vn_vmem.vm_next;
3343 	}
3344 
3345 	for (vp = head; vp != NULL; vp = vp->vn_next) {
3346 
3347 		if ((paddr = (uintptr_t)vp->vn_vmem.vm_source) == 0) {
3348 			vp->vn_sibling = root;
3349 			root = vp;
3350 			continue;
3351 		}
3352 
3353 		for (parent = head; parent != NULL; parent = parent->vn_next) {
3354 			if (parent->vn_addr != paddr)
3355 				continue;
3356 			vp->vn_sibling = parent->vn_children;
3357 			parent->vn_children = vp;
3358 			vp->vn_parent = parent;
3359 			break;
3360 		}
3361 
3362 		if (parent == NULL) {
3363 			mdb_warn("couldn't find %p's parent (%p)\n",
3364 			    vp->vn_addr, paddr);
3365 			goto err;
3366 		}
3367 	}
3368 
3369 	vw = mdb_zalloc(sizeof (vmem_walk_t), UM_SLEEP);
3370 	vw->vw_root = root;
3371 
3372 	if (current != NULL)
3373 		vw->vw_current = current;
3374 	else
3375 		vw->vw_current = root;
3376 
3377 	wsp->walk_data = vw;
3378 	return (WALK_NEXT);
3379 err:
3380 	for (vp = head; head != NULL; vp = head) {
3381 		head = vp->vn_next;
3382 		mdb_free(vp, sizeof (vmem_node_t));
3383 	}
3384 
3385 	return (WALK_ERR);
3386 }
3387 
3388 int
vmem_walk_step(mdb_walk_state_t * wsp)3389 vmem_walk_step(mdb_walk_state_t *wsp)
3390 {
3391 	vmem_walk_t *vw = wsp->walk_data;
3392 	vmem_node_t *vp;
3393 	int rval;
3394 
3395 	if ((vp = vw->vw_current) == NULL)
3396 		return (WALK_DONE);
3397 
3398 	rval = wsp->walk_callback(vp->vn_addr, &vp->vn_vmem, wsp->walk_cbdata);
3399 
3400 	if (vp->vn_children != NULL) {
3401 		vw->vw_current = vp->vn_children;
3402 		return (rval);
3403 	}
3404 
3405 	do {
3406 		vw->vw_current = vp->vn_sibling;
3407 		vp = vp->vn_parent;
3408 	} while (vw->vw_current == NULL && vp != NULL);
3409 
3410 	return (rval);
3411 }
3412 
3413 /*
3414  * The "vmem_postfix" walk walks the vmem arenas in post-fix order; all
3415  * children are visited before their parent.  We perform the postfix walk
3416  * iteratively (rather than recursively) to allow mdb to regain control
3417  * after each callback.
3418  */
3419 int
vmem_postfix_walk_step(mdb_walk_state_t * wsp)3420 vmem_postfix_walk_step(mdb_walk_state_t *wsp)
3421 {
3422 	vmem_walk_t *vw = wsp->walk_data;
3423 	vmem_node_t *vp = vw->vw_current;
3424 	int rval;
3425 
3426 	/*
3427 	 * If this node is marked, then we know that we have already visited
3428 	 * all of its children.  If the node has any siblings, they need to
3429 	 * be visited next; otherwise, we need to visit the parent.  Note
3430 	 * that vp->vn_marked will only be zero on the first invocation of
3431 	 * the step function.
3432 	 */
3433 	if (vp->vn_marked) {
3434 		if (vp->vn_sibling != NULL)
3435 			vp = vp->vn_sibling;
3436 		else if (vp->vn_parent != NULL)
3437 			vp = vp->vn_parent;
3438 		else {
3439 			/*
3440 			 * We have neither a parent, nor a sibling, and we
3441 			 * have already been visited; we're done.
3442 			 */
3443 			return (WALK_DONE);
3444 		}
3445 	}
3446 
3447 	/*
3448 	 * Before we visit this node, visit its children.
3449 	 */
3450 	while (vp->vn_children != NULL && !vp->vn_children->vn_marked)
3451 		vp = vp->vn_children;
3452 
3453 	vp->vn_marked = 1;
3454 	vw->vw_current = vp;
3455 	rval = wsp->walk_callback(vp->vn_addr, &vp->vn_vmem, wsp->walk_cbdata);
3456 
3457 	return (rval);
3458 }
3459 
3460 void
vmem_walk_fini(mdb_walk_state_t * wsp)3461 vmem_walk_fini(mdb_walk_state_t *wsp)
3462 {
3463 	vmem_walk_t *vw = wsp->walk_data;
3464 	vmem_node_t *root = vw->vw_root;
3465 	int done;
3466 
3467 	if (root == NULL)
3468 		return;
3469 
3470 	if ((vw->vw_root = root->vn_children) != NULL)
3471 		vmem_walk_fini(wsp);
3472 
3473 	vw->vw_root = root->vn_sibling;
3474 	done = (root->vn_sibling == NULL && root->vn_parent == NULL);
3475 	mdb_free(root, sizeof (vmem_node_t));
3476 
3477 	if (done) {
3478 		mdb_free(vw, sizeof (vmem_walk_t));
3479 	} else {
3480 		vmem_walk_fini(wsp);
3481 	}
3482 }
3483 
3484 typedef struct vmem_seg_walk {
3485 	uint8_t vsw_type;
3486 	uintptr_t vsw_start;
3487 	uintptr_t vsw_current;
3488 } vmem_seg_walk_t;
3489 
3490 /*ARGSUSED*/
3491 int
vmem_seg_walk_common_init(mdb_walk_state_t * wsp,uint8_t type,char * name)3492 vmem_seg_walk_common_init(mdb_walk_state_t *wsp, uint8_t type, char *name)
3493 {
3494 	vmem_seg_walk_t *vsw;
3495 
3496 	if (wsp->walk_addr == 0) {
3497 		mdb_warn("vmem_%s does not support global walks\n", name);
3498 		return (WALK_ERR);
3499 	}
3500 
3501 	wsp->walk_data = vsw = mdb_alloc(sizeof (vmem_seg_walk_t), UM_SLEEP);
3502 
3503 	vsw->vsw_type = type;
3504 	vsw->vsw_start = wsp->walk_addr + offsetof(vmem_t, vm_seg0);
3505 	vsw->vsw_current = vsw->vsw_start;
3506 
3507 	return (WALK_NEXT);
3508 }
3509 
3510 /*
3511  * vmem segments can't have type 0 (this should be added to vmem_impl.h).
3512  */
3513 #define	VMEM_NONE	0
3514 
3515 int
vmem_alloc_walk_init(mdb_walk_state_t * wsp)3516 vmem_alloc_walk_init(mdb_walk_state_t *wsp)
3517 {
3518 	return (vmem_seg_walk_common_init(wsp, VMEM_ALLOC, "alloc"));
3519 }
3520 
3521 int
vmem_free_walk_init(mdb_walk_state_t * wsp)3522 vmem_free_walk_init(mdb_walk_state_t *wsp)
3523 {
3524 	return (vmem_seg_walk_common_init(wsp, VMEM_FREE, "free"));
3525 }
3526 
3527 int
vmem_span_walk_init(mdb_walk_state_t * wsp)3528 vmem_span_walk_init(mdb_walk_state_t *wsp)
3529 {
3530 	return (vmem_seg_walk_common_init(wsp, VMEM_SPAN, "span"));
3531 }
3532 
3533 int
vmem_seg_walk_init(mdb_walk_state_t * wsp)3534 vmem_seg_walk_init(mdb_walk_state_t *wsp)
3535 {
3536 	return (vmem_seg_walk_common_init(wsp, VMEM_NONE, "seg"));
3537 }
3538 
3539 int
vmem_seg_walk_step(mdb_walk_state_t * wsp)3540 vmem_seg_walk_step(mdb_walk_state_t *wsp)
3541 {
3542 	vmem_seg_t seg;
3543 	vmem_seg_walk_t *vsw = wsp->walk_data;
3544 	uintptr_t addr = vsw->vsw_current;
3545 	static size_t seg_size = 0;
3546 	int rval;
3547 
3548 	if (!seg_size) {
3549 		if (mdb_readvar(&seg_size, "vmem_seg_size") == -1) {
3550 			mdb_warn("failed to read 'vmem_seg_size'");
3551 			seg_size = sizeof (vmem_seg_t);
3552 		}
3553 	}
3554 
3555 	if (seg_size < sizeof (seg))
3556 		bzero((caddr_t)&seg + seg_size, sizeof (seg) - seg_size);
3557 
3558 	if (mdb_vread(&seg, seg_size, addr) == -1) {
3559 		mdb_warn("couldn't read vmem_seg at %p", addr);
3560 		return (WALK_ERR);
3561 	}
3562 
3563 	vsw->vsw_current = (uintptr_t)seg.vs_anext;
3564 	if (vsw->vsw_type != VMEM_NONE && seg.vs_type != vsw->vsw_type) {
3565 		rval = WALK_NEXT;
3566 	} else {
3567 		rval = wsp->walk_callback(addr, &seg, wsp->walk_cbdata);
3568 	}
3569 
3570 	if (vsw->vsw_current == vsw->vsw_start)
3571 		return (WALK_DONE);
3572 
3573 	return (rval);
3574 }
3575 
3576 void
vmem_seg_walk_fini(mdb_walk_state_t * wsp)3577 vmem_seg_walk_fini(mdb_walk_state_t *wsp)
3578 {
3579 	vmem_seg_walk_t *vsw = wsp->walk_data;
3580 
3581 	mdb_free(vsw, sizeof (vmem_seg_walk_t));
3582 }
3583 
3584 #define	VMEM_NAMEWIDTH	22
3585 
3586 int
vmem(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3587 vmem(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3588 {
3589 	vmem_t v, parent;
3590 	vmem_kstat_t *vkp = &v.vm_kstat;
3591 	uintptr_t paddr;
3592 	int ident = 0;
3593 	char c[VMEM_NAMEWIDTH];
3594 
3595 	if (!(flags & DCMD_ADDRSPEC)) {
3596 		if (mdb_walk_dcmd("vmem", "vmem", argc, argv) == -1) {
3597 			mdb_warn("can't walk vmem");
3598 			return (DCMD_ERR);
3599 		}
3600 		return (DCMD_OK);
3601 	}
3602 
3603 	if (DCMD_HDRSPEC(flags))
3604 		mdb_printf("%-?s %-*s %10s %12s %9s %5s\n",
3605 		    "ADDR", VMEM_NAMEWIDTH, "NAME", "INUSE",
3606 		    "TOTAL", "SUCCEED", "FAIL");
3607 
3608 	if (mdb_vread(&v, sizeof (v), addr) == -1) {
3609 		mdb_warn("couldn't read vmem at %p", addr);
3610 		return (DCMD_ERR);
3611 	}
3612 
3613 	for (paddr = (uintptr_t)v.vm_source; paddr != 0; ident += 2) {
3614 		if (mdb_vread(&parent, sizeof (parent), paddr) == -1) {
3615 			mdb_warn("couldn't trace %p's ancestry", addr);
3616 			ident = 0;
3617 			break;
3618 		}
3619 		paddr = (uintptr_t)parent.vm_source;
3620 	}
3621 
3622 	(void) mdb_snprintf(c, VMEM_NAMEWIDTH, "%*s%s", ident, "", v.vm_name);
3623 
3624 	mdb_printf("%0?p %-*s %10llu %12llu %9llu %5llu\n",
3625 	    addr, VMEM_NAMEWIDTH, c,
3626 	    vkp->vk_mem_inuse.value.ui64, vkp->vk_mem_total.value.ui64,
3627 	    vkp->vk_alloc.value.ui64, vkp->vk_fail.value.ui64);
3628 
3629 	return (DCMD_OK);
3630 }
3631 
3632 void
vmem_seg_help(void)3633 vmem_seg_help(void)
3634 {
3635 	mdb_printf("%s",
3636 "Display the contents of vmem_seg_ts, with optional filtering.\n\n"
3637 "\n"
3638 "A vmem_seg_t represents a range of addresses (or arbitrary numbers),\n"
3639 "representing a single chunk of data.  Only ALLOC segments have debugging\n"
3640 "information.\n");
3641 	mdb_dec_indent(2);
3642 	mdb_printf("%<b>OPTIONS%</b>\n");
3643 	mdb_inc_indent(2);
3644 	mdb_printf("%s",
3645 "  -v    Display the full content of the vmem_seg, including its stack trace\n"
3646 "  -s    report the size of the segment, instead of the end address\n"
3647 "  -c caller\n"
3648 "        filter out segments without the function/PC in their stack trace\n"
3649 "  -e earliest\n"
3650 "        filter out segments timestamped before earliest\n"
3651 "  -l latest\n"
3652 "        filter out segments timestamped after latest\n"
3653 "  -m minsize\n"
3654 "        filer out segments smaller than minsize\n"
3655 "  -M maxsize\n"
3656 "        filer out segments larger than maxsize\n"
3657 "  -t thread\n"
3658 "        filter out segments not involving thread\n"
3659 "  -T type\n"
3660 "        filter out segments not of type 'type'\n"
3661 "        type is one of: ALLOC/FREE/SPAN/ROTOR/WALKER\n");
3662 }
3663 
3664 /*ARGSUSED*/
3665 int
vmem_seg(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3666 vmem_seg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3667 {
3668 	vmem_seg_t vs;
3669 	pc_t *stk = vs.vs_stack;
3670 	uintptr_t sz;
3671 	uint8_t t;
3672 	const char *type = NULL;
3673 	GElf_Sym sym;
3674 	char c[MDB_SYM_NAMLEN];
3675 	int no_debug;
3676 	int i;
3677 	int depth;
3678 	uintptr_t laddr, haddr;
3679 
3680 	uintptr_t caller = 0, thread = 0;
3681 	uintptr_t minsize = 0, maxsize = 0;
3682 
3683 	hrtime_t earliest = 0, latest = 0;
3684 
3685 	uint_t size = 0;
3686 	uint_t verbose = 0;
3687 
3688 	if (!(flags & DCMD_ADDRSPEC))
3689 		return (DCMD_USAGE);
3690 
3691 	if (mdb_getopts(argc, argv,
3692 	    'c', MDB_OPT_UINTPTR, &caller,
3693 	    'e', MDB_OPT_UINT64, &earliest,
3694 	    'l', MDB_OPT_UINT64, &latest,
3695 	    's', MDB_OPT_SETBITS, TRUE, &size,
3696 	    'm', MDB_OPT_UINTPTR, &minsize,
3697 	    'M', MDB_OPT_UINTPTR, &maxsize,
3698 	    't', MDB_OPT_UINTPTR, &thread,
3699 	    'T', MDB_OPT_STR, &type,
3700 	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
3701 	    NULL) != argc)
3702 		return (DCMD_USAGE);
3703 
3704 	if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
3705 		if (verbose) {
3706 			mdb_printf("%16s %4s %16s %16s %16s\n"
3707 			    "%<u>%16s %4s %16s %16s %16s%</u>\n",
3708 			    "ADDR", "TYPE", "START", "END", "SIZE",
3709 			    "", "", "THREAD", "TIMESTAMP", "");
3710 		} else {
3711 			mdb_printf("%?s %4s %?s %?s %s\n", "ADDR", "TYPE",
3712 			    "START", size? "SIZE" : "END", "WHO");
3713 		}
3714 	}
3715 
3716 	if (mdb_vread(&vs, sizeof (vs), addr) == -1) {
3717 		mdb_warn("couldn't read vmem_seg at %p", addr);
3718 		return (DCMD_ERR);
3719 	}
3720 
3721 	if (type != NULL) {
3722 		if (strcmp(type, "ALLC") == 0 || strcmp(type, "ALLOC") == 0)
3723 			t = VMEM_ALLOC;
3724 		else if (strcmp(type, "FREE") == 0)
3725 			t = VMEM_FREE;
3726 		else if (strcmp(type, "SPAN") == 0)
3727 			t = VMEM_SPAN;
3728 		else if (strcmp(type, "ROTR") == 0 ||
3729 		    strcmp(type, "ROTOR") == 0)
3730 			t = VMEM_ROTOR;
3731 		else if (strcmp(type, "WLKR") == 0 ||
3732 		    strcmp(type, "WALKER") == 0)
3733 			t = VMEM_WALKER;
3734 		else {
3735 			mdb_warn("\"%s\" is not a recognized vmem_seg type\n",
3736 			    type);
3737 			return (DCMD_ERR);
3738 		}
3739 
3740 		if (vs.vs_type != t)
3741 			return (DCMD_OK);
3742 	}
3743 
3744 	sz = vs.vs_end - vs.vs_start;
3745 
3746 	if (minsize != 0 && sz < minsize)
3747 		return (DCMD_OK);
3748 
3749 	if (maxsize != 0 && sz > maxsize)
3750 		return (DCMD_OK);
3751 
3752 	t = vs.vs_type;
3753 	depth = vs.vs_depth;
3754 
3755 	/*
3756 	 * debug info, when present, is only accurate for VMEM_ALLOC segments
3757 	 */
3758 	no_debug = (t != VMEM_ALLOC) ||
3759 	    (depth == 0 || depth > VMEM_STACK_DEPTH);
3760 
3761 	if (no_debug) {
3762 		if (caller != 0 || thread != 0 || earliest != 0 || latest != 0)
3763 			return (DCMD_OK);		/* not enough info */
3764 	} else {
3765 		if (caller != 0) {
3766 			laddr = caller;
3767 			haddr = caller + sizeof (caller);
3768 
3769 			if (mdb_lookup_by_addr(caller, MDB_SYM_FUZZY, c,
3770 			    sizeof (c), &sym) != -1 &&
3771 			    caller == (uintptr_t)sym.st_value) {
3772 				/*
3773 				 * We were provided an exact symbol value; any
3774 				 * address in the function is valid.
3775 				 */
3776 				laddr = (uintptr_t)sym.st_value;
3777 				haddr = (uintptr_t)sym.st_value + sym.st_size;
3778 			}
3779 
3780 			for (i = 0; i < depth; i++)
3781 				if (vs.vs_stack[i] >= laddr &&
3782 				    vs.vs_stack[i] < haddr)
3783 					break;
3784 
3785 			if (i == depth)
3786 				return (DCMD_OK);
3787 		}
3788 
3789 		if (thread != 0 && (uintptr_t)vs.vs_thread != thread)
3790 			return (DCMD_OK);
3791 
3792 		if (earliest != 0 && vs.vs_timestamp < earliest)
3793 			return (DCMD_OK);
3794 
3795 		if (latest != 0 && vs.vs_timestamp > latest)
3796 			return (DCMD_OK);
3797 	}
3798 
3799 	type = (t == VMEM_ALLOC ? "ALLC" :
3800 	    t == VMEM_FREE ? "FREE" :
3801 	    t == VMEM_SPAN ? "SPAN" :
3802 	    t == VMEM_ROTOR ? "ROTR" :
3803 	    t == VMEM_WALKER ? "WLKR" :
3804 	    "????");
3805 
3806 	if (flags & DCMD_PIPE_OUT) {
3807 		mdb_printf("%#lr\n", addr);
3808 		return (DCMD_OK);
3809 	}
3810 
3811 	if (verbose) {
3812 		mdb_printf("%<b>%16p%</b> %4s %16p %16p %16ld\n",
3813 		    addr, type, vs.vs_start, vs.vs_end, sz);
3814 
3815 		if (no_debug)
3816 			return (DCMD_OK);
3817 
3818 		mdb_printf("%16s %4s %16p %16llx\n",
3819 		    "", "", vs.vs_thread, vs.vs_timestamp);
3820 
3821 		mdb_inc_indent(17);
3822 		for (i = 0; i < depth; i++) {
3823 			mdb_printf("%a\n", stk[i]);
3824 		}
3825 		mdb_dec_indent(17);
3826 		mdb_printf("\n");
3827 	} else {
3828 		mdb_printf("%0?p %4s %0?p %0?p", addr, type,
3829 		    vs.vs_start, size? sz : vs.vs_end);
3830 
3831 		if (no_debug) {
3832 			mdb_printf("\n");
3833 			return (DCMD_OK);
3834 		}
3835 
3836 		for (i = 0; i < depth; i++) {
3837 			if (mdb_lookup_by_addr(stk[i], MDB_SYM_FUZZY,
3838 			    c, sizeof (c), &sym) == -1)
3839 				continue;
3840 			if (strncmp(c, "vmem_", 5) == 0)
3841 				continue;
3842 			break;
3843 		}
3844 		mdb_printf(" %a\n", stk[i]);
3845 	}
3846 	return (DCMD_OK);
3847 }
3848 
3849 typedef struct kmalog_data {
3850 	uintptr_t	kma_addr;
3851 	hrtime_t	kma_newest;
3852 } kmalog_data_t;
3853 
3854 /*ARGSUSED*/
3855 static int
showbc(uintptr_t addr,const kmem_bufctl_audit_t * bcp,kmalog_data_t * kma)3856 showbc(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmalog_data_t *kma)
3857 {
3858 	char name[KMEM_CACHE_NAMELEN + 1];
3859 	hrtime_t delta;
3860 	int i, depth;
3861 	size_t bufsize;
3862 
3863 	if (bcp->bc_timestamp == 0)
3864 		return (WALK_DONE);
3865 
3866 	if (kma->kma_newest == 0)
3867 		kma->kma_newest = bcp->bc_timestamp;
3868 
3869 	if (kma->kma_addr) {
3870 		if (mdb_vread(&bufsize, sizeof (bufsize),
3871 		    (uintptr_t)&bcp->bc_cache->cache_bufsize) == -1) {
3872 			mdb_warn(
3873 			    "failed to read cache_bufsize for cache at %p",
3874 			    bcp->bc_cache);
3875 			return (WALK_ERR);
3876 		}
3877 
3878 		if (kma->kma_addr < (uintptr_t)bcp->bc_addr ||
3879 		    kma->kma_addr >= (uintptr_t)bcp->bc_addr + bufsize)
3880 			return (WALK_NEXT);
3881 	}
3882 
3883 	delta = kma->kma_newest - bcp->bc_timestamp;
3884 	depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
3885 
3886 	if (mdb_readstr(name, sizeof (name), (uintptr_t)
3887 	    &bcp->bc_cache->cache_name) <= 0)
3888 		(void) mdb_snprintf(name, sizeof (name), "%a", bcp->bc_cache);
3889 
3890 	mdb_printf("\nT-%lld.%09lld  addr=%p  %s\n",
3891 	    delta / NANOSEC, delta % NANOSEC, bcp->bc_addr, name);
3892 
3893 	for (i = 0; i < depth; i++)
3894 		mdb_printf("\t %a\n", bcp->bc_stack[i]);
3895 
3896 	return (WALK_NEXT);
3897 }
3898 
3899 int
kmalog(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3900 kmalog(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3901 {
3902 	const char *logname = "kmem_transaction_log";
3903 	kmalog_data_t kma;
3904 
3905 	if (argc > 1)
3906 		return (DCMD_USAGE);
3907 
3908 	kma.kma_newest = 0;
3909 	if (flags & DCMD_ADDRSPEC)
3910 		kma.kma_addr = addr;
3911 	else
3912 		kma.kma_addr = 0;
3913 
3914 	if (argc > 0) {
3915 		if (argv->a_type != MDB_TYPE_STRING)
3916 			return (DCMD_USAGE);
3917 		if (strcmp(argv->a_un.a_str, "fail") == 0)
3918 			logname = "kmem_failure_log";
3919 		else if (strcmp(argv->a_un.a_str, "slab") == 0)
3920 			logname = "kmem_slab_log";
3921 		else if (strcmp(argv->a_un.a_str, "zerosized") == 0)
3922 			logname = "kmem_zerosized_log";
3923 		else
3924 			return (DCMD_USAGE);
3925 	}
3926 
3927 	if (mdb_readvar(&addr, logname) == -1) {
3928 		mdb_warn("failed to read %s log header pointer");
3929 		return (DCMD_ERR);
3930 	}
3931 
3932 	if (mdb_pwalk("kmem_log", (mdb_walk_cb_t)showbc, &kma, addr) == -1) {
3933 		mdb_warn("failed to walk kmem log");
3934 		return (DCMD_ERR);
3935 	}
3936 
3937 	return (DCMD_OK);
3938 }
3939 
3940 /*
3941  * As the final lure for die-hard crash(8) users, we provide ::kmausers here.
3942  * The first piece is a structure which we use to accumulate kmem_cache_t
3943  * addresses of interest.  The kmc_add is used as a callback for the kmem_cache
3944  * walker; we either add all caches, or ones named explicitly as arguments.
3945  */
3946 
3947 typedef struct kmclist {
3948 	const char *kmc_name;			/* Name to match (or NULL) */
3949 	uintptr_t *kmc_caches;			/* List of kmem_cache_t addrs */
3950 	int kmc_nelems;				/* Num entries in kmc_caches */
3951 	int kmc_size;				/* Size of kmc_caches array */
3952 } kmclist_t;
3953 
3954 static int
kmc_add(uintptr_t addr,const kmem_cache_t * cp,kmclist_t * kmc)3955 kmc_add(uintptr_t addr, const kmem_cache_t *cp, kmclist_t *kmc)
3956 {
3957 	void *p;
3958 	int s;
3959 
3960 	if (kmc->kmc_name == NULL ||
3961 	    strcmp(cp->cache_name, kmc->kmc_name) == 0) {
3962 		/*
3963 		 * If we have a match, grow our array (if necessary), and then
3964 		 * add the virtual address of the matching cache to our list.
3965 		 */
3966 		if (kmc->kmc_nelems >= kmc->kmc_size) {
3967 			s = kmc->kmc_size ? kmc->kmc_size * 2 : 256;
3968 			p = mdb_alloc(sizeof (uintptr_t) * s, UM_SLEEP | UM_GC);
3969 
3970 			bcopy(kmc->kmc_caches, p,
3971 			    sizeof (uintptr_t) * kmc->kmc_size);
3972 
3973 			kmc->kmc_caches = p;
3974 			kmc->kmc_size = s;
3975 		}
3976 
3977 		kmc->kmc_caches[kmc->kmc_nelems++] = addr;
3978 		return (kmc->kmc_name ? WALK_DONE : WALK_NEXT);
3979 	}
3980 
3981 	return (WALK_NEXT);
3982 }
3983 
3984 /*
3985  * The second piece of ::kmausers is a hash table of allocations.  Each
3986  * allocation owner is identified by its stack trace and data_size.  We then
3987  * track the total bytes of all such allocations, and the number of allocations
3988  * to report at the end.  Once we have a list of caches, we walk through the
3989  * allocated bufctls of each, and update our hash table accordingly.
3990  */
3991 
3992 typedef struct kmowner {
3993 	struct kmowner *kmo_head;		/* First hash elt in bucket */
3994 	struct kmowner *kmo_next;		/* Next hash elt in chain */
3995 	size_t kmo_signature;			/* Hash table signature */
3996 	uint_t kmo_num;				/* Number of allocations */
3997 	size_t kmo_data_size;			/* Size of each allocation */
3998 	size_t kmo_total_size;			/* Total bytes of allocation */
3999 	int kmo_depth;				/* Depth of stack trace */
4000 	uintptr_t kmo_stack[KMEM_STACK_DEPTH];	/* Stack trace */
4001 } kmowner_t;
4002 
4003 typedef struct kmusers {
4004 	uintptr_t kmu_addr;			/* address of interest */
4005 	const kmem_cache_t *kmu_cache;		/* Current kmem cache */
4006 	kmowner_t *kmu_hash;			/* Hash table of owners */
4007 	int kmu_nelems;				/* Number of entries in use */
4008 	int kmu_size;				/* Total number of entries */
4009 } kmusers_t;
4010 
4011 static void
kmu_add(kmusers_t * kmu,const kmem_bufctl_audit_t * bcp,size_t size,size_t data_size)4012 kmu_add(kmusers_t *kmu, const kmem_bufctl_audit_t *bcp,
4013     size_t size, size_t data_size)
4014 {
4015 	int i, depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
4016 	size_t bucket, signature = data_size;
4017 	kmowner_t *kmo, *kmoend;
4018 
4019 	/*
4020 	 * If the hash table is full, double its size and rehash everything.
4021 	 */
4022 	if (kmu->kmu_nelems >= kmu->kmu_size) {
4023 		int s = kmu->kmu_size ? kmu->kmu_size * 2 : 1024;
4024 
4025 		kmo = mdb_alloc(sizeof (kmowner_t) * s, UM_SLEEP | UM_GC);
4026 		bcopy(kmu->kmu_hash, kmo, sizeof (kmowner_t) * kmu->kmu_size);
4027 		kmu->kmu_hash = kmo;
4028 		kmu->kmu_size = s;
4029 
4030 		kmoend = kmu->kmu_hash + kmu->kmu_size;
4031 		for (kmo = kmu->kmu_hash; kmo < kmoend; kmo++)
4032 			kmo->kmo_head = NULL;
4033 
4034 		kmoend = kmu->kmu_hash + kmu->kmu_nelems;
4035 		for (kmo = kmu->kmu_hash; kmo < kmoend; kmo++) {
4036 			bucket = kmo->kmo_signature & (kmu->kmu_size - 1);
4037 			kmo->kmo_next = kmu->kmu_hash[bucket].kmo_head;
4038 			kmu->kmu_hash[bucket].kmo_head = kmo;
4039 		}
4040 	}
4041 
4042 	/*
4043 	 * Finish computing the hash signature from the stack trace, and then
4044 	 * see if the owner is in the hash table.  If so, update our stats.
4045 	 */
4046 	for (i = 0; i < depth; i++)
4047 		signature += bcp->bc_stack[i];
4048 
4049 	bucket = signature & (kmu->kmu_size - 1);
4050 
4051 	for (kmo = kmu->kmu_hash[bucket].kmo_head; kmo; kmo = kmo->kmo_next) {
4052 		if (kmo->kmo_signature == signature) {
4053 			size_t difference = 0;
4054 
4055 			difference |= kmo->kmo_data_size - data_size;
4056 			difference |= kmo->kmo_depth - depth;
4057 
4058 			for (i = 0; i < depth; i++) {
4059 				difference |= kmo->kmo_stack[i] -
4060 				    bcp->bc_stack[i];
4061 			}
4062 
4063 			if (difference == 0) {
4064 				kmo->kmo_total_size += size;
4065 				kmo->kmo_num++;
4066 				return;
4067 			}
4068 		}
4069 	}
4070 
4071 	/*
4072 	 * If the owner is not yet hashed, grab the next element and fill it
4073 	 * in based on the allocation information.
4074 	 */
4075 	kmo = &kmu->kmu_hash[kmu->kmu_nelems++];
4076 	kmo->kmo_next = kmu->kmu_hash[bucket].kmo_head;
4077 	kmu->kmu_hash[bucket].kmo_head = kmo;
4078 
4079 	kmo->kmo_signature = signature;
4080 	kmo->kmo_num = 1;
4081 	kmo->kmo_data_size = data_size;
4082 	kmo->kmo_total_size = size;
4083 	kmo->kmo_depth = depth;
4084 
4085 	for (i = 0; i < depth; i++)
4086 		kmo->kmo_stack[i] = bcp->bc_stack[i];
4087 }
4088 
4089 /*
4090  * When ::kmausers is invoked without the -f flag, we simply update our hash
4091  * table with the information from each allocated bufctl.
4092  */
4093 /*ARGSUSED*/
4094 static int
kmause1(uintptr_t addr,const kmem_bufctl_audit_t * bcp,kmusers_t * kmu)4095 kmause1(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmusers_t *kmu)
4096 {
4097 	const kmem_cache_t *cp = kmu->kmu_cache;
4098 
4099 	kmu_add(kmu, bcp, cp->cache_bufsize, cp->cache_bufsize);
4100 	return (WALK_NEXT);
4101 }
4102 
4103 /*
4104  * When ::kmausers is invoked with the -f flag, we print out the information
4105  * for each bufctl as well as updating the hash table.
4106  */
4107 static int
kmause2(uintptr_t addr,const kmem_bufctl_audit_t * bcp,kmusers_t * kmu)4108 kmause2(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmusers_t *kmu)
4109 {
4110 	int i, depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
4111 	const kmem_cache_t *cp = kmu->kmu_cache;
4112 	kmem_bufctl_t bufctl;
4113 
4114 	if (kmu->kmu_addr) {
4115 		if (mdb_vread(&bufctl, sizeof (bufctl),  addr) == -1)
4116 			mdb_warn("couldn't read bufctl at %p", addr);
4117 		else if (kmu->kmu_addr < (uintptr_t)bufctl.bc_addr ||
4118 		    kmu->kmu_addr >= (uintptr_t)bufctl.bc_addr +
4119 		    cp->cache_bufsize)
4120 			return (WALK_NEXT);
4121 	}
4122 
4123 	mdb_printf("size %d, addr %p, thread %p, cache %s\n",
4124 	    cp->cache_bufsize, addr, bcp->bc_thread, cp->cache_name);
4125 
4126 	for (i = 0; i < depth; i++)
4127 		mdb_printf("\t %a\n", bcp->bc_stack[i]);
4128 
4129 	kmu_add(kmu, bcp, cp->cache_bufsize, cp->cache_bufsize);
4130 	return (WALK_NEXT);
4131 }
4132 
4133 /*
4134  * We sort our results by allocation size before printing them.
4135  */
4136 static int
kmownercmp(const void * lp,const void * rp)4137 kmownercmp(const void *lp, const void *rp)
4138 {
4139 	const kmowner_t *lhs = lp;
4140 	const kmowner_t *rhs = rp;
4141 
4142 	return (rhs->kmo_total_size - lhs->kmo_total_size);
4143 }
4144 
4145 /*
4146  * The main engine of ::kmausers is relatively straightforward: First we
4147  * accumulate our list of kmem_cache_t addresses into the kmclist_t. Next we
4148  * iterate over the allocated bufctls of each cache in the list.  Finally,
4149  * we sort and print our results.
4150  */
4151 /*ARGSUSED*/
4152 int
kmausers(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)4153 kmausers(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
4154 {
4155 	int mem_threshold = 8192;	/* Minimum # bytes for printing */
4156 	int cnt_threshold = 100;	/* Minimum # blocks for printing */
4157 	int audited_caches = 0;		/* Number of KMF_AUDIT caches found */
4158 	int do_all_caches = 1;		/* Do all caches (no arguments) */
4159 	int opt_e = FALSE;		/* Include "small" users */
4160 	int opt_f = FALSE;		/* Print stack traces */
4161 
4162 	mdb_walk_cb_t callback = (mdb_walk_cb_t)kmause1;
4163 	kmowner_t *kmo, *kmoend;
4164 	int i, oelems;
4165 
4166 	kmclist_t kmc;
4167 	kmusers_t kmu;
4168 
4169 	bzero(&kmc, sizeof (kmc));
4170 	bzero(&kmu, sizeof (kmu));
4171 
4172 	while ((i = mdb_getopts(argc, argv,
4173 	    'e', MDB_OPT_SETBITS, TRUE, &opt_e,
4174 	    'f', MDB_OPT_SETBITS, TRUE, &opt_f, NULL)) != argc) {
4175 
4176 		argv += i;	/* skip past options we just processed */
4177 		argc -= i;	/* adjust argc */
4178 
4179 		if (argv->a_type != MDB_TYPE_STRING || *argv->a_un.a_str == '-')
4180 			return (DCMD_USAGE);
4181 
4182 		oelems = kmc.kmc_nelems;
4183 		kmc.kmc_name = argv->a_un.a_str;
4184 		(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmc_add, &kmc);
4185 
4186 		if (kmc.kmc_nelems == oelems) {
4187 			mdb_warn("unknown kmem cache: %s\n", kmc.kmc_name);
4188 			return (DCMD_ERR);
4189 		}
4190 
4191 		do_all_caches = 0;
4192 		argv++;
4193 		argc--;
4194 	}
4195 
4196 	if (flags & DCMD_ADDRSPEC) {
4197 		opt_f = TRUE;
4198 		kmu.kmu_addr = addr;
4199 	} else {
4200 		kmu.kmu_addr = 0;
4201 	}
4202 
4203 	if (opt_e)
4204 		mem_threshold = cnt_threshold = 0;
4205 
4206 	if (opt_f)
4207 		callback = (mdb_walk_cb_t)kmause2;
4208 
4209 	if (do_all_caches) {
4210 		kmc.kmc_name = NULL; /* match all cache names */
4211 		(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmc_add, &kmc);
4212 	}
4213 
4214 	for (i = 0; i < kmc.kmc_nelems; i++) {
4215 		uintptr_t cp = kmc.kmc_caches[i];
4216 		kmem_cache_t c;
4217 
4218 		if (mdb_vread(&c, sizeof (c), cp) == -1) {
4219 			mdb_warn("failed to read cache at %p", cp);
4220 			continue;
4221 		}
4222 
4223 		if (!(c.cache_flags & KMF_AUDIT)) {
4224 			if (!do_all_caches) {
4225 				mdb_warn("KMF_AUDIT is not enabled for %s\n",
4226 				    c.cache_name);
4227 			}
4228 			continue;
4229 		}
4230 
4231 		kmu.kmu_cache = &c;
4232 		(void) mdb_pwalk("bufctl", callback, &kmu, cp);
4233 		audited_caches++;
4234 	}
4235 
4236 	if (audited_caches == 0 && do_all_caches) {
4237 		mdb_warn("KMF_AUDIT is not enabled for any caches\n");
4238 		return (DCMD_ERR);
4239 	}
4240 
4241 	qsort(kmu.kmu_hash, kmu.kmu_nelems, sizeof (kmowner_t), kmownercmp);
4242 	kmoend = kmu.kmu_hash + kmu.kmu_nelems;
4243 
4244 	for (kmo = kmu.kmu_hash; kmo < kmoend; kmo++) {
4245 		if (kmo->kmo_total_size < mem_threshold &&
4246 		    kmo->kmo_num < cnt_threshold)
4247 			continue;
4248 		mdb_printf("%lu bytes for %u allocations with data size %lu:\n",
4249 		    kmo->kmo_total_size, kmo->kmo_num, kmo->kmo_data_size);
4250 		for (i = 0; i < kmo->kmo_depth; i++)
4251 			mdb_printf("\t %a\n", kmo->kmo_stack[i]);
4252 	}
4253 
4254 	return (DCMD_OK);
4255 }
4256 
4257 void
kmausers_help(void)4258 kmausers_help(void)
4259 {
4260 	mdb_printf(
4261 	    "Displays the largest users of the kmem allocator, sorted by \n"
4262 	    "trace.  If one or more caches is specified, only those caches\n"
4263 	    "will be searched.  By default, all caches are searched.  If an\n"
4264 	    "address is specified, then only those allocations which include\n"
4265 	    "the given address are displayed.  Specifying an address implies\n"
4266 	    "-f.\n"
4267 	    "\n"
4268 	    "\t-e\tInclude all users, not just the largest\n"
4269 	    "\t-f\tDisplay individual allocations.  By default, users are\n"
4270 	    "\t\tgrouped by stack\n");
4271 }
4272 
4273 static int
kmem_ready_check(void)4274 kmem_ready_check(void)
4275 {
4276 	int ready;
4277 
4278 	if (mdb_readvar(&ready, "kmem_ready") < 0)
4279 		return (-1); /* errno is set for us */
4280 
4281 	return (ready);
4282 }
4283 
4284 void
kmem_statechange(void)4285 kmem_statechange(void)
4286 {
4287 	static int been_ready = 0;
4288 
4289 	if (been_ready)
4290 		return;
4291 
4292 	if (kmem_ready_check() <= 0)
4293 		return;
4294 
4295 	been_ready = 1;
4296 	(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmem_init_walkers, NULL);
4297 }
4298 
4299 void
kmem_init(void)4300 kmem_init(void)
4301 {
4302 	mdb_walker_t w = {
4303 		"kmem_cache", "walk list of kmem caches", kmem_cache_walk_init,
4304 		list_walk_step, list_walk_fini
4305 	};
4306 
4307 	/*
4308 	 * If kmem is ready, we'll need to invoke the kmem_cache walker
4309 	 * immediately.  Walkers in the linkage structure won't be ready until
4310 	 * _mdb_init returns, so we'll need to add this one manually.  If kmem
4311 	 * is ready, we'll use the walker to initialize the caches.  If kmem
4312 	 * isn't ready, we'll register a callback that will allow us to defer
4313 	 * cache walking until it is.
4314 	 */
4315 	if (mdb_add_walker(&w) != 0) {
4316 		mdb_warn("failed to add kmem_cache walker");
4317 		return;
4318 	}
4319 
4320 	kmem_statechange();
4321 
4322 	/* register our ::whatis handlers */
4323 	mdb_whatis_register("modules", whatis_run_modules, NULL,
4324 	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4325 	mdb_whatis_register("threads", whatis_run_threads, NULL,
4326 	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4327 	mdb_whatis_register("pages", whatis_run_pages, NULL,
4328 	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4329 	mdb_whatis_register("kmem", whatis_run_kmem, NULL,
4330 	    WHATIS_PRIO_ALLOCATOR, 0);
4331 	mdb_whatis_register("vmem", whatis_run_vmem, NULL,
4332 	    WHATIS_PRIO_ALLOCATOR, 0);
4333 }
4334 
4335 typedef struct whatthread {
4336 	uintptr_t	wt_target;
4337 	int		wt_verbose;
4338 } whatthread_t;
4339 
4340 static int
whatthread_walk_thread(uintptr_t addr,const kthread_t * t,whatthread_t * w)4341 whatthread_walk_thread(uintptr_t addr, const kthread_t *t, whatthread_t *w)
4342 {
4343 	uintptr_t current, data;
4344 
4345 	if (t->t_stkbase == NULL)
4346 		return (WALK_NEXT);
4347 
4348 	/*
4349 	 * Warn about swapped out threads, but drive on anyway
4350 	 */
4351 	if (!(t->t_schedflag & TS_LOAD)) {
4352 		mdb_warn("thread %p's stack swapped out\n", addr);
4353 		return (WALK_NEXT);
4354 	}
4355 
4356 	/*
4357 	 * Search the thread's stack for the given pointer.  Note that it would
4358 	 * be more efficient to follow ::kgrep's lead and read in page-sized
4359 	 * chunks, but this routine is already fast and simple.
4360 	 */
4361 	for (current = (uintptr_t)t->t_stkbase; current < (uintptr_t)t->t_stk;
4362 	    current += sizeof (uintptr_t)) {
4363 		if (mdb_vread(&data, sizeof (data), current) == -1) {
4364 			mdb_warn("couldn't read thread %p's stack at %p",
4365 			    addr, current);
4366 			return (WALK_ERR);
4367 		}
4368 
4369 		if (data == w->wt_target) {
4370 			if (w->wt_verbose) {
4371 				mdb_printf("%p in thread %p's stack%s\n",
4372 				    current, addr, stack_active(t, current));
4373 			} else {
4374 				mdb_printf("%#lr\n", addr);
4375 				return (WALK_NEXT);
4376 			}
4377 		}
4378 	}
4379 
4380 	return (WALK_NEXT);
4381 }
4382 
4383 int
whatthread(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)4384 whatthread(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
4385 {
4386 	whatthread_t w;
4387 
4388 	if (!(flags & DCMD_ADDRSPEC))
4389 		return (DCMD_USAGE);
4390 
4391 	w.wt_verbose = FALSE;
4392 	w.wt_target = addr;
4393 
4394 	if (mdb_getopts(argc, argv,
4395 	    'v', MDB_OPT_SETBITS, TRUE, &w.wt_verbose, NULL) != argc)
4396 		return (DCMD_USAGE);
4397 
4398 	if (mdb_walk("thread", (mdb_walk_cb_t)whatthread_walk_thread, &w)
4399 	    == -1) {
4400 		mdb_warn("couldn't walk threads");
4401 		return (DCMD_ERR);
4402 	}
4403 
4404 	return (DCMD_OK);
4405 }
4406