xref: /illumos-gate/usr/src/cmd/mdb/common/modules/genunix/kmem.c (revision 33efde4275d24731ef87927237b0ffb0630b6b2d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2018 Joyent, Inc.  All rights reserved.
28  * Copyright (c) 2012 by Delphix. All rights reserved.
29  * Copyright 2025 Oxide Computer Company
30  */
31 
32 #include <mdb/mdb_param.h>
33 #include <mdb/mdb_modapi.h>
34 #include <mdb/mdb_ctf.h>
35 #include <mdb/mdb_whatis.h>
36 #include <sys/cpuvar.h>
37 #include <sys/kmem_impl.h>
38 #include <sys/vmem_impl.h>
39 #include <sys/machelf.h>
40 #include <sys/modctl.h>
41 #include <sys/kobj.h>
42 #include <sys/panic.h>
43 #include <sys/stack.h>
44 #include <sys/sysmacros.h>
45 #include <vm/page.h>
46 
47 #include "avl.h"
48 #include "combined.h"
49 #include "dist.h"
50 #include "kmem.h"
51 #include "list.h"
52 
53 #define	dprintf(x) if (mdb_debug_level) { \
54 	mdb_printf("kmem debug: ");  \
55 	/*CSTYLED*/\
56 	mdb_printf x ;\
57 }
58 
59 #define	KM_ALLOCATED		0x01
60 #define	KM_FREE			0x02
61 #define	KM_BUFCTL		0x04
62 #define	KM_CONSTRUCTED		0x08	/* only constructed free buffers */
63 #define	KM_HASH			0x10
64 
65 static int mdb_debug_level = 0;
66 
67 /*ARGSUSED*/
68 static int
kmem_init_walkers(uintptr_t addr,const kmem_cache_t * c,void * ignored)69 kmem_init_walkers(uintptr_t addr, const kmem_cache_t *c, void *ignored)
70 {
71 	mdb_walker_t w;
72 	char descr[64];
73 
74 	(void) mdb_snprintf(descr, sizeof (descr),
75 	    "walk the %s cache", c->cache_name);
76 
77 	w.walk_name = c->cache_name;
78 	w.walk_descr = descr;
79 	w.walk_init = kmem_walk_init;
80 	w.walk_step = kmem_walk_step;
81 	w.walk_fini = kmem_walk_fini;
82 	w.walk_init_arg = (void *)addr;
83 
84 	if (mdb_add_walker(&w) == -1)
85 		mdb_warn("failed to add %s walker", c->cache_name);
86 
87 	return (WALK_NEXT);
88 }
89 
90 /*ARGSUSED*/
91 int
kmem_debug(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)92 kmem_debug(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
93 {
94 	mdb_debug_level ^= 1;
95 
96 	mdb_printf("kmem: debugging is now %s\n",
97 	    mdb_debug_level ? "on" : "off");
98 
99 	return (DCMD_OK);
100 }
101 
102 int
kmem_cache_walk_init(mdb_walk_state_t * wsp)103 kmem_cache_walk_init(mdb_walk_state_t *wsp)
104 {
105 	GElf_Sym sym;
106 
107 	if (mdb_lookup_by_name("kmem_caches", &sym) == -1) {
108 		mdb_warn("couldn't find kmem_caches");
109 		return (WALK_ERR);
110 	}
111 
112 	wsp->walk_addr = (uintptr_t)sym.st_value;
113 
114 	return (list_walk_init_named(wsp, "cache list", "cache"));
115 }
116 
117 int
kmem_cpu_cache_walk_init(mdb_walk_state_t * wsp)118 kmem_cpu_cache_walk_init(mdb_walk_state_t *wsp)
119 {
120 	if (wsp->walk_addr == 0) {
121 		mdb_warn("kmem_cpu_cache doesn't support global walks");
122 		return (WALK_ERR);
123 	}
124 
125 	if (mdb_layered_walk("cpu", wsp) == -1) {
126 		mdb_warn("couldn't walk 'cpu'");
127 		return (WALK_ERR);
128 	}
129 
130 	wsp->walk_data = (void *)wsp->walk_addr;
131 
132 	return (WALK_NEXT);
133 }
134 
135 int
kmem_cpu_cache_walk_step(mdb_walk_state_t * wsp)136 kmem_cpu_cache_walk_step(mdb_walk_state_t *wsp)
137 {
138 	uintptr_t caddr = (uintptr_t)wsp->walk_data;
139 	const cpu_t *cpu = wsp->walk_layer;
140 	kmem_cpu_cache_t cc;
141 
142 	caddr += OFFSETOF(kmem_cache_t, cache_cpu[cpu->cpu_seqid]);
143 
144 	if (mdb_vread(&cc, sizeof (kmem_cpu_cache_t), caddr) == -1) {
145 		mdb_warn("couldn't read kmem_cpu_cache at %p", caddr);
146 		return (WALK_ERR);
147 	}
148 
149 	return (wsp->walk_callback(caddr, &cc, wsp->walk_cbdata));
150 }
151 
152 static int
kmem_slab_check(void * p,uintptr_t saddr,void * arg)153 kmem_slab_check(void *p, uintptr_t saddr, void *arg)
154 {
155 	kmem_slab_t *sp = p;
156 	uintptr_t caddr = (uintptr_t)arg;
157 	if ((uintptr_t)sp->slab_cache != caddr) {
158 		mdb_warn("slab %p isn't in cache %p (in cache %p)\n",
159 		    saddr, caddr, sp->slab_cache);
160 		return (-1);
161 	}
162 
163 	return (0);
164 }
165 
166 static int
kmem_partial_slab_check(void * p,uintptr_t saddr,void * arg)167 kmem_partial_slab_check(void *p, uintptr_t saddr, void *arg)
168 {
169 	kmem_slab_t *sp = p;
170 
171 	int rc = kmem_slab_check(p, saddr, arg);
172 	if (rc != 0) {
173 		return (rc);
174 	}
175 
176 	if (!KMEM_SLAB_IS_PARTIAL(sp)) {
177 		mdb_warn("slab %p is not a partial slab\n", saddr);
178 		return (-1);
179 	}
180 
181 	return (0);
182 }
183 
184 static int
kmem_complete_slab_check(void * p,uintptr_t saddr,void * arg)185 kmem_complete_slab_check(void *p, uintptr_t saddr, void *arg)
186 {
187 	kmem_slab_t *sp = p;
188 
189 	int rc = kmem_slab_check(p, saddr, arg);
190 	if (rc != 0) {
191 		return (rc);
192 	}
193 
194 	if (!KMEM_SLAB_IS_ALL_USED(sp)) {
195 		mdb_warn("slab %p is not completely allocated\n", saddr);
196 		return (-1);
197 	}
198 
199 	return (0);
200 }
201 
202 typedef struct {
203 	uintptr_t kns_cache_addr;
204 	int kns_nslabs;
205 } kmem_nth_slab_t;
206 
207 static int
kmem_nth_slab_check(void * p,uintptr_t saddr,void * arg)208 kmem_nth_slab_check(void *p, uintptr_t saddr, void *arg)
209 {
210 	kmem_nth_slab_t *chkp = arg;
211 
212 	int rc = kmem_slab_check(p, saddr, (void *)chkp->kns_cache_addr);
213 	if (rc != 0) {
214 		return (rc);
215 	}
216 
217 	return (chkp->kns_nslabs-- == 0 ? 1 : 0);
218 }
219 
220 static int
kmem_complete_slab_walk_init(mdb_walk_state_t * wsp)221 kmem_complete_slab_walk_init(mdb_walk_state_t *wsp)
222 {
223 	uintptr_t caddr = wsp->walk_addr;
224 
225 	wsp->walk_addr = (uintptr_t)(caddr +
226 	    offsetof(kmem_cache_t, cache_complete_slabs));
227 
228 	return (list_walk_init_checked(wsp, "slab list", "slab",
229 	    kmem_complete_slab_check, (void *)caddr));
230 }
231 
232 static int
kmem_partial_slab_walk_init(mdb_walk_state_t * wsp)233 kmem_partial_slab_walk_init(mdb_walk_state_t *wsp)
234 {
235 	uintptr_t caddr = wsp->walk_addr;
236 
237 	wsp->walk_addr = (uintptr_t)(caddr +
238 	    offsetof(kmem_cache_t, cache_partial_slabs));
239 
240 	return (avl_walk_init_checked(wsp, "slab list", "slab",
241 	    kmem_partial_slab_check, (void *)caddr));
242 }
243 
244 int
kmem_slab_walk_init(mdb_walk_state_t * wsp)245 kmem_slab_walk_init(mdb_walk_state_t *wsp)
246 {
247 	uintptr_t caddr = wsp->walk_addr;
248 
249 	if (caddr == 0) {
250 		mdb_warn("kmem_slab doesn't support global walks\n");
251 		return (WALK_ERR);
252 	}
253 
254 	combined_walk_init(wsp);
255 	combined_walk_add(wsp,
256 	    kmem_complete_slab_walk_init, list_walk_step, list_walk_fini);
257 	combined_walk_add(wsp,
258 	    kmem_partial_slab_walk_init, avl_walk_step, avl_walk_fini);
259 
260 	return (WALK_NEXT);
261 }
262 
263 static int
kmem_first_complete_slab_walk_init(mdb_walk_state_t * wsp)264 kmem_first_complete_slab_walk_init(mdb_walk_state_t *wsp)
265 {
266 	uintptr_t caddr = wsp->walk_addr;
267 	kmem_nth_slab_t *chk;
268 
269 	chk = mdb_alloc(sizeof (kmem_nth_slab_t),
270 	    UM_SLEEP | UM_GC);
271 	chk->kns_cache_addr = caddr;
272 	chk->kns_nslabs = 1;
273 	wsp->walk_addr = (uintptr_t)(caddr +
274 	    offsetof(kmem_cache_t, cache_complete_slabs));
275 
276 	return (list_walk_init_checked(wsp, "slab list", "slab",
277 	    kmem_nth_slab_check, chk));
278 }
279 
280 int
kmem_slab_walk_partial_init(mdb_walk_state_t * wsp)281 kmem_slab_walk_partial_init(mdb_walk_state_t *wsp)
282 {
283 	uintptr_t caddr = wsp->walk_addr;
284 	kmem_cache_t c;
285 
286 	if (caddr == 0) {
287 		mdb_warn("kmem_slab_partial doesn't support global walks\n");
288 		return (WALK_ERR);
289 	}
290 
291 	if (mdb_vread(&c, sizeof (c), caddr) == -1) {
292 		mdb_warn("couldn't read kmem_cache at %p", caddr);
293 		return (WALK_ERR);
294 	}
295 
296 	combined_walk_init(wsp);
297 
298 	/*
299 	 * Some consumers (umem_walk_step(), in particular) require at
300 	 * least one callback if there are any buffers in the cache.  So
301 	 * if there are *no* partial slabs, report the first full slab, if
302 	 * any.
303 	 *
304 	 * Yes, this is ugly, but it's cleaner than the other possibilities.
305 	 */
306 	if (c.cache_partial_slabs.avl_numnodes == 0) {
307 		combined_walk_add(wsp, kmem_first_complete_slab_walk_init,
308 		    list_walk_step, list_walk_fini);
309 	} else {
310 		combined_walk_add(wsp, kmem_partial_slab_walk_init,
311 		    avl_walk_step, avl_walk_fini);
312 	}
313 
314 	return (WALK_NEXT);
315 }
316 
317 int
kmem_cache(uintptr_t addr,uint_t flags,int ac,const mdb_arg_t * argv)318 kmem_cache(uintptr_t addr, uint_t flags, int ac, const mdb_arg_t *argv)
319 {
320 	kmem_cache_t c;
321 	const char *filter = NULL;
322 
323 	if (mdb_getopts(ac, argv,
324 	    'n', MDB_OPT_STR, &filter,
325 	    NULL) != ac) {
326 		return (DCMD_USAGE);
327 	}
328 
329 	if (!(flags & DCMD_ADDRSPEC)) {
330 		if (mdb_walk_dcmd("kmem_cache", "kmem_cache", ac, argv) == -1) {
331 			mdb_warn("can't walk kmem_cache");
332 			return (DCMD_ERR);
333 		}
334 		return (DCMD_OK);
335 	}
336 
337 	if (DCMD_HDRSPEC(flags))
338 		mdb_printf("%-?s %-25s %4s %6s %8s %8s\n", "ADDR", "NAME",
339 		    "FLAG", "CFLAG", "BUFSIZE", "BUFTOTL");
340 
341 	if (mdb_vread(&c, sizeof (c), addr) == -1) {
342 		mdb_warn("couldn't read kmem_cache at %p", addr);
343 		return (DCMD_ERR);
344 	}
345 
346 	if ((filter != NULL) && (strstr(c.cache_name, filter) == NULL))
347 		return (DCMD_OK);
348 
349 	mdb_printf("%0?p %-25s %04x %06x %8ld %8lld\n", addr, c.cache_name,
350 	    c.cache_flags, c.cache_cflags, c.cache_bufsize, c.cache_buftotal);
351 
352 	return (DCMD_OK);
353 }
354 
355 void
kmem_cache_help(void)356 kmem_cache_help(void)
357 {
358 	mdb_printf("%s", "Print kernel memory caches.\n\n");
359 	mdb_dec_indent(2);
360 	mdb_printf("%<b>OPTIONS%</b>\n");
361 	mdb_inc_indent(2);
362 	mdb_printf("%s",
363 "  -n name\n"
364 "        name of kmem cache (or matching partial name)\n"
365 "\n"
366 "Column\tDescription\n"
367 "\n"
368 "ADDR\t\taddress of kmem cache\n"
369 "NAME\t\tname of kmem cache\n"
370 "FLAG\t\tvarious cache state flags\n"
371 "CFLAG\t\tcache creation flags\n"
372 "BUFSIZE\tobject size in bytes\n"
373 "BUFTOTL\tcurrent total buffers in cache (allocated and free)\n");
374 }
375 
376 #define	LABEL_WIDTH	11
377 static void
kmem_slabs_print_dist(uint_t * ks_bucket,size_t buffers_per_slab,size_t maxbuckets,size_t minbucketsize)378 kmem_slabs_print_dist(uint_t *ks_bucket, size_t buffers_per_slab,
379     size_t maxbuckets, size_t minbucketsize)
380 {
381 	uint64_t total;
382 	int buckets;
383 	int i;
384 	const int *distarray;
385 	int complete[2];
386 
387 	buckets = buffers_per_slab;
388 
389 	total = 0;
390 	for (i = 0; i <= buffers_per_slab; i++)
391 		total += ks_bucket[i];
392 
393 	if (maxbuckets > 1)
394 		buckets = MIN(buckets, maxbuckets);
395 
396 	if (minbucketsize > 1) {
397 		/*
398 		 * minbucketsize does not apply to the first bucket reserved
399 		 * for completely allocated slabs
400 		 */
401 		buckets = MIN(buckets, 1 + ((buffers_per_slab - 1) /
402 		    minbucketsize));
403 		if ((buckets < 2) && (buffers_per_slab > 1)) {
404 			buckets = 2;
405 			minbucketsize = (buffers_per_slab - 1);
406 		}
407 	}
408 
409 	/*
410 	 * The first printed bucket is reserved for completely allocated slabs.
411 	 * Passing (buckets - 1) excludes that bucket from the generated
412 	 * distribution, since we're handling it as a special case.
413 	 */
414 	complete[0] = buffers_per_slab;
415 	complete[1] = buffers_per_slab + 1;
416 	distarray = dist_linear(buckets - 1, 1, buffers_per_slab - 1);
417 
418 	mdb_printf("%*s\n", LABEL_WIDTH, "Allocated");
419 	dist_print_header("Buffers", LABEL_WIDTH, "Slabs");
420 
421 	dist_print_bucket(complete, 0, ks_bucket, total, LABEL_WIDTH);
422 	/*
423 	 * Print bucket ranges in descending order after the first bucket for
424 	 * completely allocated slabs, so a person can see immediately whether
425 	 * or not there is fragmentation without having to scan possibly
426 	 * multiple screens of output. Starting at (buckets - 2) excludes the
427 	 * extra terminating bucket.
428 	 */
429 	for (i = buckets - 2; i >= 0; i--) {
430 		dist_print_bucket(distarray, i, ks_bucket, total, LABEL_WIDTH);
431 	}
432 	mdb_printf("\n");
433 }
434 #undef LABEL_WIDTH
435 
436 /*ARGSUSED*/
437 static int
kmem_first_slab(uintptr_t addr,const kmem_slab_t * sp,boolean_t * is_slab)438 kmem_first_slab(uintptr_t addr, const kmem_slab_t *sp, boolean_t *is_slab)
439 {
440 	*is_slab = B_TRUE;
441 	return (WALK_DONE);
442 }
443 
444 /*ARGSUSED*/
445 static int
kmem_first_partial_slab(uintptr_t addr,const kmem_slab_t * sp,boolean_t * is_slab)446 kmem_first_partial_slab(uintptr_t addr, const kmem_slab_t *sp,
447     boolean_t *is_slab)
448 {
449 	/*
450 	 * The "kmem_partial_slab" walker reports the first full slab if there
451 	 * are no partial slabs (for the sake of consumers that require at least
452 	 * one callback if there are any buffers in the cache).
453 	 */
454 	*is_slab = KMEM_SLAB_IS_PARTIAL(sp);
455 	return (WALK_DONE);
456 }
457 
458 typedef struct kmem_slab_usage {
459 	int ksu_refcnt;			/* count of allocated buffers on slab */
460 	boolean_t ksu_nomove;		/* slab marked non-reclaimable */
461 } kmem_slab_usage_t;
462 
463 typedef struct kmem_slab_stats {
464 	const kmem_cache_t *ks_cp;
465 	int ks_slabs;			/* slabs in cache */
466 	int ks_partial_slabs;		/* partially allocated slabs in cache */
467 	uint64_t ks_unused_buffers;	/* total unused buffers in cache */
468 	int ks_max_buffers_per_slab;	/* max buffers per slab */
469 	int ks_usage_len;		/* ks_usage array length */
470 	kmem_slab_usage_t *ks_usage;	/* partial slab usage */
471 	uint_t *ks_bucket;		/* slab usage distribution */
472 } kmem_slab_stats_t;
473 
474 /*ARGSUSED*/
475 static int
kmem_slablist_stat(uintptr_t addr,const kmem_slab_t * sp,kmem_slab_stats_t * ks)476 kmem_slablist_stat(uintptr_t addr, const kmem_slab_t *sp,
477     kmem_slab_stats_t *ks)
478 {
479 	kmem_slab_usage_t *ksu;
480 	long unused;
481 
482 	ks->ks_slabs++;
483 	ks->ks_bucket[sp->slab_refcnt]++;
484 
485 	unused = (sp->slab_chunks - sp->slab_refcnt);
486 	if (unused == 0) {
487 		return (WALK_NEXT);
488 	}
489 
490 	ks->ks_partial_slabs++;
491 	ks->ks_unused_buffers += unused;
492 
493 	if (ks->ks_partial_slabs > ks->ks_usage_len) {
494 		kmem_slab_usage_t *usage;
495 		int len = ks->ks_usage_len;
496 
497 		len = (len == 0 ? 16 : len * 2);
498 		usage = mdb_zalloc(len * sizeof (kmem_slab_usage_t), UM_SLEEP);
499 		if (ks->ks_usage != NULL) {
500 			bcopy(ks->ks_usage, usage,
501 			    ks->ks_usage_len * sizeof (kmem_slab_usage_t));
502 			mdb_free(ks->ks_usage,
503 			    ks->ks_usage_len * sizeof (kmem_slab_usage_t));
504 		}
505 		ks->ks_usage = usage;
506 		ks->ks_usage_len = len;
507 	}
508 
509 	ksu = &ks->ks_usage[ks->ks_partial_slabs - 1];
510 	ksu->ksu_refcnt = sp->slab_refcnt;
511 	ksu->ksu_nomove = (sp->slab_flags & KMEM_SLAB_NOMOVE);
512 	return (WALK_NEXT);
513 }
514 
515 static void
kmem_slabs_header()516 kmem_slabs_header()
517 {
518 	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
519 	    "", "", "Partial", "", "Unused", "");
520 	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
521 	    "Cache Name", "Slabs", "Slabs", "Buffers", "Buffers", "Waste");
522 	mdb_printf("%-25s %8s %8s %9s %9s %6s\n",
523 	    "-------------------------", "--------", "--------", "---------",
524 	    "---------", "------");
525 }
526 
527 int
kmem_slabs(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)528 kmem_slabs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
529 {
530 	kmem_cache_t c;
531 	kmem_slab_stats_t stats;
532 	mdb_walk_cb_t cb;
533 	int pct;
534 	int tenths_pct;
535 	size_t maxbuckets = 1;
536 	size_t minbucketsize = 0;
537 	const char *filter = NULL;
538 	const char *name = NULL;
539 	uint_t opt_v = FALSE;
540 	boolean_t buckets = B_FALSE;
541 	boolean_t skip = B_FALSE;
542 
543 	if (mdb_getopts(argc, argv,
544 	    'B', MDB_OPT_UINTPTR, &minbucketsize,
545 	    'b', MDB_OPT_UINTPTR, &maxbuckets,
546 	    'n', MDB_OPT_STR, &filter,
547 	    'N', MDB_OPT_STR, &name,
548 	    'v', MDB_OPT_SETBITS, TRUE, &opt_v,
549 	    NULL) != argc) {
550 		return (DCMD_USAGE);
551 	}
552 
553 	if ((maxbuckets != 1) || (minbucketsize != 0)) {
554 		buckets = B_TRUE;
555 	}
556 
557 	if (!(flags & DCMD_ADDRSPEC)) {
558 		if (mdb_walk_dcmd("kmem_cache", "kmem_slabs", argc,
559 		    argv) == -1) {
560 			mdb_warn("can't walk kmem_cache");
561 			return (DCMD_ERR);
562 		}
563 		return (DCMD_OK);
564 	}
565 
566 	if (mdb_vread(&c, sizeof (c), addr) == -1) {
567 		mdb_warn("couldn't read kmem_cache at %p", addr);
568 		return (DCMD_ERR);
569 	}
570 
571 	if (name == NULL) {
572 		skip = ((filter != NULL) &&
573 		    (strstr(c.cache_name, filter) == NULL));
574 	} else if (filter == NULL) {
575 		skip = (strcmp(c.cache_name, name) != 0);
576 	} else {
577 		/* match either -n or -N */
578 		skip = ((strcmp(c.cache_name, name) != 0) &&
579 		    (strstr(c.cache_name, filter) == NULL));
580 	}
581 
582 	if (!(opt_v || buckets) && DCMD_HDRSPEC(flags)) {
583 		kmem_slabs_header();
584 	} else if ((opt_v || buckets) && !skip) {
585 		if (DCMD_HDRSPEC(flags)) {
586 			kmem_slabs_header();
587 		} else {
588 			boolean_t is_slab = B_FALSE;
589 			const char *walker_name;
590 			if (opt_v) {
591 				cb = (mdb_walk_cb_t)kmem_first_partial_slab;
592 				walker_name = "kmem_slab_partial";
593 			} else {
594 				cb = (mdb_walk_cb_t)kmem_first_slab;
595 				walker_name = "kmem_slab";
596 			}
597 			(void) mdb_pwalk(walker_name, cb, &is_slab, addr);
598 			if (is_slab) {
599 				kmem_slabs_header();
600 			}
601 		}
602 	}
603 
604 	if (skip) {
605 		return (DCMD_OK);
606 	}
607 
608 	bzero(&stats, sizeof (kmem_slab_stats_t));
609 	stats.ks_cp = &c;
610 	stats.ks_max_buffers_per_slab = c.cache_maxchunks;
611 	/* +1 to include a zero bucket */
612 	stats.ks_bucket = mdb_zalloc((stats.ks_max_buffers_per_slab + 1) *
613 	    sizeof (*stats.ks_bucket), UM_SLEEP);
614 	cb = (mdb_walk_cb_t)kmem_slablist_stat;
615 	(void) mdb_pwalk("kmem_slab", cb, &stats, addr);
616 
617 	if (c.cache_buftotal == 0) {
618 		pct = 0;
619 		tenths_pct = 0;
620 	} else {
621 		uint64_t n = stats.ks_unused_buffers * 10000;
622 		pct = (int)(n / c.cache_buftotal);
623 		tenths_pct = pct - ((pct / 100) * 100);
624 		tenths_pct = (tenths_pct + 5) / 10; /* round nearest tenth */
625 		if (tenths_pct == 10) {
626 			pct += 100;
627 			tenths_pct = 0;
628 		}
629 	}
630 
631 	pct /= 100;
632 	mdb_printf("%-25s %8d %8d %9lld %9lld %3d.%1d%%\n", c.cache_name,
633 	    stats.ks_slabs, stats.ks_partial_slabs, c.cache_buftotal,
634 	    stats.ks_unused_buffers, pct, tenths_pct);
635 
636 	if (maxbuckets == 0) {
637 		maxbuckets = stats.ks_max_buffers_per_slab;
638 	}
639 
640 	if (((maxbuckets > 1) || (minbucketsize > 0)) &&
641 	    (stats.ks_slabs > 0)) {
642 		mdb_printf("\n");
643 		kmem_slabs_print_dist(stats.ks_bucket,
644 		    stats.ks_max_buffers_per_slab, maxbuckets, minbucketsize);
645 	}
646 
647 	mdb_free(stats.ks_bucket, (stats.ks_max_buffers_per_slab + 1) *
648 	    sizeof (*stats.ks_bucket));
649 
650 	if (!opt_v) {
651 		return (DCMD_OK);
652 	}
653 
654 	if (opt_v && (stats.ks_partial_slabs > 0)) {
655 		int i;
656 		kmem_slab_usage_t *ksu;
657 
658 		mdb_printf("  %d complete (%d), %d partial:",
659 		    (stats.ks_slabs - stats.ks_partial_slabs),
660 		    stats.ks_max_buffers_per_slab,
661 		    stats.ks_partial_slabs);
662 
663 		for (i = 0; i < stats.ks_partial_slabs; i++) {
664 			ksu = &stats.ks_usage[i];
665 			mdb_printf(" %d%s", ksu->ksu_refcnt,
666 			    (ksu->ksu_nomove ? "*" : ""));
667 		}
668 		mdb_printf("\n\n");
669 	}
670 
671 	if (stats.ks_usage_len > 0) {
672 		mdb_free(stats.ks_usage,
673 		    stats.ks_usage_len * sizeof (kmem_slab_usage_t));
674 	}
675 
676 	return (DCMD_OK);
677 }
678 
679 void
kmem_slabs_help(void)680 kmem_slabs_help(void)
681 {
682 	mdb_printf("%s",
683 "Display slab usage per kmem cache.\n\n");
684 	mdb_dec_indent(2);
685 	mdb_printf("%<b>OPTIONS%</b>\n");
686 	mdb_inc_indent(2);
687 	mdb_printf("%s",
688 "  -n name\n"
689 "        name of kmem cache (or matching partial name)\n"
690 "  -N name\n"
691 "        exact name of kmem cache\n"
692 "  -b maxbins\n"
693 "        Print a distribution of allocated buffers per slab using at\n"
694 "        most maxbins bins. The first bin is reserved for completely\n"
695 "        allocated slabs. Setting maxbins to zero (-b 0) has the same\n"
696 "        effect as specifying the maximum allocated buffers per slab\n"
697 "        or setting minbinsize to 1 (-B 1).\n"
698 "  -B minbinsize\n"
699 "        Print a distribution of allocated buffers per slab, making\n"
700 "        all bins (except the first, reserved for completely allocated\n"
701 "        slabs) at least minbinsize buffers apart.\n"
702 "  -v    verbose output: List the allocated buffer count of each partial\n"
703 "        slab on the free list in order from front to back to show how\n"
704 "        closely the slabs are ordered by usage. For example\n"
705 "\n"
706 "          10 complete, 3 partial (8): 7 3 1\n"
707 "\n"
708 "        means there are thirteen slabs with eight buffers each, including\n"
709 "        three partially allocated slabs with less than all eight buffers\n"
710 "        allocated.\n"
711 "\n"
712 "        Buffer allocations are always from the front of the partial slab\n"
713 "        list. When a buffer is freed from a completely used slab, that\n"
714 "        slab is added to the front of the partial slab list. Assuming\n"
715 "        that all buffers are equally likely to be freed soon, the\n"
716 "        desired order of partial slabs is most-used at the front of the\n"
717 "        list and least-used at the back (as in the example above).\n"
718 "        However, if a slab contains an allocated buffer that will not\n"
719 "        soon be freed, it would be better for that slab to be at the\n"
720 "        front where all of its buffers can be allocated. Taking a slab\n"
721 "        off the partial slab list (either with all buffers freed or all\n"
722 "        buffers allocated) reduces cache fragmentation.\n"
723 "\n"
724 "        A slab's allocated buffer count representing a partial slab (9 in\n"
725 "        the example below) may be marked as follows:\n"
726 "\n"
727 "        9*   An asterisk indicates that kmem has marked the slab non-\n"
728 "        reclaimable because the kmem client refused to move one of the\n"
729 "        slab's buffers. Since kmem does not expect to completely free the\n"
730 "        slab, it moves it to the front of the list in the hope of\n"
731 "        completely allocating it instead. A slab marked with an asterisk\n"
732 "        stays marked for as long as it remains on the partial slab list.\n"
733 "\n"
734 "Column\t\tDescription\n"
735 "\n"
736 "Cache Name\t\tname of kmem cache\n"
737 "Slabs\t\t\ttotal slab count\n"
738 "Partial Slabs\t\tcount of partially allocated slabs on the free list\n"
739 "Buffers\t\ttotal buffer count (Slabs * (buffers per slab))\n"
740 "Unused Buffers\tcount of unallocated buffers across all partial slabs\n"
741 "Waste\t\t\t(Unused Buffers / Buffers) does not include space\n"
742 "\t\t\t  for accounting structures (debug mode), slab\n"
743 "\t\t\t  coloring (incremental small offsets to stagger\n"
744 "\t\t\t  buffer alignment), or the per-CPU magazine layer\n");
745 }
746 
747 static int
addrcmp(const void * lhs,const void * rhs)748 addrcmp(const void *lhs, const void *rhs)
749 {
750 	uintptr_t p1 = *((uintptr_t *)lhs);
751 	uintptr_t p2 = *((uintptr_t *)rhs);
752 
753 	if (p1 < p2)
754 		return (-1);
755 	if (p1 > p2)
756 		return (1);
757 	return (0);
758 }
759 
760 static int
bufctlcmp(const kmem_bufctl_audit_t ** lhs,const kmem_bufctl_audit_t ** rhs)761 bufctlcmp(const kmem_bufctl_audit_t **lhs, const kmem_bufctl_audit_t **rhs)
762 {
763 	const kmem_bufctl_audit_t *bcp1 = *lhs;
764 	const kmem_bufctl_audit_t *bcp2 = *rhs;
765 
766 	if (bcp1->bc_timestamp > bcp2->bc_timestamp)
767 		return (-1);
768 
769 	if (bcp1->bc_timestamp < bcp2->bc_timestamp)
770 		return (1);
771 
772 	return (0);
773 }
774 
775 typedef struct kmem_hash_walk {
776 	uintptr_t *kmhw_table;
777 	size_t kmhw_nelems;
778 	size_t kmhw_pos;
779 	kmem_bufctl_t kmhw_cur;
780 } kmem_hash_walk_t;
781 
782 int
kmem_hash_walk_init(mdb_walk_state_t * wsp)783 kmem_hash_walk_init(mdb_walk_state_t *wsp)
784 {
785 	kmem_hash_walk_t *kmhw;
786 	uintptr_t *hash;
787 	kmem_cache_t c;
788 	uintptr_t haddr, addr = wsp->walk_addr;
789 	size_t nelems;
790 	size_t hsize;
791 
792 	if (addr == 0) {
793 		mdb_warn("kmem_hash doesn't support global walks\n");
794 		return (WALK_ERR);
795 	}
796 
797 	if (mdb_vread(&c, sizeof (c), addr) == -1) {
798 		mdb_warn("couldn't read cache at addr %p", addr);
799 		return (WALK_ERR);
800 	}
801 
802 	if (!(c.cache_flags & KMF_HASH)) {
803 		mdb_warn("cache %p doesn't have a hash table\n", addr);
804 		return (WALK_DONE);		/* nothing to do */
805 	}
806 
807 	kmhw = mdb_zalloc(sizeof (kmem_hash_walk_t), UM_SLEEP);
808 	kmhw->kmhw_cur.bc_next = NULL;
809 	kmhw->kmhw_pos = 0;
810 
811 	kmhw->kmhw_nelems = nelems = c.cache_hash_mask + 1;
812 	hsize = nelems * sizeof (uintptr_t);
813 	haddr = (uintptr_t)c.cache_hash_table;
814 
815 	kmhw->kmhw_table = hash = mdb_alloc(hsize, UM_SLEEP);
816 	if (mdb_vread(hash, hsize, haddr) == -1) {
817 		mdb_warn("failed to read hash table at %p", haddr);
818 		mdb_free(hash, hsize);
819 		mdb_free(kmhw, sizeof (kmem_hash_walk_t));
820 		return (WALK_ERR);
821 	}
822 
823 	wsp->walk_data = kmhw;
824 
825 	return (WALK_NEXT);
826 }
827 
828 int
kmem_hash_walk_step(mdb_walk_state_t * wsp)829 kmem_hash_walk_step(mdb_walk_state_t *wsp)
830 {
831 	kmem_hash_walk_t *kmhw = wsp->walk_data;
832 	uintptr_t addr = 0;
833 
834 	if ((addr = (uintptr_t)kmhw->kmhw_cur.bc_next) == 0) {
835 		while (kmhw->kmhw_pos < kmhw->kmhw_nelems) {
836 			if ((addr = kmhw->kmhw_table[kmhw->kmhw_pos++]) != 0)
837 				break;
838 		}
839 	}
840 	if (addr == 0)
841 		return (WALK_DONE);
842 
843 	if (mdb_vread(&kmhw->kmhw_cur, sizeof (kmem_bufctl_t), addr) == -1) {
844 		mdb_warn("couldn't read kmem_bufctl_t at addr %p", addr);
845 		return (WALK_ERR);
846 	}
847 
848 	return (wsp->walk_callback(addr, &kmhw->kmhw_cur, wsp->walk_cbdata));
849 }
850 
851 void
kmem_hash_walk_fini(mdb_walk_state_t * wsp)852 kmem_hash_walk_fini(mdb_walk_state_t *wsp)
853 {
854 	kmem_hash_walk_t *kmhw = wsp->walk_data;
855 
856 	if (kmhw == NULL)
857 		return;
858 
859 	mdb_free(kmhw->kmhw_table, kmhw->kmhw_nelems * sizeof (uintptr_t));
860 	mdb_free(kmhw, sizeof (kmem_hash_walk_t));
861 }
862 
863 /*
864  * Find the address of the bufctl structure for the address 'buf' in cache
865  * 'cp', which is at address caddr, and place it in *out.
866  */
867 static int
kmem_hash_lookup(kmem_cache_t * cp,uintptr_t caddr,void * buf,uintptr_t * out)868 kmem_hash_lookup(kmem_cache_t *cp, uintptr_t caddr, void *buf, uintptr_t *out)
869 {
870 	uintptr_t bucket = (uintptr_t)KMEM_HASH(cp, buf);
871 	kmem_bufctl_t *bcp;
872 	kmem_bufctl_t bc;
873 
874 	if (mdb_vread(&bcp, sizeof (kmem_bufctl_t *), bucket) == -1) {
875 		mdb_warn("unable to read hash bucket for %p in cache %p",
876 		    buf, caddr);
877 		return (-1);
878 	}
879 
880 	while (bcp != NULL) {
881 		if (mdb_vread(&bc, sizeof (kmem_bufctl_t),
882 		    (uintptr_t)bcp) == -1) {
883 			mdb_warn("unable to read bufctl at %p", bcp);
884 			return (-1);
885 		}
886 		if (bc.bc_addr == buf) {
887 			*out = (uintptr_t)bcp;
888 			return (0);
889 		}
890 		bcp = bc.bc_next;
891 	}
892 
893 	mdb_warn("unable to find bufctl for %p in cache %p\n", buf, caddr);
894 	return (-1);
895 }
896 
897 int
kmem_get_magsize(const kmem_cache_t * cp)898 kmem_get_magsize(const kmem_cache_t *cp)
899 {
900 	uintptr_t addr = (uintptr_t)cp->cache_magtype;
901 	GElf_Sym mt_sym;
902 	kmem_magtype_t mt;
903 	int res;
904 
905 	/*
906 	 * if cpu 0 has a non-zero magsize, it must be correct.  caches
907 	 * with KMF_NOMAGAZINE have disabled their magazine layers, so
908 	 * it is okay to return 0 for them.
909 	 */
910 	if ((res = cp->cache_cpu[0].cc_magsize) != 0 ||
911 	    (cp->cache_flags & KMF_NOMAGAZINE))
912 		return (res);
913 
914 	if (mdb_lookup_by_name("kmem_magtype", &mt_sym) == -1) {
915 		mdb_warn("unable to read 'kmem_magtype'");
916 	} else if (addr < mt_sym.st_value ||
917 	    addr + sizeof (mt) - 1 > mt_sym.st_value + mt_sym.st_size - 1 ||
918 	    ((addr - mt_sym.st_value) % sizeof (mt)) != 0) {
919 		mdb_warn("cache '%s' has invalid magtype pointer (%p)\n",
920 		    cp->cache_name, addr);
921 		return (0);
922 	}
923 	if (mdb_vread(&mt, sizeof (mt), addr) == -1) {
924 		mdb_warn("unable to read magtype at %a", addr);
925 		return (0);
926 	}
927 	return (mt.mt_magsize);
928 }
929 
930 /*ARGSUSED*/
931 static int
kmem_estimate_slab(uintptr_t addr,const kmem_slab_t * sp,size_t * est)932 kmem_estimate_slab(uintptr_t addr, const kmem_slab_t *sp, size_t *est)
933 {
934 	*est -= (sp->slab_chunks - sp->slab_refcnt);
935 
936 	return (WALK_NEXT);
937 }
938 
939 /*
940  * Returns an upper bound on the number of allocated buffers in a given
941  * cache.
942  */
943 size_t
kmem_estimate_allocated(uintptr_t addr,const kmem_cache_t * cp)944 kmem_estimate_allocated(uintptr_t addr, const kmem_cache_t *cp)
945 {
946 	int magsize;
947 	size_t cache_est;
948 
949 	cache_est = cp->cache_buftotal;
950 
951 	(void) mdb_pwalk("kmem_slab_partial",
952 	    (mdb_walk_cb_t)kmem_estimate_slab, &cache_est, addr);
953 
954 	if ((magsize = kmem_get_magsize(cp)) != 0) {
955 		size_t mag_est = cp->cache_full.ml_total * magsize;
956 
957 		if (cache_est >= mag_est) {
958 			cache_est -= mag_est;
959 		} else {
960 			mdb_warn("cache %p's magazine layer holds more buffers "
961 			    "than the slab layer.\n", addr);
962 		}
963 	}
964 	return (cache_est);
965 }
966 
967 #define	READMAG_ROUNDS(rounds) { \
968 	if (mdb_vread(mp, magbsize, (uintptr_t)kmp) == -1) { \
969 		mdb_warn("couldn't read magazine at %p", kmp); \
970 		goto fail; \
971 	} \
972 	for (i = 0; i < rounds; i++) { \
973 		maglist[magcnt++] = mp->mag_round[i]; \
974 		if (magcnt == magmax) { \
975 			mdb_warn("%d magazines exceeds fudge factor\n", \
976 			    magcnt); \
977 			goto fail; \
978 		} \
979 	} \
980 }
981 
982 int
kmem_read_magazines(kmem_cache_t * cp,uintptr_t addr,int ncpus,void *** maglistp,size_t * magcntp,size_t * magmaxp,int alloc_flags)983 kmem_read_magazines(kmem_cache_t *cp, uintptr_t addr, int ncpus,
984     void ***maglistp, size_t *magcntp, size_t *magmaxp, int alloc_flags)
985 {
986 	kmem_magazine_t *kmp, *mp;
987 	void **maglist = NULL;
988 	int i, cpu;
989 	size_t magsize, magmax, magbsize;
990 	size_t magcnt = 0;
991 
992 	/*
993 	 * Read the magtype out of the cache, after verifying the pointer's
994 	 * correctness.
995 	 */
996 	magsize = kmem_get_magsize(cp);
997 	if (magsize == 0) {
998 		*maglistp = NULL;
999 		*magcntp = 0;
1000 		*magmaxp = 0;
1001 		return (WALK_NEXT);
1002 	}
1003 
1004 	/*
1005 	 * There are several places where we need to go buffer hunting:
1006 	 * the per-CPU loaded magazine, the per-CPU spare full magazine,
1007 	 * and the full magazine list in the depot.
1008 	 *
1009 	 * For an upper bound on the number of buffers in the magazine
1010 	 * layer, we have the number of magazines on the cache_full
1011 	 * list plus at most two magazines per CPU (the loaded and the
1012 	 * spare).  Toss in 100 magazines as a fudge factor in case this
1013 	 * is live (the number "100" comes from the same fudge factor in
1014 	 * crash(8)).
1015 	 */
1016 	magmax = (cp->cache_full.ml_total + 2 * ncpus + 100) * magsize;
1017 	magbsize = offsetof(kmem_magazine_t, mag_round[magsize]);
1018 
1019 	if (magbsize >= PAGESIZE / 2) {
1020 		mdb_warn("magazine size for cache %p unreasonable (%x)\n",
1021 		    addr, magbsize);
1022 		return (WALK_ERR);
1023 	}
1024 
1025 	maglist = mdb_alloc(magmax * sizeof (void *), alloc_flags);
1026 	mp = mdb_alloc(magbsize, alloc_flags);
1027 	if (mp == NULL || maglist == NULL)
1028 		goto fail;
1029 
1030 	/*
1031 	 * First up: the magazines in the depot (i.e. on the cache_full list).
1032 	 */
1033 	for (kmp = cp->cache_full.ml_list; kmp != NULL; ) {
1034 		READMAG_ROUNDS(magsize);
1035 		kmp = mp->mag_next;
1036 
1037 		if (kmp == cp->cache_full.ml_list)
1038 			break; /* cache_full list loop detected */
1039 	}
1040 
1041 	dprintf(("cache_full list done\n"));
1042 
1043 	/*
1044 	 * Now whip through the CPUs, snagging the loaded magazines
1045 	 * and full spares.
1046 	 *
1047 	 * In order to prevent inconsistent dumps, rounds and prounds
1048 	 * are copied aside before dumping begins.
1049 	 */
1050 	for (cpu = 0; cpu < ncpus; cpu++) {
1051 		kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu];
1052 		short rounds, prounds;
1053 
1054 		if (KMEM_DUMPCC(ccp)) {
1055 			rounds = ccp->cc_dump_rounds;
1056 			prounds = ccp->cc_dump_prounds;
1057 		} else {
1058 			rounds = ccp->cc_rounds;
1059 			prounds = ccp->cc_prounds;
1060 		}
1061 
1062 		dprintf(("reading cpu cache %p\n",
1063 		    (uintptr_t)ccp - (uintptr_t)cp + addr));
1064 
1065 		if (rounds > 0 &&
1066 		    (kmp = ccp->cc_loaded) != NULL) {
1067 			dprintf(("reading %d loaded rounds\n", rounds));
1068 			READMAG_ROUNDS(rounds);
1069 		}
1070 
1071 		if (prounds > 0 &&
1072 		    (kmp = ccp->cc_ploaded) != NULL) {
1073 			dprintf(("reading %d previously loaded rounds\n",
1074 			    prounds));
1075 			READMAG_ROUNDS(prounds);
1076 		}
1077 	}
1078 
1079 	dprintf(("magazine layer: %d buffers\n", magcnt));
1080 
1081 	if (!(alloc_flags & UM_GC))
1082 		mdb_free(mp, magbsize);
1083 
1084 	*maglistp = maglist;
1085 	*magcntp = magcnt;
1086 	*magmaxp = magmax;
1087 
1088 	return (WALK_NEXT);
1089 
1090 fail:
1091 	if (!(alloc_flags & UM_GC)) {
1092 		if (mp)
1093 			mdb_free(mp, magbsize);
1094 		if (maglist)
1095 			mdb_free(maglist, magmax * sizeof (void *));
1096 	}
1097 	return (WALK_ERR);
1098 }
1099 
1100 static int
kmem_walk_callback(mdb_walk_state_t * wsp,uintptr_t buf)1101 kmem_walk_callback(mdb_walk_state_t *wsp, uintptr_t buf)
1102 {
1103 	return (wsp->walk_callback(buf, NULL, wsp->walk_cbdata));
1104 }
1105 
1106 static int
bufctl_walk_callback(kmem_cache_t * cp,mdb_walk_state_t * wsp,uintptr_t buf)1107 bufctl_walk_callback(kmem_cache_t *cp, mdb_walk_state_t *wsp, uintptr_t buf)
1108 {
1109 	kmem_bufctl_audit_t b;
1110 
1111 	/*
1112 	 * if KMF_AUDIT is not set, we know that we're looking at a
1113 	 * kmem_bufctl_t.
1114 	 */
1115 	if (!(cp->cache_flags & KMF_AUDIT) ||
1116 	    mdb_vread(&b, sizeof (kmem_bufctl_audit_t), buf) == -1) {
1117 		(void) memset(&b, 0, sizeof (b));
1118 		if (mdb_vread(&b, sizeof (kmem_bufctl_t), buf) == -1) {
1119 			mdb_warn("unable to read bufctl at %p", buf);
1120 			return (WALK_ERR);
1121 		}
1122 	}
1123 
1124 	return (wsp->walk_callback(buf, &b, wsp->walk_cbdata));
1125 }
1126 
1127 typedef struct kmem_walk {
1128 	int kmw_type;
1129 
1130 	uintptr_t kmw_addr;		/* cache address */
1131 	kmem_cache_t *kmw_cp;
1132 	size_t kmw_csize;
1133 
1134 	/*
1135 	 * magazine layer
1136 	 */
1137 	void **kmw_maglist;
1138 	size_t kmw_max;
1139 	size_t kmw_count;
1140 	size_t kmw_pos;
1141 
1142 	/*
1143 	 * slab layer
1144 	 */
1145 	char *kmw_valid;	/* to keep track of freed buffers */
1146 	char *kmw_ubase;	/* buffer for slab data */
1147 } kmem_walk_t;
1148 
1149 static int
kmem_walk_init_common(mdb_walk_state_t * wsp,int type)1150 kmem_walk_init_common(mdb_walk_state_t *wsp, int type)
1151 {
1152 	kmem_walk_t *kmw;
1153 	int ncpus, csize;
1154 	kmem_cache_t *cp;
1155 	size_t vm_quantum;
1156 
1157 	size_t magmax, magcnt;
1158 	void **maglist = NULL;
1159 	uint_t chunksize = 1, slabsize = 1;
1160 	int status = WALK_ERR;
1161 	uintptr_t addr = wsp->walk_addr;
1162 	const char *layered;
1163 
1164 	type &= ~KM_HASH;
1165 
1166 	if (addr == 0) {
1167 		mdb_warn("kmem walk doesn't support global walks\n");
1168 		return (WALK_ERR);
1169 	}
1170 
1171 	dprintf(("walking %p\n", addr));
1172 
1173 	/*
1174 	 * First we need to figure out how many CPUs are configured in the
1175 	 * system to know how much to slurp out.
1176 	 */
1177 	mdb_readvar(&ncpus, "max_ncpus");
1178 
1179 	csize = KMEM_CACHE_SIZE(ncpus);
1180 	cp = mdb_alloc(csize, UM_SLEEP);
1181 
1182 	if (mdb_vread(cp, csize, addr) == -1) {
1183 		mdb_warn("couldn't read cache at addr %p", addr);
1184 		goto out2;
1185 	}
1186 
1187 	/*
1188 	 * It's easy for someone to hand us an invalid cache address.
1189 	 * Unfortunately, it is hard for this walker to survive an
1190 	 * invalid cache cleanly.  So we make sure that:
1191 	 *
1192 	 *	1. the vmem arena for the cache is readable,
1193 	 *	2. the vmem arena's quantum is a power of 2,
1194 	 *	3. our slabsize is a multiple of the quantum, and
1195 	 *	4. our chunksize is >0 and less than our slabsize.
1196 	 */
1197 	if (mdb_vread(&vm_quantum, sizeof (vm_quantum),
1198 	    (uintptr_t)&cp->cache_arena->vm_quantum) == -1 ||
1199 	    vm_quantum == 0 ||
1200 	    (vm_quantum & (vm_quantum - 1)) != 0 ||
1201 	    cp->cache_slabsize < vm_quantum ||
1202 	    P2PHASE(cp->cache_slabsize, vm_quantum) != 0 ||
1203 	    cp->cache_chunksize == 0 ||
1204 	    cp->cache_chunksize > cp->cache_slabsize) {
1205 		mdb_warn("%p is not a valid kmem_cache_t\n", addr);
1206 		goto out2;
1207 	}
1208 
1209 	dprintf(("buf total is %d\n", cp->cache_buftotal));
1210 
1211 	if (cp->cache_buftotal == 0) {
1212 		mdb_free(cp, csize);
1213 		return (WALK_DONE);
1214 	}
1215 
1216 	/*
1217 	 * If they ask for bufctls, but it's a small-slab cache,
1218 	 * there is nothing to report.
1219 	 */
1220 	if ((type & KM_BUFCTL) && !(cp->cache_flags & KMF_HASH)) {
1221 		dprintf(("bufctl requested, not KMF_HASH (flags: %p)\n",
1222 		    cp->cache_flags));
1223 		mdb_free(cp, csize);
1224 		return (WALK_DONE);
1225 	}
1226 
1227 	/*
1228 	 * If they want constructed buffers, but there's no constructor or
1229 	 * the cache has DEADBEEF checking enabled, there is nothing to report.
1230 	 */
1231 	if ((type & KM_CONSTRUCTED) && (!(type & KM_FREE) ||
1232 	    cp->cache_constructor == NULL ||
1233 	    (cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) == KMF_DEADBEEF)) {
1234 		mdb_free(cp, csize);
1235 		return (WALK_DONE);
1236 	}
1237 
1238 	/*
1239 	 * Read in the contents of the magazine layer
1240 	 */
1241 	if (kmem_read_magazines(cp, addr, ncpus, &maglist, &magcnt,
1242 	    &magmax, UM_SLEEP) == WALK_ERR)
1243 		goto out2;
1244 
1245 	/*
1246 	 * We have all of the buffers from the magazines;  if we are walking
1247 	 * allocated buffers, sort them so we can bsearch them later.
1248 	 */
1249 	if (type & KM_ALLOCATED)
1250 		qsort(maglist, magcnt, sizeof (void *), addrcmp);
1251 
1252 	wsp->walk_data = kmw = mdb_zalloc(sizeof (kmem_walk_t), UM_SLEEP);
1253 
1254 	kmw->kmw_type = type;
1255 	kmw->kmw_addr = addr;
1256 	kmw->kmw_cp = cp;
1257 	kmw->kmw_csize = csize;
1258 	kmw->kmw_maglist = maglist;
1259 	kmw->kmw_max = magmax;
1260 	kmw->kmw_count = magcnt;
1261 	kmw->kmw_pos = 0;
1262 
1263 	/*
1264 	 * When walking allocated buffers in a KMF_HASH cache, we walk the
1265 	 * hash table instead of the slab layer.
1266 	 */
1267 	if ((cp->cache_flags & KMF_HASH) && (type & KM_ALLOCATED)) {
1268 		layered = "kmem_hash";
1269 
1270 		kmw->kmw_type |= KM_HASH;
1271 	} else {
1272 		/*
1273 		 * If we are walking freed buffers, we only need the
1274 		 * magazine layer plus the partially allocated slabs.
1275 		 * To walk allocated buffers, we need all of the slabs.
1276 		 */
1277 		if (type & KM_ALLOCATED)
1278 			layered = "kmem_slab";
1279 		else
1280 			layered = "kmem_slab_partial";
1281 
1282 		/*
1283 		 * for small-slab caches, we read in the entire slab.  For
1284 		 * freed buffers, we can just walk the freelist.  For
1285 		 * allocated buffers, we use a 'valid' array to track
1286 		 * the freed buffers.
1287 		 */
1288 		if (!(cp->cache_flags & KMF_HASH)) {
1289 			chunksize = cp->cache_chunksize;
1290 			slabsize = cp->cache_slabsize;
1291 
1292 			kmw->kmw_ubase = mdb_alloc(slabsize +
1293 			    sizeof (kmem_bufctl_t), UM_SLEEP);
1294 
1295 			if (type & KM_ALLOCATED)
1296 				kmw->kmw_valid =
1297 				    mdb_alloc(slabsize / chunksize, UM_SLEEP);
1298 		}
1299 	}
1300 
1301 	status = WALK_NEXT;
1302 
1303 	if (mdb_layered_walk(layered, wsp) == -1) {
1304 		mdb_warn("unable to start layered '%s' walk", layered);
1305 		status = WALK_ERR;
1306 	}
1307 
1308 	if (status == WALK_ERR) {
1309 		if (kmw->kmw_valid)
1310 			mdb_free(kmw->kmw_valid, slabsize / chunksize);
1311 
1312 		if (kmw->kmw_ubase)
1313 			mdb_free(kmw->kmw_ubase, slabsize +
1314 			    sizeof (kmem_bufctl_t));
1315 
1316 		if (kmw->kmw_maglist)
1317 			mdb_free(kmw->kmw_maglist,
1318 			    kmw->kmw_max * sizeof (uintptr_t));
1319 
1320 		mdb_free(kmw, sizeof (kmem_walk_t));
1321 		wsp->walk_data = NULL;
1322 	}
1323 
1324 out2:
1325 	if (status == WALK_ERR)
1326 		mdb_free(cp, csize);
1327 
1328 	return (status);
1329 }
1330 
1331 int
kmem_walk_step(mdb_walk_state_t * wsp)1332 kmem_walk_step(mdb_walk_state_t *wsp)
1333 {
1334 	kmem_walk_t *kmw = wsp->walk_data;
1335 	int type = kmw->kmw_type;
1336 	kmem_cache_t *cp = kmw->kmw_cp;
1337 
1338 	void **maglist = kmw->kmw_maglist;
1339 	int magcnt = kmw->kmw_count;
1340 
1341 	uintptr_t chunksize, slabsize;
1342 	uintptr_t addr;
1343 	const kmem_slab_t *sp;
1344 	const kmem_bufctl_t *bcp;
1345 	kmem_bufctl_t bc;
1346 
1347 	int chunks;
1348 	char *kbase;
1349 	void *buf;
1350 	int i, ret;
1351 
1352 	char *valid, *ubase;
1353 
1354 	/*
1355 	 * first, handle the 'kmem_hash' layered walk case
1356 	 */
1357 	if (type & KM_HASH) {
1358 		/*
1359 		 * We have a buffer which has been allocated out of the
1360 		 * global layer. We need to make sure that it's not
1361 		 * actually sitting in a magazine before we report it as
1362 		 * an allocated buffer.
1363 		 */
1364 		buf = ((const kmem_bufctl_t *)wsp->walk_layer)->bc_addr;
1365 
1366 		if (magcnt > 0 &&
1367 		    bsearch(&buf, maglist, magcnt, sizeof (void *),
1368 		    addrcmp) != NULL)
1369 			return (WALK_NEXT);
1370 
1371 		if (type & KM_BUFCTL)
1372 			return (bufctl_walk_callback(cp, wsp, wsp->walk_addr));
1373 
1374 		return (kmem_walk_callback(wsp, (uintptr_t)buf));
1375 	}
1376 
1377 	ret = WALK_NEXT;
1378 
1379 	addr = kmw->kmw_addr;
1380 
1381 	/*
1382 	 * If we're walking freed buffers, report everything in the
1383 	 * magazine layer before processing the first slab.
1384 	 */
1385 	if ((type & KM_FREE) && magcnt != 0) {
1386 		kmw->kmw_count = 0;		/* only do this once */
1387 		for (i = 0; i < magcnt; i++) {
1388 			buf = maglist[i];
1389 
1390 			if (type & KM_BUFCTL) {
1391 				uintptr_t out;
1392 
1393 				if (cp->cache_flags & KMF_BUFTAG) {
1394 					kmem_buftag_t *btp;
1395 					kmem_buftag_t tag;
1396 
1397 					/* LINTED - alignment */
1398 					btp = KMEM_BUFTAG(cp, buf);
1399 					if (mdb_vread(&tag, sizeof (tag),
1400 					    (uintptr_t)btp) == -1) {
1401 						mdb_warn("reading buftag for "
1402 						    "%p at %p", buf, btp);
1403 						continue;
1404 					}
1405 					out = (uintptr_t)tag.bt_bufctl;
1406 				} else {
1407 					if (kmem_hash_lookup(cp, addr, buf,
1408 					    &out) == -1)
1409 						continue;
1410 				}
1411 				ret = bufctl_walk_callback(cp, wsp, out);
1412 			} else {
1413 				ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1414 			}
1415 
1416 			if (ret != WALK_NEXT)
1417 				return (ret);
1418 		}
1419 	}
1420 
1421 	/*
1422 	 * If they want constructed buffers, we're finished, since the
1423 	 * magazine layer holds them all.
1424 	 */
1425 	if (type & KM_CONSTRUCTED)
1426 		return (WALK_DONE);
1427 
1428 	/*
1429 	 * Handle the buffers in the current slab
1430 	 */
1431 	chunksize = cp->cache_chunksize;
1432 	slabsize = cp->cache_slabsize;
1433 
1434 	sp = wsp->walk_layer;
1435 	chunks = sp->slab_chunks;
1436 	kbase = sp->slab_base;
1437 
1438 	dprintf(("kbase is %p\n", kbase));
1439 
1440 	if (!(cp->cache_flags & KMF_HASH)) {
1441 		valid = kmw->kmw_valid;
1442 		ubase = kmw->kmw_ubase;
1443 
1444 		if (mdb_vread(ubase, chunks * chunksize,
1445 		    (uintptr_t)kbase) == -1) {
1446 			mdb_warn("failed to read slab contents at %p", kbase);
1447 			return (WALK_ERR);
1448 		}
1449 
1450 		/*
1451 		 * Set up the valid map as fully allocated -- we'll punch
1452 		 * out the freelist.
1453 		 */
1454 		if (type & KM_ALLOCATED)
1455 			(void) memset(valid, 1, chunks);
1456 	} else {
1457 		valid = NULL;
1458 		ubase = NULL;
1459 	}
1460 
1461 	/*
1462 	 * walk the slab's freelist
1463 	 */
1464 	bcp = sp->slab_head;
1465 
1466 	dprintf(("refcnt is %d; chunks is %d\n", sp->slab_refcnt, chunks));
1467 
1468 	/*
1469 	 * since we could be in the middle of allocating a buffer,
1470 	 * our refcnt could be one higher than it aught.  So we
1471 	 * check one further on the freelist than the count allows.
1472 	 */
1473 	for (i = sp->slab_refcnt; i <= chunks; i++) {
1474 		uint_t ndx;
1475 
1476 		dprintf(("bcp is %p\n", bcp));
1477 
1478 		if (bcp == NULL) {
1479 			if (i == chunks)
1480 				break;
1481 			mdb_warn(
1482 			    "slab %p in cache %p freelist too short by %d\n",
1483 			    sp, addr, chunks - i);
1484 			break;
1485 		}
1486 
1487 		if (cp->cache_flags & KMF_HASH) {
1488 			if (mdb_vread(&bc, sizeof (bc), (uintptr_t)bcp) == -1) {
1489 				mdb_warn("failed to read bufctl ptr at %p",
1490 				    bcp);
1491 				break;
1492 			}
1493 			buf = bc.bc_addr;
1494 		} else {
1495 			/*
1496 			 * Otherwise the buffer is (or should be) in the slab
1497 			 * that we've read in; determine its offset in the
1498 			 * slab, validate that it's not corrupt, and add to
1499 			 * our base address to find the umem_bufctl_t.  (Note
1500 			 * that we don't need to add the size of the bufctl
1501 			 * to our offset calculation because of the slop that's
1502 			 * allocated for the buffer at ubase.)
1503 			 */
1504 			uintptr_t offs = (uintptr_t)bcp - (uintptr_t)kbase;
1505 
1506 			if (offs > chunks * chunksize) {
1507 				mdb_warn("found corrupt bufctl ptr %p"
1508 				    " in slab %p in cache %p\n", bcp,
1509 				    wsp->walk_addr, addr);
1510 				break;
1511 			}
1512 
1513 			bc = *((kmem_bufctl_t *)((uintptr_t)ubase + offs));
1514 			buf = KMEM_BUF(cp, bcp);
1515 		}
1516 
1517 		ndx = ((uintptr_t)buf - (uintptr_t)kbase) / chunksize;
1518 
1519 		if (ndx > slabsize / cp->cache_bufsize) {
1520 			/*
1521 			 * This is very wrong; we have managed to find
1522 			 * a buffer in the slab which shouldn't
1523 			 * actually be here.  Emit a warning, and
1524 			 * try to continue.
1525 			 */
1526 			mdb_warn("buf %p is out of range for "
1527 			    "slab %p, cache %p\n", buf, sp, addr);
1528 		} else if (type & KM_ALLOCATED) {
1529 			/*
1530 			 * we have found a buffer on the slab's freelist;
1531 			 * clear its entry
1532 			 */
1533 			valid[ndx] = 0;
1534 		} else {
1535 			/*
1536 			 * Report this freed buffer
1537 			 */
1538 			if (type & KM_BUFCTL) {
1539 				ret = bufctl_walk_callback(cp, wsp,
1540 				    (uintptr_t)bcp);
1541 			} else {
1542 				ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1543 			}
1544 			if (ret != WALK_NEXT)
1545 				return (ret);
1546 		}
1547 
1548 		bcp = bc.bc_next;
1549 	}
1550 
1551 	if (bcp != NULL) {
1552 		dprintf(("slab %p in cache %p freelist too long (%p)\n",
1553 		    sp, addr, bcp));
1554 	}
1555 
1556 	/*
1557 	 * If we are walking freed buffers, the loop above handled reporting
1558 	 * them.
1559 	 */
1560 	if (type & KM_FREE)
1561 		return (WALK_NEXT);
1562 
1563 	if (type & KM_BUFCTL) {
1564 		mdb_warn("impossible situation: small-slab KM_BUFCTL walk for "
1565 		    "cache %p\n", addr);
1566 		return (WALK_ERR);
1567 	}
1568 
1569 	/*
1570 	 * Report allocated buffers, skipping buffers in the magazine layer.
1571 	 * We only get this far for small-slab caches.
1572 	 */
1573 	for (i = 0; ret == WALK_NEXT && i < chunks; i++) {
1574 		buf = (char *)kbase + i * chunksize;
1575 
1576 		if (!valid[i])
1577 			continue;		/* on slab freelist */
1578 
1579 		if (magcnt > 0 &&
1580 		    bsearch(&buf, maglist, magcnt, sizeof (void *),
1581 		    addrcmp) != NULL)
1582 			continue;		/* in magazine layer */
1583 
1584 		ret = kmem_walk_callback(wsp, (uintptr_t)buf);
1585 	}
1586 	return (ret);
1587 }
1588 
1589 void
kmem_walk_fini(mdb_walk_state_t * wsp)1590 kmem_walk_fini(mdb_walk_state_t *wsp)
1591 {
1592 	kmem_walk_t *kmw = wsp->walk_data;
1593 	uintptr_t chunksize;
1594 	uintptr_t slabsize;
1595 
1596 	if (kmw == NULL)
1597 		return;
1598 
1599 	if (kmw->kmw_maglist != NULL)
1600 		mdb_free(kmw->kmw_maglist, kmw->kmw_max * sizeof (void *));
1601 
1602 	chunksize = kmw->kmw_cp->cache_chunksize;
1603 	slabsize = kmw->kmw_cp->cache_slabsize;
1604 
1605 	if (kmw->kmw_valid != NULL)
1606 		mdb_free(kmw->kmw_valid, slabsize / chunksize);
1607 	if (kmw->kmw_ubase != NULL)
1608 		mdb_free(kmw->kmw_ubase, slabsize + sizeof (kmem_bufctl_t));
1609 
1610 	mdb_free(kmw->kmw_cp, kmw->kmw_csize);
1611 	mdb_free(kmw, sizeof (kmem_walk_t));
1612 }
1613 
1614 /*ARGSUSED*/
1615 static int
kmem_walk_all(uintptr_t addr,const kmem_cache_t * c,mdb_walk_state_t * wsp)1616 kmem_walk_all(uintptr_t addr, const kmem_cache_t *c, mdb_walk_state_t *wsp)
1617 {
1618 	/*
1619 	 * Buffers allocated from NOTOUCH caches can also show up as freed
1620 	 * memory in other caches.  This can be a little confusing, so we
1621 	 * don't walk NOTOUCH caches when walking all caches (thereby assuring
1622 	 * that "::walk kmem" and "::walk freemem" yield disjoint output).
1623 	 */
1624 	if (c->cache_cflags & KMC_NOTOUCH)
1625 		return (WALK_NEXT);
1626 
1627 	if (mdb_pwalk(wsp->walk_data, wsp->walk_callback,
1628 	    wsp->walk_cbdata, addr) == -1)
1629 		return (WALK_DONE);
1630 
1631 	return (WALK_NEXT);
1632 }
1633 
1634 #define	KMEM_WALK_ALL(name, wsp) { \
1635 	wsp->walk_data = (name); \
1636 	if (mdb_walk("kmem_cache", (mdb_walk_cb_t)kmem_walk_all, wsp) == -1) \
1637 		return (WALK_ERR); \
1638 	return (WALK_DONE); \
1639 }
1640 
1641 int
kmem_walk_init(mdb_walk_state_t * wsp)1642 kmem_walk_init(mdb_walk_state_t *wsp)
1643 {
1644 	if (wsp->walk_arg != NULL)
1645 		wsp->walk_addr = (uintptr_t)wsp->walk_arg;
1646 
1647 	if (wsp->walk_addr == 0)
1648 		KMEM_WALK_ALL("kmem", wsp);
1649 	return (kmem_walk_init_common(wsp, KM_ALLOCATED));
1650 }
1651 
1652 int
bufctl_walk_init(mdb_walk_state_t * wsp)1653 bufctl_walk_init(mdb_walk_state_t *wsp)
1654 {
1655 	if (wsp->walk_addr == 0)
1656 		KMEM_WALK_ALL("bufctl", wsp);
1657 	return (kmem_walk_init_common(wsp, KM_ALLOCATED | KM_BUFCTL));
1658 }
1659 
1660 int
freemem_walk_init(mdb_walk_state_t * wsp)1661 freemem_walk_init(mdb_walk_state_t *wsp)
1662 {
1663 	if (wsp->walk_addr == 0)
1664 		KMEM_WALK_ALL("freemem", wsp);
1665 	return (kmem_walk_init_common(wsp, KM_FREE));
1666 }
1667 
1668 int
freemem_constructed_walk_init(mdb_walk_state_t * wsp)1669 freemem_constructed_walk_init(mdb_walk_state_t *wsp)
1670 {
1671 	if (wsp->walk_addr == 0)
1672 		KMEM_WALK_ALL("freemem_constructed", wsp);
1673 	return (kmem_walk_init_common(wsp, KM_FREE | KM_CONSTRUCTED));
1674 }
1675 
1676 int
freectl_walk_init(mdb_walk_state_t * wsp)1677 freectl_walk_init(mdb_walk_state_t *wsp)
1678 {
1679 	if (wsp->walk_addr == 0)
1680 		KMEM_WALK_ALL("freectl", wsp);
1681 	return (kmem_walk_init_common(wsp, KM_FREE | KM_BUFCTL));
1682 }
1683 
1684 int
freectl_constructed_walk_init(mdb_walk_state_t * wsp)1685 freectl_constructed_walk_init(mdb_walk_state_t *wsp)
1686 {
1687 	if (wsp->walk_addr == 0)
1688 		KMEM_WALK_ALL("freectl_constructed", wsp);
1689 	return (kmem_walk_init_common(wsp,
1690 	    KM_FREE | KM_BUFCTL | KM_CONSTRUCTED));
1691 }
1692 
1693 typedef struct bufctl_history_walk {
1694 	void		*bhw_next;
1695 	kmem_cache_t	*bhw_cache;
1696 	kmem_slab_t	*bhw_slab;
1697 	hrtime_t	bhw_timestamp;
1698 } bufctl_history_walk_t;
1699 
1700 int
bufctl_history_walk_init(mdb_walk_state_t * wsp)1701 bufctl_history_walk_init(mdb_walk_state_t *wsp)
1702 {
1703 	bufctl_history_walk_t *bhw;
1704 	kmem_bufctl_audit_t bc;
1705 	kmem_bufctl_audit_t bcn;
1706 
1707 	if (wsp->walk_addr == 0) {
1708 		mdb_warn("bufctl_history walk doesn't support global walks\n");
1709 		return (WALK_ERR);
1710 	}
1711 
1712 	if (mdb_vread(&bc, sizeof (bc), wsp->walk_addr) == -1) {
1713 		mdb_warn("unable to read bufctl at %p", wsp->walk_addr);
1714 		return (WALK_ERR);
1715 	}
1716 
1717 	bhw = mdb_zalloc(sizeof (*bhw), UM_SLEEP);
1718 	bhw->bhw_timestamp = 0;
1719 	bhw->bhw_cache = bc.bc_cache;
1720 	bhw->bhw_slab = bc.bc_slab;
1721 
1722 	/*
1723 	 * sometimes the first log entry matches the base bufctl;  in that
1724 	 * case, skip the base bufctl.
1725 	 */
1726 	if (bc.bc_lastlog != NULL &&
1727 	    mdb_vread(&bcn, sizeof (bcn), (uintptr_t)bc.bc_lastlog) != -1 &&
1728 	    bc.bc_addr == bcn.bc_addr &&
1729 	    bc.bc_cache == bcn.bc_cache &&
1730 	    bc.bc_slab == bcn.bc_slab &&
1731 	    bc.bc_timestamp == bcn.bc_timestamp &&
1732 	    bc.bc_thread == bcn.bc_thread)
1733 		bhw->bhw_next = bc.bc_lastlog;
1734 	else
1735 		bhw->bhw_next = (void *)wsp->walk_addr;
1736 
1737 	wsp->walk_addr = (uintptr_t)bc.bc_addr;
1738 	wsp->walk_data = bhw;
1739 
1740 	return (WALK_NEXT);
1741 }
1742 
1743 int
bufctl_history_walk_step(mdb_walk_state_t * wsp)1744 bufctl_history_walk_step(mdb_walk_state_t *wsp)
1745 {
1746 	bufctl_history_walk_t *bhw = wsp->walk_data;
1747 	uintptr_t addr = (uintptr_t)bhw->bhw_next;
1748 	uintptr_t baseaddr = wsp->walk_addr;
1749 	kmem_bufctl_audit_t bc;
1750 
1751 	if (addr == 0)
1752 		return (WALK_DONE);
1753 
1754 	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
1755 		mdb_warn("unable to read bufctl at %p", bhw->bhw_next);
1756 		return (WALK_ERR);
1757 	}
1758 
1759 	/*
1760 	 * The bufctl is only valid if the address, cache, and slab are
1761 	 * correct.  We also check that the timestamp is decreasing, to
1762 	 * prevent infinite loops.
1763 	 */
1764 	if ((uintptr_t)bc.bc_addr != baseaddr ||
1765 	    bc.bc_cache != bhw->bhw_cache ||
1766 	    bc.bc_slab != bhw->bhw_slab ||
1767 	    (bhw->bhw_timestamp != 0 && bc.bc_timestamp >= bhw->bhw_timestamp))
1768 		return (WALK_DONE);
1769 
1770 	bhw->bhw_next = bc.bc_lastlog;
1771 	bhw->bhw_timestamp = bc.bc_timestamp;
1772 
1773 	return (wsp->walk_callback(addr, &bc, wsp->walk_cbdata));
1774 }
1775 
1776 void
bufctl_history_walk_fini(mdb_walk_state_t * wsp)1777 bufctl_history_walk_fini(mdb_walk_state_t *wsp)
1778 {
1779 	bufctl_history_walk_t *bhw = wsp->walk_data;
1780 
1781 	mdb_free(bhw, sizeof (*bhw));
1782 }
1783 
1784 typedef struct kmem_log_walk {
1785 	kmem_bufctl_audit_t *klw_base;
1786 	kmem_bufctl_audit_t **klw_sorted;
1787 	kmem_log_header_t klw_lh;
1788 	size_t klw_size;
1789 	size_t klw_maxndx;
1790 	size_t klw_ndx;
1791 } kmem_log_walk_t;
1792 
1793 int
kmem_log_walk_init(mdb_walk_state_t * wsp)1794 kmem_log_walk_init(mdb_walk_state_t *wsp)
1795 {
1796 	uintptr_t lp = wsp->walk_addr;
1797 	kmem_log_walk_t *klw;
1798 	kmem_log_header_t *lhp;
1799 	int maxndx, i, j, k;
1800 
1801 	/*
1802 	 * By default (global walk), walk the kmem_transaction_log.  Otherwise
1803 	 * read the log whose kmem_log_header_t is stored at walk_addr.
1804 	 */
1805 	if (lp == 0 && mdb_readvar(&lp, "kmem_transaction_log") == -1) {
1806 		mdb_warn("failed to read 'kmem_transaction_log'");
1807 		return (WALK_ERR);
1808 	}
1809 
1810 	if (lp == 0) {
1811 		mdb_warn("log is disabled\n");
1812 		return (WALK_ERR);
1813 	}
1814 
1815 	klw = mdb_zalloc(sizeof (kmem_log_walk_t), UM_SLEEP);
1816 	lhp = &klw->klw_lh;
1817 
1818 	if (mdb_vread(lhp, sizeof (kmem_log_header_t), lp) == -1) {
1819 		mdb_warn("failed to read log header at %p", lp);
1820 		mdb_free(klw, sizeof (kmem_log_walk_t));
1821 		return (WALK_ERR);
1822 	}
1823 
1824 	klw->klw_size = lhp->lh_chunksize * lhp->lh_nchunks;
1825 	klw->klw_base = mdb_alloc(klw->klw_size, UM_SLEEP);
1826 	maxndx = lhp->lh_chunksize / sizeof (kmem_bufctl_audit_t) - 1;
1827 
1828 	if (mdb_vread(klw->klw_base, klw->klw_size,
1829 	    (uintptr_t)lhp->lh_base) == -1) {
1830 		mdb_warn("failed to read log at base %p", lhp->lh_base);
1831 		mdb_free(klw->klw_base, klw->klw_size);
1832 		mdb_free(klw, sizeof (kmem_log_walk_t));
1833 		return (WALK_ERR);
1834 	}
1835 
1836 	klw->klw_sorted = mdb_alloc(maxndx * lhp->lh_nchunks *
1837 	    sizeof (kmem_bufctl_audit_t *), UM_SLEEP);
1838 
1839 	for (i = 0, k = 0; i < lhp->lh_nchunks; i++) {
1840 		kmem_bufctl_audit_t *chunk = (kmem_bufctl_audit_t *)
1841 		    ((uintptr_t)klw->klw_base + i * lhp->lh_chunksize);
1842 
1843 		for (j = 0; j < maxndx; j++)
1844 			klw->klw_sorted[k++] = &chunk[j];
1845 	}
1846 
1847 	qsort(klw->klw_sorted, k, sizeof (kmem_bufctl_audit_t *),
1848 	    (int(*)(const void *, const void *))bufctlcmp);
1849 
1850 	klw->klw_maxndx = k;
1851 	wsp->walk_data = klw;
1852 
1853 	return (WALK_NEXT);
1854 }
1855 
1856 int
kmem_log_walk_step(mdb_walk_state_t * wsp)1857 kmem_log_walk_step(mdb_walk_state_t *wsp)
1858 {
1859 	kmem_log_walk_t *klw = wsp->walk_data;
1860 	kmem_bufctl_audit_t *bcp;
1861 
1862 	if (klw->klw_ndx == klw->klw_maxndx)
1863 		return (WALK_DONE);
1864 
1865 	bcp = klw->klw_sorted[klw->klw_ndx++];
1866 
1867 	return (wsp->walk_callback((uintptr_t)bcp - (uintptr_t)klw->klw_base +
1868 	    (uintptr_t)klw->klw_lh.lh_base, bcp, wsp->walk_cbdata));
1869 }
1870 
1871 void
kmem_log_walk_fini(mdb_walk_state_t * wsp)1872 kmem_log_walk_fini(mdb_walk_state_t *wsp)
1873 {
1874 	kmem_log_walk_t *klw = wsp->walk_data;
1875 
1876 	mdb_free(klw->klw_base, klw->klw_size);
1877 	mdb_free(klw->klw_sorted, klw->klw_maxndx *
1878 	    sizeof (kmem_bufctl_audit_t *));
1879 	mdb_free(klw, sizeof (kmem_log_walk_t));
1880 }
1881 
1882 typedef struct allocdby_bufctl {
1883 	uintptr_t abb_addr;
1884 	hrtime_t abb_ts;
1885 } allocdby_bufctl_t;
1886 
1887 typedef struct allocdby_walk {
1888 	const char *abw_walk;
1889 	uintptr_t abw_thread;
1890 	size_t abw_nbufs;
1891 	size_t abw_size;
1892 	allocdby_bufctl_t *abw_buf;
1893 	size_t abw_ndx;
1894 } allocdby_walk_t;
1895 
1896 int
allocdby_walk_bufctl(uintptr_t addr,const kmem_bufctl_audit_t * bcp,allocdby_walk_t * abw)1897 allocdby_walk_bufctl(uintptr_t addr, const kmem_bufctl_audit_t *bcp,
1898     allocdby_walk_t *abw)
1899 {
1900 	if ((uintptr_t)bcp->bc_thread != abw->abw_thread)
1901 		return (WALK_NEXT);
1902 
1903 	if (abw->abw_nbufs == abw->abw_size) {
1904 		allocdby_bufctl_t *buf;
1905 		size_t oldsize = sizeof (allocdby_bufctl_t) * abw->abw_size;
1906 
1907 		buf = mdb_zalloc(oldsize << 1, UM_SLEEP);
1908 
1909 		bcopy(abw->abw_buf, buf, oldsize);
1910 		mdb_free(abw->abw_buf, oldsize);
1911 
1912 		abw->abw_size <<= 1;
1913 		abw->abw_buf = buf;
1914 	}
1915 
1916 	abw->abw_buf[abw->abw_nbufs].abb_addr = addr;
1917 	abw->abw_buf[abw->abw_nbufs].abb_ts = bcp->bc_timestamp;
1918 	abw->abw_nbufs++;
1919 
1920 	return (WALK_NEXT);
1921 }
1922 
1923 /*ARGSUSED*/
1924 int
allocdby_walk_cache(uintptr_t addr,const kmem_cache_t * c,allocdby_walk_t * abw)1925 allocdby_walk_cache(uintptr_t addr, const kmem_cache_t *c, allocdby_walk_t *abw)
1926 {
1927 	if (mdb_pwalk(abw->abw_walk, (mdb_walk_cb_t)allocdby_walk_bufctl,
1928 	    abw, addr) == -1) {
1929 		mdb_warn("couldn't walk bufctl for cache %p", addr);
1930 		return (WALK_DONE);
1931 	}
1932 
1933 	return (WALK_NEXT);
1934 }
1935 
1936 static int
allocdby_cmp(const allocdby_bufctl_t * lhs,const allocdby_bufctl_t * rhs)1937 allocdby_cmp(const allocdby_bufctl_t *lhs, const allocdby_bufctl_t *rhs)
1938 {
1939 	if (lhs->abb_ts < rhs->abb_ts)
1940 		return (1);
1941 	if (lhs->abb_ts > rhs->abb_ts)
1942 		return (-1);
1943 	return (0);
1944 }
1945 
1946 static int
allocdby_walk_init_common(mdb_walk_state_t * wsp,const char * walk)1947 allocdby_walk_init_common(mdb_walk_state_t *wsp, const char *walk)
1948 {
1949 	allocdby_walk_t *abw;
1950 
1951 	if (wsp->walk_addr == 0) {
1952 		mdb_warn("allocdby walk doesn't support global walks\n");
1953 		return (WALK_ERR);
1954 	}
1955 
1956 	abw = mdb_zalloc(sizeof (allocdby_walk_t), UM_SLEEP);
1957 
1958 	abw->abw_thread = wsp->walk_addr;
1959 	abw->abw_walk = walk;
1960 	abw->abw_size = 128;	/* something reasonable */
1961 	abw->abw_buf =
1962 	    mdb_zalloc(abw->abw_size * sizeof (allocdby_bufctl_t), UM_SLEEP);
1963 
1964 	wsp->walk_data = abw;
1965 
1966 	if (mdb_walk("kmem_cache",
1967 	    (mdb_walk_cb_t)allocdby_walk_cache, abw) == -1) {
1968 		mdb_warn("couldn't walk kmem_cache");
1969 		allocdby_walk_fini(wsp);
1970 		return (WALK_ERR);
1971 	}
1972 
1973 	qsort(abw->abw_buf, abw->abw_nbufs, sizeof (allocdby_bufctl_t),
1974 	    (int(*)(const void *, const void *))allocdby_cmp);
1975 
1976 	return (WALK_NEXT);
1977 }
1978 
1979 int
allocdby_walk_init(mdb_walk_state_t * wsp)1980 allocdby_walk_init(mdb_walk_state_t *wsp)
1981 {
1982 	return (allocdby_walk_init_common(wsp, "bufctl"));
1983 }
1984 
1985 int
freedby_walk_init(mdb_walk_state_t * wsp)1986 freedby_walk_init(mdb_walk_state_t *wsp)
1987 {
1988 	return (allocdby_walk_init_common(wsp, "freectl"));
1989 }
1990 
1991 int
allocdby_walk_step(mdb_walk_state_t * wsp)1992 allocdby_walk_step(mdb_walk_state_t *wsp)
1993 {
1994 	allocdby_walk_t *abw = wsp->walk_data;
1995 	kmem_bufctl_audit_t bc;
1996 	uintptr_t addr;
1997 
1998 	if (abw->abw_ndx == abw->abw_nbufs)
1999 		return (WALK_DONE);
2000 
2001 	addr = abw->abw_buf[abw->abw_ndx++].abb_addr;
2002 
2003 	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
2004 		mdb_warn("couldn't read bufctl at %p", addr);
2005 		return (WALK_DONE);
2006 	}
2007 
2008 	return (wsp->walk_callback(addr, &bc, wsp->walk_cbdata));
2009 }
2010 
2011 void
allocdby_walk_fini(mdb_walk_state_t * wsp)2012 allocdby_walk_fini(mdb_walk_state_t *wsp)
2013 {
2014 	allocdby_walk_t *abw = wsp->walk_data;
2015 
2016 	mdb_free(abw->abw_buf, sizeof (allocdby_bufctl_t) * abw->abw_size);
2017 	mdb_free(abw, sizeof (allocdby_walk_t));
2018 }
2019 
2020 /*ARGSUSED*/
2021 int
allocdby_walk(uintptr_t addr,const kmem_bufctl_audit_t * bcp,void * ignored)2022 allocdby_walk(uintptr_t addr, const kmem_bufctl_audit_t *bcp, void *ignored)
2023 {
2024 	char c[MDB_SYM_NAMLEN];
2025 	GElf_Sym sym;
2026 	int i;
2027 
2028 	mdb_printf("%0?p %12llx ", addr, bcp->bc_timestamp);
2029 	for (i = 0; i < bcp->bc_depth; i++) {
2030 		if (mdb_lookup_by_addr(bcp->bc_stack[i],
2031 		    MDB_SYM_FUZZY, c, sizeof (c), &sym) == -1)
2032 			continue;
2033 		if (strncmp(c, "kmem_", 5) == 0)
2034 			continue;
2035 		mdb_printf("%s+0x%lx",
2036 		    c, bcp->bc_stack[i] - (uintptr_t)sym.st_value);
2037 		break;
2038 	}
2039 	mdb_printf("\n");
2040 
2041 	return (WALK_NEXT);
2042 }
2043 
2044 static int
allocdby_common(uintptr_t addr,uint_t flags,const char * w)2045 allocdby_common(uintptr_t addr, uint_t flags, const char *w)
2046 {
2047 	if (!(flags & DCMD_ADDRSPEC))
2048 		return (DCMD_USAGE);
2049 
2050 	mdb_printf("%-?s %12s %s\n", "BUFCTL", "TIMESTAMP", "CALLER");
2051 
2052 	if (mdb_pwalk(w, (mdb_walk_cb_t)allocdby_walk, NULL, addr) == -1) {
2053 		mdb_warn("can't walk '%s' for %p", w, addr);
2054 		return (DCMD_ERR);
2055 	}
2056 
2057 	return (DCMD_OK);
2058 }
2059 
2060 /*ARGSUSED*/
2061 int
allocdby(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2062 allocdby(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2063 {
2064 	return (allocdby_common(addr, flags, "allocdby"));
2065 }
2066 
2067 /*ARGSUSED*/
2068 int
freedby(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2069 freedby(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2070 {
2071 	return (allocdby_common(addr, flags, "freedby"));
2072 }
2073 
2074 /*
2075  * Return a string describing the address in relation to the given thread's
2076  * stack.
2077  *
2078  * - If the thread state is TS_FREE, return " (inactive interrupt thread)".
2079  *
2080  * - If the address is above the stack pointer, return an empty string
2081  *   signifying that the address is active.
2082  *
2083  * - If the address is below the stack pointer, and the thread is not on proc,
2084  *   return " (below sp)".
2085  *
2086  * - If the address is below the stack pointer, and the thread is on proc,
2087  *   return " (possibly below sp)".  Depending on context, we may or may not
2088  *   have an accurate t_sp.
2089  */
2090 static const char *
stack_active(const kthread_t * t,uintptr_t addr)2091 stack_active(const kthread_t *t, uintptr_t addr)
2092 {
2093 	uintptr_t panicstk;
2094 	GElf_Sym sym;
2095 
2096 	if (t->t_state == TS_FREE)
2097 		return (" (inactive interrupt thread)");
2098 
2099 	/*
2100 	 * Check to see if we're on the panic stack.  If so, ignore t_sp, as it
2101 	 * no longer relates to the thread's real stack.
2102 	 */
2103 	if (mdb_lookup_by_name("panic_stack", &sym) == 0) {
2104 		panicstk = (uintptr_t)sym.st_value;
2105 
2106 		if (t->t_sp >= panicstk && t->t_sp < panicstk + PANICSTKSIZE)
2107 			return ("");
2108 	}
2109 
2110 	if (addr >= t->t_sp + STACK_BIAS)
2111 		return ("");
2112 
2113 	if (t->t_state == TS_ONPROC)
2114 		return (" (possibly below sp)");
2115 
2116 	return (" (below sp)");
2117 }
2118 
2119 /*
2120  * Additional state for the kmem and vmem ::whatis handlers
2121  */
2122 typedef struct whatis_info {
2123 	mdb_whatis_t *wi_w;
2124 	const kmem_cache_t *wi_cache;
2125 	const vmem_t *wi_vmem;
2126 	vmem_t *wi_msb_arena;
2127 	size_t wi_slab_size;
2128 	uint_t wi_slab_found;
2129 	uint_t wi_kmem_lite_count;
2130 	uint_t wi_freemem;
2131 } whatis_info_t;
2132 
2133 /* call one of our dcmd functions with "-v" and the provided address */
2134 static void
whatis_call_printer(mdb_dcmd_f * dcmd,uintptr_t addr)2135 whatis_call_printer(mdb_dcmd_f *dcmd, uintptr_t addr)
2136 {
2137 	mdb_arg_t a;
2138 	a.a_type = MDB_TYPE_STRING;
2139 	a.a_un.a_str = "-v";
2140 
2141 	mdb_printf(":\n");
2142 	(void) (*dcmd)(addr, DCMD_ADDRSPEC, 1, &a);
2143 }
2144 
2145 static void
whatis_print_kmf_lite(uintptr_t btaddr,size_t count)2146 whatis_print_kmf_lite(uintptr_t btaddr, size_t count)
2147 {
2148 #define	KMEM_LITE_MAX	16
2149 	pc_t callers[KMEM_LITE_MAX];
2150 	pc_t uninit = (pc_t)KMEM_UNINITIALIZED_PATTERN;
2151 
2152 	kmem_buftag_t bt;
2153 	intptr_t stat;
2154 	const char *plural = "";
2155 	int i;
2156 
2157 	/* validate our arguments and read in the buftag */
2158 	if (count == 0 || count > KMEM_LITE_MAX ||
2159 	    mdb_vread(&bt, sizeof (bt), btaddr) == -1)
2160 		return;
2161 
2162 	/* validate the buffer state and read in the callers */
2163 	stat = (intptr_t)bt.bt_bufctl ^ bt.bt_bxstat;
2164 
2165 	if (stat != KMEM_BUFTAG_ALLOC && stat != KMEM_BUFTAG_FREE)
2166 		return;
2167 
2168 	if (mdb_vread(callers, count * sizeof (pc_t),
2169 	    btaddr + offsetof(kmem_buftag_lite_t, bt_history)) == -1)
2170 		return;
2171 
2172 	/* If there aren't any filled in callers, bail */
2173 	if (callers[0] == uninit)
2174 		return;
2175 
2176 	plural = (callers[1] == uninit) ? "" : "s";
2177 
2178 	/* Everything's done and checked; print them out */
2179 	mdb_printf(":\n");
2180 
2181 	mdb_inc_indent(8);
2182 	mdb_printf("recent caller%s: %a", plural, callers[0]);
2183 	for (i = 1; i < count; i++) {
2184 		if (callers[i] == uninit)
2185 			break;
2186 		mdb_printf(", %a", callers[i]);
2187 	}
2188 	mdb_dec_indent(8);
2189 }
2190 
2191 static void
whatis_print_kmem(whatis_info_t * wi,uintptr_t maddr,uintptr_t addr,uintptr_t baddr)2192 whatis_print_kmem(whatis_info_t *wi, uintptr_t maddr, uintptr_t addr,
2193     uintptr_t baddr)
2194 {
2195 	mdb_whatis_t *w = wi->wi_w;
2196 
2197 	const kmem_cache_t *cp = wi->wi_cache;
2198 	/* LINTED pointer cast may result in improper alignment */
2199 	uintptr_t btaddr = (uintptr_t)KMEM_BUFTAG(cp, addr);
2200 	int quiet = (mdb_whatis_flags(w) & WHATIS_QUIET);
2201 	int call_printer = (!quiet && (cp->cache_flags & KMF_AUDIT));
2202 
2203 	mdb_whatis_report_object(w, maddr, addr, "");
2204 
2205 	if (baddr != 0 && !call_printer)
2206 		mdb_printf("bufctl %p ", baddr);
2207 
2208 	mdb_printf("%s from %s",
2209 	    (wi->wi_freemem == FALSE) ? "allocated" : "freed", cp->cache_name);
2210 
2211 	if (baddr != 0 && call_printer) {
2212 		whatis_call_printer(bufctl, baddr);
2213 		return;
2214 	}
2215 
2216 	/* for KMF_LITE caches, try to print out the previous callers */
2217 	if (!quiet && (cp->cache_flags & KMF_LITE))
2218 		whatis_print_kmf_lite(btaddr, wi->wi_kmem_lite_count);
2219 
2220 	mdb_printf("\n");
2221 }
2222 
2223 /*ARGSUSED*/
2224 static int
whatis_walk_kmem(uintptr_t addr,void * ignored,whatis_info_t * wi)2225 whatis_walk_kmem(uintptr_t addr, void *ignored, whatis_info_t *wi)
2226 {
2227 	mdb_whatis_t *w = wi->wi_w;
2228 
2229 	uintptr_t cur;
2230 	size_t size = wi->wi_cache->cache_bufsize;
2231 
2232 	while (mdb_whatis_match(w, addr, size, &cur))
2233 		whatis_print_kmem(wi, cur, addr, 0);
2234 
2235 	return (WHATIS_WALKRET(w));
2236 }
2237 
2238 /*ARGSUSED*/
2239 static int
whatis_walk_bufctl(uintptr_t baddr,const kmem_bufctl_t * bcp,whatis_info_t * wi)2240 whatis_walk_bufctl(uintptr_t baddr, const kmem_bufctl_t *bcp, whatis_info_t *wi)
2241 {
2242 	mdb_whatis_t *w = wi->wi_w;
2243 
2244 	uintptr_t cur;
2245 	uintptr_t addr = (uintptr_t)bcp->bc_addr;
2246 	size_t size = wi->wi_cache->cache_bufsize;
2247 
2248 	while (mdb_whatis_match(w, addr, size, &cur))
2249 		whatis_print_kmem(wi, cur, addr, baddr);
2250 
2251 	return (WHATIS_WALKRET(w));
2252 }
2253 
2254 static int
whatis_walk_seg(uintptr_t addr,const vmem_seg_t * vs,whatis_info_t * wi)2255 whatis_walk_seg(uintptr_t addr, const vmem_seg_t *vs, whatis_info_t *wi)
2256 {
2257 	mdb_whatis_t *w = wi->wi_w;
2258 
2259 	size_t size = vs->vs_end - vs->vs_start;
2260 	uintptr_t cur;
2261 
2262 	/* We're not interested in anything but alloc and free segments */
2263 	if (vs->vs_type != VMEM_ALLOC && vs->vs_type != VMEM_FREE)
2264 		return (WALK_NEXT);
2265 
2266 	while (mdb_whatis_match(w, vs->vs_start, size, &cur)) {
2267 		mdb_whatis_report_object(w, cur, vs->vs_start, "");
2268 
2269 		/*
2270 		 * If we're not printing it seperately, provide the vmem_seg
2271 		 * pointer if it has a stack trace.
2272 		 */
2273 		if ((mdb_whatis_flags(w) & WHATIS_QUIET) &&
2274 		    (!(mdb_whatis_flags(w) & WHATIS_BUFCTL) ||
2275 		    (vs->vs_type == VMEM_ALLOC && vs->vs_depth != 0))) {
2276 			mdb_printf("vmem_seg %p ", addr);
2277 		}
2278 
2279 		mdb_printf("%s from the %s vmem arena",
2280 		    (vs->vs_type == VMEM_ALLOC) ? "allocated" : "freed",
2281 		    wi->wi_vmem->vm_name);
2282 
2283 		if (!(mdb_whatis_flags(w) & WHATIS_QUIET))
2284 			whatis_call_printer(vmem_seg, addr);
2285 		else
2286 			mdb_printf("\n");
2287 	}
2288 
2289 	return (WHATIS_WALKRET(w));
2290 }
2291 
2292 static int
whatis_walk_vmem(uintptr_t addr,const vmem_t * vmem,whatis_info_t * wi)2293 whatis_walk_vmem(uintptr_t addr, const vmem_t *vmem, whatis_info_t *wi)
2294 {
2295 	mdb_whatis_t *w = wi->wi_w;
2296 	const char *nm = vmem->vm_name;
2297 
2298 	int identifier = ((vmem->vm_cflags & VMC_IDENTIFIER) != 0);
2299 	int idspace = ((mdb_whatis_flags(w) & WHATIS_IDSPACE) != 0);
2300 
2301 	if (identifier != idspace)
2302 		return (WALK_NEXT);
2303 
2304 	wi->wi_vmem = vmem;
2305 
2306 	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2307 		mdb_printf("Searching vmem arena %s...\n", nm);
2308 
2309 	if (mdb_pwalk("vmem_seg",
2310 	    (mdb_walk_cb_t)whatis_walk_seg, wi, addr) == -1) {
2311 		mdb_warn("can't walk vmem_seg for %p", addr);
2312 		return (WALK_NEXT);
2313 	}
2314 
2315 	return (WHATIS_WALKRET(w));
2316 }
2317 
2318 /*ARGSUSED*/
2319 static int
whatis_walk_slab(uintptr_t saddr,const kmem_slab_t * sp,whatis_info_t * wi)2320 whatis_walk_slab(uintptr_t saddr, const kmem_slab_t *sp, whatis_info_t *wi)
2321 {
2322 	mdb_whatis_t *w = wi->wi_w;
2323 
2324 	/* It must overlap with the slab data, or it's not interesting */
2325 	if (mdb_whatis_overlaps(w,
2326 	    (uintptr_t)sp->slab_base, wi->wi_slab_size)) {
2327 		wi->wi_slab_found++;
2328 		return (WALK_DONE);
2329 	}
2330 	return (WALK_NEXT);
2331 }
2332 
2333 static int
whatis_walk_cache(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2334 whatis_walk_cache(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2335 {
2336 	mdb_whatis_t *w = wi->wi_w;
2337 
2338 	char *walk, *freewalk;
2339 	mdb_walk_cb_t func;
2340 	int do_bufctl;
2341 
2342 	int identifier = ((c->cache_flags & KMC_IDENTIFIER) != 0);
2343 	int idspace = ((mdb_whatis_flags(w) & WHATIS_IDSPACE) != 0);
2344 
2345 	if (identifier != idspace)
2346 		return (WALK_NEXT);
2347 
2348 	/* Override the '-b' flag as necessary */
2349 	if (!(c->cache_flags & KMF_HASH))
2350 		do_bufctl = FALSE;	/* no bufctls to walk */
2351 	else if (c->cache_flags & KMF_AUDIT)
2352 		do_bufctl = TRUE;	/* we always want debugging info */
2353 	else
2354 		do_bufctl = ((mdb_whatis_flags(w) & WHATIS_BUFCTL) != 0);
2355 
2356 	if (do_bufctl) {
2357 		walk = "bufctl";
2358 		freewalk = "freectl";
2359 		func = (mdb_walk_cb_t)whatis_walk_bufctl;
2360 	} else {
2361 		walk = "kmem";
2362 		freewalk = "freemem";
2363 		func = (mdb_walk_cb_t)whatis_walk_kmem;
2364 	}
2365 
2366 	wi->wi_cache = c;
2367 
2368 	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2369 		mdb_printf("Searching %s...\n", c->cache_name);
2370 
2371 	/*
2372 	 * If more then two buffers live on each slab, figure out if we're
2373 	 * interested in anything in any slab before doing the more expensive
2374 	 * kmem/freemem (bufctl/freectl) walkers.
2375 	 */
2376 	wi->wi_slab_size = c->cache_slabsize - c->cache_maxcolor;
2377 	if (!(c->cache_flags & KMF_HASH))
2378 		wi->wi_slab_size -= sizeof (kmem_slab_t);
2379 
2380 	if ((wi->wi_slab_size / c->cache_chunksize) > 2) {
2381 		wi->wi_slab_found = 0;
2382 		if (mdb_pwalk("kmem_slab", (mdb_walk_cb_t)whatis_walk_slab, wi,
2383 		    addr) == -1) {
2384 			mdb_warn("can't find kmem_slab walker");
2385 			return (WALK_DONE);
2386 		}
2387 		if (wi->wi_slab_found == 0)
2388 			return (WALK_NEXT);
2389 	}
2390 
2391 	wi->wi_freemem = FALSE;
2392 	if (mdb_pwalk(walk, func, wi, addr) == -1) {
2393 		mdb_warn("can't find %s walker", walk);
2394 		return (WALK_DONE);
2395 	}
2396 
2397 	if (mdb_whatis_done(w))
2398 		return (WALK_DONE);
2399 
2400 	/*
2401 	 * We have searched for allocated memory; now search for freed memory.
2402 	 */
2403 	if (mdb_whatis_flags(w) & WHATIS_VERBOSE)
2404 		mdb_printf("Searching %s for free memory...\n", c->cache_name);
2405 
2406 	wi->wi_freemem = TRUE;
2407 	if (mdb_pwalk(freewalk, func, wi, addr) == -1) {
2408 		mdb_warn("can't find %s walker", freewalk);
2409 		return (WALK_DONE);
2410 	}
2411 
2412 	return (WHATIS_WALKRET(w));
2413 }
2414 
2415 static int
whatis_walk_touch(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2416 whatis_walk_touch(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2417 {
2418 	if (c->cache_arena == wi->wi_msb_arena ||
2419 	    (c->cache_cflags & KMC_NOTOUCH))
2420 		return (WALK_NEXT);
2421 
2422 	return (whatis_walk_cache(addr, c, wi));
2423 }
2424 
2425 static int
whatis_walk_metadata(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2426 whatis_walk_metadata(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2427 {
2428 	if (c->cache_arena != wi->wi_msb_arena)
2429 		return (WALK_NEXT);
2430 
2431 	return (whatis_walk_cache(addr, c, wi));
2432 }
2433 
2434 static int
whatis_walk_notouch(uintptr_t addr,const kmem_cache_t * c,whatis_info_t * wi)2435 whatis_walk_notouch(uintptr_t addr, const kmem_cache_t *c, whatis_info_t *wi)
2436 {
2437 	if (c->cache_arena == wi->wi_msb_arena ||
2438 	    !(c->cache_cflags & KMC_NOTOUCH))
2439 		return (WALK_NEXT);
2440 
2441 	return (whatis_walk_cache(addr, c, wi));
2442 }
2443 
2444 static int
whatis_walk_thread(uintptr_t addr,const kthread_t * t,mdb_whatis_t * w)2445 whatis_walk_thread(uintptr_t addr, const kthread_t *t, mdb_whatis_t *w)
2446 {
2447 	uintptr_t cur;
2448 	uintptr_t saddr;
2449 	size_t size;
2450 
2451 	/*
2452 	 * Often, one calls ::whatis on an address from a thread structure.
2453 	 * We use this opportunity to short circuit this case...
2454 	 */
2455 	while (mdb_whatis_match(w, addr, sizeof (kthread_t), &cur))
2456 		mdb_whatis_report_object(w, cur, addr,
2457 		    "allocated as a thread structure\n");
2458 
2459 	/*
2460 	 * Now check the stack
2461 	 */
2462 	if (t->t_stkbase == NULL)
2463 		return (WALK_NEXT);
2464 
2465 	/*
2466 	 * This assumes that t_stk is the end of the stack, but it's really
2467 	 * only the initial stack pointer for the thread.  Arguments to the
2468 	 * initial procedure, SA(MINFRAME), etc. are all after t_stk.  So
2469 	 * that 't->t_stk::whatis' reports "part of t's stack", we include
2470 	 * t_stk in the range (the "+ 1", below), but the kernel should
2471 	 * really include the full stack bounds where we can find it.
2472 	 */
2473 	saddr = (uintptr_t)t->t_stkbase;
2474 	size = (uintptr_t)t->t_stk - saddr + 1;
2475 	while (mdb_whatis_match(w, saddr, size, &cur))
2476 		mdb_whatis_report_object(w, cur, cur,
2477 		    "in thread %p's stack%s\n", addr, stack_active(t, cur));
2478 
2479 	return (WHATIS_WALKRET(w));
2480 }
2481 
2482 static void
whatis_modctl_match(mdb_whatis_t * w,const char * name,uintptr_t base,size_t size,const char * where)2483 whatis_modctl_match(mdb_whatis_t *w, const char *name,
2484     uintptr_t base, size_t size, const char *where)
2485 {
2486 	uintptr_t cur;
2487 
2488 	/*
2489 	 * Since we're searching for addresses inside a module, we report
2490 	 * them as symbols.
2491 	 */
2492 	while (mdb_whatis_match(w, base, size, &cur))
2493 		mdb_whatis_report_address(w, cur, "in %s's %s\n", name, where);
2494 }
2495 
2496 struct kmem_ctf_module {
2497 	Shdr *symhdr;
2498 	char *symtbl;
2499 	unsigned int nsyms;
2500 	char *symspace;
2501 	size_t symsize;
2502 	char *text;
2503 	char *data;
2504 	uintptr_t bss;
2505 	size_t text_size;
2506 	size_t data_size;
2507 	size_t bss_size;
2508 };
2509 
2510 static int
whatis_walk_modctl(uintptr_t addr,const struct modctl * m,mdb_whatis_t * w)2511 whatis_walk_modctl(uintptr_t addr, const struct modctl *m, mdb_whatis_t *w)
2512 {
2513 	char name[MODMAXNAMELEN];
2514 	struct kmem_ctf_module mod;
2515 	Shdr shdr;
2516 
2517 	if (m->mod_mp == NULL)
2518 		return (WALK_NEXT);
2519 
2520 	if (mdb_ctf_vread(&mod, "struct module", "struct kmem_ctf_module",
2521 	    (uintptr_t)m->mod_mp, 0) == -1) {
2522 		mdb_warn("couldn't read modctl %p's module", addr);
2523 		return (WALK_NEXT);
2524 	}
2525 
2526 	if (mdb_readstr(name, sizeof (name), (uintptr_t)m->mod_modname) == -1)
2527 		(void) mdb_snprintf(name, sizeof (name), "0x%p", addr);
2528 
2529 	whatis_modctl_match(w, name,
2530 	    (uintptr_t)mod.text, mod.text_size, "text segment");
2531 	whatis_modctl_match(w, name,
2532 	    (uintptr_t)mod.data, mod.data_size, "data segment");
2533 	whatis_modctl_match(w, name,
2534 	    (uintptr_t)mod.bss, mod.bss_size, "bss segment");
2535 
2536 	if (mdb_vread(&shdr, sizeof (shdr), (uintptr_t)mod.symhdr) == -1) {
2537 		mdb_warn("couldn't read symbol header for %p's module", addr);
2538 		return (WALK_NEXT);
2539 	}
2540 
2541 	whatis_modctl_match(w, name,
2542 	    (uintptr_t)mod.symtbl, mod.nsyms * shdr.sh_entsize, "symtab");
2543 	whatis_modctl_match(w, name,
2544 	    (uintptr_t)mod.symspace, mod.symsize, "symtab");
2545 
2546 	return (WHATIS_WALKRET(w));
2547 }
2548 
2549 /*ARGSUSED*/
2550 static int
whatis_walk_memseg(uintptr_t addr,const struct memseg * seg,mdb_whatis_t * w)2551 whatis_walk_memseg(uintptr_t addr, const struct memseg *seg, mdb_whatis_t *w)
2552 {
2553 	uintptr_t cur;
2554 
2555 	uintptr_t base = (uintptr_t)seg->pages;
2556 	size_t size = (uintptr_t)seg->epages - base;
2557 
2558 	while (mdb_whatis_match(w, base, size, &cur)) {
2559 		/* round our found pointer down to the page_t base. */
2560 		size_t offset = (cur - base) % sizeof (page_t);
2561 
2562 		mdb_whatis_report_object(w, cur, cur - offset,
2563 		    "allocated as a page structure\n");
2564 	}
2565 
2566 	return (WHATIS_WALKRET(w));
2567 }
2568 
2569 /*ARGSUSED*/
2570 static int
whatis_run_modules(mdb_whatis_t * w,void * arg)2571 whatis_run_modules(mdb_whatis_t *w, void *arg)
2572 {
2573 	if (mdb_walk("modctl", (mdb_walk_cb_t)whatis_walk_modctl, w) == -1) {
2574 		mdb_warn("couldn't find modctl walker");
2575 		return (1);
2576 	}
2577 	return (0);
2578 }
2579 
2580 /*ARGSUSED*/
2581 static int
whatis_run_threads(mdb_whatis_t * w,void * ignored)2582 whatis_run_threads(mdb_whatis_t *w, void *ignored)
2583 {
2584 	/*
2585 	 * Now search all thread stacks.  Yes, this is a little weak; we
2586 	 * can save a lot of work by first checking to see if the
2587 	 * address is in segkp vs. segkmem.  But hey, computers are
2588 	 * fast.
2589 	 */
2590 	if (mdb_walk("thread", (mdb_walk_cb_t)whatis_walk_thread, w) == -1) {
2591 		mdb_warn("couldn't find thread walker");
2592 		return (1);
2593 	}
2594 	return (0);
2595 }
2596 
2597 /*ARGSUSED*/
2598 static int
whatis_run_pages(mdb_whatis_t * w,void * ignored)2599 whatis_run_pages(mdb_whatis_t *w, void *ignored)
2600 {
2601 	if (mdb_walk("memseg", (mdb_walk_cb_t)whatis_walk_memseg, w) == -1) {
2602 		mdb_warn("couldn't find memseg walker");
2603 		return (1);
2604 	}
2605 	return (0);
2606 }
2607 
2608 /*ARGSUSED*/
2609 static int
whatis_run_kmem(mdb_whatis_t * w,void * ignored)2610 whatis_run_kmem(mdb_whatis_t *w, void *ignored)
2611 {
2612 	whatis_info_t wi;
2613 
2614 	bzero(&wi, sizeof (wi));
2615 	wi.wi_w = w;
2616 
2617 	if (mdb_readvar(&wi.wi_msb_arena, "kmem_msb_arena") == -1)
2618 		mdb_warn("unable to readvar \"kmem_msb_arena\"");
2619 
2620 	if (mdb_readvar(&wi.wi_kmem_lite_count,
2621 	    "kmem_lite_count") == -1 || wi.wi_kmem_lite_count > 16)
2622 		wi.wi_kmem_lite_count = 0;
2623 
2624 	/*
2625 	 * We process kmem caches in the following order:
2626 	 *
2627 	 *	non-KMC_NOTOUCH, non-metadata	(typically the most interesting)
2628 	 *	metadata			(can be huge with KMF_AUDIT)
2629 	 *	KMC_NOTOUCH, non-metadata	(see kmem_walk_all())
2630 	 */
2631 	if (mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_touch,
2632 	    &wi) == -1 ||
2633 	    mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_metadata,
2634 	    &wi) == -1 ||
2635 	    mdb_walk("kmem_cache", (mdb_walk_cb_t)whatis_walk_notouch,
2636 	    &wi) == -1) {
2637 		mdb_warn("couldn't find kmem_cache walker");
2638 		return (1);
2639 	}
2640 	return (0);
2641 }
2642 
2643 /*ARGSUSED*/
2644 static int
whatis_run_vmem(mdb_whatis_t * w,void * ignored)2645 whatis_run_vmem(mdb_whatis_t *w, void *ignored)
2646 {
2647 	whatis_info_t wi;
2648 
2649 	bzero(&wi, sizeof (wi));
2650 	wi.wi_w = w;
2651 
2652 	if (mdb_walk("vmem_postfix",
2653 	    (mdb_walk_cb_t)whatis_walk_vmem, &wi) == -1) {
2654 		mdb_warn("couldn't find vmem_postfix walker");
2655 		return (1);
2656 	}
2657 	return (0);
2658 }
2659 
2660 typedef struct kmem_log_cpu {
2661 	uintptr_t kmc_low;
2662 	uintptr_t kmc_high;
2663 } kmem_log_cpu_t;
2664 
2665 typedef struct kmem_log_data {
2666 	uintptr_t kmd_addr;
2667 	kmem_log_cpu_t *kmd_cpu;
2668 } kmem_log_data_t;
2669 
2670 int
kmem_log_walk(uintptr_t addr,const kmem_bufctl_audit_t * b,kmem_log_data_t * kmd)2671 kmem_log_walk(uintptr_t addr, const kmem_bufctl_audit_t *b,
2672     kmem_log_data_t *kmd)
2673 {
2674 	int i;
2675 	kmem_log_cpu_t *kmc = kmd->kmd_cpu;
2676 	size_t bufsize;
2677 
2678 	for (i = 0; i < NCPU; i++) {
2679 		if (addr >= kmc[i].kmc_low && addr < kmc[i].kmc_high)
2680 			break;
2681 	}
2682 
2683 	if (kmd->kmd_addr) {
2684 		if (b->bc_cache == NULL)
2685 			return (WALK_NEXT);
2686 
2687 		if (mdb_vread(&bufsize, sizeof (bufsize),
2688 		    (uintptr_t)&b->bc_cache->cache_bufsize) == -1) {
2689 			mdb_warn(
2690 			    "failed to read cache_bufsize for cache at %p",
2691 			    b->bc_cache);
2692 			return (WALK_ERR);
2693 		}
2694 
2695 		if (kmd->kmd_addr < (uintptr_t)b->bc_addr ||
2696 		    kmd->kmd_addr >= (uintptr_t)b->bc_addr + bufsize)
2697 			return (WALK_NEXT);
2698 	}
2699 
2700 	if (i == NCPU)
2701 		mdb_printf("   ");
2702 	else
2703 		mdb_printf("%3d", i);
2704 
2705 	mdb_printf(" %0?p %0?p %16llx %0?p\n", addr, b->bc_addr,
2706 	    b->bc_timestamp, b->bc_thread);
2707 
2708 	return (WALK_NEXT);
2709 }
2710 
2711 /*ARGSUSED*/
2712 int
kmem_log(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2713 kmem_log(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2714 {
2715 	kmem_log_header_t lh;
2716 	kmem_cpu_log_header_t clh;
2717 	uintptr_t lhp, clhp;
2718 	int ncpus;
2719 	uintptr_t *cpu;
2720 	GElf_Sym sym;
2721 	kmem_log_cpu_t *kmc;
2722 	int i;
2723 	kmem_log_data_t kmd;
2724 	uint_t opt_b = FALSE;
2725 
2726 	if (mdb_getopts(argc, argv,
2727 	    'b', MDB_OPT_SETBITS, TRUE, &opt_b, NULL) != argc)
2728 		return (DCMD_USAGE);
2729 
2730 	if (mdb_readvar(&lhp, "kmem_transaction_log") == -1) {
2731 		mdb_warn("failed to read 'kmem_transaction_log'");
2732 		return (DCMD_ERR);
2733 	}
2734 
2735 	if (lhp == 0) {
2736 		mdb_warn("no kmem transaction log\n");
2737 		return (DCMD_ERR);
2738 	}
2739 
2740 	mdb_readvar(&ncpus, "ncpus");
2741 
2742 	if (mdb_vread(&lh, sizeof (kmem_log_header_t), lhp) == -1) {
2743 		mdb_warn("failed to read log header at %p", lhp);
2744 		return (DCMD_ERR);
2745 	}
2746 
2747 	clhp = lhp + ((uintptr_t)&lh.lh_cpu[0] - (uintptr_t)&lh);
2748 
2749 	cpu = mdb_alloc(sizeof (uintptr_t) * NCPU, UM_SLEEP | UM_GC);
2750 
2751 	if (mdb_lookup_by_name("cpu", &sym) == -1) {
2752 		mdb_warn("couldn't find 'cpu' array");
2753 		return (DCMD_ERR);
2754 	}
2755 
2756 	if (sym.st_size != NCPU * sizeof (uintptr_t)) {
2757 		mdb_warn("expected 'cpu' to be of size %d; found %d\n",
2758 		    NCPU * sizeof (uintptr_t), sym.st_size);
2759 		return (DCMD_ERR);
2760 	}
2761 
2762 	if (mdb_vread(cpu, sym.st_size, (uintptr_t)sym.st_value) == -1) {
2763 		mdb_warn("failed to read cpu array at %p", sym.st_value);
2764 		return (DCMD_ERR);
2765 	}
2766 
2767 	kmc = mdb_zalloc(sizeof (kmem_log_cpu_t) * NCPU, UM_SLEEP | UM_GC);
2768 	kmd.kmd_addr = 0;
2769 	kmd.kmd_cpu = kmc;
2770 
2771 	for (i = 0; i < NCPU; i++) {
2772 
2773 		if (cpu[i] == 0)
2774 			continue;
2775 
2776 		if (mdb_vread(&clh, sizeof (clh), clhp) == -1) {
2777 			mdb_warn("cannot read cpu %d's log header at %p",
2778 			    i, clhp);
2779 			return (DCMD_ERR);
2780 		}
2781 
2782 		kmc[i].kmc_low = clh.clh_chunk * lh.lh_chunksize +
2783 		    (uintptr_t)lh.lh_base;
2784 		kmc[i].kmc_high = (uintptr_t)clh.clh_current;
2785 
2786 		clhp += sizeof (kmem_cpu_log_header_t);
2787 	}
2788 
2789 	mdb_printf("%3s %-?s %-?s %16s %-?s\n", "CPU", "ADDR", "BUFADDR",
2790 	    "TIMESTAMP", "THREAD");
2791 
2792 	/*
2793 	 * If we have been passed an address, print out only log entries
2794 	 * corresponding to that address.  If opt_b is specified, then interpret
2795 	 * the address as a bufctl.
2796 	 */
2797 	if (flags & DCMD_ADDRSPEC) {
2798 		kmem_bufctl_audit_t b;
2799 
2800 		if (opt_b) {
2801 			kmd.kmd_addr = addr;
2802 		} else {
2803 			if (mdb_vread(&b,
2804 			    sizeof (kmem_bufctl_audit_t), addr) == -1) {
2805 				mdb_warn("failed to read bufctl at %p", addr);
2806 				return (DCMD_ERR);
2807 			}
2808 
2809 			(void) kmem_log_walk(addr, &b, &kmd);
2810 
2811 			return (DCMD_OK);
2812 		}
2813 	}
2814 
2815 	if (mdb_walk("kmem_log", (mdb_walk_cb_t)kmem_log_walk, &kmd) == -1) {
2816 		mdb_warn("can't find kmem log walker");
2817 		return (DCMD_ERR);
2818 	}
2819 
2820 	return (DCMD_OK);
2821 }
2822 
2823 typedef struct bufctl_history_cb {
2824 	int		bhc_flags;
2825 	int		bhc_argc;
2826 	const mdb_arg_t	*bhc_argv;
2827 	int		bhc_ret;
2828 } bufctl_history_cb_t;
2829 
2830 /*ARGSUSED*/
2831 static int
bufctl_history_callback(uintptr_t addr,const void * ign,void * arg)2832 bufctl_history_callback(uintptr_t addr, const void *ign, void *arg)
2833 {
2834 	bufctl_history_cb_t *bhc = arg;
2835 
2836 	bhc->bhc_ret =
2837 	    bufctl(addr, bhc->bhc_flags, bhc->bhc_argc, bhc->bhc_argv);
2838 
2839 	bhc->bhc_flags &= ~DCMD_LOOPFIRST;
2840 
2841 	return ((bhc->bhc_ret == DCMD_OK)? WALK_NEXT : WALK_DONE);
2842 }
2843 
2844 void
bufctl_help(void)2845 bufctl_help(void)
2846 {
2847 	mdb_printf("%s",
2848 "Display the contents of kmem_bufctl_audit_ts, with optional filtering.\n\n");
2849 	mdb_dec_indent(2);
2850 	mdb_printf("%<b>OPTIONS%</b>\n");
2851 	mdb_inc_indent(2);
2852 	mdb_printf("%s",
2853 "  -v    Display the full content of the bufctl, including its stack trace\n"
2854 "  -h    retrieve the bufctl's transaction history, if available\n"
2855 "  -a addr\n"
2856 "        filter out bufctls not involving the buffer at addr\n"
2857 "  -c caller\n"
2858 "        filter out bufctls without the function/PC in their stack trace\n"
2859 "  -e earliest\n"
2860 "        filter out bufctls timestamped before earliest\n"
2861 "  -l latest\n"
2862 "        filter out bufctls timestamped after latest\n"
2863 "  -t thread\n"
2864 "        filter out bufctls not involving thread\n");
2865 }
2866 
2867 int
bufctl(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)2868 bufctl(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
2869 {
2870 	kmem_bufctl_audit_t bc;
2871 	uint_t verbose = FALSE;
2872 	uint_t history = FALSE;
2873 	uint_t in_history = FALSE;
2874 	uintptr_t caller = 0, thread = 0;
2875 	uintptr_t laddr, haddr, baddr = 0;
2876 	hrtime_t earliest = 0, latest = 0;
2877 	int i, depth;
2878 	char c[MDB_SYM_NAMLEN];
2879 	GElf_Sym sym;
2880 
2881 	if (mdb_getopts(argc, argv,
2882 	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
2883 	    'h', MDB_OPT_SETBITS, TRUE, &history,
2884 	    'H', MDB_OPT_SETBITS, TRUE, &in_history,		/* internal */
2885 	    'c', MDB_OPT_UINTPTR, &caller,
2886 	    't', MDB_OPT_UINTPTR, &thread,
2887 	    'e', MDB_OPT_UINT64, &earliest,
2888 	    'l', MDB_OPT_UINT64, &latest,
2889 	    'a', MDB_OPT_UINTPTR, &baddr, NULL) != argc)
2890 		return (DCMD_USAGE);
2891 
2892 	if (!(flags & DCMD_ADDRSPEC))
2893 		return (DCMD_USAGE);
2894 
2895 	if (in_history && !history)
2896 		return (DCMD_USAGE);
2897 
2898 	if (history && !in_history) {
2899 		mdb_arg_t *nargv = mdb_zalloc(sizeof (*nargv) * (argc + 1),
2900 		    UM_SLEEP | UM_GC);
2901 		bufctl_history_cb_t bhc;
2902 
2903 		nargv[0].a_type = MDB_TYPE_STRING;
2904 		nargv[0].a_un.a_str = "-H";		/* prevent recursion */
2905 
2906 		for (i = 0; i < argc; i++)
2907 			nargv[i + 1] = argv[i];
2908 
2909 		/*
2910 		 * When in history mode, we treat each element as if it
2911 		 * were in a seperate loop, so that the headers group
2912 		 * bufctls with similar histories.
2913 		 */
2914 		bhc.bhc_flags = flags | DCMD_LOOP | DCMD_LOOPFIRST;
2915 		bhc.bhc_argc = argc + 1;
2916 		bhc.bhc_argv = nargv;
2917 		bhc.bhc_ret = DCMD_OK;
2918 
2919 		if (mdb_pwalk("bufctl_history", bufctl_history_callback, &bhc,
2920 		    addr) == -1) {
2921 			mdb_warn("unable to walk bufctl_history");
2922 			return (DCMD_ERR);
2923 		}
2924 
2925 		if (bhc.bhc_ret == DCMD_OK && !(flags & DCMD_PIPE_OUT))
2926 			mdb_printf("\n");
2927 
2928 		return (bhc.bhc_ret);
2929 	}
2930 
2931 	if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
2932 		if (verbose) {
2933 			mdb_printf("%16s %16s %16s %16s\n"
2934 			    "%<u>%16s %16s %16s %16s%</u>\n",
2935 			    "ADDR", "BUFADDR", "TIMESTAMP", "THREAD",
2936 			    "", "CACHE", "LASTLOG", "CONTENTS");
2937 		} else {
2938 			mdb_printf("%<u>%-?s %-?s %-12s %-?s %s%</u>\n",
2939 			    "ADDR", "BUFADDR", "TIMESTAMP", "THREAD", "CALLER");
2940 		}
2941 	}
2942 
2943 	if (mdb_vread(&bc, sizeof (bc), addr) == -1) {
2944 		mdb_warn("couldn't read bufctl at %p", addr);
2945 		return (DCMD_ERR);
2946 	}
2947 
2948 	/*
2949 	 * Guard against bogus bc_depth in case the bufctl is corrupt or
2950 	 * the address does not really refer to a bufctl.
2951 	 */
2952 	depth = MIN(bc.bc_depth, KMEM_STACK_DEPTH);
2953 
2954 	if (caller != 0) {
2955 		laddr = caller;
2956 		haddr = caller + sizeof (caller);
2957 
2958 		if (mdb_lookup_by_addr(caller, MDB_SYM_FUZZY, c, sizeof (c),
2959 		    &sym) != -1 && caller == (uintptr_t)sym.st_value) {
2960 			/*
2961 			 * We were provided an exact symbol value; any
2962 			 * address in the function is valid.
2963 			 */
2964 			laddr = (uintptr_t)sym.st_value;
2965 			haddr = (uintptr_t)sym.st_value + sym.st_size;
2966 		}
2967 
2968 		for (i = 0; i < depth; i++)
2969 			if (bc.bc_stack[i] >= laddr && bc.bc_stack[i] < haddr)
2970 				break;
2971 
2972 		if (i == depth)
2973 			return (DCMD_OK);
2974 	}
2975 
2976 	if (thread != 0 && (uintptr_t)bc.bc_thread != thread)
2977 		return (DCMD_OK);
2978 
2979 	if (earliest != 0 && bc.bc_timestamp < earliest)
2980 		return (DCMD_OK);
2981 
2982 	if (latest != 0 && bc.bc_timestamp > latest)
2983 		return (DCMD_OK);
2984 
2985 	if (baddr != 0 && (uintptr_t)bc.bc_addr != baddr)
2986 		return (DCMD_OK);
2987 
2988 	if (flags & DCMD_PIPE_OUT) {
2989 		mdb_printf("%#lr\n", addr);
2990 		return (DCMD_OK);
2991 	}
2992 
2993 	if (verbose) {
2994 		mdb_printf(
2995 		    "%<b>%16p%</b> %16p %16llx %16p\n"
2996 		    "%16s %16p %16p %16p\n",
2997 		    addr, bc.bc_addr, bc.bc_timestamp, bc.bc_thread,
2998 		    "", bc.bc_cache, bc.bc_lastlog, bc.bc_contents);
2999 
3000 		mdb_inc_indent(17);
3001 		for (i = 0; i < depth; i++)
3002 			mdb_printf("%a\n", bc.bc_stack[i]);
3003 		mdb_dec_indent(17);
3004 		mdb_printf("\n");
3005 	} else {
3006 		mdb_printf("%0?p %0?p %12llx %0?p", addr, bc.bc_addr,
3007 		    bc.bc_timestamp, bc.bc_thread);
3008 
3009 		for (i = 0; i < depth; i++) {
3010 			if (mdb_lookup_by_addr(bc.bc_stack[i],
3011 			    MDB_SYM_FUZZY, c, sizeof (c), &sym) == -1)
3012 				continue;
3013 			if (strncmp(c, "kmem_", 5) == 0)
3014 				continue;
3015 			mdb_printf(" %a\n", bc.bc_stack[i]);
3016 			break;
3017 		}
3018 
3019 		if (i >= depth)
3020 			mdb_printf("\n");
3021 	}
3022 
3023 	return (DCMD_OK);
3024 }
3025 
3026 typedef struct kmem_verify {
3027 	uint64_t *kmv_buf;		/* buffer to read cache contents into */
3028 	size_t kmv_size;		/* number of bytes in kmv_buf */
3029 	int kmv_corruption;		/* > 0 if corruption found. */
3030 	uint_t kmv_flags;		/* dcmd flags */
3031 	struct kmem_cache kmv_cache;	/* the cache we're operating on */
3032 } kmem_verify_t;
3033 
3034 /*
3035  * verify_pattern()
3036  *	verify that buf is filled with the pattern pat.
3037  */
3038 static int64_t
verify_pattern(uint64_t * buf_arg,size_t size,uint64_t pat)3039 verify_pattern(uint64_t *buf_arg, size_t size, uint64_t pat)
3040 {
3041 	/*LINTED*/
3042 	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
3043 	uint64_t *buf;
3044 
3045 	for (buf = buf_arg; buf < bufend; buf++)
3046 		if (*buf != pat)
3047 			return ((uintptr_t)buf - (uintptr_t)buf_arg);
3048 	return (-1);
3049 }
3050 
3051 /*
3052  * verify_buftag()
3053  *	verify that btp->bt_bxstat == (bcp ^ pat)
3054  */
3055 static int
verify_buftag(kmem_buftag_t * btp,uintptr_t pat)3056 verify_buftag(kmem_buftag_t *btp, uintptr_t pat)
3057 {
3058 	return (btp->bt_bxstat == ((intptr_t)btp->bt_bufctl ^ pat) ? 0 : -1);
3059 }
3060 
3061 /*
3062  * verify_free()
3063  *	verify the integrity of a free block of memory by checking
3064  *	that it is filled with 0xdeadbeef and that its buftag is sane.
3065  */
3066 /*ARGSUSED1*/
3067 static int
verify_free(uintptr_t addr,const void * data,void * private)3068 verify_free(uintptr_t addr, const void *data, void *private)
3069 {
3070 	kmem_verify_t *kmv = (kmem_verify_t *)private;
3071 	uint64_t *buf = kmv->kmv_buf;	/* buf to validate */
3072 	int64_t corrupt;		/* corruption offset */
3073 	kmem_buftag_t *buftagp;		/* ptr to buftag */
3074 	kmem_cache_t *cp = &kmv->kmv_cache;
3075 	boolean_t besilent = !!(kmv->kmv_flags & (DCMD_LOOP | DCMD_PIPE_OUT));
3076 
3077 	/*LINTED*/
3078 	buftagp = KMEM_BUFTAG(cp, buf);
3079 
3080 	/*
3081 	 * Read the buffer to check.
3082 	 */
3083 	if (mdb_vread(buf, kmv->kmv_size, addr) == -1) {
3084 		if (!besilent)
3085 			mdb_warn("couldn't read %p", addr);
3086 		return (WALK_NEXT);
3087 	}
3088 
3089 	if ((corrupt = verify_pattern(buf, cp->cache_verify,
3090 	    KMEM_FREE_PATTERN)) >= 0) {
3091 		if (!besilent)
3092 			mdb_printf("buffer %p (free) seems corrupted, at %p\n",
3093 			    addr, (uintptr_t)addr + corrupt);
3094 		goto corrupt;
3095 	}
3096 	/*
3097 	 * When KMF_LITE is set, buftagp->bt_redzone is used to hold
3098 	 * the first bytes of the buffer, hence we cannot check for red
3099 	 * zone corruption.
3100 	 */
3101 	if ((cp->cache_flags & (KMF_HASH | KMF_LITE)) == KMF_HASH &&
3102 	    buftagp->bt_redzone != KMEM_REDZONE_PATTERN) {
3103 		if (!besilent)
3104 			mdb_printf("buffer %p (free) seems to "
3105 			    "have a corrupt redzone pattern\n", addr);
3106 		goto corrupt;
3107 	}
3108 
3109 	/*
3110 	 * confirm bufctl pointer integrity.
3111 	 */
3112 	if (verify_buftag(buftagp, KMEM_BUFTAG_FREE) == -1) {
3113 		if (!besilent)
3114 			mdb_printf("buffer %p (free) has a corrupt "
3115 			    "buftag\n", addr);
3116 		goto corrupt;
3117 	}
3118 
3119 	return (WALK_NEXT);
3120 corrupt:
3121 	if (kmv->kmv_flags & DCMD_PIPE_OUT)
3122 		mdb_printf("%p\n", addr);
3123 	kmv->kmv_corruption++;
3124 	return (WALK_NEXT);
3125 }
3126 
3127 /*
3128  * verify_alloc()
3129  *	Verify that the buftag of an allocated buffer makes sense with respect
3130  *	to the buffer.
3131  */
3132 /*ARGSUSED1*/
3133 static int
verify_alloc(uintptr_t addr,const void * data,void * private)3134 verify_alloc(uintptr_t addr, const void *data, void *private)
3135 {
3136 	kmem_verify_t *kmv = (kmem_verify_t *)private;
3137 	kmem_cache_t *cp = &kmv->kmv_cache;
3138 	uint64_t *buf = kmv->kmv_buf;	/* buf to validate */
3139 	/*LINTED*/
3140 	kmem_buftag_t *buftagp = KMEM_BUFTAG(cp, buf);
3141 	uint32_t *ip = (uint32_t *)buftagp;
3142 	uint8_t *bp = (uint8_t *)buf;
3143 	int looks_ok = 0, size_ok = 1;	/* flags for finding corruption */
3144 	boolean_t besilent = !!(kmv->kmv_flags & (DCMD_LOOP | DCMD_PIPE_OUT));
3145 
3146 	/*
3147 	 * Read the buffer to check.
3148 	 */
3149 	if (mdb_vread(buf, kmv->kmv_size, addr) == -1) {
3150 		if (!besilent)
3151 			mdb_warn("couldn't read %p", addr);
3152 		return (WALK_NEXT);
3153 	}
3154 
3155 	/*
3156 	 * There are two cases to handle:
3157 	 * 1. If the buf was alloc'd using kmem_cache_alloc, it will have
3158 	 *    0xfeedfacefeedface at the end of it
3159 	 * 2. If the buf was alloc'd using kmem_alloc, it will have
3160 	 *    0xbb just past the end of the region in use.  At the buftag,
3161 	 *    it will have 0xfeedface (or, if the whole buffer is in use,
3162 	 *    0xfeedface & bb000000 or 0xfeedfacf & 000000bb depending on
3163 	 *    endianness), followed by 32 bits containing the offset of the
3164 	 *    0xbb byte in the buffer.
3165 	 *
3166 	 * Finally, the two 32-bit words that comprise the second half of the
3167 	 * buftag should xor to KMEM_BUFTAG_ALLOC
3168 	 */
3169 
3170 	if (buftagp->bt_redzone == KMEM_REDZONE_PATTERN)
3171 		looks_ok = 1;
3172 	else if (!KMEM_SIZE_VALID(ip[1]))
3173 		size_ok = 0;
3174 	else if (bp[KMEM_SIZE_DECODE(ip[1])] == KMEM_REDZONE_BYTE)
3175 		looks_ok = 1;
3176 	else
3177 		size_ok = 0;
3178 
3179 	if (!size_ok) {
3180 		if (!besilent)
3181 			mdb_printf("buffer %p (allocated) has a corrupt "
3182 			    "redzone size encoding\n", addr);
3183 		goto corrupt;
3184 	}
3185 
3186 	if (!looks_ok) {
3187 		if (!besilent)
3188 			mdb_printf("buffer %p (allocated) has a corrupt "
3189 			    "redzone signature\n", addr);
3190 		goto corrupt;
3191 	}
3192 
3193 	if (verify_buftag(buftagp, KMEM_BUFTAG_ALLOC) == -1) {
3194 		if (!besilent)
3195 			mdb_printf("buffer %p (allocated) has a "
3196 			    "corrupt buftag\n", addr);
3197 		goto corrupt;
3198 	}
3199 
3200 	return (WALK_NEXT);
3201 corrupt:
3202 	if (kmv->kmv_flags & DCMD_PIPE_OUT)
3203 		mdb_printf("%p\n", addr);
3204 
3205 	kmv->kmv_corruption++;
3206 	return (WALK_NEXT);
3207 }
3208 
3209 /*ARGSUSED2*/
3210 int
kmem_verify(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3211 kmem_verify(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3212 {
3213 	if (flags & DCMD_ADDRSPEC) {
3214 		int check_alloc = 0, check_free = 0;
3215 		kmem_verify_t kmv;
3216 
3217 		if (mdb_vread(&kmv.kmv_cache, sizeof (kmv.kmv_cache),
3218 		    addr) == -1) {
3219 			mdb_warn("couldn't read kmem_cache %p", addr);
3220 			return (DCMD_ERR);
3221 		}
3222 
3223 		if ((kmv.kmv_cache.cache_dump.kd_unsafe ||
3224 		    kmv.kmv_cache.cache_dump.kd_alloc_fails) &&
3225 		    !(flags & (DCMD_LOOP | DCMD_PIPE_OUT))) {
3226 			mdb_warn("WARNING: cache was used during dump: "
3227 			    "corruption may be incorrectly reported\n");
3228 		}
3229 
3230 		kmv.kmv_size = kmv.kmv_cache.cache_buftag +
3231 		    sizeof (kmem_buftag_t);
3232 		kmv.kmv_buf = mdb_alloc(kmv.kmv_size, UM_SLEEP | UM_GC);
3233 		kmv.kmv_corruption = 0;
3234 		kmv.kmv_flags = flags;
3235 
3236 		if ((kmv.kmv_cache.cache_flags & KMF_REDZONE)) {
3237 			check_alloc = 1;
3238 			if (kmv.kmv_cache.cache_flags & KMF_DEADBEEF)
3239 				check_free = 1;
3240 		} else {
3241 			if (!(flags & DCMD_LOOP)) {
3242 				mdb_warn("cache %p (%s) does not have "
3243 				    "redzone checking enabled\n", addr,
3244 				    kmv.kmv_cache.cache_name);
3245 			}
3246 			return (DCMD_ERR);
3247 		}
3248 
3249 		if (!(flags & (DCMD_LOOP | DCMD_PIPE_OUT))) {
3250 			mdb_printf("Summary for cache '%s'\n",
3251 			    kmv.kmv_cache.cache_name);
3252 			mdb_inc_indent(2);
3253 		}
3254 
3255 		if (check_alloc)
3256 			(void) mdb_pwalk("kmem", verify_alloc, &kmv, addr);
3257 		if (check_free)
3258 			(void) mdb_pwalk("freemem", verify_free, &kmv, addr);
3259 
3260 		if (!(flags & DCMD_PIPE_OUT)) {
3261 			if (flags & DCMD_LOOP) {
3262 				if (kmv.kmv_corruption == 0) {
3263 					mdb_printf("%-*s %?p clean\n",
3264 					    KMEM_CACHE_NAMELEN,
3265 					    kmv.kmv_cache.cache_name, addr);
3266 				} else {
3267 					mdb_printf("%-*s %?p %d corrupt "
3268 					    "buffer%s\n", KMEM_CACHE_NAMELEN,
3269 					    kmv.kmv_cache.cache_name, addr,
3270 					    kmv.kmv_corruption,
3271 					    kmv.kmv_corruption > 1 ? "s" : "");
3272 				}
3273 			} else {
3274 				/*
3275 				 * This is the more verbose mode, when the user
3276 				 * typed addr::kmem_verify.  If the cache was
3277 				 * clean, nothing will have yet been printed. So
3278 				 * say something.
3279 				 */
3280 				if (kmv.kmv_corruption == 0)
3281 					mdb_printf("clean\n");
3282 
3283 				mdb_dec_indent(2);
3284 			}
3285 		}
3286 	} else {
3287 		/*
3288 		 * If the user didn't specify a cache to verify, we'll walk all
3289 		 * kmem_cache's, specifying ourself as a callback for each...
3290 		 * this is the equivalent of '::walk kmem_cache .::kmem_verify'
3291 		 */
3292 
3293 		if (!(flags & DCMD_PIPE_OUT)) {
3294 			uintptr_t dump_curr;
3295 			uintptr_t dump_end;
3296 
3297 			if (mdb_readvar(&dump_curr, "kmem_dump_curr") != -1 &&
3298 			    mdb_readvar(&dump_end, "kmem_dump_end") != -1 &&
3299 			    dump_curr == dump_end) {
3300 				mdb_warn("WARNING: exceeded kmem_dump_size; "
3301 				    "corruption may be incorrectly reported\n");
3302 			}
3303 
3304 			mdb_printf("%<u>%-*s %-?s %-20s%</b>\n",
3305 			    KMEM_CACHE_NAMELEN, "Cache Name", "Addr",
3306 			    "Cache Integrity");
3307 		}
3308 
3309 		(void) (mdb_walk_dcmd("kmem_cache", "kmem_verify", 0, NULL));
3310 	}
3311 
3312 	return (DCMD_OK);
3313 }
3314 
3315 typedef struct vmem_node {
3316 	struct vmem_node *vn_next;
3317 	struct vmem_node *vn_parent;
3318 	struct vmem_node *vn_sibling;
3319 	struct vmem_node *vn_children;
3320 	uintptr_t vn_addr;
3321 	int vn_marked;
3322 	vmem_t vn_vmem;
3323 } vmem_node_t;
3324 
3325 typedef struct vmem_walk {
3326 	vmem_node_t *vw_root;
3327 	vmem_node_t *vw_current;
3328 } vmem_walk_t;
3329 
3330 int
vmem_walk_init(mdb_walk_state_t * wsp)3331 vmem_walk_init(mdb_walk_state_t *wsp)
3332 {
3333 	uintptr_t vaddr, paddr;
3334 	vmem_node_t *head = NULL, *root = NULL, *current = NULL, *parent, *vp;
3335 	vmem_walk_t *vw;
3336 
3337 	if (mdb_readvar(&vaddr, "vmem_list") == -1) {
3338 		mdb_warn("couldn't read 'vmem_list'");
3339 		return (WALK_ERR);
3340 	}
3341 
3342 	while (vaddr != 0) {
3343 		vp = mdb_zalloc(sizeof (vmem_node_t), UM_SLEEP);
3344 		vp->vn_addr = vaddr;
3345 		vp->vn_next = head;
3346 		head = vp;
3347 
3348 		if (vaddr == wsp->walk_addr)
3349 			current = vp;
3350 
3351 		if (mdb_vread(&vp->vn_vmem, sizeof (vmem_t), vaddr) == -1) {
3352 			mdb_warn("couldn't read vmem_t at %p", vaddr);
3353 			goto err;
3354 		}
3355 
3356 		vaddr = (uintptr_t)vp->vn_vmem.vm_next;
3357 	}
3358 
3359 	for (vp = head; vp != NULL; vp = vp->vn_next) {
3360 
3361 		if ((paddr = (uintptr_t)vp->vn_vmem.vm_source) == 0) {
3362 			vp->vn_sibling = root;
3363 			root = vp;
3364 			continue;
3365 		}
3366 
3367 		for (parent = head; parent != NULL; parent = parent->vn_next) {
3368 			if (parent->vn_addr != paddr)
3369 				continue;
3370 			vp->vn_sibling = parent->vn_children;
3371 			parent->vn_children = vp;
3372 			vp->vn_parent = parent;
3373 			break;
3374 		}
3375 
3376 		if (parent == NULL) {
3377 			mdb_warn("couldn't find %p's parent (%p)\n",
3378 			    vp->vn_addr, paddr);
3379 			goto err;
3380 		}
3381 	}
3382 
3383 	vw = mdb_zalloc(sizeof (vmem_walk_t), UM_SLEEP);
3384 	vw->vw_root = root;
3385 
3386 	if (current != NULL)
3387 		vw->vw_current = current;
3388 	else
3389 		vw->vw_current = root;
3390 
3391 	wsp->walk_data = vw;
3392 	return (WALK_NEXT);
3393 err:
3394 	for (vp = head; head != NULL; vp = head) {
3395 		head = vp->vn_next;
3396 		mdb_free(vp, sizeof (vmem_node_t));
3397 	}
3398 
3399 	return (WALK_ERR);
3400 }
3401 
3402 int
vmem_walk_step(mdb_walk_state_t * wsp)3403 vmem_walk_step(mdb_walk_state_t *wsp)
3404 {
3405 	vmem_walk_t *vw = wsp->walk_data;
3406 	vmem_node_t *vp;
3407 	int rval;
3408 
3409 	if ((vp = vw->vw_current) == NULL)
3410 		return (WALK_DONE);
3411 
3412 	rval = wsp->walk_callback(vp->vn_addr, &vp->vn_vmem, wsp->walk_cbdata);
3413 
3414 	if (vp->vn_children != NULL) {
3415 		vw->vw_current = vp->vn_children;
3416 		return (rval);
3417 	}
3418 
3419 	do {
3420 		vw->vw_current = vp->vn_sibling;
3421 		vp = vp->vn_parent;
3422 	} while (vw->vw_current == NULL && vp != NULL);
3423 
3424 	return (rval);
3425 }
3426 
3427 /*
3428  * The "vmem_postfix" walk walks the vmem arenas in post-fix order; all
3429  * children are visited before their parent.  We perform the postfix walk
3430  * iteratively (rather than recursively) to allow mdb to regain control
3431  * after each callback.
3432  */
3433 int
vmem_postfix_walk_step(mdb_walk_state_t * wsp)3434 vmem_postfix_walk_step(mdb_walk_state_t *wsp)
3435 {
3436 	vmem_walk_t *vw = wsp->walk_data;
3437 	vmem_node_t *vp = vw->vw_current;
3438 	int rval;
3439 
3440 	/*
3441 	 * If this node is marked, then we know that we have already visited
3442 	 * all of its children.  If the node has any siblings, they need to
3443 	 * be visited next; otherwise, we need to visit the parent.  Note
3444 	 * that vp->vn_marked will only be zero on the first invocation of
3445 	 * the step function.
3446 	 */
3447 	if (vp->vn_marked) {
3448 		if (vp->vn_sibling != NULL)
3449 			vp = vp->vn_sibling;
3450 		else if (vp->vn_parent != NULL)
3451 			vp = vp->vn_parent;
3452 		else {
3453 			/*
3454 			 * We have neither a parent, nor a sibling, and we
3455 			 * have already been visited; we're done.
3456 			 */
3457 			return (WALK_DONE);
3458 		}
3459 	}
3460 
3461 	/*
3462 	 * Before we visit this node, visit its children.
3463 	 */
3464 	while (vp->vn_children != NULL && !vp->vn_children->vn_marked)
3465 		vp = vp->vn_children;
3466 
3467 	vp->vn_marked = 1;
3468 	vw->vw_current = vp;
3469 	rval = wsp->walk_callback(vp->vn_addr, &vp->vn_vmem, wsp->walk_cbdata);
3470 
3471 	return (rval);
3472 }
3473 
3474 void
vmem_walk_fini(mdb_walk_state_t * wsp)3475 vmem_walk_fini(mdb_walk_state_t *wsp)
3476 {
3477 	vmem_walk_t *vw = wsp->walk_data;
3478 	vmem_node_t *root = vw->vw_root;
3479 	int done;
3480 
3481 	if (root == NULL)
3482 		return;
3483 
3484 	if ((vw->vw_root = root->vn_children) != NULL)
3485 		vmem_walk_fini(wsp);
3486 
3487 	vw->vw_root = root->vn_sibling;
3488 	done = (root->vn_sibling == NULL && root->vn_parent == NULL);
3489 	mdb_free(root, sizeof (vmem_node_t));
3490 
3491 	if (done) {
3492 		mdb_free(vw, sizeof (vmem_walk_t));
3493 	} else {
3494 		vmem_walk_fini(wsp);
3495 	}
3496 }
3497 
3498 typedef struct vmem_seg_walk {
3499 	uint8_t vsw_type;
3500 	uintptr_t vsw_start;
3501 	uintptr_t vsw_current;
3502 } vmem_seg_walk_t;
3503 
3504 /*ARGSUSED*/
3505 int
vmem_seg_walk_common_init(mdb_walk_state_t * wsp,uint8_t type,char * name)3506 vmem_seg_walk_common_init(mdb_walk_state_t *wsp, uint8_t type, char *name)
3507 {
3508 	vmem_seg_walk_t *vsw;
3509 
3510 	if (wsp->walk_addr == 0) {
3511 		mdb_warn("vmem_%s does not support global walks\n", name);
3512 		return (WALK_ERR);
3513 	}
3514 
3515 	wsp->walk_data = vsw = mdb_alloc(sizeof (vmem_seg_walk_t), UM_SLEEP);
3516 
3517 	vsw->vsw_type = type;
3518 	vsw->vsw_start = wsp->walk_addr + offsetof(vmem_t, vm_seg0);
3519 	vsw->vsw_current = vsw->vsw_start;
3520 
3521 	return (WALK_NEXT);
3522 }
3523 
3524 /*
3525  * vmem segments can't have type 0 (this should be added to vmem_impl.h).
3526  */
3527 #define	VMEM_NONE	0
3528 
3529 int
vmem_alloc_walk_init(mdb_walk_state_t * wsp)3530 vmem_alloc_walk_init(mdb_walk_state_t *wsp)
3531 {
3532 	return (vmem_seg_walk_common_init(wsp, VMEM_ALLOC, "alloc"));
3533 }
3534 
3535 int
vmem_free_walk_init(mdb_walk_state_t * wsp)3536 vmem_free_walk_init(mdb_walk_state_t *wsp)
3537 {
3538 	return (vmem_seg_walk_common_init(wsp, VMEM_FREE, "free"));
3539 }
3540 
3541 int
vmem_span_walk_init(mdb_walk_state_t * wsp)3542 vmem_span_walk_init(mdb_walk_state_t *wsp)
3543 {
3544 	return (vmem_seg_walk_common_init(wsp, VMEM_SPAN, "span"));
3545 }
3546 
3547 int
vmem_seg_walk_init(mdb_walk_state_t * wsp)3548 vmem_seg_walk_init(mdb_walk_state_t *wsp)
3549 {
3550 	return (vmem_seg_walk_common_init(wsp, VMEM_NONE, "seg"));
3551 }
3552 
3553 int
vmem_seg_walk_step(mdb_walk_state_t * wsp)3554 vmem_seg_walk_step(mdb_walk_state_t *wsp)
3555 {
3556 	vmem_seg_t seg;
3557 	vmem_seg_walk_t *vsw = wsp->walk_data;
3558 	uintptr_t addr = vsw->vsw_current;
3559 	static size_t seg_size = 0;
3560 	int rval;
3561 
3562 	if (!seg_size) {
3563 		if (mdb_readvar(&seg_size, "vmem_seg_size") == -1) {
3564 			mdb_warn("failed to read 'vmem_seg_size'");
3565 			seg_size = sizeof (vmem_seg_t);
3566 		}
3567 	}
3568 
3569 	if (seg_size < sizeof (seg))
3570 		bzero((caddr_t)&seg + seg_size, sizeof (seg) - seg_size);
3571 
3572 	if (mdb_vread(&seg, seg_size, addr) == -1) {
3573 		mdb_warn("couldn't read vmem_seg at %p", addr);
3574 		return (WALK_ERR);
3575 	}
3576 
3577 	vsw->vsw_current = (uintptr_t)seg.vs_anext;
3578 	if (vsw->vsw_type != VMEM_NONE && seg.vs_type != vsw->vsw_type) {
3579 		rval = WALK_NEXT;
3580 	} else {
3581 		rval = wsp->walk_callback(addr, &seg, wsp->walk_cbdata);
3582 	}
3583 
3584 	if (vsw->vsw_current == vsw->vsw_start)
3585 		return (WALK_DONE);
3586 
3587 	return (rval);
3588 }
3589 
3590 void
vmem_seg_walk_fini(mdb_walk_state_t * wsp)3591 vmem_seg_walk_fini(mdb_walk_state_t *wsp)
3592 {
3593 	vmem_seg_walk_t *vsw = wsp->walk_data;
3594 
3595 	mdb_free(vsw, sizeof (vmem_seg_walk_t));
3596 }
3597 
3598 #define	VMEM_NAMEWIDTH	22
3599 
3600 int
vmem(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3601 vmem(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3602 {
3603 	vmem_t v, parent;
3604 	vmem_kstat_t *vkp = &v.vm_kstat;
3605 	uintptr_t paddr;
3606 	int ident = 0;
3607 	char c[VMEM_NAMEWIDTH];
3608 
3609 	if (!(flags & DCMD_ADDRSPEC)) {
3610 		if (mdb_walk_dcmd("vmem", "vmem", argc, argv) == -1) {
3611 			mdb_warn("can't walk vmem");
3612 			return (DCMD_ERR);
3613 		}
3614 		return (DCMD_OK);
3615 	}
3616 
3617 	if (DCMD_HDRSPEC(flags))
3618 		mdb_printf("%-?s %-*s %10s %12s %9s %5s\n",
3619 		    "ADDR", VMEM_NAMEWIDTH, "NAME", "INUSE",
3620 		    "TOTAL", "SUCCEED", "FAIL");
3621 
3622 	if (mdb_vread(&v, sizeof (v), addr) == -1) {
3623 		mdb_warn("couldn't read vmem at %p", addr);
3624 		return (DCMD_ERR);
3625 	}
3626 
3627 	for (paddr = (uintptr_t)v.vm_source; paddr != 0; ident += 2) {
3628 		if (mdb_vread(&parent, sizeof (parent), paddr) == -1) {
3629 			mdb_warn("couldn't trace %p's ancestry", addr);
3630 			ident = 0;
3631 			break;
3632 		}
3633 		paddr = (uintptr_t)parent.vm_source;
3634 	}
3635 
3636 	(void) mdb_snprintf(c, VMEM_NAMEWIDTH, "%*s%s", ident, "", v.vm_name);
3637 
3638 	mdb_printf("%0?p %-*s %10llu %12llu %9llu %5llu\n",
3639 	    addr, VMEM_NAMEWIDTH, c,
3640 	    vkp->vk_mem_inuse.value.ui64, vkp->vk_mem_total.value.ui64,
3641 	    vkp->vk_alloc.value.ui64, vkp->vk_fail.value.ui64);
3642 
3643 	return (DCMD_OK);
3644 }
3645 
3646 void
vmem_seg_help(void)3647 vmem_seg_help(void)
3648 {
3649 	mdb_printf("%s",
3650 "Display the contents of vmem_seg_ts, with optional filtering.\n\n"
3651 "\n"
3652 "A vmem_seg_t represents a range of addresses (or arbitrary numbers),\n"
3653 "representing a single chunk of data.  Only ALLOC segments have debugging\n"
3654 "information.\n");
3655 	mdb_dec_indent(2);
3656 	mdb_printf("%<b>OPTIONS%</b>\n");
3657 	mdb_inc_indent(2);
3658 	mdb_printf("%s",
3659 "  -v    Display the full content of the vmem_seg, including its stack trace\n"
3660 "  -s    report the size of the segment, instead of the end address\n"
3661 "  -c caller\n"
3662 "        filter out segments without the function/PC in their stack trace\n"
3663 "  -e earliest\n"
3664 "        filter out segments timestamped before earliest\n"
3665 "  -l latest\n"
3666 "        filter out segments timestamped after latest\n"
3667 "  -m minsize\n"
3668 "        filer out segments smaller than minsize\n"
3669 "  -M maxsize\n"
3670 "        filer out segments larger than maxsize\n"
3671 "  -t thread\n"
3672 "        filter out segments not involving thread\n"
3673 "  -T type\n"
3674 "        filter out segments not of type 'type'\n"
3675 "        type is one of: ALLOC/FREE/SPAN/ROTOR/WALKER\n");
3676 }
3677 
3678 /*ARGSUSED*/
3679 int
vmem_seg(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3680 vmem_seg(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3681 {
3682 	vmem_seg_t vs;
3683 	pc_t *stk = vs.vs_stack;
3684 	uintptr_t sz;
3685 	uint8_t t;
3686 	const char *type = NULL;
3687 	GElf_Sym sym;
3688 	char c[MDB_SYM_NAMLEN];
3689 	int no_debug;
3690 	int i;
3691 	int depth;
3692 	uintptr_t laddr, haddr;
3693 
3694 	uintptr_t caller = 0, thread = 0;
3695 	uintptr_t minsize = 0, maxsize = 0;
3696 
3697 	hrtime_t earliest = 0, latest = 0;
3698 
3699 	uint_t size = 0;
3700 	uint_t verbose = 0;
3701 
3702 	if (!(flags & DCMD_ADDRSPEC))
3703 		return (DCMD_USAGE);
3704 
3705 	if (mdb_getopts(argc, argv,
3706 	    'c', MDB_OPT_UINTPTR, &caller,
3707 	    'e', MDB_OPT_UINT64, &earliest,
3708 	    'l', MDB_OPT_UINT64, &latest,
3709 	    's', MDB_OPT_SETBITS, TRUE, &size,
3710 	    'm', MDB_OPT_UINTPTR, &minsize,
3711 	    'M', MDB_OPT_UINTPTR, &maxsize,
3712 	    't', MDB_OPT_UINTPTR, &thread,
3713 	    'T', MDB_OPT_STR, &type,
3714 	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
3715 	    NULL) != argc)
3716 		return (DCMD_USAGE);
3717 
3718 	if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
3719 		if (verbose) {
3720 			mdb_printf("%16s %4s %16s %16s %16s\n"
3721 			    "%<u>%16s %4s %16s %16s %16s%</u>\n",
3722 			    "ADDR", "TYPE", "START", "END", "SIZE",
3723 			    "", "", "THREAD", "TIMESTAMP", "");
3724 		} else {
3725 			mdb_printf("%?s %4s %?s %?s %s\n", "ADDR", "TYPE",
3726 			    "START", size? "SIZE" : "END", "WHO");
3727 		}
3728 	}
3729 
3730 	if (mdb_vread(&vs, sizeof (vs), addr) == -1) {
3731 		mdb_warn("couldn't read vmem_seg at %p", addr);
3732 		return (DCMD_ERR);
3733 	}
3734 
3735 	if (type != NULL) {
3736 		if (strcmp(type, "ALLC") == 0 || strcmp(type, "ALLOC") == 0)
3737 			t = VMEM_ALLOC;
3738 		else if (strcmp(type, "FREE") == 0)
3739 			t = VMEM_FREE;
3740 		else if (strcmp(type, "SPAN") == 0)
3741 			t = VMEM_SPAN;
3742 		else if (strcmp(type, "ROTR") == 0 ||
3743 		    strcmp(type, "ROTOR") == 0)
3744 			t = VMEM_ROTOR;
3745 		else if (strcmp(type, "WLKR") == 0 ||
3746 		    strcmp(type, "WALKER") == 0)
3747 			t = VMEM_WALKER;
3748 		else {
3749 			mdb_warn("\"%s\" is not a recognized vmem_seg type\n",
3750 			    type);
3751 			return (DCMD_ERR);
3752 		}
3753 
3754 		if (vs.vs_type != t)
3755 			return (DCMD_OK);
3756 	}
3757 
3758 	sz = vs.vs_end - vs.vs_start;
3759 
3760 	if (minsize != 0 && sz < minsize)
3761 		return (DCMD_OK);
3762 
3763 	if (maxsize != 0 && sz > maxsize)
3764 		return (DCMD_OK);
3765 
3766 	t = vs.vs_type;
3767 	depth = vs.vs_depth;
3768 
3769 	/*
3770 	 * debug info, when present, is only accurate for VMEM_ALLOC segments
3771 	 */
3772 	no_debug = (t != VMEM_ALLOC) ||
3773 	    (depth == 0 || depth > VMEM_STACK_DEPTH);
3774 
3775 	if (no_debug) {
3776 		if (caller != 0 || thread != 0 || earliest != 0 || latest != 0)
3777 			return (DCMD_OK);		/* not enough info */
3778 	} else {
3779 		if (caller != 0) {
3780 			laddr = caller;
3781 			haddr = caller + sizeof (caller);
3782 
3783 			if (mdb_lookup_by_addr(caller, MDB_SYM_FUZZY, c,
3784 			    sizeof (c), &sym) != -1 &&
3785 			    caller == (uintptr_t)sym.st_value) {
3786 				/*
3787 				 * We were provided an exact symbol value; any
3788 				 * address in the function is valid.
3789 				 */
3790 				laddr = (uintptr_t)sym.st_value;
3791 				haddr = (uintptr_t)sym.st_value + sym.st_size;
3792 			}
3793 
3794 			for (i = 0; i < depth; i++)
3795 				if (vs.vs_stack[i] >= laddr &&
3796 				    vs.vs_stack[i] < haddr)
3797 					break;
3798 
3799 			if (i == depth)
3800 				return (DCMD_OK);
3801 		}
3802 
3803 		if (thread != 0 && (uintptr_t)vs.vs_thread != thread)
3804 			return (DCMD_OK);
3805 
3806 		if (earliest != 0 && vs.vs_timestamp < earliest)
3807 			return (DCMD_OK);
3808 
3809 		if (latest != 0 && vs.vs_timestamp > latest)
3810 			return (DCMD_OK);
3811 	}
3812 
3813 	type = (t == VMEM_ALLOC ? "ALLC" :
3814 	    t == VMEM_FREE ? "FREE" :
3815 	    t == VMEM_SPAN ? "SPAN" :
3816 	    t == VMEM_ROTOR ? "ROTR" :
3817 	    t == VMEM_WALKER ? "WLKR" :
3818 	    "????");
3819 
3820 	if (flags & DCMD_PIPE_OUT) {
3821 		mdb_printf("%#lr\n", addr);
3822 		return (DCMD_OK);
3823 	}
3824 
3825 	if (verbose) {
3826 		mdb_printf("%<b>%16p%</b> %4s %16p %16p %16ld\n",
3827 		    addr, type, vs.vs_start, vs.vs_end, sz);
3828 
3829 		if (no_debug)
3830 			return (DCMD_OK);
3831 
3832 		mdb_printf("%16s %4s %16p %16llx\n",
3833 		    "", "", vs.vs_thread, vs.vs_timestamp);
3834 
3835 		mdb_inc_indent(17);
3836 		for (i = 0; i < depth; i++) {
3837 			mdb_printf("%a\n", stk[i]);
3838 		}
3839 		mdb_dec_indent(17);
3840 		mdb_printf("\n");
3841 	} else {
3842 		mdb_printf("%0?p %4s %0?p %0?p", addr, type,
3843 		    vs.vs_start, size? sz : vs.vs_end);
3844 
3845 		if (no_debug) {
3846 			mdb_printf("\n");
3847 			return (DCMD_OK);
3848 		}
3849 
3850 		for (i = 0; i < depth; i++) {
3851 			if (mdb_lookup_by_addr(stk[i], MDB_SYM_FUZZY,
3852 			    c, sizeof (c), &sym) == -1)
3853 				continue;
3854 			if (strncmp(c, "vmem_", 5) == 0)
3855 				continue;
3856 			break;
3857 		}
3858 		mdb_printf(" %a\n", stk[i]);
3859 	}
3860 	return (DCMD_OK);
3861 }
3862 
3863 typedef struct kmalog_data {
3864 	uintptr_t	kma_addr;
3865 	hrtime_t	kma_newest;
3866 } kmalog_data_t;
3867 
3868 /*ARGSUSED*/
3869 static int
showbc(uintptr_t addr,const kmem_bufctl_audit_t * bcp,kmalog_data_t * kma)3870 showbc(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmalog_data_t *kma)
3871 {
3872 	char name[KMEM_CACHE_NAMELEN + 1];
3873 	hrtime_t delta;
3874 	int i, depth;
3875 	size_t bufsize;
3876 
3877 	if (bcp->bc_timestamp == 0)
3878 		return (WALK_DONE);
3879 
3880 	if (kma->kma_newest == 0)
3881 		kma->kma_newest = bcp->bc_timestamp;
3882 
3883 	if (kma->kma_addr) {
3884 		if (mdb_vread(&bufsize, sizeof (bufsize),
3885 		    (uintptr_t)&bcp->bc_cache->cache_bufsize) == -1) {
3886 			mdb_warn(
3887 			    "failed to read cache_bufsize for cache at %p",
3888 			    bcp->bc_cache);
3889 			return (WALK_ERR);
3890 		}
3891 
3892 		if (kma->kma_addr < (uintptr_t)bcp->bc_addr ||
3893 		    kma->kma_addr >= (uintptr_t)bcp->bc_addr + bufsize)
3894 			return (WALK_NEXT);
3895 	}
3896 
3897 	delta = kma->kma_newest - bcp->bc_timestamp;
3898 	depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
3899 
3900 	if (mdb_readstr(name, sizeof (name), (uintptr_t)
3901 	    &bcp->bc_cache->cache_name) <= 0)
3902 		(void) mdb_snprintf(name, sizeof (name), "%a", bcp->bc_cache);
3903 
3904 	mdb_printf("\nT-%lld.%09lld  addr=%p  %s\n",
3905 	    delta / NANOSEC, delta % NANOSEC, bcp->bc_addr, name);
3906 
3907 	for (i = 0; i < depth; i++)
3908 		mdb_printf("\t %a\n", bcp->bc_stack[i]);
3909 
3910 	return (WALK_NEXT);
3911 }
3912 
3913 int
kmalog(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)3914 kmalog(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
3915 {
3916 	const char *logname = "kmem_transaction_log";
3917 	kmalog_data_t kma;
3918 
3919 	if (argc > 1)
3920 		return (DCMD_USAGE);
3921 
3922 	kma.kma_newest = 0;
3923 	if (flags & DCMD_ADDRSPEC)
3924 		kma.kma_addr = addr;
3925 	else
3926 		kma.kma_addr = 0;
3927 
3928 	if (argc > 0) {
3929 		if (argv->a_type != MDB_TYPE_STRING)
3930 			return (DCMD_USAGE);
3931 		if (strcmp(argv->a_un.a_str, "fail") == 0)
3932 			logname = "kmem_failure_log";
3933 		else if (strcmp(argv->a_un.a_str, "slab") == 0)
3934 			logname = "kmem_slab_log";
3935 		else if (strcmp(argv->a_un.a_str, "zerosized") == 0)
3936 			logname = "kmem_zerosized_log";
3937 		else
3938 			return (DCMD_USAGE);
3939 	}
3940 
3941 	if (mdb_readvar(&addr, logname) == -1) {
3942 		mdb_warn("failed to read %s log header pointer");
3943 		return (DCMD_ERR);
3944 	}
3945 
3946 	if (mdb_pwalk("kmem_log", (mdb_walk_cb_t)showbc, &kma, addr) == -1) {
3947 		mdb_warn("failed to walk kmem log");
3948 		return (DCMD_ERR);
3949 	}
3950 
3951 	return (DCMD_OK);
3952 }
3953 
3954 /*
3955  * As the final lure for die-hard crash(8) users, we provide ::kmausers here.
3956  * The first piece is a structure which we use to accumulate kmem_cache_t
3957  * addresses of interest.  The kmc_add is used as a callback for the kmem_cache
3958  * walker; we either add all caches, or ones named explicitly as arguments.
3959  */
3960 
3961 typedef struct kmclist {
3962 	const char *kmc_name;			/* Name to match (or NULL) */
3963 	uintptr_t *kmc_caches;			/* List of kmem_cache_t addrs */
3964 	int kmc_nelems;				/* Num entries in kmc_caches */
3965 	int kmc_size;				/* Size of kmc_caches array */
3966 } kmclist_t;
3967 
3968 static int
kmc_add(uintptr_t addr,const kmem_cache_t * cp,kmclist_t * kmc)3969 kmc_add(uintptr_t addr, const kmem_cache_t *cp, kmclist_t *kmc)
3970 {
3971 	void *p;
3972 	int s;
3973 
3974 	if (kmc->kmc_name == NULL ||
3975 	    strcmp(cp->cache_name, kmc->kmc_name) == 0) {
3976 		/*
3977 		 * If we have a match, grow our array (if necessary), and then
3978 		 * add the virtual address of the matching cache to our list.
3979 		 */
3980 		if (kmc->kmc_nelems >= kmc->kmc_size) {
3981 			s = kmc->kmc_size ? kmc->kmc_size * 2 : 256;
3982 			p = mdb_alloc(sizeof (uintptr_t) * s, UM_SLEEP | UM_GC);
3983 
3984 			bcopy(kmc->kmc_caches, p,
3985 			    sizeof (uintptr_t) * kmc->kmc_size);
3986 
3987 			kmc->kmc_caches = p;
3988 			kmc->kmc_size = s;
3989 		}
3990 
3991 		kmc->kmc_caches[kmc->kmc_nelems++] = addr;
3992 		return (kmc->kmc_name ? WALK_DONE : WALK_NEXT);
3993 	}
3994 
3995 	return (WALK_NEXT);
3996 }
3997 
3998 /*
3999  * The second piece of ::kmausers is a hash table of allocations.  Each
4000  * allocation owner is identified by its stack trace and data_size.  We then
4001  * track the total bytes of all such allocations, and the number of allocations
4002  * to report at the end.  Once we have a list of caches, we walk through the
4003  * allocated bufctls of each, and update our hash table accordingly.
4004  */
4005 
4006 typedef struct kmowner {
4007 	struct kmowner *kmo_head;		/* First hash elt in bucket */
4008 	struct kmowner *kmo_next;		/* Next hash elt in chain */
4009 	size_t kmo_signature;			/* Hash table signature */
4010 	uint_t kmo_num;				/* Number of allocations */
4011 	size_t kmo_data_size;			/* Size of each allocation */
4012 	size_t kmo_total_size;			/* Total bytes of allocation */
4013 	int kmo_depth;				/* Depth of stack trace */
4014 	uintptr_t kmo_stack[KMEM_STACK_DEPTH];	/* Stack trace */
4015 } kmowner_t;
4016 
4017 typedef struct kmusers {
4018 	uintptr_t kmu_addr;			/* address of interest */
4019 	const kmem_cache_t *kmu_cache;		/* Current kmem cache */
4020 	kmowner_t *kmu_hash;			/* Hash table of owners */
4021 	int kmu_nelems;				/* Number of entries in use */
4022 	int kmu_size;				/* Total number of entries */
4023 } kmusers_t;
4024 
4025 static void
kmu_add(kmusers_t * kmu,const kmem_bufctl_audit_t * bcp,size_t size,size_t data_size)4026 kmu_add(kmusers_t *kmu, const kmem_bufctl_audit_t *bcp,
4027     size_t size, size_t data_size)
4028 {
4029 	int i, depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
4030 	size_t bucket, signature = data_size;
4031 	kmowner_t *kmo, *kmoend;
4032 
4033 	/*
4034 	 * If the hash table is full, double its size and rehash everything.
4035 	 */
4036 	if (kmu->kmu_nelems >= kmu->kmu_size) {
4037 		int s = kmu->kmu_size ? kmu->kmu_size * 2 : 1024;
4038 
4039 		kmo = mdb_alloc(sizeof (kmowner_t) * s, UM_SLEEP | UM_GC);
4040 		bcopy(kmu->kmu_hash, kmo, sizeof (kmowner_t) * kmu->kmu_size);
4041 		kmu->kmu_hash = kmo;
4042 		kmu->kmu_size = s;
4043 
4044 		kmoend = kmu->kmu_hash + kmu->kmu_size;
4045 		for (kmo = kmu->kmu_hash; kmo < kmoend; kmo++)
4046 			kmo->kmo_head = NULL;
4047 
4048 		kmoend = kmu->kmu_hash + kmu->kmu_nelems;
4049 		for (kmo = kmu->kmu_hash; kmo < kmoend; kmo++) {
4050 			bucket = kmo->kmo_signature & (kmu->kmu_size - 1);
4051 			kmo->kmo_next = kmu->kmu_hash[bucket].kmo_head;
4052 			kmu->kmu_hash[bucket].kmo_head = kmo;
4053 		}
4054 	}
4055 
4056 	/*
4057 	 * Finish computing the hash signature from the stack trace, and then
4058 	 * see if the owner is in the hash table.  If so, update our stats.
4059 	 */
4060 	for (i = 0; i < depth; i++)
4061 		signature += bcp->bc_stack[i];
4062 
4063 	bucket = signature & (kmu->kmu_size - 1);
4064 
4065 	for (kmo = kmu->kmu_hash[bucket].kmo_head; kmo; kmo = kmo->kmo_next) {
4066 		if (kmo->kmo_signature == signature) {
4067 			size_t difference = 0;
4068 
4069 			difference |= kmo->kmo_data_size - data_size;
4070 			difference |= kmo->kmo_depth - depth;
4071 
4072 			for (i = 0; i < depth; i++) {
4073 				difference |= kmo->kmo_stack[i] -
4074 				    bcp->bc_stack[i];
4075 			}
4076 
4077 			if (difference == 0) {
4078 				kmo->kmo_total_size += size;
4079 				kmo->kmo_num++;
4080 				return;
4081 			}
4082 		}
4083 	}
4084 
4085 	/*
4086 	 * If the owner is not yet hashed, grab the next element and fill it
4087 	 * in based on the allocation information.
4088 	 */
4089 	kmo = &kmu->kmu_hash[kmu->kmu_nelems++];
4090 	kmo->kmo_next = kmu->kmu_hash[bucket].kmo_head;
4091 	kmu->kmu_hash[bucket].kmo_head = kmo;
4092 
4093 	kmo->kmo_signature = signature;
4094 	kmo->kmo_num = 1;
4095 	kmo->kmo_data_size = data_size;
4096 	kmo->kmo_total_size = size;
4097 	kmo->kmo_depth = depth;
4098 
4099 	for (i = 0; i < depth; i++)
4100 		kmo->kmo_stack[i] = bcp->bc_stack[i];
4101 }
4102 
4103 /*
4104  * When ::kmausers is invoked without the -f flag, we simply update our hash
4105  * table with the information from each allocated bufctl.
4106  */
4107 /*ARGSUSED*/
4108 static int
kmause1(uintptr_t addr,const kmem_bufctl_audit_t * bcp,kmusers_t * kmu)4109 kmause1(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmusers_t *kmu)
4110 {
4111 	const kmem_cache_t *cp = kmu->kmu_cache;
4112 
4113 	kmu_add(kmu, bcp, cp->cache_bufsize, cp->cache_bufsize);
4114 	return (WALK_NEXT);
4115 }
4116 
4117 /*
4118  * When ::kmausers is invoked with the -f flag, we print out the information
4119  * for each bufctl as well as updating the hash table.
4120  */
4121 static int
kmause2(uintptr_t addr,const kmem_bufctl_audit_t * bcp,kmusers_t * kmu)4122 kmause2(uintptr_t addr, const kmem_bufctl_audit_t *bcp, kmusers_t *kmu)
4123 {
4124 	int i, depth = MIN(bcp->bc_depth, KMEM_STACK_DEPTH);
4125 	const kmem_cache_t *cp = kmu->kmu_cache;
4126 	kmem_bufctl_t bufctl;
4127 
4128 	if (kmu->kmu_addr) {
4129 		if (mdb_vread(&bufctl, sizeof (bufctl),  addr) == -1)
4130 			mdb_warn("couldn't read bufctl at %p", addr);
4131 		else if (kmu->kmu_addr < (uintptr_t)bufctl.bc_addr ||
4132 		    kmu->kmu_addr >= (uintptr_t)bufctl.bc_addr +
4133 		    cp->cache_bufsize)
4134 			return (WALK_NEXT);
4135 	}
4136 
4137 	mdb_printf("size %d, addr %p, thread %p, cache %s\n",
4138 	    cp->cache_bufsize, addr, bcp->bc_thread, cp->cache_name);
4139 
4140 	for (i = 0; i < depth; i++)
4141 		mdb_printf("\t %a\n", bcp->bc_stack[i]);
4142 
4143 	kmu_add(kmu, bcp, cp->cache_bufsize, cp->cache_bufsize);
4144 	return (WALK_NEXT);
4145 }
4146 
4147 /*
4148  * We sort our results by allocation size before printing them.
4149  */
4150 static int
kmownercmp(const void * lp,const void * rp)4151 kmownercmp(const void *lp, const void *rp)
4152 {
4153 	const kmowner_t *lhs = lp;
4154 	const kmowner_t *rhs = rp;
4155 
4156 	return (rhs->kmo_total_size - lhs->kmo_total_size);
4157 }
4158 
4159 /*
4160  * The main engine of ::kmausers is relatively straightforward: First we
4161  * accumulate our list of kmem_cache_t addresses into the kmclist_t. Next we
4162  * iterate over the allocated bufctls of each cache in the list.  Finally,
4163  * we sort and print our results.
4164  */
4165 /*ARGSUSED*/
4166 int
kmausers(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)4167 kmausers(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
4168 {
4169 	int mem_threshold = 8192;	/* Minimum # bytes for printing */
4170 	int cnt_threshold = 100;	/* Minimum # blocks for printing */
4171 	int audited_caches = 0;		/* Number of KMF_AUDIT caches found */
4172 	int do_all_caches = 1;		/* Do all caches (no arguments) */
4173 	int opt_e = FALSE;		/* Include "small" users */
4174 	int opt_f = FALSE;		/* Print stack traces */
4175 
4176 	mdb_walk_cb_t callback = (mdb_walk_cb_t)kmause1;
4177 	kmowner_t *kmo, *kmoend;
4178 	int i, oelems;
4179 
4180 	kmclist_t kmc;
4181 	kmusers_t kmu;
4182 
4183 	bzero(&kmc, sizeof (kmc));
4184 	bzero(&kmu, sizeof (kmu));
4185 
4186 	while ((i = mdb_getopts(argc, argv,
4187 	    'e', MDB_OPT_SETBITS, TRUE, &opt_e,
4188 	    'f', MDB_OPT_SETBITS, TRUE, &opt_f, NULL)) != argc) {
4189 
4190 		argv += i;	/* skip past options we just processed */
4191 		argc -= i;	/* adjust argc */
4192 
4193 		if (argv->a_type != MDB_TYPE_STRING || *argv->a_un.a_str == '-')
4194 			return (DCMD_USAGE);
4195 
4196 		oelems = kmc.kmc_nelems;
4197 		kmc.kmc_name = argv->a_un.a_str;
4198 		(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmc_add, &kmc);
4199 
4200 		if (kmc.kmc_nelems == oelems) {
4201 			mdb_warn("unknown kmem cache: %s\n", kmc.kmc_name);
4202 			return (DCMD_ERR);
4203 		}
4204 
4205 		do_all_caches = 0;
4206 		argv++;
4207 		argc--;
4208 	}
4209 
4210 	if (flags & DCMD_ADDRSPEC) {
4211 		opt_f = TRUE;
4212 		kmu.kmu_addr = addr;
4213 	} else {
4214 		kmu.kmu_addr = 0;
4215 	}
4216 
4217 	if (opt_e)
4218 		mem_threshold = cnt_threshold = 0;
4219 
4220 	if (opt_f)
4221 		callback = (mdb_walk_cb_t)kmause2;
4222 
4223 	if (do_all_caches) {
4224 		kmc.kmc_name = NULL; /* match all cache names */
4225 		(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmc_add, &kmc);
4226 	}
4227 
4228 	for (i = 0; i < kmc.kmc_nelems; i++) {
4229 		uintptr_t cp = kmc.kmc_caches[i];
4230 		kmem_cache_t c;
4231 
4232 		if (mdb_vread(&c, sizeof (c), cp) == -1) {
4233 			mdb_warn("failed to read cache at %p", cp);
4234 			continue;
4235 		}
4236 
4237 		if (!(c.cache_flags & KMF_AUDIT)) {
4238 			if (!do_all_caches) {
4239 				mdb_warn("KMF_AUDIT is not enabled for %s\n",
4240 				    c.cache_name);
4241 			}
4242 			continue;
4243 		}
4244 
4245 		kmu.kmu_cache = &c;
4246 		(void) mdb_pwalk("bufctl", callback, &kmu, cp);
4247 		audited_caches++;
4248 	}
4249 
4250 	if (audited_caches == 0 && do_all_caches) {
4251 		mdb_warn("KMF_AUDIT is not enabled for any caches\n");
4252 		return (DCMD_ERR);
4253 	}
4254 
4255 	qsort(kmu.kmu_hash, kmu.kmu_nelems, sizeof (kmowner_t), kmownercmp);
4256 	kmoend = kmu.kmu_hash + kmu.kmu_nelems;
4257 
4258 	for (kmo = kmu.kmu_hash; kmo < kmoend; kmo++) {
4259 		if (kmo->kmo_total_size < mem_threshold &&
4260 		    kmo->kmo_num < cnt_threshold)
4261 			continue;
4262 		mdb_printf("%lu bytes for %u allocations with data size %lu:\n",
4263 		    kmo->kmo_total_size, kmo->kmo_num, kmo->kmo_data_size);
4264 		for (i = 0; i < kmo->kmo_depth; i++)
4265 			mdb_printf("\t %a\n", kmo->kmo_stack[i]);
4266 	}
4267 
4268 	return (DCMD_OK);
4269 }
4270 
4271 void
kmausers_help(void)4272 kmausers_help(void)
4273 {
4274 	mdb_printf(
4275 	    "Displays the largest users of the kmem allocator, sorted by \n"
4276 	    "trace.  If one or more caches is specified, only those caches\n"
4277 	    "will be searched.  By default, all caches are searched.  If an\n"
4278 	    "address is specified, then only those allocations which include\n"
4279 	    "the given address are displayed.  Specifying an address implies\n"
4280 	    "-f.\n"
4281 	    "\n"
4282 	    "\t-e\tInclude all users, not just the largest\n"
4283 	    "\t-f\tDisplay individual allocations.  By default, users are\n"
4284 	    "\t\tgrouped by stack\n");
4285 }
4286 
4287 static int
kmem_ready_check(void)4288 kmem_ready_check(void)
4289 {
4290 	int ready;
4291 
4292 	if (mdb_readvar(&ready, "kmem_ready") < 0)
4293 		return (-1); /* errno is set for us */
4294 
4295 	return (ready);
4296 }
4297 
4298 void
kmem_statechange(void)4299 kmem_statechange(void)
4300 {
4301 	static int been_ready = 0;
4302 
4303 	if (been_ready)
4304 		return;
4305 
4306 	if (kmem_ready_check() <= 0)
4307 		return;
4308 
4309 	been_ready = 1;
4310 	(void) mdb_walk("kmem_cache", (mdb_walk_cb_t)kmem_init_walkers, NULL);
4311 }
4312 
4313 void
kmem_init(void)4314 kmem_init(void)
4315 {
4316 	mdb_walker_t w = {
4317 		"kmem_cache", "walk list of kmem caches", kmem_cache_walk_init,
4318 		list_walk_step, list_walk_fini
4319 	};
4320 
4321 	/*
4322 	 * If kmem is ready, we'll need to invoke the kmem_cache walker
4323 	 * immediately.  Walkers in the linkage structure won't be ready until
4324 	 * _mdb_init returns, so we'll need to add this one manually.  If kmem
4325 	 * is ready, we'll use the walker to initialize the caches.  If kmem
4326 	 * isn't ready, we'll register a callback that will allow us to defer
4327 	 * cache walking until it is.
4328 	 */
4329 	if (mdb_add_walker(&w) != 0) {
4330 		mdb_warn("failed to add kmem_cache walker");
4331 		return;
4332 	}
4333 
4334 	kmem_statechange();
4335 
4336 	/* register our ::whatis handlers */
4337 	mdb_whatis_register("modules", whatis_run_modules, NULL,
4338 	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4339 	mdb_whatis_register("threads", whatis_run_threads, NULL,
4340 	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4341 	mdb_whatis_register("pages", whatis_run_pages, NULL,
4342 	    WHATIS_PRIO_EARLY, WHATIS_REG_NO_ID);
4343 	mdb_whatis_register("kmem", whatis_run_kmem, NULL,
4344 	    WHATIS_PRIO_ALLOCATOR, 0);
4345 	mdb_whatis_register("vmem", whatis_run_vmem, NULL,
4346 	    WHATIS_PRIO_ALLOCATOR, 0);
4347 }
4348 
4349 typedef struct whatthread {
4350 	uintptr_t	wt_target;
4351 	int		wt_verbose;
4352 } whatthread_t;
4353 
4354 static int
whatthread_walk_thread(uintptr_t addr,const kthread_t * t,whatthread_t * w)4355 whatthread_walk_thread(uintptr_t addr, const kthread_t *t, whatthread_t *w)
4356 {
4357 	uintptr_t current, data;
4358 
4359 	if (t->t_stkbase == NULL)
4360 		return (WALK_NEXT);
4361 
4362 	/*
4363 	 * Warn about swapped out threads, but drive on anyway
4364 	 */
4365 	if (!(t->t_schedflag & TS_LOAD)) {
4366 		mdb_warn("thread %p's stack swapped out\n", addr);
4367 		return (WALK_NEXT);
4368 	}
4369 
4370 	/*
4371 	 * Search the thread's stack for the given pointer.  Note that it would
4372 	 * be more efficient to follow ::kgrep's lead and read in page-sized
4373 	 * chunks, but this routine is already fast and simple.
4374 	 */
4375 	for (current = (uintptr_t)t->t_stkbase; current < (uintptr_t)t->t_stk;
4376 	    current += sizeof (uintptr_t)) {
4377 		if (mdb_vread(&data, sizeof (data), current) == -1) {
4378 			mdb_warn("couldn't read thread %p's stack at %p",
4379 			    addr, current);
4380 			return (WALK_ERR);
4381 		}
4382 
4383 		if (data == w->wt_target) {
4384 			if (w->wt_verbose) {
4385 				mdb_printf("%p in thread %p's stack%s\n",
4386 				    current, addr, stack_active(t, current));
4387 			} else {
4388 				mdb_printf("%#lr\n", addr);
4389 				return (WALK_NEXT);
4390 			}
4391 		}
4392 	}
4393 
4394 	return (WALK_NEXT);
4395 }
4396 
4397 int
whatthread(uintptr_t addr,uint_t flags,int argc,const mdb_arg_t * argv)4398 whatthread(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
4399 {
4400 	whatthread_t w;
4401 
4402 	if (!(flags & DCMD_ADDRSPEC))
4403 		return (DCMD_USAGE);
4404 
4405 	w.wt_verbose = FALSE;
4406 	w.wt_target = addr;
4407 
4408 	if (mdb_getopts(argc, argv,
4409 	    'v', MDB_OPT_SETBITS, TRUE, &w.wt_verbose, NULL) != argc)
4410 		return (DCMD_USAGE);
4411 
4412 	if (mdb_walk("thread", (mdb_walk_cb_t)whatthread_walk_thread, &w)
4413 	    == -1) {
4414 		mdb_warn("couldn't walk threads");
4415 		return (DCMD_ERR);
4416 	}
4417 
4418 	return (DCMD_OK);
4419 }
4420