xref: /freebsd/sys/vm/uma_core.c (revision 6b2c1e49da284f28ec7b52f7c031474087e37104)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
5  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
6  * Copyright (c) 2004-2006 Robert N. M. Watson
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice unmodified, this list of conditions, and the following
14  *    disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * uma_core.c  Implementation of the Universal Memory allocator
33  *
34  * This allocator is intended to replace the multitude of similar object caches
35  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
36  * efficient.  A primary design goal is to return unused memory to the rest of
37  * the system.  This will make the system as a whole more flexible due to the
38  * ability to move memory to subsystems which most need it instead of leaving
39  * pools of reserved memory unused.
40  *
41  * The basic ideas stem from similar slab/zone based allocators whose algorithms
42  * are well known.
43  *
44  */
45 
46 /*
47  * TODO:
48  *	- Improve memory usage for large allocations
49  *	- Investigate cache size adjustments
50  */
51 
52 #include <sys/cdefs.h>
53 __FBSDID("$FreeBSD$");
54 
55 #include "opt_ddb.h"
56 #include "opt_param.h"
57 #include "opt_vm.h"
58 
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/bitset.h>
62 #include <sys/domainset.h>
63 #include <sys/eventhandler.h>
64 #include <sys/kernel.h>
65 #include <sys/types.h>
66 #include <sys/limits.h>
67 #include <sys/queue.h>
68 #include <sys/malloc.h>
69 #include <sys/ktr.h>
70 #include <sys/lock.h>
71 #include <sys/sysctl.h>
72 #include <sys/mutex.h>
73 #include <sys/proc.h>
74 #include <sys/random.h>
75 #include <sys/rwlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/taskqueue.h>
80 #include <sys/vmmeter.h>
81 
82 #include <vm/vm.h>
83 #include <vm/vm_domainset.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_param.h>
88 #include <vm/vm_phys.h>
89 #include <vm/vm_pagequeue.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_kern.h>
92 #include <vm/vm_extern.h>
93 #include <vm/uma.h>
94 #include <vm/uma_int.h>
95 #include <vm/uma_dbg.h>
96 
97 #include <ddb/ddb.h>
98 
99 #ifdef DEBUG_MEMGUARD
100 #include <vm/memguard.h>
101 #endif
102 
103 /*
104  * This is the zone and keg from which all zones are spawned.
105  */
106 static uma_zone_t kegs;
107 static uma_zone_t zones;
108 
109 /* This is the zone from which all offpage uma_slab_ts are allocated. */
110 static uma_zone_t slabzone;
111 
112 /*
113  * The initial hash tables come out of this zone so they can be allocated
114  * prior to malloc coming up.
115  */
116 static uma_zone_t hashzone;
117 
118 /* The boot-time adjusted value for cache line alignment. */
119 int uma_align_cache = 64 - 1;
120 
121 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
122 
123 /*
124  * Are we allowed to allocate buckets?
125  */
126 static int bucketdisable = 1;
127 
128 /* Linked list of all kegs in the system */
129 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
130 
131 /* Linked list of all cache-only zones in the system */
132 static LIST_HEAD(,uma_zone) uma_cachezones =
133     LIST_HEAD_INITIALIZER(uma_cachezones);
134 
135 /* This RW lock protects the keg list */
136 static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
137 
138 /*
139  * Pointer and counter to pool of pages, that is preallocated at
140  * startup to bootstrap UMA.
141  */
142 static char *bootmem;
143 static int boot_pages;
144 
145 static struct sx uma_reclaim_lock;
146 
147 /*
148  * kmem soft limit, initialized by uma_set_limit().  Ensure that early
149  * allocations don't trigger a wakeup of the reclaim thread.
150  */
151 static unsigned long uma_kmem_limit = LONG_MAX;
152 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0,
153     "UMA kernel memory soft limit");
154 static unsigned long uma_kmem_total;
155 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0,
156     "UMA kernel memory usage");
157 
158 /* Is the VM done starting up? */
159 static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
160     BOOT_RUNNING } booted = BOOT_COLD;
161 
162 /*
163  * This is the handle used to schedule events that need to happen
164  * outside of the allocation fast path.
165  */
166 static struct callout uma_callout;
167 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
168 
169 /*
170  * This structure is passed as the zone ctor arg so that I don't have to create
171  * a special allocation function just for zones.
172  */
173 struct uma_zctor_args {
174 	const char *name;
175 	size_t size;
176 	uma_ctor ctor;
177 	uma_dtor dtor;
178 	uma_init uminit;
179 	uma_fini fini;
180 	uma_import import;
181 	uma_release release;
182 	void *arg;
183 	uma_keg_t keg;
184 	int align;
185 	uint32_t flags;
186 };
187 
188 struct uma_kctor_args {
189 	uma_zone_t zone;
190 	size_t size;
191 	uma_init uminit;
192 	uma_fini fini;
193 	int align;
194 	uint32_t flags;
195 };
196 
197 struct uma_bucket_zone {
198 	uma_zone_t	ubz_zone;
199 	char		*ubz_name;
200 	int		ubz_entries;	/* Number of items it can hold. */
201 	int		ubz_maxsize;	/* Maximum allocation size per-item. */
202 };
203 
204 /*
205  * Compute the actual number of bucket entries to pack them in power
206  * of two sizes for more efficient space utilization.
207  */
208 #define	BUCKET_SIZE(n)						\
209     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
210 
211 #define	BUCKET_MAX	BUCKET_SIZE(256)
212 #define	BUCKET_MIN	BUCKET_SIZE(4)
213 
214 struct uma_bucket_zone bucket_zones[] = {
215 	{ NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
216 	{ NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
217 	{ NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
218 	{ NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
219 	{ NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
220 	{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
221 	{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
222 	{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
223 	{ NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
224 	{ NULL, NULL, 0}
225 };
226 
227 /*
228  * Flags and enumerations to be passed to internal functions.
229  */
230 enum zfreeskip {
231 	SKIP_NONE =	0,
232 	SKIP_CNT =	0x00000001,
233 	SKIP_DTOR =	0x00010000,
234 	SKIP_FINI =	0x00020000,
235 };
236 
237 /* Prototypes.. */
238 
239 int	uma_startup_count(int);
240 void	uma_startup(void *, int);
241 void	uma_startup1(void);
242 void	uma_startup2(void);
243 
244 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
245 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
246 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
247 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
248 static void page_free(void *, vm_size_t, uint8_t);
249 static void pcpu_page_free(void *, vm_size_t, uint8_t);
250 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
251 static void cache_drain(uma_zone_t);
252 static void bucket_drain(uma_zone_t, uma_bucket_t);
253 static void bucket_cache_reclaim(uma_zone_t zone, bool);
254 static int keg_ctor(void *, int, void *, int);
255 static void keg_dtor(void *, int, void *);
256 static int zone_ctor(void *, int, void *, int);
257 static void zone_dtor(void *, int, void *);
258 static int zero_init(void *, int, int);
259 static void keg_small_init(uma_keg_t keg);
260 static void keg_large_init(uma_keg_t keg);
261 static void zone_foreach(void (*zfunc)(uma_zone_t));
262 static void zone_timeout(uma_zone_t zone);
263 static int hash_alloc(struct uma_hash *, u_int);
264 static int hash_expand(struct uma_hash *, struct uma_hash *);
265 static void hash_free(struct uma_hash *hash);
266 static void uma_timeout(void *);
267 static void uma_startup3(void);
268 static void *zone_alloc_item(uma_zone_t, void *, int, int);
269 static void *zone_alloc_item_locked(uma_zone_t, void *, int, int);
270 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
271 static void bucket_enable(void);
272 static void bucket_init(void);
273 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
274 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
275 static void bucket_zone_drain(void);
276 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int, int);
277 static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int);
278 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
279 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item);
280 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
281     uma_fini fini, int align, uint32_t flags);
282 static int zone_import(uma_zone_t, void **, int, int, int);
283 static void zone_release(uma_zone_t, void **, int);
284 static void uma_zero_item(void *, uma_zone_t);
285 
286 void uma_print_zone(uma_zone_t);
287 void uma_print_stats(void);
288 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
289 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
290 
291 #ifdef INVARIANTS
292 static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
293 static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
294 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
295 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
296 
297 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0,
298     "Memory allocation debugging");
299 
300 static u_int dbg_divisor = 1;
301 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
302     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
303     "Debug & thrash every this item in memory allocator");
304 
305 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
306 static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
307 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
308     &uma_dbg_cnt, "memory items debugged");
309 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
310     &uma_skip_cnt, "memory items skipped, not debugged");
311 #endif
312 
313 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
314 
315 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
316     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
317 
318 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
319     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
320 
321 static int zone_warnings = 1;
322 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
323     "Warn when UMA zones becomes full");
324 
325 /* Adjust bytes under management by UMA. */
326 static inline void
327 uma_total_dec(unsigned long size)
328 {
329 
330 	atomic_subtract_long(&uma_kmem_total, size);
331 }
332 
333 static inline void
334 uma_total_inc(unsigned long size)
335 {
336 
337 	if (atomic_fetchadd_long(&uma_kmem_total, size) > uma_kmem_limit)
338 		uma_reclaim_wakeup();
339 }
340 
341 /*
342  * This routine checks to see whether or not it's safe to enable buckets.
343  */
344 static void
345 bucket_enable(void)
346 {
347 	bucketdisable = vm_page_count_min();
348 }
349 
350 /*
351  * Initialize bucket_zones, the array of zones of buckets of various sizes.
352  *
353  * For each zone, calculate the memory required for each bucket, consisting
354  * of the header and an array of pointers.
355  */
356 static void
357 bucket_init(void)
358 {
359 	struct uma_bucket_zone *ubz;
360 	int size;
361 
362 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
363 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
364 		size += sizeof(void *) * ubz->ubz_entries;
365 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
366 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
367 		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA);
368 	}
369 }
370 
371 /*
372  * Given a desired number of entries for a bucket, return the zone from which
373  * to allocate the bucket.
374  */
375 static struct uma_bucket_zone *
376 bucket_zone_lookup(int entries)
377 {
378 	struct uma_bucket_zone *ubz;
379 
380 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
381 		if (ubz->ubz_entries >= entries)
382 			return (ubz);
383 	ubz--;
384 	return (ubz);
385 }
386 
387 static struct uma_bucket_zone *
388 bucket_zone_max(uma_zone_t zone, int nitems)
389 {
390 	struct uma_bucket_zone *ubz;
391 	int bpcpu;
392 
393 	bpcpu = 2;
394 #ifdef UMA_XDOMAIN
395 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
396 		/* Count the cross-domain bucket. */
397 		bpcpu++;
398 #endif
399 
400 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
401 		if (ubz->ubz_entries * bpcpu * mp_ncpus > nitems)
402 			break;
403 	if (ubz == &bucket_zones[0])
404 		ubz = NULL;
405 	else
406 		ubz--;
407 	return (ubz);
408 }
409 
410 static int
411 bucket_select(int size)
412 {
413 	struct uma_bucket_zone *ubz;
414 
415 	ubz = &bucket_zones[0];
416 	if (size > ubz->ubz_maxsize)
417 		return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
418 
419 	for (; ubz->ubz_entries != 0; ubz++)
420 		if (ubz->ubz_maxsize < size)
421 			break;
422 	ubz--;
423 	return (ubz->ubz_entries);
424 }
425 
426 static uma_bucket_t
427 bucket_alloc(uma_zone_t zone, void *udata, int flags)
428 {
429 	struct uma_bucket_zone *ubz;
430 	uma_bucket_t bucket;
431 
432 	/*
433 	 * This is to stop us from allocating per cpu buckets while we're
434 	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
435 	 * boot pages.  This also prevents us from allocating buckets in
436 	 * low memory situations.
437 	 */
438 	if (bucketdisable)
439 		return (NULL);
440 	/*
441 	 * To limit bucket recursion we store the original zone flags
442 	 * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
443 	 * NOVM flag to persist even through deep recursions.  We also
444 	 * store ZFLAG_BUCKET once we have recursed attempting to allocate
445 	 * a bucket for a bucket zone so we do not allow infinite bucket
446 	 * recursion.  This cookie will even persist to frees of unused
447 	 * buckets via the allocation path or bucket allocations in the
448 	 * free path.
449 	 */
450 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
451 		udata = (void *)(uintptr_t)zone->uz_flags;
452 	else {
453 		if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
454 			return (NULL);
455 		udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
456 	}
457 	if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
458 		flags |= M_NOVM;
459 	ubz = bucket_zone_lookup(zone->uz_count);
460 	if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
461 		ubz++;
462 	bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
463 	if (bucket) {
464 #ifdef INVARIANTS
465 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
466 #endif
467 		bucket->ub_cnt = 0;
468 		bucket->ub_entries = ubz->ubz_entries;
469 	}
470 
471 	return (bucket);
472 }
473 
474 static void
475 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
476 {
477 	struct uma_bucket_zone *ubz;
478 
479 	KASSERT(bucket->ub_cnt == 0,
480 	    ("bucket_free: Freeing a non free bucket."));
481 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
482 		udata = (void *)(uintptr_t)zone->uz_flags;
483 	ubz = bucket_zone_lookup(bucket->ub_entries);
484 	uma_zfree_arg(ubz->ubz_zone, bucket, udata);
485 }
486 
487 static void
488 bucket_zone_drain(void)
489 {
490 	struct uma_bucket_zone *ubz;
491 
492 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
493 		uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN);
494 }
495 
496 /*
497  * Attempt to satisfy an allocation by retrieving a full bucket from one of the
498  * zone's caches.
499  */
500 static uma_bucket_t
501 zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom)
502 {
503 	uma_bucket_t bucket;
504 
505 	ZONE_LOCK_ASSERT(zone);
506 
507 	if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) != NULL) {
508 		MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
509 		TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
510 		zdom->uzd_nitems -= bucket->ub_cnt;
511 		if (zdom->uzd_imin > zdom->uzd_nitems)
512 			zdom->uzd_imin = zdom->uzd_nitems;
513 		zone->uz_bkt_count -= bucket->ub_cnt;
514 	}
515 	return (bucket);
516 }
517 
518 /*
519  * Insert a full bucket into the specified cache.  The "ws" parameter indicates
520  * whether the bucket's contents should be counted as part of the zone's working
521  * set.
522  */
523 static void
524 zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket,
525     const bool ws)
526 {
527 
528 	ZONE_LOCK_ASSERT(zone);
529 	KASSERT(!ws || zone->uz_bkt_count < zone->uz_bkt_max,
530 	    ("%s: zone %p overflow", __func__, zone));
531 
532 	if (ws)
533 		TAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
534 	else
535 		TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link);
536 	zdom->uzd_nitems += bucket->ub_cnt;
537 	if (ws && zdom->uzd_imax < zdom->uzd_nitems)
538 		zdom->uzd_imax = zdom->uzd_nitems;
539 	zone->uz_bkt_count += bucket->ub_cnt;
540 }
541 
542 static void
543 zone_log_warning(uma_zone_t zone)
544 {
545 	static const struct timeval warninterval = { 300, 0 };
546 
547 	if (!zone_warnings || zone->uz_warning == NULL)
548 		return;
549 
550 	if (ratecheck(&zone->uz_ratecheck, &warninterval))
551 		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
552 }
553 
554 static inline void
555 zone_maxaction(uma_zone_t zone)
556 {
557 
558 	if (zone->uz_maxaction.ta_func != NULL)
559 		taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
560 }
561 
562 /*
563  * Routine called by timeout which is used to fire off some time interval
564  * based calculations.  (stats, hash size, etc.)
565  *
566  * Arguments:
567  *	arg   Unused
568  *
569  * Returns:
570  *	Nothing
571  */
572 static void
573 uma_timeout(void *unused)
574 {
575 	bucket_enable();
576 	zone_foreach(zone_timeout);
577 
578 	/* Reschedule this event */
579 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
580 }
581 
582 /*
583  * Update the working set size estimate for the zone's bucket cache.
584  * The constants chosen here are somewhat arbitrary.  With an update period of
585  * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the
586  * last 100s.
587  */
588 static void
589 zone_domain_update_wss(uma_zone_domain_t zdom)
590 {
591 	long wss;
592 
593 	MPASS(zdom->uzd_imax >= zdom->uzd_imin);
594 	wss = zdom->uzd_imax - zdom->uzd_imin;
595 	zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems;
596 	zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5;
597 }
598 
599 /*
600  * Routine to perform timeout driven calculations.  This expands the
601  * hashes and does per cpu statistics aggregation.
602  *
603  *  Returns nothing.
604  */
605 static void
606 zone_timeout(uma_zone_t zone)
607 {
608 	uma_keg_t keg;
609 	u_int slabs;
610 
611 	if ((zone->uz_flags & UMA_ZONE_HASH) == 0)
612 		goto update_wss;
613 
614 	keg = zone->uz_keg;
615 	KEG_LOCK(keg);
616 	/*
617 	 * Expand the keg hash table.
618 	 *
619 	 * This is done if the number of slabs is larger than the hash size.
620 	 * What I'm trying to do here is completely reduce collisions.  This
621 	 * may be a little aggressive.  Should I allow for two collisions max?
622 	 */
623 	if (keg->uk_flags & UMA_ZONE_HASH &&
624 	    (slabs = keg->uk_pages / keg->uk_ppera) >
625 	     keg->uk_hash.uh_hashsize) {
626 		struct uma_hash newhash;
627 		struct uma_hash oldhash;
628 		int ret;
629 
630 		/*
631 		 * This is so involved because allocating and freeing
632 		 * while the keg lock is held will lead to deadlock.
633 		 * I have to do everything in stages and check for
634 		 * races.
635 		 */
636 		KEG_UNLOCK(keg);
637 		ret = hash_alloc(&newhash, 1 << fls(slabs));
638 		KEG_LOCK(keg);
639 		if (ret) {
640 			if (hash_expand(&keg->uk_hash, &newhash)) {
641 				oldhash = keg->uk_hash;
642 				keg->uk_hash = newhash;
643 			} else
644 				oldhash = newhash;
645 
646 			KEG_UNLOCK(keg);
647 			hash_free(&oldhash);
648 			return;
649 		}
650 	}
651 	KEG_UNLOCK(keg);
652 
653 update_wss:
654 	ZONE_LOCK(zone);
655 	for (int i = 0; i < vm_ndomains; i++)
656 		zone_domain_update_wss(&zone->uz_domain[i]);
657 	ZONE_UNLOCK(zone);
658 }
659 
660 /*
661  * Allocate and zero fill the next sized hash table from the appropriate
662  * backing store.
663  *
664  * Arguments:
665  *	hash  A new hash structure with the old hash size in uh_hashsize
666  *
667  * Returns:
668  *	1 on success and 0 on failure.
669  */
670 static int
671 hash_alloc(struct uma_hash *hash, u_int size)
672 {
673 	size_t alloc;
674 
675 	KASSERT(powerof2(size), ("hash size must be power of 2"));
676 	if (size > UMA_HASH_SIZE_INIT)  {
677 		hash->uh_hashsize = size;
678 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
679 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
680 		    M_UMAHASH, M_NOWAIT);
681 	} else {
682 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
683 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
684 		    UMA_ANYDOMAIN, M_WAITOK);
685 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
686 	}
687 	if (hash->uh_slab_hash) {
688 		bzero(hash->uh_slab_hash, alloc);
689 		hash->uh_hashmask = hash->uh_hashsize - 1;
690 		return (1);
691 	}
692 
693 	return (0);
694 }
695 
696 /*
697  * Expands the hash table for HASH zones.  This is done from zone_timeout
698  * to reduce collisions.  This must not be done in the regular allocation
699  * path, otherwise, we can recurse on the vm while allocating pages.
700  *
701  * Arguments:
702  *	oldhash  The hash you want to expand
703  *	newhash  The hash structure for the new table
704  *
705  * Returns:
706  *	Nothing
707  *
708  * Discussion:
709  */
710 static int
711 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
712 {
713 	uma_slab_t slab;
714 	u_int hval;
715 	u_int idx;
716 
717 	if (!newhash->uh_slab_hash)
718 		return (0);
719 
720 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
721 		return (0);
722 
723 	/*
724 	 * I need to investigate hash algorithms for resizing without a
725 	 * full rehash.
726 	 */
727 
728 	for (idx = 0; idx < oldhash->uh_hashsize; idx++)
729 		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
730 			slab = SLIST_FIRST(&oldhash->uh_slab_hash[idx]);
731 			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[idx], us_hlink);
732 			hval = UMA_HASH(newhash, slab->us_data);
733 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
734 			    slab, us_hlink);
735 		}
736 
737 	return (1);
738 }
739 
740 /*
741  * Free the hash bucket to the appropriate backing store.
742  *
743  * Arguments:
744  *	slab_hash  The hash bucket we're freeing
745  *	hashsize   The number of entries in that hash bucket
746  *
747  * Returns:
748  *	Nothing
749  */
750 static void
751 hash_free(struct uma_hash *hash)
752 {
753 	if (hash->uh_slab_hash == NULL)
754 		return;
755 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
756 		zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
757 	else
758 		free(hash->uh_slab_hash, M_UMAHASH);
759 }
760 
761 /*
762  * Frees all outstanding items in a bucket
763  *
764  * Arguments:
765  *	zone   The zone to free to, must be unlocked.
766  *	bucket The free/alloc bucket with items, cpu queue must be locked.
767  *
768  * Returns:
769  *	Nothing
770  */
771 
772 static void
773 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
774 {
775 	int i;
776 
777 	if (bucket == NULL)
778 		return;
779 
780 	if (zone->uz_fini)
781 		for (i = 0; i < bucket->ub_cnt; i++)
782 			zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
783 	zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
784 	if (zone->uz_max_items > 0) {
785 		ZONE_LOCK(zone);
786 		zone->uz_items -= bucket->ub_cnt;
787 		if (zone->uz_sleepers && zone->uz_items < zone->uz_max_items)
788 			wakeup_one(zone);
789 		ZONE_UNLOCK(zone);
790 	}
791 	bucket->ub_cnt = 0;
792 }
793 
794 /*
795  * Drains the per cpu caches for a zone.
796  *
797  * NOTE: This may only be called while the zone is being turn down, and not
798  * during normal operation.  This is necessary in order that we do not have
799  * to migrate CPUs to drain the per-CPU caches.
800  *
801  * Arguments:
802  *	zone     The zone to drain, must be unlocked.
803  *
804  * Returns:
805  *	Nothing
806  */
807 static void
808 cache_drain(uma_zone_t zone)
809 {
810 	uma_cache_t cache;
811 	int cpu;
812 
813 	/*
814 	 * XXX: It is safe to not lock the per-CPU caches, because we're
815 	 * tearing down the zone anyway.  I.e., there will be no further use
816 	 * of the caches at this point.
817 	 *
818 	 * XXX: It would good to be able to assert that the zone is being
819 	 * torn down to prevent improper use of cache_drain().
820 	 *
821 	 * XXX: We lock the zone before passing into bucket_cache_reclaim() as
822 	 * it is used elsewhere.  Should the tear-down path be made special
823 	 * there in some form?
824 	 */
825 	CPU_FOREACH(cpu) {
826 		cache = &zone->uz_cpu[cpu];
827 		bucket_drain(zone, cache->uc_allocbucket);
828 		if (cache->uc_allocbucket != NULL)
829 			bucket_free(zone, cache->uc_allocbucket, NULL);
830 		cache->uc_allocbucket = NULL;
831 		bucket_drain(zone, cache->uc_freebucket);
832 		if (cache->uc_freebucket != NULL)
833 			bucket_free(zone, cache->uc_freebucket, NULL);
834 		cache->uc_freebucket = NULL;
835 		bucket_drain(zone, cache->uc_crossbucket);
836 		if (cache->uc_crossbucket != NULL)
837 			bucket_free(zone, cache->uc_crossbucket, NULL);
838 		cache->uc_crossbucket = NULL;
839 	}
840 	ZONE_LOCK(zone);
841 	bucket_cache_reclaim(zone, true);
842 	ZONE_UNLOCK(zone);
843 }
844 
845 static void
846 cache_shrink(uma_zone_t zone)
847 {
848 
849 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
850 		return;
851 
852 	ZONE_LOCK(zone);
853 	zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
854 	ZONE_UNLOCK(zone);
855 }
856 
857 static void
858 cache_drain_safe_cpu(uma_zone_t zone)
859 {
860 	uma_cache_t cache;
861 	uma_bucket_t b1, b2, b3;
862 	int domain;
863 
864 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
865 		return;
866 
867 	b1 = b2 = b3 = NULL;
868 	ZONE_LOCK(zone);
869 	critical_enter();
870 	if (zone->uz_flags & UMA_ZONE_NUMA)
871 		domain = PCPU_GET(domain);
872 	else
873 		domain = 0;
874 	cache = &zone->uz_cpu[curcpu];
875 	if (cache->uc_allocbucket) {
876 		if (cache->uc_allocbucket->ub_cnt != 0)
877 			zone_put_bucket(zone, &zone->uz_domain[domain],
878 			    cache->uc_allocbucket, false);
879 		else
880 			b1 = cache->uc_allocbucket;
881 		cache->uc_allocbucket = NULL;
882 	}
883 	if (cache->uc_freebucket) {
884 		if (cache->uc_freebucket->ub_cnt != 0)
885 			zone_put_bucket(zone, &zone->uz_domain[domain],
886 			    cache->uc_freebucket, false);
887 		else
888 			b2 = cache->uc_freebucket;
889 		cache->uc_freebucket = NULL;
890 	}
891 	b3 = cache->uc_crossbucket;
892 	cache->uc_crossbucket = NULL;
893 	critical_exit();
894 	ZONE_UNLOCK(zone);
895 	if (b1)
896 		bucket_free(zone, b1, NULL);
897 	if (b2)
898 		bucket_free(zone, b2, NULL);
899 	if (b3) {
900 		bucket_drain(zone, b3);
901 		bucket_free(zone, b3, NULL);
902 	}
903 }
904 
905 /*
906  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
907  * This is an expensive call because it needs to bind to all CPUs
908  * one by one and enter a critical section on each of them in order
909  * to safely access their cache buckets.
910  * Zone lock must not be held on call this function.
911  */
912 static void
913 pcpu_cache_drain_safe(uma_zone_t zone)
914 {
915 	int cpu;
916 
917 	/*
918 	 * Polite bucket sizes shrinking was not enouth, shrink aggressively.
919 	 */
920 	if (zone)
921 		cache_shrink(zone);
922 	else
923 		zone_foreach(cache_shrink);
924 
925 	CPU_FOREACH(cpu) {
926 		thread_lock(curthread);
927 		sched_bind(curthread, cpu);
928 		thread_unlock(curthread);
929 
930 		if (zone)
931 			cache_drain_safe_cpu(zone);
932 		else
933 			zone_foreach(cache_drain_safe_cpu);
934 	}
935 	thread_lock(curthread);
936 	sched_unbind(curthread);
937 	thread_unlock(curthread);
938 }
939 
940 /*
941  * Reclaim cached buckets from a zone.  All buckets are reclaimed if the caller
942  * requested a drain, otherwise the per-domain caches are trimmed to either
943  * estimated working set size.
944  */
945 static void
946 bucket_cache_reclaim(uma_zone_t zone, bool drain)
947 {
948 	uma_zone_domain_t zdom;
949 	uma_bucket_t bucket;
950 	long target, tofree;
951 	int i;
952 
953 	for (i = 0; i < vm_ndomains; i++) {
954 		zdom = &zone->uz_domain[i];
955 
956 		/*
957 		 * If we were asked to drain the zone, we are done only once
958 		 * this bucket cache is empty.  Otherwise, we reclaim items in
959 		 * excess of the zone's estimated working set size.  If the
960 		 * difference nitems - imin is larger than the WSS estimate,
961 		 * then the estimate will grow at the end of this interval and
962 		 * we ignore the historical average.
963 		 */
964 		target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems -
965 		    zdom->uzd_imin);
966 		while (zdom->uzd_nitems > target) {
967 			bucket = TAILQ_LAST(&zdom->uzd_buckets, uma_bucketlist);
968 			if (bucket == NULL)
969 				break;
970 			tofree = bucket->ub_cnt;
971 			TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
972 			zdom->uzd_nitems -= tofree;
973 
974 			/*
975 			 * Shift the bounds of the current WSS interval to avoid
976 			 * perturbing the estimate.
977 			 */
978 			zdom->uzd_imax -= lmin(zdom->uzd_imax, tofree);
979 			zdom->uzd_imin -= lmin(zdom->uzd_imin, tofree);
980 
981 			ZONE_UNLOCK(zone);
982 			bucket_drain(zone, bucket);
983 			bucket_free(zone, bucket, NULL);
984 			ZONE_LOCK(zone);
985 		}
986 	}
987 
988 	/*
989 	 * Shrink the zone bucket size to ensure that the per-CPU caches
990 	 * don't grow too large.
991 	 */
992 	if (zone->uz_count > zone->uz_count_min)
993 		zone->uz_count--;
994 }
995 
996 static void
997 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
998 {
999 	uint8_t *mem;
1000 	int i;
1001 	uint8_t flags;
1002 
1003 	CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
1004 	    keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
1005 
1006 	mem = slab->us_data;
1007 	flags = slab->us_flags;
1008 	i = start;
1009 	if (keg->uk_fini != NULL) {
1010 		for (i--; i > -1; i--)
1011 #ifdef INVARIANTS
1012 		/*
1013 		 * trash_fini implies that dtor was trash_dtor. trash_fini
1014 		 * would check that memory hasn't been modified since free,
1015 		 * which executed trash_dtor.
1016 		 * That's why we need to run uma_dbg_kskip() check here,
1017 		 * albeit we don't make skip check for other init/fini
1018 		 * invocations.
1019 		 */
1020 		if (!uma_dbg_kskip(keg, slab->us_data + (keg->uk_rsize * i)) ||
1021 		    keg->uk_fini != trash_fini)
1022 #endif
1023 			keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
1024 			    keg->uk_size);
1025 	}
1026 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1027 		zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1028 	keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
1029 	uma_total_dec(PAGE_SIZE * keg->uk_ppera);
1030 }
1031 
1032 /*
1033  * Frees pages from a keg back to the system.  This is done on demand from
1034  * the pageout daemon.
1035  *
1036  * Returns nothing.
1037  */
1038 static void
1039 keg_drain(uma_keg_t keg)
1040 {
1041 	struct slabhead freeslabs = { 0 };
1042 	uma_domain_t dom;
1043 	uma_slab_t slab, tmp;
1044 	int i;
1045 
1046 	/*
1047 	 * We don't want to take pages from statically allocated kegs at this
1048 	 * time
1049 	 */
1050 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
1051 		return;
1052 
1053 	CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u",
1054 	    keg->uk_name, keg, keg->uk_free);
1055 	KEG_LOCK(keg);
1056 	if (keg->uk_free == 0)
1057 		goto finished;
1058 
1059 	for (i = 0; i < vm_ndomains; i++) {
1060 		dom = &keg->uk_domain[i];
1061 		LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) {
1062 			/* We have nowhere to free these to. */
1063 			if (slab->us_flags & UMA_SLAB_BOOT)
1064 				continue;
1065 
1066 			LIST_REMOVE(slab, us_link);
1067 			keg->uk_pages -= keg->uk_ppera;
1068 			keg->uk_free -= keg->uk_ipers;
1069 
1070 			if (keg->uk_flags & UMA_ZONE_HASH)
1071 				UMA_HASH_REMOVE(&keg->uk_hash, slab,
1072 				    slab->us_data);
1073 
1074 			SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
1075 		}
1076 	}
1077 
1078 finished:
1079 	KEG_UNLOCK(keg);
1080 
1081 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
1082 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
1083 		keg_free_slab(keg, slab, keg->uk_ipers);
1084 	}
1085 }
1086 
1087 static void
1088 zone_reclaim(uma_zone_t zone, int waitok, bool drain)
1089 {
1090 
1091 	/*
1092 	 * Set draining to interlock with zone_dtor() so we can release our
1093 	 * locks as we go.  Only dtor() should do a WAITOK call since it
1094 	 * is the only call that knows the structure will still be available
1095 	 * when it wakes up.
1096 	 */
1097 	ZONE_LOCK(zone);
1098 	while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) {
1099 		if (waitok == M_NOWAIT)
1100 			goto out;
1101 		msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
1102 	}
1103 	zone->uz_flags |= UMA_ZFLAG_RECLAIMING;
1104 	bucket_cache_reclaim(zone, drain);
1105 	ZONE_UNLOCK(zone);
1106 
1107 	/*
1108 	 * The DRAINING flag protects us from being freed while
1109 	 * we're running.  Normally the uma_rwlock would protect us but we
1110 	 * must be able to release and acquire the right lock for each keg.
1111 	 */
1112 	if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0)
1113 		keg_drain(zone->uz_keg);
1114 	ZONE_LOCK(zone);
1115 	zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING;
1116 	wakeup(zone);
1117 out:
1118 	ZONE_UNLOCK(zone);
1119 }
1120 
1121 static void
1122 zone_drain(uma_zone_t zone)
1123 {
1124 
1125 	zone_reclaim(zone, M_NOWAIT, true);
1126 }
1127 
1128 static void
1129 zone_trim(uma_zone_t zone)
1130 {
1131 
1132 	zone_reclaim(zone, M_NOWAIT, false);
1133 }
1134 
1135 /*
1136  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
1137  * If the allocation was successful, the keg lock will be held upon return,
1138  * otherwise the keg will be left unlocked.
1139  *
1140  * Arguments:
1141  *	flags   Wait flags for the item initialization routine
1142  *	aflags  Wait flags for the slab allocation
1143  *
1144  * Returns:
1145  *	The slab that was allocated or NULL if there is no memory and the
1146  *	caller specified M_NOWAIT.
1147  */
1148 static uma_slab_t
1149 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags,
1150     int aflags)
1151 {
1152 	uma_alloc allocf;
1153 	uma_slab_t slab;
1154 	unsigned long size;
1155 	uint8_t *mem;
1156 	uint8_t sflags;
1157 	int i;
1158 
1159 	KASSERT(domain >= 0 && domain < vm_ndomains,
1160 	    ("keg_alloc_slab: domain %d out of range", domain));
1161 	KEG_LOCK_ASSERT(keg);
1162 	MPASS(zone->uz_lockptr == &keg->uk_lock);
1163 
1164 	allocf = keg->uk_allocf;
1165 	KEG_UNLOCK(keg);
1166 
1167 	slab = NULL;
1168 	mem = NULL;
1169 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1170 		slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, aflags);
1171 		if (slab == NULL)
1172 			goto out;
1173 	}
1174 
1175 	/*
1176 	 * This reproduces the old vm_zone behavior of zero filling pages the
1177 	 * first time they are added to a zone.
1178 	 *
1179 	 * Malloced items are zeroed in uma_zalloc.
1180 	 */
1181 
1182 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1183 		aflags |= M_ZERO;
1184 	else
1185 		aflags &= ~M_ZERO;
1186 
1187 	if (keg->uk_flags & UMA_ZONE_NODUMP)
1188 		aflags |= M_NODUMP;
1189 
1190 	/* zone is passed for legacy reasons. */
1191 	size = keg->uk_ppera * PAGE_SIZE;
1192 	mem = allocf(zone, size, domain, &sflags, aflags);
1193 	if (mem == NULL) {
1194 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1195 			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1196 		slab = NULL;
1197 		goto out;
1198 	}
1199 	uma_total_inc(size);
1200 
1201 	/* Point the slab into the allocated memory */
1202 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
1203 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
1204 
1205 	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
1206 		for (i = 0; i < keg->uk_ppera; i++)
1207 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
1208 
1209 	slab->us_keg = keg;
1210 	slab->us_data = mem;
1211 	slab->us_freecount = keg->uk_ipers;
1212 	slab->us_flags = sflags;
1213 	slab->us_domain = domain;
1214 	BIT_FILL(SLAB_SETSIZE, &slab->us_free);
1215 #ifdef INVARIANTS
1216 	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
1217 #endif
1218 
1219 	if (keg->uk_init != NULL) {
1220 		for (i = 0; i < keg->uk_ipers; i++)
1221 			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
1222 			    keg->uk_size, flags) != 0)
1223 				break;
1224 		if (i != keg->uk_ipers) {
1225 			keg_free_slab(keg, slab, i);
1226 			slab = NULL;
1227 			goto out;
1228 		}
1229 	}
1230 	KEG_LOCK(keg);
1231 
1232 	CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
1233 	    slab, keg->uk_name, keg);
1234 
1235 	if (keg->uk_flags & UMA_ZONE_HASH)
1236 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
1237 
1238 	keg->uk_pages += keg->uk_ppera;
1239 	keg->uk_free += keg->uk_ipers;
1240 
1241 out:
1242 	return (slab);
1243 }
1244 
1245 /*
1246  * This function is intended to be used early on in place of page_alloc() so
1247  * that we may use the boot time page cache to satisfy allocations before
1248  * the VM is ready.
1249  */
1250 static void *
1251 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1252     int wait)
1253 {
1254 	uma_keg_t keg;
1255 	void *mem;
1256 	int pages;
1257 
1258 	keg = zone->uz_keg;
1259 	/*
1260 	 * If we are in BOOT_BUCKETS or higher, than switch to real
1261 	 * allocator.  Zones with page sized slabs switch at BOOT_PAGEALLOC.
1262 	 */
1263 	switch (booted) {
1264 		case BOOT_COLD:
1265 		case BOOT_STRAPPED:
1266 			break;
1267 		case BOOT_PAGEALLOC:
1268 			if (keg->uk_ppera > 1)
1269 				break;
1270 		case BOOT_BUCKETS:
1271 		case BOOT_RUNNING:
1272 #ifdef UMA_MD_SMALL_ALLOC
1273 			keg->uk_allocf = (keg->uk_ppera > 1) ?
1274 			    page_alloc : uma_small_alloc;
1275 #else
1276 			keg->uk_allocf = page_alloc;
1277 #endif
1278 			return keg->uk_allocf(zone, bytes, domain, pflag, wait);
1279 	}
1280 
1281 	/*
1282 	 * Check our small startup cache to see if it has pages remaining.
1283 	 */
1284 	pages = howmany(bytes, PAGE_SIZE);
1285 	KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
1286 	if (pages > boot_pages)
1287 		panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name);
1288 #ifdef DIAGNOSTIC
1289 	printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name,
1290 	    boot_pages);
1291 #endif
1292 	mem = bootmem;
1293 	boot_pages -= pages;
1294 	bootmem += pages * PAGE_SIZE;
1295 	*pflag = UMA_SLAB_BOOT;
1296 
1297 	return (mem);
1298 }
1299 
1300 /*
1301  * Allocates a number of pages from the system
1302  *
1303  * Arguments:
1304  *	bytes  The number of bytes requested
1305  *	wait  Shall we wait?
1306  *
1307  * Returns:
1308  *	A pointer to the alloced memory or possibly
1309  *	NULL if M_NOWAIT is set.
1310  */
1311 static void *
1312 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1313     int wait)
1314 {
1315 	void *p;	/* Returned page */
1316 
1317 	*pflag = UMA_SLAB_KERNEL;
1318 	p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait);
1319 
1320 	return (p);
1321 }
1322 
1323 static void *
1324 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1325     int wait)
1326 {
1327 	struct pglist alloctail;
1328 	vm_offset_t addr, zkva;
1329 	int cpu, flags;
1330 	vm_page_t p, p_next;
1331 #ifdef NUMA
1332 	struct pcpu *pc;
1333 #endif
1334 
1335 	MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
1336 
1337 	TAILQ_INIT(&alloctail);
1338 	flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1339 	    malloc2vm_flags(wait);
1340 	*pflag = UMA_SLAB_KERNEL;
1341 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
1342 		if (CPU_ABSENT(cpu)) {
1343 			p = vm_page_alloc(NULL, 0, flags);
1344 		} else {
1345 #ifndef NUMA
1346 			p = vm_page_alloc(NULL, 0, flags);
1347 #else
1348 			pc = pcpu_find(cpu);
1349 			p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags);
1350 			if (__predict_false(p == NULL))
1351 				p = vm_page_alloc(NULL, 0, flags);
1352 #endif
1353 		}
1354 		if (__predict_false(p == NULL))
1355 			goto fail;
1356 		TAILQ_INSERT_TAIL(&alloctail, p, listq);
1357 	}
1358 	if ((addr = kva_alloc(bytes)) == 0)
1359 		goto fail;
1360 	zkva = addr;
1361 	TAILQ_FOREACH(p, &alloctail, listq) {
1362 		pmap_qenter(zkva, &p, 1);
1363 		zkva += PAGE_SIZE;
1364 	}
1365 	return ((void*)addr);
1366 fail:
1367 	TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1368 		vm_page_unwire_noq(p);
1369 		vm_page_free(p);
1370 	}
1371 	return (NULL);
1372 }
1373 
1374 /*
1375  * Allocates a number of pages from within an object
1376  *
1377  * Arguments:
1378  *	bytes  The number of bytes requested
1379  *	wait   Shall we wait?
1380  *
1381  * Returns:
1382  *	A pointer to the alloced memory or possibly
1383  *	NULL if M_NOWAIT is set.
1384  */
1385 static void *
1386 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
1387     int wait)
1388 {
1389 	TAILQ_HEAD(, vm_page) alloctail;
1390 	u_long npages;
1391 	vm_offset_t retkva, zkva;
1392 	vm_page_t p, p_next;
1393 	uma_keg_t keg;
1394 
1395 	TAILQ_INIT(&alloctail);
1396 	keg = zone->uz_keg;
1397 
1398 	npages = howmany(bytes, PAGE_SIZE);
1399 	while (npages > 0) {
1400 		p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
1401 		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1402 		    ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
1403 		    VM_ALLOC_NOWAIT));
1404 		if (p != NULL) {
1405 			/*
1406 			 * Since the page does not belong to an object, its
1407 			 * listq is unused.
1408 			 */
1409 			TAILQ_INSERT_TAIL(&alloctail, p, listq);
1410 			npages--;
1411 			continue;
1412 		}
1413 		/*
1414 		 * Page allocation failed, free intermediate pages and
1415 		 * exit.
1416 		 */
1417 		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1418 			vm_page_unwire_noq(p);
1419 			vm_page_free(p);
1420 		}
1421 		return (NULL);
1422 	}
1423 	*flags = UMA_SLAB_PRIV;
1424 	zkva = keg->uk_kva +
1425 	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1426 	retkva = zkva;
1427 	TAILQ_FOREACH(p, &alloctail, listq) {
1428 		pmap_qenter(zkva, &p, 1);
1429 		zkva += PAGE_SIZE;
1430 	}
1431 
1432 	return ((void *)retkva);
1433 }
1434 
1435 /*
1436  * Frees a number of pages to the system
1437  *
1438  * Arguments:
1439  *	mem   A pointer to the memory to be freed
1440  *	size  The size of the memory being freed
1441  *	flags The original p->us_flags field
1442  *
1443  * Returns:
1444  *	Nothing
1445  */
1446 static void
1447 page_free(void *mem, vm_size_t size, uint8_t flags)
1448 {
1449 
1450 	if ((flags & UMA_SLAB_KERNEL) == 0)
1451 		panic("UMA: page_free used with invalid flags %x", flags);
1452 
1453 	kmem_free((vm_offset_t)mem, size);
1454 }
1455 
1456 /*
1457  * Frees pcpu zone allocations
1458  *
1459  * Arguments:
1460  *	mem   A pointer to the memory to be freed
1461  *	size  The size of the memory being freed
1462  *	flags The original p->us_flags field
1463  *
1464  * Returns:
1465  *	Nothing
1466  */
1467 static void
1468 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
1469 {
1470 	vm_offset_t sva, curva;
1471 	vm_paddr_t paddr;
1472 	vm_page_t m;
1473 
1474 	MPASS(size == (mp_maxid+1)*PAGE_SIZE);
1475 	sva = (vm_offset_t)mem;
1476 	for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
1477 		paddr = pmap_kextract(curva);
1478 		m = PHYS_TO_VM_PAGE(paddr);
1479 		vm_page_unwire_noq(m);
1480 		vm_page_free(m);
1481 	}
1482 	pmap_qremove(sva, size >> PAGE_SHIFT);
1483 	kva_free(sva, size);
1484 }
1485 
1486 
1487 /*
1488  * Zero fill initializer
1489  *
1490  * Arguments/Returns follow uma_init specifications
1491  */
1492 static int
1493 zero_init(void *mem, int size, int flags)
1494 {
1495 	bzero(mem, size);
1496 	return (0);
1497 }
1498 
1499 /*
1500  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1501  *
1502  * Arguments
1503  *	keg  The zone we should initialize
1504  *
1505  * Returns
1506  *	Nothing
1507  */
1508 static void
1509 keg_small_init(uma_keg_t keg)
1510 {
1511 	u_int rsize;
1512 	u_int memused;
1513 	u_int wastedspace;
1514 	u_int shsize;
1515 	u_int slabsize;
1516 
1517 	if (keg->uk_flags & UMA_ZONE_PCPU) {
1518 		u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
1519 
1520 		slabsize = UMA_PCPU_ALLOC_SIZE;
1521 		keg->uk_ppera = ncpus;
1522 	} else {
1523 		slabsize = UMA_SLAB_SIZE;
1524 		keg->uk_ppera = 1;
1525 	}
1526 
1527 	/*
1528 	 * Calculate the size of each allocation (rsize) according to
1529 	 * alignment.  If the requested size is smaller than we have
1530 	 * allocation bits for we round it up.
1531 	 */
1532 	rsize = keg->uk_size;
1533 	if (rsize < slabsize / SLAB_SETSIZE)
1534 		rsize = slabsize / SLAB_SETSIZE;
1535 	if (rsize & keg->uk_align)
1536 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1537 	keg->uk_rsize = rsize;
1538 
1539 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1540 	    keg->uk_rsize < UMA_PCPU_ALLOC_SIZE,
1541 	    ("%s: size %u too large", __func__, keg->uk_rsize));
1542 
1543 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1544 		shsize = 0;
1545 	else
1546 		shsize = SIZEOF_UMA_SLAB;
1547 
1548 	if (rsize <= slabsize - shsize)
1549 		keg->uk_ipers = (slabsize - shsize) / rsize;
1550 	else {
1551 		/* Handle special case when we have 1 item per slab, so
1552 		 * alignment requirement can be relaxed. */
1553 		KASSERT(keg->uk_size <= slabsize - shsize,
1554 		    ("%s: size %u greater than slab", __func__, keg->uk_size));
1555 		keg->uk_ipers = 1;
1556 	}
1557 	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1558 	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1559 
1560 	memused = keg->uk_ipers * rsize + shsize;
1561 	wastedspace = slabsize - memused;
1562 
1563 	/*
1564 	 * We can't do OFFPAGE if we're internal or if we've been
1565 	 * asked to not go to the VM for buckets.  If we do this we
1566 	 * may end up going to the VM  for slabs which we do not
1567 	 * want to do if we're UMA_ZFLAG_CACHEONLY as a result
1568 	 * of UMA_ZONE_VM, which clearly forbids it.
1569 	 */
1570 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1571 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1572 		return;
1573 
1574 	/*
1575 	 * See if using an OFFPAGE slab will limit our waste.  Only do
1576 	 * this if it permits more items per-slab.
1577 	 *
1578 	 * XXX We could try growing slabsize to limit max waste as well.
1579 	 * Historically this was not done because the VM could not
1580 	 * efficiently handle contiguous allocations.
1581 	 */
1582 	if ((wastedspace >= slabsize / UMA_MAX_WASTE) &&
1583 	    (keg->uk_ipers < (slabsize / keg->uk_rsize))) {
1584 		keg->uk_ipers = slabsize / keg->uk_rsize;
1585 		KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1586 		    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1587 		CTR6(KTR_UMA, "UMA decided we need offpage slab headers for "
1588 		    "keg: %s(%p), calculated wastedspace = %d, "
1589 		    "maximum wasted space allowed = %d, "
1590 		    "calculated ipers = %d, "
1591 		    "new wasted space = %d\n", keg->uk_name, keg, wastedspace,
1592 		    slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1593 		    slabsize - keg->uk_ipers * keg->uk_rsize);
1594 		/*
1595 		 * If we had access to memory to embed a slab header we
1596 		 * also have a page structure to use vtoslab() instead of
1597 		 * hash to find slabs.  If the zone was explicitly created
1598 		 * OFFPAGE we can't necessarily touch the memory.
1599 		 */
1600 		if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0)
1601 			keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1602 	}
1603 
1604 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1605 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1606 		keg->uk_flags |= UMA_ZONE_HASH;
1607 }
1608 
1609 /*
1610  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1611  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1612  * more complicated.
1613  *
1614  * Arguments
1615  *	keg  The keg we should initialize
1616  *
1617  * Returns
1618  *	Nothing
1619  */
1620 static void
1621 keg_large_init(uma_keg_t keg)
1622 {
1623 
1624 	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1625 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1626 	    ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1627 
1628 	keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1629 	keg->uk_ipers = 1;
1630 	keg->uk_rsize = keg->uk_size;
1631 
1632 	/* Check whether we have enough space to not do OFFPAGE. */
1633 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0 &&
1634 	    PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < SIZEOF_UMA_SLAB) {
1635 		/*
1636 		 * We can't do OFFPAGE if we're internal, in which case
1637 		 * we need an extra page per allocation to contain the
1638 		 * slab header.
1639 		 */
1640 		if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
1641 			keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1642 		else
1643 			keg->uk_ppera++;
1644 	}
1645 
1646 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1647 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1648 		keg->uk_flags |= UMA_ZONE_HASH;
1649 }
1650 
1651 static void
1652 keg_cachespread_init(uma_keg_t keg)
1653 {
1654 	int alignsize;
1655 	int trailer;
1656 	int pages;
1657 	int rsize;
1658 
1659 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1660 	    ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1661 
1662 	alignsize = keg->uk_align + 1;
1663 	rsize = keg->uk_size;
1664 	/*
1665 	 * We want one item to start on every align boundary in a page.  To
1666 	 * do this we will span pages.  We will also extend the item by the
1667 	 * size of align if it is an even multiple of align.  Otherwise, it
1668 	 * would fall on the same boundary every time.
1669 	 */
1670 	if (rsize & keg->uk_align)
1671 		rsize = (rsize & ~keg->uk_align) + alignsize;
1672 	if ((rsize & alignsize) == 0)
1673 		rsize += alignsize;
1674 	trailer = rsize - keg->uk_size;
1675 	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1676 	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1677 	keg->uk_rsize = rsize;
1678 	keg->uk_ppera = pages;
1679 	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1680 	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1681 	KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
1682 	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1683 	    keg->uk_ipers));
1684 }
1685 
1686 /*
1687  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1688  * the keg onto the global keg list.
1689  *
1690  * Arguments/Returns follow uma_ctor specifications
1691  *	udata  Actually uma_kctor_args
1692  */
1693 static int
1694 keg_ctor(void *mem, int size, void *udata, int flags)
1695 {
1696 	struct uma_kctor_args *arg = udata;
1697 	uma_keg_t keg = mem;
1698 	uma_zone_t zone;
1699 
1700 	bzero(keg, size);
1701 	keg->uk_size = arg->size;
1702 	keg->uk_init = arg->uminit;
1703 	keg->uk_fini = arg->fini;
1704 	keg->uk_align = arg->align;
1705 	keg->uk_free = 0;
1706 	keg->uk_reserve = 0;
1707 	keg->uk_pages = 0;
1708 	keg->uk_flags = arg->flags;
1709 	keg->uk_slabzone = NULL;
1710 
1711 	/*
1712 	 * We use a global round-robin policy by default.  Zones with
1713 	 * UMA_ZONE_NUMA set will use first-touch instead, in which case the
1714 	 * iterator is never run.
1715 	 */
1716 	keg->uk_dr.dr_policy = DOMAINSET_RR();
1717 	keg->uk_dr.dr_iter = 0;
1718 
1719 	/*
1720 	 * The master zone is passed to us at keg-creation time.
1721 	 */
1722 	zone = arg->zone;
1723 	keg->uk_name = zone->uz_name;
1724 
1725 	if (arg->flags & UMA_ZONE_VM)
1726 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1727 
1728 	if (arg->flags & UMA_ZONE_ZINIT)
1729 		keg->uk_init = zero_init;
1730 
1731 	if (arg->flags & UMA_ZONE_MALLOC)
1732 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
1733 
1734 	if (arg->flags & UMA_ZONE_PCPU)
1735 #ifdef SMP
1736 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1737 #else
1738 		keg->uk_flags &= ~UMA_ZONE_PCPU;
1739 #endif
1740 
1741 	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
1742 		keg_cachespread_init(keg);
1743 	} else {
1744 		if (keg->uk_size > UMA_SLAB_SPACE)
1745 			keg_large_init(keg);
1746 		else
1747 			keg_small_init(keg);
1748 	}
1749 
1750 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1751 		keg->uk_slabzone = slabzone;
1752 
1753 	/*
1754 	 * If we haven't booted yet we need allocations to go through the
1755 	 * startup cache until the vm is ready.
1756 	 */
1757 	if (booted < BOOT_PAGEALLOC)
1758 		keg->uk_allocf = startup_alloc;
1759 #ifdef UMA_MD_SMALL_ALLOC
1760 	else if (keg->uk_ppera == 1)
1761 		keg->uk_allocf = uma_small_alloc;
1762 #endif
1763 	else if (keg->uk_flags & UMA_ZONE_PCPU)
1764 		keg->uk_allocf = pcpu_page_alloc;
1765 	else
1766 		keg->uk_allocf = page_alloc;
1767 #ifdef UMA_MD_SMALL_ALLOC
1768 	if (keg->uk_ppera == 1)
1769 		keg->uk_freef = uma_small_free;
1770 	else
1771 #endif
1772 	if (keg->uk_flags & UMA_ZONE_PCPU)
1773 		keg->uk_freef = pcpu_page_free;
1774 	else
1775 		keg->uk_freef = page_free;
1776 
1777 	/*
1778 	 * Initialize keg's lock
1779 	 */
1780 	KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
1781 
1782 	/*
1783 	 * If we're putting the slab header in the actual page we need to
1784 	 * figure out where in each page it goes.  See SIZEOF_UMA_SLAB
1785 	 * macro definition.
1786 	 */
1787 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1788 		keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - SIZEOF_UMA_SLAB;
1789 		/*
1790 		 * The only way the following is possible is if with our
1791 		 * UMA_ALIGN_PTR adjustments we are now bigger than
1792 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
1793 		 * mathematically possible for all cases, so we make
1794 		 * sure here anyway.
1795 		 */
1796 		KASSERT(keg->uk_pgoff + sizeof(struct uma_slab) <=
1797 		    PAGE_SIZE * keg->uk_ppera,
1798 		    ("zone %s ipers %d rsize %d size %d slab won't fit",
1799 		    zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size));
1800 	}
1801 
1802 	if (keg->uk_flags & UMA_ZONE_HASH)
1803 		hash_alloc(&keg->uk_hash, 0);
1804 
1805 	CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n",
1806 	    keg, zone->uz_name, zone,
1807 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
1808 	    keg->uk_free);
1809 
1810 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1811 
1812 	rw_wlock(&uma_rwlock);
1813 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1814 	rw_wunlock(&uma_rwlock);
1815 	return (0);
1816 }
1817 
1818 static void
1819 zone_alloc_counters(uma_zone_t zone)
1820 {
1821 
1822 	zone->uz_allocs = counter_u64_alloc(M_WAITOK);
1823 	zone->uz_frees = counter_u64_alloc(M_WAITOK);
1824 	zone->uz_fails = counter_u64_alloc(M_WAITOK);
1825 }
1826 
1827 /*
1828  * Zone header ctor.  This initializes all fields, locks, etc.
1829  *
1830  * Arguments/Returns follow uma_ctor specifications
1831  *	udata  Actually uma_zctor_args
1832  */
1833 static int
1834 zone_ctor(void *mem, int size, void *udata, int flags)
1835 {
1836 	struct uma_zctor_args *arg = udata;
1837 	uma_zone_t zone = mem;
1838 	uma_zone_t z;
1839 	uma_keg_t keg;
1840 	int i;
1841 
1842 	bzero(zone, size);
1843 	zone->uz_name = arg->name;
1844 	zone->uz_ctor = arg->ctor;
1845 	zone->uz_dtor = arg->dtor;
1846 	zone->uz_init = NULL;
1847 	zone->uz_fini = NULL;
1848 	zone->uz_sleeps = 0;
1849 	zone->uz_xdomain = 0;
1850 	zone->uz_count = 0;
1851 	zone->uz_count_min = 0;
1852 	zone->uz_count_max = BUCKET_MAX;
1853 	zone->uz_flags = 0;
1854 	zone->uz_warning = NULL;
1855 	/* The domain structures follow the cpu structures. */
1856 	zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus];
1857 	zone->uz_bkt_max = ULONG_MAX;
1858 	timevalclear(&zone->uz_ratecheck);
1859 
1860 	if (__predict_true(booted == BOOT_RUNNING))
1861 		zone_alloc_counters(zone);
1862 	else {
1863 		zone->uz_allocs = EARLY_COUNTER;
1864 		zone->uz_frees = EARLY_COUNTER;
1865 		zone->uz_fails = EARLY_COUNTER;
1866 	}
1867 
1868 	for (i = 0; i < vm_ndomains; i++)
1869 		TAILQ_INIT(&zone->uz_domain[i].uzd_buckets);
1870 
1871 	/*
1872 	 * This is a pure cache zone, no kegs.
1873 	 */
1874 	if (arg->import) {
1875 		if (arg->flags & UMA_ZONE_VM)
1876 			arg->flags |= UMA_ZFLAG_CACHEONLY;
1877 		zone->uz_flags = arg->flags;
1878 		zone->uz_size = arg->size;
1879 		zone->uz_import = arg->import;
1880 		zone->uz_release = arg->release;
1881 		zone->uz_arg = arg->arg;
1882 		zone->uz_lockptr = &zone->uz_lock;
1883 		ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
1884 		rw_wlock(&uma_rwlock);
1885 		LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
1886 		rw_wunlock(&uma_rwlock);
1887 		goto out;
1888 	}
1889 
1890 	/*
1891 	 * Use the regular zone/keg/slab allocator.
1892 	 */
1893 	zone->uz_import = (uma_import)zone_import;
1894 	zone->uz_release = (uma_release)zone_release;
1895 	zone->uz_arg = zone;
1896 	keg = arg->keg;
1897 
1898 	if (arg->flags & UMA_ZONE_SECONDARY) {
1899 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1900 		zone->uz_init = arg->uminit;
1901 		zone->uz_fini = arg->fini;
1902 		zone->uz_lockptr = &keg->uk_lock;
1903 		zone->uz_flags |= UMA_ZONE_SECONDARY;
1904 		rw_wlock(&uma_rwlock);
1905 		ZONE_LOCK(zone);
1906 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1907 			if (LIST_NEXT(z, uz_link) == NULL) {
1908 				LIST_INSERT_AFTER(z, zone, uz_link);
1909 				break;
1910 			}
1911 		}
1912 		ZONE_UNLOCK(zone);
1913 		rw_wunlock(&uma_rwlock);
1914 	} else if (keg == NULL) {
1915 		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1916 		    arg->align, arg->flags)) == NULL)
1917 			return (ENOMEM);
1918 	} else {
1919 		struct uma_kctor_args karg;
1920 		int error;
1921 
1922 		/* We should only be here from uma_startup() */
1923 		karg.size = arg->size;
1924 		karg.uminit = arg->uminit;
1925 		karg.fini = arg->fini;
1926 		karg.align = arg->align;
1927 		karg.flags = arg->flags;
1928 		karg.zone = zone;
1929 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1930 		    flags);
1931 		if (error)
1932 			return (error);
1933 	}
1934 
1935 	zone->uz_keg = keg;
1936 	zone->uz_size = keg->uk_size;
1937 	zone->uz_flags |= (keg->uk_flags &
1938 	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1939 
1940 	/*
1941 	 * Some internal zones don't have room allocated for the per cpu
1942 	 * caches.  If we're internal, bail out here.
1943 	 */
1944 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1945 		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1946 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1947 		return (0);
1948 	}
1949 
1950 out:
1951 	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
1952 	    (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
1953 	    ("Invalid zone flag combination"));
1954 	if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) {
1955 		zone->uz_count = BUCKET_MAX;
1956 	} else if ((arg->flags & UMA_ZONE_MINBUCKET) != 0) {
1957 		zone->uz_count = BUCKET_MIN;
1958 		zone->uz_count_max = BUCKET_MIN;
1959 	} else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
1960 		zone->uz_count = 0;
1961 	else
1962 		zone->uz_count = bucket_select(zone->uz_size);
1963 	zone->uz_count_min = zone->uz_count;
1964 
1965 	return (0);
1966 }
1967 
1968 /*
1969  * Keg header dtor.  This frees all data, destroys locks, frees the hash
1970  * table and removes the keg from the global list.
1971  *
1972  * Arguments/Returns follow uma_dtor specifications
1973  *	udata  unused
1974  */
1975 static void
1976 keg_dtor(void *arg, int size, void *udata)
1977 {
1978 	uma_keg_t keg;
1979 
1980 	keg = (uma_keg_t)arg;
1981 	KEG_LOCK(keg);
1982 	if (keg->uk_free != 0) {
1983 		printf("Freed UMA keg (%s) was not empty (%d items). "
1984 		    " Lost %d pages of memory.\n",
1985 		    keg->uk_name ? keg->uk_name : "",
1986 		    keg->uk_free, keg->uk_pages);
1987 	}
1988 	KEG_UNLOCK(keg);
1989 
1990 	hash_free(&keg->uk_hash);
1991 
1992 	KEG_LOCK_FINI(keg);
1993 }
1994 
1995 /*
1996  * Zone header dtor.
1997  *
1998  * Arguments/Returns follow uma_dtor specifications
1999  *	udata  unused
2000  */
2001 static void
2002 zone_dtor(void *arg, int size, void *udata)
2003 {
2004 	uma_zone_t zone;
2005 	uma_keg_t keg;
2006 
2007 	zone = (uma_zone_t)arg;
2008 
2009 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
2010 		cache_drain(zone);
2011 
2012 	rw_wlock(&uma_rwlock);
2013 	LIST_REMOVE(zone, uz_link);
2014 	rw_wunlock(&uma_rwlock);
2015 	/*
2016 	 * XXX there are some races here where
2017 	 * the zone can be drained but zone lock
2018 	 * released and then refilled before we
2019 	 * remove it... we dont care for now
2020 	 */
2021 	zone_reclaim(zone, M_WAITOK, true);
2022 	/*
2023 	 * We only destroy kegs from non secondary/non cache zones.
2024 	 */
2025 	if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
2026 		keg = zone->uz_keg;
2027 		rw_wlock(&uma_rwlock);
2028 		LIST_REMOVE(keg, uk_link);
2029 		rw_wunlock(&uma_rwlock);
2030 		zone_free_item(kegs, keg, NULL, SKIP_NONE);
2031 	}
2032 	counter_u64_free(zone->uz_allocs);
2033 	counter_u64_free(zone->uz_frees);
2034 	counter_u64_free(zone->uz_fails);
2035 	if (zone->uz_lockptr == &zone->uz_lock)
2036 		ZONE_LOCK_FINI(zone);
2037 }
2038 
2039 /*
2040  * Traverses every zone in the system and calls a callback
2041  *
2042  * Arguments:
2043  *	zfunc  A pointer to a function which accepts a zone
2044  *		as an argument.
2045  *
2046  * Returns:
2047  *	Nothing
2048  */
2049 static void
2050 zone_foreach(void (*zfunc)(uma_zone_t))
2051 {
2052 	uma_keg_t keg;
2053 	uma_zone_t zone;
2054 
2055 	/*
2056 	 * Before BOOT_RUNNING we are guaranteed to be single
2057 	 * threaded, so locking isn't needed. Startup functions
2058 	 * are allowed to use M_WAITOK.
2059 	 */
2060 	if (__predict_true(booted == BOOT_RUNNING))
2061 		rw_rlock(&uma_rwlock);
2062 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
2063 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
2064 			zfunc(zone);
2065 	}
2066 	LIST_FOREACH(zone, &uma_cachezones, uz_link)
2067 		zfunc(zone);
2068 	if (__predict_true(booted == BOOT_RUNNING))
2069 		rw_runlock(&uma_rwlock);
2070 }
2071 
2072 /*
2073  * Count how many pages do we need to bootstrap.  VM supplies
2074  * its need in early zones in the argument, we add up our zones,
2075  * which consist of: UMA Slabs, UMA Hash and 9 Bucket zones. The
2076  * zone of zones and zone of kegs are accounted separately.
2077  */
2078 #define	UMA_BOOT_ZONES	11
2079 /* Zone of zones and zone of kegs have arbitrary alignment. */
2080 #define	UMA_BOOT_ALIGN	32
2081 static int zsize, ksize;
2082 int
2083 uma_startup_count(int vm_zones)
2084 {
2085 	int zones, pages;
2086 
2087 	ksize = sizeof(struct uma_keg) +
2088 	    (sizeof(struct uma_domain) * vm_ndomains);
2089 	zsize = sizeof(struct uma_zone) +
2090 	    (sizeof(struct uma_cache) * (mp_maxid + 1)) +
2091 	    (sizeof(struct uma_zone_domain) * vm_ndomains);
2092 
2093 	/*
2094 	 * Memory for the zone of kegs and its keg,
2095 	 * and for zone of zones.
2096 	 */
2097 	pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 +
2098 	    roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE);
2099 
2100 #ifdef	UMA_MD_SMALL_ALLOC
2101 	zones = UMA_BOOT_ZONES;
2102 #else
2103 	zones = UMA_BOOT_ZONES + vm_zones;
2104 	vm_zones = 0;
2105 #endif
2106 
2107 	/* Memory for the rest of startup zones, UMA and VM, ... */
2108 	if (zsize > UMA_SLAB_SPACE) {
2109 		/* See keg_large_init(). */
2110 		u_int ppera;
2111 
2112 		ppera = howmany(roundup2(zsize, UMA_BOOT_ALIGN), PAGE_SIZE);
2113 		if (PAGE_SIZE * ppera - roundup2(zsize, UMA_BOOT_ALIGN) <
2114 		    SIZEOF_UMA_SLAB)
2115 			ppera++;
2116 		pages += (zones + vm_zones) * ppera;
2117 	} else if (roundup2(zsize, UMA_BOOT_ALIGN) > UMA_SLAB_SPACE)
2118 		/* See keg_small_init() special case for uk_ppera = 1. */
2119 		pages += zones;
2120 	else
2121 		pages += howmany(zones,
2122 		    UMA_SLAB_SPACE / roundup2(zsize, UMA_BOOT_ALIGN));
2123 
2124 	/* ... and their kegs. Note that zone of zones allocates a keg! */
2125 	pages += howmany(zones + 1,
2126 	    UMA_SLAB_SPACE / roundup2(ksize, UMA_BOOT_ALIGN));
2127 
2128 	/*
2129 	 * Most of startup zones are not going to be offpages, that's
2130 	 * why we use UMA_SLAB_SPACE instead of UMA_SLAB_SIZE in all
2131 	 * calculations.  Some large bucket zones will be offpage, and
2132 	 * thus will allocate hashes.  We take conservative approach
2133 	 * and assume that all zones may allocate hash.  This may give
2134 	 * us some positive inaccuracy, usually an extra single page.
2135 	 */
2136 	pages += howmany(zones, UMA_SLAB_SPACE /
2137 	    (sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT));
2138 
2139 	return (pages);
2140 }
2141 
2142 void
2143 uma_startup(void *mem, int npages)
2144 {
2145 	struct uma_zctor_args args;
2146 	uma_keg_t masterkeg;
2147 	uintptr_t m;
2148 
2149 #ifdef DIAGNOSTIC
2150 	printf("Entering %s with %d boot pages configured\n", __func__, npages);
2151 #endif
2152 
2153 	rw_init(&uma_rwlock, "UMA lock");
2154 
2155 	/* Use bootpages memory for the zone of zones and zone of kegs. */
2156 	m = (uintptr_t)mem;
2157 	zones = (uma_zone_t)m;
2158 	m += roundup(zsize, CACHE_LINE_SIZE);
2159 	kegs = (uma_zone_t)m;
2160 	m += roundup(zsize, CACHE_LINE_SIZE);
2161 	masterkeg = (uma_keg_t)m;
2162 	m += roundup(ksize, CACHE_LINE_SIZE);
2163 	m = roundup(m, PAGE_SIZE);
2164 	npages -= (m - (uintptr_t)mem) / PAGE_SIZE;
2165 	mem = (void *)m;
2166 
2167 	/* "manually" create the initial zone */
2168 	memset(&args, 0, sizeof(args));
2169 	args.name = "UMA Kegs";
2170 	args.size = ksize;
2171 	args.ctor = keg_ctor;
2172 	args.dtor = keg_dtor;
2173 	args.uminit = zero_init;
2174 	args.fini = NULL;
2175 	args.keg = masterkeg;
2176 	args.align = UMA_BOOT_ALIGN - 1;
2177 	args.flags = UMA_ZFLAG_INTERNAL;
2178 	zone_ctor(kegs, zsize, &args, M_WAITOK);
2179 
2180 	bootmem = mem;
2181 	boot_pages = npages;
2182 
2183 	args.name = "UMA Zones";
2184 	args.size = zsize;
2185 	args.ctor = zone_ctor;
2186 	args.dtor = zone_dtor;
2187 	args.uminit = zero_init;
2188 	args.fini = NULL;
2189 	args.keg = NULL;
2190 	args.align = UMA_BOOT_ALIGN - 1;
2191 	args.flags = UMA_ZFLAG_INTERNAL;
2192 	zone_ctor(zones, zsize, &args, M_WAITOK);
2193 
2194 	/* Now make a zone for slab headers */
2195 	slabzone = uma_zcreate("UMA Slabs",
2196 				sizeof(struct uma_slab),
2197 				NULL, NULL, NULL, NULL,
2198 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2199 
2200 	hashzone = uma_zcreate("UMA Hash",
2201 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
2202 	    NULL, NULL, NULL, NULL,
2203 	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2204 
2205 	bucket_init();
2206 
2207 	booted = BOOT_STRAPPED;
2208 }
2209 
2210 void
2211 uma_startup1(void)
2212 {
2213 
2214 #ifdef DIAGNOSTIC
2215 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2216 #endif
2217 	booted = BOOT_PAGEALLOC;
2218 }
2219 
2220 void
2221 uma_startup2(void)
2222 {
2223 
2224 #ifdef DIAGNOSTIC
2225 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2226 #endif
2227 	booted = BOOT_BUCKETS;
2228 	sx_init(&uma_reclaim_lock, "umareclaim");
2229 	bucket_enable();
2230 }
2231 
2232 /*
2233  * Initialize our callout handle
2234  *
2235  */
2236 static void
2237 uma_startup3(void)
2238 {
2239 
2240 #ifdef INVARIANTS
2241 	TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
2242 	uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
2243 	uma_skip_cnt = counter_u64_alloc(M_WAITOK);
2244 #endif
2245 	zone_foreach(zone_alloc_counters);
2246 	callout_init(&uma_callout, 1);
2247 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
2248 	booted = BOOT_RUNNING;
2249 }
2250 
2251 static uma_keg_t
2252 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
2253 		int align, uint32_t flags)
2254 {
2255 	struct uma_kctor_args args;
2256 
2257 	args.size = size;
2258 	args.uminit = uminit;
2259 	args.fini = fini;
2260 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
2261 	args.flags = flags;
2262 	args.zone = zone;
2263 	return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
2264 }
2265 
2266 /* Public functions */
2267 /* See uma.h */
2268 void
2269 uma_set_align(int align)
2270 {
2271 
2272 	if (align != UMA_ALIGN_CACHE)
2273 		uma_align_cache = align;
2274 }
2275 
2276 /* See uma.h */
2277 uma_zone_t
2278 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
2279 		uma_init uminit, uma_fini fini, int align, uint32_t flags)
2280 
2281 {
2282 	struct uma_zctor_args args;
2283 	uma_zone_t res;
2284 	bool locked;
2285 
2286 	KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
2287 	    align, name));
2288 
2289 	/* Sets all zones to a first-touch domain policy. */
2290 #ifdef UMA_FIRSTTOUCH
2291 	flags |= UMA_ZONE_NUMA;
2292 #endif
2293 
2294 	/* This stuff is essential for the zone ctor */
2295 	memset(&args, 0, sizeof(args));
2296 	args.name = name;
2297 	args.size = size;
2298 	args.ctor = ctor;
2299 	args.dtor = dtor;
2300 	args.uminit = uminit;
2301 	args.fini = fini;
2302 #ifdef  INVARIANTS
2303 	/*
2304 	 * If a zone is being created with an empty constructor and
2305 	 * destructor, pass UMA constructor/destructor which checks for
2306 	 * memory use after free.
2307 	 */
2308 	if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
2309 	    ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
2310 		args.ctor = trash_ctor;
2311 		args.dtor = trash_dtor;
2312 		args.uminit = trash_init;
2313 		args.fini = trash_fini;
2314 	}
2315 #endif
2316 	args.align = align;
2317 	args.flags = flags;
2318 	args.keg = NULL;
2319 
2320 	if (booted < BOOT_BUCKETS) {
2321 		locked = false;
2322 	} else {
2323 		sx_slock(&uma_reclaim_lock);
2324 		locked = true;
2325 	}
2326 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2327 	if (locked)
2328 		sx_sunlock(&uma_reclaim_lock);
2329 	return (res);
2330 }
2331 
2332 /* See uma.h */
2333 uma_zone_t
2334 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
2335 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
2336 {
2337 	struct uma_zctor_args args;
2338 	uma_keg_t keg;
2339 	uma_zone_t res;
2340 	bool locked;
2341 
2342 	keg = master->uz_keg;
2343 	memset(&args, 0, sizeof(args));
2344 	args.name = name;
2345 	args.size = keg->uk_size;
2346 	args.ctor = ctor;
2347 	args.dtor = dtor;
2348 	args.uminit = zinit;
2349 	args.fini = zfini;
2350 	args.align = keg->uk_align;
2351 	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
2352 	args.keg = keg;
2353 
2354 	if (booted < BOOT_BUCKETS) {
2355 		locked = false;
2356 	} else {
2357 		sx_slock(&uma_reclaim_lock);
2358 		locked = true;
2359 	}
2360 	/* XXX Attaches only one keg of potentially many. */
2361 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2362 	if (locked)
2363 		sx_sunlock(&uma_reclaim_lock);
2364 	return (res);
2365 }
2366 
2367 /* See uma.h */
2368 uma_zone_t
2369 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
2370 		    uma_init zinit, uma_fini zfini, uma_import zimport,
2371 		    uma_release zrelease, void *arg, int flags)
2372 {
2373 	struct uma_zctor_args args;
2374 
2375 	memset(&args, 0, sizeof(args));
2376 	args.name = name;
2377 	args.size = size;
2378 	args.ctor = ctor;
2379 	args.dtor = dtor;
2380 	args.uminit = zinit;
2381 	args.fini = zfini;
2382 	args.import = zimport;
2383 	args.release = zrelease;
2384 	args.arg = arg;
2385 	args.align = 0;
2386 	args.flags = flags | UMA_ZFLAG_CACHE;
2387 
2388 	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
2389 }
2390 
2391 /* See uma.h */
2392 void
2393 uma_zdestroy(uma_zone_t zone)
2394 {
2395 
2396 	sx_slock(&uma_reclaim_lock);
2397 	zone_free_item(zones, zone, NULL, SKIP_NONE);
2398 	sx_sunlock(&uma_reclaim_lock);
2399 }
2400 
2401 void
2402 uma_zwait(uma_zone_t zone)
2403 {
2404 	void *item;
2405 
2406 	item = uma_zalloc_arg(zone, NULL, M_WAITOK);
2407 	uma_zfree(zone, item);
2408 }
2409 
2410 void *
2411 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
2412 {
2413 	void *item;
2414 #ifdef SMP
2415 	int i;
2416 
2417 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2418 #endif
2419 	item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
2420 	if (item != NULL && (flags & M_ZERO)) {
2421 #ifdef SMP
2422 		for (i = 0; i <= mp_maxid; i++)
2423 			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
2424 #else
2425 		bzero(item, zone->uz_size);
2426 #endif
2427 	}
2428 	return (item);
2429 }
2430 
2431 /*
2432  * A stub while both regular and pcpu cases are identical.
2433  */
2434 void
2435 uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata)
2436 {
2437 
2438 #ifdef SMP
2439 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2440 #endif
2441 	uma_zfree_arg(zone, item, udata);
2442 }
2443 
2444 /* See uma.h */
2445 void *
2446 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
2447 {
2448 	uma_zone_domain_t zdom;
2449 	uma_bucket_t bucket;
2450 	uma_cache_t cache;
2451 	void *item;
2452 	int cpu, domain, lockfail, maxbucket;
2453 #ifdef INVARIANTS
2454 	bool skipdbg;
2455 #endif
2456 
2457 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2458 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2459 
2460 	/* This is the fast path allocation */
2461 	CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d",
2462 	    curthread, zone->uz_name, zone, flags);
2463 
2464 	if (flags & M_WAITOK) {
2465 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2466 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2467 	}
2468 	KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC"));
2469 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2470 	    ("uma_zalloc_arg: called with spinlock or critical section held"));
2471 	if (zone->uz_flags & UMA_ZONE_PCPU)
2472 		KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone "
2473 		    "with M_ZERO passed"));
2474 
2475 #ifdef DEBUG_MEMGUARD
2476 	if (memguard_cmp_zone(zone)) {
2477 		item = memguard_alloc(zone->uz_size, flags);
2478 		if (item != NULL) {
2479 			if (zone->uz_init != NULL &&
2480 			    zone->uz_init(item, zone->uz_size, flags) != 0)
2481 				return (NULL);
2482 			if (zone->uz_ctor != NULL &&
2483 			    zone->uz_ctor(item, zone->uz_size, udata,
2484 			    flags) != 0) {
2485 			    	zone->uz_fini(item, zone->uz_size);
2486 				return (NULL);
2487 			}
2488 			return (item);
2489 		}
2490 		/* This is unfortunate but should not be fatal. */
2491 	}
2492 #endif
2493 	/*
2494 	 * If possible, allocate from the per-CPU cache.  There are two
2495 	 * requirements for safe access to the per-CPU cache: (1) the thread
2496 	 * accessing the cache must not be preempted or yield during access,
2497 	 * and (2) the thread must not migrate CPUs without switching which
2498 	 * cache it accesses.  We rely on a critical section to prevent
2499 	 * preemption and migration.  We release the critical section in
2500 	 * order to acquire the zone mutex if we are unable to allocate from
2501 	 * the current cache; when we re-acquire the critical section, we
2502 	 * must detect and handle migration if it has occurred.
2503 	 */
2504 zalloc_restart:
2505 	critical_enter();
2506 	cpu = curcpu;
2507 	cache = &zone->uz_cpu[cpu];
2508 
2509 zalloc_start:
2510 	bucket = cache->uc_allocbucket;
2511 	if (bucket != NULL && bucket->ub_cnt > 0) {
2512 		bucket->ub_cnt--;
2513 		item = bucket->ub_bucket[bucket->ub_cnt];
2514 #ifdef INVARIANTS
2515 		bucket->ub_bucket[bucket->ub_cnt] = NULL;
2516 #endif
2517 		KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
2518 		cache->uc_allocs++;
2519 		critical_exit();
2520 #ifdef INVARIANTS
2521 		skipdbg = uma_dbg_zskip(zone, item);
2522 #endif
2523 		if (zone->uz_ctor != NULL &&
2524 #ifdef INVARIANTS
2525 		    (!skipdbg || zone->uz_ctor != trash_ctor ||
2526 		    zone->uz_dtor != trash_dtor) &&
2527 #endif
2528 		    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2529 			counter_u64_add(zone->uz_fails, 1);
2530 			zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
2531 			return (NULL);
2532 		}
2533 #ifdef INVARIANTS
2534 		if (!skipdbg)
2535 			uma_dbg_alloc(zone, NULL, item);
2536 #endif
2537 		if (flags & M_ZERO)
2538 			uma_zero_item(item, zone);
2539 		return (item);
2540 	}
2541 
2542 	/*
2543 	 * We have run out of items in our alloc bucket.
2544 	 * See if we can switch with our free bucket.
2545 	 */
2546 	bucket = cache->uc_freebucket;
2547 	if (bucket != NULL && bucket->ub_cnt > 0) {
2548 		CTR2(KTR_UMA,
2549 		    "uma_zalloc: zone %s(%p) swapping empty with alloc",
2550 		    zone->uz_name, zone);
2551 		cache->uc_freebucket = cache->uc_allocbucket;
2552 		cache->uc_allocbucket = bucket;
2553 		goto zalloc_start;
2554 	}
2555 
2556 	/*
2557 	 * Discard any empty allocation bucket while we hold no locks.
2558 	 */
2559 	bucket = cache->uc_allocbucket;
2560 	cache->uc_allocbucket = NULL;
2561 	critical_exit();
2562 	if (bucket != NULL)
2563 		bucket_free(zone, bucket, udata);
2564 
2565 	/* Short-circuit for zones without buckets and low memory. */
2566 	if (zone->uz_count == 0 || bucketdisable) {
2567 		ZONE_LOCK(zone);
2568 		if (zone->uz_flags & UMA_ZONE_NUMA)
2569 			domain = PCPU_GET(domain);
2570 		else
2571 			domain = UMA_ANYDOMAIN;
2572 		goto zalloc_item;
2573 	}
2574 
2575 	/*
2576 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
2577 	 * we must go back to the zone.  This requires the zone lock, so we
2578 	 * must drop the critical section, then re-acquire it when we go back
2579 	 * to the cache.  Since the critical section is released, we may be
2580 	 * preempted or migrate.  As such, make sure not to maintain any
2581 	 * thread-local state specific to the cache from prior to releasing
2582 	 * the critical section.
2583 	 */
2584 	lockfail = 0;
2585 	if (ZONE_TRYLOCK(zone) == 0) {
2586 		/* Record contention to size the buckets. */
2587 		ZONE_LOCK(zone);
2588 		lockfail = 1;
2589 	}
2590 	critical_enter();
2591 	cpu = curcpu;
2592 	cache = &zone->uz_cpu[cpu];
2593 
2594 	/* See if we lost the race to fill the cache. */
2595 	if (cache->uc_allocbucket != NULL) {
2596 		ZONE_UNLOCK(zone);
2597 		goto zalloc_start;
2598 	}
2599 
2600 	/*
2601 	 * Check the zone's cache of buckets.
2602 	 */
2603 	if (zone->uz_flags & UMA_ZONE_NUMA) {
2604 		domain = PCPU_GET(domain);
2605 		zdom = &zone->uz_domain[domain];
2606 	} else {
2607 		domain = UMA_ANYDOMAIN;
2608 		zdom = &zone->uz_domain[0];
2609 	}
2610 
2611 	if ((bucket = zone_fetch_bucket(zone, zdom)) != NULL) {
2612 		KASSERT(bucket->ub_cnt != 0,
2613 		    ("uma_zalloc_arg: Returning an empty bucket."));
2614 		cache->uc_allocbucket = bucket;
2615 		ZONE_UNLOCK(zone);
2616 		goto zalloc_start;
2617 	}
2618 	/* We are no longer associated with this CPU. */
2619 	critical_exit();
2620 
2621 	/*
2622 	 * We bump the uz count when the cache size is insufficient to
2623 	 * handle the working set.
2624 	 */
2625 	if (lockfail && zone->uz_count < zone->uz_count_max)
2626 		zone->uz_count++;
2627 
2628 	if (zone->uz_max_items > 0) {
2629 		if (zone->uz_items >= zone->uz_max_items)
2630 			goto zalloc_item;
2631 		maxbucket = MIN(zone->uz_count,
2632 		    zone->uz_max_items - zone->uz_items);
2633 		zone->uz_items += maxbucket;
2634 	} else
2635 		maxbucket = zone->uz_count;
2636 	ZONE_UNLOCK(zone);
2637 
2638 	/*
2639 	 * Now lets just fill a bucket and put it on the free list.  If that
2640 	 * works we'll restart the allocation from the beginning and it
2641 	 * will use the just filled bucket.
2642 	 */
2643 	bucket = zone_alloc_bucket(zone, udata, domain, flags, maxbucket);
2644 	CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
2645 	    zone->uz_name, zone, bucket);
2646 	ZONE_LOCK(zone);
2647 	if (bucket != NULL) {
2648 		if (zone->uz_max_items > 0 && bucket->ub_cnt < maxbucket) {
2649 			MPASS(zone->uz_items >= maxbucket - bucket->ub_cnt);
2650 			zone->uz_items -= maxbucket - bucket->ub_cnt;
2651 			if (zone->uz_sleepers > 0 &&
2652 			    zone->uz_items < zone->uz_max_items)
2653 				wakeup_one(zone);
2654 		}
2655 		critical_enter();
2656 		cpu = curcpu;
2657 		cache = &zone->uz_cpu[cpu];
2658 
2659 		/*
2660 		 * See if we lost the race or were migrated.  Cache the
2661 		 * initialized bucket to make this less likely or claim
2662 		 * the memory directly.
2663 		 */
2664 		if (cache->uc_allocbucket == NULL &&
2665 		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
2666 		    domain == PCPU_GET(domain))) {
2667 			cache->uc_allocbucket = bucket;
2668 			zdom->uzd_imax += bucket->ub_cnt;
2669 		} else if (zone->uz_bkt_count >= zone->uz_bkt_max) {
2670 			critical_exit();
2671 			ZONE_UNLOCK(zone);
2672 			bucket_drain(zone, bucket);
2673 			bucket_free(zone, bucket, udata);
2674 			goto zalloc_restart;
2675 		} else
2676 			zone_put_bucket(zone, zdom, bucket, false);
2677 		ZONE_UNLOCK(zone);
2678 		goto zalloc_start;
2679 	} else if (zone->uz_max_items > 0) {
2680 		zone->uz_items -= maxbucket;
2681 		if (zone->uz_sleepers > 0 &&
2682 		    zone->uz_items + 1 < zone->uz_max_items)
2683 			wakeup_one(zone);
2684 	}
2685 
2686 	/*
2687 	 * We may not be able to get a bucket so return an actual item.
2688 	 */
2689 zalloc_item:
2690 	item = zone_alloc_item_locked(zone, udata, domain, flags);
2691 
2692 	return (item);
2693 }
2694 
2695 void *
2696 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
2697 {
2698 
2699 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2700 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2701 
2702 	/* This is the fast path allocation */
2703 	CTR5(KTR_UMA,
2704 	    "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d",
2705 	    curthread, zone->uz_name, zone, domain, flags);
2706 
2707 	if (flags & M_WAITOK) {
2708 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2709 		    "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
2710 	}
2711 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2712 	    ("uma_zalloc_domain: called with spinlock or critical section held"));
2713 
2714 	return (zone_alloc_item(zone, udata, domain, flags));
2715 }
2716 
2717 /*
2718  * Find a slab with some space.  Prefer slabs that are partially used over those
2719  * that are totally full.  This helps to reduce fragmentation.
2720  *
2721  * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
2722  * only 'domain'.
2723  */
2724 static uma_slab_t
2725 keg_first_slab(uma_keg_t keg, int domain, bool rr)
2726 {
2727 	uma_domain_t dom;
2728 	uma_slab_t slab;
2729 	int start;
2730 
2731 	KASSERT(domain >= 0 && domain < vm_ndomains,
2732 	    ("keg_first_slab: domain %d out of range", domain));
2733 	KEG_LOCK_ASSERT(keg);
2734 
2735 	slab = NULL;
2736 	start = domain;
2737 	do {
2738 		dom = &keg->uk_domain[domain];
2739 		if (!LIST_EMPTY(&dom->ud_part_slab))
2740 			return (LIST_FIRST(&dom->ud_part_slab));
2741 		if (!LIST_EMPTY(&dom->ud_free_slab)) {
2742 			slab = LIST_FIRST(&dom->ud_free_slab);
2743 			LIST_REMOVE(slab, us_link);
2744 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2745 			return (slab);
2746 		}
2747 		if (rr)
2748 			domain = (domain + 1) % vm_ndomains;
2749 	} while (domain != start);
2750 
2751 	return (NULL);
2752 }
2753 
2754 static uma_slab_t
2755 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags)
2756 {
2757 	uint32_t reserve;
2758 
2759 	KEG_LOCK_ASSERT(keg);
2760 
2761 	reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve;
2762 	if (keg->uk_free <= reserve)
2763 		return (NULL);
2764 	return (keg_first_slab(keg, domain, rr));
2765 }
2766 
2767 static uma_slab_t
2768 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags)
2769 {
2770 	struct vm_domainset_iter di;
2771 	uma_domain_t dom;
2772 	uma_slab_t slab;
2773 	int aflags, domain;
2774 	bool rr;
2775 
2776 restart:
2777 	KEG_LOCK_ASSERT(keg);
2778 
2779 	/*
2780 	 * Use the keg's policy if upper layers haven't already specified a
2781 	 * domain (as happens with first-touch zones).
2782 	 *
2783 	 * To avoid races we run the iterator with the keg lock held, but that
2784 	 * means that we cannot allow the vm_domainset layer to sleep.  Thus,
2785 	 * clear M_WAITOK and handle low memory conditions locally.
2786 	 */
2787 	rr = rdomain == UMA_ANYDOMAIN;
2788 	if (rr) {
2789 		aflags = (flags & ~M_WAITOK) | M_NOWAIT;
2790 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
2791 		    &aflags);
2792 	} else {
2793 		aflags = flags;
2794 		domain = rdomain;
2795 	}
2796 
2797 	for (;;) {
2798 		slab = keg_fetch_free_slab(keg, domain, rr, flags);
2799 		if (slab != NULL) {
2800 			MPASS(slab->us_keg == keg);
2801 			return (slab);
2802 		}
2803 
2804 		/*
2805 		 * M_NOVM means don't ask at all!
2806 		 */
2807 		if (flags & M_NOVM)
2808 			break;
2809 
2810 		KASSERT(zone->uz_max_items == 0 ||
2811 		    zone->uz_items <= zone->uz_max_items,
2812 		    ("%s: zone %p overflow", __func__, zone));
2813 
2814 		slab = keg_alloc_slab(keg, zone, domain, flags, aflags);
2815 		/*
2816 		 * If we got a slab here it's safe to mark it partially used
2817 		 * and return.  We assume that the caller is going to remove
2818 		 * at least one item.
2819 		 */
2820 		if (slab) {
2821 			MPASS(slab->us_keg == keg);
2822 			dom = &keg->uk_domain[slab->us_domain];
2823 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2824 			return (slab);
2825 		}
2826 		KEG_LOCK(keg);
2827 		if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
2828 			if ((flags & M_WAITOK) != 0) {
2829 				KEG_UNLOCK(keg);
2830 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
2831 				KEG_LOCK(keg);
2832 				goto restart;
2833 			}
2834 			break;
2835 		}
2836 	}
2837 
2838 	/*
2839 	 * We might not have been able to get a slab but another cpu
2840 	 * could have while we were unlocked.  Check again before we
2841 	 * fail.
2842 	 */
2843 	if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) {
2844 		MPASS(slab->us_keg == keg);
2845 		return (slab);
2846 	}
2847 	return (NULL);
2848 }
2849 
2850 static uma_slab_t
2851 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags)
2852 {
2853 	uma_slab_t slab;
2854 
2855 	if (keg == NULL) {
2856 		keg = zone->uz_keg;
2857 		KEG_LOCK(keg);
2858 	}
2859 
2860 	for (;;) {
2861 		slab = keg_fetch_slab(keg, zone, domain, flags);
2862 		if (slab)
2863 			return (slab);
2864 		if (flags & (M_NOWAIT | M_NOVM))
2865 			break;
2866 	}
2867 	KEG_UNLOCK(keg);
2868 	return (NULL);
2869 }
2870 
2871 static void *
2872 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
2873 {
2874 	uma_domain_t dom;
2875 	void *item;
2876 	uint8_t freei;
2877 
2878 	MPASS(keg == slab->us_keg);
2879 	KEG_LOCK_ASSERT(keg);
2880 
2881 	freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
2882 	BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
2883 	item = slab->us_data + (keg->uk_rsize * freei);
2884 	slab->us_freecount--;
2885 	keg->uk_free--;
2886 
2887 	/* Move this slab to the full list */
2888 	if (slab->us_freecount == 0) {
2889 		LIST_REMOVE(slab, us_link);
2890 		dom = &keg->uk_domain[slab->us_domain];
2891 		LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
2892 	}
2893 
2894 	return (item);
2895 }
2896 
2897 static int
2898 zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags)
2899 {
2900 	uma_slab_t slab;
2901 	uma_keg_t keg;
2902 #ifdef NUMA
2903 	int stripe;
2904 #endif
2905 	int i;
2906 
2907 	slab = NULL;
2908 	keg = NULL;
2909 	/* Try to keep the buckets totally full */
2910 	for (i = 0; i < max; ) {
2911 		if ((slab = zone_fetch_slab(zone, keg, domain, flags)) == NULL)
2912 			break;
2913 		keg = slab->us_keg;
2914 #ifdef NUMA
2915 		stripe = howmany(max, vm_ndomains);
2916 #endif
2917 		while (slab->us_freecount && i < max) {
2918 			bucket[i++] = slab_alloc_item(keg, slab);
2919 			if (keg->uk_free <= keg->uk_reserve)
2920 				break;
2921 #ifdef NUMA
2922 			/*
2923 			 * If the zone is striped we pick a new slab for every
2924 			 * N allocations.  Eliminating this conditional will
2925 			 * instead pick a new domain for each bucket rather
2926 			 * than stripe within each bucket.  The current option
2927 			 * produces more fragmentation and requires more cpu
2928 			 * time but yields better distribution.
2929 			 */
2930 			if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 &&
2931 			    vm_ndomains > 1 && --stripe == 0)
2932 				break;
2933 #endif
2934 		}
2935 		/* Don't block if we allocated any successfully. */
2936 		flags &= ~M_WAITOK;
2937 		flags |= M_NOWAIT;
2938 	}
2939 	if (slab != NULL)
2940 		KEG_UNLOCK(keg);
2941 
2942 	return i;
2943 }
2944 
2945 static uma_bucket_t
2946 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags, int max)
2947 {
2948 	uma_bucket_t bucket;
2949 
2950 	CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain);
2951 
2952 	/* Avoid allocs targeting empty domains. */
2953 	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
2954 		domain = UMA_ANYDOMAIN;
2955 
2956 	/* Don't wait for buckets, preserve caller's NOVM setting. */
2957 	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
2958 	if (bucket == NULL)
2959 		return (NULL);
2960 
2961 	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
2962 	    MIN(max, bucket->ub_entries), domain, flags);
2963 
2964 	/*
2965 	 * Initialize the memory if necessary.
2966 	 */
2967 	if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
2968 		int i;
2969 
2970 		for (i = 0; i < bucket->ub_cnt; i++)
2971 			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
2972 			    flags) != 0)
2973 				break;
2974 		/*
2975 		 * If we couldn't initialize the whole bucket, put the
2976 		 * rest back onto the freelist.
2977 		 */
2978 		if (i != bucket->ub_cnt) {
2979 			zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
2980 			    bucket->ub_cnt - i);
2981 #ifdef INVARIANTS
2982 			bzero(&bucket->ub_bucket[i],
2983 			    sizeof(void *) * (bucket->ub_cnt - i));
2984 #endif
2985 			bucket->ub_cnt = i;
2986 		}
2987 	}
2988 
2989 	if (bucket->ub_cnt == 0) {
2990 		bucket_free(zone, bucket, udata);
2991 		counter_u64_add(zone->uz_fails, 1);
2992 		return (NULL);
2993 	}
2994 
2995 	return (bucket);
2996 }
2997 
2998 /*
2999  * Allocates a single item from a zone.
3000  *
3001  * Arguments
3002  *	zone   The zone to alloc for.
3003  *	udata  The data to be passed to the constructor.
3004  *	domain The domain to allocate from or UMA_ANYDOMAIN.
3005  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
3006  *
3007  * Returns
3008  *	NULL if there is no memory and M_NOWAIT is set
3009  *	An item if successful
3010  */
3011 
3012 static void *
3013 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
3014 {
3015 
3016 	ZONE_LOCK(zone);
3017 	return (zone_alloc_item_locked(zone, udata, domain, flags));
3018 }
3019 
3020 /*
3021  * Returns with zone unlocked.
3022  */
3023 static void *
3024 zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags)
3025 {
3026 	void *item;
3027 #ifdef INVARIANTS
3028 	bool skipdbg;
3029 #endif
3030 
3031 	ZONE_LOCK_ASSERT(zone);
3032 
3033 	if (zone->uz_max_items > 0) {
3034 		if (zone->uz_items >= zone->uz_max_items) {
3035 			zone_log_warning(zone);
3036 			zone_maxaction(zone);
3037 			if (flags & M_NOWAIT) {
3038 				ZONE_UNLOCK(zone);
3039 				return (NULL);
3040 			}
3041 			zone->uz_sleeps++;
3042 			zone->uz_sleepers++;
3043 			while (zone->uz_items >= zone->uz_max_items)
3044 				mtx_sleep(zone, zone->uz_lockptr, PVM,
3045 				    "zonelimit", 0);
3046 			zone->uz_sleepers--;
3047 			if (zone->uz_sleepers > 0 &&
3048 			    zone->uz_items + 1 < zone->uz_max_items)
3049 				wakeup_one(zone);
3050 		}
3051 		zone->uz_items++;
3052 	}
3053 	ZONE_UNLOCK(zone);
3054 
3055 	/* Avoid allocs targeting empty domains. */
3056 	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
3057 		domain = UMA_ANYDOMAIN;
3058 
3059 	if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
3060 		goto fail;
3061 
3062 #ifdef INVARIANTS
3063 	skipdbg = uma_dbg_zskip(zone, item);
3064 #endif
3065 	/*
3066 	 * We have to call both the zone's init (not the keg's init)
3067 	 * and the zone's ctor.  This is because the item is going from
3068 	 * a keg slab directly to the user, and the user is expecting it
3069 	 * to be both zone-init'd as well as zone-ctor'd.
3070 	 */
3071 	if (zone->uz_init != NULL) {
3072 		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
3073 			zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT);
3074 			goto fail;
3075 		}
3076 	}
3077 	if (zone->uz_ctor != NULL &&
3078 #ifdef INVARIANTS
3079 	    (!skipdbg || zone->uz_ctor != trash_ctor ||
3080 	    zone->uz_dtor != trash_dtor) &&
3081 #endif
3082 	    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
3083 		zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
3084 		goto fail;
3085 	}
3086 #ifdef INVARIANTS
3087 	if (!skipdbg)
3088 		uma_dbg_alloc(zone, NULL, item);
3089 #endif
3090 	if (flags & M_ZERO)
3091 		uma_zero_item(item, zone);
3092 
3093 	counter_u64_add(zone->uz_allocs, 1);
3094 	CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
3095 	    zone->uz_name, zone);
3096 
3097 	return (item);
3098 
3099 fail:
3100 	if (zone->uz_max_items > 0) {
3101 		ZONE_LOCK(zone);
3102 		zone->uz_items--;
3103 		ZONE_UNLOCK(zone);
3104 	}
3105 	counter_u64_add(zone->uz_fails, 1);
3106 	CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
3107 	    zone->uz_name, zone);
3108 	return (NULL);
3109 }
3110 
3111 /* See uma.h */
3112 void
3113 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
3114 {
3115 	uma_cache_t cache;
3116 	uma_bucket_t bucket;
3117 	uma_zone_domain_t zdom;
3118 	int cpu, domain;
3119 #ifdef UMA_XDOMAIN
3120 	int itemdomain;
3121 #endif
3122 	bool lockfail;
3123 #ifdef INVARIANTS
3124 	bool skipdbg;
3125 #endif
3126 
3127 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3128 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3129 
3130 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
3131 	    zone->uz_name);
3132 
3133 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3134 	    ("uma_zfree_arg: called with spinlock or critical section held"));
3135 
3136         /* uma_zfree(..., NULL) does nothing, to match free(9). */
3137         if (item == NULL)
3138                 return;
3139 #ifdef DEBUG_MEMGUARD
3140 	if (is_memguard_addr(item)) {
3141 		if (zone->uz_dtor != NULL)
3142 			zone->uz_dtor(item, zone->uz_size, udata);
3143 		if (zone->uz_fini != NULL)
3144 			zone->uz_fini(item, zone->uz_size);
3145 		memguard_free(item);
3146 		return;
3147 	}
3148 #endif
3149 #ifdef INVARIANTS
3150 	skipdbg = uma_dbg_zskip(zone, item);
3151 	if (skipdbg == false) {
3152 		if (zone->uz_flags & UMA_ZONE_MALLOC)
3153 			uma_dbg_free(zone, udata, item);
3154 		else
3155 			uma_dbg_free(zone, NULL, item);
3156 	}
3157 	if (zone->uz_dtor != NULL && (!skipdbg ||
3158 	    zone->uz_dtor != trash_dtor || zone->uz_ctor != trash_ctor))
3159 #else
3160 	if (zone->uz_dtor != NULL)
3161 #endif
3162 		zone->uz_dtor(item, zone->uz_size, udata);
3163 
3164 	/*
3165 	 * The race here is acceptable.  If we miss it we'll just have to wait
3166 	 * a little longer for the limits to be reset.
3167 	 */
3168 	if (zone->uz_sleepers > 0)
3169 		goto zfree_item;
3170 
3171 #ifdef UMA_XDOMAIN
3172 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
3173 		itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
3174 #endif
3175 
3176 	/*
3177 	 * If possible, free to the per-CPU cache.  There are two
3178 	 * requirements for safe access to the per-CPU cache: (1) the thread
3179 	 * accessing the cache must not be preempted or yield during access,
3180 	 * and (2) the thread must not migrate CPUs without switching which
3181 	 * cache it accesses.  We rely on a critical section to prevent
3182 	 * preemption and migration.  We release the critical section in
3183 	 * order to acquire the zone mutex if we are unable to free to the
3184 	 * current cache; when we re-acquire the critical section, we must
3185 	 * detect and handle migration if it has occurred.
3186 	 */
3187 zfree_restart:
3188 	critical_enter();
3189 	cpu = curcpu;
3190 	cache = &zone->uz_cpu[cpu];
3191 
3192 zfree_start:
3193 	domain = PCPU_GET(domain);
3194 #ifdef UMA_XDOMAIN
3195 	if ((zone->uz_flags & UMA_ZONE_NUMA) == 0)
3196 		itemdomain = domain;
3197 #endif
3198 	/*
3199 	 * Try to free into the allocbucket first to give LIFO ordering
3200 	 * for cache-hot datastructures.  Spill over into the freebucket
3201 	 * if necessary.  Alloc will swap them if one runs dry.
3202 	 */
3203 #ifdef UMA_XDOMAIN
3204 	if (domain != itemdomain) {
3205 		bucket = cache->uc_crossbucket;
3206 	} else
3207 #endif
3208 	{
3209 		bucket = cache->uc_allocbucket;
3210 		if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
3211 			bucket = cache->uc_freebucket;
3212 	}
3213 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3214 		KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
3215 		    ("uma_zfree: Freeing to non free bucket index."));
3216 		bucket->ub_bucket[bucket->ub_cnt] = item;
3217 		bucket->ub_cnt++;
3218 		cache->uc_frees++;
3219 		critical_exit();
3220 		return;
3221 	}
3222 
3223 	/*
3224 	 * We must go back the zone, which requires acquiring the zone lock,
3225 	 * which in turn means we must release and re-acquire the critical
3226 	 * section.  Since the critical section is released, we may be
3227 	 * preempted or migrate.  As such, make sure not to maintain any
3228 	 * thread-local state specific to the cache from prior to releasing
3229 	 * the critical section.
3230 	 */
3231 	critical_exit();
3232 	if (zone->uz_count == 0 || bucketdisable)
3233 		goto zfree_item;
3234 
3235 	lockfail = false;
3236 	if (ZONE_TRYLOCK(zone) == 0) {
3237 		/* Record contention to size the buckets. */
3238 		ZONE_LOCK(zone);
3239 		lockfail = true;
3240 	}
3241 	critical_enter();
3242 	cpu = curcpu;
3243 	domain = PCPU_GET(domain);
3244 	cache = &zone->uz_cpu[cpu];
3245 
3246 #ifdef UMA_XDOMAIN
3247 	if (domain != itemdomain)
3248 		bucket = cache->uc_crossbucket;
3249 	else
3250 #endif
3251 		bucket = cache->uc_freebucket;
3252 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3253 		ZONE_UNLOCK(zone);
3254 		goto zfree_start;
3255 	}
3256 #ifdef UMA_XDOMAIN
3257 	if (domain != itemdomain)
3258 		cache->uc_crossbucket = NULL;
3259 	else
3260 #endif
3261 		cache->uc_freebucket = NULL;
3262 	/* We are no longer associated with this CPU. */
3263 	critical_exit();
3264 
3265 #ifdef UMA_XDOMAIN
3266 	if (domain != itemdomain) {
3267 		if (bucket != NULL) {
3268 			zone->uz_xdomain += bucket->ub_cnt;
3269 			if (vm_ndomains > 2 ||
3270 			    zone->uz_bkt_count >= zone->uz_bkt_max) {
3271 				ZONE_UNLOCK(zone);
3272 				bucket_drain(zone, bucket);
3273 				bucket_free(zone, bucket, udata);
3274 			} else {
3275 				zdom = &zone->uz_domain[itemdomain];
3276 				zone_put_bucket(zone, zdom, bucket, true);
3277 				ZONE_UNLOCK(zone);
3278 			}
3279 		} else
3280 			ZONE_UNLOCK(zone);
3281 		bucket = bucket_alloc(zone, udata, M_NOWAIT);
3282 		if (bucket == NULL)
3283 			goto zfree_item;
3284 		critical_enter();
3285 		cpu = curcpu;
3286 		cache = &zone->uz_cpu[cpu];
3287 		if (cache->uc_crossbucket == NULL) {
3288 			cache->uc_crossbucket = bucket;
3289 			goto zfree_start;
3290 		}
3291 		critical_exit();
3292 		bucket_free(zone, bucket, udata);
3293 		goto zfree_restart;
3294 	}
3295 #endif
3296 
3297 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) {
3298 		zdom = &zone->uz_domain[domain];
3299 	} else {
3300 		domain = 0;
3301 		zdom = &zone->uz_domain[0];
3302 	}
3303 
3304 	/* Can we throw this on the zone full list? */
3305 	if (bucket != NULL) {
3306 		CTR3(KTR_UMA,
3307 		    "uma_zfree: zone %s(%p) putting bucket %p on free list",
3308 		    zone->uz_name, zone, bucket);
3309 		/* ub_cnt is pointing to the last free item */
3310 		KASSERT(bucket->ub_cnt == bucket->ub_entries,
3311 		    ("uma_zfree: Attempting to insert not full bucket onto the full list.\n"));
3312 		if (zone->uz_bkt_count >= zone->uz_bkt_max) {
3313 			ZONE_UNLOCK(zone);
3314 			bucket_drain(zone, bucket);
3315 			bucket_free(zone, bucket, udata);
3316 			goto zfree_restart;
3317 		} else
3318 			zone_put_bucket(zone, zdom, bucket, true);
3319 	}
3320 
3321 	/*
3322 	 * We bump the uz count when the cache size is insufficient to
3323 	 * handle the working set.
3324 	 */
3325 	if (lockfail && zone->uz_count < zone->uz_count_max)
3326 		zone->uz_count++;
3327 	ZONE_UNLOCK(zone);
3328 
3329 	bucket = bucket_alloc(zone, udata, M_NOWAIT);
3330 	CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p",
3331 	    zone->uz_name, zone, bucket);
3332 	if (bucket) {
3333 		critical_enter();
3334 		cpu = curcpu;
3335 		cache = &zone->uz_cpu[cpu];
3336 		if (cache->uc_freebucket == NULL &&
3337 		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
3338 		    domain == PCPU_GET(domain))) {
3339 			cache->uc_freebucket = bucket;
3340 			goto zfree_start;
3341 		}
3342 		/*
3343 		 * We lost the race, start over.  We have to drop our
3344 		 * critical section to free the bucket.
3345 		 */
3346 		critical_exit();
3347 		bucket_free(zone, bucket, udata);
3348 		goto zfree_restart;
3349 	}
3350 
3351 	/*
3352 	 * If nothing else caught this, we'll just do an internal free.
3353 	 */
3354 zfree_item:
3355 	zone_free_item(zone, item, udata, SKIP_DTOR);
3356 }
3357 
3358 void
3359 uma_zfree_domain(uma_zone_t zone, void *item, void *udata)
3360 {
3361 
3362 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3363 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3364 
3365 	CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread,
3366 	    zone->uz_name);
3367 
3368 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3369 	    ("uma_zfree_domain: called with spinlock or critical section held"));
3370 
3371         /* uma_zfree(..., NULL) does nothing, to match free(9). */
3372         if (item == NULL)
3373                 return;
3374 	zone_free_item(zone, item, udata, SKIP_NONE);
3375 }
3376 
3377 static void
3378 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item)
3379 {
3380 	uma_keg_t keg;
3381 	uma_domain_t dom;
3382 	uint8_t freei;
3383 
3384 	keg = zone->uz_keg;
3385 	MPASS(zone->uz_lockptr == &keg->uk_lock);
3386 	KEG_LOCK_ASSERT(keg);
3387 	MPASS(keg == slab->us_keg);
3388 
3389 	dom = &keg->uk_domain[slab->us_domain];
3390 
3391 	/* Do we need to remove from any lists? */
3392 	if (slab->us_freecount+1 == keg->uk_ipers) {
3393 		LIST_REMOVE(slab, us_link);
3394 		LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
3395 	} else if (slab->us_freecount == 0) {
3396 		LIST_REMOVE(slab, us_link);
3397 		LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
3398 	}
3399 
3400 	/* Slab management. */
3401 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3402 	BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
3403 	slab->us_freecount++;
3404 
3405 	/* Keg statistics. */
3406 	keg->uk_free++;
3407 }
3408 
3409 static void
3410 zone_release(uma_zone_t zone, void **bucket, int cnt)
3411 {
3412 	void *item;
3413 	uma_slab_t slab;
3414 	uma_keg_t keg;
3415 	uint8_t *mem;
3416 	int i;
3417 
3418 	keg = zone->uz_keg;
3419 	KEG_LOCK(keg);
3420 	for (i = 0; i < cnt; i++) {
3421 		item = bucket[i];
3422 		if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
3423 			mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
3424 			if (zone->uz_flags & UMA_ZONE_HASH) {
3425 				slab = hash_sfind(&keg->uk_hash, mem);
3426 			} else {
3427 				mem += keg->uk_pgoff;
3428 				slab = (uma_slab_t)mem;
3429 			}
3430 		} else {
3431 			slab = vtoslab((vm_offset_t)item);
3432 			MPASS(slab->us_keg == keg);
3433 		}
3434 		slab_free_item(zone, slab, item);
3435 	}
3436 	KEG_UNLOCK(keg);
3437 }
3438 
3439 /*
3440  * Frees a single item to any zone.
3441  *
3442  * Arguments:
3443  *	zone   The zone to free to
3444  *	item   The item we're freeing
3445  *	udata  User supplied data for the dtor
3446  *	skip   Skip dtors and finis
3447  */
3448 static void
3449 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
3450 {
3451 #ifdef INVARIANTS
3452 	bool skipdbg;
3453 
3454 	skipdbg = uma_dbg_zskip(zone, item);
3455 	if (skip == SKIP_NONE && !skipdbg) {
3456 		if (zone->uz_flags & UMA_ZONE_MALLOC)
3457 			uma_dbg_free(zone, udata, item);
3458 		else
3459 			uma_dbg_free(zone, NULL, item);
3460 	}
3461 
3462 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL &&
3463 	    (!skipdbg || zone->uz_dtor != trash_dtor ||
3464 	    zone->uz_ctor != trash_ctor))
3465 #else
3466 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL)
3467 #endif
3468 		zone->uz_dtor(item, zone->uz_size, udata);
3469 
3470 	if (skip < SKIP_FINI && zone->uz_fini)
3471 		zone->uz_fini(item, zone->uz_size);
3472 
3473 	zone->uz_release(zone->uz_arg, &item, 1);
3474 
3475 	if (skip & SKIP_CNT)
3476 		return;
3477 
3478 	counter_u64_add(zone->uz_frees, 1);
3479 
3480 	if (zone->uz_max_items > 0) {
3481 		ZONE_LOCK(zone);
3482 		zone->uz_items--;
3483 		if (zone->uz_sleepers > 0 &&
3484 		    zone->uz_items < zone->uz_max_items)
3485 			wakeup_one(zone);
3486 		ZONE_UNLOCK(zone);
3487 	}
3488 }
3489 
3490 /* See uma.h */
3491 int
3492 uma_zone_set_max(uma_zone_t zone, int nitems)
3493 {
3494 	struct uma_bucket_zone *ubz;
3495 	int count;
3496 
3497 	ZONE_LOCK(zone);
3498 	ubz = bucket_zone_max(zone, nitems);
3499 	count = ubz != NULL ? ubz->ubz_entries : 0;
3500 	zone->uz_count_max = zone->uz_count = count;
3501 	if (zone->uz_count_min > zone->uz_count_max)
3502 		zone->uz_count_min = zone->uz_count_max;
3503 	zone->uz_max_items = nitems;
3504 	ZONE_UNLOCK(zone);
3505 
3506 	return (nitems);
3507 }
3508 
3509 /* See uma.h */
3510 void
3511 uma_zone_set_maxcache(uma_zone_t zone, int nitems)
3512 {
3513 	struct uma_bucket_zone *ubz;
3514 	int bpcpu;
3515 
3516 	ZONE_LOCK(zone);
3517 	ubz = bucket_zone_max(zone, nitems);
3518 	if (ubz != NULL) {
3519 		bpcpu = 2;
3520 #ifdef UMA_XDOMAIN
3521 		if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
3522 			/* Count the cross-domain bucket. */
3523 			bpcpu++;
3524 #endif
3525 		nitems -= ubz->ubz_entries * bpcpu * mp_ncpus;
3526 		zone->uz_count_max = ubz->ubz_entries;
3527 	} else {
3528 		zone->uz_count_max = zone->uz_count = 0;
3529 	}
3530 	if (zone->uz_count_min > zone->uz_count_max)
3531 		zone->uz_count_min = zone->uz_count_max;
3532 	zone->uz_bkt_max = nitems;
3533 	ZONE_UNLOCK(zone);
3534 }
3535 
3536 /* See uma.h */
3537 int
3538 uma_zone_get_max(uma_zone_t zone)
3539 {
3540 	int nitems;
3541 
3542 	ZONE_LOCK(zone);
3543 	nitems = zone->uz_max_items;
3544 	ZONE_UNLOCK(zone);
3545 
3546 	return (nitems);
3547 }
3548 
3549 /* See uma.h */
3550 void
3551 uma_zone_set_warning(uma_zone_t zone, const char *warning)
3552 {
3553 
3554 	ZONE_LOCK(zone);
3555 	zone->uz_warning = warning;
3556 	ZONE_UNLOCK(zone);
3557 }
3558 
3559 /* See uma.h */
3560 void
3561 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
3562 {
3563 
3564 	ZONE_LOCK(zone);
3565 	TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
3566 	ZONE_UNLOCK(zone);
3567 }
3568 
3569 /* See uma.h */
3570 int
3571 uma_zone_get_cur(uma_zone_t zone)
3572 {
3573 	int64_t nitems;
3574 	u_int i;
3575 
3576 	ZONE_LOCK(zone);
3577 	nitems = counter_u64_fetch(zone->uz_allocs) -
3578 	    counter_u64_fetch(zone->uz_frees);
3579 	CPU_FOREACH(i) {
3580 		/*
3581 		 * See the comment in uma_vm_zone_stats() regarding the
3582 		 * safety of accessing the per-cpu caches. With the zone lock
3583 		 * held, it is safe, but can potentially result in stale data.
3584 		 */
3585 		nitems += zone->uz_cpu[i].uc_allocs -
3586 		    zone->uz_cpu[i].uc_frees;
3587 	}
3588 	ZONE_UNLOCK(zone);
3589 
3590 	return (nitems < 0 ? 0 : nitems);
3591 }
3592 
3593 /* See uma.h */
3594 void
3595 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
3596 {
3597 	uma_keg_t keg;
3598 
3599 	KEG_GET(zone, keg);
3600 	KEG_LOCK(keg);
3601 	KASSERT(keg->uk_pages == 0,
3602 	    ("uma_zone_set_init on non-empty keg"));
3603 	keg->uk_init = uminit;
3604 	KEG_UNLOCK(keg);
3605 }
3606 
3607 /* See uma.h */
3608 void
3609 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
3610 {
3611 	uma_keg_t keg;
3612 
3613 	KEG_GET(zone, keg);
3614 	KEG_LOCK(keg);
3615 	KASSERT(keg->uk_pages == 0,
3616 	    ("uma_zone_set_fini on non-empty keg"));
3617 	keg->uk_fini = fini;
3618 	KEG_UNLOCK(keg);
3619 }
3620 
3621 /* See uma.h */
3622 void
3623 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
3624 {
3625 
3626 	ZONE_LOCK(zone);
3627 	KASSERT(zone->uz_keg->uk_pages == 0,
3628 	    ("uma_zone_set_zinit on non-empty keg"));
3629 	zone->uz_init = zinit;
3630 	ZONE_UNLOCK(zone);
3631 }
3632 
3633 /* See uma.h */
3634 void
3635 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
3636 {
3637 
3638 	ZONE_LOCK(zone);
3639 	KASSERT(zone->uz_keg->uk_pages == 0,
3640 	    ("uma_zone_set_zfini on non-empty keg"));
3641 	zone->uz_fini = zfini;
3642 	ZONE_UNLOCK(zone);
3643 }
3644 
3645 /* See uma.h */
3646 /* XXX uk_freef is not actually used with the zone locked */
3647 void
3648 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
3649 {
3650 	uma_keg_t keg;
3651 
3652 	KEG_GET(zone, keg);
3653 	KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
3654 	KEG_LOCK(keg);
3655 	keg->uk_freef = freef;
3656 	KEG_UNLOCK(keg);
3657 }
3658 
3659 /* See uma.h */
3660 /* XXX uk_allocf is not actually used with the zone locked */
3661 void
3662 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3663 {
3664 	uma_keg_t keg;
3665 
3666 	KEG_GET(zone, keg);
3667 	KEG_LOCK(keg);
3668 	keg->uk_allocf = allocf;
3669 	KEG_UNLOCK(keg);
3670 }
3671 
3672 /* See uma.h */
3673 void
3674 uma_zone_reserve(uma_zone_t zone, int items)
3675 {
3676 	uma_keg_t keg;
3677 
3678 	KEG_GET(zone, keg);
3679 	KEG_LOCK(keg);
3680 	keg->uk_reserve = items;
3681 	KEG_UNLOCK(keg);
3682 }
3683 
3684 /* See uma.h */
3685 int
3686 uma_zone_reserve_kva(uma_zone_t zone, int count)
3687 {
3688 	uma_keg_t keg;
3689 	vm_offset_t kva;
3690 	u_int pages;
3691 
3692 	KEG_GET(zone, keg);
3693 
3694 	pages = count / keg->uk_ipers;
3695 	if (pages * keg->uk_ipers < count)
3696 		pages++;
3697 	pages *= keg->uk_ppera;
3698 
3699 #ifdef UMA_MD_SMALL_ALLOC
3700 	if (keg->uk_ppera > 1) {
3701 #else
3702 	if (1) {
3703 #endif
3704 		kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
3705 		if (kva == 0)
3706 			return (0);
3707 	} else
3708 		kva = 0;
3709 
3710 	ZONE_LOCK(zone);
3711 	MPASS(keg->uk_kva == 0);
3712 	keg->uk_kva = kva;
3713 	keg->uk_offset = 0;
3714 	zone->uz_max_items = pages * keg->uk_ipers;
3715 #ifdef UMA_MD_SMALL_ALLOC
3716 	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3717 #else
3718 	keg->uk_allocf = noobj_alloc;
3719 #endif
3720 	keg->uk_flags |= UMA_ZONE_NOFREE;
3721 	ZONE_UNLOCK(zone);
3722 
3723 	return (1);
3724 }
3725 
3726 /* See uma.h */
3727 void
3728 uma_prealloc(uma_zone_t zone, int items)
3729 {
3730 	struct vm_domainset_iter di;
3731 	uma_domain_t dom;
3732 	uma_slab_t slab;
3733 	uma_keg_t keg;
3734 	int aflags, domain, slabs;
3735 
3736 	KEG_GET(zone, keg);
3737 	KEG_LOCK(keg);
3738 	slabs = items / keg->uk_ipers;
3739 	if (slabs * keg->uk_ipers < items)
3740 		slabs++;
3741 	while (slabs-- > 0) {
3742 		aflags = M_NOWAIT;
3743 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
3744 		    &aflags);
3745 		for (;;) {
3746 			slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
3747 			    aflags);
3748 			if (slab != NULL) {
3749 				MPASS(slab->us_keg == keg);
3750 				dom = &keg->uk_domain[slab->us_domain];
3751 				LIST_INSERT_HEAD(&dom->ud_free_slab, slab,
3752 				    us_link);
3753 				break;
3754 			}
3755 			KEG_LOCK(keg);
3756 			if (vm_domainset_iter_policy(&di, &domain) != 0) {
3757 				KEG_UNLOCK(keg);
3758 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
3759 				KEG_LOCK(keg);
3760 			}
3761 		}
3762 	}
3763 	KEG_UNLOCK(keg);
3764 }
3765 
3766 /* See uma.h */
3767 void
3768 uma_reclaim(int req)
3769 {
3770 
3771 	CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
3772 	sx_xlock(&uma_reclaim_lock);
3773 	bucket_enable();
3774 
3775 	switch (req) {
3776 	case UMA_RECLAIM_TRIM:
3777 		zone_foreach(zone_trim);
3778 		break;
3779 	case UMA_RECLAIM_DRAIN:
3780 	case UMA_RECLAIM_DRAIN_CPU:
3781 		zone_foreach(zone_drain);
3782 		if (req == UMA_RECLAIM_DRAIN_CPU) {
3783 			pcpu_cache_drain_safe(NULL);
3784 			zone_foreach(zone_drain);
3785 		}
3786 		break;
3787 	default:
3788 		panic("unhandled reclamation request %d", req);
3789 	}
3790 
3791 	/*
3792 	 * Some slabs may have been freed but this zone will be visited early
3793 	 * we visit again so that we can free pages that are empty once other
3794 	 * zones are drained.  We have to do the same for buckets.
3795 	 */
3796 	zone_drain(slabzone);
3797 	bucket_zone_drain();
3798 	sx_xunlock(&uma_reclaim_lock);
3799 }
3800 
3801 static volatile int uma_reclaim_needed;
3802 
3803 void
3804 uma_reclaim_wakeup(void)
3805 {
3806 
3807 	if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
3808 		wakeup(uma_reclaim);
3809 }
3810 
3811 void
3812 uma_reclaim_worker(void *arg __unused)
3813 {
3814 
3815 	for (;;) {
3816 		sx_xlock(&uma_reclaim_lock);
3817 		while (atomic_load_int(&uma_reclaim_needed) == 0)
3818 			sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl",
3819 			    hz);
3820 		sx_xunlock(&uma_reclaim_lock);
3821 		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
3822 		uma_reclaim(UMA_RECLAIM_DRAIN_CPU);
3823 		atomic_store_int(&uma_reclaim_needed, 0);
3824 		/* Don't fire more than once per-second. */
3825 		pause("umarclslp", hz);
3826 	}
3827 }
3828 
3829 /* See uma.h */
3830 void
3831 uma_zone_reclaim(uma_zone_t zone, int req)
3832 {
3833 
3834 	switch (req) {
3835 	case UMA_RECLAIM_TRIM:
3836 		zone_trim(zone);
3837 		break;
3838 	case UMA_RECLAIM_DRAIN:
3839 		zone_drain(zone);
3840 		break;
3841 	case UMA_RECLAIM_DRAIN_CPU:
3842 		pcpu_cache_drain_safe(zone);
3843 		zone_drain(zone);
3844 		break;
3845 	default:
3846 		panic("unhandled reclamation request %d", req);
3847 	}
3848 }
3849 
3850 /* See uma.h */
3851 int
3852 uma_zone_exhausted(uma_zone_t zone)
3853 {
3854 	int full;
3855 
3856 	ZONE_LOCK(zone);
3857 	full = zone->uz_sleepers > 0;
3858 	ZONE_UNLOCK(zone);
3859 	return (full);
3860 }
3861 
3862 int
3863 uma_zone_exhausted_nolock(uma_zone_t zone)
3864 {
3865 	return (zone->uz_sleepers > 0);
3866 }
3867 
3868 void *
3869 uma_large_malloc_domain(vm_size_t size, int domain, int wait)
3870 {
3871 	struct domainset *policy;
3872 	vm_offset_t addr;
3873 	uma_slab_t slab;
3874 
3875 	if (domain != UMA_ANYDOMAIN) {
3876 		/* avoid allocs targeting empty domains */
3877 		if (VM_DOMAIN_EMPTY(domain))
3878 			domain = UMA_ANYDOMAIN;
3879 	}
3880 	slab = zone_alloc_item(slabzone, NULL, domain, wait);
3881 	if (slab == NULL)
3882 		return (NULL);
3883 	policy = (domain == UMA_ANYDOMAIN) ? DOMAINSET_RR() :
3884 	    DOMAINSET_FIXED(domain);
3885 	addr = kmem_malloc_domainset(policy, size, wait);
3886 	if (addr != 0) {
3887 		vsetslab(addr, slab);
3888 		slab->us_data = (void *)addr;
3889 		slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC;
3890 		slab->us_size = size;
3891 		slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE(
3892 		    pmap_kextract(addr)));
3893 		uma_total_inc(size);
3894 	} else {
3895 		zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3896 	}
3897 
3898 	return ((void *)addr);
3899 }
3900 
3901 void *
3902 uma_large_malloc(vm_size_t size, int wait)
3903 {
3904 
3905 	return uma_large_malloc_domain(size, UMA_ANYDOMAIN, wait);
3906 }
3907 
3908 void
3909 uma_large_free(uma_slab_t slab)
3910 {
3911 
3912 	KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0,
3913 	    ("uma_large_free:  Memory not allocated with uma_large_malloc."));
3914 	kmem_free((vm_offset_t)slab->us_data, slab->us_size);
3915 	uma_total_dec(slab->us_size);
3916 	zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3917 }
3918 
3919 static void
3920 uma_zero_item(void *item, uma_zone_t zone)
3921 {
3922 
3923 	bzero(item, zone->uz_size);
3924 }
3925 
3926 unsigned long
3927 uma_limit(void)
3928 {
3929 
3930 	return (uma_kmem_limit);
3931 }
3932 
3933 void
3934 uma_set_limit(unsigned long limit)
3935 {
3936 
3937 	uma_kmem_limit = limit;
3938 }
3939 
3940 unsigned long
3941 uma_size(void)
3942 {
3943 
3944 	return (atomic_load_long(&uma_kmem_total));
3945 }
3946 
3947 long
3948 uma_avail(void)
3949 {
3950 
3951 	return (uma_kmem_limit - uma_size());
3952 }
3953 
3954 void
3955 uma_print_stats(void)
3956 {
3957 	zone_foreach(uma_print_zone);
3958 }
3959 
3960 static void
3961 slab_print(uma_slab_t slab)
3962 {
3963 	printf("slab: keg %p, data %p, freecount %d\n",
3964 		slab->us_keg, slab->us_data, slab->us_freecount);
3965 }
3966 
3967 static void
3968 cache_print(uma_cache_t cache)
3969 {
3970 	printf("alloc: %p(%d), free: %p(%d), cross: %p(%d)j\n",
3971 		cache->uc_allocbucket,
3972 		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3973 		cache->uc_freebucket,
3974 		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0,
3975 		cache->uc_crossbucket,
3976 		cache->uc_crossbucket?cache->uc_crossbucket->ub_cnt:0);
3977 }
3978 
3979 static void
3980 uma_print_keg(uma_keg_t keg)
3981 {
3982 	uma_domain_t dom;
3983 	uma_slab_t slab;
3984 	int i;
3985 
3986 	printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3987 	    "out %d free %d\n",
3988 	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3989 	    keg->uk_ipers, keg->uk_ppera,
3990 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
3991 	    keg->uk_free);
3992 	for (i = 0; i < vm_ndomains; i++) {
3993 		dom = &keg->uk_domain[i];
3994 		printf("Part slabs:\n");
3995 		LIST_FOREACH(slab, &dom->ud_part_slab, us_link)
3996 			slab_print(slab);
3997 		printf("Free slabs:\n");
3998 		LIST_FOREACH(slab, &dom->ud_free_slab, us_link)
3999 			slab_print(slab);
4000 		printf("Full slabs:\n");
4001 		LIST_FOREACH(slab, &dom->ud_full_slab, us_link)
4002 			slab_print(slab);
4003 	}
4004 }
4005 
4006 void
4007 uma_print_zone(uma_zone_t zone)
4008 {
4009 	uma_cache_t cache;
4010 	int i;
4011 
4012 	printf("zone: %s(%p) size %d maxitems %ju flags %#x\n",
4013 	    zone->uz_name, zone, zone->uz_size, (uintmax_t)zone->uz_max_items,
4014 	    zone->uz_flags);
4015 	if (zone->uz_lockptr != &zone->uz_lock)
4016 		uma_print_keg(zone->uz_keg);
4017 	CPU_FOREACH(i) {
4018 		cache = &zone->uz_cpu[i];
4019 		printf("CPU %d Cache:\n", i);
4020 		cache_print(cache);
4021 	}
4022 }
4023 
4024 #ifdef DDB
4025 /*
4026  * Generate statistics across both the zone and its per-cpu cache's.  Return
4027  * desired statistics if the pointer is non-NULL for that statistic.
4028  *
4029  * Note: does not update the zone statistics, as it can't safely clear the
4030  * per-CPU cache statistic.
4031  *
4032  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
4033  * safe from off-CPU; we should modify the caches to track this information
4034  * directly so that we don't have to.
4035  */
4036 static void
4037 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
4038     uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp)
4039 {
4040 	uma_cache_t cache;
4041 	uint64_t allocs, frees, sleeps, xdomain;
4042 	int cachefree, cpu;
4043 
4044 	allocs = frees = sleeps = xdomain = 0;
4045 	cachefree = 0;
4046 	CPU_FOREACH(cpu) {
4047 		cache = &z->uz_cpu[cpu];
4048 		if (cache->uc_allocbucket != NULL)
4049 			cachefree += cache->uc_allocbucket->ub_cnt;
4050 		if (cache->uc_freebucket != NULL)
4051 			cachefree += cache->uc_freebucket->ub_cnt;
4052 		if (cache->uc_crossbucket != NULL) {
4053 			xdomain += cache->uc_crossbucket->ub_cnt;
4054 			cachefree += cache->uc_crossbucket->ub_cnt;
4055 		}
4056 		allocs += cache->uc_allocs;
4057 		frees += cache->uc_frees;
4058 	}
4059 	allocs += counter_u64_fetch(z->uz_allocs);
4060 	frees += counter_u64_fetch(z->uz_frees);
4061 	sleeps += z->uz_sleeps;
4062 	xdomain += z->uz_xdomain;
4063 	if (cachefreep != NULL)
4064 		*cachefreep = cachefree;
4065 	if (allocsp != NULL)
4066 		*allocsp = allocs;
4067 	if (freesp != NULL)
4068 		*freesp = frees;
4069 	if (sleepsp != NULL)
4070 		*sleepsp = sleeps;
4071 	if (xdomainp != NULL)
4072 		*xdomainp = xdomain;
4073 }
4074 #endif /* DDB */
4075 
4076 static int
4077 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
4078 {
4079 	uma_keg_t kz;
4080 	uma_zone_t z;
4081 	int count;
4082 
4083 	count = 0;
4084 	rw_rlock(&uma_rwlock);
4085 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4086 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
4087 			count++;
4088 	}
4089 	LIST_FOREACH(z, &uma_cachezones, uz_link)
4090 		count++;
4091 
4092 	rw_runlock(&uma_rwlock);
4093 	return (sysctl_handle_int(oidp, &count, 0, req));
4094 }
4095 
4096 static void
4097 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf,
4098     struct uma_percpu_stat *ups, bool internal)
4099 {
4100 	uma_zone_domain_t zdom;
4101 	uma_bucket_t bucket;
4102 	uma_cache_t cache;
4103 	int i;
4104 
4105 
4106 	for (i = 0; i < vm_ndomains; i++) {
4107 		zdom = &z->uz_domain[i];
4108 		uth->uth_zone_free += zdom->uzd_nitems;
4109 	}
4110 	uth->uth_allocs = counter_u64_fetch(z->uz_allocs);
4111 	uth->uth_frees = counter_u64_fetch(z->uz_frees);
4112 	uth->uth_fails = counter_u64_fetch(z->uz_fails);
4113 	uth->uth_sleeps = z->uz_sleeps;
4114 	uth->uth_xdomain = z->uz_xdomain;
4115 
4116 	/*
4117 	 * While it is not normally safe to access the cache bucket pointers
4118 	 * while not on the CPU that owns the cache, we only allow the pointers
4119 	 * to be exchanged without the zone lock held, not invalidated, so
4120 	 * accept the possible race associated with bucket exchange during
4121 	 * monitoring.  Use atomic_load_ptr() to ensure that the bucket pointers
4122 	 * are loaded only once.
4123 	 */
4124 	for (i = 0; i < mp_maxid + 1; i++) {
4125 		bzero(&ups[i], sizeof(*ups));
4126 		if (internal || CPU_ABSENT(i))
4127 			continue;
4128 		cache = &z->uz_cpu[i];
4129 		bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_allocbucket);
4130 		if (bucket != NULL)
4131 			ups[i].ups_cache_free += bucket->ub_cnt;
4132 		bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_freebucket);
4133 		if (bucket != NULL)
4134 			ups[i].ups_cache_free += bucket->ub_cnt;
4135 		bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_crossbucket);
4136 		if (bucket != NULL)
4137 			ups[i].ups_cache_free += bucket->ub_cnt;
4138 		ups[i].ups_allocs = cache->uc_allocs;
4139 		ups[i].ups_frees = cache->uc_frees;
4140 	}
4141 }
4142 
4143 static int
4144 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
4145 {
4146 	struct uma_stream_header ush;
4147 	struct uma_type_header uth;
4148 	struct uma_percpu_stat *ups;
4149 	struct sbuf sbuf;
4150 	uma_keg_t kz;
4151 	uma_zone_t z;
4152 	int count, error, i;
4153 
4154 	error = sysctl_wire_old_buffer(req, 0);
4155 	if (error != 0)
4156 		return (error);
4157 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
4158 	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
4159 	ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
4160 
4161 	count = 0;
4162 	rw_rlock(&uma_rwlock);
4163 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4164 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
4165 			count++;
4166 	}
4167 
4168 	LIST_FOREACH(z, &uma_cachezones, uz_link)
4169 		count++;
4170 
4171 	/*
4172 	 * Insert stream header.
4173 	 */
4174 	bzero(&ush, sizeof(ush));
4175 	ush.ush_version = UMA_STREAM_VERSION;
4176 	ush.ush_maxcpus = (mp_maxid + 1);
4177 	ush.ush_count = count;
4178 	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
4179 
4180 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4181 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4182 			bzero(&uth, sizeof(uth));
4183 			ZONE_LOCK(z);
4184 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
4185 			uth.uth_align = kz->uk_align;
4186 			uth.uth_size = kz->uk_size;
4187 			uth.uth_rsize = kz->uk_rsize;
4188 			if (z->uz_max_items > 0)
4189 				uth.uth_pages = (z->uz_items / kz->uk_ipers) *
4190 					kz->uk_ppera;
4191 			else
4192 				uth.uth_pages = kz->uk_pages;
4193 			uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) *
4194 			    kz->uk_ppera;
4195 			uth.uth_limit = z->uz_max_items;
4196 			uth.uth_keg_free = z->uz_keg->uk_free;
4197 
4198 			/*
4199 			 * A zone is secondary is it is not the first entry
4200 			 * on the keg's zone list.
4201 			 */
4202 			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
4203 			    (LIST_FIRST(&kz->uk_zones) != z))
4204 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
4205 			uma_vm_zone_stats(&uth, z, &sbuf, ups,
4206 			    kz->uk_flags & UMA_ZFLAG_INTERNAL);
4207 			ZONE_UNLOCK(z);
4208 			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
4209 			for (i = 0; i < mp_maxid + 1; i++)
4210 				(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
4211 		}
4212 	}
4213 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
4214 		bzero(&uth, sizeof(uth));
4215 		ZONE_LOCK(z);
4216 		strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
4217 		uth.uth_size = z->uz_size;
4218 		uma_vm_zone_stats(&uth, z, &sbuf, ups, false);
4219 		ZONE_UNLOCK(z);
4220 		(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
4221 		for (i = 0; i < mp_maxid + 1; i++)
4222 			(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
4223 	}
4224 
4225 	rw_runlock(&uma_rwlock);
4226 	error = sbuf_finish(&sbuf);
4227 	sbuf_delete(&sbuf);
4228 	free(ups, M_TEMP);
4229 	return (error);
4230 }
4231 
4232 int
4233 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
4234 {
4235 	uma_zone_t zone = *(uma_zone_t *)arg1;
4236 	int error, max;
4237 
4238 	max = uma_zone_get_max(zone);
4239 	error = sysctl_handle_int(oidp, &max, 0, req);
4240 	if (error || !req->newptr)
4241 		return (error);
4242 
4243 	uma_zone_set_max(zone, max);
4244 
4245 	return (0);
4246 }
4247 
4248 int
4249 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
4250 {
4251 	uma_zone_t zone = *(uma_zone_t *)arg1;
4252 	int cur;
4253 
4254 	cur = uma_zone_get_cur(zone);
4255 	return (sysctl_handle_int(oidp, &cur, 0, req));
4256 }
4257 
4258 #ifdef INVARIANTS
4259 static uma_slab_t
4260 uma_dbg_getslab(uma_zone_t zone, void *item)
4261 {
4262 	uma_slab_t slab;
4263 	uma_keg_t keg;
4264 	uint8_t *mem;
4265 
4266 	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
4267 	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
4268 		slab = vtoslab((vm_offset_t)mem);
4269 	} else {
4270 		/*
4271 		 * It is safe to return the slab here even though the
4272 		 * zone is unlocked because the item's allocation state
4273 		 * essentially holds a reference.
4274 		 */
4275 		if (zone->uz_lockptr == &zone->uz_lock)
4276 			return (NULL);
4277 		ZONE_LOCK(zone);
4278 		keg = zone->uz_keg;
4279 		if (keg->uk_flags & UMA_ZONE_HASH)
4280 			slab = hash_sfind(&keg->uk_hash, mem);
4281 		else
4282 			slab = (uma_slab_t)(mem + keg->uk_pgoff);
4283 		ZONE_UNLOCK(zone);
4284 	}
4285 
4286 	return (slab);
4287 }
4288 
4289 static bool
4290 uma_dbg_zskip(uma_zone_t zone, void *mem)
4291 {
4292 
4293 	if (zone->uz_lockptr == &zone->uz_lock)
4294 		return (true);
4295 
4296 	return (uma_dbg_kskip(zone->uz_keg, mem));
4297 }
4298 
4299 static bool
4300 uma_dbg_kskip(uma_keg_t keg, void *mem)
4301 {
4302 	uintptr_t idx;
4303 
4304 	if (dbg_divisor == 0)
4305 		return (true);
4306 
4307 	if (dbg_divisor == 1)
4308 		return (false);
4309 
4310 	idx = (uintptr_t)mem >> PAGE_SHIFT;
4311 	if (keg->uk_ipers > 1) {
4312 		idx *= keg->uk_ipers;
4313 		idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
4314 	}
4315 
4316 	if ((idx / dbg_divisor) * dbg_divisor != idx) {
4317 		counter_u64_add(uma_skip_cnt, 1);
4318 		return (true);
4319 	}
4320 	counter_u64_add(uma_dbg_cnt, 1);
4321 
4322 	return (false);
4323 }
4324 
4325 /*
4326  * Set up the slab's freei data such that uma_dbg_free can function.
4327  *
4328  */
4329 static void
4330 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
4331 {
4332 	uma_keg_t keg;
4333 	int freei;
4334 
4335 	if (slab == NULL) {
4336 		slab = uma_dbg_getslab(zone, item);
4337 		if (slab == NULL)
4338 			panic("uma: item %p did not belong to zone %s\n",
4339 			    item, zone->uz_name);
4340 	}
4341 	keg = slab->us_keg;
4342 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4343 
4344 	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4345 		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
4346 		    item, zone, zone->uz_name, slab, freei);
4347 	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4348 
4349 	return;
4350 }
4351 
4352 /*
4353  * Verifies freed addresses.  Checks for alignment, valid slab membership
4354  * and duplicate frees.
4355  *
4356  */
4357 static void
4358 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
4359 {
4360 	uma_keg_t keg;
4361 	int freei;
4362 
4363 	if (slab == NULL) {
4364 		slab = uma_dbg_getslab(zone, item);
4365 		if (slab == NULL)
4366 			panic("uma: Freed item %p did not belong to zone %s\n",
4367 			    item, zone->uz_name);
4368 	}
4369 	keg = slab->us_keg;
4370 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4371 
4372 	if (freei >= keg->uk_ipers)
4373 		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
4374 		    item, zone, zone->uz_name, slab, freei);
4375 
4376 	if (((freei * keg->uk_rsize) + slab->us_data) != item)
4377 		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
4378 		    item, zone, zone->uz_name, slab, freei);
4379 
4380 	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4381 		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
4382 		    item, zone, zone->uz_name, slab, freei);
4383 
4384 	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4385 }
4386 #endif /* INVARIANTS */
4387 
4388 #ifdef DDB
4389 static int64_t
4390 get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used,
4391     uint64_t *sleeps, long *cachefree, uint64_t *xdomain)
4392 {
4393 	uint64_t frees;
4394 	int i;
4395 
4396 	if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
4397 		*allocs = counter_u64_fetch(z->uz_allocs);
4398 		frees = counter_u64_fetch(z->uz_frees);
4399 		*sleeps = z->uz_sleeps;
4400 		*cachefree = 0;
4401 		*xdomain = 0;
4402 	} else
4403 		uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps,
4404 		    xdomain);
4405 	if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
4406 	    (LIST_FIRST(&kz->uk_zones) != z)))
4407 		*cachefree += kz->uk_free;
4408 	for (i = 0; i < vm_ndomains; i++)
4409 		*cachefree += z->uz_domain[i].uzd_nitems;
4410 	*used = *allocs - frees;
4411 	return (((int64_t)*used + *cachefree) * kz->uk_size);
4412 }
4413 
4414 DB_SHOW_COMMAND(uma, db_show_uma)
4415 {
4416 	const char *fmt_hdr, *fmt_entry;
4417 	uma_keg_t kz;
4418 	uma_zone_t z;
4419 	uint64_t allocs, used, sleeps, xdomain;
4420 	long cachefree;
4421 	/* variables for sorting */
4422 	uma_keg_t cur_keg;
4423 	uma_zone_t cur_zone, last_zone;
4424 	int64_t cur_size, last_size, size;
4425 	int ties;
4426 
4427 	/* /i option produces machine-parseable CSV output */
4428 	if (modif[0] == 'i') {
4429 		fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n";
4430 		fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n";
4431 	} else {
4432 		fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n";
4433 		fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n";
4434 	}
4435 
4436 	db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests",
4437 	    "Sleeps", "Bucket", "Total Mem", "XFree");
4438 
4439 	/* Sort the zones with largest size first. */
4440 	last_zone = NULL;
4441 	last_size = INT64_MAX;
4442 	for (;;) {
4443 		cur_zone = NULL;
4444 		cur_size = -1;
4445 		ties = 0;
4446 		LIST_FOREACH(kz, &uma_kegs, uk_link) {
4447 			LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4448 				/*
4449 				 * In the case of size ties, print out zones
4450 				 * in the order they are encountered.  That is,
4451 				 * when we encounter the most recently output
4452 				 * zone, we have already printed all preceding
4453 				 * ties, and we must print all following ties.
4454 				 */
4455 				if (z == last_zone) {
4456 					ties = 1;
4457 					continue;
4458 				}
4459 				size = get_uma_stats(kz, z, &allocs, &used,
4460 				    &sleeps, &cachefree, &xdomain);
4461 				if (size > cur_size && size < last_size + ties)
4462 				{
4463 					cur_size = size;
4464 					cur_zone = z;
4465 					cur_keg = kz;
4466 				}
4467 			}
4468 		}
4469 		if (cur_zone == NULL)
4470 			break;
4471 
4472 		size = get_uma_stats(cur_keg, cur_zone, &allocs, &used,
4473 		    &sleeps, &cachefree, &xdomain);
4474 		db_printf(fmt_entry, cur_zone->uz_name,
4475 		    (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree,
4476 		    (uintmax_t)allocs, (uintmax_t)sleeps,
4477 		    (unsigned)cur_zone->uz_count, (intmax_t)size, xdomain);
4478 
4479 		if (db_pager_quit)
4480 			return;
4481 		last_zone = cur_zone;
4482 		last_size = cur_size;
4483 	}
4484 }
4485 
4486 DB_SHOW_COMMAND(umacache, db_show_umacache)
4487 {
4488 	uma_zone_t z;
4489 	uint64_t allocs, frees;
4490 	long cachefree;
4491 	int i;
4492 
4493 	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
4494 	    "Requests", "Bucket");
4495 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
4496 		uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL);
4497 		for (i = 0; i < vm_ndomains; i++)
4498 			cachefree += z->uz_domain[i].uzd_nitems;
4499 		db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",
4500 		    z->uz_name, (uintmax_t)z->uz_size,
4501 		    (intmax_t)(allocs - frees), cachefree,
4502 		    (uintmax_t)allocs, z->uz_count);
4503 		if (db_pager_quit)
4504 			return;
4505 	}
4506 }
4507 #endif	/* DDB */
4508