xref: /freebsd/sys/vm/uma_core.c (revision 4133f23624058951a3b66e3ad735de980a485f36)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
5  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
6  * Copyright (c) 2004-2006 Robert N. M. Watson
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice unmodified, this list of conditions, and the following
14  *    disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * uma_core.c  Implementation of the Universal Memory allocator
33  *
34  * This allocator is intended to replace the multitude of similar object caches
35  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
36  * efficient.  A primary design goal is to return unused memory to the rest of
37  * the system.  This will make the system as a whole more flexible due to the
38  * ability to move memory to subsystems which most need it instead of leaving
39  * pools of reserved memory unused.
40  *
41  * The basic ideas stem from similar slab/zone based allocators whose algorithms
42  * are well known.
43  *
44  */
45 
46 /*
47  * TODO:
48  *	- Improve memory usage for large allocations
49  *	- Investigate cache size adjustments
50  */
51 
52 #include <sys/cdefs.h>
53 __FBSDID("$FreeBSD$");
54 
55 #include "opt_ddb.h"
56 #include "opt_param.h"
57 #include "opt_vm.h"
58 
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/bitset.h>
62 #include <sys/domainset.h>
63 #include <sys/eventhandler.h>
64 #include <sys/kernel.h>
65 #include <sys/types.h>
66 #include <sys/limits.h>
67 #include <sys/queue.h>
68 #include <sys/malloc.h>
69 #include <sys/ktr.h>
70 #include <sys/lock.h>
71 #include <sys/sysctl.h>
72 #include <sys/mutex.h>
73 #include <sys/proc.h>
74 #include <sys/random.h>
75 #include <sys/rwlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/taskqueue.h>
80 #include <sys/vmmeter.h>
81 
82 #include <vm/vm.h>
83 #include <vm/vm_domainset.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_param.h>
88 #include <vm/vm_phys.h>
89 #include <vm/vm_pagequeue.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_kern.h>
92 #include <vm/vm_extern.h>
93 #include <vm/uma.h>
94 #include <vm/uma_int.h>
95 #include <vm/uma_dbg.h>
96 
97 #include <ddb/ddb.h>
98 
99 #ifdef DEBUG_MEMGUARD
100 #include <vm/memguard.h>
101 #endif
102 
103 /*
104  * This is the zone and keg from which all zones are spawned.
105  */
106 static uma_zone_t kegs;
107 static uma_zone_t zones;
108 
109 /* This is the zone from which all offpage uma_slab_ts are allocated. */
110 static uma_zone_t slabzone;
111 
112 /*
113  * The initial hash tables come out of this zone so they can be allocated
114  * prior to malloc coming up.
115  */
116 static uma_zone_t hashzone;
117 
118 /* The boot-time adjusted value for cache line alignment. */
119 int uma_align_cache = 64 - 1;
120 
121 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
122 
123 /*
124  * Are we allowed to allocate buckets?
125  */
126 static int bucketdisable = 1;
127 
128 /* Linked list of all kegs in the system */
129 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
130 
131 /* Linked list of all cache-only zones in the system */
132 static LIST_HEAD(,uma_zone) uma_cachezones =
133     LIST_HEAD_INITIALIZER(uma_cachezones);
134 
135 /* This RW lock protects the keg list */
136 static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
137 
138 /*
139  * Pointer and counter to pool of pages, that is preallocated at
140  * startup to bootstrap UMA.
141  */
142 static char *bootmem;
143 static int boot_pages;
144 
145 static struct sx uma_reclaim_lock;
146 
147 /*
148  * kmem soft limit, initialized by uma_set_limit().  Ensure that early
149  * allocations don't trigger a wakeup of the reclaim thread.
150  */
151 static unsigned long uma_kmem_limit = LONG_MAX;
152 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0,
153     "UMA kernel memory soft limit");
154 static unsigned long uma_kmem_total;
155 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0,
156     "UMA kernel memory usage");
157 
158 /* Is the VM done starting up? */
159 static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
160     BOOT_RUNNING } booted = BOOT_COLD;
161 
162 /*
163  * This is the handle used to schedule events that need to happen
164  * outside of the allocation fast path.
165  */
166 static struct callout uma_callout;
167 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
168 
169 /*
170  * This structure is passed as the zone ctor arg so that I don't have to create
171  * a special allocation function just for zones.
172  */
173 struct uma_zctor_args {
174 	const char *name;
175 	size_t size;
176 	uma_ctor ctor;
177 	uma_dtor dtor;
178 	uma_init uminit;
179 	uma_fini fini;
180 	uma_import import;
181 	uma_release release;
182 	void *arg;
183 	uma_keg_t keg;
184 	int align;
185 	uint32_t flags;
186 };
187 
188 struct uma_kctor_args {
189 	uma_zone_t zone;
190 	size_t size;
191 	uma_init uminit;
192 	uma_fini fini;
193 	int align;
194 	uint32_t flags;
195 };
196 
197 struct uma_bucket_zone {
198 	uma_zone_t	ubz_zone;
199 	char		*ubz_name;
200 	int		ubz_entries;	/* Number of items it can hold. */
201 	int		ubz_maxsize;	/* Maximum allocation size per-item. */
202 };
203 
204 /*
205  * Compute the actual number of bucket entries to pack them in power
206  * of two sizes for more efficient space utilization.
207  */
208 #define	BUCKET_SIZE(n)						\
209     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
210 
211 #define	BUCKET_MAX	BUCKET_SIZE(256)
212 #define	BUCKET_MIN	BUCKET_SIZE(4)
213 
214 struct uma_bucket_zone bucket_zones[] = {
215 	{ NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
216 	{ NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
217 	{ NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
218 	{ NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
219 	{ NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
220 	{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
221 	{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
222 	{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
223 	{ NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
224 	{ NULL, NULL, 0}
225 };
226 
227 /*
228  * Flags and enumerations to be passed to internal functions.
229  */
230 enum zfreeskip {
231 	SKIP_NONE =	0,
232 	SKIP_CNT =	0x00000001,
233 	SKIP_DTOR =	0x00010000,
234 	SKIP_FINI =	0x00020000,
235 };
236 
237 /* Prototypes.. */
238 
239 int	uma_startup_count(int);
240 void	uma_startup(void *, int);
241 void	uma_startup1(void);
242 void	uma_startup2(void);
243 
244 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
245 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
246 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
247 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
248 static void page_free(void *, vm_size_t, uint8_t);
249 static void pcpu_page_free(void *, vm_size_t, uint8_t);
250 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
251 static void cache_drain(uma_zone_t);
252 static void bucket_drain(uma_zone_t, uma_bucket_t);
253 static void bucket_cache_reclaim(uma_zone_t zone, bool);
254 static int keg_ctor(void *, int, void *, int);
255 static void keg_dtor(void *, int, void *);
256 static int zone_ctor(void *, int, void *, int);
257 static void zone_dtor(void *, int, void *);
258 static int zero_init(void *, int, int);
259 static void keg_small_init(uma_keg_t keg);
260 static void keg_large_init(uma_keg_t keg);
261 static void zone_foreach(void (*zfunc)(uma_zone_t));
262 static void zone_timeout(uma_zone_t zone);
263 static int hash_alloc(struct uma_hash *, u_int);
264 static int hash_expand(struct uma_hash *, struct uma_hash *);
265 static void hash_free(struct uma_hash *hash);
266 static void uma_timeout(void *);
267 static void uma_startup3(void);
268 static void *zone_alloc_item(uma_zone_t, void *, int, int);
269 static void *zone_alloc_item_locked(uma_zone_t, void *, int, int);
270 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
271 static void bucket_enable(void);
272 static void bucket_init(void);
273 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
274 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
275 static void bucket_zone_drain(void);
276 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int, int);
277 static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int);
278 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
279 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item);
280 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
281     uma_fini fini, int align, uint32_t flags);
282 static int zone_import(uma_zone_t, void **, int, int, int);
283 static void zone_release(uma_zone_t, void **, int);
284 static void uma_zero_item(void *, uma_zone_t);
285 
286 void uma_print_zone(uma_zone_t);
287 void uma_print_stats(void);
288 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
289 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
290 
291 #ifdef INVARIANTS
292 static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
293 static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
294 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
295 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
296 
297 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0,
298     "Memory allocation debugging");
299 
300 static u_int dbg_divisor = 1;
301 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
302     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
303     "Debug & thrash every this item in memory allocator");
304 
305 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
306 static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
307 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
308     &uma_dbg_cnt, "memory items debugged");
309 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
310     &uma_skip_cnt, "memory items skipped, not debugged");
311 #endif
312 
313 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
314 
315 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
316     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
317 
318 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
319     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
320 
321 static int zone_warnings = 1;
322 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
323     "Warn when UMA zones becomes full");
324 
325 /* Adjust bytes under management by UMA. */
326 static inline void
327 uma_total_dec(unsigned long size)
328 {
329 
330 	atomic_subtract_long(&uma_kmem_total, size);
331 }
332 
333 static inline void
334 uma_total_inc(unsigned long size)
335 {
336 
337 	if (atomic_fetchadd_long(&uma_kmem_total, size) > uma_kmem_limit)
338 		uma_reclaim_wakeup();
339 }
340 
341 /*
342  * This routine checks to see whether or not it's safe to enable buckets.
343  */
344 static void
345 bucket_enable(void)
346 {
347 	bucketdisable = vm_page_count_min();
348 }
349 
350 /*
351  * Initialize bucket_zones, the array of zones of buckets of various sizes.
352  *
353  * For each zone, calculate the memory required for each bucket, consisting
354  * of the header and an array of pointers.
355  */
356 static void
357 bucket_init(void)
358 {
359 	struct uma_bucket_zone *ubz;
360 	int size;
361 
362 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
363 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
364 		size += sizeof(void *) * ubz->ubz_entries;
365 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
366 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
367 		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA);
368 	}
369 }
370 
371 /*
372  * Given a desired number of entries for a bucket, return the zone from which
373  * to allocate the bucket.
374  */
375 static struct uma_bucket_zone *
376 bucket_zone_lookup(int entries)
377 {
378 	struct uma_bucket_zone *ubz;
379 
380 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
381 		if (ubz->ubz_entries >= entries)
382 			return (ubz);
383 	ubz--;
384 	return (ubz);
385 }
386 
387 static int
388 bucket_select(int size)
389 {
390 	struct uma_bucket_zone *ubz;
391 
392 	ubz = &bucket_zones[0];
393 	if (size > ubz->ubz_maxsize)
394 		return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
395 
396 	for (; ubz->ubz_entries != 0; ubz++)
397 		if (ubz->ubz_maxsize < size)
398 			break;
399 	ubz--;
400 	return (ubz->ubz_entries);
401 }
402 
403 static uma_bucket_t
404 bucket_alloc(uma_zone_t zone, void *udata, int flags)
405 {
406 	struct uma_bucket_zone *ubz;
407 	uma_bucket_t bucket;
408 
409 	/*
410 	 * This is to stop us from allocating per cpu buckets while we're
411 	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
412 	 * boot pages.  This also prevents us from allocating buckets in
413 	 * low memory situations.
414 	 */
415 	if (bucketdisable)
416 		return (NULL);
417 	/*
418 	 * To limit bucket recursion we store the original zone flags
419 	 * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
420 	 * NOVM flag to persist even through deep recursions.  We also
421 	 * store ZFLAG_BUCKET once we have recursed attempting to allocate
422 	 * a bucket for a bucket zone so we do not allow infinite bucket
423 	 * recursion.  This cookie will even persist to frees of unused
424 	 * buckets via the allocation path or bucket allocations in the
425 	 * free path.
426 	 */
427 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
428 		udata = (void *)(uintptr_t)zone->uz_flags;
429 	else {
430 		if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
431 			return (NULL);
432 		udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
433 	}
434 	if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
435 		flags |= M_NOVM;
436 	ubz = bucket_zone_lookup(zone->uz_count);
437 	if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
438 		ubz++;
439 	bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
440 	if (bucket) {
441 #ifdef INVARIANTS
442 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
443 #endif
444 		bucket->ub_cnt = 0;
445 		bucket->ub_entries = ubz->ubz_entries;
446 	}
447 
448 	return (bucket);
449 }
450 
451 static void
452 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
453 {
454 	struct uma_bucket_zone *ubz;
455 
456 	KASSERT(bucket->ub_cnt == 0,
457 	    ("bucket_free: Freeing a non free bucket."));
458 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
459 		udata = (void *)(uintptr_t)zone->uz_flags;
460 	ubz = bucket_zone_lookup(bucket->ub_entries);
461 	uma_zfree_arg(ubz->ubz_zone, bucket, udata);
462 }
463 
464 static void
465 bucket_zone_drain(void)
466 {
467 	struct uma_bucket_zone *ubz;
468 
469 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
470 		uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN);
471 }
472 
473 /*
474  * Attempt to satisfy an allocation by retrieving a full bucket from one of the
475  * zone's caches.
476  */
477 static uma_bucket_t
478 zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom)
479 {
480 	uma_bucket_t bucket;
481 
482 	ZONE_LOCK_ASSERT(zone);
483 
484 	if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) != NULL) {
485 		MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
486 		TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
487 		zdom->uzd_nitems -= bucket->ub_cnt;
488 		if (zdom->uzd_imin > zdom->uzd_nitems)
489 			zdom->uzd_imin = zdom->uzd_nitems;
490 		zone->uz_bkt_count -= bucket->ub_cnt;
491 	}
492 	return (bucket);
493 }
494 
495 /*
496  * Insert a full bucket into the specified cache.  The "ws" parameter indicates
497  * whether the bucket's contents should be counted as part of the zone's working
498  * set.
499  */
500 static void
501 zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket,
502     const bool ws)
503 {
504 
505 	ZONE_LOCK_ASSERT(zone);
506 	KASSERT(zone->uz_bkt_count < zone->uz_bkt_max, ("%s: zone %p overflow",
507 	    __func__, zone));
508 
509 	if (ws)
510 		TAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
511 	else
512 		TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link);
513 	zdom->uzd_nitems += bucket->ub_cnt;
514 	if (ws && zdom->uzd_imax < zdom->uzd_nitems)
515 		zdom->uzd_imax = zdom->uzd_nitems;
516 	zone->uz_bkt_count += bucket->ub_cnt;
517 }
518 
519 static void
520 zone_log_warning(uma_zone_t zone)
521 {
522 	static const struct timeval warninterval = { 300, 0 };
523 
524 	if (!zone_warnings || zone->uz_warning == NULL)
525 		return;
526 
527 	if (ratecheck(&zone->uz_ratecheck, &warninterval))
528 		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
529 }
530 
531 static inline void
532 zone_maxaction(uma_zone_t zone)
533 {
534 
535 	if (zone->uz_maxaction.ta_func != NULL)
536 		taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
537 }
538 
539 /*
540  * Routine called by timeout which is used to fire off some time interval
541  * based calculations.  (stats, hash size, etc.)
542  *
543  * Arguments:
544  *	arg   Unused
545  *
546  * Returns:
547  *	Nothing
548  */
549 static void
550 uma_timeout(void *unused)
551 {
552 	bucket_enable();
553 	zone_foreach(zone_timeout);
554 
555 	/* Reschedule this event */
556 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
557 }
558 
559 /*
560  * Update the working set size estimate for the zone's bucket cache.
561  * The constants chosen here are somewhat arbitrary.  With an update period of
562  * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the
563  * last 100s.
564  */
565 static void
566 zone_domain_update_wss(uma_zone_domain_t zdom)
567 {
568 	long wss;
569 
570 	MPASS(zdom->uzd_imax >= zdom->uzd_imin);
571 	wss = zdom->uzd_imax - zdom->uzd_imin;
572 	zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems;
573 	zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5;
574 }
575 
576 /*
577  * Routine to perform timeout driven calculations.  This expands the
578  * hashes and does per cpu statistics aggregation.
579  *
580  *  Returns nothing.
581  */
582 static void
583 zone_timeout(uma_zone_t zone)
584 {
585 	uma_keg_t keg = zone->uz_keg;
586 	u_int slabs;
587 
588 	KEG_LOCK(keg);
589 	/*
590 	 * Expand the keg hash table.
591 	 *
592 	 * This is done if the number of slabs is larger than the hash size.
593 	 * What I'm trying to do here is completely reduce collisions.  This
594 	 * may be a little aggressive.  Should I allow for two collisions max?
595 	 */
596 	if (keg->uk_flags & UMA_ZONE_HASH &&
597 	    (slabs = keg->uk_pages / keg->uk_ppera) >
598 	     keg->uk_hash.uh_hashsize) {
599 		struct uma_hash newhash;
600 		struct uma_hash oldhash;
601 		int ret;
602 
603 		/*
604 		 * This is so involved because allocating and freeing
605 		 * while the keg lock is held will lead to deadlock.
606 		 * I have to do everything in stages and check for
607 		 * races.
608 		 */
609 		KEG_UNLOCK(keg);
610 		ret = hash_alloc(&newhash, 1 << fls(slabs));
611 		KEG_LOCK(keg);
612 		if (ret) {
613 			if (hash_expand(&keg->uk_hash, &newhash)) {
614 				oldhash = keg->uk_hash;
615 				keg->uk_hash = newhash;
616 			} else
617 				oldhash = newhash;
618 
619 			KEG_UNLOCK(keg);
620 			hash_free(&oldhash);
621 			return;
622 		}
623 	}
624 	KEG_UNLOCK(keg);
625 
626 	ZONE_LOCK(zone);
627 	for (int i = 0; i < vm_ndomains; i++)
628 		zone_domain_update_wss(&zone->uz_domain[i]);
629 	ZONE_UNLOCK(zone);
630 }
631 
632 /*
633  * Allocate and zero fill the next sized hash table from the appropriate
634  * backing store.
635  *
636  * Arguments:
637  *	hash  A new hash structure with the old hash size in uh_hashsize
638  *
639  * Returns:
640  *	1 on success and 0 on failure.
641  */
642 static int
643 hash_alloc(struct uma_hash *hash, u_int size)
644 {
645 	size_t alloc;
646 
647 	KASSERT(powerof2(size), ("hash size must be power of 2"));
648 	if (size > UMA_HASH_SIZE_INIT)  {
649 		hash->uh_hashsize = size;
650 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
651 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
652 		    M_UMAHASH, M_NOWAIT);
653 	} else {
654 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
655 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
656 		    UMA_ANYDOMAIN, M_WAITOK);
657 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
658 	}
659 	if (hash->uh_slab_hash) {
660 		bzero(hash->uh_slab_hash, alloc);
661 		hash->uh_hashmask = hash->uh_hashsize - 1;
662 		return (1);
663 	}
664 
665 	return (0);
666 }
667 
668 /*
669  * Expands the hash table for HASH zones.  This is done from zone_timeout
670  * to reduce collisions.  This must not be done in the regular allocation
671  * path, otherwise, we can recurse on the vm while allocating pages.
672  *
673  * Arguments:
674  *	oldhash  The hash you want to expand
675  *	newhash  The hash structure for the new table
676  *
677  * Returns:
678  *	Nothing
679  *
680  * Discussion:
681  */
682 static int
683 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
684 {
685 	uma_slab_t slab;
686 	u_int hval;
687 	u_int idx;
688 
689 	if (!newhash->uh_slab_hash)
690 		return (0);
691 
692 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
693 		return (0);
694 
695 	/*
696 	 * I need to investigate hash algorithms for resizing without a
697 	 * full rehash.
698 	 */
699 
700 	for (idx = 0; idx < oldhash->uh_hashsize; idx++)
701 		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
702 			slab = SLIST_FIRST(&oldhash->uh_slab_hash[idx]);
703 			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[idx], us_hlink);
704 			hval = UMA_HASH(newhash, slab->us_data);
705 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
706 			    slab, us_hlink);
707 		}
708 
709 	return (1);
710 }
711 
712 /*
713  * Free the hash bucket to the appropriate backing store.
714  *
715  * Arguments:
716  *	slab_hash  The hash bucket we're freeing
717  *	hashsize   The number of entries in that hash bucket
718  *
719  * Returns:
720  *	Nothing
721  */
722 static void
723 hash_free(struct uma_hash *hash)
724 {
725 	if (hash->uh_slab_hash == NULL)
726 		return;
727 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
728 		zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
729 	else
730 		free(hash->uh_slab_hash, M_UMAHASH);
731 }
732 
733 /*
734  * Frees all outstanding items in a bucket
735  *
736  * Arguments:
737  *	zone   The zone to free to, must be unlocked.
738  *	bucket The free/alloc bucket with items, cpu queue must be locked.
739  *
740  * Returns:
741  *	Nothing
742  */
743 
744 static void
745 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
746 {
747 	int i;
748 
749 	if (bucket == NULL)
750 		return;
751 
752 	if (zone->uz_fini)
753 		for (i = 0; i < bucket->ub_cnt; i++)
754 			zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
755 	zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
756 	if (zone->uz_max_items > 0) {
757 		ZONE_LOCK(zone);
758 		zone->uz_items -= bucket->ub_cnt;
759 		if (zone->uz_sleepers && zone->uz_items < zone->uz_max_items)
760 			wakeup_one(zone);
761 		ZONE_UNLOCK(zone);
762 	}
763 	bucket->ub_cnt = 0;
764 }
765 
766 /*
767  * Drains the per cpu caches for a zone.
768  *
769  * NOTE: This may only be called while the zone is being turn down, and not
770  * during normal operation.  This is necessary in order that we do not have
771  * to migrate CPUs to drain the per-CPU caches.
772  *
773  * Arguments:
774  *	zone     The zone to drain, must be unlocked.
775  *
776  * Returns:
777  *	Nothing
778  */
779 static void
780 cache_drain(uma_zone_t zone)
781 {
782 	uma_cache_t cache;
783 	int cpu;
784 
785 	/*
786 	 * XXX: It is safe to not lock the per-CPU caches, because we're
787 	 * tearing down the zone anyway.  I.e., there will be no further use
788 	 * of the caches at this point.
789 	 *
790 	 * XXX: It would good to be able to assert that the zone is being
791 	 * torn down to prevent improper use of cache_drain().
792 	 *
793 	 * XXX: We lock the zone before passing into bucket_cache_reclaim() as
794 	 * it is used elsewhere.  Should the tear-down path be made special
795 	 * there in some form?
796 	 */
797 	CPU_FOREACH(cpu) {
798 		cache = &zone->uz_cpu[cpu];
799 		bucket_drain(zone, cache->uc_allocbucket);
800 		if (cache->uc_allocbucket != NULL)
801 			bucket_free(zone, cache->uc_allocbucket, NULL);
802 		cache->uc_allocbucket = NULL;
803 		bucket_drain(zone, cache->uc_freebucket);
804 		if (cache->uc_freebucket != NULL)
805 			bucket_free(zone, cache->uc_freebucket, NULL);
806 		cache->uc_freebucket = NULL;
807 		bucket_drain(zone, cache->uc_crossbucket);
808 		if (cache->uc_crossbucket != NULL)
809 			bucket_free(zone, cache->uc_crossbucket, NULL);
810 		cache->uc_crossbucket = NULL;
811 	}
812 	ZONE_LOCK(zone);
813 	bucket_cache_reclaim(zone, true);
814 	ZONE_UNLOCK(zone);
815 }
816 
817 static void
818 cache_shrink(uma_zone_t zone)
819 {
820 
821 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
822 		return;
823 
824 	ZONE_LOCK(zone);
825 	zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
826 	ZONE_UNLOCK(zone);
827 }
828 
829 static void
830 cache_drain_safe_cpu(uma_zone_t zone)
831 {
832 	uma_cache_t cache;
833 	uma_bucket_t b1, b2, b3;
834 	int domain;
835 
836 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
837 		return;
838 
839 	b1 = b2 = b3 = NULL;
840 	ZONE_LOCK(zone);
841 	critical_enter();
842 	if (zone->uz_flags & UMA_ZONE_NUMA)
843 		domain = PCPU_GET(domain);
844 	else
845 		domain = 0;
846 	cache = &zone->uz_cpu[curcpu];
847 	if (cache->uc_allocbucket) {
848 		if (cache->uc_allocbucket->ub_cnt != 0)
849 			zone_put_bucket(zone, &zone->uz_domain[domain],
850 			    cache->uc_allocbucket, false);
851 		else
852 			b1 = cache->uc_allocbucket;
853 		cache->uc_allocbucket = NULL;
854 	}
855 	if (cache->uc_freebucket) {
856 		if (cache->uc_freebucket->ub_cnt != 0)
857 			zone_put_bucket(zone, &zone->uz_domain[domain],
858 			    cache->uc_freebucket, false);
859 		else
860 			b2 = cache->uc_freebucket;
861 		cache->uc_freebucket = NULL;
862 	}
863 	b3 = cache->uc_crossbucket;
864 	cache->uc_crossbucket = NULL;
865 	critical_exit();
866 	ZONE_UNLOCK(zone);
867 	if (b1)
868 		bucket_free(zone, b1, NULL);
869 	if (b2)
870 		bucket_free(zone, b2, NULL);
871 	if (b3) {
872 		bucket_drain(zone, b3);
873 		bucket_free(zone, b3, NULL);
874 	}
875 }
876 
877 /*
878  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
879  * This is an expensive call because it needs to bind to all CPUs
880  * one by one and enter a critical section on each of them in order
881  * to safely access their cache buckets.
882  * Zone lock must not be held on call this function.
883  */
884 static void
885 pcpu_cache_drain_safe(uma_zone_t zone)
886 {
887 	int cpu;
888 
889 	/*
890 	 * Polite bucket sizes shrinking was not enouth, shrink aggressively.
891 	 */
892 	if (zone)
893 		cache_shrink(zone);
894 	else
895 		zone_foreach(cache_shrink);
896 
897 	CPU_FOREACH(cpu) {
898 		thread_lock(curthread);
899 		sched_bind(curthread, cpu);
900 		thread_unlock(curthread);
901 
902 		if (zone)
903 			cache_drain_safe_cpu(zone);
904 		else
905 			zone_foreach(cache_drain_safe_cpu);
906 	}
907 	thread_lock(curthread);
908 	sched_unbind(curthread);
909 	thread_unlock(curthread);
910 }
911 
912 /*
913  * Reclaim cached buckets from a zone.  All buckets are reclaimed if the caller
914  * requested a drain, otherwise the per-domain caches are trimmed to either
915  * estimated working set size.
916  */
917 static void
918 bucket_cache_reclaim(uma_zone_t zone, bool drain)
919 {
920 	uma_zone_domain_t zdom;
921 	uma_bucket_t bucket;
922 	long target, tofree;
923 	int i;
924 
925 	for (i = 0; i < vm_ndomains; i++) {
926 		zdom = &zone->uz_domain[i];
927 
928 		/*
929 		 * If we were asked to drain the zone, we are done only once
930 		 * this bucket cache is empty.  Otherwise, we reclaim items in
931 		 * excess of the zone's estimated working set size.  If the
932 		 * difference nitems - imin is larger than the WSS estimate,
933 		 * then the estimate will grow at the end of this interval and
934 		 * we ignore the historical average.
935 		 */
936 		target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems -
937 		    zdom->uzd_imin);
938 		while (zdom->uzd_nitems > target) {
939 			bucket = TAILQ_LAST(&zdom->uzd_buckets, uma_bucketlist);
940 			if (bucket == NULL)
941 				break;
942 			tofree = bucket->ub_cnt;
943 			TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
944 			zdom->uzd_nitems -= tofree;
945 
946 			/*
947 			 * Shift the bounds of the current WSS interval to avoid
948 			 * perturbing the estimate.
949 			 */
950 			zdom->uzd_imax -= lmin(zdom->uzd_imax, tofree);
951 			zdom->uzd_imin -= lmin(zdom->uzd_imin, tofree);
952 
953 			ZONE_UNLOCK(zone);
954 			bucket_drain(zone, bucket);
955 			bucket_free(zone, bucket, NULL);
956 			ZONE_LOCK(zone);
957 		}
958 	}
959 
960 	/*
961 	 * Shrink the zone bucket size to ensure that the per-CPU caches
962 	 * don't grow too large.
963 	 */
964 	if (zone->uz_count > zone->uz_count_min)
965 		zone->uz_count--;
966 }
967 
968 static void
969 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
970 {
971 	uint8_t *mem;
972 	int i;
973 	uint8_t flags;
974 
975 	CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
976 	    keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
977 
978 	mem = slab->us_data;
979 	flags = slab->us_flags;
980 	i = start;
981 	if (keg->uk_fini != NULL) {
982 		for (i--; i > -1; i--)
983 #ifdef INVARIANTS
984 		/*
985 		 * trash_fini implies that dtor was trash_dtor. trash_fini
986 		 * would check that memory hasn't been modified since free,
987 		 * which executed trash_dtor.
988 		 * That's why we need to run uma_dbg_kskip() check here,
989 		 * albeit we don't make skip check for other init/fini
990 		 * invocations.
991 		 */
992 		if (!uma_dbg_kskip(keg, slab->us_data + (keg->uk_rsize * i)) ||
993 		    keg->uk_fini != trash_fini)
994 #endif
995 			keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
996 			    keg->uk_size);
997 	}
998 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
999 		zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1000 	keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
1001 	uma_total_dec(PAGE_SIZE * keg->uk_ppera);
1002 }
1003 
1004 /*
1005  * Frees pages from a keg back to the system.  This is done on demand from
1006  * the pageout daemon.
1007  *
1008  * Returns nothing.
1009  */
1010 static void
1011 keg_drain(uma_keg_t keg)
1012 {
1013 	struct slabhead freeslabs = { 0 };
1014 	uma_domain_t dom;
1015 	uma_slab_t slab, tmp;
1016 	int i;
1017 
1018 	/*
1019 	 * We don't want to take pages from statically allocated kegs at this
1020 	 * time
1021 	 */
1022 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
1023 		return;
1024 
1025 	CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u",
1026 	    keg->uk_name, keg, keg->uk_free);
1027 	KEG_LOCK(keg);
1028 	if (keg->uk_free == 0)
1029 		goto finished;
1030 
1031 	for (i = 0; i < vm_ndomains; i++) {
1032 		dom = &keg->uk_domain[i];
1033 		LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) {
1034 			/* We have nowhere to free these to. */
1035 			if (slab->us_flags & UMA_SLAB_BOOT)
1036 				continue;
1037 
1038 			LIST_REMOVE(slab, us_link);
1039 			keg->uk_pages -= keg->uk_ppera;
1040 			keg->uk_free -= keg->uk_ipers;
1041 
1042 			if (keg->uk_flags & UMA_ZONE_HASH)
1043 				UMA_HASH_REMOVE(&keg->uk_hash, slab,
1044 				    slab->us_data);
1045 
1046 			SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
1047 		}
1048 	}
1049 
1050 finished:
1051 	KEG_UNLOCK(keg);
1052 
1053 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
1054 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
1055 		keg_free_slab(keg, slab, keg->uk_ipers);
1056 	}
1057 }
1058 
1059 static void
1060 zone_reclaim(uma_zone_t zone, int waitok, bool drain)
1061 {
1062 
1063 	/*
1064 	 * Set draining to interlock with zone_dtor() so we can release our
1065 	 * locks as we go.  Only dtor() should do a WAITOK call since it
1066 	 * is the only call that knows the structure will still be available
1067 	 * when it wakes up.
1068 	 */
1069 	ZONE_LOCK(zone);
1070 	while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) {
1071 		if (waitok == M_NOWAIT)
1072 			goto out;
1073 		msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
1074 	}
1075 	zone->uz_flags |= UMA_ZFLAG_RECLAIMING;
1076 	bucket_cache_reclaim(zone, drain);
1077 	ZONE_UNLOCK(zone);
1078 
1079 	/*
1080 	 * The DRAINING flag protects us from being freed while
1081 	 * we're running.  Normally the uma_rwlock would protect us but we
1082 	 * must be able to release and acquire the right lock for each keg.
1083 	 */
1084 	keg_drain(zone->uz_keg);
1085 	ZONE_LOCK(zone);
1086 	zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING;
1087 	wakeup(zone);
1088 out:
1089 	ZONE_UNLOCK(zone);
1090 }
1091 
1092 static void
1093 zone_drain(uma_zone_t zone)
1094 {
1095 
1096 	zone_reclaim(zone, M_NOWAIT, true);
1097 }
1098 
1099 static void
1100 zone_trim(uma_zone_t zone)
1101 {
1102 
1103 	zone_reclaim(zone, M_NOWAIT, false);
1104 }
1105 
1106 /*
1107  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
1108  * If the allocation was successful, the keg lock will be held upon return,
1109  * otherwise the keg will be left unlocked.
1110  *
1111  * Arguments:
1112  *	flags   Wait flags for the item initialization routine
1113  *	aflags  Wait flags for the slab allocation
1114  *
1115  * Returns:
1116  *	The slab that was allocated or NULL if there is no memory and the
1117  *	caller specified M_NOWAIT.
1118  */
1119 static uma_slab_t
1120 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags,
1121     int aflags)
1122 {
1123 	uma_alloc allocf;
1124 	uma_slab_t slab;
1125 	unsigned long size;
1126 	uint8_t *mem;
1127 	uint8_t sflags;
1128 	int i;
1129 
1130 	KASSERT(domain >= 0 && domain < vm_ndomains,
1131 	    ("keg_alloc_slab: domain %d out of range", domain));
1132 	KEG_LOCK_ASSERT(keg);
1133 	MPASS(zone->uz_lockptr == &keg->uk_lock);
1134 
1135 	allocf = keg->uk_allocf;
1136 	KEG_UNLOCK(keg);
1137 
1138 	slab = NULL;
1139 	mem = NULL;
1140 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1141 		slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, aflags);
1142 		if (slab == NULL)
1143 			goto out;
1144 	}
1145 
1146 	/*
1147 	 * This reproduces the old vm_zone behavior of zero filling pages the
1148 	 * first time they are added to a zone.
1149 	 *
1150 	 * Malloced items are zeroed in uma_zalloc.
1151 	 */
1152 
1153 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1154 		aflags |= M_ZERO;
1155 	else
1156 		aflags &= ~M_ZERO;
1157 
1158 	if (keg->uk_flags & UMA_ZONE_NODUMP)
1159 		aflags |= M_NODUMP;
1160 
1161 	/* zone is passed for legacy reasons. */
1162 	size = keg->uk_ppera * PAGE_SIZE;
1163 	mem = allocf(zone, size, domain, &sflags, aflags);
1164 	if (mem == NULL) {
1165 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1166 			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1167 		slab = NULL;
1168 		goto out;
1169 	}
1170 	uma_total_inc(size);
1171 
1172 	/* Point the slab into the allocated memory */
1173 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
1174 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
1175 
1176 	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
1177 		for (i = 0; i < keg->uk_ppera; i++)
1178 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
1179 
1180 	slab->us_keg = keg;
1181 	slab->us_data = mem;
1182 	slab->us_freecount = keg->uk_ipers;
1183 	slab->us_flags = sflags;
1184 	slab->us_domain = domain;
1185 	BIT_FILL(SLAB_SETSIZE, &slab->us_free);
1186 #ifdef INVARIANTS
1187 	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
1188 #endif
1189 
1190 	if (keg->uk_init != NULL) {
1191 		for (i = 0; i < keg->uk_ipers; i++)
1192 			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
1193 			    keg->uk_size, flags) != 0)
1194 				break;
1195 		if (i != keg->uk_ipers) {
1196 			keg_free_slab(keg, slab, i);
1197 			slab = NULL;
1198 			goto out;
1199 		}
1200 	}
1201 	KEG_LOCK(keg);
1202 
1203 	CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
1204 	    slab, keg->uk_name, keg);
1205 
1206 	if (keg->uk_flags & UMA_ZONE_HASH)
1207 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
1208 
1209 	keg->uk_pages += keg->uk_ppera;
1210 	keg->uk_free += keg->uk_ipers;
1211 
1212 out:
1213 	return (slab);
1214 }
1215 
1216 /*
1217  * This function is intended to be used early on in place of page_alloc() so
1218  * that we may use the boot time page cache to satisfy allocations before
1219  * the VM is ready.
1220  */
1221 static void *
1222 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1223     int wait)
1224 {
1225 	uma_keg_t keg;
1226 	void *mem;
1227 	int pages;
1228 
1229 	keg = zone->uz_keg;
1230 	/*
1231 	 * If we are in BOOT_BUCKETS or higher, than switch to real
1232 	 * allocator.  Zones with page sized slabs switch at BOOT_PAGEALLOC.
1233 	 */
1234 	switch (booted) {
1235 		case BOOT_COLD:
1236 		case BOOT_STRAPPED:
1237 			break;
1238 		case BOOT_PAGEALLOC:
1239 			if (keg->uk_ppera > 1)
1240 				break;
1241 		case BOOT_BUCKETS:
1242 		case BOOT_RUNNING:
1243 #ifdef UMA_MD_SMALL_ALLOC
1244 			keg->uk_allocf = (keg->uk_ppera > 1) ?
1245 			    page_alloc : uma_small_alloc;
1246 #else
1247 			keg->uk_allocf = page_alloc;
1248 #endif
1249 			return keg->uk_allocf(zone, bytes, domain, pflag, wait);
1250 	}
1251 
1252 	/*
1253 	 * Check our small startup cache to see if it has pages remaining.
1254 	 */
1255 	pages = howmany(bytes, PAGE_SIZE);
1256 	KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
1257 	if (pages > boot_pages)
1258 		panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name);
1259 #ifdef DIAGNOSTIC
1260 	printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name,
1261 	    boot_pages);
1262 #endif
1263 	mem = bootmem;
1264 	boot_pages -= pages;
1265 	bootmem += pages * PAGE_SIZE;
1266 	*pflag = UMA_SLAB_BOOT;
1267 
1268 	return (mem);
1269 }
1270 
1271 /*
1272  * Allocates a number of pages from the system
1273  *
1274  * Arguments:
1275  *	bytes  The number of bytes requested
1276  *	wait  Shall we wait?
1277  *
1278  * Returns:
1279  *	A pointer to the alloced memory or possibly
1280  *	NULL if M_NOWAIT is set.
1281  */
1282 static void *
1283 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1284     int wait)
1285 {
1286 	void *p;	/* Returned page */
1287 
1288 	*pflag = UMA_SLAB_KERNEL;
1289 	p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait);
1290 
1291 	return (p);
1292 }
1293 
1294 static void *
1295 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1296     int wait)
1297 {
1298 	struct pglist alloctail;
1299 	vm_offset_t addr, zkva;
1300 	int cpu, flags;
1301 	vm_page_t p, p_next;
1302 #ifdef NUMA
1303 	struct pcpu *pc;
1304 #endif
1305 
1306 	MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
1307 
1308 	TAILQ_INIT(&alloctail);
1309 	flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1310 	    malloc2vm_flags(wait);
1311 	*pflag = UMA_SLAB_KERNEL;
1312 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
1313 		if (CPU_ABSENT(cpu)) {
1314 			p = vm_page_alloc(NULL, 0, flags);
1315 		} else {
1316 #ifndef NUMA
1317 			p = vm_page_alloc(NULL, 0, flags);
1318 #else
1319 			pc = pcpu_find(cpu);
1320 			p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags);
1321 			if (__predict_false(p == NULL))
1322 				p = vm_page_alloc(NULL, 0, flags);
1323 #endif
1324 		}
1325 		if (__predict_false(p == NULL))
1326 			goto fail;
1327 		TAILQ_INSERT_TAIL(&alloctail, p, listq);
1328 	}
1329 	if ((addr = kva_alloc(bytes)) == 0)
1330 		goto fail;
1331 	zkva = addr;
1332 	TAILQ_FOREACH(p, &alloctail, listq) {
1333 		pmap_qenter(zkva, &p, 1);
1334 		zkva += PAGE_SIZE;
1335 	}
1336 	return ((void*)addr);
1337 fail:
1338 	TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1339 		vm_page_unwire_noq(p);
1340 		vm_page_free(p);
1341 	}
1342 	return (NULL);
1343 }
1344 
1345 /*
1346  * Allocates a number of pages from within an object
1347  *
1348  * Arguments:
1349  *	bytes  The number of bytes requested
1350  *	wait   Shall we wait?
1351  *
1352  * Returns:
1353  *	A pointer to the alloced memory or possibly
1354  *	NULL if M_NOWAIT is set.
1355  */
1356 static void *
1357 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
1358     int wait)
1359 {
1360 	TAILQ_HEAD(, vm_page) alloctail;
1361 	u_long npages;
1362 	vm_offset_t retkva, zkva;
1363 	vm_page_t p, p_next;
1364 	uma_keg_t keg;
1365 
1366 	TAILQ_INIT(&alloctail);
1367 	keg = zone->uz_keg;
1368 
1369 	npages = howmany(bytes, PAGE_SIZE);
1370 	while (npages > 0) {
1371 		p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
1372 		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1373 		    ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
1374 		    VM_ALLOC_NOWAIT));
1375 		if (p != NULL) {
1376 			/*
1377 			 * Since the page does not belong to an object, its
1378 			 * listq is unused.
1379 			 */
1380 			TAILQ_INSERT_TAIL(&alloctail, p, listq);
1381 			npages--;
1382 			continue;
1383 		}
1384 		/*
1385 		 * Page allocation failed, free intermediate pages and
1386 		 * exit.
1387 		 */
1388 		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1389 			vm_page_unwire_noq(p);
1390 			vm_page_free(p);
1391 		}
1392 		return (NULL);
1393 	}
1394 	*flags = UMA_SLAB_PRIV;
1395 	zkva = keg->uk_kva +
1396 	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1397 	retkva = zkva;
1398 	TAILQ_FOREACH(p, &alloctail, listq) {
1399 		pmap_qenter(zkva, &p, 1);
1400 		zkva += PAGE_SIZE;
1401 	}
1402 
1403 	return ((void *)retkva);
1404 }
1405 
1406 /*
1407  * Frees a number of pages to the system
1408  *
1409  * Arguments:
1410  *	mem   A pointer to the memory to be freed
1411  *	size  The size of the memory being freed
1412  *	flags The original p->us_flags field
1413  *
1414  * Returns:
1415  *	Nothing
1416  */
1417 static void
1418 page_free(void *mem, vm_size_t size, uint8_t flags)
1419 {
1420 
1421 	if ((flags & UMA_SLAB_KERNEL) == 0)
1422 		panic("UMA: page_free used with invalid flags %x", flags);
1423 
1424 	kmem_free((vm_offset_t)mem, size);
1425 }
1426 
1427 /*
1428  * Frees pcpu zone allocations
1429  *
1430  * Arguments:
1431  *	mem   A pointer to the memory to be freed
1432  *	size  The size of the memory being freed
1433  *	flags The original p->us_flags field
1434  *
1435  * Returns:
1436  *	Nothing
1437  */
1438 static void
1439 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
1440 {
1441 	vm_offset_t sva, curva;
1442 	vm_paddr_t paddr;
1443 	vm_page_t m;
1444 
1445 	MPASS(size == (mp_maxid+1)*PAGE_SIZE);
1446 	sva = (vm_offset_t)mem;
1447 	for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
1448 		paddr = pmap_kextract(curva);
1449 		m = PHYS_TO_VM_PAGE(paddr);
1450 		vm_page_unwire_noq(m);
1451 		vm_page_free(m);
1452 	}
1453 	pmap_qremove(sva, size >> PAGE_SHIFT);
1454 	kva_free(sva, size);
1455 }
1456 
1457 
1458 /*
1459  * Zero fill initializer
1460  *
1461  * Arguments/Returns follow uma_init specifications
1462  */
1463 static int
1464 zero_init(void *mem, int size, int flags)
1465 {
1466 	bzero(mem, size);
1467 	return (0);
1468 }
1469 
1470 /*
1471  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1472  *
1473  * Arguments
1474  *	keg  The zone we should initialize
1475  *
1476  * Returns
1477  *	Nothing
1478  */
1479 static void
1480 keg_small_init(uma_keg_t keg)
1481 {
1482 	u_int rsize;
1483 	u_int memused;
1484 	u_int wastedspace;
1485 	u_int shsize;
1486 	u_int slabsize;
1487 
1488 	if (keg->uk_flags & UMA_ZONE_PCPU) {
1489 		u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
1490 
1491 		slabsize = UMA_PCPU_ALLOC_SIZE;
1492 		keg->uk_ppera = ncpus;
1493 	} else {
1494 		slabsize = UMA_SLAB_SIZE;
1495 		keg->uk_ppera = 1;
1496 	}
1497 
1498 	/*
1499 	 * Calculate the size of each allocation (rsize) according to
1500 	 * alignment.  If the requested size is smaller than we have
1501 	 * allocation bits for we round it up.
1502 	 */
1503 	rsize = keg->uk_size;
1504 	if (rsize < slabsize / SLAB_SETSIZE)
1505 		rsize = slabsize / SLAB_SETSIZE;
1506 	if (rsize & keg->uk_align)
1507 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1508 	keg->uk_rsize = rsize;
1509 
1510 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1511 	    keg->uk_rsize < UMA_PCPU_ALLOC_SIZE,
1512 	    ("%s: size %u too large", __func__, keg->uk_rsize));
1513 
1514 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1515 		shsize = 0;
1516 	else
1517 		shsize = SIZEOF_UMA_SLAB;
1518 
1519 	if (rsize <= slabsize - shsize)
1520 		keg->uk_ipers = (slabsize - shsize) / rsize;
1521 	else {
1522 		/* Handle special case when we have 1 item per slab, so
1523 		 * alignment requirement can be relaxed. */
1524 		KASSERT(keg->uk_size <= slabsize - shsize,
1525 		    ("%s: size %u greater than slab", __func__, keg->uk_size));
1526 		keg->uk_ipers = 1;
1527 	}
1528 	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1529 	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1530 
1531 	memused = keg->uk_ipers * rsize + shsize;
1532 	wastedspace = slabsize - memused;
1533 
1534 	/*
1535 	 * We can't do OFFPAGE if we're internal or if we've been
1536 	 * asked to not go to the VM for buckets.  If we do this we
1537 	 * may end up going to the VM  for slabs which we do not
1538 	 * want to do if we're UMA_ZFLAG_CACHEONLY as a result
1539 	 * of UMA_ZONE_VM, which clearly forbids it.
1540 	 */
1541 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1542 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1543 		return;
1544 
1545 	/*
1546 	 * See if using an OFFPAGE slab will limit our waste.  Only do
1547 	 * this if it permits more items per-slab.
1548 	 *
1549 	 * XXX We could try growing slabsize to limit max waste as well.
1550 	 * Historically this was not done because the VM could not
1551 	 * efficiently handle contiguous allocations.
1552 	 */
1553 	if ((wastedspace >= slabsize / UMA_MAX_WASTE) &&
1554 	    (keg->uk_ipers < (slabsize / keg->uk_rsize))) {
1555 		keg->uk_ipers = slabsize / keg->uk_rsize;
1556 		KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1557 		    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1558 		CTR6(KTR_UMA, "UMA decided we need offpage slab headers for "
1559 		    "keg: %s(%p), calculated wastedspace = %d, "
1560 		    "maximum wasted space allowed = %d, "
1561 		    "calculated ipers = %d, "
1562 		    "new wasted space = %d\n", keg->uk_name, keg, wastedspace,
1563 		    slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1564 		    slabsize - keg->uk_ipers * keg->uk_rsize);
1565 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1566 	}
1567 
1568 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1569 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1570 		keg->uk_flags |= UMA_ZONE_HASH;
1571 }
1572 
1573 /*
1574  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1575  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1576  * more complicated.
1577  *
1578  * Arguments
1579  *	keg  The keg we should initialize
1580  *
1581  * Returns
1582  *	Nothing
1583  */
1584 static void
1585 keg_large_init(uma_keg_t keg)
1586 {
1587 
1588 	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1589 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1590 	    ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1591 
1592 	keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1593 	keg->uk_ipers = 1;
1594 	keg->uk_rsize = keg->uk_size;
1595 
1596 	/* Check whether we have enough space to not do OFFPAGE. */
1597 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0 &&
1598 	    PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < SIZEOF_UMA_SLAB) {
1599 		/*
1600 		 * We can't do OFFPAGE if we're internal, in which case
1601 		 * we need an extra page per allocation to contain the
1602 		 * slab header.
1603 		 */
1604 		if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
1605 			keg->uk_flags |= UMA_ZONE_OFFPAGE;
1606 		else
1607 			keg->uk_ppera++;
1608 	}
1609 
1610 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1611 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1612 		keg->uk_flags |= UMA_ZONE_HASH;
1613 }
1614 
1615 static void
1616 keg_cachespread_init(uma_keg_t keg)
1617 {
1618 	int alignsize;
1619 	int trailer;
1620 	int pages;
1621 	int rsize;
1622 
1623 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1624 	    ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1625 
1626 	alignsize = keg->uk_align + 1;
1627 	rsize = keg->uk_size;
1628 	/*
1629 	 * We want one item to start on every align boundary in a page.  To
1630 	 * do this we will span pages.  We will also extend the item by the
1631 	 * size of align if it is an even multiple of align.  Otherwise, it
1632 	 * would fall on the same boundary every time.
1633 	 */
1634 	if (rsize & keg->uk_align)
1635 		rsize = (rsize & ~keg->uk_align) + alignsize;
1636 	if ((rsize & alignsize) == 0)
1637 		rsize += alignsize;
1638 	trailer = rsize - keg->uk_size;
1639 	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1640 	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1641 	keg->uk_rsize = rsize;
1642 	keg->uk_ppera = pages;
1643 	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1644 	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1645 	KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
1646 	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1647 	    keg->uk_ipers));
1648 }
1649 
1650 /*
1651  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1652  * the keg onto the global keg list.
1653  *
1654  * Arguments/Returns follow uma_ctor specifications
1655  *	udata  Actually uma_kctor_args
1656  */
1657 static int
1658 keg_ctor(void *mem, int size, void *udata, int flags)
1659 {
1660 	struct uma_kctor_args *arg = udata;
1661 	uma_keg_t keg = mem;
1662 	uma_zone_t zone;
1663 
1664 	bzero(keg, size);
1665 	keg->uk_size = arg->size;
1666 	keg->uk_init = arg->uminit;
1667 	keg->uk_fini = arg->fini;
1668 	keg->uk_align = arg->align;
1669 	keg->uk_free = 0;
1670 	keg->uk_reserve = 0;
1671 	keg->uk_pages = 0;
1672 	keg->uk_flags = arg->flags;
1673 	keg->uk_slabzone = NULL;
1674 
1675 	/*
1676 	 * We use a global round-robin policy by default.  Zones with
1677 	 * UMA_ZONE_NUMA set will use first-touch instead, in which case the
1678 	 * iterator is never run.
1679 	 */
1680 	keg->uk_dr.dr_policy = DOMAINSET_RR();
1681 	keg->uk_dr.dr_iter = 0;
1682 
1683 	/*
1684 	 * The master zone is passed to us at keg-creation time.
1685 	 */
1686 	zone = arg->zone;
1687 	keg->uk_name = zone->uz_name;
1688 
1689 	if (arg->flags & UMA_ZONE_VM)
1690 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1691 
1692 	if (arg->flags & UMA_ZONE_ZINIT)
1693 		keg->uk_init = zero_init;
1694 
1695 	if (arg->flags & UMA_ZONE_MALLOC)
1696 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
1697 
1698 	if (arg->flags & UMA_ZONE_PCPU)
1699 #ifdef SMP
1700 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1701 #else
1702 		keg->uk_flags &= ~UMA_ZONE_PCPU;
1703 #endif
1704 
1705 	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
1706 		keg_cachespread_init(keg);
1707 	} else {
1708 		if (keg->uk_size > UMA_SLAB_SPACE)
1709 			keg_large_init(keg);
1710 		else
1711 			keg_small_init(keg);
1712 	}
1713 
1714 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1715 		keg->uk_slabzone = slabzone;
1716 
1717 	/*
1718 	 * If we haven't booted yet we need allocations to go through the
1719 	 * startup cache until the vm is ready.
1720 	 */
1721 	if (booted < BOOT_PAGEALLOC)
1722 		keg->uk_allocf = startup_alloc;
1723 #ifdef UMA_MD_SMALL_ALLOC
1724 	else if (keg->uk_ppera == 1)
1725 		keg->uk_allocf = uma_small_alloc;
1726 #endif
1727 	else if (keg->uk_flags & UMA_ZONE_PCPU)
1728 		keg->uk_allocf = pcpu_page_alloc;
1729 	else
1730 		keg->uk_allocf = page_alloc;
1731 #ifdef UMA_MD_SMALL_ALLOC
1732 	if (keg->uk_ppera == 1)
1733 		keg->uk_freef = uma_small_free;
1734 	else
1735 #endif
1736 	if (keg->uk_flags & UMA_ZONE_PCPU)
1737 		keg->uk_freef = pcpu_page_free;
1738 	else
1739 		keg->uk_freef = page_free;
1740 
1741 	/*
1742 	 * Initialize keg's lock
1743 	 */
1744 	KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
1745 
1746 	/*
1747 	 * If we're putting the slab header in the actual page we need to
1748 	 * figure out where in each page it goes.  See SIZEOF_UMA_SLAB
1749 	 * macro definition.
1750 	 */
1751 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1752 		keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - SIZEOF_UMA_SLAB;
1753 		/*
1754 		 * The only way the following is possible is if with our
1755 		 * UMA_ALIGN_PTR adjustments we are now bigger than
1756 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
1757 		 * mathematically possible for all cases, so we make
1758 		 * sure here anyway.
1759 		 */
1760 		KASSERT(keg->uk_pgoff + sizeof(struct uma_slab) <=
1761 		    PAGE_SIZE * keg->uk_ppera,
1762 		    ("zone %s ipers %d rsize %d size %d slab won't fit",
1763 		    zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size));
1764 	}
1765 
1766 	if (keg->uk_flags & UMA_ZONE_HASH)
1767 		hash_alloc(&keg->uk_hash, 0);
1768 
1769 	CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n",
1770 	    keg, zone->uz_name, zone,
1771 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
1772 	    keg->uk_free);
1773 
1774 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1775 
1776 	rw_wlock(&uma_rwlock);
1777 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1778 	rw_wunlock(&uma_rwlock);
1779 	return (0);
1780 }
1781 
1782 static void
1783 zone_alloc_counters(uma_zone_t zone)
1784 {
1785 
1786 	zone->uz_allocs = counter_u64_alloc(M_WAITOK);
1787 	zone->uz_frees = counter_u64_alloc(M_WAITOK);
1788 	zone->uz_fails = counter_u64_alloc(M_WAITOK);
1789 }
1790 
1791 /*
1792  * Zone header ctor.  This initializes all fields, locks, etc.
1793  *
1794  * Arguments/Returns follow uma_ctor specifications
1795  *	udata  Actually uma_zctor_args
1796  */
1797 static int
1798 zone_ctor(void *mem, int size, void *udata, int flags)
1799 {
1800 	struct uma_zctor_args *arg = udata;
1801 	uma_zone_t zone = mem;
1802 	uma_zone_t z;
1803 	uma_keg_t keg;
1804 	int i;
1805 
1806 	bzero(zone, size);
1807 	zone->uz_name = arg->name;
1808 	zone->uz_ctor = arg->ctor;
1809 	zone->uz_dtor = arg->dtor;
1810 	zone->uz_init = NULL;
1811 	zone->uz_fini = NULL;
1812 	zone->uz_sleeps = 0;
1813 	zone->uz_xdomain = 0;
1814 	zone->uz_count = 0;
1815 	zone->uz_count_min = 0;
1816 	zone->uz_count_max = BUCKET_MAX;
1817 	zone->uz_flags = 0;
1818 	zone->uz_warning = NULL;
1819 	/* The domain structures follow the cpu structures. */
1820 	zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus];
1821 	zone->uz_bkt_max = ULONG_MAX;
1822 	timevalclear(&zone->uz_ratecheck);
1823 
1824 	if (__predict_true(booted == BOOT_RUNNING))
1825 		zone_alloc_counters(zone);
1826 	else {
1827 		zone->uz_allocs = EARLY_COUNTER;
1828 		zone->uz_frees = EARLY_COUNTER;
1829 		zone->uz_fails = EARLY_COUNTER;
1830 	}
1831 
1832 	for (i = 0; i < vm_ndomains; i++)
1833 		TAILQ_INIT(&zone->uz_domain[i].uzd_buckets);
1834 
1835 	/*
1836 	 * This is a pure cache zone, no kegs.
1837 	 */
1838 	if (arg->import) {
1839 		if (arg->flags & UMA_ZONE_VM)
1840 			arg->flags |= UMA_ZFLAG_CACHEONLY;
1841 		zone->uz_flags = arg->flags;
1842 		zone->uz_size = arg->size;
1843 		zone->uz_import = arg->import;
1844 		zone->uz_release = arg->release;
1845 		zone->uz_arg = arg->arg;
1846 		zone->uz_lockptr = &zone->uz_lock;
1847 		ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
1848 		rw_wlock(&uma_rwlock);
1849 		LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
1850 		rw_wunlock(&uma_rwlock);
1851 		goto out;
1852 	}
1853 
1854 	/*
1855 	 * Use the regular zone/keg/slab allocator.
1856 	 */
1857 	zone->uz_import = (uma_import)zone_import;
1858 	zone->uz_release = (uma_release)zone_release;
1859 	zone->uz_arg = zone;
1860 	keg = arg->keg;
1861 
1862 	if (arg->flags & UMA_ZONE_SECONDARY) {
1863 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1864 		zone->uz_init = arg->uminit;
1865 		zone->uz_fini = arg->fini;
1866 		zone->uz_lockptr = &keg->uk_lock;
1867 		zone->uz_flags |= UMA_ZONE_SECONDARY;
1868 		rw_wlock(&uma_rwlock);
1869 		ZONE_LOCK(zone);
1870 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1871 			if (LIST_NEXT(z, uz_link) == NULL) {
1872 				LIST_INSERT_AFTER(z, zone, uz_link);
1873 				break;
1874 			}
1875 		}
1876 		ZONE_UNLOCK(zone);
1877 		rw_wunlock(&uma_rwlock);
1878 	} else if (keg == NULL) {
1879 		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1880 		    arg->align, arg->flags)) == NULL)
1881 			return (ENOMEM);
1882 	} else {
1883 		struct uma_kctor_args karg;
1884 		int error;
1885 
1886 		/* We should only be here from uma_startup() */
1887 		karg.size = arg->size;
1888 		karg.uminit = arg->uminit;
1889 		karg.fini = arg->fini;
1890 		karg.align = arg->align;
1891 		karg.flags = arg->flags;
1892 		karg.zone = zone;
1893 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1894 		    flags);
1895 		if (error)
1896 			return (error);
1897 	}
1898 
1899 	zone->uz_keg = keg;
1900 	zone->uz_size = keg->uk_size;
1901 	zone->uz_flags |= (keg->uk_flags &
1902 	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1903 
1904 	/*
1905 	 * Some internal zones don't have room allocated for the per cpu
1906 	 * caches.  If we're internal, bail out here.
1907 	 */
1908 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1909 		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1910 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1911 		return (0);
1912 	}
1913 
1914 out:
1915 	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
1916 	    (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
1917 	    ("Invalid zone flag combination"));
1918 	if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) {
1919 		zone->uz_count = BUCKET_MAX;
1920 	} else if ((arg->flags & UMA_ZONE_MINBUCKET) != 0) {
1921 		zone->uz_count = BUCKET_MIN;
1922 		zone->uz_count_max = BUCKET_MIN;
1923 	} else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
1924 		zone->uz_count = 0;
1925 	else
1926 		zone->uz_count = bucket_select(zone->uz_size);
1927 	zone->uz_count_min = zone->uz_count;
1928 
1929 	return (0);
1930 }
1931 
1932 /*
1933  * Keg header dtor.  This frees all data, destroys locks, frees the hash
1934  * table and removes the keg from the global list.
1935  *
1936  * Arguments/Returns follow uma_dtor specifications
1937  *	udata  unused
1938  */
1939 static void
1940 keg_dtor(void *arg, int size, void *udata)
1941 {
1942 	uma_keg_t keg;
1943 
1944 	keg = (uma_keg_t)arg;
1945 	KEG_LOCK(keg);
1946 	if (keg->uk_free != 0) {
1947 		printf("Freed UMA keg (%s) was not empty (%d items). "
1948 		    " Lost %d pages of memory.\n",
1949 		    keg->uk_name ? keg->uk_name : "",
1950 		    keg->uk_free, keg->uk_pages);
1951 	}
1952 	KEG_UNLOCK(keg);
1953 
1954 	hash_free(&keg->uk_hash);
1955 
1956 	KEG_LOCK_FINI(keg);
1957 }
1958 
1959 /*
1960  * Zone header dtor.
1961  *
1962  * Arguments/Returns follow uma_dtor specifications
1963  *	udata  unused
1964  */
1965 static void
1966 zone_dtor(void *arg, int size, void *udata)
1967 {
1968 	uma_zone_t zone;
1969 	uma_keg_t keg;
1970 
1971 	zone = (uma_zone_t)arg;
1972 
1973 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1974 		cache_drain(zone);
1975 
1976 	rw_wlock(&uma_rwlock);
1977 	LIST_REMOVE(zone, uz_link);
1978 	rw_wunlock(&uma_rwlock);
1979 	/*
1980 	 * XXX there are some races here where
1981 	 * the zone can be drained but zone lock
1982 	 * released and then refilled before we
1983 	 * remove it... we dont care for now
1984 	 */
1985 	zone_reclaim(zone, M_WAITOK, true);
1986 	/*
1987 	 * We only destroy kegs from non secondary/non cache zones.
1988 	 */
1989 	if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
1990 		keg = zone->uz_keg;
1991 		rw_wlock(&uma_rwlock);
1992 		LIST_REMOVE(keg, uk_link);
1993 		rw_wunlock(&uma_rwlock);
1994 		zone_free_item(kegs, keg, NULL, SKIP_NONE);
1995 	}
1996 	counter_u64_free(zone->uz_allocs);
1997 	counter_u64_free(zone->uz_frees);
1998 	counter_u64_free(zone->uz_fails);
1999 	if (zone->uz_lockptr == &zone->uz_lock)
2000 		ZONE_LOCK_FINI(zone);
2001 }
2002 
2003 /*
2004  * Traverses every zone in the system and calls a callback
2005  *
2006  * Arguments:
2007  *	zfunc  A pointer to a function which accepts a zone
2008  *		as an argument.
2009  *
2010  * Returns:
2011  *	Nothing
2012  */
2013 static void
2014 zone_foreach(void (*zfunc)(uma_zone_t))
2015 {
2016 	uma_keg_t keg;
2017 	uma_zone_t zone;
2018 
2019 	/*
2020 	 * Before BOOT_RUNNING we are guaranteed to be single
2021 	 * threaded, so locking isn't needed. Startup functions
2022 	 * are allowed to use M_WAITOK.
2023 	 */
2024 	if (__predict_true(booted == BOOT_RUNNING))
2025 		rw_rlock(&uma_rwlock);
2026 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
2027 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
2028 			zfunc(zone);
2029 	}
2030 	if (__predict_true(booted == BOOT_RUNNING))
2031 		rw_runlock(&uma_rwlock);
2032 }
2033 
2034 /*
2035  * Count how many pages do we need to bootstrap.  VM supplies
2036  * its need in early zones in the argument, we add up our zones,
2037  * which consist of: UMA Slabs, UMA Hash and 9 Bucket zones. The
2038  * zone of zones and zone of kegs are accounted separately.
2039  */
2040 #define	UMA_BOOT_ZONES	11
2041 /* Zone of zones and zone of kegs have arbitrary alignment. */
2042 #define	UMA_BOOT_ALIGN	32
2043 static int zsize, ksize;
2044 int
2045 uma_startup_count(int vm_zones)
2046 {
2047 	int zones, pages;
2048 
2049 	ksize = sizeof(struct uma_keg) +
2050 	    (sizeof(struct uma_domain) * vm_ndomains);
2051 	zsize = sizeof(struct uma_zone) +
2052 	    (sizeof(struct uma_cache) * (mp_maxid + 1)) +
2053 	    (sizeof(struct uma_zone_domain) * vm_ndomains);
2054 
2055 	/*
2056 	 * Memory for the zone of kegs and its keg,
2057 	 * and for zone of zones.
2058 	 */
2059 	pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 +
2060 	    roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE);
2061 
2062 #ifdef	UMA_MD_SMALL_ALLOC
2063 	zones = UMA_BOOT_ZONES;
2064 #else
2065 	zones = UMA_BOOT_ZONES + vm_zones;
2066 	vm_zones = 0;
2067 #endif
2068 
2069 	/* Memory for the rest of startup zones, UMA and VM, ... */
2070 	if (zsize > UMA_SLAB_SPACE) {
2071 		/* See keg_large_init(). */
2072 		u_int ppera;
2073 
2074 		ppera = howmany(roundup2(zsize, UMA_BOOT_ALIGN), PAGE_SIZE);
2075 		if (PAGE_SIZE * ppera - roundup2(zsize, UMA_BOOT_ALIGN) <
2076 		    SIZEOF_UMA_SLAB)
2077 			ppera++;
2078 		pages += (zones + vm_zones) * ppera;
2079 	} else if (roundup2(zsize, UMA_BOOT_ALIGN) > UMA_SLAB_SPACE)
2080 		/* See keg_small_init() special case for uk_ppera = 1. */
2081 		pages += zones;
2082 	else
2083 		pages += howmany(zones,
2084 		    UMA_SLAB_SPACE / roundup2(zsize, UMA_BOOT_ALIGN));
2085 
2086 	/* ... and their kegs. Note that zone of zones allocates a keg! */
2087 	pages += howmany(zones + 1,
2088 	    UMA_SLAB_SPACE / roundup2(ksize, UMA_BOOT_ALIGN));
2089 
2090 	/*
2091 	 * Most of startup zones are not going to be offpages, that's
2092 	 * why we use UMA_SLAB_SPACE instead of UMA_SLAB_SIZE in all
2093 	 * calculations.  Some large bucket zones will be offpage, and
2094 	 * thus will allocate hashes.  We take conservative approach
2095 	 * and assume that all zones may allocate hash.  This may give
2096 	 * us some positive inaccuracy, usually an extra single page.
2097 	 */
2098 	pages += howmany(zones, UMA_SLAB_SPACE /
2099 	    (sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT));
2100 
2101 	return (pages);
2102 }
2103 
2104 void
2105 uma_startup(void *mem, int npages)
2106 {
2107 	struct uma_zctor_args args;
2108 	uma_keg_t masterkeg;
2109 	uintptr_t m;
2110 
2111 #ifdef DIAGNOSTIC
2112 	printf("Entering %s with %d boot pages configured\n", __func__, npages);
2113 #endif
2114 
2115 	rw_init(&uma_rwlock, "UMA lock");
2116 
2117 	/* Use bootpages memory for the zone of zones and zone of kegs. */
2118 	m = (uintptr_t)mem;
2119 	zones = (uma_zone_t)m;
2120 	m += roundup(zsize, CACHE_LINE_SIZE);
2121 	kegs = (uma_zone_t)m;
2122 	m += roundup(zsize, CACHE_LINE_SIZE);
2123 	masterkeg = (uma_keg_t)m;
2124 	m += roundup(ksize, CACHE_LINE_SIZE);
2125 	m = roundup(m, PAGE_SIZE);
2126 	npages -= (m - (uintptr_t)mem) / PAGE_SIZE;
2127 	mem = (void *)m;
2128 
2129 	/* "manually" create the initial zone */
2130 	memset(&args, 0, sizeof(args));
2131 	args.name = "UMA Kegs";
2132 	args.size = ksize;
2133 	args.ctor = keg_ctor;
2134 	args.dtor = keg_dtor;
2135 	args.uminit = zero_init;
2136 	args.fini = NULL;
2137 	args.keg = masterkeg;
2138 	args.align = UMA_BOOT_ALIGN - 1;
2139 	args.flags = UMA_ZFLAG_INTERNAL;
2140 	zone_ctor(kegs, zsize, &args, M_WAITOK);
2141 
2142 	bootmem = mem;
2143 	boot_pages = npages;
2144 
2145 	args.name = "UMA Zones";
2146 	args.size = zsize;
2147 	args.ctor = zone_ctor;
2148 	args.dtor = zone_dtor;
2149 	args.uminit = zero_init;
2150 	args.fini = NULL;
2151 	args.keg = NULL;
2152 	args.align = UMA_BOOT_ALIGN - 1;
2153 	args.flags = UMA_ZFLAG_INTERNAL;
2154 	zone_ctor(zones, zsize, &args, M_WAITOK);
2155 
2156 	/* Now make a zone for slab headers */
2157 	slabzone = uma_zcreate("UMA Slabs",
2158 				sizeof(struct uma_slab),
2159 				NULL, NULL, NULL, NULL,
2160 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2161 
2162 	hashzone = uma_zcreate("UMA Hash",
2163 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
2164 	    NULL, NULL, NULL, NULL,
2165 	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2166 
2167 	bucket_init();
2168 
2169 	booted = BOOT_STRAPPED;
2170 }
2171 
2172 void
2173 uma_startup1(void)
2174 {
2175 
2176 #ifdef DIAGNOSTIC
2177 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2178 #endif
2179 	booted = BOOT_PAGEALLOC;
2180 }
2181 
2182 void
2183 uma_startup2(void)
2184 {
2185 
2186 #ifdef DIAGNOSTIC
2187 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2188 #endif
2189 	booted = BOOT_BUCKETS;
2190 	sx_init(&uma_reclaim_lock, "umareclaim");
2191 	bucket_enable();
2192 }
2193 
2194 /*
2195  * Initialize our callout handle
2196  *
2197  */
2198 static void
2199 uma_startup3(void)
2200 {
2201 	uma_zone_t zone;
2202 
2203 #ifdef INVARIANTS
2204 	TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
2205 	uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
2206 	uma_skip_cnt = counter_u64_alloc(M_WAITOK);
2207 #endif
2208 	zone_foreach(zone_alloc_counters);
2209 	LIST_FOREACH(zone, &uma_cachezones, uz_link)
2210 		zone_alloc_counters(zone);
2211 	callout_init(&uma_callout, 1);
2212 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
2213 	booted = BOOT_RUNNING;
2214 }
2215 
2216 static uma_keg_t
2217 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
2218 		int align, uint32_t flags)
2219 {
2220 	struct uma_kctor_args args;
2221 
2222 	args.size = size;
2223 	args.uminit = uminit;
2224 	args.fini = fini;
2225 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
2226 	args.flags = flags;
2227 	args.zone = zone;
2228 	return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
2229 }
2230 
2231 /* Public functions */
2232 /* See uma.h */
2233 void
2234 uma_set_align(int align)
2235 {
2236 
2237 	if (align != UMA_ALIGN_CACHE)
2238 		uma_align_cache = align;
2239 }
2240 
2241 /* See uma.h */
2242 uma_zone_t
2243 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
2244 		uma_init uminit, uma_fini fini, int align, uint32_t flags)
2245 
2246 {
2247 	struct uma_zctor_args args;
2248 	uma_zone_t res;
2249 	bool locked;
2250 
2251 	KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
2252 	    align, name));
2253 
2254 	/* Sets all zones to a first-touch domain policy. */
2255 #ifdef UMA_FIRSTTOUCH
2256 	flags |= UMA_ZONE_NUMA;
2257 #endif
2258 
2259 	/* This stuff is essential for the zone ctor */
2260 	memset(&args, 0, sizeof(args));
2261 	args.name = name;
2262 	args.size = size;
2263 	args.ctor = ctor;
2264 	args.dtor = dtor;
2265 	args.uminit = uminit;
2266 	args.fini = fini;
2267 #ifdef  INVARIANTS
2268 	/*
2269 	 * If a zone is being created with an empty constructor and
2270 	 * destructor, pass UMA constructor/destructor which checks for
2271 	 * memory use after free.
2272 	 */
2273 	if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
2274 	    ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
2275 		args.ctor = trash_ctor;
2276 		args.dtor = trash_dtor;
2277 		args.uminit = trash_init;
2278 		args.fini = trash_fini;
2279 	}
2280 #endif
2281 	args.align = align;
2282 	args.flags = flags;
2283 	args.keg = NULL;
2284 
2285 	if (booted < BOOT_BUCKETS) {
2286 		locked = false;
2287 	} else {
2288 		sx_slock(&uma_reclaim_lock);
2289 		locked = true;
2290 	}
2291 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2292 	if (locked)
2293 		sx_sunlock(&uma_reclaim_lock);
2294 	return (res);
2295 }
2296 
2297 /* See uma.h */
2298 uma_zone_t
2299 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
2300 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
2301 {
2302 	struct uma_zctor_args args;
2303 	uma_keg_t keg;
2304 	uma_zone_t res;
2305 	bool locked;
2306 
2307 	keg = master->uz_keg;
2308 	memset(&args, 0, sizeof(args));
2309 	args.name = name;
2310 	args.size = keg->uk_size;
2311 	args.ctor = ctor;
2312 	args.dtor = dtor;
2313 	args.uminit = zinit;
2314 	args.fini = zfini;
2315 	args.align = keg->uk_align;
2316 	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
2317 	args.keg = keg;
2318 
2319 	if (booted < BOOT_BUCKETS) {
2320 		locked = false;
2321 	} else {
2322 		sx_slock(&uma_reclaim_lock);
2323 		locked = true;
2324 	}
2325 	/* XXX Attaches only one keg of potentially many. */
2326 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2327 	if (locked)
2328 		sx_sunlock(&uma_reclaim_lock);
2329 	return (res);
2330 }
2331 
2332 /* See uma.h */
2333 uma_zone_t
2334 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
2335 		    uma_init zinit, uma_fini zfini, uma_import zimport,
2336 		    uma_release zrelease, void *arg, int flags)
2337 {
2338 	struct uma_zctor_args args;
2339 
2340 	memset(&args, 0, sizeof(args));
2341 	args.name = name;
2342 	args.size = size;
2343 	args.ctor = ctor;
2344 	args.dtor = dtor;
2345 	args.uminit = zinit;
2346 	args.fini = zfini;
2347 	args.import = zimport;
2348 	args.release = zrelease;
2349 	args.arg = arg;
2350 	args.align = 0;
2351 	args.flags = flags | UMA_ZFLAG_CACHE;
2352 
2353 	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
2354 }
2355 
2356 /* See uma.h */
2357 void
2358 uma_zdestroy(uma_zone_t zone)
2359 {
2360 
2361 	sx_slock(&uma_reclaim_lock);
2362 	zone_free_item(zones, zone, NULL, SKIP_NONE);
2363 	sx_sunlock(&uma_reclaim_lock);
2364 }
2365 
2366 void
2367 uma_zwait(uma_zone_t zone)
2368 {
2369 	void *item;
2370 
2371 	item = uma_zalloc_arg(zone, NULL, M_WAITOK);
2372 	uma_zfree(zone, item);
2373 }
2374 
2375 void *
2376 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
2377 {
2378 	void *item;
2379 #ifdef SMP
2380 	int i;
2381 
2382 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2383 #endif
2384 	item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
2385 	if (item != NULL && (flags & M_ZERO)) {
2386 #ifdef SMP
2387 		for (i = 0; i <= mp_maxid; i++)
2388 			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
2389 #else
2390 		bzero(item, zone->uz_size);
2391 #endif
2392 	}
2393 	return (item);
2394 }
2395 
2396 /*
2397  * A stub while both regular and pcpu cases are identical.
2398  */
2399 void
2400 uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata)
2401 {
2402 
2403 #ifdef SMP
2404 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2405 #endif
2406 	uma_zfree_arg(zone, item, udata);
2407 }
2408 
2409 /* See uma.h */
2410 void *
2411 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
2412 {
2413 	uma_zone_domain_t zdom;
2414 	uma_bucket_t bucket;
2415 	uma_cache_t cache;
2416 	void *item;
2417 	int cpu, domain, lockfail, maxbucket;
2418 #ifdef INVARIANTS
2419 	bool skipdbg;
2420 #endif
2421 
2422 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2423 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2424 
2425 	/* This is the fast path allocation */
2426 	CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d",
2427 	    curthread, zone->uz_name, zone, flags);
2428 
2429 	if (flags & M_WAITOK) {
2430 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2431 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2432 	}
2433 	KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC"));
2434 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2435 	    ("uma_zalloc_arg: called with spinlock or critical section held"));
2436 	if (zone->uz_flags & UMA_ZONE_PCPU)
2437 		KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone "
2438 		    "with M_ZERO passed"));
2439 
2440 #ifdef DEBUG_MEMGUARD
2441 	if (memguard_cmp_zone(zone)) {
2442 		item = memguard_alloc(zone->uz_size, flags);
2443 		if (item != NULL) {
2444 			if (zone->uz_init != NULL &&
2445 			    zone->uz_init(item, zone->uz_size, flags) != 0)
2446 				return (NULL);
2447 			if (zone->uz_ctor != NULL &&
2448 			    zone->uz_ctor(item, zone->uz_size, udata,
2449 			    flags) != 0) {
2450 			    	zone->uz_fini(item, zone->uz_size);
2451 				return (NULL);
2452 			}
2453 			return (item);
2454 		}
2455 		/* This is unfortunate but should not be fatal. */
2456 	}
2457 #endif
2458 	/*
2459 	 * If possible, allocate from the per-CPU cache.  There are two
2460 	 * requirements for safe access to the per-CPU cache: (1) the thread
2461 	 * accessing the cache must not be preempted or yield during access,
2462 	 * and (2) the thread must not migrate CPUs without switching which
2463 	 * cache it accesses.  We rely on a critical section to prevent
2464 	 * preemption and migration.  We release the critical section in
2465 	 * order to acquire the zone mutex if we are unable to allocate from
2466 	 * the current cache; when we re-acquire the critical section, we
2467 	 * must detect and handle migration if it has occurred.
2468 	 */
2469 zalloc_restart:
2470 	critical_enter();
2471 	cpu = curcpu;
2472 	cache = &zone->uz_cpu[cpu];
2473 
2474 zalloc_start:
2475 	bucket = cache->uc_allocbucket;
2476 	if (bucket != NULL && bucket->ub_cnt > 0) {
2477 		bucket->ub_cnt--;
2478 		item = bucket->ub_bucket[bucket->ub_cnt];
2479 #ifdef INVARIANTS
2480 		bucket->ub_bucket[bucket->ub_cnt] = NULL;
2481 #endif
2482 		KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
2483 		cache->uc_allocs++;
2484 		critical_exit();
2485 #ifdef INVARIANTS
2486 		skipdbg = uma_dbg_zskip(zone, item);
2487 #endif
2488 		if (zone->uz_ctor != NULL &&
2489 #ifdef INVARIANTS
2490 		    (!skipdbg || zone->uz_ctor != trash_ctor ||
2491 		    zone->uz_dtor != trash_dtor) &&
2492 #endif
2493 		    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2494 			counter_u64_add(zone->uz_fails, 1);
2495 			zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
2496 			return (NULL);
2497 		}
2498 #ifdef INVARIANTS
2499 		if (!skipdbg)
2500 			uma_dbg_alloc(zone, NULL, item);
2501 #endif
2502 		if (flags & M_ZERO)
2503 			uma_zero_item(item, zone);
2504 		return (item);
2505 	}
2506 
2507 	/*
2508 	 * We have run out of items in our alloc bucket.
2509 	 * See if we can switch with our free bucket.
2510 	 */
2511 	bucket = cache->uc_freebucket;
2512 	if (bucket != NULL && bucket->ub_cnt > 0) {
2513 		CTR2(KTR_UMA,
2514 		    "uma_zalloc: zone %s(%p) swapping empty with alloc",
2515 		    zone->uz_name, zone);
2516 		cache->uc_freebucket = cache->uc_allocbucket;
2517 		cache->uc_allocbucket = bucket;
2518 		goto zalloc_start;
2519 	}
2520 
2521 	/*
2522 	 * Discard any empty allocation bucket while we hold no locks.
2523 	 */
2524 	bucket = cache->uc_allocbucket;
2525 	cache->uc_allocbucket = NULL;
2526 	critical_exit();
2527 	if (bucket != NULL)
2528 		bucket_free(zone, bucket, udata);
2529 
2530 	/* Short-circuit for zones without buckets and low memory. */
2531 	if (zone->uz_count == 0 || bucketdisable) {
2532 		ZONE_LOCK(zone);
2533 		if (zone->uz_flags & UMA_ZONE_NUMA)
2534 			domain = PCPU_GET(domain);
2535 		else
2536 			domain = UMA_ANYDOMAIN;
2537 		goto zalloc_item;
2538 	}
2539 
2540 	/*
2541 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
2542 	 * we must go back to the zone.  This requires the zone lock, so we
2543 	 * must drop the critical section, then re-acquire it when we go back
2544 	 * to the cache.  Since the critical section is released, we may be
2545 	 * preempted or migrate.  As such, make sure not to maintain any
2546 	 * thread-local state specific to the cache from prior to releasing
2547 	 * the critical section.
2548 	 */
2549 	lockfail = 0;
2550 	if (ZONE_TRYLOCK(zone) == 0) {
2551 		/* Record contention to size the buckets. */
2552 		ZONE_LOCK(zone);
2553 		lockfail = 1;
2554 	}
2555 	critical_enter();
2556 	cpu = curcpu;
2557 	cache = &zone->uz_cpu[cpu];
2558 
2559 	/* See if we lost the race to fill the cache. */
2560 	if (cache->uc_allocbucket != NULL) {
2561 		ZONE_UNLOCK(zone);
2562 		goto zalloc_start;
2563 	}
2564 
2565 	/*
2566 	 * Check the zone's cache of buckets.
2567 	 */
2568 	if (zone->uz_flags & UMA_ZONE_NUMA) {
2569 		domain = PCPU_GET(domain);
2570 		zdom = &zone->uz_domain[domain];
2571 	} else {
2572 		domain = UMA_ANYDOMAIN;
2573 		zdom = &zone->uz_domain[0];
2574 	}
2575 
2576 	if ((bucket = zone_fetch_bucket(zone, zdom)) != NULL) {
2577 		KASSERT(bucket->ub_cnt != 0,
2578 		    ("uma_zalloc_arg: Returning an empty bucket."));
2579 		cache->uc_allocbucket = bucket;
2580 		ZONE_UNLOCK(zone);
2581 		goto zalloc_start;
2582 	}
2583 	/* We are no longer associated with this CPU. */
2584 	critical_exit();
2585 
2586 	/*
2587 	 * We bump the uz count when the cache size is insufficient to
2588 	 * handle the working set.
2589 	 */
2590 	if (lockfail && zone->uz_count < zone->uz_count_max)
2591 		zone->uz_count++;
2592 
2593 	if (zone->uz_max_items > 0) {
2594 		if (zone->uz_items >= zone->uz_max_items)
2595 			goto zalloc_item;
2596 		maxbucket = MIN(zone->uz_count,
2597 		    zone->uz_max_items - zone->uz_items);
2598 		zone->uz_items += maxbucket;
2599 	} else
2600 		maxbucket = zone->uz_count;
2601 	ZONE_UNLOCK(zone);
2602 
2603 	/*
2604 	 * Now lets just fill a bucket and put it on the free list.  If that
2605 	 * works we'll restart the allocation from the beginning and it
2606 	 * will use the just filled bucket.
2607 	 */
2608 	bucket = zone_alloc_bucket(zone, udata, domain, flags, maxbucket);
2609 	CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
2610 	    zone->uz_name, zone, bucket);
2611 	ZONE_LOCK(zone);
2612 	if (bucket != NULL) {
2613 		if (zone->uz_max_items > 0 && bucket->ub_cnt < maxbucket) {
2614 			MPASS(zone->uz_items >= maxbucket - bucket->ub_cnt);
2615 			zone->uz_items -= maxbucket - bucket->ub_cnt;
2616 			if (zone->uz_sleepers > 0 &&
2617 			    zone->uz_items < zone->uz_max_items)
2618 				wakeup_one(zone);
2619 		}
2620 		critical_enter();
2621 		cpu = curcpu;
2622 		cache = &zone->uz_cpu[cpu];
2623 
2624 		/*
2625 		 * See if we lost the race or were migrated.  Cache the
2626 		 * initialized bucket to make this less likely or claim
2627 		 * the memory directly.
2628 		 */
2629 		if (cache->uc_allocbucket == NULL &&
2630 		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
2631 		    domain == PCPU_GET(domain))) {
2632 			cache->uc_allocbucket = bucket;
2633 			zdom->uzd_imax += bucket->ub_cnt;
2634 		} else if (zone->uz_bkt_count >= zone->uz_bkt_max) {
2635 			critical_exit();
2636 			ZONE_UNLOCK(zone);
2637 			bucket_drain(zone, bucket);
2638 			bucket_free(zone, bucket, udata);
2639 			goto zalloc_restart;
2640 		} else
2641 			zone_put_bucket(zone, zdom, bucket, false);
2642 		ZONE_UNLOCK(zone);
2643 		goto zalloc_start;
2644 	} else if (zone->uz_max_items > 0) {
2645 		zone->uz_items -= maxbucket;
2646 		if (zone->uz_sleepers > 0 &&
2647 		    zone->uz_items + 1 < zone->uz_max_items)
2648 			wakeup_one(zone);
2649 	}
2650 
2651 	/*
2652 	 * We may not be able to get a bucket so return an actual item.
2653 	 */
2654 zalloc_item:
2655 	item = zone_alloc_item_locked(zone, udata, domain, flags);
2656 
2657 	return (item);
2658 }
2659 
2660 void *
2661 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
2662 {
2663 
2664 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2665 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2666 
2667 	/* This is the fast path allocation */
2668 	CTR5(KTR_UMA,
2669 	    "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d",
2670 	    curthread, zone->uz_name, zone, domain, flags);
2671 
2672 	if (flags & M_WAITOK) {
2673 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2674 		    "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
2675 	}
2676 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2677 	    ("uma_zalloc_domain: called with spinlock or critical section held"));
2678 
2679 	return (zone_alloc_item(zone, udata, domain, flags));
2680 }
2681 
2682 /*
2683  * Find a slab with some space.  Prefer slabs that are partially used over those
2684  * that are totally full.  This helps to reduce fragmentation.
2685  *
2686  * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
2687  * only 'domain'.
2688  */
2689 static uma_slab_t
2690 keg_first_slab(uma_keg_t keg, int domain, bool rr)
2691 {
2692 	uma_domain_t dom;
2693 	uma_slab_t slab;
2694 	int start;
2695 
2696 	KASSERT(domain >= 0 && domain < vm_ndomains,
2697 	    ("keg_first_slab: domain %d out of range", domain));
2698 	KEG_LOCK_ASSERT(keg);
2699 
2700 	slab = NULL;
2701 	start = domain;
2702 	do {
2703 		dom = &keg->uk_domain[domain];
2704 		if (!LIST_EMPTY(&dom->ud_part_slab))
2705 			return (LIST_FIRST(&dom->ud_part_slab));
2706 		if (!LIST_EMPTY(&dom->ud_free_slab)) {
2707 			slab = LIST_FIRST(&dom->ud_free_slab);
2708 			LIST_REMOVE(slab, us_link);
2709 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2710 			return (slab);
2711 		}
2712 		if (rr)
2713 			domain = (domain + 1) % vm_ndomains;
2714 	} while (domain != start);
2715 
2716 	return (NULL);
2717 }
2718 
2719 static uma_slab_t
2720 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags)
2721 {
2722 	uint32_t reserve;
2723 
2724 	KEG_LOCK_ASSERT(keg);
2725 
2726 	reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve;
2727 	if (keg->uk_free <= reserve)
2728 		return (NULL);
2729 	return (keg_first_slab(keg, domain, rr));
2730 }
2731 
2732 static uma_slab_t
2733 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags)
2734 {
2735 	struct vm_domainset_iter di;
2736 	uma_domain_t dom;
2737 	uma_slab_t slab;
2738 	int aflags, domain;
2739 	bool rr;
2740 
2741 restart:
2742 	KEG_LOCK_ASSERT(keg);
2743 
2744 	/*
2745 	 * Use the keg's policy if upper layers haven't already specified a
2746 	 * domain (as happens with first-touch zones).
2747 	 *
2748 	 * To avoid races we run the iterator with the keg lock held, but that
2749 	 * means that we cannot allow the vm_domainset layer to sleep.  Thus,
2750 	 * clear M_WAITOK and handle low memory conditions locally.
2751 	 */
2752 	rr = rdomain == UMA_ANYDOMAIN;
2753 	if (rr) {
2754 		aflags = (flags & ~M_WAITOK) | M_NOWAIT;
2755 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
2756 		    &aflags);
2757 	} else {
2758 		aflags = flags;
2759 		domain = rdomain;
2760 	}
2761 
2762 	for (;;) {
2763 		slab = keg_fetch_free_slab(keg, domain, rr, flags);
2764 		if (slab != NULL) {
2765 			MPASS(slab->us_keg == keg);
2766 			return (slab);
2767 		}
2768 
2769 		/*
2770 		 * M_NOVM means don't ask at all!
2771 		 */
2772 		if (flags & M_NOVM)
2773 			break;
2774 
2775 		KASSERT(zone->uz_max_items == 0 ||
2776 		    zone->uz_items <= zone->uz_max_items,
2777 		    ("%s: zone %p overflow", __func__, zone));
2778 
2779 		slab = keg_alloc_slab(keg, zone, domain, flags, aflags);
2780 		/*
2781 		 * If we got a slab here it's safe to mark it partially used
2782 		 * and return.  We assume that the caller is going to remove
2783 		 * at least one item.
2784 		 */
2785 		if (slab) {
2786 			MPASS(slab->us_keg == keg);
2787 			dom = &keg->uk_domain[slab->us_domain];
2788 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2789 			return (slab);
2790 		}
2791 		KEG_LOCK(keg);
2792 		if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
2793 			if ((flags & M_WAITOK) != 0) {
2794 				KEG_UNLOCK(keg);
2795 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
2796 				KEG_LOCK(keg);
2797 				goto restart;
2798 			}
2799 			break;
2800 		}
2801 	}
2802 
2803 	/*
2804 	 * We might not have been able to get a slab but another cpu
2805 	 * could have while we were unlocked.  Check again before we
2806 	 * fail.
2807 	 */
2808 	if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) {
2809 		MPASS(slab->us_keg == keg);
2810 		return (slab);
2811 	}
2812 	return (NULL);
2813 }
2814 
2815 static uma_slab_t
2816 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags)
2817 {
2818 	uma_slab_t slab;
2819 
2820 	if (keg == NULL) {
2821 		keg = zone->uz_keg;
2822 		KEG_LOCK(keg);
2823 	}
2824 
2825 	for (;;) {
2826 		slab = keg_fetch_slab(keg, zone, domain, flags);
2827 		if (slab)
2828 			return (slab);
2829 		if (flags & (M_NOWAIT | M_NOVM))
2830 			break;
2831 	}
2832 	KEG_UNLOCK(keg);
2833 	return (NULL);
2834 }
2835 
2836 static void *
2837 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
2838 {
2839 	uma_domain_t dom;
2840 	void *item;
2841 	uint8_t freei;
2842 
2843 	MPASS(keg == slab->us_keg);
2844 	KEG_LOCK_ASSERT(keg);
2845 
2846 	freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
2847 	BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
2848 	item = slab->us_data + (keg->uk_rsize * freei);
2849 	slab->us_freecount--;
2850 	keg->uk_free--;
2851 
2852 	/* Move this slab to the full list */
2853 	if (slab->us_freecount == 0) {
2854 		LIST_REMOVE(slab, us_link);
2855 		dom = &keg->uk_domain[slab->us_domain];
2856 		LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
2857 	}
2858 
2859 	return (item);
2860 }
2861 
2862 static int
2863 zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags)
2864 {
2865 	uma_slab_t slab;
2866 	uma_keg_t keg;
2867 #ifdef NUMA
2868 	int stripe;
2869 #endif
2870 	int i;
2871 
2872 	slab = NULL;
2873 	keg = NULL;
2874 	/* Try to keep the buckets totally full */
2875 	for (i = 0; i < max; ) {
2876 		if ((slab = zone_fetch_slab(zone, keg, domain, flags)) == NULL)
2877 			break;
2878 		keg = slab->us_keg;
2879 #ifdef NUMA
2880 		stripe = howmany(max, vm_ndomains);
2881 #endif
2882 		while (slab->us_freecount && i < max) {
2883 			bucket[i++] = slab_alloc_item(keg, slab);
2884 			if (keg->uk_free <= keg->uk_reserve)
2885 				break;
2886 #ifdef NUMA
2887 			/*
2888 			 * If the zone is striped we pick a new slab for every
2889 			 * N allocations.  Eliminating this conditional will
2890 			 * instead pick a new domain for each bucket rather
2891 			 * than stripe within each bucket.  The current option
2892 			 * produces more fragmentation and requires more cpu
2893 			 * time but yields better distribution.
2894 			 */
2895 			if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 &&
2896 			    vm_ndomains > 1 && --stripe == 0)
2897 				break;
2898 #endif
2899 		}
2900 		/* Don't block if we allocated any successfully. */
2901 		flags &= ~M_WAITOK;
2902 		flags |= M_NOWAIT;
2903 	}
2904 	if (slab != NULL)
2905 		KEG_UNLOCK(keg);
2906 
2907 	return i;
2908 }
2909 
2910 static uma_bucket_t
2911 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags, int max)
2912 {
2913 	uma_bucket_t bucket;
2914 
2915 	CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain);
2916 
2917 	/* Avoid allocs targeting empty domains. */
2918 	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
2919 		domain = UMA_ANYDOMAIN;
2920 
2921 	/* Don't wait for buckets, preserve caller's NOVM setting. */
2922 	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
2923 	if (bucket == NULL)
2924 		return (NULL);
2925 
2926 	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
2927 	    MIN(max, bucket->ub_entries), domain, flags);
2928 
2929 	/*
2930 	 * Initialize the memory if necessary.
2931 	 */
2932 	if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
2933 		int i;
2934 
2935 		for (i = 0; i < bucket->ub_cnt; i++)
2936 			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
2937 			    flags) != 0)
2938 				break;
2939 		/*
2940 		 * If we couldn't initialize the whole bucket, put the
2941 		 * rest back onto the freelist.
2942 		 */
2943 		if (i != bucket->ub_cnt) {
2944 			zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
2945 			    bucket->ub_cnt - i);
2946 #ifdef INVARIANTS
2947 			bzero(&bucket->ub_bucket[i],
2948 			    sizeof(void *) * (bucket->ub_cnt - i));
2949 #endif
2950 			bucket->ub_cnt = i;
2951 		}
2952 	}
2953 
2954 	if (bucket->ub_cnt == 0) {
2955 		bucket_free(zone, bucket, udata);
2956 		counter_u64_add(zone->uz_fails, 1);
2957 		return (NULL);
2958 	}
2959 
2960 	return (bucket);
2961 }
2962 
2963 /*
2964  * Allocates a single item from a zone.
2965  *
2966  * Arguments
2967  *	zone   The zone to alloc for.
2968  *	udata  The data to be passed to the constructor.
2969  *	domain The domain to allocate from or UMA_ANYDOMAIN.
2970  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
2971  *
2972  * Returns
2973  *	NULL if there is no memory and M_NOWAIT is set
2974  *	An item if successful
2975  */
2976 
2977 static void *
2978 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
2979 {
2980 
2981 	ZONE_LOCK(zone);
2982 	return (zone_alloc_item_locked(zone, udata, domain, flags));
2983 }
2984 
2985 /*
2986  * Returns with zone unlocked.
2987  */
2988 static void *
2989 zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags)
2990 {
2991 	void *item;
2992 #ifdef INVARIANTS
2993 	bool skipdbg;
2994 #endif
2995 
2996 	ZONE_LOCK_ASSERT(zone);
2997 
2998 	if (zone->uz_max_items > 0) {
2999 		if (zone->uz_items >= zone->uz_max_items) {
3000 			zone_log_warning(zone);
3001 			zone_maxaction(zone);
3002 			if (flags & M_NOWAIT) {
3003 				ZONE_UNLOCK(zone);
3004 				return (NULL);
3005 			}
3006 			zone->uz_sleeps++;
3007 			zone->uz_sleepers++;
3008 			while (zone->uz_items >= zone->uz_max_items)
3009 				mtx_sleep(zone, zone->uz_lockptr, PVM,
3010 				    "zonelimit", 0);
3011 			zone->uz_sleepers--;
3012 			if (zone->uz_sleepers > 0 &&
3013 			    zone->uz_items + 1 < zone->uz_max_items)
3014 				wakeup_one(zone);
3015 		}
3016 		zone->uz_items++;
3017 	}
3018 	ZONE_UNLOCK(zone);
3019 
3020 	/* Avoid allocs targeting empty domains. */
3021 	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
3022 		domain = UMA_ANYDOMAIN;
3023 
3024 	if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
3025 		goto fail;
3026 
3027 #ifdef INVARIANTS
3028 	skipdbg = uma_dbg_zskip(zone, item);
3029 #endif
3030 	/*
3031 	 * We have to call both the zone's init (not the keg's init)
3032 	 * and the zone's ctor.  This is because the item is going from
3033 	 * a keg slab directly to the user, and the user is expecting it
3034 	 * to be both zone-init'd as well as zone-ctor'd.
3035 	 */
3036 	if (zone->uz_init != NULL) {
3037 		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
3038 			zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT);
3039 			goto fail;
3040 		}
3041 	}
3042 	if (zone->uz_ctor != NULL &&
3043 #ifdef INVARIANTS
3044 	    (!skipdbg || zone->uz_ctor != trash_ctor ||
3045 	    zone->uz_dtor != trash_dtor) &&
3046 #endif
3047 	    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
3048 		zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
3049 		goto fail;
3050 	}
3051 #ifdef INVARIANTS
3052 	if (!skipdbg)
3053 		uma_dbg_alloc(zone, NULL, item);
3054 #endif
3055 	if (flags & M_ZERO)
3056 		uma_zero_item(item, zone);
3057 
3058 	counter_u64_add(zone->uz_allocs, 1);
3059 	CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
3060 	    zone->uz_name, zone);
3061 
3062 	return (item);
3063 
3064 fail:
3065 	if (zone->uz_max_items > 0) {
3066 		ZONE_LOCK(zone);
3067 		zone->uz_items--;
3068 		ZONE_UNLOCK(zone);
3069 	}
3070 	counter_u64_add(zone->uz_fails, 1);
3071 	CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
3072 	    zone->uz_name, zone);
3073 	return (NULL);
3074 }
3075 
3076 /* See uma.h */
3077 void
3078 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
3079 {
3080 	uma_cache_t cache;
3081 	uma_bucket_t bucket;
3082 	uma_zone_domain_t zdom;
3083 	int cpu, domain;
3084 #ifdef UMA_XDOMAIN
3085 	int itemdomain;
3086 #endif
3087 	bool lockfail;
3088 #ifdef INVARIANTS
3089 	bool skipdbg;
3090 #endif
3091 
3092 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3093 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3094 
3095 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
3096 	    zone->uz_name);
3097 
3098 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3099 	    ("uma_zfree_arg: called with spinlock or critical section held"));
3100 
3101         /* uma_zfree(..., NULL) does nothing, to match free(9). */
3102         if (item == NULL)
3103                 return;
3104 #ifdef DEBUG_MEMGUARD
3105 	if (is_memguard_addr(item)) {
3106 		if (zone->uz_dtor != NULL)
3107 			zone->uz_dtor(item, zone->uz_size, udata);
3108 		if (zone->uz_fini != NULL)
3109 			zone->uz_fini(item, zone->uz_size);
3110 		memguard_free(item);
3111 		return;
3112 	}
3113 #endif
3114 #ifdef INVARIANTS
3115 	skipdbg = uma_dbg_zskip(zone, item);
3116 	if (skipdbg == false) {
3117 		if (zone->uz_flags & UMA_ZONE_MALLOC)
3118 			uma_dbg_free(zone, udata, item);
3119 		else
3120 			uma_dbg_free(zone, NULL, item);
3121 	}
3122 	if (zone->uz_dtor != NULL && (!skipdbg ||
3123 	    zone->uz_dtor != trash_dtor || zone->uz_ctor != trash_ctor))
3124 #else
3125 	if (zone->uz_dtor != NULL)
3126 #endif
3127 		zone->uz_dtor(item, zone->uz_size, udata);
3128 
3129 	/*
3130 	 * The race here is acceptable.  If we miss it we'll just have to wait
3131 	 * a little longer for the limits to be reset.
3132 	 */
3133 	if (zone->uz_sleepers > 0)
3134 		goto zfree_item;
3135 
3136 #ifdef UMA_XDOMAIN
3137 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
3138 		itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
3139 #endif
3140 
3141 	/*
3142 	 * If possible, free to the per-CPU cache.  There are two
3143 	 * requirements for safe access to the per-CPU cache: (1) the thread
3144 	 * accessing the cache must not be preempted or yield during access,
3145 	 * and (2) the thread must not migrate CPUs without switching which
3146 	 * cache it accesses.  We rely on a critical section to prevent
3147 	 * preemption and migration.  We release the critical section in
3148 	 * order to acquire the zone mutex if we are unable to free to the
3149 	 * current cache; when we re-acquire the critical section, we must
3150 	 * detect and handle migration if it has occurred.
3151 	 */
3152 zfree_restart:
3153 	critical_enter();
3154 	cpu = curcpu;
3155 	cache = &zone->uz_cpu[cpu];
3156 
3157 zfree_start:
3158 	domain = PCPU_GET(domain);
3159 #ifdef UMA_XDOMAIN
3160 	if ((zone->uz_flags & UMA_ZONE_NUMA) == 0)
3161 		itemdomain = domain;
3162 #endif
3163 	/*
3164 	 * Try to free into the allocbucket first to give LIFO ordering
3165 	 * for cache-hot datastructures.  Spill over into the freebucket
3166 	 * if necessary.  Alloc will swap them if one runs dry.
3167 	 */
3168 #ifdef UMA_XDOMAIN
3169 	if (domain != itemdomain) {
3170 		bucket = cache->uc_crossbucket;
3171 	} else
3172 #endif
3173 	{
3174 		bucket = cache->uc_allocbucket;
3175 		if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
3176 			bucket = cache->uc_freebucket;
3177 	}
3178 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3179 		KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
3180 		    ("uma_zfree: Freeing to non free bucket index."));
3181 		bucket->ub_bucket[bucket->ub_cnt] = item;
3182 		bucket->ub_cnt++;
3183 		cache->uc_frees++;
3184 		critical_exit();
3185 		return;
3186 	}
3187 
3188 	/*
3189 	 * We must go back the zone, which requires acquiring the zone lock,
3190 	 * which in turn means we must release and re-acquire the critical
3191 	 * section.  Since the critical section is released, we may be
3192 	 * preempted or migrate.  As such, make sure not to maintain any
3193 	 * thread-local state specific to the cache from prior to releasing
3194 	 * the critical section.
3195 	 */
3196 	critical_exit();
3197 	if (zone->uz_count == 0 || bucketdisable)
3198 		goto zfree_item;
3199 
3200 	lockfail = false;
3201 	if (ZONE_TRYLOCK(zone) == 0) {
3202 		/* Record contention to size the buckets. */
3203 		ZONE_LOCK(zone);
3204 		lockfail = true;
3205 	}
3206 	critical_enter();
3207 	cpu = curcpu;
3208 	domain = PCPU_GET(domain);
3209 	cache = &zone->uz_cpu[cpu];
3210 
3211 #ifdef UMA_XDOMAIN
3212 	if (domain != itemdomain)
3213 		bucket = cache->uc_crossbucket;
3214 	else
3215 #endif
3216 		bucket = cache->uc_freebucket;
3217 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3218 		ZONE_UNLOCK(zone);
3219 		goto zfree_start;
3220 	}
3221 #ifdef UMA_XDOMAIN
3222 	if (domain != itemdomain)
3223 		cache->uc_crossbucket = NULL;
3224 	else
3225 #endif
3226 		cache->uc_freebucket = NULL;
3227 	/* We are no longer associated with this CPU. */
3228 	critical_exit();
3229 
3230 #ifdef UMA_XDOMAIN
3231 	if (domain != itemdomain) {
3232 		if (bucket != NULL) {
3233 			zone->uz_xdomain += bucket->ub_cnt;
3234 			if (vm_ndomains > 2 ||
3235 			    zone->uz_bkt_count >= zone->uz_bkt_max) {
3236 				ZONE_UNLOCK(zone);
3237 				bucket_drain(zone, bucket);
3238 				bucket_free(zone, bucket, udata);
3239 			} else {
3240 				zdom = &zone->uz_domain[itemdomain];
3241 				zone_put_bucket(zone, zdom, bucket, true);
3242 				ZONE_UNLOCK(zone);
3243 			}
3244 		} else
3245 			ZONE_UNLOCK(zone);
3246 		bucket = bucket_alloc(zone, udata, M_NOWAIT);
3247 		if (bucket == NULL)
3248 			goto zfree_item;
3249 		critical_enter();
3250 		cpu = curcpu;
3251 		cache = &zone->uz_cpu[cpu];
3252 		if (cache->uc_crossbucket == NULL) {
3253 			cache->uc_crossbucket = bucket;
3254 			goto zfree_start;
3255 		}
3256 		critical_exit();
3257 		bucket_free(zone, bucket, udata);
3258 		goto zfree_restart;
3259 	}
3260 #endif
3261 
3262 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) {
3263 		zdom = &zone->uz_domain[domain];
3264 	} else {
3265 		domain = 0;
3266 		zdom = &zone->uz_domain[0];
3267 	}
3268 
3269 	/* Can we throw this on the zone full list? */
3270 	if (bucket != NULL) {
3271 		CTR3(KTR_UMA,
3272 		    "uma_zfree: zone %s(%p) putting bucket %p on free list",
3273 		    zone->uz_name, zone, bucket);
3274 		/* ub_cnt is pointing to the last free item */
3275 		KASSERT(bucket->ub_cnt == bucket->ub_entries,
3276 		    ("uma_zfree: Attempting to insert not full bucket onto the full list.\n"));
3277 		if (zone->uz_bkt_count >= zone->uz_bkt_max) {
3278 			ZONE_UNLOCK(zone);
3279 			bucket_drain(zone, bucket);
3280 			bucket_free(zone, bucket, udata);
3281 			goto zfree_restart;
3282 		} else
3283 			zone_put_bucket(zone, zdom, bucket, true);
3284 	}
3285 
3286 	/*
3287 	 * We bump the uz count when the cache size is insufficient to
3288 	 * handle the working set.
3289 	 */
3290 	if (lockfail && zone->uz_count < zone->uz_count_max)
3291 		zone->uz_count++;
3292 	ZONE_UNLOCK(zone);
3293 
3294 	bucket = bucket_alloc(zone, udata, M_NOWAIT);
3295 	CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p",
3296 	    zone->uz_name, zone, bucket);
3297 	if (bucket) {
3298 		critical_enter();
3299 		cpu = curcpu;
3300 		cache = &zone->uz_cpu[cpu];
3301 		if (cache->uc_freebucket == NULL &&
3302 		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
3303 		    domain == PCPU_GET(domain))) {
3304 			cache->uc_freebucket = bucket;
3305 			goto zfree_start;
3306 		}
3307 		/*
3308 		 * We lost the race, start over.  We have to drop our
3309 		 * critical section to free the bucket.
3310 		 */
3311 		critical_exit();
3312 		bucket_free(zone, bucket, udata);
3313 		goto zfree_restart;
3314 	}
3315 
3316 	/*
3317 	 * If nothing else caught this, we'll just do an internal free.
3318 	 */
3319 zfree_item:
3320 	zone_free_item(zone, item, udata, SKIP_DTOR);
3321 }
3322 
3323 void
3324 uma_zfree_domain(uma_zone_t zone, void *item, void *udata)
3325 {
3326 
3327 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3328 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3329 
3330 	CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread,
3331 	    zone->uz_name);
3332 
3333 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3334 	    ("uma_zfree_domain: called with spinlock or critical section held"));
3335 
3336         /* uma_zfree(..., NULL) does nothing, to match free(9). */
3337         if (item == NULL)
3338                 return;
3339 	zone_free_item(zone, item, udata, SKIP_NONE);
3340 }
3341 
3342 static void
3343 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item)
3344 {
3345 	uma_keg_t keg;
3346 	uma_domain_t dom;
3347 	uint8_t freei;
3348 
3349 	keg = zone->uz_keg;
3350 	MPASS(zone->uz_lockptr == &keg->uk_lock);
3351 	KEG_LOCK_ASSERT(keg);
3352 	MPASS(keg == slab->us_keg);
3353 
3354 	dom = &keg->uk_domain[slab->us_domain];
3355 
3356 	/* Do we need to remove from any lists? */
3357 	if (slab->us_freecount+1 == keg->uk_ipers) {
3358 		LIST_REMOVE(slab, us_link);
3359 		LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
3360 	} else if (slab->us_freecount == 0) {
3361 		LIST_REMOVE(slab, us_link);
3362 		LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
3363 	}
3364 
3365 	/* Slab management. */
3366 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3367 	BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
3368 	slab->us_freecount++;
3369 
3370 	/* Keg statistics. */
3371 	keg->uk_free++;
3372 }
3373 
3374 static void
3375 zone_release(uma_zone_t zone, void **bucket, int cnt)
3376 {
3377 	void *item;
3378 	uma_slab_t slab;
3379 	uma_keg_t keg;
3380 	uint8_t *mem;
3381 	int i;
3382 
3383 	keg = zone->uz_keg;
3384 	KEG_LOCK(keg);
3385 	for (i = 0; i < cnt; i++) {
3386 		item = bucket[i];
3387 		if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
3388 			mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
3389 			if (zone->uz_flags & UMA_ZONE_HASH) {
3390 				slab = hash_sfind(&keg->uk_hash, mem);
3391 			} else {
3392 				mem += keg->uk_pgoff;
3393 				slab = (uma_slab_t)mem;
3394 			}
3395 		} else {
3396 			slab = vtoslab((vm_offset_t)item);
3397 			MPASS(slab->us_keg == keg);
3398 		}
3399 		slab_free_item(zone, slab, item);
3400 	}
3401 	KEG_UNLOCK(keg);
3402 }
3403 
3404 /*
3405  * Frees a single item to any zone.
3406  *
3407  * Arguments:
3408  *	zone   The zone to free to
3409  *	item   The item we're freeing
3410  *	udata  User supplied data for the dtor
3411  *	skip   Skip dtors and finis
3412  */
3413 static void
3414 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
3415 {
3416 #ifdef INVARIANTS
3417 	bool skipdbg;
3418 
3419 	skipdbg = uma_dbg_zskip(zone, item);
3420 	if (skip == SKIP_NONE && !skipdbg) {
3421 		if (zone->uz_flags & UMA_ZONE_MALLOC)
3422 			uma_dbg_free(zone, udata, item);
3423 		else
3424 			uma_dbg_free(zone, NULL, item);
3425 	}
3426 
3427 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL &&
3428 	    (!skipdbg || zone->uz_dtor != trash_dtor ||
3429 	    zone->uz_ctor != trash_ctor))
3430 #else
3431 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL)
3432 #endif
3433 		zone->uz_dtor(item, zone->uz_size, udata);
3434 
3435 	if (skip < SKIP_FINI && zone->uz_fini)
3436 		zone->uz_fini(item, zone->uz_size);
3437 
3438 	zone->uz_release(zone->uz_arg, &item, 1);
3439 
3440 	if (skip & SKIP_CNT)
3441 		return;
3442 
3443 	counter_u64_add(zone->uz_frees, 1);
3444 
3445 	if (zone->uz_max_items > 0) {
3446 		ZONE_LOCK(zone);
3447 		zone->uz_items--;
3448 		if (zone->uz_sleepers > 0 &&
3449 		    zone->uz_items < zone->uz_max_items)
3450 			wakeup_one(zone);
3451 		ZONE_UNLOCK(zone);
3452 	}
3453 }
3454 
3455 /* See uma.h */
3456 int
3457 uma_zone_set_max(uma_zone_t zone, int nitems)
3458 {
3459 	struct uma_bucket_zone *ubz;
3460 
3461 	/*
3462 	 * If limit is very low we may need to limit how
3463 	 * much items are allowed in CPU caches.
3464 	 */
3465 	ubz = &bucket_zones[0];
3466 	for (; ubz->ubz_entries != 0; ubz++)
3467 		if (ubz->ubz_entries * 2 * mp_ncpus > nitems)
3468 			break;
3469 	if (ubz == &bucket_zones[0])
3470 		nitems = ubz->ubz_entries * 2 * mp_ncpus;
3471 	else
3472 		ubz--;
3473 
3474 	ZONE_LOCK(zone);
3475 	zone->uz_count_max = zone->uz_count = ubz->ubz_entries;
3476 	if (zone->uz_count_min > zone->uz_count_max)
3477 		zone->uz_count_min = zone->uz_count_max;
3478 	zone->uz_max_items = nitems;
3479 	ZONE_UNLOCK(zone);
3480 
3481 	return (nitems);
3482 }
3483 
3484 /* See uma.h */
3485 int
3486 uma_zone_set_maxcache(uma_zone_t zone, int nitems)
3487 {
3488 
3489 	ZONE_LOCK(zone);
3490 	zone->uz_bkt_max = nitems;
3491 	ZONE_UNLOCK(zone);
3492 
3493 	return (nitems);
3494 }
3495 
3496 /* See uma.h */
3497 int
3498 uma_zone_get_max(uma_zone_t zone)
3499 {
3500 	int nitems;
3501 
3502 	ZONE_LOCK(zone);
3503 	nitems = zone->uz_max_items;
3504 	ZONE_UNLOCK(zone);
3505 
3506 	return (nitems);
3507 }
3508 
3509 /* See uma.h */
3510 void
3511 uma_zone_set_warning(uma_zone_t zone, const char *warning)
3512 {
3513 
3514 	ZONE_LOCK(zone);
3515 	zone->uz_warning = warning;
3516 	ZONE_UNLOCK(zone);
3517 }
3518 
3519 /* See uma.h */
3520 void
3521 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
3522 {
3523 
3524 	ZONE_LOCK(zone);
3525 	TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
3526 	ZONE_UNLOCK(zone);
3527 }
3528 
3529 /* See uma.h */
3530 int
3531 uma_zone_get_cur(uma_zone_t zone)
3532 {
3533 	int64_t nitems;
3534 	u_int i;
3535 
3536 	ZONE_LOCK(zone);
3537 	nitems = counter_u64_fetch(zone->uz_allocs) -
3538 	    counter_u64_fetch(zone->uz_frees);
3539 	CPU_FOREACH(i) {
3540 		/*
3541 		 * See the comment in uma_vm_zone_stats() regarding the
3542 		 * safety of accessing the per-cpu caches. With the zone lock
3543 		 * held, it is safe, but can potentially result in stale data.
3544 		 */
3545 		nitems += zone->uz_cpu[i].uc_allocs -
3546 		    zone->uz_cpu[i].uc_frees;
3547 	}
3548 	ZONE_UNLOCK(zone);
3549 
3550 	return (nitems < 0 ? 0 : nitems);
3551 }
3552 
3553 /* See uma.h */
3554 void
3555 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
3556 {
3557 	uma_keg_t keg;
3558 
3559 	KEG_GET(zone, keg);
3560 	KEG_LOCK(keg);
3561 	KASSERT(keg->uk_pages == 0,
3562 	    ("uma_zone_set_init on non-empty keg"));
3563 	keg->uk_init = uminit;
3564 	KEG_UNLOCK(keg);
3565 }
3566 
3567 /* See uma.h */
3568 void
3569 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
3570 {
3571 	uma_keg_t keg;
3572 
3573 	KEG_GET(zone, keg);
3574 	KEG_LOCK(keg);
3575 	KASSERT(keg->uk_pages == 0,
3576 	    ("uma_zone_set_fini on non-empty keg"));
3577 	keg->uk_fini = fini;
3578 	KEG_UNLOCK(keg);
3579 }
3580 
3581 /* See uma.h */
3582 void
3583 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
3584 {
3585 
3586 	ZONE_LOCK(zone);
3587 	KASSERT(zone->uz_keg->uk_pages == 0,
3588 	    ("uma_zone_set_zinit on non-empty keg"));
3589 	zone->uz_init = zinit;
3590 	ZONE_UNLOCK(zone);
3591 }
3592 
3593 /* See uma.h */
3594 void
3595 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
3596 {
3597 
3598 	ZONE_LOCK(zone);
3599 	KASSERT(zone->uz_keg->uk_pages == 0,
3600 	    ("uma_zone_set_zfini on non-empty keg"));
3601 	zone->uz_fini = zfini;
3602 	ZONE_UNLOCK(zone);
3603 }
3604 
3605 /* See uma.h */
3606 /* XXX uk_freef is not actually used with the zone locked */
3607 void
3608 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
3609 {
3610 	uma_keg_t keg;
3611 
3612 	KEG_GET(zone, keg);
3613 	KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
3614 	KEG_LOCK(keg);
3615 	keg->uk_freef = freef;
3616 	KEG_UNLOCK(keg);
3617 }
3618 
3619 /* See uma.h */
3620 /* XXX uk_allocf is not actually used with the zone locked */
3621 void
3622 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3623 {
3624 	uma_keg_t keg;
3625 
3626 	KEG_GET(zone, keg);
3627 	KEG_LOCK(keg);
3628 	keg->uk_allocf = allocf;
3629 	KEG_UNLOCK(keg);
3630 }
3631 
3632 /* See uma.h */
3633 void
3634 uma_zone_reserve(uma_zone_t zone, int items)
3635 {
3636 	uma_keg_t keg;
3637 
3638 	KEG_GET(zone, keg);
3639 	KEG_LOCK(keg);
3640 	keg->uk_reserve = items;
3641 	KEG_UNLOCK(keg);
3642 }
3643 
3644 /* See uma.h */
3645 int
3646 uma_zone_reserve_kva(uma_zone_t zone, int count)
3647 {
3648 	uma_keg_t keg;
3649 	vm_offset_t kva;
3650 	u_int pages;
3651 
3652 	KEG_GET(zone, keg);
3653 
3654 	pages = count / keg->uk_ipers;
3655 	if (pages * keg->uk_ipers < count)
3656 		pages++;
3657 	pages *= keg->uk_ppera;
3658 
3659 #ifdef UMA_MD_SMALL_ALLOC
3660 	if (keg->uk_ppera > 1) {
3661 #else
3662 	if (1) {
3663 #endif
3664 		kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
3665 		if (kva == 0)
3666 			return (0);
3667 	} else
3668 		kva = 0;
3669 
3670 	ZONE_LOCK(zone);
3671 	MPASS(keg->uk_kva == 0);
3672 	keg->uk_kva = kva;
3673 	keg->uk_offset = 0;
3674 	zone->uz_max_items = pages * keg->uk_ipers;
3675 #ifdef UMA_MD_SMALL_ALLOC
3676 	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3677 #else
3678 	keg->uk_allocf = noobj_alloc;
3679 #endif
3680 	keg->uk_flags |= UMA_ZONE_NOFREE;
3681 	ZONE_UNLOCK(zone);
3682 
3683 	return (1);
3684 }
3685 
3686 /* See uma.h */
3687 void
3688 uma_prealloc(uma_zone_t zone, int items)
3689 {
3690 	struct vm_domainset_iter di;
3691 	uma_domain_t dom;
3692 	uma_slab_t slab;
3693 	uma_keg_t keg;
3694 	int aflags, domain, slabs;
3695 
3696 	KEG_GET(zone, keg);
3697 	KEG_LOCK(keg);
3698 	slabs = items / keg->uk_ipers;
3699 	if (slabs * keg->uk_ipers < items)
3700 		slabs++;
3701 	while (slabs-- > 0) {
3702 		aflags = M_NOWAIT;
3703 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
3704 		    &aflags);
3705 		for (;;) {
3706 			slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
3707 			    aflags);
3708 			if (slab != NULL) {
3709 				MPASS(slab->us_keg == keg);
3710 				dom = &keg->uk_domain[slab->us_domain];
3711 				LIST_INSERT_HEAD(&dom->ud_free_slab, slab,
3712 				    us_link);
3713 				break;
3714 			}
3715 			KEG_LOCK(keg);
3716 			if (vm_domainset_iter_policy(&di, &domain) != 0) {
3717 				KEG_UNLOCK(keg);
3718 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
3719 				KEG_LOCK(keg);
3720 			}
3721 		}
3722 	}
3723 	KEG_UNLOCK(keg);
3724 }
3725 
3726 /* See uma.h */
3727 void
3728 uma_reclaim(int req)
3729 {
3730 
3731 	CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
3732 	sx_xlock(&uma_reclaim_lock);
3733 	bucket_enable();
3734 
3735 	switch (req) {
3736 	case UMA_RECLAIM_TRIM:
3737 		zone_foreach(zone_trim);
3738 		break;
3739 	case UMA_RECLAIM_DRAIN:
3740 	case UMA_RECLAIM_DRAIN_CPU:
3741 		zone_foreach(zone_drain);
3742 		if (req == UMA_RECLAIM_DRAIN_CPU) {
3743 			pcpu_cache_drain_safe(NULL);
3744 			zone_foreach(zone_drain);
3745 		}
3746 		break;
3747 	default:
3748 		panic("unhandled reclamation request %d", req);
3749 	}
3750 
3751 	/*
3752 	 * Some slabs may have been freed but this zone will be visited early
3753 	 * we visit again so that we can free pages that are empty once other
3754 	 * zones are drained.  We have to do the same for buckets.
3755 	 */
3756 	zone_drain(slabzone);
3757 	bucket_zone_drain();
3758 	sx_xunlock(&uma_reclaim_lock);
3759 }
3760 
3761 static volatile int uma_reclaim_needed;
3762 
3763 void
3764 uma_reclaim_wakeup(void)
3765 {
3766 
3767 	if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
3768 		wakeup(uma_reclaim);
3769 }
3770 
3771 void
3772 uma_reclaim_worker(void *arg __unused)
3773 {
3774 
3775 	for (;;) {
3776 		sx_xlock(&uma_reclaim_lock);
3777 		while (atomic_load_int(&uma_reclaim_needed) == 0)
3778 			sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl",
3779 			    hz);
3780 		sx_xunlock(&uma_reclaim_lock);
3781 		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
3782 		uma_reclaim(UMA_RECLAIM_DRAIN_CPU);
3783 		atomic_store_int(&uma_reclaim_needed, 0);
3784 		/* Don't fire more than once per-second. */
3785 		pause("umarclslp", hz);
3786 	}
3787 }
3788 
3789 /* See uma.h */
3790 void
3791 uma_zone_reclaim(uma_zone_t zone, int req)
3792 {
3793 
3794 	switch (req) {
3795 	case UMA_RECLAIM_TRIM:
3796 		zone_trim(zone);
3797 		break;
3798 	case UMA_RECLAIM_DRAIN:
3799 		zone_drain(zone);
3800 		break;
3801 	case UMA_RECLAIM_DRAIN_CPU:
3802 		pcpu_cache_drain_safe(zone);
3803 		zone_drain(zone);
3804 		break;
3805 	default:
3806 		panic("unhandled reclamation request %d", req);
3807 	}
3808 }
3809 
3810 /* See uma.h */
3811 int
3812 uma_zone_exhausted(uma_zone_t zone)
3813 {
3814 	int full;
3815 
3816 	ZONE_LOCK(zone);
3817 	full = zone->uz_sleepers > 0;
3818 	ZONE_UNLOCK(zone);
3819 	return (full);
3820 }
3821 
3822 int
3823 uma_zone_exhausted_nolock(uma_zone_t zone)
3824 {
3825 	return (zone->uz_sleepers > 0);
3826 }
3827 
3828 void *
3829 uma_large_malloc_domain(vm_size_t size, int domain, int wait)
3830 {
3831 	struct domainset *policy;
3832 	vm_offset_t addr;
3833 	uma_slab_t slab;
3834 
3835 	if (domain != UMA_ANYDOMAIN) {
3836 		/* avoid allocs targeting empty domains */
3837 		if (VM_DOMAIN_EMPTY(domain))
3838 			domain = UMA_ANYDOMAIN;
3839 	}
3840 	slab = zone_alloc_item(slabzone, NULL, domain, wait);
3841 	if (slab == NULL)
3842 		return (NULL);
3843 	policy = (domain == UMA_ANYDOMAIN) ? DOMAINSET_RR() :
3844 	    DOMAINSET_FIXED(domain);
3845 	addr = kmem_malloc_domainset(policy, size, wait);
3846 	if (addr != 0) {
3847 		vsetslab(addr, slab);
3848 		slab->us_data = (void *)addr;
3849 		slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC;
3850 		slab->us_size = size;
3851 		slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE(
3852 		    pmap_kextract(addr)));
3853 		uma_total_inc(size);
3854 	} else {
3855 		zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3856 	}
3857 
3858 	return ((void *)addr);
3859 }
3860 
3861 void *
3862 uma_large_malloc(vm_size_t size, int wait)
3863 {
3864 
3865 	return uma_large_malloc_domain(size, UMA_ANYDOMAIN, wait);
3866 }
3867 
3868 void
3869 uma_large_free(uma_slab_t slab)
3870 {
3871 
3872 	KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0,
3873 	    ("uma_large_free:  Memory not allocated with uma_large_malloc."));
3874 	kmem_free((vm_offset_t)slab->us_data, slab->us_size);
3875 	uma_total_dec(slab->us_size);
3876 	zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3877 }
3878 
3879 static void
3880 uma_zero_item(void *item, uma_zone_t zone)
3881 {
3882 
3883 	bzero(item, zone->uz_size);
3884 }
3885 
3886 unsigned long
3887 uma_limit(void)
3888 {
3889 
3890 	return (uma_kmem_limit);
3891 }
3892 
3893 void
3894 uma_set_limit(unsigned long limit)
3895 {
3896 
3897 	uma_kmem_limit = limit;
3898 }
3899 
3900 unsigned long
3901 uma_size(void)
3902 {
3903 
3904 	return (atomic_load_long(&uma_kmem_total));
3905 }
3906 
3907 long
3908 uma_avail(void)
3909 {
3910 
3911 	return (uma_kmem_limit - uma_size());
3912 }
3913 
3914 void
3915 uma_print_stats(void)
3916 {
3917 	zone_foreach(uma_print_zone);
3918 }
3919 
3920 static void
3921 slab_print(uma_slab_t slab)
3922 {
3923 	printf("slab: keg %p, data %p, freecount %d\n",
3924 		slab->us_keg, slab->us_data, slab->us_freecount);
3925 }
3926 
3927 static void
3928 cache_print(uma_cache_t cache)
3929 {
3930 	printf("alloc: %p(%d), free: %p(%d), cross: %p(%d)j\n",
3931 		cache->uc_allocbucket,
3932 		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3933 		cache->uc_freebucket,
3934 		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0,
3935 		cache->uc_crossbucket,
3936 		cache->uc_crossbucket?cache->uc_crossbucket->ub_cnt:0);
3937 }
3938 
3939 static void
3940 uma_print_keg(uma_keg_t keg)
3941 {
3942 	uma_domain_t dom;
3943 	uma_slab_t slab;
3944 	int i;
3945 
3946 	printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3947 	    "out %d free %d\n",
3948 	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3949 	    keg->uk_ipers, keg->uk_ppera,
3950 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
3951 	    keg->uk_free);
3952 	for (i = 0; i < vm_ndomains; i++) {
3953 		dom = &keg->uk_domain[i];
3954 		printf("Part slabs:\n");
3955 		LIST_FOREACH(slab, &dom->ud_part_slab, us_link)
3956 			slab_print(slab);
3957 		printf("Free slabs:\n");
3958 		LIST_FOREACH(slab, &dom->ud_free_slab, us_link)
3959 			slab_print(slab);
3960 		printf("Full slabs:\n");
3961 		LIST_FOREACH(slab, &dom->ud_full_slab, us_link)
3962 			slab_print(slab);
3963 	}
3964 }
3965 
3966 void
3967 uma_print_zone(uma_zone_t zone)
3968 {
3969 	uma_cache_t cache;
3970 	int i;
3971 
3972 	printf("zone: %s(%p) size %d maxitems %ju flags %#x\n",
3973 	    zone->uz_name, zone, zone->uz_size, (uintmax_t)zone->uz_max_items,
3974 	    zone->uz_flags);
3975 	if (zone->uz_lockptr != &zone->uz_lock)
3976 		uma_print_keg(zone->uz_keg);
3977 	CPU_FOREACH(i) {
3978 		cache = &zone->uz_cpu[i];
3979 		printf("CPU %d Cache:\n", i);
3980 		cache_print(cache);
3981 	}
3982 }
3983 
3984 #ifdef DDB
3985 /*
3986  * Generate statistics across both the zone and its per-cpu cache's.  Return
3987  * desired statistics if the pointer is non-NULL for that statistic.
3988  *
3989  * Note: does not update the zone statistics, as it can't safely clear the
3990  * per-CPU cache statistic.
3991  *
3992  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
3993  * safe from off-CPU; we should modify the caches to track this information
3994  * directly so that we don't have to.
3995  */
3996 static void
3997 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
3998     uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp)
3999 {
4000 	uma_cache_t cache;
4001 	uint64_t allocs, frees, sleeps, xdomain;
4002 	int cachefree, cpu;
4003 
4004 	allocs = frees = sleeps = xdomain = 0;
4005 	cachefree = 0;
4006 	CPU_FOREACH(cpu) {
4007 		cache = &z->uz_cpu[cpu];
4008 		if (cache->uc_allocbucket != NULL)
4009 			cachefree += cache->uc_allocbucket->ub_cnt;
4010 		if (cache->uc_freebucket != NULL)
4011 			cachefree += cache->uc_freebucket->ub_cnt;
4012 		if (cache->uc_crossbucket != NULL) {
4013 			xdomain += cache->uc_crossbucket->ub_cnt;
4014 			cachefree += cache->uc_crossbucket->ub_cnt;
4015 		}
4016 		allocs += cache->uc_allocs;
4017 		frees += cache->uc_frees;
4018 	}
4019 	allocs += counter_u64_fetch(z->uz_allocs);
4020 	frees += counter_u64_fetch(z->uz_frees);
4021 	sleeps += z->uz_sleeps;
4022 	xdomain += z->uz_xdomain;
4023 	if (cachefreep != NULL)
4024 		*cachefreep = cachefree;
4025 	if (allocsp != NULL)
4026 		*allocsp = allocs;
4027 	if (freesp != NULL)
4028 		*freesp = frees;
4029 	if (sleepsp != NULL)
4030 		*sleepsp = sleeps;
4031 	if (xdomainp != NULL)
4032 		*xdomainp = xdomain;
4033 }
4034 #endif /* DDB */
4035 
4036 static int
4037 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
4038 {
4039 	uma_keg_t kz;
4040 	uma_zone_t z;
4041 	int count;
4042 
4043 	count = 0;
4044 	rw_rlock(&uma_rwlock);
4045 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4046 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
4047 			count++;
4048 	}
4049 	LIST_FOREACH(z, &uma_cachezones, uz_link)
4050 		count++;
4051 
4052 	rw_runlock(&uma_rwlock);
4053 	return (sysctl_handle_int(oidp, &count, 0, req));
4054 }
4055 
4056 static void
4057 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf,
4058     struct uma_percpu_stat *ups, bool internal)
4059 {
4060 	uma_zone_domain_t zdom;
4061 	uma_bucket_t bucket;
4062 	uma_cache_t cache;
4063 	int i;
4064 
4065 
4066 	for (i = 0; i < vm_ndomains; i++) {
4067 		zdom = &z->uz_domain[i];
4068 		uth->uth_zone_free += zdom->uzd_nitems;
4069 	}
4070 	uth->uth_allocs = counter_u64_fetch(z->uz_allocs);
4071 	uth->uth_frees = counter_u64_fetch(z->uz_frees);
4072 	uth->uth_fails = counter_u64_fetch(z->uz_fails);
4073 	uth->uth_sleeps = z->uz_sleeps;
4074 	uth->uth_xdomain = z->uz_xdomain;
4075 
4076 	/*
4077 	 * While it is not normally safe to access the cache bucket pointers
4078 	 * while not on the CPU that owns the cache, we only allow the pointers
4079 	 * to be exchanged without the zone lock held, not invalidated, so
4080 	 * accept the possible race associated with bucket exchange during
4081 	 * monitoring.  Use atomic_load_ptr() to ensure that the bucket pointers
4082 	 * are loaded only once.
4083 	 */
4084 	for (i = 0; i < mp_maxid + 1; i++) {
4085 		bzero(&ups[i], sizeof(*ups));
4086 		if (internal || CPU_ABSENT(i))
4087 			continue;
4088 		cache = &z->uz_cpu[i];
4089 		bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_allocbucket);
4090 		if (bucket != NULL)
4091 			ups[i].ups_cache_free += bucket->ub_cnt;
4092 		bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_freebucket);
4093 		if (bucket != NULL)
4094 			ups[i].ups_cache_free += bucket->ub_cnt;
4095 		bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_crossbucket);
4096 		if (bucket != NULL)
4097 			ups[i].ups_cache_free += bucket->ub_cnt;
4098 		ups[i].ups_allocs = cache->uc_allocs;
4099 		ups[i].ups_frees = cache->uc_frees;
4100 	}
4101 }
4102 
4103 static int
4104 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
4105 {
4106 	struct uma_stream_header ush;
4107 	struct uma_type_header uth;
4108 	struct uma_percpu_stat *ups;
4109 	struct sbuf sbuf;
4110 	uma_keg_t kz;
4111 	uma_zone_t z;
4112 	int count, error, i;
4113 
4114 	error = sysctl_wire_old_buffer(req, 0);
4115 	if (error != 0)
4116 		return (error);
4117 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
4118 	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
4119 	ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
4120 
4121 	count = 0;
4122 	rw_rlock(&uma_rwlock);
4123 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4124 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
4125 			count++;
4126 	}
4127 
4128 	LIST_FOREACH(z, &uma_cachezones, uz_link)
4129 		count++;
4130 
4131 	/*
4132 	 * Insert stream header.
4133 	 */
4134 	bzero(&ush, sizeof(ush));
4135 	ush.ush_version = UMA_STREAM_VERSION;
4136 	ush.ush_maxcpus = (mp_maxid + 1);
4137 	ush.ush_count = count;
4138 	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
4139 
4140 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4141 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4142 			bzero(&uth, sizeof(uth));
4143 			ZONE_LOCK(z);
4144 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
4145 			uth.uth_align = kz->uk_align;
4146 			uth.uth_size = kz->uk_size;
4147 			uth.uth_rsize = kz->uk_rsize;
4148 			if (z->uz_max_items > 0)
4149 				uth.uth_pages = (z->uz_items / kz->uk_ipers) *
4150 					kz->uk_ppera;
4151 			else
4152 				uth.uth_pages = kz->uk_pages;
4153 			uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) *
4154 			    kz->uk_ppera;
4155 			uth.uth_limit = z->uz_max_items;
4156 			uth.uth_keg_free = z->uz_keg->uk_free;
4157 
4158 			/*
4159 			 * A zone is secondary is it is not the first entry
4160 			 * on the keg's zone list.
4161 			 */
4162 			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
4163 			    (LIST_FIRST(&kz->uk_zones) != z))
4164 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
4165 			uma_vm_zone_stats(&uth, z, &sbuf, ups,
4166 			    kz->uk_flags & UMA_ZFLAG_INTERNAL);
4167 			ZONE_UNLOCK(z);
4168 			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
4169 			for (i = 0; i < mp_maxid + 1; i++)
4170 				(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
4171 		}
4172 	}
4173 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
4174 		bzero(&uth, sizeof(uth));
4175 		ZONE_LOCK(z);
4176 		strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
4177 		uth.uth_size = z->uz_size;
4178 		uma_vm_zone_stats(&uth, z, &sbuf, ups, false);
4179 		ZONE_UNLOCK(z);
4180 		(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
4181 		for (i = 0; i < mp_maxid + 1; i++)
4182 			(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
4183 	}
4184 
4185 	rw_runlock(&uma_rwlock);
4186 	error = sbuf_finish(&sbuf);
4187 	sbuf_delete(&sbuf);
4188 	free(ups, M_TEMP);
4189 	return (error);
4190 }
4191 
4192 int
4193 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
4194 {
4195 	uma_zone_t zone = *(uma_zone_t *)arg1;
4196 	int error, max;
4197 
4198 	max = uma_zone_get_max(zone);
4199 	error = sysctl_handle_int(oidp, &max, 0, req);
4200 	if (error || !req->newptr)
4201 		return (error);
4202 
4203 	uma_zone_set_max(zone, max);
4204 
4205 	return (0);
4206 }
4207 
4208 int
4209 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
4210 {
4211 	uma_zone_t zone = *(uma_zone_t *)arg1;
4212 	int cur;
4213 
4214 	cur = uma_zone_get_cur(zone);
4215 	return (sysctl_handle_int(oidp, &cur, 0, req));
4216 }
4217 
4218 #ifdef INVARIANTS
4219 static uma_slab_t
4220 uma_dbg_getslab(uma_zone_t zone, void *item)
4221 {
4222 	uma_slab_t slab;
4223 	uma_keg_t keg;
4224 	uint8_t *mem;
4225 
4226 	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
4227 	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
4228 		slab = vtoslab((vm_offset_t)mem);
4229 	} else {
4230 		/*
4231 		 * It is safe to return the slab here even though the
4232 		 * zone is unlocked because the item's allocation state
4233 		 * essentially holds a reference.
4234 		 */
4235 		if (zone->uz_lockptr == &zone->uz_lock)
4236 			return (NULL);
4237 		ZONE_LOCK(zone);
4238 		keg = zone->uz_keg;
4239 		if (keg->uk_flags & UMA_ZONE_HASH)
4240 			slab = hash_sfind(&keg->uk_hash, mem);
4241 		else
4242 			slab = (uma_slab_t)(mem + keg->uk_pgoff);
4243 		ZONE_UNLOCK(zone);
4244 	}
4245 
4246 	return (slab);
4247 }
4248 
4249 static bool
4250 uma_dbg_zskip(uma_zone_t zone, void *mem)
4251 {
4252 
4253 	if (zone->uz_lockptr == &zone->uz_lock)
4254 		return (true);
4255 
4256 	return (uma_dbg_kskip(zone->uz_keg, mem));
4257 }
4258 
4259 static bool
4260 uma_dbg_kskip(uma_keg_t keg, void *mem)
4261 {
4262 	uintptr_t idx;
4263 
4264 	if (dbg_divisor == 0)
4265 		return (true);
4266 
4267 	if (dbg_divisor == 1)
4268 		return (false);
4269 
4270 	idx = (uintptr_t)mem >> PAGE_SHIFT;
4271 	if (keg->uk_ipers > 1) {
4272 		idx *= keg->uk_ipers;
4273 		idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
4274 	}
4275 
4276 	if ((idx / dbg_divisor) * dbg_divisor != idx) {
4277 		counter_u64_add(uma_skip_cnt, 1);
4278 		return (true);
4279 	}
4280 	counter_u64_add(uma_dbg_cnt, 1);
4281 
4282 	return (false);
4283 }
4284 
4285 /*
4286  * Set up the slab's freei data such that uma_dbg_free can function.
4287  *
4288  */
4289 static void
4290 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
4291 {
4292 	uma_keg_t keg;
4293 	int freei;
4294 
4295 	if (slab == NULL) {
4296 		slab = uma_dbg_getslab(zone, item);
4297 		if (slab == NULL)
4298 			panic("uma: item %p did not belong to zone %s\n",
4299 			    item, zone->uz_name);
4300 	}
4301 	keg = slab->us_keg;
4302 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4303 
4304 	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4305 		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
4306 		    item, zone, zone->uz_name, slab, freei);
4307 	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4308 
4309 	return;
4310 }
4311 
4312 /*
4313  * Verifies freed addresses.  Checks for alignment, valid slab membership
4314  * and duplicate frees.
4315  *
4316  */
4317 static void
4318 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
4319 {
4320 	uma_keg_t keg;
4321 	int freei;
4322 
4323 	if (slab == NULL) {
4324 		slab = uma_dbg_getslab(zone, item);
4325 		if (slab == NULL)
4326 			panic("uma: Freed item %p did not belong to zone %s\n",
4327 			    item, zone->uz_name);
4328 	}
4329 	keg = slab->us_keg;
4330 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4331 
4332 	if (freei >= keg->uk_ipers)
4333 		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
4334 		    item, zone, zone->uz_name, slab, freei);
4335 
4336 	if (((freei * keg->uk_rsize) + slab->us_data) != item)
4337 		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
4338 		    item, zone, zone->uz_name, slab, freei);
4339 
4340 	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4341 		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
4342 		    item, zone, zone->uz_name, slab, freei);
4343 
4344 	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4345 }
4346 #endif /* INVARIANTS */
4347 
4348 #ifdef DDB
4349 static int64_t
4350 get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used,
4351     uint64_t *sleeps, long *cachefree, uint64_t *xdomain)
4352 {
4353 	uint64_t frees;
4354 	int i;
4355 
4356 	if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
4357 		*allocs = counter_u64_fetch(z->uz_allocs);
4358 		frees = counter_u64_fetch(z->uz_frees);
4359 		*sleeps = z->uz_sleeps;
4360 		*cachefree = 0;
4361 		*xdomain = 0;
4362 	} else
4363 		uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps,
4364 		    xdomain);
4365 	if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
4366 	    (LIST_FIRST(&kz->uk_zones) != z)))
4367 		*cachefree += kz->uk_free;
4368 	for (i = 0; i < vm_ndomains; i++)
4369 		*cachefree += z->uz_domain[i].uzd_nitems;
4370 	*used = *allocs - frees;
4371 	return (((int64_t)*used + *cachefree) * kz->uk_size);
4372 }
4373 
4374 DB_SHOW_COMMAND(uma, db_show_uma)
4375 {
4376 	const char *fmt_hdr, *fmt_entry;
4377 	uma_keg_t kz;
4378 	uma_zone_t z;
4379 	uint64_t allocs, used, sleeps, xdomain;
4380 	long cachefree;
4381 	/* variables for sorting */
4382 	uma_keg_t cur_keg;
4383 	uma_zone_t cur_zone, last_zone;
4384 	int64_t cur_size, last_size, size;
4385 	int ties;
4386 
4387 	/* /i option produces machine-parseable CSV output */
4388 	if (modif[0] == 'i') {
4389 		fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n";
4390 		fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n";
4391 	} else {
4392 		fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n";
4393 		fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n";
4394 	}
4395 
4396 	db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests",
4397 	    "Sleeps", "Bucket", "Total Mem", "XFree");
4398 
4399 	/* Sort the zones with largest size first. */
4400 	last_zone = NULL;
4401 	last_size = INT64_MAX;
4402 	for (;;) {
4403 		cur_zone = NULL;
4404 		cur_size = -1;
4405 		ties = 0;
4406 		LIST_FOREACH(kz, &uma_kegs, uk_link) {
4407 			LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4408 				/*
4409 				 * In the case of size ties, print out zones
4410 				 * in the order they are encountered.  That is,
4411 				 * when we encounter the most recently output
4412 				 * zone, we have already printed all preceding
4413 				 * ties, and we must print all following ties.
4414 				 */
4415 				if (z == last_zone) {
4416 					ties = 1;
4417 					continue;
4418 				}
4419 				size = get_uma_stats(kz, z, &allocs, &used,
4420 				    &sleeps, &cachefree, &xdomain);
4421 				if (size > cur_size && size < last_size + ties)
4422 				{
4423 					cur_size = size;
4424 					cur_zone = z;
4425 					cur_keg = kz;
4426 				}
4427 			}
4428 		}
4429 		if (cur_zone == NULL)
4430 			break;
4431 
4432 		size = get_uma_stats(cur_keg, cur_zone, &allocs, &used,
4433 		    &sleeps, &cachefree, &xdomain);
4434 		db_printf(fmt_entry, cur_zone->uz_name,
4435 		    (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree,
4436 		    (uintmax_t)allocs, (uintmax_t)sleeps,
4437 		    (unsigned)cur_zone->uz_count, (intmax_t)size, xdomain);
4438 
4439 		if (db_pager_quit)
4440 			return;
4441 		last_zone = cur_zone;
4442 		last_size = cur_size;
4443 	}
4444 }
4445 
4446 DB_SHOW_COMMAND(umacache, db_show_umacache)
4447 {
4448 	uma_zone_t z;
4449 	uint64_t allocs, frees;
4450 	long cachefree;
4451 	int i;
4452 
4453 	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
4454 	    "Requests", "Bucket");
4455 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
4456 		uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL);
4457 		for (i = 0; i < vm_ndomains; i++)
4458 			cachefree += z->uz_domain[i].uzd_nitems;
4459 		db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",
4460 		    z->uz_name, (uintmax_t)z->uz_size,
4461 		    (intmax_t)(allocs - frees), cachefree,
4462 		    (uintmax_t)allocs, z->uz_count);
4463 		if (db_pager_quit)
4464 			return;
4465 	}
4466 }
4467 #endif	/* DDB */
4468