xref: /freebsd/sys/vm/uma_core.c (revision e9b148a3185f41e3a09e91ea75cae7828d908845)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
5  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
6  * Copyright (c) 2004-2006 Robert N. M. Watson
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice unmodified, this list of conditions, and the following
14  *    disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * uma_core.c  Implementation of the Universal Memory allocator
33  *
34  * This allocator is intended to replace the multitude of similar object caches
35  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
36  * efficient.  A primary design goal is to return unused memory to the rest of
37  * the system.  This will make the system as a whole more flexible due to the
38  * ability to move memory to subsystems which most need it instead of leaving
39  * pools of reserved memory unused.
40  *
41  * The basic ideas stem from similar slab/zone based allocators whose algorithms
42  * are well known.
43  *
44  */
45 
46 /*
47  * TODO:
48  *	- Improve memory usage for large allocations
49  *	- Investigate cache size adjustments
50  */
51 
52 #include <sys/cdefs.h>
53 __FBSDID("$FreeBSD$");
54 
55 #include "opt_ddb.h"
56 #include "opt_param.h"
57 #include "opt_vm.h"
58 
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/bitset.h>
62 #include <sys/domainset.h>
63 #include <sys/eventhandler.h>
64 #include <sys/kernel.h>
65 #include <sys/types.h>
66 #include <sys/limits.h>
67 #include <sys/queue.h>
68 #include <sys/malloc.h>
69 #include <sys/ktr.h>
70 #include <sys/lock.h>
71 #include <sys/sysctl.h>
72 #include <sys/mutex.h>
73 #include <sys/proc.h>
74 #include <sys/random.h>
75 #include <sys/rwlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/taskqueue.h>
80 #include <sys/vmmeter.h>
81 
82 #include <vm/vm.h>
83 #include <vm/vm_domainset.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_param.h>
88 #include <vm/vm_phys.h>
89 #include <vm/vm_pagequeue.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_kern.h>
92 #include <vm/vm_extern.h>
93 #include <vm/uma.h>
94 #include <vm/uma_int.h>
95 #include <vm/uma_dbg.h>
96 
97 #include <ddb/ddb.h>
98 
99 #ifdef DEBUG_MEMGUARD
100 #include <vm/memguard.h>
101 #endif
102 
103 /*
104  * This is the zone and keg from which all zones are spawned.
105  */
106 static uma_zone_t kegs;
107 static uma_zone_t zones;
108 
109 /* This is the zone from which all offpage uma_slab_ts are allocated. */
110 static uma_zone_t slabzone;
111 
112 /*
113  * The initial hash tables come out of this zone so they can be allocated
114  * prior to malloc coming up.
115  */
116 static uma_zone_t hashzone;
117 
118 /* The boot-time adjusted value for cache line alignment. */
119 int uma_align_cache = 64 - 1;
120 
121 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
122 
123 /*
124  * Are we allowed to allocate buckets?
125  */
126 static int bucketdisable = 1;
127 
128 /* Linked list of all kegs in the system */
129 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
130 
131 /* Linked list of all cache-only zones in the system */
132 static LIST_HEAD(,uma_zone) uma_cachezones =
133     LIST_HEAD_INITIALIZER(uma_cachezones);
134 
135 /* This RW lock protects the keg list */
136 static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
137 
138 /*
139  * Pointer and counter to pool of pages, that is preallocated at
140  * startup to bootstrap UMA.
141  */
142 static char *bootmem;
143 static int boot_pages;
144 
145 static struct sx uma_reclaim_lock;
146 
147 /*
148  * kmem soft limit, initialized by uma_set_limit().  Ensure that early
149  * allocations don't trigger a wakeup of the reclaim thread.
150  */
151 static unsigned long uma_kmem_limit = LONG_MAX;
152 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0,
153     "UMA kernel memory soft limit");
154 static unsigned long uma_kmem_total;
155 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0,
156     "UMA kernel memory usage");
157 
158 /* Is the VM done starting up? */
159 static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
160     BOOT_RUNNING } booted = BOOT_COLD;
161 
162 /*
163  * This is the handle used to schedule events that need to happen
164  * outside of the allocation fast path.
165  */
166 static struct callout uma_callout;
167 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
168 
169 /*
170  * This structure is passed as the zone ctor arg so that I don't have to create
171  * a special allocation function just for zones.
172  */
173 struct uma_zctor_args {
174 	const char *name;
175 	size_t size;
176 	uma_ctor ctor;
177 	uma_dtor dtor;
178 	uma_init uminit;
179 	uma_fini fini;
180 	uma_import import;
181 	uma_release release;
182 	void *arg;
183 	uma_keg_t keg;
184 	int align;
185 	uint32_t flags;
186 };
187 
188 struct uma_kctor_args {
189 	uma_zone_t zone;
190 	size_t size;
191 	uma_init uminit;
192 	uma_fini fini;
193 	int align;
194 	uint32_t flags;
195 };
196 
197 struct uma_bucket_zone {
198 	uma_zone_t	ubz_zone;
199 	char		*ubz_name;
200 	int		ubz_entries;	/* Number of items it can hold. */
201 	int		ubz_maxsize;	/* Maximum allocation size per-item. */
202 };
203 
204 /*
205  * Compute the actual number of bucket entries to pack them in power
206  * of two sizes for more efficient space utilization.
207  */
208 #define	BUCKET_SIZE(n)						\
209     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
210 
211 #define	BUCKET_MAX	BUCKET_SIZE(256)
212 #define	BUCKET_MIN	BUCKET_SIZE(4)
213 
214 struct uma_bucket_zone bucket_zones[] = {
215 	{ NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
216 	{ NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
217 	{ NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
218 	{ NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
219 	{ NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
220 	{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
221 	{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
222 	{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
223 	{ NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
224 	{ NULL, NULL, 0}
225 };
226 
227 /*
228  * Flags and enumerations to be passed to internal functions.
229  */
230 enum zfreeskip {
231 	SKIP_NONE =	0,
232 	SKIP_CNT =	0x00000001,
233 	SKIP_DTOR =	0x00010000,
234 	SKIP_FINI =	0x00020000,
235 };
236 
237 /* Prototypes.. */
238 
239 int	uma_startup_count(int);
240 void	uma_startup(void *, int);
241 void	uma_startup1(void);
242 void	uma_startup2(void);
243 
244 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
245 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
246 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
247 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
248 static void page_free(void *, vm_size_t, uint8_t);
249 static void pcpu_page_free(void *, vm_size_t, uint8_t);
250 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
251 static void cache_drain(uma_zone_t);
252 static void bucket_drain(uma_zone_t, uma_bucket_t);
253 static void bucket_cache_reclaim(uma_zone_t zone, bool);
254 static int keg_ctor(void *, int, void *, int);
255 static void keg_dtor(void *, int, void *);
256 static int zone_ctor(void *, int, void *, int);
257 static void zone_dtor(void *, int, void *);
258 static int zero_init(void *, int, int);
259 static void keg_small_init(uma_keg_t keg);
260 static void keg_large_init(uma_keg_t keg);
261 static void zone_foreach(void (*zfunc)(uma_zone_t));
262 static void zone_timeout(uma_zone_t zone);
263 static int hash_alloc(struct uma_hash *, u_int);
264 static int hash_expand(struct uma_hash *, struct uma_hash *);
265 static void hash_free(struct uma_hash *hash);
266 static void uma_timeout(void *);
267 static void uma_startup3(void);
268 static void *zone_alloc_item(uma_zone_t, void *, int, int);
269 static void *zone_alloc_item_locked(uma_zone_t, void *, int, int);
270 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
271 static void bucket_enable(void);
272 static void bucket_init(void);
273 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
274 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
275 static void bucket_zone_drain(void);
276 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int, int);
277 static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int);
278 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
279 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item);
280 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
281     uma_fini fini, int align, uint32_t flags);
282 static int zone_import(uma_zone_t, void **, int, int, int);
283 static void zone_release(uma_zone_t, void **, int);
284 static void uma_zero_item(void *, uma_zone_t);
285 
286 void uma_print_zone(uma_zone_t);
287 void uma_print_stats(void);
288 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
289 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
290 
291 #ifdef INVARIANTS
292 static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
293 static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
294 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
295 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
296 
297 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0,
298     "Memory allocation debugging");
299 
300 static u_int dbg_divisor = 1;
301 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
302     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
303     "Debug & thrash every this item in memory allocator");
304 
305 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
306 static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
307 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
308     &uma_dbg_cnt, "memory items debugged");
309 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
310     &uma_skip_cnt, "memory items skipped, not debugged");
311 #endif
312 
313 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
314 
315 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
316     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
317 
318 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
319     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
320 
321 static int zone_warnings = 1;
322 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
323     "Warn when UMA zones becomes full");
324 
325 /* Adjust bytes under management by UMA. */
326 static inline void
327 uma_total_dec(unsigned long size)
328 {
329 
330 	atomic_subtract_long(&uma_kmem_total, size);
331 }
332 
333 static inline void
334 uma_total_inc(unsigned long size)
335 {
336 
337 	if (atomic_fetchadd_long(&uma_kmem_total, size) > uma_kmem_limit)
338 		uma_reclaim_wakeup();
339 }
340 
341 /*
342  * This routine checks to see whether or not it's safe to enable buckets.
343  */
344 static void
345 bucket_enable(void)
346 {
347 	bucketdisable = vm_page_count_min();
348 }
349 
350 /*
351  * Initialize bucket_zones, the array of zones of buckets of various sizes.
352  *
353  * For each zone, calculate the memory required for each bucket, consisting
354  * of the header and an array of pointers.
355  */
356 static void
357 bucket_init(void)
358 {
359 	struct uma_bucket_zone *ubz;
360 	int size;
361 
362 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
363 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
364 		size += sizeof(void *) * ubz->ubz_entries;
365 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
366 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
367 		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA);
368 	}
369 }
370 
371 /*
372  * Given a desired number of entries for a bucket, return the zone from which
373  * to allocate the bucket.
374  */
375 static struct uma_bucket_zone *
376 bucket_zone_lookup(int entries)
377 {
378 	struct uma_bucket_zone *ubz;
379 
380 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
381 		if (ubz->ubz_entries >= entries)
382 			return (ubz);
383 	ubz--;
384 	return (ubz);
385 }
386 
387 static int
388 bucket_select(int size)
389 {
390 	struct uma_bucket_zone *ubz;
391 
392 	ubz = &bucket_zones[0];
393 	if (size > ubz->ubz_maxsize)
394 		return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
395 
396 	for (; ubz->ubz_entries != 0; ubz++)
397 		if (ubz->ubz_maxsize < size)
398 			break;
399 	ubz--;
400 	return (ubz->ubz_entries);
401 }
402 
403 static uma_bucket_t
404 bucket_alloc(uma_zone_t zone, void *udata, int flags)
405 {
406 	struct uma_bucket_zone *ubz;
407 	uma_bucket_t bucket;
408 
409 	/*
410 	 * This is to stop us from allocating per cpu buckets while we're
411 	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
412 	 * boot pages.  This also prevents us from allocating buckets in
413 	 * low memory situations.
414 	 */
415 	if (bucketdisable)
416 		return (NULL);
417 	/*
418 	 * To limit bucket recursion we store the original zone flags
419 	 * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
420 	 * NOVM flag to persist even through deep recursions.  We also
421 	 * store ZFLAG_BUCKET once we have recursed attempting to allocate
422 	 * a bucket for a bucket zone so we do not allow infinite bucket
423 	 * recursion.  This cookie will even persist to frees of unused
424 	 * buckets via the allocation path or bucket allocations in the
425 	 * free path.
426 	 */
427 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
428 		udata = (void *)(uintptr_t)zone->uz_flags;
429 	else {
430 		if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
431 			return (NULL);
432 		udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
433 	}
434 	if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
435 		flags |= M_NOVM;
436 	ubz = bucket_zone_lookup(zone->uz_count);
437 	if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
438 		ubz++;
439 	bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
440 	if (bucket) {
441 #ifdef INVARIANTS
442 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
443 #endif
444 		bucket->ub_cnt = 0;
445 		bucket->ub_entries = ubz->ubz_entries;
446 	}
447 
448 	return (bucket);
449 }
450 
451 static void
452 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
453 {
454 	struct uma_bucket_zone *ubz;
455 
456 	KASSERT(bucket->ub_cnt == 0,
457 	    ("bucket_free: Freeing a non free bucket."));
458 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
459 		udata = (void *)(uintptr_t)zone->uz_flags;
460 	ubz = bucket_zone_lookup(bucket->ub_entries);
461 	uma_zfree_arg(ubz->ubz_zone, bucket, udata);
462 }
463 
464 static void
465 bucket_zone_drain(void)
466 {
467 	struct uma_bucket_zone *ubz;
468 
469 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
470 		uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN);
471 }
472 
473 /*
474  * Attempt to satisfy an allocation by retrieving a full bucket from one of the
475  * zone's caches.
476  */
477 static uma_bucket_t
478 zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom)
479 {
480 	uma_bucket_t bucket;
481 
482 	ZONE_LOCK_ASSERT(zone);
483 
484 	if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) != NULL) {
485 		MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
486 		TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
487 		zdom->uzd_nitems -= bucket->ub_cnt;
488 		if (zdom->uzd_imin > zdom->uzd_nitems)
489 			zdom->uzd_imin = zdom->uzd_nitems;
490 		zone->uz_bkt_count -= bucket->ub_cnt;
491 	}
492 	return (bucket);
493 }
494 
495 /*
496  * Insert a full bucket into the specified cache.  The "ws" parameter indicates
497  * whether the bucket's contents should be counted as part of the zone's working
498  * set.
499  */
500 static void
501 zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket,
502     const bool ws)
503 {
504 
505 	ZONE_LOCK_ASSERT(zone);
506 	KASSERT(zone->uz_bkt_count < zone->uz_bkt_max, ("%s: zone %p overflow",
507 	    __func__, zone));
508 
509 	if (ws)
510 		TAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
511 	else
512 		TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link);
513 	zdom->uzd_nitems += bucket->ub_cnt;
514 	if (ws && zdom->uzd_imax < zdom->uzd_nitems)
515 		zdom->uzd_imax = zdom->uzd_nitems;
516 	zone->uz_bkt_count += bucket->ub_cnt;
517 }
518 
519 static void
520 zone_log_warning(uma_zone_t zone)
521 {
522 	static const struct timeval warninterval = { 300, 0 };
523 
524 	if (!zone_warnings || zone->uz_warning == NULL)
525 		return;
526 
527 	if (ratecheck(&zone->uz_ratecheck, &warninterval))
528 		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
529 }
530 
531 static inline void
532 zone_maxaction(uma_zone_t zone)
533 {
534 
535 	if (zone->uz_maxaction.ta_func != NULL)
536 		taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
537 }
538 
539 /*
540  * Routine called by timeout which is used to fire off some time interval
541  * based calculations.  (stats, hash size, etc.)
542  *
543  * Arguments:
544  *	arg   Unused
545  *
546  * Returns:
547  *	Nothing
548  */
549 static void
550 uma_timeout(void *unused)
551 {
552 	bucket_enable();
553 	zone_foreach(zone_timeout);
554 
555 	/* Reschedule this event */
556 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
557 }
558 
559 /*
560  * Update the working set size estimate for the zone's bucket cache.
561  * The constants chosen here are somewhat arbitrary.  With an update period of
562  * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the
563  * last 100s.
564  */
565 static void
566 zone_domain_update_wss(uma_zone_domain_t zdom)
567 {
568 	long wss;
569 
570 	MPASS(zdom->uzd_imax >= zdom->uzd_imin);
571 	wss = zdom->uzd_imax - zdom->uzd_imin;
572 	zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems;
573 	zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5;
574 }
575 
576 /*
577  * Routine to perform timeout driven calculations.  This expands the
578  * hashes and does per cpu statistics aggregation.
579  *
580  *  Returns nothing.
581  */
582 static void
583 zone_timeout(uma_zone_t zone)
584 {
585 	uma_keg_t keg = zone->uz_keg;
586 	u_int slabs;
587 
588 	KEG_LOCK(keg);
589 	/*
590 	 * Expand the keg hash table.
591 	 *
592 	 * This is done if the number of slabs is larger than the hash size.
593 	 * What I'm trying to do here is completely reduce collisions.  This
594 	 * may be a little aggressive.  Should I allow for two collisions max?
595 	 */
596 	if (keg->uk_flags & UMA_ZONE_HASH &&
597 	    (slabs = keg->uk_pages / keg->uk_ppera) >
598 	     keg->uk_hash.uh_hashsize) {
599 		struct uma_hash newhash;
600 		struct uma_hash oldhash;
601 		int ret;
602 
603 		/*
604 		 * This is so involved because allocating and freeing
605 		 * while the keg lock is held will lead to deadlock.
606 		 * I have to do everything in stages and check for
607 		 * races.
608 		 */
609 		KEG_UNLOCK(keg);
610 		ret = hash_alloc(&newhash, 1 << fls(slabs));
611 		KEG_LOCK(keg);
612 		if (ret) {
613 			if (hash_expand(&keg->uk_hash, &newhash)) {
614 				oldhash = keg->uk_hash;
615 				keg->uk_hash = newhash;
616 			} else
617 				oldhash = newhash;
618 
619 			KEG_UNLOCK(keg);
620 			hash_free(&oldhash);
621 			return;
622 		}
623 	}
624 	KEG_UNLOCK(keg);
625 
626 	ZONE_LOCK(zone);
627 	for (int i = 0; i < vm_ndomains; i++)
628 		zone_domain_update_wss(&zone->uz_domain[i]);
629 	ZONE_UNLOCK(zone);
630 }
631 
632 /*
633  * Allocate and zero fill the next sized hash table from the appropriate
634  * backing store.
635  *
636  * Arguments:
637  *	hash  A new hash structure with the old hash size in uh_hashsize
638  *
639  * Returns:
640  *	1 on success and 0 on failure.
641  */
642 static int
643 hash_alloc(struct uma_hash *hash, u_int size)
644 {
645 	size_t alloc;
646 
647 	KASSERT(powerof2(size), ("hash size must be power of 2"));
648 	if (size > UMA_HASH_SIZE_INIT)  {
649 		hash->uh_hashsize = size;
650 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
651 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
652 		    M_UMAHASH, M_NOWAIT);
653 	} else {
654 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
655 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
656 		    UMA_ANYDOMAIN, M_WAITOK);
657 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
658 	}
659 	if (hash->uh_slab_hash) {
660 		bzero(hash->uh_slab_hash, alloc);
661 		hash->uh_hashmask = hash->uh_hashsize - 1;
662 		return (1);
663 	}
664 
665 	return (0);
666 }
667 
668 /*
669  * Expands the hash table for HASH zones.  This is done from zone_timeout
670  * to reduce collisions.  This must not be done in the regular allocation
671  * path, otherwise, we can recurse on the vm while allocating pages.
672  *
673  * Arguments:
674  *	oldhash  The hash you want to expand
675  *	newhash  The hash structure for the new table
676  *
677  * Returns:
678  *	Nothing
679  *
680  * Discussion:
681  */
682 static int
683 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
684 {
685 	uma_slab_t slab;
686 	u_int hval;
687 	u_int idx;
688 
689 	if (!newhash->uh_slab_hash)
690 		return (0);
691 
692 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
693 		return (0);
694 
695 	/*
696 	 * I need to investigate hash algorithms for resizing without a
697 	 * full rehash.
698 	 */
699 
700 	for (idx = 0; idx < oldhash->uh_hashsize; idx++)
701 		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
702 			slab = SLIST_FIRST(&oldhash->uh_slab_hash[idx]);
703 			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[idx], us_hlink);
704 			hval = UMA_HASH(newhash, slab->us_data);
705 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
706 			    slab, us_hlink);
707 		}
708 
709 	return (1);
710 }
711 
712 /*
713  * Free the hash bucket to the appropriate backing store.
714  *
715  * Arguments:
716  *	slab_hash  The hash bucket we're freeing
717  *	hashsize   The number of entries in that hash bucket
718  *
719  * Returns:
720  *	Nothing
721  */
722 static void
723 hash_free(struct uma_hash *hash)
724 {
725 	if (hash->uh_slab_hash == NULL)
726 		return;
727 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
728 		zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
729 	else
730 		free(hash->uh_slab_hash, M_UMAHASH);
731 }
732 
733 /*
734  * Frees all outstanding items in a bucket
735  *
736  * Arguments:
737  *	zone   The zone to free to, must be unlocked.
738  *	bucket The free/alloc bucket with items, cpu queue must be locked.
739  *
740  * Returns:
741  *	Nothing
742  */
743 
744 static void
745 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
746 {
747 	int i;
748 
749 	if (bucket == NULL)
750 		return;
751 
752 	if (zone->uz_fini)
753 		for (i = 0; i < bucket->ub_cnt; i++)
754 			zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
755 	zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
756 	if (zone->uz_max_items > 0) {
757 		ZONE_LOCK(zone);
758 		zone->uz_items -= bucket->ub_cnt;
759 		if (zone->uz_sleepers && zone->uz_items < zone->uz_max_items)
760 			wakeup_one(zone);
761 		ZONE_UNLOCK(zone);
762 	}
763 	bucket->ub_cnt = 0;
764 }
765 
766 /*
767  * Drains the per cpu caches for a zone.
768  *
769  * NOTE: This may only be called while the zone is being turn down, and not
770  * during normal operation.  This is necessary in order that we do not have
771  * to migrate CPUs to drain the per-CPU caches.
772  *
773  * Arguments:
774  *	zone     The zone to drain, must be unlocked.
775  *
776  * Returns:
777  *	Nothing
778  */
779 static void
780 cache_drain(uma_zone_t zone)
781 {
782 	uma_cache_t cache;
783 	int cpu;
784 
785 	/*
786 	 * XXX: It is safe to not lock the per-CPU caches, because we're
787 	 * tearing down the zone anyway.  I.e., there will be no further use
788 	 * of the caches at this point.
789 	 *
790 	 * XXX: It would good to be able to assert that the zone is being
791 	 * torn down to prevent improper use of cache_drain().
792 	 *
793 	 * XXX: We lock the zone before passing into bucket_cache_reclaim() as
794 	 * it is used elsewhere.  Should the tear-down path be made special
795 	 * there in some form?
796 	 */
797 	CPU_FOREACH(cpu) {
798 		cache = &zone->uz_cpu[cpu];
799 		bucket_drain(zone, cache->uc_allocbucket);
800 		if (cache->uc_allocbucket != NULL)
801 			bucket_free(zone, cache->uc_allocbucket, NULL);
802 		cache->uc_allocbucket = NULL;
803 		bucket_drain(zone, cache->uc_freebucket);
804 		if (cache->uc_freebucket != NULL)
805 			bucket_free(zone, cache->uc_freebucket, NULL);
806 		cache->uc_freebucket = NULL;
807 		bucket_drain(zone, cache->uc_crossbucket);
808 		if (cache->uc_crossbucket != NULL)
809 			bucket_free(zone, cache->uc_crossbucket, NULL);
810 		cache->uc_crossbucket = NULL;
811 	}
812 	ZONE_LOCK(zone);
813 	bucket_cache_reclaim(zone, true);
814 	ZONE_UNLOCK(zone);
815 }
816 
817 static void
818 cache_shrink(uma_zone_t zone)
819 {
820 
821 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
822 		return;
823 
824 	ZONE_LOCK(zone);
825 	zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
826 	ZONE_UNLOCK(zone);
827 }
828 
829 static void
830 cache_drain_safe_cpu(uma_zone_t zone)
831 {
832 	uma_cache_t cache;
833 	uma_bucket_t b1, b2, b3;
834 	int domain;
835 
836 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
837 		return;
838 
839 	b1 = b2 = b3 = NULL;
840 	ZONE_LOCK(zone);
841 	critical_enter();
842 	if (zone->uz_flags & UMA_ZONE_NUMA)
843 		domain = PCPU_GET(domain);
844 	else
845 		domain = 0;
846 	cache = &zone->uz_cpu[curcpu];
847 	if (cache->uc_allocbucket) {
848 		if (cache->uc_allocbucket->ub_cnt != 0)
849 			zone_put_bucket(zone, &zone->uz_domain[domain],
850 			    cache->uc_allocbucket, false);
851 		else
852 			b1 = cache->uc_allocbucket;
853 		cache->uc_allocbucket = NULL;
854 	}
855 	if (cache->uc_freebucket) {
856 		if (cache->uc_freebucket->ub_cnt != 0)
857 			zone_put_bucket(zone, &zone->uz_domain[domain],
858 			    cache->uc_freebucket, false);
859 		else
860 			b2 = cache->uc_freebucket;
861 		cache->uc_freebucket = NULL;
862 	}
863 	b3 = cache->uc_crossbucket;
864 	cache->uc_crossbucket = NULL;
865 	critical_exit();
866 	ZONE_UNLOCK(zone);
867 	if (b1)
868 		bucket_free(zone, b1, NULL);
869 	if (b2)
870 		bucket_free(zone, b2, NULL);
871 	if (b3) {
872 		bucket_drain(zone, b3);
873 		bucket_free(zone, b3, NULL);
874 	}
875 }
876 
877 /*
878  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
879  * This is an expensive call because it needs to bind to all CPUs
880  * one by one and enter a critical section on each of them in order
881  * to safely access their cache buckets.
882  * Zone lock must not be held on call this function.
883  */
884 static void
885 pcpu_cache_drain_safe(uma_zone_t zone)
886 {
887 	int cpu;
888 
889 	/*
890 	 * Polite bucket sizes shrinking was not enouth, shrink aggressively.
891 	 */
892 	if (zone)
893 		cache_shrink(zone);
894 	else
895 		zone_foreach(cache_shrink);
896 
897 	CPU_FOREACH(cpu) {
898 		thread_lock(curthread);
899 		sched_bind(curthread, cpu);
900 		thread_unlock(curthread);
901 
902 		if (zone)
903 			cache_drain_safe_cpu(zone);
904 		else
905 			zone_foreach(cache_drain_safe_cpu);
906 	}
907 	thread_lock(curthread);
908 	sched_unbind(curthread);
909 	thread_unlock(curthread);
910 }
911 
912 /*
913  * Reclaim cached buckets from a zone.  All buckets are reclaimed if the caller
914  * requested a drain, otherwise the per-domain caches are trimmed to either
915  * estimated working set size.
916  */
917 static void
918 bucket_cache_reclaim(uma_zone_t zone, bool drain)
919 {
920 	uma_zone_domain_t zdom;
921 	uma_bucket_t bucket;
922 	long target, tofree;
923 	int i;
924 
925 	for (i = 0; i < vm_ndomains; i++) {
926 		zdom = &zone->uz_domain[i];
927 
928 		/*
929 		 * If we were asked to drain the zone, we are done only once
930 		 * this bucket cache is empty.  Otherwise, we reclaim items in
931 		 * excess of the zone's estimated working set size.  If the
932 		 * difference nitems - imin is larger than the WSS estimate,
933 		 * then the estimate will grow at the end of this interval and
934 		 * we ignore the historical average.
935 		 */
936 		target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems -
937 		    zdom->uzd_imin);
938 		while (zdom->uzd_nitems > target) {
939 			bucket = TAILQ_LAST(&zdom->uzd_buckets, uma_bucketlist);
940 			if (bucket == NULL)
941 				break;
942 			tofree = bucket->ub_cnt;
943 			TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
944 			zdom->uzd_nitems -= tofree;
945 
946 			/*
947 			 * Shift the bounds of the current WSS interval to avoid
948 			 * perturbing the estimate.
949 			 */
950 			zdom->uzd_imax -= lmin(zdom->uzd_imax, tofree);
951 			zdom->uzd_imin -= lmin(zdom->uzd_imin, tofree);
952 
953 			ZONE_UNLOCK(zone);
954 			bucket_drain(zone, bucket);
955 			bucket_free(zone, bucket, NULL);
956 			ZONE_LOCK(zone);
957 		}
958 	}
959 
960 	/*
961 	 * Shrink the zone bucket size to ensure that the per-CPU caches
962 	 * don't grow too large.
963 	 */
964 	if (zone->uz_count > zone->uz_count_min)
965 		zone->uz_count--;
966 }
967 
968 static void
969 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
970 {
971 	uint8_t *mem;
972 	int i;
973 	uint8_t flags;
974 
975 	CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
976 	    keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
977 
978 	mem = slab->us_data;
979 	flags = slab->us_flags;
980 	i = start;
981 	if (keg->uk_fini != NULL) {
982 		for (i--; i > -1; i--)
983 #ifdef INVARIANTS
984 		/*
985 		 * trash_fini implies that dtor was trash_dtor. trash_fini
986 		 * would check that memory hasn't been modified since free,
987 		 * which executed trash_dtor.
988 		 * That's why we need to run uma_dbg_kskip() check here,
989 		 * albeit we don't make skip check for other init/fini
990 		 * invocations.
991 		 */
992 		if (!uma_dbg_kskip(keg, slab->us_data + (keg->uk_rsize * i)) ||
993 		    keg->uk_fini != trash_fini)
994 #endif
995 			keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
996 			    keg->uk_size);
997 	}
998 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
999 		zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1000 	keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
1001 	uma_total_dec(PAGE_SIZE * keg->uk_ppera);
1002 }
1003 
1004 /*
1005  * Frees pages from a keg back to the system.  This is done on demand from
1006  * the pageout daemon.
1007  *
1008  * Returns nothing.
1009  */
1010 static void
1011 keg_drain(uma_keg_t keg)
1012 {
1013 	struct slabhead freeslabs = { 0 };
1014 	uma_domain_t dom;
1015 	uma_slab_t slab, tmp;
1016 	int i;
1017 
1018 	/*
1019 	 * We don't want to take pages from statically allocated kegs at this
1020 	 * time
1021 	 */
1022 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
1023 		return;
1024 
1025 	CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u",
1026 	    keg->uk_name, keg, keg->uk_free);
1027 	KEG_LOCK(keg);
1028 	if (keg->uk_free == 0)
1029 		goto finished;
1030 
1031 	for (i = 0; i < vm_ndomains; i++) {
1032 		dom = &keg->uk_domain[i];
1033 		LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) {
1034 			/* We have nowhere to free these to. */
1035 			if (slab->us_flags & UMA_SLAB_BOOT)
1036 				continue;
1037 
1038 			LIST_REMOVE(slab, us_link);
1039 			keg->uk_pages -= keg->uk_ppera;
1040 			keg->uk_free -= keg->uk_ipers;
1041 
1042 			if (keg->uk_flags & UMA_ZONE_HASH)
1043 				UMA_HASH_REMOVE(&keg->uk_hash, slab,
1044 				    slab->us_data);
1045 
1046 			SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
1047 		}
1048 	}
1049 
1050 finished:
1051 	KEG_UNLOCK(keg);
1052 
1053 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
1054 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
1055 		keg_free_slab(keg, slab, keg->uk_ipers);
1056 	}
1057 }
1058 
1059 static void
1060 zone_reclaim(uma_zone_t zone, int waitok, bool drain)
1061 {
1062 
1063 	/*
1064 	 * Set draining to interlock with zone_dtor() so we can release our
1065 	 * locks as we go.  Only dtor() should do a WAITOK call since it
1066 	 * is the only call that knows the structure will still be available
1067 	 * when it wakes up.
1068 	 */
1069 	ZONE_LOCK(zone);
1070 	while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) {
1071 		if (waitok == M_NOWAIT)
1072 			goto out;
1073 		msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
1074 	}
1075 	zone->uz_flags |= UMA_ZFLAG_RECLAIMING;
1076 	bucket_cache_reclaim(zone, drain);
1077 	ZONE_UNLOCK(zone);
1078 
1079 	/*
1080 	 * The DRAINING flag protects us from being freed while
1081 	 * we're running.  Normally the uma_rwlock would protect us but we
1082 	 * must be able to release and acquire the right lock for each keg.
1083 	 */
1084 	keg_drain(zone->uz_keg);
1085 	ZONE_LOCK(zone);
1086 	zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING;
1087 	wakeup(zone);
1088 out:
1089 	ZONE_UNLOCK(zone);
1090 }
1091 
1092 static void
1093 zone_drain(uma_zone_t zone)
1094 {
1095 
1096 	zone_reclaim(zone, M_NOWAIT, true);
1097 }
1098 
1099 static void
1100 zone_trim(uma_zone_t zone)
1101 {
1102 
1103 	zone_reclaim(zone, M_NOWAIT, false);
1104 }
1105 
1106 /*
1107  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
1108  * If the allocation was successful, the keg lock will be held upon return,
1109  * otherwise the keg will be left unlocked.
1110  *
1111  * Arguments:
1112  *	flags   Wait flags for the item initialization routine
1113  *	aflags  Wait flags for the slab allocation
1114  *
1115  * Returns:
1116  *	The slab that was allocated or NULL if there is no memory and the
1117  *	caller specified M_NOWAIT.
1118  */
1119 static uma_slab_t
1120 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags,
1121     int aflags)
1122 {
1123 	uma_alloc allocf;
1124 	uma_slab_t slab;
1125 	unsigned long size;
1126 	uint8_t *mem;
1127 	uint8_t sflags;
1128 	int i;
1129 
1130 	KASSERT(domain >= 0 && domain < vm_ndomains,
1131 	    ("keg_alloc_slab: domain %d out of range", domain));
1132 	KEG_LOCK_ASSERT(keg);
1133 	MPASS(zone->uz_lockptr == &keg->uk_lock);
1134 
1135 	allocf = keg->uk_allocf;
1136 	KEG_UNLOCK(keg);
1137 
1138 	slab = NULL;
1139 	mem = NULL;
1140 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1141 		slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, aflags);
1142 		if (slab == NULL)
1143 			goto out;
1144 	}
1145 
1146 	/*
1147 	 * This reproduces the old vm_zone behavior of zero filling pages the
1148 	 * first time they are added to a zone.
1149 	 *
1150 	 * Malloced items are zeroed in uma_zalloc.
1151 	 */
1152 
1153 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1154 		aflags |= M_ZERO;
1155 	else
1156 		aflags &= ~M_ZERO;
1157 
1158 	if (keg->uk_flags & UMA_ZONE_NODUMP)
1159 		aflags |= M_NODUMP;
1160 
1161 	/* zone is passed for legacy reasons. */
1162 	size = keg->uk_ppera * PAGE_SIZE;
1163 	mem = allocf(zone, size, domain, &sflags, aflags);
1164 	if (mem == NULL) {
1165 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1166 			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1167 		slab = NULL;
1168 		goto out;
1169 	}
1170 	uma_total_inc(size);
1171 
1172 	/* Point the slab into the allocated memory */
1173 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
1174 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
1175 
1176 	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
1177 		for (i = 0; i < keg->uk_ppera; i++)
1178 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
1179 
1180 	slab->us_keg = keg;
1181 	slab->us_data = mem;
1182 	slab->us_freecount = keg->uk_ipers;
1183 	slab->us_flags = sflags;
1184 	slab->us_domain = domain;
1185 	BIT_FILL(SLAB_SETSIZE, &slab->us_free);
1186 #ifdef INVARIANTS
1187 	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
1188 #endif
1189 
1190 	if (keg->uk_init != NULL) {
1191 		for (i = 0; i < keg->uk_ipers; i++)
1192 			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
1193 			    keg->uk_size, flags) != 0)
1194 				break;
1195 		if (i != keg->uk_ipers) {
1196 			keg_free_slab(keg, slab, i);
1197 			slab = NULL;
1198 			goto out;
1199 		}
1200 	}
1201 	KEG_LOCK(keg);
1202 
1203 	CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
1204 	    slab, keg->uk_name, keg);
1205 
1206 	if (keg->uk_flags & UMA_ZONE_HASH)
1207 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
1208 
1209 	keg->uk_pages += keg->uk_ppera;
1210 	keg->uk_free += keg->uk_ipers;
1211 
1212 out:
1213 	return (slab);
1214 }
1215 
1216 /*
1217  * This function is intended to be used early on in place of page_alloc() so
1218  * that we may use the boot time page cache to satisfy allocations before
1219  * the VM is ready.
1220  */
1221 static void *
1222 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1223     int wait)
1224 {
1225 	uma_keg_t keg;
1226 	void *mem;
1227 	int pages;
1228 
1229 	keg = zone->uz_keg;
1230 	/*
1231 	 * If we are in BOOT_BUCKETS or higher, than switch to real
1232 	 * allocator.  Zones with page sized slabs switch at BOOT_PAGEALLOC.
1233 	 */
1234 	switch (booted) {
1235 		case BOOT_COLD:
1236 		case BOOT_STRAPPED:
1237 			break;
1238 		case BOOT_PAGEALLOC:
1239 			if (keg->uk_ppera > 1)
1240 				break;
1241 		case BOOT_BUCKETS:
1242 		case BOOT_RUNNING:
1243 #ifdef UMA_MD_SMALL_ALLOC
1244 			keg->uk_allocf = (keg->uk_ppera > 1) ?
1245 			    page_alloc : uma_small_alloc;
1246 #else
1247 			keg->uk_allocf = page_alloc;
1248 #endif
1249 			return keg->uk_allocf(zone, bytes, domain, pflag, wait);
1250 	}
1251 
1252 	/*
1253 	 * Check our small startup cache to see if it has pages remaining.
1254 	 */
1255 	pages = howmany(bytes, PAGE_SIZE);
1256 	KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
1257 	if (pages > boot_pages)
1258 		panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name);
1259 #ifdef DIAGNOSTIC
1260 	printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name,
1261 	    boot_pages);
1262 #endif
1263 	mem = bootmem;
1264 	boot_pages -= pages;
1265 	bootmem += pages * PAGE_SIZE;
1266 	*pflag = UMA_SLAB_BOOT;
1267 
1268 	return (mem);
1269 }
1270 
1271 /*
1272  * Allocates a number of pages from the system
1273  *
1274  * Arguments:
1275  *	bytes  The number of bytes requested
1276  *	wait  Shall we wait?
1277  *
1278  * Returns:
1279  *	A pointer to the alloced memory or possibly
1280  *	NULL if M_NOWAIT is set.
1281  */
1282 static void *
1283 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1284     int wait)
1285 {
1286 	void *p;	/* Returned page */
1287 
1288 	*pflag = UMA_SLAB_KERNEL;
1289 	p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait);
1290 
1291 	return (p);
1292 }
1293 
1294 static void *
1295 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1296     int wait)
1297 {
1298 	struct pglist alloctail;
1299 	vm_offset_t addr, zkva;
1300 	int cpu, flags;
1301 	vm_page_t p, p_next;
1302 #ifdef NUMA
1303 	struct pcpu *pc;
1304 #endif
1305 
1306 	MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
1307 
1308 	TAILQ_INIT(&alloctail);
1309 	flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1310 	    malloc2vm_flags(wait);
1311 	*pflag = UMA_SLAB_KERNEL;
1312 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
1313 		if (CPU_ABSENT(cpu)) {
1314 			p = vm_page_alloc(NULL, 0, flags);
1315 		} else {
1316 #ifndef NUMA
1317 			p = vm_page_alloc(NULL, 0, flags);
1318 #else
1319 			pc = pcpu_find(cpu);
1320 			p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags);
1321 			if (__predict_false(p == NULL))
1322 				p = vm_page_alloc(NULL, 0, flags);
1323 #endif
1324 		}
1325 		if (__predict_false(p == NULL))
1326 			goto fail;
1327 		TAILQ_INSERT_TAIL(&alloctail, p, listq);
1328 	}
1329 	if ((addr = kva_alloc(bytes)) == 0)
1330 		goto fail;
1331 	zkva = addr;
1332 	TAILQ_FOREACH(p, &alloctail, listq) {
1333 		pmap_qenter(zkva, &p, 1);
1334 		zkva += PAGE_SIZE;
1335 	}
1336 	return ((void*)addr);
1337 fail:
1338 	TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1339 		vm_page_unwire_noq(p);
1340 		vm_page_free(p);
1341 	}
1342 	return (NULL);
1343 }
1344 
1345 /*
1346  * Allocates a number of pages from within an object
1347  *
1348  * Arguments:
1349  *	bytes  The number of bytes requested
1350  *	wait   Shall we wait?
1351  *
1352  * Returns:
1353  *	A pointer to the alloced memory or possibly
1354  *	NULL if M_NOWAIT is set.
1355  */
1356 static void *
1357 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
1358     int wait)
1359 {
1360 	TAILQ_HEAD(, vm_page) alloctail;
1361 	u_long npages;
1362 	vm_offset_t retkva, zkva;
1363 	vm_page_t p, p_next;
1364 	uma_keg_t keg;
1365 
1366 	TAILQ_INIT(&alloctail);
1367 	keg = zone->uz_keg;
1368 
1369 	npages = howmany(bytes, PAGE_SIZE);
1370 	while (npages > 0) {
1371 		p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
1372 		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1373 		    ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
1374 		    VM_ALLOC_NOWAIT));
1375 		if (p != NULL) {
1376 			/*
1377 			 * Since the page does not belong to an object, its
1378 			 * listq is unused.
1379 			 */
1380 			TAILQ_INSERT_TAIL(&alloctail, p, listq);
1381 			npages--;
1382 			continue;
1383 		}
1384 		/*
1385 		 * Page allocation failed, free intermediate pages and
1386 		 * exit.
1387 		 */
1388 		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1389 			vm_page_unwire_noq(p);
1390 			vm_page_free(p);
1391 		}
1392 		return (NULL);
1393 	}
1394 	*flags = UMA_SLAB_PRIV;
1395 	zkva = keg->uk_kva +
1396 	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1397 	retkva = zkva;
1398 	TAILQ_FOREACH(p, &alloctail, listq) {
1399 		pmap_qenter(zkva, &p, 1);
1400 		zkva += PAGE_SIZE;
1401 	}
1402 
1403 	return ((void *)retkva);
1404 }
1405 
1406 /*
1407  * Frees a number of pages to the system
1408  *
1409  * Arguments:
1410  *	mem   A pointer to the memory to be freed
1411  *	size  The size of the memory being freed
1412  *	flags The original p->us_flags field
1413  *
1414  * Returns:
1415  *	Nothing
1416  */
1417 static void
1418 page_free(void *mem, vm_size_t size, uint8_t flags)
1419 {
1420 
1421 	if ((flags & UMA_SLAB_KERNEL) == 0)
1422 		panic("UMA: page_free used with invalid flags %x", flags);
1423 
1424 	kmem_free((vm_offset_t)mem, size);
1425 }
1426 
1427 /*
1428  * Frees pcpu zone allocations
1429  *
1430  * Arguments:
1431  *	mem   A pointer to the memory to be freed
1432  *	size  The size of the memory being freed
1433  *	flags The original p->us_flags field
1434  *
1435  * Returns:
1436  *	Nothing
1437  */
1438 static void
1439 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
1440 {
1441 	vm_offset_t sva, curva;
1442 	vm_paddr_t paddr;
1443 	vm_page_t m;
1444 
1445 	MPASS(size == (mp_maxid+1)*PAGE_SIZE);
1446 	sva = (vm_offset_t)mem;
1447 	for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
1448 		paddr = pmap_kextract(curva);
1449 		m = PHYS_TO_VM_PAGE(paddr);
1450 		vm_page_unwire_noq(m);
1451 		vm_page_free(m);
1452 	}
1453 	pmap_qremove(sva, size >> PAGE_SHIFT);
1454 	kva_free(sva, size);
1455 }
1456 
1457 
1458 /*
1459  * Zero fill initializer
1460  *
1461  * Arguments/Returns follow uma_init specifications
1462  */
1463 static int
1464 zero_init(void *mem, int size, int flags)
1465 {
1466 	bzero(mem, size);
1467 	return (0);
1468 }
1469 
1470 /*
1471  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1472  *
1473  * Arguments
1474  *	keg  The zone we should initialize
1475  *
1476  * Returns
1477  *	Nothing
1478  */
1479 static void
1480 keg_small_init(uma_keg_t keg)
1481 {
1482 	u_int rsize;
1483 	u_int memused;
1484 	u_int wastedspace;
1485 	u_int shsize;
1486 	u_int slabsize;
1487 
1488 	if (keg->uk_flags & UMA_ZONE_PCPU) {
1489 		u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
1490 
1491 		slabsize = UMA_PCPU_ALLOC_SIZE;
1492 		keg->uk_ppera = ncpus;
1493 	} else {
1494 		slabsize = UMA_SLAB_SIZE;
1495 		keg->uk_ppera = 1;
1496 	}
1497 
1498 	/*
1499 	 * Calculate the size of each allocation (rsize) according to
1500 	 * alignment.  If the requested size is smaller than we have
1501 	 * allocation bits for we round it up.
1502 	 */
1503 	rsize = keg->uk_size;
1504 	if (rsize < slabsize / SLAB_SETSIZE)
1505 		rsize = slabsize / SLAB_SETSIZE;
1506 	if (rsize & keg->uk_align)
1507 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1508 	keg->uk_rsize = rsize;
1509 
1510 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1511 	    keg->uk_rsize < UMA_PCPU_ALLOC_SIZE,
1512 	    ("%s: size %u too large", __func__, keg->uk_rsize));
1513 
1514 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1515 		shsize = 0;
1516 	else
1517 		shsize = SIZEOF_UMA_SLAB;
1518 
1519 	if (rsize <= slabsize - shsize)
1520 		keg->uk_ipers = (slabsize - shsize) / rsize;
1521 	else {
1522 		/* Handle special case when we have 1 item per slab, so
1523 		 * alignment requirement can be relaxed. */
1524 		KASSERT(keg->uk_size <= slabsize - shsize,
1525 		    ("%s: size %u greater than slab", __func__, keg->uk_size));
1526 		keg->uk_ipers = 1;
1527 	}
1528 	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1529 	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1530 
1531 	memused = keg->uk_ipers * rsize + shsize;
1532 	wastedspace = slabsize - memused;
1533 
1534 	/*
1535 	 * We can't do OFFPAGE if we're internal or if we've been
1536 	 * asked to not go to the VM for buckets.  If we do this we
1537 	 * may end up going to the VM  for slabs which we do not
1538 	 * want to do if we're UMA_ZFLAG_CACHEONLY as a result
1539 	 * of UMA_ZONE_VM, which clearly forbids it.
1540 	 */
1541 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1542 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1543 		return;
1544 
1545 	/*
1546 	 * See if using an OFFPAGE slab will limit our waste.  Only do
1547 	 * this if it permits more items per-slab.
1548 	 *
1549 	 * XXX We could try growing slabsize to limit max waste as well.
1550 	 * Historically this was not done because the VM could not
1551 	 * efficiently handle contiguous allocations.
1552 	 */
1553 	if ((wastedspace >= slabsize / UMA_MAX_WASTE) &&
1554 	    (keg->uk_ipers < (slabsize / keg->uk_rsize))) {
1555 		keg->uk_ipers = slabsize / keg->uk_rsize;
1556 		KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1557 		    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1558 		CTR6(KTR_UMA, "UMA decided we need offpage slab headers for "
1559 		    "keg: %s(%p), calculated wastedspace = %d, "
1560 		    "maximum wasted space allowed = %d, "
1561 		    "calculated ipers = %d, "
1562 		    "new wasted space = %d\n", keg->uk_name, keg, wastedspace,
1563 		    slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1564 		    slabsize - keg->uk_ipers * keg->uk_rsize);
1565 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1566 	}
1567 
1568 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1569 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1570 		keg->uk_flags |= UMA_ZONE_HASH;
1571 }
1572 
1573 /*
1574  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1575  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1576  * more complicated.
1577  *
1578  * Arguments
1579  *	keg  The keg we should initialize
1580  *
1581  * Returns
1582  *	Nothing
1583  */
1584 static void
1585 keg_large_init(uma_keg_t keg)
1586 {
1587 
1588 	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1589 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1590 	    ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1591 
1592 	keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1593 	keg->uk_ipers = 1;
1594 	keg->uk_rsize = keg->uk_size;
1595 
1596 	/* Check whether we have enough space to not do OFFPAGE. */
1597 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0 &&
1598 	    PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < SIZEOF_UMA_SLAB) {
1599 		/*
1600 		 * We can't do OFFPAGE if we're internal, in which case
1601 		 * we need an extra page per allocation to contain the
1602 		 * slab header.
1603 		 */
1604 		if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
1605 			keg->uk_flags |= UMA_ZONE_OFFPAGE;
1606 		else
1607 			keg->uk_ppera++;
1608 	}
1609 
1610 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1611 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1612 		keg->uk_flags |= UMA_ZONE_HASH;
1613 }
1614 
1615 static void
1616 keg_cachespread_init(uma_keg_t keg)
1617 {
1618 	int alignsize;
1619 	int trailer;
1620 	int pages;
1621 	int rsize;
1622 
1623 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1624 	    ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1625 
1626 	alignsize = keg->uk_align + 1;
1627 	rsize = keg->uk_size;
1628 	/*
1629 	 * We want one item to start on every align boundary in a page.  To
1630 	 * do this we will span pages.  We will also extend the item by the
1631 	 * size of align if it is an even multiple of align.  Otherwise, it
1632 	 * would fall on the same boundary every time.
1633 	 */
1634 	if (rsize & keg->uk_align)
1635 		rsize = (rsize & ~keg->uk_align) + alignsize;
1636 	if ((rsize & alignsize) == 0)
1637 		rsize += alignsize;
1638 	trailer = rsize - keg->uk_size;
1639 	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1640 	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1641 	keg->uk_rsize = rsize;
1642 	keg->uk_ppera = pages;
1643 	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1644 	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1645 	KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
1646 	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1647 	    keg->uk_ipers));
1648 }
1649 
1650 /*
1651  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1652  * the keg onto the global keg list.
1653  *
1654  * Arguments/Returns follow uma_ctor specifications
1655  *	udata  Actually uma_kctor_args
1656  */
1657 static int
1658 keg_ctor(void *mem, int size, void *udata, int flags)
1659 {
1660 	struct uma_kctor_args *arg = udata;
1661 	uma_keg_t keg = mem;
1662 	uma_zone_t zone;
1663 
1664 	bzero(keg, size);
1665 	keg->uk_size = arg->size;
1666 	keg->uk_init = arg->uminit;
1667 	keg->uk_fini = arg->fini;
1668 	keg->uk_align = arg->align;
1669 	keg->uk_free = 0;
1670 	keg->uk_reserve = 0;
1671 	keg->uk_pages = 0;
1672 	keg->uk_flags = arg->flags;
1673 	keg->uk_slabzone = NULL;
1674 
1675 	/*
1676 	 * We use a global round-robin policy by default.  Zones with
1677 	 * UMA_ZONE_NUMA set will use first-touch instead, in which case the
1678 	 * iterator is never run.
1679 	 */
1680 	keg->uk_dr.dr_policy = DOMAINSET_RR();
1681 	keg->uk_dr.dr_iter = 0;
1682 
1683 	/*
1684 	 * The master zone is passed to us at keg-creation time.
1685 	 */
1686 	zone = arg->zone;
1687 	keg->uk_name = zone->uz_name;
1688 
1689 	if (arg->flags & UMA_ZONE_VM)
1690 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1691 
1692 	if (arg->flags & UMA_ZONE_ZINIT)
1693 		keg->uk_init = zero_init;
1694 
1695 	if (arg->flags & UMA_ZONE_MALLOC)
1696 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
1697 
1698 	if (arg->flags & UMA_ZONE_PCPU)
1699 #ifdef SMP
1700 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1701 #else
1702 		keg->uk_flags &= ~UMA_ZONE_PCPU;
1703 #endif
1704 
1705 	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
1706 		keg_cachespread_init(keg);
1707 	} else {
1708 		if (keg->uk_size > UMA_SLAB_SPACE)
1709 			keg_large_init(keg);
1710 		else
1711 			keg_small_init(keg);
1712 	}
1713 
1714 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1715 		keg->uk_slabzone = slabzone;
1716 
1717 	/*
1718 	 * If we haven't booted yet we need allocations to go through the
1719 	 * startup cache until the vm is ready.
1720 	 */
1721 	if (booted < BOOT_PAGEALLOC)
1722 		keg->uk_allocf = startup_alloc;
1723 #ifdef UMA_MD_SMALL_ALLOC
1724 	else if (keg->uk_ppera == 1)
1725 		keg->uk_allocf = uma_small_alloc;
1726 #endif
1727 	else if (keg->uk_flags & UMA_ZONE_PCPU)
1728 		keg->uk_allocf = pcpu_page_alloc;
1729 	else
1730 		keg->uk_allocf = page_alloc;
1731 #ifdef UMA_MD_SMALL_ALLOC
1732 	if (keg->uk_ppera == 1)
1733 		keg->uk_freef = uma_small_free;
1734 	else
1735 #endif
1736 	if (keg->uk_flags & UMA_ZONE_PCPU)
1737 		keg->uk_freef = pcpu_page_free;
1738 	else
1739 		keg->uk_freef = page_free;
1740 
1741 	/*
1742 	 * Initialize keg's lock
1743 	 */
1744 	KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
1745 
1746 	/*
1747 	 * If we're putting the slab header in the actual page we need to
1748 	 * figure out where in each page it goes.  See SIZEOF_UMA_SLAB
1749 	 * macro definition.
1750 	 */
1751 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1752 		keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - SIZEOF_UMA_SLAB;
1753 		/*
1754 		 * The only way the following is possible is if with our
1755 		 * UMA_ALIGN_PTR adjustments we are now bigger than
1756 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
1757 		 * mathematically possible for all cases, so we make
1758 		 * sure here anyway.
1759 		 */
1760 		KASSERT(keg->uk_pgoff + sizeof(struct uma_slab) <=
1761 		    PAGE_SIZE * keg->uk_ppera,
1762 		    ("zone %s ipers %d rsize %d size %d slab won't fit",
1763 		    zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size));
1764 	}
1765 
1766 	if (keg->uk_flags & UMA_ZONE_HASH)
1767 		hash_alloc(&keg->uk_hash, 0);
1768 
1769 	CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n",
1770 	    keg, zone->uz_name, zone,
1771 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
1772 	    keg->uk_free);
1773 
1774 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1775 
1776 	rw_wlock(&uma_rwlock);
1777 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1778 	rw_wunlock(&uma_rwlock);
1779 	return (0);
1780 }
1781 
1782 static void
1783 zone_alloc_counters(uma_zone_t zone)
1784 {
1785 
1786 	zone->uz_allocs = counter_u64_alloc(M_WAITOK);
1787 	zone->uz_frees = counter_u64_alloc(M_WAITOK);
1788 	zone->uz_fails = counter_u64_alloc(M_WAITOK);
1789 }
1790 
1791 /*
1792  * Zone header ctor.  This initializes all fields, locks, etc.
1793  *
1794  * Arguments/Returns follow uma_ctor specifications
1795  *	udata  Actually uma_zctor_args
1796  */
1797 static int
1798 zone_ctor(void *mem, int size, void *udata, int flags)
1799 {
1800 	struct uma_zctor_args *arg = udata;
1801 	uma_zone_t zone = mem;
1802 	uma_zone_t z;
1803 	uma_keg_t keg;
1804 	int i;
1805 
1806 	bzero(zone, size);
1807 	zone->uz_name = arg->name;
1808 	zone->uz_ctor = arg->ctor;
1809 	zone->uz_dtor = arg->dtor;
1810 	zone->uz_init = NULL;
1811 	zone->uz_fini = NULL;
1812 	zone->uz_sleeps = 0;
1813 	zone->uz_xdomain = 0;
1814 	zone->uz_count = 0;
1815 	zone->uz_count_min = 0;
1816 	zone->uz_count_max = BUCKET_MAX;
1817 	zone->uz_flags = 0;
1818 	zone->uz_warning = NULL;
1819 	/* The domain structures follow the cpu structures. */
1820 	zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus];
1821 	zone->uz_bkt_max = ULONG_MAX;
1822 	timevalclear(&zone->uz_ratecheck);
1823 
1824 	if (__predict_true(booted == BOOT_RUNNING))
1825 		zone_alloc_counters(zone);
1826 	else {
1827 		zone->uz_allocs = EARLY_COUNTER;
1828 		zone->uz_frees = EARLY_COUNTER;
1829 		zone->uz_fails = EARLY_COUNTER;
1830 	}
1831 
1832 	for (i = 0; i < vm_ndomains; i++)
1833 		TAILQ_INIT(&zone->uz_domain[i].uzd_buckets);
1834 
1835 	/*
1836 	 * This is a pure cache zone, no kegs.
1837 	 */
1838 	if (arg->import) {
1839 		if (arg->flags & UMA_ZONE_VM)
1840 			arg->flags |= UMA_ZFLAG_CACHEONLY;
1841 		zone->uz_flags = arg->flags;
1842 		zone->uz_size = arg->size;
1843 		zone->uz_import = arg->import;
1844 		zone->uz_release = arg->release;
1845 		zone->uz_arg = arg->arg;
1846 		zone->uz_lockptr = &zone->uz_lock;
1847 		ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
1848 		rw_wlock(&uma_rwlock);
1849 		LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
1850 		rw_wunlock(&uma_rwlock);
1851 		goto out;
1852 	}
1853 
1854 	/*
1855 	 * Use the regular zone/keg/slab allocator.
1856 	 */
1857 	zone->uz_import = (uma_import)zone_import;
1858 	zone->uz_release = (uma_release)zone_release;
1859 	zone->uz_arg = zone;
1860 	keg = arg->keg;
1861 
1862 	if (arg->flags & UMA_ZONE_SECONDARY) {
1863 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1864 		zone->uz_init = arg->uminit;
1865 		zone->uz_fini = arg->fini;
1866 		zone->uz_lockptr = &keg->uk_lock;
1867 		zone->uz_flags |= UMA_ZONE_SECONDARY;
1868 		rw_wlock(&uma_rwlock);
1869 		ZONE_LOCK(zone);
1870 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1871 			if (LIST_NEXT(z, uz_link) == NULL) {
1872 				LIST_INSERT_AFTER(z, zone, uz_link);
1873 				break;
1874 			}
1875 		}
1876 		ZONE_UNLOCK(zone);
1877 		rw_wunlock(&uma_rwlock);
1878 	} else if (keg == NULL) {
1879 		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1880 		    arg->align, arg->flags)) == NULL)
1881 			return (ENOMEM);
1882 	} else {
1883 		struct uma_kctor_args karg;
1884 		int error;
1885 
1886 		/* We should only be here from uma_startup() */
1887 		karg.size = arg->size;
1888 		karg.uminit = arg->uminit;
1889 		karg.fini = arg->fini;
1890 		karg.align = arg->align;
1891 		karg.flags = arg->flags;
1892 		karg.zone = zone;
1893 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1894 		    flags);
1895 		if (error)
1896 			return (error);
1897 	}
1898 
1899 	zone->uz_keg = keg;
1900 	zone->uz_size = keg->uk_size;
1901 	zone->uz_flags |= (keg->uk_flags &
1902 	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1903 
1904 	/*
1905 	 * Some internal zones don't have room allocated for the per cpu
1906 	 * caches.  If we're internal, bail out here.
1907 	 */
1908 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1909 		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1910 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1911 		return (0);
1912 	}
1913 
1914 out:
1915 	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
1916 	    (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
1917 	    ("Invalid zone flag combination"));
1918 	if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) {
1919 		zone->uz_count = BUCKET_MAX;
1920 	} else if ((arg->flags & UMA_ZONE_MINBUCKET) != 0) {
1921 		zone->uz_count = BUCKET_MIN;
1922 		zone->uz_count_max = BUCKET_MIN;
1923 	} else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
1924 		zone->uz_count = 0;
1925 	else
1926 		zone->uz_count = bucket_select(zone->uz_size);
1927 	zone->uz_count_min = zone->uz_count;
1928 
1929 	return (0);
1930 }
1931 
1932 /*
1933  * Keg header dtor.  This frees all data, destroys locks, frees the hash
1934  * table and removes the keg from the global list.
1935  *
1936  * Arguments/Returns follow uma_dtor specifications
1937  *	udata  unused
1938  */
1939 static void
1940 keg_dtor(void *arg, int size, void *udata)
1941 {
1942 	uma_keg_t keg;
1943 
1944 	keg = (uma_keg_t)arg;
1945 	KEG_LOCK(keg);
1946 	if (keg->uk_free != 0) {
1947 		printf("Freed UMA keg (%s) was not empty (%d items). "
1948 		    " Lost %d pages of memory.\n",
1949 		    keg->uk_name ? keg->uk_name : "",
1950 		    keg->uk_free, keg->uk_pages);
1951 	}
1952 	KEG_UNLOCK(keg);
1953 
1954 	hash_free(&keg->uk_hash);
1955 
1956 	KEG_LOCK_FINI(keg);
1957 }
1958 
1959 /*
1960  * Zone header dtor.
1961  *
1962  * Arguments/Returns follow uma_dtor specifications
1963  *	udata  unused
1964  */
1965 static void
1966 zone_dtor(void *arg, int size, void *udata)
1967 {
1968 	uma_zone_t zone;
1969 	uma_keg_t keg;
1970 
1971 	zone = (uma_zone_t)arg;
1972 
1973 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1974 		cache_drain(zone);
1975 
1976 	rw_wlock(&uma_rwlock);
1977 	LIST_REMOVE(zone, uz_link);
1978 	rw_wunlock(&uma_rwlock);
1979 	/*
1980 	 * XXX there are some races here where
1981 	 * the zone can be drained but zone lock
1982 	 * released and then refilled before we
1983 	 * remove it... we dont care for now
1984 	 */
1985 	zone_reclaim(zone, M_WAITOK, true);
1986 	/*
1987 	 * We only destroy kegs from non secondary/non cache zones.
1988 	 */
1989 	if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
1990 		keg = zone->uz_keg;
1991 		rw_wlock(&uma_rwlock);
1992 		LIST_REMOVE(keg, uk_link);
1993 		rw_wunlock(&uma_rwlock);
1994 		zone_free_item(kegs, keg, NULL, SKIP_NONE);
1995 	}
1996 	counter_u64_free(zone->uz_allocs);
1997 	counter_u64_free(zone->uz_frees);
1998 	counter_u64_free(zone->uz_fails);
1999 	if (zone->uz_lockptr == &zone->uz_lock)
2000 		ZONE_LOCK_FINI(zone);
2001 }
2002 
2003 /*
2004  * Traverses every zone in the system and calls a callback
2005  *
2006  * Arguments:
2007  *	zfunc  A pointer to a function which accepts a zone
2008  *		as an argument.
2009  *
2010  * Returns:
2011  *	Nothing
2012  */
2013 static void
2014 zone_foreach(void (*zfunc)(uma_zone_t))
2015 {
2016 	uma_keg_t keg;
2017 	uma_zone_t zone;
2018 
2019 	/*
2020 	 * Before BOOT_RUNNING we are guaranteed to be single
2021 	 * threaded, so locking isn't needed. Startup functions
2022 	 * are allowed to use M_WAITOK.
2023 	 */
2024 	if (__predict_true(booted == BOOT_RUNNING))
2025 		rw_rlock(&uma_rwlock);
2026 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
2027 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
2028 			zfunc(zone);
2029 	}
2030 	if (__predict_true(booted == BOOT_RUNNING))
2031 		rw_runlock(&uma_rwlock);
2032 }
2033 
2034 /*
2035  * Count how many pages do we need to bootstrap.  VM supplies
2036  * its need in early zones in the argument, we add up our zones,
2037  * which consist of: UMA Slabs, UMA Hash and 9 Bucket zones. The
2038  * zone of zones and zone of kegs are accounted separately.
2039  */
2040 #define	UMA_BOOT_ZONES	11
2041 /* Zone of zones and zone of kegs have arbitrary alignment. */
2042 #define	UMA_BOOT_ALIGN	32
2043 static int zsize, ksize;
2044 int
2045 uma_startup_count(int vm_zones)
2046 {
2047 	int zones, pages;
2048 
2049 	ksize = sizeof(struct uma_keg) +
2050 	    (sizeof(struct uma_domain) * vm_ndomains);
2051 	zsize = sizeof(struct uma_zone) +
2052 	    (sizeof(struct uma_cache) * (mp_maxid + 1)) +
2053 	    (sizeof(struct uma_zone_domain) * vm_ndomains);
2054 
2055 	/*
2056 	 * Memory for the zone of kegs and its keg,
2057 	 * and for zone of zones.
2058 	 */
2059 	pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 +
2060 	    roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE);
2061 
2062 #ifdef	UMA_MD_SMALL_ALLOC
2063 	zones = UMA_BOOT_ZONES;
2064 #else
2065 	zones = UMA_BOOT_ZONES + vm_zones;
2066 	vm_zones = 0;
2067 #endif
2068 
2069 	/* Memory for the rest of startup zones, UMA and VM, ... */
2070 	if (zsize > UMA_SLAB_SPACE) {
2071 		/* See keg_large_init(). */
2072 		u_int ppera;
2073 
2074 		ppera = howmany(roundup2(zsize, UMA_BOOT_ALIGN), PAGE_SIZE);
2075 		if (PAGE_SIZE * ppera - roundup2(zsize, UMA_BOOT_ALIGN) <
2076 		    SIZEOF_UMA_SLAB)
2077 			ppera++;
2078 		pages += (zones + vm_zones) * ppera;
2079 	} else if (roundup2(zsize, UMA_BOOT_ALIGN) > UMA_SLAB_SPACE)
2080 		/* See keg_small_init() special case for uk_ppera = 1. */
2081 		pages += zones;
2082 	else
2083 		pages += howmany(zones,
2084 		    UMA_SLAB_SPACE / roundup2(zsize, UMA_BOOT_ALIGN));
2085 
2086 	/* ... and their kegs. Note that zone of zones allocates a keg! */
2087 	pages += howmany(zones + 1,
2088 	    UMA_SLAB_SPACE / roundup2(ksize, UMA_BOOT_ALIGN));
2089 
2090 	/*
2091 	 * Most of startup zones are not going to be offpages, that's
2092 	 * why we use UMA_SLAB_SPACE instead of UMA_SLAB_SIZE in all
2093 	 * calculations.  Some large bucket zones will be offpage, and
2094 	 * thus will allocate hashes.  We take conservative approach
2095 	 * and assume that all zones may allocate hash.  This may give
2096 	 * us some positive inaccuracy, usually an extra single page.
2097 	 */
2098 	pages += howmany(zones, UMA_SLAB_SPACE /
2099 	    (sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT));
2100 
2101 	return (pages);
2102 }
2103 
2104 void
2105 uma_startup(void *mem, int npages)
2106 {
2107 	struct uma_zctor_args args;
2108 	uma_keg_t masterkeg;
2109 	uintptr_t m;
2110 
2111 #ifdef DIAGNOSTIC
2112 	printf("Entering %s with %d boot pages configured\n", __func__, npages);
2113 #endif
2114 
2115 	rw_init(&uma_rwlock, "UMA lock");
2116 
2117 	/* Use bootpages memory for the zone of zones and zone of kegs. */
2118 	m = (uintptr_t)mem;
2119 	zones = (uma_zone_t)m;
2120 	m += roundup(zsize, CACHE_LINE_SIZE);
2121 	kegs = (uma_zone_t)m;
2122 	m += roundup(zsize, CACHE_LINE_SIZE);
2123 	masterkeg = (uma_keg_t)m;
2124 	m += roundup(ksize, CACHE_LINE_SIZE);
2125 	m = roundup(m, PAGE_SIZE);
2126 	npages -= (m - (uintptr_t)mem) / PAGE_SIZE;
2127 	mem = (void *)m;
2128 
2129 	/* "manually" create the initial zone */
2130 	memset(&args, 0, sizeof(args));
2131 	args.name = "UMA Kegs";
2132 	args.size = ksize;
2133 	args.ctor = keg_ctor;
2134 	args.dtor = keg_dtor;
2135 	args.uminit = zero_init;
2136 	args.fini = NULL;
2137 	args.keg = masterkeg;
2138 	args.align = UMA_BOOT_ALIGN - 1;
2139 	args.flags = UMA_ZFLAG_INTERNAL;
2140 	zone_ctor(kegs, zsize, &args, M_WAITOK);
2141 
2142 	bootmem = mem;
2143 	boot_pages = npages;
2144 
2145 	args.name = "UMA Zones";
2146 	args.size = zsize;
2147 	args.ctor = zone_ctor;
2148 	args.dtor = zone_dtor;
2149 	args.uminit = zero_init;
2150 	args.fini = NULL;
2151 	args.keg = NULL;
2152 	args.align = UMA_BOOT_ALIGN - 1;
2153 	args.flags = UMA_ZFLAG_INTERNAL;
2154 	zone_ctor(zones, zsize, &args, M_WAITOK);
2155 
2156 	/* Now make a zone for slab headers */
2157 	slabzone = uma_zcreate("UMA Slabs",
2158 				sizeof(struct uma_slab),
2159 				NULL, NULL, NULL, NULL,
2160 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2161 
2162 	hashzone = uma_zcreate("UMA Hash",
2163 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
2164 	    NULL, NULL, NULL, NULL,
2165 	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2166 
2167 	bucket_init();
2168 
2169 	booted = BOOT_STRAPPED;
2170 }
2171 
2172 void
2173 uma_startup1(void)
2174 {
2175 
2176 #ifdef DIAGNOSTIC
2177 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2178 #endif
2179 	booted = BOOT_PAGEALLOC;
2180 }
2181 
2182 void
2183 uma_startup2(void)
2184 {
2185 
2186 #ifdef DIAGNOSTIC
2187 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2188 #endif
2189 	booted = BOOT_BUCKETS;
2190 	sx_init(&uma_reclaim_lock, "umareclaim");
2191 	bucket_enable();
2192 }
2193 
2194 /*
2195  * Initialize our callout handle
2196  *
2197  */
2198 static void
2199 uma_startup3(void)
2200 {
2201 
2202 #ifdef INVARIANTS
2203 	TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
2204 	uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
2205 	uma_skip_cnt = counter_u64_alloc(M_WAITOK);
2206 #endif
2207 	zone_foreach(zone_alloc_counters);
2208 	callout_init(&uma_callout, 1);
2209 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
2210 	booted = BOOT_RUNNING;
2211 }
2212 
2213 static uma_keg_t
2214 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
2215 		int align, uint32_t flags)
2216 {
2217 	struct uma_kctor_args args;
2218 
2219 	args.size = size;
2220 	args.uminit = uminit;
2221 	args.fini = fini;
2222 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
2223 	args.flags = flags;
2224 	args.zone = zone;
2225 	return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
2226 }
2227 
2228 /* Public functions */
2229 /* See uma.h */
2230 void
2231 uma_set_align(int align)
2232 {
2233 
2234 	if (align != UMA_ALIGN_CACHE)
2235 		uma_align_cache = align;
2236 }
2237 
2238 /* See uma.h */
2239 uma_zone_t
2240 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
2241 		uma_init uminit, uma_fini fini, int align, uint32_t flags)
2242 
2243 {
2244 	struct uma_zctor_args args;
2245 	uma_zone_t res;
2246 	bool locked;
2247 
2248 	KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
2249 	    align, name));
2250 
2251 	/* Sets all zones to a first-touch domain policy. */
2252 #ifdef UMA_FIRSTTOUCH
2253 	flags |= UMA_ZONE_NUMA;
2254 #endif
2255 
2256 	/* This stuff is essential for the zone ctor */
2257 	memset(&args, 0, sizeof(args));
2258 	args.name = name;
2259 	args.size = size;
2260 	args.ctor = ctor;
2261 	args.dtor = dtor;
2262 	args.uminit = uminit;
2263 	args.fini = fini;
2264 #ifdef  INVARIANTS
2265 	/*
2266 	 * If a zone is being created with an empty constructor and
2267 	 * destructor, pass UMA constructor/destructor which checks for
2268 	 * memory use after free.
2269 	 */
2270 	if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
2271 	    ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
2272 		args.ctor = trash_ctor;
2273 		args.dtor = trash_dtor;
2274 		args.uminit = trash_init;
2275 		args.fini = trash_fini;
2276 	}
2277 #endif
2278 	args.align = align;
2279 	args.flags = flags;
2280 	args.keg = NULL;
2281 
2282 	if (booted < BOOT_BUCKETS) {
2283 		locked = false;
2284 	} else {
2285 		sx_slock(&uma_reclaim_lock);
2286 		locked = true;
2287 	}
2288 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2289 	if (locked)
2290 		sx_sunlock(&uma_reclaim_lock);
2291 	return (res);
2292 }
2293 
2294 /* See uma.h */
2295 uma_zone_t
2296 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
2297 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
2298 {
2299 	struct uma_zctor_args args;
2300 	uma_keg_t keg;
2301 	uma_zone_t res;
2302 	bool locked;
2303 
2304 	keg = master->uz_keg;
2305 	memset(&args, 0, sizeof(args));
2306 	args.name = name;
2307 	args.size = keg->uk_size;
2308 	args.ctor = ctor;
2309 	args.dtor = dtor;
2310 	args.uminit = zinit;
2311 	args.fini = zfini;
2312 	args.align = keg->uk_align;
2313 	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
2314 	args.keg = keg;
2315 
2316 	if (booted < BOOT_BUCKETS) {
2317 		locked = false;
2318 	} else {
2319 		sx_slock(&uma_reclaim_lock);
2320 		locked = true;
2321 	}
2322 	/* XXX Attaches only one keg of potentially many. */
2323 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2324 	if (locked)
2325 		sx_sunlock(&uma_reclaim_lock);
2326 	return (res);
2327 }
2328 
2329 /* See uma.h */
2330 uma_zone_t
2331 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
2332 		    uma_init zinit, uma_fini zfini, uma_import zimport,
2333 		    uma_release zrelease, void *arg, int flags)
2334 {
2335 	struct uma_zctor_args args;
2336 
2337 	memset(&args, 0, sizeof(args));
2338 	args.name = name;
2339 	args.size = size;
2340 	args.ctor = ctor;
2341 	args.dtor = dtor;
2342 	args.uminit = zinit;
2343 	args.fini = zfini;
2344 	args.import = zimport;
2345 	args.release = zrelease;
2346 	args.arg = arg;
2347 	args.align = 0;
2348 	args.flags = flags | UMA_ZFLAG_CACHE;
2349 
2350 	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
2351 }
2352 
2353 /* See uma.h */
2354 void
2355 uma_zdestroy(uma_zone_t zone)
2356 {
2357 
2358 	sx_slock(&uma_reclaim_lock);
2359 	zone_free_item(zones, zone, NULL, SKIP_NONE);
2360 	sx_sunlock(&uma_reclaim_lock);
2361 }
2362 
2363 void
2364 uma_zwait(uma_zone_t zone)
2365 {
2366 	void *item;
2367 
2368 	item = uma_zalloc_arg(zone, NULL, M_WAITOK);
2369 	uma_zfree(zone, item);
2370 }
2371 
2372 void *
2373 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
2374 {
2375 	void *item;
2376 #ifdef SMP
2377 	int i;
2378 
2379 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2380 #endif
2381 	item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
2382 	if (item != NULL && (flags & M_ZERO)) {
2383 #ifdef SMP
2384 		for (i = 0; i <= mp_maxid; i++)
2385 			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
2386 #else
2387 		bzero(item, zone->uz_size);
2388 #endif
2389 	}
2390 	return (item);
2391 }
2392 
2393 /*
2394  * A stub while both regular and pcpu cases are identical.
2395  */
2396 void
2397 uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata)
2398 {
2399 
2400 #ifdef SMP
2401 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2402 #endif
2403 	uma_zfree_arg(zone, item, udata);
2404 }
2405 
2406 /* See uma.h */
2407 void *
2408 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
2409 {
2410 	uma_zone_domain_t zdom;
2411 	uma_bucket_t bucket;
2412 	uma_cache_t cache;
2413 	void *item;
2414 	int cpu, domain, lockfail, maxbucket;
2415 #ifdef INVARIANTS
2416 	bool skipdbg;
2417 #endif
2418 
2419 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2420 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2421 
2422 	/* This is the fast path allocation */
2423 	CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d",
2424 	    curthread, zone->uz_name, zone, flags);
2425 
2426 	if (flags & M_WAITOK) {
2427 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2428 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2429 	}
2430 	KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC"));
2431 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2432 	    ("uma_zalloc_arg: called with spinlock or critical section held"));
2433 	if (zone->uz_flags & UMA_ZONE_PCPU)
2434 		KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone "
2435 		    "with M_ZERO passed"));
2436 
2437 #ifdef DEBUG_MEMGUARD
2438 	if (memguard_cmp_zone(zone)) {
2439 		item = memguard_alloc(zone->uz_size, flags);
2440 		if (item != NULL) {
2441 			if (zone->uz_init != NULL &&
2442 			    zone->uz_init(item, zone->uz_size, flags) != 0)
2443 				return (NULL);
2444 			if (zone->uz_ctor != NULL &&
2445 			    zone->uz_ctor(item, zone->uz_size, udata,
2446 			    flags) != 0) {
2447 			    	zone->uz_fini(item, zone->uz_size);
2448 				return (NULL);
2449 			}
2450 			return (item);
2451 		}
2452 		/* This is unfortunate but should not be fatal. */
2453 	}
2454 #endif
2455 	/*
2456 	 * If possible, allocate from the per-CPU cache.  There are two
2457 	 * requirements for safe access to the per-CPU cache: (1) the thread
2458 	 * accessing the cache must not be preempted or yield during access,
2459 	 * and (2) the thread must not migrate CPUs without switching which
2460 	 * cache it accesses.  We rely on a critical section to prevent
2461 	 * preemption and migration.  We release the critical section in
2462 	 * order to acquire the zone mutex if we are unable to allocate from
2463 	 * the current cache; when we re-acquire the critical section, we
2464 	 * must detect and handle migration if it has occurred.
2465 	 */
2466 zalloc_restart:
2467 	critical_enter();
2468 	cpu = curcpu;
2469 	cache = &zone->uz_cpu[cpu];
2470 
2471 zalloc_start:
2472 	bucket = cache->uc_allocbucket;
2473 	if (bucket != NULL && bucket->ub_cnt > 0) {
2474 		bucket->ub_cnt--;
2475 		item = bucket->ub_bucket[bucket->ub_cnt];
2476 #ifdef INVARIANTS
2477 		bucket->ub_bucket[bucket->ub_cnt] = NULL;
2478 #endif
2479 		KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
2480 		cache->uc_allocs++;
2481 		critical_exit();
2482 #ifdef INVARIANTS
2483 		skipdbg = uma_dbg_zskip(zone, item);
2484 #endif
2485 		if (zone->uz_ctor != NULL &&
2486 #ifdef INVARIANTS
2487 		    (!skipdbg || zone->uz_ctor != trash_ctor ||
2488 		    zone->uz_dtor != trash_dtor) &&
2489 #endif
2490 		    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2491 			counter_u64_add(zone->uz_fails, 1);
2492 			zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
2493 			return (NULL);
2494 		}
2495 #ifdef INVARIANTS
2496 		if (!skipdbg)
2497 			uma_dbg_alloc(zone, NULL, item);
2498 #endif
2499 		if (flags & M_ZERO)
2500 			uma_zero_item(item, zone);
2501 		return (item);
2502 	}
2503 
2504 	/*
2505 	 * We have run out of items in our alloc bucket.
2506 	 * See if we can switch with our free bucket.
2507 	 */
2508 	bucket = cache->uc_freebucket;
2509 	if (bucket != NULL && bucket->ub_cnt > 0) {
2510 		CTR2(KTR_UMA,
2511 		    "uma_zalloc: zone %s(%p) swapping empty with alloc",
2512 		    zone->uz_name, zone);
2513 		cache->uc_freebucket = cache->uc_allocbucket;
2514 		cache->uc_allocbucket = bucket;
2515 		goto zalloc_start;
2516 	}
2517 
2518 	/*
2519 	 * Discard any empty allocation bucket while we hold no locks.
2520 	 */
2521 	bucket = cache->uc_allocbucket;
2522 	cache->uc_allocbucket = NULL;
2523 	critical_exit();
2524 	if (bucket != NULL)
2525 		bucket_free(zone, bucket, udata);
2526 
2527 	/* Short-circuit for zones without buckets and low memory. */
2528 	if (zone->uz_count == 0 || bucketdisable) {
2529 		ZONE_LOCK(zone);
2530 		if (zone->uz_flags & UMA_ZONE_NUMA)
2531 			domain = PCPU_GET(domain);
2532 		else
2533 			domain = UMA_ANYDOMAIN;
2534 		goto zalloc_item;
2535 	}
2536 
2537 	/*
2538 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
2539 	 * we must go back to the zone.  This requires the zone lock, so we
2540 	 * must drop the critical section, then re-acquire it when we go back
2541 	 * to the cache.  Since the critical section is released, we may be
2542 	 * preempted or migrate.  As such, make sure not to maintain any
2543 	 * thread-local state specific to the cache from prior to releasing
2544 	 * the critical section.
2545 	 */
2546 	lockfail = 0;
2547 	if (ZONE_TRYLOCK(zone) == 0) {
2548 		/* Record contention to size the buckets. */
2549 		ZONE_LOCK(zone);
2550 		lockfail = 1;
2551 	}
2552 	critical_enter();
2553 	cpu = curcpu;
2554 	cache = &zone->uz_cpu[cpu];
2555 
2556 	/* See if we lost the race to fill the cache. */
2557 	if (cache->uc_allocbucket != NULL) {
2558 		ZONE_UNLOCK(zone);
2559 		goto zalloc_start;
2560 	}
2561 
2562 	/*
2563 	 * Check the zone's cache of buckets.
2564 	 */
2565 	if (zone->uz_flags & UMA_ZONE_NUMA) {
2566 		domain = PCPU_GET(domain);
2567 		zdom = &zone->uz_domain[domain];
2568 	} else {
2569 		domain = UMA_ANYDOMAIN;
2570 		zdom = &zone->uz_domain[0];
2571 	}
2572 
2573 	if ((bucket = zone_fetch_bucket(zone, zdom)) != NULL) {
2574 		KASSERT(bucket->ub_cnt != 0,
2575 		    ("uma_zalloc_arg: Returning an empty bucket."));
2576 		cache->uc_allocbucket = bucket;
2577 		ZONE_UNLOCK(zone);
2578 		goto zalloc_start;
2579 	}
2580 	/* We are no longer associated with this CPU. */
2581 	critical_exit();
2582 
2583 	/*
2584 	 * We bump the uz count when the cache size is insufficient to
2585 	 * handle the working set.
2586 	 */
2587 	if (lockfail && zone->uz_count < zone->uz_count_max)
2588 		zone->uz_count++;
2589 
2590 	if (zone->uz_max_items > 0) {
2591 		if (zone->uz_items >= zone->uz_max_items)
2592 			goto zalloc_item;
2593 		maxbucket = MIN(zone->uz_count,
2594 		    zone->uz_max_items - zone->uz_items);
2595 		zone->uz_items += maxbucket;
2596 	} else
2597 		maxbucket = zone->uz_count;
2598 	ZONE_UNLOCK(zone);
2599 
2600 	/*
2601 	 * Now lets just fill a bucket and put it on the free list.  If that
2602 	 * works we'll restart the allocation from the beginning and it
2603 	 * will use the just filled bucket.
2604 	 */
2605 	bucket = zone_alloc_bucket(zone, udata, domain, flags, maxbucket);
2606 	CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
2607 	    zone->uz_name, zone, bucket);
2608 	ZONE_LOCK(zone);
2609 	if (bucket != NULL) {
2610 		if (zone->uz_max_items > 0 && bucket->ub_cnt < maxbucket) {
2611 			MPASS(zone->uz_items >= maxbucket - bucket->ub_cnt);
2612 			zone->uz_items -= maxbucket - bucket->ub_cnt;
2613 			if (zone->uz_sleepers > 0 &&
2614 			    zone->uz_items < zone->uz_max_items)
2615 				wakeup_one(zone);
2616 		}
2617 		critical_enter();
2618 		cpu = curcpu;
2619 		cache = &zone->uz_cpu[cpu];
2620 
2621 		/*
2622 		 * See if we lost the race or were migrated.  Cache the
2623 		 * initialized bucket to make this less likely or claim
2624 		 * the memory directly.
2625 		 */
2626 		if (cache->uc_allocbucket == NULL &&
2627 		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
2628 		    domain == PCPU_GET(domain))) {
2629 			cache->uc_allocbucket = bucket;
2630 			zdom->uzd_imax += bucket->ub_cnt;
2631 		} else if (zone->uz_bkt_count >= zone->uz_bkt_max) {
2632 			critical_exit();
2633 			ZONE_UNLOCK(zone);
2634 			bucket_drain(zone, bucket);
2635 			bucket_free(zone, bucket, udata);
2636 			goto zalloc_restart;
2637 		} else
2638 			zone_put_bucket(zone, zdom, bucket, false);
2639 		ZONE_UNLOCK(zone);
2640 		goto zalloc_start;
2641 	} else if (zone->uz_max_items > 0) {
2642 		zone->uz_items -= maxbucket;
2643 		if (zone->uz_sleepers > 0 &&
2644 		    zone->uz_items + 1 < zone->uz_max_items)
2645 			wakeup_one(zone);
2646 	}
2647 
2648 	/*
2649 	 * We may not be able to get a bucket so return an actual item.
2650 	 */
2651 zalloc_item:
2652 	item = zone_alloc_item_locked(zone, udata, domain, flags);
2653 
2654 	return (item);
2655 }
2656 
2657 void *
2658 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
2659 {
2660 
2661 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2662 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2663 
2664 	/* This is the fast path allocation */
2665 	CTR5(KTR_UMA,
2666 	    "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d",
2667 	    curthread, zone->uz_name, zone, domain, flags);
2668 
2669 	if (flags & M_WAITOK) {
2670 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2671 		    "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
2672 	}
2673 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2674 	    ("uma_zalloc_domain: called with spinlock or critical section held"));
2675 
2676 	return (zone_alloc_item(zone, udata, domain, flags));
2677 }
2678 
2679 /*
2680  * Find a slab with some space.  Prefer slabs that are partially used over those
2681  * that are totally full.  This helps to reduce fragmentation.
2682  *
2683  * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
2684  * only 'domain'.
2685  */
2686 static uma_slab_t
2687 keg_first_slab(uma_keg_t keg, int domain, bool rr)
2688 {
2689 	uma_domain_t dom;
2690 	uma_slab_t slab;
2691 	int start;
2692 
2693 	KASSERT(domain >= 0 && domain < vm_ndomains,
2694 	    ("keg_first_slab: domain %d out of range", domain));
2695 	KEG_LOCK_ASSERT(keg);
2696 
2697 	slab = NULL;
2698 	start = domain;
2699 	do {
2700 		dom = &keg->uk_domain[domain];
2701 		if (!LIST_EMPTY(&dom->ud_part_slab))
2702 			return (LIST_FIRST(&dom->ud_part_slab));
2703 		if (!LIST_EMPTY(&dom->ud_free_slab)) {
2704 			slab = LIST_FIRST(&dom->ud_free_slab);
2705 			LIST_REMOVE(slab, us_link);
2706 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2707 			return (slab);
2708 		}
2709 		if (rr)
2710 			domain = (domain + 1) % vm_ndomains;
2711 	} while (domain != start);
2712 
2713 	return (NULL);
2714 }
2715 
2716 static uma_slab_t
2717 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags)
2718 {
2719 	uint32_t reserve;
2720 
2721 	KEG_LOCK_ASSERT(keg);
2722 
2723 	reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve;
2724 	if (keg->uk_free <= reserve)
2725 		return (NULL);
2726 	return (keg_first_slab(keg, domain, rr));
2727 }
2728 
2729 static uma_slab_t
2730 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags)
2731 {
2732 	struct vm_domainset_iter di;
2733 	uma_domain_t dom;
2734 	uma_slab_t slab;
2735 	int aflags, domain;
2736 	bool rr;
2737 
2738 restart:
2739 	KEG_LOCK_ASSERT(keg);
2740 
2741 	/*
2742 	 * Use the keg's policy if upper layers haven't already specified a
2743 	 * domain (as happens with first-touch zones).
2744 	 *
2745 	 * To avoid races we run the iterator with the keg lock held, but that
2746 	 * means that we cannot allow the vm_domainset layer to sleep.  Thus,
2747 	 * clear M_WAITOK and handle low memory conditions locally.
2748 	 */
2749 	rr = rdomain == UMA_ANYDOMAIN;
2750 	if (rr) {
2751 		aflags = (flags & ~M_WAITOK) | M_NOWAIT;
2752 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
2753 		    &aflags);
2754 	} else {
2755 		aflags = flags;
2756 		domain = rdomain;
2757 	}
2758 
2759 	for (;;) {
2760 		slab = keg_fetch_free_slab(keg, domain, rr, flags);
2761 		if (slab != NULL) {
2762 			MPASS(slab->us_keg == keg);
2763 			return (slab);
2764 		}
2765 
2766 		/*
2767 		 * M_NOVM means don't ask at all!
2768 		 */
2769 		if (flags & M_NOVM)
2770 			break;
2771 
2772 		KASSERT(zone->uz_max_items == 0 ||
2773 		    zone->uz_items <= zone->uz_max_items,
2774 		    ("%s: zone %p overflow", __func__, zone));
2775 
2776 		slab = keg_alloc_slab(keg, zone, domain, flags, aflags);
2777 		/*
2778 		 * If we got a slab here it's safe to mark it partially used
2779 		 * and return.  We assume that the caller is going to remove
2780 		 * at least one item.
2781 		 */
2782 		if (slab) {
2783 			MPASS(slab->us_keg == keg);
2784 			dom = &keg->uk_domain[slab->us_domain];
2785 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2786 			return (slab);
2787 		}
2788 		KEG_LOCK(keg);
2789 		if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
2790 			if ((flags & M_WAITOK) != 0) {
2791 				KEG_UNLOCK(keg);
2792 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
2793 				KEG_LOCK(keg);
2794 				goto restart;
2795 			}
2796 			break;
2797 		}
2798 	}
2799 
2800 	/*
2801 	 * We might not have been able to get a slab but another cpu
2802 	 * could have while we were unlocked.  Check again before we
2803 	 * fail.
2804 	 */
2805 	if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) {
2806 		MPASS(slab->us_keg == keg);
2807 		return (slab);
2808 	}
2809 	return (NULL);
2810 }
2811 
2812 static uma_slab_t
2813 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags)
2814 {
2815 	uma_slab_t slab;
2816 
2817 	if (keg == NULL) {
2818 		keg = zone->uz_keg;
2819 		KEG_LOCK(keg);
2820 	}
2821 
2822 	for (;;) {
2823 		slab = keg_fetch_slab(keg, zone, domain, flags);
2824 		if (slab)
2825 			return (slab);
2826 		if (flags & (M_NOWAIT | M_NOVM))
2827 			break;
2828 	}
2829 	KEG_UNLOCK(keg);
2830 	return (NULL);
2831 }
2832 
2833 static void *
2834 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
2835 {
2836 	uma_domain_t dom;
2837 	void *item;
2838 	uint8_t freei;
2839 
2840 	MPASS(keg == slab->us_keg);
2841 	KEG_LOCK_ASSERT(keg);
2842 
2843 	freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
2844 	BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
2845 	item = slab->us_data + (keg->uk_rsize * freei);
2846 	slab->us_freecount--;
2847 	keg->uk_free--;
2848 
2849 	/* Move this slab to the full list */
2850 	if (slab->us_freecount == 0) {
2851 		LIST_REMOVE(slab, us_link);
2852 		dom = &keg->uk_domain[slab->us_domain];
2853 		LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
2854 	}
2855 
2856 	return (item);
2857 }
2858 
2859 static int
2860 zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags)
2861 {
2862 	uma_slab_t slab;
2863 	uma_keg_t keg;
2864 #ifdef NUMA
2865 	int stripe;
2866 #endif
2867 	int i;
2868 
2869 	slab = NULL;
2870 	keg = NULL;
2871 	/* Try to keep the buckets totally full */
2872 	for (i = 0; i < max; ) {
2873 		if ((slab = zone_fetch_slab(zone, keg, domain, flags)) == NULL)
2874 			break;
2875 		keg = slab->us_keg;
2876 #ifdef NUMA
2877 		stripe = howmany(max, vm_ndomains);
2878 #endif
2879 		while (slab->us_freecount && i < max) {
2880 			bucket[i++] = slab_alloc_item(keg, slab);
2881 			if (keg->uk_free <= keg->uk_reserve)
2882 				break;
2883 #ifdef NUMA
2884 			/*
2885 			 * If the zone is striped we pick a new slab for every
2886 			 * N allocations.  Eliminating this conditional will
2887 			 * instead pick a new domain for each bucket rather
2888 			 * than stripe within each bucket.  The current option
2889 			 * produces more fragmentation and requires more cpu
2890 			 * time but yields better distribution.
2891 			 */
2892 			if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 &&
2893 			    vm_ndomains > 1 && --stripe == 0)
2894 				break;
2895 #endif
2896 		}
2897 		/* Don't block if we allocated any successfully. */
2898 		flags &= ~M_WAITOK;
2899 		flags |= M_NOWAIT;
2900 	}
2901 	if (slab != NULL)
2902 		KEG_UNLOCK(keg);
2903 
2904 	return i;
2905 }
2906 
2907 static uma_bucket_t
2908 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags, int max)
2909 {
2910 	uma_bucket_t bucket;
2911 
2912 	CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain);
2913 
2914 	/* Avoid allocs targeting empty domains. */
2915 	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
2916 		domain = UMA_ANYDOMAIN;
2917 
2918 	/* Don't wait for buckets, preserve caller's NOVM setting. */
2919 	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
2920 	if (bucket == NULL)
2921 		return (NULL);
2922 
2923 	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
2924 	    MIN(max, bucket->ub_entries), domain, flags);
2925 
2926 	/*
2927 	 * Initialize the memory if necessary.
2928 	 */
2929 	if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
2930 		int i;
2931 
2932 		for (i = 0; i < bucket->ub_cnt; i++)
2933 			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
2934 			    flags) != 0)
2935 				break;
2936 		/*
2937 		 * If we couldn't initialize the whole bucket, put the
2938 		 * rest back onto the freelist.
2939 		 */
2940 		if (i != bucket->ub_cnt) {
2941 			zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
2942 			    bucket->ub_cnt - i);
2943 #ifdef INVARIANTS
2944 			bzero(&bucket->ub_bucket[i],
2945 			    sizeof(void *) * (bucket->ub_cnt - i));
2946 #endif
2947 			bucket->ub_cnt = i;
2948 		}
2949 	}
2950 
2951 	if (bucket->ub_cnt == 0) {
2952 		bucket_free(zone, bucket, udata);
2953 		counter_u64_add(zone->uz_fails, 1);
2954 		return (NULL);
2955 	}
2956 
2957 	return (bucket);
2958 }
2959 
2960 /*
2961  * Allocates a single item from a zone.
2962  *
2963  * Arguments
2964  *	zone   The zone to alloc for.
2965  *	udata  The data to be passed to the constructor.
2966  *	domain The domain to allocate from or UMA_ANYDOMAIN.
2967  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
2968  *
2969  * Returns
2970  *	NULL if there is no memory and M_NOWAIT is set
2971  *	An item if successful
2972  */
2973 
2974 static void *
2975 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
2976 {
2977 
2978 	ZONE_LOCK(zone);
2979 	return (zone_alloc_item_locked(zone, udata, domain, flags));
2980 }
2981 
2982 /*
2983  * Returns with zone unlocked.
2984  */
2985 static void *
2986 zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags)
2987 {
2988 	void *item;
2989 #ifdef INVARIANTS
2990 	bool skipdbg;
2991 #endif
2992 
2993 	ZONE_LOCK_ASSERT(zone);
2994 
2995 	if (zone->uz_max_items > 0) {
2996 		if (zone->uz_items >= zone->uz_max_items) {
2997 			zone_log_warning(zone);
2998 			zone_maxaction(zone);
2999 			if (flags & M_NOWAIT) {
3000 				ZONE_UNLOCK(zone);
3001 				return (NULL);
3002 			}
3003 			zone->uz_sleeps++;
3004 			zone->uz_sleepers++;
3005 			while (zone->uz_items >= zone->uz_max_items)
3006 				mtx_sleep(zone, zone->uz_lockptr, PVM,
3007 				    "zonelimit", 0);
3008 			zone->uz_sleepers--;
3009 			if (zone->uz_sleepers > 0 &&
3010 			    zone->uz_items + 1 < zone->uz_max_items)
3011 				wakeup_one(zone);
3012 		}
3013 		zone->uz_items++;
3014 	}
3015 	ZONE_UNLOCK(zone);
3016 
3017 	/* Avoid allocs targeting empty domains. */
3018 	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
3019 		domain = UMA_ANYDOMAIN;
3020 
3021 	if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
3022 		goto fail;
3023 
3024 #ifdef INVARIANTS
3025 	skipdbg = uma_dbg_zskip(zone, item);
3026 #endif
3027 	/*
3028 	 * We have to call both the zone's init (not the keg's init)
3029 	 * and the zone's ctor.  This is because the item is going from
3030 	 * a keg slab directly to the user, and the user is expecting it
3031 	 * to be both zone-init'd as well as zone-ctor'd.
3032 	 */
3033 	if (zone->uz_init != NULL) {
3034 		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
3035 			zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT);
3036 			goto fail;
3037 		}
3038 	}
3039 	if (zone->uz_ctor != NULL &&
3040 #ifdef INVARIANTS
3041 	    (!skipdbg || zone->uz_ctor != trash_ctor ||
3042 	    zone->uz_dtor != trash_dtor) &&
3043 #endif
3044 	    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
3045 		zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
3046 		goto fail;
3047 	}
3048 #ifdef INVARIANTS
3049 	if (!skipdbg)
3050 		uma_dbg_alloc(zone, NULL, item);
3051 #endif
3052 	if (flags & M_ZERO)
3053 		uma_zero_item(item, zone);
3054 
3055 	counter_u64_add(zone->uz_allocs, 1);
3056 	CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
3057 	    zone->uz_name, zone);
3058 
3059 	return (item);
3060 
3061 fail:
3062 	if (zone->uz_max_items > 0) {
3063 		ZONE_LOCK(zone);
3064 		zone->uz_items--;
3065 		ZONE_UNLOCK(zone);
3066 	}
3067 	counter_u64_add(zone->uz_fails, 1);
3068 	CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
3069 	    zone->uz_name, zone);
3070 	return (NULL);
3071 }
3072 
3073 /* See uma.h */
3074 void
3075 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
3076 {
3077 	uma_cache_t cache;
3078 	uma_bucket_t bucket;
3079 	uma_zone_domain_t zdom;
3080 	int cpu, domain;
3081 #ifdef UMA_XDOMAIN
3082 	int itemdomain;
3083 #endif
3084 	bool lockfail;
3085 #ifdef INVARIANTS
3086 	bool skipdbg;
3087 #endif
3088 
3089 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3090 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3091 
3092 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
3093 	    zone->uz_name);
3094 
3095 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3096 	    ("uma_zfree_arg: called with spinlock or critical section held"));
3097 
3098         /* uma_zfree(..., NULL) does nothing, to match free(9). */
3099         if (item == NULL)
3100                 return;
3101 #ifdef DEBUG_MEMGUARD
3102 	if (is_memguard_addr(item)) {
3103 		if (zone->uz_dtor != NULL)
3104 			zone->uz_dtor(item, zone->uz_size, udata);
3105 		if (zone->uz_fini != NULL)
3106 			zone->uz_fini(item, zone->uz_size);
3107 		memguard_free(item);
3108 		return;
3109 	}
3110 #endif
3111 #ifdef INVARIANTS
3112 	skipdbg = uma_dbg_zskip(zone, item);
3113 	if (skipdbg == false) {
3114 		if (zone->uz_flags & UMA_ZONE_MALLOC)
3115 			uma_dbg_free(zone, udata, item);
3116 		else
3117 			uma_dbg_free(zone, NULL, item);
3118 	}
3119 	if (zone->uz_dtor != NULL && (!skipdbg ||
3120 	    zone->uz_dtor != trash_dtor || zone->uz_ctor != trash_ctor))
3121 #else
3122 	if (zone->uz_dtor != NULL)
3123 #endif
3124 		zone->uz_dtor(item, zone->uz_size, udata);
3125 
3126 	/*
3127 	 * The race here is acceptable.  If we miss it we'll just have to wait
3128 	 * a little longer for the limits to be reset.
3129 	 */
3130 	if (zone->uz_sleepers > 0)
3131 		goto zfree_item;
3132 
3133 #ifdef UMA_XDOMAIN
3134 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
3135 		itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
3136 #endif
3137 
3138 	/*
3139 	 * If possible, free to the per-CPU cache.  There are two
3140 	 * requirements for safe access to the per-CPU cache: (1) the thread
3141 	 * accessing the cache must not be preempted or yield during access,
3142 	 * and (2) the thread must not migrate CPUs without switching which
3143 	 * cache it accesses.  We rely on a critical section to prevent
3144 	 * preemption and migration.  We release the critical section in
3145 	 * order to acquire the zone mutex if we are unable to free to the
3146 	 * current cache; when we re-acquire the critical section, we must
3147 	 * detect and handle migration if it has occurred.
3148 	 */
3149 zfree_restart:
3150 	critical_enter();
3151 	cpu = curcpu;
3152 	cache = &zone->uz_cpu[cpu];
3153 
3154 zfree_start:
3155 	domain = PCPU_GET(domain);
3156 #ifdef UMA_XDOMAIN
3157 	if ((zone->uz_flags & UMA_ZONE_NUMA) == 0)
3158 		itemdomain = domain;
3159 #endif
3160 	/*
3161 	 * Try to free into the allocbucket first to give LIFO ordering
3162 	 * for cache-hot datastructures.  Spill over into the freebucket
3163 	 * if necessary.  Alloc will swap them if one runs dry.
3164 	 */
3165 #ifdef UMA_XDOMAIN
3166 	if (domain != itemdomain) {
3167 		bucket = cache->uc_crossbucket;
3168 	} else
3169 #endif
3170 	{
3171 		bucket = cache->uc_allocbucket;
3172 		if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
3173 			bucket = cache->uc_freebucket;
3174 	}
3175 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3176 		KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
3177 		    ("uma_zfree: Freeing to non free bucket index."));
3178 		bucket->ub_bucket[bucket->ub_cnt] = item;
3179 		bucket->ub_cnt++;
3180 		cache->uc_frees++;
3181 		critical_exit();
3182 		return;
3183 	}
3184 
3185 	/*
3186 	 * We must go back the zone, which requires acquiring the zone lock,
3187 	 * which in turn means we must release and re-acquire the critical
3188 	 * section.  Since the critical section is released, we may be
3189 	 * preempted or migrate.  As such, make sure not to maintain any
3190 	 * thread-local state specific to the cache from prior to releasing
3191 	 * the critical section.
3192 	 */
3193 	critical_exit();
3194 	if (zone->uz_count == 0 || bucketdisable)
3195 		goto zfree_item;
3196 
3197 	lockfail = false;
3198 	if (ZONE_TRYLOCK(zone) == 0) {
3199 		/* Record contention to size the buckets. */
3200 		ZONE_LOCK(zone);
3201 		lockfail = true;
3202 	}
3203 	critical_enter();
3204 	cpu = curcpu;
3205 	domain = PCPU_GET(domain);
3206 	cache = &zone->uz_cpu[cpu];
3207 
3208 #ifdef UMA_XDOMAIN
3209 	if (domain != itemdomain)
3210 		bucket = cache->uc_crossbucket;
3211 	else
3212 #endif
3213 		bucket = cache->uc_freebucket;
3214 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3215 		ZONE_UNLOCK(zone);
3216 		goto zfree_start;
3217 	}
3218 #ifdef UMA_XDOMAIN
3219 	if (domain != itemdomain)
3220 		cache->uc_crossbucket = NULL;
3221 	else
3222 #endif
3223 		cache->uc_freebucket = NULL;
3224 	/* We are no longer associated with this CPU. */
3225 	critical_exit();
3226 
3227 #ifdef UMA_XDOMAIN
3228 	if (domain != itemdomain) {
3229 		if (bucket != NULL) {
3230 			zone->uz_xdomain += bucket->ub_cnt;
3231 			if (vm_ndomains > 2 ||
3232 			    zone->uz_bkt_count >= zone->uz_bkt_max) {
3233 				ZONE_UNLOCK(zone);
3234 				bucket_drain(zone, bucket);
3235 				bucket_free(zone, bucket, udata);
3236 			} else {
3237 				zdom = &zone->uz_domain[itemdomain];
3238 				zone_put_bucket(zone, zdom, bucket, true);
3239 				ZONE_UNLOCK(zone);
3240 			}
3241 		} else
3242 			ZONE_UNLOCK(zone);
3243 		bucket = bucket_alloc(zone, udata, M_NOWAIT);
3244 		if (bucket == NULL)
3245 			goto zfree_item;
3246 		critical_enter();
3247 		cpu = curcpu;
3248 		cache = &zone->uz_cpu[cpu];
3249 		if (cache->uc_crossbucket == NULL) {
3250 			cache->uc_crossbucket = bucket;
3251 			goto zfree_start;
3252 		}
3253 		critical_exit();
3254 		bucket_free(zone, bucket, udata);
3255 		goto zfree_restart;
3256 	}
3257 #endif
3258 
3259 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) {
3260 		zdom = &zone->uz_domain[domain];
3261 	} else {
3262 		domain = 0;
3263 		zdom = &zone->uz_domain[0];
3264 	}
3265 
3266 	/* Can we throw this on the zone full list? */
3267 	if (bucket != NULL) {
3268 		CTR3(KTR_UMA,
3269 		    "uma_zfree: zone %s(%p) putting bucket %p on free list",
3270 		    zone->uz_name, zone, bucket);
3271 		/* ub_cnt is pointing to the last free item */
3272 		KASSERT(bucket->ub_cnt == bucket->ub_entries,
3273 		    ("uma_zfree: Attempting to insert not full bucket onto the full list.\n"));
3274 		if (zone->uz_bkt_count >= zone->uz_bkt_max) {
3275 			ZONE_UNLOCK(zone);
3276 			bucket_drain(zone, bucket);
3277 			bucket_free(zone, bucket, udata);
3278 			goto zfree_restart;
3279 		} else
3280 			zone_put_bucket(zone, zdom, bucket, true);
3281 	}
3282 
3283 	/*
3284 	 * We bump the uz count when the cache size is insufficient to
3285 	 * handle the working set.
3286 	 */
3287 	if (lockfail && zone->uz_count < zone->uz_count_max)
3288 		zone->uz_count++;
3289 	ZONE_UNLOCK(zone);
3290 
3291 	bucket = bucket_alloc(zone, udata, M_NOWAIT);
3292 	CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p",
3293 	    zone->uz_name, zone, bucket);
3294 	if (bucket) {
3295 		critical_enter();
3296 		cpu = curcpu;
3297 		cache = &zone->uz_cpu[cpu];
3298 		if (cache->uc_freebucket == NULL &&
3299 		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
3300 		    domain == PCPU_GET(domain))) {
3301 			cache->uc_freebucket = bucket;
3302 			goto zfree_start;
3303 		}
3304 		/*
3305 		 * We lost the race, start over.  We have to drop our
3306 		 * critical section to free the bucket.
3307 		 */
3308 		critical_exit();
3309 		bucket_free(zone, bucket, udata);
3310 		goto zfree_restart;
3311 	}
3312 
3313 	/*
3314 	 * If nothing else caught this, we'll just do an internal free.
3315 	 */
3316 zfree_item:
3317 	zone_free_item(zone, item, udata, SKIP_DTOR);
3318 }
3319 
3320 void
3321 uma_zfree_domain(uma_zone_t zone, void *item, void *udata)
3322 {
3323 
3324 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3325 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3326 
3327 	CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread,
3328 	    zone->uz_name);
3329 
3330 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3331 	    ("uma_zfree_domain: called with spinlock or critical section held"));
3332 
3333         /* uma_zfree(..., NULL) does nothing, to match free(9). */
3334         if (item == NULL)
3335                 return;
3336 	zone_free_item(zone, item, udata, SKIP_NONE);
3337 }
3338 
3339 static void
3340 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item)
3341 {
3342 	uma_keg_t keg;
3343 	uma_domain_t dom;
3344 	uint8_t freei;
3345 
3346 	keg = zone->uz_keg;
3347 	MPASS(zone->uz_lockptr == &keg->uk_lock);
3348 	KEG_LOCK_ASSERT(keg);
3349 	MPASS(keg == slab->us_keg);
3350 
3351 	dom = &keg->uk_domain[slab->us_domain];
3352 
3353 	/* Do we need to remove from any lists? */
3354 	if (slab->us_freecount+1 == keg->uk_ipers) {
3355 		LIST_REMOVE(slab, us_link);
3356 		LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
3357 	} else if (slab->us_freecount == 0) {
3358 		LIST_REMOVE(slab, us_link);
3359 		LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
3360 	}
3361 
3362 	/* Slab management. */
3363 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3364 	BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
3365 	slab->us_freecount++;
3366 
3367 	/* Keg statistics. */
3368 	keg->uk_free++;
3369 }
3370 
3371 static void
3372 zone_release(uma_zone_t zone, void **bucket, int cnt)
3373 {
3374 	void *item;
3375 	uma_slab_t slab;
3376 	uma_keg_t keg;
3377 	uint8_t *mem;
3378 	int i;
3379 
3380 	keg = zone->uz_keg;
3381 	KEG_LOCK(keg);
3382 	for (i = 0; i < cnt; i++) {
3383 		item = bucket[i];
3384 		if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
3385 			mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
3386 			if (zone->uz_flags & UMA_ZONE_HASH) {
3387 				slab = hash_sfind(&keg->uk_hash, mem);
3388 			} else {
3389 				mem += keg->uk_pgoff;
3390 				slab = (uma_slab_t)mem;
3391 			}
3392 		} else {
3393 			slab = vtoslab((vm_offset_t)item);
3394 			MPASS(slab->us_keg == keg);
3395 		}
3396 		slab_free_item(zone, slab, item);
3397 	}
3398 	KEG_UNLOCK(keg);
3399 }
3400 
3401 /*
3402  * Frees a single item to any zone.
3403  *
3404  * Arguments:
3405  *	zone   The zone to free to
3406  *	item   The item we're freeing
3407  *	udata  User supplied data for the dtor
3408  *	skip   Skip dtors and finis
3409  */
3410 static void
3411 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
3412 {
3413 #ifdef INVARIANTS
3414 	bool skipdbg;
3415 
3416 	skipdbg = uma_dbg_zskip(zone, item);
3417 	if (skip == SKIP_NONE && !skipdbg) {
3418 		if (zone->uz_flags & UMA_ZONE_MALLOC)
3419 			uma_dbg_free(zone, udata, item);
3420 		else
3421 			uma_dbg_free(zone, NULL, item);
3422 	}
3423 
3424 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL &&
3425 	    (!skipdbg || zone->uz_dtor != trash_dtor ||
3426 	    zone->uz_ctor != trash_ctor))
3427 #else
3428 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL)
3429 #endif
3430 		zone->uz_dtor(item, zone->uz_size, udata);
3431 
3432 	if (skip < SKIP_FINI && zone->uz_fini)
3433 		zone->uz_fini(item, zone->uz_size);
3434 
3435 	zone->uz_release(zone->uz_arg, &item, 1);
3436 
3437 	if (skip & SKIP_CNT)
3438 		return;
3439 
3440 	counter_u64_add(zone->uz_frees, 1);
3441 
3442 	if (zone->uz_max_items > 0) {
3443 		ZONE_LOCK(zone);
3444 		zone->uz_items--;
3445 		if (zone->uz_sleepers > 0 &&
3446 		    zone->uz_items < zone->uz_max_items)
3447 			wakeup_one(zone);
3448 		ZONE_UNLOCK(zone);
3449 	}
3450 }
3451 
3452 /* See uma.h */
3453 int
3454 uma_zone_set_max(uma_zone_t zone, int nitems)
3455 {
3456 	struct uma_bucket_zone *ubz;
3457 
3458 	/*
3459 	 * If limit is very low we may need to limit how
3460 	 * much items are allowed in CPU caches.
3461 	 */
3462 	ubz = &bucket_zones[0];
3463 	for (; ubz->ubz_entries != 0; ubz++)
3464 		if (ubz->ubz_entries * 2 * mp_ncpus > nitems)
3465 			break;
3466 	if (ubz == &bucket_zones[0])
3467 		nitems = ubz->ubz_entries * 2 * mp_ncpus;
3468 	else
3469 		ubz--;
3470 
3471 	ZONE_LOCK(zone);
3472 	zone->uz_count_max = zone->uz_count = ubz->ubz_entries;
3473 	if (zone->uz_count_min > zone->uz_count_max)
3474 		zone->uz_count_min = zone->uz_count_max;
3475 	zone->uz_max_items = nitems;
3476 	ZONE_UNLOCK(zone);
3477 
3478 	return (nitems);
3479 }
3480 
3481 /* See uma.h */
3482 int
3483 uma_zone_set_maxcache(uma_zone_t zone, int nitems)
3484 {
3485 
3486 	ZONE_LOCK(zone);
3487 	zone->uz_bkt_max = nitems;
3488 	ZONE_UNLOCK(zone);
3489 
3490 	return (nitems);
3491 }
3492 
3493 /* See uma.h */
3494 int
3495 uma_zone_get_max(uma_zone_t zone)
3496 {
3497 	int nitems;
3498 
3499 	ZONE_LOCK(zone);
3500 	nitems = zone->uz_max_items;
3501 	ZONE_UNLOCK(zone);
3502 
3503 	return (nitems);
3504 }
3505 
3506 /* See uma.h */
3507 void
3508 uma_zone_set_warning(uma_zone_t zone, const char *warning)
3509 {
3510 
3511 	ZONE_LOCK(zone);
3512 	zone->uz_warning = warning;
3513 	ZONE_UNLOCK(zone);
3514 }
3515 
3516 /* See uma.h */
3517 void
3518 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
3519 {
3520 
3521 	ZONE_LOCK(zone);
3522 	TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
3523 	ZONE_UNLOCK(zone);
3524 }
3525 
3526 /* See uma.h */
3527 int
3528 uma_zone_get_cur(uma_zone_t zone)
3529 {
3530 	int64_t nitems;
3531 	u_int i;
3532 
3533 	ZONE_LOCK(zone);
3534 	nitems = counter_u64_fetch(zone->uz_allocs) -
3535 	    counter_u64_fetch(zone->uz_frees);
3536 	CPU_FOREACH(i) {
3537 		/*
3538 		 * See the comment in uma_vm_zone_stats() regarding the
3539 		 * safety of accessing the per-cpu caches. With the zone lock
3540 		 * held, it is safe, but can potentially result in stale data.
3541 		 */
3542 		nitems += zone->uz_cpu[i].uc_allocs -
3543 		    zone->uz_cpu[i].uc_frees;
3544 	}
3545 	ZONE_UNLOCK(zone);
3546 
3547 	return (nitems < 0 ? 0 : nitems);
3548 }
3549 
3550 /* See uma.h */
3551 void
3552 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
3553 {
3554 	uma_keg_t keg;
3555 
3556 	KEG_GET(zone, keg);
3557 	KEG_LOCK(keg);
3558 	KASSERT(keg->uk_pages == 0,
3559 	    ("uma_zone_set_init on non-empty keg"));
3560 	keg->uk_init = uminit;
3561 	KEG_UNLOCK(keg);
3562 }
3563 
3564 /* See uma.h */
3565 void
3566 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
3567 {
3568 	uma_keg_t keg;
3569 
3570 	KEG_GET(zone, keg);
3571 	KEG_LOCK(keg);
3572 	KASSERT(keg->uk_pages == 0,
3573 	    ("uma_zone_set_fini on non-empty keg"));
3574 	keg->uk_fini = fini;
3575 	KEG_UNLOCK(keg);
3576 }
3577 
3578 /* See uma.h */
3579 void
3580 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
3581 {
3582 
3583 	ZONE_LOCK(zone);
3584 	KASSERT(zone->uz_keg->uk_pages == 0,
3585 	    ("uma_zone_set_zinit on non-empty keg"));
3586 	zone->uz_init = zinit;
3587 	ZONE_UNLOCK(zone);
3588 }
3589 
3590 /* See uma.h */
3591 void
3592 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
3593 {
3594 
3595 	ZONE_LOCK(zone);
3596 	KASSERT(zone->uz_keg->uk_pages == 0,
3597 	    ("uma_zone_set_zfini on non-empty keg"));
3598 	zone->uz_fini = zfini;
3599 	ZONE_UNLOCK(zone);
3600 }
3601 
3602 /* See uma.h */
3603 /* XXX uk_freef is not actually used with the zone locked */
3604 void
3605 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
3606 {
3607 	uma_keg_t keg;
3608 
3609 	KEG_GET(zone, keg);
3610 	KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
3611 	KEG_LOCK(keg);
3612 	keg->uk_freef = freef;
3613 	KEG_UNLOCK(keg);
3614 }
3615 
3616 /* See uma.h */
3617 /* XXX uk_allocf is not actually used with the zone locked */
3618 void
3619 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3620 {
3621 	uma_keg_t keg;
3622 
3623 	KEG_GET(zone, keg);
3624 	KEG_LOCK(keg);
3625 	keg->uk_allocf = allocf;
3626 	KEG_UNLOCK(keg);
3627 }
3628 
3629 /* See uma.h */
3630 void
3631 uma_zone_reserve(uma_zone_t zone, int items)
3632 {
3633 	uma_keg_t keg;
3634 
3635 	KEG_GET(zone, keg);
3636 	KEG_LOCK(keg);
3637 	keg->uk_reserve = items;
3638 	KEG_UNLOCK(keg);
3639 }
3640 
3641 /* See uma.h */
3642 int
3643 uma_zone_reserve_kva(uma_zone_t zone, int count)
3644 {
3645 	uma_keg_t keg;
3646 	vm_offset_t kva;
3647 	u_int pages;
3648 
3649 	KEG_GET(zone, keg);
3650 
3651 	pages = count / keg->uk_ipers;
3652 	if (pages * keg->uk_ipers < count)
3653 		pages++;
3654 	pages *= keg->uk_ppera;
3655 
3656 #ifdef UMA_MD_SMALL_ALLOC
3657 	if (keg->uk_ppera > 1) {
3658 #else
3659 	if (1) {
3660 #endif
3661 		kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
3662 		if (kva == 0)
3663 			return (0);
3664 	} else
3665 		kva = 0;
3666 
3667 	ZONE_LOCK(zone);
3668 	MPASS(keg->uk_kva == 0);
3669 	keg->uk_kva = kva;
3670 	keg->uk_offset = 0;
3671 	zone->uz_max_items = pages * keg->uk_ipers;
3672 #ifdef UMA_MD_SMALL_ALLOC
3673 	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3674 #else
3675 	keg->uk_allocf = noobj_alloc;
3676 #endif
3677 	keg->uk_flags |= UMA_ZONE_NOFREE;
3678 	ZONE_UNLOCK(zone);
3679 
3680 	return (1);
3681 }
3682 
3683 /* See uma.h */
3684 void
3685 uma_prealloc(uma_zone_t zone, int items)
3686 {
3687 	struct vm_domainset_iter di;
3688 	uma_domain_t dom;
3689 	uma_slab_t slab;
3690 	uma_keg_t keg;
3691 	int aflags, domain, slabs;
3692 
3693 	KEG_GET(zone, keg);
3694 	KEG_LOCK(keg);
3695 	slabs = items / keg->uk_ipers;
3696 	if (slabs * keg->uk_ipers < items)
3697 		slabs++;
3698 	while (slabs-- > 0) {
3699 		aflags = M_NOWAIT;
3700 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
3701 		    &aflags);
3702 		for (;;) {
3703 			slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
3704 			    aflags);
3705 			if (slab != NULL) {
3706 				MPASS(slab->us_keg == keg);
3707 				dom = &keg->uk_domain[slab->us_domain];
3708 				LIST_INSERT_HEAD(&dom->ud_free_slab, slab,
3709 				    us_link);
3710 				break;
3711 			}
3712 			KEG_LOCK(keg);
3713 			if (vm_domainset_iter_policy(&di, &domain) != 0) {
3714 				KEG_UNLOCK(keg);
3715 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
3716 				KEG_LOCK(keg);
3717 			}
3718 		}
3719 	}
3720 	KEG_UNLOCK(keg);
3721 }
3722 
3723 /* See uma.h */
3724 void
3725 uma_reclaim(int req)
3726 {
3727 
3728 	CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
3729 	sx_xlock(&uma_reclaim_lock);
3730 	bucket_enable();
3731 
3732 	switch (req) {
3733 	case UMA_RECLAIM_TRIM:
3734 		zone_foreach(zone_trim);
3735 		break;
3736 	case UMA_RECLAIM_DRAIN:
3737 	case UMA_RECLAIM_DRAIN_CPU:
3738 		zone_foreach(zone_drain);
3739 		if (req == UMA_RECLAIM_DRAIN_CPU) {
3740 			pcpu_cache_drain_safe(NULL);
3741 			zone_foreach(zone_drain);
3742 		}
3743 		break;
3744 	default:
3745 		panic("unhandled reclamation request %d", req);
3746 	}
3747 
3748 	/*
3749 	 * Some slabs may have been freed but this zone will be visited early
3750 	 * we visit again so that we can free pages that are empty once other
3751 	 * zones are drained.  We have to do the same for buckets.
3752 	 */
3753 	zone_drain(slabzone);
3754 	bucket_zone_drain();
3755 	sx_xunlock(&uma_reclaim_lock);
3756 }
3757 
3758 static volatile int uma_reclaim_needed;
3759 
3760 void
3761 uma_reclaim_wakeup(void)
3762 {
3763 
3764 	if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
3765 		wakeup(uma_reclaim);
3766 }
3767 
3768 void
3769 uma_reclaim_worker(void *arg __unused)
3770 {
3771 
3772 	for (;;) {
3773 		sx_xlock(&uma_reclaim_lock);
3774 		while (atomic_load_int(&uma_reclaim_needed) == 0)
3775 			sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl",
3776 			    hz);
3777 		sx_xunlock(&uma_reclaim_lock);
3778 		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
3779 		uma_reclaim(UMA_RECLAIM_DRAIN_CPU);
3780 		atomic_store_int(&uma_reclaim_needed, 0);
3781 		/* Don't fire more than once per-second. */
3782 		pause("umarclslp", hz);
3783 	}
3784 }
3785 
3786 /* See uma.h */
3787 void
3788 uma_zone_reclaim(uma_zone_t zone, int req)
3789 {
3790 
3791 	switch (req) {
3792 	case UMA_RECLAIM_TRIM:
3793 		zone_trim(zone);
3794 		break;
3795 	case UMA_RECLAIM_DRAIN:
3796 		zone_drain(zone);
3797 		break;
3798 	case UMA_RECLAIM_DRAIN_CPU:
3799 		pcpu_cache_drain_safe(zone);
3800 		zone_drain(zone);
3801 		break;
3802 	default:
3803 		panic("unhandled reclamation request %d", req);
3804 	}
3805 }
3806 
3807 /* See uma.h */
3808 int
3809 uma_zone_exhausted(uma_zone_t zone)
3810 {
3811 	int full;
3812 
3813 	ZONE_LOCK(zone);
3814 	full = zone->uz_sleepers > 0;
3815 	ZONE_UNLOCK(zone);
3816 	return (full);
3817 }
3818 
3819 int
3820 uma_zone_exhausted_nolock(uma_zone_t zone)
3821 {
3822 	return (zone->uz_sleepers > 0);
3823 }
3824 
3825 void *
3826 uma_large_malloc_domain(vm_size_t size, int domain, int wait)
3827 {
3828 	struct domainset *policy;
3829 	vm_offset_t addr;
3830 	uma_slab_t slab;
3831 
3832 	if (domain != UMA_ANYDOMAIN) {
3833 		/* avoid allocs targeting empty domains */
3834 		if (VM_DOMAIN_EMPTY(domain))
3835 			domain = UMA_ANYDOMAIN;
3836 	}
3837 	slab = zone_alloc_item(slabzone, NULL, domain, wait);
3838 	if (slab == NULL)
3839 		return (NULL);
3840 	policy = (domain == UMA_ANYDOMAIN) ? DOMAINSET_RR() :
3841 	    DOMAINSET_FIXED(domain);
3842 	addr = kmem_malloc_domainset(policy, size, wait);
3843 	if (addr != 0) {
3844 		vsetslab(addr, slab);
3845 		slab->us_data = (void *)addr;
3846 		slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC;
3847 		slab->us_size = size;
3848 		slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE(
3849 		    pmap_kextract(addr)));
3850 		uma_total_inc(size);
3851 	} else {
3852 		zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3853 	}
3854 
3855 	return ((void *)addr);
3856 }
3857 
3858 void *
3859 uma_large_malloc(vm_size_t size, int wait)
3860 {
3861 
3862 	return uma_large_malloc_domain(size, UMA_ANYDOMAIN, wait);
3863 }
3864 
3865 void
3866 uma_large_free(uma_slab_t slab)
3867 {
3868 
3869 	KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0,
3870 	    ("uma_large_free:  Memory not allocated with uma_large_malloc."));
3871 	kmem_free((vm_offset_t)slab->us_data, slab->us_size);
3872 	uma_total_dec(slab->us_size);
3873 	zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3874 }
3875 
3876 static void
3877 uma_zero_item(void *item, uma_zone_t zone)
3878 {
3879 
3880 	bzero(item, zone->uz_size);
3881 }
3882 
3883 unsigned long
3884 uma_limit(void)
3885 {
3886 
3887 	return (uma_kmem_limit);
3888 }
3889 
3890 void
3891 uma_set_limit(unsigned long limit)
3892 {
3893 
3894 	uma_kmem_limit = limit;
3895 }
3896 
3897 unsigned long
3898 uma_size(void)
3899 {
3900 
3901 	return (atomic_load_long(&uma_kmem_total));
3902 }
3903 
3904 long
3905 uma_avail(void)
3906 {
3907 
3908 	return (uma_kmem_limit - uma_size());
3909 }
3910 
3911 void
3912 uma_print_stats(void)
3913 {
3914 	zone_foreach(uma_print_zone);
3915 }
3916 
3917 static void
3918 slab_print(uma_slab_t slab)
3919 {
3920 	printf("slab: keg %p, data %p, freecount %d\n",
3921 		slab->us_keg, slab->us_data, slab->us_freecount);
3922 }
3923 
3924 static void
3925 cache_print(uma_cache_t cache)
3926 {
3927 	printf("alloc: %p(%d), free: %p(%d), cross: %p(%d)j\n",
3928 		cache->uc_allocbucket,
3929 		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3930 		cache->uc_freebucket,
3931 		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0,
3932 		cache->uc_crossbucket,
3933 		cache->uc_crossbucket?cache->uc_crossbucket->ub_cnt:0);
3934 }
3935 
3936 static void
3937 uma_print_keg(uma_keg_t keg)
3938 {
3939 	uma_domain_t dom;
3940 	uma_slab_t slab;
3941 	int i;
3942 
3943 	printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3944 	    "out %d free %d\n",
3945 	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3946 	    keg->uk_ipers, keg->uk_ppera,
3947 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
3948 	    keg->uk_free);
3949 	for (i = 0; i < vm_ndomains; i++) {
3950 		dom = &keg->uk_domain[i];
3951 		printf("Part slabs:\n");
3952 		LIST_FOREACH(slab, &dom->ud_part_slab, us_link)
3953 			slab_print(slab);
3954 		printf("Free slabs:\n");
3955 		LIST_FOREACH(slab, &dom->ud_free_slab, us_link)
3956 			slab_print(slab);
3957 		printf("Full slabs:\n");
3958 		LIST_FOREACH(slab, &dom->ud_full_slab, us_link)
3959 			slab_print(slab);
3960 	}
3961 }
3962 
3963 void
3964 uma_print_zone(uma_zone_t zone)
3965 {
3966 	uma_cache_t cache;
3967 	int i;
3968 
3969 	printf("zone: %s(%p) size %d maxitems %ju flags %#x\n",
3970 	    zone->uz_name, zone, zone->uz_size, (uintmax_t)zone->uz_max_items,
3971 	    zone->uz_flags);
3972 	if (zone->uz_lockptr != &zone->uz_lock)
3973 		uma_print_keg(zone->uz_keg);
3974 	CPU_FOREACH(i) {
3975 		cache = &zone->uz_cpu[i];
3976 		printf("CPU %d Cache:\n", i);
3977 		cache_print(cache);
3978 	}
3979 }
3980 
3981 #ifdef DDB
3982 /*
3983  * Generate statistics across both the zone and its per-cpu cache's.  Return
3984  * desired statistics if the pointer is non-NULL for that statistic.
3985  *
3986  * Note: does not update the zone statistics, as it can't safely clear the
3987  * per-CPU cache statistic.
3988  *
3989  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
3990  * safe from off-CPU; we should modify the caches to track this information
3991  * directly so that we don't have to.
3992  */
3993 static void
3994 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
3995     uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp)
3996 {
3997 	uma_cache_t cache;
3998 	uint64_t allocs, frees, sleeps, xdomain;
3999 	int cachefree, cpu;
4000 
4001 	allocs = frees = sleeps = xdomain = 0;
4002 	cachefree = 0;
4003 	CPU_FOREACH(cpu) {
4004 		cache = &z->uz_cpu[cpu];
4005 		if (cache->uc_allocbucket != NULL)
4006 			cachefree += cache->uc_allocbucket->ub_cnt;
4007 		if (cache->uc_freebucket != NULL)
4008 			cachefree += cache->uc_freebucket->ub_cnt;
4009 		if (cache->uc_crossbucket != NULL) {
4010 			xdomain += cache->uc_crossbucket->ub_cnt;
4011 			cachefree += cache->uc_crossbucket->ub_cnt;
4012 		}
4013 		allocs += cache->uc_allocs;
4014 		frees += cache->uc_frees;
4015 	}
4016 	allocs += counter_u64_fetch(z->uz_allocs);
4017 	frees += counter_u64_fetch(z->uz_frees);
4018 	sleeps += z->uz_sleeps;
4019 	xdomain += z->uz_xdomain;
4020 	if (cachefreep != NULL)
4021 		*cachefreep = cachefree;
4022 	if (allocsp != NULL)
4023 		*allocsp = allocs;
4024 	if (freesp != NULL)
4025 		*freesp = frees;
4026 	if (sleepsp != NULL)
4027 		*sleepsp = sleeps;
4028 	if (xdomainp != NULL)
4029 		*xdomainp = xdomain;
4030 }
4031 #endif /* DDB */
4032 
4033 static int
4034 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
4035 {
4036 	uma_keg_t kz;
4037 	uma_zone_t z;
4038 	int count;
4039 
4040 	count = 0;
4041 	rw_rlock(&uma_rwlock);
4042 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4043 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
4044 			count++;
4045 	}
4046 	LIST_FOREACH(z, &uma_cachezones, uz_link)
4047 		count++;
4048 
4049 	rw_runlock(&uma_rwlock);
4050 	return (sysctl_handle_int(oidp, &count, 0, req));
4051 }
4052 
4053 static void
4054 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf,
4055     struct uma_percpu_stat *ups, bool internal)
4056 {
4057 	uma_zone_domain_t zdom;
4058 	uma_bucket_t bucket;
4059 	uma_cache_t cache;
4060 	int i;
4061 
4062 
4063 	for (i = 0; i < vm_ndomains; i++) {
4064 		zdom = &z->uz_domain[i];
4065 		uth->uth_zone_free += zdom->uzd_nitems;
4066 	}
4067 	uth->uth_allocs = counter_u64_fetch(z->uz_allocs);
4068 	uth->uth_frees = counter_u64_fetch(z->uz_frees);
4069 	uth->uth_fails = counter_u64_fetch(z->uz_fails);
4070 	uth->uth_sleeps = z->uz_sleeps;
4071 	uth->uth_xdomain = z->uz_xdomain;
4072 
4073 	/*
4074 	 * While it is not normally safe to access the cache bucket pointers
4075 	 * while not on the CPU that owns the cache, we only allow the pointers
4076 	 * to be exchanged without the zone lock held, not invalidated, so
4077 	 * accept the possible race associated with bucket exchange during
4078 	 * monitoring.  Use atomic_load_ptr() to ensure that the bucket pointers
4079 	 * are loaded only once.
4080 	 */
4081 	for (i = 0; i < mp_maxid + 1; i++) {
4082 		bzero(&ups[i], sizeof(*ups));
4083 		if (internal || CPU_ABSENT(i))
4084 			continue;
4085 		cache = &z->uz_cpu[i];
4086 		bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_allocbucket);
4087 		if (bucket != NULL)
4088 			ups[i].ups_cache_free += bucket->ub_cnt;
4089 		bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_freebucket);
4090 		if (bucket != NULL)
4091 			ups[i].ups_cache_free += bucket->ub_cnt;
4092 		bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_crossbucket);
4093 		if (bucket != NULL)
4094 			ups[i].ups_cache_free += bucket->ub_cnt;
4095 		ups[i].ups_allocs = cache->uc_allocs;
4096 		ups[i].ups_frees = cache->uc_frees;
4097 	}
4098 }
4099 
4100 static int
4101 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
4102 {
4103 	struct uma_stream_header ush;
4104 	struct uma_type_header uth;
4105 	struct uma_percpu_stat *ups;
4106 	struct sbuf sbuf;
4107 	uma_keg_t kz;
4108 	uma_zone_t z;
4109 	int count, error, i;
4110 
4111 	error = sysctl_wire_old_buffer(req, 0);
4112 	if (error != 0)
4113 		return (error);
4114 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
4115 	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
4116 	ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
4117 
4118 	count = 0;
4119 	rw_rlock(&uma_rwlock);
4120 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4121 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
4122 			count++;
4123 	}
4124 
4125 	LIST_FOREACH(z, &uma_cachezones, uz_link)
4126 		count++;
4127 
4128 	/*
4129 	 * Insert stream header.
4130 	 */
4131 	bzero(&ush, sizeof(ush));
4132 	ush.ush_version = UMA_STREAM_VERSION;
4133 	ush.ush_maxcpus = (mp_maxid + 1);
4134 	ush.ush_count = count;
4135 	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
4136 
4137 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4138 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4139 			bzero(&uth, sizeof(uth));
4140 			ZONE_LOCK(z);
4141 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
4142 			uth.uth_align = kz->uk_align;
4143 			uth.uth_size = kz->uk_size;
4144 			uth.uth_rsize = kz->uk_rsize;
4145 			if (z->uz_max_items > 0)
4146 				uth.uth_pages = (z->uz_items / kz->uk_ipers) *
4147 					kz->uk_ppera;
4148 			else
4149 				uth.uth_pages = kz->uk_pages;
4150 			uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) *
4151 			    kz->uk_ppera;
4152 			uth.uth_limit = z->uz_max_items;
4153 			uth.uth_keg_free = z->uz_keg->uk_free;
4154 
4155 			/*
4156 			 * A zone is secondary is it is not the first entry
4157 			 * on the keg's zone list.
4158 			 */
4159 			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
4160 			    (LIST_FIRST(&kz->uk_zones) != z))
4161 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
4162 			uma_vm_zone_stats(&uth, z, &sbuf, ups,
4163 			    kz->uk_flags & UMA_ZFLAG_INTERNAL);
4164 			ZONE_UNLOCK(z);
4165 			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
4166 			for (i = 0; i < mp_maxid + 1; i++)
4167 				(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
4168 		}
4169 	}
4170 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
4171 		bzero(&uth, sizeof(uth));
4172 		ZONE_LOCK(z);
4173 		strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
4174 		uth.uth_size = z->uz_size;
4175 		uma_vm_zone_stats(&uth, z, &sbuf, ups, false);
4176 		ZONE_UNLOCK(z);
4177 		(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
4178 		for (i = 0; i < mp_maxid + 1; i++)
4179 			(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
4180 	}
4181 
4182 	rw_runlock(&uma_rwlock);
4183 	error = sbuf_finish(&sbuf);
4184 	sbuf_delete(&sbuf);
4185 	free(ups, M_TEMP);
4186 	return (error);
4187 }
4188 
4189 int
4190 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
4191 {
4192 	uma_zone_t zone = *(uma_zone_t *)arg1;
4193 	int error, max;
4194 
4195 	max = uma_zone_get_max(zone);
4196 	error = sysctl_handle_int(oidp, &max, 0, req);
4197 	if (error || !req->newptr)
4198 		return (error);
4199 
4200 	uma_zone_set_max(zone, max);
4201 
4202 	return (0);
4203 }
4204 
4205 int
4206 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
4207 {
4208 	uma_zone_t zone = *(uma_zone_t *)arg1;
4209 	int cur;
4210 
4211 	cur = uma_zone_get_cur(zone);
4212 	return (sysctl_handle_int(oidp, &cur, 0, req));
4213 }
4214 
4215 #ifdef INVARIANTS
4216 static uma_slab_t
4217 uma_dbg_getslab(uma_zone_t zone, void *item)
4218 {
4219 	uma_slab_t slab;
4220 	uma_keg_t keg;
4221 	uint8_t *mem;
4222 
4223 	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
4224 	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
4225 		slab = vtoslab((vm_offset_t)mem);
4226 	} else {
4227 		/*
4228 		 * It is safe to return the slab here even though the
4229 		 * zone is unlocked because the item's allocation state
4230 		 * essentially holds a reference.
4231 		 */
4232 		if (zone->uz_lockptr == &zone->uz_lock)
4233 			return (NULL);
4234 		ZONE_LOCK(zone);
4235 		keg = zone->uz_keg;
4236 		if (keg->uk_flags & UMA_ZONE_HASH)
4237 			slab = hash_sfind(&keg->uk_hash, mem);
4238 		else
4239 			slab = (uma_slab_t)(mem + keg->uk_pgoff);
4240 		ZONE_UNLOCK(zone);
4241 	}
4242 
4243 	return (slab);
4244 }
4245 
4246 static bool
4247 uma_dbg_zskip(uma_zone_t zone, void *mem)
4248 {
4249 
4250 	if (zone->uz_lockptr == &zone->uz_lock)
4251 		return (true);
4252 
4253 	return (uma_dbg_kskip(zone->uz_keg, mem));
4254 }
4255 
4256 static bool
4257 uma_dbg_kskip(uma_keg_t keg, void *mem)
4258 {
4259 	uintptr_t idx;
4260 
4261 	if (dbg_divisor == 0)
4262 		return (true);
4263 
4264 	if (dbg_divisor == 1)
4265 		return (false);
4266 
4267 	idx = (uintptr_t)mem >> PAGE_SHIFT;
4268 	if (keg->uk_ipers > 1) {
4269 		idx *= keg->uk_ipers;
4270 		idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
4271 	}
4272 
4273 	if ((idx / dbg_divisor) * dbg_divisor != idx) {
4274 		counter_u64_add(uma_skip_cnt, 1);
4275 		return (true);
4276 	}
4277 	counter_u64_add(uma_dbg_cnt, 1);
4278 
4279 	return (false);
4280 }
4281 
4282 /*
4283  * Set up the slab's freei data such that uma_dbg_free can function.
4284  *
4285  */
4286 static void
4287 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
4288 {
4289 	uma_keg_t keg;
4290 	int freei;
4291 
4292 	if (slab == NULL) {
4293 		slab = uma_dbg_getslab(zone, item);
4294 		if (slab == NULL)
4295 			panic("uma: item %p did not belong to zone %s\n",
4296 			    item, zone->uz_name);
4297 	}
4298 	keg = slab->us_keg;
4299 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4300 
4301 	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4302 		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
4303 		    item, zone, zone->uz_name, slab, freei);
4304 	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4305 
4306 	return;
4307 }
4308 
4309 /*
4310  * Verifies freed addresses.  Checks for alignment, valid slab membership
4311  * and duplicate frees.
4312  *
4313  */
4314 static void
4315 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
4316 {
4317 	uma_keg_t keg;
4318 	int freei;
4319 
4320 	if (slab == NULL) {
4321 		slab = uma_dbg_getslab(zone, item);
4322 		if (slab == NULL)
4323 			panic("uma: Freed item %p did not belong to zone %s\n",
4324 			    item, zone->uz_name);
4325 	}
4326 	keg = slab->us_keg;
4327 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4328 
4329 	if (freei >= keg->uk_ipers)
4330 		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
4331 		    item, zone, zone->uz_name, slab, freei);
4332 
4333 	if (((freei * keg->uk_rsize) + slab->us_data) != item)
4334 		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
4335 		    item, zone, zone->uz_name, slab, freei);
4336 
4337 	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4338 		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
4339 		    item, zone, zone->uz_name, slab, freei);
4340 
4341 	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4342 }
4343 #endif /* INVARIANTS */
4344 
4345 #ifdef DDB
4346 static int64_t
4347 get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used,
4348     uint64_t *sleeps, long *cachefree, uint64_t *xdomain)
4349 {
4350 	uint64_t frees;
4351 	int i;
4352 
4353 	if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
4354 		*allocs = counter_u64_fetch(z->uz_allocs);
4355 		frees = counter_u64_fetch(z->uz_frees);
4356 		*sleeps = z->uz_sleeps;
4357 		*cachefree = 0;
4358 		*xdomain = 0;
4359 	} else
4360 		uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps,
4361 		    xdomain);
4362 	if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
4363 	    (LIST_FIRST(&kz->uk_zones) != z)))
4364 		*cachefree += kz->uk_free;
4365 	for (i = 0; i < vm_ndomains; i++)
4366 		*cachefree += z->uz_domain[i].uzd_nitems;
4367 	*used = *allocs - frees;
4368 	return (((int64_t)*used + *cachefree) * kz->uk_size);
4369 }
4370 
4371 DB_SHOW_COMMAND(uma, db_show_uma)
4372 {
4373 	const char *fmt_hdr, *fmt_entry;
4374 	uma_keg_t kz;
4375 	uma_zone_t z;
4376 	uint64_t allocs, used, sleeps, xdomain;
4377 	long cachefree;
4378 	/* variables for sorting */
4379 	uma_keg_t cur_keg;
4380 	uma_zone_t cur_zone, last_zone;
4381 	int64_t cur_size, last_size, size;
4382 	int ties;
4383 
4384 	/* /i option produces machine-parseable CSV output */
4385 	if (modif[0] == 'i') {
4386 		fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n";
4387 		fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n";
4388 	} else {
4389 		fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n";
4390 		fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n";
4391 	}
4392 
4393 	db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests",
4394 	    "Sleeps", "Bucket", "Total Mem", "XFree");
4395 
4396 	/* Sort the zones with largest size first. */
4397 	last_zone = NULL;
4398 	last_size = INT64_MAX;
4399 	for (;;) {
4400 		cur_zone = NULL;
4401 		cur_size = -1;
4402 		ties = 0;
4403 		LIST_FOREACH(kz, &uma_kegs, uk_link) {
4404 			LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4405 				/*
4406 				 * In the case of size ties, print out zones
4407 				 * in the order they are encountered.  That is,
4408 				 * when we encounter the most recently output
4409 				 * zone, we have already printed all preceding
4410 				 * ties, and we must print all following ties.
4411 				 */
4412 				if (z == last_zone) {
4413 					ties = 1;
4414 					continue;
4415 				}
4416 				size = get_uma_stats(kz, z, &allocs, &used,
4417 				    &sleeps, &cachefree, &xdomain);
4418 				if (size > cur_size && size < last_size + ties)
4419 				{
4420 					cur_size = size;
4421 					cur_zone = z;
4422 					cur_keg = kz;
4423 				}
4424 			}
4425 		}
4426 		if (cur_zone == NULL)
4427 			break;
4428 
4429 		size = get_uma_stats(cur_keg, cur_zone, &allocs, &used,
4430 		    &sleeps, &cachefree, &xdomain);
4431 		db_printf(fmt_entry, cur_zone->uz_name,
4432 		    (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree,
4433 		    (uintmax_t)allocs, (uintmax_t)sleeps,
4434 		    (unsigned)cur_zone->uz_count, (intmax_t)size, xdomain);
4435 
4436 		if (db_pager_quit)
4437 			return;
4438 		last_zone = cur_zone;
4439 		last_size = cur_size;
4440 	}
4441 }
4442 
4443 DB_SHOW_COMMAND(umacache, db_show_umacache)
4444 {
4445 	uma_zone_t z;
4446 	uint64_t allocs, frees;
4447 	long cachefree;
4448 	int i;
4449 
4450 	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
4451 	    "Requests", "Bucket");
4452 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
4453 		uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL);
4454 		for (i = 0; i < vm_ndomains; i++)
4455 			cachefree += z->uz_domain[i].uzd_nitems;
4456 		db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",
4457 		    z->uz_name, (uintmax_t)z->uz_size,
4458 		    (intmax_t)(allocs - frees), cachefree,
4459 		    (uintmax_t)allocs, z->uz_count);
4460 		if (db_pager_quit)
4461 			return;
4462 	}
4463 }
4464 #endif	/* DDB */
4465