xref: /freebsd/sys/vm/uma_core.c (revision a15cb219c6f2b8ed16179c2fce882a2ff327b753)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
5  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
6  * Copyright (c) 2004-2006 Robert N. M. Watson
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice unmodified, this list of conditions, and the following
14  *    disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * uma_core.c  Implementation of the Universal Memory allocator
33  *
34  * This allocator is intended to replace the multitude of similar object caches
35  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
36  * efficient.  A primary design goal is to return unused memory to the rest of
37  * the system.  This will make the system as a whole more flexible due to the
38  * ability to move memory to subsystems which most need it instead of leaving
39  * pools of reserved memory unused.
40  *
41  * The basic ideas stem from similar slab/zone based allocators whose algorithms
42  * are well known.
43  *
44  */
45 
46 /*
47  * TODO:
48  *	- Improve memory usage for large allocations
49  *	- Investigate cache size adjustments
50  */
51 
52 #include <sys/cdefs.h>
53 __FBSDID("$FreeBSD$");
54 
55 #include "opt_ddb.h"
56 #include "opt_param.h"
57 #include "opt_vm.h"
58 
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/bitset.h>
62 #include <sys/domainset.h>
63 #include <sys/eventhandler.h>
64 #include <sys/kernel.h>
65 #include <sys/types.h>
66 #include <sys/limits.h>
67 #include <sys/queue.h>
68 #include <sys/malloc.h>
69 #include <sys/ktr.h>
70 #include <sys/lock.h>
71 #include <sys/sysctl.h>
72 #include <sys/mutex.h>
73 #include <sys/proc.h>
74 #include <sys/random.h>
75 #include <sys/rwlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/taskqueue.h>
80 #include <sys/vmmeter.h>
81 
82 #include <vm/vm.h>
83 #include <vm/vm_domainset.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_param.h>
88 #include <vm/vm_phys.h>
89 #include <vm/vm_pagequeue.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_kern.h>
92 #include <vm/vm_extern.h>
93 #include <vm/uma.h>
94 #include <vm/uma_int.h>
95 #include <vm/uma_dbg.h>
96 
97 #include <ddb/ddb.h>
98 
99 #ifdef DEBUG_MEMGUARD
100 #include <vm/memguard.h>
101 #endif
102 
103 /*
104  * This is the zone and keg from which all zones are spawned.
105  */
106 static uma_zone_t kegs;
107 static uma_zone_t zones;
108 
109 /* This is the zone from which all offpage uma_slab_ts are allocated. */
110 static uma_zone_t slabzone;
111 
112 /*
113  * The initial hash tables come out of this zone so they can be allocated
114  * prior to malloc coming up.
115  */
116 static uma_zone_t hashzone;
117 
118 /* The boot-time adjusted value for cache line alignment. */
119 int uma_align_cache = 64 - 1;
120 
121 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
122 
123 /*
124  * Are we allowed to allocate buckets?
125  */
126 static int bucketdisable = 1;
127 
128 /* Linked list of all kegs in the system */
129 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
130 
131 /* Linked list of all cache-only zones in the system */
132 static LIST_HEAD(,uma_zone) uma_cachezones =
133     LIST_HEAD_INITIALIZER(uma_cachezones);
134 
135 /* This RW lock protects the keg list */
136 static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
137 
138 /*
139  * Pointer and counter to pool of pages, that is preallocated at
140  * startup to bootstrap UMA.
141  */
142 static char *bootmem;
143 static int boot_pages;
144 
145 static struct sx uma_drain_lock;
146 
147 /*
148  * kmem soft limit, initialized by uma_set_limit().  Ensure that early
149  * allocations don't trigger a wakeup of the reclaim thread.
150  */
151 static unsigned long uma_kmem_limit = LONG_MAX;
152 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0,
153     "UMA kernel memory soft limit");
154 static unsigned long uma_kmem_total;
155 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0,
156     "UMA kernel memory usage");
157 
158 /* Is the VM done starting up? */
159 static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
160     BOOT_RUNNING } booted = BOOT_COLD;
161 
162 /*
163  * This is the handle used to schedule events that need to happen
164  * outside of the allocation fast path.
165  */
166 static struct callout uma_callout;
167 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
168 
169 /*
170  * This structure is passed as the zone ctor arg so that I don't have to create
171  * a special allocation function just for zones.
172  */
173 struct uma_zctor_args {
174 	const char *name;
175 	size_t size;
176 	uma_ctor ctor;
177 	uma_dtor dtor;
178 	uma_init uminit;
179 	uma_fini fini;
180 	uma_import import;
181 	uma_release release;
182 	void *arg;
183 	uma_keg_t keg;
184 	int align;
185 	uint32_t flags;
186 };
187 
188 struct uma_kctor_args {
189 	uma_zone_t zone;
190 	size_t size;
191 	uma_init uminit;
192 	uma_fini fini;
193 	int align;
194 	uint32_t flags;
195 };
196 
197 struct uma_bucket_zone {
198 	uma_zone_t	ubz_zone;
199 	char		*ubz_name;
200 	int		ubz_entries;	/* Number of items it can hold. */
201 	int		ubz_maxsize;	/* Maximum allocation size per-item. */
202 };
203 
204 /*
205  * Compute the actual number of bucket entries to pack them in power
206  * of two sizes for more efficient space utilization.
207  */
208 #define	BUCKET_SIZE(n)						\
209     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
210 
211 #define	BUCKET_MAX	BUCKET_SIZE(256)
212 #define	BUCKET_MIN	BUCKET_SIZE(4)
213 
214 struct uma_bucket_zone bucket_zones[] = {
215 	{ NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
216 	{ NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
217 	{ NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
218 	{ NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
219 	{ NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
220 	{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
221 	{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
222 	{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
223 	{ NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
224 	{ NULL, NULL, 0}
225 };
226 
227 /*
228  * Flags and enumerations to be passed to internal functions.
229  */
230 enum zfreeskip {
231 	SKIP_NONE =	0,
232 	SKIP_CNT =	0x00000001,
233 	SKIP_DTOR =	0x00010000,
234 	SKIP_FINI =	0x00020000,
235 };
236 
237 #define	UMA_ANYDOMAIN	-1	/* Special value for domain search. */
238 
239 /* Prototypes.. */
240 
241 int	uma_startup_count(int);
242 void	uma_startup(void *, int);
243 void	uma_startup1(void);
244 void	uma_startup2(void);
245 
246 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
247 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
248 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
249 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
250 static void page_free(void *, vm_size_t, uint8_t);
251 static void pcpu_page_free(void *, vm_size_t, uint8_t);
252 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
253 static void cache_drain(uma_zone_t);
254 static void bucket_drain(uma_zone_t, uma_bucket_t);
255 static void bucket_cache_drain(uma_zone_t zone);
256 static int keg_ctor(void *, int, void *, int);
257 static void keg_dtor(void *, int, void *);
258 static int zone_ctor(void *, int, void *, int);
259 static void zone_dtor(void *, int, void *);
260 static int zero_init(void *, int, int);
261 static void keg_small_init(uma_keg_t keg);
262 static void keg_large_init(uma_keg_t keg);
263 static void zone_foreach(void (*zfunc)(uma_zone_t));
264 static void zone_timeout(uma_zone_t zone);
265 static int hash_alloc(struct uma_hash *, u_int);
266 static int hash_expand(struct uma_hash *, struct uma_hash *);
267 static void hash_free(struct uma_hash *hash);
268 static void uma_timeout(void *);
269 static void uma_startup3(void);
270 static void *zone_alloc_item(uma_zone_t, void *, int, int);
271 static void *zone_alloc_item_locked(uma_zone_t, void *, int, int);
272 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
273 static void bucket_enable(void);
274 static void bucket_init(void);
275 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
276 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
277 static void bucket_zone_drain(void);
278 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int, int);
279 static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int);
280 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
281 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item);
282 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
283     uma_fini fini, int align, uint32_t flags);
284 static int zone_import(uma_zone_t, void **, int, int, int);
285 static void zone_release(uma_zone_t, void **, int);
286 static void uma_zero_item(void *, uma_zone_t);
287 
288 void uma_print_zone(uma_zone_t);
289 void uma_print_stats(void);
290 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
291 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
292 
293 #ifdef INVARIANTS
294 static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
295 static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
296 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
297 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
298 
299 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0,
300     "Memory allocation debugging");
301 
302 static u_int dbg_divisor = 1;
303 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
304     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
305     "Debug & thrash every this item in memory allocator");
306 
307 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
308 static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
309 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
310     &uma_dbg_cnt, "memory items debugged");
311 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
312     &uma_skip_cnt, "memory items skipped, not debugged");
313 #endif
314 
315 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
316 
317 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
318     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
319 
320 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
321     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
322 
323 static int zone_warnings = 1;
324 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
325     "Warn when UMA zones becomes full");
326 
327 /* Adjust bytes under management by UMA. */
328 static inline void
329 uma_total_dec(unsigned long size)
330 {
331 
332 	atomic_subtract_long(&uma_kmem_total, size);
333 }
334 
335 static inline void
336 uma_total_inc(unsigned long size)
337 {
338 
339 	if (atomic_fetchadd_long(&uma_kmem_total, size) > uma_kmem_limit)
340 		uma_reclaim_wakeup();
341 }
342 
343 /*
344  * This routine checks to see whether or not it's safe to enable buckets.
345  */
346 static void
347 bucket_enable(void)
348 {
349 	bucketdisable = vm_page_count_min();
350 }
351 
352 /*
353  * Initialize bucket_zones, the array of zones of buckets of various sizes.
354  *
355  * For each zone, calculate the memory required for each bucket, consisting
356  * of the header and an array of pointers.
357  */
358 static void
359 bucket_init(void)
360 {
361 	struct uma_bucket_zone *ubz;
362 	int size;
363 
364 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
365 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
366 		size += sizeof(void *) * ubz->ubz_entries;
367 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
368 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
369 		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA);
370 	}
371 }
372 
373 /*
374  * Given a desired number of entries for a bucket, return the zone from which
375  * to allocate the bucket.
376  */
377 static struct uma_bucket_zone *
378 bucket_zone_lookup(int entries)
379 {
380 	struct uma_bucket_zone *ubz;
381 
382 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
383 		if (ubz->ubz_entries >= entries)
384 			return (ubz);
385 	ubz--;
386 	return (ubz);
387 }
388 
389 static int
390 bucket_select(int size)
391 {
392 	struct uma_bucket_zone *ubz;
393 
394 	ubz = &bucket_zones[0];
395 	if (size > ubz->ubz_maxsize)
396 		return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
397 
398 	for (; ubz->ubz_entries != 0; ubz++)
399 		if (ubz->ubz_maxsize < size)
400 			break;
401 	ubz--;
402 	return (ubz->ubz_entries);
403 }
404 
405 static uma_bucket_t
406 bucket_alloc(uma_zone_t zone, void *udata, int flags)
407 {
408 	struct uma_bucket_zone *ubz;
409 	uma_bucket_t bucket;
410 
411 	/*
412 	 * This is to stop us from allocating per cpu buckets while we're
413 	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
414 	 * boot pages.  This also prevents us from allocating buckets in
415 	 * low memory situations.
416 	 */
417 	if (bucketdisable)
418 		return (NULL);
419 	/*
420 	 * To limit bucket recursion we store the original zone flags
421 	 * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
422 	 * NOVM flag to persist even through deep recursions.  We also
423 	 * store ZFLAG_BUCKET once we have recursed attempting to allocate
424 	 * a bucket for a bucket zone so we do not allow infinite bucket
425 	 * recursion.  This cookie will even persist to frees of unused
426 	 * buckets via the allocation path or bucket allocations in the
427 	 * free path.
428 	 */
429 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
430 		udata = (void *)(uintptr_t)zone->uz_flags;
431 	else {
432 		if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
433 			return (NULL);
434 		udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
435 	}
436 	if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
437 		flags |= M_NOVM;
438 	ubz = bucket_zone_lookup(zone->uz_count);
439 	if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
440 		ubz++;
441 	bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
442 	if (bucket) {
443 #ifdef INVARIANTS
444 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
445 #endif
446 		bucket->ub_cnt = 0;
447 		bucket->ub_entries = ubz->ubz_entries;
448 	}
449 
450 	return (bucket);
451 }
452 
453 static void
454 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
455 {
456 	struct uma_bucket_zone *ubz;
457 
458 	KASSERT(bucket->ub_cnt == 0,
459 	    ("bucket_free: Freeing a non free bucket."));
460 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
461 		udata = (void *)(uintptr_t)zone->uz_flags;
462 	ubz = bucket_zone_lookup(bucket->ub_entries);
463 	uma_zfree_arg(ubz->ubz_zone, bucket, udata);
464 }
465 
466 static void
467 bucket_zone_drain(void)
468 {
469 	struct uma_bucket_zone *ubz;
470 
471 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
472 		zone_drain(ubz->ubz_zone);
473 }
474 
475 static uma_bucket_t
476 zone_try_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, const bool ws)
477 {
478 	uma_bucket_t bucket;
479 
480 	ZONE_LOCK_ASSERT(zone);
481 
482 	if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
483 		MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
484 		LIST_REMOVE(bucket, ub_link);
485 		zdom->uzd_nitems -= bucket->ub_cnt;
486 		if (ws && zdom->uzd_imin > zdom->uzd_nitems)
487 			zdom->uzd_imin = zdom->uzd_nitems;
488 		zone->uz_bkt_count -= bucket->ub_cnt;
489 	}
490 	return (bucket);
491 }
492 
493 static void
494 zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket,
495     const bool ws)
496 {
497 
498 	ZONE_LOCK_ASSERT(zone);
499 	KASSERT(zone->uz_bkt_count < zone->uz_bkt_max, ("%s: zone %p overflow",
500 	    __func__, zone));
501 
502 	LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
503 	zdom->uzd_nitems += bucket->ub_cnt;
504 	if (ws && zdom->uzd_imax < zdom->uzd_nitems)
505 		zdom->uzd_imax = zdom->uzd_nitems;
506 	zone->uz_bkt_count += bucket->ub_cnt;
507 }
508 
509 static void
510 zone_log_warning(uma_zone_t zone)
511 {
512 	static const struct timeval warninterval = { 300, 0 };
513 
514 	if (!zone_warnings || zone->uz_warning == NULL)
515 		return;
516 
517 	if (ratecheck(&zone->uz_ratecheck, &warninterval))
518 		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
519 }
520 
521 static inline void
522 zone_maxaction(uma_zone_t zone)
523 {
524 
525 	if (zone->uz_maxaction.ta_func != NULL)
526 		taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
527 }
528 
529 /*
530  * Routine called by timeout which is used to fire off some time interval
531  * based calculations.  (stats, hash size, etc.)
532  *
533  * Arguments:
534  *	arg   Unused
535  *
536  * Returns:
537  *	Nothing
538  */
539 static void
540 uma_timeout(void *unused)
541 {
542 	bucket_enable();
543 	zone_foreach(zone_timeout);
544 
545 	/* Reschedule this event */
546 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
547 }
548 
549 /*
550  * Update the working set size estimate for the zone's bucket cache.
551  * The constants chosen here are somewhat arbitrary.  With an update period of
552  * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the
553  * last 100s.
554  */
555 static void
556 zone_domain_update_wss(uma_zone_domain_t zdom)
557 {
558 	long wss;
559 
560 	MPASS(zdom->uzd_imax >= zdom->uzd_imin);
561 	wss = zdom->uzd_imax - zdom->uzd_imin;
562 	zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems;
563 	zdom->uzd_wss = (3 * wss + 2 * zdom->uzd_wss) / 5;
564 }
565 
566 /*
567  * Routine to perform timeout driven calculations.  This expands the
568  * hashes and does per cpu statistics aggregation.
569  *
570  *  Returns nothing.
571  */
572 static void
573 zone_timeout(uma_zone_t zone)
574 {
575 	uma_keg_t keg = zone->uz_keg;
576 	u_int slabs;
577 
578 	KEG_LOCK(keg);
579 	/*
580 	 * Expand the keg hash table.
581 	 *
582 	 * This is done if the number of slabs is larger than the hash size.
583 	 * What I'm trying to do here is completely reduce collisions.  This
584 	 * may be a little aggressive.  Should I allow for two collisions max?
585 	 */
586 	if (keg->uk_flags & UMA_ZONE_HASH &&
587 	    (slabs = keg->uk_pages / keg->uk_ppera) >
588 	     keg->uk_hash.uh_hashsize) {
589 		struct uma_hash newhash;
590 		struct uma_hash oldhash;
591 		int ret;
592 
593 		/*
594 		 * This is so involved because allocating and freeing
595 		 * while the keg lock is held will lead to deadlock.
596 		 * I have to do everything in stages and check for
597 		 * races.
598 		 */
599 		KEG_UNLOCK(keg);
600 		ret = hash_alloc(&newhash, 1 << fls(slabs));
601 		KEG_LOCK(keg);
602 		if (ret) {
603 			if (hash_expand(&keg->uk_hash, &newhash)) {
604 				oldhash = keg->uk_hash;
605 				keg->uk_hash = newhash;
606 			} else
607 				oldhash = newhash;
608 
609 			KEG_UNLOCK(keg);
610 			hash_free(&oldhash);
611 			return;
612 		}
613 	}
614 
615 	for (int i = 0; i < vm_ndomains; i++)
616 		zone_domain_update_wss(&zone->uz_domain[i]);
617 
618 	KEG_UNLOCK(keg);
619 }
620 
621 /*
622  * Allocate and zero fill the next sized hash table from the appropriate
623  * backing store.
624  *
625  * Arguments:
626  *	hash  A new hash structure with the old hash size in uh_hashsize
627  *
628  * Returns:
629  *	1 on success and 0 on failure.
630  */
631 static int
632 hash_alloc(struct uma_hash *hash, u_int size)
633 {
634 	size_t alloc;
635 
636 	KASSERT(powerof2(size), ("hash size must be power of 2"));
637 	if (size > UMA_HASH_SIZE_INIT)  {
638 		hash->uh_hashsize = size;
639 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
640 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
641 		    M_UMAHASH, M_NOWAIT);
642 	} else {
643 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
644 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
645 		    UMA_ANYDOMAIN, M_WAITOK);
646 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
647 	}
648 	if (hash->uh_slab_hash) {
649 		bzero(hash->uh_slab_hash, alloc);
650 		hash->uh_hashmask = hash->uh_hashsize - 1;
651 		return (1);
652 	}
653 
654 	return (0);
655 }
656 
657 /*
658  * Expands the hash table for HASH zones.  This is done from zone_timeout
659  * to reduce collisions.  This must not be done in the regular allocation
660  * path, otherwise, we can recurse on the vm while allocating pages.
661  *
662  * Arguments:
663  *	oldhash  The hash you want to expand
664  *	newhash  The hash structure for the new table
665  *
666  * Returns:
667  *	Nothing
668  *
669  * Discussion:
670  */
671 static int
672 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
673 {
674 	uma_slab_t slab;
675 	u_int hval;
676 	u_int idx;
677 
678 	if (!newhash->uh_slab_hash)
679 		return (0);
680 
681 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
682 		return (0);
683 
684 	/*
685 	 * I need to investigate hash algorithms for resizing without a
686 	 * full rehash.
687 	 */
688 
689 	for (idx = 0; idx < oldhash->uh_hashsize; idx++)
690 		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
691 			slab = SLIST_FIRST(&oldhash->uh_slab_hash[idx]);
692 			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[idx], us_hlink);
693 			hval = UMA_HASH(newhash, slab->us_data);
694 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
695 			    slab, us_hlink);
696 		}
697 
698 	return (1);
699 }
700 
701 /*
702  * Free the hash bucket to the appropriate backing store.
703  *
704  * Arguments:
705  *	slab_hash  The hash bucket we're freeing
706  *	hashsize   The number of entries in that hash bucket
707  *
708  * Returns:
709  *	Nothing
710  */
711 static void
712 hash_free(struct uma_hash *hash)
713 {
714 	if (hash->uh_slab_hash == NULL)
715 		return;
716 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
717 		zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
718 	else
719 		free(hash->uh_slab_hash, M_UMAHASH);
720 }
721 
722 /*
723  * Frees all outstanding items in a bucket
724  *
725  * Arguments:
726  *	zone   The zone to free to, must be unlocked.
727  *	bucket The free/alloc bucket with items, cpu queue must be locked.
728  *
729  * Returns:
730  *	Nothing
731  */
732 
733 static void
734 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
735 {
736 	int i;
737 
738 	if (bucket == NULL)
739 		return;
740 
741 	if (zone->uz_fini)
742 		for (i = 0; i < bucket->ub_cnt; i++)
743 			zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
744 	zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
745 	if (zone->uz_max_items > 0) {
746 		ZONE_LOCK(zone);
747 		zone->uz_items -= bucket->ub_cnt;
748 		if (zone->uz_sleepers && zone->uz_items < zone->uz_max_items)
749 			wakeup_one(zone);
750 		ZONE_UNLOCK(zone);
751 	}
752 	bucket->ub_cnt = 0;
753 }
754 
755 /*
756  * Drains the per cpu caches for a zone.
757  *
758  * NOTE: This may only be called while the zone is being turn down, and not
759  * during normal operation.  This is necessary in order that we do not have
760  * to migrate CPUs to drain the per-CPU caches.
761  *
762  * Arguments:
763  *	zone     The zone to drain, must be unlocked.
764  *
765  * Returns:
766  *	Nothing
767  */
768 static void
769 cache_drain(uma_zone_t zone)
770 {
771 	uma_cache_t cache;
772 	int cpu;
773 
774 	/*
775 	 * XXX: It is safe to not lock the per-CPU caches, because we're
776 	 * tearing down the zone anyway.  I.e., there will be no further use
777 	 * of the caches at this point.
778 	 *
779 	 * XXX: It would good to be able to assert that the zone is being
780 	 * torn down to prevent improper use of cache_drain().
781 	 *
782 	 * XXX: We lock the zone before passing into bucket_cache_drain() as
783 	 * it is used elsewhere.  Should the tear-down path be made special
784 	 * there in some form?
785 	 */
786 	CPU_FOREACH(cpu) {
787 		cache = &zone->uz_cpu[cpu];
788 		bucket_drain(zone, cache->uc_allocbucket);
789 		if (cache->uc_allocbucket != NULL)
790 			bucket_free(zone, cache->uc_allocbucket, NULL);
791 		cache->uc_allocbucket = NULL;
792 		bucket_drain(zone, cache->uc_freebucket);
793 		if (cache->uc_freebucket != NULL)
794 			bucket_free(zone, cache->uc_freebucket, NULL);
795 		cache->uc_freebucket = NULL;
796 		bucket_drain(zone, cache->uc_crossbucket);
797 		if (cache->uc_crossbucket != NULL)
798 			bucket_free(zone, cache->uc_crossbucket, NULL);
799 		cache->uc_crossbucket = NULL;
800 	}
801 	ZONE_LOCK(zone);
802 	bucket_cache_drain(zone);
803 	ZONE_UNLOCK(zone);
804 }
805 
806 static void
807 cache_shrink(uma_zone_t zone)
808 {
809 
810 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
811 		return;
812 
813 	ZONE_LOCK(zone);
814 	zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
815 	ZONE_UNLOCK(zone);
816 }
817 
818 static void
819 cache_drain_safe_cpu(uma_zone_t zone)
820 {
821 	uma_cache_t cache;
822 	uma_bucket_t b1, b2, b3;
823 	int domain;
824 
825 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
826 		return;
827 
828 	b1 = b2 = b3 = NULL;
829 	ZONE_LOCK(zone);
830 	critical_enter();
831 	if (zone->uz_flags & UMA_ZONE_NUMA)
832 		domain = PCPU_GET(domain);
833 	else
834 		domain = 0;
835 	cache = &zone->uz_cpu[curcpu];
836 	if (cache->uc_allocbucket) {
837 		if (cache->uc_allocbucket->ub_cnt != 0)
838 			zone_put_bucket(zone, &zone->uz_domain[domain],
839 			    cache->uc_allocbucket, false);
840 		else
841 			b1 = cache->uc_allocbucket;
842 		cache->uc_allocbucket = NULL;
843 	}
844 	if (cache->uc_freebucket) {
845 		if (cache->uc_freebucket->ub_cnt != 0)
846 			zone_put_bucket(zone, &zone->uz_domain[domain],
847 			    cache->uc_freebucket, false);
848 		else
849 			b2 = cache->uc_freebucket;
850 		cache->uc_freebucket = NULL;
851 	}
852 	b3 = cache->uc_crossbucket;
853 	cache->uc_crossbucket = NULL;
854 	critical_exit();
855 	ZONE_UNLOCK(zone);
856 	if (b1)
857 		bucket_free(zone, b1, NULL);
858 	if (b2)
859 		bucket_free(zone, b2, NULL);
860 	if (b3) {
861 		bucket_drain(zone, b3);
862 		bucket_free(zone, b3, NULL);
863 	}
864 }
865 
866 /*
867  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
868  * This is an expensive call because it needs to bind to all CPUs
869  * one by one and enter a critical section on each of them in order
870  * to safely access their cache buckets.
871  * Zone lock must not be held on call this function.
872  */
873 static void
874 cache_drain_safe(uma_zone_t zone)
875 {
876 	int cpu;
877 
878 	/*
879 	 * Polite bucket sizes shrinking was not enouth, shrink aggressively.
880 	 */
881 	if (zone)
882 		cache_shrink(zone);
883 	else
884 		zone_foreach(cache_shrink);
885 
886 	CPU_FOREACH(cpu) {
887 		thread_lock(curthread);
888 		sched_bind(curthread, cpu);
889 		thread_unlock(curthread);
890 
891 		if (zone)
892 			cache_drain_safe_cpu(zone);
893 		else
894 			zone_foreach(cache_drain_safe_cpu);
895 	}
896 	thread_lock(curthread);
897 	sched_unbind(curthread);
898 	thread_unlock(curthread);
899 }
900 
901 /*
902  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
903  */
904 static void
905 bucket_cache_drain(uma_zone_t zone)
906 {
907 	uma_zone_domain_t zdom;
908 	uma_bucket_t bucket;
909 	int i;
910 
911 	/*
912 	 * Drain the bucket queues and free the buckets.
913 	 */
914 	for (i = 0; i < vm_ndomains; i++) {
915 		zdom = &zone->uz_domain[i];
916 		while ((bucket = zone_try_fetch_bucket(zone, zdom, false)) !=
917 		    NULL) {
918 			ZONE_UNLOCK(zone);
919 			bucket_drain(zone, bucket);
920 			bucket_free(zone, bucket, NULL);
921 			ZONE_LOCK(zone);
922 		}
923 	}
924 
925 	/*
926 	 * Shrink further bucket sizes.  Price of single zone lock collision
927 	 * is probably lower then price of global cache drain.
928 	 */
929 	if (zone->uz_count > zone->uz_count_min)
930 		zone->uz_count--;
931 }
932 
933 static void
934 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
935 {
936 	uint8_t *mem;
937 	int i;
938 	uint8_t flags;
939 
940 	CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
941 	    keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
942 
943 	mem = slab->us_data;
944 	flags = slab->us_flags;
945 	i = start;
946 	if (keg->uk_fini != NULL) {
947 		for (i--; i > -1; i--)
948 #ifdef INVARIANTS
949 		/*
950 		 * trash_fini implies that dtor was trash_dtor. trash_fini
951 		 * would check that memory hasn't been modified since free,
952 		 * which executed trash_dtor.
953 		 * That's why we need to run uma_dbg_kskip() check here,
954 		 * albeit we don't make skip check for other init/fini
955 		 * invocations.
956 		 */
957 		if (!uma_dbg_kskip(keg, slab->us_data + (keg->uk_rsize * i)) ||
958 		    keg->uk_fini != trash_fini)
959 #endif
960 			keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
961 			    keg->uk_size);
962 	}
963 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
964 		zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
965 	keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
966 	uma_total_dec(PAGE_SIZE * keg->uk_ppera);
967 }
968 
969 /*
970  * Frees pages from a keg back to the system.  This is done on demand from
971  * the pageout daemon.
972  *
973  * Returns nothing.
974  */
975 static void
976 keg_drain(uma_keg_t keg)
977 {
978 	struct slabhead freeslabs = { 0 };
979 	uma_domain_t dom;
980 	uma_slab_t slab, tmp;
981 	int i;
982 
983 	/*
984 	 * We don't want to take pages from statically allocated kegs at this
985 	 * time
986 	 */
987 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
988 		return;
989 
990 	CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u",
991 	    keg->uk_name, keg, keg->uk_free);
992 	KEG_LOCK(keg);
993 	if (keg->uk_free == 0)
994 		goto finished;
995 
996 	for (i = 0; i < vm_ndomains; i++) {
997 		dom = &keg->uk_domain[i];
998 		LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) {
999 			/* We have nowhere to free these to. */
1000 			if (slab->us_flags & UMA_SLAB_BOOT)
1001 				continue;
1002 
1003 			LIST_REMOVE(slab, us_link);
1004 			keg->uk_pages -= keg->uk_ppera;
1005 			keg->uk_free -= keg->uk_ipers;
1006 
1007 			if (keg->uk_flags & UMA_ZONE_HASH)
1008 				UMA_HASH_REMOVE(&keg->uk_hash, slab,
1009 				    slab->us_data);
1010 
1011 			SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
1012 		}
1013 	}
1014 
1015 finished:
1016 	KEG_UNLOCK(keg);
1017 
1018 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
1019 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
1020 		keg_free_slab(keg, slab, keg->uk_ipers);
1021 	}
1022 }
1023 
1024 static void
1025 zone_drain_wait(uma_zone_t zone, int waitok)
1026 {
1027 
1028 	/*
1029 	 * Set draining to interlock with zone_dtor() so we can release our
1030 	 * locks as we go.  Only dtor() should do a WAITOK call since it
1031 	 * is the only call that knows the structure will still be available
1032 	 * when it wakes up.
1033 	 */
1034 	ZONE_LOCK(zone);
1035 	while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
1036 		if (waitok == M_NOWAIT)
1037 			goto out;
1038 		msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
1039 	}
1040 	zone->uz_flags |= UMA_ZFLAG_DRAINING;
1041 	bucket_cache_drain(zone);
1042 	ZONE_UNLOCK(zone);
1043 	/*
1044 	 * The DRAINING flag protects us from being freed while
1045 	 * we're running.  Normally the uma_rwlock would protect us but we
1046 	 * must be able to release and acquire the right lock for each keg.
1047 	 */
1048 	keg_drain(zone->uz_keg);
1049 	ZONE_LOCK(zone);
1050 	zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
1051 	wakeup(zone);
1052 out:
1053 	ZONE_UNLOCK(zone);
1054 }
1055 
1056 void
1057 zone_drain(uma_zone_t zone)
1058 {
1059 
1060 	zone_drain_wait(zone, M_NOWAIT);
1061 }
1062 
1063 /*
1064  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
1065  * If the allocation was successful, the keg lock will be held upon return,
1066  * otherwise the keg will be left unlocked.
1067  *
1068  * Arguments:
1069  *	flags   Wait flags for the item initialization routine
1070  *	aflags  Wait flags for the slab allocation
1071  *
1072  * Returns:
1073  *	The slab that was allocated or NULL if there is no memory and the
1074  *	caller specified M_NOWAIT.
1075  */
1076 static uma_slab_t
1077 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags,
1078     int aflags)
1079 {
1080 	uma_alloc allocf;
1081 	uma_slab_t slab;
1082 	unsigned long size;
1083 	uint8_t *mem;
1084 	uint8_t sflags;
1085 	int i;
1086 
1087 	KASSERT(domain >= 0 && domain < vm_ndomains,
1088 	    ("keg_alloc_slab: domain %d out of range", domain));
1089 	KEG_LOCK_ASSERT(keg);
1090 	MPASS(zone->uz_lockptr == &keg->uk_lock);
1091 
1092 	allocf = keg->uk_allocf;
1093 	KEG_UNLOCK(keg);
1094 
1095 	slab = NULL;
1096 	mem = NULL;
1097 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1098 		slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, aflags);
1099 		if (slab == NULL)
1100 			goto out;
1101 	}
1102 
1103 	/*
1104 	 * This reproduces the old vm_zone behavior of zero filling pages the
1105 	 * first time they are added to a zone.
1106 	 *
1107 	 * Malloced items are zeroed in uma_zalloc.
1108 	 */
1109 
1110 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1111 		aflags |= M_ZERO;
1112 	else
1113 		aflags &= ~M_ZERO;
1114 
1115 	if (keg->uk_flags & UMA_ZONE_NODUMP)
1116 		aflags |= M_NODUMP;
1117 
1118 	/* zone is passed for legacy reasons. */
1119 	size = keg->uk_ppera * PAGE_SIZE;
1120 	mem = allocf(zone, size, domain, &sflags, aflags);
1121 	if (mem == NULL) {
1122 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1123 			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1124 		slab = NULL;
1125 		goto out;
1126 	}
1127 	uma_total_inc(size);
1128 
1129 	/* Point the slab into the allocated memory */
1130 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
1131 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
1132 
1133 	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
1134 		for (i = 0; i < keg->uk_ppera; i++)
1135 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
1136 
1137 	slab->us_keg = keg;
1138 	slab->us_data = mem;
1139 	slab->us_freecount = keg->uk_ipers;
1140 	slab->us_flags = sflags;
1141 	slab->us_domain = domain;
1142 	BIT_FILL(SLAB_SETSIZE, &slab->us_free);
1143 #ifdef INVARIANTS
1144 	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
1145 #endif
1146 
1147 	if (keg->uk_init != NULL) {
1148 		for (i = 0; i < keg->uk_ipers; i++)
1149 			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
1150 			    keg->uk_size, flags) != 0)
1151 				break;
1152 		if (i != keg->uk_ipers) {
1153 			keg_free_slab(keg, slab, i);
1154 			slab = NULL;
1155 			goto out;
1156 		}
1157 	}
1158 	KEG_LOCK(keg);
1159 
1160 	CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
1161 	    slab, keg->uk_name, keg);
1162 
1163 	if (keg->uk_flags & UMA_ZONE_HASH)
1164 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
1165 
1166 	keg->uk_pages += keg->uk_ppera;
1167 	keg->uk_free += keg->uk_ipers;
1168 
1169 out:
1170 	return (slab);
1171 }
1172 
1173 /*
1174  * This function is intended to be used early on in place of page_alloc() so
1175  * that we may use the boot time page cache to satisfy allocations before
1176  * the VM is ready.
1177  */
1178 static void *
1179 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1180     int wait)
1181 {
1182 	uma_keg_t keg;
1183 	void *mem;
1184 	int pages;
1185 
1186 	keg = zone->uz_keg;
1187 	/*
1188 	 * If we are in BOOT_BUCKETS or higher, than switch to real
1189 	 * allocator.  Zones with page sized slabs switch at BOOT_PAGEALLOC.
1190 	 */
1191 	switch (booted) {
1192 		case BOOT_COLD:
1193 		case BOOT_STRAPPED:
1194 			break;
1195 		case BOOT_PAGEALLOC:
1196 			if (keg->uk_ppera > 1)
1197 				break;
1198 		case BOOT_BUCKETS:
1199 		case BOOT_RUNNING:
1200 #ifdef UMA_MD_SMALL_ALLOC
1201 			keg->uk_allocf = (keg->uk_ppera > 1) ?
1202 			    page_alloc : uma_small_alloc;
1203 #else
1204 			keg->uk_allocf = page_alloc;
1205 #endif
1206 			return keg->uk_allocf(zone, bytes, domain, pflag, wait);
1207 	}
1208 
1209 	/*
1210 	 * Check our small startup cache to see if it has pages remaining.
1211 	 */
1212 	pages = howmany(bytes, PAGE_SIZE);
1213 	KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
1214 	if (pages > boot_pages)
1215 		panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name);
1216 #ifdef DIAGNOSTIC
1217 	printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name,
1218 	    boot_pages);
1219 #endif
1220 	mem = bootmem;
1221 	boot_pages -= pages;
1222 	bootmem += pages * PAGE_SIZE;
1223 	*pflag = UMA_SLAB_BOOT;
1224 
1225 	return (mem);
1226 }
1227 
1228 /*
1229  * Allocates a number of pages from the system
1230  *
1231  * Arguments:
1232  *	bytes  The number of bytes requested
1233  *	wait  Shall we wait?
1234  *
1235  * Returns:
1236  *	A pointer to the alloced memory or possibly
1237  *	NULL if M_NOWAIT is set.
1238  */
1239 static void *
1240 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1241     int wait)
1242 {
1243 	void *p;	/* Returned page */
1244 
1245 	*pflag = UMA_SLAB_KERNEL;
1246 	p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait);
1247 
1248 	return (p);
1249 }
1250 
1251 static void *
1252 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1253     int wait)
1254 {
1255 	struct pglist alloctail;
1256 	vm_offset_t addr, zkva;
1257 	int cpu, flags;
1258 	vm_page_t p, p_next;
1259 #ifdef NUMA
1260 	struct pcpu *pc;
1261 #endif
1262 
1263 	MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
1264 
1265 	TAILQ_INIT(&alloctail);
1266 	flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1267 	    malloc2vm_flags(wait);
1268 	*pflag = UMA_SLAB_KERNEL;
1269 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
1270 		if (CPU_ABSENT(cpu)) {
1271 			p = vm_page_alloc(NULL, 0, flags);
1272 		} else {
1273 #ifndef NUMA
1274 			p = vm_page_alloc(NULL, 0, flags);
1275 #else
1276 			pc = pcpu_find(cpu);
1277 			p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags);
1278 			if (__predict_false(p == NULL))
1279 				p = vm_page_alloc(NULL, 0, flags);
1280 #endif
1281 		}
1282 		if (__predict_false(p == NULL))
1283 			goto fail;
1284 		TAILQ_INSERT_TAIL(&alloctail, p, listq);
1285 	}
1286 	if ((addr = kva_alloc(bytes)) == 0)
1287 		goto fail;
1288 	zkva = addr;
1289 	TAILQ_FOREACH(p, &alloctail, listq) {
1290 		pmap_qenter(zkva, &p, 1);
1291 		zkva += PAGE_SIZE;
1292 	}
1293 	return ((void*)addr);
1294 fail:
1295 	TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1296 		vm_page_unwire_noq(p);
1297 		vm_page_free(p);
1298 	}
1299 	return (NULL);
1300 }
1301 
1302 /*
1303  * Allocates a number of pages from within an object
1304  *
1305  * Arguments:
1306  *	bytes  The number of bytes requested
1307  *	wait   Shall we wait?
1308  *
1309  * Returns:
1310  *	A pointer to the alloced memory or possibly
1311  *	NULL if M_NOWAIT is set.
1312  */
1313 static void *
1314 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
1315     int wait)
1316 {
1317 	TAILQ_HEAD(, vm_page) alloctail;
1318 	u_long npages;
1319 	vm_offset_t retkva, zkva;
1320 	vm_page_t p, p_next;
1321 	uma_keg_t keg;
1322 
1323 	TAILQ_INIT(&alloctail);
1324 	keg = zone->uz_keg;
1325 
1326 	npages = howmany(bytes, PAGE_SIZE);
1327 	while (npages > 0) {
1328 		p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
1329 		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1330 		    ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
1331 		    VM_ALLOC_NOWAIT));
1332 		if (p != NULL) {
1333 			/*
1334 			 * Since the page does not belong to an object, its
1335 			 * listq is unused.
1336 			 */
1337 			TAILQ_INSERT_TAIL(&alloctail, p, listq);
1338 			npages--;
1339 			continue;
1340 		}
1341 		/*
1342 		 * Page allocation failed, free intermediate pages and
1343 		 * exit.
1344 		 */
1345 		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1346 			vm_page_unwire_noq(p);
1347 			vm_page_free(p);
1348 		}
1349 		return (NULL);
1350 	}
1351 	*flags = UMA_SLAB_PRIV;
1352 	zkva = keg->uk_kva +
1353 	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1354 	retkva = zkva;
1355 	TAILQ_FOREACH(p, &alloctail, listq) {
1356 		pmap_qenter(zkva, &p, 1);
1357 		zkva += PAGE_SIZE;
1358 	}
1359 
1360 	return ((void *)retkva);
1361 }
1362 
1363 /*
1364  * Frees a number of pages to the system
1365  *
1366  * Arguments:
1367  *	mem   A pointer to the memory to be freed
1368  *	size  The size of the memory being freed
1369  *	flags The original p->us_flags field
1370  *
1371  * Returns:
1372  *	Nothing
1373  */
1374 static void
1375 page_free(void *mem, vm_size_t size, uint8_t flags)
1376 {
1377 
1378 	if ((flags & UMA_SLAB_KERNEL) == 0)
1379 		panic("UMA: page_free used with invalid flags %x", flags);
1380 
1381 	kmem_free((vm_offset_t)mem, size);
1382 }
1383 
1384 /*
1385  * Frees pcpu zone allocations
1386  *
1387  * Arguments:
1388  *	mem   A pointer to the memory to be freed
1389  *	size  The size of the memory being freed
1390  *	flags The original p->us_flags field
1391  *
1392  * Returns:
1393  *	Nothing
1394  */
1395 static void
1396 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
1397 {
1398 	vm_offset_t sva, curva;
1399 	vm_paddr_t paddr;
1400 	vm_page_t m;
1401 
1402 	MPASS(size == (mp_maxid+1)*PAGE_SIZE);
1403 	sva = (vm_offset_t)mem;
1404 	for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
1405 		paddr = pmap_kextract(curva);
1406 		m = PHYS_TO_VM_PAGE(paddr);
1407 		vm_page_unwire_noq(m);
1408 		vm_page_free(m);
1409 	}
1410 	pmap_qremove(sva, size >> PAGE_SHIFT);
1411 	kva_free(sva, size);
1412 }
1413 
1414 
1415 /*
1416  * Zero fill initializer
1417  *
1418  * Arguments/Returns follow uma_init specifications
1419  */
1420 static int
1421 zero_init(void *mem, int size, int flags)
1422 {
1423 	bzero(mem, size);
1424 	return (0);
1425 }
1426 
1427 /*
1428  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1429  *
1430  * Arguments
1431  *	keg  The zone we should initialize
1432  *
1433  * Returns
1434  *	Nothing
1435  */
1436 static void
1437 keg_small_init(uma_keg_t keg)
1438 {
1439 	u_int rsize;
1440 	u_int memused;
1441 	u_int wastedspace;
1442 	u_int shsize;
1443 	u_int slabsize;
1444 
1445 	if (keg->uk_flags & UMA_ZONE_PCPU) {
1446 		u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
1447 
1448 		slabsize = UMA_PCPU_ALLOC_SIZE;
1449 		keg->uk_ppera = ncpus;
1450 	} else {
1451 		slabsize = UMA_SLAB_SIZE;
1452 		keg->uk_ppera = 1;
1453 	}
1454 
1455 	/*
1456 	 * Calculate the size of each allocation (rsize) according to
1457 	 * alignment.  If the requested size is smaller than we have
1458 	 * allocation bits for we round it up.
1459 	 */
1460 	rsize = keg->uk_size;
1461 	if (rsize < slabsize / SLAB_SETSIZE)
1462 		rsize = slabsize / SLAB_SETSIZE;
1463 	if (rsize & keg->uk_align)
1464 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1465 	keg->uk_rsize = rsize;
1466 
1467 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1468 	    keg->uk_rsize < UMA_PCPU_ALLOC_SIZE,
1469 	    ("%s: size %u too large", __func__, keg->uk_rsize));
1470 
1471 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1472 		shsize = 0;
1473 	else
1474 		shsize = SIZEOF_UMA_SLAB;
1475 
1476 	if (rsize <= slabsize - shsize)
1477 		keg->uk_ipers = (slabsize - shsize) / rsize;
1478 	else {
1479 		/* Handle special case when we have 1 item per slab, so
1480 		 * alignment requirement can be relaxed. */
1481 		KASSERT(keg->uk_size <= slabsize - shsize,
1482 		    ("%s: size %u greater than slab", __func__, keg->uk_size));
1483 		keg->uk_ipers = 1;
1484 	}
1485 	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1486 	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1487 
1488 	memused = keg->uk_ipers * rsize + shsize;
1489 	wastedspace = slabsize - memused;
1490 
1491 	/*
1492 	 * We can't do OFFPAGE if we're internal or if we've been
1493 	 * asked to not go to the VM for buckets.  If we do this we
1494 	 * may end up going to the VM  for slabs which we do not
1495 	 * want to do if we're UMA_ZFLAG_CACHEONLY as a result
1496 	 * of UMA_ZONE_VM, which clearly forbids it.
1497 	 */
1498 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1499 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1500 		return;
1501 
1502 	/*
1503 	 * See if using an OFFPAGE slab will limit our waste.  Only do
1504 	 * this if it permits more items per-slab.
1505 	 *
1506 	 * XXX We could try growing slabsize to limit max waste as well.
1507 	 * Historically this was not done because the VM could not
1508 	 * efficiently handle contiguous allocations.
1509 	 */
1510 	if ((wastedspace >= slabsize / UMA_MAX_WASTE) &&
1511 	    (keg->uk_ipers < (slabsize / keg->uk_rsize))) {
1512 		keg->uk_ipers = slabsize / keg->uk_rsize;
1513 		KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1514 		    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1515 		CTR6(KTR_UMA, "UMA decided we need offpage slab headers for "
1516 		    "keg: %s(%p), calculated wastedspace = %d, "
1517 		    "maximum wasted space allowed = %d, "
1518 		    "calculated ipers = %d, "
1519 		    "new wasted space = %d\n", keg->uk_name, keg, wastedspace,
1520 		    slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1521 		    slabsize - keg->uk_ipers * keg->uk_rsize);
1522 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1523 	}
1524 
1525 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1526 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1527 		keg->uk_flags |= UMA_ZONE_HASH;
1528 }
1529 
1530 /*
1531  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1532  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1533  * more complicated.
1534  *
1535  * Arguments
1536  *	keg  The keg we should initialize
1537  *
1538  * Returns
1539  *	Nothing
1540  */
1541 static void
1542 keg_large_init(uma_keg_t keg)
1543 {
1544 
1545 	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1546 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1547 	    ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1548 
1549 	keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1550 	keg->uk_ipers = 1;
1551 	keg->uk_rsize = keg->uk_size;
1552 
1553 	/* Check whether we have enough space to not do OFFPAGE. */
1554 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0 &&
1555 	    PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < SIZEOF_UMA_SLAB) {
1556 		/*
1557 		 * We can't do OFFPAGE if we're internal, in which case
1558 		 * we need an extra page per allocation to contain the
1559 		 * slab header.
1560 		 */
1561 		if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
1562 			keg->uk_flags |= UMA_ZONE_OFFPAGE;
1563 		else
1564 			keg->uk_ppera++;
1565 	}
1566 
1567 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1568 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1569 		keg->uk_flags |= UMA_ZONE_HASH;
1570 }
1571 
1572 static void
1573 keg_cachespread_init(uma_keg_t keg)
1574 {
1575 	int alignsize;
1576 	int trailer;
1577 	int pages;
1578 	int rsize;
1579 
1580 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1581 	    ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1582 
1583 	alignsize = keg->uk_align + 1;
1584 	rsize = keg->uk_size;
1585 	/*
1586 	 * We want one item to start on every align boundary in a page.  To
1587 	 * do this we will span pages.  We will also extend the item by the
1588 	 * size of align if it is an even multiple of align.  Otherwise, it
1589 	 * would fall on the same boundary every time.
1590 	 */
1591 	if (rsize & keg->uk_align)
1592 		rsize = (rsize & ~keg->uk_align) + alignsize;
1593 	if ((rsize & alignsize) == 0)
1594 		rsize += alignsize;
1595 	trailer = rsize - keg->uk_size;
1596 	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1597 	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1598 	keg->uk_rsize = rsize;
1599 	keg->uk_ppera = pages;
1600 	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1601 	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1602 	KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
1603 	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1604 	    keg->uk_ipers));
1605 }
1606 
1607 /*
1608  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1609  * the keg onto the global keg list.
1610  *
1611  * Arguments/Returns follow uma_ctor specifications
1612  *	udata  Actually uma_kctor_args
1613  */
1614 static int
1615 keg_ctor(void *mem, int size, void *udata, int flags)
1616 {
1617 	struct uma_kctor_args *arg = udata;
1618 	uma_keg_t keg = mem;
1619 	uma_zone_t zone;
1620 
1621 	bzero(keg, size);
1622 	keg->uk_size = arg->size;
1623 	keg->uk_init = arg->uminit;
1624 	keg->uk_fini = arg->fini;
1625 	keg->uk_align = arg->align;
1626 	keg->uk_free = 0;
1627 	keg->uk_reserve = 0;
1628 	keg->uk_pages = 0;
1629 	keg->uk_flags = arg->flags;
1630 	keg->uk_slabzone = NULL;
1631 
1632 	/*
1633 	 * We use a global round-robin policy by default.  Zones with
1634 	 * UMA_ZONE_NUMA set will use first-touch instead, in which case the
1635 	 * iterator is never run.
1636 	 */
1637 	keg->uk_dr.dr_policy = DOMAINSET_RR();
1638 	keg->uk_dr.dr_iter = 0;
1639 
1640 	/*
1641 	 * The master zone is passed to us at keg-creation time.
1642 	 */
1643 	zone = arg->zone;
1644 	keg->uk_name = zone->uz_name;
1645 
1646 	if (arg->flags & UMA_ZONE_VM)
1647 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1648 
1649 	if (arg->flags & UMA_ZONE_ZINIT)
1650 		keg->uk_init = zero_init;
1651 
1652 	if (arg->flags & UMA_ZONE_MALLOC)
1653 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
1654 
1655 	if (arg->flags & UMA_ZONE_PCPU)
1656 #ifdef SMP
1657 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1658 #else
1659 		keg->uk_flags &= ~UMA_ZONE_PCPU;
1660 #endif
1661 
1662 	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
1663 		keg_cachespread_init(keg);
1664 	} else {
1665 		if (keg->uk_size > UMA_SLAB_SPACE)
1666 			keg_large_init(keg);
1667 		else
1668 			keg_small_init(keg);
1669 	}
1670 
1671 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1672 		keg->uk_slabzone = slabzone;
1673 
1674 	/*
1675 	 * If we haven't booted yet we need allocations to go through the
1676 	 * startup cache until the vm is ready.
1677 	 */
1678 	if (booted < BOOT_PAGEALLOC)
1679 		keg->uk_allocf = startup_alloc;
1680 #ifdef UMA_MD_SMALL_ALLOC
1681 	else if (keg->uk_ppera == 1)
1682 		keg->uk_allocf = uma_small_alloc;
1683 #endif
1684 	else if (keg->uk_flags & UMA_ZONE_PCPU)
1685 		keg->uk_allocf = pcpu_page_alloc;
1686 	else
1687 		keg->uk_allocf = page_alloc;
1688 #ifdef UMA_MD_SMALL_ALLOC
1689 	if (keg->uk_ppera == 1)
1690 		keg->uk_freef = uma_small_free;
1691 	else
1692 #endif
1693 	if (keg->uk_flags & UMA_ZONE_PCPU)
1694 		keg->uk_freef = pcpu_page_free;
1695 	else
1696 		keg->uk_freef = page_free;
1697 
1698 	/*
1699 	 * Initialize keg's lock
1700 	 */
1701 	KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
1702 
1703 	/*
1704 	 * If we're putting the slab header in the actual page we need to
1705 	 * figure out where in each page it goes.  See SIZEOF_UMA_SLAB
1706 	 * macro definition.
1707 	 */
1708 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1709 		keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - SIZEOF_UMA_SLAB;
1710 		/*
1711 		 * The only way the following is possible is if with our
1712 		 * UMA_ALIGN_PTR adjustments we are now bigger than
1713 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
1714 		 * mathematically possible for all cases, so we make
1715 		 * sure here anyway.
1716 		 */
1717 		KASSERT(keg->uk_pgoff + sizeof(struct uma_slab) <=
1718 		    PAGE_SIZE * keg->uk_ppera,
1719 		    ("zone %s ipers %d rsize %d size %d slab won't fit",
1720 		    zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size));
1721 	}
1722 
1723 	if (keg->uk_flags & UMA_ZONE_HASH)
1724 		hash_alloc(&keg->uk_hash, 0);
1725 
1726 	CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n",
1727 	    keg, zone->uz_name, zone,
1728 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
1729 	    keg->uk_free);
1730 
1731 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1732 
1733 	rw_wlock(&uma_rwlock);
1734 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1735 	rw_wunlock(&uma_rwlock);
1736 	return (0);
1737 }
1738 
1739 static void
1740 zone_alloc_counters(uma_zone_t zone)
1741 {
1742 
1743 	zone->uz_allocs = counter_u64_alloc(M_WAITOK);
1744 	zone->uz_frees = counter_u64_alloc(M_WAITOK);
1745 	zone->uz_fails = counter_u64_alloc(M_WAITOK);
1746 }
1747 
1748 /*
1749  * Zone header ctor.  This initializes all fields, locks, etc.
1750  *
1751  * Arguments/Returns follow uma_ctor specifications
1752  *	udata  Actually uma_zctor_args
1753  */
1754 static int
1755 zone_ctor(void *mem, int size, void *udata, int flags)
1756 {
1757 	struct uma_zctor_args *arg = udata;
1758 	uma_zone_t zone = mem;
1759 	uma_zone_t z;
1760 	uma_keg_t keg;
1761 
1762 	bzero(zone, size);
1763 	zone->uz_name = arg->name;
1764 	zone->uz_ctor = arg->ctor;
1765 	zone->uz_dtor = arg->dtor;
1766 	zone->uz_init = NULL;
1767 	zone->uz_fini = NULL;
1768 	zone->uz_sleeps = 0;
1769 	zone->uz_xdomain = 0;
1770 	zone->uz_count = 0;
1771 	zone->uz_count_min = 0;
1772 	zone->uz_count_max = BUCKET_MAX;
1773 	zone->uz_flags = 0;
1774 	zone->uz_warning = NULL;
1775 	/* The domain structures follow the cpu structures. */
1776 	zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus];
1777 	zone->uz_bkt_max = ULONG_MAX;
1778 	timevalclear(&zone->uz_ratecheck);
1779 
1780 	if (__predict_true(booted == BOOT_RUNNING))
1781 		zone_alloc_counters(zone);
1782 	else {
1783 		zone->uz_allocs = EARLY_COUNTER;
1784 		zone->uz_frees = EARLY_COUNTER;
1785 		zone->uz_fails = EARLY_COUNTER;
1786 	}
1787 
1788 	/*
1789 	 * This is a pure cache zone, no kegs.
1790 	 */
1791 	if (arg->import) {
1792 		if (arg->flags & UMA_ZONE_VM)
1793 			arg->flags |= UMA_ZFLAG_CACHEONLY;
1794 		zone->uz_flags = arg->flags;
1795 		zone->uz_size = arg->size;
1796 		zone->uz_import = arg->import;
1797 		zone->uz_release = arg->release;
1798 		zone->uz_arg = arg->arg;
1799 		zone->uz_lockptr = &zone->uz_lock;
1800 		ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
1801 		rw_wlock(&uma_rwlock);
1802 		LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
1803 		rw_wunlock(&uma_rwlock);
1804 		goto out;
1805 	}
1806 
1807 	/*
1808 	 * Use the regular zone/keg/slab allocator.
1809 	 */
1810 	zone->uz_import = (uma_import)zone_import;
1811 	zone->uz_release = (uma_release)zone_release;
1812 	zone->uz_arg = zone;
1813 	keg = arg->keg;
1814 
1815 	if (arg->flags & UMA_ZONE_SECONDARY) {
1816 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1817 		zone->uz_init = arg->uminit;
1818 		zone->uz_fini = arg->fini;
1819 		zone->uz_lockptr = &keg->uk_lock;
1820 		zone->uz_flags |= UMA_ZONE_SECONDARY;
1821 		rw_wlock(&uma_rwlock);
1822 		ZONE_LOCK(zone);
1823 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1824 			if (LIST_NEXT(z, uz_link) == NULL) {
1825 				LIST_INSERT_AFTER(z, zone, uz_link);
1826 				break;
1827 			}
1828 		}
1829 		ZONE_UNLOCK(zone);
1830 		rw_wunlock(&uma_rwlock);
1831 	} else if (keg == NULL) {
1832 		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1833 		    arg->align, arg->flags)) == NULL)
1834 			return (ENOMEM);
1835 	} else {
1836 		struct uma_kctor_args karg;
1837 		int error;
1838 
1839 		/* We should only be here from uma_startup() */
1840 		karg.size = arg->size;
1841 		karg.uminit = arg->uminit;
1842 		karg.fini = arg->fini;
1843 		karg.align = arg->align;
1844 		karg.flags = arg->flags;
1845 		karg.zone = zone;
1846 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1847 		    flags);
1848 		if (error)
1849 			return (error);
1850 	}
1851 
1852 	zone->uz_keg = keg;
1853 	zone->uz_size = keg->uk_size;
1854 	zone->uz_flags |= (keg->uk_flags &
1855 	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1856 
1857 	/*
1858 	 * Some internal zones don't have room allocated for the per cpu
1859 	 * caches.  If we're internal, bail out here.
1860 	 */
1861 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1862 		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1863 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1864 		return (0);
1865 	}
1866 
1867 out:
1868 	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
1869 	    (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
1870 	    ("Invalid zone flag combination"));
1871 	if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) {
1872 		zone->uz_count = BUCKET_MAX;
1873 	} else if ((arg->flags & UMA_ZONE_MINBUCKET) != 0) {
1874 		zone->uz_count = BUCKET_MIN;
1875 		zone->uz_count_max = BUCKET_MIN;
1876 	} else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
1877 		zone->uz_count = 0;
1878 	else
1879 		zone->uz_count = bucket_select(zone->uz_size);
1880 	zone->uz_count_min = zone->uz_count;
1881 
1882 	return (0);
1883 }
1884 
1885 /*
1886  * Keg header dtor.  This frees all data, destroys locks, frees the hash
1887  * table and removes the keg from the global list.
1888  *
1889  * Arguments/Returns follow uma_dtor specifications
1890  *	udata  unused
1891  */
1892 static void
1893 keg_dtor(void *arg, int size, void *udata)
1894 {
1895 	uma_keg_t keg;
1896 
1897 	keg = (uma_keg_t)arg;
1898 	KEG_LOCK(keg);
1899 	if (keg->uk_free != 0) {
1900 		printf("Freed UMA keg (%s) was not empty (%d items). "
1901 		    " Lost %d pages of memory.\n",
1902 		    keg->uk_name ? keg->uk_name : "",
1903 		    keg->uk_free, keg->uk_pages);
1904 	}
1905 	KEG_UNLOCK(keg);
1906 
1907 	hash_free(&keg->uk_hash);
1908 
1909 	KEG_LOCK_FINI(keg);
1910 }
1911 
1912 /*
1913  * Zone header dtor.
1914  *
1915  * Arguments/Returns follow uma_dtor specifications
1916  *	udata  unused
1917  */
1918 static void
1919 zone_dtor(void *arg, int size, void *udata)
1920 {
1921 	uma_zone_t zone;
1922 	uma_keg_t keg;
1923 
1924 	zone = (uma_zone_t)arg;
1925 
1926 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1927 		cache_drain(zone);
1928 
1929 	rw_wlock(&uma_rwlock);
1930 	LIST_REMOVE(zone, uz_link);
1931 	rw_wunlock(&uma_rwlock);
1932 	/*
1933 	 * XXX there are some races here where
1934 	 * the zone can be drained but zone lock
1935 	 * released and then refilled before we
1936 	 * remove it... we dont care for now
1937 	 */
1938 	zone_drain_wait(zone, M_WAITOK);
1939 	/*
1940 	 * We only destroy kegs from non secondary/non cache zones.
1941 	 */
1942 	if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
1943 		keg = zone->uz_keg;
1944 		rw_wlock(&uma_rwlock);
1945 		LIST_REMOVE(keg, uk_link);
1946 		rw_wunlock(&uma_rwlock);
1947 		zone_free_item(kegs, keg, NULL, SKIP_NONE);
1948 	}
1949 	counter_u64_free(zone->uz_allocs);
1950 	counter_u64_free(zone->uz_frees);
1951 	counter_u64_free(zone->uz_fails);
1952 	if (zone->uz_lockptr == &zone->uz_lock)
1953 		ZONE_LOCK_FINI(zone);
1954 }
1955 
1956 /*
1957  * Traverses every zone in the system and calls a callback
1958  *
1959  * Arguments:
1960  *	zfunc  A pointer to a function which accepts a zone
1961  *		as an argument.
1962  *
1963  * Returns:
1964  *	Nothing
1965  */
1966 static void
1967 zone_foreach(void (*zfunc)(uma_zone_t))
1968 {
1969 	uma_keg_t keg;
1970 	uma_zone_t zone;
1971 
1972 	/*
1973 	 * Before BOOT_RUNNING we are guaranteed to be single
1974 	 * threaded, so locking isn't needed. Startup functions
1975 	 * are allowed to use M_WAITOK.
1976 	 */
1977 	if (__predict_true(booted == BOOT_RUNNING))
1978 		rw_rlock(&uma_rwlock);
1979 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
1980 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
1981 			zfunc(zone);
1982 	}
1983 	if (__predict_true(booted == BOOT_RUNNING))
1984 		rw_runlock(&uma_rwlock);
1985 }
1986 
1987 /*
1988  * Count how many pages do we need to bootstrap.  VM supplies
1989  * its need in early zones in the argument, we add up our zones,
1990  * which consist of: UMA Slabs, UMA Hash and 9 Bucket zones. The
1991  * zone of zones and zone of kegs are accounted separately.
1992  */
1993 #define	UMA_BOOT_ZONES	11
1994 /* Zone of zones and zone of kegs have arbitrary alignment. */
1995 #define	UMA_BOOT_ALIGN	32
1996 static int zsize, ksize;
1997 int
1998 uma_startup_count(int vm_zones)
1999 {
2000 	int zones, pages;
2001 
2002 	ksize = sizeof(struct uma_keg) +
2003 	    (sizeof(struct uma_domain) * vm_ndomains);
2004 	zsize = sizeof(struct uma_zone) +
2005 	    (sizeof(struct uma_cache) * (mp_maxid + 1)) +
2006 	    (sizeof(struct uma_zone_domain) * vm_ndomains);
2007 
2008 	/*
2009 	 * Memory for the zone of kegs and its keg,
2010 	 * and for zone of zones.
2011 	 */
2012 	pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 +
2013 	    roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE);
2014 
2015 #ifdef	UMA_MD_SMALL_ALLOC
2016 	zones = UMA_BOOT_ZONES;
2017 #else
2018 	zones = UMA_BOOT_ZONES + vm_zones;
2019 	vm_zones = 0;
2020 #endif
2021 
2022 	/* Memory for the rest of startup zones, UMA and VM, ... */
2023 	if (zsize > UMA_SLAB_SPACE) {
2024 		/* See keg_large_init(). */
2025 		u_int ppera;
2026 
2027 		ppera = howmany(roundup2(zsize, UMA_BOOT_ALIGN), PAGE_SIZE);
2028 		if (PAGE_SIZE * ppera - roundup2(zsize, UMA_BOOT_ALIGN) <
2029 		    SIZEOF_UMA_SLAB)
2030 			ppera++;
2031 		pages += (zones + vm_zones) * ppera;
2032 	} else if (roundup2(zsize, UMA_BOOT_ALIGN) > UMA_SLAB_SPACE)
2033 		/* See keg_small_init() special case for uk_ppera = 1. */
2034 		pages += zones;
2035 	else
2036 		pages += howmany(zones,
2037 		    UMA_SLAB_SPACE / roundup2(zsize, UMA_BOOT_ALIGN));
2038 
2039 	/* ... and their kegs. Note that zone of zones allocates a keg! */
2040 	pages += howmany(zones + 1,
2041 	    UMA_SLAB_SPACE / roundup2(ksize, UMA_BOOT_ALIGN));
2042 
2043 	/*
2044 	 * Most of startup zones are not going to be offpages, that's
2045 	 * why we use UMA_SLAB_SPACE instead of UMA_SLAB_SIZE in all
2046 	 * calculations.  Some large bucket zones will be offpage, and
2047 	 * thus will allocate hashes.  We take conservative approach
2048 	 * and assume that all zones may allocate hash.  This may give
2049 	 * us some positive inaccuracy, usually an extra single page.
2050 	 */
2051 	pages += howmany(zones, UMA_SLAB_SPACE /
2052 	    (sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT));
2053 
2054 	return (pages);
2055 }
2056 
2057 void
2058 uma_startup(void *mem, int npages)
2059 {
2060 	struct uma_zctor_args args;
2061 	uma_keg_t masterkeg;
2062 	uintptr_t m;
2063 
2064 #ifdef DIAGNOSTIC
2065 	printf("Entering %s with %d boot pages configured\n", __func__, npages);
2066 #endif
2067 
2068 	rw_init(&uma_rwlock, "UMA lock");
2069 
2070 	/* Use bootpages memory for the zone of zones and zone of kegs. */
2071 	m = (uintptr_t)mem;
2072 	zones = (uma_zone_t)m;
2073 	m += roundup(zsize, CACHE_LINE_SIZE);
2074 	kegs = (uma_zone_t)m;
2075 	m += roundup(zsize, CACHE_LINE_SIZE);
2076 	masterkeg = (uma_keg_t)m;
2077 	m += roundup(ksize, CACHE_LINE_SIZE);
2078 	m = roundup(m, PAGE_SIZE);
2079 	npages -= (m - (uintptr_t)mem) / PAGE_SIZE;
2080 	mem = (void *)m;
2081 
2082 	/* "manually" create the initial zone */
2083 	memset(&args, 0, sizeof(args));
2084 	args.name = "UMA Kegs";
2085 	args.size = ksize;
2086 	args.ctor = keg_ctor;
2087 	args.dtor = keg_dtor;
2088 	args.uminit = zero_init;
2089 	args.fini = NULL;
2090 	args.keg = masterkeg;
2091 	args.align = UMA_BOOT_ALIGN - 1;
2092 	args.flags = UMA_ZFLAG_INTERNAL;
2093 	zone_ctor(kegs, zsize, &args, M_WAITOK);
2094 
2095 	bootmem = mem;
2096 	boot_pages = npages;
2097 
2098 	args.name = "UMA Zones";
2099 	args.size = zsize;
2100 	args.ctor = zone_ctor;
2101 	args.dtor = zone_dtor;
2102 	args.uminit = zero_init;
2103 	args.fini = NULL;
2104 	args.keg = NULL;
2105 	args.align = UMA_BOOT_ALIGN - 1;
2106 	args.flags = UMA_ZFLAG_INTERNAL;
2107 	zone_ctor(zones, zsize, &args, M_WAITOK);
2108 
2109 	/* Now make a zone for slab headers */
2110 	slabzone = uma_zcreate("UMA Slabs",
2111 				sizeof(struct uma_slab),
2112 				NULL, NULL, NULL, NULL,
2113 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2114 
2115 	hashzone = uma_zcreate("UMA Hash",
2116 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
2117 	    NULL, NULL, NULL, NULL,
2118 	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2119 
2120 	bucket_init();
2121 
2122 	booted = BOOT_STRAPPED;
2123 }
2124 
2125 void
2126 uma_startup1(void)
2127 {
2128 
2129 #ifdef DIAGNOSTIC
2130 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2131 #endif
2132 	booted = BOOT_PAGEALLOC;
2133 }
2134 
2135 void
2136 uma_startup2(void)
2137 {
2138 
2139 #ifdef DIAGNOSTIC
2140 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2141 #endif
2142 	booted = BOOT_BUCKETS;
2143 	sx_init(&uma_drain_lock, "umadrain");
2144 	bucket_enable();
2145 }
2146 
2147 /*
2148  * Initialize our callout handle
2149  *
2150  */
2151 static void
2152 uma_startup3(void)
2153 {
2154 
2155 #ifdef INVARIANTS
2156 	TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
2157 	uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
2158 	uma_skip_cnt = counter_u64_alloc(M_WAITOK);
2159 #endif
2160 	zone_foreach(zone_alloc_counters);
2161 	callout_init(&uma_callout, 1);
2162 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
2163 	booted = BOOT_RUNNING;
2164 }
2165 
2166 static uma_keg_t
2167 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
2168 		int align, uint32_t flags)
2169 {
2170 	struct uma_kctor_args args;
2171 
2172 	args.size = size;
2173 	args.uminit = uminit;
2174 	args.fini = fini;
2175 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
2176 	args.flags = flags;
2177 	args.zone = zone;
2178 	return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
2179 }
2180 
2181 /* Public functions */
2182 /* See uma.h */
2183 void
2184 uma_set_align(int align)
2185 {
2186 
2187 	if (align != UMA_ALIGN_CACHE)
2188 		uma_align_cache = align;
2189 }
2190 
2191 /* See uma.h */
2192 uma_zone_t
2193 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
2194 		uma_init uminit, uma_fini fini, int align, uint32_t flags)
2195 
2196 {
2197 	struct uma_zctor_args args;
2198 	uma_zone_t res;
2199 	bool locked;
2200 
2201 	KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
2202 	    align, name));
2203 
2204 	/* Sets all zones to a first-touch domain policy. */
2205 #ifdef UMA_FIRSTTOUCH
2206 	flags |= UMA_ZONE_NUMA;
2207 #endif
2208 
2209 	/* This stuff is essential for the zone ctor */
2210 	memset(&args, 0, sizeof(args));
2211 	args.name = name;
2212 	args.size = size;
2213 	args.ctor = ctor;
2214 	args.dtor = dtor;
2215 	args.uminit = uminit;
2216 	args.fini = fini;
2217 #ifdef  INVARIANTS
2218 	/*
2219 	 * If a zone is being created with an empty constructor and
2220 	 * destructor, pass UMA constructor/destructor which checks for
2221 	 * memory use after free.
2222 	 */
2223 	if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
2224 	    ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
2225 		args.ctor = trash_ctor;
2226 		args.dtor = trash_dtor;
2227 		args.uminit = trash_init;
2228 		args.fini = trash_fini;
2229 	}
2230 #endif
2231 	args.align = align;
2232 	args.flags = flags;
2233 	args.keg = NULL;
2234 
2235 	if (booted < BOOT_BUCKETS) {
2236 		locked = false;
2237 	} else {
2238 		sx_slock(&uma_drain_lock);
2239 		locked = true;
2240 	}
2241 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2242 	if (locked)
2243 		sx_sunlock(&uma_drain_lock);
2244 	return (res);
2245 }
2246 
2247 /* See uma.h */
2248 uma_zone_t
2249 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
2250 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
2251 {
2252 	struct uma_zctor_args args;
2253 	uma_keg_t keg;
2254 	uma_zone_t res;
2255 	bool locked;
2256 
2257 	keg = master->uz_keg;
2258 	memset(&args, 0, sizeof(args));
2259 	args.name = name;
2260 	args.size = keg->uk_size;
2261 	args.ctor = ctor;
2262 	args.dtor = dtor;
2263 	args.uminit = zinit;
2264 	args.fini = zfini;
2265 	args.align = keg->uk_align;
2266 	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
2267 	args.keg = keg;
2268 
2269 	if (booted < BOOT_BUCKETS) {
2270 		locked = false;
2271 	} else {
2272 		sx_slock(&uma_drain_lock);
2273 		locked = true;
2274 	}
2275 	/* XXX Attaches only one keg of potentially many. */
2276 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2277 	if (locked)
2278 		sx_sunlock(&uma_drain_lock);
2279 	return (res);
2280 }
2281 
2282 /* See uma.h */
2283 uma_zone_t
2284 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
2285 		    uma_init zinit, uma_fini zfini, uma_import zimport,
2286 		    uma_release zrelease, void *arg, int flags)
2287 {
2288 	struct uma_zctor_args args;
2289 
2290 	memset(&args, 0, sizeof(args));
2291 	args.name = name;
2292 	args.size = size;
2293 	args.ctor = ctor;
2294 	args.dtor = dtor;
2295 	args.uminit = zinit;
2296 	args.fini = zfini;
2297 	args.import = zimport;
2298 	args.release = zrelease;
2299 	args.arg = arg;
2300 	args.align = 0;
2301 	args.flags = flags | UMA_ZFLAG_CACHE;
2302 
2303 	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
2304 }
2305 
2306 /* See uma.h */
2307 void
2308 uma_zdestroy(uma_zone_t zone)
2309 {
2310 
2311 	sx_slock(&uma_drain_lock);
2312 	zone_free_item(zones, zone, NULL, SKIP_NONE);
2313 	sx_sunlock(&uma_drain_lock);
2314 }
2315 
2316 void
2317 uma_zwait(uma_zone_t zone)
2318 {
2319 	void *item;
2320 
2321 	item = uma_zalloc_arg(zone, NULL, M_WAITOK);
2322 	uma_zfree(zone, item);
2323 }
2324 
2325 void *
2326 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
2327 {
2328 	void *item;
2329 #ifdef SMP
2330 	int i;
2331 
2332 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2333 #endif
2334 	item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
2335 	if (item != NULL && (flags & M_ZERO)) {
2336 #ifdef SMP
2337 		for (i = 0; i <= mp_maxid; i++)
2338 			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
2339 #else
2340 		bzero(item, zone->uz_size);
2341 #endif
2342 	}
2343 	return (item);
2344 }
2345 
2346 /*
2347  * A stub while both regular and pcpu cases are identical.
2348  */
2349 void
2350 uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata)
2351 {
2352 
2353 #ifdef SMP
2354 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2355 #endif
2356 	uma_zfree_arg(zone, item, udata);
2357 }
2358 
2359 /* See uma.h */
2360 void *
2361 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
2362 {
2363 	uma_zone_domain_t zdom;
2364 	uma_bucket_t bucket;
2365 	uma_cache_t cache;
2366 	void *item;
2367 	int cpu, domain, lockfail, maxbucket;
2368 #ifdef INVARIANTS
2369 	bool skipdbg;
2370 #endif
2371 
2372 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2373 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2374 
2375 	/* This is the fast path allocation */
2376 	CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d",
2377 	    curthread, zone->uz_name, zone, flags);
2378 
2379 	if (flags & M_WAITOK) {
2380 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2381 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2382 	}
2383 	KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC"));
2384 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2385 	    ("uma_zalloc_arg: called with spinlock or critical section held"));
2386 	if (zone->uz_flags & UMA_ZONE_PCPU)
2387 		KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone "
2388 		    "with M_ZERO passed"));
2389 
2390 #ifdef DEBUG_MEMGUARD
2391 	if (memguard_cmp_zone(zone)) {
2392 		item = memguard_alloc(zone->uz_size, flags);
2393 		if (item != NULL) {
2394 			if (zone->uz_init != NULL &&
2395 			    zone->uz_init(item, zone->uz_size, flags) != 0)
2396 				return (NULL);
2397 			if (zone->uz_ctor != NULL &&
2398 			    zone->uz_ctor(item, zone->uz_size, udata,
2399 			    flags) != 0) {
2400 			    	zone->uz_fini(item, zone->uz_size);
2401 				return (NULL);
2402 			}
2403 			return (item);
2404 		}
2405 		/* This is unfortunate but should not be fatal. */
2406 	}
2407 #endif
2408 	/*
2409 	 * If possible, allocate from the per-CPU cache.  There are two
2410 	 * requirements for safe access to the per-CPU cache: (1) the thread
2411 	 * accessing the cache must not be preempted or yield during access,
2412 	 * and (2) the thread must not migrate CPUs without switching which
2413 	 * cache it accesses.  We rely on a critical section to prevent
2414 	 * preemption and migration.  We release the critical section in
2415 	 * order to acquire the zone mutex if we are unable to allocate from
2416 	 * the current cache; when we re-acquire the critical section, we
2417 	 * must detect and handle migration if it has occurred.
2418 	 */
2419 zalloc_restart:
2420 	critical_enter();
2421 	cpu = curcpu;
2422 	cache = &zone->uz_cpu[cpu];
2423 
2424 zalloc_start:
2425 	bucket = cache->uc_allocbucket;
2426 	if (bucket != NULL && bucket->ub_cnt > 0) {
2427 		bucket->ub_cnt--;
2428 		item = bucket->ub_bucket[bucket->ub_cnt];
2429 #ifdef INVARIANTS
2430 		bucket->ub_bucket[bucket->ub_cnt] = NULL;
2431 #endif
2432 		KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
2433 		cache->uc_allocs++;
2434 		critical_exit();
2435 #ifdef INVARIANTS
2436 		skipdbg = uma_dbg_zskip(zone, item);
2437 #endif
2438 		if (zone->uz_ctor != NULL &&
2439 #ifdef INVARIANTS
2440 		    (!skipdbg || zone->uz_ctor != trash_ctor ||
2441 		    zone->uz_dtor != trash_dtor) &&
2442 #endif
2443 		    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2444 			counter_u64_add(zone->uz_fails, 1);
2445 			zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
2446 			return (NULL);
2447 		}
2448 #ifdef INVARIANTS
2449 		if (!skipdbg)
2450 			uma_dbg_alloc(zone, NULL, item);
2451 #endif
2452 		if (flags & M_ZERO)
2453 			uma_zero_item(item, zone);
2454 		return (item);
2455 	}
2456 
2457 	/*
2458 	 * We have run out of items in our alloc bucket.
2459 	 * See if we can switch with our free bucket.
2460 	 */
2461 	bucket = cache->uc_freebucket;
2462 	if (bucket != NULL && bucket->ub_cnt > 0) {
2463 		CTR2(KTR_UMA,
2464 		    "uma_zalloc: zone %s(%p) swapping empty with alloc",
2465 		    zone->uz_name, zone);
2466 		cache->uc_freebucket = cache->uc_allocbucket;
2467 		cache->uc_allocbucket = bucket;
2468 		goto zalloc_start;
2469 	}
2470 
2471 	/*
2472 	 * Discard any empty allocation bucket while we hold no locks.
2473 	 */
2474 	bucket = cache->uc_allocbucket;
2475 	cache->uc_allocbucket = NULL;
2476 	critical_exit();
2477 	if (bucket != NULL)
2478 		bucket_free(zone, bucket, udata);
2479 
2480 	/* Short-circuit for zones without buckets and low memory. */
2481 	if (zone->uz_count == 0 || bucketdisable) {
2482 		ZONE_LOCK(zone);
2483 		if (zone->uz_flags & UMA_ZONE_NUMA)
2484 			domain = PCPU_GET(domain);
2485 		else
2486 			domain = UMA_ANYDOMAIN;
2487 		goto zalloc_item;
2488 	}
2489 
2490 	/*
2491 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
2492 	 * we must go back to the zone.  This requires the zone lock, so we
2493 	 * must drop the critical section, then re-acquire it when we go back
2494 	 * to the cache.  Since the critical section is released, we may be
2495 	 * preempted or migrate.  As such, make sure not to maintain any
2496 	 * thread-local state specific to the cache from prior to releasing
2497 	 * the critical section.
2498 	 */
2499 	lockfail = 0;
2500 	if (ZONE_TRYLOCK(zone) == 0) {
2501 		/* Record contention to size the buckets. */
2502 		ZONE_LOCK(zone);
2503 		lockfail = 1;
2504 	}
2505 	critical_enter();
2506 	cpu = curcpu;
2507 	cache = &zone->uz_cpu[cpu];
2508 
2509 	/* See if we lost the race to fill the cache. */
2510 	if (cache->uc_allocbucket != NULL) {
2511 		ZONE_UNLOCK(zone);
2512 		goto zalloc_start;
2513 	}
2514 
2515 	/*
2516 	 * Check the zone's cache of buckets.
2517 	 */
2518 	if (zone->uz_flags & UMA_ZONE_NUMA) {
2519 		domain = PCPU_GET(domain);
2520 		zdom = &zone->uz_domain[domain];
2521 	} else {
2522 		domain = UMA_ANYDOMAIN;
2523 		zdom = &zone->uz_domain[0];
2524 	}
2525 
2526 	if ((bucket = zone_try_fetch_bucket(zone, zdom, true)) != NULL) {
2527 		KASSERT(bucket->ub_cnt != 0,
2528 		    ("uma_zalloc_arg: Returning an empty bucket."));
2529 		cache->uc_allocbucket = bucket;
2530 		ZONE_UNLOCK(zone);
2531 		goto zalloc_start;
2532 	}
2533 	/* We are no longer associated with this CPU. */
2534 	critical_exit();
2535 
2536 	/*
2537 	 * We bump the uz count when the cache size is insufficient to
2538 	 * handle the working set.
2539 	 */
2540 	if (lockfail && zone->uz_count < zone->uz_count_max)
2541 		zone->uz_count++;
2542 
2543 	if (zone->uz_max_items > 0) {
2544 		if (zone->uz_items >= zone->uz_max_items)
2545 			goto zalloc_item;
2546 		maxbucket = MIN(zone->uz_count,
2547 		    zone->uz_max_items - zone->uz_items);
2548 		zone->uz_items += maxbucket;
2549 	} else
2550 		maxbucket = zone->uz_count;
2551 	ZONE_UNLOCK(zone);
2552 
2553 	/*
2554 	 * Now lets just fill a bucket and put it on the free list.  If that
2555 	 * works we'll restart the allocation from the beginning and it
2556 	 * will use the just filled bucket.
2557 	 */
2558 	bucket = zone_alloc_bucket(zone, udata, domain, flags, maxbucket);
2559 	CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
2560 	    zone->uz_name, zone, bucket);
2561 	ZONE_LOCK(zone);
2562 	if (bucket != NULL) {
2563 		if (zone->uz_max_items > 0 && bucket->ub_cnt < maxbucket) {
2564 			MPASS(zone->uz_items >= maxbucket - bucket->ub_cnt);
2565 			zone->uz_items -= maxbucket - bucket->ub_cnt;
2566 			if (zone->uz_sleepers > 0 &&
2567 			    zone->uz_items < zone->uz_max_items)
2568 				wakeup_one(zone);
2569 		}
2570 		critical_enter();
2571 		cpu = curcpu;
2572 		cache = &zone->uz_cpu[cpu];
2573 
2574 		/*
2575 		 * See if we lost the race or were migrated.  Cache the
2576 		 * initialized bucket to make this less likely or claim
2577 		 * the memory directly.
2578 		 */
2579 		if (cache->uc_allocbucket == NULL &&
2580 		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
2581 		    domain == PCPU_GET(domain))) {
2582 			cache->uc_allocbucket = bucket;
2583 			zdom->uzd_imax += bucket->ub_cnt;
2584 		} else if (zone->uz_bkt_count >= zone->uz_bkt_max) {
2585 			critical_exit();
2586 			ZONE_UNLOCK(zone);
2587 			bucket_drain(zone, bucket);
2588 			bucket_free(zone, bucket, udata);
2589 			goto zalloc_restart;
2590 		} else
2591 			zone_put_bucket(zone, zdom, bucket, false);
2592 		ZONE_UNLOCK(zone);
2593 		goto zalloc_start;
2594 	} else if (zone->uz_max_items > 0) {
2595 		zone->uz_items -= maxbucket;
2596 		if (zone->uz_sleepers > 0 &&
2597 		    zone->uz_items + 1 < zone->uz_max_items)
2598 			wakeup_one(zone);
2599 	}
2600 
2601 	/*
2602 	 * We may not be able to get a bucket so return an actual item.
2603 	 */
2604 zalloc_item:
2605 	item = zone_alloc_item_locked(zone, udata, domain, flags);
2606 
2607 	return (item);
2608 }
2609 
2610 void *
2611 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
2612 {
2613 
2614 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2615 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2616 
2617 	/* This is the fast path allocation */
2618 	CTR5(KTR_UMA,
2619 	    "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d",
2620 	    curthread, zone->uz_name, zone, domain, flags);
2621 
2622 	if (flags & M_WAITOK) {
2623 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2624 		    "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
2625 	}
2626 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2627 	    ("uma_zalloc_domain: called with spinlock or critical section held"));
2628 
2629 	return (zone_alloc_item(zone, udata, domain, flags));
2630 }
2631 
2632 /*
2633  * Find a slab with some space.  Prefer slabs that are partially used over those
2634  * that are totally full.  This helps to reduce fragmentation.
2635  *
2636  * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
2637  * only 'domain'.
2638  */
2639 static uma_slab_t
2640 keg_first_slab(uma_keg_t keg, int domain, bool rr)
2641 {
2642 	uma_domain_t dom;
2643 	uma_slab_t slab;
2644 	int start;
2645 
2646 	KASSERT(domain >= 0 && domain < vm_ndomains,
2647 	    ("keg_first_slab: domain %d out of range", domain));
2648 	KEG_LOCK_ASSERT(keg);
2649 
2650 	slab = NULL;
2651 	start = domain;
2652 	do {
2653 		dom = &keg->uk_domain[domain];
2654 		if (!LIST_EMPTY(&dom->ud_part_slab))
2655 			return (LIST_FIRST(&dom->ud_part_slab));
2656 		if (!LIST_EMPTY(&dom->ud_free_slab)) {
2657 			slab = LIST_FIRST(&dom->ud_free_slab);
2658 			LIST_REMOVE(slab, us_link);
2659 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2660 			return (slab);
2661 		}
2662 		if (rr)
2663 			domain = (domain + 1) % vm_ndomains;
2664 	} while (domain != start);
2665 
2666 	return (NULL);
2667 }
2668 
2669 static uma_slab_t
2670 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags)
2671 {
2672 	uint32_t reserve;
2673 
2674 	KEG_LOCK_ASSERT(keg);
2675 
2676 	reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve;
2677 	if (keg->uk_free <= reserve)
2678 		return (NULL);
2679 	return (keg_first_slab(keg, domain, rr));
2680 }
2681 
2682 static uma_slab_t
2683 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags)
2684 {
2685 	struct vm_domainset_iter di;
2686 	uma_domain_t dom;
2687 	uma_slab_t slab;
2688 	int aflags, domain;
2689 	bool rr;
2690 
2691 restart:
2692 	KEG_LOCK_ASSERT(keg);
2693 
2694 	/*
2695 	 * Use the keg's policy if upper layers haven't already specified a
2696 	 * domain (as happens with first-touch zones).
2697 	 *
2698 	 * To avoid races we run the iterator with the keg lock held, but that
2699 	 * means that we cannot allow the vm_domainset layer to sleep.  Thus,
2700 	 * clear M_WAITOK and handle low memory conditions locally.
2701 	 */
2702 	rr = rdomain == UMA_ANYDOMAIN;
2703 	if (rr) {
2704 		aflags = (flags & ~M_WAITOK) | M_NOWAIT;
2705 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
2706 		    &aflags);
2707 	} else {
2708 		aflags = flags;
2709 		domain = rdomain;
2710 	}
2711 
2712 	for (;;) {
2713 		slab = keg_fetch_free_slab(keg, domain, rr, flags);
2714 		if (slab != NULL) {
2715 			MPASS(slab->us_keg == keg);
2716 			return (slab);
2717 		}
2718 
2719 		/*
2720 		 * M_NOVM means don't ask at all!
2721 		 */
2722 		if (flags & M_NOVM)
2723 			break;
2724 
2725 		KASSERT(zone->uz_max_items == 0 ||
2726 		    zone->uz_items <= zone->uz_max_items,
2727 		    ("%s: zone %p overflow", __func__, zone));
2728 
2729 		slab = keg_alloc_slab(keg, zone, domain, flags, aflags);
2730 		/*
2731 		 * If we got a slab here it's safe to mark it partially used
2732 		 * and return.  We assume that the caller is going to remove
2733 		 * at least one item.
2734 		 */
2735 		if (slab) {
2736 			MPASS(slab->us_keg == keg);
2737 			dom = &keg->uk_domain[slab->us_domain];
2738 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2739 			return (slab);
2740 		}
2741 		KEG_LOCK(keg);
2742 		if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
2743 			if ((flags & M_WAITOK) != 0) {
2744 				KEG_UNLOCK(keg);
2745 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
2746 				KEG_LOCK(keg);
2747 				goto restart;
2748 			}
2749 			break;
2750 		}
2751 	}
2752 
2753 	/*
2754 	 * We might not have been able to get a slab but another cpu
2755 	 * could have while we were unlocked.  Check again before we
2756 	 * fail.
2757 	 */
2758 	if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) {
2759 		MPASS(slab->us_keg == keg);
2760 		return (slab);
2761 	}
2762 	return (NULL);
2763 }
2764 
2765 static uma_slab_t
2766 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags)
2767 {
2768 	uma_slab_t slab;
2769 
2770 	if (keg == NULL) {
2771 		keg = zone->uz_keg;
2772 		KEG_LOCK(keg);
2773 	}
2774 
2775 	for (;;) {
2776 		slab = keg_fetch_slab(keg, zone, domain, flags);
2777 		if (slab)
2778 			return (slab);
2779 		if (flags & (M_NOWAIT | M_NOVM))
2780 			break;
2781 	}
2782 	KEG_UNLOCK(keg);
2783 	return (NULL);
2784 }
2785 
2786 static void *
2787 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
2788 {
2789 	uma_domain_t dom;
2790 	void *item;
2791 	uint8_t freei;
2792 
2793 	MPASS(keg == slab->us_keg);
2794 	KEG_LOCK_ASSERT(keg);
2795 
2796 	freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
2797 	BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
2798 	item = slab->us_data + (keg->uk_rsize * freei);
2799 	slab->us_freecount--;
2800 	keg->uk_free--;
2801 
2802 	/* Move this slab to the full list */
2803 	if (slab->us_freecount == 0) {
2804 		LIST_REMOVE(slab, us_link);
2805 		dom = &keg->uk_domain[slab->us_domain];
2806 		LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
2807 	}
2808 
2809 	return (item);
2810 }
2811 
2812 static int
2813 zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags)
2814 {
2815 	uma_slab_t slab;
2816 	uma_keg_t keg;
2817 #ifdef NUMA
2818 	int stripe;
2819 #endif
2820 	int i;
2821 
2822 	slab = NULL;
2823 	keg = NULL;
2824 	/* Try to keep the buckets totally full */
2825 	for (i = 0; i < max; ) {
2826 		if ((slab = zone_fetch_slab(zone, keg, domain, flags)) == NULL)
2827 			break;
2828 		keg = slab->us_keg;
2829 #ifdef NUMA
2830 		stripe = howmany(max, vm_ndomains);
2831 #endif
2832 		while (slab->us_freecount && i < max) {
2833 			bucket[i++] = slab_alloc_item(keg, slab);
2834 			if (keg->uk_free <= keg->uk_reserve)
2835 				break;
2836 #ifdef NUMA
2837 			/*
2838 			 * If the zone is striped we pick a new slab for every
2839 			 * N allocations.  Eliminating this conditional will
2840 			 * instead pick a new domain for each bucket rather
2841 			 * than stripe within each bucket.  The current option
2842 			 * produces more fragmentation and requires more cpu
2843 			 * time but yields better distribution.
2844 			 */
2845 			if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 &&
2846 			    vm_ndomains > 1 && --stripe == 0)
2847 				break;
2848 #endif
2849 		}
2850 		/* Don't block if we allocated any successfully. */
2851 		flags &= ~M_WAITOK;
2852 		flags |= M_NOWAIT;
2853 	}
2854 	if (slab != NULL)
2855 		KEG_UNLOCK(keg);
2856 
2857 	return i;
2858 }
2859 
2860 static uma_bucket_t
2861 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags, int max)
2862 {
2863 	uma_bucket_t bucket;
2864 
2865 	CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain);
2866 
2867 	/* Avoid allocs targeting empty domains. */
2868 	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
2869 		domain = UMA_ANYDOMAIN;
2870 
2871 	/* Don't wait for buckets, preserve caller's NOVM setting. */
2872 	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
2873 	if (bucket == NULL)
2874 		return (NULL);
2875 
2876 	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
2877 	    MIN(max, bucket->ub_entries), domain, flags);
2878 
2879 	/*
2880 	 * Initialize the memory if necessary.
2881 	 */
2882 	if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
2883 		int i;
2884 
2885 		for (i = 0; i < bucket->ub_cnt; i++)
2886 			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
2887 			    flags) != 0)
2888 				break;
2889 		/*
2890 		 * If we couldn't initialize the whole bucket, put the
2891 		 * rest back onto the freelist.
2892 		 */
2893 		if (i != bucket->ub_cnt) {
2894 			zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
2895 			    bucket->ub_cnt - i);
2896 #ifdef INVARIANTS
2897 			bzero(&bucket->ub_bucket[i],
2898 			    sizeof(void *) * (bucket->ub_cnt - i));
2899 #endif
2900 			bucket->ub_cnt = i;
2901 		}
2902 	}
2903 
2904 	if (bucket->ub_cnt == 0) {
2905 		bucket_free(zone, bucket, udata);
2906 		counter_u64_add(zone->uz_fails, 1);
2907 		return (NULL);
2908 	}
2909 
2910 	return (bucket);
2911 }
2912 
2913 /*
2914  * Allocates a single item from a zone.
2915  *
2916  * Arguments
2917  *	zone   The zone to alloc for.
2918  *	udata  The data to be passed to the constructor.
2919  *	domain The domain to allocate from or UMA_ANYDOMAIN.
2920  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
2921  *
2922  * Returns
2923  *	NULL if there is no memory and M_NOWAIT is set
2924  *	An item if successful
2925  */
2926 
2927 static void *
2928 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
2929 {
2930 
2931 	ZONE_LOCK(zone);
2932 	return (zone_alloc_item_locked(zone, udata, domain, flags));
2933 }
2934 
2935 /*
2936  * Returns with zone unlocked.
2937  */
2938 static void *
2939 zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags)
2940 {
2941 	void *item;
2942 #ifdef INVARIANTS
2943 	bool skipdbg;
2944 #endif
2945 
2946 	ZONE_LOCK_ASSERT(zone);
2947 
2948 	if (zone->uz_max_items > 0) {
2949 		if (zone->uz_items >= zone->uz_max_items) {
2950 			zone_log_warning(zone);
2951 			zone_maxaction(zone);
2952 			if (flags & M_NOWAIT) {
2953 				ZONE_UNLOCK(zone);
2954 				return (NULL);
2955 			}
2956 			zone->uz_sleeps++;
2957 			zone->uz_sleepers++;
2958 			while (zone->uz_items >= zone->uz_max_items)
2959 				mtx_sleep(zone, zone->uz_lockptr, PVM,
2960 				    "zonelimit", 0);
2961 			zone->uz_sleepers--;
2962 			if (zone->uz_sleepers > 0 &&
2963 			    zone->uz_items + 1 < zone->uz_max_items)
2964 				wakeup_one(zone);
2965 		}
2966 		zone->uz_items++;
2967 	}
2968 	ZONE_UNLOCK(zone);
2969 
2970 	/* Avoid allocs targeting empty domains. */
2971 	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
2972 		domain = UMA_ANYDOMAIN;
2973 
2974 	if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
2975 		goto fail;
2976 
2977 #ifdef INVARIANTS
2978 	skipdbg = uma_dbg_zskip(zone, item);
2979 #endif
2980 	/*
2981 	 * We have to call both the zone's init (not the keg's init)
2982 	 * and the zone's ctor.  This is because the item is going from
2983 	 * a keg slab directly to the user, and the user is expecting it
2984 	 * to be both zone-init'd as well as zone-ctor'd.
2985 	 */
2986 	if (zone->uz_init != NULL) {
2987 		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
2988 			zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT);
2989 			goto fail;
2990 		}
2991 	}
2992 	if (zone->uz_ctor != NULL &&
2993 #ifdef INVARIANTS
2994 	    (!skipdbg || zone->uz_ctor != trash_ctor ||
2995 	    zone->uz_dtor != trash_dtor) &&
2996 #endif
2997 	    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2998 		zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
2999 		goto fail;
3000 	}
3001 #ifdef INVARIANTS
3002 	if (!skipdbg)
3003 		uma_dbg_alloc(zone, NULL, item);
3004 #endif
3005 	if (flags & M_ZERO)
3006 		uma_zero_item(item, zone);
3007 
3008 	counter_u64_add(zone->uz_allocs, 1);
3009 	CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
3010 	    zone->uz_name, zone);
3011 
3012 	return (item);
3013 
3014 fail:
3015 	if (zone->uz_max_items > 0) {
3016 		ZONE_LOCK(zone);
3017 		zone->uz_items--;
3018 		ZONE_UNLOCK(zone);
3019 	}
3020 	counter_u64_add(zone->uz_fails, 1);
3021 	CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
3022 	    zone->uz_name, zone);
3023 	return (NULL);
3024 }
3025 
3026 /* See uma.h */
3027 void
3028 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
3029 {
3030 	uma_cache_t cache;
3031 	uma_bucket_t bucket;
3032 	uma_zone_domain_t zdom;
3033 	int cpu, domain;
3034 #ifdef UMA_XDOMAIN
3035 	int itemdomain;
3036 #endif
3037 	bool lockfail;
3038 #ifdef INVARIANTS
3039 	bool skipdbg;
3040 #endif
3041 
3042 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3043 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3044 
3045 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
3046 	    zone->uz_name);
3047 
3048 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3049 	    ("uma_zfree_arg: called with spinlock or critical section held"));
3050 
3051         /* uma_zfree(..., NULL) does nothing, to match free(9). */
3052         if (item == NULL)
3053                 return;
3054 #ifdef DEBUG_MEMGUARD
3055 	if (is_memguard_addr(item)) {
3056 		if (zone->uz_dtor != NULL)
3057 			zone->uz_dtor(item, zone->uz_size, udata);
3058 		if (zone->uz_fini != NULL)
3059 			zone->uz_fini(item, zone->uz_size);
3060 		memguard_free(item);
3061 		return;
3062 	}
3063 #endif
3064 #ifdef INVARIANTS
3065 	skipdbg = uma_dbg_zskip(zone, item);
3066 	if (skipdbg == false) {
3067 		if (zone->uz_flags & UMA_ZONE_MALLOC)
3068 			uma_dbg_free(zone, udata, item);
3069 		else
3070 			uma_dbg_free(zone, NULL, item);
3071 	}
3072 	if (zone->uz_dtor != NULL && (!skipdbg ||
3073 	    zone->uz_dtor != trash_dtor || zone->uz_ctor != trash_ctor))
3074 #else
3075 	if (zone->uz_dtor != NULL)
3076 #endif
3077 		zone->uz_dtor(item, zone->uz_size, udata);
3078 
3079 	/*
3080 	 * The race here is acceptable.  If we miss it we'll just have to wait
3081 	 * a little longer for the limits to be reset.
3082 	 */
3083 	if (zone->uz_sleepers > 0)
3084 		goto zfree_item;
3085 
3086 #ifdef UMA_XDOMAIN
3087 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
3088 		itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
3089 #endif
3090 
3091 	/*
3092 	 * If possible, free to the per-CPU cache.  There are two
3093 	 * requirements for safe access to the per-CPU cache: (1) the thread
3094 	 * accessing the cache must not be preempted or yield during access,
3095 	 * and (2) the thread must not migrate CPUs without switching which
3096 	 * cache it accesses.  We rely on a critical section to prevent
3097 	 * preemption and migration.  We release the critical section in
3098 	 * order to acquire the zone mutex if we are unable to free to the
3099 	 * current cache; when we re-acquire the critical section, we must
3100 	 * detect and handle migration if it has occurred.
3101 	 */
3102 zfree_restart:
3103 	critical_enter();
3104 	cpu = curcpu;
3105 	cache = &zone->uz_cpu[cpu];
3106 
3107 zfree_start:
3108 	domain = PCPU_GET(domain);
3109 #ifdef UMA_XDOMAIN
3110 	if ((zone->uz_flags & UMA_ZONE_NUMA) == 0)
3111 		itemdomain = domain;
3112 #endif
3113 	/*
3114 	 * Try to free into the allocbucket first to give LIFO ordering
3115 	 * for cache-hot datastructures.  Spill over into the freebucket
3116 	 * if necessary.  Alloc will swap them if one runs dry.
3117 	 */
3118 #ifdef UMA_XDOMAIN
3119 	if (domain != itemdomain) {
3120 		bucket = cache->uc_crossbucket;
3121 	} else
3122 #endif
3123 	{
3124 		bucket = cache->uc_allocbucket;
3125 		if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
3126 			bucket = cache->uc_freebucket;
3127 	}
3128 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3129 		KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
3130 		    ("uma_zfree: Freeing to non free bucket index."));
3131 		bucket->ub_bucket[bucket->ub_cnt] = item;
3132 		bucket->ub_cnt++;
3133 		cache->uc_frees++;
3134 		critical_exit();
3135 		return;
3136 	}
3137 
3138 	/*
3139 	 * We must go back the zone, which requires acquiring the zone lock,
3140 	 * which in turn means we must release and re-acquire the critical
3141 	 * section.  Since the critical section is released, we may be
3142 	 * preempted or migrate.  As such, make sure not to maintain any
3143 	 * thread-local state specific to the cache from prior to releasing
3144 	 * the critical section.
3145 	 */
3146 	critical_exit();
3147 	if (zone->uz_count == 0 || bucketdisable)
3148 		goto zfree_item;
3149 
3150 	lockfail = false;
3151 	if (ZONE_TRYLOCK(zone) == 0) {
3152 		/* Record contention to size the buckets. */
3153 		ZONE_LOCK(zone);
3154 		lockfail = true;
3155 	}
3156 	critical_enter();
3157 	cpu = curcpu;
3158 	domain = PCPU_GET(domain);
3159 	cache = &zone->uz_cpu[cpu];
3160 
3161 #ifdef UMA_XDOMAIN
3162 	if (domain != itemdomain)
3163 		bucket = cache->uc_crossbucket;
3164 	else
3165 #endif
3166 		bucket = cache->uc_freebucket;
3167 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3168 		ZONE_UNLOCK(zone);
3169 		goto zfree_start;
3170 	}
3171 #ifdef UMA_XDOMAIN
3172 	if (domain != itemdomain)
3173 		cache->uc_crossbucket = NULL;
3174 	else
3175 #endif
3176 		cache->uc_freebucket = NULL;
3177 	/* We are no longer associated with this CPU. */
3178 	critical_exit();
3179 
3180 #ifdef UMA_XDOMAIN
3181 	if (domain != itemdomain) {
3182 		if (bucket != NULL) {
3183 			zone->uz_xdomain += bucket->ub_cnt;
3184 			if (vm_ndomains > 2 ||
3185 			    zone->uz_bkt_count >= zone->uz_bkt_max) {
3186 				ZONE_UNLOCK(zone);
3187 				bucket_drain(zone, bucket);
3188 				bucket_free(zone, bucket, udata);
3189 			} else {
3190 				zdom = &zone->uz_domain[itemdomain];
3191 				zone_put_bucket(zone, zdom, bucket, true);
3192 				ZONE_UNLOCK(zone);
3193 			}
3194 		} else
3195 			ZONE_UNLOCK(zone);
3196 		bucket = bucket_alloc(zone, udata, M_NOWAIT);
3197 		if (bucket == NULL)
3198 			goto zfree_item;
3199 		critical_enter();
3200 		cpu = curcpu;
3201 		cache = &zone->uz_cpu[cpu];
3202 		if (cache->uc_crossbucket == NULL) {
3203 			cache->uc_crossbucket = bucket;
3204 			goto zfree_start;
3205 		}
3206 		critical_exit();
3207 		bucket_free(zone, bucket, udata);
3208 		goto zfree_restart;
3209 	}
3210 #endif
3211 
3212 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) {
3213 		zdom = &zone->uz_domain[domain];
3214 	} else {
3215 		domain = 0;
3216 		zdom = &zone->uz_domain[0];
3217 	}
3218 
3219 	/* Can we throw this on the zone full list? */
3220 	if (bucket != NULL) {
3221 		CTR3(KTR_UMA,
3222 		    "uma_zfree: zone %s(%p) putting bucket %p on free list",
3223 		    zone->uz_name, zone, bucket);
3224 		/* ub_cnt is pointing to the last free item */
3225 		KASSERT(bucket->ub_cnt == bucket->ub_entries,
3226 		    ("uma_zfree: Attempting to insert not full bucket onto the full list.\n"));
3227 		if (zone->uz_bkt_count >= zone->uz_bkt_max) {
3228 			ZONE_UNLOCK(zone);
3229 			bucket_drain(zone, bucket);
3230 			bucket_free(zone, bucket, udata);
3231 			goto zfree_restart;
3232 		} else
3233 			zone_put_bucket(zone, zdom, bucket, true);
3234 	}
3235 
3236 	/*
3237 	 * We bump the uz count when the cache size is insufficient to
3238 	 * handle the working set.
3239 	 */
3240 	if (lockfail && zone->uz_count < zone->uz_count_max)
3241 		zone->uz_count++;
3242 	ZONE_UNLOCK(zone);
3243 
3244 	bucket = bucket_alloc(zone, udata, M_NOWAIT);
3245 	CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p",
3246 	    zone->uz_name, zone, bucket);
3247 	if (bucket) {
3248 		critical_enter();
3249 		cpu = curcpu;
3250 		cache = &zone->uz_cpu[cpu];
3251 		if (cache->uc_freebucket == NULL &&
3252 		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
3253 		    domain == PCPU_GET(domain))) {
3254 			cache->uc_freebucket = bucket;
3255 			goto zfree_start;
3256 		}
3257 		/*
3258 		 * We lost the race, start over.  We have to drop our
3259 		 * critical section to free the bucket.
3260 		 */
3261 		critical_exit();
3262 		bucket_free(zone, bucket, udata);
3263 		goto zfree_restart;
3264 	}
3265 
3266 	/*
3267 	 * If nothing else caught this, we'll just do an internal free.
3268 	 */
3269 zfree_item:
3270 	zone_free_item(zone, item, udata, SKIP_DTOR);
3271 }
3272 
3273 void
3274 uma_zfree_domain(uma_zone_t zone, void *item, void *udata)
3275 {
3276 
3277 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3278 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3279 
3280 	CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread,
3281 	    zone->uz_name);
3282 
3283 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3284 	    ("uma_zfree_domain: called with spinlock or critical section held"));
3285 
3286         /* uma_zfree(..., NULL) does nothing, to match free(9). */
3287         if (item == NULL)
3288                 return;
3289 	zone_free_item(zone, item, udata, SKIP_NONE);
3290 }
3291 
3292 static void
3293 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item)
3294 {
3295 	uma_keg_t keg;
3296 	uma_domain_t dom;
3297 	uint8_t freei;
3298 
3299 	keg = zone->uz_keg;
3300 	MPASS(zone->uz_lockptr == &keg->uk_lock);
3301 	KEG_LOCK_ASSERT(keg);
3302 	MPASS(keg == slab->us_keg);
3303 
3304 	dom = &keg->uk_domain[slab->us_domain];
3305 
3306 	/* Do we need to remove from any lists? */
3307 	if (slab->us_freecount+1 == keg->uk_ipers) {
3308 		LIST_REMOVE(slab, us_link);
3309 		LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
3310 	} else if (slab->us_freecount == 0) {
3311 		LIST_REMOVE(slab, us_link);
3312 		LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
3313 	}
3314 
3315 	/* Slab management. */
3316 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3317 	BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
3318 	slab->us_freecount++;
3319 
3320 	/* Keg statistics. */
3321 	keg->uk_free++;
3322 }
3323 
3324 static void
3325 zone_release(uma_zone_t zone, void **bucket, int cnt)
3326 {
3327 	void *item;
3328 	uma_slab_t slab;
3329 	uma_keg_t keg;
3330 	uint8_t *mem;
3331 	int i;
3332 
3333 	keg = zone->uz_keg;
3334 	KEG_LOCK(keg);
3335 	for (i = 0; i < cnt; i++) {
3336 		item = bucket[i];
3337 		if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
3338 			mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
3339 			if (zone->uz_flags & UMA_ZONE_HASH) {
3340 				slab = hash_sfind(&keg->uk_hash, mem);
3341 			} else {
3342 				mem += keg->uk_pgoff;
3343 				slab = (uma_slab_t)mem;
3344 			}
3345 		} else {
3346 			slab = vtoslab((vm_offset_t)item);
3347 			MPASS(slab->us_keg == keg);
3348 		}
3349 		slab_free_item(zone, slab, item);
3350 	}
3351 	KEG_UNLOCK(keg);
3352 }
3353 
3354 /*
3355  * Frees a single item to any zone.
3356  *
3357  * Arguments:
3358  *	zone   The zone to free to
3359  *	item   The item we're freeing
3360  *	udata  User supplied data for the dtor
3361  *	skip   Skip dtors and finis
3362  */
3363 static void
3364 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
3365 {
3366 #ifdef INVARIANTS
3367 	bool skipdbg;
3368 
3369 	skipdbg = uma_dbg_zskip(zone, item);
3370 	if (skip == SKIP_NONE && !skipdbg) {
3371 		if (zone->uz_flags & UMA_ZONE_MALLOC)
3372 			uma_dbg_free(zone, udata, item);
3373 		else
3374 			uma_dbg_free(zone, NULL, item);
3375 	}
3376 
3377 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL &&
3378 	    (!skipdbg || zone->uz_dtor != trash_dtor ||
3379 	    zone->uz_ctor != trash_ctor))
3380 #else
3381 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL)
3382 #endif
3383 		zone->uz_dtor(item, zone->uz_size, udata);
3384 
3385 	if (skip < SKIP_FINI && zone->uz_fini)
3386 		zone->uz_fini(item, zone->uz_size);
3387 
3388 	zone->uz_release(zone->uz_arg, &item, 1);
3389 
3390 	if (skip & SKIP_CNT)
3391 		return;
3392 
3393 	counter_u64_add(zone->uz_frees, 1);
3394 
3395 	if (zone->uz_max_items > 0) {
3396 		ZONE_LOCK(zone);
3397 		zone->uz_items--;
3398 		if (zone->uz_sleepers > 0 &&
3399 		    zone->uz_items < zone->uz_max_items)
3400 			wakeup_one(zone);
3401 		ZONE_UNLOCK(zone);
3402 	}
3403 }
3404 
3405 /* See uma.h */
3406 int
3407 uma_zone_set_max(uma_zone_t zone, int nitems)
3408 {
3409 	struct uma_bucket_zone *ubz;
3410 
3411 	/*
3412 	 * If limit is very low we may need to limit how
3413 	 * much items are allowed in CPU caches.
3414 	 */
3415 	ubz = &bucket_zones[0];
3416 	for (; ubz->ubz_entries != 0; ubz++)
3417 		if (ubz->ubz_entries * 2 * mp_ncpus > nitems)
3418 			break;
3419 	if (ubz == &bucket_zones[0])
3420 		nitems = ubz->ubz_entries * 2 * mp_ncpus;
3421 	else
3422 		ubz--;
3423 
3424 	ZONE_LOCK(zone);
3425 	zone->uz_count_max = zone->uz_count = ubz->ubz_entries;
3426 	if (zone->uz_count_min > zone->uz_count_max)
3427 		zone->uz_count_min = zone->uz_count_max;
3428 	zone->uz_max_items = nitems;
3429 	ZONE_UNLOCK(zone);
3430 
3431 	return (nitems);
3432 }
3433 
3434 /* See uma.h */
3435 int
3436 uma_zone_set_maxcache(uma_zone_t zone, int nitems)
3437 {
3438 
3439 	ZONE_LOCK(zone);
3440 	zone->uz_bkt_max = nitems;
3441 	ZONE_UNLOCK(zone);
3442 
3443 	return (nitems);
3444 }
3445 
3446 /* See uma.h */
3447 int
3448 uma_zone_get_max(uma_zone_t zone)
3449 {
3450 	int nitems;
3451 
3452 	ZONE_LOCK(zone);
3453 	nitems = zone->uz_max_items;
3454 	ZONE_UNLOCK(zone);
3455 
3456 	return (nitems);
3457 }
3458 
3459 /* See uma.h */
3460 void
3461 uma_zone_set_warning(uma_zone_t zone, const char *warning)
3462 {
3463 
3464 	ZONE_LOCK(zone);
3465 	zone->uz_warning = warning;
3466 	ZONE_UNLOCK(zone);
3467 }
3468 
3469 /* See uma.h */
3470 void
3471 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
3472 {
3473 
3474 	ZONE_LOCK(zone);
3475 	TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
3476 	ZONE_UNLOCK(zone);
3477 }
3478 
3479 /* See uma.h */
3480 int
3481 uma_zone_get_cur(uma_zone_t zone)
3482 {
3483 	int64_t nitems;
3484 	u_int i;
3485 
3486 	ZONE_LOCK(zone);
3487 	nitems = counter_u64_fetch(zone->uz_allocs) -
3488 	    counter_u64_fetch(zone->uz_frees);
3489 	CPU_FOREACH(i) {
3490 		/*
3491 		 * See the comment in uma_vm_zone_stats() regarding the
3492 		 * safety of accessing the per-cpu caches. With the zone lock
3493 		 * held, it is safe, but can potentially result in stale data.
3494 		 */
3495 		nitems += zone->uz_cpu[i].uc_allocs -
3496 		    zone->uz_cpu[i].uc_frees;
3497 	}
3498 	ZONE_UNLOCK(zone);
3499 
3500 	return (nitems < 0 ? 0 : nitems);
3501 }
3502 
3503 /* See uma.h */
3504 void
3505 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
3506 {
3507 	uma_keg_t keg;
3508 
3509 	KEG_GET(zone, keg);
3510 	KEG_LOCK(keg);
3511 	KASSERT(keg->uk_pages == 0,
3512 	    ("uma_zone_set_init on non-empty keg"));
3513 	keg->uk_init = uminit;
3514 	KEG_UNLOCK(keg);
3515 }
3516 
3517 /* See uma.h */
3518 void
3519 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
3520 {
3521 	uma_keg_t keg;
3522 
3523 	KEG_GET(zone, keg);
3524 	KEG_LOCK(keg);
3525 	KASSERT(keg->uk_pages == 0,
3526 	    ("uma_zone_set_fini on non-empty keg"));
3527 	keg->uk_fini = fini;
3528 	KEG_UNLOCK(keg);
3529 }
3530 
3531 /* See uma.h */
3532 void
3533 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
3534 {
3535 
3536 	ZONE_LOCK(zone);
3537 	KASSERT(zone->uz_keg->uk_pages == 0,
3538 	    ("uma_zone_set_zinit on non-empty keg"));
3539 	zone->uz_init = zinit;
3540 	ZONE_UNLOCK(zone);
3541 }
3542 
3543 /* See uma.h */
3544 void
3545 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
3546 {
3547 
3548 	ZONE_LOCK(zone);
3549 	KASSERT(zone->uz_keg->uk_pages == 0,
3550 	    ("uma_zone_set_zfini on non-empty keg"));
3551 	zone->uz_fini = zfini;
3552 	ZONE_UNLOCK(zone);
3553 }
3554 
3555 /* See uma.h */
3556 /* XXX uk_freef is not actually used with the zone locked */
3557 void
3558 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
3559 {
3560 	uma_keg_t keg;
3561 
3562 	KEG_GET(zone, keg);
3563 	KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
3564 	KEG_LOCK(keg);
3565 	keg->uk_freef = freef;
3566 	KEG_UNLOCK(keg);
3567 }
3568 
3569 /* See uma.h */
3570 /* XXX uk_allocf is not actually used with the zone locked */
3571 void
3572 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3573 {
3574 	uma_keg_t keg;
3575 
3576 	KEG_GET(zone, keg);
3577 	KEG_LOCK(keg);
3578 	keg->uk_allocf = allocf;
3579 	KEG_UNLOCK(keg);
3580 }
3581 
3582 /* See uma.h */
3583 void
3584 uma_zone_reserve(uma_zone_t zone, int items)
3585 {
3586 	uma_keg_t keg;
3587 
3588 	KEG_GET(zone, keg);
3589 	KEG_LOCK(keg);
3590 	keg->uk_reserve = items;
3591 	KEG_UNLOCK(keg);
3592 }
3593 
3594 /* See uma.h */
3595 int
3596 uma_zone_reserve_kva(uma_zone_t zone, int count)
3597 {
3598 	uma_keg_t keg;
3599 	vm_offset_t kva;
3600 	u_int pages;
3601 
3602 	KEG_GET(zone, keg);
3603 
3604 	pages = count / keg->uk_ipers;
3605 	if (pages * keg->uk_ipers < count)
3606 		pages++;
3607 	pages *= keg->uk_ppera;
3608 
3609 #ifdef UMA_MD_SMALL_ALLOC
3610 	if (keg->uk_ppera > 1) {
3611 #else
3612 	if (1) {
3613 #endif
3614 		kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
3615 		if (kva == 0)
3616 			return (0);
3617 	} else
3618 		kva = 0;
3619 
3620 	ZONE_LOCK(zone);
3621 	MPASS(keg->uk_kva == 0);
3622 	keg->uk_kva = kva;
3623 	keg->uk_offset = 0;
3624 	zone->uz_max_items = pages * keg->uk_ipers;
3625 #ifdef UMA_MD_SMALL_ALLOC
3626 	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3627 #else
3628 	keg->uk_allocf = noobj_alloc;
3629 #endif
3630 	keg->uk_flags |= UMA_ZONE_NOFREE;
3631 	ZONE_UNLOCK(zone);
3632 
3633 	return (1);
3634 }
3635 
3636 /* See uma.h */
3637 void
3638 uma_prealloc(uma_zone_t zone, int items)
3639 {
3640 	struct vm_domainset_iter di;
3641 	uma_domain_t dom;
3642 	uma_slab_t slab;
3643 	uma_keg_t keg;
3644 	int aflags, domain, slabs;
3645 
3646 	KEG_GET(zone, keg);
3647 	KEG_LOCK(keg);
3648 	slabs = items / keg->uk_ipers;
3649 	if (slabs * keg->uk_ipers < items)
3650 		slabs++;
3651 	while (slabs-- > 0) {
3652 		aflags = M_NOWAIT;
3653 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
3654 		    &aflags);
3655 		for (;;) {
3656 			slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
3657 			    aflags);
3658 			if (slab != NULL) {
3659 				MPASS(slab->us_keg == keg);
3660 				dom = &keg->uk_domain[slab->us_domain];
3661 				LIST_INSERT_HEAD(&dom->ud_free_slab, slab,
3662 				    us_link);
3663 				break;
3664 			}
3665 			KEG_LOCK(keg);
3666 			if (vm_domainset_iter_policy(&di, &domain) != 0) {
3667 				KEG_UNLOCK(keg);
3668 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
3669 				KEG_LOCK(keg);
3670 			}
3671 		}
3672 	}
3673 	KEG_UNLOCK(keg);
3674 }
3675 
3676 /* See uma.h */
3677 static void
3678 uma_reclaim_locked(bool kmem_danger)
3679 {
3680 
3681 	CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
3682 	sx_assert(&uma_drain_lock, SA_XLOCKED);
3683 	bucket_enable();
3684 	zone_foreach(zone_drain);
3685 	if (vm_page_count_min() || kmem_danger) {
3686 		cache_drain_safe(NULL);
3687 		zone_foreach(zone_drain);
3688 	}
3689 
3690 	/*
3691 	 * Some slabs may have been freed but this zone will be visited early
3692 	 * we visit again so that we can free pages that are empty once other
3693 	 * zones are drained.  We have to do the same for buckets.
3694 	 */
3695 	zone_drain(slabzone);
3696 	bucket_zone_drain();
3697 }
3698 
3699 void
3700 uma_reclaim(void)
3701 {
3702 
3703 	sx_xlock(&uma_drain_lock);
3704 	uma_reclaim_locked(false);
3705 	sx_xunlock(&uma_drain_lock);
3706 }
3707 
3708 static volatile int uma_reclaim_needed;
3709 
3710 void
3711 uma_reclaim_wakeup(void)
3712 {
3713 
3714 	if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
3715 		wakeup(uma_reclaim);
3716 }
3717 
3718 void
3719 uma_reclaim_worker(void *arg __unused)
3720 {
3721 
3722 	for (;;) {
3723 		sx_xlock(&uma_drain_lock);
3724 		while (atomic_load_int(&uma_reclaim_needed) == 0)
3725 			sx_sleep(uma_reclaim, &uma_drain_lock, PVM, "umarcl",
3726 			    hz);
3727 		sx_xunlock(&uma_drain_lock);
3728 		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
3729 		sx_xlock(&uma_drain_lock);
3730 		uma_reclaim_locked(true);
3731 		atomic_store_int(&uma_reclaim_needed, 0);
3732 		sx_xunlock(&uma_drain_lock);
3733 		/* Don't fire more than once per-second. */
3734 		pause("umarclslp", hz);
3735 	}
3736 }
3737 
3738 /* See uma.h */
3739 int
3740 uma_zone_exhausted(uma_zone_t zone)
3741 {
3742 	int full;
3743 
3744 	ZONE_LOCK(zone);
3745 	full = zone->uz_sleepers > 0;
3746 	ZONE_UNLOCK(zone);
3747 	return (full);
3748 }
3749 
3750 int
3751 uma_zone_exhausted_nolock(uma_zone_t zone)
3752 {
3753 	return (zone->uz_sleepers > 0);
3754 }
3755 
3756 void *
3757 uma_large_malloc_domain(vm_size_t size, int domain, int wait)
3758 {
3759 	struct domainset *policy;
3760 	vm_offset_t addr;
3761 	uma_slab_t slab;
3762 
3763 	if (domain != UMA_ANYDOMAIN) {
3764 		/* avoid allocs targeting empty domains */
3765 		if (VM_DOMAIN_EMPTY(domain))
3766 			domain = UMA_ANYDOMAIN;
3767 	}
3768 	slab = zone_alloc_item(slabzone, NULL, domain, wait);
3769 	if (slab == NULL)
3770 		return (NULL);
3771 	policy = (domain == UMA_ANYDOMAIN) ? DOMAINSET_RR() :
3772 	    DOMAINSET_FIXED(domain);
3773 	addr = kmem_malloc_domainset(policy, size, wait);
3774 	if (addr != 0) {
3775 		vsetslab(addr, slab);
3776 		slab->us_data = (void *)addr;
3777 		slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC;
3778 		slab->us_size = size;
3779 		slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE(
3780 		    pmap_kextract(addr)));
3781 		uma_total_inc(size);
3782 	} else {
3783 		zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3784 	}
3785 
3786 	return ((void *)addr);
3787 }
3788 
3789 void *
3790 uma_large_malloc(vm_size_t size, int wait)
3791 {
3792 
3793 	return uma_large_malloc_domain(size, UMA_ANYDOMAIN, wait);
3794 }
3795 
3796 void
3797 uma_large_free(uma_slab_t slab)
3798 {
3799 
3800 	KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0,
3801 	    ("uma_large_free:  Memory not allocated with uma_large_malloc."));
3802 	kmem_free((vm_offset_t)slab->us_data, slab->us_size);
3803 	uma_total_dec(slab->us_size);
3804 	zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3805 }
3806 
3807 static void
3808 uma_zero_item(void *item, uma_zone_t zone)
3809 {
3810 
3811 	bzero(item, zone->uz_size);
3812 }
3813 
3814 unsigned long
3815 uma_limit(void)
3816 {
3817 
3818 	return (uma_kmem_limit);
3819 }
3820 
3821 void
3822 uma_set_limit(unsigned long limit)
3823 {
3824 
3825 	uma_kmem_limit = limit;
3826 }
3827 
3828 unsigned long
3829 uma_size(void)
3830 {
3831 
3832 	return (atomic_load_long(&uma_kmem_total));
3833 }
3834 
3835 long
3836 uma_avail(void)
3837 {
3838 
3839 	return (uma_kmem_limit - uma_size());
3840 }
3841 
3842 void
3843 uma_print_stats(void)
3844 {
3845 	zone_foreach(uma_print_zone);
3846 }
3847 
3848 static void
3849 slab_print(uma_slab_t slab)
3850 {
3851 	printf("slab: keg %p, data %p, freecount %d\n",
3852 		slab->us_keg, slab->us_data, slab->us_freecount);
3853 }
3854 
3855 static void
3856 cache_print(uma_cache_t cache)
3857 {
3858 	printf("alloc: %p(%d), free: %p(%d), cross: %p(%d)j\n",
3859 		cache->uc_allocbucket,
3860 		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3861 		cache->uc_freebucket,
3862 		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0,
3863 		cache->uc_crossbucket,
3864 		cache->uc_crossbucket?cache->uc_crossbucket->ub_cnt:0);
3865 }
3866 
3867 static void
3868 uma_print_keg(uma_keg_t keg)
3869 {
3870 	uma_domain_t dom;
3871 	uma_slab_t slab;
3872 	int i;
3873 
3874 	printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3875 	    "out %d free %d\n",
3876 	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3877 	    keg->uk_ipers, keg->uk_ppera,
3878 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
3879 	    keg->uk_free);
3880 	for (i = 0; i < vm_ndomains; i++) {
3881 		dom = &keg->uk_domain[i];
3882 		printf("Part slabs:\n");
3883 		LIST_FOREACH(slab, &dom->ud_part_slab, us_link)
3884 			slab_print(slab);
3885 		printf("Free slabs:\n");
3886 		LIST_FOREACH(slab, &dom->ud_free_slab, us_link)
3887 			slab_print(slab);
3888 		printf("Full slabs:\n");
3889 		LIST_FOREACH(slab, &dom->ud_full_slab, us_link)
3890 			slab_print(slab);
3891 	}
3892 }
3893 
3894 void
3895 uma_print_zone(uma_zone_t zone)
3896 {
3897 	uma_cache_t cache;
3898 	int i;
3899 
3900 	printf("zone: %s(%p) size %d maxitems %ju flags %#x\n",
3901 	    zone->uz_name, zone, zone->uz_size, (uintmax_t)zone->uz_max_items,
3902 	    zone->uz_flags);
3903 	if (zone->uz_lockptr != &zone->uz_lock)
3904 		uma_print_keg(zone->uz_keg);
3905 	CPU_FOREACH(i) {
3906 		cache = &zone->uz_cpu[i];
3907 		printf("CPU %d Cache:\n", i);
3908 		cache_print(cache);
3909 	}
3910 }
3911 
3912 #ifdef DDB
3913 /*
3914  * Generate statistics across both the zone and its per-cpu cache's.  Return
3915  * desired statistics if the pointer is non-NULL for that statistic.
3916  *
3917  * Note: does not update the zone statistics, as it can't safely clear the
3918  * per-CPU cache statistic.
3919  *
3920  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
3921  * safe from off-CPU; we should modify the caches to track this information
3922  * directly so that we don't have to.
3923  */
3924 static void
3925 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
3926     uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp)
3927 {
3928 	uma_cache_t cache;
3929 	uint64_t allocs, frees, sleeps, xdomain;
3930 	int cachefree, cpu;
3931 
3932 	allocs = frees = sleeps = xdomain = 0;
3933 	cachefree = 0;
3934 	CPU_FOREACH(cpu) {
3935 		cache = &z->uz_cpu[cpu];
3936 		if (cache->uc_allocbucket != NULL)
3937 			cachefree += cache->uc_allocbucket->ub_cnt;
3938 		if (cache->uc_freebucket != NULL)
3939 			cachefree += cache->uc_freebucket->ub_cnt;
3940 		if (cache->uc_crossbucket != NULL) {
3941 			xdomain += cache->uc_crossbucket->ub_cnt;
3942 			cachefree += cache->uc_crossbucket->ub_cnt;
3943 		}
3944 		allocs += cache->uc_allocs;
3945 		frees += cache->uc_frees;
3946 	}
3947 	allocs += counter_u64_fetch(z->uz_allocs);
3948 	frees += counter_u64_fetch(z->uz_frees);
3949 	sleeps += z->uz_sleeps;
3950 	xdomain += z->uz_xdomain;
3951 	if (cachefreep != NULL)
3952 		*cachefreep = cachefree;
3953 	if (allocsp != NULL)
3954 		*allocsp = allocs;
3955 	if (freesp != NULL)
3956 		*freesp = frees;
3957 	if (sleepsp != NULL)
3958 		*sleepsp = sleeps;
3959 	if (xdomainp != NULL)
3960 		*xdomainp = xdomain;
3961 }
3962 #endif /* DDB */
3963 
3964 static int
3965 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
3966 {
3967 	uma_keg_t kz;
3968 	uma_zone_t z;
3969 	int count;
3970 
3971 	count = 0;
3972 	rw_rlock(&uma_rwlock);
3973 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3974 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
3975 			count++;
3976 	}
3977 	LIST_FOREACH(z, &uma_cachezones, uz_link)
3978 		count++;
3979 
3980 	rw_runlock(&uma_rwlock);
3981 	return (sysctl_handle_int(oidp, &count, 0, req));
3982 }
3983 
3984 static void
3985 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf,
3986     struct uma_percpu_stat *ups, bool internal)
3987 {
3988 	uma_zone_domain_t zdom;
3989 	uma_cache_t cache;
3990 	int i;
3991 
3992 
3993 	for (i = 0; i < vm_ndomains; i++) {
3994 		zdom = &z->uz_domain[i];
3995 		uth->uth_zone_free += zdom->uzd_nitems;
3996 	}
3997 	uth->uth_allocs = counter_u64_fetch(z->uz_allocs);
3998 	uth->uth_frees = counter_u64_fetch(z->uz_frees);
3999 	uth->uth_fails = counter_u64_fetch(z->uz_fails);
4000 	uth->uth_sleeps = z->uz_sleeps;
4001 	uth->uth_xdomain = z->uz_xdomain;
4002 	/*
4003 	 * While it is not normally safe to access the cache
4004 	 * bucket pointers while not on the CPU that owns the
4005 	 * cache, we only allow the pointers to be exchanged
4006 	 * without the zone lock held, not invalidated, so
4007 	 * accept the possible race associated with bucket
4008 	 * exchange during monitoring.
4009 	 */
4010 	for (i = 0; i < mp_maxid + 1; i++) {
4011 		bzero(&ups[i], sizeof(*ups));
4012 		if (internal || CPU_ABSENT(i))
4013 			continue;
4014 		cache = &z->uz_cpu[i];
4015 		if (cache->uc_allocbucket != NULL)
4016 			ups[i].ups_cache_free +=
4017 			    cache->uc_allocbucket->ub_cnt;
4018 		if (cache->uc_freebucket != NULL)
4019 			ups[i].ups_cache_free +=
4020 			    cache->uc_freebucket->ub_cnt;
4021 		if (cache->uc_crossbucket != NULL)
4022 			ups[i].ups_cache_free +=
4023 			    cache->uc_crossbucket->ub_cnt;
4024 		ups[i].ups_allocs = cache->uc_allocs;
4025 		ups[i].ups_frees = cache->uc_frees;
4026 	}
4027 }
4028 
4029 static int
4030 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
4031 {
4032 	struct uma_stream_header ush;
4033 	struct uma_type_header uth;
4034 	struct uma_percpu_stat *ups;
4035 	struct sbuf sbuf;
4036 	uma_keg_t kz;
4037 	uma_zone_t z;
4038 	int count, error, i;
4039 
4040 	error = sysctl_wire_old_buffer(req, 0);
4041 	if (error != 0)
4042 		return (error);
4043 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
4044 	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
4045 	ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
4046 
4047 	count = 0;
4048 	rw_rlock(&uma_rwlock);
4049 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4050 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
4051 			count++;
4052 	}
4053 
4054 	LIST_FOREACH(z, &uma_cachezones, uz_link)
4055 		count++;
4056 
4057 	/*
4058 	 * Insert stream header.
4059 	 */
4060 	bzero(&ush, sizeof(ush));
4061 	ush.ush_version = UMA_STREAM_VERSION;
4062 	ush.ush_maxcpus = (mp_maxid + 1);
4063 	ush.ush_count = count;
4064 	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
4065 
4066 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4067 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4068 			bzero(&uth, sizeof(uth));
4069 			ZONE_LOCK(z);
4070 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
4071 			uth.uth_align = kz->uk_align;
4072 			uth.uth_size = kz->uk_size;
4073 			uth.uth_rsize = kz->uk_rsize;
4074 			if (z->uz_max_items > 0)
4075 				uth.uth_pages = (z->uz_items / kz->uk_ipers) *
4076 					kz->uk_ppera;
4077 			else
4078 				uth.uth_pages = kz->uk_pages;
4079 			uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) *
4080 			    kz->uk_ppera;
4081 			uth.uth_limit = z->uz_max_items;
4082 			uth.uth_keg_free = z->uz_keg->uk_free;
4083 
4084 			/*
4085 			 * A zone is secondary is it is not the first entry
4086 			 * on the keg's zone list.
4087 			 */
4088 			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
4089 			    (LIST_FIRST(&kz->uk_zones) != z))
4090 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
4091 			uma_vm_zone_stats(&uth, z, &sbuf, ups,
4092 			    kz->uk_flags & UMA_ZFLAG_INTERNAL);
4093 			ZONE_UNLOCK(z);
4094 			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
4095 			for (i = 0; i < mp_maxid + 1; i++)
4096 				(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
4097 		}
4098 	}
4099 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
4100 		bzero(&uth, sizeof(uth));
4101 		ZONE_LOCK(z);
4102 		strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
4103 		uth.uth_size = z->uz_size;
4104 		uma_vm_zone_stats(&uth, z, &sbuf, ups, false);
4105 		ZONE_UNLOCK(z);
4106 		(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
4107 		for (i = 0; i < mp_maxid + 1; i++)
4108 			(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
4109 	}
4110 
4111 	rw_runlock(&uma_rwlock);
4112 	error = sbuf_finish(&sbuf);
4113 	sbuf_delete(&sbuf);
4114 	free(ups, M_TEMP);
4115 	return (error);
4116 }
4117 
4118 int
4119 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
4120 {
4121 	uma_zone_t zone = *(uma_zone_t *)arg1;
4122 	int error, max;
4123 
4124 	max = uma_zone_get_max(zone);
4125 	error = sysctl_handle_int(oidp, &max, 0, req);
4126 	if (error || !req->newptr)
4127 		return (error);
4128 
4129 	uma_zone_set_max(zone, max);
4130 
4131 	return (0);
4132 }
4133 
4134 int
4135 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
4136 {
4137 	uma_zone_t zone = *(uma_zone_t *)arg1;
4138 	int cur;
4139 
4140 	cur = uma_zone_get_cur(zone);
4141 	return (sysctl_handle_int(oidp, &cur, 0, req));
4142 }
4143 
4144 #ifdef INVARIANTS
4145 static uma_slab_t
4146 uma_dbg_getslab(uma_zone_t zone, void *item)
4147 {
4148 	uma_slab_t slab;
4149 	uma_keg_t keg;
4150 	uint8_t *mem;
4151 
4152 	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
4153 	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
4154 		slab = vtoslab((vm_offset_t)mem);
4155 	} else {
4156 		/*
4157 		 * It is safe to return the slab here even though the
4158 		 * zone is unlocked because the item's allocation state
4159 		 * essentially holds a reference.
4160 		 */
4161 		if (zone->uz_lockptr == &zone->uz_lock)
4162 			return (NULL);
4163 		ZONE_LOCK(zone);
4164 		keg = zone->uz_keg;
4165 		if (keg->uk_flags & UMA_ZONE_HASH)
4166 			slab = hash_sfind(&keg->uk_hash, mem);
4167 		else
4168 			slab = (uma_slab_t)(mem + keg->uk_pgoff);
4169 		ZONE_UNLOCK(zone);
4170 	}
4171 
4172 	return (slab);
4173 }
4174 
4175 static bool
4176 uma_dbg_zskip(uma_zone_t zone, void *mem)
4177 {
4178 
4179 	if (zone->uz_lockptr == &zone->uz_lock)
4180 		return (true);
4181 
4182 	return (uma_dbg_kskip(zone->uz_keg, mem));
4183 }
4184 
4185 static bool
4186 uma_dbg_kskip(uma_keg_t keg, void *mem)
4187 {
4188 	uintptr_t idx;
4189 
4190 	if (dbg_divisor == 0)
4191 		return (true);
4192 
4193 	if (dbg_divisor == 1)
4194 		return (false);
4195 
4196 	idx = (uintptr_t)mem >> PAGE_SHIFT;
4197 	if (keg->uk_ipers > 1) {
4198 		idx *= keg->uk_ipers;
4199 		idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
4200 	}
4201 
4202 	if ((idx / dbg_divisor) * dbg_divisor != idx) {
4203 		counter_u64_add(uma_skip_cnt, 1);
4204 		return (true);
4205 	}
4206 	counter_u64_add(uma_dbg_cnt, 1);
4207 
4208 	return (false);
4209 }
4210 
4211 /*
4212  * Set up the slab's freei data such that uma_dbg_free can function.
4213  *
4214  */
4215 static void
4216 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
4217 {
4218 	uma_keg_t keg;
4219 	int freei;
4220 
4221 	if (slab == NULL) {
4222 		slab = uma_dbg_getslab(zone, item);
4223 		if (slab == NULL)
4224 			panic("uma: item %p did not belong to zone %s\n",
4225 			    item, zone->uz_name);
4226 	}
4227 	keg = slab->us_keg;
4228 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4229 
4230 	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4231 		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
4232 		    item, zone, zone->uz_name, slab, freei);
4233 	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4234 
4235 	return;
4236 }
4237 
4238 /*
4239  * Verifies freed addresses.  Checks for alignment, valid slab membership
4240  * and duplicate frees.
4241  *
4242  */
4243 static void
4244 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
4245 {
4246 	uma_keg_t keg;
4247 	int freei;
4248 
4249 	if (slab == NULL) {
4250 		slab = uma_dbg_getslab(zone, item);
4251 		if (slab == NULL)
4252 			panic("uma: Freed item %p did not belong to zone %s\n",
4253 			    item, zone->uz_name);
4254 	}
4255 	keg = slab->us_keg;
4256 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4257 
4258 	if (freei >= keg->uk_ipers)
4259 		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
4260 		    item, zone, zone->uz_name, slab, freei);
4261 
4262 	if (((freei * keg->uk_rsize) + slab->us_data) != item)
4263 		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
4264 		    item, zone, zone->uz_name, slab, freei);
4265 
4266 	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4267 		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
4268 		    item, zone, zone->uz_name, slab, freei);
4269 
4270 	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4271 }
4272 #endif /* INVARIANTS */
4273 
4274 #ifdef DDB
4275 DB_SHOW_COMMAND(uma, db_show_uma)
4276 {
4277 	uma_keg_t kz;
4278 	uma_zone_t z;
4279 	uint64_t allocs, frees, sleeps, xdomain;
4280 	long cachefree;
4281 	int i;
4282 
4283 	db_printf("%18s %8s %8s %8s %12s %8s %8s %8s\n", "Zone", "Size", "Used",
4284 	    "Free", "Requests", "Sleeps", "Bucket", "XFree");
4285 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4286 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4287 			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
4288 				allocs = counter_u64_fetch(z->uz_allocs);
4289 				frees = counter_u64_fetch(z->uz_frees);
4290 				sleeps = z->uz_sleeps;
4291 				cachefree = 0;
4292 			} else
4293 				uma_zone_sumstat(z, &cachefree, &allocs,
4294 				    &frees, &sleeps, &xdomain);
4295 			if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
4296 			    (LIST_FIRST(&kz->uk_zones) != z)))
4297 				cachefree += kz->uk_free;
4298 			for (i = 0; i < vm_ndomains; i++)
4299 				cachefree += z->uz_domain[i].uzd_nitems;
4300 
4301 			db_printf("%18s %8ju %8jd %8ld %12ju %8ju %8u %8ju\n",
4302 			    z->uz_name, (uintmax_t)kz->uk_size,
4303 			    (intmax_t)(allocs - frees), cachefree,
4304 			    (uintmax_t)allocs, sleeps, z->uz_count, xdomain);
4305 			if (db_pager_quit)
4306 				return;
4307 		}
4308 	}
4309 }
4310 
4311 DB_SHOW_COMMAND(umacache, db_show_umacache)
4312 {
4313 	uma_zone_t z;
4314 	uint64_t allocs, frees;
4315 	long cachefree;
4316 	int i;
4317 
4318 	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
4319 	    "Requests", "Bucket");
4320 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
4321 		uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL);
4322 		for (i = 0; i < vm_ndomains; i++)
4323 			cachefree += z->uz_domain[i].uzd_nitems;
4324 		db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",
4325 		    z->uz_name, (uintmax_t)z->uz_size,
4326 		    (intmax_t)(allocs - frees), cachefree,
4327 		    (uintmax_t)allocs, z->uz_count);
4328 		if (db_pager_quit)
4329 			return;
4330 	}
4331 }
4332 #endif	/* DDB */
4333