xref: /freebsd/sys/vm/uma_core.c (revision 0b57cec536236d46e3dba9bd041533462f33dbb7)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2002-2019 Jeffrey Roberson <jeff@FreeBSD.org>
5  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
6  * Copyright (c) 2004-2006 Robert N. M. Watson
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice unmodified, this list of conditions, and the following
14  *    disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * uma_core.c  Implementation of the Universal Memory allocator
33  *
34  * This allocator is intended to replace the multitude of similar object caches
35  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
36  * efficient.  A primary design goal is to return unused memory to the rest of
37  * the system.  This will make the system as a whole more flexible due to the
38  * ability to move memory to subsystems which most need it instead of leaving
39  * pools of reserved memory unused.
40  *
41  * The basic ideas stem from similar slab/zone based allocators whose algorithms
42  * are well known.
43  *
44  */
45 
46 /*
47  * TODO:
48  *	- Improve memory usage for large allocations
49  *	- Investigate cache size adjustments
50  */
51 
52 #include <sys/cdefs.h>
53 __FBSDID("$FreeBSD$");
54 
55 #include "opt_ddb.h"
56 #include "opt_param.h"
57 #include "opt_vm.h"
58 
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/bitset.h>
62 #include <sys/domainset.h>
63 #include <sys/eventhandler.h>
64 #include <sys/kernel.h>
65 #include <sys/types.h>
66 #include <sys/limits.h>
67 #include <sys/queue.h>
68 #include <sys/malloc.h>
69 #include <sys/ktr.h>
70 #include <sys/lock.h>
71 #include <sys/sysctl.h>
72 #include <sys/mutex.h>
73 #include <sys/proc.h>
74 #include <sys/random.h>
75 #include <sys/rwlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/taskqueue.h>
80 #include <sys/vmmeter.h>
81 
82 #include <vm/vm.h>
83 #include <vm/vm_domainset.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_param.h>
88 #include <vm/vm_phys.h>
89 #include <vm/vm_pagequeue.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_kern.h>
92 #include <vm/vm_extern.h>
93 #include <vm/uma.h>
94 #include <vm/uma_int.h>
95 #include <vm/uma_dbg.h>
96 
97 #include <ddb/ddb.h>
98 
99 #ifdef DEBUG_MEMGUARD
100 #include <vm/memguard.h>
101 #endif
102 
103 /*
104  * This is the zone and keg from which all zones are spawned.
105  */
106 static uma_zone_t kegs;
107 static uma_zone_t zones;
108 
109 /* This is the zone from which all offpage uma_slab_ts are allocated. */
110 static uma_zone_t slabzone;
111 
112 /*
113  * The initial hash tables come out of this zone so they can be allocated
114  * prior to malloc coming up.
115  */
116 static uma_zone_t hashzone;
117 
118 /* The boot-time adjusted value for cache line alignment. */
119 int uma_align_cache = 64 - 1;
120 
121 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
122 static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc");
123 
124 /*
125  * Are we allowed to allocate buckets?
126  */
127 static int bucketdisable = 1;
128 
129 /* Linked list of all kegs in the system */
130 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
131 
132 /* Linked list of all cache-only zones in the system */
133 static LIST_HEAD(,uma_zone) uma_cachezones =
134     LIST_HEAD_INITIALIZER(uma_cachezones);
135 
136 /* This RW lock protects the keg list */
137 static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
138 
139 /*
140  * Pointer and counter to pool of pages, that is preallocated at
141  * startup to bootstrap UMA.
142  */
143 static char *bootmem;
144 static int boot_pages;
145 
146 static struct sx uma_reclaim_lock;
147 
148 /*
149  * kmem soft limit, initialized by uma_set_limit().  Ensure that early
150  * allocations don't trigger a wakeup of the reclaim thread.
151  */
152 unsigned long uma_kmem_limit = LONG_MAX;
153 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0,
154     "UMA kernel memory soft limit");
155 unsigned long uma_kmem_total;
156 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0,
157     "UMA kernel memory usage");
158 
159 /* Is the VM done starting up? */
160 static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
161     BOOT_RUNNING } booted = BOOT_COLD;
162 
163 /*
164  * This is the handle used to schedule events that need to happen
165  * outside of the allocation fast path.
166  */
167 static struct callout uma_callout;
168 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
169 
170 /*
171  * This structure is passed as the zone ctor arg so that I don't have to create
172  * a special allocation function just for zones.
173  */
174 struct uma_zctor_args {
175 	const char *name;
176 	size_t size;
177 	uma_ctor ctor;
178 	uma_dtor dtor;
179 	uma_init uminit;
180 	uma_fini fini;
181 	uma_import import;
182 	uma_release release;
183 	void *arg;
184 	uma_keg_t keg;
185 	int align;
186 	uint32_t flags;
187 };
188 
189 struct uma_kctor_args {
190 	uma_zone_t zone;
191 	size_t size;
192 	uma_init uminit;
193 	uma_fini fini;
194 	int align;
195 	uint32_t flags;
196 };
197 
198 struct uma_bucket_zone {
199 	uma_zone_t	ubz_zone;
200 	char		*ubz_name;
201 	int		ubz_entries;	/* Number of items it can hold. */
202 	int		ubz_maxsize;	/* Maximum allocation size per-item. */
203 };
204 
205 /*
206  * Compute the actual number of bucket entries to pack them in power
207  * of two sizes for more efficient space utilization.
208  */
209 #define	BUCKET_SIZE(n)						\
210     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
211 
212 #define	BUCKET_MAX	BUCKET_SIZE(256)
213 #define	BUCKET_MIN	BUCKET_SIZE(4)
214 
215 struct uma_bucket_zone bucket_zones[] = {
216 	{ NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
217 	{ NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
218 	{ NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
219 	{ NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
220 	{ NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
221 	{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
222 	{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
223 	{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
224 	{ NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
225 	{ NULL, NULL, 0}
226 };
227 
228 /*
229  * Flags and enumerations to be passed to internal functions.
230  */
231 enum zfreeskip {
232 	SKIP_NONE =	0,
233 	SKIP_CNT =	0x00000001,
234 	SKIP_DTOR =	0x00010000,
235 	SKIP_FINI =	0x00020000,
236 };
237 
238 /* Prototypes.. */
239 
240 int	uma_startup_count(int);
241 void	uma_startup(void *, int);
242 void	uma_startup1(void);
243 void	uma_startup2(void);
244 
245 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
246 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
247 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
248 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
249 static void page_free(void *, vm_size_t, uint8_t);
250 static void pcpu_page_free(void *, vm_size_t, uint8_t);
251 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
252 static void cache_drain(uma_zone_t);
253 static void bucket_drain(uma_zone_t, uma_bucket_t);
254 static void bucket_cache_reclaim(uma_zone_t zone, bool);
255 static int keg_ctor(void *, int, void *, int);
256 static void keg_dtor(void *, int, void *);
257 static int zone_ctor(void *, int, void *, int);
258 static void zone_dtor(void *, int, void *);
259 static int zero_init(void *, int, int);
260 static void keg_small_init(uma_keg_t keg);
261 static void keg_large_init(uma_keg_t keg);
262 static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *);
263 static void zone_timeout(uma_zone_t zone, void *);
264 static int hash_alloc(struct uma_hash *, u_int);
265 static int hash_expand(struct uma_hash *, struct uma_hash *);
266 static void hash_free(struct uma_hash *hash);
267 static void uma_timeout(void *);
268 static void uma_startup3(void);
269 static void *zone_alloc_item(uma_zone_t, void *, int, int);
270 static void *zone_alloc_item_locked(uma_zone_t, void *, int, int);
271 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
272 static void bucket_enable(void);
273 static void bucket_init(void);
274 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
275 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
276 static void bucket_zone_drain(void);
277 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
278 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
279 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item);
280 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
281     uma_fini fini, int align, uint32_t flags);
282 static int zone_import(void *, void **, int, int, int);
283 static void zone_release(void *, void **, int);
284 static void uma_zero_item(void *, uma_zone_t);
285 static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int);
286 static bool cache_free(uma_zone_t, uma_cache_t, void *, void *, int);
287 
288 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
289 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
290 static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS);
291 static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS);
292 static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS);
293 static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS);
294 
295 #ifdef INVARIANTS
296 static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg);
297 
298 static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
299 static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
300 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
301 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
302 
303 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0,
304     "Memory allocation debugging");
305 
306 static u_int dbg_divisor = 1;
307 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
308     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
309     "Debug & thrash every this item in memory allocator");
310 
311 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
312 static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
313 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
314     &uma_dbg_cnt, "memory items debugged");
315 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
316     &uma_skip_cnt, "memory items skipped, not debugged");
317 #endif
318 
319 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
320 
321 SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW, 0, "Universal Memory Allocator");
322 
323 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
324     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
325 
326 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
327     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
328 
329 static int zone_warnings = 1;
330 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
331     "Warn when UMA zones becomes full");
332 
333 /*
334  * This routine checks to see whether or not it's safe to enable buckets.
335  */
336 static void
337 bucket_enable(void)
338 {
339 
340 	KASSERT(booted >= BOOT_BUCKETS, ("Bucket enable before init"));
341 	bucketdisable = vm_page_count_min();
342 }
343 
344 /*
345  * Initialize bucket_zones, the array of zones of buckets of various sizes.
346  *
347  * For each zone, calculate the memory required for each bucket, consisting
348  * of the header and an array of pointers.
349  */
350 static void
351 bucket_init(void)
352 {
353 	struct uma_bucket_zone *ubz;
354 	int size;
355 
356 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
357 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
358 		size += sizeof(void *) * ubz->ubz_entries;
359 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
360 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
361 		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA);
362 	}
363 }
364 
365 /*
366  * Given a desired number of entries for a bucket, return the zone from which
367  * to allocate the bucket.
368  */
369 static struct uma_bucket_zone *
370 bucket_zone_lookup(int entries)
371 {
372 	struct uma_bucket_zone *ubz;
373 
374 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
375 		if (ubz->ubz_entries >= entries)
376 			return (ubz);
377 	ubz--;
378 	return (ubz);
379 }
380 
381 static struct uma_bucket_zone *
382 bucket_zone_max(uma_zone_t zone, int nitems)
383 {
384 	struct uma_bucket_zone *ubz;
385 	int bpcpu;
386 
387 	bpcpu = 2;
388 #ifdef UMA_XDOMAIN
389 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
390 		/* Count the cross-domain bucket. */
391 		bpcpu++;
392 #endif
393 
394 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
395 		if (ubz->ubz_entries * bpcpu * mp_ncpus > nitems)
396 			break;
397 	if (ubz == &bucket_zones[0])
398 		ubz = NULL;
399 	else
400 		ubz--;
401 	return (ubz);
402 }
403 
404 static int
405 bucket_select(int size)
406 {
407 	struct uma_bucket_zone *ubz;
408 
409 	ubz = &bucket_zones[0];
410 	if (size > ubz->ubz_maxsize)
411 		return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
412 
413 	for (; ubz->ubz_entries != 0; ubz++)
414 		if (ubz->ubz_maxsize < size)
415 			break;
416 	ubz--;
417 	return (ubz->ubz_entries);
418 }
419 
420 static uma_bucket_t
421 bucket_alloc(uma_zone_t zone, void *udata, int flags)
422 {
423 	struct uma_bucket_zone *ubz;
424 	uma_bucket_t bucket;
425 
426 	/*
427 	 * This is to stop us from allocating per cpu buckets while we're
428 	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
429 	 * boot pages.  This also prevents us from allocating buckets in
430 	 * low memory situations.
431 	 */
432 	if (bucketdisable)
433 		return (NULL);
434 	/*
435 	 * To limit bucket recursion we store the original zone flags
436 	 * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
437 	 * NOVM flag to persist even through deep recursions.  We also
438 	 * store ZFLAG_BUCKET once we have recursed attempting to allocate
439 	 * a bucket for a bucket zone so we do not allow infinite bucket
440 	 * recursion.  This cookie will even persist to frees of unused
441 	 * buckets via the allocation path or bucket allocations in the
442 	 * free path.
443 	 */
444 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
445 		udata = (void *)(uintptr_t)zone->uz_flags;
446 	else {
447 		if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
448 			return (NULL);
449 		udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
450 	}
451 	if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
452 		flags |= M_NOVM;
453 	ubz = bucket_zone_lookup(zone->uz_bucket_size);
454 	if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
455 		ubz++;
456 	bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
457 	if (bucket) {
458 #ifdef INVARIANTS
459 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
460 #endif
461 		bucket->ub_cnt = 0;
462 		bucket->ub_entries = ubz->ubz_entries;
463 	}
464 
465 	return (bucket);
466 }
467 
468 static void
469 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
470 {
471 	struct uma_bucket_zone *ubz;
472 
473 	KASSERT(bucket->ub_cnt == 0,
474 	    ("bucket_free: Freeing a non free bucket."));
475 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
476 		udata = (void *)(uintptr_t)zone->uz_flags;
477 	ubz = bucket_zone_lookup(bucket->ub_entries);
478 	uma_zfree_arg(ubz->ubz_zone, bucket, udata);
479 }
480 
481 static void
482 bucket_zone_drain(void)
483 {
484 	struct uma_bucket_zone *ubz;
485 
486 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
487 		uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN);
488 }
489 
490 /*
491  * Attempt to satisfy an allocation by retrieving a full bucket from one of the
492  * zone's caches.
493  */
494 static uma_bucket_t
495 zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom)
496 {
497 	uma_bucket_t bucket;
498 
499 	ZONE_LOCK_ASSERT(zone);
500 
501 	if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) != NULL) {
502 		MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
503 		TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
504 		zdom->uzd_nitems -= bucket->ub_cnt;
505 		if (zdom->uzd_imin > zdom->uzd_nitems)
506 			zdom->uzd_imin = zdom->uzd_nitems;
507 		zone->uz_bkt_count -= bucket->ub_cnt;
508 	}
509 	return (bucket);
510 }
511 
512 /*
513  * Insert a full bucket into the specified cache.  The "ws" parameter indicates
514  * whether the bucket's contents should be counted as part of the zone's working
515  * set.
516  */
517 static void
518 zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket,
519     const bool ws)
520 {
521 
522 	ZONE_LOCK_ASSERT(zone);
523 	KASSERT(!ws || zone->uz_bkt_count < zone->uz_bkt_max,
524 	    ("%s: zone %p overflow", __func__, zone));
525 
526 	if (ws)
527 		TAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
528 	else
529 		TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link);
530 	zdom->uzd_nitems += bucket->ub_cnt;
531 	if (ws && zdom->uzd_imax < zdom->uzd_nitems)
532 		zdom->uzd_imax = zdom->uzd_nitems;
533 	zone->uz_bkt_count += bucket->ub_cnt;
534 }
535 
536 static void
537 zone_log_warning(uma_zone_t zone)
538 {
539 	static const struct timeval warninterval = { 300, 0 };
540 
541 	if (!zone_warnings || zone->uz_warning == NULL)
542 		return;
543 
544 	if (ratecheck(&zone->uz_ratecheck, &warninterval))
545 		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
546 }
547 
548 static inline void
549 zone_maxaction(uma_zone_t zone)
550 {
551 
552 	if (zone->uz_maxaction.ta_func != NULL)
553 		taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
554 }
555 
556 /*
557  * Routine called by timeout which is used to fire off some time interval
558  * based calculations.  (stats, hash size, etc.)
559  *
560  * Arguments:
561  *	arg   Unused
562  *
563  * Returns:
564  *	Nothing
565  */
566 static void
567 uma_timeout(void *unused)
568 {
569 	bucket_enable();
570 	zone_foreach(zone_timeout, NULL);
571 
572 	/* Reschedule this event */
573 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
574 }
575 
576 /*
577  * Update the working set size estimate for the zone's bucket cache.
578  * The constants chosen here are somewhat arbitrary.  With an update period of
579  * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the
580  * last 100s.
581  */
582 static void
583 zone_domain_update_wss(uma_zone_domain_t zdom)
584 {
585 	long wss;
586 
587 	MPASS(zdom->uzd_imax >= zdom->uzd_imin);
588 	wss = zdom->uzd_imax - zdom->uzd_imin;
589 	zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems;
590 	zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5;
591 }
592 
593 /*
594  * Routine to perform timeout driven calculations.  This expands the
595  * hashes and does per cpu statistics aggregation.
596  *
597  *  Returns nothing.
598  */
599 static void
600 zone_timeout(uma_zone_t zone, void *unused)
601 {
602 	uma_keg_t keg;
603 	u_int slabs;
604 
605 	if ((zone->uz_flags & UMA_ZONE_HASH) == 0)
606 		goto update_wss;
607 
608 	keg = zone->uz_keg;
609 	KEG_LOCK(keg);
610 	/*
611 	 * Expand the keg hash table.
612 	 *
613 	 * This is done if the number of slabs is larger than the hash size.
614 	 * What I'm trying to do here is completely reduce collisions.  This
615 	 * may be a little aggressive.  Should I allow for two collisions max?
616 	 */
617 	if (keg->uk_flags & UMA_ZONE_HASH &&
618 	    (slabs = keg->uk_pages / keg->uk_ppera) >
619 	     keg->uk_hash.uh_hashsize) {
620 		struct uma_hash newhash;
621 		struct uma_hash oldhash;
622 		int ret;
623 
624 		/*
625 		 * This is so involved because allocating and freeing
626 		 * while the keg lock is held will lead to deadlock.
627 		 * I have to do everything in stages and check for
628 		 * races.
629 		 */
630 		KEG_UNLOCK(keg);
631 		ret = hash_alloc(&newhash, 1 << fls(slabs));
632 		KEG_LOCK(keg);
633 		if (ret) {
634 			if (hash_expand(&keg->uk_hash, &newhash)) {
635 				oldhash = keg->uk_hash;
636 				keg->uk_hash = newhash;
637 			} else
638 				oldhash = newhash;
639 
640 			KEG_UNLOCK(keg);
641 			hash_free(&oldhash);
642 			return;
643 		}
644 	}
645 	KEG_UNLOCK(keg);
646 
647 update_wss:
648 	ZONE_LOCK(zone);
649 	for (int i = 0; i < vm_ndomains; i++)
650 		zone_domain_update_wss(&zone->uz_domain[i]);
651 	ZONE_UNLOCK(zone);
652 }
653 
654 /*
655  * Allocate and zero fill the next sized hash table from the appropriate
656  * backing store.
657  *
658  * Arguments:
659  *	hash  A new hash structure with the old hash size in uh_hashsize
660  *
661  * Returns:
662  *	1 on success and 0 on failure.
663  */
664 static int
665 hash_alloc(struct uma_hash *hash, u_int size)
666 {
667 	size_t alloc;
668 
669 	KASSERT(powerof2(size), ("hash size must be power of 2"));
670 	if (size > UMA_HASH_SIZE_INIT)  {
671 		hash->uh_hashsize = size;
672 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
673 		hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT);
674 	} else {
675 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
676 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
677 		    UMA_ANYDOMAIN, M_WAITOK);
678 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
679 	}
680 	if (hash->uh_slab_hash) {
681 		bzero(hash->uh_slab_hash, alloc);
682 		hash->uh_hashmask = hash->uh_hashsize - 1;
683 		return (1);
684 	}
685 
686 	return (0);
687 }
688 
689 /*
690  * Expands the hash table for HASH zones.  This is done from zone_timeout
691  * to reduce collisions.  This must not be done in the regular allocation
692  * path, otherwise, we can recurse on the vm while allocating pages.
693  *
694  * Arguments:
695  *	oldhash  The hash you want to expand
696  *	newhash  The hash structure for the new table
697  *
698  * Returns:
699  *	Nothing
700  *
701  * Discussion:
702  */
703 static int
704 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
705 {
706 	uma_hash_slab_t slab;
707 	u_int hval;
708 	u_int idx;
709 
710 	if (!newhash->uh_slab_hash)
711 		return (0);
712 
713 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
714 		return (0);
715 
716 	/*
717 	 * I need to investigate hash algorithms for resizing without a
718 	 * full rehash.
719 	 */
720 
721 	for (idx = 0; idx < oldhash->uh_hashsize; idx++)
722 		while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
723 			slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]);
724 			LIST_REMOVE(slab, uhs_hlink);
725 			hval = UMA_HASH(newhash, slab->uhs_data);
726 			LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
727 			    slab, uhs_hlink);
728 		}
729 
730 	return (1);
731 }
732 
733 /*
734  * Free the hash bucket to the appropriate backing store.
735  *
736  * Arguments:
737  *	slab_hash  The hash bucket we're freeing
738  *	hashsize   The number of entries in that hash bucket
739  *
740  * Returns:
741  *	Nothing
742  */
743 static void
744 hash_free(struct uma_hash *hash)
745 {
746 	if (hash->uh_slab_hash == NULL)
747 		return;
748 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
749 		zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
750 	else
751 		free(hash->uh_slab_hash, M_UMAHASH);
752 }
753 
754 /*
755  * Frees all outstanding items in a bucket
756  *
757  * Arguments:
758  *	zone   The zone to free to, must be unlocked.
759  *	bucket The free/alloc bucket with items, cpu queue must be locked.
760  *
761  * Returns:
762  *	Nothing
763  */
764 
765 static void
766 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
767 {
768 	int i;
769 
770 	if (bucket == NULL)
771 		return;
772 
773 	if (zone->uz_fini)
774 		for (i = 0; i < bucket->ub_cnt; i++)
775 			zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
776 	zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
777 	if (zone->uz_max_items > 0) {
778 		ZONE_LOCK(zone);
779 		zone->uz_items -= bucket->ub_cnt;
780 		if (zone->uz_sleepers && zone->uz_items < zone->uz_max_items)
781 			wakeup_one(zone);
782 		ZONE_UNLOCK(zone);
783 	}
784 	bucket->ub_cnt = 0;
785 }
786 
787 /*
788  * Drains the per cpu caches for a zone.
789  *
790  * NOTE: This may only be called while the zone is being turn down, and not
791  * during normal operation.  This is necessary in order that we do not have
792  * to migrate CPUs to drain the per-CPU caches.
793  *
794  * Arguments:
795  *	zone     The zone to drain, must be unlocked.
796  *
797  * Returns:
798  *	Nothing
799  */
800 static void
801 cache_drain(uma_zone_t zone)
802 {
803 	uma_cache_t cache;
804 	int cpu;
805 
806 	/*
807 	 * XXX: It is safe to not lock the per-CPU caches, because we're
808 	 * tearing down the zone anyway.  I.e., there will be no further use
809 	 * of the caches at this point.
810 	 *
811 	 * XXX: It would good to be able to assert that the zone is being
812 	 * torn down to prevent improper use of cache_drain().
813 	 *
814 	 * XXX: We lock the zone before passing into bucket_cache_reclaim() as
815 	 * it is used elsewhere.  Should the tear-down path be made special
816 	 * there in some form?
817 	 */
818 	CPU_FOREACH(cpu) {
819 		cache = &zone->uz_cpu[cpu];
820 		bucket_drain(zone, cache->uc_allocbucket);
821 		if (cache->uc_allocbucket != NULL)
822 			bucket_free(zone, cache->uc_allocbucket, NULL);
823 		cache->uc_allocbucket = NULL;
824 		bucket_drain(zone, cache->uc_freebucket);
825 		if (cache->uc_freebucket != NULL)
826 			bucket_free(zone, cache->uc_freebucket, NULL);
827 		cache->uc_freebucket = NULL;
828 		bucket_drain(zone, cache->uc_crossbucket);
829 		if (cache->uc_crossbucket != NULL)
830 			bucket_free(zone, cache->uc_crossbucket, NULL);
831 		cache->uc_crossbucket = NULL;
832 	}
833 	ZONE_LOCK(zone);
834 	bucket_cache_reclaim(zone, true);
835 	ZONE_UNLOCK(zone);
836 }
837 
838 static void
839 cache_shrink(uma_zone_t zone, void *unused)
840 {
841 
842 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
843 		return;
844 
845 	ZONE_LOCK(zone);
846 	zone->uz_bucket_size =
847 	    (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2;
848 	ZONE_UNLOCK(zone);
849 }
850 
851 static void
852 cache_drain_safe_cpu(uma_zone_t zone, void *unused)
853 {
854 	uma_cache_t cache;
855 	uma_bucket_t b1, b2, b3;
856 	int domain;
857 
858 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
859 		return;
860 
861 	b1 = b2 = b3 = NULL;
862 	ZONE_LOCK(zone);
863 	critical_enter();
864 	if (zone->uz_flags & UMA_ZONE_NUMA)
865 		domain = PCPU_GET(domain);
866 	else
867 		domain = 0;
868 	cache = &zone->uz_cpu[curcpu];
869 	if (cache->uc_allocbucket) {
870 		if (cache->uc_allocbucket->ub_cnt != 0)
871 			zone_put_bucket(zone, &zone->uz_domain[domain],
872 			    cache->uc_allocbucket, false);
873 		else
874 			b1 = cache->uc_allocbucket;
875 		cache->uc_allocbucket = NULL;
876 	}
877 	if (cache->uc_freebucket) {
878 		if (cache->uc_freebucket->ub_cnt != 0)
879 			zone_put_bucket(zone, &zone->uz_domain[domain],
880 			    cache->uc_freebucket, false);
881 		else
882 			b2 = cache->uc_freebucket;
883 		cache->uc_freebucket = NULL;
884 	}
885 	b3 = cache->uc_crossbucket;
886 	cache->uc_crossbucket = NULL;
887 	critical_exit();
888 	ZONE_UNLOCK(zone);
889 	if (b1)
890 		bucket_free(zone, b1, NULL);
891 	if (b2)
892 		bucket_free(zone, b2, NULL);
893 	if (b3) {
894 		bucket_drain(zone, b3);
895 		bucket_free(zone, b3, NULL);
896 	}
897 }
898 
899 /*
900  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
901  * This is an expensive call because it needs to bind to all CPUs
902  * one by one and enter a critical section on each of them in order
903  * to safely access their cache buckets.
904  * Zone lock must not be held on call this function.
905  */
906 static void
907 pcpu_cache_drain_safe(uma_zone_t zone)
908 {
909 	int cpu;
910 
911 	/*
912 	 * Polite bucket sizes shrinking was not enouth, shrink aggressively.
913 	 */
914 	if (zone)
915 		cache_shrink(zone, NULL);
916 	else
917 		zone_foreach(cache_shrink, NULL);
918 
919 	CPU_FOREACH(cpu) {
920 		thread_lock(curthread);
921 		sched_bind(curthread, cpu);
922 		thread_unlock(curthread);
923 
924 		if (zone)
925 			cache_drain_safe_cpu(zone, NULL);
926 		else
927 			zone_foreach(cache_drain_safe_cpu, NULL);
928 	}
929 	thread_lock(curthread);
930 	sched_unbind(curthread);
931 	thread_unlock(curthread);
932 }
933 
934 /*
935  * Reclaim cached buckets from a zone.  All buckets are reclaimed if the caller
936  * requested a drain, otherwise the per-domain caches are trimmed to either
937  * estimated working set size.
938  */
939 static void
940 bucket_cache_reclaim(uma_zone_t zone, bool drain)
941 {
942 	uma_zone_domain_t zdom;
943 	uma_bucket_t bucket;
944 	long target, tofree;
945 	int i;
946 
947 	for (i = 0; i < vm_ndomains; i++) {
948 		zdom = &zone->uz_domain[i];
949 
950 		/*
951 		 * If we were asked to drain the zone, we are done only once
952 		 * this bucket cache is empty.  Otherwise, we reclaim items in
953 		 * excess of the zone's estimated working set size.  If the
954 		 * difference nitems - imin is larger than the WSS estimate,
955 		 * then the estimate will grow at the end of this interval and
956 		 * we ignore the historical average.
957 		 */
958 		target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems -
959 		    zdom->uzd_imin);
960 		while (zdom->uzd_nitems > target) {
961 			bucket = TAILQ_LAST(&zdom->uzd_buckets, uma_bucketlist);
962 			if (bucket == NULL)
963 				break;
964 			tofree = bucket->ub_cnt;
965 			TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
966 			zdom->uzd_nitems -= tofree;
967 
968 			/*
969 			 * Shift the bounds of the current WSS interval to avoid
970 			 * perturbing the estimate.
971 			 */
972 			zdom->uzd_imax -= lmin(zdom->uzd_imax, tofree);
973 			zdom->uzd_imin -= lmin(zdom->uzd_imin, tofree);
974 
975 			ZONE_UNLOCK(zone);
976 			bucket_drain(zone, bucket);
977 			bucket_free(zone, bucket, NULL);
978 			ZONE_LOCK(zone);
979 		}
980 	}
981 
982 	/*
983 	 * Shrink the zone bucket size to ensure that the per-CPU caches
984 	 * don't grow too large.
985 	 */
986 	if (zone->uz_bucket_size > zone->uz_bucket_size_min)
987 		zone->uz_bucket_size--;
988 }
989 
990 static void
991 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
992 {
993 	uint8_t *mem;
994 	int i;
995 	uint8_t flags;
996 
997 	CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
998 	    keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
999 
1000 	mem = slab_data(slab, keg);
1001 	flags = slab->us_flags;
1002 	i = start;
1003 	if (keg->uk_fini != NULL) {
1004 		for (i--; i > -1; i--)
1005 #ifdef INVARIANTS
1006 		/*
1007 		 * trash_fini implies that dtor was trash_dtor. trash_fini
1008 		 * would check that memory hasn't been modified since free,
1009 		 * which executed trash_dtor.
1010 		 * That's why we need to run uma_dbg_kskip() check here,
1011 		 * albeit we don't make skip check for other init/fini
1012 		 * invocations.
1013 		 */
1014 		if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) ||
1015 		    keg->uk_fini != trash_fini)
1016 #endif
1017 			keg->uk_fini(slab_item(slab, keg, i), keg->uk_size);
1018 	}
1019 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1020 		zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1021 	keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
1022 	uma_total_dec(PAGE_SIZE * keg->uk_ppera);
1023 }
1024 
1025 /*
1026  * Frees pages from a keg back to the system.  This is done on demand from
1027  * the pageout daemon.
1028  *
1029  * Returns nothing.
1030  */
1031 static void
1032 keg_drain(uma_keg_t keg)
1033 {
1034 	struct slabhead freeslabs = { 0 };
1035 	uma_domain_t dom;
1036 	uma_slab_t slab, tmp;
1037 	int i;
1038 
1039 	/*
1040 	 * We don't want to take pages from statically allocated kegs at this
1041 	 * time
1042 	 */
1043 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
1044 		return;
1045 
1046 	CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u",
1047 	    keg->uk_name, keg, keg->uk_free);
1048 	KEG_LOCK(keg);
1049 	if (keg->uk_free == 0)
1050 		goto finished;
1051 
1052 	for (i = 0; i < vm_ndomains; i++) {
1053 		dom = &keg->uk_domain[i];
1054 		LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) {
1055 			/* We have nowhere to free these to. */
1056 			if (slab->us_flags & UMA_SLAB_BOOT)
1057 				continue;
1058 
1059 			LIST_REMOVE(slab, us_link);
1060 			keg->uk_pages -= keg->uk_ppera;
1061 			keg->uk_free -= keg->uk_ipers;
1062 
1063 			if (keg->uk_flags & UMA_ZONE_HASH)
1064 				UMA_HASH_REMOVE(&keg->uk_hash, slab);
1065 
1066 			LIST_INSERT_HEAD(&freeslabs, slab, us_link);
1067 		}
1068 	}
1069 
1070 finished:
1071 	KEG_UNLOCK(keg);
1072 
1073 	while ((slab = LIST_FIRST(&freeslabs)) != NULL) {
1074 		LIST_REMOVE(slab, us_link);
1075 		keg_free_slab(keg, slab, keg->uk_ipers);
1076 	}
1077 }
1078 
1079 static void
1080 zone_reclaim(uma_zone_t zone, int waitok, bool drain)
1081 {
1082 
1083 	/*
1084 	 * Set draining to interlock with zone_dtor() so we can release our
1085 	 * locks as we go.  Only dtor() should do a WAITOK call since it
1086 	 * is the only call that knows the structure will still be available
1087 	 * when it wakes up.
1088 	 */
1089 	ZONE_LOCK(zone);
1090 	while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) {
1091 		if (waitok == M_NOWAIT)
1092 			goto out;
1093 		msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
1094 	}
1095 	zone->uz_flags |= UMA_ZFLAG_RECLAIMING;
1096 	bucket_cache_reclaim(zone, drain);
1097 	ZONE_UNLOCK(zone);
1098 
1099 	/*
1100 	 * The DRAINING flag protects us from being freed while
1101 	 * we're running.  Normally the uma_rwlock would protect us but we
1102 	 * must be able to release and acquire the right lock for each keg.
1103 	 */
1104 	if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0)
1105 		keg_drain(zone->uz_keg);
1106 	ZONE_LOCK(zone);
1107 	zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING;
1108 	wakeup(zone);
1109 out:
1110 	ZONE_UNLOCK(zone);
1111 }
1112 
1113 static void
1114 zone_drain(uma_zone_t zone, void *unused)
1115 {
1116 
1117 	zone_reclaim(zone, M_NOWAIT, true);
1118 }
1119 
1120 static void
1121 zone_trim(uma_zone_t zone, void *unused)
1122 {
1123 
1124 	zone_reclaim(zone, M_NOWAIT, false);
1125 }
1126 
1127 /*
1128  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
1129  * If the allocation was successful, the keg lock will be held upon return,
1130  * otherwise the keg will be left unlocked.
1131  *
1132  * Arguments:
1133  *	flags   Wait flags for the item initialization routine
1134  *	aflags  Wait flags for the slab allocation
1135  *
1136  * Returns:
1137  *	The slab that was allocated or NULL if there is no memory and the
1138  *	caller specified M_NOWAIT.
1139  */
1140 static uma_slab_t
1141 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags,
1142     int aflags)
1143 {
1144 	uma_alloc allocf;
1145 	uma_slab_t slab;
1146 	unsigned long size;
1147 	uint8_t *mem;
1148 	uint8_t sflags;
1149 	int i;
1150 
1151 	KASSERT(domain >= 0 && domain < vm_ndomains,
1152 	    ("keg_alloc_slab: domain %d out of range", domain));
1153 	KEG_LOCK_ASSERT(keg);
1154 	MPASS(zone->uz_lockptr == &keg->uk_lock);
1155 
1156 	allocf = keg->uk_allocf;
1157 	KEG_UNLOCK(keg);
1158 
1159 	slab = NULL;
1160 	mem = NULL;
1161 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1162 		slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, aflags);
1163 		if (slab == NULL)
1164 			goto out;
1165 	}
1166 
1167 	/*
1168 	 * This reproduces the old vm_zone behavior of zero filling pages the
1169 	 * first time they are added to a zone.
1170 	 *
1171 	 * Malloced items are zeroed in uma_zalloc.
1172 	 */
1173 
1174 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1175 		aflags |= M_ZERO;
1176 	else
1177 		aflags &= ~M_ZERO;
1178 
1179 	if (keg->uk_flags & UMA_ZONE_NODUMP)
1180 		aflags |= M_NODUMP;
1181 
1182 	/* zone is passed for legacy reasons. */
1183 	size = keg->uk_ppera * PAGE_SIZE;
1184 	mem = allocf(zone, size, domain, &sflags, aflags);
1185 	if (mem == NULL) {
1186 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1187 			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1188 		slab = NULL;
1189 		goto out;
1190 	}
1191 	uma_total_inc(size);
1192 
1193 	/* Point the slab into the allocated memory */
1194 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
1195 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
1196 	else
1197 		((uma_hash_slab_t)slab)->uhs_data = mem;
1198 
1199 	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
1200 		for (i = 0; i < keg->uk_ppera; i++)
1201 			vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE),
1202 			    zone, slab);
1203 
1204 	slab->us_freecount = keg->uk_ipers;
1205 	slab->us_flags = sflags;
1206 	slab->us_domain = domain;
1207 	BIT_FILL(keg->uk_ipers, &slab->us_free);
1208 #ifdef INVARIANTS
1209 	BIT_ZERO(keg->uk_ipers, slab_dbg_bits(slab, keg));
1210 #endif
1211 
1212 	if (keg->uk_init != NULL) {
1213 		for (i = 0; i < keg->uk_ipers; i++)
1214 			if (keg->uk_init(slab_item(slab, keg, i),
1215 			    keg->uk_size, flags) != 0)
1216 				break;
1217 		if (i != keg->uk_ipers) {
1218 			keg_free_slab(keg, slab, i);
1219 			slab = NULL;
1220 			goto out;
1221 		}
1222 	}
1223 	KEG_LOCK(keg);
1224 
1225 	CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
1226 	    slab, keg->uk_name, keg);
1227 
1228 	if (keg->uk_flags & UMA_ZONE_HASH)
1229 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
1230 
1231 	keg->uk_pages += keg->uk_ppera;
1232 	keg->uk_free += keg->uk_ipers;
1233 
1234 out:
1235 	return (slab);
1236 }
1237 
1238 /*
1239  * This function is intended to be used early on in place of page_alloc() so
1240  * that we may use the boot time page cache to satisfy allocations before
1241  * the VM is ready.
1242  */
1243 static void *
1244 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1245     int wait)
1246 {
1247 	uma_keg_t keg;
1248 	void *mem;
1249 	int pages;
1250 
1251 	keg = zone->uz_keg;
1252 	/*
1253 	 * If we are in BOOT_BUCKETS or higher, than switch to real
1254 	 * allocator.  Zones with page sized slabs switch at BOOT_PAGEALLOC.
1255 	 */
1256 	switch (booted) {
1257 		case BOOT_COLD:
1258 		case BOOT_STRAPPED:
1259 			break;
1260 		case BOOT_PAGEALLOC:
1261 			if (keg->uk_ppera > 1)
1262 				break;
1263 		case BOOT_BUCKETS:
1264 		case BOOT_RUNNING:
1265 #ifdef UMA_MD_SMALL_ALLOC
1266 			keg->uk_allocf = (keg->uk_ppera > 1) ?
1267 			    page_alloc : uma_small_alloc;
1268 #else
1269 			keg->uk_allocf = page_alloc;
1270 #endif
1271 			return keg->uk_allocf(zone, bytes, domain, pflag, wait);
1272 	}
1273 
1274 	/*
1275 	 * Check our small startup cache to see if it has pages remaining.
1276 	 */
1277 	pages = howmany(bytes, PAGE_SIZE);
1278 	KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
1279 	if (pages > boot_pages)
1280 		panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name);
1281 #ifdef DIAGNOSTIC
1282 	printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name,
1283 	    boot_pages);
1284 #endif
1285 	mem = bootmem;
1286 	boot_pages -= pages;
1287 	bootmem += pages * PAGE_SIZE;
1288 	*pflag = UMA_SLAB_BOOT;
1289 
1290 	return (mem);
1291 }
1292 
1293 /*
1294  * Allocates a number of pages from the system
1295  *
1296  * Arguments:
1297  *	bytes  The number of bytes requested
1298  *	wait  Shall we wait?
1299  *
1300  * Returns:
1301  *	A pointer to the alloced memory or possibly
1302  *	NULL if M_NOWAIT is set.
1303  */
1304 static void *
1305 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1306     int wait)
1307 {
1308 	void *p;	/* Returned page */
1309 
1310 	*pflag = UMA_SLAB_KERNEL;
1311 	p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait);
1312 
1313 	return (p);
1314 }
1315 
1316 static void *
1317 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1318     int wait)
1319 {
1320 	struct pglist alloctail;
1321 	vm_offset_t addr, zkva;
1322 	int cpu, flags;
1323 	vm_page_t p, p_next;
1324 #ifdef NUMA
1325 	struct pcpu *pc;
1326 #endif
1327 
1328 	MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
1329 
1330 	TAILQ_INIT(&alloctail);
1331 	flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1332 	    malloc2vm_flags(wait);
1333 	*pflag = UMA_SLAB_KERNEL;
1334 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
1335 		if (CPU_ABSENT(cpu)) {
1336 			p = vm_page_alloc(NULL, 0, flags);
1337 		} else {
1338 #ifndef NUMA
1339 			p = vm_page_alloc(NULL, 0, flags);
1340 #else
1341 			pc = pcpu_find(cpu);
1342 			p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags);
1343 			if (__predict_false(p == NULL))
1344 				p = vm_page_alloc(NULL, 0, flags);
1345 #endif
1346 		}
1347 		if (__predict_false(p == NULL))
1348 			goto fail;
1349 		TAILQ_INSERT_TAIL(&alloctail, p, listq);
1350 	}
1351 	if ((addr = kva_alloc(bytes)) == 0)
1352 		goto fail;
1353 	zkva = addr;
1354 	TAILQ_FOREACH(p, &alloctail, listq) {
1355 		pmap_qenter(zkva, &p, 1);
1356 		zkva += PAGE_SIZE;
1357 	}
1358 	return ((void*)addr);
1359 fail:
1360 	TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1361 		vm_page_unwire_noq(p);
1362 		vm_page_free(p);
1363 	}
1364 	return (NULL);
1365 }
1366 
1367 /*
1368  * Allocates a number of pages from within an object
1369  *
1370  * Arguments:
1371  *	bytes  The number of bytes requested
1372  *	wait   Shall we wait?
1373  *
1374  * Returns:
1375  *	A pointer to the alloced memory or possibly
1376  *	NULL if M_NOWAIT is set.
1377  */
1378 static void *
1379 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
1380     int wait)
1381 {
1382 	TAILQ_HEAD(, vm_page) alloctail;
1383 	u_long npages;
1384 	vm_offset_t retkva, zkva;
1385 	vm_page_t p, p_next;
1386 	uma_keg_t keg;
1387 
1388 	TAILQ_INIT(&alloctail);
1389 	keg = zone->uz_keg;
1390 
1391 	npages = howmany(bytes, PAGE_SIZE);
1392 	while (npages > 0) {
1393 		p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
1394 		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1395 		    ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
1396 		    VM_ALLOC_NOWAIT));
1397 		if (p != NULL) {
1398 			/*
1399 			 * Since the page does not belong to an object, its
1400 			 * listq is unused.
1401 			 */
1402 			TAILQ_INSERT_TAIL(&alloctail, p, listq);
1403 			npages--;
1404 			continue;
1405 		}
1406 		/*
1407 		 * Page allocation failed, free intermediate pages and
1408 		 * exit.
1409 		 */
1410 		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1411 			vm_page_unwire_noq(p);
1412 			vm_page_free(p);
1413 		}
1414 		return (NULL);
1415 	}
1416 	*flags = UMA_SLAB_PRIV;
1417 	zkva = keg->uk_kva +
1418 	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1419 	retkva = zkva;
1420 	TAILQ_FOREACH(p, &alloctail, listq) {
1421 		pmap_qenter(zkva, &p, 1);
1422 		zkva += PAGE_SIZE;
1423 	}
1424 
1425 	return ((void *)retkva);
1426 }
1427 
1428 /*
1429  * Frees a number of pages to the system
1430  *
1431  * Arguments:
1432  *	mem   A pointer to the memory to be freed
1433  *	size  The size of the memory being freed
1434  *	flags The original p->us_flags field
1435  *
1436  * Returns:
1437  *	Nothing
1438  */
1439 static void
1440 page_free(void *mem, vm_size_t size, uint8_t flags)
1441 {
1442 
1443 	if ((flags & UMA_SLAB_KERNEL) == 0)
1444 		panic("UMA: page_free used with invalid flags %x", flags);
1445 
1446 	kmem_free((vm_offset_t)mem, size);
1447 }
1448 
1449 /*
1450  * Frees pcpu zone allocations
1451  *
1452  * Arguments:
1453  *	mem   A pointer to the memory to be freed
1454  *	size  The size of the memory being freed
1455  *	flags The original p->us_flags field
1456  *
1457  * Returns:
1458  *	Nothing
1459  */
1460 static void
1461 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
1462 {
1463 	vm_offset_t sva, curva;
1464 	vm_paddr_t paddr;
1465 	vm_page_t m;
1466 
1467 	MPASS(size == (mp_maxid+1)*PAGE_SIZE);
1468 	sva = (vm_offset_t)mem;
1469 	for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
1470 		paddr = pmap_kextract(curva);
1471 		m = PHYS_TO_VM_PAGE(paddr);
1472 		vm_page_unwire_noq(m);
1473 		vm_page_free(m);
1474 	}
1475 	pmap_qremove(sva, size >> PAGE_SHIFT);
1476 	kva_free(sva, size);
1477 }
1478 
1479 
1480 /*
1481  * Zero fill initializer
1482  *
1483  * Arguments/Returns follow uma_init specifications
1484  */
1485 static int
1486 zero_init(void *mem, int size, int flags)
1487 {
1488 	bzero(mem, size);
1489 	return (0);
1490 }
1491 
1492 #ifdef INVARIANTS
1493 struct noslabbits *
1494 slab_dbg_bits(uma_slab_t slab, uma_keg_t keg)
1495 {
1496 
1497 	return ((void *)((char *)&slab->us_free + BITSET_SIZE(keg->uk_ipers)));
1498 }
1499 #endif
1500 
1501 /*
1502  * Actual size of embedded struct slab (!OFFPAGE).
1503  */
1504 size_t
1505 slab_sizeof(int nitems)
1506 {
1507 	size_t s;
1508 
1509 	s = sizeof(struct uma_slab) + BITSET_SIZE(nitems) * SLAB_BITSETS;
1510 	return (roundup(s, UMA_ALIGN_PTR + 1));
1511 }
1512 
1513 /*
1514  * Size of memory for embedded slabs (!OFFPAGE).
1515  */
1516 size_t
1517 slab_space(int nitems)
1518 {
1519 	return (UMA_SLAB_SIZE - slab_sizeof(nitems));
1520 }
1521 
1522 /*
1523  * Compute the number of items that will fit in an embedded (!OFFPAGE) slab
1524  * with a given size and alignment.
1525  */
1526 int
1527 slab_ipers(size_t size, int align)
1528 {
1529 	int rsize;
1530 	int nitems;
1531 
1532         /*
1533          * Compute the ideal number of items that will fit in a page and
1534          * then compute the actual number based on a bitset nitems wide.
1535          */
1536 	rsize = roundup(size, align + 1);
1537         nitems = UMA_SLAB_SIZE / rsize;
1538 	return (slab_space(nitems) / rsize);
1539 }
1540 
1541 /*
1542  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1543  *
1544  * Arguments
1545  *	keg  The zone we should initialize
1546  *
1547  * Returns
1548  *	Nothing
1549  */
1550 static void
1551 keg_small_init(uma_keg_t keg)
1552 {
1553 	u_int rsize;
1554 	u_int memused;
1555 	u_int wastedspace;
1556 	u_int shsize;
1557 	u_int slabsize;
1558 
1559 	if (keg->uk_flags & UMA_ZONE_PCPU) {
1560 		u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
1561 
1562 		slabsize = UMA_PCPU_ALLOC_SIZE;
1563 		keg->uk_ppera = ncpus;
1564 	} else {
1565 		slabsize = UMA_SLAB_SIZE;
1566 		keg->uk_ppera = 1;
1567 	}
1568 
1569 	/*
1570 	 * Calculate the size of each allocation (rsize) according to
1571 	 * alignment.  If the requested size is smaller than we have
1572 	 * allocation bits for we round it up.
1573 	 */
1574 	rsize = keg->uk_size;
1575 	if (rsize < slabsize / SLAB_MAX_SETSIZE)
1576 		rsize = slabsize / SLAB_MAX_SETSIZE;
1577 	if (rsize & keg->uk_align)
1578 		rsize = roundup(rsize, keg->uk_align + 1);
1579 	keg->uk_rsize = rsize;
1580 
1581 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1582 	    keg->uk_rsize < UMA_PCPU_ALLOC_SIZE,
1583 	    ("%s: size %u too large", __func__, keg->uk_rsize));
1584 
1585 	/*
1586 	 * Use a pessimistic bit count for shsize.  It may be possible to
1587 	 * squeeze one more item in for very particular sizes if we were
1588 	 * to loop and reduce the bitsize if there is waste.
1589 	 */
1590 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1591 		shsize = 0;
1592 	else
1593 		shsize = slab_sizeof(slabsize / rsize);
1594 
1595 	if (rsize <= slabsize - shsize)
1596 		keg->uk_ipers = (slabsize - shsize) / rsize;
1597 	else {
1598 		/* Handle special case when we have 1 item per slab, so
1599 		 * alignment requirement can be relaxed. */
1600 		KASSERT(keg->uk_size <= slabsize - shsize,
1601 		    ("%s: size %u greater than slab", __func__, keg->uk_size));
1602 		keg->uk_ipers = 1;
1603 	}
1604 	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE,
1605 	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1606 
1607 	memused = keg->uk_ipers * rsize + shsize;
1608 	wastedspace = slabsize - memused;
1609 
1610 	/*
1611 	 * We can't do OFFPAGE if we're internal or if we've been
1612 	 * asked to not go to the VM for buckets.  If we do this we
1613 	 * may end up going to the VM  for slabs which we do not
1614 	 * want to do if we're UMA_ZFLAG_CACHEONLY as a result
1615 	 * of UMA_ZONE_VM, which clearly forbids it.
1616 	 */
1617 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1618 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1619 		return;
1620 
1621 	/*
1622 	 * See if using an OFFPAGE slab will limit our waste.  Only do
1623 	 * this if it permits more items per-slab.
1624 	 *
1625 	 * XXX We could try growing slabsize to limit max waste as well.
1626 	 * Historically this was not done because the VM could not
1627 	 * efficiently handle contiguous allocations.
1628 	 */
1629 	if ((wastedspace >= slabsize / UMA_MAX_WASTE) &&
1630 	    (keg->uk_ipers < (slabsize / keg->uk_rsize))) {
1631 		keg->uk_ipers = slabsize / keg->uk_rsize;
1632 		KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE,
1633 		    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1634 		CTR6(KTR_UMA, "UMA decided we need offpage slab headers for "
1635 		    "keg: %s(%p), calculated wastedspace = %d, "
1636 		    "maximum wasted space allowed = %d, "
1637 		    "calculated ipers = %d, "
1638 		    "new wasted space = %d\n", keg->uk_name, keg, wastedspace,
1639 		    slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1640 		    slabsize - keg->uk_ipers * keg->uk_rsize);
1641 		/*
1642 		 * If we had access to memory to embed a slab header we
1643 		 * also have a page structure to use vtoslab() instead of
1644 		 * hash to find slabs.  If the zone was explicitly created
1645 		 * OFFPAGE we can't necessarily touch the memory.
1646 		 */
1647 		if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0)
1648 			keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1649 	}
1650 
1651 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1652 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1653 		keg->uk_flags |= UMA_ZONE_HASH;
1654 }
1655 
1656 /*
1657  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1658  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1659  * more complicated.
1660  *
1661  * Arguments
1662  *	keg  The keg we should initialize
1663  *
1664  * Returns
1665  *	Nothing
1666  */
1667 static void
1668 keg_large_init(uma_keg_t keg)
1669 {
1670 
1671 	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1672 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1673 	    ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1674 
1675 	keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1676 	keg->uk_ipers = 1;
1677 	keg->uk_rsize = keg->uk_size;
1678 
1679 	/* Check whether we have enough space to not do OFFPAGE. */
1680 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0 &&
1681 	    PAGE_SIZE * keg->uk_ppera - keg->uk_rsize <
1682 	    slab_sizeof(SLAB_MIN_SETSIZE)) {
1683 		/*
1684 		 * We can't do OFFPAGE if we're internal, in which case
1685 		 * we need an extra page per allocation to contain the
1686 		 * slab header.
1687 		 */
1688 		if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
1689 			keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1690 		else
1691 			keg->uk_ppera++;
1692 	}
1693 
1694 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1695 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1696 		keg->uk_flags |= UMA_ZONE_HASH;
1697 }
1698 
1699 static void
1700 keg_cachespread_init(uma_keg_t keg)
1701 {
1702 	int alignsize;
1703 	int trailer;
1704 	int pages;
1705 	int rsize;
1706 
1707 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1708 	    ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1709 
1710 	alignsize = keg->uk_align + 1;
1711 	rsize = keg->uk_size;
1712 	/*
1713 	 * We want one item to start on every align boundary in a page.  To
1714 	 * do this we will span pages.  We will also extend the item by the
1715 	 * size of align if it is an even multiple of align.  Otherwise, it
1716 	 * would fall on the same boundary every time.
1717 	 */
1718 	if (rsize & keg->uk_align)
1719 		rsize = (rsize & ~keg->uk_align) + alignsize;
1720 	if ((rsize & alignsize) == 0)
1721 		rsize += alignsize;
1722 	trailer = rsize - keg->uk_size;
1723 	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1724 	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1725 	keg->uk_rsize = rsize;
1726 	keg->uk_ppera = pages;
1727 	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1728 	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1729 	KASSERT(keg->uk_ipers <= SLAB_MAX_SETSIZE,
1730 	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1731 	    keg->uk_ipers));
1732 }
1733 
1734 /*
1735  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1736  * the keg onto the global keg list.
1737  *
1738  * Arguments/Returns follow uma_ctor specifications
1739  *	udata  Actually uma_kctor_args
1740  */
1741 static int
1742 keg_ctor(void *mem, int size, void *udata, int flags)
1743 {
1744 	struct uma_kctor_args *arg = udata;
1745 	uma_keg_t keg = mem;
1746 	uma_zone_t zone;
1747 
1748 	bzero(keg, size);
1749 	keg->uk_size = arg->size;
1750 	keg->uk_init = arg->uminit;
1751 	keg->uk_fini = arg->fini;
1752 	keg->uk_align = arg->align;
1753 	keg->uk_free = 0;
1754 	keg->uk_reserve = 0;
1755 	keg->uk_pages = 0;
1756 	keg->uk_flags = arg->flags;
1757 	keg->uk_slabzone = NULL;
1758 
1759 	/*
1760 	 * We use a global round-robin policy by default.  Zones with
1761 	 * UMA_ZONE_NUMA set will use first-touch instead, in which case the
1762 	 * iterator is never run.
1763 	 */
1764 	keg->uk_dr.dr_policy = DOMAINSET_RR();
1765 	keg->uk_dr.dr_iter = 0;
1766 
1767 	/*
1768 	 * The master zone is passed to us at keg-creation time.
1769 	 */
1770 	zone = arg->zone;
1771 	keg->uk_name = zone->uz_name;
1772 
1773 	if (arg->flags & UMA_ZONE_VM)
1774 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1775 
1776 	if (arg->flags & UMA_ZONE_ZINIT)
1777 		keg->uk_init = zero_init;
1778 
1779 	if (arg->flags & UMA_ZONE_MALLOC)
1780 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
1781 
1782 	if (arg->flags & UMA_ZONE_PCPU)
1783 #ifdef SMP
1784 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1785 #else
1786 		keg->uk_flags &= ~UMA_ZONE_PCPU;
1787 #endif
1788 
1789 	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
1790 		keg_cachespread_init(keg);
1791 	} else {
1792 		if (keg->uk_size > slab_space(SLAB_MIN_SETSIZE))
1793 			keg_large_init(keg);
1794 		else
1795 			keg_small_init(keg);
1796 	}
1797 
1798 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1799 		keg->uk_slabzone = slabzone;
1800 
1801 	/*
1802 	 * If we haven't booted yet we need allocations to go through the
1803 	 * startup cache until the vm is ready.
1804 	 */
1805 	if (booted < BOOT_PAGEALLOC)
1806 		keg->uk_allocf = startup_alloc;
1807 #ifdef UMA_MD_SMALL_ALLOC
1808 	else if (keg->uk_ppera == 1)
1809 		keg->uk_allocf = uma_small_alloc;
1810 #endif
1811 	else if (keg->uk_flags & UMA_ZONE_PCPU)
1812 		keg->uk_allocf = pcpu_page_alloc;
1813 	else
1814 		keg->uk_allocf = page_alloc;
1815 #ifdef UMA_MD_SMALL_ALLOC
1816 	if (keg->uk_ppera == 1)
1817 		keg->uk_freef = uma_small_free;
1818 	else
1819 #endif
1820 	if (keg->uk_flags & UMA_ZONE_PCPU)
1821 		keg->uk_freef = pcpu_page_free;
1822 	else
1823 		keg->uk_freef = page_free;
1824 
1825 	/*
1826 	 * Initialize keg's lock
1827 	 */
1828 	KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
1829 
1830 	/*
1831 	 * If we're putting the slab header in the actual page we need to
1832 	 * figure out where in each page it goes.  See slab_sizeof
1833 	 * definition.
1834 	 */
1835 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1836 		size_t shsize;
1837 
1838 		shsize = slab_sizeof(keg->uk_ipers);
1839 		keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize;
1840 		/*
1841 		 * The only way the following is possible is if with our
1842 		 * UMA_ALIGN_PTR adjustments we are now bigger than
1843 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
1844 		 * mathematically possible for all cases, so we make
1845 		 * sure here anyway.
1846 		 */
1847 		KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera,
1848 		    ("zone %s ipers %d rsize %d size %d slab won't fit",
1849 		    zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size));
1850 	}
1851 
1852 	if (keg->uk_flags & UMA_ZONE_HASH)
1853 		hash_alloc(&keg->uk_hash, 0);
1854 
1855 	CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n",
1856 	    keg, zone->uz_name, zone,
1857 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
1858 	    keg->uk_free);
1859 
1860 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1861 
1862 	rw_wlock(&uma_rwlock);
1863 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1864 	rw_wunlock(&uma_rwlock);
1865 	return (0);
1866 }
1867 
1868 static void
1869 zone_alloc_counters(uma_zone_t zone, void *unused)
1870 {
1871 
1872 	zone->uz_allocs = counter_u64_alloc(M_WAITOK);
1873 	zone->uz_frees = counter_u64_alloc(M_WAITOK);
1874 	zone->uz_fails = counter_u64_alloc(M_WAITOK);
1875 }
1876 
1877 static void
1878 zone_alloc_sysctl(uma_zone_t zone, void *unused)
1879 {
1880 	uma_zone_domain_t zdom;
1881 	uma_keg_t keg;
1882 	struct sysctl_oid *oid, *domainoid;
1883 	int domains, i, cnt;
1884 	static const char *nokeg = "cache zone";
1885 	char *c;
1886 
1887 	/*
1888 	 * Make a sysctl safe copy of the zone name by removing
1889 	 * any special characters and handling dups by appending
1890 	 * an index.
1891 	 */
1892 	if (zone->uz_namecnt != 0) {
1893 		/* Count the number of decimal digits and '_' separator. */
1894 		for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++)
1895 			cnt /= 10;
1896 		zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1,
1897 		    M_UMA, M_WAITOK);
1898 		sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name,
1899 		    zone->uz_namecnt);
1900 	} else
1901 		zone->uz_ctlname = strdup(zone->uz_name, M_UMA);
1902 	for (c = zone->uz_ctlname; *c != '\0'; c++)
1903 		if (strchr("./\\ -", *c) != NULL)
1904 			*c = '_';
1905 
1906 	/*
1907 	 * Basic parameters at the root.
1908 	 */
1909 	zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma),
1910 	    OID_AUTO, zone->uz_ctlname, CTLFLAG_RD, NULL, "");
1911 	oid = zone->uz_oid;
1912 	SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1913 	    "size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size");
1914 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1915 	    "flags", CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_MPSAFE,
1916 	    zone, 0, sysctl_handle_uma_zone_flags, "A",
1917 	    "Allocator configuration flags");
1918 	SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1919 	    "bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0,
1920 	    "Desired per-cpu cache size");
1921 	SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1922 	    "bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0,
1923 	    "Maximum allowed per-cpu cache size");
1924 
1925 	/*
1926 	 * keg if present.
1927 	 */
1928 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
1929 	    "keg", CTLFLAG_RD, NULL, "");
1930 	keg = zone->uz_keg;
1931 	if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) {
1932 		SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1933 		    "name", CTLFLAG_RD, keg->uk_name, "Keg name");
1934 		SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1935 		    "rsize", CTLFLAG_RD, &keg->uk_rsize, 0,
1936 		    "Real object size with alignment");
1937 		SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1938 		    "ppera", CTLFLAG_RD, &keg->uk_ppera, 0,
1939 		    "pages per-slab allocation");
1940 		SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1941 		    "ipers", CTLFLAG_RD, &keg->uk_ipers, 0,
1942 		    "items available per-slab");
1943 		SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1944 		    "align", CTLFLAG_RD, &keg->uk_align, 0,
1945 		    "item alignment mask");
1946 		SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1947 		    "pages", CTLFLAG_RD, &keg->uk_pages, 0,
1948 		    "Total pages currently allocated from VM");
1949 		SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1950 		    "free", CTLFLAG_RD, &keg->uk_free, 0,
1951 		    "items free in the slab layer");
1952 		SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1953 		    "efficiency", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE,
1954 		    keg, 0, sysctl_handle_uma_slab_efficiency, "I",
1955 		    "Slab utilization (100 - internal fragmentation %)");
1956 	} else
1957 		SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1958 		    "name", CTLFLAG_RD, nokeg, "Keg name");
1959 
1960 	/*
1961 	 * Information about zone limits.
1962 	 */
1963 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
1964 	    "limit", CTLFLAG_RD, NULL, "");
1965 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1966 	    "items", CTLFLAG_RD, &zone->uz_items, 0,
1967 	    "current number of cached items");
1968 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1969 	    "max_items", CTLFLAG_RD, &zone->uz_max_items, 0,
1970 	    "Maximum number of cached items");
1971 	SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1972 	    "sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0,
1973 	    "Number of threads sleeping at limit");
1974 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1975 	    "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0,
1976 	    "Total zone limit sleeps");
1977 
1978 	/*
1979 	 * Per-domain information.
1980 	 */
1981 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
1982 		domains = vm_ndomains;
1983 	else
1984 		domains = 1;
1985 	domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid),
1986 	    OID_AUTO, "domain", CTLFLAG_RD, NULL, "");
1987 	for (i = 0; i < domains; i++) {
1988 		zdom = &zone->uz_domain[i];
1989 		oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid),
1990 		    OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD, NULL, "");
1991 		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1992 		    "nitems", CTLFLAG_RD, &zdom->uzd_nitems,
1993 		    "number of items in this domain");
1994 		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1995 		    "imax", CTLFLAG_RD, &zdom->uzd_imax,
1996 		    "maximum item count in this period");
1997 		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
1998 		    "imin", CTLFLAG_RD, &zdom->uzd_imin,
1999 		    "minimum item count in this period");
2000 		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2001 		    "wss", CTLFLAG_RD, &zdom->uzd_wss,
2002 		    "Working set size");
2003 	}
2004 
2005 	/*
2006 	 * General statistics.
2007 	 */
2008 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO,
2009 	    "stats", CTLFLAG_RD, NULL, "");
2010 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2011 	    "current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE,
2012 	    zone, 1, sysctl_handle_uma_zone_cur, "I",
2013 	    "Current number of allocated items");
2014 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2015 	    "allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
2016 	    zone, 0, sysctl_handle_uma_zone_allocs, "QU",
2017 	    "Total allocation calls");
2018 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2019 	    "frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE,
2020 	    zone, 0, sysctl_handle_uma_zone_frees, "QU",
2021 	    "Total free calls");
2022 	SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2023 	    "fails", CTLFLAG_RD, &zone->uz_fails,
2024 	    "Number of allocation failures");
2025 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
2026 	    "xdomain", CTLFLAG_RD, &zone->uz_xdomain, 0,
2027 	    "Free calls from the wrong domain");
2028 }
2029 
2030 struct uma_zone_count {
2031 	const char	*name;
2032 	int		count;
2033 };
2034 
2035 static void
2036 zone_count(uma_zone_t zone, void *arg)
2037 {
2038 	struct uma_zone_count *cnt;
2039 
2040 	cnt = arg;
2041 	/*
2042 	 * Some zones are rapidly created with identical names and
2043 	 * destroyed out of order.  This can lead to gaps in the count.
2044 	 * Use one greater than the maximum observed for this name.
2045 	 */
2046 	if (strcmp(zone->uz_name, cnt->name) == 0)
2047 		cnt->count = MAX(cnt->count,
2048 		    zone->uz_namecnt + 1);
2049 }
2050 
2051 /*
2052  * Zone header ctor.  This initializes all fields, locks, etc.
2053  *
2054  * Arguments/Returns follow uma_ctor specifications
2055  *	udata  Actually uma_zctor_args
2056  */
2057 static int
2058 zone_ctor(void *mem, int size, void *udata, int flags)
2059 {
2060 	struct uma_zone_count cnt;
2061 	struct uma_zctor_args *arg = udata;
2062 	uma_zone_t zone = mem;
2063 	uma_zone_t z;
2064 	uma_keg_t keg;
2065 	int i;
2066 
2067 	bzero(zone, size);
2068 	zone->uz_name = arg->name;
2069 	zone->uz_ctor = arg->ctor;
2070 	zone->uz_dtor = arg->dtor;
2071 	zone->uz_init = NULL;
2072 	zone->uz_fini = NULL;
2073 	zone->uz_sleeps = 0;
2074 	zone->uz_xdomain = 0;
2075 	zone->uz_bucket_size = 0;
2076 	zone->uz_bucket_size_min = 0;
2077 	zone->uz_bucket_size_max = BUCKET_MAX;
2078 	zone->uz_flags = 0;
2079 	zone->uz_warning = NULL;
2080 	/* The domain structures follow the cpu structures. */
2081 	zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus];
2082 	zone->uz_bkt_max = ULONG_MAX;
2083 	timevalclear(&zone->uz_ratecheck);
2084 
2085 	/* Count the number of duplicate names. */
2086 	cnt.name = arg->name;
2087 	cnt.count = 0;
2088 	zone_foreach(zone_count, &cnt);
2089 	zone->uz_namecnt = cnt.count;
2090 
2091 	for (i = 0; i < vm_ndomains; i++)
2092 		TAILQ_INIT(&zone->uz_domain[i].uzd_buckets);
2093 
2094 #ifdef INVARIANTS
2095 	if (arg->uminit == trash_init && arg->fini == trash_fini)
2096 		zone->uz_flags |= UMA_ZFLAG_TRASH;
2097 #endif
2098 
2099 	/*
2100 	 * This is a pure cache zone, no kegs.
2101 	 */
2102 	if (arg->import) {
2103 		if (arg->flags & UMA_ZONE_VM)
2104 			arg->flags |= UMA_ZFLAG_CACHEONLY;
2105 		zone->uz_flags = arg->flags;
2106 		zone->uz_size = arg->size;
2107 		zone->uz_import = arg->import;
2108 		zone->uz_release = arg->release;
2109 		zone->uz_arg = arg->arg;
2110 		zone->uz_lockptr = &zone->uz_lock;
2111 		ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
2112 		rw_wlock(&uma_rwlock);
2113 		LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
2114 		rw_wunlock(&uma_rwlock);
2115 		goto out;
2116 	}
2117 
2118 	/*
2119 	 * Use the regular zone/keg/slab allocator.
2120 	 */
2121 	zone->uz_import = zone_import;
2122 	zone->uz_release = zone_release;
2123 	zone->uz_arg = zone;
2124 	keg = arg->keg;
2125 
2126 	if (arg->flags & UMA_ZONE_SECONDARY) {
2127 		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
2128 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
2129 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
2130 		zone->uz_init = arg->uminit;
2131 		zone->uz_fini = arg->fini;
2132 		zone->uz_lockptr = &keg->uk_lock;
2133 		zone->uz_flags |= UMA_ZONE_SECONDARY;
2134 		rw_wlock(&uma_rwlock);
2135 		ZONE_LOCK(zone);
2136 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
2137 			if (LIST_NEXT(z, uz_link) == NULL) {
2138 				LIST_INSERT_AFTER(z, zone, uz_link);
2139 				break;
2140 			}
2141 		}
2142 		ZONE_UNLOCK(zone);
2143 		rw_wunlock(&uma_rwlock);
2144 	} else if (keg == NULL) {
2145 		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
2146 		    arg->align, arg->flags)) == NULL)
2147 			return (ENOMEM);
2148 	} else {
2149 		struct uma_kctor_args karg;
2150 		int error;
2151 
2152 		/* We should only be here from uma_startup() */
2153 		karg.size = arg->size;
2154 		karg.uminit = arg->uminit;
2155 		karg.fini = arg->fini;
2156 		karg.align = arg->align;
2157 		karg.flags = arg->flags;
2158 		karg.zone = zone;
2159 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
2160 		    flags);
2161 		if (error)
2162 			return (error);
2163 	}
2164 
2165 	/* Inherit properties from the keg. */
2166 	zone->uz_keg = keg;
2167 	zone->uz_size = keg->uk_size;
2168 	zone->uz_flags |= (keg->uk_flags &
2169 	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
2170 
2171 out:
2172 	if (__predict_true(booted == BOOT_RUNNING)) {
2173 		zone_alloc_counters(zone, NULL);
2174 		zone_alloc_sysctl(zone, NULL);
2175 	} else {
2176 		zone->uz_allocs = EARLY_COUNTER;
2177 		zone->uz_frees = EARLY_COUNTER;
2178 		zone->uz_fails = EARLY_COUNTER;
2179 	}
2180 
2181 	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
2182 	    (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
2183 	    ("Invalid zone flag combination"));
2184 	if (arg->flags & UMA_ZFLAG_INTERNAL)
2185 		zone->uz_bucket_size_max = zone->uz_bucket_size = 0;
2186 	if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
2187 		zone->uz_bucket_size = BUCKET_MAX;
2188 	else if ((arg->flags & UMA_ZONE_MINBUCKET) != 0)
2189 		zone->uz_bucket_size_max = zone->uz_bucket_size = BUCKET_MIN;
2190 	else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
2191 		zone->uz_bucket_size = 0;
2192 	else
2193 		zone->uz_bucket_size = bucket_select(zone->uz_size);
2194 	zone->uz_bucket_size_min = zone->uz_bucket_size;
2195 
2196 	return (0);
2197 }
2198 
2199 /*
2200  * Keg header dtor.  This frees all data, destroys locks, frees the hash
2201  * table and removes the keg from the global list.
2202  *
2203  * Arguments/Returns follow uma_dtor specifications
2204  *	udata  unused
2205  */
2206 static void
2207 keg_dtor(void *arg, int size, void *udata)
2208 {
2209 	uma_keg_t keg;
2210 
2211 	keg = (uma_keg_t)arg;
2212 	KEG_LOCK(keg);
2213 	if (keg->uk_free != 0) {
2214 		printf("Freed UMA keg (%s) was not empty (%d items). "
2215 		    " Lost %d pages of memory.\n",
2216 		    keg->uk_name ? keg->uk_name : "",
2217 		    keg->uk_free, keg->uk_pages);
2218 	}
2219 	KEG_UNLOCK(keg);
2220 
2221 	hash_free(&keg->uk_hash);
2222 
2223 	KEG_LOCK_FINI(keg);
2224 }
2225 
2226 /*
2227  * Zone header dtor.
2228  *
2229  * Arguments/Returns follow uma_dtor specifications
2230  *	udata  unused
2231  */
2232 static void
2233 zone_dtor(void *arg, int size, void *udata)
2234 {
2235 	uma_zone_t zone;
2236 	uma_keg_t keg;
2237 
2238 	zone = (uma_zone_t)arg;
2239 
2240 	sysctl_remove_oid(zone->uz_oid, 1, 1);
2241 
2242 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
2243 		cache_drain(zone);
2244 
2245 	rw_wlock(&uma_rwlock);
2246 	LIST_REMOVE(zone, uz_link);
2247 	rw_wunlock(&uma_rwlock);
2248 	/*
2249 	 * XXX there are some races here where
2250 	 * the zone can be drained but zone lock
2251 	 * released and then refilled before we
2252 	 * remove it... we dont care for now
2253 	 */
2254 	zone_reclaim(zone, M_WAITOK, true);
2255 	/*
2256 	 * We only destroy kegs from non secondary/non cache zones.
2257 	 */
2258 	if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) {
2259 		keg = zone->uz_keg;
2260 		rw_wlock(&uma_rwlock);
2261 		LIST_REMOVE(keg, uk_link);
2262 		rw_wunlock(&uma_rwlock);
2263 		zone_free_item(kegs, keg, NULL, SKIP_NONE);
2264 	}
2265 	counter_u64_free(zone->uz_allocs);
2266 	counter_u64_free(zone->uz_frees);
2267 	counter_u64_free(zone->uz_fails);
2268 	free(zone->uz_ctlname, M_UMA);
2269 	if (zone->uz_lockptr == &zone->uz_lock)
2270 		ZONE_LOCK_FINI(zone);
2271 }
2272 
2273 /*
2274  * Traverses every zone in the system and calls a callback
2275  *
2276  * Arguments:
2277  *	zfunc  A pointer to a function which accepts a zone
2278  *		as an argument.
2279  *
2280  * Returns:
2281  *	Nothing
2282  */
2283 static void
2284 zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg)
2285 {
2286 	uma_keg_t keg;
2287 	uma_zone_t zone;
2288 
2289 	/*
2290 	 * Before BOOT_RUNNING we are guaranteed to be single
2291 	 * threaded, so locking isn't needed. Startup functions
2292 	 * are allowed to use M_WAITOK.
2293 	 */
2294 	if (__predict_true(booted == BOOT_RUNNING))
2295 		rw_rlock(&uma_rwlock);
2296 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
2297 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
2298 			zfunc(zone, arg);
2299 	}
2300 	LIST_FOREACH(zone, &uma_cachezones, uz_link)
2301 		zfunc(zone, arg);
2302 	if (__predict_true(booted == BOOT_RUNNING))
2303 		rw_runlock(&uma_rwlock);
2304 }
2305 
2306 /*
2307  * Count how many pages do we need to bootstrap.  VM supplies
2308  * its need in early zones in the argument, we add up our zones,
2309  * which consist of the UMA Slabs, UMA Hash and 9 Bucket zones.  The
2310  * zone of zones and zone of kegs are accounted separately.
2311  */
2312 #define	UMA_BOOT_ZONES	11
2313 /* Zone of zones and zone of kegs have arbitrary alignment. */
2314 #define	UMA_BOOT_ALIGN	32
2315 static int zsize, ksize;
2316 int
2317 uma_startup_count(int vm_zones)
2318 {
2319 	int zones, pages;
2320 	size_t space, size;
2321 
2322 	ksize = sizeof(struct uma_keg) +
2323 	    (sizeof(struct uma_domain) * vm_ndomains);
2324 	zsize = sizeof(struct uma_zone) +
2325 	    (sizeof(struct uma_cache) * (mp_maxid + 1)) +
2326 	    (sizeof(struct uma_zone_domain) * vm_ndomains);
2327 
2328 	/*
2329 	 * Memory for the zone of kegs and its keg,
2330 	 * and for zone of zones.
2331 	 */
2332 	pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 +
2333 	    roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE);
2334 
2335 #ifdef	UMA_MD_SMALL_ALLOC
2336 	zones = UMA_BOOT_ZONES;
2337 #else
2338 	zones = UMA_BOOT_ZONES + vm_zones;
2339 	vm_zones = 0;
2340 #endif
2341 	size = slab_sizeof(SLAB_MAX_SETSIZE);
2342 	space = slab_space(SLAB_MAX_SETSIZE);
2343 
2344 	/* Memory for the rest of startup zones, UMA and VM, ... */
2345 	if (zsize > space) {
2346 		/* See keg_large_init(). */
2347 		u_int ppera;
2348 
2349 		ppera = howmany(roundup2(zsize, UMA_BOOT_ALIGN), PAGE_SIZE);
2350 		if (PAGE_SIZE * ppera - roundup2(zsize, UMA_BOOT_ALIGN) < size)
2351 			ppera++;
2352 		pages += (zones + vm_zones) * ppera;
2353 	} else if (roundup2(zsize, UMA_BOOT_ALIGN) > space)
2354 		/* See keg_small_init() special case for uk_ppera = 1. */
2355 		pages += zones;
2356 	else
2357 		pages += howmany(zones,
2358 		    space / roundup2(zsize, UMA_BOOT_ALIGN));
2359 
2360 	/* ... and their kegs. Note that zone of zones allocates a keg! */
2361 	pages += howmany(zones + 1,
2362 	    space / roundup2(ksize, UMA_BOOT_ALIGN));
2363 
2364 	return (pages);
2365 }
2366 
2367 void
2368 uma_startup(void *mem, int npages)
2369 {
2370 	struct uma_zctor_args args;
2371 	uma_keg_t masterkeg;
2372 	uintptr_t m;
2373 
2374 #ifdef DIAGNOSTIC
2375 	printf("Entering %s with %d boot pages configured\n", __func__, npages);
2376 #endif
2377 
2378 	rw_init(&uma_rwlock, "UMA lock");
2379 
2380 	/* Use bootpages memory for the zone of zones and zone of kegs. */
2381 	m = (uintptr_t)mem;
2382 	zones = (uma_zone_t)m;
2383 	m += roundup(zsize, CACHE_LINE_SIZE);
2384 	kegs = (uma_zone_t)m;
2385 	m += roundup(zsize, CACHE_LINE_SIZE);
2386 	masterkeg = (uma_keg_t)m;
2387 	m += roundup(ksize, CACHE_LINE_SIZE);
2388 	m = roundup(m, PAGE_SIZE);
2389 	npages -= (m - (uintptr_t)mem) / PAGE_SIZE;
2390 	mem = (void *)m;
2391 
2392 	/* "manually" create the initial zone */
2393 	memset(&args, 0, sizeof(args));
2394 	args.name = "UMA Kegs";
2395 	args.size = ksize;
2396 	args.ctor = keg_ctor;
2397 	args.dtor = keg_dtor;
2398 	args.uminit = zero_init;
2399 	args.fini = NULL;
2400 	args.keg = masterkeg;
2401 	args.align = UMA_BOOT_ALIGN - 1;
2402 	args.flags = UMA_ZFLAG_INTERNAL;
2403 	zone_ctor(kegs, zsize, &args, M_WAITOK);
2404 
2405 	bootmem = mem;
2406 	boot_pages = npages;
2407 
2408 	args.name = "UMA Zones";
2409 	args.size = zsize;
2410 	args.ctor = zone_ctor;
2411 	args.dtor = zone_dtor;
2412 	args.uminit = zero_init;
2413 	args.fini = NULL;
2414 	args.keg = NULL;
2415 	args.align = UMA_BOOT_ALIGN - 1;
2416 	args.flags = UMA_ZFLAG_INTERNAL;
2417 	zone_ctor(zones, zsize, &args, M_WAITOK);
2418 
2419 	/* Now make a zone for slab headers */
2420 	slabzone = uma_zcreate("UMA Slabs", sizeof(struct uma_hash_slab),
2421 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2422 
2423 	hashzone = uma_zcreate("UMA Hash",
2424 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
2425 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2426 
2427 	booted = BOOT_STRAPPED;
2428 }
2429 
2430 void
2431 uma_startup1(void)
2432 {
2433 
2434 #ifdef DIAGNOSTIC
2435 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2436 #endif
2437 	booted = BOOT_PAGEALLOC;
2438 }
2439 
2440 void
2441 uma_startup2(void)
2442 {
2443 
2444 #ifdef DIAGNOSTIC
2445 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2446 #endif
2447 	sx_init(&uma_reclaim_lock, "umareclaim");
2448 	bucket_init();
2449 	booted = BOOT_BUCKETS;
2450 	bucket_enable();
2451 }
2452 
2453 /*
2454  * Initialize our callout handle
2455  *
2456  */
2457 static void
2458 uma_startup3(void)
2459 {
2460 
2461 #ifdef INVARIANTS
2462 	TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
2463 	uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
2464 	uma_skip_cnt = counter_u64_alloc(M_WAITOK);
2465 #endif
2466 	zone_foreach(zone_alloc_counters, NULL);
2467 	zone_foreach(zone_alloc_sysctl, NULL);
2468 	callout_init(&uma_callout, 1);
2469 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
2470 	booted = BOOT_RUNNING;
2471 }
2472 
2473 static uma_keg_t
2474 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
2475 		int align, uint32_t flags)
2476 {
2477 	struct uma_kctor_args args;
2478 
2479 	args.size = size;
2480 	args.uminit = uminit;
2481 	args.fini = fini;
2482 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
2483 	args.flags = flags;
2484 	args.zone = zone;
2485 	return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
2486 }
2487 
2488 /* Public functions */
2489 /* See uma.h */
2490 void
2491 uma_set_align(int align)
2492 {
2493 
2494 	if (align != UMA_ALIGN_CACHE)
2495 		uma_align_cache = align;
2496 }
2497 
2498 /* See uma.h */
2499 uma_zone_t
2500 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
2501 		uma_init uminit, uma_fini fini, int align, uint32_t flags)
2502 
2503 {
2504 	struct uma_zctor_args args;
2505 	uma_zone_t res;
2506 	bool locked;
2507 
2508 	KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
2509 	    align, name));
2510 
2511 	/* Sets all zones to a first-touch domain policy. */
2512 #ifdef UMA_FIRSTTOUCH
2513 	flags |= UMA_ZONE_NUMA;
2514 #endif
2515 
2516 	/* This stuff is essential for the zone ctor */
2517 	memset(&args, 0, sizeof(args));
2518 	args.name = name;
2519 	args.size = size;
2520 	args.ctor = ctor;
2521 	args.dtor = dtor;
2522 	args.uminit = uminit;
2523 	args.fini = fini;
2524 #ifdef  INVARIANTS
2525 	/*
2526 	 * Inject procedures which check for memory use after free if we are
2527 	 * allowed to scramble the memory while it is not allocated.  This
2528 	 * requires that: UMA is actually able to access the memory, no init
2529 	 * or fini procedures, no dependency on the initial value of the
2530 	 * memory, and no (legitimate) use of the memory after free.  Note,
2531 	 * the ctor and dtor do not need to be empty.
2532 	 *
2533 	 * XXX UMA_ZONE_OFFPAGE.
2534 	 */
2535 	if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
2536 	    uminit == NULL && fini == NULL) {
2537 		args.uminit = trash_init;
2538 		args.fini = trash_fini;
2539 	}
2540 #endif
2541 	args.align = align;
2542 	args.flags = flags;
2543 	args.keg = NULL;
2544 
2545 	if (booted < BOOT_BUCKETS) {
2546 		locked = false;
2547 	} else {
2548 		sx_slock(&uma_reclaim_lock);
2549 		locked = true;
2550 	}
2551 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2552 	if (locked)
2553 		sx_sunlock(&uma_reclaim_lock);
2554 	return (res);
2555 }
2556 
2557 /* See uma.h */
2558 uma_zone_t
2559 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
2560 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
2561 {
2562 	struct uma_zctor_args args;
2563 	uma_keg_t keg;
2564 	uma_zone_t res;
2565 	bool locked;
2566 
2567 	keg = master->uz_keg;
2568 	memset(&args, 0, sizeof(args));
2569 	args.name = name;
2570 	args.size = keg->uk_size;
2571 	args.ctor = ctor;
2572 	args.dtor = dtor;
2573 	args.uminit = zinit;
2574 	args.fini = zfini;
2575 	args.align = keg->uk_align;
2576 	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
2577 	args.keg = keg;
2578 
2579 	if (booted < BOOT_BUCKETS) {
2580 		locked = false;
2581 	} else {
2582 		sx_slock(&uma_reclaim_lock);
2583 		locked = true;
2584 	}
2585 	/* XXX Attaches only one keg of potentially many. */
2586 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2587 	if (locked)
2588 		sx_sunlock(&uma_reclaim_lock);
2589 	return (res);
2590 }
2591 
2592 /* See uma.h */
2593 uma_zone_t
2594 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
2595 		    uma_init zinit, uma_fini zfini, uma_import zimport,
2596 		    uma_release zrelease, void *arg, int flags)
2597 {
2598 	struct uma_zctor_args args;
2599 
2600 	memset(&args, 0, sizeof(args));
2601 	args.name = name;
2602 	args.size = size;
2603 	args.ctor = ctor;
2604 	args.dtor = dtor;
2605 	args.uminit = zinit;
2606 	args.fini = zfini;
2607 	args.import = zimport;
2608 	args.release = zrelease;
2609 	args.arg = arg;
2610 	args.align = 0;
2611 	args.flags = flags | UMA_ZFLAG_CACHE;
2612 
2613 	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
2614 }
2615 
2616 /* See uma.h */
2617 void
2618 uma_zdestroy(uma_zone_t zone)
2619 {
2620 
2621 	sx_slock(&uma_reclaim_lock);
2622 	zone_free_item(zones, zone, NULL, SKIP_NONE);
2623 	sx_sunlock(&uma_reclaim_lock);
2624 }
2625 
2626 void
2627 uma_zwait(uma_zone_t zone)
2628 {
2629 	void *item;
2630 
2631 	item = uma_zalloc_arg(zone, NULL, M_WAITOK);
2632 	uma_zfree(zone, item);
2633 }
2634 
2635 void *
2636 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
2637 {
2638 	void *item;
2639 #ifdef SMP
2640 	int i;
2641 
2642 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2643 #endif
2644 	item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
2645 	if (item != NULL && (flags & M_ZERO)) {
2646 #ifdef SMP
2647 		for (i = 0; i <= mp_maxid; i++)
2648 			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
2649 #else
2650 		bzero(item, zone->uz_size);
2651 #endif
2652 	}
2653 	return (item);
2654 }
2655 
2656 /*
2657  * A stub while both regular and pcpu cases are identical.
2658  */
2659 void
2660 uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata)
2661 {
2662 
2663 #ifdef SMP
2664 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2665 #endif
2666 	uma_zfree_arg(zone, item, udata);
2667 }
2668 
2669 static inline void *
2670 bucket_pop(uma_zone_t zone, uma_cache_t cache, uma_bucket_t bucket)
2671 {
2672 	void *item;
2673 
2674 	bucket->ub_cnt--;
2675 	item = bucket->ub_bucket[bucket->ub_cnt];
2676 #ifdef INVARIANTS
2677 	bucket->ub_bucket[bucket->ub_cnt] = NULL;
2678 	KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
2679 #endif
2680 	cache->uc_allocs++;
2681 
2682 	return (item);
2683 }
2684 
2685 static inline void
2686 bucket_push(uma_zone_t zone, uma_cache_t cache, uma_bucket_t bucket,
2687     void *item)
2688 {
2689 	KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
2690 	    ("uma_zfree: Freeing to non free bucket index."));
2691 	bucket->ub_bucket[bucket->ub_cnt] = item;
2692 	bucket->ub_cnt++;
2693 	cache->uc_frees++;
2694 }
2695 
2696 static void *
2697 item_ctor(uma_zone_t zone, void *udata, int flags, void *item)
2698 {
2699 #ifdef INVARIANTS
2700 	bool skipdbg;
2701 
2702 	skipdbg = uma_dbg_zskip(zone, item);
2703 	if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 &&
2704 	    zone->uz_ctor != trash_ctor)
2705 		trash_ctor(item, zone->uz_size, udata, flags);
2706 #endif
2707 	if (__predict_false(zone->uz_ctor != NULL) &&
2708 	    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2709 		counter_u64_add(zone->uz_fails, 1);
2710 		zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT);
2711 		return (NULL);
2712 	}
2713 #ifdef INVARIANTS
2714 	if (!skipdbg)
2715 		uma_dbg_alloc(zone, NULL, item);
2716 #endif
2717 	if (flags & M_ZERO)
2718 		uma_zero_item(item, zone);
2719 
2720 	return (item);
2721 }
2722 
2723 static inline void
2724 item_dtor(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
2725 {
2726 #ifdef INVARIANTS
2727 	bool skipdbg;
2728 
2729 	skipdbg = uma_dbg_zskip(zone, item);
2730 	if (skip == SKIP_NONE && !skipdbg) {
2731 		if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0)
2732 			uma_dbg_free(zone, udata, item);
2733 		else
2734 			uma_dbg_free(zone, NULL, item);
2735 	}
2736 #endif
2737 	if (skip < SKIP_DTOR) {
2738 		if (zone->uz_dtor != NULL)
2739 			zone->uz_dtor(item, zone->uz_size, udata);
2740 #ifdef INVARIANTS
2741 		if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 &&
2742 		    zone->uz_dtor != trash_dtor)
2743 			trash_dtor(item, zone->uz_size, udata);
2744 #endif
2745 	}
2746 }
2747 
2748 /* See uma.h */
2749 void *
2750 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
2751 {
2752 	uma_bucket_t bucket;
2753 	uma_cache_t cache;
2754 	void *item;
2755 	int cpu, domain;
2756 
2757 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2758 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2759 
2760 	/* This is the fast path allocation */
2761 	CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d",
2762 	    curthread, zone->uz_name, zone, flags);
2763 
2764 	if (flags & M_WAITOK) {
2765 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2766 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2767 	}
2768 	KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC"));
2769 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2770 	    ("uma_zalloc_arg: called with spinlock or critical section held"));
2771 	if (zone->uz_flags & UMA_ZONE_PCPU)
2772 		KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone "
2773 		    "with M_ZERO passed"));
2774 
2775 #ifdef DEBUG_MEMGUARD
2776 	if (memguard_cmp_zone(zone)) {
2777 		item = memguard_alloc(zone->uz_size, flags);
2778 		if (item != NULL) {
2779 			if (zone->uz_init != NULL &&
2780 			    zone->uz_init(item, zone->uz_size, flags) != 0)
2781 				return (NULL);
2782 			if (zone->uz_ctor != NULL &&
2783 			    zone->uz_ctor(item, zone->uz_size, udata,
2784 			    flags) != 0) {
2785 				counter_u64_add(zone->uz_fails, 1);
2786 			    	zone->uz_fini(item, zone->uz_size);
2787 				return (NULL);
2788 			}
2789 			return (item);
2790 		}
2791 		/* This is unfortunate but should not be fatal. */
2792 	}
2793 #endif
2794 	/*
2795 	 * If possible, allocate from the per-CPU cache.  There are two
2796 	 * requirements for safe access to the per-CPU cache: (1) the thread
2797 	 * accessing the cache must not be preempted or yield during access,
2798 	 * and (2) the thread must not migrate CPUs without switching which
2799 	 * cache it accesses.  We rely on a critical section to prevent
2800 	 * preemption and migration.  We release the critical section in
2801 	 * order to acquire the zone mutex if we are unable to allocate from
2802 	 * the current cache; when we re-acquire the critical section, we
2803 	 * must detect and handle migration if it has occurred.
2804 	 */
2805 	critical_enter();
2806 	do {
2807 		cpu = curcpu;
2808 		cache = &zone->uz_cpu[cpu];
2809 		bucket = cache->uc_allocbucket;
2810 		if (__predict_true(bucket != NULL && bucket->ub_cnt != 0)) {
2811 			item = bucket_pop(zone, cache, bucket);
2812 			critical_exit();
2813 			return (item_ctor(zone, udata, flags, item));
2814 		}
2815 	} while (cache_alloc(zone, cache, udata, flags));
2816 	critical_exit();
2817 
2818 	/*
2819 	 * We can not get a bucket so try to return a single item.
2820 	 */
2821 	if (zone->uz_flags & UMA_ZONE_NUMA)
2822 		domain = PCPU_GET(domain);
2823 	else
2824 		domain = UMA_ANYDOMAIN;
2825 	return (zone_alloc_item_locked(zone, udata, domain, flags));
2826 }
2827 
2828 /*
2829  * Replenish an alloc bucket and possibly restore an old one.  Called in
2830  * a critical section.  Returns in a critical section.
2831  *
2832  * A false return value indicates failure and returns with the zone lock
2833  * held.  A true return value indicates success and the caller should retry.
2834  */
2835 static __noinline bool
2836 cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
2837 {
2838 	uma_zone_domain_t zdom;
2839 	uma_bucket_t bucket;
2840 	int cpu, domain;
2841 	bool lockfail;
2842 
2843 	CRITICAL_ASSERT(curthread);
2844 
2845 	/*
2846 	 * If we have run out of items in our alloc bucket see
2847 	 * if we can switch with the free bucket.
2848 	 */
2849 	bucket = cache->uc_freebucket;
2850 	if (bucket != NULL && bucket->ub_cnt != 0) {
2851 		cache->uc_freebucket = cache->uc_allocbucket;
2852 		cache->uc_allocbucket = bucket;
2853 		return (true);
2854 	}
2855 
2856 	/*
2857 	 * Discard any empty allocation bucket while we hold no locks.
2858 	 */
2859 	bucket = cache->uc_allocbucket;
2860 	cache->uc_allocbucket = NULL;
2861 	critical_exit();
2862 	if (bucket != NULL)
2863 		bucket_free(zone, bucket, udata);
2864 
2865 	/*
2866 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
2867 	 * we must go back to the zone.  This requires the zone lock, so we
2868 	 * must drop the critical section, then re-acquire it when we go back
2869 	 * to the cache.  Since the critical section is released, we may be
2870 	 * preempted or migrate.  As such, make sure not to maintain any
2871 	 * thread-local state specific to the cache from prior to releasing
2872 	 * the critical section.
2873 	 */
2874 	lockfail = 0;
2875 	if (ZONE_TRYLOCK(zone) == 0) {
2876 		/* Record contention to size the buckets. */
2877 		ZONE_LOCK(zone);
2878 		lockfail = 1;
2879 	}
2880 
2881 	critical_enter();
2882 	/* Short-circuit for zones without buckets and low memory. */
2883 	if (zone->uz_bucket_size == 0 || bucketdisable)
2884 		return (false);
2885 
2886 	cpu = curcpu;
2887 	cache = &zone->uz_cpu[cpu];
2888 
2889 	/* See if we lost the race to fill the cache. */
2890 	if (cache->uc_allocbucket != NULL) {
2891 		ZONE_UNLOCK(zone);
2892 		return (true);
2893 	}
2894 
2895 	/*
2896 	 * Check the zone's cache of buckets.
2897 	 */
2898 	if (zone->uz_flags & UMA_ZONE_NUMA) {
2899 		domain = PCPU_GET(domain);
2900 		zdom = &zone->uz_domain[domain];
2901 	} else {
2902 		domain = UMA_ANYDOMAIN;
2903 		zdom = &zone->uz_domain[0];
2904 	}
2905 
2906 	if ((bucket = zone_fetch_bucket(zone, zdom)) != NULL) {
2907 		ZONE_UNLOCK(zone);
2908 		KASSERT(bucket->ub_cnt != 0,
2909 		    ("uma_zalloc_arg: Returning an empty bucket."));
2910 		cache->uc_allocbucket = bucket;
2911 		return (true);
2912 	}
2913 	/* We are no longer associated with this CPU. */
2914 	critical_exit();
2915 
2916 	/*
2917 	 * We bump the uz count when the cache size is insufficient to
2918 	 * handle the working set.
2919 	 */
2920 	if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max)
2921 		zone->uz_bucket_size++;
2922 
2923 	/*
2924 	 * Fill a bucket and attempt to use it as the alloc bucket.
2925 	 */
2926 	bucket = zone_alloc_bucket(zone, udata, domain, flags);
2927 	CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
2928 	    zone->uz_name, zone, bucket);
2929 	critical_enter();
2930 	if (bucket == NULL)
2931 		return (false);
2932 
2933 	/*
2934 	 * See if we lost the race or were migrated.  Cache the
2935 	 * initialized bucket to make this less likely or claim
2936 	 * the memory directly.
2937 	 */
2938 	cpu = curcpu;
2939 	cache = &zone->uz_cpu[cpu];
2940 	if (cache->uc_allocbucket == NULL &&
2941 	    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
2942 	    domain == PCPU_GET(domain))) {
2943 		cache->uc_allocbucket = bucket;
2944 		zdom->uzd_imax += bucket->ub_cnt;
2945 	} else if (zone->uz_bkt_count >= zone->uz_bkt_max) {
2946 		critical_exit();
2947 		ZONE_UNLOCK(zone);
2948 		bucket_drain(zone, bucket);
2949 		bucket_free(zone, bucket, udata);
2950 		critical_enter();
2951 		return (true);
2952 	} else
2953 		zone_put_bucket(zone, zdom, bucket, false);
2954 	ZONE_UNLOCK(zone);
2955 	return (true);
2956 }
2957 
2958 void *
2959 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
2960 {
2961 
2962 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2963 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2964 
2965 	/* This is the fast path allocation */
2966 	CTR5(KTR_UMA,
2967 	    "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d",
2968 	    curthread, zone->uz_name, zone, domain, flags);
2969 
2970 	if (flags & M_WAITOK) {
2971 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2972 		    "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
2973 	}
2974 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2975 	    ("uma_zalloc_domain: called with spinlock or critical section held"));
2976 
2977 	return (zone_alloc_item(zone, udata, domain, flags));
2978 }
2979 
2980 /*
2981  * Find a slab with some space.  Prefer slabs that are partially used over those
2982  * that are totally full.  This helps to reduce fragmentation.
2983  *
2984  * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
2985  * only 'domain'.
2986  */
2987 static uma_slab_t
2988 keg_first_slab(uma_keg_t keg, int domain, bool rr)
2989 {
2990 	uma_domain_t dom;
2991 	uma_slab_t slab;
2992 	int start;
2993 
2994 	KASSERT(domain >= 0 && domain < vm_ndomains,
2995 	    ("keg_first_slab: domain %d out of range", domain));
2996 	KEG_LOCK_ASSERT(keg);
2997 
2998 	slab = NULL;
2999 	start = domain;
3000 	do {
3001 		dom = &keg->uk_domain[domain];
3002 		if (!LIST_EMPTY(&dom->ud_part_slab))
3003 			return (LIST_FIRST(&dom->ud_part_slab));
3004 		if (!LIST_EMPTY(&dom->ud_free_slab)) {
3005 			slab = LIST_FIRST(&dom->ud_free_slab);
3006 			LIST_REMOVE(slab, us_link);
3007 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
3008 			return (slab);
3009 		}
3010 		if (rr)
3011 			domain = (domain + 1) % vm_ndomains;
3012 	} while (domain != start);
3013 
3014 	return (NULL);
3015 }
3016 
3017 static uma_slab_t
3018 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags)
3019 {
3020 	uint32_t reserve;
3021 
3022 	KEG_LOCK_ASSERT(keg);
3023 
3024 	reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve;
3025 	if (keg->uk_free <= reserve)
3026 		return (NULL);
3027 	return (keg_first_slab(keg, domain, rr));
3028 }
3029 
3030 static uma_slab_t
3031 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags)
3032 {
3033 	struct vm_domainset_iter di;
3034 	uma_domain_t dom;
3035 	uma_slab_t slab;
3036 	int aflags, domain;
3037 	bool rr;
3038 
3039 restart:
3040 	KEG_LOCK_ASSERT(keg);
3041 
3042 	/*
3043 	 * Use the keg's policy if upper layers haven't already specified a
3044 	 * domain (as happens with first-touch zones).
3045 	 *
3046 	 * To avoid races we run the iterator with the keg lock held, but that
3047 	 * means that we cannot allow the vm_domainset layer to sleep.  Thus,
3048 	 * clear M_WAITOK and handle low memory conditions locally.
3049 	 */
3050 	rr = rdomain == UMA_ANYDOMAIN;
3051 	if (rr) {
3052 		aflags = (flags & ~M_WAITOK) | M_NOWAIT;
3053 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
3054 		    &aflags);
3055 	} else {
3056 		aflags = flags;
3057 		domain = rdomain;
3058 	}
3059 
3060 	for (;;) {
3061 		slab = keg_fetch_free_slab(keg, domain, rr, flags);
3062 		if (slab != NULL)
3063 			return (slab);
3064 
3065 		/*
3066 		 * M_NOVM means don't ask at all!
3067 		 */
3068 		if (flags & M_NOVM)
3069 			break;
3070 
3071 		KASSERT(zone->uz_max_items == 0 ||
3072 		    zone->uz_items <= zone->uz_max_items,
3073 		    ("%s: zone %p overflow", __func__, zone));
3074 
3075 		slab = keg_alloc_slab(keg, zone, domain, flags, aflags);
3076 		/*
3077 		 * If we got a slab here it's safe to mark it partially used
3078 		 * and return.  We assume that the caller is going to remove
3079 		 * at least one item.
3080 		 */
3081 		if (slab) {
3082 			dom = &keg->uk_domain[slab->us_domain];
3083 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
3084 			return (slab);
3085 		}
3086 		KEG_LOCK(keg);
3087 		if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
3088 			if ((flags & M_WAITOK) != 0) {
3089 				KEG_UNLOCK(keg);
3090 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
3091 				KEG_LOCK(keg);
3092 				goto restart;
3093 			}
3094 			break;
3095 		}
3096 	}
3097 
3098 	/*
3099 	 * We might not have been able to get a slab but another cpu
3100 	 * could have while we were unlocked.  Check again before we
3101 	 * fail.
3102 	 */
3103 	if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) {
3104 		return (slab);
3105 	}
3106 	return (NULL);
3107 }
3108 
3109 static void *
3110 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
3111 {
3112 	uma_domain_t dom;
3113 	void *item;
3114 	uint8_t freei;
3115 
3116 	KEG_LOCK_ASSERT(keg);
3117 
3118 	freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1;
3119 	BIT_CLR(keg->uk_ipers, freei, &slab->us_free);
3120 	item = slab_item(slab, keg, freei);
3121 	slab->us_freecount--;
3122 	keg->uk_free--;
3123 
3124 	/* Move this slab to the full list */
3125 	if (slab->us_freecount == 0) {
3126 		LIST_REMOVE(slab, us_link);
3127 		dom = &keg->uk_domain[slab->us_domain];
3128 		LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
3129 	}
3130 
3131 	return (item);
3132 }
3133 
3134 static int
3135 zone_import(void *arg, void **bucket, int max, int domain, int flags)
3136 {
3137 	uma_zone_t zone;
3138 	uma_slab_t slab;
3139 	uma_keg_t keg;
3140 #ifdef NUMA
3141 	int stripe;
3142 #endif
3143 	int i;
3144 
3145 	zone = arg;
3146 	slab = NULL;
3147 	keg = zone->uz_keg;
3148 	KEG_LOCK(keg);
3149 	/* Try to keep the buckets totally full */
3150 	for (i = 0; i < max; ) {
3151 		if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL)
3152 			break;
3153 #ifdef NUMA
3154 		stripe = howmany(max, vm_ndomains);
3155 #endif
3156 		while (slab->us_freecount && i < max) {
3157 			bucket[i++] = slab_alloc_item(keg, slab);
3158 			if (keg->uk_free <= keg->uk_reserve)
3159 				break;
3160 #ifdef NUMA
3161 			/*
3162 			 * If the zone is striped we pick a new slab for every
3163 			 * N allocations.  Eliminating this conditional will
3164 			 * instead pick a new domain for each bucket rather
3165 			 * than stripe within each bucket.  The current option
3166 			 * produces more fragmentation and requires more cpu
3167 			 * time but yields better distribution.
3168 			 */
3169 			if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 &&
3170 			    vm_ndomains > 1 && --stripe == 0)
3171 				break;
3172 #endif
3173 		}
3174 		/* Don't block if we allocated any successfully. */
3175 		flags &= ~M_WAITOK;
3176 		flags |= M_NOWAIT;
3177 	}
3178 	KEG_UNLOCK(keg);
3179 
3180 	return i;
3181 }
3182 
3183 static uma_bucket_t
3184 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags)
3185 {
3186 	uma_bucket_t bucket;
3187 	int maxbucket, cnt;
3188 
3189 	CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain);
3190 
3191 	/* Avoid allocs targeting empty domains. */
3192 	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
3193 		domain = UMA_ANYDOMAIN;
3194 
3195 	if (zone->uz_max_items > 0) {
3196 		if (zone->uz_items >= zone->uz_max_items)
3197 			return (false);
3198 		maxbucket = MIN(zone->uz_bucket_size,
3199 		    zone->uz_max_items - zone->uz_items);
3200 		zone->uz_items += maxbucket;
3201 	} else
3202 		maxbucket = zone->uz_bucket_size;
3203 	ZONE_UNLOCK(zone);
3204 
3205 	/* Don't wait for buckets, preserve caller's NOVM setting. */
3206 	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
3207 	if (bucket == NULL) {
3208 		cnt = 0;
3209 		goto out;
3210 	}
3211 
3212 	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
3213 	    MIN(maxbucket, bucket->ub_entries), domain, flags);
3214 
3215 	/*
3216 	 * Initialize the memory if necessary.
3217 	 */
3218 	if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
3219 		int i;
3220 
3221 		for (i = 0; i < bucket->ub_cnt; i++)
3222 			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
3223 			    flags) != 0)
3224 				break;
3225 		/*
3226 		 * If we couldn't initialize the whole bucket, put the
3227 		 * rest back onto the freelist.
3228 		 */
3229 		if (i != bucket->ub_cnt) {
3230 			zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
3231 			    bucket->ub_cnt - i);
3232 #ifdef INVARIANTS
3233 			bzero(&bucket->ub_bucket[i],
3234 			    sizeof(void *) * (bucket->ub_cnt - i));
3235 #endif
3236 			bucket->ub_cnt = i;
3237 		}
3238 	}
3239 
3240 	cnt = bucket->ub_cnt;
3241 	if (bucket->ub_cnt == 0) {
3242 		bucket_free(zone, bucket, udata);
3243 		counter_u64_add(zone->uz_fails, 1);
3244 		bucket = NULL;
3245 	}
3246 out:
3247 	ZONE_LOCK(zone);
3248 	if (zone->uz_max_items > 0 && cnt < maxbucket) {
3249 		MPASS(zone->uz_items >= maxbucket - cnt);
3250 		zone->uz_items -= maxbucket - cnt;
3251 		if (zone->uz_sleepers > 0 &&
3252 		    (cnt == 0 ? zone->uz_items + 1 : zone->uz_items) <
3253 		    zone->uz_max_items)
3254 			wakeup_one(zone);
3255 	}
3256 
3257 	return (bucket);
3258 }
3259 
3260 /*
3261  * Allocates a single item from a zone.
3262  *
3263  * Arguments
3264  *	zone   The zone to alloc for.
3265  *	udata  The data to be passed to the constructor.
3266  *	domain The domain to allocate from or UMA_ANYDOMAIN.
3267  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
3268  *
3269  * Returns
3270  *	NULL if there is no memory and M_NOWAIT is set
3271  *	An item if successful
3272  */
3273 
3274 static void *
3275 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
3276 {
3277 
3278 	ZONE_LOCK(zone);
3279 	return (zone_alloc_item_locked(zone, udata, domain, flags));
3280 }
3281 
3282 /*
3283  * Returns with zone unlocked.
3284  */
3285 static void *
3286 zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags)
3287 {
3288 	void *item;
3289 
3290 	ZONE_LOCK_ASSERT(zone);
3291 
3292 	if (zone->uz_max_items > 0) {
3293 		if (zone->uz_items >= zone->uz_max_items) {
3294 			zone_log_warning(zone);
3295 			zone_maxaction(zone);
3296 			if (flags & M_NOWAIT) {
3297 				ZONE_UNLOCK(zone);
3298 				return (NULL);
3299 			}
3300 			zone->uz_sleeps++;
3301 			zone->uz_sleepers++;
3302 			while (zone->uz_items >= zone->uz_max_items)
3303 				mtx_sleep(zone, zone->uz_lockptr, PVM,
3304 				    "zonelimit", 0);
3305 			zone->uz_sleepers--;
3306 			if (zone->uz_sleepers > 0 &&
3307 			    zone->uz_items + 1 < zone->uz_max_items)
3308 				wakeup_one(zone);
3309 		}
3310 		zone->uz_items++;
3311 	}
3312 	ZONE_UNLOCK(zone);
3313 
3314 	/* Avoid allocs targeting empty domains. */
3315 	if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
3316 		domain = UMA_ANYDOMAIN;
3317 
3318 	if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
3319 		goto fail_cnt;
3320 
3321 	/*
3322 	 * We have to call both the zone's init (not the keg's init)
3323 	 * and the zone's ctor.  This is because the item is going from
3324 	 * a keg slab directly to the user, and the user is expecting it
3325 	 * to be both zone-init'd as well as zone-ctor'd.
3326 	 */
3327 	if (zone->uz_init != NULL) {
3328 		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
3329 			zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT);
3330 			goto fail_cnt;
3331 		}
3332 	}
3333 	item = item_ctor(zone, udata, flags, item);
3334 	if (item == NULL)
3335 		goto fail;
3336 
3337 	counter_u64_add(zone->uz_allocs, 1);
3338 	CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
3339 	    zone->uz_name, zone);
3340 
3341 	return (item);
3342 
3343 fail_cnt:
3344 	counter_u64_add(zone->uz_fails, 1);
3345 fail:
3346 	if (zone->uz_max_items > 0) {
3347 		ZONE_LOCK(zone);
3348 		/* XXX Decrement without wakeup */
3349 		zone->uz_items--;
3350 		ZONE_UNLOCK(zone);
3351 	}
3352 	CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
3353 	    zone->uz_name, zone);
3354 	return (NULL);
3355 }
3356 
3357 /* See uma.h */
3358 void
3359 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
3360 {
3361 	uma_cache_t cache;
3362 	uma_bucket_t bucket;
3363 	int cpu, domain, itemdomain;
3364 
3365 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3366 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3367 
3368 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
3369 	    zone->uz_name);
3370 
3371 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3372 	    ("uma_zfree_arg: called with spinlock or critical section held"));
3373 
3374         /* uma_zfree(..., NULL) does nothing, to match free(9). */
3375         if (item == NULL)
3376                 return;
3377 #ifdef DEBUG_MEMGUARD
3378 	if (is_memguard_addr(item)) {
3379 		if (zone->uz_dtor != NULL)
3380 			zone->uz_dtor(item, zone->uz_size, udata);
3381 		if (zone->uz_fini != NULL)
3382 			zone->uz_fini(item, zone->uz_size);
3383 		memguard_free(item);
3384 		return;
3385 	}
3386 #endif
3387 	item_dtor(zone, item, udata, SKIP_NONE);
3388 
3389 	/*
3390 	 * The race here is acceptable.  If we miss it we'll just have to wait
3391 	 * a little longer for the limits to be reset.
3392 	 */
3393 	if (zone->uz_sleepers > 0)
3394 		goto zfree_item;
3395 
3396 	/*
3397 	 * If possible, free to the per-CPU cache.  There are two
3398 	 * requirements for safe access to the per-CPU cache: (1) the thread
3399 	 * accessing the cache must not be preempted or yield during access,
3400 	 * and (2) the thread must not migrate CPUs without switching which
3401 	 * cache it accesses.  We rely on a critical section to prevent
3402 	 * preemption and migration.  We release the critical section in
3403 	 * order to acquire the zone mutex if we are unable to free to the
3404 	 * current cache; when we re-acquire the critical section, we must
3405 	 * detect and handle migration if it has occurred.
3406 	 */
3407 	domain = itemdomain = 0;
3408 	critical_enter();
3409 	do {
3410 		cpu = curcpu;
3411 		cache = &zone->uz_cpu[cpu];
3412 		bucket = cache->uc_allocbucket;
3413 #ifdef UMA_XDOMAIN
3414 		if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) {
3415 			itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
3416 			domain = PCPU_GET(domain);
3417 		}
3418 		if ((zone->uz_flags & UMA_ZONE_NUMA) != 0 &&
3419 		    domain != itemdomain) {
3420 			bucket = cache->uc_crossbucket;
3421 		} else
3422 #endif
3423 
3424 		/*
3425 		 * Try to free into the allocbucket first to give LIFO ordering
3426 		 * for cache-hot datastructures.  Spill over into the freebucket
3427 		 * if necessary.  Alloc will swap them if one runs dry.
3428 		 */
3429 		if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
3430 			bucket = cache->uc_freebucket;
3431 		if (__predict_true(bucket != NULL &&
3432 		    bucket->ub_cnt < bucket->ub_entries)) {
3433 			bucket_push(zone, cache, bucket, item);
3434 			critical_exit();
3435 			return;
3436 		}
3437 	} while (cache_free(zone, cache, udata, item, itemdomain));
3438 	critical_exit();
3439 
3440 	/*
3441 	 * If nothing else caught this, we'll just do an internal free.
3442 	 */
3443 zfree_item:
3444 	zone_free_item(zone, item, udata, SKIP_DTOR);
3445 }
3446 
3447 static void
3448 zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata,
3449     int domain, int itemdomain)
3450 {
3451 	uma_zone_domain_t zdom;
3452 
3453 #ifdef UMA_XDOMAIN
3454 	/*
3455 	 * Buckets coming from the wrong domain will be entirely for the
3456 	 * only other domain on two domain systems.  In this case we can
3457 	 * simply cache them.  Otherwise we need to sort them back to
3458 	 * correct domains by freeing the contents to the slab layer.
3459 	 */
3460 	if (domain != itemdomain && vm_ndomains > 2) {
3461 		CTR3(KTR_UMA,
3462 		    "uma_zfree: zone %s(%p) draining cross bucket %p",
3463 		    zone->uz_name, zone, bucket);
3464 		bucket_drain(zone, bucket);
3465 		bucket_free(zone, bucket, udata);
3466 		return;
3467 	}
3468 #endif
3469 	/*
3470 	 * Attempt to save the bucket in the zone's domain bucket cache.
3471 	 *
3472 	 * We bump the uz count when the cache size is insufficient to
3473 	 * handle the working set.
3474 	 */
3475 	if (ZONE_TRYLOCK(zone) == 0) {
3476 		/* Record contention to size the buckets. */
3477 		ZONE_LOCK(zone);
3478 		if (zone->uz_bucket_size < zone->uz_bucket_size_max)
3479 			zone->uz_bucket_size++;
3480 	}
3481 
3482 	CTR3(KTR_UMA,
3483 	    "uma_zfree: zone %s(%p) putting bucket %p on free list",
3484 	    zone->uz_name, zone, bucket);
3485 	/* ub_cnt is pointing to the last free item */
3486 	KASSERT(bucket->ub_cnt == bucket->ub_entries,
3487 	    ("uma_zfree: Attempting to insert partial  bucket onto the full list.\n"));
3488 	if (zone->uz_bkt_count >= zone->uz_bkt_max) {
3489 		ZONE_UNLOCK(zone);
3490 		bucket_drain(zone, bucket);
3491 		bucket_free(zone, bucket, udata);
3492 	} else {
3493 		zdom = &zone->uz_domain[itemdomain];
3494 		zone_put_bucket(zone, zdom, bucket, true);
3495 		ZONE_UNLOCK(zone);
3496 	}
3497 }
3498 
3499 /*
3500  * Populate a free or cross bucket for the current cpu cache.  Free any
3501  * existing full bucket either to the zone cache or back to the slab layer.
3502  *
3503  * Enters and returns in a critical section.  false return indicates that
3504  * we can not satisfy this free in the cache layer.  true indicates that
3505  * the caller should retry.
3506  */
3507 static __noinline bool
3508 cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item,
3509     int itemdomain)
3510 {
3511 	uma_bucket_t bucket;
3512 	int cpu, domain;
3513 
3514 	CRITICAL_ASSERT(curthread);
3515 
3516 	if (zone->uz_bucket_size == 0 || bucketdisable)
3517 		return false;
3518 
3519 	cpu = curcpu;
3520 	cache = &zone->uz_cpu[cpu];
3521 
3522 	/*
3523 	 * NUMA domains need to free to the correct zdom.  When XDOMAIN
3524 	 * is enabled this is the zdom of the item and the bucket may be
3525 	 * the cross bucket if they do not match.
3526 	 */
3527 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
3528 #ifdef UMA_XDOMAIN
3529 		domain = PCPU_GET(domain);
3530 #else
3531 		itemdomain = domain = PCPU_GET(domain);
3532 #endif
3533 	else
3534 		itemdomain = domain = 0;
3535 #ifdef UMA_XDOMAIN
3536 	if (domain != itemdomain) {
3537 		bucket = cache->uc_crossbucket;
3538 		cache->uc_crossbucket = NULL;
3539 		if (bucket != NULL)
3540 			atomic_add_64(&zone->uz_xdomain, bucket->ub_cnt);
3541 	} else
3542 #endif
3543 	{
3544 		bucket = cache->uc_freebucket;
3545 		cache->uc_freebucket = NULL;
3546 	}
3547 
3548 
3549 	/* We are no longer associated with this CPU. */
3550 	critical_exit();
3551 
3552 	if (bucket != NULL)
3553 		zone_free_bucket(zone, bucket, udata, domain, itemdomain);
3554 
3555 	bucket = bucket_alloc(zone, udata, M_NOWAIT);
3556 	CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p",
3557 	    zone->uz_name, zone, bucket);
3558 	critical_enter();
3559 	if (bucket == NULL)
3560 		return (false);
3561 	cpu = curcpu;
3562 	cache = &zone->uz_cpu[cpu];
3563 #ifdef UMA_XDOMAIN
3564 	/*
3565 	 * Check to see if we should be populating the cross bucket.  If it
3566 	 * is already populated we will fall through and attempt to populate
3567 	 * the free bucket.
3568 	 */
3569 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) {
3570 		domain = PCPU_GET(domain);
3571 		if (domain != itemdomain && cache->uc_crossbucket == NULL) {
3572 			cache->uc_crossbucket = bucket;
3573 			return (true);
3574 		}
3575 	}
3576 #endif
3577 	/*
3578 	 * We may have lost the race to fill the bucket or switched CPUs.
3579 	 */
3580 	if (cache->uc_freebucket != NULL) {
3581 		critical_exit();
3582 		bucket_free(zone, bucket, udata);
3583 		critical_enter();
3584 	} else
3585 		cache->uc_freebucket = bucket;
3586 
3587 	return (true);
3588 }
3589 
3590 void
3591 uma_zfree_domain(uma_zone_t zone, void *item, void *udata)
3592 {
3593 
3594 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3595 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3596 
3597 	CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread,
3598 	    zone->uz_name);
3599 
3600 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3601 	    ("uma_zfree_domain: called with spinlock or critical section held"));
3602 
3603         /* uma_zfree(..., NULL) does nothing, to match free(9). */
3604         if (item == NULL)
3605                 return;
3606 	zone_free_item(zone, item, udata, SKIP_NONE);
3607 }
3608 
3609 static void
3610 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item)
3611 {
3612 	uma_keg_t keg;
3613 	uma_domain_t dom;
3614 	uint8_t freei;
3615 
3616 	keg = zone->uz_keg;
3617 	MPASS(zone->uz_lockptr == &keg->uk_lock);
3618 	KEG_LOCK_ASSERT(keg);
3619 
3620 	dom = &keg->uk_domain[slab->us_domain];
3621 
3622 	/* Do we need to remove from any lists? */
3623 	if (slab->us_freecount+1 == keg->uk_ipers) {
3624 		LIST_REMOVE(slab, us_link);
3625 		LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
3626 	} else if (slab->us_freecount == 0) {
3627 		LIST_REMOVE(slab, us_link);
3628 		LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
3629 	}
3630 
3631 	/* Slab management. */
3632 	freei = slab_item_index(slab, keg, item);
3633 	BIT_SET(keg->uk_ipers, freei, &slab->us_free);
3634 	slab->us_freecount++;
3635 
3636 	/* Keg statistics. */
3637 	keg->uk_free++;
3638 }
3639 
3640 static void
3641 zone_release(void *arg, void **bucket, int cnt)
3642 {
3643 	uma_zone_t zone;
3644 	void *item;
3645 	uma_slab_t slab;
3646 	uma_keg_t keg;
3647 	uint8_t *mem;
3648 	int i;
3649 
3650 	zone = arg;
3651 	keg = zone->uz_keg;
3652 	KEG_LOCK(keg);
3653 	for (i = 0; i < cnt; i++) {
3654 		item = bucket[i];
3655 		if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
3656 			mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
3657 			if (zone->uz_flags & UMA_ZONE_HASH) {
3658 				slab = hash_sfind(&keg->uk_hash, mem);
3659 			} else {
3660 				mem += keg->uk_pgoff;
3661 				slab = (uma_slab_t)mem;
3662 			}
3663 		} else
3664 			slab = vtoslab((vm_offset_t)item);
3665 		slab_free_item(zone, slab, item);
3666 	}
3667 	KEG_UNLOCK(keg);
3668 }
3669 
3670 /*
3671  * Frees a single item to any zone.
3672  *
3673  * Arguments:
3674  *	zone   The zone to free to
3675  *	item   The item we're freeing
3676  *	udata  User supplied data for the dtor
3677  *	skip   Skip dtors and finis
3678  */
3679 static void
3680 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
3681 {
3682 
3683 	item_dtor(zone, item, udata, skip);
3684 
3685 	if (skip < SKIP_FINI && zone->uz_fini)
3686 		zone->uz_fini(item, zone->uz_size);
3687 
3688 	zone->uz_release(zone->uz_arg, &item, 1);
3689 
3690 	if (skip & SKIP_CNT)
3691 		return;
3692 
3693 	counter_u64_add(zone->uz_frees, 1);
3694 
3695 	if (zone->uz_max_items > 0) {
3696 		ZONE_LOCK(zone);
3697 		zone->uz_items--;
3698 		if (zone->uz_sleepers > 0 &&
3699 		    zone->uz_items < zone->uz_max_items)
3700 			wakeup_one(zone);
3701 		ZONE_UNLOCK(zone);
3702 	}
3703 }
3704 
3705 /* See uma.h */
3706 int
3707 uma_zone_set_max(uma_zone_t zone, int nitems)
3708 {
3709 	struct uma_bucket_zone *ubz;
3710 	int count;
3711 
3712 	ZONE_LOCK(zone);
3713 	ubz = bucket_zone_max(zone, nitems);
3714 	count = ubz != NULL ? ubz->ubz_entries : 0;
3715 	zone->uz_bucket_size_max = zone->uz_bucket_size = count;
3716 	if (zone->uz_bucket_size_min > zone->uz_bucket_size_max)
3717 		zone->uz_bucket_size_min = zone->uz_bucket_size_max;
3718 	zone->uz_max_items = nitems;
3719 	ZONE_UNLOCK(zone);
3720 
3721 	return (nitems);
3722 }
3723 
3724 /* See uma.h */
3725 void
3726 uma_zone_set_maxcache(uma_zone_t zone, int nitems)
3727 {
3728 	struct uma_bucket_zone *ubz;
3729 	int bpcpu;
3730 
3731 	ZONE_LOCK(zone);
3732 	ubz = bucket_zone_max(zone, nitems);
3733 	if (ubz != NULL) {
3734 		bpcpu = 2;
3735 #ifdef UMA_XDOMAIN
3736 		if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
3737 			/* Count the cross-domain bucket. */
3738 			bpcpu++;
3739 #endif
3740 		nitems -= ubz->ubz_entries * bpcpu * mp_ncpus;
3741 		zone->uz_bucket_size_max = ubz->ubz_entries;
3742 	} else {
3743 		zone->uz_bucket_size_max = zone->uz_bucket_size = 0;
3744 	}
3745 	if (zone->uz_bucket_size_min > zone->uz_bucket_size_max)
3746 		zone->uz_bucket_size_min = zone->uz_bucket_size_max;
3747 	zone->uz_bkt_max = nitems;
3748 	ZONE_UNLOCK(zone);
3749 }
3750 
3751 /* See uma.h */
3752 int
3753 uma_zone_get_max(uma_zone_t zone)
3754 {
3755 	int nitems;
3756 
3757 	ZONE_LOCK(zone);
3758 	nitems = zone->uz_max_items;
3759 	ZONE_UNLOCK(zone);
3760 
3761 	return (nitems);
3762 }
3763 
3764 /* See uma.h */
3765 void
3766 uma_zone_set_warning(uma_zone_t zone, const char *warning)
3767 {
3768 
3769 	ZONE_LOCK(zone);
3770 	zone->uz_warning = warning;
3771 	ZONE_UNLOCK(zone);
3772 }
3773 
3774 /* See uma.h */
3775 void
3776 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
3777 {
3778 
3779 	ZONE_LOCK(zone);
3780 	TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
3781 	ZONE_UNLOCK(zone);
3782 }
3783 
3784 /* See uma.h */
3785 int
3786 uma_zone_get_cur(uma_zone_t zone)
3787 {
3788 	int64_t nitems;
3789 	u_int i;
3790 
3791 	ZONE_LOCK(zone);
3792 	nitems = counter_u64_fetch(zone->uz_allocs) -
3793 	    counter_u64_fetch(zone->uz_frees);
3794 	if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) == 0) {
3795 		CPU_FOREACH(i) {
3796 			/*
3797 			 * See the comment in uma_vm_zone_stats() regarding
3798 			 * the safety of accessing the per-cpu caches. With
3799 			 * the zone lock held, it is safe, but can potentially
3800 			 * result in stale data.
3801 			 */
3802 			nitems += zone->uz_cpu[i].uc_allocs -
3803 			    zone->uz_cpu[i].uc_frees;
3804 		}
3805 	}
3806 	ZONE_UNLOCK(zone);
3807 
3808 	return (nitems < 0 ? 0 : nitems);
3809 }
3810 
3811 static uint64_t
3812 uma_zone_get_allocs(uma_zone_t zone)
3813 {
3814 	uint64_t nitems;
3815 	u_int i;
3816 
3817 	ZONE_LOCK(zone);
3818 	nitems = counter_u64_fetch(zone->uz_allocs);
3819 	if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) == 0) {
3820 		CPU_FOREACH(i) {
3821 			/*
3822 			 * See the comment in uma_vm_zone_stats() regarding
3823 			 * the safety of accessing the per-cpu caches. With
3824 			 * the zone lock held, it is safe, but can potentially
3825 			 * result in stale data.
3826 			 */
3827 			nitems += zone->uz_cpu[i].uc_allocs;
3828 		}
3829 	}
3830 	ZONE_UNLOCK(zone);
3831 
3832 	return (nitems);
3833 }
3834 
3835 static uint64_t
3836 uma_zone_get_frees(uma_zone_t zone)
3837 {
3838 	uint64_t nitems;
3839 	u_int i;
3840 
3841 	ZONE_LOCK(zone);
3842 	nitems = counter_u64_fetch(zone->uz_frees);
3843 	if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) == 0) {
3844 		CPU_FOREACH(i) {
3845 			/*
3846 			 * See the comment in uma_vm_zone_stats() regarding
3847 			 * the safety of accessing the per-cpu caches. With
3848 			 * the zone lock held, it is safe, but can potentially
3849 			 * result in stale data.
3850 			 */
3851 			nitems += zone->uz_cpu[i].uc_frees;
3852 		}
3853 	}
3854 	ZONE_UNLOCK(zone);
3855 
3856 	return (nitems);
3857 }
3858 
3859 /* See uma.h */
3860 void
3861 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
3862 {
3863 	uma_keg_t keg;
3864 
3865 	KEG_GET(zone, keg);
3866 	KEG_LOCK(keg);
3867 	KASSERT(keg->uk_pages == 0,
3868 	    ("uma_zone_set_init on non-empty keg"));
3869 	keg->uk_init = uminit;
3870 	KEG_UNLOCK(keg);
3871 }
3872 
3873 /* See uma.h */
3874 void
3875 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
3876 {
3877 	uma_keg_t keg;
3878 
3879 	KEG_GET(zone, keg);
3880 	KEG_LOCK(keg);
3881 	KASSERT(keg->uk_pages == 0,
3882 	    ("uma_zone_set_fini on non-empty keg"));
3883 	keg->uk_fini = fini;
3884 	KEG_UNLOCK(keg);
3885 }
3886 
3887 /* See uma.h */
3888 void
3889 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
3890 {
3891 
3892 	ZONE_LOCK(zone);
3893 	KASSERT(zone->uz_keg->uk_pages == 0,
3894 	    ("uma_zone_set_zinit on non-empty keg"));
3895 	zone->uz_init = zinit;
3896 	ZONE_UNLOCK(zone);
3897 }
3898 
3899 /* See uma.h */
3900 void
3901 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
3902 {
3903 
3904 	ZONE_LOCK(zone);
3905 	KASSERT(zone->uz_keg->uk_pages == 0,
3906 	    ("uma_zone_set_zfini on non-empty keg"));
3907 	zone->uz_fini = zfini;
3908 	ZONE_UNLOCK(zone);
3909 }
3910 
3911 /* See uma.h */
3912 /* XXX uk_freef is not actually used with the zone locked */
3913 void
3914 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
3915 {
3916 	uma_keg_t keg;
3917 
3918 	KEG_GET(zone, keg);
3919 	KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
3920 	KEG_LOCK(keg);
3921 	keg->uk_freef = freef;
3922 	KEG_UNLOCK(keg);
3923 }
3924 
3925 /* See uma.h */
3926 /* XXX uk_allocf is not actually used with the zone locked */
3927 void
3928 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3929 {
3930 	uma_keg_t keg;
3931 
3932 	KEG_GET(zone, keg);
3933 	KEG_LOCK(keg);
3934 	keg->uk_allocf = allocf;
3935 	KEG_UNLOCK(keg);
3936 }
3937 
3938 /* See uma.h */
3939 void
3940 uma_zone_reserve(uma_zone_t zone, int items)
3941 {
3942 	uma_keg_t keg;
3943 
3944 	KEG_GET(zone, keg);
3945 	KEG_LOCK(keg);
3946 	keg->uk_reserve = items;
3947 	KEG_UNLOCK(keg);
3948 }
3949 
3950 /* See uma.h */
3951 int
3952 uma_zone_reserve_kva(uma_zone_t zone, int count)
3953 {
3954 	uma_keg_t keg;
3955 	vm_offset_t kva;
3956 	u_int pages;
3957 
3958 	KEG_GET(zone, keg);
3959 
3960 	pages = count / keg->uk_ipers;
3961 	if (pages * keg->uk_ipers < count)
3962 		pages++;
3963 	pages *= keg->uk_ppera;
3964 
3965 #ifdef UMA_MD_SMALL_ALLOC
3966 	if (keg->uk_ppera > 1) {
3967 #else
3968 	if (1) {
3969 #endif
3970 		kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
3971 		if (kva == 0)
3972 			return (0);
3973 	} else
3974 		kva = 0;
3975 
3976 	ZONE_LOCK(zone);
3977 	MPASS(keg->uk_kva == 0);
3978 	keg->uk_kva = kva;
3979 	keg->uk_offset = 0;
3980 	zone->uz_max_items = pages * keg->uk_ipers;
3981 #ifdef UMA_MD_SMALL_ALLOC
3982 	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3983 #else
3984 	keg->uk_allocf = noobj_alloc;
3985 #endif
3986 	keg->uk_flags |= UMA_ZONE_NOFREE;
3987 	ZONE_UNLOCK(zone);
3988 
3989 	return (1);
3990 }
3991 
3992 /* See uma.h */
3993 void
3994 uma_prealloc(uma_zone_t zone, int items)
3995 {
3996 	struct vm_domainset_iter di;
3997 	uma_domain_t dom;
3998 	uma_slab_t slab;
3999 	uma_keg_t keg;
4000 	int aflags, domain, slabs;
4001 
4002 	KEG_GET(zone, keg);
4003 	KEG_LOCK(keg);
4004 	slabs = items / keg->uk_ipers;
4005 	if (slabs * keg->uk_ipers < items)
4006 		slabs++;
4007 	while (slabs-- > 0) {
4008 		aflags = M_NOWAIT;
4009 		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
4010 		    &aflags);
4011 		for (;;) {
4012 			slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
4013 			    aflags);
4014 			if (slab != NULL) {
4015 				dom = &keg->uk_domain[slab->us_domain];
4016 				LIST_INSERT_HEAD(&dom->ud_free_slab, slab,
4017 				    us_link);
4018 				break;
4019 			}
4020 			KEG_LOCK(keg);
4021 			if (vm_domainset_iter_policy(&di, &domain) != 0) {
4022 				KEG_UNLOCK(keg);
4023 				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask);
4024 				KEG_LOCK(keg);
4025 			}
4026 		}
4027 	}
4028 	KEG_UNLOCK(keg);
4029 }
4030 
4031 /* See uma.h */
4032 void
4033 uma_reclaim(int req)
4034 {
4035 
4036 	CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
4037 	sx_xlock(&uma_reclaim_lock);
4038 	bucket_enable();
4039 
4040 	switch (req) {
4041 	case UMA_RECLAIM_TRIM:
4042 		zone_foreach(zone_trim, NULL);
4043 		break;
4044 	case UMA_RECLAIM_DRAIN:
4045 	case UMA_RECLAIM_DRAIN_CPU:
4046 		zone_foreach(zone_drain, NULL);
4047 		if (req == UMA_RECLAIM_DRAIN_CPU) {
4048 			pcpu_cache_drain_safe(NULL);
4049 			zone_foreach(zone_drain, NULL);
4050 		}
4051 		break;
4052 	default:
4053 		panic("unhandled reclamation request %d", req);
4054 	}
4055 
4056 	/*
4057 	 * Some slabs may have been freed but this zone will be visited early
4058 	 * we visit again so that we can free pages that are empty once other
4059 	 * zones are drained.  We have to do the same for buckets.
4060 	 */
4061 	zone_drain(slabzone, NULL);
4062 	bucket_zone_drain();
4063 	sx_xunlock(&uma_reclaim_lock);
4064 }
4065 
4066 static volatile int uma_reclaim_needed;
4067 
4068 void
4069 uma_reclaim_wakeup(void)
4070 {
4071 
4072 	if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
4073 		wakeup(uma_reclaim);
4074 }
4075 
4076 void
4077 uma_reclaim_worker(void *arg __unused)
4078 {
4079 
4080 	for (;;) {
4081 		sx_xlock(&uma_reclaim_lock);
4082 		while (atomic_load_int(&uma_reclaim_needed) == 0)
4083 			sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl",
4084 			    hz);
4085 		sx_xunlock(&uma_reclaim_lock);
4086 		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
4087 		uma_reclaim(UMA_RECLAIM_DRAIN_CPU);
4088 		atomic_store_int(&uma_reclaim_needed, 0);
4089 		/* Don't fire more than once per-second. */
4090 		pause("umarclslp", hz);
4091 	}
4092 }
4093 
4094 /* See uma.h */
4095 void
4096 uma_zone_reclaim(uma_zone_t zone, int req)
4097 {
4098 
4099 	switch (req) {
4100 	case UMA_RECLAIM_TRIM:
4101 		zone_trim(zone, NULL);
4102 		break;
4103 	case UMA_RECLAIM_DRAIN:
4104 		zone_drain(zone, NULL);
4105 		break;
4106 	case UMA_RECLAIM_DRAIN_CPU:
4107 		pcpu_cache_drain_safe(zone);
4108 		zone_drain(zone, NULL);
4109 		break;
4110 	default:
4111 		panic("unhandled reclamation request %d", req);
4112 	}
4113 }
4114 
4115 /* See uma.h */
4116 int
4117 uma_zone_exhausted(uma_zone_t zone)
4118 {
4119 	int full;
4120 
4121 	ZONE_LOCK(zone);
4122 	full = zone->uz_sleepers > 0;
4123 	ZONE_UNLOCK(zone);
4124 	return (full);
4125 }
4126 
4127 int
4128 uma_zone_exhausted_nolock(uma_zone_t zone)
4129 {
4130 	return (zone->uz_sleepers > 0);
4131 }
4132 
4133 static void
4134 uma_zero_item(void *item, uma_zone_t zone)
4135 {
4136 
4137 	bzero(item, zone->uz_size);
4138 }
4139 
4140 unsigned long
4141 uma_limit(void)
4142 {
4143 
4144 	return (uma_kmem_limit);
4145 }
4146 
4147 void
4148 uma_set_limit(unsigned long limit)
4149 {
4150 
4151 	uma_kmem_limit = limit;
4152 }
4153 
4154 unsigned long
4155 uma_size(void)
4156 {
4157 
4158 	return (atomic_load_long(&uma_kmem_total));
4159 }
4160 
4161 long
4162 uma_avail(void)
4163 {
4164 
4165 	return (uma_kmem_limit - uma_size());
4166 }
4167 
4168 #ifdef DDB
4169 /*
4170  * Generate statistics across both the zone and its per-cpu cache's.  Return
4171  * desired statistics if the pointer is non-NULL for that statistic.
4172  *
4173  * Note: does not update the zone statistics, as it can't safely clear the
4174  * per-CPU cache statistic.
4175  *
4176  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
4177  * safe from off-CPU; we should modify the caches to track this information
4178  * directly so that we don't have to.
4179  */
4180 static void
4181 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
4182     uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp)
4183 {
4184 	uma_cache_t cache;
4185 	uint64_t allocs, frees, sleeps, xdomain;
4186 	int cachefree, cpu;
4187 
4188 	allocs = frees = sleeps = xdomain = 0;
4189 	cachefree = 0;
4190 	CPU_FOREACH(cpu) {
4191 		cache = &z->uz_cpu[cpu];
4192 		if (cache->uc_allocbucket != NULL)
4193 			cachefree += cache->uc_allocbucket->ub_cnt;
4194 		if (cache->uc_freebucket != NULL)
4195 			cachefree += cache->uc_freebucket->ub_cnt;
4196 		if (cache->uc_crossbucket != NULL) {
4197 			xdomain += cache->uc_crossbucket->ub_cnt;
4198 			cachefree += cache->uc_crossbucket->ub_cnt;
4199 		}
4200 		allocs += cache->uc_allocs;
4201 		frees += cache->uc_frees;
4202 	}
4203 	allocs += counter_u64_fetch(z->uz_allocs);
4204 	frees += counter_u64_fetch(z->uz_frees);
4205 	sleeps += z->uz_sleeps;
4206 	xdomain += z->uz_xdomain;
4207 	if (cachefreep != NULL)
4208 		*cachefreep = cachefree;
4209 	if (allocsp != NULL)
4210 		*allocsp = allocs;
4211 	if (freesp != NULL)
4212 		*freesp = frees;
4213 	if (sleepsp != NULL)
4214 		*sleepsp = sleeps;
4215 	if (xdomainp != NULL)
4216 		*xdomainp = xdomain;
4217 }
4218 #endif /* DDB */
4219 
4220 static int
4221 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
4222 {
4223 	uma_keg_t kz;
4224 	uma_zone_t z;
4225 	int count;
4226 
4227 	count = 0;
4228 	rw_rlock(&uma_rwlock);
4229 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4230 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
4231 			count++;
4232 	}
4233 	LIST_FOREACH(z, &uma_cachezones, uz_link)
4234 		count++;
4235 
4236 	rw_runlock(&uma_rwlock);
4237 	return (sysctl_handle_int(oidp, &count, 0, req));
4238 }
4239 
4240 static void
4241 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf,
4242     struct uma_percpu_stat *ups, bool internal)
4243 {
4244 	uma_zone_domain_t zdom;
4245 	uma_bucket_t bucket;
4246 	uma_cache_t cache;
4247 	int i;
4248 
4249 
4250 	for (i = 0; i < vm_ndomains; i++) {
4251 		zdom = &z->uz_domain[i];
4252 		uth->uth_zone_free += zdom->uzd_nitems;
4253 	}
4254 	uth->uth_allocs = counter_u64_fetch(z->uz_allocs);
4255 	uth->uth_frees = counter_u64_fetch(z->uz_frees);
4256 	uth->uth_fails = counter_u64_fetch(z->uz_fails);
4257 	uth->uth_sleeps = z->uz_sleeps;
4258 	uth->uth_xdomain = z->uz_xdomain;
4259 
4260 	/*
4261 	 * While it is not normally safe to access the cache bucket pointers
4262 	 * while not on the CPU that owns the cache, we only allow the pointers
4263 	 * to be exchanged without the zone lock held, not invalidated, so
4264 	 * accept the possible race associated with bucket exchange during
4265 	 * monitoring.  Use atomic_load_ptr() to ensure that the bucket pointers
4266 	 * are loaded only once.
4267 	 */
4268 	for (i = 0; i < mp_maxid + 1; i++) {
4269 		bzero(&ups[i], sizeof(*ups));
4270 		if (internal || CPU_ABSENT(i))
4271 			continue;
4272 		cache = &z->uz_cpu[i];
4273 		bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_allocbucket);
4274 		if (bucket != NULL)
4275 			ups[i].ups_cache_free += bucket->ub_cnt;
4276 		bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_freebucket);
4277 		if (bucket != NULL)
4278 			ups[i].ups_cache_free += bucket->ub_cnt;
4279 		bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_crossbucket);
4280 		if (bucket != NULL)
4281 			ups[i].ups_cache_free += bucket->ub_cnt;
4282 		ups[i].ups_allocs = cache->uc_allocs;
4283 		ups[i].ups_frees = cache->uc_frees;
4284 	}
4285 }
4286 
4287 static int
4288 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
4289 {
4290 	struct uma_stream_header ush;
4291 	struct uma_type_header uth;
4292 	struct uma_percpu_stat *ups;
4293 	struct sbuf sbuf;
4294 	uma_keg_t kz;
4295 	uma_zone_t z;
4296 	int count, error, i;
4297 
4298 	error = sysctl_wire_old_buffer(req, 0);
4299 	if (error != 0)
4300 		return (error);
4301 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
4302 	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
4303 	ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
4304 
4305 	count = 0;
4306 	rw_rlock(&uma_rwlock);
4307 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4308 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
4309 			count++;
4310 	}
4311 
4312 	LIST_FOREACH(z, &uma_cachezones, uz_link)
4313 		count++;
4314 
4315 	/*
4316 	 * Insert stream header.
4317 	 */
4318 	bzero(&ush, sizeof(ush));
4319 	ush.ush_version = UMA_STREAM_VERSION;
4320 	ush.ush_maxcpus = (mp_maxid + 1);
4321 	ush.ush_count = count;
4322 	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
4323 
4324 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4325 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4326 			bzero(&uth, sizeof(uth));
4327 			ZONE_LOCK(z);
4328 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
4329 			uth.uth_align = kz->uk_align;
4330 			uth.uth_size = kz->uk_size;
4331 			uth.uth_rsize = kz->uk_rsize;
4332 			if (z->uz_max_items > 0)
4333 				uth.uth_pages = (z->uz_items / kz->uk_ipers) *
4334 					kz->uk_ppera;
4335 			else
4336 				uth.uth_pages = kz->uk_pages;
4337 			uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) *
4338 			    kz->uk_ppera;
4339 			uth.uth_limit = z->uz_max_items;
4340 			uth.uth_keg_free = z->uz_keg->uk_free;
4341 
4342 			/*
4343 			 * A zone is secondary is it is not the first entry
4344 			 * on the keg's zone list.
4345 			 */
4346 			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
4347 			    (LIST_FIRST(&kz->uk_zones) != z))
4348 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
4349 			uma_vm_zone_stats(&uth, z, &sbuf, ups,
4350 			    kz->uk_flags & UMA_ZFLAG_INTERNAL);
4351 			ZONE_UNLOCK(z);
4352 			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
4353 			for (i = 0; i < mp_maxid + 1; i++)
4354 				(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
4355 		}
4356 	}
4357 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
4358 		bzero(&uth, sizeof(uth));
4359 		ZONE_LOCK(z);
4360 		strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
4361 		uth.uth_size = z->uz_size;
4362 		uma_vm_zone_stats(&uth, z, &sbuf, ups, false);
4363 		ZONE_UNLOCK(z);
4364 		(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
4365 		for (i = 0; i < mp_maxid + 1; i++)
4366 			(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
4367 	}
4368 
4369 	rw_runlock(&uma_rwlock);
4370 	error = sbuf_finish(&sbuf);
4371 	sbuf_delete(&sbuf);
4372 	free(ups, M_TEMP);
4373 	return (error);
4374 }
4375 
4376 int
4377 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
4378 {
4379 	uma_zone_t zone = *(uma_zone_t *)arg1;
4380 	int error, max;
4381 
4382 	max = uma_zone_get_max(zone);
4383 	error = sysctl_handle_int(oidp, &max, 0, req);
4384 	if (error || !req->newptr)
4385 		return (error);
4386 
4387 	uma_zone_set_max(zone, max);
4388 
4389 	return (0);
4390 }
4391 
4392 int
4393 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
4394 {
4395 	uma_zone_t zone;
4396 	int cur;
4397 
4398 	/*
4399 	 * Some callers want to add sysctls for global zones that
4400 	 * may not yet exist so they pass a pointer to a pointer.
4401 	 */
4402 	if (arg2 == 0)
4403 		zone = *(uma_zone_t *)arg1;
4404 	else
4405 		zone = arg1;
4406 	cur = uma_zone_get_cur(zone);
4407 	return (sysctl_handle_int(oidp, &cur, 0, req));
4408 }
4409 
4410 static int
4411 sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS)
4412 {
4413 	uma_zone_t zone = arg1;
4414 	uint64_t cur;
4415 
4416 	cur = uma_zone_get_allocs(zone);
4417 	return (sysctl_handle_64(oidp, &cur, 0, req));
4418 }
4419 
4420 static int
4421 sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS)
4422 {
4423 	uma_zone_t zone = arg1;
4424 	uint64_t cur;
4425 
4426 	cur = uma_zone_get_frees(zone);
4427 	return (sysctl_handle_64(oidp, &cur, 0, req));
4428 }
4429 
4430 static int
4431 sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS)
4432 {
4433 	struct sbuf sbuf;
4434 	uma_zone_t zone = arg1;
4435 	int error;
4436 
4437 	sbuf_new_for_sysctl(&sbuf, NULL, 0, req);
4438 	if (zone->uz_flags != 0)
4439 		sbuf_printf(&sbuf, "0x%b", zone->uz_flags, PRINT_UMA_ZFLAGS);
4440 	else
4441 		sbuf_printf(&sbuf, "0");
4442 	error = sbuf_finish(&sbuf);
4443 	sbuf_delete(&sbuf);
4444 
4445 	return (error);
4446 }
4447 
4448 static int
4449 sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS)
4450 {
4451 	uma_keg_t keg = arg1;
4452 	int avail, effpct, total;
4453 
4454 	total = keg->uk_ppera * PAGE_SIZE;
4455 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) != 0)
4456 		total += slab_sizeof(SLAB_MAX_SETSIZE);
4457 	/*
4458 	 * We consider the client's requested size and alignment here, not the
4459 	 * real size determination uk_rsize, because we also adjust the real
4460 	 * size for internal implementation reasons (max bitset size).
4461 	 */
4462 	avail = keg->uk_ipers * roundup2(keg->uk_size, keg->uk_align + 1);
4463 	if ((keg->uk_flags & UMA_ZONE_PCPU) != 0)
4464 		avail *= mp_maxid + 1;
4465 	effpct = 100 * avail / total;
4466 	return (sysctl_handle_int(oidp, &effpct, 0, req));
4467 }
4468 
4469 #ifdef INVARIANTS
4470 static uma_slab_t
4471 uma_dbg_getslab(uma_zone_t zone, void *item)
4472 {
4473 	uma_slab_t slab;
4474 	uma_keg_t keg;
4475 	uint8_t *mem;
4476 
4477 	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
4478 	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
4479 		slab = vtoslab((vm_offset_t)mem);
4480 	} else {
4481 		/*
4482 		 * It is safe to return the slab here even though the
4483 		 * zone is unlocked because the item's allocation state
4484 		 * essentially holds a reference.
4485 		 */
4486 		if (zone->uz_lockptr == &zone->uz_lock)
4487 			return (NULL);
4488 		ZONE_LOCK(zone);
4489 		keg = zone->uz_keg;
4490 		if (keg->uk_flags & UMA_ZONE_HASH)
4491 			slab = hash_sfind(&keg->uk_hash, mem);
4492 		else
4493 			slab = (uma_slab_t)(mem + keg->uk_pgoff);
4494 		ZONE_UNLOCK(zone);
4495 	}
4496 
4497 	return (slab);
4498 }
4499 
4500 static bool
4501 uma_dbg_zskip(uma_zone_t zone, void *mem)
4502 {
4503 
4504 	if (zone->uz_lockptr == &zone->uz_lock)
4505 		return (true);
4506 
4507 	return (uma_dbg_kskip(zone->uz_keg, mem));
4508 }
4509 
4510 static bool
4511 uma_dbg_kskip(uma_keg_t keg, void *mem)
4512 {
4513 	uintptr_t idx;
4514 
4515 	if (dbg_divisor == 0)
4516 		return (true);
4517 
4518 	if (dbg_divisor == 1)
4519 		return (false);
4520 
4521 	idx = (uintptr_t)mem >> PAGE_SHIFT;
4522 	if (keg->uk_ipers > 1) {
4523 		idx *= keg->uk_ipers;
4524 		idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
4525 	}
4526 
4527 	if ((idx / dbg_divisor) * dbg_divisor != idx) {
4528 		counter_u64_add(uma_skip_cnt, 1);
4529 		return (true);
4530 	}
4531 	counter_u64_add(uma_dbg_cnt, 1);
4532 
4533 	return (false);
4534 }
4535 
4536 /*
4537  * Set up the slab's freei data such that uma_dbg_free can function.
4538  *
4539  */
4540 static void
4541 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
4542 {
4543 	uma_keg_t keg;
4544 	int freei;
4545 
4546 	if (slab == NULL) {
4547 		slab = uma_dbg_getslab(zone, item);
4548 		if (slab == NULL)
4549 			panic("uma: item %p did not belong to zone %s\n",
4550 			    item, zone->uz_name);
4551 	}
4552 	keg = zone->uz_keg;
4553 	freei = slab_item_index(slab, keg, item);
4554 
4555 	if (BIT_ISSET(keg->uk_ipers, freei, slab_dbg_bits(slab, keg)))
4556 		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
4557 		    item, zone, zone->uz_name, slab, freei);
4558 	BIT_SET_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg));
4559 }
4560 
4561 /*
4562  * Verifies freed addresses.  Checks for alignment, valid slab membership
4563  * and duplicate frees.
4564  *
4565  */
4566 static void
4567 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
4568 {
4569 	uma_keg_t keg;
4570 	int freei;
4571 
4572 	if (slab == NULL) {
4573 		slab = uma_dbg_getslab(zone, item);
4574 		if (slab == NULL)
4575 			panic("uma: Freed item %p did not belong to zone %s\n",
4576 			    item, zone->uz_name);
4577 	}
4578 	keg = zone->uz_keg;
4579 	freei = slab_item_index(slab, keg, item);
4580 
4581 	if (freei >= keg->uk_ipers)
4582 		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
4583 		    item, zone, zone->uz_name, slab, freei);
4584 
4585 	if (slab_item(slab, keg, freei) != item)
4586 		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
4587 		    item, zone, zone->uz_name, slab, freei);
4588 
4589 	if (!BIT_ISSET(keg->uk_ipers, freei, slab_dbg_bits(slab, keg)))
4590 		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
4591 		    item, zone, zone->uz_name, slab, freei);
4592 
4593 	BIT_CLR_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg));
4594 }
4595 #endif /* INVARIANTS */
4596 
4597 #ifdef DDB
4598 static int64_t
4599 get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used,
4600     uint64_t *sleeps, long *cachefree, uint64_t *xdomain)
4601 {
4602 	uint64_t frees;
4603 	int i;
4604 
4605 	if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
4606 		*allocs = counter_u64_fetch(z->uz_allocs);
4607 		frees = counter_u64_fetch(z->uz_frees);
4608 		*sleeps = z->uz_sleeps;
4609 		*cachefree = 0;
4610 		*xdomain = 0;
4611 	} else
4612 		uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps,
4613 		    xdomain);
4614 	if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
4615 	    (LIST_FIRST(&kz->uk_zones) != z)))
4616 		*cachefree += kz->uk_free;
4617 	for (i = 0; i < vm_ndomains; i++)
4618 		*cachefree += z->uz_domain[i].uzd_nitems;
4619 	*used = *allocs - frees;
4620 	return (((int64_t)*used + *cachefree) * kz->uk_size);
4621 }
4622 
4623 DB_SHOW_COMMAND(uma, db_show_uma)
4624 {
4625 	const char *fmt_hdr, *fmt_entry;
4626 	uma_keg_t kz;
4627 	uma_zone_t z;
4628 	uint64_t allocs, used, sleeps, xdomain;
4629 	long cachefree;
4630 	/* variables for sorting */
4631 	uma_keg_t cur_keg;
4632 	uma_zone_t cur_zone, last_zone;
4633 	int64_t cur_size, last_size, size;
4634 	int ties;
4635 
4636 	/* /i option produces machine-parseable CSV output */
4637 	if (modif[0] == 'i') {
4638 		fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n";
4639 		fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n";
4640 	} else {
4641 		fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n";
4642 		fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n";
4643 	}
4644 
4645 	db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests",
4646 	    "Sleeps", "Bucket", "Total Mem", "XFree");
4647 
4648 	/* Sort the zones with largest size first. */
4649 	last_zone = NULL;
4650 	last_size = INT64_MAX;
4651 	for (;;) {
4652 		cur_zone = NULL;
4653 		cur_size = -1;
4654 		ties = 0;
4655 		LIST_FOREACH(kz, &uma_kegs, uk_link) {
4656 			LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4657 				/*
4658 				 * In the case of size ties, print out zones
4659 				 * in the order they are encountered.  That is,
4660 				 * when we encounter the most recently output
4661 				 * zone, we have already printed all preceding
4662 				 * ties, and we must print all following ties.
4663 				 */
4664 				if (z == last_zone) {
4665 					ties = 1;
4666 					continue;
4667 				}
4668 				size = get_uma_stats(kz, z, &allocs, &used,
4669 				    &sleeps, &cachefree, &xdomain);
4670 				if (size > cur_size && size < last_size + ties)
4671 				{
4672 					cur_size = size;
4673 					cur_zone = z;
4674 					cur_keg = kz;
4675 				}
4676 			}
4677 		}
4678 		if (cur_zone == NULL)
4679 			break;
4680 
4681 		size = get_uma_stats(cur_keg, cur_zone, &allocs, &used,
4682 		    &sleeps, &cachefree, &xdomain);
4683 		db_printf(fmt_entry, cur_zone->uz_name,
4684 		    (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree,
4685 		    (uintmax_t)allocs, (uintmax_t)sleeps,
4686 		    (unsigned)cur_zone->uz_bucket_size, (intmax_t)size,
4687 		    xdomain);
4688 
4689 		if (db_pager_quit)
4690 			return;
4691 		last_zone = cur_zone;
4692 		last_size = cur_size;
4693 	}
4694 }
4695 
4696 DB_SHOW_COMMAND(umacache, db_show_umacache)
4697 {
4698 	uma_zone_t z;
4699 	uint64_t allocs, frees;
4700 	long cachefree;
4701 	int i;
4702 
4703 	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
4704 	    "Requests", "Bucket");
4705 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
4706 		uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL);
4707 		for (i = 0; i < vm_ndomains; i++)
4708 			cachefree += z->uz_domain[i].uzd_nitems;
4709 		db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",
4710 		    z->uz_name, (uintmax_t)z->uz_size,
4711 		    (intmax_t)(allocs - frees), cachefree,
4712 		    (uintmax_t)allocs, z->uz_bucket_size);
4713 		if (db_pager_quit)
4714 			return;
4715 	}
4716 }
4717 #endif	/* DDB */
4718