xref: /freebsd/sys/vm/uma_core.c (revision 730cecb05aaf016ac52ef7cfc691ccec3a0408cd)
1 /*-
2  * Copyright (c) 2002-2005, 2009 Jeffrey Roberson <jeff@FreeBSD.org>
3  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
4  * Copyright (c) 2004-2006 Robert N. M. Watson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * uma_core.c  Implementation of the Universal Memory allocator
31  *
32  * This allocator is intended to replace the multitude of similar object caches
33  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
34  * effecient.  A primary design goal is to return unused memory to the rest of
35  * the system.  This will make the system as a whole more flexible due to the
36  * ability to move memory to subsystems which most need it instead of leaving
37  * pools of reserved memory unused.
38  *
39  * The basic ideas stem from similar slab/zone based allocators whose algorithms
40  * are well known.
41  *
42  */
43 
44 /*
45  * TODO:
46  *	- Improve memory usage for large allocations
47  *	- Investigate cache size adjustments
48  */
49 
50 #include <sys/cdefs.h>
51 __FBSDID("$FreeBSD$");
52 
53 /* I should really use ktr.. */
54 /*
55 #define UMA_DEBUG 1
56 #define UMA_DEBUG_ALLOC 1
57 #define UMA_DEBUG_ALLOC_1 1
58 */
59 
60 #include "opt_ddb.h"
61 #include "opt_param.h"
62 #include "opt_vm.h"
63 
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/kernel.h>
67 #include <sys/types.h>
68 #include <sys/queue.h>
69 #include <sys/malloc.h>
70 #include <sys/ktr.h>
71 #include <sys/lock.h>
72 #include <sys/sysctl.h>
73 #include <sys/mutex.h>
74 #include <sys/proc.h>
75 #include <sys/rwlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/smp.h>
78 #include <sys/vmmeter.h>
79 
80 #include <vm/vm.h>
81 #include <vm/vm_object.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_pageout.h>
84 #include <vm/vm_param.h>
85 #include <vm/vm_map.h>
86 #include <vm/vm_kern.h>
87 #include <vm/vm_extern.h>
88 #include <vm/uma.h>
89 #include <vm/uma_int.h>
90 #include <vm/uma_dbg.h>
91 
92 #include <ddb/ddb.h>
93 
94 #ifdef DEBUG_MEMGUARD
95 #include <vm/memguard.h>
96 #endif
97 
98 /*
99  * This is the zone and keg from which all zones are spawned.  The idea is that
100  * even the zone & keg heads are allocated from the allocator, so we use the
101  * bss section to bootstrap us.
102  */
103 static struct uma_keg masterkeg;
104 static struct uma_zone masterzone_k;
105 static struct uma_zone masterzone_z;
106 static uma_zone_t kegs = &masterzone_k;
107 static uma_zone_t zones = &masterzone_z;
108 
109 /* This is the zone from which all of uma_slab_t's are allocated. */
110 static uma_zone_t slabzone;
111 static uma_zone_t slabrefzone;	/* With refcounters (for UMA_ZONE_REFCNT) */
112 
113 /*
114  * The initial hash tables come out of this zone so they can be allocated
115  * prior to malloc coming up.
116  */
117 static uma_zone_t hashzone;
118 
119 /* The boot-time adjusted value for cache line alignment. */
120 int uma_align_cache = 64 - 1;
121 
122 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
123 
124 /*
125  * Are we allowed to allocate buckets?
126  */
127 static int bucketdisable = 1;
128 
129 /* Linked list of all kegs in the system */
130 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
131 
132 /* This mutex protects the keg list */
133 static struct mtx uma_mtx;
134 
135 /* Linked list of boot time pages */
136 static LIST_HEAD(,uma_slab) uma_boot_pages =
137     LIST_HEAD_INITIALIZER(uma_boot_pages);
138 
139 /* This mutex protects the boot time pages list */
140 static struct mtx uma_boot_pages_mtx;
141 
142 /* Is the VM done starting up? */
143 static int booted = 0;
144 #define	UMA_STARTUP	1
145 #define	UMA_STARTUP2	2
146 
147 /* Maximum number of allowed items-per-slab if the slab header is OFFPAGE */
148 static u_int uma_max_ipers;
149 static u_int uma_max_ipers_ref;
150 
151 /*
152  * This is the handle used to schedule events that need to happen
153  * outside of the allocation fast path.
154  */
155 static struct callout uma_callout;
156 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
157 
158 /*
159  * This structure is passed as the zone ctor arg so that I don't have to create
160  * a special allocation function just for zones.
161  */
162 struct uma_zctor_args {
163 	const char *name;
164 	size_t size;
165 	uma_ctor ctor;
166 	uma_dtor dtor;
167 	uma_init uminit;
168 	uma_fini fini;
169 	uma_keg_t keg;
170 	int align;
171 	u_int32_t flags;
172 };
173 
174 struct uma_kctor_args {
175 	uma_zone_t zone;
176 	size_t size;
177 	uma_init uminit;
178 	uma_fini fini;
179 	int align;
180 	u_int32_t flags;
181 };
182 
183 struct uma_bucket_zone {
184 	uma_zone_t	ubz_zone;
185 	char		*ubz_name;
186 	int		ubz_entries;
187 };
188 
189 #define	BUCKET_MAX	128
190 
191 struct uma_bucket_zone bucket_zones[] = {
192 	{ NULL, "16 Bucket", 16 },
193 	{ NULL, "32 Bucket", 32 },
194 	{ NULL, "64 Bucket", 64 },
195 	{ NULL, "128 Bucket", 128 },
196 	{ NULL, NULL, 0}
197 };
198 
199 #define	BUCKET_SHIFT	4
200 #define	BUCKET_ZONES	((BUCKET_MAX >> BUCKET_SHIFT) + 1)
201 
202 /*
203  * bucket_size[] maps requested bucket sizes to zones that allocate a bucket
204  * of approximately the right size.
205  */
206 static uint8_t bucket_size[BUCKET_ZONES];
207 
208 /*
209  * Flags and enumerations to be passed to internal functions.
210  */
211 enum zfreeskip { SKIP_NONE, SKIP_DTOR, SKIP_FINI };
212 
213 #define	ZFREE_STATFAIL	0x00000001	/* Update zone failure statistic. */
214 #define	ZFREE_STATFREE	0x00000002	/* Update zone free statistic. */
215 
216 /* Prototypes.. */
217 
218 static void *noobj_alloc(uma_zone_t, int, u_int8_t *, int);
219 static void *page_alloc(uma_zone_t, int, u_int8_t *, int);
220 static void *startup_alloc(uma_zone_t, int, u_int8_t *, int);
221 static void page_free(void *, int, u_int8_t);
222 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
223 static void cache_drain(uma_zone_t);
224 static void bucket_drain(uma_zone_t, uma_bucket_t);
225 static void bucket_cache_drain(uma_zone_t zone);
226 static int keg_ctor(void *, int, void *, int);
227 static void keg_dtor(void *, int, void *);
228 static int zone_ctor(void *, int, void *, int);
229 static void zone_dtor(void *, int, void *);
230 static int zero_init(void *, int, int);
231 static void keg_small_init(uma_keg_t keg);
232 static void keg_large_init(uma_keg_t keg);
233 static void zone_foreach(void (*zfunc)(uma_zone_t));
234 static void zone_timeout(uma_zone_t zone);
235 static int hash_alloc(struct uma_hash *);
236 static int hash_expand(struct uma_hash *, struct uma_hash *);
237 static void hash_free(struct uma_hash *hash);
238 static void uma_timeout(void *);
239 static void uma_startup3(void);
240 static void *zone_alloc_item(uma_zone_t, void *, int);
241 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip,
242     int);
243 static void bucket_enable(void);
244 static void bucket_init(void);
245 static uma_bucket_t bucket_alloc(int, int);
246 static void bucket_free(uma_bucket_t);
247 static void bucket_zone_drain(void);
248 static int zone_alloc_bucket(uma_zone_t zone, int flags);
249 static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
250 static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
251 static void *slab_alloc_item(uma_zone_t zone, uma_slab_t slab);
252 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
253     uma_fini fini, int align, u_int32_t flags);
254 static inline void zone_relock(uma_zone_t zone, uma_keg_t keg);
255 static inline void keg_relock(uma_keg_t keg, uma_zone_t zone);
256 
257 void uma_print_zone(uma_zone_t);
258 void uma_print_stats(void);
259 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
260 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
261 
262 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
263 
264 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
265     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
266 
267 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
268     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
269 
270 static int zone_warnings = 1;
271 TUNABLE_INT("vm.zone_warnings", &zone_warnings);
272 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RW, &zone_warnings, 0,
273     "Warn when UMA zones becomes full");
274 
275 /*
276  * This routine checks to see whether or not it's safe to enable buckets.
277  */
278 
279 static void
280 bucket_enable(void)
281 {
282 	bucketdisable = vm_page_count_min();
283 }
284 
285 /*
286  * Initialize bucket_zones, the array of zones of buckets of various sizes.
287  *
288  * For each zone, calculate the memory required for each bucket, consisting
289  * of the header and an array of pointers.  Initialize bucket_size[] to point
290  * the range of appropriate bucket sizes at the zone.
291  */
292 static void
293 bucket_init(void)
294 {
295 	struct uma_bucket_zone *ubz;
296 	int i;
297 	int j;
298 
299 	for (i = 0, j = 0; bucket_zones[j].ubz_entries != 0; j++) {
300 		int size;
301 
302 		ubz = &bucket_zones[j];
303 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
304 		size += sizeof(void *) * ubz->ubz_entries;
305 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
306 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
307 		    UMA_ZFLAG_INTERNAL | UMA_ZFLAG_BUCKET);
308 		for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT))
309 			bucket_size[i >> BUCKET_SHIFT] = j;
310 	}
311 }
312 
313 /*
314  * Given a desired number of entries for a bucket, return the zone from which
315  * to allocate the bucket.
316  */
317 static struct uma_bucket_zone *
318 bucket_zone_lookup(int entries)
319 {
320 	int idx;
321 
322 	idx = howmany(entries, 1 << BUCKET_SHIFT);
323 	return (&bucket_zones[bucket_size[idx]]);
324 }
325 
326 static uma_bucket_t
327 bucket_alloc(int entries, int bflags)
328 {
329 	struct uma_bucket_zone *ubz;
330 	uma_bucket_t bucket;
331 
332 	/*
333 	 * This is to stop us from allocating per cpu buckets while we're
334 	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
335 	 * boot pages.  This also prevents us from allocating buckets in
336 	 * low memory situations.
337 	 */
338 	if (bucketdisable)
339 		return (NULL);
340 
341 	ubz = bucket_zone_lookup(entries);
342 	bucket = zone_alloc_item(ubz->ubz_zone, NULL, bflags);
343 	if (bucket) {
344 #ifdef INVARIANTS
345 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
346 #endif
347 		bucket->ub_cnt = 0;
348 		bucket->ub_entries = ubz->ubz_entries;
349 	}
350 
351 	return (bucket);
352 }
353 
354 static void
355 bucket_free(uma_bucket_t bucket)
356 {
357 	struct uma_bucket_zone *ubz;
358 
359 	ubz = bucket_zone_lookup(bucket->ub_entries);
360 	zone_free_item(ubz->ubz_zone, bucket, NULL, SKIP_NONE,
361 	    ZFREE_STATFREE);
362 }
363 
364 static void
365 bucket_zone_drain(void)
366 {
367 	struct uma_bucket_zone *ubz;
368 
369 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
370 		zone_drain(ubz->ubz_zone);
371 }
372 
373 static void
374 zone_log_warning(uma_zone_t zone)
375 {
376 	static const struct timeval warninterval = { 300, 0 };
377 
378 	if (!zone_warnings || zone->uz_warning == NULL)
379 		return;
380 
381 	if (ratecheck(&zone->uz_ratecheck, &warninterval))
382 		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
383 }
384 
385 static inline uma_keg_t
386 zone_first_keg(uma_zone_t zone)
387 {
388 
389 	return (LIST_FIRST(&zone->uz_kegs)->kl_keg);
390 }
391 
392 static void
393 zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
394 {
395 	uma_klink_t klink;
396 
397 	LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
398 		kegfn(klink->kl_keg);
399 }
400 
401 /*
402  * Routine called by timeout which is used to fire off some time interval
403  * based calculations.  (stats, hash size, etc.)
404  *
405  * Arguments:
406  *	arg   Unused
407  *
408  * Returns:
409  *	Nothing
410  */
411 static void
412 uma_timeout(void *unused)
413 {
414 	bucket_enable();
415 	zone_foreach(zone_timeout);
416 
417 	/* Reschedule this event */
418 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
419 }
420 
421 /*
422  * Routine to perform timeout driven calculations.  This expands the
423  * hashes and does per cpu statistics aggregation.
424  *
425  *  Returns nothing.
426  */
427 static void
428 keg_timeout(uma_keg_t keg)
429 {
430 
431 	KEG_LOCK(keg);
432 	/*
433 	 * Expand the keg hash table.
434 	 *
435 	 * This is done if the number of slabs is larger than the hash size.
436 	 * What I'm trying to do here is completely reduce collisions.  This
437 	 * may be a little aggressive.  Should I allow for two collisions max?
438 	 */
439 	if (keg->uk_flags & UMA_ZONE_HASH &&
440 	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
441 		struct uma_hash newhash;
442 		struct uma_hash oldhash;
443 		int ret;
444 
445 		/*
446 		 * This is so involved because allocating and freeing
447 		 * while the keg lock is held will lead to deadlock.
448 		 * I have to do everything in stages and check for
449 		 * races.
450 		 */
451 		newhash = keg->uk_hash;
452 		KEG_UNLOCK(keg);
453 		ret = hash_alloc(&newhash);
454 		KEG_LOCK(keg);
455 		if (ret) {
456 			if (hash_expand(&keg->uk_hash, &newhash)) {
457 				oldhash = keg->uk_hash;
458 				keg->uk_hash = newhash;
459 			} else
460 				oldhash = newhash;
461 
462 			KEG_UNLOCK(keg);
463 			hash_free(&oldhash);
464 			KEG_LOCK(keg);
465 		}
466 	}
467 	KEG_UNLOCK(keg);
468 }
469 
470 static void
471 zone_timeout(uma_zone_t zone)
472 {
473 
474 	zone_foreach_keg(zone, &keg_timeout);
475 }
476 
477 /*
478  * Allocate and zero fill the next sized hash table from the appropriate
479  * backing store.
480  *
481  * Arguments:
482  *	hash  A new hash structure with the old hash size in uh_hashsize
483  *
484  * Returns:
485  *	1 on sucess and 0 on failure.
486  */
487 static int
488 hash_alloc(struct uma_hash *hash)
489 {
490 	int oldsize;
491 	int alloc;
492 
493 	oldsize = hash->uh_hashsize;
494 
495 	/* We're just going to go to a power of two greater */
496 	if (oldsize)  {
497 		hash->uh_hashsize = oldsize * 2;
498 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
499 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
500 		    M_UMAHASH, M_NOWAIT);
501 	} else {
502 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
503 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
504 		    M_WAITOK);
505 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
506 	}
507 	if (hash->uh_slab_hash) {
508 		bzero(hash->uh_slab_hash, alloc);
509 		hash->uh_hashmask = hash->uh_hashsize - 1;
510 		return (1);
511 	}
512 
513 	return (0);
514 }
515 
516 /*
517  * Expands the hash table for HASH zones.  This is done from zone_timeout
518  * to reduce collisions.  This must not be done in the regular allocation
519  * path, otherwise, we can recurse on the vm while allocating pages.
520  *
521  * Arguments:
522  *	oldhash  The hash you want to expand
523  *	newhash  The hash structure for the new table
524  *
525  * Returns:
526  *	Nothing
527  *
528  * Discussion:
529  */
530 static int
531 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
532 {
533 	uma_slab_t slab;
534 	int hval;
535 	int i;
536 
537 	if (!newhash->uh_slab_hash)
538 		return (0);
539 
540 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
541 		return (0);
542 
543 	/*
544 	 * I need to investigate hash algorithms for resizing without a
545 	 * full rehash.
546 	 */
547 
548 	for (i = 0; i < oldhash->uh_hashsize; i++)
549 		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
550 			slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
551 			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
552 			hval = UMA_HASH(newhash, slab->us_data);
553 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
554 			    slab, us_hlink);
555 		}
556 
557 	return (1);
558 }
559 
560 /*
561  * Free the hash bucket to the appropriate backing store.
562  *
563  * Arguments:
564  *	slab_hash  The hash bucket we're freeing
565  *	hashsize   The number of entries in that hash bucket
566  *
567  * Returns:
568  *	Nothing
569  */
570 static void
571 hash_free(struct uma_hash *hash)
572 {
573 	if (hash->uh_slab_hash == NULL)
574 		return;
575 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
576 		zone_free_item(hashzone,
577 		    hash->uh_slab_hash, NULL, SKIP_NONE, ZFREE_STATFREE);
578 	else
579 		free(hash->uh_slab_hash, M_UMAHASH);
580 }
581 
582 /*
583  * Frees all outstanding items in a bucket
584  *
585  * Arguments:
586  *	zone   The zone to free to, must be unlocked.
587  *	bucket The free/alloc bucket with items, cpu queue must be locked.
588  *
589  * Returns:
590  *	Nothing
591  */
592 
593 static void
594 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
595 {
596 	void *item;
597 
598 	if (bucket == NULL)
599 		return;
600 
601 	while (bucket->ub_cnt > 0)  {
602 		bucket->ub_cnt--;
603 		item = bucket->ub_bucket[bucket->ub_cnt];
604 #ifdef INVARIANTS
605 		bucket->ub_bucket[bucket->ub_cnt] = NULL;
606 		KASSERT(item != NULL,
607 		    ("bucket_drain: botched ptr, item is NULL"));
608 #endif
609 		zone_free_item(zone, item, NULL, SKIP_DTOR, 0);
610 	}
611 }
612 
613 /*
614  * Drains the per cpu caches for a zone.
615  *
616  * NOTE: This may only be called while the zone is being turn down, and not
617  * during normal operation.  This is necessary in order that we do not have
618  * to migrate CPUs to drain the per-CPU caches.
619  *
620  * Arguments:
621  *	zone     The zone to drain, must be unlocked.
622  *
623  * Returns:
624  *	Nothing
625  */
626 static void
627 cache_drain(uma_zone_t zone)
628 {
629 	uma_cache_t cache;
630 	int cpu;
631 
632 	/*
633 	 * XXX: It is safe to not lock the per-CPU caches, because we're
634 	 * tearing down the zone anyway.  I.e., there will be no further use
635 	 * of the caches at this point.
636 	 *
637 	 * XXX: It would good to be able to assert that the zone is being
638 	 * torn down to prevent improper use of cache_drain().
639 	 *
640 	 * XXX: We lock the zone before passing into bucket_cache_drain() as
641 	 * it is used elsewhere.  Should the tear-down path be made special
642 	 * there in some form?
643 	 */
644 	CPU_FOREACH(cpu) {
645 		cache = &zone->uz_cpu[cpu];
646 		bucket_drain(zone, cache->uc_allocbucket);
647 		bucket_drain(zone, cache->uc_freebucket);
648 		if (cache->uc_allocbucket != NULL)
649 			bucket_free(cache->uc_allocbucket);
650 		if (cache->uc_freebucket != NULL)
651 			bucket_free(cache->uc_freebucket);
652 		cache->uc_allocbucket = cache->uc_freebucket = NULL;
653 	}
654 	ZONE_LOCK(zone);
655 	bucket_cache_drain(zone);
656 	ZONE_UNLOCK(zone);
657 }
658 
659 /*
660  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
661  */
662 static void
663 bucket_cache_drain(uma_zone_t zone)
664 {
665 	uma_bucket_t bucket;
666 
667 	/*
668 	 * Drain the bucket queues and free the buckets, we just keep two per
669 	 * cpu (alloc/free).
670 	 */
671 	while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
672 		LIST_REMOVE(bucket, ub_link);
673 		ZONE_UNLOCK(zone);
674 		bucket_drain(zone, bucket);
675 		bucket_free(bucket);
676 		ZONE_LOCK(zone);
677 	}
678 
679 	/* Now we do the free queue.. */
680 	while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
681 		LIST_REMOVE(bucket, ub_link);
682 		bucket_free(bucket);
683 	}
684 }
685 
686 /*
687  * Frees pages from a keg back to the system.  This is done on demand from
688  * the pageout daemon.
689  *
690  * Returns nothing.
691  */
692 static void
693 keg_drain(uma_keg_t keg)
694 {
695 	struct slabhead freeslabs = { 0 };
696 	uma_slab_t slab;
697 	uma_slab_t n;
698 	u_int8_t flags;
699 	u_int8_t *mem;
700 	int i;
701 
702 	/*
703 	 * We don't want to take pages from statically allocated kegs at this
704 	 * time
705 	 */
706 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
707 		return;
708 
709 #ifdef UMA_DEBUG
710 	printf("%s free items: %u\n", keg->uk_name, keg->uk_free);
711 #endif
712 	KEG_LOCK(keg);
713 	if (keg->uk_free == 0)
714 		goto finished;
715 
716 	slab = LIST_FIRST(&keg->uk_free_slab);
717 	while (slab) {
718 		n = LIST_NEXT(slab, us_link);
719 
720 		/* We have no where to free these to */
721 		if (slab->us_flags & UMA_SLAB_BOOT) {
722 			slab = n;
723 			continue;
724 		}
725 
726 		LIST_REMOVE(slab, us_link);
727 		keg->uk_pages -= keg->uk_ppera;
728 		keg->uk_free -= keg->uk_ipers;
729 
730 		if (keg->uk_flags & UMA_ZONE_HASH)
731 			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
732 
733 		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
734 
735 		slab = n;
736 	}
737 finished:
738 	KEG_UNLOCK(keg);
739 
740 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
741 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
742 		if (keg->uk_fini)
743 			for (i = 0; i < keg->uk_ipers; i++)
744 				keg->uk_fini(
745 				    slab->us_data + (keg->uk_rsize * i),
746 				    keg->uk_size);
747 		flags = slab->us_flags;
748 		mem = slab->us_data;
749 
750 		if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
751 			vm_object_t obj;
752 
753 			if (flags & UMA_SLAB_KMEM)
754 				obj = kmem_object;
755 			else if (flags & UMA_SLAB_KERNEL)
756 				obj = kernel_object;
757 			else
758 				obj = NULL;
759 			for (i = 0; i < keg->uk_ppera; i++)
760 				vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
761 				    obj);
762 		}
763 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
764 			zone_free_item(keg->uk_slabzone, slab, NULL,
765 			    SKIP_NONE, ZFREE_STATFREE);
766 #ifdef UMA_DEBUG
767 		printf("%s: Returning %d bytes.\n",
768 		    keg->uk_name, UMA_SLAB_SIZE * keg->uk_ppera);
769 #endif
770 		keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);
771 	}
772 }
773 
774 static void
775 zone_drain_wait(uma_zone_t zone, int waitok)
776 {
777 
778 	/*
779 	 * Set draining to interlock with zone_dtor() so we can release our
780 	 * locks as we go.  Only dtor() should do a WAITOK call since it
781 	 * is the only call that knows the structure will still be available
782 	 * when it wakes up.
783 	 */
784 	ZONE_LOCK(zone);
785 	while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
786 		if (waitok == M_NOWAIT)
787 			goto out;
788 		mtx_unlock(&uma_mtx);
789 		msleep(zone, zone->uz_lock, PVM, "zonedrain", 1);
790 		mtx_lock(&uma_mtx);
791 	}
792 	zone->uz_flags |= UMA_ZFLAG_DRAINING;
793 	bucket_cache_drain(zone);
794 	ZONE_UNLOCK(zone);
795 	/*
796 	 * The DRAINING flag protects us from being freed while
797 	 * we're running.  Normally the uma_mtx would protect us but we
798 	 * must be able to release and acquire the right lock for each keg.
799 	 */
800 	zone_foreach_keg(zone, &keg_drain);
801 	ZONE_LOCK(zone);
802 	zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
803 	wakeup(zone);
804 out:
805 	ZONE_UNLOCK(zone);
806 }
807 
808 void
809 zone_drain(uma_zone_t zone)
810 {
811 
812 	zone_drain_wait(zone, M_NOWAIT);
813 }
814 
815 /*
816  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
817  *
818  * Arguments:
819  *	wait  Shall we wait?
820  *
821  * Returns:
822  *	The slab that was allocated or NULL if there is no memory and the
823  *	caller specified M_NOWAIT.
824  */
825 static uma_slab_t
826 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
827 {
828 	uma_slabrefcnt_t slabref;
829 	uma_alloc allocf;
830 	uma_slab_t slab;
831 	u_int8_t *mem;
832 	u_int8_t flags;
833 	int i;
834 
835 	mtx_assert(&keg->uk_lock, MA_OWNED);
836 	slab = NULL;
837 
838 #ifdef UMA_DEBUG
839 	printf("slab_zalloc:  Allocating a new slab for %s\n", keg->uk_name);
840 #endif
841 	allocf = keg->uk_allocf;
842 	KEG_UNLOCK(keg);
843 
844 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
845 		slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
846 		if (slab == NULL) {
847 			KEG_LOCK(keg);
848 			return NULL;
849 		}
850 	}
851 
852 	/*
853 	 * This reproduces the old vm_zone behavior of zero filling pages the
854 	 * first time they are added to a zone.
855 	 *
856 	 * Malloced items are zeroed in uma_zalloc.
857 	 */
858 
859 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
860 		wait |= M_ZERO;
861 	else
862 		wait &= ~M_ZERO;
863 
864 	if (keg->uk_flags & UMA_ZONE_NODUMP)
865 		wait |= M_NODUMP;
866 
867 	/* zone is passed for legacy reasons. */
868 	mem = allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE, &flags, wait);
869 	if (mem == NULL) {
870 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
871 			zone_free_item(keg->uk_slabzone, slab, NULL,
872 			    SKIP_NONE, ZFREE_STATFREE);
873 		KEG_LOCK(keg);
874 		return (NULL);
875 	}
876 
877 	/* Point the slab into the allocated memory */
878 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
879 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
880 
881 	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
882 		for (i = 0; i < keg->uk_ppera; i++)
883 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
884 
885 	slab->us_keg = keg;
886 	slab->us_data = mem;
887 	slab->us_freecount = keg->uk_ipers;
888 	slab->us_firstfree = 0;
889 	slab->us_flags = flags;
890 
891 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
892 		slabref = (uma_slabrefcnt_t)slab;
893 		for (i = 0; i < keg->uk_ipers; i++) {
894 			slabref->us_freelist[i].us_refcnt = 0;
895 			slabref->us_freelist[i].us_item = i+1;
896 		}
897 	} else {
898 		for (i = 0; i < keg->uk_ipers; i++)
899 			slab->us_freelist[i].us_item = i+1;
900 	}
901 
902 	if (keg->uk_init != NULL) {
903 		for (i = 0; i < keg->uk_ipers; i++)
904 			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
905 			    keg->uk_size, wait) != 0)
906 				break;
907 		if (i != keg->uk_ipers) {
908 			if (keg->uk_fini != NULL) {
909 				for (i--; i > -1; i--)
910 					keg->uk_fini(slab->us_data +
911 					    (keg->uk_rsize * i),
912 					    keg->uk_size);
913 			}
914 			if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
915 				vm_object_t obj;
916 
917 				if (flags & UMA_SLAB_KMEM)
918 					obj = kmem_object;
919 				else if (flags & UMA_SLAB_KERNEL)
920 					obj = kernel_object;
921 				else
922 					obj = NULL;
923 				for (i = 0; i < keg->uk_ppera; i++)
924 					vsetobj((vm_offset_t)mem +
925 					    (i * PAGE_SIZE), obj);
926 			}
927 			if (keg->uk_flags & UMA_ZONE_OFFPAGE)
928 				zone_free_item(keg->uk_slabzone, slab,
929 				    NULL, SKIP_NONE, ZFREE_STATFREE);
930 			keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera,
931 			    flags);
932 			KEG_LOCK(keg);
933 			return (NULL);
934 		}
935 	}
936 	KEG_LOCK(keg);
937 
938 	if (keg->uk_flags & UMA_ZONE_HASH)
939 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
940 
941 	keg->uk_pages += keg->uk_ppera;
942 	keg->uk_free += keg->uk_ipers;
943 
944 	return (slab);
945 }
946 
947 /*
948  * This function is intended to be used early on in place of page_alloc() so
949  * that we may use the boot time page cache to satisfy allocations before
950  * the VM is ready.
951  */
952 static void *
953 startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
954 {
955 	uma_keg_t keg;
956 	uma_slab_t tmps;
957 	int pages, check_pages;
958 
959 	keg = zone_first_keg(zone);
960 	pages = howmany(bytes, PAGE_SIZE);
961 	check_pages = pages - 1;
962 	KASSERT(pages > 0, ("startup_alloc can't reserve 0 pages\n"));
963 
964 	/*
965 	 * Check our small startup cache to see if it has pages remaining.
966 	 */
967 	mtx_lock(&uma_boot_pages_mtx);
968 
969 	/* First check if we have enough room. */
970 	tmps = LIST_FIRST(&uma_boot_pages);
971 	while (tmps != NULL && check_pages-- > 0)
972 		tmps = LIST_NEXT(tmps, us_link);
973 	if (tmps != NULL) {
974 		/*
975 		 * It's ok to lose tmps references.  The last one will
976 		 * have tmps->us_data pointing to the start address of
977 		 * "pages" contiguous pages of memory.
978 		 */
979 		while (pages-- > 0) {
980 			tmps = LIST_FIRST(&uma_boot_pages);
981 			LIST_REMOVE(tmps, us_link);
982 		}
983 		mtx_unlock(&uma_boot_pages_mtx);
984 		*pflag = tmps->us_flags;
985 		return (tmps->us_data);
986 	}
987 	mtx_unlock(&uma_boot_pages_mtx);
988 	if (booted < UMA_STARTUP2)
989 		panic("UMA: Increase vm.boot_pages");
990 	/*
991 	 * Now that we've booted reset these users to their real allocator.
992 	 */
993 #ifdef UMA_MD_SMALL_ALLOC
994 	keg->uk_allocf = (keg->uk_ppera > 1) ? page_alloc : uma_small_alloc;
995 #else
996 	keg->uk_allocf = page_alloc;
997 #endif
998 	return keg->uk_allocf(zone, bytes, pflag, wait);
999 }
1000 
1001 /*
1002  * Allocates a number of pages from the system
1003  *
1004  * Arguments:
1005  *	bytes  The number of bytes requested
1006  *	wait  Shall we wait?
1007  *
1008  * Returns:
1009  *	A pointer to the alloced memory or possibly
1010  *	NULL if M_NOWAIT is set.
1011  */
1012 static void *
1013 page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
1014 {
1015 	void *p;	/* Returned page */
1016 
1017 	*pflag = UMA_SLAB_KMEM;
1018 	p = (void *) kmem_malloc(kmem_map, bytes, wait);
1019 
1020 	return (p);
1021 }
1022 
1023 /*
1024  * Allocates a number of pages from within an object
1025  *
1026  * Arguments:
1027  *	bytes  The number of bytes requested
1028  *	wait   Shall we wait?
1029  *
1030  * Returns:
1031  *	A pointer to the alloced memory or possibly
1032  *	NULL if M_NOWAIT is set.
1033  */
1034 static void *
1035 noobj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
1036 {
1037 	TAILQ_HEAD(, vm_page) alloctail;
1038 	u_long npages;
1039 	vm_offset_t retkva, zkva;
1040 	vm_page_t p, p_next;
1041 	uma_keg_t keg;
1042 
1043 	TAILQ_INIT(&alloctail);
1044 	keg = zone_first_keg(zone);
1045 
1046 	npages = howmany(bytes, PAGE_SIZE);
1047 	while (npages > 0) {
1048 		p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
1049 		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
1050 		if (p != NULL) {
1051 			/*
1052 			 * Since the page does not belong to an object, its
1053 			 * listq is unused.
1054 			 */
1055 			TAILQ_INSERT_TAIL(&alloctail, p, listq);
1056 			npages--;
1057 			continue;
1058 		}
1059 		if (wait & M_WAITOK) {
1060 			VM_WAIT;
1061 			continue;
1062 		}
1063 
1064 		/*
1065 		 * Page allocation failed, free intermediate pages and
1066 		 * exit.
1067 		 */
1068 		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1069 			vm_page_unwire(p, 0);
1070 			vm_page_free(p);
1071 		}
1072 		return (NULL);
1073 	}
1074 	*flags = UMA_SLAB_PRIV;
1075 	zkva = keg->uk_kva +
1076 	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1077 	retkva = zkva;
1078 	TAILQ_FOREACH(p, &alloctail, listq) {
1079 		pmap_qenter(zkva, &p, 1);
1080 		zkva += PAGE_SIZE;
1081 	}
1082 
1083 	return ((void *)retkva);
1084 }
1085 
1086 /*
1087  * Frees a number of pages to the system
1088  *
1089  * Arguments:
1090  *	mem   A pointer to the memory to be freed
1091  *	size  The size of the memory being freed
1092  *	flags The original p->us_flags field
1093  *
1094  * Returns:
1095  *	Nothing
1096  */
1097 static void
1098 page_free(void *mem, int size, u_int8_t flags)
1099 {
1100 	vm_map_t map;
1101 
1102 	if (flags & UMA_SLAB_KMEM)
1103 		map = kmem_map;
1104 	else if (flags & UMA_SLAB_KERNEL)
1105 		map = kernel_map;
1106 	else
1107 		panic("UMA: page_free used with invalid flags %d", flags);
1108 
1109 	kmem_free(map, (vm_offset_t)mem, size);
1110 }
1111 
1112 /*
1113  * Zero fill initializer
1114  *
1115  * Arguments/Returns follow uma_init specifications
1116  */
1117 static int
1118 zero_init(void *mem, int size, int flags)
1119 {
1120 	bzero(mem, size);
1121 	return (0);
1122 }
1123 
1124 /*
1125  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1126  *
1127  * Arguments
1128  *	keg  The zone we should initialize
1129  *
1130  * Returns
1131  *	Nothing
1132  */
1133 static void
1134 keg_small_init(uma_keg_t keg)
1135 {
1136 	u_int rsize;
1137 	u_int memused;
1138 	u_int wastedspace;
1139 	u_int shsize;
1140 
1141 	KASSERT(keg != NULL, ("Keg is null in keg_small_init"));
1142 	rsize = keg->uk_size;
1143 
1144 	if (rsize < UMA_SMALLEST_UNIT)
1145 		rsize = UMA_SMALLEST_UNIT;
1146 	if (rsize & keg->uk_align)
1147 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1148 
1149 	keg->uk_rsize = rsize;
1150 	keg->uk_ppera = 1;
1151 
1152 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1153 		shsize = 0;
1154 	} else if (keg->uk_flags & UMA_ZONE_REFCNT) {
1155 		rsize += UMA_FRITMREF_SZ;	/* linkage & refcnt */
1156 		shsize = sizeof(struct uma_slab_refcnt);
1157 	} else {
1158 		rsize += UMA_FRITM_SZ;	/* Account for linkage */
1159 		shsize = sizeof(struct uma_slab);
1160 	}
1161 
1162 	keg->uk_ipers = (UMA_SLAB_SIZE - shsize) / rsize;
1163 	KASSERT(keg->uk_ipers != 0, ("keg_small_init: ipers is 0"));
1164 	memused = keg->uk_ipers * rsize + shsize;
1165 	wastedspace = UMA_SLAB_SIZE - memused;
1166 
1167 	/*
1168 	 * We can't do OFFPAGE if we're internal or if we've been
1169 	 * asked to not go to the VM for buckets.  If we do this we
1170 	 * may end up going to the VM (kmem_map) for slabs which we
1171 	 * do not want to do if we're UMA_ZFLAG_CACHEONLY as a
1172 	 * result of UMA_ZONE_VM, which clearly forbids it.
1173 	 */
1174 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1175 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1176 		return;
1177 
1178 	if ((wastedspace >= UMA_MAX_WASTE) &&
1179 	    (keg->uk_ipers < (UMA_SLAB_SIZE / keg->uk_rsize))) {
1180 		keg->uk_ipers = UMA_SLAB_SIZE / keg->uk_rsize;
1181 		KASSERT(keg->uk_ipers <= 255,
1182 		    ("keg_small_init: keg->uk_ipers too high!"));
1183 #ifdef UMA_DEBUG
1184 		printf("UMA decided we need offpage slab headers for "
1185 		    "keg: %s, calculated wastedspace = %d, "
1186 		    "maximum wasted space allowed = %d, "
1187 		    "calculated ipers = %d, "
1188 		    "new wasted space = %d\n", keg->uk_name, wastedspace,
1189 		    UMA_MAX_WASTE, keg->uk_ipers,
1190 		    UMA_SLAB_SIZE - keg->uk_ipers * keg->uk_rsize);
1191 #endif
1192 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1193 		if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1194 			keg->uk_flags |= UMA_ZONE_HASH;
1195 	}
1196 }
1197 
1198 /*
1199  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1200  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1201  * more complicated.
1202  *
1203  * Arguments
1204  *	keg  The keg we should initialize
1205  *
1206  * Returns
1207  *	Nothing
1208  */
1209 static void
1210 keg_large_init(uma_keg_t keg)
1211 {
1212 	int pages;
1213 
1214 	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1215 	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
1216 	    ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
1217 
1218 	pages = keg->uk_size / UMA_SLAB_SIZE;
1219 
1220 	/* Account for remainder */
1221 	if ((pages * UMA_SLAB_SIZE) < keg->uk_size)
1222 		pages++;
1223 
1224 	keg->uk_ppera = pages;
1225 	keg->uk_ipers = 1;
1226 	keg->uk_rsize = keg->uk_size;
1227 
1228 	/* We can't do OFFPAGE if we're internal, bail out here. */
1229 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
1230 		return;
1231 
1232 	keg->uk_flags |= UMA_ZONE_OFFPAGE;
1233 	if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1234 		keg->uk_flags |= UMA_ZONE_HASH;
1235 }
1236 
1237 static void
1238 keg_cachespread_init(uma_keg_t keg)
1239 {
1240 	int alignsize;
1241 	int trailer;
1242 	int pages;
1243 	int rsize;
1244 
1245 	alignsize = keg->uk_align + 1;
1246 	rsize = keg->uk_size;
1247 	/*
1248 	 * We want one item to start on every align boundary in a page.  To
1249 	 * do this we will span pages.  We will also extend the item by the
1250 	 * size of align if it is an even multiple of align.  Otherwise, it
1251 	 * would fall on the same boundary every time.
1252 	 */
1253 	if (rsize & keg->uk_align)
1254 		rsize = (rsize & ~keg->uk_align) + alignsize;
1255 	if ((rsize & alignsize) == 0)
1256 		rsize += alignsize;
1257 	trailer = rsize - keg->uk_size;
1258 	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1259 	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1260 	keg->uk_rsize = rsize;
1261 	keg->uk_ppera = pages;
1262 	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1263 	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1264 	KASSERT(keg->uk_ipers <= uma_max_ipers,
1265 	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1266 	    keg->uk_ipers));
1267 }
1268 
1269 /*
1270  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1271  * the keg onto the global keg list.
1272  *
1273  * Arguments/Returns follow uma_ctor specifications
1274  *	udata  Actually uma_kctor_args
1275  */
1276 static int
1277 keg_ctor(void *mem, int size, void *udata, int flags)
1278 {
1279 	struct uma_kctor_args *arg = udata;
1280 	uma_keg_t keg = mem;
1281 	uma_zone_t zone;
1282 
1283 	bzero(keg, size);
1284 	keg->uk_size = arg->size;
1285 	keg->uk_init = arg->uminit;
1286 	keg->uk_fini = arg->fini;
1287 	keg->uk_align = arg->align;
1288 	keg->uk_free = 0;
1289 	keg->uk_pages = 0;
1290 	keg->uk_flags = arg->flags;
1291 	keg->uk_allocf = page_alloc;
1292 	keg->uk_freef = page_free;
1293 	keg->uk_recurse = 0;
1294 	keg->uk_slabzone = NULL;
1295 
1296 	/*
1297 	 * The master zone is passed to us at keg-creation time.
1298 	 */
1299 	zone = arg->zone;
1300 	keg->uk_name = zone->uz_name;
1301 
1302 	if (arg->flags & UMA_ZONE_VM)
1303 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1304 
1305 	if (arg->flags & UMA_ZONE_ZINIT)
1306 		keg->uk_init = zero_init;
1307 
1308 	if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
1309 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
1310 
1311 	/*
1312 	 * The +UMA_FRITM_SZ added to uk_size is to account for the
1313 	 * linkage that is added to the size in keg_small_init().  If
1314 	 * we don't account for this here then we may end up in
1315 	 * keg_small_init() with a calculated 'ipers' of 0.
1316 	 */
1317 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
1318 		if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
1319 			keg_cachespread_init(keg);
1320 		else if ((keg->uk_size+UMA_FRITMREF_SZ) >
1321 		    (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)))
1322 			keg_large_init(keg);
1323 		else
1324 			keg_small_init(keg);
1325 	} else {
1326 		if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
1327 			keg_cachespread_init(keg);
1328 		else if ((keg->uk_size+UMA_FRITM_SZ) >
1329 		    (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
1330 			keg_large_init(keg);
1331 		else
1332 			keg_small_init(keg);
1333 	}
1334 
1335 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1336 		if (keg->uk_flags & UMA_ZONE_REFCNT)
1337 			keg->uk_slabzone = slabrefzone;
1338 		else
1339 			keg->uk_slabzone = slabzone;
1340 	}
1341 
1342 	/*
1343 	 * If we haven't booted yet we need allocations to go through the
1344 	 * startup cache until the vm is ready.
1345 	 */
1346 	if (keg->uk_ppera == 1) {
1347 #ifdef UMA_MD_SMALL_ALLOC
1348 		keg->uk_allocf = uma_small_alloc;
1349 		keg->uk_freef = uma_small_free;
1350 
1351 		if (booted < UMA_STARTUP)
1352 			keg->uk_allocf = startup_alloc;
1353 #else
1354 		if (booted < UMA_STARTUP2)
1355 			keg->uk_allocf = startup_alloc;
1356 #endif
1357 	} else if (booted < UMA_STARTUP2 &&
1358 	    (keg->uk_flags & UMA_ZFLAG_INTERNAL))
1359 		keg->uk_allocf = startup_alloc;
1360 
1361 	/*
1362 	 * Initialize keg's lock (shared among zones).
1363 	 */
1364 	if (arg->flags & UMA_ZONE_MTXCLASS)
1365 		KEG_LOCK_INIT(keg, 1);
1366 	else
1367 		KEG_LOCK_INIT(keg, 0);
1368 
1369 	/*
1370 	 * If we're putting the slab header in the actual page we need to
1371 	 * figure out where in each page it goes.  This calculates a right
1372 	 * justified offset into the memory on an ALIGN_PTR boundary.
1373 	 */
1374 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1375 		u_int totsize;
1376 
1377 		/* Size of the slab struct and free list */
1378 		if (keg->uk_flags & UMA_ZONE_REFCNT)
1379 			totsize = sizeof(struct uma_slab_refcnt) +
1380 			    keg->uk_ipers * UMA_FRITMREF_SZ;
1381 		else
1382 			totsize = sizeof(struct uma_slab) +
1383 			    keg->uk_ipers * UMA_FRITM_SZ;
1384 
1385 		if (totsize & UMA_ALIGN_PTR)
1386 			totsize = (totsize & ~UMA_ALIGN_PTR) +
1387 			    (UMA_ALIGN_PTR + 1);
1388 		keg->uk_pgoff = (UMA_SLAB_SIZE * keg->uk_ppera) - totsize;
1389 
1390 		if (keg->uk_flags & UMA_ZONE_REFCNT)
1391 			totsize = keg->uk_pgoff + sizeof(struct uma_slab_refcnt)
1392 			    + keg->uk_ipers * UMA_FRITMREF_SZ;
1393 		else
1394 			totsize = keg->uk_pgoff + sizeof(struct uma_slab)
1395 			    + keg->uk_ipers * UMA_FRITM_SZ;
1396 
1397 		/*
1398 		 * The only way the following is possible is if with our
1399 		 * UMA_ALIGN_PTR adjustments we are now bigger than
1400 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
1401 		 * mathematically possible for all cases, so we make
1402 		 * sure here anyway.
1403 		 */
1404 		if (totsize > UMA_SLAB_SIZE * keg->uk_ppera) {
1405 			printf("zone %s ipers %d rsize %d size %d\n",
1406 			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
1407 			    keg->uk_size);
1408 			panic("UMA slab won't fit.");
1409 		}
1410 	}
1411 
1412 	if (keg->uk_flags & UMA_ZONE_HASH)
1413 		hash_alloc(&keg->uk_hash);
1414 
1415 #ifdef UMA_DEBUG
1416 	printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n",
1417 	    zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
1418 	    keg->uk_ipers, keg->uk_ppera,
1419 	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
1420 #endif
1421 
1422 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1423 
1424 	mtx_lock(&uma_mtx);
1425 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1426 	mtx_unlock(&uma_mtx);
1427 	return (0);
1428 }
1429 
1430 /*
1431  * Zone header ctor.  This initializes all fields, locks, etc.
1432  *
1433  * Arguments/Returns follow uma_ctor specifications
1434  *	udata  Actually uma_zctor_args
1435  */
1436 static int
1437 zone_ctor(void *mem, int size, void *udata, int flags)
1438 {
1439 	struct uma_zctor_args *arg = udata;
1440 	uma_zone_t zone = mem;
1441 	uma_zone_t z;
1442 	uma_keg_t keg;
1443 
1444 	bzero(zone, size);
1445 	zone->uz_name = arg->name;
1446 	zone->uz_ctor = arg->ctor;
1447 	zone->uz_dtor = arg->dtor;
1448 	zone->uz_slab = zone_fetch_slab;
1449 	zone->uz_init = NULL;
1450 	zone->uz_fini = NULL;
1451 	zone->uz_allocs = 0;
1452 	zone->uz_frees = 0;
1453 	zone->uz_fails = 0;
1454 	zone->uz_sleeps = 0;
1455 	zone->uz_fills = zone->uz_count = 0;
1456 	zone->uz_flags = 0;
1457 	zone->uz_warning = NULL;
1458 	timevalclear(&zone->uz_ratecheck);
1459 	keg = arg->keg;
1460 
1461 	if (arg->flags & UMA_ZONE_SECONDARY) {
1462 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1463 		zone->uz_init = arg->uminit;
1464 		zone->uz_fini = arg->fini;
1465 		zone->uz_lock = &keg->uk_lock;
1466 		zone->uz_flags |= UMA_ZONE_SECONDARY;
1467 		mtx_lock(&uma_mtx);
1468 		ZONE_LOCK(zone);
1469 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1470 			if (LIST_NEXT(z, uz_link) == NULL) {
1471 				LIST_INSERT_AFTER(z, zone, uz_link);
1472 				break;
1473 			}
1474 		}
1475 		ZONE_UNLOCK(zone);
1476 		mtx_unlock(&uma_mtx);
1477 	} else if (keg == NULL) {
1478 		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1479 		    arg->align, arg->flags)) == NULL)
1480 			return (ENOMEM);
1481 	} else {
1482 		struct uma_kctor_args karg;
1483 		int error;
1484 
1485 		/* We should only be here from uma_startup() */
1486 		karg.size = arg->size;
1487 		karg.uminit = arg->uminit;
1488 		karg.fini = arg->fini;
1489 		karg.align = arg->align;
1490 		karg.flags = arg->flags;
1491 		karg.zone = zone;
1492 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1493 		    flags);
1494 		if (error)
1495 			return (error);
1496 	}
1497 	/*
1498 	 * Link in the first keg.
1499 	 */
1500 	zone->uz_klink.kl_keg = keg;
1501 	LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
1502 	zone->uz_lock = &keg->uk_lock;
1503 	zone->uz_size = keg->uk_size;
1504 	zone->uz_flags |= (keg->uk_flags &
1505 	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1506 
1507 	/*
1508 	 * Some internal zones don't have room allocated for the per cpu
1509 	 * caches.  If we're internal, bail out here.
1510 	 */
1511 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1512 		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1513 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1514 		return (0);
1515 	}
1516 
1517 	if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
1518 		zone->uz_count = BUCKET_MAX;
1519 	else if (keg->uk_ipers <= BUCKET_MAX)
1520 		zone->uz_count = keg->uk_ipers;
1521 	else
1522 		zone->uz_count = BUCKET_MAX;
1523 	return (0);
1524 }
1525 
1526 /*
1527  * Keg header dtor.  This frees all data, destroys locks, frees the hash
1528  * table and removes the keg from the global list.
1529  *
1530  * Arguments/Returns follow uma_dtor specifications
1531  *	udata  unused
1532  */
1533 static void
1534 keg_dtor(void *arg, int size, void *udata)
1535 {
1536 	uma_keg_t keg;
1537 
1538 	keg = (uma_keg_t)arg;
1539 	KEG_LOCK(keg);
1540 	if (keg->uk_free != 0) {
1541 		printf("Freed UMA keg was not empty (%d items). "
1542 		    " Lost %d pages of memory.\n",
1543 		    keg->uk_free, keg->uk_pages);
1544 	}
1545 	KEG_UNLOCK(keg);
1546 
1547 	hash_free(&keg->uk_hash);
1548 
1549 	KEG_LOCK_FINI(keg);
1550 }
1551 
1552 /*
1553  * Zone header dtor.
1554  *
1555  * Arguments/Returns follow uma_dtor specifications
1556  *	udata  unused
1557  */
1558 static void
1559 zone_dtor(void *arg, int size, void *udata)
1560 {
1561 	uma_klink_t klink;
1562 	uma_zone_t zone;
1563 	uma_keg_t keg;
1564 
1565 	zone = (uma_zone_t)arg;
1566 	keg = zone_first_keg(zone);
1567 
1568 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1569 		cache_drain(zone);
1570 
1571 	mtx_lock(&uma_mtx);
1572 	LIST_REMOVE(zone, uz_link);
1573 	mtx_unlock(&uma_mtx);
1574 	/*
1575 	 * XXX there are some races here where
1576 	 * the zone can be drained but zone lock
1577 	 * released and then refilled before we
1578 	 * remove it... we dont care for now
1579 	 */
1580 	zone_drain_wait(zone, M_WAITOK);
1581 	/*
1582 	 * Unlink all of our kegs.
1583 	 */
1584 	while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
1585 		klink->kl_keg = NULL;
1586 		LIST_REMOVE(klink, kl_link);
1587 		if (klink == &zone->uz_klink)
1588 			continue;
1589 		free(klink, M_TEMP);
1590 	}
1591 	/*
1592 	 * We only destroy kegs from non secondary zones.
1593 	 */
1594 	if ((zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
1595 		mtx_lock(&uma_mtx);
1596 		LIST_REMOVE(keg, uk_link);
1597 		mtx_unlock(&uma_mtx);
1598 		zone_free_item(kegs, keg, NULL, SKIP_NONE,
1599 		    ZFREE_STATFREE);
1600 	}
1601 }
1602 
1603 /*
1604  * Traverses every zone in the system and calls a callback
1605  *
1606  * Arguments:
1607  *	zfunc  A pointer to a function which accepts a zone
1608  *		as an argument.
1609  *
1610  * Returns:
1611  *	Nothing
1612  */
1613 static void
1614 zone_foreach(void (*zfunc)(uma_zone_t))
1615 {
1616 	uma_keg_t keg;
1617 	uma_zone_t zone;
1618 
1619 	mtx_lock(&uma_mtx);
1620 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
1621 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
1622 			zfunc(zone);
1623 	}
1624 	mtx_unlock(&uma_mtx);
1625 }
1626 
1627 /* Public functions */
1628 /* See uma.h */
1629 void
1630 uma_startup(void *bootmem, int boot_pages)
1631 {
1632 	struct uma_zctor_args args;
1633 	uma_slab_t slab;
1634 	u_int slabsize;
1635 	u_int objsize, totsize, wsize;
1636 	int i;
1637 
1638 #ifdef UMA_DEBUG
1639 	printf("Creating uma keg headers zone and keg.\n");
1640 #endif
1641 	mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
1642 
1643 	/*
1644 	 * Figure out the maximum number of items-per-slab we'll have if
1645 	 * we're using the OFFPAGE slab header to track free items, given
1646 	 * all possible object sizes and the maximum desired wastage
1647 	 * (UMA_MAX_WASTE).
1648 	 *
1649 	 * We iterate until we find an object size for
1650 	 * which the calculated wastage in keg_small_init() will be
1651 	 * enough to warrant OFFPAGE.  Since wastedspace versus objsize
1652 	 * is an overall increasing see-saw function, we find the smallest
1653 	 * objsize such that the wastage is always acceptable for objects
1654 	 * with that objsize or smaller.  Since a smaller objsize always
1655 	 * generates a larger possible uma_max_ipers, we use this computed
1656 	 * objsize to calculate the largest ipers possible.  Since the
1657 	 * ipers calculated for OFFPAGE slab headers is always larger than
1658 	 * the ipers initially calculated in keg_small_init(), we use
1659 	 * the former's equation (UMA_SLAB_SIZE / keg->uk_rsize) to
1660 	 * obtain the maximum ipers possible for offpage slab headers.
1661 	 *
1662 	 * It should be noted that ipers versus objsize is an inversly
1663 	 * proportional function which drops off rather quickly so as
1664 	 * long as our UMA_MAX_WASTE is such that the objsize we calculate
1665 	 * falls into the portion of the inverse relation AFTER the steep
1666 	 * falloff, then uma_max_ipers shouldn't be too high (~10 on i386).
1667 	 *
1668 	 * Note that we have 8-bits (1 byte) to use as a freelist index
1669 	 * inside the actual slab header itself and this is enough to
1670 	 * accomodate us.  In the worst case, a UMA_SMALLEST_UNIT sized
1671 	 * object with offpage slab header would have ipers =
1672 	 * UMA_SLAB_SIZE / UMA_SMALLEST_UNIT (currently = 256), which is
1673 	 * 1 greater than what our byte-integer freelist index can
1674 	 * accomodate, but we know that this situation never occurs as
1675 	 * for UMA_SMALLEST_UNIT-sized objects, we will never calculate
1676 	 * that we need to go to offpage slab headers.  Or, if we do,
1677 	 * then we trap that condition below and panic in the INVARIANTS case.
1678 	 */
1679 	wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab) - UMA_MAX_WASTE;
1680 	totsize = wsize;
1681 	objsize = UMA_SMALLEST_UNIT;
1682 	while (totsize >= wsize) {
1683 		totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) /
1684 		    (objsize + UMA_FRITM_SZ);
1685 		totsize *= (UMA_FRITM_SZ + objsize);
1686 		objsize++;
1687 	}
1688 	if (objsize > UMA_SMALLEST_UNIT)
1689 		objsize--;
1690 	uma_max_ipers = MAX(UMA_SLAB_SIZE / objsize, 64);
1691 
1692 	wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) - UMA_MAX_WASTE;
1693 	totsize = wsize;
1694 	objsize = UMA_SMALLEST_UNIT;
1695 	while (totsize >= wsize) {
1696 		totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)) /
1697 		    (objsize + UMA_FRITMREF_SZ);
1698 		totsize *= (UMA_FRITMREF_SZ + objsize);
1699 		objsize++;
1700 	}
1701 	if (objsize > UMA_SMALLEST_UNIT)
1702 		objsize--;
1703 	uma_max_ipers_ref = MAX(UMA_SLAB_SIZE / objsize, 64);
1704 
1705 	KASSERT((uma_max_ipers_ref <= 255) && (uma_max_ipers <= 255),
1706 	    ("uma_startup: calculated uma_max_ipers values too large!"));
1707 
1708 #ifdef UMA_DEBUG
1709 	printf("Calculated uma_max_ipers (for OFFPAGE) is %d\n", uma_max_ipers);
1710 	printf("Calculated uma_max_ipers_ref (for OFFPAGE) is %d\n",
1711 	    uma_max_ipers_ref);
1712 #endif
1713 
1714 	/* "manually" create the initial zone */
1715 	args.name = "UMA Kegs";
1716 	args.size = sizeof(struct uma_keg);
1717 	args.ctor = keg_ctor;
1718 	args.dtor = keg_dtor;
1719 	args.uminit = zero_init;
1720 	args.fini = NULL;
1721 	args.keg = &masterkeg;
1722 	args.align = 32 - 1;
1723 	args.flags = UMA_ZFLAG_INTERNAL;
1724 	/* The initial zone has no Per cpu queues so it's smaller */
1725 	zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK);
1726 
1727 #ifdef UMA_DEBUG
1728 	printf("Filling boot free list.\n");
1729 #endif
1730 	for (i = 0; i < boot_pages; i++) {
1731 		slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE));
1732 		slab->us_data = (u_int8_t *)slab;
1733 		slab->us_flags = UMA_SLAB_BOOT;
1734 		LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
1735 	}
1736 	mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF);
1737 
1738 #ifdef UMA_DEBUG
1739 	printf("Creating uma zone headers zone and keg.\n");
1740 #endif
1741 	args.name = "UMA Zones";
1742 	args.size = sizeof(struct uma_zone) +
1743 	    (sizeof(struct uma_cache) * (mp_maxid + 1));
1744 	args.ctor = zone_ctor;
1745 	args.dtor = zone_dtor;
1746 	args.uminit = zero_init;
1747 	args.fini = NULL;
1748 	args.keg = NULL;
1749 	args.align = 32 - 1;
1750 	args.flags = UMA_ZFLAG_INTERNAL;
1751 	/* The initial zone has no Per cpu queues so it's smaller */
1752 	zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
1753 
1754 #ifdef UMA_DEBUG
1755 	printf("Initializing pcpu cache locks.\n");
1756 #endif
1757 #ifdef UMA_DEBUG
1758 	printf("Creating slab and hash zones.\n");
1759 #endif
1760 
1761 	/*
1762 	 * This is the max number of free list items we'll have with
1763 	 * offpage slabs.
1764 	 */
1765 	slabsize = uma_max_ipers * UMA_FRITM_SZ;
1766 	slabsize += sizeof(struct uma_slab);
1767 
1768 	/* Now make a zone for slab headers */
1769 	slabzone = uma_zcreate("UMA Slabs",
1770 				slabsize,
1771 				NULL, NULL, NULL, NULL,
1772 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1773 
1774 	/*
1775 	 * We also create a zone for the bigger slabs with reference
1776 	 * counts in them, to accomodate UMA_ZONE_REFCNT zones.
1777 	 */
1778 	slabsize = uma_max_ipers_ref * UMA_FRITMREF_SZ;
1779 	slabsize += sizeof(struct uma_slab_refcnt);
1780 	slabrefzone = uma_zcreate("UMA RCntSlabs",
1781 				  slabsize,
1782 				  NULL, NULL, NULL, NULL,
1783 				  UMA_ALIGN_PTR,
1784 				  UMA_ZFLAG_INTERNAL);
1785 
1786 	hashzone = uma_zcreate("UMA Hash",
1787 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
1788 	    NULL, NULL, NULL, NULL,
1789 	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1790 
1791 	bucket_init();
1792 
1793 	booted = UMA_STARTUP;
1794 
1795 #ifdef UMA_DEBUG
1796 	printf("UMA startup complete.\n");
1797 #endif
1798 }
1799 
1800 /* see uma.h */
1801 void
1802 uma_startup2(void)
1803 {
1804 	booted = UMA_STARTUP2;
1805 	bucket_enable();
1806 #ifdef UMA_DEBUG
1807 	printf("UMA startup2 complete.\n");
1808 #endif
1809 }
1810 
1811 /*
1812  * Initialize our callout handle
1813  *
1814  */
1815 
1816 static void
1817 uma_startup3(void)
1818 {
1819 #ifdef UMA_DEBUG
1820 	printf("Starting callout.\n");
1821 #endif
1822 	callout_init(&uma_callout, CALLOUT_MPSAFE);
1823 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
1824 #ifdef UMA_DEBUG
1825 	printf("UMA startup3 complete.\n");
1826 #endif
1827 }
1828 
1829 static uma_keg_t
1830 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
1831 		int align, u_int32_t flags)
1832 {
1833 	struct uma_kctor_args args;
1834 
1835 	args.size = size;
1836 	args.uminit = uminit;
1837 	args.fini = fini;
1838 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
1839 	args.flags = flags;
1840 	args.zone = zone;
1841 	return (zone_alloc_item(kegs, &args, M_WAITOK));
1842 }
1843 
1844 /* See uma.h */
1845 void
1846 uma_set_align(int align)
1847 {
1848 
1849 	if (align != UMA_ALIGN_CACHE)
1850 		uma_align_cache = align;
1851 }
1852 
1853 /* See uma.h */
1854 uma_zone_t
1855 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
1856 		uma_init uminit, uma_fini fini, int align, u_int32_t flags)
1857 
1858 {
1859 	struct uma_zctor_args args;
1860 
1861 	/* This stuff is essential for the zone ctor */
1862 	args.name = name;
1863 	args.size = size;
1864 	args.ctor = ctor;
1865 	args.dtor = dtor;
1866 	args.uminit = uminit;
1867 	args.fini = fini;
1868 	args.align = align;
1869 	args.flags = flags;
1870 	args.keg = NULL;
1871 
1872 	return (zone_alloc_item(zones, &args, M_WAITOK));
1873 }
1874 
1875 /* See uma.h */
1876 uma_zone_t
1877 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
1878 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
1879 {
1880 	struct uma_zctor_args args;
1881 	uma_keg_t keg;
1882 
1883 	keg = zone_first_keg(master);
1884 	args.name = name;
1885 	args.size = keg->uk_size;
1886 	args.ctor = ctor;
1887 	args.dtor = dtor;
1888 	args.uminit = zinit;
1889 	args.fini = zfini;
1890 	args.align = keg->uk_align;
1891 	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
1892 	args.keg = keg;
1893 
1894 	/* XXX Attaches only one keg of potentially many. */
1895 	return (zone_alloc_item(zones, &args, M_WAITOK));
1896 }
1897 
1898 static void
1899 zone_lock_pair(uma_zone_t a, uma_zone_t b)
1900 {
1901 	if (a < b) {
1902 		ZONE_LOCK(a);
1903 		mtx_lock_flags(b->uz_lock, MTX_DUPOK);
1904 	} else {
1905 		ZONE_LOCK(b);
1906 		mtx_lock_flags(a->uz_lock, MTX_DUPOK);
1907 	}
1908 }
1909 
1910 static void
1911 zone_unlock_pair(uma_zone_t a, uma_zone_t b)
1912 {
1913 
1914 	ZONE_UNLOCK(a);
1915 	ZONE_UNLOCK(b);
1916 }
1917 
1918 int
1919 uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
1920 {
1921 	uma_klink_t klink;
1922 	uma_klink_t kl;
1923 	int error;
1924 
1925 	error = 0;
1926 	klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
1927 
1928 	zone_lock_pair(zone, master);
1929 	/*
1930 	 * zone must use vtoslab() to resolve objects and must already be
1931 	 * a secondary.
1932 	 */
1933 	if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
1934 	    != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
1935 		error = EINVAL;
1936 		goto out;
1937 	}
1938 	/*
1939 	 * The new master must also use vtoslab().
1940 	 */
1941 	if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
1942 		error = EINVAL;
1943 		goto out;
1944 	}
1945 	/*
1946 	 * Both must either be refcnt, or not be refcnt.
1947 	 */
1948 	if ((zone->uz_flags & UMA_ZONE_REFCNT) !=
1949 	    (master->uz_flags & UMA_ZONE_REFCNT)) {
1950 		error = EINVAL;
1951 		goto out;
1952 	}
1953 	/*
1954 	 * The underlying object must be the same size.  rsize
1955 	 * may be different.
1956 	 */
1957 	if (master->uz_size != zone->uz_size) {
1958 		error = E2BIG;
1959 		goto out;
1960 	}
1961 	/*
1962 	 * Put it at the end of the list.
1963 	 */
1964 	klink->kl_keg = zone_first_keg(master);
1965 	LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
1966 		if (LIST_NEXT(kl, kl_link) == NULL) {
1967 			LIST_INSERT_AFTER(kl, klink, kl_link);
1968 			break;
1969 		}
1970 	}
1971 	klink = NULL;
1972 	zone->uz_flags |= UMA_ZFLAG_MULTI;
1973 	zone->uz_slab = zone_fetch_slab_multi;
1974 
1975 out:
1976 	zone_unlock_pair(zone, master);
1977 	if (klink != NULL)
1978 		free(klink, M_TEMP);
1979 
1980 	return (error);
1981 }
1982 
1983 
1984 /* See uma.h */
1985 void
1986 uma_zdestroy(uma_zone_t zone)
1987 {
1988 
1989 	zone_free_item(zones, zone, NULL, SKIP_NONE, ZFREE_STATFREE);
1990 }
1991 
1992 /* See uma.h */
1993 void *
1994 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
1995 {
1996 	void *item;
1997 	uma_cache_t cache;
1998 	uma_bucket_t bucket;
1999 	int cpu;
2000 
2001 	/* This is the fast path allocation */
2002 #ifdef UMA_DEBUG_ALLOC_1
2003 	printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
2004 #endif
2005 	CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,
2006 	    zone->uz_name, flags);
2007 
2008 	if (flags & M_WAITOK) {
2009 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2010 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2011 	}
2012 #ifdef DEBUG_MEMGUARD
2013 	if (memguard_cmp_zone(zone)) {
2014 		item = memguard_alloc(zone->uz_size, flags);
2015 		if (item != NULL) {
2016 			/*
2017 			 * Avoid conflict with the use-after-free
2018 			 * protecting infrastructure from INVARIANTS.
2019 			 */
2020 			if (zone->uz_init != NULL &&
2021 			    zone->uz_init != mtrash_init &&
2022 			    zone->uz_init(item, zone->uz_size, flags) != 0)
2023 				return (NULL);
2024 			if (zone->uz_ctor != NULL &&
2025 			    zone->uz_ctor != mtrash_ctor &&
2026 			    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2027 			    	zone->uz_fini(item, zone->uz_size);
2028 				return (NULL);
2029 			}
2030 			return (item);
2031 		}
2032 		/* This is unfortunate but should not be fatal. */
2033 	}
2034 #endif
2035 	/*
2036 	 * If possible, allocate from the per-CPU cache.  There are two
2037 	 * requirements for safe access to the per-CPU cache: (1) the thread
2038 	 * accessing the cache must not be preempted or yield during access,
2039 	 * and (2) the thread must not migrate CPUs without switching which
2040 	 * cache it accesses.  We rely on a critical section to prevent
2041 	 * preemption and migration.  We release the critical section in
2042 	 * order to acquire the zone mutex if we are unable to allocate from
2043 	 * the current cache; when we re-acquire the critical section, we
2044 	 * must detect and handle migration if it has occurred.
2045 	 */
2046 zalloc_restart:
2047 	critical_enter();
2048 	cpu = curcpu;
2049 	cache = &zone->uz_cpu[cpu];
2050 
2051 zalloc_start:
2052 	bucket = cache->uc_allocbucket;
2053 
2054 	if (bucket) {
2055 		if (bucket->ub_cnt > 0) {
2056 			bucket->ub_cnt--;
2057 			item = bucket->ub_bucket[bucket->ub_cnt];
2058 #ifdef INVARIANTS
2059 			bucket->ub_bucket[bucket->ub_cnt] = NULL;
2060 #endif
2061 			KASSERT(item != NULL,
2062 			    ("uma_zalloc: Bucket pointer mangled."));
2063 			cache->uc_allocs++;
2064 			critical_exit();
2065 #ifdef INVARIANTS
2066 			ZONE_LOCK(zone);
2067 			uma_dbg_alloc(zone, NULL, item);
2068 			ZONE_UNLOCK(zone);
2069 #endif
2070 			if (zone->uz_ctor != NULL) {
2071 				if (zone->uz_ctor(item, zone->uz_size,
2072 				    udata, flags) != 0) {
2073 					zone_free_item(zone, item, udata,
2074 					    SKIP_DTOR, ZFREE_STATFAIL |
2075 					    ZFREE_STATFREE);
2076 					return (NULL);
2077 				}
2078 			}
2079 			if (flags & M_ZERO)
2080 				bzero(item, zone->uz_size);
2081 			return (item);
2082 		} else if (cache->uc_freebucket) {
2083 			/*
2084 			 * We have run out of items in our allocbucket.
2085 			 * See if we can switch with our free bucket.
2086 			 */
2087 			if (cache->uc_freebucket->ub_cnt > 0) {
2088 #ifdef UMA_DEBUG_ALLOC
2089 				printf("uma_zalloc: Swapping empty with"
2090 				    " alloc.\n");
2091 #endif
2092 				bucket = cache->uc_freebucket;
2093 				cache->uc_freebucket = cache->uc_allocbucket;
2094 				cache->uc_allocbucket = bucket;
2095 
2096 				goto zalloc_start;
2097 			}
2098 		}
2099 	}
2100 	/*
2101 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
2102 	 * we must go back to the zone.  This requires the zone lock, so we
2103 	 * must drop the critical section, then re-acquire it when we go back
2104 	 * to the cache.  Since the critical section is released, we may be
2105 	 * preempted or migrate.  As such, make sure not to maintain any
2106 	 * thread-local state specific to the cache from prior to releasing
2107 	 * the critical section.
2108 	 */
2109 	critical_exit();
2110 	ZONE_LOCK(zone);
2111 	critical_enter();
2112 	cpu = curcpu;
2113 	cache = &zone->uz_cpu[cpu];
2114 	bucket = cache->uc_allocbucket;
2115 	if (bucket != NULL) {
2116 		if (bucket->ub_cnt > 0) {
2117 			ZONE_UNLOCK(zone);
2118 			goto zalloc_start;
2119 		}
2120 		bucket = cache->uc_freebucket;
2121 		if (bucket != NULL && bucket->ub_cnt > 0) {
2122 			ZONE_UNLOCK(zone);
2123 			goto zalloc_start;
2124 		}
2125 	}
2126 
2127 	/* Since we have locked the zone we may as well send back our stats */
2128 	zone->uz_allocs += cache->uc_allocs;
2129 	cache->uc_allocs = 0;
2130 	zone->uz_frees += cache->uc_frees;
2131 	cache->uc_frees = 0;
2132 
2133 	/* Our old one is now a free bucket */
2134 	if (cache->uc_allocbucket) {
2135 		KASSERT(cache->uc_allocbucket->ub_cnt == 0,
2136 		    ("uma_zalloc_arg: Freeing a non free bucket."));
2137 		LIST_INSERT_HEAD(&zone->uz_free_bucket,
2138 		    cache->uc_allocbucket, ub_link);
2139 		cache->uc_allocbucket = NULL;
2140 	}
2141 
2142 	/* Check the free list for a new alloc bucket */
2143 	if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
2144 		KASSERT(bucket->ub_cnt != 0,
2145 		    ("uma_zalloc_arg: Returning an empty bucket."));
2146 
2147 		LIST_REMOVE(bucket, ub_link);
2148 		cache->uc_allocbucket = bucket;
2149 		ZONE_UNLOCK(zone);
2150 		goto zalloc_start;
2151 	}
2152 	/* We are no longer associated with this CPU. */
2153 	critical_exit();
2154 
2155 	/* Bump up our uz_count so we get here less */
2156 	if (zone->uz_count < BUCKET_MAX)
2157 		zone->uz_count++;
2158 
2159 	/*
2160 	 * Now lets just fill a bucket and put it on the free list.  If that
2161 	 * works we'll restart the allocation from the begining.
2162 	 */
2163 	if (zone_alloc_bucket(zone, flags)) {
2164 		ZONE_UNLOCK(zone);
2165 		goto zalloc_restart;
2166 	}
2167 	ZONE_UNLOCK(zone);
2168 	/*
2169 	 * We may not be able to get a bucket so return an actual item.
2170 	 */
2171 #ifdef UMA_DEBUG
2172 	printf("uma_zalloc_arg: Bucketzone returned NULL\n");
2173 #endif
2174 
2175 	item = zone_alloc_item(zone, udata, flags);
2176 	return (item);
2177 }
2178 
2179 static uma_slab_t
2180 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags)
2181 {
2182 	uma_slab_t slab;
2183 
2184 	mtx_assert(&keg->uk_lock, MA_OWNED);
2185 	slab = NULL;
2186 
2187 	for (;;) {
2188 		/*
2189 		 * Find a slab with some space.  Prefer slabs that are partially
2190 		 * used over those that are totally full.  This helps to reduce
2191 		 * fragmentation.
2192 		 */
2193 		if (keg->uk_free != 0) {
2194 			if (!LIST_EMPTY(&keg->uk_part_slab)) {
2195 				slab = LIST_FIRST(&keg->uk_part_slab);
2196 			} else {
2197 				slab = LIST_FIRST(&keg->uk_free_slab);
2198 				LIST_REMOVE(slab, us_link);
2199 				LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
2200 				    us_link);
2201 			}
2202 			MPASS(slab->us_keg == keg);
2203 			return (slab);
2204 		}
2205 
2206 		/*
2207 		 * M_NOVM means don't ask at all!
2208 		 */
2209 		if (flags & M_NOVM)
2210 			break;
2211 
2212 		if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
2213 			keg->uk_flags |= UMA_ZFLAG_FULL;
2214 			/*
2215 			 * If this is not a multi-zone, set the FULL bit.
2216 			 * Otherwise slab_multi() takes care of it.
2217 			 */
2218 			if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
2219 				zone->uz_flags |= UMA_ZFLAG_FULL;
2220 				zone_log_warning(zone);
2221 			}
2222 			if (flags & M_NOWAIT)
2223 				break;
2224 			zone->uz_sleeps++;
2225 			msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
2226 			continue;
2227 		}
2228 		keg->uk_recurse++;
2229 		slab = keg_alloc_slab(keg, zone, flags);
2230 		keg->uk_recurse--;
2231 		/*
2232 		 * If we got a slab here it's safe to mark it partially used
2233 		 * and return.  We assume that the caller is going to remove
2234 		 * at least one item.
2235 		 */
2236 		if (slab) {
2237 			MPASS(slab->us_keg == keg);
2238 			LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2239 			return (slab);
2240 		}
2241 		/*
2242 		 * We might not have been able to get a slab but another cpu
2243 		 * could have while we were unlocked.  Check again before we
2244 		 * fail.
2245 		 */
2246 		flags |= M_NOVM;
2247 	}
2248 	return (slab);
2249 }
2250 
2251 static inline void
2252 zone_relock(uma_zone_t zone, uma_keg_t keg)
2253 {
2254 	if (zone->uz_lock != &keg->uk_lock) {
2255 		KEG_UNLOCK(keg);
2256 		ZONE_LOCK(zone);
2257 	}
2258 }
2259 
2260 static inline void
2261 keg_relock(uma_keg_t keg, uma_zone_t zone)
2262 {
2263 	if (zone->uz_lock != &keg->uk_lock) {
2264 		ZONE_UNLOCK(zone);
2265 		KEG_LOCK(keg);
2266 	}
2267 }
2268 
2269 static uma_slab_t
2270 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags)
2271 {
2272 	uma_slab_t slab;
2273 
2274 	if (keg == NULL)
2275 		keg = zone_first_keg(zone);
2276 	/*
2277 	 * This is to prevent us from recursively trying to allocate
2278 	 * buckets.  The problem is that if an allocation forces us to
2279 	 * grab a new bucket we will call page_alloc, which will go off
2280 	 * and cause the vm to allocate vm_map_entries.  If we need new
2281 	 * buckets there too we will recurse in kmem_alloc and bad
2282 	 * things happen.  So instead we return a NULL bucket, and make
2283 	 * the code that allocates buckets smart enough to deal with it
2284 	 */
2285 	if (keg->uk_flags & UMA_ZFLAG_BUCKET && keg->uk_recurse != 0)
2286 		return (NULL);
2287 
2288 	for (;;) {
2289 		slab = keg_fetch_slab(keg, zone, flags);
2290 		if (slab)
2291 			return (slab);
2292 		if (flags & (M_NOWAIT | M_NOVM))
2293 			break;
2294 	}
2295 	return (NULL);
2296 }
2297 
2298 /*
2299  * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
2300  * with the keg locked.  Caller must call zone_relock() afterwards if the
2301  * zone lock is required.  On NULL the zone lock is held.
2302  *
2303  * The last pointer is used to seed the search.  It is not required.
2304  */
2305 static uma_slab_t
2306 zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags)
2307 {
2308 	uma_klink_t klink;
2309 	uma_slab_t slab;
2310 	uma_keg_t keg;
2311 	int flags;
2312 	int empty;
2313 	int full;
2314 
2315 	/*
2316 	 * Don't wait on the first pass.  This will skip limit tests
2317 	 * as well.  We don't want to block if we can find a provider
2318 	 * without blocking.
2319 	 */
2320 	flags = (rflags & ~M_WAITOK) | M_NOWAIT;
2321 	/*
2322 	 * Use the last slab allocated as a hint for where to start
2323 	 * the search.
2324 	 */
2325 	if (last) {
2326 		slab = keg_fetch_slab(last, zone, flags);
2327 		if (slab)
2328 			return (slab);
2329 		zone_relock(zone, last);
2330 		last = NULL;
2331 	}
2332 	/*
2333 	 * Loop until we have a slab incase of transient failures
2334 	 * while M_WAITOK is specified.  I'm not sure this is 100%
2335 	 * required but we've done it for so long now.
2336 	 */
2337 	for (;;) {
2338 		empty = 0;
2339 		full = 0;
2340 		/*
2341 		 * Search the available kegs for slabs.  Be careful to hold the
2342 		 * correct lock while calling into the keg layer.
2343 		 */
2344 		LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
2345 			keg = klink->kl_keg;
2346 			keg_relock(keg, zone);
2347 			if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
2348 				slab = keg_fetch_slab(keg, zone, flags);
2349 				if (slab)
2350 					return (slab);
2351 			}
2352 			if (keg->uk_flags & UMA_ZFLAG_FULL)
2353 				full++;
2354 			else
2355 				empty++;
2356 			zone_relock(zone, keg);
2357 		}
2358 		if (rflags & (M_NOWAIT | M_NOVM))
2359 			break;
2360 		flags = rflags;
2361 		/*
2362 		 * All kegs are full.  XXX We can't atomically check all kegs
2363 		 * and sleep so just sleep for a short period and retry.
2364 		 */
2365 		if (full && !empty) {
2366 			zone->uz_flags |= UMA_ZFLAG_FULL;
2367 			zone->uz_sleeps++;
2368 			zone_log_warning(zone);
2369 			msleep(zone, zone->uz_lock, PVM, "zonelimit", hz/100);
2370 			zone->uz_flags &= ~UMA_ZFLAG_FULL;
2371 			continue;
2372 		}
2373 	}
2374 	return (NULL);
2375 }
2376 
2377 static void *
2378 slab_alloc_item(uma_zone_t zone, uma_slab_t slab)
2379 {
2380 	uma_keg_t keg;
2381 	uma_slabrefcnt_t slabref;
2382 	void *item;
2383 	u_int8_t freei;
2384 
2385 	keg = slab->us_keg;
2386 	mtx_assert(&keg->uk_lock, MA_OWNED);
2387 
2388 	freei = slab->us_firstfree;
2389 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
2390 		slabref = (uma_slabrefcnt_t)slab;
2391 		slab->us_firstfree = slabref->us_freelist[freei].us_item;
2392 	} else {
2393 		slab->us_firstfree = slab->us_freelist[freei].us_item;
2394 	}
2395 	item = slab->us_data + (keg->uk_rsize * freei);
2396 
2397 	slab->us_freecount--;
2398 	keg->uk_free--;
2399 #ifdef INVARIANTS
2400 	uma_dbg_alloc(zone, slab, item);
2401 #endif
2402 	/* Move this slab to the full list */
2403 	if (slab->us_freecount == 0) {
2404 		LIST_REMOVE(slab, us_link);
2405 		LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
2406 	}
2407 
2408 	return (item);
2409 }
2410 
2411 static int
2412 zone_alloc_bucket(uma_zone_t zone, int flags)
2413 {
2414 	uma_bucket_t bucket;
2415 	uma_slab_t slab;
2416 	uma_keg_t keg;
2417 	int16_t saved;
2418 	int max, origflags = flags;
2419 
2420 	/*
2421 	 * Try this zone's free list first so we don't allocate extra buckets.
2422 	 */
2423 	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
2424 		KASSERT(bucket->ub_cnt == 0,
2425 		    ("zone_alloc_bucket: Bucket on free list is not empty."));
2426 		LIST_REMOVE(bucket, ub_link);
2427 	} else {
2428 		int bflags;
2429 
2430 		bflags = (flags & ~M_ZERO);
2431 		if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
2432 			bflags |= M_NOVM;
2433 
2434 		ZONE_UNLOCK(zone);
2435 		bucket = bucket_alloc(zone->uz_count, bflags);
2436 		ZONE_LOCK(zone);
2437 	}
2438 
2439 	if (bucket == NULL) {
2440 		return (0);
2441 	}
2442 
2443 #ifdef SMP
2444 	/*
2445 	 * This code is here to limit the number of simultaneous bucket fills
2446 	 * for any given zone to the number of per cpu caches in this zone. This
2447 	 * is done so that we don't allocate more memory than we really need.
2448 	 */
2449 	if (zone->uz_fills >= mp_ncpus)
2450 		goto done;
2451 
2452 #endif
2453 	zone->uz_fills++;
2454 
2455 	max = MIN(bucket->ub_entries, zone->uz_count);
2456 	/* Try to keep the buckets totally full */
2457 	saved = bucket->ub_cnt;
2458 	slab = NULL;
2459 	keg = NULL;
2460 	while (bucket->ub_cnt < max &&
2461 	    (slab = zone->uz_slab(zone, keg, flags)) != NULL) {
2462 		keg = slab->us_keg;
2463 		while (slab->us_freecount && bucket->ub_cnt < max) {
2464 			bucket->ub_bucket[bucket->ub_cnt++] =
2465 			    slab_alloc_item(zone, slab);
2466 		}
2467 
2468 		/* Don't block on the next fill */
2469 		flags |= M_NOWAIT;
2470 	}
2471 	if (slab)
2472 		zone_relock(zone, keg);
2473 
2474 	/*
2475 	 * We unlock here because we need to call the zone's init.
2476 	 * It should be safe to unlock because the slab dealt with
2477 	 * above is already on the appropriate list within the keg
2478 	 * and the bucket we filled is not yet on any list, so we
2479 	 * own it.
2480 	 */
2481 	if (zone->uz_init != NULL) {
2482 		int i;
2483 
2484 		ZONE_UNLOCK(zone);
2485 		for (i = saved; i < bucket->ub_cnt; i++)
2486 			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
2487 			    origflags) != 0)
2488 				break;
2489 		/*
2490 		 * If we couldn't initialize the whole bucket, put the
2491 		 * rest back onto the freelist.
2492 		 */
2493 		if (i != bucket->ub_cnt) {
2494 			int j;
2495 
2496 			for (j = i; j < bucket->ub_cnt; j++) {
2497 				zone_free_item(zone, bucket->ub_bucket[j],
2498 				    NULL, SKIP_FINI, 0);
2499 #ifdef INVARIANTS
2500 				bucket->ub_bucket[j] = NULL;
2501 #endif
2502 			}
2503 			bucket->ub_cnt = i;
2504 		}
2505 		ZONE_LOCK(zone);
2506 	}
2507 
2508 	zone->uz_fills--;
2509 	if (bucket->ub_cnt != 0) {
2510 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
2511 		    bucket, ub_link);
2512 		return (1);
2513 	}
2514 #ifdef SMP
2515 done:
2516 #endif
2517 	bucket_free(bucket);
2518 
2519 	return (0);
2520 }
2521 /*
2522  * Allocates an item for an internal zone
2523  *
2524  * Arguments
2525  *	zone   The zone to alloc for.
2526  *	udata  The data to be passed to the constructor.
2527  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
2528  *
2529  * Returns
2530  *	NULL if there is no memory and M_NOWAIT is set
2531  *	An item if successful
2532  */
2533 
2534 static void *
2535 zone_alloc_item(uma_zone_t zone, void *udata, int flags)
2536 {
2537 	uma_slab_t slab;
2538 	void *item;
2539 
2540 	item = NULL;
2541 
2542 #ifdef UMA_DEBUG_ALLOC
2543 	printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
2544 #endif
2545 	ZONE_LOCK(zone);
2546 
2547 	slab = zone->uz_slab(zone, NULL, flags);
2548 	if (slab == NULL) {
2549 		zone->uz_fails++;
2550 		ZONE_UNLOCK(zone);
2551 		return (NULL);
2552 	}
2553 
2554 	item = slab_alloc_item(zone, slab);
2555 
2556 	zone_relock(zone, slab->us_keg);
2557 	zone->uz_allocs++;
2558 	ZONE_UNLOCK(zone);
2559 
2560 	/*
2561 	 * We have to call both the zone's init (not the keg's init)
2562 	 * and the zone's ctor.  This is because the item is going from
2563 	 * a keg slab directly to the user, and the user is expecting it
2564 	 * to be both zone-init'd as well as zone-ctor'd.
2565 	 */
2566 	if (zone->uz_init != NULL) {
2567 		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
2568 			zone_free_item(zone, item, udata, SKIP_FINI,
2569 			    ZFREE_STATFAIL | ZFREE_STATFREE);
2570 			return (NULL);
2571 		}
2572 	}
2573 	if (zone->uz_ctor != NULL) {
2574 		if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2575 			zone_free_item(zone, item, udata, SKIP_DTOR,
2576 			    ZFREE_STATFAIL | ZFREE_STATFREE);
2577 			return (NULL);
2578 		}
2579 	}
2580 	if (flags & M_ZERO)
2581 		bzero(item, zone->uz_size);
2582 
2583 	return (item);
2584 }
2585 
2586 /* See uma.h */
2587 void
2588 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
2589 {
2590 	uma_cache_t cache;
2591 	uma_bucket_t bucket;
2592 	int bflags;
2593 	int cpu;
2594 
2595 #ifdef UMA_DEBUG_ALLOC_1
2596 	printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
2597 #endif
2598 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
2599 	    zone->uz_name);
2600 
2601         /* uma_zfree(..., NULL) does nothing, to match free(9). */
2602         if (item == NULL)
2603                 return;
2604 #ifdef DEBUG_MEMGUARD
2605 	if (is_memguard_addr(item)) {
2606 		if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor)
2607 			zone->uz_dtor(item, zone->uz_size, udata);
2608 		if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini)
2609 			zone->uz_fini(item, zone->uz_size);
2610 		memguard_free(item);
2611 		return;
2612 	}
2613 #endif
2614 	if (zone->uz_dtor)
2615 		zone->uz_dtor(item, zone->uz_size, udata);
2616 
2617 #ifdef INVARIANTS
2618 	ZONE_LOCK(zone);
2619 	if (zone->uz_flags & UMA_ZONE_MALLOC)
2620 		uma_dbg_free(zone, udata, item);
2621 	else
2622 		uma_dbg_free(zone, NULL, item);
2623 	ZONE_UNLOCK(zone);
2624 #endif
2625 	/*
2626 	 * The race here is acceptable.  If we miss it we'll just have to wait
2627 	 * a little longer for the limits to be reset.
2628 	 */
2629 	if (zone->uz_flags & UMA_ZFLAG_FULL)
2630 		goto zfree_internal;
2631 
2632 	/*
2633 	 * If possible, free to the per-CPU cache.  There are two
2634 	 * requirements for safe access to the per-CPU cache: (1) the thread
2635 	 * accessing the cache must not be preempted or yield during access,
2636 	 * and (2) the thread must not migrate CPUs without switching which
2637 	 * cache it accesses.  We rely on a critical section to prevent
2638 	 * preemption and migration.  We release the critical section in
2639 	 * order to acquire the zone mutex if we are unable to free to the
2640 	 * current cache; when we re-acquire the critical section, we must
2641 	 * detect and handle migration if it has occurred.
2642 	 */
2643 zfree_restart:
2644 	critical_enter();
2645 	cpu = curcpu;
2646 	cache = &zone->uz_cpu[cpu];
2647 
2648 zfree_start:
2649 	bucket = cache->uc_freebucket;
2650 
2651 	if (bucket) {
2652 		/*
2653 		 * Do we have room in our bucket? It is OK for this uz count
2654 		 * check to be slightly out of sync.
2655 		 */
2656 
2657 		if (bucket->ub_cnt < bucket->ub_entries) {
2658 			KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
2659 			    ("uma_zfree: Freeing to non free bucket index."));
2660 			bucket->ub_bucket[bucket->ub_cnt] = item;
2661 			bucket->ub_cnt++;
2662 			cache->uc_frees++;
2663 			critical_exit();
2664 			return;
2665 		} else if (cache->uc_allocbucket) {
2666 #ifdef UMA_DEBUG_ALLOC
2667 			printf("uma_zfree: Swapping buckets.\n");
2668 #endif
2669 			/*
2670 			 * We have run out of space in our freebucket.
2671 			 * See if we can switch with our alloc bucket.
2672 			 */
2673 			if (cache->uc_allocbucket->ub_cnt <
2674 			    cache->uc_freebucket->ub_cnt) {
2675 				bucket = cache->uc_freebucket;
2676 				cache->uc_freebucket = cache->uc_allocbucket;
2677 				cache->uc_allocbucket = bucket;
2678 				goto zfree_start;
2679 			}
2680 		}
2681 	}
2682 	/*
2683 	 * We can get here for two reasons:
2684 	 *
2685 	 * 1) The buckets are NULL
2686 	 * 2) The alloc and free buckets are both somewhat full.
2687 	 *
2688 	 * We must go back the zone, which requires acquiring the zone lock,
2689 	 * which in turn means we must release and re-acquire the critical
2690 	 * section.  Since the critical section is released, we may be
2691 	 * preempted or migrate.  As such, make sure not to maintain any
2692 	 * thread-local state specific to the cache from prior to releasing
2693 	 * the critical section.
2694 	 */
2695 	critical_exit();
2696 	ZONE_LOCK(zone);
2697 	critical_enter();
2698 	cpu = curcpu;
2699 	cache = &zone->uz_cpu[cpu];
2700 	if (cache->uc_freebucket != NULL) {
2701 		if (cache->uc_freebucket->ub_cnt <
2702 		    cache->uc_freebucket->ub_entries) {
2703 			ZONE_UNLOCK(zone);
2704 			goto zfree_start;
2705 		}
2706 		if (cache->uc_allocbucket != NULL &&
2707 		    (cache->uc_allocbucket->ub_cnt <
2708 		    cache->uc_freebucket->ub_cnt)) {
2709 			ZONE_UNLOCK(zone);
2710 			goto zfree_start;
2711 		}
2712 	}
2713 
2714 	/* Since we have locked the zone we may as well send back our stats */
2715 	zone->uz_allocs += cache->uc_allocs;
2716 	cache->uc_allocs = 0;
2717 	zone->uz_frees += cache->uc_frees;
2718 	cache->uc_frees = 0;
2719 
2720 	bucket = cache->uc_freebucket;
2721 	cache->uc_freebucket = NULL;
2722 
2723 	/* Can we throw this on the zone full list? */
2724 	if (bucket != NULL) {
2725 #ifdef UMA_DEBUG_ALLOC
2726 		printf("uma_zfree: Putting old bucket on the free list.\n");
2727 #endif
2728 		/* ub_cnt is pointing to the last free item */
2729 		KASSERT(bucket->ub_cnt != 0,
2730 		    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
2731 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
2732 		    bucket, ub_link);
2733 	}
2734 	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
2735 		LIST_REMOVE(bucket, ub_link);
2736 		ZONE_UNLOCK(zone);
2737 		cache->uc_freebucket = bucket;
2738 		goto zfree_start;
2739 	}
2740 	/* We are no longer associated with this CPU. */
2741 	critical_exit();
2742 
2743 	/* And the zone.. */
2744 	ZONE_UNLOCK(zone);
2745 
2746 #ifdef UMA_DEBUG_ALLOC
2747 	printf("uma_zfree: Allocating new free bucket.\n");
2748 #endif
2749 	bflags = M_NOWAIT;
2750 
2751 	if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
2752 		bflags |= M_NOVM;
2753 	bucket = bucket_alloc(zone->uz_count, bflags);
2754 	if (bucket) {
2755 		ZONE_LOCK(zone);
2756 		LIST_INSERT_HEAD(&zone->uz_free_bucket,
2757 		    bucket, ub_link);
2758 		ZONE_UNLOCK(zone);
2759 		goto zfree_restart;
2760 	}
2761 
2762 	/*
2763 	 * If nothing else caught this, we'll just do an internal free.
2764 	 */
2765 zfree_internal:
2766 	zone_free_item(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE);
2767 
2768 	return;
2769 }
2770 
2771 /*
2772  * Frees an item to an INTERNAL zone or allocates a free bucket
2773  *
2774  * Arguments:
2775  *	zone   The zone to free to
2776  *	item   The item we're freeing
2777  *	udata  User supplied data for the dtor
2778  *	skip   Skip dtors and finis
2779  */
2780 static void
2781 zone_free_item(uma_zone_t zone, void *item, void *udata,
2782     enum zfreeskip skip, int flags)
2783 {
2784 	uma_slab_t slab;
2785 	uma_slabrefcnt_t slabref;
2786 	uma_keg_t keg;
2787 	u_int8_t *mem;
2788 	u_int8_t freei;
2789 	int clearfull;
2790 
2791 	if (skip < SKIP_DTOR && zone->uz_dtor)
2792 		zone->uz_dtor(item, zone->uz_size, udata);
2793 
2794 	if (skip < SKIP_FINI && zone->uz_fini)
2795 		zone->uz_fini(item, zone->uz_size);
2796 
2797 	ZONE_LOCK(zone);
2798 
2799 	if (flags & ZFREE_STATFAIL)
2800 		zone->uz_fails++;
2801 	if (flags & ZFREE_STATFREE)
2802 		zone->uz_frees++;
2803 
2804 	if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
2805 		mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
2806 		keg = zone_first_keg(zone); /* Must only be one. */
2807 		if (zone->uz_flags & UMA_ZONE_HASH) {
2808 			slab = hash_sfind(&keg->uk_hash, mem);
2809 		} else {
2810 			mem += keg->uk_pgoff;
2811 			slab = (uma_slab_t)mem;
2812 		}
2813 	} else {
2814 		/* This prevents redundant lookups via free(). */
2815 		if ((zone->uz_flags & UMA_ZONE_MALLOC) && udata != NULL)
2816 			slab = (uma_slab_t)udata;
2817 		else
2818 			slab = vtoslab((vm_offset_t)item);
2819 		keg = slab->us_keg;
2820 		keg_relock(keg, zone);
2821 	}
2822 	MPASS(keg == slab->us_keg);
2823 
2824 	/* Do we need to remove from any lists? */
2825 	if (slab->us_freecount+1 == keg->uk_ipers) {
2826 		LIST_REMOVE(slab, us_link);
2827 		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
2828 	} else if (slab->us_freecount == 0) {
2829 		LIST_REMOVE(slab, us_link);
2830 		LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2831 	}
2832 
2833 	/* Slab management stuff */
2834 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
2835 		/ keg->uk_rsize;
2836 
2837 #ifdef INVARIANTS
2838 	if (!skip)
2839 		uma_dbg_free(zone, slab, item);
2840 #endif
2841 
2842 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
2843 		slabref = (uma_slabrefcnt_t)slab;
2844 		slabref->us_freelist[freei].us_item = slab->us_firstfree;
2845 	} else {
2846 		slab->us_freelist[freei].us_item = slab->us_firstfree;
2847 	}
2848 	slab->us_firstfree = freei;
2849 	slab->us_freecount++;
2850 
2851 	/* Zone statistics */
2852 	keg->uk_free++;
2853 
2854 	clearfull = 0;
2855 	if (keg->uk_flags & UMA_ZFLAG_FULL) {
2856 		if (keg->uk_pages < keg->uk_maxpages) {
2857 			keg->uk_flags &= ~UMA_ZFLAG_FULL;
2858 			clearfull = 1;
2859 		}
2860 
2861 		/*
2862 		 * We can handle one more allocation. Since we're clearing ZFLAG_FULL,
2863 		 * wake up all procs blocked on pages. This should be uncommon, so
2864 		 * keeping this simple for now (rather than adding count of blocked
2865 		 * threads etc).
2866 		 */
2867 		wakeup(keg);
2868 	}
2869 	if (clearfull) {
2870 		zone_relock(zone, keg);
2871 		zone->uz_flags &= ~UMA_ZFLAG_FULL;
2872 		wakeup(zone);
2873 		ZONE_UNLOCK(zone);
2874 	} else
2875 		KEG_UNLOCK(keg);
2876 }
2877 
2878 /* See uma.h */
2879 int
2880 uma_zone_set_max(uma_zone_t zone, int nitems)
2881 {
2882 	uma_keg_t keg;
2883 
2884 	ZONE_LOCK(zone);
2885 	keg = zone_first_keg(zone);
2886 	keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
2887 	if (keg->uk_maxpages * keg->uk_ipers < nitems)
2888 		keg->uk_maxpages += keg->uk_ppera;
2889 	nitems = keg->uk_maxpages * keg->uk_ipers;
2890 	ZONE_UNLOCK(zone);
2891 
2892 	return (nitems);
2893 }
2894 
2895 /* See uma.h */
2896 int
2897 uma_zone_get_max(uma_zone_t zone)
2898 {
2899 	int nitems;
2900 	uma_keg_t keg;
2901 
2902 	ZONE_LOCK(zone);
2903 	keg = zone_first_keg(zone);
2904 	nitems = keg->uk_maxpages * keg->uk_ipers;
2905 	ZONE_UNLOCK(zone);
2906 
2907 	return (nitems);
2908 }
2909 
2910 /* See uma.h */
2911 void
2912 uma_zone_set_warning(uma_zone_t zone, const char *warning)
2913 {
2914 
2915 	ZONE_LOCK(zone);
2916 	zone->uz_warning = warning;
2917 	ZONE_UNLOCK(zone);
2918 }
2919 
2920 /* See uma.h */
2921 int
2922 uma_zone_get_cur(uma_zone_t zone)
2923 {
2924 	int64_t nitems;
2925 	u_int i;
2926 
2927 	ZONE_LOCK(zone);
2928 	nitems = zone->uz_allocs - zone->uz_frees;
2929 	CPU_FOREACH(i) {
2930 		/*
2931 		 * See the comment in sysctl_vm_zone_stats() regarding the
2932 		 * safety of accessing the per-cpu caches. With the zone lock
2933 		 * held, it is safe, but can potentially result in stale data.
2934 		 */
2935 		nitems += zone->uz_cpu[i].uc_allocs -
2936 		    zone->uz_cpu[i].uc_frees;
2937 	}
2938 	ZONE_UNLOCK(zone);
2939 
2940 	return (nitems < 0 ? 0 : nitems);
2941 }
2942 
2943 /* See uma.h */
2944 void
2945 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
2946 {
2947 	uma_keg_t keg;
2948 
2949 	ZONE_LOCK(zone);
2950 	keg = zone_first_keg(zone);
2951 	KASSERT(keg->uk_pages == 0,
2952 	    ("uma_zone_set_init on non-empty keg"));
2953 	keg->uk_init = uminit;
2954 	ZONE_UNLOCK(zone);
2955 }
2956 
2957 /* See uma.h */
2958 void
2959 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
2960 {
2961 	uma_keg_t keg;
2962 
2963 	ZONE_LOCK(zone);
2964 	keg = zone_first_keg(zone);
2965 	KASSERT(keg->uk_pages == 0,
2966 	    ("uma_zone_set_fini on non-empty keg"));
2967 	keg->uk_fini = fini;
2968 	ZONE_UNLOCK(zone);
2969 }
2970 
2971 /* See uma.h */
2972 void
2973 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
2974 {
2975 	ZONE_LOCK(zone);
2976 	KASSERT(zone_first_keg(zone)->uk_pages == 0,
2977 	    ("uma_zone_set_zinit on non-empty keg"));
2978 	zone->uz_init = zinit;
2979 	ZONE_UNLOCK(zone);
2980 }
2981 
2982 /* See uma.h */
2983 void
2984 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
2985 {
2986 	ZONE_LOCK(zone);
2987 	KASSERT(zone_first_keg(zone)->uk_pages == 0,
2988 	    ("uma_zone_set_zfini on non-empty keg"));
2989 	zone->uz_fini = zfini;
2990 	ZONE_UNLOCK(zone);
2991 }
2992 
2993 /* See uma.h */
2994 /* XXX uk_freef is not actually used with the zone locked */
2995 void
2996 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
2997 {
2998 
2999 	ZONE_LOCK(zone);
3000 	zone_first_keg(zone)->uk_freef = freef;
3001 	ZONE_UNLOCK(zone);
3002 }
3003 
3004 /* See uma.h */
3005 /* XXX uk_allocf is not actually used with the zone locked */
3006 void
3007 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3008 {
3009 	uma_keg_t keg;
3010 
3011 	ZONE_LOCK(zone);
3012 	keg = zone_first_keg(zone);
3013 	keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
3014 	keg->uk_allocf = allocf;
3015 	ZONE_UNLOCK(zone);
3016 }
3017 
3018 /* See uma.h */
3019 int
3020 uma_zone_reserve_kva(uma_zone_t zone, int count)
3021 {
3022 	uma_keg_t keg;
3023 	vm_offset_t kva;
3024 	int pages;
3025 
3026 	keg = zone_first_keg(zone);
3027 	pages = count / keg->uk_ipers;
3028 
3029 	if (pages * keg->uk_ipers < count)
3030 		pages++;
3031 
3032 #ifdef UMA_MD_SMALL_ALLOC
3033 	if (keg->uk_ppera > 1) {
3034 #else
3035 	if (1) {
3036 #endif
3037 		kva = kmem_alloc_nofault(kernel_map, pages * UMA_SLAB_SIZE);
3038 		if (kva == 0)
3039 			return (0);
3040 	} else
3041 		kva = 0;
3042 	ZONE_LOCK(zone);
3043 	keg->uk_kva = kva;
3044 	keg->uk_offset = 0;
3045 	keg->uk_maxpages = pages;
3046 #ifdef UMA_MD_SMALL_ALLOC
3047 	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3048 #else
3049 	keg->uk_allocf = noobj_alloc;
3050 #endif
3051 	keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
3052 	ZONE_UNLOCK(zone);
3053 	return (1);
3054 }
3055 
3056 /* See uma.h */
3057 void
3058 uma_prealloc(uma_zone_t zone, int items)
3059 {
3060 	int slabs;
3061 	uma_slab_t slab;
3062 	uma_keg_t keg;
3063 
3064 	keg = zone_first_keg(zone);
3065 	ZONE_LOCK(zone);
3066 	slabs = items / keg->uk_ipers;
3067 	if (slabs * keg->uk_ipers < items)
3068 		slabs++;
3069 	while (slabs > 0) {
3070 		slab = keg_alloc_slab(keg, zone, M_WAITOK);
3071 		if (slab == NULL)
3072 			break;
3073 		MPASS(slab->us_keg == keg);
3074 		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
3075 		slabs--;
3076 	}
3077 	ZONE_UNLOCK(zone);
3078 }
3079 
3080 /* See uma.h */
3081 u_int32_t *
3082 uma_find_refcnt(uma_zone_t zone, void *item)
3083 {
3084 	uma_slabrefcnt_t slabref;
3085 	uma_keg_t keg;
3086 	u_int32_t *refcnt;
3087 	int idx;
3088 
3089 	slabref = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item &
3090 	    (~UMA_SLAB_MASK));
3091 	keg = slabref->us_keg;
3092 	KASSERT(slabref != NULL && slabref->us_keg->uk_flags & UMA_ZONE_REFCNT,
3093 	    ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
3094 	idx = ((unsigned long)item - (unsigned long)slabref->us_data)
3095 	    / keg->uk_rsize;
3096 	refcnt = &slabref->us_freelist[idx].us_refcnt;
3097 	return refcnt;
3098 }
3099 
3100 /* See uma.h */
3101 void
3102 uma_reclaim(void)
3103 {
3104 #ifdef UMA_DEBUG
3105 	printf("UMA: vm asked us to release pages!\n");
3106 #endif
3107 	bucket_enable();
3108 	zone_foreach(zone_drain);
3109 	/*
3110 	 * Some slabs may have been freed but this zone will be visited early
3111 	 * we visit again so that we can free pages that are empty once other
3112 	 * zones are drained.  We have to do the same for buckets.
3113 	 */
3114 	zone_drain(slabzone);
3115 	zone_drain(slabrefzone);
3116 	bucket_zone_drain();
3117 }
3118 
3119 /* See uma.h */
3120 int
3121 uma_zone_exhausted(uma_zone_t zone)
3122 {
3123 	int full;
3124 
3125 	ZONE_LOCK(zone);
3126 	full = (zone->uz_flags & UMA_ZFLAG_FULL);
3127 	ZONE_UNLOCK(zone);
3128 	return (full);
3129 }
3130 
3131 int
3132 uma_zone_exhausted_nolock(uma_zone_t zone)
3133 {
3134 	return (zone->uz_flags & UMA_ZFLAG_FULL);
3135 }
3136 
3137 void *
3138 uma_large_malloc(int size, int wait)
3139 {
3140 	void *mem;
3141 	uma_slab_t slab;
3142 	u_int8_t flags;
3143 
3144 	slab = zone_alloc_item(slabzone, NULL, wait);
3145 	if (slab == NULL)
3146 		return (NULL);
3147 	mem = page_alloc(NULL, size, &flags, wait);
3148 	if (mem) {
3149 		vsetslab((vm_offset_t)mem, slab);
3150 		slab->us_data = mem;
3151 		slab->us_flags = flags | UMA_SLAB_MALLOC;
3152 		slab->us_size = size;
3153 	} else {
3154 		zone_free_item(slabzone, slab, NULL, SKIP_NONE,
3155 		    ZFREE_STATFAIL | ZFREE_STATFREE);
3156 	}
3157 
3158 	return (mem);
3159 }
3160 
3161 void
3162 uma_large_free(uma_slab_t slab)
3163 {
3164 	vsetobj((vm_offset_t)slab->us_data, kmem_object);
3165 	page_free(slab->us_data, slab->us_size, slab->us_flags);
3166 	zone_free_item(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE);
3167 }
3168 
3169 void
3170 uma_print_stats(void)
3171 {
3172 	zone_foreach(uma_print_zone);
3173 }
3174 
3175 static void
3176 slab_print(uma_slab_t slab)
3177 {
3178 	printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
3179 		slab->us_keg, slab->us_data, slab->us_freecount,
3180 		slab->us_firstfree);
3181 }
3182 
3183 static void
3184 cache_print(uma_cache_t cache)
3185 {
3186 	printf("alloc: %p(%d), free: %p(%d)\n",
3187 		cache->uc_allocbucket,
3188 		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3189 		cache->uc_freebucket,
3190 		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
3191 }
3192 
3193 static void
3194 uma_print_keg(uma_keg_t keg)
3195 {
3196 	uma_slab_t slab;
3197 
3198 	printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3199 	    "out %d free %d limit %d\n",
3200 	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3201 	    keg->uk_ipers, keg->uk_ppera,
3202 	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free,
3203 	    (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
3204 	printf("Part slabs:\n");
3205 	LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
3206 		slab_print(slab);
3207 	printf("Free slabs:\n");
3208 	LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
3209 		slab_print(slab);
3210 	printf("Full slabs:\n");
3211 	LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
3212 		slab_print(slab);
3213 }
3214 
3215 void
3216 uma_print_zone(uma_zone_t zone)
3217 {
3218 	uma_cache_t cache;
3219 	uma_klink_t kl;
3220 	int i;
3221 
3222 	printf("zone: %s(%p) size %d flags %#x\n",
3223 	    zone->uz_name, zone, zone->uz_size, zone->uz_flags);
3224 	LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
3225 		uma_print_keg(kl->kl_keg);
3226 	CPU_FOREACH(i) {
3227 		cache = &zone->uz_cpu[i];
3228 		printf("CPU %d Cache:\n", i);
3229 		cache_print(cache);
3230 	}
3231 }
3232 
3233 #ifdef DDB
3234 /*
3235  * Generate statistics across both the zone and its per-cpu cache's.  Return
3236  * desired statistics if the pointer is non-NULL for that statistic.
3237  *
3238  * Note: does not update the zone statistics, as it can't safely clear the
3239  * per-CPU cache statistic.
3240  *
3241  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
3242  * safe from off-CPU; we should modify the caches to track this information
3243  * directly so that we don't have to.
3244  */
3245 static void
3246 uma_zone_sumstat(uma_zone_t z, int *cachefreep, u_int64_t *allocsp,
3247     u_int64_t *freesp, u_int64_t *sleepsp)
3248 {
3249 	uma_cache_t cache;
3250 	u_int64_t allocs, frees, sleeps;
3251 	int cachefree, cpu;
3252 
3253 	allocs = frees = sleeps = 0;
3254 	cachefree = 0;
3255 	CPU_FOREACH(cpu) {
3256 		cache = &z->uz_cpu[cpu];
3257 		if (cache->uc_allocbucket != NULL)
3258 			cachefree += cache->uc_allocbucket->ub_cnt;
3259 		if (cache->uc_freebucket != NULL)
3260 			cachefree += cache->uc_freebucket->ub_cnt;
3261 		allocs += cache->uc_allocs;
3262 		frees += cache->uc_frees;
3263 	}
3264 	allocs += z->uz_allocs;
3265 	frees += z->uz_frees;
3266 	sleeps += z->uz_sleeps;
3267 	if (cachefreep != NULL)
3268 		*cachefreep = cachefree;
3269 	if (allocsp != NULL)
3270 		*allocsp = allocs;
3271 	if (freesp != NULL)
3272 		*freesp = frees;
3273 	if (sleepsp != NULL)
3274 		*sleepsp = sleeps;
3275 }
3276 #endif /* DDB */
3277 
3278 static int
3279 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
3280 {
3281 	uma_keg_t kz;
3282 	uma_zone_t z;
3283 	int count;
3284 
3285 	count = 0;
3286 	mtx_lock(&uma_mtx);
3287 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3288 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
3289 			count++;
3290 	}
3291 	mtx_unlock(&uma_mtx);
3292 	return (sysctl_handle_int(oidp, &count, 0, req));
3293 }
3294 
3295 static int
3296 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
3297 {
3298 	struct uma_stream_header ush;
3299 	struct uma_type_header uth;
3300 	struct uma_percpu_stat ups;
3301 	uma_bucket_t bucket;
3302 	struct sbuf sbuf;
3303 	uma_cache_t cache;
3304 	uma_klink_t kl;
3305 	uma_keg_t kz;
3306 	uma_zone_t z;
3307 	uma_keg_t k;
3308 	int count, error, i;
3309 
3310 	error = sysctl_wire_old_buffer(req, 0);
3311 	if (error != 0)
3312 		return (error);
3313 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
3314 
3315 	count = 0;
3316 	mtx_lock(&uma_mtx);
3317 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3318 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
3319 			count++;
3320 	}
3321 
3322 	/*
3323 	 * Insert stream header.
3324 	 */
3325 	bzero(&ush, sizeof(ush));
3326 	ush.ush_version = UMA_STREAM_VERSION;
3327 	ush.ush_maxcpus = (mp_maxid + 1);
3328 	ush.ush_count = count;
3329 	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
3330 
3331 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3332 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3333 			bzero(&uth, sizeof(uth));
3334 			ZONE_LOCK(z);
3335 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
3336 			uth.uth_align = kz->uk_align;
3337 			uth.uth_size = kz->uk_size;
3338 			uth.uth_rsize = kz->uk_rsize;
3339 			LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
3340 				k = kl->kl_keg;
3341 				uth.uth_maxpages += k->uk_maxpages;
3342 				uth.uth_pages += k->uk_pages;
3343 				uth.uth_keg_free += k->uk_free;
3344 				uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
3345 				    * k->uk_ipers;
3346 			}
3347 
3348 			/*
3349 			 * A zone is secondary is it is not the first entry
3350 			 * on the keg's zone list.
3351 			 */
3352 			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
3353 			    (LIST_FIRST(&kz->uk_zones) != z))
3354 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
3355 
3356 			LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
3357 				uth.uth_zone_free += bucket->ub_cnt;
3358 			uth.uth_allocs = z->uz_allocs;
3359 			uth.uth_frees = z->uz_frees;
3360 			uth.uth_fails = z->uz_fails;
3361 			uth.uth_sleeps = z->uz_sleeps;
3362 			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
3363 			/*
3364 			 * While it is not normally safe to access the cache
3365 			 * bucket pointers while not on the CPU that owns the
3366 			 * cache, we only allow the pointers to be exchanged
3367 			 * without the zone lock held, not invalidated, so
3368 			 * accept the possible race associated with bucket
3369 			 * exchange during monitoring.
3370 			 */
3371 			for (i = 0; i < (mp_maxid + 1); i++) {
3372 				bzero(&ups, sizeof(ups));
3373 				if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
3374 					goto skip;
3375 				if (CPU_ABSENT(i))
3376 					goto skip;
3377 				cache = &z->uz_cpu[i];
3378 				if (cache->uc_allocbucket != NULL)
3379 					ups.ups_cache_free +=
3380 					    cache->uc_allocbucket->ub_cnt;
3381 				if (cache->uc_freebucket != NULL)
3382 					ups.ups_cache_free +=
3383 					    cache->uc_freebucket->ub_cnt;
3384 				ups.ups_allocs = cache->uc_allocs;
3385 				ups.ups_frees = cache->uc_frees;
3386 skip:
3387 				(void)sbuf_bcat(&sbuf, &ups, sizeof(ups));
3388 			}
3389 			ZONE_UNLOCK(z);
3390 		}
3391 	}
3392 	mtx_unlock(&uma_mtx);
3393 	error = sbuf_finish(&sbuf);
3394 	sbuf_delete(&sbuf);
3395 	return (error);
3396 }
3397 
3398 #ifdef DDB
3399 DB_SHOW_COMMAND(uma, db_show_uma)
3400 {
3401 	u_int64_t allocs, frees, sleeps;
3402 	uma_bucket_t bucket;
3403 	uma_keg_t kz;
3404 	uma_zone_t z;
3405 	int cachefree;
3406 
3407 	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
3408 	    "Requests", "Sleeps");
3409 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3410 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3411 			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
3412 				allocs = z->uz_allocs;
3413 				frees = z->uz_frees;
3414 				sleeps = z->uz_sleeps;
3415 				cachefree = 0;
3416 			} else
3417 				uma_zone_sumstat(z, &cachefree, &allocs,
3418 				    &frees, &sleeps);
3419 			if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
3420 			    (LIST_FIRST(&kz->uk_zones) != z)))
3421 				cachefree += kz->uk_free;
3422 			LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
3423 				cachefree += bucket->ub_cnt;
3424 			db_printf("%18s %8ju %8jd %8d %12ju %8ju\n", z->uz_name,
3425 			    (uintmax_t)kz->uk_size,
3426 			    (intmax_t)(allocs - frees), cachefree,
3427 			    (uintmax_t)allocs, sleeps);
3428 			if (db_pager_quit)
3429 				return;
3430 		}
3431 	}
3432 }
3433 #endif
3434