xref: /freebsd/sys/vm/uma_core.c (revision f856af0466c076beef4ea9b15d088e1119a945b8)
1 /*-
2  * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff@FreeBSD.org>
3  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
4  * Copyright (c) 2004-2006 Robert N. M. Watson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * uma_core.c  Implementation of the Universal Memory allocator
31  *
32  * This allocator is intended to replace the multitude of similar object caches
33  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
34  * effecient.  A primary design goal is to return unused memory to the rest of
35  * the system.  This will make the system as a whole more flexible due to the
36  * ability to move memory to subsystems which most need it instead of leaving
37  * pools of reserved memory unused.
38  *
39  * The basic ideas stem from similar slab/zone based allocators whose algorithms
40  * are well known.
41  *
42  */
43 
44 /*
45  * TODO:
46  *	- Improve memory usage for large allocations
47  *	- Investigate cache size adjustments
48  */
49 
50 #include <sys/cdefs.h>
51 __FBSDID("$FreeBSD$");
52 
53 /* I should really use ktr.. */
54 /*
55 #define UMA_DEBUG 1
56 #define UMA_DEBUG_ALLOC 1
57 #define UMA_DEBUG_ALLOC_1 1
58 */
59 
60 #include "opt_ddb.h"
61 #include "opt_param.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/types.h>
67 #include <sys/queue.h>
68 #include <sys/malloc.h>
69 #include <sys/ktr.h>
70 #include <sys/lock.h>
71 #include <sys/sysctl.h>
72 #include <sys/mutex.h>
73 #include <sys/proc.h>
74 #include <sys/sbuf.h>
75 #include <sys/smp.h>
76 #include <sys/vmmeter.h>
77 
78 #include <vm/vm.h>
79 #include <vm/vm_object.h>
80 #include <vm/vm_page.h>
81 #include <vm/vm_param.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_kern.h>
84 #include <vm/vm_extern.h>
85 #include <vm/uma.h>
86 #include <vm/uma_int.h>
87 #include <vm/uma_dbg.h>
88 
89 #include <machine/vmparam.h>
90 
91 #include <ddb/ddb.h>
92 
93 /*
94  * This is the zone and keg from which all zones are spawned.  The idea is that
95  * even the zone & keg heads are allocated from the allocator, so we use the
96  * bss section to bootstrap us.
97  */
98 static struct uma_keg masterkeg;
99 static struct uma_zone masterzone_k;
100 static struct uma_zone masterzone_z;
101 static uma_zone_t kegs = &masterzone_k;
102 static uma_zone_t zones = &masterzone_z;
103 
104 /* This is the zone from which all of uma_slab_t's are allocated. */
105 static uma_zone_t slabzone;
106 static uma_zone_t slabrefzone;	/* With refcounters (for UMA_ZONE_REFCNT) */
107 
108 /*
109  * The initial hash tables come out of this zone so they can be allocated
110  * prior to malloc coming up.
111  */
112 static uma_zone_t hashzone;
113 
114 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
115 
116 /*
117  * Are we allowed to allocate buckets?
118  */
119 static int bucketdisable = 1;
120 
121 /* Linked list of all kegs in the system */
122 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(&uma_kegs);
123 
124 /* This mutex protects the keg list */
125 static struct mtx uma_mtx;
126 
127 /* Linked list of boot time pages */
128 static LIST_HEAD(,uma_slab) uma_boot_pages =
129     LIST_HEAD_INITIALIZER(&uma_boot_pages);
130 
131 /* This mutex protects the boot time pages list */
132 static struct mtx uma_boot_pages_mtx;
133 
134 /* Is the VM done starting up? */
135 static int booted = 0;
136 
137 /* Maximum number of allowed items-per-slab if the slab header is OFFPAGE */
138 static u_int uma_max_ipers;
139 static u_int uma_max_ipers_ref;
140 
141 /*
142  * This is the handle used to schedule events that need to happen
143  * outside of the allocation fast path.
144  */
145 static struct callout uma_callout;
146 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
147 
148 /*
149  * This structure is passed as the zone ctor arg so that I don't have to create
150  * a special allocation function just for zones.
151  */
152 struct uma_zctor_args {
153 	char *name;
154 	size_t size;
155 	uma_ctor ctor;
156 	uma_dtor dtor;
157 	uma_init uminit;
158 	uma_fini fini;
159 	uma_keg_t keg;
160 	int align;
161 	u_int32_t flags;
162 };
163 
164 struct uma_kctor_args {
165 	uma_zone_t zone;
166 	size_t size;
167 	uma_init uminit;
168 	uma_fini fini;
169 	int align;
170 	u_int32_t flags;
171 };
172 
173 struct uma_bucket_zone {
174 	uma_zone_t	ubz_zone;
175 	char		*ubz_name;
176 	int		ubz_entries;
177 };
178 
179 #define	BUCKET_MAX	128
180 
181 struct uma_bucket_zone bucket_zones[] = {
182 	{ NULL, "16 Bucket", 16 },
183 	{ NULL, "32 Bucket", 32 },
184 	{ NULL, "64 Bucket", 64 },
185 	{ NULL, "128 Bucket", 128 },
186 	{ NULL, NULL, 0}
187 };
188 
189 #define	BUCKET_SHIFT	4
190 #define	BUCKET_ZONES	((BUCKET_MAX >> BUCKET_SHIFT) + 1)
191 
192 /*
193  * bucket_size[] maps requested bucket sizes to zones that allocate a bucket
194  * of approximately the right size.
195  */
196 static uint8_t bucket_size[BUCKET_ZONES];
197 
198 /*
199  * Flags and enumerations to be passed to internal functions.
200  */
201 enum zfreeskip { SKIP_NONE, SKIP_DTOR, SKIP_FINI };
202 
203 #define	ZFREE_STATFAIL	0x00000001	/* Update zone failure statistic. */
204 #define	ZFREE_STATFREE	0x00000002	/* Update zone free statistic. */
205 
206 /* Prototypes.. */
207 
208 static void *obj_alloc(uma_zone_t, int, u_int8_t *, int);
209 static void *page_alloc(uma_zone_t, int, u_int8_t *, int);
210 static void *startup_alloc(uma_zone_t, int, u_int8_t *, int);
211 static void page_free(void *, int, u_int8_t);
212 static uma_slab_t slab_zalloc(uma_zone_t, int);
213 static void cache_drain(uma_zone_t);
214 static void bucket_drain(uma_zone_t, uma_bucket_t);
215 static void bucket_cache_drain(uma_zone_t zone);
216 static int keg_ctor(void *, int, void *, int);
217 static void keg_dtor(void *, int, void *);
218 static int zone_ctor(void *, int, void *, int);
219 static void zone_dtor(void *, int, void *);
220 static int zero_init(void *, int, int);
221 static void zone_small_init(uma_zone_t zone);
222 static void zone_large_init(uma_zone_t zone);
223 static void zone_foreach(void (*zfunc)(uma_zone_t));
224 static void zone_timeout(uma_zone_t zone);
225 static int hash_alloc(struct uma_hash *);
226 static int hash_expand(struct uma_hash *, struct uma_hash *);
227 static void hash_free(struct uma_hash *hash);
228 static void uma_timeout(void *);
229 static void uma_startup3(void);
230 static void *uma_zalloc_internal(uma_zone_t, void *, int);
231 static void uma_zfree_internal(uma_zone_t, void *, void *, enum zfreeskip,
232     int);
233 static void bucket_enable(void);
234 static void bucket_init(void);
235 static uma_bucket_t bucket_alloc(int, int);
236 static void bucket_free(uma_bucket_t);
237 static void bucket_zone_drain(void);
238 static int uma_zalloc_bucket(uma_zone_t zone, int flags);
239 static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags);
240 static void *uma_slab_alloc(uma_zone_t zone, uma_slab_t slab);
241 static void zone_drain(uma_zone_t);
242 static uma_zone_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
243     uma_fini fini, int align, u_int32_t flags);
244 
245 void uma_print_zone(uma_zone_t);
246 void uma_print_stats(void);
247 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
248 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
249 
250 #ifdef WITNESS
251 static int nosleepwithlocks = 1;
252 #else
253 static int nosleepwithlocks = 0;
254 #endif
255 SYSCTL_INT(_debug, OID_AUTO, nosleepwithlocks, CTLFLAG_RW, &nosleepwithlocks,
256     0, "Convert M_WAITOK to M_NOWAIT to avoid lock-held-across-sleep paths");
257 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
258 
259 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
260     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
261 
262 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
263     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
264 
265 /*
266  * This routine checks to see whether or not it's safe to enable buckets.
267  */
268 
269 static void
270 bucket_enable(void)
271 {
272 	if (cnt.v_free_count < cnt.v_free_min)
273 		bucketdisable = 1;
274 	else
275 		bucketdisable = 0;
276 }
277 
278 /*
279  * Initialize bucket_zones, the array of zones of buckets of various sizes.
280  *
281  * For each zone, calculate the memory required for each bucket, consisting
282  * of the header and an array of pointers.  Initialize bucket_size[] to point
283  * the range of appropriate bucket sizes at the zone.
284  */
285 static void
286 bucket_init(void)
287 {
288 	struct uma_bucket_zone *ubz;
289 	int i;
290 	int j;
291 
292 	for (i = 0, j = 0; bucket_zones[j].ubz_entries != 0; j++) {
293 		int size;
294 
295 		ubz = &bucket_zones[j];
296 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
297 		size += sizeof(void *) * ubz->ubz_entries;
298 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
299 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
300 		for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT))
301 			bucket_size[i >> BUCKET_SHIFT] = j;
302 	}
303 }
304 
305 /*
306  * Given a desired number of entries for a bucket, return the zone from which
307  * to allocate the bucket.
308  */
309 static struct uma_bucket_zone *
310 bucket_zone_lookup(int entries)
311 {
312 	int idx;
313 
314 	idx = howmany(entries, 1 << BUCKET_SHIFT);
315 	return (&bucket_zones[bucket_size[idx]]);
316 }
317 
318 static uma_bucket_t
319 bucket_alloc(int entries, int bflags)
320 {
321 	struct uma_bucket_zone *ubz;
322 	uma_bucket_t bucket;
323 
324 	/*
325 	 * This is to stop us from allocating per cpu buckets while we're
326 	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
327 	 * boot pages.  This also prevents us from allocating buckets in
328 	 * low memory situations.
329 	 */
330 	if (bucketdisable)
331 		return (NULL);
332 
333 	ubz = bucket_zone_lookup(entries);
334 	bucket = uma_zalloc_internal(ubz->ubz_zone, NULL, bflags);
335 	if (bucket) {
336 #ifdef INVARIANTS
337 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
338 #endif
339 		bucket->ub_cnt = 0;
340 		bucket->ub_entries = ubz->ubz_entries;
341 	}
342 
343 	return (bucket);
344 }
345 
346 static void
347 bucket_free(uma_bucket_t bucket)
348 {
349 	struct uma_bucket_zone *ubz;
350 
351 	ubz = bucket_zone_lookup(bucket->ub_entries);
352 	uma_zfree_internal(ubz->ubz_zone, bucket, NULL, SKIP_NONE,
353 	    ZFREE_STATFREE);
354 }
355 
356 static void
357 bucket_zone_drain(void)
358 {
359 	struct uma_bucket_zone *ubz;
360 
361 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
362 		zone_drain(ubz->ubz_zone);
363 }
364 
365 
366 /*
367  * Routine called by timeout which is used to fire off some time interval
368  * based calculations.  (stats, hash size, etc.)
369  *
370  * Arguments:
371  *	arg   Unused
372  *
373  * Returns:
374  *	Nothing
375  */
376 static void
377 uma_timeout(void *unused)
378 {
379 	bucket_enable();
380 	zone_foreach(zone_timeout);
381 
382 	/* Reschedule this event */
383 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
384 }
385 
386 /*
387  * Routine to perform timeout driven calculations.  This expands the
388  * hashes and does per cpu statistics aggregation.
389  *
390  *  Arguments:
391  *	zone  The zone to operate on
392  *
393  *  Returns:
394  *	Nothing
395  */
396 static void
397 zone_timeout(uma_zone_t zone)
398 {
399 	uma_keg_t keg;
400 	u_int64_t alloc;
401 
402 	keg = zone->uz_keg;
403 	alloc = 0;
404 
405 	/*
406 	 * Expand the zone hash table.
407 	 *
408 	 * This is done if the number of slabs is larger than the hash size.
409 	 * What I'm trying to do here is completely reduce collisions.  This
410 	 * may be a little aggressive.  Should I allow for two collisions max?
411 	 */
412 	ZONE_LOCK(zone);
413 	if (keg->uk_flags & UMA_ZONE_HASH &&
414 	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
415 		struct uma_hash newhash;
416 		struct uma_hash oldhash;
417 		int ret;
418 
419 		/*
420 		 * This is so involved because allocating and freeing
421 		 * while the zone lock is held will lead to deadlock.
422 		 * I have to do everything in stages and check for
423 		 * races.
424 		 */
425 		newhash = keg->uk_hash;
426 		ZONE_UNLOCK(zone);
427 		ret = hash_alloc(&newhash);
428 		ZONE_LOCK(zone);
429 		if (ret) {
430 			if (hash_expand(&keg->uk_hash, &newhash)) {
431 				oldhash = keg->uk_hash;
432 				keg->uk_hash = newhash;
433 			} else
434 				oldhash = newhash;
435 
436 			ZONE_UNLOCK(zone);
437 			hash_free(&oldhash);
438 			ZONE_LOCK(zone);
439 		}
440 	}
441 	ZONE_UNLOCK(zone);
442 }
443 
444 /*
445  * Allocate and zero fill the next sized hash table from the appropriate
446  * backing store.
447  *
448  * Arguments:
449  *	hash  A new hash structure with the old hash size in uh_hashsize
450  *
451  * Returns:
452  *	1 on sucess and 0 on failure.
453  */
454 static int
455 hash_alloc(struct uma_hash *hash)
456 {
457 	int oldsize;
458 	int alloc;
459 
460 	oldsize = hash->uh_hashsize;
461 
462 	/* We're just going to go to a power of two greater */
463 	if (oldsize)  {
464 		hash->uh_hashsize = oldsize * 2;
465 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
466 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
467 		    M_UMAHASH, M_NOWAIT);
468 	} else {
469 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
470 		hash->uh_slab_hash = uma_zalloc_internal(hashzone, NULL,
471 		    M_WAITOK);
472 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
473 	}
474 	if (hash->uh_slab_hash) {
475 		bzero(hash->uh_slab_hash, alloc);
476 		hash->uh_hashmask = hash->uh_hashsize - 1;
477 		return (1);
478 	}
479 
480 	return (0);
481 }
482 
483 /*
484  * Expands the hash table for HASH zones.  This is done from zone_timeout
485  * to reduce collisions.  This must not be done in the regular allocation
486  * path, otherwise, we can recurse on the vm while allocating pages.
487  *
488  * Arguments:
489  *	oldhash  The hash you want to expand
490  *	newhash  The hash structure for the new table
491  *
492  * Returns:
493  *	Nothing
494  *
495  * Discussion:
496  */
497 static int
498 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
499 {
500 	uma_slab_t slab;
501 	int hval;
502 	int i;
503 
504 	if (!newhash->uh_slab_hash)
505 		return (0);
506 
507 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
508 		return (0);
509 
510 	/*
511 	 * I need to investigate hash algorithms for resizing without a
512 	 * full rehash.
513 	 */
514 
515 	for (i = 0; i < oldhash->uh_hashsize; i++)
516 		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
517 			slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
518 			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
519 			hval = UMA_HASH(newhash, slab->us_data);
520 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
521 			    slab, us_hlink);
522 		}
523 
524 	return (1);
525 }
526 
527 /*
528  * Free the hash bucket to the appropriate backing store.
529  *
530  * Arguments:
531  *	slab_hash  The hash bucket we're freeing
532  *	hashsize   The number of entries in that hash bucket
533  *
534  * Returns:
535  *	Nothing
536  */
537 static void
538 hash_free(struct uma_hash *hash)
539 {
540 	if (hash->uh_slab_hash == NULL)
541 		return;
542 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
543 		uma_zfree_internal(hashzone,
544 		    hash->uh_slab_hash, NULL, SKIP_NONE, ZFREE_STATFREE);
545 	else
546 		free(hash->uh_slab_hash, M_UMAHASH);
547 }
548 
549 /*
550  * Frees all outstanding items in a bucket
551  *
552  * Arguments:
553  *	zone   The zone to free to, must be unlocked.
554  *	bucket The free/alloc bucket with items, cpu queue must be locked.
555  *
556  * Returns:
557  *	Nothing
558  */
559 
560 static void
561 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
562 {
563 	uma_slab_t slab;
564 	int mzone;
565 	void *item;
566 
567 	if (bucket == NULL)
568 		return;
569 
570 	slab = NULL;
571 	mzone = 0;
572 
573 	/* We have to lookup the slab again for malloc.. */
574 	if (zone->uz_keg->uk_flags & UMA_ZONE_MALLOC)
575 		mzone = 1;
576 
577 	while (bucket->ub_cnt > 0)  {
578 		bucket->ub_cnt--;
579 		item = bucket->ub_bucket[bucket->ub_cnt];
580 #ifdef INVARIANTS
581 		bucket->ub_bucket[bucket->ub_cnt] = NULL;
582 		KASSERT(item != NULL,
583 		    ("bucket_drain: botched ptr, item is NULL"));
584 #endif
585 		/*
586 		 * This is extremely inefficient.  The slab pointer was passed
587 		 * to uma_zfree_arg, but we lost it because the buckets don't
588 		 * hold them.  This will go away when free() gets a size passed
589 		 * to it.
590 		 */
591 		if (mzone)
592 			slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
593 		uma_zfree_internal(zone, item, slab, SKIP_DTOR, 0);
594 	}
595 }
596 
597 /*
598  * Drains the per cpu caches for a zone.
599  *
600  * NOTE: This may only be called while the zone is being turn down, and not
601  * during normal operation.  This is necessary in order that we do not have
602  * to migrate CPUs to drain the per-CPU caches.
603  *
604  * Arguments:
605  *	zone     The zone to drain, must be unlocked.
606  *
607  * Returns:
608  *	Nothing
609  */
610 static void
611 cache_drain(uma_zone_t zone)
612 {
613 	uma_cache_t cache;
614 	int cpu;
615 
616 	/*
617 	 * XXX: It is safe to not lock the per-CPU caches, because we're
618 	 * tearing down the zone anyway.  I.e., there will be no further use
619 	 * of the caches at this point.
620 	 *
621 	 * XXX: It would good to be able to assert that the zone is being
622 	 * torn down to prevent improper use of cache_drain().
623 	 *
624 	 * XXX: We lock the zone before passing into bucket_cache_drain() as
625 	 * it is used elsewhere.  Should the tear-down path be made special
626 	 * there in some form?
627 	 */
628 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
629 		if (CPU_ABSENT(cpu))
630 			continue;
631 		cache = &zone->uz_cpu[cpu];
632 		bucket_drain(zone, cache->uc_allocbucket);
633 		bucket_drain(zone, cache->uc_freebucket);
634 		if (cache->uc_allocbucket != NULL)
635 			bucket_free(cache->uc_allocbucket);
636 		if (cache->uc_freebucket != NULL)
637 			bucket_free(cache->uc_freebucket);
638 		cache->uc_allocbucket = cache->uc_freebucket = NULL;
639 	}
640 	ZONE_LOCK(zone);
641 	bucket_cache_drain(zone);
642 	ZONE_UNLOCK(zone);
643 }
644 
645 /*
646  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
647  */
648 static void
649 bucket_cache_drain(uma_zone_t zone)
650 {
651 	uma_bucket_t bucket;
652 
653 	/*
654 	 * Drain the bucket queues and free the buckets, we just keep two per
655 	 * cpu (alloc/free).
656 	 */
657 	while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
658 		LIST_REMOVE(bucket, ub_link);
659 		ZONE_UNLOCK(zone);
660 		bucket_drain(zone, bucket);
661 		bucket_free(bucket);
662 		ZONE_LOCK(zone);
663 	}
664 
665 	/* Now we do the free queue.. */
666 	while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
667 		LIST_REMOVE(bucket, ub_link);
668 		bucket_free(bucket);
669 	}
670 }
671 
672 /*
673  * Frees pages from a zone back to the system.  This is done on demand from
674  * the pageout daemon.
675  *
676  * Arguments:
677  *	zone  The zone to free pages from
678  *	 all  Should we drain all items?
679  *
680  * Returns:
681  *	Nothing.
682  */
683 static void
684 zone_drain(uma_zone_t zone)
685 {
686 	struct slabhead freeslabs = { 0 };
687 	uma_keg_t keg;
688 	uma_slab_t slab;
689 	uma_slab_t n;
690 	u_int8_t flags;
691 	u_int8_t *mem;
692 	int i;
693 
694 	keg = zone->uz_keg;
695 
696 	/*
697 	 * We don't want to take pages from statically allocated zones at this
698 	 * time
699 	 */
700 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
701 		return;
702 
703 	ZONE_LOCK(zone);
704 
705 #ifdef UMA_DEBUG
706 	printf("%s free items: %u\n", zone->uz_name, keg->uk_free);
707 #endif
708 	bucket_cache_drain(zone);
709 	if (keg->uk_free == 0)
710 		goto finished;
711 
712 	slab = LIST_FIRST(&keg->uk_free_slab);
713 	while (slab) {
714 		n = LIST_NEXT(slab, us_link);
715 
716 		/* We have no where to free these to */
717 		if (slab->us_flags & UMA_SLAB_BOOT) {
718 			slab = n;
719 			continue;
720 		}
721 
722 		LIST_REMOVE(slab, us_link);
723 		keg->uk_pages -= keg->uk_ppera;
724 		keg->uk_free -= keg->uk_ipers;
725 
726 		if (keg->uk_flags & UMA_ZONE_HASH)
727 			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
728 
729 		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
730 
731 		slab = n;
732 	}
733 finished:
734 	ZONE_UNLOCK(zone);
735 
736 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
737 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
738 		if (keg->uk_fini)
739 			for (i = 0; i < keg->uk_ipers; i++)
740 				keg->uk_fini(
741 				    slab->us_data + (keg->uk_rsize * i),
742 				    keg->uk_size);
743 		flags = slab->us_flags;
744 		mem = slab->us_data;
745 
746 		if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
747 		    (keg->uk_flags & UMA_ZONE_REFCNT)) {
748 			vm_object_t obj;
749 
750 			if (flags & UMA_SLAB_KMEM)
751 				obj = kmem_object;
752 			else
753 				obj = NULL;
754 			for (i = 0; i < keg->uk_ppera; i++)
755 				vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
756 				    obj);
757 		}
758 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
759 			uma_zfree_internal(keg->uk_slabzone, slab, NULL,
760 			    SKIP_NONE, ZFREE_STATFREE);
761 #ifdef UMA_DEBUG
762 		printf("%s: Returning %d bytes.\n",
763 		    zone->uz_name, UMA_SLAB_SIZE * keg->uk_ppera);
764 #endif
765 		keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);
766 	}
767 }
768 
769 /*
770  * Allocate a new slab for a zone.  This does not insert the slab onto a list.
771  *
772  * Arguments:
773  *	zone  The zone to allocate slabs for
774  *	wait  Shall we wait?
775  *
776  * Returns:
777  *	The slab that was allocated or NULL if there is no memory and the
778  *	caller specified M_NOWAIT.
779  */
780 static uma_slab_t
781 slab_zalloc(uma_zone_t zone, int wait)
782 {
783 	uma_slabrefcnt_t slabref;
784 	uma_slab_t slab;
785 	uma_keg_t keg;
786 	u_int8_t *mem;
787 	u_int8_t flags;
788 	int i;
789 
790 	slab = NULL;
791 	keg = zone->uz_keg;
792 
793 #ifdef UMA_DEBUG
794 	printf("slab_zalloc:  Allocating a new slab for %s\n", zone->uz_name);
795 #endif
796 	ZONE_UNLOCK(zone);
797 
798 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
799 		slab = uma_zalloc_internal(keg->uk_slabzone, NULL, wait);
800 		if (slab == NULL) {
801 			ZONE_LOCK(zone);
802 			return NULL;
803 		}
804 	}
805 
806 	/*
807 	 * This reproduces the old vm_zone behavior of zero filling pages the
808 	 * first time they are added to a zone.
809 	 *
810 	 * Malloced items are zeroed in uma_zalloc.
811 	 */
812 
813 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
814 		wait |= M_ZERO;
815 	else
816 		wait &= ~M_ZERO;
817 
818 	mem = keg->uk_allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE,
819 	    &flags, wait);
820 	if (mem == NULL) {
821 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
822 			uma_zfree_internal(keg->uk_slabzone, slab, NULL,
823 			    SKIP_NONE, ZFREE_STATFREE);
824 		ZONE_LOCK(zone);
825 		return (NULL);
826 	}
827 
828 	/* Point the slab into the allocated memory */
829 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
830 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
831 
832 	if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
833 	    (keg->uk_flags & UMA_ZONE_REFCNT))
834 		for (i = 0; i < keg->uk_ppera; i++)
835 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
836 
837 	slab->us_keg = keg;
838 	slab->us_data = mem;
839 	slab->us_freecount = keg->uk_ipers;
840 	slab->us_firstfree = 0;
841 	slab->us_flags = flags;
842 
843 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
844 		slabref = (uma_slabrefcnt_t)slab;
845 		for (i = 0; i < keg->uk_ipers; i++) {
846 			slabref->us_freelist[i].us_refcnt = 0;
847 			slabref->us_freelist[i].us_item = i+1;
848 		}
849 	} else {
850 		for (i = 0; i < keg->uk_ipers; i++)
851 			slab->us_freelist[i].us_item = i+1;
852 	}
853 
854 	if (keg->uk_init != NULL) {
855 		for (i = 0; i < keg->uk_ipers; i++)
856 			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
857 			    keg->uk_size, wait) != 0)
858 				break;
859 		if (i != keg->uk_ipers) {
860 			if (keg->uk_fini != NULL) {
861 				for (i--; i > -1; i--)
862 					keg->uk_fini(slab->us_data +
863 					    (keg->uk_rsize * i),
864 					    keg->uk_size);
865 			}
866 			if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
867 			    (keg->uk_flags & UMA_ZONE_REFCNT)) {
868 				vm_object_t obj;
869 
870 				if (flags & UMA_SLAB_KMEM)
871 					obj = kmem_object;
872 				else
873 					obj = NULL;
874 				for (i = 0; i < keg->uk_ppera; i++)
875 					vsetobj((vm_offset_t)mem +
876 					    (i * PAGE_SIZE), obj);
877 			}
878 			if (keg->uk_flags & UMA_ZONE_OFFPAGE)
879 				uma_zfree_internal(keg->uk_slabzone, slab,
880 				    NULL, SKIP_NONE, ZFREE_STATFREE);
881 			keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera,
882 			    flags);
883 			ZONE_LOCK(zone);
884 			return (NULL);
885 		}
886 	}
887 	ZONE_LOCK(zone);
888 
889 	if (keg->uk_flags & UMA_ZONE_HASH)
890 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
891 
892 	keg->uk_pages += keg->uk_ppera;
893 	keg->uk_free += keg->uk_ipers;
894 
895 	return (slab);
896 }
897 
898 /*
899  * This function is intended to be used early on in place of page_alloc() so
900  * that we may use the boot time page cache to satisfy allocations before
901  * the VM is ready.
902  */
903 static void *
904 startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
905 {
906 	uma_keg_t keg;
907 	uma_slab_t tmps;
908 
909 	keg = zone->uz_keg;
910 
911 	/*
912 	 * Check our small startup cache to see if it has pages remaining.
913 	 */
914 	mtx_lock(&uma_boot_pages_mtx);
915 	if ((tmps = LIST_FIRST(&uma_boot_pages)) != NULL) {
916 		LIST_REMOVE(tmps, us_link);
917 		mtx_unlock(&uma_boot_pages_mtx);
918 		*pflag = tmps->us_flags;
919 		return (tmps->us_data);
920 	}
921 	mtx_unlock(&uma_boot_pages_mtx);
922 	if (booted == 0)
923 		panic("UMA: Increase vm.boot_pages");
924 	/*
925 	 * Now that we've booted reset these users to their real allocator.
926 	 */
927 #ifdef UMA_MD_SMALL_ALLOC
928 	keg->uk_allocf = uma_small_alloc;
929 #else
930 	keg->uk_allocf = page_alloc;
931 #endif
932 	return keg->uk_allocf(zone, bytes, pflag, wait);
933 }
934 
935 /*
936  * Allocates a number of pages from the system
937  *
938  * Arguments:
939  *	zone  Unused
940  *	bytes  The number of bytes requested
941  *	wait  Shall we wait?
942  *
943  * Returns:
944  *	A pointer to the alloced memory or possibly
945  *	NULL if M_NOWAIT is set.
946  */
947 static void *
948 page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
949 {
950 	void *p;	/* Returned page */
951 
952 	*pflag = UMA_SLAB_KMEM;
953 	p = (void *) kmem_malloc(kmem_map, bytes, wait);
954 
955 	return (p);
956 }
957 
958 /*
959  * Allocates a number of pages from within an object
960  *
961  * Arguments:
962  *	zone   Unused
963  *	bytes  The number of bytes requested
964  *	wait   Shall we wait?
965  *
966  * Returns:
967  *	A pointer to the alloced memory or possibly
968  *	NULL if M_NOWAIT is set.
969  */
970 static void *
971 obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
972 {
973 	vm_object_t object;
974 	vm_offset_t retkva, zkva;
975 	vm_page_t p;
976 	int pages, startpages;
977 
978 	object = zone->uz_keg->uk_obj;
979 	retkva = 0;
980 
981 	/*
982 	 * This looks a little weird since we're getting one page at a time.
983 	 */
984 	VM_OBJECT_LOCK(object);
985 	p = TAILQ_LAST(&object->memq, pglist);
986 	pages = p != NULL ? p->pindex + 1 : 0;
987 	startpages = pages;
988 	zkva = zone->uz_keg->uk_kva + pages * PAGE_SIZE;
989 	for (; bytes > 0; bytes -= PAGE_SIZE) {
990 		p = vm_page_alloc(object, pages,
991 		    VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
992 		if (p == NULL) {
993 			if (pages != startpages)
994 				pmap_qremove(retkva, pages - startpages);
995 			while (pages != startpages) {
996 				pages--;
997 				p = TAILQ_LAST(&object->memq, pglist);
998 				vm_page_lock_queues();
999 				vm_page_unwire(p, 0);
1000 				vm_page_free(p);
1001 				vm_page_unlock_queues();
1002 			}
1003 			retkva = 0;
1004 			goto done;
1005 		}
1006 		pmap_qenter(zkva, &p, 1);
1007 		if (retkva == 0)
1008 			retkva = zkva;
1009 		zkva += PAGE_SIZE;
1010 		pages += 1;
1011 	}
1012 done:
1013 	VM_OBJECT_UNLOCK(object);
1014 	*flags = UMA_SLAB_PRIV;
1015 
1016 	return ((void *)retkva);
1017 }
1018 
1019 /*
1020  * Frees a number of pages to the system
1021  *
1022  * Arguments:
1023  *	mem   A pointer to the memory to be freed
1024  *	size  The size of the memory being freed
1025  *	flags The original p->us_flags field
1026  *
1027  * Returns:
1028  *	Nothing
1029  */
1030 static void
1031 page_free(void *mem, int size, u_int8_t flags)
1032 {
1033 	vm_map_t map;
1034 
1035 	if (flags & UMA_SLAB_KMEM)
1036 		map = kmem_map;
1037 	else
1038 		panic("UMA: page_free used with invalid flags %d\n", flags);
1039 
1040 	kmem_free(map, (vm_offset_t)mem, size);
1041 }
1042 
1043 /*
1044  * Zero fill initializer
1045  *
1046  * Arguments/Returns follow uma_init specifications
1047  */
1048 static int
1049 zero_init(void *mem, int size, int flags)
1050 {
1051 	bzero(mem, size);
1052 	return (0);
1053 }
1054 
1055 /*
1056  * Finish creating a small uma zone.  This calculates ipers, and the zone size.
1057  *
1058  * Arguments
1059  *	zone  The zone we should initialize
1060  *
1061  * Returns
1062  *	Nothing
1063  */
1064 static void
1065 zone_small_init(uma_zone_t zone)
1066 {
1067 	uma_keg_t keg;
1068 	u_int rsize;
1069 	u_int memused;
1070 	u_int wastedspace;
1071 	u_int shsize;
1072 
1073 	keg = zone->uz_keg;
1074 	KASSERT(keg != NULL, ("Keg is null in zone_small_init"));
1075 	rsize = keg->uk_size;
1076 
1077 	if (rsize < UMA_SMALLEST_UNIT)
1078 		rsize = UMA_SMALLEST_UNIT;
1079 	if (rsize & keg->uk_align)
1080 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1081 
1082 	keg->uk_rsize = rsize;
1083 	keg->uk_ppera = 1;
1084 
1085 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
1086 		rsize += UMA_FRITMREF_SZ;	/* linkage & refcnt */
1087 		shsize = sizeof(struct uma_slab_refcnt);
1088 	} else {
1089 		rsize += UMA_FRITM_SZ;	/* Account for linkage */
1090 		shsize = sizeof(struct uma_slab);
1091 	}
1092 
1093 	keg->uk_ipers = (UMA_SLAB_SIZE - shsize) / rsize;
1094 	KASSERT(keg->uk_ipers != 0, ("zone_small_init: ipers is 0"));
1095 	memused = keg->uk_ipers * rsize + shsize;
1096 	wastedspace = UMA_SLAB_SIZE - memused;
1097 
1098 	/*
1099 	 * We can't do OFFPAGE if we're internal or if we've been
1100 	 * asked to not go to the VM for buckets.  If we do this we
1101 	 * may end up going to the VM (kmem_map) for slabs which we
1102 	 * do not want to do if we're UMA_ZFLAG_CACHEONLY as a
1103 	 * result of UMA_ZONE_VM, which clearly forbids it.
1104 	 */
1105 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1106 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1107 		return;
1108 
1109 	if ((wastedspace >= UMA_MAX_WASTE) &&
1110 	    (keg->uk_ipers < (UMA_SLAB_SIZE / keg->uk_rsize))) {
1111 		keg->uk_ipers = UMA_SLAB_SIZE / keg->uk_rsize;
1112 		KASSERT(keg->uk_ipers <= 255,
1113 		    ("zone_small_init: keg->uk_ipers too high!"));
1114 #ifdef UMA_DEBUG
1115 		printf("UMA decided we need offpage slab headers for "
1116 		    "zone: %s, calculated wastedspace = %d, "
1117 		    "maximum wasted space allowed = %d, "
1118 		    "calculated ipers = %d, "
1119 		    "new wasted space = %d\n", zone->uz_name, wastedspace,
1120 		    UMA_MAX_WASTE, keg->uk_ipers,
1121 		    UMA_SLAB_SIZE - keg->uk_ipers * keg->uk_rsize);
1122 #endif
1123 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1124 		if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1125 			keg->uk_flags |= UMA_ZONE_HASH;
1126 	}
1127 }
1128 
1129 /*
1130  * Finish creating a large (> UMA_SLAB_SIZE) uma zone.  Just give in and do
1131  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1132  * more complicated.
1133  *
1134  * Arguments
1135  *	zone  The zone we should initialize
1136  *
1137  * Returns
1138  *	Nothing
1139  */
1140 static void
1141 zone_large_init(uma_zone_t zone)
1142 {
1143 	uma_keg_t keg;
1144 	int pages;
1145 
1146 	keg = zone->uz_keg;
1147 
1148 	KASSERT(keg != NULL, ("Keg is null in zone_large_init"));
1149 	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
1150 	    ("zone_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY zone"));
1151 
1152 	pages = keg->uk_size / UMA_SLAB_SIZE;
1153 
1154 	/* Account for remainder */
1155 	if ((pages * UMA_SLAB_SIZE) < keg->uk_size)
1156 		pages++;
1157 
1158 	keg->uk_ppera = pages;
1159 	keg->uk_ipers = 1;
1160 
1161 	keg->uk_flags |= UMA_ZONE_OFFPAGE;
1162 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1163 		keg->uk_flags |= UMA_ZONE_HASH;
1164 
1165 	keg->uk_rsize = keg->uk_size;
1166 }
1167 
1168 /*
1169  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1170  * the keg onto the global keg list.
1171  *
1172  * Arguments/Returns follow uma_ctor specifications
1173  *	udata  Actually uma_kctor_args
1174  */
1175 static int
1176 keg_ctor(void *mem, int size, void *udata, int flags)
1177 {
1178 	struct uma_kctor_args *arg = udata;
1179 	uma_keg_t keg = mem;
1180 	uma_zone_t zone;
1181 
1182 	bzero(keg, size);
1183 	keg->uk_size = arg->size;
1184 	keg->uk_init = arg->uminit;
1185 	keg->uk_fini = arg->fini;
1186 	keg->uk_align = arg->align;
1187 	keg->uk_free = 0;
1188 	keg->uk_pages = 0;
1189 	keg->uk_flags = arg->flags;
1190 	keg->uk_allocf = page_alloc;
1191 	keg->uk_freef = page_free;
1192 	keg->uk_recurse = 0;
1193 	keg->uk_slabzone = NULL;
1194 
1195 	/*
1196 	 * The master zone is passed to us at keg-creation time.
1197 	 */
1198 	zone = arg->zone;
1199 	zone->uz_keg = keg;
1200 
1201 	if (arg->flags & UMA_ZONE_VM)
1202 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1203 
1204 	if (arg->flags & UMA_ZONE_ZINIT)
1205 		keg->uk_init = zero_init;
1206 
1207 	/*
1208 	 * The +UMA_FRITM_SZ added to uk_size is to account for the
1209 	 * linkage that is added to the size in zone_small_init().  If
1210 	 * we don't account for this here then we may end up in
1211 	 * zone_small_init() with a calculated 'ipers' of 0.
1212 	 */
1213 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
1214 		if ((keg->uk_size+UMA_FRITMREF_SZ) >
1215 		    (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)))
1216 			zone_large_init(zone);
1217 		else
1218 			zone_small_init(zone);
1219 	} else {
1220 		if ((keg->uk_size+UMA_FRITM_SZ) >
1221 		    (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
1222 			zone_large_init(zone);
1223 		else
1224 			zone_small_init(zone);
1225 	}
1226 
1227 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1228 		if (keg->uk_flags & UMA_ZONE_REFCNT)
1229 			keg->uk_slabzone = slabrefzone;
1230 		else
1231 			keg->uk_slabzone = slabzone;
1232 	}
1233 
1234 	/*
1235 	 * If we haven't booted yet we need allocations to go through the
1236 	 * startup cache until the vm is ready.
1237 	 */
1238 	if (keg->uk_ppera == 1) {
1239 #ifdef UMA_MD_SMALL_ALLOC
1240 		keg->uk_allocf = uma_small_alloc;
1241 		keg->uk_freef = uma_small_free;
1242 #endif
1243 		if (booted == 0)
1244 			keg->uk_allocf = startup_alloc;
1245 	}
1246 
1247 	/*
1248 	 * Initialize keg's lock (shared among zones) through
1249 	 * Master zone
1250 	 */
1251 	zone->uz_lock = &keg->uk_lock;
1252 	if (arg->flags & UMA_ZONE_MTXCLASS)
1253 		ZONE_LOCK_INIT(zone, 1);
1254 	else
1255 		ZONE_LOCK_INIT(zone, 0);
1256 
1257 	/*
1258 	 * If we're putting the slab header in the actual page we need to
1259 	 * figure out where in each page it goes.  This calculates a right
1260 	 * justified offset into the memory on an ALIGN_PTR boundary.
1261 	 */
1262 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1263 		u_int totsize;
1264 
1265 		/* Size of the slab struct and free list */
1266 		if (keg->uk_flags & UMA_ZONE_REFCNT)
1267 			totsize = sizeof(struct uma_slab_refcnt) +
1268 			    keg->uk_ipers * UMA_FRITMREF_SZ;
1269 		else
1270 			totsize = sizeof(struct uma_slab) +
1271 			    keg->uk_ipers * UMA_FRITM_SZ;
1272 
1273 		if (totsize & UMA_ALIGN_PTR)
1274 			totsize = (totsize & ~UMA_ALIGN_PTR) +
1275 			    (UMA_ALIGN_PTR + 1);
1276 		keg->uk_pgoff = UMA_SLAB_SIZE - totsize;
1277 
1278 		if (keg->uk_flags & UMA_ZONE_REFCNT)
1279 			totsize = keg->uk_pgoff + sizeof(struct uma_slab_refcnt)
1280 			    + keg->uk_ipers * UMA_FRITMREF_SZ;
1281 		else
1282 			totsize = keg->uk_pgoff + sizeof(struct uma_slab)
1283 			    + keg->uk_ipers * UMA_FRITM_SZ;
1284 
1285 		/*
1286 		 * The only way the following is possible is if with our
1287 		 * UMA_ALIGN_PTR adjustments we are now bigger than
1288 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
1289 		 * mathematically possible for all cases, so we make
1290 		 * sure here anyway.
1291 		 */
1292 		if (totsize > UMA_SLAB_SIZE) {
1293 			printf("zone %s ipers %d rsize %d size %d\n",
1294 			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
1295 			    keg->uk_size);
1296 			panic("UMA slab won't fit.\n");
1297 		}
1298 	}
1299 
1300 	if (keg->uk_flags & UMA_ZONE_HASH)
1301 		hash_alloc(&keg->uk_hash);
1302 
1303 #ifdef UMA_DEBUG
1304 	printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n",
1305 	    zone->uz_name, zone,
1306 	    keg->uk_size, keg->uk_ipers,
1307 	    keg->uk_ppera, keg->uk_pgoff);
1308 #endif
1309 
1310 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1311 
1312 	mtx_lock(&uma_mtx);
1313 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1314 	mtx_unlock(&uma_mtx);
1315 	return (0);
1316 }
1317 
1318 /*
1319  * Zone header ctor.  This initializes all fields, locks, etc.
1320  *
1321  * Arguments/Returns follow uma_ctor specifications
1322  *	udata  Actually uma_zctor_args
1323  */
1324 
1325 static int
1326 zone_ctor(void *mem, int size, void *udata, int flags)
1327 {
1328 	struct uma_zctor_args *arg = udata;
1329 	uma_zone_t zone = mem;
1330 	uma_zone_t z;
1331 	uma_keg_t keg;
1332 
1333 	bzero(zone, size);
1334 	zone->uz_name = arg->name;
1335 	zone->uz_ctor = arg->ctor;
1336 	zone->uz_dtor = arg->dtor;
1337 	zone->uz_init = NULL;
1338 	zone->uz_fini = NULL;
1339 	zone->uz_allocs = 0;
1340 	zone->uz_frees = 0;
1341 	zone->uz_fails = 0;
1342 	zone->uz_fills = zone->uz_count = 0;
1343 
1344 	if (arg->flags & UMA_ZONE_SECONDARY) {
1345 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1346 		keg = arg->keg;
1347 		zone->uz_keg = keg;
1348 		zone->uz_init = arg->uminit;
1349 		zone->uz_fini = arg->fini;
1350 		zone->uz_lock = &keg->uk_lock;
1351 		mtx_lock(&uma_mtx);
1352 		ZONE_LOCK(zone);
1353 		keg->uk_flags |= UMA_ZONE_SECONDARY;
1354 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1355 			if (LIST_NEXT(z, uz_link) == NULL) {
1356 				LIST_INSERT_AFTER(z, zone, uz_link);
1357 				break;
1358 			}
1359 		}
1360 		ZONE_UNLOCK(zone);
1361 		mtx_unlock(&uma_mtx);
1362 	} else if (arg->keg == NULL) {
1363 		if (uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1364 		    arg->align, arg->flags) == NULL)
1365 			return (ENOMEM);
1366 	} else {
1367 		struct uma_kctor_args karg;
1368 		int error;
1369 
1370 		/* We should only be here from uma_startup() */
1371 		karg.size = arg->size;
1372 		karg.uminit = arg->uminit;
1373 		karg.fini = arg->fini;
1374 		karg.align = arg->align;
1375 		karg.flags = arg->flags;
1376 		karg.zone = zone;
1377 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1378 		    flags);
1379 		if (error)
1380 			return (error);
1381 	}
1382 	keg = zone->uz_keg;
1383 	zone->uz_lock = &keg->uk_lock;
1384 
1385 	/*
1386 	 * Some internal zones don't have room allocated for the per cpu
1387 	 * caches.  If we're internal, bail out here.
1388 	 */
1389 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1390 		KASSERT((keg->uk_flags & UMA_ZONE_SECONDARY) == 0,
1391 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1392 		return (0);
1393 	}
1394 
1395 	if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
1396 		zone->uz_count = BUCKET_MAX;
1397 	else if (keg->uk_ipers <= BUCKET_MAX)
1398 		zone->uz_count = keg->uk_ipers;
1399 	else
1400 		zone->uz_count = BUCKET_MAX;
1401 	return (0);
1402 }
1403 
1404 /*
1405  * Keg header dtor.  This frees all data, destroys locks, frees the hash
1406  * table and removes the keg from the global list.
1407  *
1408  * Arguments/Returns follow uma_dtor specifications
1409  *	udata  unused
1410  */
1411 static void
1412 keg_dtor(void *arg, int size, void *udata)
1413 {
1414 	uma_keg_t keg;
1415 
1416 	keg = (uma_keg_t)arg;
1417 	mtx_lock(&keg->uk_lock);
1418 	if (keg->uk_free != 0) {
1419 		printf("Freed UMA keg was not empty (%d items). "
1420 		    " Lost %d pages of memory.\n",
1421 		    keg->uk_free, keg->uk_pages);
1422 	}
1423 	mtx_unlock(&keg->uk_lock);
1424 
1425 	if (keg->uk_flags & UMA_ZONE_HASH)
1426 		hash_free(&keg->uk_hash);
1427 
1428 	mtx_destroy(&keg->uk_lock);
1429 }
1430 
1431 /*
1432  * Zone header dtor.
1433  *
1434  * Arguments/Returns follow uma_dtor specifications
1435  *	udata  unused
1436  */
1437 static void
1438 zone_dtor(void *arg, int size, void *udata)
1439 {
1440 	uma_zone_t zone;
1441 	uma_keg_t keg;
1442 
1443 	zone = (uma_zone_t)arg;
1444 	keg = zone->uz_keg;
1445 
1446 	if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL))
1447 		cache_drain(zone);
1448 
1449 	mtx_lock(&uma_mtx);
1450 	zone_drain(zone);
1451 	if (keg->uk_flags & UMA_ZONE_SECONDARY) {
1452 		LIST_REMOVE(zone, uz_link);
1453 		/*
1454 		 * XXX there are some races here where
1455 		 * the zone can be drained but zone lock
1456 		 * released and then refilled before we
1457 		 * remove it... we dont care for now
1458 		 */
1459 		ZONE_LOCK(zone);
1460 		if (LIST_EMPTY(&keg->uk_zones))
1461 			keg->uk_flags &= ~UMA_ZONE_SECONDARY;
1462 		ZONE_UNLOCK(zone);
1463 		mtx_unlock(&uma_mtx);
1464 	} else {
1465 		LIST_REMOVE(keg, uk_link);
1466 		LIST_REMOVE(zone, uz_link);
1467 		mtx_unlock(&uma_mtx);
1468 		uma_zfree_internal(kegs, keg, NULL, SKIP_NONE,
1469 		    ZFREE_STATFREE);
1470 	}
1471 	zone->uz_keg = NULL;
1472 }
1473 
1474 /*
1475  * Traverses every zone in the system and calls a callback
1476  *
1477  * Arguments:
1478  *	zfunc  A pointer to a function which accepts a zone
1479  *		as an argument.
1480  *
1481  * Returns:
1482  *	Nothing
1483  */
1484 static void
1485 zone_foreach(void (*zfunc)(uma_zone_t))
1486 {
1487 	uma_keg_t keg;
1488 	uma_zone_t zone;
1489 
1490 	mtx_lock(&uma_mtx);
1491 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
1492 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
1493 			zfunc(zone);
1494 	}
1495 	mtx_unlock(&uma_mtx);
1496 }
1497 
1498 /* Public functions */
1499 /* See uma.h */
1500 void
1501 uma_startup(void *bootmem, int boot_pages)
1502 {
1503 	struct uma_zctor_args args;
1504 	uma_slab_t slab;
1505 	u_int slabsize;
1506 	u_int objsize, totsize, wsize;
1507 	int i;
1508 
1509 #ifdef UMA_DEBUG
1510 	printf("Creating uma keg headers zone and keg.\n");
1511 #endif
1512 	mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
1513 
1514 	/*
1515 	 * Figure out the maximum number of items-per-slab we'll have if
1516 	 * we're using the OFFPAGE slab header to track free items, given
1517 	 * all possible object sizes and the maximum desired wastage
1518 	 * (UMA_MAX_WASTE).
1519 	 *
1520 	 * We iterate until we find an object size for
1521 	 * which the calculated wastage in zone_small_init() will be
1522 	 * enough to warrant OFFPAGE.  Since wastedspace versus objsize
1523 	 * is an overall increasing see-saw function, we find the smallest
1524 	 * objsize such that the wastage is always acceptable for objects
1525 	 * with that objsize or smaller.  Since a smaller objsize always
1526 	 * generates a larger possible uma_max_ipers, we use this computed
1527 	 * objsize to calculate the largest ipers possible.  Since the
1528 	 * ipers calculated for OFFPAGE slab headers is always larger than
1529 	 * the ipers initially calculated in zone_small_init(), we use
1530 	 * the former's equation (UMA_SLAB_SIZE / keg->uk_rsize) to
1531 	 * obtain the maximum ipers possible for offpage slab headers.
1532 	 *
1533 	 * It should be noted that ipers versus objsize is an inversly
1534 	 * proportional function which drops off rather quickly so as
1535 	 * long as our UMA_MAX_WASTE is such that the objsize we calculate
1536 	 * falls into the portion of the inverse relation AFTER the steep
1537 	 * falloff, then uma_max_ipers shouldn't be too high (~10 on i386).
1538 	 *
1539 	 * Note that we have 8-bits (1 byte) to use as a freelist index
1540 	 * inside the actual slab header itself and this is enough to
1541 	 * accomodate us.  In the worst case, a UMA_SMALLEST_UNIT sized
1542 	 * object with offpage slab header would have ipers =
1543 	 * UMA_SLAB_SIZE / UMA_SMALLEST_UNIT (currently = 256), which is
1544 	 * 1 greater than what our byte-integer freelist index can
1545 	 * accomodate, but we know that this situation never occurs as
1546 	 * for UMA_SMALLEST_UNIT-sized objects, we will never calculate
1547 	 * that we need to go to offpage slab headers.  Or, if we do,
1548 	 * then we trap that condition below and panic in the INVARIANTS case.
1549 	 */
1550 	wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab) - UMA_MAX_WASTE;
1551 	totsize = wsize;
1552 	objsize = UMA_SMALLEST_UNIT;
1553 	while (totsize >= wsize) {
1554 		totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) /
1555 		    (objsize + UMA_FRITM_SZ);
1556 		totsize *= (UMA_FRITM_SZ + objsize);
1557 		objsize++;
1558 	}
1559 	if (objsize > UMA_SMALLEST_UNIT)
1560 		objsize--;
1561 	uma_max_ipers = UMA_SLAB_SIZE / objsize;
1562 
1563 	wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) - UMA_MAX_WASTE;
1564 	totsize = wsize;
1565 	objsize = UMA_SMALLEST_UNIT;
1566 	while (totsize >= wsize) {
1567 		totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)) /
1568 		    (objsize + UMA_FRITMREF_SZ);
1569 		totsize *= (UMA_FRITMREF_SZ + objsize);
1570 		objsize++;
1571 	}
1572 	if (objsize > UMA_SMALLEST_UNIT)
1573 		objsize--;
1574 	uma_max_ipers_ref = UMA_SLAB_SIZE / objsize;
1575 
1576 	KASSERT((uma_max_ipers_ref <= 255) && (uma_max_ipers <= 255),
1577 	    ("uma_startup: calculated uma_max_ipers values too large!"));
1578 
1579 #ifdef UMA_DEBUG
1580 	printf("Calculated uma_max_ipers (for OFFPAGE) is %d\n", uma_max_ipers);
1581 	printf("Calculated uma_max_ipers_slab (for OFFPAGE) is %d\n",
1582 	    uma_max_ipers_ref);
1583 #endif
1584 
1585 	/* "manually" create the initial zone */
1586 	args.name = "UMA Kegs";
1587 	args.size = sizeof(struct uma_keg);
1588 	args.ctor = keg_ctor;
1589 	args.dtor = keg_dtor;
1590 	args.uminit = zero_init;
1591 	args.fini = NULL;
1592 	args.keg = &masterkeg;
1593 	args.align = 32 - 1;
1594 	args.flags = UMA_ZFLAG_INTERNAL;
1595 	/* The initial zone has no Per cpu queues so it's smaller */
1596 	zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK);
1597 
1598 #ifdef UMA_DEBUG
1599 	printf("Filling boot free list.\n");
1600 #endif
1601 	for (i = 0; i < boot_pages; i++) {
1602 		slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE));
1603 		slab->us_data = (u_int8_t *)slab;
1604 		slab->us_flags = UMA_SLAB_BOOT;
1605 		LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
1606 	}
1607 	mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF);
1608 
1609 #ifdef UMA_DEBUG
1610 	printf("Creating uma zone headers zone and keg.\n");
1611 #endif
1612 	args.name = "UMA Zones";
1613 	args.size = sizeof(struct uma_zone) +
1614 	    (sizeof(struct uma_cache) * (mp_maxid + 1));
1615 	args.ctor = zone_ctor;
1616 	args.dtor = zone_dtor;
1617 	args.uminit = zero_init;
1618 	args.fini = NULL;
1619 	args.keg = NULL;
1620 	args.align = 32 - 1;
1621 	args.flags = UMA_ZFLAG_INTERNAL;
1622 	/* The initial zone has no Per cpu queues so it's smaller */
1623 	zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
1624 
1625 #ifdef UMA_DEBUG
1626 	printf("Initializing pcpu cache locks.\n");
1627 #endif
1628 #ifdef UMA_DEBUG
1629 	printf("Creating slab and hash zones.\n");
1630 #endif
1631 
1632 	/*
1633 	 * This is the max number of free list items we'll have with
1634 	 * offpage slabs.
1635 	 */
1636 	slabsize = uma_max_ipers * UMA_FRITM_SZ;
1637 	slabsize += sizeof(struct uma_slab);
1638 
1639 	/* Now make a zone for slab headers */
1640 	slabzone = uma_zcreate("UMA Slabs",
1641 				slabsize,
1642 				NULL, NULL, NULL, NULL,
1643 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1644 
1645 	/*
1646 	 * We also create a zone for the bigger slabs with reference
1647 	 * counts in them, to accomodate UMA_ZONE_REFCNT zones.
1648 	 */
1649 	slabsize = uma_max_ipers_ref * UMA_FRITMREF_SZ;
1650 	slabsize += sizeof(struct uma_slab_refcnt);
1651 	slabrefzone = uma_zcreate("UMA RCntSlabs",
1652 				  slabsize,
1653 				  NULL, NULL, NULL, NULL,
1654 				  UMA_ALIGN_PTR,
1655 				  UMA_ZFLAG_INTERNAL);
1656 
1657 	hashzone = uma_zcreate("UMA Hash",
1658 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
1659 	    NULL, NULL, NULL, NULL,
1660 	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1661 
1662 	bucket_init();
1663 
1664 #ifdef UMA_MD_SMALL_ALLOC
1665 	booted = 1;
1666 #endif
1667 
1668 #ifdef UMA_DEBUG
1669 	printf("UMA startup complete.\n");
1670 #endif
1671 }
1672 
1673 /* see uma.h */
1674 void
1675 uma_startup2(void)
1676 {
1677 	booted = 1;
1678 	bucket_enable();
1679 #ifdef UMA_DEBUG
1680 	printf("UMA startup2 complete.\n");
1681 #endif
1682 }
1683 
1684 /*
1685  * Initialize our callout handle
1686  *
1687  */
1688 
1689 static void
1690 uma_startup3(void)
1691 {
1692 #ifdef UMA_DEBUG
1693 	printf("Starting callout.\n");
1694 #endif
1695 	callout_init(&uma_callout, CALLOUT_MPSAFE);
1696 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
1697 #ifdef UMA_DEBUG
1698 	printf("UMA startup3 complete.\n");
1699 #endif
1700 }
1701 
1702 static uma_zone_t
1703 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
1704 		int align, u_int32_t flags)
1705 {
1706 	struct uma_kctor_args args;
1707 
1708 	args.size = size;
1709 	args.uminit = uminit;
1710 	args.fini = fini;
1711 	args.align = align;
1712 	args.flags = flags;
1713 	args.zone = zone;
1714 	return (uma_zalloc_internal(kegs, &args, M_WAITOK));
1715 }
1716 
1717 /* See uma.h */
1718 uma_zone_t
1719 uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
1720 		uma_init uminit, uma_fini fini, int align, u_int32_t flags)
1721 
1722 {
1723 	struct uma_zctor_args args;
1724 
1725 	/* This stuff is essential for the zone ctor */
1726 	args.name = name;
1727 	args.size = size;
1728 	args.ctor = ctor;
1729 	args.dtor = dtor;
1730 	args.uminit = uminit;
1731 	args.fini = fini;
1732 	args.align = align;
1733 	args.flags = flags;
1734 	args.keg = NULL;
1735 
1736 	return (uma_zalloc_internal(zones, &args, M_WAITOK));
1737 }
1738 
1739 /* See uma.h */
1740 uma_zone_t
1741 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
1742 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
1743 {
1744 	struct uma_zctor_args args;
1745 
1746 	args.name = name;
1747 	args.size = master->uz_keg->uk_size;
1748 	args.ctor = ctor;
1749 	args.dtor = dtor;
1750 	args.uminit = zinit;
1751 	args.fini = zfini;
1752 	args.align = master->uz_keg->uk_align;
1753 	args.flags = master->uz_keg->uk_flags | UMA_ZONE_SECONDARY;
1754 	args.keg = master->uz_keg;
1755 
1756 	return (uma_zalloc_internal(zones, &args, M_WAITOK));
1757 }
1758 
1759 /* See uma.h */
1760 void
1761 uma_zdestroy(uma_zone_t zone)
1762 {
1763 
1764 	uma_zfree_internal(zones, zone, NULL, SKIP_NONE, ZFREE_STATFREE);
1765 }
1766 
1767 /* See uma.h */
1768 void *
1769 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
1770 {
1771 	void *item;
1772 	uma_cache_t cache;
1773 	uma_bucket_t bucket;
1774 	int cpu;
1775 	int badness;
1776 
1777 	/* This is the fast path allocation */
1778 #ifdef UMA_DEBUG_ALLOC_1
1779 	printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
1780 #endif
1781 	CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,
1782 	    zone->uz_name, flags);
1783 
1784 	if (!(flags & M_NOWAIT)) {
1785 		KASSERT(curthread->td_intr_nesting_level == 0,
1786 		   ("malloc(M_WAITOK) in interrupt context"));
1787 		if (nosleepwithlocks) {
1788 #ifdef WITNESS
1789 			badness = WITNESS_CHECK(WARN_GIANTOK | WARN_SLEEPOK,
1790 			    NULL,
1791 			    "malloc(M_WAITOK) of \"%s\", forcing M_NOWAIT",
1792 			    zone->uz_name);
1793 #else
1794 			badness = 1;
1795 #endif
1796 		} else {
1797 			badness = 0;
1798 			WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
1799 			    "malloc(M_WAITOK) of \"%s\"", zone->uz_name);
1800 		}
1801 		if (badness) {
1802 			flags &= ~M_WAITOK;
1803 			flags |= M_NOWAIT;
1804 		}
1805 	}
1806 
1807 	/*
1808 	 * If possible, allocate from the per-CPU cache.  There are two
1809 	 * requirements for safe access to the per-CPU cache: (1) the thread
1810 	 * accessing the cache must not be preempted or yield during access,
1811 	 * and (2) the thread must not migrate CPUs without switching which
1812 	 * cache it accesses.  We rely on a critical section to prevent
1813 	 * preemption and migration.  We release the critical section in
1814 	 * order to acquire the zone mutex if we are unable to allocate from
1815 	 * the current cache; when we re-acquire the critical section, we
1816 	 * must detect and handle migration if it has occurred.
1817 	 */
1818 zalloc_restart:
1819 	critical_enter();
1820 	cpu = curcpu;
1821 	cache = &zone->uz_cpu[cpu];
1822 
1823 zalloc_start:
1824 	bucket = cache->uc_allocbucket;
1825 
1826 	if (bucket) {
1827 		if (bucket->ub_cnt > 0) {
1828 			bucket->ub_cnt--;
1829 			item = bucket->ub_bucket[bucket->ub_cnt];
1830 #ifdef INVARIANTS
1831 			bucket->ub_bucket[bucket->ub_cnt] = NULL;
1832 #endif
1833 			KASSERT(item != NULL,
1834 			    ("uma_zalloc: Bucket pointer mangled."));
1835 			cache->uc_allocs++;
1836 			critical_exit();
1837 #ifdef INVARIANTS
1838 			ZONE_LOCK(zone);
1839 			uma_dbg_alloc(zone, NULL, item);
1840 			ZONE_UNLOCK(zone);
1841 #endif
1842 			if (zone->uz_ctor != NULL) {
1843 				if (zone->uz_ctor(item, zone->uz_keg->uk_size,
1844 				    udata, flags) != 0) {
1845 					uma_zfree_internal(zone, item, udata,
1846 					    SKIP_DTOR, ZFREE_STATFAIL |
1847 					    ZFREE_STATFREE);
1848 					return (NULL);
1849 				}
1850 			}
1851 			if (flags & M_ZERO)
1852 				bzero(item, zone->uz_keg->uk_size);
1853 			return (item);
1854 		} else if (cache->uc_freebucket) {
1855 			/*
1856 			 * We have run out of items in our allocbucket.
1857 			 * See if we can switch with our free bucket.
1858 			 */
1859 			if (cache->uc_freebucket->ub_cnt > 0) {
1860 #ifdef UMA_DEBUG_ALLOC
1861 				printf("uma_zalloc: Swapping empty with"
1862 				    " alloc.\n");
1863 #endif
1864 				bucket = cache->uc_freebucket;
1865 				cache->uc_freebucket = cache->uc_allocbucket;
1866 				cache->uc_allocbucket = bucket;
1867 
1868 				goto zalloc_start;
1869 			}
1870 		}
1871 	}
1872 	/*
1873 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
1874 	 * we must go back to the zone.  This requires the zone lock, so we
1875 	 * must drop the critical section, then re-acquire it when we go back
1876 	 * to the cache.  Since the critical section is released, we may be
1877 	 * preempted or migrate.  As such, make sure not to maintain any
1878 	 * thread-local state specific to the cache from prior to releasing
1879 	 * the critical section.
1880 	 */
1881 	critical_exit();
1882 	ZONE_LOCK(zone);
1883 	critical_enter();
1884 	cpu = curcpu;
1885 	cache = &zone->uz_cpu[cpu];
1886 	bucket = cache->uc_allocbucket;
1887 	if (bucket != NULL) {
1888 		if (bucket->ub_cnt > 0) {
1889 			ZONE_UNLOCK(zone);
1890 			goto zalloc_start;
1891 		}
1892 		bucket = cache->uc_freebucket;
1893 		if (bucket != NULL && bucket->ub_cnt > 0) {
1894 			ZONE_UNLOCK(zone);
1895 			goto zalloc_start;
1896 		}
1897 	}
1898 
1899 	/* Since we have locked the zone we may as well send back our stats */
1900 	zone->uz_allocs += cache->uc_allocs;
1901 	cache->uc_allocs = 0;
1902 	zone->uz_frees += cache->uc_frees;
1903 	cache->uc_frees = 0;
1904 
1905 	/* Our old one is now a free bucket */
1906 	if (cache->uc_allocbucket) {
1907 		KASSERT(cache->uc_allocbucket->ub_cnt == 0,
1908 		    ("uma_zalloc_arg: Freeing a non free bucket."));
1909 		LIST_INSERT_HEAD(&zone->uz_free_bucket,
1910 		    cache->uc_allocbucket, ub_link);
1911 		cache->uc_allocbucket = NULL;
1912 	}
1913 
1914 	/* Check the free list for a new alloc bucket */
1915 	if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
1916 		KASSERT(bucket->ub_cnt != 0,
1917 		    ("uma_zalloc_arg: Returning an empty bucket."));
1918 
1919 		LIST_REMOVE(bucket, ub_link);
1920 		cache->uc_allocbucket = bucket;
1921 		ZONE_UNLOCK(zone);
1922 		goto zalloc_start;
1923 	}
1924 	/* We are no longer associated with this CPU. */
1925 	critical_exit();
1926 
1927 	/* Bump up our uz_count so we get here less */
1928 	if (zone->uz_count < BUCKET_MAX)
1929 		zone->uz_count++;
1930 
1931 	/*
1932 	 * Now lets just fill a bucket and put it on the free list.  If that
1933 	 * works we'll restart the allocation from the begining.
1934 	 */
1935 	if (uma_zalloc_bucket(zone, flags)) {
1936 		ZONE_UNLOCK(zone);
1937 		goto zalloc_restart;
1938 	}
1939 	ZONE_UNLOCK(zone);
1940 	/*
1941 	 * We may not be able to get a bucket so return an actual item.
1942 	 */
1943 #ifdef UMA_DEBUG
1944 	printf("uma_zalloc_arg: Bucketzone returned NULL\n");
1945 #endif
1946 
1947 	return (uma_zalloc_internal(zone, udata, flags));
1948 }
1949 
1950 static uma_slab_t
1951 uma_zone_slab(uma_zone_t zone, int flags)
1952 {
1953 	uma_slab_t slab;
1954 	uma_keg_t keg;
1955 
1956 	keg = zone->uz_keg;
1957 
1958 	/*
1959 	 * This is to prevent us from recursively trying to allocate
1960 	 * buckets.  The problem is that if an allocation forces us to
1961 	 * grab a new bucket we will call page_alloc, which will go off
1962 	 * and cause the vm to allocate vm_map_entries.  If we need new
1963 	 * buckets there too we will recurse in kmem_alloc and bad
1964 	 * things happen.  So instead we return a NULL bucket, and make
1965 	 * the code that allocates buckets smart enough to deal with it
1966 	 *
1967 	 * XXX: While we want this protection for the bucket zones so that
1968 	 * recursion from the VM is handled (and the calling code that
1969 	 * allocates buckets knows how to deal with it), we do not want
1970 	 * to prevent allocation from the slab header zones (slabzone
1971 	 * and slabrefzone) if uk_recurse is not zero for them.  The
1972 	 * reason is that it could lead to NULL being returned for
1973 	 * slab header allocations even in the M_WAITOK case, and the
1974 	 * caller can't handle that.
1975 	 */
1976 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL && keg->uk_recurse != 0)
1977 		if ((zone != slabzone) && (zone != slabrefzone))
1978 			return (NULL);
1979 
1980 	slab = NULL;
1981 
1982 	for (;;) {
1983 		/*
1984 		 * Find a slab with some space.  Prefer slabs that are partially
1985 		 * used over those that are totally full.  This helps to reduce
1986 		 * fragmentation.
1987 		 */
1988 		if (keg->uk_free != 0) {
1989 			if (!LIST_EMPTY(&keg->uk_part_slab)) {
1990 				slab = LIST_FIRST(&keg->uk_part_slab);
1991 			} else {
1992 				slab = LIST_FIRST(&keg->uk_free_slab);
1993 				LIST_REMOVE(slab, us_link);
1994 				LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
1995 				    us_link);
1996 			}
1997 			return (slab);
1998 		}
1999 
2000 		/*
2001 		 * M_NOVM means don't ask at all!
2002 		 */
2003 		if (flags & M_NOVM)
2004 			break;
2005 
2006 		if (keg->uk_maxpages &&
2007 		    keg->uk_pages >= keg->uk_maxpages) {
2008 			keg->uk_flags |= UMA_ZFLAG_FULL;
2009 
2010 			if (flags & M_NOWAIT)
2011 				break;
2012 			else
2013 				msleep(keg, &keg->uk_lock, PVM,
2014 				    "zonelimit", 0);
2015 			continue;
2016 		}
2017 		keg->uk_recurse++;
2018 		slab = slab_zalloc(zone, flags);
2019 		keg->uk_recurse--;
2020 
2021 		/*
2022 		 * If we got a slab here it's safe to mark it partially used
2023 		 * and return.  We assume that the caller is going to remove
2024 		 * at least one item.
2025 		 */
2026 		if (slab) {
2027 			LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2028 			return (slab);
2029 		}
2030 		/*
2031 		 * We might not have been able to get a slab but another cpu
2032 		 * could have while we were unlocked.  Check again before we
2033 		 * fail.
2034 		 */
2035 		if (flags & M_NOWAIT)
2036 			flags |= M_NOVM;
2037 	}
2038 	return (slab);
2039 }
2040 
2041 static void *
2042 uma_slab_alloc(uma_zone_t zone, uma_slab_t slab)
2043 {
2044 	uma_keg_t keg;
2045 	uma_slabrefcnt_t slabref;
2046 	void *item;
2047 	u_int8_t freei;
2048 
2049 	keg = zone->uz_keg;
2050 
2051 	freei = slab->us_firstfree;
2052 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
2053 		slabref = (uma_slabrefcnt_t)slab;
2054 		slab->us_firstfree = slabref->us_freelist[freei].us_item;
2055 	} else {
2056 		slab->us_firstfree = slab->us_freelist[freei].us_item;
2057 	}
2058 	item = slab->us_data + (keg->uk_rsize * freei);
2059 
2060 	slab->us_freecount--;
2061 	keg->uk_free--;
2062 #ifdef INVARIANTS
2063 	uma_dbg_alloc(zone, slab, item);
2064 #endif
2065 	/* Move this slab to the full list */
2066 	if (slab->us_freecount == 0) {
2067 		LIST_REMOVE(slab, us_link);
2068 		LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
2069 	}
2070 
2071 	return (item);
2072 }
2073 
2074 static int
2075 uma_zalloc_bucket(uma_zone_t zone, int flags)
2076 {
2077 	uma_bucket_t bucket;
2078 	uma_slab_t slab;
2079 	int16_t saved;
2080 	int max, origflags = flags;
2081 
2082 	/*
2083 	 * Try this zone's free list first so we don't allocate extra buckets.
2084 	 */
2085 	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
2086 		KASSERT(bucket->ub_cnt == 0,
2087 		    ("uma_zalloc_bucket: Bucket on free list is not empty."));
2088 		LIST_REMOVE(bucket, ub_link);
2089 	} else {
2090 		int bflags;
2091 
2092 		bflags = (flags & ~M_ZERO);
2093 		if (zone->uz_keg->uk_flags & UMA_ZFLAG_CACHEONLY)
2094 			bflags |= M_NOVM;
2095 
2096 		ZONE_UNLOCK(zone);
2097 		bucket = bucket_alloc(zone->uz_count, bflags);
2098 		ZONE_LOCK(zone);
2099 	}
2100 
2101 	if (bucket == NULL)
2102 		return (0);
2103 
2104 #ifdef SMP
2105 	/*
2106 	 * This code is here to limit the number of simultaneous bucket fills
2107 	 * for any given zone to the number of per cpu caches in this zone. This
2108 	 * is done so that we don't allocate more memory than we really need.
2109 	 */
2110 	if (zone->uz_fills >= mp_ncpus)
2111 		goto done;
2112 
2113 #endif
2114 	zone->uz_fills++;
2115 
2116 	max = MIN(bucket->ub_entries, zone->uz_count);
2117 	/* Try to keep the buckets totally full */
2118 	saved = bucket->ub_cnt;
2119 	while (bucket->ub_cnt < max &&
2120 	    (slab = uma_zone_slab(zone, flags)) != NULL) {
2121 		while (slab->us_freecount && bucket->ub_cnt < max) {
2122 			bucket->ub_bucket[bucket->ub_cnt++] =
2123 			    uma_slab_alloc(zone, slab);
2124 		}
2125 
2126 		/* Don't block on the next fill */
2127 		flags |= M_NOWAIT;
2128 	}
2129 
2130 	/*
2131 	 * We unlock here because we need to call the zone's init.
2132 	 * It should be safe to unlock because the slab dealt with
2133 	 * above is already on the appropriate list within the keg
2134 	 * and the bucket we filled is not yet on any list, so we
2135 	 * own it.
2136 	 */
2137 	if (zone->uz_init != NULL) {
2138 		int i;
2139 
2140 		ZONE_UNLOCK(zone);
2141 		for (i = saved; i < bucket->ub_cnt; i++)
2142 			if (zone->uz_init(bucket->ub_bucket[i],
2143 			    zone->uz_keg->uk_size, origflags) != 0)
2144 				break;
2145 		/*
2146 		 * If we couldn't initialize the whole bucket, put the
2147 		 * rest back onto the freelist.
2148 		 */
2149 		if (i != bucket->ub_cnt) {
2150 			int j;
2151 
2152 			for (j = i; j < bucket->ub_cnt; j++) {
2153 				uma_zfree_internal(zone, bucket->ub_bucket[j],
2154 				    NULL, SKIP_FINI, 0);
2155 #ifdef INVARIANTS
2156 				bucket->ub_bucket[j] = NULL;
2157 #endif
2158 			}
2159 			bucket->ub_cnt = i;
2160 		}
2161 		ZONE_LOCK(zone);
2162 	}
2163 
2164 	zone->uz_fills--;
2165 	if (bucket->ub_cnt != 0) {
2166 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
2167 		    bucket, ub_link);
2168 		return (1);
2169 	}
2170 #ifdef SMP
2171 done:
2172 #endif
2173 	bucket_free(bucket);
2174 
2175 	return (0);
2176 }
2177 /*
2178  * Allocates an item for an internal zone
2179  *
2180  * Arguments
2181  *	zone   The zone to alloc for.
2182  *	udata  The data to be passed to the constructor.
2183  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
2184  *
2185  * Returns
2186  *	NULL if there is no memory and M_NOWAIT is set
2187  *	An item if successful
2188  */
2189 
2190 static void *
2191 uma_zalloc_internal(uma_zone_t zone, void *udata, int flags)
2192 {
2193 	uma_keg_t keg;
2194 	uma_slab_t slab;
2195 	void *item;
2196 
2197 	item = NULL;
2198 	keg = zone->uz_keg;
2199 
2200 #ifdef UMA_DEBUG_ALLOC
2201 	printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
2202 #endif
2203 	ZONE_LOCK(zone);
2204 
2205 	slab = uma_zone_slab(zone, flags);
2206 	if (slab == NULL) {
2207 		zone->uz_fails++;
2208 		ZONE_UNLOCK(zone);
2209 		return (NULL);
2210 	}
2211 
2212 	item = uma_slab_alloc(zone, slab);
2213 
2214 	zone->uz_allocs++;
2215 
2216 	ZONE_UNLOCK(zone);
2217 
2218 	/*
2219 	 * We have to call both the zone's init (not the keg's init)
2220 	 * and the zone's ctor.  This is because the item is going from
2221 	 * a keg slab directly to the user, and the user is expecting it
2222 	 * to be both zone-init'd as well as zone-ctor'd.
2223 	 */
2224 	if (zone->uz_init != NULL) {
2225 		if (zone->uz_init(item, keg->uk_size, flags) != 0) {
2226 			uma_zfree_internal(zone, item, udata, SKIP_FINI,
2227 			    ZFREE_STATFAIL | ZFREE_STATFREE);
2228 			return (NULL);
2229 		}
2230 	}
2231 	if (zone->uz_ctor != NULL) {
2232 		if (zone->uz_ctor(item, keg->uk_size, udata, flags) != 0) {
2233 			uma_zfree_internal(zone, item, udata, SKIP_DTOR,
2234 			    ZFREE_STATFAIL | ZFREE_STATFREE);
2235 			return (NULL);
2236 		}
2237 	}
2238 	if (flags & M_ZERO)
2239 		bzero(item, keg->uk_size);
2240 
2241 	return (item);
2242 }
2243 
2244 /* See uma.h */
2245 void
2246 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
2247 {
2248 	uma_keg_t keg;
2249 	uma_cache_t cache;
2250 	uma_bucket_t bucket;
2251 	int bflags;
2252 	int cpu;
2253 
2254 	keg = zone->uz_keg;
2255 
2256 #ifdef UMA_DEBUG_ALLOC_1
2257 	printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
2258 #endif
2259 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
2260 	    zone->uz_name);
2261 
2262 	if (zone->uz_dtor)
2263 		zone->uz_dtor(item, keg->uk_size, udata);
2264 #ifdef INVARIANTS
2265 	ZONE_LOCK(zone);
2266 	if (keg->uk_flags & UMA_ZONE_MALLOC)
2267 		uma_dbg_free(zone, udata, item);
2268 	else
2269 		uma_dbg_free(zone, NULL, item);
2270 	ZONE_UNLOCK(zone);
2271 #endif
2272 	/*
2273 	 * The race here is acceptable.  If we miss it we'll just have to wait
2274 	 * a little longer for the limits to be reset.
2275 	 */
2276 	if (keg->uk_flags & UMA_ZFLAG_FULL)
2277 		goto zfree_internal;
2278 
2279 	/*
2280 	 * If possible, free to the per-CPU cache.  There are two
2281 	 * requirements for safe access to the per-CPU cache: (1) the thread
2282 	 * accessing the cache must not be preempted or yield during access,
2283 	 * and (2) the thread must not migrate CPUs without switching which
2284 	 * cache it accesses.  We rely on a critical section to prevent
2285 	 * preemption and migration.  We release the critical section in
2286 	 * order to acquire the zone mutex if we are unable to free to the
2287 	 * current cache; when we re-acquire the critical section, we must
2288 	 * detect and handle migration if it has occurred.
2289 	 */
2290 zfree_restart:
2291 	critical_enter();
2292 	cpu = curcpu;
2293 	cache = &zone->uz_cpu[cpu];
2294 
2295 zfree_start:
2296 	bucket = cache->uc_freebucket;
2297 
2298 	if (bucket) {
2299 		/*
2300 		 * Do we have room in our bucket? It is OK for this uz count
2301 		 * check to be slightly out of sync.
2302 		 */
2303 
2304 		if (bucket->ub_cnt < bucket->ub_entries) {
2305 			KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
2306 			    ("uma_zfree: Freeing to non free bucket index."));
2307 			bucket->ub_bucket[bucket->ub_cnt] = item;
2308 			bucket->ub_cnt++;
2309 			cache->uc_frees++;
2310 			critical_exit();
2311 			return;
2312 		} else if (cache->uc_allocbucket) {
2313 #ifdef UMA_DEBUG_ALLOC
2314 			printf("uma_zfree: Swapping buckets.\n");
2315 #endif
2316 			/*
2317 			 * We have run out of space in our freebucket.
2318 			 * See if we can switch with our alloc bucket.
2319 			 */
2320 			if (cache->uc_allocbucket->ub_cnt <
2321 			    cache->uc_freebucket->ub_cnt) {
2322 				bucket = cache->uc_freebucket;
2323 				cache->uc_freebucket = cache->uc_allocbucket;
2324 				cache->uc_allocbucket = bucket;
2325 				goto zfree_start;
2326 			}
2327 		}
2328 	}
2329 	/*
2330 	 * We can get here for two reasons:
2331 	 *
2332 	 * 1) The buckets are NULL
2333 	 * 2) The alloc and free buckets are both somewhat full.
2334 	 *
2335 	 * We must go back the zone, which requires acquiring the zone lock,
2336 	 * which in turn means we must release and re-acquire the critical
2337 	 * section.  Since the critical section is released, we may be
2338 	 * preempted or migrate.  As such, make sure not to maintain any
2339 	 * thread-local state specific to the cache from prior to releasing
2340 	 * the critical section.
2341 	 */
2342 	critical_exit();
2343 	ZONE_LOCK(zone);
2344 	critical_enter();
2345 	cpu = curcpu;
2346 	cache = &zone->uz_cpu[cpu];
2347 	if (cache->uc_freebucket != NULL) {
2348 		if (cache->uc_freebucket->ub_cnt <
2349 		    cache->uc_freebucket->ub_entries) {
2350 			ZONE_UNLOCK(zone);
2351 			goto zfree_start;
2352 		}
2353 		if (cache->uc_allocbucket != NULL &&
2354 		    (cache->uc_allocbucket->ub_cnt <
2355 		    cache->uc_freebucket->ub_cnt)) {
2356 			ZONE_UNLOCK(zone);
2357 			goto zfree_start;
2358 		}
2359 	}
2360 
2361 	/* Since we have locked the zone we may as well send back our stats */
2362 	zone->uz_allocs += cache->uc_allocs;
2363 	cache->uc_allocs = 0;
2364 	zone->uz_frees += cache->uc_frees;
2365 	cache->uc_frees = 0;
2366 
2367 	bucket = cache->uc_freebucket;
2368 	cache->uc_freebucket = NULL;
2369 
2370 	/* Can we throw this on the zone full list? */
2371 	if (bucket != NULL) {
2372 #ifdef UMA_DEBUG_ALLOC
2373 		printf("uma_zfree: Putting old bucket on the free list.\n");
2374 #endif
2375 		/* ub_cnt is pointing to the last free item */
2376 		KASSERT(bucket->ub_cnt != 0,
2377 		    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
2378 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
2379 		    bucket, ub_link);
2380 	}
2381 	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
2382 		LIST_REMOVE(bucket, ub_link);
2383 		ZONE_UNLOCK(zone);
2384 		cache->uc_freebucket = bucket;
2385 		goto zfree_start;
2386 	}
2387 	/* We are no longer associated with this CPU. */
2388 	critical_exit();
2389 
2390 	/* And the zone.. */
2391 	ZONE_UNLOCK(zone);
2392 
2393 #ifdef UMA_DEBUG_ALLOC
2394 	printf("uma_zfree: Allocating new free bucket.\n");
2395 #endif
2396 	bflags = M_NOWAIT;
2397 
2398 	if (keg->uk_flags & UMA_ZFLAG_CACHEONLY)
2399 		bflags |= M_NOVM;
2400 	bucket = bucket_alloc(zone->uz_count, bflags);
2401 	if (bucket) {
2402 		ZONE_LOCK(zone);
2403 		LIST_INSERT_HEAD(&zone->uz_free_bucket,
2404 		    bucket, ub_link);
2405 		ZONE_UNLOCK(zone);
2406 		goto zfree_restart;
2407 	}
2408 
2409 	/*
2410 	 * If nothing else caught this, we'll just do an internal free.
2411 	 */
2412 zfree_internal:
2413 	uma_zfree_internal(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE);
2414 
2415 	return;
2416 }
2417 
2418 /*
2419  * Frees an item to an INTERNAL zone or allocates a free bucket
2420  *
2421  * Arguments:
2422  *	zone   The zone to free to
2423  *	item   The item we're freeing
2424  *	udata  User supplied data for the dtor
2425  *	skip   Skip dtors and finis
2426  */
2427 static void
2428 uma_zfree_internal(uma_zone_t zone, void *item, void *udata,
2429     enum zfreeskip skip, int flags)
2430 {
2431 	uma_slab_t slab;
2432 	uma_slabrefcnt_t slabref;
2433 	uma_keg_t keg;
2434 	u_int8_t *mem;
2435 	u_int8_t freei;
2436 
2437 	keg = zone->uz_keg;
2438 
2439 	if (skip < SKIP_DTOR && zone->uz_dtor)
2440 		zone->uz_dtor(item, keg->uk_size, udata);
2441 	if (skip < SKIP_FINI && zone->uz_fini)
2442 		zone->uz_fini(item, keg->uk_size);
2443 
2444 	ZONE_LOCK(zone);
2445 
2446 	if (flags & ZFREE_STATFAIL)
2447 		zone->uz_fails++;
2448 	if (flags & ZFREE_STATFREE)
2449 		zone->uz_frees++;
2450 
2451 	if (!(keg->uk_flags & UMA_ZONE_MALLOC)) {
2452 		mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
2453 		if (keg->uk_flags & UMA_ZONE_HASH)
2454 			slab = hash_sfind(&keg->uk_hash, mem);
2455 		else {
2456 			mem += keg->uk_pgoff;
2457 			slab = (uma_slab_t)mem;
2458 		}
2459 	} else {
2460 		slab = (uma_slab_t)udata;
2461 	}
2462 
2463 	/* Do we need to remove from any lists? */
2464 	if (slab->us_freecount+1 == keg->uk_ipers) {
2465 		LIST_REMOVE(slab, us_link);
2466 		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
2467 	} else if (slab->us_freecount == 0) {
2468 		LIST_REMOVE(slab, us_link);
2469 		LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2470 	}
2471 
2472 	/* Slab management stuff */
2473 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
2474 		/ keg->uk_rsize;
2475 
2476 #ifdef INVARIANTS
2477 	if (!skip)
2478 		uma_dbg_free(zone, slab, item);
2479 #endif
2480 
2481 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
2482 		slabref = (uma_slabrefcnt_t)slab;
2483 		slabref->us_freelist[freei].us_item = slab->us_firstfree;
2484 	} else {
2485 		slab->us_freelist[freei].us_item = slab->us_firstfree;
2486 	}
2487 	slab->us_firstfree = freei;
2488 	slab->us_freecount++;
2489 
2490 	/* Zone statistics */
2491 	keg->uk_free++;
2492 
2493 	if (keg->uk_flags & UMA_ZFLAG_FULL) {
2494 		if (keg->uk_pages < keg->uk_maxpages)
2495 			keg->uk_flags &= ~UMA_ZFLAG_FULL;
2496 
2497 		/* We can handle one more allocation */
2498 		wakeup_one(keg);
2499 	}
2500 
2501 	ZONE_UNLOCK(zone);
2502 }
2503 
2504 /* See uma.h */
2505 void
2506 uma_zone_set_max(uma_zone_t zone, int nitems)
2507 {
2508 	uma_keg_t keg;
2509 
2510 	keg = zone->uz_keg;
2511 	ZONE_LOCK(zone);
2512 	if (keg->uk_ppera > 1)
2513 		keg->uk_maxpages = nitems * keg->uk_ppera;
2514 	else
2515 		keg->uk_maxpages = nitems / keg->uk_ipers;
2516 
2517 	if (keg->uk_maxpages * keg->uk_ipers < nitems)
2518 		keg->uk_maxpages++;
2519 
2520 	ZONE_UNLOCK(zone);
2521 }
2522 
2523 /* See uma.h */
2524 void
2525 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
2526 {
2527 	ZONE_LOCK(zone);
2528 	KASSERT(zone->uz_keg->uk_pages == 0,
2529 	    ("uma_zone_set_init on non-empty keg"));
2530 	zone->uz_keg->uk_init = uminit;
2531 	ZONE_UNLOCK(zone);
2532 }
2533 
2534 /* See uma.h */
2535 void
2536 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
2537 {
2538 	ZONE_LOCK(zone);
2539 	KASSERT(zone->uz_keg->uk_pages == 0,
2540 	    ("uma_zone_set_fini on non-empty keg"));
2541 	zone->uz_keg->uk_fini = fini;
2542 	ZONE_UNLOCK(zone);
2543 }
2544 
2545 /* See uma.h */
2546 void
2547 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
2548 {
2549 	ZONE_LOCK(zone);
2550 	KASSERT(zone->uz_keg->uk_pages == 0,
2551 	    ("uma_zone_set_zinit on non-empty keg"));
2552 	zone->uz_init = zinit;
2553 	ZONE_UNLOCK(zone);
2554 }
2555 
2556 /* See uma.h */
2557 void
2558 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
2559 {
2560 	ZONE_LOCK(zone);
2561 	KASSERT(zone->uz_keg->uk_pages == 0,
2562 	    ("uma_zone_set_zfini on non-empty keg"));
2563 	zone->uz_fini = zfini;
2564 	ZONE_UNLOCK(zone);
2565 }
2566 
2567 /* See uma.h */
2568 /* XXX uk_freef is not actually used with the zone locked */
2569 void
2570 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
2571 {
2572 	ZONE_LOCK(zone);
2573 	zone->uz_keg->uk_freef = freef;
2574 	ZONE_UNLOCK(zone);
2575 }
2576 
2577 /* See uma.h */
2578 /* XXX uk_allocf is not actually used with the zone locked */
2579 void
2580 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
2581 {
2582 	ZONE_LOCK(zone);
2583 	zone->uz_keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
2584 	zone->uz_keg->uk_allocf = allocf;
2585 	ZONE_UNLOCK(zone);
2586 }
2587 
2588 /* See uma.h */
2589 int
2590 uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
2591 {
2592 	uma_keg_t keg;
2593 	vm_offset_t kva;
2594 	int pages;
2595 
2596 	keg = zone->uz_keg;
2597 	pages = count / keg->uk_ipers;
2598 
2599 	if (pages * keg->uk_ipers < count)
2600 		pages++;
2601 
2602 	kva = kmem_alloc_nofault(kernel_map, pages * UMA_SLAB_SIZE);
2603 
2604 	if (kva == 0)
2605 		return (0);
2606 	if (obj == NULL) {
2607 		obj = vm_object_allocate(OBJT_DEFAULT,
2608 		    pages);
2609 	} else {
2610 		VM_OBJECT_LOCK_INIT(obj, "uma object");
2611 		_vm_object_allocate(OBJT_DEFAULT,
2612 		    pages, obj);
2613 	}
2614 	ZONE_LOCK(zone);
2615 	keg->uk_kva = kva;
2616 	keg->uk_obj = obj;
2617 	keg->uk_maxpages = pages;
2618 	keg->uk_allocf = obj_alloc;
2619 	keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
2620 	ZONE_UNLOCK(zone);
2621 	return (1);
2622 }
2623 
2624 /* See uma.h */
2625 void
2626 uma_prealloc(uma_zone_t zone, int items)
2627 {
2628 	int slabs;
2629 	uma_slab_t slab;
2630 	uma_keg_t keg;
2631 
2632 	keg = zone->uz_keg;
2633 	ZONE_LOCK(zone);
2634 	slabs = items / keg->uk_ipers;
2635 	if (slabs * keg->uk_ipers < items)
2636 		slabs++;
2637 	while (slabs > 0) {
2638 		slab = slab_zalloc(zone, M_WAITOK);
2639 		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
2640 		slabs--;
2641 	}
2642 	ZONE_UNLOCK(zone);
2643 }
2644 
2645 /* See uma.h */
2646 u_int32_t *
2647 uma_find_refcnt(uma_zone_t zone, void *item)
2648 {
2649 	uma_slabrefcnt_t slabref;
2650 	uma_keg_t keg;
2651 	u_int32_t *refcnt;
2652 	int idx;
2653 
2654 	keg = zone->uz_keg;
2655 	slabref = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item &
2656 	    (~UMA_SLAB_MASK));
2657 	KASSERT(slabref != NULL && slabref->us_keg->uk_flags & UMA_ZONE_REFCNT,
2658 	    ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
2659 	idx = ((unsigned long)item - (unsigned long)slabref->us_data)
2660 	    / keg->uk_rsize;
2661 	refcnt = &slabref->us_freelist[idx].us_refcnt;
2662 	return refcnt;
2663 }
2664 
2665 /* See uma.h */
2666 void
2667 uma_reclaim(void)
2668 {
2669 #ifdef UMA_DEBUG
2670 	printf("UMA: vm asked us to release pages!\n");
2671 #endif
2672 	bucket_enable();
2673 	zone_foreach(zone_drain);
2674 	/*
2675 	 * Some slabs may have been freed but this zone will be visited early
2676 	 * we visit again so that we can free pages that are empty once other
2677 	 * zones are drained.  We have to do the same for buckets.
2678 	 */
2679 	zone_drain(slabzone);
2680 	zone_drain(slabrefzone);
2681 	bucket_zone_drain();
2682 }
2683 
2684 /* See uma.h */
2685 int
2686 uma_zone_exhausted(uma_zone_t zone)
2687 {
2688 	int full;
2689 
2690 	ZONE_LOCK(zone);
2691 	full = (zone->uz_keg->uk_flags & UMA_ZFLAG_FULL);
2692 	ZONE_UNLOCK(zone);
2693 	return (full);
2694 }
2695 
2696 void *
2697 uma_large_malloc(int size, int wait)
2698 {
2699 	void *mem;
2700 	uma_slab_t slab;
2701 	u_int8_t flags;
2702 
2703 	slab = uma_zalloc_internal(slabzone, NULL, wait);
2704 	if (slab == NULL)
2705 		return (NULL);
2706 	mem = page_alloc(NULL, size, &flags, wait);
2707 	if (mem) {
2708 		vsetslab((vm_offset_t)mem, slab);
2709 		slab->us_data = mem;
2710 		slab->us_flags = flags | UMA_SLAB_MALLOC;
2711 		slab->us_size = size;
2712 	} else {
2713 		uma_zfree_internal(slabzone, slab, NULL, SKIP_NONE,
2714 		    ZFREE_STATFAIL | ZFREE_STATFREE);
2715 	}
2716 
2717 	return (mem);
2718 }
2719 
2720 void
2721 uma_large_free(uma_slab_t slab)
2722 {
2723 	vsetobj((vm_offset_t)slab->us_data, kmem_object);
2724 	page_free(slab->us_data, slab->us_size, slab->us_flags);
2725 	uma_zfree_internal(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE);
2726 }
2727 
2728 void
2729 uma_print_stats(void)
2730 {
2731 	zone_foreach(uma_print_zone);
2732 }
2733 
2734 static void
2735 slab_print(uma_slab_t slab)
2736 {
2737 	printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
2738 		slab->us_keg, slab->us_data, slab->us_freecount,
2739 		slab->us_firstfree);
2740 }
2741 
2742 static void
2743 cache_print(uma_cache_t cache)
2744 {
2745 	printf("alloc: %p(%d), free: %p(%d)\n",
2746 		cache->uc_allocbucket,
2747 		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
2748 		cache->uc_freebucket,
2749 		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
2750 }
2751 
2752 void
2753 uma_print_zone(uma_zone_t zone)
2754 {
2755 	uma_cache_t cache;
2756 	uma_keg_t keg;
2757 	uma_slab_t slab;
2758 	int i;
2759 
2760 	keg = zone->uz_keg;
2761 	printf("%s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n",
2762 	    zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
2763 	    keg->uk_ipers, keg->uk_ppera,
2764 	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
2765 	printf("Part slabs:\n");
2766 	LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
2767 		slab_print(slab);
2768 	printf("Free slabs:\n");
2769 	LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
2770 		slab_print(slab);
2771 	printf("Full slabs:\n");
2772 	LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
2773 		slab_print(slab);
2774 	for (i = 0; i <= mp_maxid; i++) {
2775 		if (CPU_ABSENT(i))
2776 			continue;
2777 		cache = &zone->uz_cpu[i];
2778 		printf("CPU %d Cache:\n", i);
2779 		cache_print(cache);
2780 	}
2781 }
2782 
2783 #ifdef DDB
2784 /*
2785  * Generate statistics across both the zone and its per-cpu cache's.  Return
2786  * desired statistics if the pointer is non-NULL for that statistic.
2787  *
2788  * Note: does not update the zone statistics, as it can't safely clear the
2789  * per-CPU cache statistic.
2790  *
2791  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
2792  * safe from off-CPU; we should modify the caches to track this information
2793  * directly so that we don't have to.
2794  */
2795 static void
2796 uma_zone_sumstat(uma_zone_t z, int *cachefreep, u_int64_t *allocsp,
2797     u_int64_t *freesp)
2798 {
2799 	uma_cache_t cache;
2800 	u_int64_t allocs, frees;
2801 	int cachefree, cpu;
2802 
2803 	allocs = frees = 0;
2804 	cachefree = 0;
2805 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
2806 		if (CPU_ABSENT(cpu))
2807 			continue;
2808 		cache = &z->uz_cpu[cpu];
2809 		if (cache->uc_allocbucket != NULL)
2810 			cachefree += cache->uc_allocbucket->ub_cnt;
2811 		if (cache->uc_freebucket != NULL)
2812 			cachefree += cache->uc_freebucket->ub_cnt;
2813 		allocs += cache->uc_allocs;
2814 		frees += cache->uc_frees;
2815 	}
2816 	allocs += z->uz_allocs;
2817 	frees += z->uz_frees;
2818 	if (cachefreep != NULL)
2819 		*cachefreep = cachefree;
2820 	if (allocsp != NULL)
2821 		*allocsp = allocs;
2822 	if (freesp != NULL)
2823 		*freesp = frees;
2824 }
2825 #endif /* DDB */
2826 
2827 static int
2828 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
2829 {
2830 	uma_keg_t kz;
2831 	uma_zone_t z;
2832 	int count;
2833 
2834 	count = 0;
2835 	mtx_lock(&uma_mtx);
2836 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
2837 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
2838 			count++;
2839 	}
2840 	mtx_unlock(&uma_mtx);
2841 	return (sysctl_handle_int(oidp, &count, 0, req));
2842 }
2843 
2844 static int
2845 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
2846 {
2847 	struct uma_stream_header ush;
2848 	struct uma_type_header uth;
2849 	struct uma_percpu_stat ups;
2850 	uma_bucket_t bucket;
2851 	struct sbuf sbuf;
2852 	uma_cache_t cache;
2853 	uma_keg_t kz;
2854 	uma_zone_t z;
2855 	char *buffer;
2856 	int buflen, count, error, i;
2857 
2858 	mtx_lock(&uma_mtx);
2859 restart:
2860 	mtx_assert(&uma_mtx, MA_OWNED);
2861 	count = 0;
2862 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
2863 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
2864 			count++;
2865 	}
2866 	mtx_unlock(&uma_mtx);
2867 
2868 	buflen = sizeof(ush) + count * (sizeof(uth) + sizeof(ups) *
2869 	    (mp_maxid + 1)) + 1;
2870 	buffer = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
2871 
2872 	mtx_lock(&uma_mtx);
2873 	i = 0;
2874 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
2875 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
2876 			i++;
2877 	}
2878 	if (i > count) {
2879 		free(buffer, M_TEMP);
2880 		goto restart;
2881 	}
2882 	count =  i;
2883 
2884 	sbuf_new(&sbuf, buffer, buflen, SBUF_FIXEDLEN);
2885 
2886 	/*
2887 	 * Insert stream header.
2888 	 */
2889 	bzero(&ush, sizeof(ush));
2890 	ush.ush_version = UMA_STREAM_VERSION;
2891 	ush.ush_maxcpus = (mp_maxid + 1);
2892 	ush.ush_count = count;
2893 	if (sbuf_bcat(&sbuf, &ush, sizeof(ush)) < 0) {
2894 		mtx_unlock(&uma_mtx);
2895 		error = ENOMEM;
2896 		goto out;
2897 	}
2898 
2899 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
2900 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
2901 			bzero(&uth, sizeof(uth));
2902 			ZONE_LOCK(z);
2903 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
2904 			uth.uth_align = kz->uk_align;
2905 			uth.uth_pages = kz->uk_pages;
2906 			uth.uth_keg_free = kz->uk_free;
2907 			uth.uth_size = kz->uk_size;
2908 			uth.uth_rsize = kz->uk_rsize;
2909 			uth.uth_maxpages = kz->uk_maxpages;
2910 			if (kz->uk_ppera > 1)
2911 				uth.uth_limit = kz->uk_maxpages /
2912 				    kz->uk_ppera;
2913 			else
2914 				uth.uth_limit = kz->uk_maxpages *
2915 				    kz->uk_ipers;
2916 
2917 			/*
2918 			 * A zone is secondary is it is not the first entry
2919 			 * on the keg's zone list.
2920 			 */
2921 			if ((kz->uk_flags & UMA_ZONE_SECONDARY) &&
2922 			    (LIST_FIRST(&kz->uk_zones) != z))
2923 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
2924 
2925 			LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
2926 				uth.uth_zone_free += bucket->ub_cnt;
2927 			uth.uth_allocs = z->uz_allocs;
2928 			uth.uth_frees = z->uz_frees;
2929 			uth.uth_fails = z->uz_fails;
2930 			if (sbuf_bcat(&sbuf, &uth, sizeof(uth)) < 0) {
2931 				ZONE_UNLOCK(z);
2932 				mtx_unlock(&uma_mtx);
2933 				error = ENOMEM;
2934 				goto out;
2935 			}
2936 			/*
2937 			 * While it is not normally safe to access the cache
2938 			 * bucket pointers while not on the CPU that owns the
2939 			 * cache, we only allow the pointers to be exchanged
2940 			 * without the zone lock held, not invalidated, so
2941 			 * accept the possible race associated with bucket
2942 			 * exchange during monitoring.
2943 			 */
2944 			for (i = 0; i < (mp_maxid + 1); i++) {
2945 				bzero(&ups, sizeof(ups));
2946 				if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
2947 					goto skip;
2948 				if (CPU_ABSENT(i))
2949 					goto skip;
2950 				cache = &z->uz_cpu[i];
2951 				if (cache->uc_allocbucket != NULL)
2952 					ups.ups_cache_free +=
2953 					    cache->uc_allocbucket->ub_cnt;
2954 				if (cache->uc_freebucket != NULL)
2955 					ups.ups_cache_free +=
2956 					    cache->uc_freebucket->ub_cnt;
2957 				ups.ups_allocs = cache->uc_allocs;
2958 				ups.ups_frees = cache->uc_frees;
2959 skip:
2960 				if (sbuf_bcat(&sbuf, &ups, sizeof(ups)) < 0) {
2961 					ZONE_UNLOCK(z);
2962 					mtx_unlock(&uma_mtx);
2963 					error = ENOMEM;
2964 					goto out;
2965 				}
2966 			}
2967 			ZONE_UNLOCK(z);
2968 		}
2969 	}
2970 	mtx_unlock(&uma_mtx);
2971 	sbuf_finish(&sbuf);
2972 	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
2973 out:
2974 	free(buffer, M_TEMP);
2975 	return (error);
2976 }
2977 
2978 #ifdef DDB
2979 DB_SHOW_COMMAND(uma, db_show_uma)
2980 {
2981 	u_int64_t allocs, frees;
2982 	uma_bucket_t bucket;
2983 	uma_keg_t kz;
2984 	uma_zone_t z;
2985 	int cachefree;
2986 
2987 	db_printf("%18s %8s %8s %8s %12s\n", "Zone", "Size", "Used", "Free",
2988 	    "Requests");
2989 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
2990 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
2991 			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
2992 				allocs = z->uz_allocs;
2993 				frees = z->uz_frees;
2994 				cachefree = 0;
2995 			} else
2996 				uma_zone_sumstat(z, &cachefree, &allocs,
2997 				    &frees);
2998 			if (!((kz->uk_flags & UMA_ZONE_SECONDARY) &&
2999 			    (LIST_FIRST(&kz->uk_zones) != z)))
3000 				cachefree += kz->uk_free;
3001 			LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
3002 				cachefree += bucket->ub_cnt;
3003 			db_printf("%18s %8ju %8jd %8d %12ju\n", z->uz_name,
3004 			    (uintmax_t)kz->uk_size,
3005 			    (intmax_t)(allocs - frees), cachefree,
3006 			    (uintmax_t)allocs);
3007 		}
3008 	}
3009 }
3010 #endif
3011