xref: /freebsd/sys/vm/uma_core.c (revision 6fd05b64b5b65dd4ba9b86482a0634a5f0b96c29)
1 /*
2  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 /*
28  * uma_core.c  Implementation of the Universal Memory allocator
29  *
30  * This allocator is intended to replace the multitude of similar object caches
31  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
32  * effecient.  A primary design goal is to return unused memory to the rest of
33  * the system.  This will make the system as a whole more flexible due to the
34  * ability to move memory to subsystems which most need it instead of leaving
35  * pools of reserved memory unused.
36  *
37  * The basic ideas stem from similar slab/zone based allocators whose algorithms
38  * are well known.
39  *
40  */
41 
42 /*
43  * TODO:
44  *	- Improve memory usage for large allocations
45  *	- Investigate cache size adjustments
46  */
47 
48 #include <sys/cdefs.h>
49 __FBSDID("$FreeBSD$");
50 
51 /* I should really use ktr.. */
52 /*
53 #define UMA_DEBUG 1
54 #define UMA_DEBUG_ALLOC 1
55 #define UMA_DEBUG_ALLOC_1 1
56 */
57 
58 #include "opt_param.h"
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/kernel.h>
62 #include <sys/types.h>
63 #include <sys/queue.h>
64 #include <sys/malloc.h>
65 #include <sys/lock.h>
66 #include <sys/sysctl.h>
67 #include <sys/mutex.h>
68 #include <sys/proc.h>
69 #include <sys/smp.h>
70 #include <sys/vmmeter.h>
71 
72 #include <vm/vm.h>
73 #include <vm/vm_object.h>
74 #include <vm/vm_page.h>
75 #include <vm/vm_param.h>
76 #include <vm/vm_map.h>
77 #include <vm/vm_kern.h>
78 #include <vm/vm_extern.h>
79 #include <vm/uma.h>
80 #include <vm/uma_int.h>
81 #include <vm/uma_dbg.h>
82 
83 #include <machine/vmparam.h>
84 
85 /*
86  * This is the zone and keg from which all zones are spawned.  The idea is that
87  * even the zone & keg heads are allocated from the allocator, so we use the
88  * bss section to bootstrap us.
89  */
90 static struct uma_keg masterkeg;
91 static struct uma_zone masterzone_k;
92 static struct uma_zone masterzone_z;
93 static uma_zone_t kegs = &masterzone_k;
94 static uma_zone_t zones = &masterzone_z;
95 
96 /* This is the zone from which all of uma_slab_t's are allocated. */
97 static uma_zone_t slabzone;
98 static uma_zone_t slabrefzone;	/* With refcounters (for UMA_ZONE_REFCNT) */
99 
100 /*
101  * The initial hash tables come out of this zone so they can be allocated
102  * prior to malloc coming up.
103  */
104 static uma_zone_t hashzone;
105 
106 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
107 
108 /*
109  * Are we allowed to allocate buckets?
110  */
111 static int bucketdisable = 1;
112 
113 /* Linked list of all kegs in the system */
114 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(&uma_kegs);
115 
116 /* This mutex protects the keg list */
117 static struct mtx uma_mtx;
118 
119 /* These are the pcpu cache locks */
120 static struct mtx uma_pcpu_mtx[MAXCPU];
121 
122 /* Linked list of boot time pages */
123 static LIST_HEAD(,uma_slab) uma_boot_pages =
124     LIST_HEAD_INITIALIZER(&uma_boot_pages);
125 
126 /* Count of free boottime pages */
127 static int uma_boot_free = 0;
128 
129 /* Is the VM done starting up? */
130 static int booted = 0;
131 
132 /*
133  * This is the handle used to schedule events that need to happen
134  * outside of the allocation fast path.
135  */
136 static struct callout uma_callout;
137 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
138 
139 /*
140  * This structure is passed as the zone ctor arg so that I don't have to create
141  * a special allocation function just for zones.
142  */
143 struct uma_zctor_args {
144 	char *name;
145 	size_t size;
146 	uma_ctor ctor;
147 	uma_dtor dtor;
148 	uma_init uminit;
149 	uma_fini fini;
150 	uma_keg_t keg;
151 	int align;
152 	u_int16_t flags;
153 };
154 
155 struct uma_kctor_args {
156 	uma_zone_t zone;
157 	size_t size;
158 	uma_init uminit;
159 	uma_fini fini;
160 	int align;
161 	u_int16_t flags;
162 };
163 
164 struct uma_bucket_zone {
165 	uma_zone_t	ubz_zone;
166 	char		*ubz_name;
167 	int		ubz_entries;
168 };
169 
170 #define	BUCKET_MAX	128
171 
172 struct uma_bucket_zone bucket_zones[] = {
173 	{ NULL, "16 Bucket", 16 },
174 	{ NULL, "32 Bucket", 32 },
175 	{ NULL, "64 Bucket", 64 },
176 	{ NULL, "128 Bucket", 128 },
177 	{ NULL, NULL, 0}
178 };
179 
180 #define	BUCKET_SHIFT	4
181 #define	BUCKET_ZONES	((BUCKET_MAX >> BUCKET_SHIFT) + 1)
182 
183 uint8_t bucket_size[BUCKET_ZONES];
184 
185 /* Prototypes.. */
186 
187 static void *obj_alloc(uma_zone_t, int, u_int8_t *, int);
188 static void *page_alloc(uma_zone_t, int, u_int8_t *, int);
189 static void *startup_alloc(uma_zone_t, int, u_int8_t *, int);
190 static void page_free(void *, int, u_int8_t);
191 static uma_slab_t slab_zalloc(uma_zone_t, int);
192 static void cache_drain(uma_zone_t);
193 static void bucket_drain(uma_zone_t, uma_bucket_t);
194 static void bucket_cache_drain(uma_zone_t zone);
195 static void keg_ctor(void *, int, void *);
196 static void keg_dtor(void *, int, void *);
197 static void zone_ctor(void *, int, void *);
198 static void zone_dtor(void *, int, void *);
199 static void zero_init(void *, int);
200 static void zone_small_init(uma_zone_t zone);
201 static void zone_large_init(uma_zone_t zone);
202 static void zone_foreach(void (*zfunc)(uma_zone_t));
203 static void zone_timeout(uma_zone_t zone);
204 static int hash_alloc(struct uma_hash *);
205 static int hash_expand(struct uma_hash *, struct uma_hash *);
206 static void hash_free(struct uma_hash *hash);
207 static void uma_timeout(void *);
208 static void uma_startup3(void);
209 static void *uma_zalloc_internal(uma_zone_t, void *, int);
210 static void uma_zfree_internal(uma_zone_t, void *, void *, int);
211 static void bucket_enable(void);
212 static void bucket_init(void);
213 static uma_bucket_t bucket_alloc(int, int);
214 static void bucket_free(uma_bucket_t);
215 static void bucket_zone_drain(void);
216 static int uma_zalloc_bucket(uma_zone_t zone, int flags);
217 static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags);
218 static void *uma_slab_alloc(uma_zone_t zone, uma_slab_t slab);
219 static void zone_drain(uma_zone_t);
220 static void uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
221     uma_fini fini, int align, u_int16_t flags);
222 
223 void uma_print_zone(uma_zone_t);
224 void uma_print_stats(void);
225 static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS);
226 
227 static int nosleepwithlocks = 0;
228 SYSCTL_INT(_debug, OID_AUTO, nosleepwithlocks, CTLFLAG_RW, &nosleepwithlocks,
229     0, "Convert M_WAITOK to M_NOWAIT to avoid lock-held-across-sleep paths");
230 SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD,
231     NULL, 0, sysctl_vm_zone, "A", "Zone Info");
232 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
233 
234 /*
235  * This routine checks to see whether or not it's safe to enable buckets.
236  */
237 
238 static void
239 bucket_enable(void)
240 {
241 	if (cnt.v_free_count < cnt.v_free_min)
242 		bucketdisable = 1;
243 	else
244 		bucketdisable = 0;
245 }
246 
247 static void
248 bucket_init(void)
249 {
250 	struct uma_bucket_zone *ubz;
251 	int i;
252 	int j;
253 
254 	for (i = 0, j = 0; bucket_zones[j].ubz_entries != 0; j++) {
255 		int size;
256 
257 		ubz = &bucket_zones[j];
258 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
259 		size += sizeof(void *) * ubz->ubz_entries;
260 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
261 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
262 		for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT))
263 			bucket_size[i >> BUCKET_SHIFT] = j;
264 	}
265 }
266 
267 static uma_bucket_t
268 bucket_alloc(int entries, int bflags)
269 {
270 	struct uma_bucket_zone *ubz;
271 	uma_bucket_t bucket;
272 	int idx;
273 
274 	/*
275 	 * This is to stop us from allocating per cpu buckets while we're
276 	 * running out of UMA_BOOT_PAGES.  Otherwise, we would exhaust the
277 	 * boot pages.  This also prevents us from allocating buckets in
278 	 * low memory situations.
279 	 */
280 
281 	if (bucketdisable)
282 		return (NULL);
283 	idx = howmany(entries, 1 << BUCKET_SHIFT);
284 	ubz = &bucket_zones[bucket_size[idx]];
285 	bucket = uma_zalloc_internal(ubz->ubz_zone, NULL, bflags);
286 	if (bucket) {
287 #ifdef INVARIANTS
288 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
289 #endif
290 		bucket->ub_cnt = 0;
291 		bucket->ub_entries = ubz->ubz_entries;
292 	}
293 
294 	return (bucket);
295 }
296 
297 static void
298 bucket_free(uma_bucket_t bucket)
299 {
300 	struct uma_bucket_zone *ubz;
301 	int idx;
302 
303 	idx = howmany(bucket->ub_entries, 1 << BUCKET_SHIFT);
304 	ubz = &bucket_zones[bucket_size[idx]];
305 	uma_zfree_internal(ubz->ubz_zone, bucket, NULL, 0);
306 }
307 
308 static void
309 bucket_zone_drain(void)
310 {
311 	struct uma_bucket_zone *ubz;
312 
313 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
314 		zone_drain(ubz->ubz_zone);
315 }
316 
317 
318 /*
319  * Routine called by timeout which is used to fire off some time interval
320  * based calculations.  (stats, hash size, etc.)
321  *
322  * Arguments:
323  *	arg   Unused
324  *
325  * Returns:
326  *	Nothing
327  */
328 static void
329 uma_timeout(void *unused)
330 {
331 	bucket_enable();
332 	zone_foreach(zone_timeout);
333 
334 	/* Reschedule this event */
335 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
336 }
337 
338 /*
339  * Routine to perform timeout driven calculations.  This expands the
340  * hashes and does per cpu statistics aggregation.
341  *
342  *  Arguments:
343  *	zone  The zone to operate on
344  *
345  *  Returns:
346  *	Nothing
347  */
348 static void
349 zone_timeout(uma_zone_t zone)
350 {
351 	uma_keg_t keg;
352 	uma_cache_t cache;
353 	u_int64_t alloc;
354 	int cpu;
355 
356 	keg = zone->uz_keg;
357 	alloc = 0;
358 
359 	/*
360 	 * Aggregate per cpu cache statistics back to the zone.
361 	 *
362 	 * XXX This should be done in the sysctl handler.
363 	 *
364 	 * I may rewrite this to set a flag in the per cpu cache instead of
365 	 * locking.  If the flag is not cleared on the next round I will have
366 	 * to lock and do it here instead so that the statistics don't get too
367 	 * far out of sync.
368 	 */
369 	if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL)) {
370 		for (cpu = 0; cpu <= mp_maxid; cpu++) {
371 			if (CPU_ABSENT(cpu))
372 				continue;
373 			CPU_LOCK(cpu);
374 			cache = &zone->uz_cpu[cpu];
375 			/* Add them up, and reset */
376 			alloc += cache->uc_allocs;
377 			cache->uc_allocs = 0;
378 			CPU_UNLOCK(cpu);
379 		}
380 	}
381 
382 	/* Now push these stats back into the zone.. */
383 	ZONE_LOCK(zone);
384 	zone->uz_allocs += alloc;
385 
386 	/*
387 	 * Expand the zone hash table.
388 	 *
389 	 * This is done if the number of slabs is larger than the hash size.
390 	 * What I'm trying to do here is completely reduce collisions.  This
391 	 * may be a little aggressive.  Should I allow for two collisions max?
392 	 */
393 
394 	if (keg->uk_flags & UMA_ZONE_HASH &&
395 	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
396 		struct uma_hash newhash;
397 		struct uma_hash oldhash;
398 		int ret;
399 
400 		/*
401 		 * This is so involved because allocating and freeing
402 		 * while the zone lock is held will lead to deadlock.
403 		 * I have to do everything in stages and check for
404 		 * races.
405 		 */
406 		newhash = keg->uk_hash;
407 		ZONE_UNLOCK(zone);
408 		ret = hash_alloc(&newhash);
409 		ZONE_LOCK(zone);
410 		if (ret) {
411 			if (hash_expand(&keg->uk_hash, &newhash)) {
412 				oldhash = keg->uk_hash;
413 				keg->uk_hash = newhash;
414 			} else
415 				oldhash = newhash;
416 
417 			ZONE_UNLOCK(zone);
418 			hash_free(&oldhash);
419 			ZONE_LOCK(zone);
420 		}
421 	}
422 	ZONE_UNLOCK(zone);
423 }
424 
425 /*
426  * Allocate and zero fill the next sized hash table from the appropriate
427  * backing store.
428  *
429  * Arguments:
430  *	hash  A new hash structure with the old hash size in uh_hashsize
431  *
432  * Returns:
433  *	1 on sucess and 0 on failure.
434  */
435 static int
436 hash_alloc(struct uma_hash *hash)
437 {
438 	int oldsize;
439 	int alloc;
440 
441 	oldsize = hash->uh_hashsize;
442 
443 	/* We're just going to go to a power of two greater */
444 	if (oldsize)  {
445 		hash->uh_hashsize = oldsize * 2;
446 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
447 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
448 		    M_UMAHASH, M_NOWAIT);
449 	} else {
450 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
451 		hash->uh_slab_hash = uma_zalloc_internal(hashzone, NULL,
452 		    M_WAITOK);
453 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
454 	}
455 	if (hash->uh_slab_hash) {
456 		bzero(hash->uh_slab_hash, alloc);
457 		hash->uh_hashmask = hash->uh_hashsize - 1;
458 		return (1);
459 	}
460 
461 	return (0);
462 }
463 
464 /*
465  * Expands the hash table for HASH zones.  This is done from zone_timeout
466  * to reduce collisions.  This must not be done in the regular allocation
467  * path, otherwise, we can recurse on the vm while allocating pages.
468  *
469  * Arguments:
470  *	oldhash  The hash you want to expand
471  *	newhash  The hash structure for the new table
472  *
473  * Returns:
474  *	Nothing
475  *
476  * Discussion:
477  */
478 static int
479 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
480 {
481 	uma_slab_t slab;
482 	int hval;
483 	int i;
484 
485 	if (!newhash->uh_slab_hash)
486 		return (0);
487 
488 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
489 		return (0);
490 
491 	/*
492 	 * I need to investigate hash algorithms for resizing without a
493 	 * full rehash.
494 	 */
495 
496 	for (i = 0; i < oldhash->uh_hashsize; i++)
497 		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
498 			slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
499 			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
500 			hval = UMA_HASH(newhash, slab->us_data);
501 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
502 			    slab, us_hlink);
503 		}
504 
505 	return (1);
506 }
507 
508 /*
509  * Free the hash bucket to the appropriate backing store.
510  *
511  * Arguments:
512  *	slab_hash  The hash bucket we're freeing
513  *	hashsize   The number of entries in that hash bucket
514  *
515  * Returns:
516  *	Nothing
517  */
518 static void
519 hash_free(struct uma_hash *hash)
520 {
521 	if (hash->uh_slab_hash == NULL)
522 		return;
523 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
524 		uma_zfree_internal(hashzone,
525 		    hash->uh_slab_hash, NULL, 0);
526 	else
527 		free(hash->uh_slab_hash, M_UMAHASH);
528 }
529 
530 /*
531  * Frees all outstanding items in a bucket
532  *
533  * Arguments:
534  *	zone   The zone to free to, must be unlocked.
535  *	bucket The free/alloc bucket with items, cpu queue must be locked.
536  *
537  * Returns:
538  *	Nothing
539  */
540 
541 static void
542 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
543 {
544 	uma_slab_t slab;
545 	int mzone;
546 	void *item;
547 
548 	if (bucket == NULL)
549 		return;
550 
551 	slab = NULL;
552 	mzone = 0;
553 
554 	/* We have to lookup the slab again for malloc.. */
555 	if (zone->uz_keg->uk_flags & UMA_ZONE_MALLOC)
556 		mzone = 1;
557 
558 	while (bucket->ub_cnt > 0)  {
559 		bucket->ub_cnt--;
560 		item = bucket->ub_bucket[bucket->ub_cnt];
561 #ifdef INVARIANTS
562 		bucket->ub_bucket[bucket->ub_cnt] = NULL;
563 		KASSERT(item != NULL,
564 		    ("bucket_drain: botched ptr, item is NULL"));
565 #endif
566 		/*
567 		 * This is extremely inefficient.  The slab pointer was passed
568 		 * to uma_zfree_arg, but we lost it because the buckets don't
569 		 * hold them.  This will go away when free() gets a size passed
570 		 * to it.
571 		 */
572 		if (mzone)
573 			slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
574 		uma_zfree_internal(zone, item, slab, 1);
575 	}
576 }
577 
578 /*
579  * Drains the per cpu caches for a zone.
580  *
581  * Arguments:
582  *	zone     The zone to drain, must be unlocked.
583  *
584  * Returns:
585  *	Nothing
586  */
587 static void
588 cache_drain(uma_zone_t zone)
589 {
590 	uma_cache_t cache;
591 	int cpu;
592 
593 	/*
594 	 * We have to lock each cpu cache before locking the zone
595 	 */
596 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
597 		if (CPU_ABSENT(cpu))
598 			continue;
599 		CPU_LOCK(cpu);
600 		cache = &zone->uz_cpu[cpu];
601 		bucket_drain(zone, cache->uc_allocbucket);
602 		bucket_drain(zone, cache->uc_freebucket);
603 		if (cache->uc_allocbucket != NULL)
604 			bucket_free(cache->uc_allocbucket);
605 		if (cache->uc_freebucket != NULL)
606 			bucket_free(cache->uc_freebucket);
607 		cache->uc_allocbucket = cache->uc_freebucket = NULL;
608 	}
609 	ZONE_LOCK(zone);
610 	bucket_cache_drain(zone);
611 	ZONE_UNLOCK(zone);
612 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
613 		if (CPU_ABSENT(cpu))
614 			continue;
615 		CPU_UNLOCK(cpu);
616 	}
617 }
618 
619 /*
620  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
621  */
622 static void
623 bucket_cache_drain(uma_zone_t zone)
624 {
625 	uma_bucket_t bucket;
626 
627 	/*
628 	 * Drain the bucket queues and free the buckets, we just keep two per
629 	 * cpu (alloc/free).
630 	 */
631 	while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
632 		LIST_REMOVE(bucket, ub_link);
633 		ZONE_UNLOCK(zone);
634 		bucket_drain(zone, bucket);
635 		bucket_free(bucket);
636 		ZONE_LOCK(zone);
637 	}
638 
639 	/* Now we do the free queue.. */
640 	while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
641 		LIST_REMOVE(bucket, ub_link);
642 		bucket_free(bucket);
643 	}
644 }
645 
646 /*
647  * Frees pages from a zone back to the system.  This is done on demand from
648  * the pageout daemon.
649  *
650  * Arguments:
651  *	zone  The zone to free pages from
652  *	 all  Should we drain all items?
653  *
654  * Returns:
655  *	Nothing.
656  */
657 static void
658 zone_drain(uma_zone_t zone)
659 {
660 	struct slabhead freeslabs = {};
661 	uma_keg_t keg;
662 	uma_slab_t slab;
663 	uma_slab_t n;
664 	u_int8_t flags;
665 	u_int8_t *mem;
666 	int i;
667 
668 	keg = zone->uz_keg;
669 
670 	/*
671 	 * We don't want to take pages from statically allocated zones at this
672 	 * time
673 	 */
674 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
675 		return;
676 
677 	ZONE_LOCK(zone);
678 
679 #ifdef UMA_DEBUG
680 	printf("%s free items: %u\n", zone->uz_name, keg->uk_free);
681 #endif
682 	bucket_cache_drain(zone);
683 	if (keg->uk_free == 0)
684 		goto finished;
685 
686 	slab = LIST_FIRST(&keg->uk_free_slab);
687 	while (slab) {
688 		n = LIST_NEXT(slab, us_link);
689 
690 		/* We have no where to free these to */
691 		if (slab->us_flags & UMA_SLAB_BOOT) {
692 			slab = n;
693 			continue;
694 		}
695 
696 		LIST_REMOVE(slab, us_link);
697 		keg->uk_pages -= keg->uk_ppera;
698 		keg->uk_free -= keg->uk_ipers;
699 
700 		if (keg->uk_flags & UMA_ZONE_HASH)
701 			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
702 
703 		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
704 
705 		slab = n;
706 	}
707 finished:
708 	ZONE_UNLOCK(zone);
709 
710 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
711 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
712 		if (keg->uk_fini)
713 			for (i = 0; i < keg->uk_ipers; i++)
714 				keg->uk_fini(
715 				    slab->us_data + (keg->uk_rsize * i),
716 				    keg->uk_size);
717 		flags = slab->us_flags;
718 		mem = slab->us_data;
719 
720 		if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
721 		    (keg->uk_flags & UMA_ZONE_REFCNT)) {
722 			vm_object_t obj;
723 
724 			if (flags & UMA_SLAB_KMEM)
725 				obj = kmem_object;
726 			else
727 				obj = NULL;
728 			for (i = 0; i < keg->uk_ppera; i++)
729 				vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
730 				    obj);
731 		}
732 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
733 			uma_zfree_internal(keg->uk_slabzone, slab, NULL, 0);
734 #ifdef UMA_DEBUG
735 		printf("%s: Returning %d bytes.\n",
736 		    zone->uz_name, UMA_SLAB_SIZE * keg->uk_ppera);
737 #endif
738 		keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);
739 	}
740 }
741 
742 /*
743  * Allocate a new slab for a zone.  This does not insert the slab onto a list.
744  *
745  * Arguments:
746  *	zone  The zone to allocate slabs for
747  *	wait  Shall we wait?
748  *
749  * Returns:
750  *	The slab that was allocated or NULL if there is no memory and the
751  *	caller specified M_NOWAIT.
752  */
753 static uma_slab_t
754 slab_zalloc(uma_zone_t zone, int wait)
755 {
756 	uma_slabrefcnt_t slabref;
757 	uma_slab_t slab;
758 	uma_keg_t keg;
759 	u_int8_t *mem;
760 	u_int8_t flags;
761 	int i;
762 
763 	slab = NULL;
764 	keg = zone->uz_keg;
765 
766 #ifdef UMA_DEBUG
767 	printf("slab_zalloc:  Allocating a new slab for %s\n", zone->uz_name);
768 #endif
769 	ZONE_UNLOCK(zone);
770 
771 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
772 		slab = uma_zalloc_internal(keg->uk_slabzone, NULL, wait);
773 		if (slab == NULL) {
774 			ZONE_LOCK(zone);
775 			return NULL;
776 		}
777 	}
778 
779 	/*
780 	 * This reproduces the old vm_zone behavior of zero filling pages the
781 	 * first time they are added to a zone.
782 	 *
783 	 * Malloced items are zeroed in uma_zalloc.
784 	 */
785 
786 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
787 		wait |= M_ZERO;
788 	else
789 		wait &= ~M_ZERO;
790 
791 	mem = keg->uk_allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE,
792 	    &flags, wait);
793 	if (mem == NULL) {
794 		ZONE_LOCK(zone);
795 		return (NULL);
796 	}
797 
798 	/* Point the slab into the allocated memory */
799 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
800 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
801 
802 	if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
803 	    (keg->uk_flags & UMA_ZONE_REFCNT))
804 		for (i = 0; i < keg->uk_ppera; i++)
805 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
806 
807 	slab->us_keg = keg;
808 	slab->us_data = mem;
809 	slab->us_freecount = keg->uk_ipers;
810 	slab->us_firstfree = 0;
811 	slab->us_flags = flags;
812 	for (i = 0; i < keg->uk_ipers; i++)
813 		slab->us_freelist[i].us_item = i+1;
814 
815 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
816 		slabref = (uma_slabrefcnt_t)slab;
817 		for (i = 0; i < keg->uk_ipers; i++)
818 			slabref->us_freelist[i].us_refcnt = 0;
819 	}
820 
821 	if (keg->uk_init)
822 		for (i = 0; i < keg->uk_ipers; i++)
823 			keg->uk_init(slab->us_data + (keg->uk_rsize * i),
824 			    keg->uk_size);
825 	ZONE_LOCK(zone);
826 
827 	if (keg->uk_flags & UMA_ZONE_HASH)
828 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
829 
830 	keg->uk_pages += keg->uk_ppera;
831 	keg->uk_free += keg->uk_ipers;
832 
833 	return (slab);
834 }
835 
836 /*
837  * This function is intended to be used early on in place of page_alloc() so
838  * that we may use the boot time page cache to satisfy allocations before
839  * the VM is ready.
840  */
841 static void *
842 startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
843 {
844 	uma_keg_t keg;
845 
846 	keg = zone->uz_keg;
847 
848 	/*
849 	 * Check our small startup cache to see if it has pages remaining.
850 	 */
851 	mtx_lock(&uma_mtx);
852 	if (uma_boot_free != 0) {
853 		uma_slab_t tmps;
854 
855 		tmps = LIST_FIRST(&uma_boot_pages);
856 		LIST_REMOVE(tmps, us_link);
857 		uma_boot_free--;
858 		mtx_unlock(&uma_mtx);
859 		*pflag = tmps->us_flags;
860 		return (tmps->us_data);
861 	}
862 	mtx_unlock(&uma_mtx);
863 	if (booted == 0)
864 		panic("UMA: Increase UMA_BOOT_PAGES");
865 	/*
866 	 * Now that we've booted reset these users to their real allocator.
867 	 */
868 #ifdef UMA_MD_SMALL_ALLOC
869 	keg->uk_allocf = uma_small_alloc;
870 #else
871 	keg->uk_allocf = page_alloc;
872 #endif
873 	return keg->uk_allocf(zone, bytes, pflag, wait);
874 }
875 
876 /*
877  * Allocates a number of pages from the system
878  *
879  * Arguments:
880  *	zone  Unused
881  *	bytes  The number of bytes requested
882  *	wait  Shall we wait?
883  *
884  * Returns:
885  *	A pointer to the alloced memory or possibly
886  *	NULL if M_NOWAIT is set.
887  */
888 static void *
889 page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
890 {
891 	void *p;	/* Returned page */
892 
893 	*pflag = UMA_SLAB_KMEM;
894 	p = (void *) kmem_malloc(kmem_map, bytes, wait);
895 
896 	return (p);
897 }
898 
899 /*
900  * Allocates a number of pages from within an object
901  *
902  * Arguments:
903  *	zone   Unused
904  *	bytes  The number of bytes requested
905  *	wait   Shall we wait?
906  *
907  * Returns:
908  *	A pointer to the alloced memory or possibly
909  *	NULL if M_NOWAIT is set.
910  */
911 static void *
912 obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
913 {
914 	vm_object_t object;
915 	vm_offset_t retkva, zkva;
916 	vm_page_t p;
917 	int pages, startpages;
918 
919 	object = zone->uz_keg->uk_obj;
920 	retkva = 0;
921 
922 	/*
923 	 * This looks a little weird since we're getting one page at a time.
924 	 */
925 	VM_OBJECT_LOCK(object);
926 	p = TAILQ_LAST(&object->memq, pglist);
927 	pages = p != NULL ? p->pindex + 1 : 0;
928 	startpages = pages;
929 	zkva = zone->uz_keg->uk_kva + pages * PAGE_SIZE;
930 	for (; bytes > 0; bytes -= PAGE_SIZE) {
931 		p = vm_page_alloc(object, pages,
932 		    VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
933 		if (p == NULL) {
934 			if (pages != startpages)
935 				pmap_qremove(retkva, pages - startpages);
936 			while (pages != startpages) {
937 				pages--;
938 				p = TAILQ_LAST(&object->memq, pglist);
939 				vm_page_lock_queues();
940 				vm_page_unwire(p, 0);
941 				vm_page_free(p);
942 				vm_page_unlock_queues();
943 			}
944 			retkva = 0;
945 			goto done;
946 		}
947 		pmap_qenter(zkva, &p, 1);
948 		if (retkva == 0)
949 			retkva = zkva;
950 		zkva += PAGE_SIZE;
951 		pages += 1;
952 	}
953 done:
954 	VM_OBJECT_UNLOCK(object);
955 	*flags = UMA_SLAB_PRIV;
956 
957 	return ((void *)retkva);
958 }
959 
960 /*
961  * Frees a number of pages to the system
962  *
963  * Arguments:
964  *	mem   A pointer to the memory to be freed
965  *	size  The size of the memory being freed
966  *	flags The original p->us_flags field
967  *
968  * Returns:
969  *	Nothing
970  */
971 static void
972 page_free(void *mem, int size, u_int8_t flags)
973 {
974 	vm_map_t map;
975 
976 	if (flags & UMA_SLAB_KMEM)
977 		map = kmem_map;
978 	else
979 		panic("UMA: page_free used with invalid flags %d\n", flags);
980 
981 	kmem_free(map, (vm_offset_t)mem, size);
982 }
983 
984 /*
985  * Zero fill initializer
986  *
987  * Arguments/Returns follow uma_init specifications
988  */
989 static void
990 zero_init(void *mem, int size)
991 {
992 	bzero(mem, size);
993 }
994 
995 /*
996  * Finish creating a small uma zone.  This calculates ipers, and the zone size.
997  *
998  * Arguments
999  *	zone  The zone we should initialize
1000  *
1001  * Returns
1002  *	Nothing
1003  */
1004 static void
1005 zone_small_init(uma_zone_t zone)
1006 {
1007 	uma_keg_t keg;
1008 	int rsize;
1009 	int memused;
1010 	int ipers;
1011 
1012 	keg = zone->uz_keg;
1013 	KASSERT(keg != NULL, ("Keg is null in zone_small_init"));
1014 	rsize = keg->uk_size;
1015 
1016 	if (rsize < UMA_SMALLEST_UNIT)
1017 		rsize = UMA_SMALLEST_UNIT;
1018 
1019 	if (rsize & keg->uk_align)
1020 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1021 
1022 	keg->uk_rsize = rsize;
1023 
1024 	rsize += 1;	/* Account for the byte of linkage */
1025 	keg->uk_ipers = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / rsize;
1026 	keg->uk_ppera = 1;
1027 
1028 	KASSERT(keg->uk_ipers != 0, ("zone_small_init: ipers is 0, uh-oh!"));
1029 	memused = keg->uk_ipers * keg->uk_rsize;
1030 
1031 	/* Can we do any better? */
1032 	if ((keg->uk_flags & UMA_ZONE_REFCNT) ||
1033 	    ((UMA_SLAB_SIZE - memused) >= UMA_MAX_WASTE)) {
1034 		/*
1035 		 * We can't do this if we're internal or if we've been
1036 		 * asked to not go to the VM for buckets.  If we do this we
1037 		 * may end up going to the VM (kmem_map) for slabs which we
1038 		 * do not want to do if we're UMA_ZFLAG_CACHEONLY as a
1039 		 * result of UMA_ZONE_VM, which clearly forbids it.
1040 		 */
1041 		if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1042 		    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1043 			return;
1044 		ipers = UMA_SLAB_SIZE / keg->uk_rsize;
1045 		if ((keg->uk_flags & UMA_ZONE_REFCNT) ||
1046 		    (ipers > keg->uk_ipers)) {
1047 			keg->uk_flags |= UMA_ZONE_OFFPAGE;
1048 			if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1049 				keg->uk_flags |= UMA_ZONE_HASH;
1050 			keg->uk_ipers = ipers;
1051 		}
1052 	}
1053 }
1054 
1055 /*
1056  * Finish creating a large (> UMA_SLAB_SIZE) uma zone.  Just give in and do
1057  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1058  * more complicated.
1059  *
1060  * Arguments
1061  *	zone  The zone we should initialize
1062  *
1063  * Returns
1064  *	Nothing
1065  */
1066 static void
1067 zone_large_init(uma_zone_t zone)
1068 {
1069 	uma_keg_t keg;
1070 	int pages;
1071 
1072 	keg = zone->uz_keg;
1073 
1074 	KASSERT(keg != NULL, ("Keg is null in zone_large_init"));
1075 	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
1076 	    ("zone_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY zone"));
1077 
1078 	pages = keg->uk_size / UMA_SLAB_SIZE;
1079 
1080 	/* Account for remainder */
1081 	if ((pages * UMA_SLAB_SIZE) < keg->uk_size)
1082 		pages++;
1083 
1084 	keg->uk_ppera = pages;
1085 	keg->uk_ipers = 1;
1086 
1087 	keg->uk_flags |= UMA_ZONE_OFFPAGE;
1088 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1089 		keg->uk_flags |= UMA_ZONE_HASH;
1090 
1091 	keg->uk_rsize = keg->uk_size;
1092 }
1093 
1094 /*
1095  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1096  * the keg onto the global keg list.
1097  *
1098  * Arguments/Returns follow uma_ctor specifications
1099  *	udata  Actually uma_kctor_args
1100  */
1101 static void
1102 keg_ctor(void *mem, int size, void *udata)
1103 {
1104 	struct uma_kctor_args *arg = udata;
1105 	uma_keg_t keg = mem;
1106 	uma_zone_t zone;
1107 
1108 	bzero(keg, size);
1109 	keg->uk_size = arg->size;
1110 	keg->uk_init = arg->uminit;
1111 	keg->uk_fini = arg->fini;
1112 	keg->uk_align = arg->align;
1113 	keg->uk_free = 0;
1114 	keg->uk_pages = 0;
1115 	keg->uk_flags = arg->flags;
1116 	keg->uk_allocf = page_alloc;
1117 	keg->uk_freef = page_free;
1118 	keg->uk_recurse = 0;
1119 	keg->uk_slabzone = NULL;
1120 
1121 	/*
1122 	 * The master zone is passed to us at keg-creation time.
1123 	 */
1124 	zone = arg->zone;
1125 	zone->uz_keg = keg;
1126 
1127 	if (arg->flags & UMA_ZONE_VM)
1128 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1129 
1130 	if (arg->flags & UMA_ZONE_ZINIT)
1131 		keg->uk_init = zero_init;
1132 
1133 	/*
1134 	 * The +1 byte added to uk_size is to account for the byte of
1135 	 * linkage that is added to the size in zone_small_init().  If
1136 	 * we don't account for this here then we may end up in
1137 	 * zone_small_init() with a calculated 'ipers' of 0.
1138 	 */
1139 	if ((keg->uk_size+1) > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
1140 		zone_large_init(zone);
1141 	else
1142 		zone_small_init(zone);
1143 
1144 	if (keg->uk_flags & UMA_ZONE_REFCNT)
1145 		keg->uk_slabzone = slabrefzone;
1146 	else if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1147 		keg->uk_slabzone = slabzone;
1148 
1149 	/*
1150 	 * If we haven't booted yet we need allocations to go through the
1151 	 * startup cache until the vm is ready.
1152 	 */
1153 	if (keg->uk_ppera == 1) {
1154 #ifdef UMA_MD_SMALL_ALLOC
1155 		keg->uk_allocf = uma_small_alloc;
1156 		keg->uk_freef = uma_small_free;
1157 #endif
1158 		if (booted == 0)
1159 			keg->uk_allocf = startup_alloc;
1160 	}
1161 
1162 	/*
1163 	 * Initialize keg's lock (shared among zones) through
1164 	 * Master zone
1165 	 */
1166 	zone->uz_lock = &keg->uk_lock;
1167 	if (arg->flags & UMA_ZONE_MTXCLASS)
1168 		ZONE_LOCK_INIT(zone, 1);
1169 	else
1170 		ZONE_LOCK_INIT(zone, 0);
1171 
1172 	/*
1173 	 * If we're putting the slab header in the actual page we need to
1174 	 * figure out where in each page it goes.  This calculates a right
1175 	 * justified offset into the memory on an ALIGN_PTR boundary.
1176 	 */
1177 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1178 		int totsize;
1179 
1180 		/* Size of the slab struct and free list */
1181 		totsize = sizeof(struct uma_slab) + keg->uk_ipers;
1182 		if (totsize & UMA_ALIGN_PTR)
1183 			totsize = (totsize & ~UMA_ALIGN_PTR) +
1184 			    (UMA_ALIGN_PTR + 1);
1185 		keg->uk_pgoff = UMA_SLAB_SIZE - totsize;
1186 		totsize = keg->uk_pgoff + sizeof(struct uma_slab)
1187 		    + keg->uk_ipers;
1188 		/* I don't think it's possible, but I'll make sure anyway */
1189 		if (totsize > UMA_SLAB_SIZE) {
1190 			printf("zone %s ipers %d rsize %d size %d\n",
1191 			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
1192 			    keg->uk_size);
1193 			panic("UMA slab won't fit.\n");
1194 		}
1195 	}
1196 
1197 	if (keg->uk_flags & UMA_ZONE_HASH)
1198 		hash_alloc(&keg->uk_hash);
1199 
1200 #ifdef UMA_DEBUG
1201 	printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n",
1202 	    zone->uz_name, zone,
1203 	    keg->uk_size, keg->uk_ipers,
1204 	    keg->uk_ppera, keg->uk_pgoff);
1205 #endif
1206 
1207 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1208 
1209 	mtx_lock(&uma_mtx);
1210 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1211 	mtx_unlock(&uma_mtx);
1212 }
1213 
1214 /*
1215  * Zone header ctor.  This initializes all fields, locks, etc.
1216  *
1217  * Arguments/Returns follow uma_ctor specifications
1218  *	udata  Actually uma_zctor_args
1219  */
1220 
1221 static void
1222 zone_ctor(void *mem, int size, void *udata)
1223 {
1224 	struct uma_zctor_args *arg = udata;
1225 	uma_zone_t zone = mem;
1226 	uma_zone_t z;
1227 	uma_keg_t keg;
1228 
1229 	bzero(zone, size);
1230 	zone->uz_name = arg->name;
1231 	zone->uz_ctor = arg->ctor;
1232 	zone->uz_dtor = arg->dtor;
1233 	zone->uz_init = NULL;
1234 	zone->uz_fini = NULL;
1235 	zone->uz_allocs = 0;
1236 	zone->uz_fills = zone->uz_count = 0;
1237 
1238 	if (arg->flags & UMA_ZONE_SECONDARY) {
1239 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1240 		keg = arg->keg;
1241 		zone->uz_keg = keg;
1242 		zone->uz_init = arg->uminit;
1243 		zone->uz_fini = arg->fini;
1244 		zone->uz_lock = &keg->uk_lock;
1245 		mtx_lock(&uma_mtx);
1246 		ZONE_LOCK(zone);
1247 		keg->uk_flags |= UMA_ZONE_SECONDARY;
1248 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1249 			if (LIST_NEXT(z, uz_link) == NULL) {
1250 				LIST_INSERT_AFTER(z, zone, uz_link);
1251 				break;
1252 			}
1253 		}
1254 		ZONE_UNLOCK(zone);
1255 		mtx_unlock(&uma_mtx);
1256 	} else if (arg->keg == NULL) {
1257 		uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1258 		    arg->align, arg->flags);
1259 	} else {
1260 		struct uma_kctor_args karg;
1261 
1262 		/* We should only be here from uma_startup() */
1263 		karg.size = arg->size;
1264 		karg.uminit = arg->uminit;
1265 		karg.fini = arg->fini;
1266 		karg.align = arg->align;
1267 		karg.flags = arg->flags;
1268 		karg.zone = zone;
1269 		keg_ctor(arg->keg, sizeof(struct uma_keg), &karg);
1270 	}
1271 	keg = zone->uz_keg;
1272 	zone->uz_lock = &keg->uk_lock;
1273 
1274 	/*
1275 	 * Some internal zones don't have room allocated for the per cpu
1276 	 * caches.  If we're internal, bail out here.
1277 	 */
1278 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1279 		KASSERT((keg->uk_flags & UMA_ZONE_SECONDARY) == 0,
1280 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1281 		return;
1282 	}
1283 
1284 	if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
1285 		zone->uz_count = BUCKET_MAX;
1286 	else if (keg->uk_ipers <= BUCKET_MAX)
1287 		zone->uz_count = keg->uk_ipers;
1288 	else
1289 		zone->uz_count = BUCKET_MAX;
1290 }
1291 
1292 /*
1293  * Keg header dtor.  This frees all data, destroys locks, frees the hash
1294  * table and removes the keg from the global list.
1295  *
1296  * Arguments/Returns follow uma_dtor specifications
1297  *	udata  unused
1298  */
1299 static void
1300 keg_dtor(void *arg, int size, void *udata)
1301 {
1302 	uma_keg_t keg;
1303 
1304 	keg = (uma_keg_t)arg;
1305 	mtx_lock(&keg->uk_lock);
1306 	if (keg->uk_free != 0) {
1307 		printf("Freed UMA keg was not empty (%d items). "
1308 		    " Lost %d pages of memory.\n",
1309 		    keg->uk_free, keg->uk_pages);
1310 	}
1311 	mtx_unlock(&keg->uk_lock);
1312 
1313 	if (keg->uk_flags & UMA_ZONE_HASH)
1314 		hash_free(&keg->uk_hash);
1315 
1316 	mtx_destroy(&keg->uk_lock);
1317 }
1318 
1319 /*
1320  * Zone header dtor.
1321  *
1322  * Arguments/Returns follow uma_dtor specifications
1323  *	udata  unused
1324  */
1325 static void
1326 zone_dtor(void *arg, int size, void *udata)
1327 {
1328 	uma_zone_t zone;
1329 	uma_keg_t keg;
1330 
1331 	zone = (uma_zone_t)arg;
1332 	keg = zone->uz_keg;
1333 
1334 	if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL))
1335 		cache_drain(zone);
1336 
1337 	mtx_lock(&uma_mtx);
1338 	zone_drain(zone);
1339 	if (keg->uk_flags & UMA_ZONE_SECONDARY) {
1340 		LIST_REMOVE(zone, uz_link);
1341 		/*
1342 		 * XXX there are some races here where
1343 		 * the zone can be drained but zone lock
1344 		 * released and then refilled before we
1345 		 * remove it... we dont care for now
1346 		 */
1347 		ZONE_LOCK(zone);
1348 		if (LIST_EMPTY(&keg->uk_zones))
1349 			keg->uk_flags &= ~UMA_ZONE_SECONDARY;
1350 		ZONE_UNLOCK(zone);
1351 		mtx_unlock(&uma_mtx);
1352 	} else {
1353 		LIST_REMOVE(keg, uk_link);
1354 		LIST_REMOVE(zone, uz_link);
1355 		mtx_unlock(&uma_mtx);
1356 		uma_zfree_internal(kegs, keg, NULL, 0);
1357 	}
1358 	zone->uz_keg = NULL;
1359 }
1360 
1361 /*
1362  * Traverses every zone in the system and calls a callback
1363  *
1364  * Arguments:
1365  *	zfunc  A pointer to a function which accepts a zone
1366  *		as an argument.
1367  *
1368  * Returns:
1369  *	Nothing
1370  */
1371 static void
1372 zone_foreach(void (*zfunc)(uma_zone_t))
1373 {
1374 	uma_keg_t keg;
1375 	uma_zone_t zone;
1376 
1377 	mtx_lock(&uma_mtx);
1378 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
1379 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
1380 			zfunc(zone);
1381 	}
1382 	mtx_unlock(&uma_mtx);
1383 }
1384 
1385 /* Public functions */
1386 /* See uma.h */
1387 void
1388 uma_startup(void *bootmem)
1389 {
1390 	struct uma_zctor_args args;
1391 	uma_slab_t slab;
1392 	int slabsize;
1393 	int i;
1394 
1395 #ifdef UMA_DEBUG
1396 	printf("Creating uma keg headers zone and keg.\n");
1397 #endif
1398 	/*
1399 	 * The general UMA lock is a recursion-allowed lock because
1400 	 * there is a code path where, while we're still configured
1401 	 * to use startup_alloc() for backend page allocations, we
1402 	 * may end up in uma_reclaim() which calls zone_foreach(zone_drain),
1403 	 * which grabs uma_mtx, only to later call into startup_alloc()
1404 	 * because while freeing we needed to allocate a bucket.  Since
1405 	 * startup_alloc() also takes uma_mtx, we need to be able to
1406 	 * recurse on it.
1407 	 */
1408 	mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF | MTX_RECURSE);
1409 
1410 	/* "manually" create the initial zone */
1411 	args.name = "UMA Kegs";
1412 	args.size = sizeof(struct uma_keg);
1413 	args.ctor = keg_ctor;
1414 	args.dtor = keg_dtor;
1415 	args.uminit = zero_init;
1416 	args.fini = NULL;
1417 	args.keg = &masterkeg;
1418 	args.align = 32 - 1;
1419 	args.flags = UMA_ZFLAG_INTERNAL;
1420 	/* The initial zone has no Per cpu queues so it's smaller */
1421 	zone_ctor(kegs, sizeof(struct uma_zone), &args);
1422 
1423 #ifdef UMA_DEBUG
1424 	printf("Filling boot free list.\n");
1425 #endif
1426 	for (i = 0; i < UMA_BOOT_PAGES; i++) {
1427 		slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE));
1428 		slab->us_data = (u_int8_t *)slab;
1429 		slab->us_flags = UMA_SLAB_BOOT;
1430 		LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
1431 		uma_boot_free++;
1432 	}
1433 
1434 #ifdef UMA_DEBUG
1435 	printf("Creating uma zone headers zone and keg.\n");
1436 #endif
1437 	args.name = "UMA Zones";
1438 	args.size = sizeof(struct uma_zone) +
1439 	    (sizeof(struct uma_cache) * (mp_maxid + 1));
1440 	args.ctor = zone_ctor;
1441 	args.dtor = zone_dtor;
1442 	args.uminit = zero_init;
1443 	args.fini = NULL;
1444 	args.keg = NULL;
1445 	args.align = 32 - 1;
1446 	args.flags = UMA_ZFLAG_INTERNAL;
1447 	/* The initial zone has no Per cpu queues so it's smaller */
1448 	zone_ctor(zones, sizeof(struct uma_zone), &args);
1449 
1450 #ifdef UMA_DEBUG
1451 	printf("Initializing pcpu cache locks.\n");
1452 #endif
1453 	/* Initialize the pcpu cache lock set once and for all */
1454 	for (i = 0; i <= mp_maxid; i++)
1455 		CPU_LOCK_INIT(i);
1456 
1457 #ifdef UMA_DEBUG
1458 	printf("Creating slab and hash zones.\n");
1459 #endif
1460 
1461 	/*
1462 	 * This is the max number of free list items we'll have with
1463 	 * offpage slabs.
1464 	 */
1465 	slabsize = UMA_SLAB_SIZE - sizeof(struct uma_slab);
1466 	slabsize /= UMA_MAX_WASTE;
1467 	slabsize++;			/* In case there it's rounded */
1468 	slabsize += sizeof(struct uma_slab);
1469 
1470 	/* Now make a zone for slab headers */
1471 	slabzone = uma_zcreate("UMA Slabs",
1472 				slabsize,
1473 				NULL, NULL, NULL, NULL,
1474 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1475 
1476 	/*
1477 	 * We also create a zone for the bigger slabs with reference
1478 	 * counts in them, to accomodate UMA_ZONE_REFCNT zones.
1479 	 */
1480 	slabsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt);
1481 	slabsize /= UMA_MAX_WASTE;
1482 	slabsize++;
1483 	slabsize += 4 * slabsize;
1484 	slabsize += sizeof(struct uma_slab_refcnt);
1485 	slabrefzone = uma_zcreate("UMA RCntSlabs",
1486 				  slabsize,
1487 				  NULL, NULL, NULL, NULL,
1488 				  UMA_ALIGN_PTR,
1489 				  UMA_ZFLAG_INTERNAL);
1490 
1491 	hashzone = uma_zcreate("UMA Hash",
1492 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
1493 	    NULL, NULL, NULL, NULL,
1494 	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1495 
1496 	bucket_init();
1497 
1498 #ifdef UMA_MD_SMALL_ALLOC
1499 	booted = 1;
1500 #endif
1501 
1502 #ifdef UMA_DEBUG
1503 	printf("UMA startup complete.\n");
1504 #endif
1505 }
1506 
1507 /* see uma.h */
1508 void
1509 uma_startup2(void)
1510 {
1511 	booted = 1;
1512 	bucket_enable();
1513 #ifdef UMA_DEBUG
1514 	printf("UMA startup2 complete.\n");
1515 #endif
1516 }
1517 
1518 /*
1519  * Initialize our callout handle
1520  *
1521  */
1522 
1523 static void
1524 uma_startup3(void)
1525 {
1526 #ifdef UMA_DEBUG
1527 	printf("Starting callout.\n");
1528 #endif
1529 	callout_init(&uma_callout, CALLOUT_MPSAFE);
1530 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
1531 #ifdef UMA_DEBUG
1532 	printf("UMA startup3 complete.\n");
1533 #endif
1534 }
1535 
1536 static void
1537 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
1538 		int align, u_int16_t flags)
1539 {
1540 	struct uma_kctor_args args;
1541 
1542 	args.size = size;
1543 	args.uminit = uminit;
1544 	args.fini = fini;
1545 	args.align = align;
1546 	args.flags = flags;
1547 	args.zone = zone;
1548 	zone = uma_zalloc_internal(kegs, &args, M_WAITOK);
1549 }
1550 
1551 /* See uma.h */
1552 uma_zone_t
1553 uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
1554 		uma_init uminit, uma_fini fini, int align, u_int16_t flags)
1555 
1556 {
1557 	struct uma_zctor_args args;
1558 
1559 	/* This stuff is essential for the zone ctor */
1560 	args.name = name;
1561 	args.size = size;
1562 	args.ctor = ctor;
1563 	args.dtor = dtor;
1564 	args.uminit = uminit;
1565 	args.fini = fini;
1566 	args.align = align;
1567 	args.flags = flags;
1568 	args.keg = NULL;
1569 
1570 	return (uma_zalloc_internal(zones, &args, M_WAITOK));
1571 }
1572 
1573 /* See uma.h */
1574 uma_zone_t
1575 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
1576 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
1577 {
1578 	struct uma_zctor_args args;
1579 
1580 	args.name = name;
1581 	args.size = master->uz_keg->uk_size;
1582 	args.ctor = ctor;
1583 	args.dtor = dtor;
1584 	args.uminit = zinit;
1585 	args.fini = zfini;
1586 	args.align = master->uz_keg->uk_align;
1587 	args.flags = master->uz_keg->uk_flags | UMA_ZONE_SECONDARY;
1588 	args.keg = master->uz_keg;
1589 
1590 	return (uma_zalloc_internal(zones, &args, M_WAITOK));
1591 }
1592 
1593 /* See uma.h */
1594 void
1595 uma_zdestroy(uma_zone_t zone)
1596 {
1597 	uma_zfree_internal(zones, zone, NULL, 0);
1598 }
1599 
1600 /* See uma.h */
1601 void *
1602 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
1603 {
1604 	void *item;
1605 	uma_cache_t cache;
1606 	uma_bucket_t bucket;
1607 	int cpu;
1608 	int badness;
1609 
1610 	/* This is the fast path allocation */
1611 #ifdef UMA_DEBUG_ALLOC_1
1612 	printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
1613 #endif
1614 
1615 	if (!(flags & M_NOWAIT)) {
1616 		KASSERT(curthread->td_intr_nesting_level == 0,
1617 		   ("malloc(M_WAITOK) in interrupt context"));
1618 		badness = nosleepwithlocks;
1619 #ifdef WITNESS
1620 		badness = WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
1621 		    NULL,
1622 		    "malloc(M_WAITOK) of \"%s\", forcing M_NOWAIT",
1623 		    zone->uz_name);
1624 #endif
1625 		if (badness) {
1626 			flags &= ~M_WAITOK;
1627 			flags |= M_NOWAIT;
1628 		}
1629 	}
1630 
1631 zalloc_restart:
1632 	cpu = PCPU_GET(cpuid);
1633 	CPU_LOCK(cpu);
1634 	cache = &zone->uz_cpu[cpu];
1635 
1636 zalloc_start:
1637 	bucket = cache->uc_allocbucket;
1638 
1639 	if (bucket) {
1640 		if (bucket->ub_cnt > 0) {
1641 			bucket->ub_cnt--;
1642 			item = bucket->ub_bucket[bucket->ub_cnt];
1643 #ifdef INVARIANTS
1644 			bucket->ub_bucket[bucket->ub_cnt] = NULL;
1645 #endif
1646 			KASSERT(item != NULL,
1647 			    ("uma_zalloc: Bucket pointer mangled."));
1648 			cache->uc_allocs++;
1649 #ifdef INVARIANTS
1650 			ZONE_LOCK(zone);
1651 			uma_dbg_alloc(zone, NULL, item);
1652 			ZONE_UNLOCK(zone);
1653 #endif
1654 			CPU_UNLOCK(cpu);
1655 			if (zone->uz_ctor)
1656 				zone->uz_ctor(item,zone->uz_keg->uk_size,udata);
1657 			if (flags & M_ZERO)
1658 				bzero(item, zone->uz_keg->uk_size);
1659 			return (item);
1660 		} else if (cache->uc_freebucket) {
1661 			/*
1662 			 * We have run out of items in our allocbucket.
1663 			 * See if we can switch with our free bucket.
1664 			 */
1665 			if (cache->uc_freebucket->ub_cnt > 0) {
1666 #ifdef UMA_DEBUG_ALLOC
1667 				printf("uma_zalloc: Swapping empty with"
1668 				    " alloc.\n");
1669 #endif
1670 				bucket = cache->uc_freebucket;
1671 				cache->uc_freebucket = cache->uc_allocbucket;
1672 				cache->uc_allocbucket = bucket;
1673 
1674 				goto zalloc_start;
1675 			}
1676 		}
1677 	}
1678 	ZONE_LOCK(zone);
1679 	/* Since we have locked the zone we may as well send back our stats */
1680 	zone->uz_allocs += cache->uc_allocs;
1681 	cache->uc_allocs = 0;
1682 
1683 	/* Our old one is now a free bucket */
1684 	if (cache->uc_allocbucket) {
1685 		KASSERT(cache->uc_allocbucket->ub_cnt == 0,
1686 		    ("uma_zalloc_arg: Freeing a non free bucket."));
1687 		LIST_INSERT_HEAD(&zone->uz_free_bucket,
1688 		    cache->uc_allocbucket, ub_link);
1689 		cache->uc_allocbucket = NULL;
1690 	}
1691 
1692 	/* Check the free list for a new alloc bucket */
1693 	if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
1694 		KASSERT(bucket->ub_cnt != 0,
1695 		    ("uma_zalloc_arg: Returning an empty bucket."));
1696 
1697 		LIST_REMOVE(bucket, ub_link);
1698 		cache->uc_allocbucket = bucket;
1699 		ZONE_UNLOCK(zone);
1700 		goto zalloc_start;
1701 	}
1702 	/* We are no longer associated with this cpu!!! */
1703 	CPU_UNLOCK(cpu);
1704 
1705 	/* Bump up our uz_count so we get here less */
1706 	if (zone->uz_count < BUCKET_MAX)
1707 		zone->uz_count++;
1708 
1709 	/*
1710 	 * Now lets just fill a bucket and put it on the free list.  If that
1711 	 * works we'll restart the allocation from the begining.
1712 	 */
1713 	if (uma_zalloc_bucket(zone, flags)) {
1714 		ZONE_UNLOCK(zone);
1715 		goto zalloc_restart;
1716 	}
1717 	ZONE_UNLOCK(zone);
1718 	/*
1719 	 * We may not be able to get a bucket so return an actual item.
1720 	 */
1721 #ifdef UMA_DEBUG
1722 	printf("uma_zalloc_arg: Bucketzone returned NULL\n");
1723 #endif
1724 
1725 	return (uma_zalloc_internal(zone, udata, flags));
1726 }
1727 
1728 static uma_slab_t
1729 uma_zone_slab(uma_zone_t zone, int flags)
1730 {
1731 	uma_slab_t slab;
1732 	uma_keg_t keg;
1733 
1734 	keg = zone->uz_keg;
1735 
1736 	/*
1737 	 * This is to prevent us from recursively trying to allocate
1738 	 * buckets.  The problem is that if an allocation forces us to
1739 	 * grab a new bucket we will call page_alloc, which will go off
1740 	 * and cause the vm to allocate vm_map_entries.  If we need new
1741 	 * buckets there too we will recurse in kmem_alloc and bad
1742 	 * things happen.  So instead we return a NULL bucket, and make
1743 	 * the code that allocates buckets smart enough to deal with it
1744 	 */
1745 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL && keg->uk_recurse != 0)
1746 		return (NULL);
1747 
1748 	slab = NULL;
1749 
1750 	for (;;) {
1751 		/*
1752 		 * Find a slab with some space.  Prefer slabs that are partially
1753 		 * used over those that are totally full.  This helps to reduce
1754 		 * fragmentation.
1755 		 */
1756 		if (keg->uk_free != 0) {
1757 			if (!LIST_EMPTY(&keg->uk_part_slab)) {
1758 				slab = LIST_FIRST(&keg->uk_part_slab);
1759 			} else {
1760 				slab = LIST_FIRST(&keg->uk_free_slab);
1761 				LIST_REMOVE(slab, us_link);
1762 				LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
1763 				    us_link);
1764 			}
1765 			return (slab);
1766 		}
1767 
1768 		/*
1769 		 * M_NOVM means don't ask at all!
1770 		 */
1771 		if (flags & M_NOVM)
1772 			break;
1773 
1774 		if (keg->uk_maxpages &&
1775 		    keg->uk_pages >= keg->uk_maxpages) {
1776 			keg->uk_flags |= UMA_ZFLAG_FULL;
1777 
1778 			if (flags & M_NOWAIT)
1779 				break;
1780 			else
1781 				msleep(keg, &keg->uk_lock, PVM,
1782 				    "zonelimit", 0);
1783 			continue;
1784 		}
1785 		keg->uk_recurse++;
1786 		slab = slab_zalloc(zone, flags);
1787 		keg->uk_recurse--;
1788 
1789 		/*
1790 		 * If we got a slab here it's safe to mark it partially used
1791 		 * and return.  We assume that the caller is going to remove
1792 		 * at least one item.
1793 		 */
1794 		if (slab) {
1795 			LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
1796 			return (slab);
1797 		}
1798 		/*
1799 		 * We might not have been able to get a slab but another cpu
1800 		 * could have while we were unlocked.  Check again before we
1801 		 * fail.
1802 		 */
1803 		if (flags & M_NOWAIT)
1804 			flags |= M_NOVM;
1805 	}
1806 	return (slab);
1807 }
1808 
1809 static void *
1810 uma_slab_alloc(uma_zone_t zone, uma_slab_t slab)
1811 {
1812 	uma_keg_t keg;
1813 	void *item;
1814 	u_int8_t freei;
1815 
1816 	keg = zone->uz_keg;
1817 
1818 	freei = slab->us_firstfree;
1819 	slab->us_firstfree = slab->us_freelist[freei].us_item;
1820 	item = slab->us_data + (keg->uk_rsize * freei);
1821 
1822 	slab->us_freecount--;
1823 	keg->uk_free--;
1824 #ifdef INVARIANTS
1825 	uma_dbg_alloc(zone, slab, item);
1826 #endif
1827 	/* Move this slab to the full list */
1828 	if (slab->us_freecount == 0) {
1829 		LIST_REMOVE(slab, us_link);
1830 		LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
1831 	}
1832 
1833 	return (item);
1834 }
1835 
1836 static int
1837 uma_zalloc_bucket(uma_zone_t zone, int flags)
1838 {
1839 	uma_bucket_t bucket;
1840 	uma_slab_t slab;
1841 	int16_t saved;
1842 	int max;
1843 
1844 	/*
1845 	 * Try this zone's free list first so we don't allocate extra buckets.
1846 	 */
1847 	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
1848 		KASSERT(bucket->ub_cnt == 0,
1849 		    ("uma_zalloc_bucket: Bucket on free list is not empty."));
1850 		LIST_REMOVE(bucket, ub_link);
1851 	} else {
1852 		int bflags;
1853 
1854 		bflags = (flags & ~M_ZERO);
1855 		if (zone->uz_keg->uk_flags & UMA_ZFLAG_CACHEONLY)
1856 			bflags |= M_NOVM;
1857 
1858 		ZONE_UNLOCK(zone);
1859 		bucket = bucket_alloc(zone->uz_count, bflags);
1860 		ZONE_LOCK(zone);
1861 	}
1862 
1863 	if (bucket == NULL)
1864 		return (0);
1865 
1866 #ifdef SMP
1867 	/*
1868 	 * This code is here to limit the number of simultaneous bucket fills
1869 	 * for any given zone to the number of per cpu caches in this zone. This
1870 	 * is done so that we don't allocate more memory than we really need.
1871 	 */
1872 	if (zone->uz_fills >= mp_ncpus)
1873 		goto done;
1874 
1875 #endif
1876 	zone->uz_fills++;
1877 
1878 	max = MIN(bucket->ub_entries, zone->uz_count);
1879 	/* Try to keep the buckets totally full */
1880 	saved = bucket->ub_cnt;
1881 	while (bucket->ub_cnt < max &&
1882 	    (slab = uma_zone_slab(zone, flags)) != NULL) {
1883 		while (slab->us_freecount && bucket->ub_cnt < max) {
1884 			bucket->ub_bucket[bucket->ub_cnt++] =
1885 			    uma_slab_alloc(zone, slab);
1886 		}
1887 
1888 		/* Don't block on the next fill */
1889 		flags |= M_NOWAIT;
1890 	}
1891 
1892 	/*
1893 	 * We unlock here because we need to call the zone's init.
1894 	 * It should be safe to unlock because the slab dealt with
1895 	 * above is already on the appropriate list within the keg
1896 	 * and the bucket we filled is not yet on any list, so we
1897 	 * own it.
1898 	 */
1899 	if (zone->uz_init != NULL) {
1900 		int i;
1901 
1902 		ZONE_UNLOCK(zone);
1903 		for (i = saved; i < bucket->ub_cnt; i++)
1904 			zone->uz_init(bucket->ub_bucket[i],
1905 			    zone->uz_keg->uk_size);
1906 		ZONE_LOCK(zone);
1907 	}
1908 
1909 	zone->uz_fills--;
1910 	if (bucket->ub_cnt != 0) {
1911 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
1912 		    bucket, ub_link);
1913 		return (1);
1914 	}
1915 #ifdef SMP
1916 done:
1917 #endif
1918 	bucket_free(bucket);
1919 
1920 	return (0);
1921 }
1922 /*
1923  * Allocates an item for an internal zone
1924  *
1925  * Arguments
1926  *	zone   The zone to alloc for.
1927  *	udata  The data to be passed to the constructor.
1928  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
1929  *
1930  * Returns
1931  *	NULL if there is no memory and M_NOWAIT is set
1932  *	An item if successful
1933  */
1934 
1935 static void *
1936 uma_zalloc_internal(uma_zone_t zone, void *udata, int flags)
1937 {
1938 	uma_keg_t keg;
1939 	uma_slab_t slab;
1940 	void *item;
1941 
1942 	item = NULL;
1943 	keg = zone->uz_keg;
1944 
1945 #ifdef UMA_DEBUG_ALLOC
1946 	printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
1947 #endif
1948 	ZONE_LOCK(zone);
1949 
1950 	slab = uma_zone_slab(zone, flags);
1951 	if (slab == NULL) {
1952 		ZONE_UNLOCK(zone);
1953 		return (NULL);
1954 	}
1955 
1956 	item = uma_slab_alloc(zone, slab);
1957 
1958 	ZONE_UNLOCK(zone);
1959 
1960 	/*
1961 	 * We have to call both the zone's init (not the keg's init)
1962 	 * and the zone's ctor.  This is because the item is going from
1963 	 * a keg slab directly to the user, and the user is expecting it
1964 	 * to be both zone-init'd as well as zone-ctor'd.
1965 	 */
1966 	if (zone->uz_init != NULL)
1967 		zone->uz_init(item, keg->uk_size);
1968 	if (zone->uz_ctor != NULL)
1969 		zone->uz_ctor(item, keg->uk_size, udata);
1970 	if (flags & M_ZERO)
1971 		bzero(item, keg->uk_size);
1972 
1973 	return (item);
1974 }
1975 
1976 /* See uma.h */
1977 void
1978 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
1979 {
1980 	uma_keg_t keg;
1981 	uma_cache_t cache;
1982 	uma_bucket_t bucket;
1983 	int bflags;
1984 	int cpu;
1985 	int skip;
1986 
1987 	/* This is the fast path free */
1988 	skip = 0;
1989 	keg = zone->uz_keg;
1990 
1991 #ifdef UMA_DEBUG_ALLOC_1
1992 	printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
1993 #endif
1994 	/*
1995 	 * The race here is acceptable.  If we miss it we'll just have to wait
1996 	 * a little longer for the limits to be reset.
1997 	 */
1998 
1999 	if (keg->uk_flags & UMA_ZFLAG_FULL)
2000 		goto zfree_internal;
2001 
2002 	if (zone->uz_dtor) {
2003 		zone->uz_dtor(item, keg->uk_size, udata);
2004 		skip = 1;
2005 	}
2006 
2007 zfree_restart:
2008 	cpu = PCPU_GET(cpuid);
2009 	CPU_LOCK(cpu);
2010 	cache = &zone->uz_cpu[cpu];
2011 
2012 zfree_start:
2013 	bucket = cache->uc_freebucket;
2014 
2015 	if (bucket) {
2016 		/*
2017 		 * Do we have room in our bucket? It is OK for this uz count
2018 		 * check to be slightly out of sync.
2019 		 */
2020 
2021 		if (bucket->ub_cnt < bucket->ub_entries) {
2022 			KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
2023 			    ("uma_zfree: Freeing to non free bucket index."));
2024 			bucket->ub_bucket[bucket->ub_cnt] = item;
2025 			bucket->ub_cnt++;
2026 #ifdef INVARIANTS
2027 			ZONE_LOCK(zone);
2028 			if (keg->uk_flags & UMA_ZONE_MALLOC)
2029 				uma_dbg_free(zone, udata, item);
2030 			else
2031 				uma_dbg_free(zone, NULL, item);
2032 			ZONE_UNLOCK(zone);
2033 #endif
2034 			CPU_UNLOCK(cpu);
2035 			return;
2036 		} else if (cache->uc_allocbucket) {
2037 #ifdef UMA_DEBUG_ALLOC
2038 			printf("uma_zfree: Swapping buckets.\n");
2039 #endif
2040 			/*
2041 			 * We have run out of space in our freebucket.
2042 			 * See if we can switch with our alloc bucket.
2043 			 */
2044 			if (cache->uc_allocbucket->ub_cnt <
2045 			    cache->uc_freebucket->ub_cnt) {
2046 				bucket = cache->uc_freebucket;
2047 				cache->uc_freebucket = cache->uc_allocbucket;
2048 				cache->uc_allocbucket = bucket;
2049 				goto zfree_start;
2050 			}
2051 		}
2052 	}
2053 	/*
2054 	 * We can get here for two reasons:
2055 	 *
2056 	 * 1) The buckets are NULL
2057 	 * 2) The alloc and free buckets are both somewhat full.
2058 	 */
2059 
2060 	ZONE_LOCK(zone);
2061 
2062 	bucket = cache->uc_freebucket;
2063 	cache->uc_freebucket = NULL;
2064 
2065 	/* Can we throw this on the zone full list? */
2066 	if (bucket != NULL) {
2067 #ifdef UMA_DEBUG_ALLOC
2068 		printf("uma_zfree: Putting old bucket on the free list.\n");
2069 #endif
2070 		/* ub_cnt is pointing to the last free item */
2071 		KASSERT(bucket->ub_cnt != 0,
2072 		    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
2073 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
2074 		    bucket, ub_link);
2075 	}
2076 	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
2077 		LIST_REMOVE(bucket, ub_link);
2078 		ZONE_UNLOCK(zone);
2079 		cache->uc_freebucket = bucket;
2080 		goto zfree_start;
2081 	}
2082 	/* We're done with this CPU now */
2083 	CPU_UNLOCK(cpu);
2084 
2085 	/* And the zone.. */
2086 	ZONE_UNLOCK(zone);
2087 
2088 #ifdef UMA_DEBUG_ALLOC
2089 	printf("uma_zfree: Allocating new free bucket.\n");
2090 #endif
2091 	bflags = M_NOWAIT;
2092 
2093 	if (keg->uk_flags & UMA_ZFLAG_CACHEONLY)
2094 		bflags |= M_NOVM;
2095 	bucket = bucket_alloc(zone->uz_count, bflags);
2096 	if (bucket) {
2097 		ZONE_LOCK(zone);
2098 		LIST_INSERT_HEAD(&zone->uz_free_bucket,
2099 		    bucket, ub_link);
2100 		ZONE_UNLOCK(zone);
2101 		goto zfree_restart;
2102 	}
2103 
2104 	/*
2105 	 * If nothing else caught this, we'll just do an internal free.
2106 	 */
2107 
2108 zfree_internal:
2109 
2110 #ifdef INVARIANTS
2111 	/*
2112 	 * If we need to skip the dtor and the uma_dbg_free in
2113 	 * uma_zfree_internal because we've already called the dtor
2114 	 * above, but we ended up here, then we need to make sure
2115 	 * that we take care of the uma_dbg_free immediately.
2116 	 */
2117 	if (skip) {
2118 		ZONE_LOCK(zone);
2119 		if (keg->uk_flags & UMA_ZONE_MALLOC)
2120 			uma_dbg_free(zone, udata, item);
2121 		else
2122 			uma_dbg_free(zone, NULL, item);
2123 		ZONE_UNLOCK(zone);
2124 	}
2125 #endif
2126 	uma_zfree_internal(zone, item, udata, skip);
2127 
2128 	return;
2129 }
2130 
2131 /*
2132  * Frees an item to an INTERNAL zone or allocates a free bucket
2133  *
2134  * Arguments:
2135  *	zone   The zone to free to
2136  *	item   The item we're freeing
2137  *	udata  User supplied data for the dtor
2138  *	skip   Skip the dtor, it was done in uma_zfree_arg
2139  */
2140 static void
2141 uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip)
2142 {
2143 	uma_slab_t slab;
2144 	uma_keg_t keg;
2145 	u_int8_t *mem;
2146 	u_int8_t freei;
2147 
2148 	keg = zone->uz_keg;
2149 
2150 	if (!skip && zone->uz_dtor)
2151 		zone->uz_dtor(item, keg->uk_size, udata);
2152 	if (zone->uz_fini)
2153 		zone->uz_fini(item, keg->uk_size);
2154 
2155 	ZONE_LOCK(zone);
2156 
2157 	if (!(keg->uk_flags & UMA_ZONE_MALLOC)) {
2158 		mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
2159 		if (keg->uk_flags & UMA_ZONE_HASH)
2160 			slab = hash_sfind(&keg->uk_hash, mem);
2161 		else {
2162 			mem += keg->uk_pgoff;
2163 			slab = (uma_slab_t)mem;
2164 		}
2165 	} else {
2166 		slab = (uma_slab_t)udata;
2167 	}
2168 
2169 	/* Do we need to remove from any lists? */
2170 	if (slab->us_freecount+1 == keg->uk_ipers) {
2171 		LIST_REMOVE(slab, us_link);
2172 		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
2173 	} else if (slab->us_freecount == 0) {
2174 		LIST_REMOVE(slab, us_link);
2175 		LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2176 	}
2177 
2178 	/* Slab management stuff */
2179 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
2180 		/ keg->uk_rsize;
2181 
2182 #ifdef INVARIANTS
2183 	if (!skip)
2184 		uma_dbg_free(zone, slab, item);
2185 #endif
2186 
2187 	slab->us_freelist[freei].us_item = slab->us_firstfree;
2188 	slab->us_firstfree = freei;
2189 	slab->us_freecount++;
2190 
2191 	/* Zone statistics */
2192 	keg->uk_free++;
2193 
2194 	if (keg->uk_flags & UMA_ZFLAG_FULL) {
2195 		if (keg->uk_pages < keg->uk_maxpages)
2196 			keg->uk_flags &= ~UMA_ZFLAG_FULL;
2197 
2198 		/* We can handle one more allocation */
2199 		wakeup_one(keg);
2200 	}
2201 
2202 	ZONE_UNLOCK(zone);
2203 }
2204 
2205 /* See uma.h */
2206 void
2207 uma_zone_set_max(uma_zone_t zone, int nitems)
2208 {
2209 	uma_keg_t keg;
2210 
2211 	keg = zone->uz_keg;
2212 	ZONE_LOCK(zone);
2213 	if (keg->uk_ppera > 1)
2214 		keg->uk_maxpages = nitems * keg->uk_ppera;
2215 	else
2216 		keg->uk_maxpages = nitems / keg->uk_ipers;
2217 
2218 	if (keg->uk_maxpages * keg->uk_ipers < nitems)
2219 		keg->uk_maxpages++;
2220 
2221 	ZONE_UNLOCK(zone);
2222 }
2223 
2224 /* See uma.h */
2225 void
2226 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
2227 {
2228 	ZONE_LOCK(zone);
2229 	KASSERT(zone->uz_keg->uk_pages == 0,
2230 	    ("uma_zone_set_init on non-empty keg"));
2231 	zone->uz_keg->uk_init = uminit;
2232 	ZONE_UNLOCK(zone);
2233 }
2234 
2235 /* See uma.h */
2236 void
2237 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
2238 {
2239 	ZONE_LOCK(zone);
2240 	KASSERT(zone->uz_keg->uk_pages == 0,
2241 	    ("uma_zone_set_fini on non-empty keg"));
2242 	zone->uz_keg->uk_fini = fini;
2243 	ZONE_UNLOCK(zone);
2244 }
2245 
2246 /* See uma.h */
2247 void
2248 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
2249 {
2250 	ZONE_LOCK(zone);
2251 	KASSERT(zone->uz_keg->uk_pages == 0,
2252 	    ("uma_zone_set_zinit on non-empty keg"));
2253 	zone->uz_init = zinit;
2254 	ZONE_UNLOCK(zone);
2255 }
2256 
2257 /* See uma.h */
2258 void
2259 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
2260 {
2261 	ZONE_LOCK(zone);
2262 	KASSERT(zone->uz_keg->uk_pages == 0,
2263 	    ("uma_zone_set_zfini on non-empty keg"));
2264 	zone->uz_fini = zfini;
2265 	ZONE_UNLOCK(zone);
2266 }
2267 
2268 /* See uma.h */
2269 void
2270 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
2271 {
2272 	ZONE_LOCK(zone);
2273 	zone->uz_keg->uk_freef = freef;
2274 	ZONE_UNLOCK(zone);
2275 }
2276 
2277 /* See uma.h */
2278 void
2279 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
2280 {
2281 	ZONE_LOCK(zone);
2282 	zone->uz_keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
2283 	zone->uz_keg->uk_allocf = allocf;
2284 	ZONE_UNLOCK(zone);
2285 }
2286 
2287 /* See uma.h */
2288 int
2289 uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
2290 {
2291 	uma_keg_t keg;
2292 	vm_offset_t kva;
2293 	int pages;
2294 
2295 	keg = zone->uz_keg;
2296 	pages = count / keg->uk_ipers;
2297 
2298 	if (pages * keg->uk_ipers < count)
2299 		pages++;
2300 
2301 	kva = kmem_alloc_pageable(kernel_map, pages * UMA_SLAB_SIZE);
2302 
2303 	if (kva == 0)
2304 		return (0);
2305 	if (obj == NULL) {
2306 		obj = vm_object_allocate(OBJT_DEFAULT,
2307 		    pages);
2308 	} else {
2309 		VM_OBJECT_LOCK_INIT(obj);
2310 		_vm_object_allocate(OBJT_DEFAULT,
2311 		    pages, obj);
2312 	}
2313 	ZONE_LOCK(zone);
2314 	keg->uk_kva = kva;
2315 	keg->uk_obj = obj;
2316 	keg->uk_maxpages = pages;
2317 	keg->uk_allocf = obj_alloc;
2318 	keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
2319 	ZONE_UNLOCK(zone);
2320 	return (1);
2321 }
2322 
2323 /* See uma.h */
2324 void
2325 uma_prealloc(uma_zone_t zone, int items)
2326 {
2327 	int slabs;
2328 	uma_slab_t slab;
2329 	uma_keg_t keg;
2330 
2331 	keg = zone->uz_keg;
2332 	ZONE_LOCK(zone);
2333 	slabs = items / keg->uk_ipers;
2334 	if (slabs * keg->uk_ipers < items)
2335 		slabs++;
2336 	while (slabs > 0) {
2337 		slab = slab_zalloc(zone, M_WAITOK);
2338 		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
2339 		slabs--;
2340 	}
2341 	ZONE_UNLOCK(zone);
2342 }
2343 
2344 /* See uma.h */
2345 u_int32_t *
2346 uma_find_refcnt(uma_zone_t zone, void *item)
2347 {
2348 	uma_slabrefcnt_t slab;
2349 	uma_keg_t keg;
2350 	u_int32_t *refcnt;
2351 	int idx;
2352 
2353 	keg = zone->uz_keg;
2354 	slab = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
2355 	KASSERT(slab != NULL,
2356 	    ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
2357 	idx = ((unsigned long)item - (unsigned long)slab->us_data)
2358 	    / keg->uk_rsize;
2359 	refcnt = &(slab->us_freelist[idx].us_refcnt);
2360 	return refcnt;
2361 }
2362 
2363 /* See uma.h */
2364 void
2365 uma_reclaim(void)
2366 {
2367 #ifdef UMA_DEBUG
2368 	printf("UMA: vm asked us to release pages!\n");
2369 #endif
2370 	bucket_enable();
2371 	zone_foreach(zone_drain);
2372 	/*
2373 	 * Some slabs may have been freed but this zone will be visited early
2374 	 * we visit again so that we can free pages that are empty once other
2375 	 * zones are drained.  We have to do the same for buckets.
2376 	 */
2377 	zone_drain(slabzone);
2378 	zone_drain(slabrefzone);
2379 	bucket_zone_drain();
2380 }
2381 
2382 void *
2383 uma_large_malloc(int size, int wait)
2384 {
2385 	void *mem;
2386 	uma_slab_t slab;
2387 	u_int8_t flags;
2388 
2389 	slab = uma_zalloc_internal(slabzone, NULL, wait);
2390 	if (slab == NULL)
2391 		return (NULL);
2392 	mem = page_alloc(NULL, size, &flags, wait);
2393 	if (mem) {
2394 		vsetslab((vm_offset_t)mem, slab);
2395 		slab->us_data = mem;
2396 		slab->us_flags = flags | UMA_SLAB_MALLOC;
2397 		slab->us_size = size;
2398 	} else {
2399 		uma_zfree_internal(slabzone, slab, NULL, 0);
2400 	}
2401 
2402 	return (mem);
2403 }
2404 
2405 void
2406 uma_large_free(uma_slab_t slab)
2407 {
2408 	vsetobj((vm_offset_t)slab->us_data, kmem_object);
2409 	page_free(slab->us_data, slab->us_size, slab->us_flags);
2410 	uma_zfree_internal(slabzone, slab, NULL, 0);
2411 }
2412 
2413 void
2414 uma_print_stats(void)
2415 {
2416 	zone_foreach(uma_print_zone);
2417 }
2418 
2419 static void
2420 slab_print(uma_slab_t slab)
2421 {
2422 	printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
2423 		slab->us_keg, slab->us_data, slab->us_freecount,
2424 		slab->us_firstfree);
2425 }
2426 
2427 static void
2428 cache_print(uma_cache_t cache)
2429 {
2430 	printf("alloc: %p(%d), free: %p(%d)\n",
2431 		cache->uc_allocbucket,
2432 		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
2433 		cache->uc_freebucket,
2434 		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
2435 }
2436 
2437 void
2438 uma_print_zone(uma_zone_t zone)
2439 {
2440 	uma_cache_t cache;
2441 	uma_keg_t keg;
2442 	uma_slab_t slab;
2443 	int i;
2444 
2445 	keg = zone->uz_keg;
2446 	printf("%s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n",
2447 	    zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
2448 	    keg->uk_ipers, keg->uk_ppera,
2449 	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
2450 	printf("Part slabs:\n");
2451 	LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
2452 		slab_print(slab);
2453 	printf("Free slabs:\n");
2454 	LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
2455 		slab_print(slab);
2456 	printf("Full slabs:\n");
2457 	LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
2458 		slab_print(slab);
2459 	for (i = 0; i <= mp_maxid; i++) {
2460 		if (CPU_ABSENT(i))
2461 			continue;
2462 		cache = &zone->uz_cpu[i];
2463 		printf("CPU %d Cache:\n", i);
2464 		cache_print(cache);
2465 	}
2466 }
2467 
2468 /*
2469  * Sysctl handler for vm.zone
2470  *
2471  * stolen from vm_zone.c
2472  */
2473 static int
2474 sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
2475 {
2476 	int error, len, cnt;
2477 	const int linesize = 128;	/* conservative */
2478 	int totalfree;
2479 	char *tmpbuf, *offset;
2480 	uma_zone_t z;
2481 	uma_keg_t zk;
2482 	char *p;
2483 	int cpu;
2484 	int cachefree;
2485 	uma_bucket_t bucket;
2486 	uma_cache_t cache;
2487 
2488 	cnt = 0;
2489 	mtx_lock(&uma_mtx);
2490 	LIST_FOREACH(zk, &uma_kegs, uk_link) {
2491 		LIST_FOREACH(z, &zk->uk_zones, uz_link)
2492 			cnt++;
2493 	}
2494 	mtx_unlock(&uma_mtx);
2495 	MALLOC(tmpbuf, char *, (cnt == 0 ? 1 : cnt) * linesize,
2496 			M_TEMP, M_WAITOK);
2497 	len = snprintf(tmpbuf, linesize,
2498 	    "\nITEM            SIZE     LIMIT     USED    FREE  REQUESTS\n\n");
2499 	if (cnt == 0)
2500 		tmpbuf[len - 1] = '\0';
2501 	error = SYSCTL_OUT(req, tmpbuf, cnt == 0 ? len-1 : len);
2502 	if (error || cnt == 0)
2503 		goto out;
2504 	offset = tmpbuf;
2505 	mtx_lock(&uma_mtx);
2506 	LIST_FOREACH(zk, &uma_kegs, uk_link) {
2507 	  LIST_FOREACH(z, &zk->uk_zones, uz_link) {
2508 		if (cnt == 0)	/* list may have changed size */
2509 			break;
2510 		if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) {
2511 			for (cpu = 0; cpu <= mp_maxid; cpu++) {
2512 				if (CPU_ABSENT(cpu))
2513 					continue;
2514 				CPU_LOCK(cpu);
2515 			}
2516 		}
2517 		ZONE_LOCK(z);
2518 		cachefree = 0;
2519 		if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) {
2520 			for (cpu = 0; cpu <= mp_maxid; cpu++) {
2521 				if (CPU_ABSENT(cpu))
2522 					continue;
2523 				cache = &z->uz_cpu[cpu];
2524 				if (cache->uc_allocbucket != NULL)
2525 					cachefree += cache->uc_allocbucket->ub_cnt;
2526 				if (cache->uc_freebucket != NULL)
2527 					cachefree += cache->uc_freebucket->ub_cnt;
2528 				CPU_UNLOCK(cpu);
2529 			}
2530 		}
2531 		LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) {
2532 			cachefree += bucket->ub_cnt;
2533 		}
2534 		totalfree = zk->uk_free + cachefree;
2535 		len = snprintf(offset, linesize,
2536 		    "%-12.12s  %6.6u, %8.8u, %6.6u, %6.6u, %8.8llu\n",
2537 		    z->uz_name, zk->uk_size,
2538 		    zk->uk_maxpages * zk->uk_ipers,
2539 		    (zk->uk_ipers * (zk->uk_pages / zk->uk_ppera)) - totalfree,
2540 		    totalfree,
2541 		    (unsigned long long)z->uz_allocs);
2542 		ZONE_UNLOCK(z);
2543 		for (p = offset + 12; p > offset && *p == ' '; --p)
2544 			/* nothing */ ;
2545 		p[1] = ':';
2546 		cnt--;
2547 		offset += len;
2548 	  }
2549 	}
2550 	mtx_unlock(&uma_mtx);
2551 	*offset++ = '\0';
2552 	error = SYSCTL_OUT(req, tmpbuf, offset - tmpbuf);
2553 out:
2554 	FREE(tmpbuf, M_TEMP);
2555 	return (error);
2556 }
2557