xref: /freebsd/sys/vm/uma_core.c (revision 42c159fe388a3765f69860c84183700af37aca8a)
1 /*
2  * Copyright (c) 2002, Jeffrey Roberson <jroberson@chesapeake.net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  *
28  */
29 
30 /*
31  * uma_core.c  Implementation of the Universal Memory allocator
32  *
33  * This allocator is intended to replace the multitude of similar object caches
34  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
35  * effecient.  A primary design goal is to return unused memory to the rest of
36  * the system.  This will make the system as a whole more flexible due to the
37  * ability to move memory to subsystems which most need it instead of leaving
38  * pools of reserved memory unused.
39  *
40  * The basic ideas stem from similar slab/zone based allocators whose algorithms
41  * are well known.
42  *
43  */
44 
45 /*
46  * TODO:
47  *	- Improve memory usage for large allocations
48  *	- Improve INVARIANTS (0xdeadc0de write out)
49  *	- Investigate cache size adjustments
50  */
51 
52 /* I should really use ktr.. */
53 /*
54 #define UMA_DEBUG 1
55 #define UMA_DEBUG_ALLOC 1
56 #define UMA_DEBUG_ALLOC_1 1
57 */
58 
59 
60 #include "opt_param.h"
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/kernel.h>
64 #include <sys/types.h>
65 #include <sys/queue.h>
66 #include <sys/malloc.h>
67 #include <sys/lock.h>
68 #include <sys/sysctl.h>
69 #include <machine/types.h>
70 #include <sys/mutex.h>
71 #include <sys/smp.h>
72 
73 #include <vm/vm.h>
74 #include <vm/vm_object.h>
75 #include <vm/vm_page.h>
76 #include <vm/vm_param.h>
77 #include <vm/vm_map.h>
78 #include <vm/vm_kern.h>
79 #include <vm/vm_extern.h>
80 #include <vm/uma.h>
81 #include <vm/uma_int.h>
82 
83 /*
84  * This is the zone from which all zones are spawned.  The idea is that even
85  * the zone heads are allocated from the allocator, so we use the bss section
86  * to bootstrap us.
87  */
88 static struct uma_zone master_zone;
89 static uma_zone_t zones = &master_zone;
90 
91 /* This is the zone from which all of uma_slab_t's are allocated. */
92 static uma_zone_t slabzone;
93 
94 /*
95  * The initial hash tables come out of this zone so they can be allocated
96  * prior to malloc coming up.
97  */
98 static uma_zone_t hashzone;
99 
100 /*
101  * Zone that buckets come from.
102  */
103 static uma_zone_t bucketzone;
104 
105 /* Linked list of all zones in the system */
106 static LIST_HEAD(,uma_zone) uma_zones = LIST_HEAD_INITIALIZER(&uma_zones);
107 
108 /* This mutex protects the zone list */
109 static struct mtx uma_mtx;
110 
111 /* Linked list of boot time pages */
112 static LIST_HEAD(,uma_slab) uma_boot_pages =
113     LIST_HEAD_INITIALIZER(&uma_boot_pages);
114 
115 /* Count of free boottime pages */
116 static int uma_boot_free = 0;
117 
118 /* Is the VM done starting up? */
119 static int booted = 0;
120 
121 /* This is the handle used to schedule our working set calculator */
122 static struct callout uma_callout;
123 
124 /* This is mp_maxid + 1, for use while looping over each cpu */
125 static int maxcpu;
126 
127 /*
128  * This structure is passed as the zone ctor arg so that I don't have to create
129  * a special allocation function just for zones.
130  */
131 struct uma_zctor_args {
132 	char *name;
133 	int size;
134 	uma_ctor ctor;
135 	uma_dtor dtor;
136 	uma_init uminit;
137 	uma_fini fini;
138 	int align;
139 	u_int16_t flags;
140 };
141 
142 /*
143  * This is the malloc hash table which is used to find the zone that a
144  * malloc allocation came from.  It is not currently resizeable.  The
145  * memory for the actual hash bucket is allocated in kmeminit.
146  */
147 struct uma_hash mhash;
148 struct uma_hash *mallochash = &mhash;
149 
150 /* Prototypes.. */
151 
152 static void *obj_alloc(uma_zone_t, int, u_int8_t *, int);
153 static void *page_alloc(uma_zone_t, int, u_int8_t *, int);
154 static void page_free(void *, int, u_int8_t);
155 static uma_slab_t slab_zalloc(uma_zone_t, int);
156 static void cache_drain(uma_zone_t);
157 static void bucket_drain(uma_zone_t, uma_bucket_t);
158 static void zone_drain(uma_zone_t);
159 static void zone_ctor(void *, int, void *);
160 static void zero_init(void *, int);
161 static void zone_small_init(uma_zone_t zone);
162 static void zone_large_init(uma_zone_t zone);
163 static void zone_foreach(void (*zfunc)(uma_zone_t));
164 static void zone_timeout(uma_zone_t zone);
165 static void hash_expand(struct uma_hash *);
166 static void uma_timeout(void *);
167 static void uma_startup3(void);
168 static void *uma_zalloc_internal(uma_zone_t, void *, int, int *, int);
169 static void uma_zfree_internal(uma_zone_t,
170     void *, void *, int);
171 void uma_print_zone(uma_zone_t);
172 void uma_print_stats(void);
173 static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS);
174 
175 SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD,
176     NULL, 0, sysctl_vm_zone, "A", "Zone Info");
177 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
178 
179 
180 /*
181  * Routine called by timeout which is used to fire off some time interval
182  * based calculations.  (working set, stats, etc.)
183  *
184  * Arguments:
185  *	arg   Unused
186  *
187  * Returns:
188  *	Nothing
189  */
190 static void
191 uma_timeout(void *unused)
192 {
193 	zone_foreach(zone_timeout);
194 
195 	/* Reschedule this event */
196 	callout_reset(&uma_callout, UMA_WORKING_TIME * hz, uma_timeout, NULL);
197 }
198 
199 /*
200  * Routine to perform timeout driven calculations.  This does the working set
201  * as well as hash expanding, and per cpu statistics aggregation.
202  *
203  *  Arguments:
204  *	zone  The zone to operate on
205  *
206  *  Returns:
207  *	Nothing
208  */
209 static void
210 zone_timeout(uma_zone_t zone)
211 {
212 	uma_cache_t cache;
213 	u_int64_t alloc;
214 	int free;
215 	int cpu;
216 
217 	alloc = 0;
218 	free = 0;
219 
220 	/*
221 	 * Aggregate per cpu cache statistics back to the zone.
222 	 *
223 	 * I may rewrite this to set a flag in the per cpu cache instead of
224 	 * locking.  If the flag is not cleared on the next round I will have
225 	 * to lock and do it here instead so that the statistics don't get too
226 	 * far out of sync.
227 	 */
228 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) {
229 		for (cpu = 0; cpu < maxcpu; cpu++) {
230 			if (CPU_ABSENT(cpu))
231 				continue;
232 			CPU_LOCK(zone, cpu);
233 			cache = &zone->uz_cpu[cpu];
234 			/* Add them up, and reset */
235 			alloc += cache->uc_allocs;
236 			cache->uc_allocs = 0;
237 			if (cache->uc_allocbucket)
238 				free += cache->uc_allocbucket->ub_ptr + 1;
239 			if (cache->uc_freebucket)
240 				free += cache->uc_freebucket->ub_ptr + 1;
241 			CPU_UNLOCK(zone, cpu);
242 		}
243 	}
244 
245 	/* Now push these stats back into the zone.. */
246 	ZONE_LOCK(zone);
247 	zone->uz_allocs += alloc;
248 
249 	/*
250 	 * cachefree is an instantanious snapshot of what is in the per cpu
251 	 * caches, not an accurate counter
252 	 */
253 	zone->uz_cachefree = free;
254 
255 	/*
256 	 * Expand the zone hash table.
257 	 *
258 	 * This is done if the number of slabs is larger than the hash size.
259 	 * What I'm trying to do here is completely reduce collisions.  This
260 	 * may be a little aggressive.  Should I allow for two collisions max?
261 	 */
262 
263 	if ((zone->uz_flags & UMA_ZFLAG_OFFPAGE) &&
264 	    !(zone->uz_flags & UMA_ZFLAG_MALLOC)) {
265 		if (zone->uz_pages / zone->uz_ppera
266 		    >= zone->uz_hash.uh_hashsize)
267 			hash_expand(&zone->uz_hash);
268 	}
269 
270 	/*
271 	 * Here we compute the working set size as the total number of items
272 	 * left outstanding since the last time interval.  This is slightly
273 	 * suboptimal. What we really want is the highest number of outstanding
274 	 * items during the last time quantum.  This should be close enough.
275 	 *
276 	 * The working set size is used to throttle the zone_drain function.
277 	 * We don't want to return memory that we may need again immediately.
278 	 */
279 	alloc = zone->uz_allocs - zone->uz_oallocs;
280 	zone->uz_oallocs = zone->uz_allocs;
281 	zone->uz_wssize = alloc;
282 
283 	ZONE_UNLOCK(zone);
284 }
285 
286 /*
287  * Expands the hash table for OFFPAGE zones.  This is done from zone_timeout
288  * to reduce collisions.  This must not be done in the regular allocation path,
289  * otherwise, we can recurse on the vm while allocating pages.
290  *
291  * Arguments:
292  *	hash  The hash you want to expand by a factor of two.
293  *
294  * Returns:
295  * 	Nothing
296  *
297  * Discussion:
298  */
299 static void
300 hash_expand(struct uma_hash *hash)
301 {
302 	struct slabhead *newhash;
303 	struct slabhead *oldhash;
304 	uma_slab_t slab;
305 	int hzonefree;
306 	int hashsize;
307 	int alloc;
308 	int hval;
309 	int i;
310 
311 
312 	/*
313 	 * Remember the old hash size and see if it has to go back to the
314  	 * hash zone, or malloc.  The hash zone is used for the initial hash
315 	 */
316 
317 	hashsize = hash->uh_hashsize;
318 	oldhash = hash->uh_slab_hash;
319 
320 	if (hashsize == UMA_HASH_SIZE_INIT)
321 		hzonefree = 1;
322 	else
323 		hzonefree = 0;
324 
325 
326 	/* We're just going to go to a power of two greater */
327 	if (hash->uh_hashsize)  {
328 		alloc = sizeof(hash->uh_slab_hash[0]) * (hash->uh_hashsize * 2);
329 		/* XXX Shouldn't be abusing DEVBUF here */
330 		newhash = (struct slabhead *)malloc(alloc, M_DEVBUF, M_NOWAIT);
331 		if (newhash == NULL) {
332 			return;
333 		}
334 		hash->uh_hashsize *= 2;
335 	} else {
336 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
337 		newhash = uma_zalloc_internal(hashzone, NULL, M_WAITOK, NULL, -1);
338 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
339 	}
340 
341 	bzero(newhash, alloc);
342 
343 	hash->uh_hashmask = hash->uh_hashsize - 1;
344 
345 	/*
346 	 * I need to investigate hash algorithms for resizing without a
347 	 * full rehash.
348 	 */
349 
350 	for (i = 0; i < hashsize; i++)
351 		while (!SLIST_EMPTY(&hash->uh_slab_hash[i])) {
352 			slab = SLIST_FIRST(&hash->uh_slab_hash[i]);
353 			SLIST_REMOVE_HEAD(&hash->uh_slab_hash[i], us_hlink);
354 			hval = UMA_HASH(hash, slab->us_data);
355 			SLIST_INSERT_HEAD(&newhash[hval], slab, us_hlink);
356 		}
357 
358 	if (hash->uh_slab_hash) {
359 		if (hzonefree)
360 			uma_zfree_internal(hashzone,
361 			    hash->uh_slab_hash, NULL, 0);
362 		else
363 			free(hash->uh_slab_hash, M_DEVBUF);
364 	}
365 	hash->uh_slab_hash = newhash;
366 
367 	return;
368 }
369 
370 /*
371  * Frees all outstanding items in a bucket
372  *
373  * Arguments:
374  *	zone   The zone to free to, must be unlocked.
375  *	bucket The free/alloc bucket with items, cpu queue must be locked.
376  *
377  * Returns:
378  *	Nothing
379  */
380 
381 static void
382 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
383 {
384 	uma_slab_t slab;
385 	int mzone;
386 	void *item;
387 
388 	if (bucket == NULL)
389 		return;
390 
391 	slab = NULL;
392 	mzone = 0;
393 
394 	/* We have to lookup the slab again for malloc.. */
395 	if (zone->uz_flags & UMA_ZFLAG_MALLOC)
396 		mzone = 1;
397 
398 	while (bucket->ub_ptr > -1)  {
399 		item = bucket->ub_bucket[bucket->ub_ptr];
400 #ifdef INVARIANTS
401 		bucket->ub_bucket[bucket->ub_ptr] = NULL;
402 		KASSERT(item != NULL,
403 		    ("bucket_drain: botched ptr, item is NULL"));
404 #endif
405 		bucket->ub_ptr--;
406 		/*
407 		 * This is extremely inefficient.  The slab pointer was passed
408 		 * to uma_zfree_arg, but we lost it because the buckets don't
409 		 * hold them.  This will go away when free() gets a size passed
410 		 * to it.
411 		 */
412 		if (mzone)
413 			slab = hash_sfind(mallochash,
414 			    (u_int8_t *)((unsigned long)item &
415 			   (~UMA_SLAB_MASK)));
416 		uma_zfree_internal(zone, item, slab, 1);
417 	}
418 }
419 
420 /*
421  * Drains the per cpu caches for a zone.
422  *
423  * Arguments:
424  *	zone  The zone to drain, must be unlocked.
425  *
426  * Returns:
427  *	Nothing
428  *
429  * This function returns with the zone locked so that the per cpu queues can
430  * not be filled until zone_drain is finished.
431  *
432  */
433 static void
434 cache_drain(uma_zone_t zone)
435 {
436 	uma_bucket_t bucket;
437 	uma_cache_t cache;
438 	int cpu;
439 
440 	/*
441 	 * Flush out the per cpu queues.
442 	 *
443 	 * XXX This causes unneccisary thrashing due to immediately having
444 	 * empty per cpu queues.  I need to improve this.
445 	 */
446 
447 	/*
448 	 * We have to lock each cpu cache before locking the zone
449 	 */
450 	ZONE_UNLOCK(zone);
451 
452 	for (cpu = 0; cpu < maxcpu; cpu++) {
453 		if (CPU_ABSENT(cpu))
454 			continue;
455 		CPU_LOCK(zone, cpu);
456 		cache = &zone->uz_cpu[cpu];
457 		bucket_drain(zone, cache->uc_allocbucket);
458 		bucket_drain(zone, cache->uc_freebucket);
459 	}
460 
461 	/*
462 	 * Drain the bucket queues and free the buckets, we just keep two per
463 	 * cpu (alloc/free).
464 	 */
465 	ZONE_LOCK(zone);
466 	while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
467 		LIST_REMOVE(bucket, ub_link);
468 		ZONE_UNLOCK(zone);
469 		bucket_drain(zone, bucket);
470 		uma_zfree_internal(bucketzone, bucket, NULL, 0);
471 		ZONE_LOCK(zone);
472 	}
473 
474 	/* Now we do the free queue.. */
475 	while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
476 		LIST_REMOVE(bucket, ub_link);
477 		uma_zfree_internal(bucketzone, bucket, NULL, 0);
478 	}
479 
480 	/* We unlock here, but they will all block until the zone is unlocked */
481 	for (cpu = 0; cpu < maxcpu; cpu++) {
482 		if (CPU_ABSENT(cpu))
483 			continue;
484 		CPU_UNLOCK(zone, cpu);
485 	}
486 
487 	zone->uz_cachefree = 0;
488 }
489 
490 /*
491  * Frees pages from a zone back to the system.  This is done on demand from
492  * the pageout daemon.
493  *
494  * Arguments:
495  *	zone  The zone to free pages from
496  *
497  * Returns:
498  *	Nothing.
499  */
500 static void
501 zone_drain(uma_zone_t zone)
502 {
503 	uma_slab_t slab;
504 	uma_slab_t n;
505 	u_int64_t extra;
506 	u_int8_t flags;
507 	u_int8_t *mem;
508 	int i;
509 
510 	/*
511 	 * We don't want to take pages from staticly allocated zones at this
512 	 * time
513 	 */
514 	if (zone->uz_flags & UMA_ZFLAG_NOFREE || zone->uz_freef == NULL)
515 		return;
516 
517 	ZONE_LOCK(zone);
518 
519 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
520 		cache_drain(zone);
521 
522 	if (zone->uz_free < zone->uz_wssize)
523 		goto finished;
524 #ifdef UMA_DEBUG
525 	printf("%s working set size: %llu free items: %u\n",
526 	    zone->uz_name, (unsigned long long)zone->uz_wssize, zone->uz_free);
527 #endif
528 	extra = zone->uz_wssize - zone->uz_free;
529 	extra /= zone->uz_ipers;
530 
531 	/* extra is now the number of extra slabs that we can free */
532 
533 	if (extra == 0)
534 		goto finished;
535 
536 	slab = LIST_FIRST(&zone->uz_free_slab);
537 	while (slab && extra) {
538 		n = LIST_NEXT(slab, us_link);
539 
540 		/* We have no where to free these to */
541 		if (slab->us_flags & UMA_SLAB_BOOT) {
542 			slab = n;
543 			continue;
544 		}
545 
546 		LIST_REMOVE(slab, us_link);
547 		zone->uz_pages -= zone->uz_ppera;
548 		zone->uz_free -= zone->uz_ipers;
549 		if (zone->uz_fini)
550 			for (i = 0; i < zone->uz_ipers; i++)
551 				zone->uz_fini(
552 				    slab->us_data + (zone->uz_rsize * i),
553 				    zone->uz_size);
554 		flags = slab->us_flags;
555 		mem = slab->us_data;
556 		if (zone->uz_flags & UMA_ZFLAG_OFFPAGE) {
557 			if (zone->uz_flags & UMA_ZFLAG_MALLOC) {
558 				UMA_HASH_REMOVE(mallochash,
559 				    slab, slab->us_data);
560 			} else {
561 				UMA_HASH_REMOVE(&zone->uz_hash,
562 				    slab, slab->us_data);
563 			}
564 			uma_zfree_internal(slabzone, slab, NULL, 0);
565 		} else if (zone->uz_flags & UMA_ZFLAG_MALLOC)
566 			UMA_HASH_REMOVE(mallochash, slab, slab->us_data);
567 #ifdef UMA_DEBUG
568 		printf("%s: Returning %d bytes.\n",
569 		    zone->uz_name, UMA_SLAB_SIZE * zone->uz_ppera);
570 #endif
571 		zone->uz_freef(mem, UMA_SLAB_SIZE * zone->uz_ppera, flags);
572 
573 		slab = n;
574 		extra--;
575 	}
576 
577 finished:
578 	ZONE_UNLOCK(zone);
579 }
580 
581 /*
582  * Allocate a new slab for a zone.  This does not insert the slab onto a list.
583  *
584  * Arguments:
585  *	zone  The zone to allocate slabs for
586  *	wait  Shall we wait?
587  *
588  * Returns:
589  *	The slab that was allocated or NULL if there is no memory and the
590  *	caller specified M_NOWAIT.
591  *
592  */
593 static uma_slab_t
594 slab_zalloc(uma_zone_t zone, int wait)
595 {
596 	uma_slab_t slab;	/* Starting slab */
597 	u_int8_t *mem;
598 	u_int8_t flags;
599 	int i;
600 
601 #ifdef UMA_DEBUG
602 	printf("slab_zalloc:  Allocating a new slab for %s\n", zone->uz_name);
603 #endif
604 	if (zone->uz_maxpages &&
605 	    zone->uz_pages + zone->uz_ppera > zone->uz_maxpages)
606 		return (NULL);
607 
608 	if (booted || (zone->uz_flags & UMA_ZFLAG_PRIVALLOC)) {
609 		ZONE_UNLOCK(zone);
610 		mtx_lock(&Giant);
611 		slab = (uma_slab_t )zone->uz_allocf(zone,
612 		    zone->uz_ppera * UMA_SLAB_SIZE, &flags, wait);
613 		mtx_unlock(&Giant);
614 		ZONE_LOCK(zone);
615 		if (slab != NULL)
616 			slab->us_data = (u_int8_t *)slab;
617 		else
618 			return (NULL);
619 	} else {
620 
621 		if (zone->uz_ppera > 1)
622 			panic("UMA: Attemping to allocate multiple pages before vm has started.\n");
623 		if (zone->uz_flags & UMA_ZFLAG_MALLOC)
624 			panic("Mallocing before uma_startup2 has been called.\n");
625 		if (uma_boot_free == 0)
626 			panic("UMA: Ran out of pre init pages, increase UMA_BOOT_PAGES\n");
627 		slab = LIST_FIRST(&uma_boot_pages);
628 		LIST_REMOVE(slab, us_link);
629 		uma_boot_free--;
630 	}
631 
632 	mem = slab->us_data;
633 
634 	/* Alloc slab structure for offpage, otherwise adjust it's position */
635 	if (!(zone->uz_flags & UMA_ZFLAG_OFFPAGE)) {
636 		slab = (uma_slab_t )(mem + zone->uz_pgoff);
637 	} else  {
638 		slab = uma_zalloc_internal(slabzone, NULL, wait, NULL, -1);
639 		if (slab == NULL)	/* XXX This should go away */
640 			panic("UMA: No free slab structures");
641 		if (!(zone->uz_flags & UMA_ZFLAG_MALLOC))
642 			UMA_HASH_INSERT(&zone->uz_hash, slab, mem);
643 	}
644 	if (zone->uz_flags & UMA_ZFLAG_MALLOC) {
645 #ifdef UMA_DEBUG
646 		printf("Inserting %p into malloc hash from slab %p\n",
647 		    mem, slab);
648 #endif
649 		UMA_HASH_INSERT(mallochash, slab, mem);
650 	}
651 
652 	slab->us_zone = zone;
653 	slab->us_data = mem;
654 
655 	/*
656 	 * This is intended to spread data out across cache lines.
657 	 *
658 	 * This code doesn't seem to work properly on x86, and on alpha
659 	 * it makes absolutely no performance difference. I'm sure it could
660 	 * use some tuning, but sun makes outrageous claims about it's
661 	 * performance.
662 	 */
663 #if 0
664 	if (zone->uz_cachemax) {
665 		slab->us_data += zone->uz_cacheoff;
666 		zone->uz_cacheoff += UMA_CACHE_INC;
667 		if (zone->uz_cacheoff > zone->uz_cachemax)
668 			zone->uz_cacheoff = 0;
669 	}
670 #endif
671 
672 	slab->us_freecount = zone->uz_ipers;
673 	slab->us_firstfree = 0;
674 	slab->us_flags = flags;
675 	for (i = 0; i < zone->uz_ipers; i++)
676 		slab->us_freelist[i] = i+1;
677 
678 	if (zone->uz_init)
679 		for (i = 0; i < zone->uz_ipers; i++)
680 			zone->uz_init(slab->us_data + (zone->uz_rsize * i),
681 			    zone->uz_size);
682 
683 	zone->uz_pages += zone->uz_ppera;
684 	zone->uz_free += zone->uz_ipers;
685 
686 	return (slab);
687 }
688 
689 /*
690  * Allocates a number of pages from the system
691  *
692  * Arguments:
693  *	zone  Unused
694  *	bytes  The number of bytes requested
695  *	wait  Shall we wait?
696  *
697  * Returns:
698  *	A pointer to the alloced memory or possibly
699  *	NULL if M_NOWAIT is set.
700  */
701 static void *
702 page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
703 {
704 	void *p;	/* Returned page */
705 
706 	/*
707 	 * XXX The original zone allocator did this, but I don't think it's
708 	 * neccisary in current.
709 	 */
710 
711 	if (lockstatus(&kernel_map->lock, NULL)) {
712 		*pflag = UMA_SLAB_KMEM;
713 		p = (void *) kmem_malloc(kmem_map, bytes, wait);
714 	} else {
715 		*pflag = UMA_SLAB_KMAP;
716 		p = (void *) kmem_alloc(kernel_map, bytes);
717 	}
718 
719 	return (p);
720 }
721 
722 /*
723  * Allocates a number of pages from within an object
724  *
725  * Arguments:
726  *	zone   Unused
727  *	bytes  The number of bytes requested
728  *	wait   Shall we wait?
729  *
730  * Returns:
731  *	A pointer to the alloced memory or possibly
732  *	NULL if M_NOWAIT is set.
733  */
734 static void *
735 obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
736 {
737 	vm_offset_t zkva;
738 	vm_offset_t retkva;
739 	vm_page_t p;
740 	int pages;
741 
742 	retkva = NULL;
743 	pages = zone->uz_pages;
744 
745 	/*
746 	 * This looks a little weird since we're getting one page at a time
747 	 */
748 	while (bytes > 0) {
749 		p = vm_page_alloc(zone->uz_obj, pages,
750 		    VM_ALLOC_INTERRUPT);
751 		if (p == NULL)
752 			return (NULL);
753 
754 		zkva = zone->uz_kva + pages * PAGE_SIZE;
755 		if (retkva == NULL)
756 			retkva = zkva;
757 		pmap_qenter(zkva, &p, 1);
758 		bytes -= PAGE_SIZE;
759 		pages += 1;
760 	}
761 
762 	*flags = UMA_SLAB_PRIV;
763 
764 	return ((void *)retkva);
765 }
766 
767 /*
768  * Frees a number of pages to the system
769  *
770  * Arguments:
771  *	mem   A pointer to the memory to be freed
772  *	size  The size of the memory being freed
773  *	flags The original p->us_flags field
774  *
775  * Returns:
776  *	Nothing
777  *
778  */
779 static void
780 page_free(void *mem, int size, u_int8_t flags)
781 {
782 	vm_map_t map;
783 	if (flags & UMA_SLAB_KMEM)
784 		map = kmem_map;
785 	else if (flags & UMA_SLAB_KMAP)
786 		map = kernel_map;
787 	else
788 		panic("UMA: page_free used with invalid flags %d\n", flags);
789 
790 	kmem_free(map, (vm_offset_t)mem, size);
791 }
792 
793 /*
794  * Zero fill initializer
795  *
796  * Arguments/Returns follow uma_init specifications
797  *
798  */
799 static void
800 zero_init(void *mem, int size)
801 {
802 	bzero(mem, size);
803 }
804 
805 /*
806  * Finish creating a small uma zone.  This calculates ipers, and the zone size.
807  *
808  * Arguments
809  *	zone  The zone we should initialize
810  *
811  * Returns
812  *	Nothing
813  */
814 static void
815 zone_small_init(uma_zone_t zone)
816 {
817 	int rsize;
818 	int memused;
819 	int ipers;
820 
821 	rsize = zone->uz_size;
822 
823 	if (rsize < UMA_SMALLEST_UNIT)
824 		rsize = UMA_SMALLEST_UNIT;
825 
826 	if (rsize & zone->uz_align)
827 		rsize = (rsize & ~zone->uz_align) + (zone->uz_align + 1);
828 
829 	zone->uz_rsize = rsize;
830 
831 	rsize += 1;	/* Account for the byte of linkage */
832 	zone->uz_ipers = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / rsize;
833 	zone->uz_ppera = 1;
834 
835 	memused = zone->uz_ipers * zone->uz_rsize;
836 
837 	/* Can we do any better? */
838 	if ((UMA_SLAB_SIZE - memused) >= UMA_MAX_WASTE) {
839 		if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
840 			return;
841 		ipers = UMA_SLAB_SIZE / zone->uz_rsize;
842 		if (ipers > zone->uz_ipers) {
843 			zone->uz_flags |= UMA_ZFLAG_OFFPAGE;
844 			zone->uz_ipers = ipers;
845 		}
846 	}
847 
848 }
849 
850 /*
851  * Finish creating a large (> UMA_SLAB_SIZE) uma zone.  Just give in and do
852  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
853  * more complicated.
854  *
855  * Arguments
856  *	zone  The zone we should initialize
857  *
858  * Returns
859  *	Nothing
860  */
861 static void
862 zone_large_init(uma_zone_t zone)
863 {
864 	int pages;
865 
866 	pages = zone->uz_size / UMA_SLAB_SIZE;
867 
868 	/* Account for remainder */
869 	if ((pages * UMA_SLAB_SIZE) < zone->uz_size)
870 		pages++;
871 
872 	zone->uz_ppera = pages;
873 	zone->uz_ipers = 1;
874 
875 	zone->uz_flags |= UMA_ZFLAG_OFFPAGE;
876 	zone->uz_rsize = zone->uz_size;
877 }
878 
879 /*
880  * Zone header ctor.  This initializes all fields, locks, etc.  And inserts
881  * the zone onto the global zone list.
882  *
883  * Arguments/Returns follow uma_ctor specifications
884  *	udata  Actually uma_zcreat_args
885  *
886  */
887 
888 static void
889 zone_ctor(void *mem, int size, void *udata)
890 {
891 	struct uma_zctor_args *arg = udata;
892 	uma_zone_t zone = mem;
893 	int cplen;
894 	int cpu;
895 
896 	bzero(zone, size);
897 	zone->uz_name = arg->name;
898 	zone->uz_size = arg->size;
899 	zone->uz_ctor = arg->ctor;
900 	zone->uz_dtor = arg->dtor;
901 	zone->uz_init = arg->uminit;
902 	zone->uz_align = arg->align;
903 	zone->uz_free = 0;
904 	zone->uz_pages = 0;
905 	zone->uz_flags = 0;
906 	zone->uz_allocf = page_alloc;
907 	zone->uz_freef = page_free;
908 
909 	if (arg->flags & UMA_ZONE_ZINIT)
910 		zone->uz_init = zero_init;
911 
912 	if (arg->flags & UMA_ZONE_INTERNAL)
913 		zone->uz_flags |= UMA_ZFLAG_INTERNAL;
914 
915 	if (arg->flags & UMA_ZONE_MALLOC)
916 		zone->uz_flags |= UMA_ZFLAG_MALLOC;
917 
918 	if (arg->flags & UMA_ZONE_NOFREE)
919 		zone->uz_flags |= UMA_ZFLAG_NOFREE;
920 
921 	if (zone->uz_size > UMA_SLAB_SIZE)
922 		zone_large_init(zone);
923 	else
924 		zone_small_init(zone);
925 
926 	/* We do this so that the per cpu lock name is unique for each zone */
927 	memcpy(zone->uz_lname, "PCPU ", 5);
928 	cplen = min(strlen(zone->uz_name) + 1, LOCKNAME_LEN - 6);
929 	memcpy(zone->uz_lname+5, zone->uz_name, cplen);
930 	zone->uz_lname[LOCKNAME_LEN - 1] = '\0';
931 
932 	/*
933 	 * If we're putting the slab header in the actual page we need to
934 	 * figure out where in each page it goes.  This calculates a right
935 	 * justified offset into the memory on a ALIGN_PTR boundary.
936 	 */
937 	if (!(zone->uz_flags & UMA_ZFLAG_OFFPAGE)) {
938 		int totsize;
939 		int waste;
940 
941 		/* Size of the slab struct and free list */
942 		totsize = sizeof(struct uma_slab) + zone->uz_ipers;
943 		if (totsize & UMA_ALIGN_PTR)
944 			totsize = (totsize & ~UMA_ALIGN_PTR) +
945 			    (UMA_ALIGN_PTR + 1);
946 		zone->uz_pgoff = UMA_SLAB_SIZE - totsize;
947 
948 		waste = zone->uz_pgoff;
949 		waste -= (zone->uz_ipers * zone->uz_rsize);
950 
951 		/*
952 		 * This calculates how much space we have for cache line size
953 		 * optimizations.  It works by offseting each slab slightly.
954 		 * Currently it breaks on x86, and so it is disabled.
955 		 */
956 
957 		if (zone->uz_align < UMA_CACHE_INC && waste > UMA_CACHE_INC) {
958 			zone->uz_cachemax = waste - UMA_CACHE_INC;
959 			zone->uz_cacheoff = 0;
960 		}
961 
962 		totsize = zone->uz_pgoff + sizeof(struct uma_slab)
963 		    + zone->uz_ipers;
964 		/* I don't think it's possible, but I'll make sure anyway */
965 		if (totsize > UMA_SLAB_SIZE) {
966 			printf("zone %s ipers %d rsize %d size %d\n",
967 			    zone->uz_name, zone->uz_ipers, zone->uz_rsize,
968 			    zone->uz_size);
969 			panic("UMA slab won't fit.\n");
970 		}
971 	} else {
972 		/* hash_expand here to allocate the initial hash table */
973 		hash_expand(&zone->uz_hash);
974 		zone->uz_pgoff = 0;
975 	}
976 
977 #ifdef UMA_DEBUG
978 	printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n",
979 	    zone->uz_name, zone,
980 	    zone->uz_size, zone->uz_ipers,
981 	    zone->uz_ppera, zone->uz_pgoff);
982 #endif
983 	ZONE_LOCK_INIT(zone);
984 
985 	mtx_lock(&uma_mtx);
986 	LIST_INSERT_HEAD(&uma_zones, zone, uz_link);
987 	mtx_unlock(&uma_mtx);
988 
989 	/*
990 	 * Some internal zones don't have room allocated for the per cpu
991 	 * caches.  If we're internal, bail out here.
992 	 */
993 
994 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
995 		return;
996 
997 	for (cpu = 0; cpu < maxcpu; cpu++) {
998 		if (zone->uz_ipers < UMA_BUCKET_SIZE)
999 			zone->uz_cpu[cpu].uc_count = zone->uz_ipers - 1;
1000 		else
1001 			zone->uz_cpu[cpu].uc_count = UMA_BUCKET_SIZE - 1;
1002 		CPU_LOCK_INIT(zone, cpu);
1003 	}
1004 }
1005 
1006 /*
1007  * Traverses every zone in the system and calls a callback
1008  *
1009  * Arguments:
1010  *	zfunc  A pointer to a function which accepts a zone
1011  *		as an argument.
1012  *
1013  * Returns:
1014  *	Nothing
1015  */
1016 static void
1017 zone_foreach(void (*zfunc)(uma_zone_t))
1018 {
1019 	uma_zone_t zone;
1020 
1021 	mtx_lock(&uma_mtx);
1022 	LIST_FOREACH(zone, &uma_zones, uz_link) {
1023 		zfunc(zone);
1024 	}
1025 	mtx_unlock(&uma_mtx);
1026 }
1027 
1028 /* Public functions */
1029 /* See uma.h */
1030 void
1031 uma_startup(void *bootmem)
1032 {
1033 	struct uma_zctor_args args;
1034 	uma_slab_t slab;
1035 	int slabsize;
1036 	int i;
1037 
1038 #ifdef UMA_DEBUG
1039 	printf("Creating uma zone headers zone.\n");
1040 #endif
1041 #ifdef SMP
1042 	maxcpu = mp_maxid + 1;
1043 #else
1044 	maxcpu = 1;
1045 #endif
1046 #ifdef UMA_DEBUG
1047 	printf("Max cpu = %d, mp_maxid = %d\n", maxcpu, mp_maxid);
1048 	Debugger("stop");
1049 #endif
1050 	mtx_init(&uma_mtx, "UMA lock", MTX_DEF);
1051 	/* "manually" Create the initial zone */
1052 	args.name = "UMA Zones";
1053 	args.size = sizeof(struct uma_zone) +
1054 	    (sizeof(struct uma_cache) * (maxcpu - 1));
1055 	args.ctor = zone_ctor;
1056 	args.dtor = NULL;
1057 	args.uminit = zero_init;
1058 	args.fini = NULL;
1059 	args.align = 32 - 1;
1060 	args.flags = UMA_ZONE_INTERNAL;
1061 	/* The initial zone has no Per cpu queues so it's smaller */
1062 	zone_ctor(zones, sizeof(struct uma_zone), &args);
1063 
1064 #ifdef UMA_DEBUG
1065 	printf("Filling boot free list.\n");
1066 #endif
1067 	for (i = 0; i < UMA_BOOT_PAGES; i++) {
1068 		slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE));
1069 		slab->us_data = (u_int8_t *)slab;
1070 		slab->us_flags = UMA_SLAB_BOOT;
1071 		LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
1072 		uma_boot_free++;
1073 	}
1074 
1075 #ifdef UMA_DEBUG
1076 	printf("Creating slab zone.\n");
1077 #endif
1078 
1079 	/*
1080 	 * This is the max number of free list items we'll have with
1081 	 * offpage slabs.
1082 	 */
1083 
1084 	slabsize = UMA_SLAB_SIZE - sizeof(struct uma_slab);
1085 	slabsize /= UMA_MAX_WASTE;
1086 	slabsize++;			/* In case there it's rounded */
1087 	slabsize += sizeof(struct uma_slab);
1088 
1089 	/* Now make a zone for slab headers */
1090 	slabzone = uma_zcreate("UMA Slabs",
1091 				slabsize,
1092 				NULL, NULL, NULL, NULL,
1093 				UMA_ALIGN_PTR, UMA_ZONE_INTERNAL);
1094 
1095 	hashzone = uma_zcreate("UMA Hash",
1096 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
1097 	    NULL, NULL, NULL, NULL,
1098 	    UMA_ALIGN_PTR, UMA_ZONE_INTERNAL);
1099 
1100 	bucketzone = uma_zcreate("UMA Buckets", sizeof(struct uma_bucket),
1101 	    NULL, NULL, NULL, NULL,
1102 	    UMA_ALIGN_PTR, UMA_ZONE_INTERNAL);
1103 
1104 
1105 #ifdef UMA_DEBUG
1106 	printf("UMA startup complete.\n");
1107 #endif
1108 }
1109 
1110 /* see uma.h */
1111 void
1112 uma_startup2(void *hashmem, u_long elems)
1113 {
1114 	bzero(hashmem, elems * sizeof(void *));
1115 	mallochash->uh_slab_hash = hashmem;
1116 	mallochash->uh_hashsize = elems;
1117 	mallochash->uh_hashmask = elems - 1;
1118 	booted = 1;
1119 #ifdef UMA_DEBUG
1120 	printf("UMA startup2 complete.\n");
1121 #endif
1122 }
1123 
1124 /*
1125  * Initialize our callout handle
1126  *
1127  */
1128 
1129 static void
1130 uma_startup3(void)
1131 {
1132 #ifdef UMA_DEBUG
1133 	printf("Starting callout.\n");
1134 #endif
1135 	/* We'll be mpsafe once the vm is locked. */
1136 	callout_init(&uma_callout, 0);
1137 	callout_reset(&uma_callout, UMA_WORKING_TIME * hz, uma_timeout, NULL);
1138 #ifdef UMA_DEBUG
1139 	printf("UMA startup3 complete.\n");
1140 #endif
1141 }
1142 
1143 /* See uma.h */
1144 uma_zone_t
1145 uma_zcreate(char *name, int size, uma_ctor ctor, uma_dtor dtor, uma_init uminit,
1146 		     uma_fini fini, int align, u_int16_t flags)
1147 
1148 {
1149 	struct uma_zctor_args args;
1150 
1151 	/* This stuff is essential for the zone ctor */
1152 	args.name = name;
1153 	args.size = size;
1154 	args.ctor = ctor;
1155 	args.dtor = dtor;
1156 	args.uminit = uminit;
1157 	args.fini = fini;
1158 	args.align = align;
1159 	args.flags = flags;
1160 
1161 	return (uma_zalloc_internal(zones, &args, M_WAITOK, NULL, -1));
1162 }
1163 
1164 /* See uma.h */
1165 void *
1166 uma_zalloc_arg(uma_zone_t zone, void *udata, int wait)
1167 {
1168 	void *item;
1169 	uma_cache_t cache;
1170 	uma_bucket_t bucket;
1171 	int isitem;
1172 	int cpu;
1173 
1174 	/* This is the fast path allocation */
1175 #ifdef UMA_DEBUG_ALLOC_1
1176 	printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
1177 #endif
1178 	cpu = PCPU_GET(cpuid);
1179 	CPU_LOCK(zone, cpu);
1180 	cache = &zone->uz_cpu[cpu];
1181 	cache->uc_allocs++;
1182 
1183 zalloc_start:
1184 	bucket = cache->uc_allocbucket;
1185 
1186 	if (bucket) {
1187 		if (bucket->ub_ptr > -1) {
1188 			item = bucket->ub_bucket[bucket->ub_ptr];
1189 #ifdef INVARIANTS
1190 			bucket->ub_bucket[bucket->ub_ptr] = NULL;
1191 #endif
1192 			bucket->ub_ptr--;
1193 			KASSERT(item != NULL,
1194 			    ("uma_zalloc: Bucket pointer mangled."));
1195 			cache->uc_allocs++;
1196 			CPU_UNLOCK(zone, cpu);
1197 			if (zone->uz_ctor)
1198 				zone->uz_ctor(item, zone->uz_size, udata);
1199 			return (item);
1200 		} else if (cache->uc_freebucket) {
1201 			/*
1202 			 * We have run out of items in our allocbucket.
1203 			 * See if we can switch with our free bucket.
1204 			 */
1205 			if (cache->uc_freebucket->ub_ptr > -1) {
1206 				uma_bucket_t swap;
1207 
1208 #ifdef UMA_DEBUG_ALLOC
1209 				printf("uma_zalloc: Swapping empty with alloc.\n");
1210 #endif
1211 				swap = cache->uc_freebucket;
1212 				cache->uc_freebucket = cache->uc_allocbucket;
1213 				cache->uc_allocbucket = swap;
1214 
1215 				goto zalloc_start;
1216 			}
1217 		}
1218 	}
1219 	/*
1220 	 * We can get here for three reasons:
1221 	 *
1222 	 * 1) The buckets are NULL
1223 	 * 2) The zone is INTERNAL, and so it has no buckets.
1224 	 * 3) The alloc and free buckets are both empty.
1225 	 *
1226 	 * Just handoff to uma_zalloc_internal to do the hard stuff
1227 	 *
1228 	 */
1229 #ifdef UMA_DEBUG_ALLOC
1230 	printf("uma_zalloc: Falling back to zalloc_internal.\n");
1231 #endif
1232 
1233 	item = uma_zalloc_internal(zone, udata, wait, &isitem, cpu);
1234 
1235 #ifdef UMA_DEBUG
1236 	printf("uma_zalloc: zalloc_internal completed.\n");
1237 #endif
1238 
1239 	if (item && isitem == 0)
1240 		goto zalloc_start;
1241 
1242 	/*
1243 	 * If isitem is set then we should just return it. The cpu lock
1244 	 * was unlocked when we couldn't get a bucket.
1245 	 */
1246 	return item;
1247 }
1248 
1249 /*
1250  * Allocates an item for an internal zone OR fills a bucket
1251  *
1252  * Arguments
1253  *	zone   The zone to alloc for.
1254  *	udata  The data to be passed to the constructor.
1255  *	wait   M_WAITOK or M_NOWAIT.
1256  *	isitem The returned value is an item if this is true.
1257  *	cpu    The cpu # of the cache that we should use, or -1.
1258  *
1259  * Returns
1260  *	NULL if there is no memory and M_NOWAIT is set
1261  *	An item if called on an interal zone
1262  *	Non NULL if called to fill a bucket and it was successful.
1263  *
1264  * Discussion:
1265  *	This was much cleaner before it had to do per cpu caches.  It is
1266  *	complicated now because it has to handle the simple internal case, and
1267  *	the more involved bucket filling and allocation.  The isitem is there
1268  *	to remove a failure case.  You shouldn't fail on allocating from a zone
1269  *	because there were no buckets.  This allows the exported zalloc to just
1270  *	return the item.
1271  *
1272  */
1273 
1274 static void *
1275 uma_zalloc_internal(uma_zone_t zone, void *udata, int wait, int *isitem, int cpu)
1276 {
1277 	uma_bucket_t bucket;
1278 	uma_cache_t cache;
1279 	uma_slab_t slab;
1280 	u_int8_t freei;
1281 	void *item;
1282 
1283 	bucket = NULL;
1284 	cache = NULL;
1285 	item = NULL;
1286 
1287 	/*
1288 	 * This is to stop us from allocating per cpu buckets while we're running
1289 	 * out of UMA_BOOT_PAGES.  Otherwise, we would exhaust the boot pages.
1290 	 */
1291 
1292 	if (!booted && zone == bucketzone)
1293 		return (NULL);
1294 
1295 #ifdef UMA_DEBUG_ALLOC
1296 	printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
1297 #endif
1298 	if (isitem != NULL)
1299 		*isitem = 0;
1300 
1301 	ZONE_LOCK(zone);
1302 
1303         /* We got here because we need to fill some buckets */
1304 	if (cpu != -1) {
1305 		cache = &zone->uz_cpu[cpu];
1306 
1307 		zone->uz_allocs += cache->uc_allocs;
1308 		/* Check the free list */
1309                 bucket = LIST_FIRST(&zone->uz_full_bucket);
1310 		if (bucket) {
1311 			LIST_REMOVE(bucket, ub_link);
1312 			/* Our old one is now a free bucket */
1313 			if (cache->uc_allocbucket) {
1314 				KASSERT(cache->uc_allocbucket->ub_ptr == -1,
1315 				    ("uma_zalloc_internal: Freeing a non free bucket."));
1316 				LIST_INSERT_HEAD(&zone->uz_free_bucket,
1317 				    cache->uc_allocbucket, ub_link);
1318 			}
1319 			KASSERT(bucket->ub_ptr != -1,
1320 			    ("uma_zalloc_internal: Returning an empty bucket."));
1321 			/*zone->uz_free -= bucket->ub_ptr + 1;*/
1322 			cache->uc_allocbucket = bucket;
1323 			ZONE_UNLOCK(zone);
1324 			return (bucket);
1325 		}
1326 		/* Bump up our uc_count so we get here less */
1327 		if (cache->uc_count < UMA_BUCKET_SIZE - 1)
1328 			cache->uc_count++;
1329 		/* Nothing on the free list, try to re-use the old one */
1330 		bucket = cache->uc_allocbucket;
1331 		if (bucket == NULL) {
1332 			/* Nope, we need a new one */
1333 			CPU_UNLOCK(zone, cpu);
1334 			ZONE_UNLOCK(zone);
1335 			bucket = uma_zalloc_internal(bucketzone,
1336 			    NULL, wait, NULL, -1);
1337 			CPU_LOCK(zone, cpu);
1338 			ZONE_LOCK(zone);
1339 			/* Did we lose the race? */
1340 			if (cache->uc_allocbucket) {
1341 #ifdef UMA_DEBUG
1342 				printf("uma_zalloc_internal: Lost race with another CPU.\n");
1343 #endif
1344 				if (bucket)
1345 					uma_zfree_internal(bucketzone,
1346 					    bucket, NULL, 0);
1347 				ZONE_UNLOCK(zone);
1348 				return (cache->uc_allocbucket);
1349 			}
1350 			cache->uc_allocbucket = bucket;
1351 
1352 			if (bucket) {
1353 #ifdef INVARIANTS
1354 				bzero(bucket, bucketzone->uz_size);
1355 #endif
1356 				bucket->ub_ptr = -1;
1357 			} else {
1358 				/*
1359 				 * We may not get a bucket if we recurse, so
1360 				 * return an actual item. The rest of this code
1361 				 * does the right thing if the cache is NULL.
1362 				 */
1363 #ifdef UMA_DEBUG
1364 				printf("uma_zalloc_internal: Bucketzone returned NULL\n");
1365 #endif
1366 				CPU_UNLOCK(zone, cpu);
1367 				cache = NULL;
1368 				cpu = -1;
1369 			}
1370 		}
1371 	}
1372 
1373 new_slab:
1374 
1375 	/* Find a slab with some space */
1376 	if (zone->uz_free) {
1377 		if (!LIST_EMPTY(&zone->uz_part_slab)) {
1378 			slab = LIST_FIRST(&zone->uz_part_slab);
1379 		} else {
1380 			slab = LIST_FIRST(&zone->uz_free_slab);
1381 			LIST_REMOVE(slab, us_link);
1382 			LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link);
1383 		}
1384 	} else {
1385 		/*
1386 		 * This is to prevent us from recursively trying to allocate
1387 		 * buckets.  The problem is that if an allocation forces us to
1388 		 * grab a new bucket we will call page_alloc, which will go off
1389 		 * and cause the vm to allocate vm_map_entries.  If we need new
1390 		 * buckets there too we will recurse in kmem_alloc and bad
1391 		 * things happen.  So instead we return a NULL bucket, and make
1392 		 * the code that allocates buckets smart enough to deal with it			 */
1393 		if (zone == bucketzone && zone->uz_recurse != 0) {
1394 			ZONE_UNLOCK(zone);
1395 			return (NULL);
1396 		}
1397 		zone->uz_recurse++;
1398 		slab = slab_zalloc(zone, wait);
1399 		zone->uz_recurse--;
1400 		if (slab)  {
1401 			LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link);
1402 		/*
1403 		 * We might not have been able to get a page, but another cpu
1404 		 * could have while we were unlocked.
1405 		 */
1406 		} else if (zone->uz_free == 0) {
1407 			ZONE_UNLOCK(zone);
1408 			/* If we're filling a bucket return what we have */
1409 			if (bucket != NULL && bucket->ub_ptr != -1) {
1410 				return (bucket);
1411 			} else
1412 				return (NULL);
1413 		} else {
1414 			/* Another cpu must have succeeded */
1415 			if ((slab = LIST_FIRST(&zone->uz_part_slab)) == NULL) {
1416 				slab = LIST_FIRST(&zone->uz_free_slab);
1417 				LIST_REMOVE(slab, us_link);
1418 				LIST_INSERT_HEAD(&zone->uz_part_slab,
1419 				    slab, us_link);
1420 			}
1421 		}
1422 	}
1423 
1424 	while (slab->us_freecount) {
1425 		freei = slab->us_firstfree;
1426 		slab->us_firstfree = slab->us_freelist[freei];
1427 #ifdef INVARIANTS
1428 		slab->us_freelist[freei] = 255;
1429 #endif
1430 		slab->us_freecount--;
1431 		zone->uz_free--;
1432 		item = slab->us_data + (zone->uz_rsize * freei);
1433 
1434 		if (cache == NULL) {
1435 			zone->uz_allocs++;
1436 			break;
1437 		}
1438 
1439 		bucket->ub_bucket[++bucket->ub_ptr] = item;
1440 
1441 		/* Don't overfill the bucket! */
1442 		if (bucket->ub_ptr == cache->uc_count)
1443 			break;
1444 	}
1445 
1446 	/* Move this slab to the full list */
1447 	if (slab->us_freecount == 0) {
1448 		LIST_REMOVE(slab, us_link);
1449 		LIST_INSERT_HEAD(&zone->uz_full_slab, slab, us_link);
1450 	}
1451 
1452 	if (cache != NULL) {
1453 		/* Try to keep the buckets totally full, but don't block */
1454 		if (bucket->ub_ptr < cache->uc_count) {
1455 			wait = M_NOWAIT;
1456 			goto new_slab;
1457 		}
1458 	}
1459 
1460 	ZONE_UNLOCK(zone);
1461 
1462 	/* Only construct at this time if we're not filling a bucket */
1463 	if (cache == NULL)  {
1464 		if (zone->uz_ctor)
1465 			zone->uz_ctor(item, zone->uz_size, udata);
1466 
1467 		if (isitem != NULL)
1468 			*isitem = 1;
1469 	}
1470 
1471 	return (item);
1472 }
1473 
1474 /* See uma.h */
1475 void
1476 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
1477 {
1478 	uma_cache_t cache;
1479 	uma_bucket_t bucket;
1480 	int cpu;
1481 
1482 	/* This is the fast path free */
1483 #ifdef UMA_DEBUG_ALLOC_1
1484 	printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
1485 #endif
1486 	cpu = PCPU_GET(cpuid);
1487 	CPU_LOCK(zone, cpu);
1488 	cache = &zone->uz_cpu[cpu];
1489 
1490 zfree_start:
1491 	bucket = cache->uc_freebucket;
1492 
1493 	if (bucket) {
1494 		/* Do we have room in our bucket? */
1495 		if (bucket->ub_ptr < cache->uc_count) {
1496 			bucket->ub_ptr++;
1497 			KASSERT(bucket->ub_bucket[bucket->ub_ptr] == NULL,
1498 			    ("uma_zfree: Freeing to non free bucket index."));
1499 			bucket->ub_bucket[bucket->ub_ptr] = item;
1500 			CPU_UNLOCK(zone, cpu);
1501 			if (zone->uz_dtor)
1502 				zone->uz_dtor(item, zone->uz_size, udata);
1503 			return;
1504 		} else if (cache->uc_allocbucket) {
1505 #ifdef UMA_DEBUG_ALLOC
1506 			printf("uma_zfree: Swapping buckets.\n");
1507 #endif
1508 			/*
1509 			 * We have run out of space in our freebucket.
1510 			 * See if we can switch with our alloc bucket.
1511 			 */
1512 			if (cache->uc_allocbucket->ub_ptr <
1513 			    cache->uc_freebucket->ub_ptr) {
1514 				uma_bucket_t swap;
1515 
1516 				swap = cache->uc_freebucket;
1517 				cache->uc_freebucket = cache->uc_allocbucket;
1518 				cache->uc_allocbucket = swap;
1519 
1520 				goto zfree_start;
1521 			}
1522 		}
1523 	}
1524 
1525 	/*
1526 	 * We can get here for three reasons:
1527 	 *
1528 	 * 1) The buckets are NULL
1529 	 * 2) The zone is INTERNAL, and so it has no buckets.
1530 	 * 3) The alloc and free buckets are both somewhat full.
1531 	 *
1532 	 */
1533 
1534 	ZONE_LOCK(zone);
1535 
1536 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) {
1537 		bucket = cache->uc_freebucket;
1538 		cache->uc_freebucket = NULL;
1539 
1540 		/* Can we throw this on the zone full list? */
1541 		if (bucket != NULL) {
1542 #ifdef UMA_DEBUG_ALLOC
1543 			printf("uma_zfree: Putting old bucket on the free list.\n");
1544 #endif
1545 			/* ub_ptr is pointing to the last free item */
1546 			KASSERT(bucket->ub_ptr != -1,
1547 			    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
1548 			/*zone->uz_free += bucket->ub_ptr + 1;*/
1549 			LIST_INSERT_HEAD(&zone->uz_full_bucket,
1550 			    bucket, ub_link);
1551 			bucket = LIST_FIRST(&zone->uz_free_bucket);
1552 			if (bucket)
1553 				LIST_REMOVE(bucket, ub_link);
1554 		}
1555 		/*
1556 		 * Do we need to alloc one? Either the freebucket was NULL
1557 		 * or the free_bucket list was empty.
1558 		 */
1559 		if (bucket == NULL) {
1560 #ifdef UMA_DEBUG_ALLOC
1561 			printf("uma_zfree: Allocating new free bucket.\n");
1562 #endif
1563 			/* This has to be done so we don't recurse on a lock */
1564 			ZONE_UNLOCK(zone);
1565 			CPU_UNLOCK(zone, cpu);
1566 			bucket = uma_zalloc_internal(bucketzone,
1567 			    NULL, M_NOWAIT, NULL, -1);
1568 			CPU_LOCK(zone, cpu);
1569 			ZONE_LOCK(zone);
1570 			if (bucket) {
1571 #ifdef INVARIANTS
1572 				bzero(bucket, bucketzone->uz_size);
1573 #endif
1574 				bucket->ub_ptr = -1;
1575 			}
1576 			/* Did we lose the race? */
1577 			if (cache->uc_freebucket != NULL) {
1578 				if (bucket)
1579 					uma_zfree_internal(bucketzone,
1580 					    bucket, NULL, 0);
1581 				ZONE_UNLOCK(zone);
1582 				goto zfree_start;
1583 			}
1584 			/* If we couldn't get one just free directly */
1585 			if (bucket == NULL)
1586 				goto zfree_internal;
1587 		}
1588 		cache->uc_freebucket = bucket;
1589 		ZONE_UNLOCK(zone);
1590 		goto zfree_start;
1591 	}
1592 
1593 zfree_internal:
1594 
1595 	CPU_UNLOCK(zone, cpu);
1596 	ZONE_UNLOCK(zone);
1597 	uma_zfree_internal(zone, item, udata, 0);
1598 
1599 	return;
1600 
1601 }
1602 
1603 /*
1604  * Frees an item to an INTERNAL zone or allocates a free bucket
1605  *
1606  * Arguments:
1607  *	zone   The zone to free to
1608  *	item   The item we're freeing
1609  *	udata  User supplied data for the dtor
1610  *	skip   Skip the dtor, it was done in uma_zfree_arg
1611  */
1612 
1613 static void
1614 uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip)
1615 {
1616 	uma_slab_t slab;
1617 	u_int8_t *mem;
1618 	u_int8_t freei;
1619 
1620 	ZONE_LOCK(zone);
1621 
1622 	if (!(zone->uz_flags & UMA_ZFLAG_MALLOC)) {
1623 		mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
1624 		if (zone->uz_flags & UMA_ZFLAG_OFFPAGE)
1625 			slab = hash_sfind(&zone->uz_hash, mem);
1626 		else {
1627 			mem += zone->uz_pgoff;
1628 			slab = (uma_slab_t)mem;
1629 		}
1630 	} else {
1631 		slab = (uma_slab_t)udata;
1632 	}
1633 
1634 	/* Do we need to remove from any lists? */
1635 	if (slab->us_freecount+1 == zone->uz_ipers) {
1636 		LIST_REMOVE(slab, us_link);
1637 		LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link);
1638 	} else if (slab->us_freecount == 0) {
1639 		LIST_REMOVE(slab, us_link);
1640 		LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link);
1641 	}
1642 
1643 	/* Slab management stuff */
1644 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
1645 		/ zone->uz_rsize;
1646 #ifdef INVARIANTS
1647 	if (((freei * zone->uz_rsize) + slab->us_data) != item)
1648 		panic("zone: %s(%p) slab %p freed address %p unaligned.\n",
1649 		    zone->uz_name, zone, slab, item);
1650 	if (freei >= zone->uz_ipers)
1651 		panic("zone: %s(%p) slab %p freelist %i out of range 0-%d\n",
1652 		    zone->uz_name, zone, slab, freei, zone->uz_ipers-1);
1653 
1654 	if (slab->us_freelist[freei] != 255) {
1655 		printf("Slab at %p, freei %d = %d.\n",
1656 		    slab, freei, slab->us_freelist[freei]);
1657 		panic("Duplicate free of item %p from zone %p(%s)\n",
1658 		    item, zone, zone->uz_name);
1659 	}
1660 #endif
1661 	slab->us_freelist[freei] = slab->us_firstfree;
1662 	slab->us_firstfree = freei;
1663 	slab->us_freecount++;
1664 
1665 	/* Zone statistics */
1666 	zone->uz_free++;
1667 
1668 	ZONE_UNLOCK(zone);
1669 
1670 	if (!skip && zone->uz_dtor)
1671 		zone->uz_dtor(item, zone->uz_size, udata);
1672 }
1673 
1674 /* See uma.h */
1675 void
1676 uma_zone_set_max(uma_zone_t zone, int nitems)
1677 {
1678 	ZONE_LOCK(zone);
1679 	if (zone->uz_ppera > 1)
1680 		zone->uz_maxpages = nitems / zone->uz_ppera;
1681 	else
1682 		zone->uz_maxpages = nitems / zone->uz_ipers;
1683 	ZONE_UNLOCK(zone);
1684 }
1685 
1686 /* See uma.h */
1687 void
1688 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
1689 {
1690 	ZONE_LOCK(zone);
1691 
1692 	zone->uz_freef = freef;
1693 
1694 	ZONE_UNLOCK(zone);
1695 }
1696 
1697 /* See uma.h */
1698 void
1699 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
1700 {
1701 	ZONE_LOCK(zone);
1702 
1703 	zone->uz_flags |= UMA_ZFLAG_PRIVALLOC;
1704 	zone->uz_allocf = allocf;
1705 
1706 	ZONE_UNLOCK(zone);
1707 }
1708 
1709 /* See uma.h */
1710 int
1711 uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
1712 {
1713 	int pages;
1714 	vm_offset_t kva;
1715 
1716 	ZONE_LOCK(zone);
1717 	mtx_lock(&Giant);
1718 
1719 	zone->uz_obj = obj;
1720 	pages = count / zone->uz_ipers;
1721 
1722 	if (pages * zone->uz_ipers < count)
1723 		pages++;
1724 	zone->uz_kva = NULL;
1725 	ZONE_UNLOCK(zone);
1726 	kva = kmem_alloc_pageable(kernel_map, pages * UMA_SLAB_SIZE);
1727 	ZONE_LOCK(zone);
1728 
1729 	zone->uz_kva = kva;
1730 
1731 	if (zone->uz_kva == 0) {
1732 		ZONE_UNLOCK(zone);
1733 		return (0);
1734 	}
1735 
1736 	zone->uz_maxpages = pages;
1737 
1738 	if (zone->uz_obj == NULL)
1739 		zone->uz_obj = vm_object_allocate(OBJT_DEFAULT,
1740 		    zone->uz_maxpages);
1741 	else
1742 		_vm_object_allocate(OBJT_DEFAULT,
1743 		    zone->uz_maxpages, zone->uz_obj);
1744 
1745 	zone->uz_allocf = obj_alloc;
1746 	zone->uz_flags |= UMA_ZFLAG_NOFREE | UMA_ZFLAG_PRIVALLOC;
1747 
1748 	mtx_unlock(&Giant);
1749 	ZONE_UNLOCK(zone);
1750 
1751 	return (1);
1752 }
1753 
1754 /* See uma.h */
1755 void
1756 uma_prealloc(uma_zone_t zone, int items)
1757 {
1758 	int slabs;
1759 	uma_slab_t slab;
1760 
1761 	ZONE_LOCK(zone);
1762 	slabs = items / zone->uz_ipers;
1763 	if (slabs * zone->uz_ipers < items)
1764 		slabs++;
1765 
1766 	while (slabs > 0) {
1767 		slab = slab_zalloc(zone, M_WAITOK);
1768 		LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link);
1769 		slabs--;
1770 	}
1771 	ZONE_UNLOCK(zone);
1772 }
1773 
1774 /* See uma.h */
1775 void
1776 uma_reclaim(void)
1777 {
1778 	/*
1779 	 * You might think that the delay below would improve performance since
1780 	 * the allocator will give away memory that it may ask for immediately.
1781 	 * Really, it makes things worse, since cpu cycles are so much cheaper
1782 	 * than disk activity.
1783 	 */
1784 #if 0
1785 	static struct timeval tv = {0};
1786 	struct timeval now;
1787 	getmicrouptime(&now);
1788 	if (now.tv_sec > tv.tv_sec + 30)
1789 		tv = now;
1790 	else
1791 		return;
1792 #endif
1793 #ifdef UMA_DEBUG
1794 	printf("UMA: vm asked us to release pages!\n");
1795 #endif
1796 	zone_foreach(zone_drain);
1797 
1798 	/*
1799 	 * Some slabs may have been freed but this zone will be visited early
1800 	 * we visit again so that we can free pages that are empty once other
1801 	 * zones are drained.  We have to do the same for buckets.
1802 	 */
1803 	zone_drain(slabzone);
1804 	zone_drain(bucketzone);
1805 }
1806 
1807 void *
1808 uma_large_malloc(int size, int wait)
1809 {
1810 	void *mem;
1811 	uma_slab_t slab;
1812 	u_int8_t flags;
1813 
1814 	slab = uma_zalloc_internal(slabzone, NULL, wait, NULL, -1);
1815 	if (slab == NULL)
1816 		return (NULL);
1817 
1818 	mem = page_alloc(NULL, size, &flags, wait);
1819 	if (mem) {
1820 		slab->us_data = mem;
1821 		slab->us_flags = flags | UMA_SLAB_MALLOC;
1822 		slab->us_size = size;
1823 		UMA_HASH_INSERT(mallochash, slab, mem);
1824 	} else {
1825 		uma_zfree_internal(slabzone, slab, NULL, 0);
1826 	}
1827 
1828 
1829 	return (mem);
1830 }
1831 
1832 void
1833 uma_large_free(uma_slab_t slab)
1834 {
1835 	UMA_HASH_REMOVE(mallochash, slab, slab->us_data);
1836 	page_free(slab->us_data, slab->us_size, slab->us_flags);
1837 	uma_zfree_internal(slabzone, slab, NULL, 0);
1838 }
1839 
1840 void
1841 uma_print_stats(void)
1842 {
1843 	zone_foreach(uma_print_zone);
1844 }
1845 
1846 void
1847 uma_print_zone(uma_zone_t zone)
1848 {
1849 	printf("%s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n",
1850 	    zone->uz_name, zone, zone->uz_size, zone->uz_rsize, zone->uz_flags,
1851 	    zone->uz_ipers, zone->uz_ppera,
1852 	    (zone->uz_ipers * zone->uz_pages) - zone->uz_free, zone->uz_free);
1853 }
1854 
1855 /*
1856  * Sysctl handler for vm.zone
1857  *
1858  * stolen from vm_zone.c
1859  */
1860 static int
1861 sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
1862 {
1863 	int error, len, cnt;
1864 	const int linesize = 128;	/* conservative */
1865 	int totalfree;
1866 	char *tmpbuf, *offset;
1867 	uma_zone_t z;
1868 	char *p;
1869 
1870 	cnt = 0;
1871 	LIST_FOREACH(z, &uma_zones, uz_link)
1872 		cnt++;
1873 	MALLOC(tmpbuf, char *, (cnt == 0 ? 1 : cnt) * linesize,
1874 			M_TEMP, M_WAITOK);
1875 	len = snprintf(tmpbuf, linesize,
1876 	    "\nITEM            SIZE     LIMIT     USED    FREE  REQUESTS\n\n");
1877 	if (cnt == 0)
1878 		tmpbuf[len - 1] = '\0';
1879 	error = SYSCTL_OUT(req, tmpbuf, cnt == 0 ? len-1 : len);
1880 	if (error || cnt == 0)
1881 		goto out;
1882 	offset = tmpbuf;
1883 	mtx_lock(&uma_mtx);
1884 	LIST_FOREACH(z, &uma_zones, uz_link) {
1885 		if (cnt == 0)	/* list may have changed size */
1886 			break;
1887 		ZONE_LOCK(z);
1888 		totalfree = z->uz_free + z->uz_cachefree;
1889 		len = snprintf(offset, linesize,
1890 		    "%-12.12s  %6.6u, %8.8u, %6.6u, %6.6u, %8.8llu\n",
1891 		    z->uz_name, z->uz_size,
1892 		    z->uz_maxpages * z->uz_ipers,
1893 		    (z->uz_ipers * (z->uz_pages / z->uz_ppera)) - totalfree,
1894 		    totalfree,
1895 		    (unsigned long long)z->uz_allocs);
1896 		ZONE_UNLOCK(z);
1897 		for (p = offset + 12; p > offset && *p == ' '; --p)
1898 			/* nothing */ ;
1899 		p[1] = ':';
1900 		cnt--;
1901 		offset += len;
1902 	}
1903 	mtx_unlock(&uma_mtx);
1904 	*offset++ = '\0';
1905 	error = SYSCTL_OUT(req, tmpbuf, offset - tmpbuf);
1906 out:
1907 	FREE(tmpbuf, M_TEMP);
1908 	return (error);
1909 }
1910