1 /* 2 * Copyright (c) 2002, Jeffrey Roberson <jroberson@chesapeake.net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 * 28 */ 29 30 /* 31 * uma_core.c Implementation of the Universal Memory allocator 32 * 33 * This allocator is intended to replace the multitude of similar object caches 34 * in the standard FreeBSD kernel. The intent is to be flexible as well as 35 * effecient. A primary design goal is to return unused memory to the rest of 36 * the system. This will make the system as a whole more flexible due to the 37 * ability to move memory to subsystems which most need it instead of leaving 38 * pools of reserved memory unused. 39 * 40 * The basic ideas stem from similar slab/zone based allocators whose algorithms 41 * are well known. 42 * 43 */ 44 45 /* 46 * TODO: 47 * - Improve memory usage for large allocations 48 * - Investigate cache size adjustments 49 */ 50 51 /* I should really use ktr.. */ 52 /* 53 #define UMA_DEBUG 1 54 #define UMA_DEBUG_ALLOC 1 55 #define UMA_DEBUG_ALLOC_1 1 56 */ 57 58 59 #include "opt_param.h" 60 #include <sys/param.h> 61 #include <sys/systm.h> 62 #include <sys/kernel.h> 63 #include <sys/types.h> 64 #include <sys/queue.h> 65 #include <sys/malloc.h> 66 #include <sys/lock.h> 67 #include <sys/sysctl.h> 68 #include <sys/mutex.h> 69 #include <sys/proc.h> 70 #include <sys/smp.h> 71 #include <sys/vmmeter.h> 72 73 #include <machine/types.h> 74 75 #include <vm/vm.h> 76 #include <vm/vm_object.h> 77 #include <vm/vm_page.h> 78 #include <vm/vm_param.h> 79 #include <vm/vm_map.h> 80 #include <vm/vm_kern.h> 81 #include <vm/vm_extern.h> 82 #include <vm/uma.h> 83 #include <vm/uma_int.h> 84 #include <vm/uma_dbg.h> 85 86 /* 87 * This is the zone from which all zones are spawned. The idea is that even 88 * the zone heads are allocated from the allocator, so we use the bss section 89 * to bootstrap us. 90 */ 91 static struct uma_zone masterzone; 92 static uma_zone_t zones = &masterzone; 93 94 /* This is the zone from which all of uma_slab_t's are allocated. */ 95 static uma_zone_t slabzone; 96 97 /* 98 * The initial hash tables come out of this zone so they can be allocated 99 * prior to malloc coming up. 100 */ 101 static uma_zone_t hashzone; 102 103 /* 104 * Zone that buckets come from. 105 */ 106 static uma_zone_t bucketzone; 107 108 /* 109 * Are we allowed to allocate buckets? 110 */ 111 static int bucketdisable = 1; 112 113 /* Linked list of all zones in the system */ 114 static LIST_HEAD(,uma_zone) uma_zones = LIST_HEAD_INITIALIZER(&uma_zones); 115 116 /* This mutex protects the zone list */ 117 static struct mtx uma_mtx; 118 119 /* Linked list of boot time pages */ 120 static LIST_HEAD(,uma_slab) uma_boot_pages = 121 LIST_HEAD_INITIALIZER(&uma_boot_pages); 122 123 /* Count of free boottime pages */ 124 static int uma_boot_free = 0; 125 126 /* Is the VM done starting up? */ 127 static int booted = 0; 128 129 /* This is the handle used to schedule our working set calculator */ 130 static struct callout uma_callout; 131 132 /* This is mp_maxid + 1, for use while looping over each cpu */ 133 static int maxcpu; 134 135 /* 136 * This structure is passed as the zone ctor arg so that I don't have to create 137 * a special allocation function just for zones. 138 */ 139 struct uma_zctor_args { 140 char *name; 141 size_t size; 142 uma_ctor ctor; 143 uma_dtor dtor; 144 uma_init uminit; 145 uma_fini fini; 146 int align; 147 u_int16_t flags; 148 }; 149 150 /* 151 * This is the malloc hash table which is used to find the zone that a 152 * malloc allocation came from. It is not currently resizeable. The 153 * memory for the actual hash bucket is allocated in kmeminit. 154 */ 155 struct uma_hash mhash; 156 struct uma_hash *mallochash = &mhash; 157 158 /* Prototypes.. */ 159 160 static void *obj_alloc(uma_zone_t, int, u_int8_t *, int); 161 static void *page_alloc(uma_zone_t, int, u_int8_t *, int); 162 static void page_free(void *, int, u_int8_t); 163 static uma_slab_t slab_zalloc(uma_zone_t, int); 164 static void cache_drain(uma_zone_t); 165 static void bucket_drain(uma_zone_t, uma_bucket_t); 166 static void zone_drain(uma_zone_t); 167 static void zone_ctor(void *, int, void *); 168 static void zone_dtor(void *, int, void *); 169 static void zero_init(void *, int); 170 static void zone_small_init(uma_zone_t zone); 171 static void zone_large_init(uma_zone_t zone); 172 static void zone_foreach(void (*zfunc)(uma_zone_t)); 173 static void zone_timeout(uma_zone_t zone); 174 static int hash_alloc(struct uma_hash *); 175 static int hash_expand(struct uma_hash *, struct uma_hash *); 176 static void hash_free(struct uma_hash *hash); 177 static void uma_timeout(void *); 178 static void uma_startup3(void); 179 static void *uma_zalloc_internal(uma_zone_t, void *, int, uma_bucket_t); 180 static void uma_zfree_internal(uma_zone_t, void *, void *, int); 181 static void bucket_enable(void); 182 void uma_print_zone(uma_zone_t); 183 void uma_print_stats(void); 184 static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS); 185 186 SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD, 187 NULL, 0, sysctl_vm_zone, "A", "Zone Info"); 188 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); 189 190 /* 191 * This routine checks to see whether or not it's safe to enable buckets. 192 */ 193 194 static void 195 bucket_enable(void) 196 { 197 if (cnt.v_free_count < cnt.v_free_min) 198 bucketdisable = 1; 199 else 200 bucketdisable = 0; 201 } 202 203 204 /* 205 * Routine called by timeout which is used to fire off some time interval 206 * based calculations. (working set, stats, etc.) 207 * 208 * Arguments: 209 * arg Unused 210 * 211 * Returns: 212 * Nothing 213 */ 214 static void 215 uma_timeout(void *unused) 216 { 217 bucket_enable(); 218 zone_foreach(zone_timeout); 219 220 /* Reschedule this event */ 221 callout_reset(&uma_callout, UMA_WORKING_TIME * hz, uma_timeout, NULL); 222 } 223 224 /* 225 * Routine to perform timeout driven calculations. This does the working set 226 * as well as hash expanding, and per cpu statistics aggregation. 227 * 228 * Arguments: 229 * zone The zone to operate on 230 * 231 * Returns: 232 * Nothing 233 */ 234 static void 235 zone_timeout(uma_zone_t zone) 236 { 237 uma_cache_t cache; 238 u_int64_t alloc; 239 int free; 240 int cpu; 241 242 alloc = 0; 243 free = 0; 244 245 /* 246 * Aggregate per cpu cache statistics back to the zone. 247 * 248 * I may rewrite this to set a flag in the per cpu cache instead of 249 * locking. If the flag is not cleared on the next round I will have 250 * to lock and do it here instead so that the statistics don't get too 251 * far out of sync. 252 */ 253 if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) { 254 for (cpu = 0; cpu < maxcpu; cpu++) { 255 if (CPU_ABSENT(cpu)) 256 continue; 257 CPU_LOCK(zone, cpu); 258 cache = &zone->uz_cpu[cpu]; 259 /* Add them up, and reset */ 260 alloc += cache->uc_allocs; 261 cache->uc_allocs = 0; 262 if (cache->uc_allocbucket) 263 free += cache->uc_allocbucket->ub_ptr + 1; 264 if (cache->uc_freebucket) 265 free += cache->uc_freebucket->ub_ptr + 1; 266 CPU_UNLOCK(zone, cpu); 267 } 268 } 269 270 /* Now push these stats back into the zone.. */ 271 ZONE_LOCK(zone); 272 zone->uz_allocs += alloc; 273 274 /* 275 * cachefree is an instantanious snapshot of what is in the per cpu 276 * caches, not an accurate counter 277 */ 278 zone->uz_cachefree = free; 279 280 /* 281 * Expand the zone hash table. 282 * 283 * This is done if the number of slabs is larger than the hash size. 284 * What I'm trying to do here is completely reduce collisions. This 285 * may be a little aggressive. Should I allow for two collisions max? 286 */ 287 288 if ((zone->uz_flags & UMA_ZFLAG_OFFPAGE) && 289 !(zone->uz_flags & UMA_ZFLAG_MALLOC)) { 290 if (zone->uz_pages / zone->uz_ppera 291 >= zone->uz_hash.uh_hashsize) { 292 struct uma_hash newhash; 293 struct uma_hash oldhash; 294 int ret; 295 296 /* 297 * This is so involved because allocating and freeing 298 * while the zone lock is held will lead to deadlock. 299 * I have to do everything in stages and check for 300 * races. 301 */ 302 newhash = zone->uz_hash; 303 ZONE_UNLOCK(zone); 304 ret = hash_alloc(&newhash); 305 ZONE_LOCK(zone); 306 if (ret) { 307 if (hash_expand(&zone->uz_hash, &newhash)) { 308 oldhash = zone->uz_hash; 309 zone->uz_hash = newhash; 310 } else 311 oldhash = newhash; 312 313 ZONE_UNLOCK(zone); 314 hash_free(&oldhash); 315 ZONE_LOCK(zone); 316 } 317 } 318 } 319 320 /* 321 * Here we compute the working set size as the total number of items 322 * left outstanding since the last time interval. This is slightly 323 * suboptimal. What we really want is the highest number of outstanding 324 * items during the last time quantum. This should be close enough. 325 * 326 * The working set size is used to throttle the zone_drain function. 327 * We don't want to return memory that we may need again immediately. 328 */ 329 alloc = zone->uz_allocs - zone->uz_oallocs; 330 zone->uz_oallocs = zone->uz_allocs; 331 zone->uz_wssize = alloc; 332 333 ZONE_UNLOCK(zone); 334 } 335 336 /* 337 * Allocate and zero fill the next sized hash table from the appropriate 338 * backing store. 339 * 340 * Arguments: 341 * hash A new hash structure with the old hash size in uh_hashsize 342 * 343 * Returns: 344 * 1 on sucess and 0 on failure. 345 */ 346 int 347 hash_alloc(struct uma_hash *hash) 348 { 349 int oldsize; 350 int alloc; 351 352 oldsize = hash->uh_hashsize; 353 354 /* We're just going to go to a power of two greater */ 355 if (oldsize) { 356 hash->uh_hashsize = oldsize * 2; 357 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; 358 /* XXX Shouldn't be abusing DEVBUF here */ 359 hash->uh_slab_hash = (struct slabhead *)malloc(alloc, 360 M_DEVBUF, M_NOWAIT); 361 } else { 362 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; 363 hash->uh_slab_hash = uma_zalloc_internal(hashzone, NULL, 364 M_WAITOK, NULL); 365 hash->uh_hashsize = UMA_HASH_SIZE_INIT; 366 } 367 if (hash->uh_slab_hash) { 368 bzero(hash->uh_slab_hash, alloc); 369 hash->uh_hashmask = hash->uh_hashsize - 1; 370 return (1); 371 } 372 373 return (0); 374 } 375 376 /* 377 * Expands the hash table for OFFPAGE zones. This is done from zone_timeout 378 * to reduce collisions. This must not be done in the regular allocation path, 379 * otherwise, we can recurse on the vm while allocating pages. 380 * 381 * Arguments: 382 * oldhash The hash you want to expand 383 * newhash The hash structure for the new table 384 * 385 * Returns: 386 * Nothing 387 * 388 * Discussion: 389 */ 390 static int 391 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) 392 { 393 uma_slab_t slab; 394 int hval; 395 int i; 396 397 if (!newhash->uh_slab_hash) 398 return (0); 399 400 if (oldhash->uh_hashsize >= newhash->uh_hashsize) 401 return (0); 402 403 /* 404 * I need to investigate hash algorithms for resizing without a 405 * full rehash. 406 */ 407 408 for (i = 0; i < oldhash->uh_hashsize; i++) 409 while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) { 410 slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]); 411 SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink); 412 hval = UMA_HASH(newhash, slab->us_data); 413 SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], 414 slab, us_hlink); 415 } 416 417 return (1); 418 } 419 420 /* 421 * Free the hash bucket to the appropriate backing store. 422 * 423 * Arguments: 424 * slab_hash The hash bucket we're freeing 425 * hashsize The number of entries in that hash bucket 426 * 427 * Returns: 428 * Nothing 429 */ 430 static void 431 hash_free(struct uma_hash *hash) 432 { 433 if (hash->uh_slab_hash == NULL) 434 return; 435 if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) 436 uma_zfree_internal(hashzone, 437 hash->uh_slab_hash, NULL, 0); 438 else 439 free(hash->uh_slab_hash, M_DEVBUF); 440 } 441 442 /* 443 * Frees all outstanding items in a bucket 444 * 445 * Arguments: 446 * zone The zone to free to, must be unlocked. 447 * bucket The free/alloc bucket with items, cpu queue must be locked. 448 * 449 * Returns: 450 * Nothing 451 */ 452 453 static void 454 bucket_drain(uma_zone_t zone, uma_bucket_t bucket) 455 { 456 uma_slab_t slab; 457 int mzone; 458 void *item; 459 460 if (bucket == NULL) 461 return; 462 463 slab = NULL; 464 mzone = 0; 465 466 /* We have to lookup the slab again for malloc.. */ 467 if (zone->uz_flags & UMA_ZFLAG_MALLOC) 468 mzone = 1; 469 470 while (bucket->ub_ptr > -1) { 471 item = bucket->ub_bucket[bucket->ub_ptr]; 472 #ifdef INVARIANTS 473 bucket->ub_bucket[bucket->ub_ptr] = NULL; 474 KASSERT(item != NULL, 475 ("bucket_drain: botched ptr, item is NULL")); 476 #endif 477 bucket->ub_ptr--; 478 /* 479 * This is extremely inefficient. The slab pointer was passed 480 * to uma_zfree_arg, but we lost it because the buckets don't 481 * hold them. This will go away when free() gets a size passed 482 * to it. 483 */ 484 if (mzone) { 485 mtx_lock(&malloc_mtx); 486 slab = hash_sfind(mallochash, 487 (u_int8_t *)((unsigned long)item & 488 (~UMA_SLAB_MASK))); 489 mtx_unlock(&malloc_mtx); 490 } 491 uma_zfree_internal(zone, item, slab, 1); 492 } 493 } 494 495 /* 496 * Drains the per cpu caches for a zone. 497 * 498 * Arguments: 499 * zone The zone to drain, must be unlocked. 500 * 501 * Returns: 502 * Nothing 503 * 504 * This function returns with the zone locked so that the per cpu queues can 505 * not be filled until zone_drain is finished. 506 * 507 */ 508 static void 509 cache_drain(uma_zone_t zone) 510 { 511 uma_bucket_t bucket; 512 uma_cache_t cache; 513 int cpu; 514 515 /* 516 * Flush out the per cpu queues. 517 * 518 * XXX This causes unnecessary thrashing due to immediately having 519 * empty per cpu queues. I need to improve this. 520 */ 521 522 /* 523 * We have to lock each cpu cache before locking the zone 524 */ 525 ZONE_UNLOCK(zone); 526 527 for (cpu = 0; cpu < maxcpu; cpu++) { 528 if (CPU_ABSENT(cpu)) 529 continue; 530 CPU_LOCK(zone, cpu); 531 cache = &zone->uz_cpu[cpu]; 532 bucket_drain(zone, cache->uc_allocbucket); 533 bucket_drain(zone, cache->uc_freebucket); 534 } 535 536 /* 537 * Drain the bucket queues and free the buckets, we just keep two per 538 * cpu (alloc/free). 539 */ 540 ZONE_LOCK(zone); 541 while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) { 542 LIST_REMOVE(bucket, ub_link); 543 ZONE_UNLOCK(zone); 544 bucket_drain(zone, bucket); 545 uma_zfree_internal(bucketzone, bucket, NULL, 0); 546 ZONE_LOCK(zone); 547 } 548 549 /* Now we do the free queue.. */ 550 while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) { 551 LIST_REMOVE(bucket, ub_link); 552 uma_zfree_internal(bucketzone, bucket, NULL, 0); 553 } 554 555 /* We unlock here, but they will all block until the zone is unlocked */ 556 for (cpu = 0; cpu < maxcpu; cpu++) { 557 if (CPU_ABSENT(cpu)) 558 continue; 559 CPU_UNLOCK(zone, cpu); 560 } 561 562 zone->uz_cachefree = 0; 563 } 564 565 /* 566 * Frees pages from a zone back to the system. This is done on demand from 567 * the pageout daemon. 568 * 569 * Arguments: 570 * zone The zone to free pages from 571 * all Should we drain all items? 572 * 573 * Returns: 574 * Nothing. 575 */ 576 static void 577 zone_drain(uma_zone_t zone) 578 { 579 struct slabhead freeslabs = {}; 580 uma_slab_t slab; 581 uma_slab_t n; 582 u_int64_t extra; 583 u_int8_t flags; 584 u_int8_t *mem; 585 int i; 586 587 /* 588 * We don't want to take pages from staticly allocated zones at this 589 * time 590 */ 591 if (zone->uz_flags & UMA_ZFLAG_NOFREE || zone->uz_freef == NULL) 592 return; 593 594 ZONE_LOCK(zone); 595 596 if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) 597 cache_drain(zone); 598 599 if (zone->uz_free < zone->uz_wssize) 600 goto finished; 601 #ifdef UMA_DEBUG 602 printf("%s working set size: %llu free items: %u\n", 603 zone->uz_name, (unsigned long long)zone->uz_wssize, zone->uz_free); 604 #endif 605 extra = zone->uz_free - zone->uz_wssize; 606 extra /= zone->uz_ipers; 607 608 /* extra is now the number of extra slabs that we can free */ 609 610 if (extra == 0) 611 goto finished; 612 613 slab = LIST_FIRST(&zone->uz_free_slab); 614 while (slab && extra) { 615 n = LIST_NEXT(slab, us_link); 616 617 /* We have no where to free these to */ 618 if (slab->us_flags & UMA_SLAB_BOOT) { 619 slab = n; 620 continue; 621 } 622 623 LIST_REMOVE(slab, us_link); 624 zone->uz_pages -= zone->uz_ppera; 625 zone->uz_free -= zone->uz_ipers; 626 627 if (zone->uz_flags & UMA_ZFLAG_MALLOC) { 628 mtx_lock(&malloc_mtx); 629 UMA_HASH_REMOVE(mallochash, slab, slab->us_data); 630 mtx_unlock(&malloc_mtx); 631 } 632 if (zone->uz_flags & UMA_ZFLAG_OFFPAGE && 633 !(zone->uz_flags & UMA_ZFLAG_MALLOC)) 634 UMA_HASH_REMOVE(&zone->uz_hash, slab, slab->us_data); 635 636 SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); 637 638 slab = n; 639 extra--; 640 } 641 finished: 642 ZONE_UNLOCK(zone); 643 644 while ((slab = SLIST_FIRST(&freeslabs)) != NULL) { 645 SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink); 646 if (zone->uz_fini) 647 for (i = 0; i < zone->uz_ipers; i++) 648 zone->uz_fini( 649 slab->us_data + (zone->uz_rsize * i), 650 zone->uz_size); 651 flags = slab->us_flags; 652 mem = slab->us_data; 653 if (zone->uz_flags & UMA_ZFLAG_OFFPAGE) { 654 uma_zfree_internal(slabzone, slab, NULL, 0); 655 } 656 #ifdef UMA_DEBUG 657 printf("%s: Returning %d bytes.\n", 658 zone->uz_name, UMA_SLAB_SIZE * zone->uz_ppera); 659 #endif 660 zone->uz_freef(mem, UMA_SLAB_SIZE * zone->uz_ppera, flags); 661 } 662 663 } 664 665 /* 666 * Allocate a new slab for a zone. This does not insert the slab onto a list. 667 * 668 * Arguments: 669 * zone The zone to allocate slabs for 670 * wait Shall we wait? 671 * 672 * Returns: 673 * The slab that was allocated or NULL if there is no memory and the 674 * caller specified M_NOWAIT. 675 * 676 */ 677 static uma_slab_t 678 slab_zalloc(uma_zone_t zone, int wait) 679 { 680 uma_slab_t slab; /* Starting slab */ 681 u_int8_t *mem; 682 u_int8_t flags; 683 int i; 684 685 slab = NULL; 686 687 #ifdef UMA_DEBUG 688 printf("slab_zalloc: Allocating a new slab for %s\n", zone->uz_name); 689 #endif 690 ZONE_UNLOCK(zone); 691 692 if (zone->uz_flags & UMA_ZFLAG_OFFPAGE) { 693 slab = uma_zalloc_internal(slabzone, NULL, wait, NULL); 694 if (slab == NULL) { 695 ZONE_LOCK(zone); 696 return NULL; 697 } 698 } 699 700 /* 701 * This reproduces the old vm_zone behavior of zero filling pages the 702 * first time they are added to a zone. 703 * 704 * Malloced items are zeroed in uma_zalloc. 705 */ 706 707 if ((zone->uz_flags & UMA_ZFLAG_MALLOC) == 0) 708 wait |= M_ZERO; 709 else 710 wait &= ~M_ZERO; 711 712 if (booted || (zone->uz_flags & UMA_ZFLAG_PRIVALLOC)) { 713 mtx_lock(&Giant); 714 mem = zone->uz_allocf(zone, 715 zone->uz_ppera * UMA_SLAB_SIZE, &flags, wait); 716 mtx_unlock(&Giant); 717 if (mem == NULL) { 718 ZONE_LOCK(zone); 719 return (NULL); 720 } 721 } else { 722 uma_slab_t tmps; 723 724 if (zone->uz_ppera > 1) 725 panic("UMA: Attemping to allocate multiple pages before vm has started.\n"); 726 if (zone->uz_flags & UMA_ZFLAG_MALLOC) 727 panic("Mallocing before uma_startup2 has been called.\n"); 728 if (uma_boot_free == 0) 729 panic("UMA: Ran out of pre init pages, increase UMA_BOOT_PAGES\n"); 730 tmps = LIST_FIRST(&uma_boot_pages); 731 LIST_REMOVE(tmps, us_link); 732 uma_boot_free--; 733 mem = tmps->us_data; 734 } 735 736 /* Point the slab into the allocated memory */ 737 if (!(zone->uz_flags & UMA_ZFLAG_OFFPAGE)) { 738 slab = (uma_slab_t )(mem + zone->uz_pgoff); 739 } 740 741 if (zone->uz_flags & UMA_ZFLAG_MALLOC) { 742 #ifdef UMA_DEBUG 743 printf("Inserting %p into malloc hash from slab %p\n", 744 mem, slab); 745 #endif 746 mtx_lock(&malloc_mtx); 747 UMA_HASH_INSERT(mallochash, slab, mem); 748 mtx_unlock(&malloc_mtx); 749 } 750 751 slab->us_zone = zone; 752 slab->us_data = mem; 753 754 /* 755 * This is intended to spread data out across cache lines. 756 * 757 * This code doesn't seem to work properly on x86, and on alpha 758 * it makes absolutely no performance difference. I'm sure it could 759 * use some tuning, but sun makes outrageous claims about it's 760 * performance. 761 */ 762 #if 0 763 if (zone->uz_cachemax) { 764 slab->us_data += zone->uz_cacheoff; 765 zone->uz_cacheoff += UMA_CACHE_INC; 766 if (zone->uz_cacheoff > zone->uz_cachemax) 767 zone->uz_cacheoff = 0; 768 } 769 #endif 770 771 slab->us_freecount = zone->uz_ipers; 772 slab->us_firstfree = 0; 773 slab->us_flags = flags; 774 for (i = 0; i < zone->uz_ipers; i++) 775 slab->us_freelist[i] = i+1; 776 777 if (zone->uz_init) 778 for (i = 0; i < zone->uz_ipers; i++) 779 zone->uz_init(slab->us_data + (zone->uz_rsize * i), 780 zone->uz_size); 781 ZONE_LOCK(zone); 782 783 if ((zone->uz_flags & (UMA_ZFLAG_OFFPAGE|UMA_ZFLAG_MALLOC)) == 784 UMA_ZFLAG_OFFPAGE) 785 UMA_HASH_INSERT(&zone->uz_hash, slab, mem); 786 787 zone->uz_pages += zone->uz_ppera; 788 zone->uz_free += zone->uz_ipers; 789 790 791 return (slab); 792 } 793 794 /* 795 * Allocates a number of pages from the system 796 * 797 * Arguments: 798 * zone Unused 799 * bytes The number of bytes requested 800 * wait Shall we wait? 801 * 802 * Returns: 803 * A pointer to the alloced memory or possibly 804 * NULL if M_NOWAIT is set. 805 */ 806 static void * 807 page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait) 808 { 809 void *p; /* Returned page */ 810 811 *pflag = UMA_SLAB_KMEM; 812 p = (void *) kmem_malloc(kmem_map, bytes, wait); 813 814 return (p); 815 } 816 817 /* 818 * Allocates a number of pages from within an object 819 * 820 * Arguments: 821 * zone Unused 822 * bytes The number of bytes requested 823 * wait Shall we wait? 824 * 825 * Returns: 826 * A pointer to the alloced memory or possibly 827 * NULL if M_NOWAIT is set. 828 * 829 * TODO: If we fail during a multi-page allocation release the pages that have 830 * already been allocated. 831 */ 832 static void * 833 obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) 834 { 835 vm_offset_t zkva; 836 vm_offset_t retkva; 837 vm_page_t p; 838 int pages; 839 840 retkva = NULL; 841 pages = zone->uz_pages; 842 843 /* 844 * This looks a little weird since we're getting one page at a time 845 */ 846 while (bytes > 0) { 847 p = vm_page_alloc(zone->uz_obj, pages, 848 VM_ALLOC_INTERRUPT); 849 if (p == NULL) 850 return (NULL); 851 852 zkva = zone->uz_kva + pages * PAGE_SIZE; 853 if (retkva == NULL) 854 retkva = zkva; 855 pmap_qenter(zkva, &p, 1); 856 bytes -= PAGE_SIZE; 857 pages += 1; 858 } 859 860 *flags = UMA_SLAB_PRIV; 861 862 return ((void *)retkva); 863 } 864 865 /* 866 * Frees a number of pages to the system 867 * 868 * Arguments: 869 * mem A pointer to the memory to be freed 870 * size The size of the memory being freed 871 * flags The original p->us_flags field 872 * 873 * Returns: 874 * Nothing 875 * 876 */ 877 static void 878 page_free(void *mem, int size, u_int8_t flags) 879 { 880 vm_map_t map; 881 882 if (flags & UMA_SLAB_KMEM) 883 map = kmem_map; 884 else 885 panic("UMA: page_free used with invalid flags %d\n", flags); 886 887 kmem_free(map, (vm_offset_t)mem, size); 888 } 889 890 /* 891 * Zero fill initializer 892 * 893 * Arguments/Returns follow uma_init specifications 894 * 895 */ 896 static void 897 zero_init(void *mem, int size) 898 { 899 bzero(mem, size); 900 } 901 902 /* 903 * Finish creating a small uma zone. This calculates ipers, and the zone size. 904 * 905 * Arguments 906 * zone The zone we should initialize 907 * 908 * Returns 909 * Nothing 910 */ 911 static void 912 zone_small_init(uma_zone_t zone) 913 { 914 int rsize; 915 int memused; 916 int ipers; 917 918 rsize = zone->uz_size; 919 920 if (rsize < UMA_SMALLEST_UNIT) 921 rsize = UMA_SMALLEST_UNIT; 922 923 if (rsize & zone->uz_align) 924 rsize = (rsize & ~zone->uz_align) + (zone->uz_align + 1); 925 926 zone->uz_rsize = rsize; 927 928 rsize += 1; /* Account for the byte of linkage */ 929 zone->uz_ipers = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) / rsize; 930 zone->uz_ppera = 1; 931 932 memused = zone->uz_ipers * zone->uz_rsize; 933 934 /* Can we do any better? */ 935 if ((UMA_SLAB_SIZE - memused) >= UMA_MAX_WASTE) { 936 if (zone->uz_flags & UMA_ZFLAG_INTERNAL) 937 return; 938 ipers = UMA_SLAB_SIZE / zone->uz_rsize; 939 if (ipers > zone->uz_ipers) { 940 zone->uz_flags |= UMA_ZFLAG_OFFPAGE; 941 zone->uz_ipers = ipers; 942 } 943 } 944 945 } 946 947 /* 948 * Finish creating a large (> UMA_SLAB_SIZE) uma zone. Just give in and do 949 * OFFPAGE for now. When I can allow for more dynamic slab sizes this will be 950 * more complicated. 951 * 952 * Arguments 953 * zone The zone we should initialize 954 * 955 * Returns 956 * Nothing 957 */ 958 static void 959 zone_large_init(uma_zone_t zone) 960 { 961 int pages; 962 963 pages = zone->uz_size / UMA_SLAB_SIZE; 964 965 /* Account for remainder */ 966 if ((pages * UMA_SLAB_SIZE) < zone->uz_size) 967 pages++; 968 969 zone->uz_ppera = pages; 970 zone->uz_ipers = 1; 971 972 zone->uz_flags |= UMA_ZFLAG_OFFPAGE; 973 zone->uz_rsize = zone->uz_size; 974 } 975 976 /* 977 * Zone header ctor. This initializes all fields, locks, etc. And inserts 978 * the zone onto the global zone list. 979 * 980 * Arguments/Returns follow uma_ctor specifications 981 * udata Actually uma_zcreat_args 982 * 983 */ 984 985 static void 986 zone_ctor(void *mem, int size, void *udata) 987 { 988 struct uma_zctor_args *arg = udata; 989 uma_zone_t zone = mem; 990 int privlc; 991 int cplen; 992 int cpu; 993 994 bzero(zone, size); 995 zone->uz_name = arg->name; 996 zone->uz_size = arg->size; 997 zone->uz_ctor = arg->ctor; 998 zone->uz_dtor = arg->dtor; 999 zone->uz_init = arg->uminit; 1000 zone->uz_align = arg->align; 1001 zone->uz_free = 0; 1002 zone->uz_pages = 0; 1003 zone->uz_flags = 0; 1004 zone->uz_allocf = page_alloc; 1005 zone->uz_freef = page_free; 1006 1007 if (arg->flags & UMA_ZONE_ZINIT) 1008 zone->uz_init = zero_init; 1009 1010 if (arg->flags & UMA_ZONE_INTERNAL) 1011 zone->uz_flags |= UMA_ZFLAG_INTERNAL; 1012 1013 if (arg->flags & UMA_ZONE_MALLOC) 1014 zone->uz_flags |= UMA_ZFLAG_MALLOC; 1015 1016 if (arg->flags & UMA_ZONE_NOFREE) 1017 zone->uz_flags |= UMA_ZFLAG_NOFREE; 1018 1019 if (arg->flags & UMA_ZONE_VM) 1020 zone->uz_flags |= UMA_ZFLAG_BUCKETCACHE; 1021 1022 if (zone->uz_size > UMA_SLAB_SIZE) 1023 zone_large_init(zone); 1024 else 1025 zone_small_init(zone); 1026 1027 if (arg->flags & UMA_ZONE_MTXCLASS) 1028 privlc = 1; 1029 else 1030 privlc = 0; 1031 1032 /* We do this so that the per cpu lock name is unique for each zone */ 1033 memcpy(zone->uz_lname, "PCPU ", 5); 1034 cplen = min(strlen(zone->uz_name) + 1, LOCKNAME_LEN - 6); 1035 memcpy(zone->uz_lname+5, zone->uz_name, cplen); 1036 zone->uz_lname[LOCKNAME_LEN - 1] = '\0'; 1037 1038 /* 1039 * If we're putting the slab header in the actual page we need to 1040 * figure out where in each page it goes. This calculates a right 1041 * justified offset into the memory on a ALIGN_PTR boundary. 1042 */ 1043 if (!(zone->uz_flags & UMA_ZFLAG_OFFPAGE)) { 1044 int totsize; 1045 int waste; 1046 1047 /* Size of the slab struct and free list */ 1048 totsize = sizeof(struct uma_slab) + zone->uz_ipers; 1049 if (totsize & UMA_ALIGN_PTR) 1050 totsize = (totsize & ~UMA_ALIGN_PTR) + 1051 (UMA_ALIGN_PTR + 1); 1052 zone->uz_pgoff = UMA_SLAB_SIZE - totsize; 1053 1054 waste = zone->uz_pgoff; 1055 waste -= (zone->uz_ipers * zone->uz_rsize); 1056 1057 /* 1058 * This calculates how much space we have for cache line size 1059 * optimizations. It works by offseting each slab slightly. 1060 * Currently it breaks on x86, and so it is disabled. 1061 */ 1062 1063 if (zone->uz_align < UMA_CACHE_INC && waste > UMA_CACHE_INC) { 1064 zone->uz_cachemax = waste - UMA_CACHE_INC; 1065 zone->uz_cacheoff = 0; 1066 } 1067 1068 totsize = zone->uz_pgoff + sizeof(struct uma_slab) 1069 + zone->uz_ipers; 1070 /* I don't think it's possible, but I'll make sure anyway */ 1071 if (totsize > UMA_SLAB_SIZE) { 1072 printf("zone %s ipers %d rsize %d size %d\n", 1073 zone->uz_name, zone->uz_ipers, zone->uz_rsize, 1074 zone->uz_size); 1075 panic("UMA slab won't fit.\n"); 1076 } 1077 } else { 1078 hash_alloc(&zone->uz_hash); 1079 zone->uz_pgoff = 0; 1080 } 1081 1082 #ifdef UMA_DEBUG 1083 printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n", 1084 zone->uz_name, zone, 1085 zone->uz_size, zone->uz_ipers, 1086 zone->uz_ppera, zone->uz_pgoff); 1087 #endif 1088 ZONE_LOCK_INIT(zone, privlc); 1089 1090 mtx_lock(&uma_mtx); 1091 LIST_INSERT_HEAD(&uma_zones, zone, uz_link); 1092 mtx_unlock(&uma_mtx); 1093 1094 /* 1095 * Some internal zones don't have room allocated for the per cpu 1096 * caches. If we're internal, bail out here. 1097 */ 1098 1099 if (zone->uz_flags & UMA_ZFLAG_INTERNAL) 1100 return; 1101 1102 if (zone->uz_ipers < UMA_BUCKET_SIZE) 1103 zone->uz_count = zone->uz_ipers - 1; 1104 else 1105 zone->uz_count = UMA_BUCKET_SIZE - 1; 1106 1107 for (cpu = 0; cpu < maxcpu; cpu++) 1108 CPU_LOCK_INIT(zone, cpu, privlc); 1109 } 1110 1111 /* 1112 * Zone header dtor. This frees all data, destroys locks, frees the hash table 1113 * and removes the zone from the global list. 1114 * 1115 * Arguments/Returns follow uma_dtor specifications 1116 * udata unused 1117 */ 1118 1119 static void 1120 zone_dtor(void *arg, int size, void *udata) 1121 { 1122 uma_zone_t zone; 1123 int cpu; 1124 1125 zone = (uma_zone_t)arg; 1126 1127 mtx_lock(&uma_mtx); 1128 LIST_REMOVE(zone, uz_link); 1129 mtx_unlock(&uma_mtx); 1130 1131 ZONE_LOCK(zone); 1132 zone->uz_wssize = 0; 1133 ZONE_UNLOCK(zone); 1134 1135 zone_drain(zone); 1136 ZONE_LOCK(zone); 1137 if (zone->uz_free != 0) 1138 printf("Zone %s was not empty. Lost %d pages of memory.\n", 1139 zone->uz_name, zone->uz_pages); 1140 1141 if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) == 0) 1142 for (cpu = 0; cpu < maxcpu; cpu++) 1143 CPU_LOCK_FINI(zone, cpu); 1144 1145 ZONE_UNLOCK(zone); 1146 if ((zone->uz_flags & UMA_ZFLAG_OFFPAGE) != 0) 1147 hash_free(&zone->uz_hash); 1148 1149 ZONE_LOCK_FINI(zone); 1150 } 1151 /* 1152 * Traverses every zone in the system and calls a callback 1153 * 1154 * Arguments: 1155 * zfunc A pointer to a function which accepts a zone 1156 * as an argument. 1157 * 1158 * Returns: 1159 * Nothing 1160 */ 1161 static void 1162 zone_foreach(void (*zfunc)(uma_zone_t)) 1163 { 1164 uma_zone_t zone; 1165 1166 mtx_lock(&uma_mtx); 1167 LIST_FOREACH(zone, &uma_zones, uz_link) { 1168 zfunc(zone); 1169 } 1170 mtx_unlock(&uma_mtx); 1171 } 1172 1173 /* Public functions */ 1174 /* See uma.h */ 1175 void 1176 uma_startup(void *bootmem) 1177 { 1178 struct uma_zctor_args args; 1179 uma_slab_t slab; 1180 int slabsize; 1181 int i; 1182 1183 #ifdef UMA_DEBUG 1184 printf("Creating uma zone headers zone.\n"); 1185 #endif 1186 #ifdef SMP 1187 maxcpu = mp_maxid + 1; 1188 #else 1189 maxcpu = 1; 1190 #endif 1191 #ifdef UMA_DEBUG 1192 printf("Max cpu = %d, mp_maxid = %d\n", maxcpu, mp_maxid); 1193 Debugger("stop"); 1194 #endif 1195 mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF); 1196 /* "manually" Create the initial zone */ 1197 args.name = "UMA Zones"; 1198 args.size = sizeof(struct uma_zone) + 1199 (sizeof(struct uma_cache) * (maxcpu - 1)); 1200 args.ctor = zone_ctor; 1201 args.dtor = zone_dtor; 1202 args.uminit = zero_init; 1203 args.fini = NULL; 1204 args.align = 32 - 1; 1205 args.flags = UMA_ZONE_INTERNAL; 1206 /* The initial zone has no Per cpu queues so it's smaller */ 1207 zone_ctor(zones, sizeof(struct uma_zone), &args); 1208 1209 #ifdef UMA_DEBUG 1210 printf("Filling boot free list.\n"); 1211 #endif 1212 for (i = 0; i < UMA_BOOT_PAGES; i++) { 1213 slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE)); 1214 slab->us_data = (u_int8_t *)slab; 1215 slab->us_flags = UMA_SLAB_BOOT; 1216 LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link); 1217 uma_boot_free++; 1218 } 1219 1220 #ifdef UMA_DEBUG 1221 printf("Creating slab zone.\n"); 1222 #endif 1223 1224 /* 1225 * This is the max number of free list items we'll have with 1226 * offpage slabs. 1227 */ 1228 1229 slabsize = UMA_SLAB_SIZE - sizeof(struct uma_slab); 1230 slabsize /= UMA_MAX_WASTE; 1231 slabsize++; /* In case there it's rounded */ 1232 slabsize += sizeof(struct uma_slab); 1233 1234 /* Now make a zone for slab headers */ 1235 slabzone = uma_zcreate("UMA Slabs", 1236 slabsize, 1237 NULL, NULL, NULL, NULL, 1238 UMA_ALIGN_PTR, UMA_ZONE_INTERNAL); 1239 1240 hashzone = uma_zcreate("UMA Hash", 1241 sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, 1242 NULL, NULL, NULL, NULL, 1243 UMA_ALIGN_PTR, UMA_ZONE_INTERNAL); 1244 1245 bucketzone = uma_zcreate("UMA Buckets", sizeof(struct uma_bucket), 1246 NULL, NULL, NULL, NULL, 1247 UMA_ALIGN_PTR, UMA_ZONE_INTERNAL); 1248 1249 1250 #ifdef UMA_DEBUG 1251 printf("UMA startup complete.\n"); 1252 #endif 1253 } 1254 1255 /* see uma.h */ 1256 void 1257 uma_startup2(void *hashmem, u_long elems) 1258 { 1259 bzero(hashmem, elems * sizeof(void *)); 1260 mallochash->uh_slab_hash = hashmem; 1261 mallochash->uh_hashsize = elems; 1262 mallochash->uh_hashmask = elems - 1; 1263 booted = 1; 1264 bucket_enable(); 1265 #ifdef UMA_DEBUG 1266 printf("UMA startup2 complete.\n"); 1267 #endif 1268 } 1269 1270 /* 1271 * Initialize our callout handle 1272 * 1273 */ 1274 1275 static void 1276 uma_startup3(void) 1277 { 1278 #ifdef UMA_DEBUG 1279 printf("Starting callout.\n"); 1280 #endif 1281 callout_init(&uma_callout, 0); 1282 callout_reset(&uma_callout, UMA_WORKING_TIME * hz, uma_timeout, NULL); 1283 #ifdef UMA_DEBUG 1284 printf("UMA startup3 complete.\n"); 1285 #endif 1286 } 1287 1288 /* See uma.h */ 1289 uma_zone_t 1290 uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor, 1291 uma_init uminit, uma_fini fini, int align, u_int16_t flags) 1292 1293 { 1294 struct uma_zctor_args args; 1295 1296 /* This stuff is essential for the zone ctor */ 1297 args.name = name; 1298 args.size = size; 1299 args.ctor = ctor; 1300 args.dtor = dtor; 1301 args.uminit = uminit; 1302 args.fini = fini; 1303 args.align = align; 1304 args.flags = flags; 1305 1306 return (uma_zalloc_internal(zones, &args, M_WAITOK, NULL)); 1307 } 1308 1309 /* See uma.h */ 1310 void 1311 uma_zdestroy(uma_zone_t zone) 1312 { 1313 uma_zfree_internal(zones, zone, NULL, 0); 1314 } 1315 1316 /* See uma.h */ 1317 void * 1318 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) 1319 { 1320 void *item; 1321 uma_cache_t cache; 1322 uma_bucket_t bucket; 1323 int cpu; 1324 1325 /* This is the fast path allocation */ 1326 #ifdef UMA_DEBUG_ALLOC_1 1327 printf("Allocating one item from %s(%p)\n", zone->uz_name, zone); 1328 #endif 1329 1330 if (!(flags & M_NOWAIT)) { 1331 KASSERT(curthread->td_intr_nesting_level == 0, 1332 ("malloc(M_WAITOK) in interrupt context")); 1333 WITNESS_SLEEP(1, NULL); 1334 } 1335 1336 zalloc_restart: 1337 cpu = PCPU_GET(cpuid); 1338 CPU_LOCK(zone, cpu); 1339 cache = &zone->uz_cpu[cpu]; 1340 1341 zalloc_start: 1342 bucket = cache->uc_allocbucket; 1343 1344 if (bucket) { 1345 if (bucket->ub_ptr > -1) { 1346 item = bucket->ub_bucket[bucket->ub_ptr]; 1347 #ifdef INVARIANTS 1348 bucket->ub_bucket[bucket->ub_ptr] = NULL; 1349 #endif 1350 bucket->ub_ptr--; 1351 KASSERT(item != NULL, 1352 ("uma_zalloc: Bucket pointer mangled.")); 1353 cache->uc_allocs++; 1354 #ifdef INVARIANTS 1355 uma_dbg_alloc(zone, NULL, item); 1356 #endif 1357 CPU_UNLOCK(zone, cpu); 1358 if (zone->uz_ctor) 1359 zone->uz_ctor(item, zone->uz_size, udata); 1360 if (flags & M_ZERO) 1361 bzero(item, zone->uz_size); 1362 return (item); 1363 } else if (cache->uc_freebucket) { 1364 /* 1365 * We have run out of items in our allocbucket. 1366 * See if we can switch with our free bucket. 1367 */ 1368 if (cache->uc_freebucket->ub_ptr > -1) { 1369 uma_bucket_t swap; 1370 1371 #ifdef UMA_DEBUG_ALLOC 1372 printf("uma_zalloc: Swapping empty with alloc.\n"); 1373 #endif 1374 swap = cache->uc_freebucket; 1375 cache->uc_freebucket = cache->uc_allocbucket; 1376 cache->uc_allocbucket = swap; 1377 1378 goto zalloc_start; 1379 } 1380 } 1381 } 1382 ZONE_LOCK(zone); 1383 /* Since we have locked the zone we may as well send back our stats */ 1384 zone->uz_allocs += cache->uc_allocs; 1385 cache->uc_allocs = 0; 1386 1387 /* Our old one is now a free bucket */ 1388 if (cache->uc_allocbucket) { 1389 KASSERT(cache->uc_allocbucket->ub_ptr == -1, 1390 ("uma_zalloc_arg: Freeing a non free bucket.")); 1391 LIST_INSERT_HEAD(&zone->uz_free_bucket, 1392 cache->uc_allocbucket, ub_link); 1393 cache->uc_allocbucket = NULL; 1394 } 1395 1396 /* Check the free list for a new alloc bucket */ 1397 if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) { 1398 KASSERT(bucket->ub_ptr != -1, 1399 ("uma_zalloc_arg: Returning an empty bucket.")); 1400 1401 LIST_REMOVE(bucket, ub_link); 1402 cache->uc_allocbucket = bucket; 1403 ZONE_UNLOCK(zone); 1404 goto zalloc_start; 1405 } 1406 /* Bump up our uz_count so we get here less */ 1407 if (zone->uz_count < UMA_BUCKET_SIZE - 1) 1408 zone->uz_count++; 1409 1410 /* We are no longer associated with this cpu!!! */ 1411 CPU_UNLOCK(zone, cpu); 1412 1413 /* 1414 * Now lets just fill a bucket and put it on the free list. If that 1415 * works we'll restart the allocation from the begining. 1416 * 1417 * Try this zone's free list first so we don't allocate extra buckets. 1418 */ 1419 1420 if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) 1421 LIST_REMOVE(bucket, ub_link); 1422 1423 /* Now we no longer need the zone lock. */ 1424 ZONE_UNLOCK(zone); 1425 1426 if (bucket == NULL) { 1427 int bflags; 1428 1429 bflags = flags; 1430 if (zone->uz_flags & UMA_ZFLAG_BUCKETCACHE) 1431 bflags |= M_NOVM; 1432 1433 bucket = uma_zalloc_internal(bucketzone, 1434 NULL, bflags, NULL); 1435 } 1436 1437 if (bucket != NULL) { 1438 #ifdef INVARIANTS 1439 bzero(bucket, bucketzone->uz_size); 1440 #endif 1441 bucket->ub_ptr = -1; 1442 1443 if (uma_zalloc_internal(zone, udata, flags, bucket)) 1444 goto zalloc_restart; 1445 else 1446 uma_zfree_internal(bucketzone, bucket, NULL, 0); 1447 } 1448 /* 1449 * We may not get a bucket if we recurse, so 1450 * return an actual item. 1451 */ 1452 #ifdef UMA_DEBUG 1453 printf("uma_zalloc_arg: Bucketzone returned NULL\n"); 1454 #endif 1455 1456 return (uma_zalloc_internal(zone, udata, flags, NULL)); 1457 } 1458 1459 /* 1460 * Allocates an item for an internal zone OR fills a bucket 1461 * 1462 * Arguments 1463 * zone The zone to alloc for. 1464 * udata The data to be passed to the constructor. 1465 * flags M_WAITOK, M_NOWAIT, M_ZERO. 1466 * bucket The bucket to fill or NULL 1467 * 1468 * Returns 1469 * NULL if there is no memory and M_NOWAIT is set 1470 * An item if called on an interal zone 1471 * Non NULL if called to fill a bucket and it was successful. 1472 * 1473 * Discussion: 1474 * This was much cleaner before it had to do per cpu caches. It is 1475 * complicated now because it has to handle the simple internal case, and 1476 * the more involved bucket filling and allocation. 1477 */ 1478 1479 static void * 1480 uma_zalloc_internal(uma_zone_t zone, void *udata, int flags, uma_bucket_t bucket) 1481 { 1482 uma_slab_t slab; 1483 u_int8_t freei; 1484 void *item; 1485 1486 item = NULL; 1487 1488 /* 1489 * This is to stop us from allocating per cpu buckets while we're 1490 * running out of UMA_BOOT_PAGES. Otherwise, we would exhaust the 1491 * boot pages. 1492 */ 1493 1494 if (bucketdisable && zone == bucketzone) 1495 return (NULL); 1496 1497 #ifdef UMA_DEBUG_ALLOC 1498 printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone); 1499 #endif 1500 ZONE_LOCK(zone); 1501 1502 /* 1503 * This code is here to limit the number of simultaneous bucket fills 1504 * for any given zone to the number of per cpu caches in this zone. This 1505 * is done so that we don't allocate more memory than we really need. 1506 */ 1507 1508 if (bucket) { 1509 #ifdef SMP 1510 if (zone->uz_fills >= mp_ncpus) { 1511 #else 1512 if (zone->uz_fills > 1) { 1513 #endif 1514 ZONE_UNLOCK(zone); 1515 return (NULL); 1516 } 1517 1518 zone->uz_fills++; 1519 } 1520 1521 new_slab: 1522 1523 /* Find a slab with some space */ 1524 if (zone->uz_free) { 1525 if (!LIST_EMPTY(&zone->uz_part_slab)) { 1526 slab = LIST_FIRST(&zone->uz_part_slab); 1527 } else { 1528 slab = LIST_FIRST(&zone->uz_free_slab); 1529 LIST_REMOVE(slab, us_link); 1530 LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link); 1531 } 1532 } else { 1533 /* 1534 * This is to prevent us from recursively trying to allocate 1535 * buckets. The problem is that if an allocation forces us to 1536 * grab a new bucket we will call page_alloc, which will go off 1537 * and cause the vm to allocate vm_map_entries. If we need new 1538 * buckets there too we will recurse in kmem_alloc and bad 1539 * things happen. So instead we return a NULL bucket, and make 1540 * the code that allocates buckets smart enough to deal with it 1541 */ 1542 if (zone == bucketzone && zone->uz_recurse != 0) { 1543 ZONE_UNLOCK(zone); 1544 return (NULL); 1545 } 1546 while (zone->uz_maxpages && 1547 zone->uz_pages >= zone->uz_maxpages) { 1548 zone->uz_flags |= UMA_ZFLAG_FULL; 1549 1550 if (flags & M_WAITOK) 1551 msleep(zone, &zone->uz_lock, PVM, "zonelimit", 0); 1552 else 1553 goto alloc_fail; 1554 1555 goto new_slab; 1556 } 1557 1558 if (flags & M_NOVM) 1559 goto alloc_fail; 1560 1561 zone->uz_recurse++; 1562 slab = slab_zalloc(zone, flags); 1563 zone->uz_recurse--; 1564 /* 1565 * We might not have been able to get a slab but another cpu 1566 * could have while we were unlocked. If we did get a slab put 1567 * it on the partially used slab list. If not check the free 1568 * count and restart or fail accordingly. 1569 */ 1570 if (slab) 1571 LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link); 1572 else if (zone->uz_free == 0) 1573 goto alloc_fail; 1574 else 1575 goto new_slab; 1576 } 1577 /* 1578 * If this is our first time though put this guy on the list. 1579 */ 1580 if (bucket != NULL && bucket->ub_ptr == -1) 1581 LIST_INSERT_HEAD(&zone->uz_full_bucket, 1582 bucket, ub_link); 1583 1584 1585 while (slab->us_freecount) { 1586 freei = slab->us_firstfree; 1587 slab->us_firstfree = slab->us_freelist[freei]; 1588 1589 item = slab->us_data + (zone->uz_rsize * freei); 1590 1591 slab->us_freecount--; 1592 zone->uz_free--; 1593 #ifdef INVARIANTS 1594 uma_dbg_alloc(zone, slab, item); 1595 #endif 1596 if (bucket == NULL) { 1597 zone->uz_allocs++; 1598 break; 1599 } 1600 bucket->ub_bucket[++bucket->ub_ptr] = item; 1601 1602 /* Don't overfill the bucket! */ 1603 if (bucket->ub_ptr == zone->uz_count) 1604 break; 1605 } 1606 1607 /* Move this slab to the full list */ 1608 if (slab->us_freecount == 0) { 1609 LIST_REMOVE(slab, us_link); 1610 LIST_INSERT_HEAD(&zone->uz_full_slab, slab, us_link); 1611 } 1612 1613 if (bucket != NULL) { 1614 /* Try to keep the buckets totally full, but don't block */ 1615 if (bucket->ub_ptr < zone->uz_count) { 1616 flags |= M_NOWAIT; 1617 flags &= ~M_WAITOK; 1618 goto new_slab; 1619 } else 1620 zone->uz_fills--; 1621 } 1622 1623 ZONE_UNLOCK(zone); 1624 1625 /* Only construct at this time if we're not filling a bucket */ 1626 if (bucket == NULL) { 1627 if (zone->uz_ctor != NULL) 1628 zone->uz_ctor(item, zone->uz_size, udata); 1629 if (flags & M_ZERO) 1630 bzero(item, zone->uz_size); 1631 } 1632 1633 return (item); 1634 1635 alloc_fail: 1636 if (bucket != NULL) 1637 zone->uz_fills--; 1638 ZONE_UNLOCK(zone); 1639 1640 if (bucket != NULL && bucket->ub_ptr != -1) 1641 return (bucket); 1642 1643 return (NULL); 1644 } 1645 1646 /* See uma.h */ 1647 void 1648 uma_zfree_arg(uma_zone_t zone, void *item, void *udata) 1649 { 1650 uma_cache_t cache; 1651 uma_bucket_t bucket; 1652 int bflags; 1653 int cpu; 1654 1655 /* This is the fast path free */ 1656 #ifdef UMA_DEBUG_ALLOC_1 1657 printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone); 1658 #endif 1659 /* 1660 * The race here is acceptable. If we miss it we'll just have to wait 1661 * a little longer for the limits to be reset. 1662 */ 1663 1664 if (zone->uz_flags & UMA_ZFLAG_FULL) 1665 goto zfree_internal; 1666 1667 zfree_restart: 1668 cpu = PCPU_GET(cpuid); 1669 CPU_LOCK(zone, cpu); 1670 cache = &zone->uz_cpu[cpu]; 1671 1672 zfree_start: 1673 bucket = cache->uc_freebucket; 1674 1675 if (bucket) { 1676 /* 1677 * Do we have room in our bucket? It is OK for this uz count 1678 * check to be slightly out of sync. 1679 */ 1680 1681 if (bucket->ub_ptr < zone->uz_count) { 1682 bucket->ub_ptr++; 1683 KASSERT(bucket->ub_bucket[bucket->ub_ptr] == NULL, 1684 ("uma_zfree: Freeing to non free bucket index.")); 1685 bucket->ub_bucket[bucket->ub_ptr] = item; 1686 if (zone->uz_dtor) 1687 zone->uz_dtor(item, zone->uz_size, udata); 1688 #ifdef INVARIANTS 1689 if (zone->uz_flags & UMA_ZFLAG_MALLOC) 1690 uma_dbg_free(zone, udata, item); 1691 else 1692 uma_dbg_free(zone, NULL, item); 1693 #endif 1694 CPU_UNLOCK(zone, cpu); 1695 return; 1696 } else if (cache->uc_allocbucket) { 1697 #ifdef UMA_DEBUG_ALLOC 1698 printf("uma_zfree: Swapping buckets.\n"); 1699 #endif 1700 /* 1701 * We have run out of space in our freebucket. 1702 * See if we can switch with our alloc bucket. 1703 */ 1704 if (cache->uc_allocbucket->ub_ptr < 1705 cache->uc_freebucket->ub_ptr) { 1706 uma_bucket_t swap; 1707 1708 swap = cache->uc_freebucket; 1709 cache->uc_freebucket = cache->uc_allocbucket; 1710 cache->uc_allocbucket = swap; 1711 1712 goto zfree_start; 1713 } 1714 } 1715 } 1716 1717 /* 1718 * We can get here for two reasons: 1719 * 1720 * 1) The buckets are NULL 1721 * 2) The alloc and free buckets are both somewhat full. 1722 * 1723 */ 1724 1725 ZONE_LOCK(zone); 1726 1727 bucket = cache->uc_freebucket; 1728 cache->uc_freebucket = NULL; 1729 1730 /* Can we throw this on the zone full list? */ 1731 if (bucket != NULL) { 1732 #ifdef UMA_DEBUG_ALLOC 1733 printf("uma_zfree: Putting old bucket on the free list.\n"); 1734 #endif 1735 /* ub_ptr is pointing to the last free item */ 1736 KASSERT(bucket->ub_ptr != -1, 1737 ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n")); 1738 LIST_INSERT_HEAD(&zone->uz_full_bucket, 1739 bucket, ub_link); 1740 } 1741 if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) { 1742 LIST_REMOVE(bucket, ub_link); 1743 ZONE_UNLOCK(zone); 1744 cache->uc_freebucket = bucket; 1745 goto zfree_start; 1746 } 1747 /* We're done with this CPU now */ 1748 CPU_UNLOCK(zone, cpu); 1749 1750 /* And the zone.. */ 1751 ZONE_UNLOCK(zone); 1752 1753 #ifdef UMA_DEBUG_ALLOC 1754 printf("uma_zfree: Allocating new free bucket.\n"); 1755 #endif 1756 bflags = M_NOWAIT; 1757 1758 if (zone->uz_flags & UMA_ZFLAG_BUCKETCACHE) 1759 bflags |= M_NOVM; 1760 #ifdef INVARIANTS 1761 bflags |= M_ZERO; 1762 #endif 1763 bucket = uma_zalloc_internal(bucketzone, 1764 NULL, bflags, NULL); 1765 if (bucket) { 1766 bucket->ub_ptr = -1; 1767 ZONE_LOCK(zone); 1768 LIST_INSERT_HEAD(&zone->uz_free_bucket, 1769 bucket, ub_link); 1770 ZONE_UNLOCK(zone); 1771 goto zfree_restart; 1772 } 1773 1774 /* 1775 * If nothing else caught this, we'll just do an internal free. 1776 */ 1777 1778 zfree_internal: 1779 1780 uma_zfree_internal(zone, item, udata, 0); 1781 1782 return; 1783 1784 } 1785 1786 /* 1787 * Frees an item to an INTERNAL zone or allocates a free bucket 1788 * 1789 * Arguments: 1790 * zone The zone to free to 1791 * item The item we're freeing 1792 * udata User supplied data for the dtor 1793 * skip Skip the dtor, it was done in uma_zfree_arg 1794 */ 1795 1796 static void 1797 uma_zfree_internal(uma_zone_t zone, void *item, void *udata, int skip) 1798 { 1799 uma_slab_t slab; 1800 u_int8_t *mem; 1801 u_int8_t freei; 1802 1803 ZONE_LOCK(zone); 1804 1805 if (!(zone->uz_flags & UMA_ZFLAG_MALLOC)) { 1806 mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK)); 1807 if (zone->uz_flags & UMA_ZFLAG_OFFPAGE) 1808 slab = hash_sfind(&zone->uz_hash, mem); 1809 else { 1810 mem += zone->uz_pgoff; 1811 slab = (uma_slab_t)mem; 1812 } 1813 } else { 1814 slab = (uma_slab_t)udata; 1815 } 1816 1817 /* Do we need to remove from any lists? */ 1818 if (slab->us_freecount+1 == zone->uz_ipers) { 1819 LIST_REMOVE(slab, us_link); 1820 LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link); 1821 } else if (slab->us_freecount == 0) { 1822 LIST_REMOVE(slab, us_link); 1823 LIST_INSERT_HEAD(&zone->uz_part_slab, slab, us_link); 1824 } 1825 1826 /* Slab management stuff */ 1827 freei = ((unsigned long)item - (unsigned long)slab->us_data) 1828 / zone->uz_rsize; 1829 1830 #ifdef INVARIANTS 1831 if (!skip) 1832 uma_dbg_free(zone, slab, item); 1833 #endif 1834 1835 slab->us_freelist[freei] = slab->us_firstfree; 1836 slab->us_firstfree = freei; 1837 slab->us_freecount++; 1838 1839 /* Zone statistics */ 1840 zone->uz_free++; 1841 1842 if (!skip && zone->uz_dtor) 1843 zone->uz_dtor(item, zone->uz_size, udata); 1844 1845 if (zone->uz_flags & UMA_ZFLAG_FULL) { 1846 if (zone->uz_pages < zone->uz_maxpages) 1847 zone->uz_flags &= ~UMA_ZFLAG_FULL; 1848 1849 /* We can handle one more allocation */ 1850 wakeup_one(&zone); 1851 } 1852 1853 ZONE_UNLOCK(zone); 1854 } 1855 1856 /* See uma.h */ 1857 void 1858 uma_zone_set_max(uma_zone_t zone, int nitems) 1859 { 1860 ZONE_LOCK(zone); 1861 if (zone->uz_ppera > 1) 1862 zone->uz_maxpages = nitems * zone->uz_ppera; 1863 else 1864 zone->uz_maxpages = nitems / zone->uz_ipers; 1865 1866 if (zone->uz_maxpages * zone->uz_ipers < nitems) 1867 zone->uz_maxpages++; 1868 1869 ZONE_UNLOCK(zone); 1870 } 1871 1872 /* See uma.h */ 1873 void 1874 uma_zone_set_freef(uma_zone_t zone, uma_free freef) 1875 { 1876 ZONE_LOCK(zone); 1877 1878 zone->uz_freef = freef; 1879 1880 ZONE_UNLOCK(zone); 1881 } 1882 1883 /* See uma.h */ 1884 void 1885 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) 1886 { 1887 ZONE_LOCK(zone); 1888 1889 zone->uz_flags |= UMA_ZFLAG_PRIVALLOC; 1890 zone->uz_allocf = allocf; 1891 1892 ZONE_UNLOCK(zone); 1893 } 1894 1895 /* See uma.h */ 1896 int 1897 uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count) 1898 { 1899 int pages; 1900 vm_offset_t kva; 1901 1902 mtx_lock(&Giant); 1903 1904 pages = count / zone->uz_ipers; 1905 1906 if (pages * zone->uz_ipers < count) 1907 pages++; 1908 1909 kva = kmem_alloc_pageable(kernel_map, pages * UMA_SLAB_SIZE); 1910 1911 if (kva == 0) { 1912 mtx_unlock(&Giant); 1913 return (0); 1914 } 1915 1916 1917 if (obj == NULL) 1918 obj = vm_object_allocate(OBJT_DEFAULT, 1919 pages); 1920 else 1921 _vm_object_allocate(OBJT_DEFAULT, 1922 pages, obj); 1923 1924 ZONE_LOCK(zone); 1925 zone->uz_kva = kva; 1926 zone->uz_obj = obj; 1927 zone->uz_maxpages = pages; 1928 1929 zone->uz_allocf = obj_alloc; 1930 zone->uz_flags |= UMA_ZFLAG_NOFREE | UMA_ZFLAG_PRIVALLOC; 1931 1932 ZONE_UNLOCK(zone); 1933 mtx_unlock(&Giant); 1934 1935 return (1); 1936 } 1937 1938 /* See uma.h */ 1939 void 1940 uma_prealloc(uma_zone_t zone, int items) 1941 { 1942 int slabs; 1943 uma_slab_t slab; 1944 1945 ZONE_LOCK(zone); 1946 slabs = items / zone->uz_ipers; 1947 if (slabs * zone->uz_ipers < items) 1948 slabs++; 1949 1950 while (slabs > 0) { 1951 slab = slab_zalloc(zone, M_WAITOK); 1952 LIST_INSERT_HEAD(&zone->uz_free_slab, slab, us_link); 1953 slabs--; 1954 } 1955 ZONE_UNLOCK(zone); 1956 } 1957 1958 /* See uma.h */ 1959 void 1960 uma_reclaim(void) 1961 { 1962 /* 1963 * You might think that the delay below would improve performance since 1964 * the allocator will give away memory that it may ask for immediately. 1965 * Really, it makes things worse, since cpu cycles are so much cheaper 1966 * than disk activity. 1967 */ 1968 #if 0 1969 static struct timeval tv = {0}; 1970 struct timeval now; 1971 getmicrouptime(&now); 1972 if (now.tv_sec > tv.tv_sec + 30) 1973 tv = now; 1974 else 1975 return; 1976 #endif 1977 #ifdef UMA_DEBUG 1978 printf("UMA: vm asked us to release pages!\n"); 1979 #endif 1980 bucket_enable(); 1981 zone_foreach(zone_drain); 1982 1983 /* 1984 * Some slabs may have been freed but this zone will be visited early 1985 * we visit again so that we can free pages that are empty once other 1986 * zones are drained. We have to do the same for buckets. 1987 */ 1988 zone_drain(slabzone); 1989 zone_drain(bucketzone); 1990 } 1991 1992 void * 1993 uma_large_malloc(int size, int wait) 1994 { 1995 void *mem; 1996 uma_slab_t slab; 1997 u_int8_t flags; 1998 1999 slab = uma_zalloc_internal(slabzone, NULL, wait, NULL); 2000 if (slab == NULL) 2001 return (NULL); 2002 2003 mem = page_alloc(NULL, size, &flags, wait); 2004 if (mem) { 2005 slab->us_data = mem; 2006 slab->us_flags = flags | UMA_SLAB_MALLOC; 2007 slab->us_size = size; 2008 mtx_lock(&malloc_mtx); 2009 UMA_HASH_INSERT(mallochash, slab, mem); 2010 mtx_unlock(&malloc_mtx); 2011 } else { 2012 uma_zfree_internal(slabzone, slab, NULL, 0); 2013 } 2014 2015 2016 return (mem); 2017 } 2018 2019 void 2020 uma_large_free(uma_slab_t slab) 2021 { 2022 mtx_lock(&malloc_mtx); 2023 UMA_HASH_REMOVE(mallochash, slab, slab->us_data); 2024 mtx_unlock(&malloc_mtx); 2025 page_free(slab->us_data, slab->us_size, slab->us_flags); 2026 uma_zfree_internal(slabzone, slab, NULL, 0); 2027 } 2028 2029 void 2030 uma_print_stats(void) 2031 { 2032 zone_foreach(uma_print_zone); 2033 } 2034 2035 void 2036 uma_print_zone(uma_zone_t zone) 2037 { 2038 printf("%s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n", 2039 zone->uz_name, zone, zone->uz_size, zone->uz_rsize, zone->uz_flags, 2040 zone->uz_ipers, zone->uz_ppera, 2041 (zone->uz_ipers * zone->uz_pages) - zone->uz_free, zone->uz_free); 2042 } 2043 2044 /* 2045 * Sysctl handler for vm.zone 2046 * 2047 * stolen from vm_zone.c 2048 */ 2049 static int 2050 sysctl_vm_zone(SYSCTL_HANDLER_ARGS) 2051 { 2052 int error, len, cnt; 2053 const int linesize = 128; /* conservative */ 2054 int totalfree; 2055 char *tmpbuf, *offset; 2056 uma_zone_t z; 2057 char *p; 2058 2059 cnt = 0; 2060 mtx_lock(&uma_mtx); 2061 LIST_FOREACH(z, &uma_zones, uz_link) 2062 cnt++; 2063 mtx_unlock(&uma_mtx); 2064 MALLOC(tmpbuf, char *, (cnt == 0 ? 1 : cnt) * linesize, 2065 M_TEMP, M_WAITOK); 2066 len = snprintf(tmpbuf, linesize, 2067 "\nITEM SIZE LIMIT USED FREE REQUESTS\n\n"); 2068 if (cnt == 0) 2069 tmpbuf[len - 1] = '\0'; 2070 error = SYSCTL_OUT(req, tmpbuf, cnt == 0 ? len-1 : len); 2071 if (error || cnt == 0) 2072 goto out; 2073 offset = tmpbuf; 2074 mtx_lock(&uma_mtx); 2075 LIST_FOREACH(z, &uma_zones, uz_link) { 2076 if (cnt == 0) /* list may have changed size */ 2077 break; 2078 ZONE_LOCK(z); 2079 totalfree = z->uz_free + z->uz_cachefree; 2080 len = snprintf(offset, linesize, 2081 "%-12.12s %6.6u, %8.8u, %6.6u, %6.6u, %8.8llu\n", 2082 z->uz_name, z->uz_size, 2083 z->uz_maxpages * z->uz_ipers, 2084 (z->uz_ipers * (z->uz_pages / z->uz_ppera)) - totalfree, 2085 totalfree, 2086 (unsigned long long)z->uz_allocs); 2087 ZONE_UNLOCK(z); 2088 for (p = offset + 12; p > offset && *p == ' '; --p) 2089 /* nothing */ ; 2090 p[1] = ':'; 2091 cnt--; 2092 offset += len; 2093 } 2094 mtx_unlock(&uma_mtx); 2095 *offset++ = '\0'; 2096 error = SYSCTL_OUT(req, tmpbuf, offset - tmpbuf); 2097 out: 2098 FREE(tmpbuf, M_TEMP); 2099 return (error); 2100 } 2101