1 /* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks and only 6 * uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com> 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/module.h> 13 #include <linux/bit_spinlock.h> 14 #include <linux/interrupt.h> 15 #include <linux/bitops.h> 16 #include <linux/slab.h> 17 #include <linux/seq_file.h> 18 #include <linux/cpu.h> 19 #include <linux/cpuset.h> 20 #include <linux/mempolicy.h> 21 #include <linux/ctype.h> 22 #include <linux/kallsyms.h> 23 24 /* 25 * Lock order: 26 * 1. slab_lock(page) 27 * 2. slab->list_lock 28 * 29 * The slab_lock protects operations on the object of a particular 30 * slab and its metadata in the page struct. If the slab lock 31 * has been taken then no allocations nor frees can be performed 32 * on the objects in the slab nor can the slab be added or removed 33 * from the partial or full lists since this would mean modifying 34 * the page_struct of the slab. 35 * 36 * The list_lock protects the partial and full list on each node and 37 * the partial slab counter. If taken then no new slabs may be added or 38 * removed from the lists nor make the number of partial slabs be modified. 39 * (Note that the total number of slabs is an atomic value that may be 40 * modified without taking the list lock). 41 * 42 * The list_lock is a centralized lock and thus we avoid taking it as 43 * much as possible. As long as SLUB does not have to handle partial 44 * slabs, operations can continue without any centralized lock. F.e. 45 * allocating a long series of objects that fill up slabs does not require 46 * the list lock. 47 * 48 * The lock order is sometimes inverted when we are trying to get a slab 49 * off a list. We take the list_lock and then look for a page on the list 50 * to use. While we do that objects in the slabs may be freed. We can 51 * only operate on the slab if we have also taken the slab_lock. So we use 52 * a slab_trylock() on the slab. If trylock was successful then no frees 53 * can occur anymore and we can use the slab for allocations etc. If the 54 * slab_trylock() does not succeed then frees are in progress in the slab and 55 * we must stay away from it for a while since we may cause a bouncing 56 * cacheline if we try to acquire the lock. So go onto the next slab. 57 * If all pages are busy then we may allocate a new slab instead of reusing 58 * a partial slab. A new slab has noone operating on it and thus there is 59 * no danger of cacheline contention. 60 * 61 * Interrupts are disabled during allocation and deallocation in order to 62 * make the slab allocator safe to use in the context of an irq. In addition 63 * interrupts are disabled to ensure that the processor does not change 64 * while handling per_cpu slabs, due to kernel preemption. 65 * 66 * SLUB assigns one slab for allocation to each processor. 67 * Allocations only occur from these slabs called cpu slabs. 68 * 69 * Slabs with free elements are kept on a partial list. 70 * There is no list for full slabs. If an object in a full slab is 71 * freed then the slab will show up again on the partial lists. 72 * Otherwise there is no need to track full slabs unless we have to 73 * track full slabs for debugging purposes. 74 * 75 * Slabs are freed when they become empty. Teardown and setup is 76 * minimal so we rely on the page allocators per cpu caches for 77 * fast frees and allocs. 78 * 79 * Overloading of page flags that are otherwise used for LRU management. 80 * 81 * PageActive The slab is used as a cpu cache. Allocations 82 * may be performed from the slab. The slab is not 83 * on any slab list and cannot be moved onto one. 84 * 85 * PageError Slab requires special handling due to debug 86 * options set. This moves slab handling out of 87 * the fast path. 88 */ 89 90 /* 91 * Issues still to be resolved: 92 * 93 * - The per cpu array is updated for each new slab and and is a remote 94 * cacheline for most nodes. This could become a bouncing cacheline given 95 * enough frequent updates. There are 16 pointers in a cacheline.so at 96 * max 16 cpus could compete. Likely okay. 97 * 98 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 99 * 100 * - Variable sizing of the per node arrays 101 */ 102 103 /* Enable to test recovery from slab corruption on boot */ 104 #undef SLUB_RESILIENCY_TEST 105 106 #if PAGE_SHIFT <= 12 107 108 /* 109 * Small page size. Make sure that we do not fragment memory 110 */ 111 #define DEFAULT_MAX_ORDER 1 112 #define DEFAULT_MIN_OBJECTS 4 113 114 #else 115 116 /* 117 * Large page machines are customarily able to handle larger 118 * page orders. 119 */ 120 #define DEFAULT_MAX_ORDER 2 121 #define DEFAULT_MIN_OBJECTS 8 122 123 #endif 124 125 /* 126 * Mininum number of partial slabs. These will be left on the partial 127 * lists even if they are empty. kmem_cache_shrink may reclaim them. 128 */ 129 #define MIN_PARTIAL 2 130 131 /* 132 * Maximum number of desirable partial slabs. 133 * The existence of more partial slabs makes kmem_cache_shrink 134 * sort the partial list by the number of objects in the. 135 */ 136 #define MAX_PARTIAL 10 137 138 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 139 SLAB_POISON | SLAB_STORE_USER) 140 /* 141 * Set of flags that will prevent slab merging 142 */ 143 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 144 SLAB_TRACE | SLAB_DESTROY_BY_RCU) 145 146 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 147 SLAB_CACHE_DMA) 148 149 #ifndef ARCH_KMALLOC_MINALIGN 150 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 151 #endif 152 153 #ifndef ARCH_SLAB_MINALIGN 154 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) 155 #endif 156 157 /* Internal SLUB flags */ 158 #define __OBJECT_POISON 0x80000000 /* Poison object */ 159 160 static int kmem_size = sizeof(struct kmem_cache); 161 162 #ifdef CONFIG_SMP 163 static struct notifier_block slab_notifier; 164 #endif 165 166 static enum { 167 DOWN, /* No slab functionality available */ 168 PARTIAL, /* kmem_cache_open() works but kmalloc does not */ 169 UP, /* Everything works */ 170 SYSFS /* Sysfs up */ 171 } slab_state = DOWN; 172 173 /* A list of all slab caches on the system */ 174 static DECLARE_RWSEM(slub_lock); 175 LIST_HEAD(slab_caches); 176 177 #ifdef CONFIG_SYSFS 178 static int sysfs_slab_add(struct kmem_cache *); 179 static int sysfs_slab_alias(struct kmem_cache *, const char *); 180 static void sysfs_slab_remove(struct kmem_cache *); 181 #else 182 static int sysfs_slab_add(struct kmem_cache *s) { return 0; } 183 static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } 184 static void sysfs_slab_remove(struct kmem_cache *s) {} 185 #endif 186 187 /******************************************************************** 188 * Core slab cache functions 189 *******************************************************************/ 190 191 int slab_is_available(void) 192 { 193 return slab_state >= UP; 194 } 195 196 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 197 { 198 #ifdef CONFIG_NUMA 199 return s->node[node]; 200 #else 201 return &s->local_node; 202 #endif 203 } 204 205 /* 206 * Object debugging 207 */ 208 static void print_section(char *text, u8 *addr, unsigned int length) 209 { 210 int i, offset; 211 int newline = 1; 212 char ascii[17]; 213 214 ascii[16] = 0; 215 216 for (i = 0; i < length; i++) { 217 if (newline) { 218 printk(KERN_ERR "%10s 0x%p: ", text, addr + i); 219 newline = 0; 220 } 221 printk(" %02x", addr[i]); 222 offset = i % 16; 223 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; 224 if (offset == 15) { 225 printk(" %s\n",ascii); 226 newline = 1; 227 } 228 } 229 if (!newline) { 230 i %= 16; 231 while (i < 16) { 232 printk(" "); 233 ascii[i] = ' '; 234 i++; 235 } 236 printk(" %s\n", ascii); 237 } 238 } 239 240 /* 241 * Slow version of get and set free pointer. 242 * 243 * This requires touching the cache lines of kmem_cache. 244 * The offset can also be obtained from the page. In that 245 * case it is in the cacheline that we already need to touch. 246 */ 247 static void *get_freepointer(struct kmem_cache *s, void *object) 248 { 249 return *(void **)(object + s->offset); 250 } 251 252 static void set_freepointer(struct kmem_cache *s, void *object, void *fp) 253 { 254 *(void **)(object + s->offset) = fp; 255 } 256 257 /* 258 * Tracking user of a slab. 259 */ 260 struct track { 261 void *addr; /* Called from address */ 262 int cpu; /* Was running on cpu */ 263 int pid; /* Pid context */ 264 unsigned long when; /* When did the operation occur */ 265 }; 266 267 enum track_item { TRACK_ALLOC, TRACK_FREE }; 268 269 static struct track *get_track(struct kmem_cache *s, void *object, 270 enum track_item alloc) 271 { 272 struct track *p; 273 274 if (s->offset) 275 p = object + s->offset + sizeof(void *); 276 else 277 p = object + s->inuse; 278 279 return p + alloc; 280 } 281 282 static void set_track(struct kmem_cache *s, void *object, 283 enum track_item alloc, void *addr) 284 { 285 struct track *p; 286 287 if (s->offset) 288 p = object + s->offset + sizeof(void *); 289 else 290 p = object + s->inuse; 291 292 p += alloc; 293 if (addr) { 294 p->addr = addr; 295 p->cpu = smp_processor_id(); 296 p->pid = current ? current->pid : -1; 297 p->when = jiffies; 298 } else 299 memset(p, 0, sizeof(struct track)); 300 } 301 302 static void init_tracking(struct kmem_cache *s, void *object) 303 { 304 if (s->flags & SLAB_STORE_USER) { 305 set_track(s, object, TRACK_FREE, NULL); 306 set_track(s, object, TRACK_ALLOC, NULL); 307 } 308 } 309 310 static void print_track(const char *s, struct track *t) 311 { 312 if (!t->addr) 313 return; 314 315 printk(KERN_ERR "%s: ", s); 316 __print_symbol("%s", (unsigned long)t->addr); 317 printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); 318 } 319 320 static void print_trailer(struct kmem_cache *s, u8 *p) 321 { 322 unsigned int off; /* Offset of last byte */ 323 324 if (s->flags & SLAB_RED_ZONE) 325 print_section("Redzone", p + s->objsize, 326 s->inuse - s->objsize); 327 328 printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n", 329 p + s->offset, 330 get_freepointer(s, p)); 331 332 if (s->offset) 333 off = s->offset + sizeof(void *); 334 else 335 off = s->inuse; 336 337 if (s->flags & SLAB_STORE_USER) { 338 print_track("Last alloc", get_track(s, p, TRACK_ALLOC)); 339 print_track("Last free ", get_track(s, p, TRACK_FREE)); 340 off += 2 * sizeof(struct track); 341 } 342 343 if (off != s->size) 344 /* Beginning of the filler is the free pointer */ 345 print_section("Filler", p + off, s->size - off); 346 } 347 348 static void object_err(struct kmem_cache *s, struct page *page, 349 u8 *object, char *reason) 350 { 351 u8 *addr = page_address(page); 352 353 printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n", 354 s->name, reason, object, page); 355 printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n", 356 object - addr, page->flags, page->inuse, page->freelist); 357 if (object > addr + 16) 358 print_section("Bytes b4", object - 16, 16); 359 print_section("Object", object, min(s->objsize, 128)); 360 print_trailer(s, object); 361 dump_stack(); 362 } 363 364 static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...) 365 { 366 va_list args; 367 char buf[100]; 368 369 va_start(args, reason); 370 vsnprintf(buf, sizeof(buf), reason, args); 371 va_end(args); 372 printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf, 373 page); 374 dump_stack(); 375 } 376 377 static void init_object(struct kmem_cache *s, void *object, int active) 378 { 379 u8 *p = object; 380 381 if (s->flags & __OBJECT_POISON) { 382 memset(p, POISON_FREE, s->objsize - 1); 383 p[s->objsize -1] = POISON_END; 384 } 385 386 if (s->flags & SLAB_RED_ZONE) 387 memset(p + s->objsize, 388 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, 389 s->inuse - s->objsize); 390 } 391 392 static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) 393 { 394 while (bytes) { 395 if (*start != (u8)value) 396 return 0; 397 start++; 398 bytes--; 399 } 400 return 1; 401 } 402 403 404 static int check_valid_pointer(struct kmem_cache *s, struct page *page, 405 void *object) 406 { 407 void *base; 408 409 if (!object) 410 return 1; 411 412 base = page_address(page); 413 if (object < base || object >= base + s->objects * s->size || 414 (object - base) % s->size) { 415 return 0; 416 } 417 418 return 1; 419 } 420 421 /* 422 * Object layout: 423 * 424 * object address 425 * Bytes of the object to be managed. 426 * If the freepointer may overlay the object then the free 427 * pointer is the first word of the object. 428 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 429 * 0xa5 (POISON_END) 430 * 431 * object + s->objsize 432 * Padding to reach word boundary. This is also used for Redzoning. 433 * Padding is extended to word size if Redzoning is enabled 434 * and objsize == inuse. 435 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 436 * 0xcc (RED_ACTIVE) for objects in use. 437 * 438 * object + s->inuse 439 * A. Free pointer (if we cannot overwrite object on free) 440 * B. Tracking data for SLAB_STORE_USER 441 * C. Padding to reach required alignment boundary 442 * Padding is done using 0x5a (POISON_INUSE) 443 * 444 * object + s->size 445 * 446 * If slabcaches are merged then the objsize and inuse boundaries are to 447 * be ignored. And therefore no slab options that rely on these boundaries 448 * may be used with merged slabcaches. 449 */ 450 451 static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 452 void *from, void *to) 453 { 454 printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n", 455 s->name, message, data, from, to - 1); 456 memset(from, data, to - from); 457 } 458 459 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 460 { 461 unsigned long off = s->inuse; /* The end of info */ 462 463 if (s->offset) 464 /* Freepointer is placed after the object. */ 465 off += sizeof(void *); 466 467 if (s->flags & SLAB_STORE_USER) 468 /* We also have user information there */ 469 off += 2 * sizeof(struct track); 470 471 if (s->size == off) 472 return 1; 473 474 if (check_bytes(p + off, POISON_INUSE, s->size - off)) 475 return 1; 476 477 object_err(s, page, p, "Object padding check fails"); 478 479 /* 480 * Restore padding 481 */ 482 restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size); 483 return 0; 484 } 485 486 static int slab_pad_check(struct kmem_cache *s, struct page *page) 487 { 488 u8 *p; 489 int length, remainder; 490 491 if (!(s->flags & SLAB_POISON)) 492 return 1; 493 494 p = page_address(page); 495 length = s->objects * s->size; 496 remainder = (PAGE_SIZE << s->order) - length; 497 if (!remainder) 498 return 1; 499 500 if (!check_bytes(p + length, POISON_INUSE, remainder)) { 501 slab_err(s, page, "Padding check failed"); 502 restore_bytes(s, "slab padding", POISON_INUSE, p + length, 503 p + length + remainder); 504 return 0; 505 } 506 return 1; 507 } 508 509 static int check_object(struct kmem_cache *s, struct page *page, 510 void *object, int active) 511 { 512 u8 *p = object; 513 u8 *endobject = object + s->objsize; 514 515 if (s->flags & SLAB_RED_ZONE) { 516 unsigned int red = 517 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; 518 519 if (!check_bytes(endobject, red, s->inuse - s->objsize)) { 520 object_err(s, page, object, 521 active ? "Redzone Active" : "Redzone Inactive"); 522 restore_bytes(s, "redzone", red, 523 endobject, object + s->inuse); 524 return 0; 525 } 526 } else { 527 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse && 528 !check_bytes(endobject, POISON_INUSE, 529 s->inuse - s->objsize)) { 530 object_err(s, page, p, "Alignment padding check fails"); 531 /* 532 * Fix it so that there will not be another report. 533 * 534 * Hmmm... We may be corrupting an object that now expects 535 * to be longer than allowed. 536 */ 537 restore_bytes(s, "alignment padding", POISON_INUSE, 538 endobject, object + s->inuse); 539 } 540 } 541 542 if (s->flags & SLAB_POISON) { 543 if (!active && (s->flags & __OBJECT_POISON) && 544 (!check_bytes(p, POISON_FREE, s->objsize - 1) || 545 p[s->objsize - 1] != POISON_END)) { 546 547 object_err(s, page, p, "Poison check failed"); 548 restore_bytes(s, "Poison", POISON_FREE, 549 p, p + s->objsize -1); 550 restore_bytes(s, "Poison", POISON_END, 551 p + s->objsize - 1, p + s->objsize); 552 return 0; 553 } 554 /* 555 * check_pad_bytes cleans up on its own. 556 */ 557 check_pad_bytes(s, page, p); 558 } 559 560 if (!s->offset && active) 561 /* 562 * Object and freepointer overlap. Cannot check 563 * freepointer while object is allocated. 564 */ 565 return 1; 566 567 /* Check free pointer validity */ 568 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 569 object_err(s, page, p, "Freepointer corrupt"); 570 /* 571 * No choice but to zap it and thus loose the remainder 572 * of the free objects in this slab. May cause 573 * another error because the object count maybe 574 * wrong now. 575 */ 576 set_freepointer(s, p, NULL); 577 return 0; 578 } 579 return 1; 580 } 581 582 static int check_slab(struct kmem_cache *s, struct page *page) 583 { 584 VM_BUG_ON(!irqs_disabled()); 585 586 if (!PageSlab(page)) { 587 slab_err(s, page, "Not a valid slab page flags=%lx " 588 "mapping=0x%p count=%d", page->flags, page->mapping, 589 page_count(page)); 590 return 0; 591 } 592 if (page->offset * sizeof(void *) != s->offset) { 593 slab_err(s, page, "Corrupted offset %lu flags=0x%lx " 594 "mapping=0x%p count=%d", 595 (unsigned long)(page->offset * sizeof(void *)), 596 page->flags, 597 page->mapping, 598 page_count(page)); 599 return 0; 600 } 601 if (page->inuse > s->objects) { 602 slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx " 603 "mapping=0x%p count=%d", 604 s->name, page->inuse, s->objects, page->flags, 605 page->mapping, page_count(page)); 606 return 0; 607 } 608 /* Slab_pad_check fixes things up after itself */ 609 slab_pad_check(s, page); 610 return 1; 611 } 612 613 /* 614 * Determine if a certain object on a page is on the freelist and 615 * therefore free. Must hold the slab lock for cpu slabs to 616 * guarantee that the chains are consistent. 617 */ 618 static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 619 { 620 int nr = 0; 621 void *fp = page->freelist; 622 void *object = NULL; 623 624 while (fp && nr <= s->objects) { 625 if (fp == search) 626 return 1; 627 if (!check_valid_pointer(s, page, fp)) { 628 if (object) { 629 object_err(s, page, object, 630 "Freechain corrupt"); 631 set_freepointer(s, object, NULL); 632 break; 633 } else { 634 slab_err(s, page, "Freepointer 0x%p corrupt", 635 fp); 636 page->freelist = NULL; 637 page->inuse = s->objects; 638 printk(KERN_ERR "@@@ SLUB %s: Freelist " 639 "cleared. Slab 0x%p\n", 640 s->name, page); 641 return 0; 642 } 643 break; 644 } 645 object = fp; 646 fp = get_freepointer(s, object); 647 nr++; 648 } 649 650 if (page->inuse != s->objects - nr) { 651 slab_err(s, page, "Wrong object count. Counter is %d but " 652 "counted were %d", s, page, page->inuse, 653 s->objects - nr); 654 page->inuse = s->objects - nr; 655 printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. " 656 "Slab @0x%p\n", s->name, page); 657 } 658 return search == NULL; 659 } 660 661 /* 662 * Tracking of fully allocated slabs for debugging 663 */ 664 static void add_full(struct kmem_cache_node *n, struct page *page) 665 { 666 spin_lock(&n->list_lock); 667 list_add(&page->lru, &n->full); 668 spin_unlock(&n->list_lock); 669 } 670 671 static void remove_full(struct kmem_cache *s, struct page *page) 672 { 673 struct kmem_cache_node *n; 674 675 if (!(s->flags & SLAB_STORE_USER)) 676 return; 677 678 n = get_node(s, page_to_nid(page)); 679 680 spin_lock(&n->list_lock); 681 list_del(&page->lru); 682 spin_unlock(&n->list_lock); 683 } 684 685 static int alloc_object_checks(struct kmem_cache *s, struct page *page, 686 void *object) 687 { 688 if (!check_slab(s, page)) 689 goto bad; 690 691 if (object && !on_freelist(s, page, object)) { 692 slab_err(s, page, "Object 0x%p already allocated", object); 693 goto bad; 694 } 695 696 if (!check_valid_pointer(s, page, object)) { 697 object_err(s, page, object, "Freelist Pointer check fails"); 698 goto bad; 699 } 700 701 if (!object) 702 return 1; 703 704 if (!check_object(s, page, object, 0)) 705 goto bad; 706 707 return 1; 708 bad: 709 if (PageSlab(page)) { 710 /* 711 * If this is a slab page then lets do the best we can 712 * to avoid issues in the future. Marking all objects 713 * as used avoids touching the remainder. 714 */ 715 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", 716 s->name, page); 717 page->inuse = s->objects; 718 page->freelist = NULL; 719 /* Fix up fields that may be corrupted */ 720 page->offset = s->offset / sizeof(void *); 721 } 722 return 0; 723 } 724 725 static int free_object_checks(struct kmem_cache *s, struct page *page, 726 void *object) 727 { 728 if (!check_slab(s, page)) 729 goto fail; 730 731 if (!check_valid_pointer(s, page, object)) { 732 slab_err(s, page, "Invalid object pointer 0x%p", object); 733 goto fail; 734 } 735 736 if (on_freelist(s, page, object)) { 737 slab_err(s, page, "Object 0x%p already free", object); 738 goto fail; 739 } 740 741 if (!check_object(s, page, object, 1)) 742 return 0; 743 744 if (unlikely(s != page->slab)) { 745 if (!PageSlab(page)) 746 slab_err(s, page, "Attempt to free object(0x%p) " 747 "outside of slab", object); 748 else 749 if (!page->slab) { 750 printk(KERN_ERR 751 "SLUB <none>: no slab for object 0x%p.\n", 752 object); 753 dump_stack(); 754 } 755 else 756 slab_err(s, page, "object at 0x%p belongs " 757 "to slab %s", object, page->slab->name); 758 goto fail; 759 } 760 return 1; 761 fail: 762 printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", 763 s->name, page, object); 764 return 0; 765 } 766 767 /* 768 * Slab allocation and freeing 769 */ 770 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 771 { 772 struct page * page; 773 int pages = 1 << s->order; 774 775 if (s->order) 776 flags |= __GFP_COMP; 777 778 if (s->flags & SLAB_CACHE_DMA) 779 flags |= SLUB_DMA; 780 781 if (node == -1) 782 page = alloc_pages(flags, s->order); 783 else 784 page = alloc_pages_node(node, flags, s->order); 785 786 if (!page) 787 return NULL; 788 789 mod_zone_page_state(page_zone(page), 790 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 791 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 792 pages); 793 794 return page; 795 } 796 797 static void setup_object(struct kmem_cache *s, struct page *page, 798 void *object) 799 { 800 if (PageError(page)) { 801 init_object(s, object, 0); 802 init_tracking(s, object); 803 } 804 805 if (unlikely(s->ctor)) 806 s->ctor(object, s, SLAB_CTOR_CONSTRUCTOR); 807 } 808 809 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 810 { 811 struct page *page; 812 struct kmem_cache_node *n; 813 void *start; 814 void *end; 815 void *last; 816 void *p; 817 818 BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); 819 820 if (flags & __GFP_WAIT) 821 local_irq_enable(); 822 823 page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); 824 if (!page) 825 goto out; 826 827 n = get_node(s, page_to_nid(page)); 828 if (n) 829 atomic_long_inc(&n->nr_slabs); 830 page->offset = s->offset / sizeof(void *); 831 page->slab = s; 832 page->flags |= 1 << PG_slab; 833 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 834 SLAB_STORE_USER | SLAB_TRACE)) 835 page->flags |= 1 << PG_error; 836 837 start = page_address(page); 838 end = start + s->objects * s->size; 839 840 if (unlikely(s->flags & SLAB_POISON)) 841 memset(start, POISON_INUSE, PAGE_SIZE << s->order); 842 843 last = start; 844 for (p = start + s->size; p < end; p += s->size) { 845 setup_object(s, page, last); 846 set_freepointer(s, last, p); 847 last = p; 848 } 849 setup_object(s, page, last); 850 set_freepointer(s, last, NULL); 851 852 page->freelist = start; 853 page->inuse = 0; 854 out: 855 if (flags & __GFP_WAIT) 856 local_irq_disable(); 857 return page; 858 } 859 860 static void __free_slab(struct kmem_cache *s, struct page *page) 861 { 862 int pages = 1 << s->order; 863 864 if (unlikely(PageError(page) || s->dtor)) { 865 void *start = page_address(page); 866 void *end = start + (pages << PAGE_SHIFT); 867 void *p; 868 869 slab_pad_check(s, page); 870 for (p = start; p <= end - s->size; p += s->size) { 871 if (s->dtor) 872 s->dtor(p, s, 0); 873 check_object(s, page, p, 0); 874 } 875 } 876 877 mod_zone_page_state(page_zone(page), 878 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 879 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 880 - pages); 881 882 page->mapping = NULL; 883 __free_pages(page, s->order); 884 } 885 886 static void rcu_free_slab(struct rcu_head *h) 887 { 888 struct page *page; 889 890 page = container_of((struct list_head *)h, struct page, lru); 891 __free_slab(page->slab, page); 892 } 893 894 static void free_slab(struct kmem_cache *s, struct page *page) 895 { 896 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 897 /* 898 * RCU free overloads the RCU head over the LRU 899 */ 900 struct rcu_head *head = (void *)&page->lru; 901 902 call_rcu(head, rcu_free_slab); 903 } else 904 __free_slab(s, page); 905 } 906 907 static void discard_slab(struct kmem_cache *s, struct page *page) 908 { 909 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 910 911 atomic_long_dec(&n->nr_slabs); 912 reset_page_mapcount(page); 913 page->flags &= ~(1 << PG_slab | 1 << PG_error); 914 free_slab(s, page); 915 } 916 917 /* 918 * Per slab locking using the pagelock 919 */ 920 static __always_inline void slab_lock(struct page *page) 921 { 922 bit_spin_lock(PG_locked, &page->flags); 923 } 924 925 static __always_inline void slab_unlock(struct page *page) 926 { 927 bit_spin_unlock(PG_locked, &page->flags); 928 } 929 930 static __always_inline int slab_trylock(struct page *page) 931 { 932 int rc = 1; 933 934 rc = bit_spin_trylock(PG_locked, &page->flags); 935 return rc; 936 } 937 938 /* 939 * Management of partially allocated slabs 940 */ 941 static void add_partial_tail(struct kmem_cache_node *n, struct page *page) 942 { 943 spin_lock(&n->list_lock); 944 n->nr_partial++; 945 list_add_tail(&page->lru, &n->partial); 946 spin_unlock(&n->list_lock); 947 } 948 949 static void add_partial(struct kmem_cache_node *n, struct page *page) 950 { 951 spin_lock(&n->list_lock); 952 n->nr_partial++; 953 list_add(&page->lru, &n->partial); 954 spin_unlock(&n->list_lock); 955 } 956 957 static void remove_partial(struct kmem_cache *s, 958 struct page *page) 959 { 960 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 961 962 spin_lock(&n->list_lock); 963 list_del(&page->lru); 964 n->nr_partial--; 965 spin_unlock(&n->list_lock); 966 } 967 968 /* 969 * Lock page and remove it from the partial list 970 * 971 * Must hold list_lock 972 */ 973 static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) 974 { 975 if (slab_trylock(page)) { 976 list_del(&page->lru); 977 n->nr_partial--; 978 return 1; 979 } 980 return 0; 981 } 982 983 /* 984 * Try to get a partial slab from a specific node 985 */ 986 static struct page *get_partial_node(struct kmem_cache_node *n) 987 { 988 struct page *page; 989 990 /* 991 * Racy check. If we mistakenly see no partial slabs then we 992 * just allocate an empty slab. If we mistakenly try to get a 993 * partial slab then get_partials() will return NULL. 994 */ 995 if (!n || !n->nr_partial) 996 return NULL; 997 998 spin_lock(&n->list_lock); 999 list_for_each_entry(page, &n->partial, lru) 1000 if (lock_and_del_slab(n, page)) 1001 goto out; 1002 page = NULL; 1003 out: 1004 spin_unlock(&n->list_lock); 1005 return page; 1006 } 1007 1008 /* 1009 * Get a page from somewhere. Search in increasing NUMA 1010 * distances. 1011 */ 1012 static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1013 { 1014 #ifdef CONFIG_NUMA 1015 struct zonelist *zonelist; 1016 struct zone **z; 1017 struct page *page; 1018 1019 /* 1020 * The defrag ratio allows to configure the tradeoffs between 1021 * inter node defragmentation and node local allocations. 1022 * A lower defrag_ratio increases the tendency to do local 1023 * allocations instead of scanning throught the partial 1024 * lists on other nodes. 1025 * 1026 * If defrag_ratio is set to 0 then kmalloc() always 1027 * returns node local objects. If its higher then kmalloc() 1028 * may return off node objects in order to avoid fragmentation. 1029 * 1030 * A higher ratio means slabs may be taken from other nodes 1031 * thus reducing the number of partial slabs on those nodes. 1032 * 1033 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes 1034 * defrag_ratio = 1000) then every (well almost) allocation 1035 * will first attempt to defrag slab caches on other nodes. This 1036 * means scanning over all nodes to look for partial slabs which 1037 * may be a bit expensive to do on every slab allocation. 1038 */ 1039 if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) 1040 return NULL; 1041 1042 zonelist = &NODE_DATA(slab_node(current->mempolicy)) 1043 ->node_zonelists[gfp_zone(flags)]; 1044 for (z = zonelist->zones; *z; z++) { 1045 struct kmem_cache_node *n; 1046 1047 n = get_node(s, zone_to_nid(*z)); 1048 1049 if (n && cpuset_zone_allowed_hardwall(*z, flags) && 1050 n->nr_partial > MIN_PARTIAL) { 1051 page = get_partial_node(n); 1052 if (page) 1053 return page; 1054 } 1055 } 1056 #endif 1057 return NULL; 1058 } 1059 1060 /* 1061 * Get a partial page, lock it and return it. 1062 */ 1063 static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1064 { 1065 struct page *page; 1066 int searchnode = (node == -1) ? numa_node_id() : node; 1067 1068 page = get_partial_node(get_node(s, searchnode)); 1069 if (page || (flags & __GFP_THISNODE)) 1070 return page; 1071 1072 return get_any_partial(s, flags); 1073 } 1074 1075 /* 1076 * Move a page back to the lists. 1077 * 1078 * Must be called with the slab lock held. 1079 * 1080 * On exit the slab lock will have been dropped. 1081 */ 1082 static void putback_slab(struct kmem_cache *s, struct page *page) 1083 { 1084 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1085 1086 if (page->inuse) { 1087 1088 if (page->freelist) 1089 add_partial(n, page); 1090 else if (PageError(page) && (s->flags & SLAB_STORE_USER)) 1091 add_full(n, page); 1092 slab_unlock(page); 1093 1094 } else { 1095 if (n->nr_partial < MIN_PARTIAL) { 1096 /* 1097 * Adding an empty page to the partial slabs in order 1098 * to avoid page allocator overhead. This page needs to 1099 * come after all the others that are not fully empty 1100 * in order to make sure that we do maximum 1101 * defragmentation. 1102 */ 1103 add_partial_tail(n, page); 1104 slab_unlock(page); 1105 } else { 1106 slab_unlock(page); 1107 discard_slab(s, page); 1108 } 1109 } 1110 } 1111 1112 /* 1113 * Remove the cpu slab 1114 */ 1115 static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) 1116 { 1117 s->cpu_slab[cpu] = NULL; 1118 ClearPageActive(page); 1119 1120 putback_slab(s, page); 1121 } 1122 1123 static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) 1124 { 1125 slab_lock(page); 1126 deactivate_slab(s, page, cpu); 1127 } 1128 1129 /* 1130 * Flush cpu slab. 1131 * Called from IPI handler with interrupts disabled. 1132 */ 1133 static void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1134 { 1135 struct page *page = s->cpu_slab[cpu]; 1136 1137 if (likely(page)) 1138 flush_slab(s, page, cpu); 1139 } 1140 1141 static void flush_cpu_slab(void *d) 1142 { 1143 struct kmem_cache *s = d; 1144 int cpu = smp_processor_id(); 1145 1146 __flush_cpu_slab(s, cpu); 1147 } 1148 1149 static void flush_all(struct kmem_cache *s) 1150 { 1151 #ifdef CONFIG_SMP 1152 on_each_cpu(flush_cpu_slab, s, 1, 1); 1153 #else 1154 unsigned long flags; 1155 1156 local_irq_save(flags); 1157 flush_cpu_slab(s); 1158 local_irq_restore(flags); 1159 #endif 1160 } 1161 1162 /* 1163 * slab_alloc is optimized to only modify two cachelines on the fast path 1164 * (aside from the stack): 1165 * 1166 * 1. The page struct 1167 * 2. The first cacheline of the object to be allocated. 1168 * 1169 * The only cache lines that are read (apart from code) is the 1170 * per cpu array in the kmem_cache struct. 1171 * 1172 * Fastpath is not possible if we need to get a new slab or have 1173 * debugging enabled (which means all slabs are marked with PageError) 1174 */ 1175 static void *slab_alloc(struct kmem_cache *s, 1176 gfp_t gfpflags, int node, void *addr) 1177 { 1178 struct page *page; 1179 void **object; 1180 unsigned long flags; 1181 int cpu; 1182 1183 local_irq_save(flags); 1184 cpu = smp_processor_id(); 1185 page = s->cpu_slab[cpu]; 1186 if (!page) 1187 goto new_slab; 1188 1189 slab_lock(page); 1190 if (unlikely(node != -1 && page_to_nid(page) != node)) 1191 goto another_slab; 1192 redo: 1193 object = page->freelist; 1194 if (unlikely(!object)) 1195 goto another_slab; 1196 if (unlikely(PageError(page))) 1197 goto debug; 1198 1199 have_object: 1200 page->inuse++; 1201 page->freelist = object[page->offset]; 1202 slab_unlock(page); 1203 local_irq_restore(flags); 1204 return object; 1205 1206 another_slab: 1207 deactivate_slab(s, page, cpu); 1208 1209 new_slab: 1210 page = get_partial(s, gfpflags, node); 1211 if (likely(page)) { 1212 have_slab: 1213 s->cpu_slab[cpu] = page; 1214 SetPageActive(page); 1215 goto redo; 1216 } 1217 1218 page = new_slab(s, gfpflags, node); 1219 if (page) { 1220 cpu = smp_processor_id(); 1221 if (s->cpu_slab[cpu]) { 1222 /* 1223 * Someone else populated the cpu_slab while we enabled 1224 * interrupts, or we have got scheduled on another cpu. 1225 * The page may not be on the requested node. 1226 */ 1227 if (node == -1 || 1228 page_to_nid(s->cpu_slab[cpu]) == node) { 1229 /* 1230 * Current cpuslab is acceptable and we 1231 * want the current one since its cache hot 1232 */ 1233 discard_slab(s, page); 1234 page = s->cpu_slab[cpu]; 1235 slab_lock(page); 1236 goto redo; 1237 } 1238 /* Dump the current slab */ 1239 flush_slab(s, s->cpu_slab[cpu], cpu); 1240 } 1241 slab_lock(page); 1242 goto have_slab; 1243 } 1244 local_irq_restore(flags); 1245 return NULL; 1246 debug: 1247 if (!alloc_object_checks(s, page, object)) 1248 goto another_slab; 1249 if (s->flags & SLAB_STORE_USER) 1250 set_track(s, object, TRACK_ALLOC, addr); 1251 if (s->flags & SLAB_TRACE) { 1252 printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n", 1253 s->name, object, page->inuse, 1254 page->freelist); 1255 dump_stack(); 1256 } 1257 init_object(s, object, 1); 1258 goto have_object; 1259 } 1260 1261 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 1262 { 1263 return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); 1264 } 1265 EXPORT_SYMBOL(kmem_cache_alloc); 1266 1267 #ifdef CONFIG_NUMA 1268 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 1269 { 1270 return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); 1271 } 1272 EXPORT_SYMBOL(kmem_cache_alloc_node); 1273 #endif 1274 1275 /* 1276 * The fastpath only writes the cacheline of the page struct and the first 1277 * cacheline of the object. 1278 * 1279 * No special cachelines need to be read 1280 */ 1281 static void slab_free(struct kmem_cache *s, struct page *page, 1282 void *x, void *addr) 1283 { 1284 void *prior; 1285 void **object = (void *)x; 1286 unsigned long flags; 1287 1288 local_irq_save(flags); 1289 slab_lock(page); 1290 1291 if (unlikely(PageError(page))) 1292 goto debug; 1293 checks_ok: 1294 prior = object[page->offset] = page->freelist; 1295 page->freelist = object; 1296 page->inuse--; 1297 1298 if (unlikely(PageActive(page))) 1299 /* 1300 * Cpu slabs are never on partial lists and are 1301 * never freed. 1302 */ 1303 goto out_unlock; 1304 1305 if (unlikely(!page->inuse)) 1306 goto slab_empty; 1307 1308 /* 1309 * Objects left in the slab. If it 1310 * was not on the partial list before 1311 * then add it. 1312 */ 1313 if (unlikely(!prior)) 1314 add_partial(get_node(s, page_to_nid(page)), page); 1315 1316 out_unlock: 1317 slab_unlock(page); 1318 local_irq_restore(flags); 1319 return; 1320 1321 slab_empty: 1322 if (prior) 1323 /* 1324 * Slab on the partial list. 1325 */ 1326 remove_partial(s, page); 1327 1328 slab_unlock(page); 1329 discard_slab(s, page); 1330 local_irq_restore(flags); 1331 return; 1332 1333 debug: 1334 if (!free_object_checks(s, page, x)) 1335 goto out_unlock; 1336 if (!PageActive(page) && !page->freelist) 1337 remove_full(s, page); 1338 if (s->flags & SLAB_STORE_USER) 1339 set_track(s, x, TRACK_FREE, addr); 1340 if (s->flags & SLAB_TRACE) { 1341 printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n", 1342 s->name, object, page->inuse, 1343 page->freelist); 1344 print_section("Object", (void *)object, s->objsize); 1345 dump_stack(); 1346 } 1347 init_object(s, object, 0); 1348 goto checks_ok; 1349 } 1350 1351 void kmem_cache_free(struct kmem_cache *s, void *x) 1352 { 1353 struct page *page; 1354 1355 page = virt_to_head_page(x); 1356 1357 slab_free(s, page, x, __builtin_return_address(0)); 1358 } 1359 EXPORT_SYMBOL(kmem_cache_free); 1360 1361 /* Figure out on which slab object the object resides */ 1362 static struct page *get_object_page(const void *x) 1363 { 1364 struct page *page = virt_to_head_page(x); 1365 1366 if (!PageSlab(page)) 1367 return NULL; 1368 1369 return page; 1370 } 1371 1372 /* 1373 * kmem_cache_open produces objects aligned at "size" and the first object 1374 * is placed at offset 0 in the slab (We have no metainformation on the 1375 * slab, all slabs are in essence "off slab"). 1376 * 1377 * In order to get the desired alignment one just needs to align the 1378 * size. 1379 * 1380 * Notice that the allocation order determines the sizes of the per cpu 1381 * caches. Each processor has always one slab available for allocations. 1382 * Increasing the allocation order reduces the number of times that slabs 1383 * must be moved on and off the partial lists and therefore may influence 1384 * locking overhead. 1385 * 1386 * The offset is used to relocate the free list link in each object. It is 1387 * therefore possible to move the free list link behind the object. This 1388 * is necessary for RCU to work properly and also useful for debugging. 1389 */ 1390 1391 /* 1392 * Mininum / Maximum order of slab pages. This influences locking overhead 1393 * and slab fragmentation. A higher order reduces the number of partial slabs 1394 * and increases the number of allocations possible without having to 1395 * take the list_lock. 1396 */ 1397 static int slub_min_order; 1398 static int slub_max_order = DEFAULT_MAX_ORDER; 1399 1400 /* 1401 * Minimum number of objects per slab. This is necessary in order to 1402 * reduce locking overhead. Similar to the queue size in SLAB. 1403 */ 1404 static int slub_min_objects = DEFAULT_MIN_OBJECTS; 1405 1406 /* 1407 * Merge control. If this is set then no merging of slab caches will occur. 1408 */ 1409 static int slub_nomerge; 1410 1411 /* 1412 * Debug settings: 1413 */ 1414 static int slub_debug; 1415 1416 static char *slub_debug_slabs; 1417 1418 /* 1419 * Calculate the order of allocation given an slab object size. 1420 * 1421 * The order of allocation has significant impact on other elements 1422 * of the system. Generally order 0 allocations should be preferred 1423 * since they do not cause fragmentation in the page allocator. Larger 1424 * objects may have problems with order 0 because there may be too much 1425 * space left unused in a slab. We go to a higher order if more than 1/8th 1426 * of the slab would be wasted. 1427 * 1428 * In order to reach satisfactory performance we must ensure that 1429 * a minimum number of objects is in one slab. Otherwise we may 1430 * generate too much activity on the partial lists. This is less a 1431 * concern for large slabs though. slub_max_order specifies the order 1432 * where we begin to stop considering the number of objects in a slab. 1433 * 1434 * Higher order allocations also allow the placement of more objects 1435 * in a slab and thereby reduce object handling overhead. If the user 1436 * has requested a higher mininum order then we start with that one 1437 * instead of zero. 1438 */ 1439 static int calculate_order(int size) 1440 { 1441 int order; 1442 int rem; 1443 1444 for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT); 1445 order < MAX_ORDER; order++) { 1446 unsigned long slab_size = PAGE_SIZE << order; 1447 1448 if (slub_max_order > order && 1449 slab_size < slub_min_objects * size) 1450 continue; 1451 1452 if (slab_size < size) 1453 continue; 1454 1455 rem = slab_size % size; 1456 1457 if (rem <= (PAGE_SIZE << order) / 8) 1458 break; 1459 1460 } 1461 if (order >= MAX_ORDER) 1462 return -E2BIG; 1463 return order; 1464 } 1465 1466 /* 1467 * Function to figure out which alignment to use from the 1468 * various ways of specifying it. 1469 */ 1470 static unsigned long calculate_alignment(unsigned long flags, 1471 unsigned long align, unsigned long size) 1472 { 1473 /* 1474 * If the user wants hardware cache aligned objects then 1475 * follow that suggestion if the object is sufficiently 1476 * large. 1477 * 1478 * The hardware cache alignment cannot override the 1479 * specified alignment though. If that is greater 1480 * then use it. 1481 */ 1482 if ((flags & SLAB_HWCACHE_ALIGN) && 1483 size > L1_CACHE_BYTES / 2) 1484 return max_t(unsigned long, align, L1_CACHE_BYTES); 1485 1486 if (align < ARCH_SLAB_MINALIGN) 1487 return ARCH_SLAB_MINALIGN; 1488 1489 return ALIGN(align, sizeof(void *)); 1490 } 1491 1492 static void init_kmem_cache_node(struct kmem_cache_node *n) 1493 { 1494 n->nr_partial = 0; 1495 atomic_long_set(&n->nr_slabs, 0); 1496 spin_lock_init(&n->list_lock); 1497 INIT_LIST_HEAD(&n->partial); 1498 INIT_LIST_HEAD(&n->full); 1499 } 1500 1501 #ifdef CONFIG_NUMA 1502 /* 1503 * No kmalloc_node yet so do it by hand. We know that this is the first 1504 * slab on the node for this slabcache. There are no concurrent accesses 1505 * possible. 1506 * 1507 * Note that this function only works on the kmalloc_node_cache 1508 * when allocating for the kmalloc_node_cache. 1509 */ 1510 static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, 1511 int node) 1512 { 1513 struct page *page; 1514 struct kmem_cache_node *n; 1515 1516 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); 1517 1518 page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); 1519 /* new_slab() disables interupts */ 1520 local_irq_enable(); 1521 1522 BUG_ON(!page); 1523 n = page->freelist; 1524 BUG_ON(!n); 1525 page->freelist = get_freepointer(kmalloc_caches, n); 1526 page->inuse++; 1527 kmalloc_caches->node[node] = n; 1528 init_object(kmalloc_caches, n, 1); 1529 init_kmem_cache_node(n); 1530 atomic_long_inc(&n->nr_slabs); 1531 add_partial(n, page); 1532 return n; 1533 } 1534 1535 static void free_kmem_cache_nodes(struct kmem_cache *s) 1536 { 1537 int node; 1538 1539 for_each_online_node(node) { 1540 struct kmem_cache_node *n = s->node[node]; 1541 if (n && n != &s->local_node) 1542 kmem_cache_free(kmalloc_caches, n); 1543 s->node[node] = NULL; 1544 } 1545 } 1546 1547 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 1548 { 1549 int node; 1550 int local_node; 1551 1552 if (slab_state >= UP) 1553 local_node = page_to_nid(virt_to_page(s)); 1554 else 1555 local_node = 0; 1556 1557 for_each_online_node(node) { 1558 struct kmem_cache_node *n; 1559 1560 if (local_node == node) 1561 n = &s->local_node; 1562 else { 1563 if (slab_state == DOWN) { 1564 n = early_kmem_cache_node_alloc(gfpflags, 1565 node); 1566 continue; 1567 } 1568 n = kmem_cache_alloc_node(kmalloc_caches, 1569 gfpflags, node); 1570 1571 if (!n) { 1572 free_kmem_cache_nodes(s); 1573 return 0; 1574 } 1575 1576 } 1577 s->node[node] = n; 1578 init_kmem_cache_node(n); 1579 } 1580 return 1; 1581 } 1582 #else 1583 static void free_kmem_cache_nodes(struct kmem_cache *s) 1584 { 1585 } 1586 1587 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 1588 { 1589 init_kmem_cache_node(&s->local_node); 1590 return 1; 1591 } 1592 #endif 1593 1594 /* 1595 * calculate_sizes() determines the order and the distribution of data within 1596 * a slab object. 1597 */ 1598 static int calculate_sizes(struct kmem_cache *s) 1599 { 1600 unsigned long flags = s->flags; 1601 unsigned long size = s->objsize; 1602 unsigned long align = s->align; 1603 1604 /* 1605 * Determine if we can poison the object itself. If the user of 1606 * the slab may touch the object after free or before allocation 1607 * then we should never poison the object itself. 1608 */ 1609 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 1610 !s->ctor && !s->dtor) 1611 s->flags |= __OBJECT_POISON; 1612 else 1613 s->flags &= ~__OBJECT_POISON; 1614 1615 /* 1616 * Round up object size to the next word boundary. We can only 1617 * place the free pointer at word boundaries and this determines 1618 * the possible location of the free pointer. 1619 */ 1620 size = ALIGN(size, sizeof(void *)); 1621 1622 /* 1623 * If we are redzoning then check if there is some space between the 1624 * end of the object and the free pointer. If not then add an 1625 * additional word, so that we can establish a redzone between 1626 * the object and the freepointer to be able to check for overwrites. 1627 */ 1628 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 1629 size += sizeof(void *); 1630 1631 /* 1632 * With that we have determined how much of the slab is in actual 1633 * use by the object. This is the potential offset to the free 1634 * pointer. 1635 */ 1636 s->inuse = size; 1637 1638 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 1639 s->ctor || s->dtor)) { 1640 /* 1641 * Relocate free pointer after the object if it is not 1642 * permitted to overwrite the first word of the object on 1643 * kmem_cache_free. 1644 * 1645 * This is the case if we do RCU, have a constructor or 1646 * destructor or are poisoning the objects. 1647 */ 1648 s->offset = size; 1649 size += sizeof(void *); 1650 } 1651 1652 if (flags & SLAB_STORE_USER) 1653 /* 1654 * Need to store information about allocs and frees after 1655 * the object. 1656 */ 1657 size += 2 * sizeof(struct track); 1658 1659 if (flags & DEBUG_DEFAULT_FLAGS) 1660 /* 1661 * Add some empty padding so that we can catch 1662 * overwrites from earlier objects rather than let 1663 * tracking information or the free pointer be 1664 * corrupted if an user writes before the start 1665 * of the object. 1666 */ 1667 size += sizeof(void *); 1668 /* 1669 * Determine the alignment based on various parameters that the 1670 * user specified (this is unecessarily complex due to the attempt 1671 * to be compatible with SLAB. Should be cleaned up some day). 1672 */ 1673 align = calculate_alignment(flags, align, s->objsize); 1674 1675 /* 1676 * SLUB stores one object immediately after another beginning from 1677 * offset 0. In order to align the objects we have to simply size 1678 * each object to conform to the alignment. 1679 */ 1680 size = ALIGN(size, align); 1681 s->size = size; 1682 1683 s->order = calculate_order(size); 1684 if (s->order < 0) 1685 return 0; 1686 1687 /* 1688 * Determine the number of objects per slab 1689 */ 1690 s->objects = (PAGE_SIZE << s->order) / size; 1691 1692 /* 1693 * Verify that the number of objects is within permitted limits. 1694 * The page->inuse field is only 16 bit wide! So we cannot have 1695 * more than 64k objects per slab. 1696 */ 1697 if (!s->objects || s->objects > 65535) 1698 return 0; 1699 return 1; 1700 1701 } 1702 1703 static int __init finish_bootstrap(void) 1704 { 1705 struct list_head *h; 1706 int err; 1707 1708 slab_state = SYSFS; 1709 1710 list_for_each(h, &slab_caches) { 1711 struct kmem_cache *s = 1712 container_of(h, struct kmem_cache, list); 1713 1714 err = sysfs_slab_add(s); 1715 BUG_ON(err); 1716 } 1717 return 0; 1718 } 1719 1720 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 1721 const char *name, size_t size, 1722 size_t align, unsigned long flags, 1723 void (*ctor)(void *, struct kmem_cache *, unsigned long), 1724 void (*dtor)(void *, struct kmem_cache *, unsigned long)) 1725 { 1726 memset(s, 0, kmem_size); 1727 s->name = name; 1728 s->ctor = ctor; 1729 s->dtor = dtor; 1730 s->objsize = size; 1731 s->flags = flags; 1732 s->align = align; 1733 1734 /* 1735 * The page->offset field is only 16 bit wide. This is an offset 1736 * in units of words from the beginning of an object. If the slab 1737 * size is bigger then we cannot move the free pointer behind the 1738 * object anymore. 1739 * 1740 * On 32 bit platforms the limit is 256k. On 64bit platforms 1741 * the limit is 512k. 1742 * 1743 * Debugging or ctor/dtors may create a need to move the free 1744 * pointer. Fail if this happens. 1745 */ 1746 if (s->size >= 65535 * sizeof(void *)) { 1747 BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | 1748 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); 1749 BUG_ON(ctor || dtor); 1750 } 1751 else 1752 /* 1753 * Enable debugging if selected on the kernel commandline. 1754 */ 1755 if (slub_debug && (!slub_debug_slabs || 1756 strncmp(slub_debug_slabs, name, 1757 strlen(slub_debug_slabs)) == 0)) 1758 s->flags |= slub_debug; 1759 1760 if (!calculate_sizes(s)) 1761 goto error; 1762 1763 s->refcount = 1; 1764 #ifdef CONFIG_NUMA 1765 s->defrag_ratio = 100; 1766 #endif 1767 1768 if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 1769 return 1; 1770 error: 1771 if (flags & SLAB_PANIC) 1772 panic("Cannot create slab %s size=%lu realsize=%u " 1773 "order=%u offset=%u flags=%lx\n", 1774 s->name, (unsigned long)size, s->size, s->order, 1775 s->offset, flags); 1776 return 0; 1777 } 1778 EXPORT_SYMBOL(kmem_cache_open); 1779 1780 /* 1781 * Check if a given pointer is valid 1782 */ 1783 int kmem_ptr_validate(struct kmem_cache *s, const void *object) 1784 { 1785 struct page * page; 1786 void *addr; 1787 1788 page = get_object_page(object); 1789 1790 if (!page || s != page->slab) 1791 /* No slab or wrong slab */ 1792 return 0; 1793 1794 addr = page_address(page); 1795 if (object < addr || object >= addr + s->objects * s->size) 1796 /* Out of bounds */ 1797 return 0; 1798 1799 if ((object - addr) % s->size) 1800 /* Improperly aligned */ 1801 return 0; 1802 1803 /* 1804 * We could also check if the object is on the slabs freelist. 1805 * But this would be too expensive and it seems that the main 1806 * purpose of kmem_ptr_valid is to check if the object belongs 1807 * to a certain slab. 1808 */ 1809 return 1; 1810 } 1811 EXPORT_SYMBOL(kmem_ptr_validate); 1812 1813 /* 1814 * Determine the size of a slab object 1815 */ 1816 unsigned int kmem_cache_size(struct kmem_cache *s) 1817 { 1818 return s->objsize; 1819 } 1820 EXPORT_SYMBOL(kmem_cache_size); 1821 1822 const char *kmem_cache_name(struct kmem_cache *s) 1823 { 1824 return s->name; 1825 } 1826 EXPORT_SYMBOL(kmem_cache_name); 1827 1828 /* 1829 * Attempt to free all slabs on a node 1830 */ 1831 static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, 1832 struct list_head *list) 1833 { 1834 int slabs_inuse = 0; 1835 unsigned long flags; 1836 struct page *page, *h; 1837 1838 spin_lock_irqsave(&n->list_lock, flags); 1839 list_for_each_entry_safe(page, h, list, lru) 1840 if (!page->inuse) { 1841 list_del(&page->lru); 1842 discard_slab(s, page); 1843 } else 1844 slabs_inuse++; 1845 spin_unlock_irqrestore(&n->list_lock, flags); 1846 return slabs_inuse; 1847 } 1848 1849 /* 1850 * Release all resources used by slab cache 1851 */ 1852 static int kmem_cache_close(struct kmem_cache *s) 1853 { 1854 int node; 1855 1856 flush_all(s); 1857 1858 /* Attempt to free all objects */ 1859 for_each_online_node(node) { 1860 struct kmem_cache_node *n = get_node(s, node); 1861 1862 n->nr_partial -= free_list(s, n, &n->partial); 1863 if (atomic_long_read(&n->nr_slabs)) 1864 return 1; 1865 } 1866 free_kmem_cache_nodes(s); 1867 return 0; 1868 } 1869 1870 /* 1871 * Close a cache and release the kmem_cache structure 1872 * (must be used for caches created using kmem_cache_create) 1873 */ 1874 void kmem_cache_destroy(struct kmem_cache *s) 1875 { 1876 down_write(&slub_lock); 1877 s->refcount--; 1878 if (!s->refcount) { 1879 list_del(&s->list); 1880 if (kmem_cache_close(s)) 1881 WARN_ON(1); 1882 sysfs_slab_remove(s); 1883 kfree(s); 1884 } 1885 up_write(&slub_lock); 1886 } 1887 EXPORT_SYMBOL(kmem_cache_destroy); 1888 1889 /******************************************************************** 1890 * Kmalloc subsystem 1891 *******************************************************************/ 1892 1893 struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; 1894 EXPORT_SYMBOL(kmalloc_caches); 1895 1896 #ifdef CONFIG_ZONE_DMA 1897 static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; 1898 #endif 1899 1900 static int __init setup_slub_min_order(char *str) 1901 { 1902 get_option (&str, &slub_min_order); 1903 1904 return 1; 1905 } 1906 1907 __setup("slub_min_order=", setup_slub_min_order); 1908 1909 static int __init setup_slub_max_order(char *str) 1910 { 1911 get_option (&str, &slub_max_order); 1912 1913 return 1; 1914 } 1915 1916 __setup("slub_max_order=", setup_slub_max_order); 1917 1918 static int __init setup_slub_min_objects(char *str) 1919 { 1920 get_option (&str, &slub_min_objects); 1921 1922 return 1; 1923 } 1924 1925 __setup("slub_min_objects=", setup_slub_min_objects); 1926 1927 static int __init setup_slub_nomerge(char *str) 1928 { 1929 slub_nomerge = 1; 1930 return 1; 1931 } 1932 1933 __setup("slub_nomerge", setup_slub_nomerge); 1934 1935 static int __init setup_slub_debug(char *str) 1936 { 1937 if (!str || *str != '=') 1938 slub_debug = DEBUG_DEFAULT_FLAGS; 1939 else { 1940 str++; 1941 if (*str == 0 || *str == ',') 1942 slub_debug = DEBUG_DEFAULT_FLAGS; 1943 else 1944 for( ;*str && *str != ','; str++) 1945 switch (*str) { 1946 case 'f' : case 'F' : 1947 slub_debug |= SLAB_DEBUG_FREE; 1948 break; 1949 case 'z' : case 'Z' : 1950 slub_debug |= SLAB_RED_ZONE; 1951 break; 1952 case 'p' : case 'P' : 1953 slub_debug |= SLAB_POISON; 1954 break; 1955 case 'u' : case 'U' : 1956 slub_debug |= SLAB_STORE_USER; 1957 break; 1958 case 't' : case 'T' : 1959 slub_debug |= SLAB_TRACE; 1960 break; 1961 default: 1962 printk(KERN_ERR "slub_debug option '%c' " 1963 "unknown. skipped\n",*str); 1964 } 1965 } 1966 1967 if (*str == ',') 1968 slub_debug_slabs = str + 1; 1969 return 1; 1970 } 1971 1972 __setup("slub_debug", setup_slub_debug); 1973 1974 static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, 1975 const char *name, int size, gfp_t gfp_flags) 1976 { 1977 unsigned int flags = 0; 1978 1979 if (gfp_flags & SLUB_DMA) 1980 flags = SLAB_CACHE_DMA; 1981 1982 down_write(&slub_lock); 1983 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 1984 flags, NULL, NULL)) 1985 goto panic; 1986 1987 list_add(&s->list, &slab_caches); 1988 up_write(&slub_lock); 1989 if (sysfs_slab_add(s)) 1990 goto panic; 1991 return s; 1992 1993 panic: 1994 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 1995 } 1996 1997 static struct kmem_cache *get_slab(size_t size, gfp_t flags) 1998 { 1999 int index = kmalloc_index(size); 2000 2001 if (!index) 2002 return NULL; 2003 2004 /* Allocation too large? */ 2005 BUG_ON(index < 0); 2006 2007 #ifdef CONFIG_ZONE_DMA 2008 if ((flags & SLUB_DMA)) { 2009 struct kmem_cache *s; 2010 struct kmem_cache *x; 2011 char *text; 2012 size_t realsize; 2013 2014 s = kmalloc_caches_dma[index]; 2015 if (s) 2016 return s; 2017 2018 /* Dynamically create dma cache */ 2019 x = kmalloc(kmem_size, flags & ~SLUB_DMA); 2020 if (!x) 2021 panic("Unable to allocate memory for dma cache\n"); 2022 2023 if (index <= KMALLOC_SHIFT_HIGH) 2024 realsize = 1 << index; 2025 else { 2026 if (index == 1) 2027 realsize = 96; 2028 else 2029 realsize = 192; 2030 } 2031 2032 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", 2033 (unsigned int)realsize); 2034 s = create_kmalloc_cache(x, text, realsize, flags); 2035 kmalloc_caches_dma[index] = s; 2036 return s; 2037 } 2038 #endif 2039 return &kmalloc_caches[index]; 2040 } 2041 2042 void *__kmalloc(size_t size, gfp_t flags) 2043 { 2044 struct kmem_cache *s = get_slab(size, flags); 2045 2046 if (s) 2047 return slab_alloc(s, flags, -1, __builtin_return_address(0)); 2048 return NULL; 2049 } 2050 EXPORT_SYMBOL(__kmalloc); 2051 2052 #ifdef CONFIG_NUMA 2053 void *__kmalloc_node(size_t size, gfp_t flags, int node) 2054 { 2055 struct kmem_cache *s = get_slab(size, flags); 2056 2057 if (s) 2058 return slab_alloc(s, flags, node, __builtin_return_address(0)); 2059 return NULL; 2060 } 2061 EXPORT_SYMBOL(__kmalloc_node); 2062 #endif 2063 2064 size_t ksize(const void *object) 2065 { 2066 struct page *page = get_object_page(object); 2067 struct kmem_cache *s; 2068 2069 BUG_ON(!page); 2070 s = page->slab; 2071 BUG_ON(!s); 2072 2073 /* 2074 * Debugging requires use of the padding between object 2075 * and whatever may come after it. 2076 */ 2077 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 2078 return s->objsize; 2079 2080 /* 2081 * If we have the need to store the freelist pointer 2082 * back there or track user information then we can 2083 * only use the space before that information. 2084 */ 2085 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 2086 return s->inuse; 2087 2088 /* 2089 * Else we can use all the padding etc for the allocation 2090 */ 2091 return s->size; 2092 } 2093 EXPORT_SYMBOL(ksize); 2094 2095 void kfree(const void *x) 2096 { 2097 struct kmem_cache *s; 2098 struct page *page; 2099 2100 if (!x) 2101 return; 2102 2103 page = virt_to_head_page(x); 2104 s = page->slab; 2105 2106 slab_free(s, page, (void *)x, __builtin_return_address(0)); 2107 } 2108 EXPORT_SYMBOL(kfree); 2109 2110 /* 2111 * kmem_cache_shrink removes empty slabs from the partial lists 2112 * and then sorts the partially allocated slabs by the number 2113 * of items in use. The slabs with the most items in use 2114 * come first. New allocations will remove these from the 2115 * partial list because they are full. The slabs with the 2116 * least items are placed last. If it happens that the objects 2117 * are freed then the page can be returned to the page allocator. 2118 */ 2119 int kmem_cache_shrink(struct kmem_cache *s) 2120 { 2121 int node; 2122 int i; 2123 struct kmem_cache_node *n; 2124 struct page *page; 2125 struct page *t; 2126 struct list_head *slabs_by_inuse = 2127 kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); 2128 unsigned long flags; 2129 2130 if (!slabs_by_inuse) 2131 return -ENOMEM; 2132 2133 flush_all(s); 2134 for_each_online_node(node) { 2135 n = get_node(s, node); 2136 2137 if (!n->nr_partial) 2138 continue; 2139 2140 for (i = 0; i < s->objects; i++) 2141 INIT_LIST_HEAD(slabs_by_inuse + i); 2142 2143 spin_lock_irqsave(&n->list_lock, flags); 2144 2145 /* 2146 * Build lists indexed by the items in use in 2147 * each slab or free slabs if empty. 2148 * 2149 * Note that concurrent frees may occur while 2150 * we hold the list_lock. page->inuse here is 2151 * the upper limit. 2152 */ 2153 list_for_each_entry_safe(page, t, &n->partial, lru) { 2154 if (!page->inuse && slab_trylock(page)) { 2155 /* 2156 * Must hold slab lock here because slab_free 2157 * may have freed the last object and be 2158 * waiting to release the slab. 2159 */ 2160 list_del(&page->lru); 2161 n->nr_partial--; 2162 slab_unlock(page); 2163 discard_slab(s, page); 2164 } else { 2165 if (n->nr_partial > MAX_PARTIAL) 2166 list_move(&page->lru, 2167 slabs_by_inuse + page->inuse); 2168 } 2169 } 2170 2171 if (n->nr_partial <= MAX_PARTIAL) 2172 goto out; 2173 2174 /* 2175 * Rebuild the partial list with the slabs filled up 2176 * most first and the least used slabs at the end. 2177 */ 2178 for (i = s->objects - 1; i >= 0; i--) 2179 list_splice(slabs_by_inuse + i, n->partial.prev); 2180 2181 out: 2182 spin_unlock_irqrestore(&n->list_lock, flags); 2183 } 2184 2185 kfree(slabs_by_inuse); 2186 return 0; 2187 } 2188 EXPORT_SYMBOL(kmem_cache_shrink); 2189 2190 /** 2191 * krealloc - reallocate memory. The contents will remain unchanged. 2192 * 2193 * @p: object to reallocate memory for. 2194 * @new_size: how many bytes of memory are required. 2195 * @flags: the type of memory to allocate. 2196 * 2197 * The contents of the object pointed to are preserved up to the 2198 * lesser of the new and old sizes. If @p is %NULL, krealloc() 2199 * behaves exactly like kmalloc(). If @size is 0 and @p is not a 2200 * %NULL pointer, the object pointed to is freed. 2201 */ 2202 void *krealloc(const void *p, size_t new_size, gfp_t flags) 2203 { 2204 struct kmem_cache *new_cache; 2205 void *ret; 2206 struct page *page; 2207 2208 if (unlikely(!p)) 2209 return kmalloc(new_size, flags); 2210 2211 if (unlikely(!new_size)) { 2212 kfree(p); 2213 return NULL; 2214 } 2215 2216 page = virt_to_head_page(p); 2217 2218 new_cache = get_slab(new_size, flags); 2219 2220 /* 2221 * If new size fits in the current cache, bail out. 2222 */ 2223 if (likely(page->slab == new_cache)) 2224 return (void *)p; 2225 2226 ret = kmalloc(new_size, flags); 2227 if (ret) { 2228 memcpy(ret, p, min(new_size, ksize(p))); 2229 kfree(p); 2230 } 2231 return ret; 2232 } 2233 EXPORT_SYMBOL(krealloc); 2234 2235 /******************************************************************** 2236 * Basic setup of slabs 2237 *******************************************************************/ 2238 2239 void __init kmem_cache_init(void) 2240 { 2241 int i; 2242 2243 #ifdef CONFIG_NUMA 2244 /* 2245 * Must first have the slab cache available for the allocations of the 2246 * struct kmalloc_cache_node's. There is special bootstrap code in 2247 * kmem_cache_open for slab_state == DOWN. 2248 */ 2249 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 2250 sizeof(struct kmem_cache_node), GFP_KERNEL); 2251 #endif 2252 2253 /* Able to allocate the per node structures */ 2254 slab_state = PARTIAL; 2255 2256 /* Caches that are not of the two-to-the-power-of size */ 2257 create_kmalloc_cache(&kmalloc_caches[1], 2258 "kmalloc-96", 96, GFP_KERNEL); 2259 create_kmalloc_cache(&kmalloc_caches[2], 2260 "kmalloc-192", 192, GFP_KERNEL); 2261 2262 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) 2263 create_kmalloc_cache(&kmalloc_caches[i], 2264 "kmalloc", 1 << i, GFP_KERNEL); 2265 2266 slab_state = UP; 2267 2268 /* Provide the correct kmalloc names now that the caches are up */ 2269 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) 2270 kmalloc_caches[i]. name = 2271 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 2272 2273 #ifdef CONFIG_SMP 2274 register_cpu_notifier(&slab_notifier); 2275 #endif 2276 2277 if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */ 2278 kmem_size = offsetof(struct kmem_cache, cpu_slab) 2279 + nr_cpu_ids * sizeof(struct page *); 2280 2281 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2282 " Processors=%d, Nodes=%d\n", 2283 KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES, 2284 slub_min_order, slub_max_order, slub_min_objects, 2285 nr_cpu_ids, nr_node_ids); 2286 } 2287 2288 /* 2289 * Find a mergeable slab cache 2290 */ 2291 static int slab_unmergeable(struct kmem_cache *s) 2292 { 2293 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 2294 return 1; 2295 2296 if (s->ctor || s->dtor) 2297 return 1; 2298 2299 return 0; 2300 } 2301 2302 static struct kmem_cache *find_mergeable(size_t size, 2303 size_t align, unsigned long flags, 2304 void (*ctor)(void *, struct kmem_cache *, unsigned long), 2305 void (*dtor)(void *, struct kmem_cache *, unsigned long)) 2306 { 2307 struct list_head *h; 2308 2309 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 2310 return NULL; 2311 2312 if (ctor || dtor) 2313 return NULL; 2314 2315 size = ALIGN(size, sizeof(void *)); 2316 align = calculate_alignment(flags, align, size); 2317 size = ALIGN(size, align); 2318 2319 list_for_each(h, &slab_caches) { 2320 struct kmem_cache *s = 2321 container_of(h, struct kmem_cache, list); 2322 2323 if (slab_unmergeable(s)) 2324 continue; 2325 2326 if (size > s->size) 2327 continue; 2328 2329 if (((flags | slub_debug) & SLUB_MERGE_SAME) != 2330 (s->flags & SLUB_MERGE_SAME)) 2331 continue; 2332 /* 2333 * Check if alignment is compatible. 2334 * Courtesy of Adrian Drzewiecki 2335 */ 2336 if ((s->size & ~(align -1)) != s->size) 2337 continue; 2338 2339 if (s->size - size >= sizeof(void *)) 2340 continue; 2341 2342 return s; 2343 } 2344 return NULL; 2345 } 2346 2347 struct kmem_cache *kmem_cache_create(const char *name, size_t size, 2348 size_t align, unsigned long flags, 2349 void (*ctor)(void *, struct kmem_cache *, unsigned long), 2350 void (*dtor)(void *, struct kmem_cache *, unsigned long)) 2351 { 2352 struct kmem_cache *s; 2353 2354 down_write(&slub_lock); 2355 s = find_mergeable(size, align, flags, dtor, ctor); 2356 if (s) { 2357 s->refcount++; 2358 /* 2359 * Adjust the object sizes so that we clear 2360 * the complete object on kzalloc. 2361 */ 2362 s->objsize = max(s->objsize, (int)size); 2363 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 2364 if (sysfs_slab_alias(s, name)) 2365 goto err; 2366 } else { 2367 s = kmalloc(kmem_size, GFP_KERNEL); 2368 if (s && kmem_cache_open(s, GFP_KERNEL, name, 2369 size, align, flags, ctor, dtor)) { 2370 if (sysfs_slab_add(s)) { 2371 kfree(s); 2372 goto err; 2373 } 2374 list_add(&s->list, &slab_caches); 2375 } else 2376 kfree(s); 2377 } 2378 up_write(&slub_lock); 2379 return s; 2380 2381 err: 2382 up_write(&slub_lock); 2383 if (flags & SLAB_PANIC) 2384 panic("Cannot create slabcache %s\n", name); 2385 else 2386 s = NULL; 2387 return s; 2388 } 2389 EXPORT_SYMBOL(kmem_cache_create); 2390 2391 void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags) 2392 { 2393 void *x; 2394 2395 x = slab_alloc(s, flags, -1, __builtin_return_address(0)); 2396 if (x) 2397 memset(x, 0, s->objsize); 2398 return x; 2399 } 2400 EXPORT_SYMBOL(kmem_cache_zalloc); 2401 2402 #ifdef CONFIG_SMP 2403 static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) 2404 { 2405 struct list_head *h; 2406 2407 down_read(&slub_lock); 2408 list_for_each(h, &slab_caches) { 2409 struct kmem_cache *s = 2410 container_of(h, struct kmem_cache, list); 2411 2412 func(s, cpu); 2413 } 2414 up_read(&slub_lock); 2415 } 2416 2417 /* 2418 * Use the cpu notifier to insure that the slab are flushed 2419 * when necessary. 2420 */ 2421 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 2422 unsigned long action, void *hcpu) 2423 { 2424 long cpu = (long)hcpu; 2425 2426 switch (action) { 2427 case CPU_UP_CANCELED: 2428 case CPU_DEAD: 2429 for_all_slabs(__flush_cpu_slab, cpu); 2430 break; 2431 default: 2432 break; 2433 } 2434 return NOTIFY_OK; 2435 } 2436 2437 static struct notifier_block __cpuinitdata slab_notifier = 2438 { &slab_cpuup_callback, NULL, 0 }; 2439 2440 #endif 2441 2442 #ifdef CONFIG_NUMA 2443 2444 /***************************************************************** 2445 * Generic reaper used to support the page allocator 2446 * (the cpu slabs are reaped by a per slab workqueue). 2447 * 2448 * Maybe move this to the page allocator? 2449 ****************************************************************/ 2450 2451 static DEFINE_PER_CPU(unsigned long, reap_node); 2452 2453 static void init_reap_node(int cpu) 2454 { 2455 int node; 2456 2457 node = next_node(cpu_to_node(cpu), node_online_map); 2458 if (node == MAX_NUMNODES) 2459 node = first_node(node_online_map); 2460 2461 __get_cpu_var(reap_node) = node; 2462 } 2463 2464 static void next_reap_node(void) 2465 { 2466 int node = __get_cpu_var(reap_node); 2467 2468 /* 2469 * Also drain per cpu pages on remote zones 2470 */ 2471 if (node != numa_node_id()) 2472 drain_node_pages(node); 2473 2474 node = next_node(node, node_online_map); 2475 if (unlikely(node >= MAX_NUMNODES)) 2476 node = first_node(node_online_map); 2477 __get_cpu_var(reap_node) = node; 2478 } 2479 #else 2480 #define init_reap_node(cpu) do { } while (0) 2481 #define next_reap_node(void) do { } while (0) 2482 #endif 2483 2484 #define REAPTIMEOUT_CPUC (2*HZ) 2485 2486 #ifdef CONFIG_SMP 2487 static DEFINE_PER_CPU(struct delayed_work, reap_work); 2488 2489 static void cache_reap(struct work_struct *unused) 2490 { 2491 next_reap_node(); 2492 refresh_cpu_vm_stats(smp_processor_id()); 2493 schedule_delayed_work(&__get_cpu_var(reap_work), 2494 REAPTIMEOUT_CPUC); 2495 } 2496 2497 static void __devinit start_cpu_timer(int cpu) 2498 { 2499 struct delayed_work *reap_work = &per_cpu(reap_work, cpu); 2500 2501 /* 2502 * When this gets called from do_initcalls via cpucache_init(), 2503 * init_workqueues() has already run, so keventd will be setup 2504 * at that time. 2505 */ 2506 if (keventd_up() && reap_work->work.func == NULL) { 2507 init_reap_node(cpu); 2508 INIT_DELAYED_WORK(reap_work, cache_reap); 2509 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 2510 } 2511 } 2512 2513 static int __init cpucache_init(void) 2514 { 2515 int cpu; 2516 2517 /* 2518 * Register the timers that drain pcp pages and update vm statistics 2519 */ 2520 for_each_online_cpu(cpu) 2521 start_cpu_timer(cpu); 2522 return 0; 2523 } 2524 __initcall(cpucache_init); 2525 #endif 2526 2527 #ifdef SLUB_RESILIENCY_TEST 2528 static unsigned long validate_slab_cache(struct kmem_cache *s); 2529 2530 static void resiliency_test(void) 2531 { 2532 u8 *p; 2533 2534 printk(KERN_ERR "SLUB resiliency testing\n"); 2535 printk(KERN_ERR "-----------------------\n"); 2536 printk(KERN_ERR "A. Corruption after allocation\n"); 2537 2538 p = kzalloc(16, GFP_KERNEL); 2539 p[16] = 0x12; 2540 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 2541 " 0x12->0x%p\n\n", p + 16); 2542 2543 validate_slab_cache(kmalloc_caches + 4); 2544 2545 /* Hmmm... The next two are dangerous */ 2546 p = kzalloc(32, GFP_KERNEL); 2547 p[32 + sizeof(void *)] = 0x34; 2548 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 2549 " 0x34 -> -0x%p\n", p); 2550 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 2551 2552 validate_slab_cache(kmalloc_caches + 5); 2553 p = kzalloc(64, GFP_KERNEL); 2554 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 2555 *p = 0x56; 2556 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 2557 p); 2558 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 2559 validate_slab_cache(kmalloc_caches + 6); 2560 2561 printk(KERN_ERR "\nB. Corruption after free\n"); 2562 p = kzalloc(128, GFP_KERNEL); 2563 kfree(p); 2564 *p = 0x78; 2565 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 2566 validate_slab_cache(kmalloc_caches + 7); 2567 2568 p = kzalloc(256, GFP_KERNEL); 2569 kfree(p); 2570 p[50] = 0x9a; 2571 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); 2572 validate_slab_cache(kmalloc_caches + 8); 2573 2574 p = kzalloc(512, GFP_KERNEL); 2575 kfree(p); 2576 p[512] = 0xab; 2577 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 2578 validate_slab_cache(kmalloc_caches + 9); 2579 } 2580 #else 2581 static void resiliency_test(void) {}; 2582 #endif 2583 2584 /* 2585 * These are not as efficient as kmalloc for the non debug case. 2586 * We do not have the page struct available so we have to touch one 2587 * cacheline in struct kmem_cache to check slab flags. 2588 */ 2589 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) 2590 { 2591 struct kmem_cache *s = get_slab(size, gfpflags); 2592 2593 if (!s) 2594 return NULL; 2595 2596 return slab_alloc(s, gfpflags, -1, caller); 2597 } 2598 2599 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 2600 int node, void *caller) 2601 { 2602 struct kmem_cache *s = get_slab(size, gfpflags); 2603 2604 if (!s) 2605 return NULL; 2606 2607 return slab_alloc(s, gfpflags, node, caller); 2608 } 2609 2610 #ifdef CONFIG_SYSFS 2611 2612 static int validate_slab(struct kmem_cache *s, struct page *page) 2613 { 2614 void *p; 2615 void *addr = page_address(page); 2616 unsigned long map[BITS_TO_LONGS(s->objects)]; 2617 2618 if (!check_slab(s, page) || 2619 !on_freelist(s, page, NULL)) 2620 return 0; 2621 2622 /* Now we know that a valid freelist exists */ 2623 bitmap_zero(map, s->objects); 2624 2625 for(p = page->freelist; p; p = get_freepointer(s, p)) { 2626 set_bit((p - addr) / s->size, map); 2627 if (!check_object(s, page, p, 0)) 2628 return 0; 2629 } 2630 2631 for(p = addr; p < addr + s->objects * s->size; p += s->size) 2632 if (!test_bit((p - addr) / s->size, map)) 2633 if (!check_object(s, page, p, 1)) 2634 return 0; 2635 return 1; 2636 } 2637 2638 static void validate_slab_slab(struct kmem_cache *s, struct page *page) 2639 { 2640 if (slab_trylock(page)) { 2641 validate_slab(s, page); 2642 slab_unlock(page); 2643 } else 2644 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", 2645 s->name, page); 2646 2647 if (s->flags & DEBUG_DEFAULT_FLAGS) { 2648 if (!PageError(page)) 2649 printk(KERN_ERR "SLUB %s: PageError not set " 2650 "on slab 0x%p\n", s->name, page); 2651 } else { 2652 if (PageError(page)) 2653 printk(KERN_ERR "SLUB %s: PageError set on " 2654 "slab 0x%p\n", s->name, page); 2655 } 2656 } 2657 2658 static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n) 2659 { 2660 unsigned long count = 0; 2661 struct page *page; 2662 unsigned long flags; 2663 2664 spin_lock_irqsave(&n->list_lock, flags); 2665 2666 list_for_each_entry(page, &n->partial, lru) { 2667 validate_slab_slab(s, page); 2668 count++; 2669 } 2670 if (count != n->nr_partial) 2671 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 2672 "counter=%ld\n", s->name, count, n->nr_partial); 2673 2674 if (!(s->flags & SLAB_STORE_USER)) 2675 goto out; 2676 2677 list_for_each_entry(page, &n->full, lru) { 2678 validate_slab_slab(s, page); 2679 count++; 2680 } 2681 if (count != atomic_long_read(&n->nr_slabs)) 2682 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 2683 "counter=%ld\n", s->name, count, 2684 atomic_long_read(&n->nr_slabs)); 2685 2686 out: 2687 spin_unlock_irqrestore(&n->list_lock, flags); 2688 return count; 2689 } 2690 2691 static unsigned long validate_slab_cache(struct kmem_cache *s) 2692 { 2693 int node; 2694 unsigned long count = 0; 2695 2696 flush_all(s); 2697 for_each_online_node(node) { 2698 struct kmem_cache_node *n = get_node(s, node); 2699 2700 count += validate_slab_node(s, n); 2701 } 2702 return count; 2703 } 2704 2705 /* 2706 * Generate lists of locations where slabcache objects are allocated 2707 * and freed. 2708 */ 2709 2710 struct location { 2711 unsigned long count; 2712 void *addr; 2713 }; 2714 2715 struct loc_track { 2716 unsigned long max; 2717 unsigned long count; 2718 struct location *loc; 2719 }; 2720 2721 static void free_loc_track(struct loc_track *t) 2722 { 2723 if (t->max) 2724 free_pages((unsigned long)t->loc, 2725 get_order(sizeof(struct location) * t->max)); 2726 } 2727 2728 static int alloc_loc_track(struct loc_track *t, unsigned long max) 2729 { 2730 struct location *l; 2731 int order; 2732 2733 if (!max) 2734 max = PAGE_SIZE / sizeof(struct location); 2735 2736 order = get_order(sizeof(struct location) * max); 2737 2738 l = (void *)__get_free_pages(GFP_KERNEL, order); 2739 2740 if (!l) 2741 return 0; 2742 2743 if (t->count) { 2744 memcpy(l, t->loc, sizeof(struct location) * t->count); 2745 free_loc_track(t); 2746 } 2747 t->max = max; 2748 t->loc = l; 2749 return 1; 2750 } 2751 2752 static int add_location(struct loc_track *t, struct kmem_cache *s, 2753 void *addr) 2754 { 2755 long start, end, pos; 2756 struct location *l; 2757 void *caddr; 2758 2759 start = -1; 2760 end = t->count; 2761 2762 for ( ; ; ) { 2763 pos = start + (end - start + 1) / 2; 2764 2765 /* 2766 * There is nothing at "end". If we end up there 2767 * we need to add something to before end. 2768 */ 2769 if (pos == end) 2770 break; 2771 2772 caddr = t->loc[pos].addr; 2773 if (addr == caddr) { 2774 t->loc[pos].count++; 2775 return 1; 2776 } 2777 2778 if (addr < caddr) 2779 end = pos; 2780 else 2781 start = pos; 2782 } 2783 2784 /* 2785 * Not found. Insert new tracking element 2786 */ 2787 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) 2788 return 0; 2789 2790 l = t->loc + pos; 2791 if (pos < t->count) 2792 memmove(l + 1, l, 2793 (t->count - pos) * sizeof(struct location)); 2794 t->count++; 2795 l->count = 1; 2796 l->addr = addr; 2797 return 1; 2798 } 2799 2800 static void process_slab(struct loc_track *t, struct kmem_cache *s, 2801 struct page *page, enum track_item alloc) 2802 { 2803 void *addr = page_address(page); 2804 unsigned long map[BITS_TO_LONGS(s->objects)]; 2805 void *p; 2806 2807 bitmap_zero(map, s->objects); 2808 for (p = page->freelist; p; p = get_freepointer(s, p)) 2809 set_bit((p - addr) / s->size, map); 2810 2811 for (p = addr; p < addr + s->objects * s->size; p += s->size) 2812 if (!test_bit((p - addr) / s->size, map)) { 2813 void *addr = get_track(s, p, alloc)->addr; 2814 2815 add_location(t, s, addr); 2816 } 2817 } 2818 2819 static int list_locations(struct kmem_cache *s, char *buf, 2820 enum track_item alloc) 2821 { 2822 int n = 0; 2823 unsigned long i; 2824 struct loc_track t; 2825 int node; 2826 2827 t.count = 0; 2828 t.max = 0; 2829 2830 /* Push back cpu slabs */ 2831 flush_all(s); 2832 2833 for_each_online_node(node) { 2834 struct kmem_cache_node *n = get_node(s, node); 2835 unsigned long flags; 2836 struct page *page; 2837 2838 if (!atomic_read(&n->nr_slabs)) 2839 continue; 2840 2841 spin_lock_irqsave(&n->list_lock, flags); 2842 list_for_each_entry(page, &n->partial, lru) 2843 process_slab(&t, s, page, alloc); 2844 list_for_each_entry(page, &n->full, lru) 2845 process_slab(&t, s, page, alloc); 2846 spin_unlock_irqrestore(&n->list_lock, flags); 2847 } 2848 2849 for (i = 0; i < t.count; i++) { 2850 void *addr = t.loc[i].addr; 2851 2852 if (n > PAGE_SIZE - 100) 2853 break; 2854 n += sprintf(buf + n, "%7ld ", t.loc[i].count); 2855 if (addr) 2856 n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr); 2857 else 2858 n += sprintf(buf + n, "<not-available>"); 2859 n += sprintf(buf + n, "\n"); 2860 } 2861 2862 free_loc_track(&t); 2863 if (!t.count) 2864 n += sprintf(buf, "No data\n"); 2865 return n; 2866 } 2867 2868 static unsigned long count_partial(struct kmem_cache_node *n) 2869 { 2870 unsigned long flags; 2871 unsigned long x = 0; 2872 struct page *page; 2873 2874 spin_lock_irqsave(&n->list_lock, flags); 2875 list_for_each_entry(page, &n->partial, lru) 2876 x += page->inuse; 2877 spin_unlock_irqrestore(&n->list_lock, flags); 2878 return x; 2879 } 2880 2881 enum slab_stat_type { 2882 SL_FULL, 2883 SL_PARTIAL, 2884 SL_CPU, 2885 SL_OBJECTS 2886 }; 2887 2888 #define SO_FULL (1 << SL_FULL) 2889 #define SO_PARTIAL (1 << SL_PARTIAL) 2890 #define SO_CPU (1 << SL_CPU) 2891 #define SO_OBJECTS (1 << SL_OBJECTS) 2892 2893 static unsigned long slab_objects(struct kmem_cache *s, 2894 char *buf, unsigned long flags) 2895 { 2896 unsigned long total = 0; 2897 int cpu; 2898 int node; 2899 int x; 2900 unsigned long *nodes; 2901 unsigned long *per_cpu; 2902 2903 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 2904 per_cpu = nodes + nr_node_ids; 2905 2906 for_each_possible_cpu(cpu) { 2907 struct page *page = s->cpu_slab[cpu]; 2908 int node; 2909 2910 if (page) { 2911 node = page_to_nid(page); 2912 if (flags & SO_CPU) { 2913 int x = 0; 2914 2915 if (flags & SO_OBJECTS) 2916 x = page->inuse; 2917 else 2918 x = 1; 2919 total += x; 2920 nodes[node] += x; 2921 } 2922 per_cpu[node]++; 2923 } 2924 } 2925 2926 for_each_online_node(node) { 2927 struct kmem_cache_node *n = get_node(s, node); 2928 2929 if (flags & SO_PARTIAL) { 2930 if (flags & SO_OBJECTS) 2931 x = count_partial(n); 2932 else 2933 x = n->nr_partial; 2934 total += x; 2935 nodes[node] += x; 2936 } 2937 2938 if (flags & SO_FULL) { 2939 int full_slabs = atomic_read(&n->nr_slabs) 2940 - per_cpu[node] 2941 - n->nr_partial; 2942 2943 if (flags & SO_OBJECTS) 2944 x = full_slabs * s->objects; 2945 else 2946 x = full_slabs; 2947 total += x; 2948 nodes[node] += x; 2949 } 2950 } 2951 2952 x = sprintf(buf, "%lu", total); 2953 #ifdef CONFIG_NUMA 2954 for_each_online_node(node) 2955 if (nodes[node]) 2956 x += sprintf(buf + x, " N%d=%lu", 2957 node, nodes[node]); 2958 #endif 2959 kfree(nodes); 2960 return x + sprintf(buf + x, "\n"); 2961 } 2962 2963 static int any_slab_objects(struct kmem_cache *s) 2964 { 2965 int node; 2966 int cpu; 2967 2968 for_each_possible_cpu(cpu) 2969 if (s->cpu_slab[cpu]) 2970 return 1; 2971 2972 for_each_node(node) { 2973 struct kmem_cache_node *n = get_node(s, node); 2974 2975 if (n->nr_partial || atomic_read(&n->nr_slabs)) 2976 return 1; 2977 } 2978 return 0; 2979 } 2980 2981 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 2982 #define to_slab(n) container_of(n, struct kmem_cache, kobj); 2983 2984 struct slab_attribute { 2985 struct attribute attr; 2986 ssize_t (*show)(struct kmem_cache *s, char *buf); 2987 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 2988 }; 2989 2990 #define SLAB_ATTR_RO(_name) \ 2991 static struct slab_attribute _name##_attr = __ATTR_RO(_name) 2992 2993 #define SLAB_ATTR(_name) \ 2994 static struct slab_attribute _name##_attr = \ 2995 __ATTR(_name, 0644, _name##_show, _name##_store) 2996 2997 static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 2998 { 2999 return sprintf(buf, "%d\n", s->size); 3000 } 3001 SLAB_ATTR_RO(slab_size); 3002 3003 static ssize_t align_show(struct kmem_cache *s, char *buf) 3004 { 3005 return sprintf(buf, "%d\n", s->align); 3006 } 3007 SLAB_ATTR_RO(align); 3008 3009 static ssize_t object_size_show(struct kmem_cache *s, char *buf) 3010 { 3011 return sprintf(buf, "%d\n", s->objsize); 3012 } 3013 SLAB_ATTR_RO(object_size); 3014 3015 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 3016 { 3017 return sprintf(buf, "%d\n", s->objects); 3018 } 3019 SLAB_ATTR_RO(objs_per_slab); 3020 3021 static ssize_t order_show(struct kmem_cache *s, char *buf) 3022 { 3023 return sprintf(buf, "%d\n", s->order); 3024 } 3025 SLAB_ATTR_RO(order); 3026 3027 static ssize_t ctor_show(struct kmem_cache *s, char *buf) 3028 { 3029 if (s->ctor) { 3030 int n = sprint_symbol(buf, (unsigned long)s->ctor); 3031 3032 return n + sprintf(buf + n, "\n"); 3033 } 3034 return 0; 3035 } 3036 SLAB_ATTR_RO(ctor); 3037 3038 static ssize_t dtor_show(struct kmem_cache *s, char *buf) 3039 { 3040 if (s->dtor) { 3041 int n = sprint_symbol(buf, (unsigned long)s->dtor); 3042 3043 return n + sprintf(buf + n, "\n"); 3044 } 3045 return 0; 3046 } 3047 SLAB_ATTR_RO(dtor); 3048 3049 static ssize_t aliases_show(struct kmem_cache *s, char *buf) 3050 { 3051 return sprintf(buf, "%d\n", s->refcount - 1); 3052 } 3053 SLAB_ATTR_RO(aliases); 3054 3055 static ssize_t slabs_show(struct kmem_cache *s, char *buf) 3056 { 3057 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); 3058 } 3059 SLAB_ATTR_RO(slabs); 3060 3061 static ssize_t partial_show(struct kmem_cache *s, char *buf) 3062 { 3063 return slab_objects(s, buf, SO_PARTIAL); 3064 } 3065 SLAB_ATTR_RO(partial); 3066 3067 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 3068 { 3069 return slab_objects(s, buf, SO_CPU); 3070 } 3071 SLAB_ATTR_RO(cpu_slabs); 3072 3073 static ssize_t objects_show(struct kmem_cache *s, char *buf) 3074 { 3075 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); 3076 } 3077 SLAB_ATTR_RO(objects); 3078 3079 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 3080 { 3081 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 3082 } 3083 3084 static ssize_t sanity_checks_store(struct kmem_cache *s, 3085 const char *buf, size_t length) 3086 { 3087 s->flags &= ~SLAB_DEBUG_FREE; 3088 if (buf[0] == '1') 3089 s->flags |= SLAB_DEBUG_FREE; 3090 return length; 3091 } 3092 SLAB_ATTR(sanity_checks); 3093 3094 static ssize_t trace_show(struct kmem_cache *s, char *buf) 3095 { 3096 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 3097 } 3098 3099 static ssize_t trace_store(struct kmem_cache *s, const char *buf, 3100 size_t length) 3101 { 3102 s->flags &= ~SLAB_TRACE; 3103 if (buf[0] == '1') 3104 s->flags |= SLAB_TRACE; 3105 return length; 3106 } 3107 SLAB_ATTR(trace); 3108 3109 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 3110 { 3111 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 3112 } 3113 3114 static ssize_t reclaim_account_store(struct kmem_cache *s, 3115 const char *buf, size_t length) 3116 { 3117 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 3118 if (buf[0] == '1') 3119 s->flags |= SLAB_RECLAIM_ACCOUNT; 3120 return length; 3121 } 3122 SLAB_ATTR(reclaim_account); 3123 3124 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 3125 { 3126 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 3127 } 3128 SLAB_ATTR_RO(hwcache_align); 3129 3130 #ifdef CONFIG_ZONE_DMA 3131 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 3132 { 3133 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 3134 } 3135 SLAB_ATTR_RO(cache_dma); 3136 #endif 3137 3138 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 3139 { 3140 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 3141 } 3142 SLAB_ATTR_RO(destroy_by_rcu); 3143 3144 static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 3145 { 3146 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 3147 } 3148 3149 static ssize_t red_zone_store(struct kmem_cache *s, 3150 const char *buf, size_t length) 3151 { 3152 if (any_slab_objects(s)) 3153 return -EBUSY; 3154 3155 s->flags &= ~SLAB_RED_ZONE; 3156 if (buf[0] == '1') 3157 s->flags |= SLAB_RED_ZONE; 3158 calculate_sizes(s); 3159 return length; 3160 } 3161 SLAB_ATTR(red_zone); 3162 3163 static ssize_t poison_show(struct kmem_cache *s, char *buf) 3164 { 3165 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 3166 } 3167 3168 static ssize_t poison_store(struct kmem_cache *s, 3169 const char *buf, size_t length) 3170 { 3171 if (any_slab_objects(s)) 3172 return -EBUSY; 3173 3174 s->flags &= ~SLAB_POISON; 3175 if (buf[0] == '1') 3176 s->flags |= SLAB_POISON; 3177 calculate_sizes(s); 3178 return length; 3179 } 3180 SLAB_ATTR(poison); 3181 3182 static ssize_t store_user_show(struct kmem_cache *s, char *buf) 3183 { 3184 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 3185 } 3186 3187 static ssize_t store_user_store(struct kmem_cache *s, 3188 const char *buf, size_t length) 3189 { 3190 if (any_slab_objects(s)) 3191 return -EBUSY; 3192 3193 s->flags &= ~SLAB_STORE_USER; 3194 if (buf[0] == '1') 3195 s->flags |= SLAB_STORE_USER; 3196 calculate_sizes(s); 3197 return length; 3198 } 3199 SLAB_ATTR(store_user); 3200 3201 static ssize_t validate_show(struct kmem_cache *s, char *buf) 3202 { 3203 return 0; 3204 } 3205 3206 static ssize_t validate_store(struct kmem_cache *s, 3207 const char *buf, size_t length) 3208 { 3209 if (buf[0] == '1') 3210 validate_slab_cache(s); 3211 else 3212 return -EINVAL; 3213 return length; 3214 } 3215 SLAB_ATTR(validate); 3216 3217 static ssize_t shrink_show(struct kmem_cache *s, char *buf) 3218 { 3219 return 0; 3220 } 3221 3222 static ssize_t shrink_store(struct kmem_cache *s, 3223 const char *buf, size_t length) 3224 { 3225 if (buf[0] == '1') { 3226 int rc = kmem_cache_shrink(s); 3227 3228 if (rc) 3229 return rc; 3230 } else 3231 return -EINVAL; 3232 return length; 3233 } 3234 SLAB_ATTR(shrink); 3235 3236 static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 3237 { 3238 if (!(s->flags & SLAB_STORE_USER)) 3239 return -ENOSYS; 3240 return list_locations(s, buf, TRACK_ALLOC); 3241 } 3242 SLAB_ATTR_RO(alloc_calls); 3243 3244 static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 3245 { 3246 if (!(s->flags & SLAB_STORE_USER)) 3247 return -ENOSYS; 3248 return list_locations(s, buf, TRACK_FREE); 3249 } 3250 SLAB_ATTR_RO(free_calls); 3251 3252 #ifdef CONFIG_NUMA 3253 static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) 3254 { 3255 return sprintf(buf, "%d\n", s->defrag_ratio / 10); 3256 } 3257 3258 static ssize_t defrag_ratio_store(struct kmem_cache *s, 3259 const char *buf, size_t length) 3260 { 3261 int n = simple_strtoul(buf, NULL, 10); 3262 3263 if (n < 100) 3264 s->defrag_ratio = n * 10; 3265 return length; 3266 } 3267 SLAB_ATTR(defrag_ratio); 3268 #endif 3269 3270 static struct attribute * slab_attrs[] = { 3271 &slab_size_attr.attr, 3272 &object_size_attr.attr, 3273 &objs_per_slab_attr.attr, 3274 &order_attr.attr, 3275 &objects_attr.attr, 3276 &slabs_attr.attr, 3277 &partial_attr.attr, 3278 &cpu_slabs_attr.attr, 3279 &ctor_attr.attr, 3280 &dtor_attr.attr, 3281 &aliases_attr.attr, 3282 &align_attr.attr, 3283 &sanity_checks_attr.attr, 3284 &trace_attr.attr, 3285 &hwcache_align_attr.attr, 3286 &reclaim_account_attr.attr, 3287 &destroy_by_rcu_attr.attr, 3288 &red_zone_attr.attr, 3289 &poison_attr.attr, 3290 &store_user_attr.attr, 3291 &validate_attr.attr, 3292 &shrink_attr.attr, 3293 &alloc_calls_attr.attr, 3294 &free_calls_attr.attr, 3295 #ifdef CONFIG_ZONE_DMA 3296 &cache_dma_attr.attr, 3297 #endif 3298 #ifdef CONFIG_NUMA 3299 &defrag_ratio_attr.attr, 3300 #endif 3301 NULL 3302 }; 3303 3304 static struct attribute_group slab_attr_group = { 3305 .attrs = slab_attrs, 3306 }; 3307 3308 static ssize_t slab_attr_show(struct kobject *kobj, 3309 struct attribute *attr, 3310 char *buf) 3311 { 3312 struct slab_attribute *attribute; 3313 struct kmem_cache *s; 3314 int err; 3315 3316 attribute = to_slab_attr(attr); 3317 s = to_slab(kobj); 3318 3319 if (!attribute->show) 3320 return -EIO; 3321 3322 err = attribute->show(s, buf); 3323 3324 return err; 3325 } 3326 3327 static ssize_t slab_attr_store(struct kobject *kobj, 3328 struct attribute *attr, 3329 const char *buf, size_t len) 3330 { 3331 struct slab_attribute *attribute; 3332 struct kmem_cache *s; 3333 int err; 3334 3335 attribute = to_slab_attr(attr); 3336 s = to_slab(kobj); 3337 3338 if (!attribute->store) 3339 return -EIO; 3340 3341 err = attribute->store(s, buf, len); 3342 3343 return err; 3344 } 3345 3346 static struct sysfs_ops slab_sysfs_ops = { 3347 .show = slab_attr_show, 3348 .store = slab_attr_store, 3349 }; 3350 3351 static struct kobj_type slab_ktype = { 3352 .sysfs_ops = &slab_sysfs_ops, 3353 }; 3354 3355 static int uevent_filter(struct kset *kset, struct kobject *kobj) 3356 { 3357 struct kobj_type *ktype = get_ktype(kobj); 3358 3359 if (ktype == &slab_ktype) 3360 return 1; 3361 return 0; 3362 } 3363 3364 static struct kset_uevent_ops slab_uevent_ops = { 3365 .filter = uevent_filter, 3366 }; 3367 3368 decl_subsys(slab, &slab_ktype, &slab_uevent_ops); 3369 3370 #define ID_STR_LENGTH 64 3371 3372 /* Create a unique string id for a slab cache: 3373 * format 3374 * :[flags-]size:[memory address of kmemcache] 3375 */ 3376 static char *create_unique_id(struct kmem_cache *s) 3377 { 3378 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 3379 char *p = name; 3380 3381 BUG_ON(!name); 3382 3383 *p++ = ':'; 3384 /* 3385 * First flags affecting slabcache operations. We will only 3386 * get here for aliasable slabs so we do not need to support 3387 * too many flags. The flags here must cover all flags that 3388 * are matched during merging to guarantee that the id is 3389 * unique. 3390 */ 3391 if (s->flags & SLAB_CACHE_DMA) 3392 *p++ = 'd'; 3393 if (s->flags & SLAB_RECLAIM_ACCOUNT) 3394 *p++ = 'a'; 3395 if (s->flags & SLAB_DEBUG_FREE) 3396 *p++ = 'F'; 3397 if (p != name + 1) 3398 *p++ = '-'; 3399 p += sprintf(p, "%07d", s->size); 3400 BUG_ON(p > name + ID_STR_LENGTH - 1); 3401 return name; 3402 } 3403 3404 static int sysfs_slab_add(struct kmem_cache *s) 3405 { 3406 int err; 3407 const char *name; 3408 int unmergeable; 3409 3410 if (slab_state < SYSFS) 3411 /* Defer until later */ 3412 return 0; 3413 3414 unmergeable = slab_unmergeable(s); 3415 if (unmergeable) { 3416 /* 3417 * Slabcache can never be merged so we can use the name proper. 3418 * This is typically the case for debug situations. In that 3419 * case we can catch duplicate names easily. 3420 */ 3421 sysfs_remove_link(&slab_subsys.kset.kobj, s->name); 3422 name = s->name; 3423 } else { 3424 /* 3425 * Create a unique name for the slab as a target 3426 * for the symlinks. 3427 */ 3428 name = create_unique_id(s); 3429 } 3430 3431 kobj_set_kset_s(s, slab_subsys); 3432 kobject_set_name(&s->kobj, name); 3433 kobject_init(&s->kobj); 3434 err = kobject_add(&s->kobj); 3435 if (err) 3436 return err; 3437 3438 err = sysfs_create_group(&s->kobj, &slab_attr_group); 3439 if (err) 3440 return err; 3441 kobject_uevent(&s->kobj, KOBJ_ADD); 3442 if (!unmergeable) { 3443 /* Setup first alias */ 3444 sysfs_slab_alias(s, s->name); 3445 kfree(name); 3446 } 3447 return 0; 3448 } 3449 3450 static void sysfs_slab_remove(struct kmem_cache *s) 3451 { 3452 kobject_uevent(&s->kobj, KOBJ_REMOVE); 3453 kobject_del(&s->kobj); 3454 } 3455 3456 /* 3457 * Need to buffer aliases during bootup until sysfs becomes 3458 * available lest we loose that information. 3459 */ 3460 struct saved_alias { 3461 struct kmem_cache *s; 3462 const char *name; 3463 struct saved_alias *next; 3464 }; 3465 3466 struct saved_alias *alias_list; 3467 3468 static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 3469 { 3470 struct saved_alias *al; 3471 3472 if (slab_state == SYSFS) { 3473 /* 3474 * If we have a leftover link then remove it. 3475 */ 3476 sysfs_remove_link(&slab_subsys.kset.kobj, name); 3477 return sysfs_create_link(&slab_subsys.kset.kobj, 3478 &s->kobj, name); 3479 } 3480 3481 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 3482 if (!al) 3483 return -ENOMEM; 3484 3485 al->s = s; 3486 al->name = name; 3487 al->next = alias_list; 3488 alias_list = al; 3489 return 0; 3490 } 3491 3492 static int __init slab_sysfs_init(void) 3493 { 3494 int err; 3495 3496 err = subsystem_register(&slab_subsys); 3497 if (err) { 3498 printk(KERN_ERR "Cannot register slab subsystem.\n"); 3499 return -ENOSYS; 3500 } 3501 3502 finish_bootstrap(); 3503 3504 while (alias_list) { 3505 struct saved_alias *al = alias_list; 3506 3507 alias_list = alias_list->next; 3508 err = sysfs_slab_alias(al->s, al->name); 3509 BUG_ON(err); 3510 kfree(al); 3511 } 3512 3513 resiliency_test(); 3514 return 0; 3515 } 3516 3517 __initcall(slab_sysfs_init); 3518 #else 3519 __initcall(finish_bootstrap); 3520 #endif 3521