1 /* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks and only 6 * uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com> 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/module.h> 13 #include <linux/bit_spinlock.h> 14 #include <linux/interrupt.h> 15 #include <linux/bitops.h> 16 #include <linux/slab.h> 17 #include <linux/seq_file.h> 18 #include <linux/cpu.h> 19 #include <linux/cpuset.h> 20 #include <linux/mempolicy.h> 21 #include <linux/ctype.h> 22 #include <linux/kallsyms.h> 23 24 /* 25 * Lock order: 26 * 1. slab_lock(page) 27 * 2. slab->list_lock 28 * 29 * The slab_lock protects operations on the object of a particular 30 * slab and its metadata in the page struct. If the slab lock 31 * has been taken then no allocations nor frees can be performed 32 * on the objects in the slab nor can the slab be added or removed 33 * from the partial or full lists since this would mean modifying 34 * the page_struct of the slab. 35 * 36 * The list_lock protects the partial and full list on each node and 37 * the partial slab counter. If taken then no new slabs may be added or 38 * removed from the lists nor make the number of partial slabs be modified. 39 * (Note that the total number of slabs is an atomic value that may be 40 * modified without taking the list lock). 41 * 42 * The list_lock is a centralized lock and thus we avoid taking it as 43 * much as possible. As long as SLUB does not have to handle partial 44 * slabs, operations can continue without any centralized lock. F.e. 45 * allocating a long series of objects that fill up slabs does not require 46 * the list lock. 47 * 48 * The lock order is sometimes inverted when we are trying to get a slab 49 * off a list. We take the list_lock and then look for a page on the list 50 * to use. While we do that objects in the slabs may be freed. We can 51 * only operate on the slab if we have also taken the slab_lock. So we use 52 * a slab_trylock() on the slab. If trylock was successful then no frees 53 * can occur anymore and we can use the slab for allocations etc. If the 54 * slab_trylock() does not succeed then frees are in progress in the slab and 55 * we must stay away from it for a while since we may cause a bouncing 56 * cacheline if we try to acquire the lock. So go onto the next slab. 57 * If all pages are busy then we may allocate a new slab instead of reusing 58 * a partial slab. A new slab has noone operating on it and thus there is 59 * no danger of cacheline contention. 60 * 61 * Interrupts are disabled during allocation and deallocation in order to 62 * make the slab allocator safe to use in the context of an irq. In addition 63 * interrupts are disabled to ensure that the processor does not change 64 * while handling per_cpu slabs, due to kernel preemption. 65 * 66 * SLUB assigns one slab for allocation to each processor. 67 * Allocations only occur from these slabs called cpu slabs. 68 * 69 * Slabs with free elements are kept on a partial list and during regular 70 * operations no list for full slabs is used. If an object in a full slab is 71 * freed then the slab will show up again on the partial lists. 72 * We track full slabs for debugging purposes though because otherwise we 73 * cannot scan all objects. 74 * 75 * Slabs are freed when they become empty. Teardown and setup is 76 * minimal so we rely on the page allocators per cpu caches for 77 * fast frees and allocs. 78 * 79 * Overloading of page flags that are otherwise used for LRU management. 80 * 81 * PageActive The slab is frozen and exempt from list processing. 82 * This means that the slab is dedicated to a purpose 83 * such as satisfying allocations for a specific 84 * processor. Objects may be freed in the slab while 85 * it is frozen but slab_free will then skip the usual 86 * list operations. It is up to the processor holding 87 * the slab to integrate the slab into the slab lists 88 * when the slab is no longer needed. 89 * 90 * One use of this flag is to mark slabs that are 91 * used for allocations. Then such a slab becomes a cpu 92 * slab. The cpu slab may be equipped with an additional 93 * lockless_freelist that allows lockless access to 94 * free objects in addition to the regular freelist 95 * that requires the slab lock. 96 * 97 * PageError Slab requires special handling due to debug 98 * options set. This moves slab handling out of 99 * the fast path and disables lockless freelists. 100 */ 101 102 #define FROZEN (1 << PG_active) 103 104 #ifdef CONFIG_SLUB_DEBUG 105 #define SLABDEBUG (1 << PG_error) 106 #else 107 #define SLABDEBUG 0 108 #endif 109 110 static inline int SlabFrozen(struct page *page) 111 { 112 return page->flags & FROZEN; 113 } 114 115 static inline void SetSlabFrozen(struct page *page) 116 { 117 page->flags |= FROZEN; 118 } 119 120 static inline void ClearSlabFrozen(struct page *page) 121 { 122 page->flags &= ~FROZEN; 123 } 124 125 static inline int SlabDebug(struct page *page) 126 { 127 return page->flags & SLABDEBUG; 128 } 129 130 static inline void SetSlabDebug(struct page *page) 131 { 132 page->flags |= SLABDEBUG; 133 } 134 135 static inline void ClearSlabDebug(struct page *page) 136 { 137 page->flags &= ~SLABDEBUG; 138 } 139 140 /* 141 * Issues still to be resolved: 142 * 143 * - The per cpu array is updated for each new slab and and is a remote 144 * cacheline for most nodes. This could become a bouncing cacheline given 145 * enough frequent updates. There are 16 pointers in a cacheline, so at 146 * max 16 cpus could compete for the cacheline which may be okay. 147 * 148 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 149 * 150 * - Variable sizing of the per node arrays 151 */ 152 153 /* Enable to test recovery from slab corruption on boot */ 154 #undef SLUB_RESILIENCY_TEST 155 156 #if PAGE_SHIFT <= 12 157 158 /* 159 * Small page size. Make sure that we do not fragment memory 160 */ 161 #define DEFAULT_MAX_ORDER 1 162 #define DEFAULT_MIN_OBJECTS 4 163 164 #else 165 166 /* 167 * Large page machines are customarily able to handle larger 168 * page orders. 169 */ 170 #define DEFAULT_MAX_ORDER 2 171 #define DEFAULT_MIN_OBJECTS 8 172 173 #endif 174 175 /* 176 * Mininum number of partial slabs. These will be left on the partial 177 * lists even if they are empty. kmem_cache_shrink may reclaim them. 178 */ 179 #define MIN_PARTIAL 2 180 181 /* 182 * Maximum number of desirable partial slabs. 183 * The existence of more partial slabs makes kmem_cache_shrink 184 * sort the partial list by the number of objects in the. 185 */ 186 #define MAX_PARTIAL 10 187 188 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 189 SLAB_POISON | SLAB_STORE_USER) 190 191 /* 192 * Set of flags that will prevent slab merging 193 */ 194 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 195 SLAB_TRACE | SLAB_DESTROY_BY_RCU) 196 197 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 198 SLAB_CACHE_DMA) 199 200 #ifndef ARCH_KMALLOC_MINALIGN 201 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 202 #endif 203 204 #ifndef ARCH_SLAB_MINALIGN 205 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) 206 #endif 207 208 /* 209 * The page->inuse field is 16 bit thus we have this limitation 210 */ 211 #define MAX_OBJECTS_PER_SLAB 65535 212 213 /* Internal SLUB flags */ 214 #define __OBJECT_POISON 0x80000000 /* Poison object */ 215 216 /* Not all arches define cache_line_size */ 217 #ifndef cache_line_size 218 #define cache_line_size() L1_CACHE_BYTES 219 #endif 220 221 static int kmem_size = sizeof(struct kmem_cache); 222 223 #ifdef CONFIG_SMP 224 static struct notifier_block slab_notifier; 225 #endif 226 227 static enum { 228 DOWN, /* No slab functionality available */ 229 PARTIAL, /* kmem_cache_open() works but kmalloc does not */ 230 UP, /* Everything works but does not show up in sysfs */ 231 SYSFS /* Sysfs up */ 232 } slab_state = DOWN; 233 234 /* A list of all slab caches on the system */ 235 static DECLARE_RWSEM(slub_lock); 236 static LIST_HEAD(slab_caches); 237 238 /* 239 * Tracking user of a slab. 240 */ 241 struct track { 242 void *addr; /* Called from address */ 243 int cpu; /* Was running on cpu */ 244 int pid; /* Pid context */ 245 unsigned long when; /* When did the operation occur */ 246 }; 247 248 enum track_item { TRACK_ALLOC, TRACK_FREE }; 249 250 #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) 251 static int sysfs_slab_add(struct kmem_cache *); 252 static int sysfs_slab_alias(struct kmem_cache *, const char *); 253 static void sysfs_slab_remove(struct kmem_cache *); 254 #else 255 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 256 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 257 { return 0; } 258 static inline void sysfs_slab_remove(struct kmem_cache *s) {} 259 #endif 260 261 /******************************************************************** 262 * Core slab cache functions 263 *******************************************************************/ 264 265 int slab_is_available(void) 266 { 267 return slab_state >= UP; 268 } 269 270 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 271 { 272 #ifdef CONFIG_NUMA 273 return s->node[node]; 274 #else 275 return &s->local_node; 276 #endif 277 } 278 279 static inline int check_valid_pointer(struct kmem_cache *s, 280 struct page *page, const void *object) 281 { 282 void *base; 283 284 if (!object) 285 return 1; 286 287 base = page_address(page); 288 if (object < base || object >= base + s->objects * s->size || 289 (object - base) % s->size) { 290 return 0; 291 } 292 293 return 1; 294 } 295 296 /* 297 * Slow version of get and set free pointer. 298 * 299 * This version requires touching the cache lines of kmem_cache which 300 * we avoid to do in the fast alloc free paths. There we obtain the offset 301 * from the page struct. 302 */ 303 static inline void *get_freepointer(struct kmem_cache *s, void *object) 304 { 305 return *(void **)(object + s->offset); 306 } 307 308 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 309 { 310 *(void **)(object + s->offset) = fp; 311 } 312 313 /* Loop over all objects in a slab */ 314 #define for_each_object(__p, __s, __addr) \ 315 for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ 316 __p += (__s)->size) 317 318 /* Scan freelist */ 319 #define for_each_free_object(__p, __s, __free) \ 320 for (__p = (__free); __p; __p = get_freepointer((__s), __p)) 321 322 /* Determine object index from a given position */ 323 static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 324 { 325 return (p - addr) / s->size; 326 } 327 328 #ifdef CONFIG_SLUB_DEBUG 329 /* 330 * Debug settings: 331 */ 332 #ifdef CONFIG_SLUB_DEBUG_ON 333 static int slub_debug = DEBUG_DEFAULT_FLAGS; 334 #else 335 static int slub_debug; 336 #endif 337 338 static char *slub_debug_slabs; 339 340 /* 341 * Object debugging 342 */ 343 static void print_section(char *text, u8 *addr, unsigned int length) 344 { 345 int i, offset; 346 int newline = 1; 347 char ascii[17]; 348 349 ascii[16] = 0; 350 351 for (i = 0; i < length; i++) { 352 if (newline) { 353 printk(KERN_ERR "%8s 0x%p: ", text, addr + i); 354 newline = 0; 355 } 356 printk(" %02x", addr[i]); 357 offset = i % 16; 358 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; 359 if (offset == 15) { 360 printk(" %s\n",ascii); 361 newline = 1; 362 } 363 } 364 if (!newline) { 365 i %= 16; 366 while (i < 16) { 367 printk(" "); 368 ascii[i] = ' '; 369 i++; 370 } 371 printk(" %s\n", ascii); 372 } 373 } 374 375 static struct track *get_track(struct kmem_cache *s, void *object, 376 enum track_item alloc) 377 { 378 struct track *p; 379 380 if (s->offset) 381 p = object + s->offset + sizeof(void *); 382 else 383 p = object + s->inuse; 384 385 return p + alloc; 386 } 387 388 static void set_track(struct kmem_cache *s, void *object, 389 enum track_item alloc, void *addr) 390 { 391 struct track *p; 392 393 if (s->offset) 394 p = object + s->offset + sizeof(void *); 395 else 396 p = object + s->inuse; 397 398 p += alloc; 399 if (addr) { 400 p->addr = addr; 401 p->cpu = smp_processor_id(); 402 p->pid = current ? current->pid : -1; 403 p->when = jiffies; 404 } else 405 memset(p, 0, sizeof(struct track)); 406 } 407 408 static void init_tracking(struct kmem_cache *s, void *object) 409 { 410 if (!(s->flags & SLAB_STORE_USER)) 411 return; 412 413 set_track(s, object, TRACK_FREE, NULL); 414 set_track(s, object, TRACK_ALLOC, NULL); 415 } 416 417 static void print_track(const char *s, struct track *t) 418 { 419 if (!t->addr) 420 return; 421 422 printk(KERN_ERR "INFO: %s in ", s); 423 __print_symbol("%s", (unsigned long)t->addr); 424 printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); 425 } 426 427 static void print_tracking(struct kmem_cache *s, void *object) 428 { 429 if (!(s->flags & SLAB_STORE_USER)) 430 return; 431 432 print_track("Allocated", get_track(s, object, TRACK_ALLOC)); 433 print_track("Freed", get_track(s, object, TRACK_FREE)); 434 } 435 436 static void print_page_info(struct page *page) 437 { 438 printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", 439 page, page->inuse, page->freelist, page->flags); 440 441 } 442 443 static void slab_bug(struct kmem_cache *s, char *fmt, ...) 444 { 445 va_list args; 446 char buf[100]; 447 448 va_start(args, fmt); 449 vsnprintf(buf, sizeof(buf), fmt, args); 450 va_end(args); 451 printk(KERN_ERR "========================================" 452 "=====================================\n"); 453 printk(KERN_ERR "BUG %s: %s\n", s->name, buf); 454 printk(KERN_ERR "----------------------------------------" 455 "-------------------------------------\n\n"); 456 } 457 458 static void slab_fix(struct kmem_cache *s, char *fmt, ...) 459 { 460 va_list args; 461 char buf[100]; 462 463 va_start(args, fmt); 464 vsnprintf(buf, sizeof(buf), fmt, args); 465 va_end(args); 466 printk(KERN_ERR "FIX %s: %s\n", s->name, buf); 467 } 468 469 static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 470 { 471 unsigned int off; /* Offset of last byte */ 472 u8 *addr = page_address(page); 473 474 print_tracking(s, p); 475 476 print_page_info(page); 477 478 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 479 p, p - addr, get_freepointer(s, p)); 480 481 if (p > addr + 16) 482 print_section("Bytes b4", p - 16, 16); 483 484 print_section("Object", p, min(s->objsize, 128)); 485 486 if (s->flags & SLAB_RED_ZONE) 487 print_section("Redzone", p + s->objsize, 488 s->inuse - s->objsize); 489 490 if (s->offset) 491 off = s->offset + sizeof(void *); 492 else 493 off = s->inuse; 494 495 if (s->flags & SLAB_STORE_USER) 496 off += 2 * sizeof(struct track); 497 498 if (off != s->size) 499 /* Beginning of the filler is the free pointer */ 500 print_section("Padding", p + off, s->size - off); 501 502 dump_stack(); 503 } 504 505 static void object_err(struct kmem_cache *s, struct page *page, 506 u8 *object, char *reason) 507 { 508 slab_bug(s, reason); 509 print_trailer(s, page, object); 510 } 511 512 static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) 513 { 514 va_list args; 515 char buf[100]; 516 517 va_start(args, fmt); 518 vsnprintf(buf, sizeof(buf), fmt, args); 519 va_end(args); 520 slab_bug(s, fmt); 521 print_page_info(page); 522 dump_stack(); 523 } 524 525 static void init_object(struct kmem_cache *s, void *object, int active) 526 { 527 u8 *p = object; 528 529 if (s->flags & __OBJECT_POISON) { 530 memset(p, POISON_FREE, s->objsize - 1); 531 p[s->objsize -1] = POISON_END; 532 } 533 534 if (s->flags & SLAB_RED_ZONE) 535 memset(p + s->objsize, 536 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, 537 s->inuse - s->objsize); 538 } 539 540 static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) 541 { 542 while (bytes) { 543 if (*start != (u8)value) 544 return start; 545 start++; 546 bytes--; 547 } 548 return NULL; 549 } 550 551 static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 552 void *from, void *to) 553 { 554 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); 555 memset(from, data, to - from); 556 } 557 558 static int check_bytes_and_report(struct kmem_cache *s, struct page *page, 559 u8 *object, char *what, 560 u8* start, unsigned int value, unsigned int bytes) 561 { 562 u8 *fault; 563 u8 *end; 564 565 fault = check_bytes(start, value, bytes); 566 if (!fault) 567 return 1; 568 569 end = start + bytes; 570 while (end > fault && end[-1] == value) 571 end--; 572 573 slab_bug(s, "%s overwritten", what); 574 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", 575 fault, end - 1, fault[0], value); 576 print_trailer(s, page, object); 577 578 restore_bytes(s, what, value, fault, end); 579 return 0; 580 } 581 582 /* 583 * Object layout: 584 * 585 * object address 586 * Bytes of the object to be managed. 587 * If the freepointer may overlay the object then the free 588 * pointer is the first word of the object. 589 * 590 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 591 * 0xa5 (POISON_END) 592 * 593 * object + s->objsize 594 * Padding to reach word boundary. This is also used for Redzoning. 595 * Padding is extended by another word if Redzoning is enabled and 596 * objsize == inuse. 597 * 598 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 599 * 0xcc (RED_ACTIVE) for objects in use. 600 * 601 * object + s->inuse 602 * Meta data starts here. 603 * 604 * A. Free pointer (if we cannot overwrite object on free) 605 * B. Tracking data for SLAB_STORE_USER 606 * C. Padding to reach required alignment boundary or at mininum 607 * one word if debuggin is on to be able to detect writes 608 * before the word boundary. 609 * 610 * Padding is done using 0x5a (POISON_INUSE) 611 * 612 * object + s->size 613 * Nothing is used beyond s->size. 614 * 615 * If slabcaches are merged then the objsize and inuse boundaries are mostly 616 * ignored. And therefore no slab options that rely on these boundaries 617 * may be used with merged slabcaches. 618 */ 619 620 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 621 { 622 unsigned long off = s->inuse; /* The end of info */ 623 624 if (s->offset) 625 /* Freepointer is placed after the object. */ 626 off += sizeof(void *); 627 628 if (s->flags & SLAB_STORE_USER) 629 /* We also have user information there */ 630 off += 2 * sizeof(struct track); 631 632 if (s->size == off) 633 return 1; 634 635 return check_bytes_and_report(s, page, p, "Object padding", 636 p + off, POISON_INUSE, s->size - off); 637 } 638 639 static int slab_pad_check(struct kmem_cache *s, struct page *page) 640 { 641 u8 *start; 642 u8 *fault; 643 u8 *end; 644 int length; 645 int remainder; 646 647 if (!(s->flags & SLAB_POISON)) 648 return 1; 649 650 start = page_address(page); 651 end = start + (PAGE_SIZE << s->order); 652 length = s->objects * s->size; 653 remainder = end - (start + length); 654 if (!remainder) 655 return 1; 656 657 fault = check_bytes(start + length, POISON_INUSE, remainder); 658 if (!fault) 659 return 1; 660 while (end > fault && end[-1] == POISON_INUSE) 661 end--; 662 663 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 664 print_section("Padding", start, length); 665 666 restore_bytes(s, "slab padding", POISON_INUSE, start, end); 667 return 0; 668 } 669 670 static int check_object(struct kmem_cache *s, struct page *page, 671 void *object, int active) 672 { 673 u8 *p = object; 674 u8 *endobject = object + s->objsize; 675 676 if (s->flags & SLAB_RED_ZONE) { 677 unsigned int red = 678 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; 679 680 if (!check_bytes_and_report(s, page, object, "Redzone", 681 endobject, red, s->inuse - s->objsize)) 682 return 0; 683 } else { 684 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) 685 check_bytes_and_report(s, page, p, "Alignment padding", endobject, 686 POISON_INUSE, s->inuse - s->objsize); 687 } 688 689 if (s->flags & SLAB_POISON) { 690 if (!active && (s->flags & __OBJECT_POISON) && 691 (!check_bytes_and_report(s, page, p, "Poison", p, 692 POISON_FREE, s->objsize - 1) || 693 !check_bytes_and_report(s, page, p, "Poison", 694 p + s->objsize -1, POISON_END, 1))) 695 return 0; 696 /* 697 * check_pad_bytes cleans up on its own. 698 */ 699 check_pad_bytes(s, page, p); 700 } 701 702 if (!s->offset && active) 703 /* 704 * Object and freepointer overlap. Cannot check 705 * freepointer while object is allocated. 706 */ 707 return 1; 708 709 /* Check free pointer validity */ 710 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 711 object_err(s, page, p, "Freepointer corrupt"); 712 /* 713 * No choice but to zap it and thus loose the remainder 714 * of the free objects in this slab. May cause 715 * another error because the object count is now wrong. 716 */ 717 set_freepointer(s, p, NULL); 718 return 0; 719 } 720 return 1; 721 } 722 723 static int check_slab(struct kmem_cache *s, struct page *page) 724 { 725 VM_BUG_ON(!irqs_disabled()); 726 727 if (!PageSlab(page)) { 728 slab_err(s, page, "Not a valid slab page"); 729 return 0; 730 } 731 if (page->offset * sizeof(void *) != s->offset) { 732 slab_err(s, page, "Corrupted offset %lu", 733 (unsigned long)(page->offset * sizeof(void *))); 734 return 0; 735 } 736 if (page->inuse > s->objects) { 737 slab_err(s, page, "inuse %u > max %u", 738 s->name, page->inuse, s->objects); 739 return 0; 740 } 741 /* Slab_pad_check fixes things up after itself */ 742 slab_pad_check(s, page); 743 return 1; 744 } 745 746 /* 747 * Determine if a certain object on a page is on the freelist. Must hold the 748 * slab lock to guarantee that the chains are in a consistent state. 749 */ 750 static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 751 { 752 int nr = 0; 753 void *fp = page->freelist; 754 void *object = NULL; 755 756 while (fp && nr <= s->objects) { 757 if (fp == search) 758 return 1; 759 if (!check_valid_pointer(s, page, fp)) { 760 if (object) { 761 object_err(s, page, object, 762 "Freechain corrupt"); 763 set_freepointer(s, object, NULL); 764 break; 765 } else { 766 slab_err(s, page, "Freepointer corrupt"); 767 page->freelist = NULL; 768 page->inuse = s->objects; 769 slab_fix(s, "Freelist cleared"); 770 return 0; 771 } 772 break; 773 } 774 object = fp; 775 fp = get_freepointer(s, object); 776 nr++; 777 } 778 779 if (page->inuse != s->objects - nr) { 780 slab_err(s, page, "Wrong object count. Counter is %d but " 781 "counted were %d", page->inuse, s->objects - nr); 782 page->inuse = s->objects - nr; 783 slab_fix(s, "Object count adjusted."); 784 } 785 return search == NULL; 786 } 787 788 static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) 789 { 790 if (s->flags & SLAB_TRACE) { 791 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 792 s->name, 793 alloc ? "alloc" : "free", 794 object, page->inuse, 795 page->freelist); 796 797 if (!alloc) 798 print_section("Object", (void *)object, s->objsize); 799 800 dump_stack(); 801 } 802 } 803 804 /* 805 * Tracking of fully allocated slabs for debugging purposes. 806 */ 807 static void add_full(struct kmem_cache_node *n, struct page *page) 808 { 809 spin_lock(&n->list_lock); 810 list_add(&page->lru, &n->full); 811 spin_unlock(&n->list_lock); 812 } 813 814 static void remove_full(struct kmem_cache *s, struct page *page) 815 { 816 struct kmem_cache_node *n; 817 818 if (!(s->flags & SLAB_STORE_USER)) 819 return; 820 821 n = get_node(s, page_to_nid(page)); 822 823 spin_lock(&n->list_lock); 824 list_del(&page->lru); 825 spin_unlock(&n->list_lock); 826 } 827 828 static void setup_object_debug(struct kmem_cache *s, struct page *page, 829 void *object) 830 { 831 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 832 return; 833 834 init_object(s, object, 0); 835 init_tracking(s, object); 836 } 837 838 static int alloc_debug_processing(struct kmem_cache *s, struct page *page, 839 void *object, void *addr) 840 { 841 if (!check_slab(s, page)) 842 goto bad; 843 844 if (object && !on_freelist(s, page, object)) { 845 object_err(s, page, object, "Object already allocated"); 846 goto bad; 847 } 848 849 if (!check_valid_pointer(s, page, object)) { 850 object_err(s, page, object, "Freelist Pointer check fails"); 851 goto bad; 852 } 853 854 if (object && !check_object(s, page, object, 0)) 855 goto bad; 856 857 /* Success perform special debug activities for allocs */ 858 if (s->flags & SLAB_STORE_USER) 859 set_track(s, object, TRACK_ALLOC, addr); 860 trace(s, page, object, 1); 861 init_object(s, object, 1); 862 return 1; 863 864 bad: 865 if (PageSlab(page)) { 866 /* 867 * If this is a slab page then lets do the best we can 868 * to avoid issues in the future. Marking all objects 869 * as used avoids touching the remaining objects. 870 */ 871 slab_fix(s, "Marking all objects used"); 872 page->inuse = s->objects; 873 page->freelist = NULL; 874 /* Fix up fields that may be corrupted */ 875 page->offset = s->offset / sizeof(void *); 876 } 877 return 0; 878 } 879 880 static int free_debug_processing(struct kmem_cache *s, struct page *page, 881 void *object, void *addr) 882 { 883 if (!check_slab(s, page)) 884 goto fail; 885 886 if (!check_valid_pointer(s, page, object)) { 887 slab_err(s, page, "Invalid object pointer 0x%p", object); 888 goto fail; 889 } 890 891 if (on_freelist(s, page, object)) { 892 object_err(s, page, object, "Object already free"); 893 goto fail; 894 } 895 896 if (!check_object(s, page, object, 1)) 897 return 0; 898 899 if (unlikely(s != page->slab)) { 900 if (!PageSlab(page)) 901 slab_err(s, page, "Attempt to free object(0x%p) " 902 "outside of slab", object); 903 else 904 if (!page->slab) { 905 printk(KERN_ERR 906 "SLUB <none>: no slab for object 0x%p.\n", 907 object); 908 dump_stack(); 909 } 910 else 911 object_err(s, page, object, 912 "page slab pointer corrupt."); 913 goto fail; 914 } 915 916 /* Special debug activities for freeing objects */ 917 if (!SlabFrozen(page) && !page->freelist) 918 remove_full(s, page); 919 if (s->flags & SLAB_STORE_USER) 920 set_track(s, object, TRACK_FREE, addr); 921 trace(s, page, object, 0); 922 init_object(s, object, 0); 923 return 1; 924 925 fail: 926 slab_fix(s, "Object at 0x%p not freed", object); 927 return 0; 928 } 929 930 static int __init setup_slub_debug(char *str) 931 { 932 slub_debug = DEBUG_DEFAULT_FLAGS; 933 if (*str++ != '=' || !*str) 934 /* 935 * No options specified. Switch on full debugging. 936 */ 937 goto out; 938 939 if (*str == ',') 940 /* 941 * No options but restriction on slabs. This means full 942 * debugging for slabs matching a pattern. 943 */ 944 goto check_slabs; 945 946 slub_debug = 0; 947 if (*str == '-') 948 /* 949 * Switch off all debugging measures. 950 */ 951 goto out; 952 953 /* 954 * Determine which debug features should be switched on 955 */ 956 for ( ;*str && *str != ','; str++) { 957 switch (tolower(*str)) { 958 case 'f': 959 slub_debug |= SLAB_DEBUG_FREE; 960 break; 961 case 'z': 962 slub_debug |= SLAB_RED_ZONE; 963 break; 964 case 'p': 965 slub_debug |= SLAB_POISON; 966 break; 967 case 'u': 968 slub_debug |= SLAB_STORE_USER; 969 break; 970 case 't': 971 slub_debug |= SLAB_TRACE; 972 break; 973 default: 974 printk(KERN_ERR "slub_debug option '%c' " 975 "unknown. skipped\n",*str); 976 } 977 } 978 979 check_slabs: 980 if (*str == ',') 981 slub_debug_slabs = str + 1; 982 out: 983 return 1; 984 } 985 986 __setup("slub_debug", setup_slub_debug); 987 988 static void kmem_cache_open_debug_check(struct kmem_cache *s) 989 { 990 /* 991 * The page->offset field is only 16 bit wide. This is an offset 992 * in units of words from the beginning of an object. If the slab 993 * size is bigger then we cannot move the free pointer behind the 994 * object anymore. 995 * 996 * On 32 bit platforms the limit is 256k. On 64bit platforms 997 * the limit is 512k. 998 * 999 * Debugging or ctor may create a need to move the free 1000 * pointer. Fail if this happens. 1001 */ 1002 if (s->objsize >= 65535 * sizeof(void *)) { 1003 BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON | 1004 SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); 1005 BUG_ON(s->ctor); 1006 } 1007 else 1008 /* 1009 * Enable debugging if selected on the kernel commandline. 1010 */ 1011 if (slub_debug && (!slub_debug_slabs || 1012 strncmp(slub_debug_slabs, s->name, 1013 strlen(slub_debug_slabs)) == 0)) 1014 s->flags |= slub_debug; 1015 } 1016 #else 1017 static inline void setup_object_debug(struct kmem_cache *s, 1018 struct page *page, void *object) {} 1019 1020 static inline int alloc_debug_processing(struct kmem_cache *s, 1021 struct page *page, void *object, void *addr) { return 0; } 1022 1023 static inline int free_debug_processing(struct kmem_cache *s, 1024 struct page *page, void *object, void *addr) { return 0; } 1025 1026 static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1027 { return 1; } 1028 static inline int check_object(struct kmem_cache *s, struct page *page, 1029 void *object, int active) { return 1; } 1030 static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1031 static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {} 1032 #define slub_debug 0 1033 #endif 1034 /* 1035 * Slab allocation and freeing 1036 */ 1037 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1038 { 1039 struct page * page; 1040 int pages = 1 << s->order; 1041 1042 if (s->order) 1043 flags |= __GFP_COMP; 1044 1045 if (s->flags & SLAB_CACHE_DMA) 1046 flags |= SLUB_DMA; 1047 1048 if (node == -1) 1049 page = alloc_pages(flags, s->order); 1050 else 1051 page = alloc_pages_node(node, flags, s->order); 1052 1053 if (!page) 1054 return NULL; 1055 1056 mod_zone_page_state(page_zone(page), 1057 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1058 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1059 pages); 1060 1061 return page; 1062 } 1063 1064 static void setup_object(struct kmem_cache *s, struct page *page, 1065 void *object) 1066 { 1067 setup_object_debug(s, page, object); 1068 if (unlikely(s->ctor)) 1069 s->ctor(object, s, 0); 1070 } 1071 1072 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1073 { 1074 struct page *page; 1075 struct kmem_cache_node *n; 1076 void *start; 1077 void *end; 1078 void *last; 1079 void *p; 1080 1081 BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); 1082 1083 if (flags & __GFP_WAIT) 1084 local_irq_enable(); 1085 1086 page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); 1087 if (!page) 1088 goto out; 1089 1090 n = get_node(s, page_to_nid(page)); 1091 if (n) 1092 atomic_long_inc(&n->nr_slabs); 1093 page->offset = s->offset / sizeof(void *); 1094 page->slab = s; 1095 page->flags |= 1 << PG_slab; 1096 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 1097 SLAB_STORE_USER | SLAB_TRACE)) 1098 SetSlabDebug(page); 1099 1100 start = page_address(page); 1101 end = start + s->objects * s->size; 1102 1103 if (unlikely(s->flags & SLAB_POISON)) 1104 memset(start, POISON_INUSE, PAGE_SIZE << s->order); 1105 1106 last = start; 1107 for_each_object(p, s, start) { 1108 setup_object(s, page, last); 1109 set_freepointer(s, last, p); 1110 last = p; 1111 } 1112 setup_object(s, page, last); 1113 set_freepointer(s, last, NULL); 1114 1115 page->freelist = start; 1116 page->lockless_freelist = NULL; 1117 page->inuse = 0; 1118 out: 1119 if (flags & __GFP_WAIT) 1120 local_irq_disable(); 1121 return page; 1122 } 1123 1124 static void __free_slab(struct kmem_cache *s, struct page *page) 1125 { 1126 int pages = 1 << s->order; 1127 1128 if (unlikely(SlabDebug(page))) { 1129 void *p; 1130 1131 slab_pad_check(s, page); 1132 for_each_object(p, s, page_address(page)) 1133 check_object(s, page, p, 0); 1134 } 1135 1136 mod_zone_page_state(page_zone(page), 1137 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1138 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1139 - pages); 1140 1141 page->mapping = NULL; 1142 __free_pages(page, s->order); 1143 } 1144 1145 static void rcu_free_slab(struct rcu_head *h) 1146 { 1147 struct page *page; 1148 1149 page = container_of((struct list_head *)h, struct page, lru); 1150 __free_slab(page->slab, page); 1151 } 1152 1153 static void free_slab(struct kmem_cache *s, struct page *page) 1154 { 1155 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1156 /* 1157 * RCU free overloads the RCU head over the LRU 1158 */ 1159 struct rcu_head *head = (void *)&page->lru; 1160 1161 call_rcu(head, rcu_free_slab); 1162 } else 1163 __free_slab(s, page); 1164 } 1165 1166 static void discard_slab(struct kmem_cache *s, struct page *page) 1167 { 1168 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1169 1170 atomic_long_dec(&n->nr_slabs); 1171 reset_page_mapcount(page); 1172 ClearSlabDebug(page); 1173 __ClearPageSlab(page); 1174 free_slab(s, page); 1175 } 1176 1177 /* 1178 * Per slab locking using the pagelock 1179 */ 1180 static __always_inline void slab_lock(struct page *page) 1181 { 1182 bit_spin_lock(PG_locked, &page->flags); 1183 } 1184 1185 static __always_inline void slab_unlock(struct page *page) 1186 { 1187 bit_spin_unlock(PG_locked, &page->flags); 1188 } 1189 1190 static __always_inline int slab_trylock(struct page *page) 1191 { 1192 int rc = 1; 1193 1194 rc = bit_spin_trylock(PG_locked, &page->flags); 1195 return rc; 1196 } 1197 1198 /* 1199 * Management of partially allocated slabs 1200 */ 1201 static void add_partial_tail(struct kmem_cache_node *n, struct page *page) 1202 { 1203 spin_lock(&n->list_lock); 1204 n->nr_partial++; 1205 list_add_tail(&page->lru, &n->partial); 1206 spin_unlock(&n->list_lock); 1207 } 1208 1209 static void add_partial(struct kmem_cache_node *n, struct page *page) 1210 { 1211 spin_lock(&n->list_lock); 1212 n->nr_partial++; 1213 list_add(&page->lru, &n->partial); 1214 spin_unlock(&n->list_lock); 1215 } 1216 1217 static void remove_partial(struct kmem_cache *s, 1218 struct page *page) 1219 { 1220 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1221 1222 spin_lock(&n->list_lock); 1223 list_del(&page->lru); 1224 n->nr_partial--; 1225 spin_unlock(&n->list_lock); 1226 } 1227 1228 /* 1229 * Lock slab and remove from the partial list. 1230 * 1231 * Must hold list_lock. 1232 */ 1233 static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) 1234 { 1235 if (slab_trylock(page)) { 1236 list_del(&page->lru); 1237 n->nr_partial--; 1238 SetSlabFrozen(page); 1239 return 1; 1240 } 1241 return 0; 1242 } 1243 1244 /* 1245 * Try to allocate a partial slab from a specific node. 1246 */ 1247 static struct page *get_partial_node(struct kmem_cache_node *n) 1248 { 1249 struct page *page; 1250 1251 /* 1252 * Racy check. If we mistakenly see no partial slabs then we 1253 * just allocate an empty slab. If we mistakenly try to get a 1254 * partial slab and there is none available then get_partials() 1255 * will return NULL. 1256 */ 1257 if (!n || !n->nr_partial) 1258 return NULL; 1259 1260 spin_lock(&n->list_lock); 1261 list_for_each_entry(page, &n->partial, lru) 1262 if (lock_and_freeze_slab(n, page)) 1263 goto out; 1264 page = NULL; 1265 out: 1266 spin_unlock(&n->list_lock); 1267 return page; 1268 } 1269 1270 /* 1271 * Get a page from somewhere. Search in increasing NUMA distances. 1272 */ 1273 static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1274 { 1275 #ifdef CONFIG_NUMA 1276 struct zonelist *zonelist; 1277 struct zone **z; 1278 struct page *page; 1279 1280 /* 1281 * The defrag ratio allows a configuration of the tradeoffs between 1282 * inter node defragmentation and node local allocations. A lower 1283 * defrag_ratio increases the tendency to do local allocations 1284 * instead of attempting to obtain partial slabs from other nodes. 1285 * 1286 * If the defrag_ratio is set to 0 then kmalloc() always 1287 * returns node local objects. If the ratio is higher then kmalloc() 1288 * may return off node objects because partial slabs are obtained 1289 * from other nodes and filled up. 1290 * 1291 * If /sys/slab/xx/defrag_ratio is set to 100 (which makes 1292 * defrag_ratio = 1000) then every (well almost) allocation will 1293 * first attempt to defrag slab caches on other nodes. This means 1294 * scanning over all nodes to look for partial slabs which may be 1295 * expensive if we do it every time we are trying to find a slab 1296 * with available objects. 1297 */ 1298 if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) 1299 return NULL; 1300 1301 zonelist = &NODE_DATA(slab_node(current->mempolicy)) 1302 ->node_zonelists[gfp_zone(flags)]; 1303 for (z = zonelist->zones; *z; z++) { 1304 struct kmem_cache_node *n; 1305 1306 n = get_node(s, zone_to_nid(*z)); 1307 1308 if (n && cpuset_zone_allowed_hardwall(*z, flags) && 1309 n->nr_partial > MIN_PARTIAL) { 1310 page = get_partial_node(n); 1311 if (page) 1312 return page; 1313 } 1314 } 1315 #endif 1316 return NULL; 1317 } 1318 1319 /* 1320 * Get a partial page, lock it and return it. 1321 */ 1322 static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1323 { 1324 struct page *page; 1325 int searchnode = (node == -1) ? numa_node_id() : node; 1326 1327 page = get_partial_node(get_node(s, searchnode)); 1328 if (page || (flags & __GFP_THISNODE)) 1329 return page; 1330 1331 return get_any_partial(s, flags); 1332 } 1333 1334 /* 1335 * Move a page back to the lists. 1336 * 1337 * Must be called with the slab lock held. 1338 * 1339 * On exit the slab lock will have been dropped. 1340 */ 1341 static void unfreeze_slab(struct kmem_cache *s, struct page *page) 1342 { 1343 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1344 1345 ClearSlabFrozen(page); 1346 if (page->inuse) { 1347 1348 if (page->freelist) 1349 add_partial(n, page); 1350 else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) 1351 add_full(n, page); 1352 slab_unlock(page); 1353 1354 } else { 1355 if (n->nr_partial < MIN_PARTIAL) { 1356 /* 1357 * Adding an empty slab to the partial slabs in order 1358 * to avoid page allocator overhead. This slab needs 1359 * to come after the other slabs with objects in 1360 * order to fill them up. That way the size of the 1361 * partial list stays small. kmem_cache_shrink can 1362 * reclaim empty slabs from the partial list. 1363 */ 1364 add_partial_tail(n, page); 1365 slab_unlock(page); 1366 } else { 1367 slab_unlock(page); 1368 discard_slab(s, page); 1369 } 1370 } 1371 } 1372 1373 /* 1374 * Remove the cpu slab 1375 */ 1376 static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) 1377 { 1378 /* 1379 * Merge cpu freelist into freelist. Typically we get here 1380 * because both freelists are empty. So this is unlikely 1381 * to occur. 1382 */ 1383 while (unlikely(page->lockless_freelist)) { 1384 void **object; 1385 1386 /* Retrieve object from cpu_freelist */ 1387 object = page->lockless_freelist; 1388 page->lockless_freelist = page->lockless_freelist[page->offset]; 1389 1390 /* And put onto the regular freelist */ 1391 object[page->offset] = page->freelist; 1392 page->freelist = object; 1393 page->inuse--; 1394 } 1395 s->cpu_slab[cpu] = NULL; 1396 unfreeze_slab(s, page); 1397 } 1398 1399 static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) 1400 { 1401 slab_lock(page); 1402 deactivate_slab(s, page, cpu); 1403 } 1404 1405 /* 1406 * Flush cpu slab. 1407 * Called from IPI handler with interrupts disabled. 1408 */ 1409 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1410 { 1411 struct page *page = s->cpu_slab[cpu]; 1412 1413 if (likely(page)) 1414 flush_slab(s, page, cpu); 1415 } 1416 1417 static void flush_cpu_slab(void *d) 1418 { 1419 struct kmem_cache *s = d; 1420 int cpu = smp_processor_id(); 1421 1422 __flush_cpu_slab(s, cpu); 1423 } 1424 1425 static void flush_all(struct kmem_cache *s) 1426 { 1427 #ifdef CONFIG_SMP 1428 on_each_cpu(flush_cpu_slab, s, 1, 1); 1429 #else 1430 unsigned long flags; 1431 1432 local_irq_save(flags); 1433 flush_cpu_slab(s); 1434 local_irq_restore(flags); 1435 #endif 1436 } 1437 1438 /* 1439 * Slow path. The lockless freelist is empty or we need to perform 1440 * debugging duties. 1441 * 1442 * Interrupts are disabled. 1443 * 1444 * Processing is still very fast if new objects have been freed to the 1445 * regular freelist. In that case we simply take over the regular freelist 1446 * as the lockless freelist and zap the regular freelist. 1447 * 1448 * If that is not working then we fall back to the partial lists. We take the 1449 * first element of the freelist as the object to allocate now and move the 1450 * rest of the freelist to the lockless freelist. 1451 * 1452 * And if we were unable to get a new slab from the partial slab lists then 1453 * we need to allocate a new slab. This is slowest path since we may sleep. 1454 */ 1455 static void *__slab_alloc(struct kmem_cache *s, 1456 gfp_t gfpflags, int node, void *addr, struct page *page) 1457 { 1458 void **object; 1459 int cpu = smp_processor_id(); 1460 1461 if (!page) 1462 goto new_slab; 1463 1464 slab_lock(page); 1465 if (unlikely(node != -1 && page_to_nid(page) != node)) 1466 goto another_slab; 1467 load_freelist: 1468 object = page->freelist; 1469 if (unlikely(!object)) 1470 goto another_slab; 1471 if (unlikely(SlabDebug(page))) 1472 goto debug; 1473 1474 object = page->freelist; 1475 page->lockless_freelist = object[page->offset]; 1476 page->inuse = s->objects; 1477 page->freelist = NULL; 1478 slab_unlock(page); 1479 return object; 1480 1481 another_slab: 1482 deactivate_slab(s, page, cpu); 1483 1484 new_slab: 1485 page = get_partial(s, gfpflags, node); 1486 if (page) { 1487 s->cpu_slab[cpu] = page; 1488 goto load_freelist; 1489 } 1490 1491 page = new_slab(s, gfpflags, node); 1492 if (page) { 1493 cpu = smp_processor_id(); 1494 if (s->cpu_slab[cpu]) { 1495 /* 1496 * Someone else populated the cpu_slab while we 1497 * enabled interrupts, or we have gotten scheduled 1498 * on another cpu. The page may not be on the 1499 * requested node even if __GFP_THISNODE was 1500 * specified. So we need to recheck. 1501 */ 1502 if (node == -1 || 1503 page_to_nid(s->cpu_slab[cpu]) == node) { 1504 /* 1505 * Current cpuslab is acceptable and we 1506 * want the current one since its cache hot 1507 */ 1508 discard_slab(s, page); 1509 page = s->cpu_slab[cpu]; 1510 slab_lock(page); 1511 goto load_freelist; 1512 } 1513 /* New slab does not fit our expectations */ 1514 flush_slab(s, s->cpu_slab[cpu], cpu); 1515 } 1516 slab_lock(page); 1517 SetSlabFrozen(page); 1518 s->cpu_slab[cpu] = page; 1519 goto load_freelist; 1520 } 1521 return NULL; 1522 debug: 1523 object = page->freelist; 1524 if (!alloc_debug_processing(s, page, object, addr)) 1525 goto another_slab; 1526 1527 page->inuse++; 1528 page->freelist = object[page->offset]; 1529 slab_unlock(page); 1530 return object; 1531 } 1532 1533 /* 1534 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) 1535 * have the fastpath folded into their functions. So no function call 1536 * overhead for requests that can be satisfied on the fastpath. 1537 * 1538 * The fastpath works by first checking if the lockless freelist can be used. 1539 * If not then __slab_alloc is called for slow processing. 1540 * 1541 * Otherwise we can simply pick the next object from the lockless free list. 1542 */ 1543 static void __always_inline *slab_alloc(struct kmem_cache *s, 1544 gfp_t gfpflags, int node, void *addr) 1545 { 1546 struct page *page; 1547 void **object; 1548 unsigned long flags; 1549 1550 local_irq_save(flags); 1551 page = s->cpu_slab[smp_processor_id()]; 1552 if (unlikely(!page || !page->lockless_freelist || 1553 (node != -1 && page_to_nid(page) != node))) 1554 1555 object = __slab_alloc(s, gfpflags, node, addr, page); 1556 1557 else { 1558 object = page->lockless_freelist; 1559 page->lockless_freelist = object[page->offset]; 1560 } 1561 local_irq_restore(flags); 1562 1563 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1564 memset(object, 0, s->objsize); 1565 1566 return object; 1567 } 1568 1569 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 1570 { 1571 return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); 1572 } 1573 EXPORT_SYMBOL(kmem_cache_alloc); 1574 1575 #ifdef CONFIG_NUMA 1576 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 1577 { 1578 return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); 1579 } 1580 EXPORT_SYMBOL(kmem_cache_alloc_node); 1581 #endif 1582 1583 /* 1584 * Slow patch handling. This may still be called frequently since objects 1585 * have a longer lifetime than the cpu slabs in most processing loads. 1586 * 1587 * So we still attempt to reduce cache line usage. Just take the slab 1588 * lock and free the item. If there is no additional partial page 1589 * handling required then we can return immediately. 1590 */ 1591 static void __slab_free(struct kmem_cache *s, struct page *page, 1592 void *x, void *addr) 1593 { 1594 void *prior; 1595 void **object = (void *)x; 1596 1597 slab_lock(page); 1598 1599 if (unlikely(SlabDebug(page))) 1600 goto debug; 1601 checks_ok: 1602 prior = object[page->offset] = page->freelist; 1603 page->freelist = object; 1604 page->inuse--; 1605 1606 if (unlikely(SlabFrozen(page))) 1607 goto out_unlock; 1608 1609 if (unlikely(!page->inuse)) 1610 goto slab_empty; 1611 1612 /* 1613 * Objects left in the slab. If it 1614 * was not on the partial list before 1615 * then add it. 1616 */ 1617 if (unlikely(!prior)) 1618 add_partial(get_node(s, page_to_nid(page)), page); 1619 1620 out_unlock: 1621 slab_unlock(page); 1622 return; 1623 1624 slab_empty: 1625 if (prior) 1626 /* 1627 * Slab still on the partial list. 1628 */ 1629 remove_partial(s, page); 1630 1631 slab_unlock(page); 1632 discard_slab(s, page); 1633 return; 1634 1635 debug: 1636 if (!free_debug_processing(s, page, x, addr)) 1637 goto out_unlock; 1638 goto checks_ok; 1639 } 1640 1641 /* 1642 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 1643 * can perform fastpath freeing without additional function calls. 1644 * 1645 * The fastpath is only possible if we are freeing to the current cpu slab 1646 * of this processor. This typically the case if we have just allocated 1647 * the item before. 1648 * 1649 * If fastpath is not possible then fall back to __slab_free where we deal 1650 * with all sorts of special processing. 1651 */ 1652 static void __always_inline slab_free(struct kmem_cache *s, 1653 struct page *page, void *x, void *addr) 1654 { 1655 void **object = (void *)x; 1656 unsigned long flags; 1657 1658 local_irq_save(flags); 1659 if (likely(page == s->cpu_slab[smp_processor_id()] && 1660 !SlabDebug(page))) { 1661 object[page->offset] = page->lockless_freelist; 1662 page->lockless_freelist = object; 1663 } else 1664 __slab_free(s, page, x, addr); 1665 1666 local_irq_restore(flags); 1667 } 1668 1669 void kmem_cache_free(struct kmem_cache *s, void *x) 1670 { 1671 struct page *page; 1672 1673 page = virt_to_head_page(x); 1674 1675 slab_free(s, page, x, __builtin_return_address(0)); 1676 } 1677 EXPORT_SYMBOL(kmem_cache_free); 1678 1679 /* Figure out on which slab object the object resides */ 1680 static struct page *get_object_page(const void *x) 1681 { 1682 struct page *page = virt_to_head_page(x); 1683 1684 if (!PageSlab(page)) 1685 return NULL; 1686 1687 return page; 1688 } 1689 1690 /* 1691 * Object placement in a slab is made very easy because we always start at 1692 * offset 0. If we tune the size of the object to the alignment then we can 1693 * get the required alignment by putting one properly sized object after 1694 * another. 1695 * 1696 * Notice that the allocation order determines the sizes of the per cpu 1697 * caches. Each processor has always one slab available for allocations. 1698 * Increasing the allocation order reduces the number of times that slabs 1699 * must be moved on and off the partial lists and is therefore a factor in 1700 * locking overhead. 1701 */ 1702 1703 /* 1704 * Mininum / Maximum order of slab pages. This influences locking overhead 1705 * and slab fragmentation. A higher order reduces the number of partial slabs 1706 * and increases the number of allocations possible without having to 1707 * take the list_lock. 1708 */ 1709 static int slub_min_order; 1710 static int slub_max_order = DEFAULT_MAX_ORDER; 1711 static int slub_min_objects = DEFAULT_MIN_OBJECTS; 1712 1713 /* 1714 * Merge control. If this is set then no merging of slab caches will occur. 1715 * (Could be removed. This was introduced to pacify the merge skeptics.) 1716 */ 1717 static int slub_nomerge; 1718 1719 /* 1720 * Calculate the order of allocation given an slab object size. 1721 * 1722 * The order of allocation has significant impact on performance and other 1723 * system components. Generally order 0 allocations should be preferred since 1724 * order 0 does not cause fragmentation in the page allocator. Larger objects 1725 * be problematic to put into order 0 slabs because there may be too much 1726 * unused space left. We go to a higher order if more than 1/8th of the slab 1727 * would be wasted. 1728 * 1729 * In order to reach satisfactory performance we must ensure that a minimum 1730 * number of objects is in one slab. Otherwise we may generate too much 1731 * activity on the partial lists which requires taking the list_lock. This is 1732 * less a concern for large slabs though which are rarely used. 1733 * 1734 * slub_max_order specifies the order where we begin to stop considering the 1735 * number of objects in a slab as critical. If we reach slub_max_order then 1736 * we try to keep the page order as low as possible. So we accept more waste 1737 * of space in favor of a small page order. 1738 * 1739 * Higher order allocations also allow the placement of more objects in a 1740 * slab and thereby reduce object handling overhead. If the user has 1741 * requested a higher mininum order then we start with that one instead of 1742 * the smallest order which will fit the object. 1743 */ 1744 static inline int slab_order(int size, int min_objects, 1745 int max_order, int fract_leftover) 1746 { 1747 int order; 1748 int rem; 1749 int min_order = slub_min_order; 1750 1751 /* 1752 * If we would create too many object per slab then reduce 1753 * the slab order even if it goes below slub_min_order. 1754 */ 1755 while (min_order > 0 && 1756 (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size) 1757 min_order--; 1758 1759 for (order = max(min_order, 1760 fls(min_objects * size - 1) - PAGE_SHIFT); 1761 order <= max_order; order++) { 1762 1763 unsigned long slab_size = PAGE_SIZE << order; 1764 1765 if (slab_size < min_objects * size) 1766 continue; 1767 1768 rem = slab_size % size; 1769 1770 if (rem <= slab_size / fract_leftover) 1771 break; 1772 1773 /* If the next size is too high then exit now */ 1774 if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size) 1775 break; 1776 } 1777 1778 return order; 1779 } 1780 1781 static inline int calculate_order(int size) 1782 { 1783 int order; 1784 int min_objects; 1785 int fraction; 1786 1787 /* 1788 * Attempt to find best configuration for a slab. This 1789 * works by first attempting to generate a layout with 1790 * the best configuration and backing off gradually. 1791 * 1792 * First we reduce the acceptable waste in a slab. Then 1793 * we reduce the minimum objects required in a slab. 1794 */ 1795 min_objects = slub_min_objects; 1796 while (min_objects > 1) { 1797 fraction = 8; 1798 while (fraction >= 4) { 1799 order = slab_order(size, min_objects, 1800 slub_max_order, fraction); 1801 if (order <= slub_max_order) 1802 return order; 1803 fraction /= 2; 1804 } 1805 min_objects /= 2; 1806 } 1807 1808 /* 1809 * We were unable to place multiple objects in a slab. Now 1810 * lets see if we can place a single object there. 1811 */ 1812 order = slab_order(size, 1, slub_max_order, 1); 1813 if (order <= slub_max_order) 1814 return order; 1815 1816 /* 1817 * Doh this slab cannot be placed using slub_max_order. 1818 */ 1819 order = slab_order(size, 1, MAX_ORDER, 1); 1820 if (order <= MAX_ORDER) 1821 return order; 1822 return -ENOSYS; 1823 } 1824 1825 /* 1826 * Figure out what the alignment of the objects will be. 1827 */ 1828 static unsigned long calculate_alignment(unsigned long flags, 1829 unsigned long align, unsigned long size) 1830 { 1831 /* 1832 * If the user wants hardware cache aligned objects then 1833 * follow that suggestion if the object is sufficiently 1834 * large. 1835 * 1836 * The hardware cache alignment cannot override the 1837 * specified alignment though. If that is greater 1838 * then use it. 1839 */ 1840 if ((flags & SLAB_HWCACHE_ALIGN) && 1841 size > cache_line_size() / 2) 1842 return max_t(unsigned long, align, cache_line_size()); 1843 1844 if (align < ARCH_SLAB_MINALIGN) 1845 return ARCH_SLAB_MINALIGN; 1846 1847 return ALIGN(align, sizeof(void *)); 1848 } 1849 1850 static void init_kmem_cache_node(struct kmem_cache_node *n) 1851 { 1852 n->nr_partial = 0; 1853 atomic_long_set(&n->nr_slabs, 0); 1854 spin_lock_init(&n->list_lock); 1855 INIT_LIST_HEAD(&n->partial); 1856 #ifdef CONFIG_SLUB_DEBUG 1857 INIT_LIST_HEAD(&n->full); 1858 #endif 1859 } 1860 1861 #ifdef CONFIG_NUMA 1862 /* 1863 * No kmalloc_node yet so do it by hand. We know that this is the first 1864 * slab on the node for this slabcache. There are no concurrent accesses 1865 * possible. 1866 * 1867 * Note that this function only works on the kmalloc_node_cache 1868 * when allocating for the kmalloc_node_cache. 1869 */ 1870 static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, 1871 int node) 1872 { 1873 struct page *page; 1874 struct kmem_cache_node *n; 1875 1876 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); 1877 1878 page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); 1879 1880 BUG_ON(!page); 1881 n = page->freelist; 1882 BUG_ON(!n); 1883 page->freelist = get_freepointer(kmalloc_caches, n); 1884 page->inuse++; 1885 kmalloc_caches->node[node] = n; 1886 #ifdef CONFIG_SLUB_DEBUG 1887 init_object(kmalloc_caches, n, 1); 1888 init_tracking(kmalloc_caches, n); 1889 #endif 1890 init_kmem_cache_node(n); 1891 atomic_long_inc(&n->nr_slabs); 1892 add_partial(n, page); 1893 1894 /* 1895 * new_slab() disables interupts. If we do not reenable interrupts here 1896 * then bootup would continue with interrupts disabled. 1897 */ 1898 local_irq_enable(); 1899 return n; 1900 } 1901 1902 static void free_kmem_cache_nodes(struct kmem_cache *s) 1903 { 1904 int node; 1905 1906 for_each_online_node(node) { 1907 struct kmem_cache_node *n = s->node[node]; 1908 if (n && n != &s->local_node) 1909 kmem_cache_free(kmalloc_caches, n); 1910 s->node[node] = NULL; 1911 } 1912 } 1913 1914 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 1915 { 1916 int node; 1917 int local_node; 1918 1919 if (slab_state >= UP) 1920 local_node = page_to_nid(virt_to_page(s)); 1921 else 1922 local_node = 0; 1923 1924 for_each_online_node(node) { 1925 struct kmem_cache_node *n; 1926 1927 if (local_node == node) 1928 n = &s->local_node; 1929 else { 1930 if (slab_state == DOWN) { 1931 n = early_kmem_cache_node_alloc(gfpflags, 1932 node); 1933 continue; 1934 } 1935 n = kmem_cache_alloc_node(kmalloc_caches, 1936 gfpflags, node); 1937 1938 if (!n) { 1939 free_kmem_cache_nodes(s); 1940 return 0; 1941 } 1942 1943 } 1944 s->node[node] = n; 1945 init_kmem_cache_node(n); 1946 } 1947 return 1; 1948 } 1949 #else 1950 static void free_kmem_cache_nodes(struct kmem_cache *s) 1951 { 1952 } 1953 1954 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 1955 { 1956 init_kmem_cache_node(&s->local_node); 1957 return 1; 1958 } 1959 #endif 1960 1961 /* 1962 * calculate_sizes() determines the order and the distribution of data within 1963 * a slab object. 1964 */ 1965 static int calculate_sizes(struct kmem_cache *s) 1966 { 1967 unsigned long flags = s->flags; 1968 unsigned long size = s->objsize; 1969 unsigned long align = s->align; 1970 1971 /* 1972 * Determine if we can poison the object itself. If the user of 1973 * the slab may touch the object after free or before allocation 1974 * then we should never poison the object itself. 1975 */ 1976 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 1977 !s->ctor) 1978 s->flags |= __OBJECT_POISON; 1979 else 1980 s->flags &= ~__OBJECT_POISON; 1981 1982 /* 1983 * Round up object size to the next word boundary. We can only 1984 * place the free pointer at word boundaries and this determines 1985 * the possible location of the free pointer. 1986 */ 1987 size = ALIGN(size, sizeof(void *)); 1988 1989 #ifdef CONFIG_SLUB_DEBUG 1990 /* 1991 * If we are Redzoning then check if there is some space between the 1992 * end of the object and the free pointer. If not then add an 1993 * additional word to have some bytes to store Redzone information. 1994 */ 1995 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 1996 size += sizeof(void *); 1997 #endif 1998 1999 /* 2000 * With that we have determined the number of bytes in actual use 2001 * by the object. This is the potential offset to the free pointer. 2002 */ 2003 s->inuse = size; 2004 2005 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 2006 s->ctor)) { 2007 /* 2008 * Relocate free pointer after the object if it is not 2009 * permitted to overwrite the first word of the object on 2010 * kmem_cache_free. 2011 * 2012 * This is the case if we do RCU, have a constructor or 2013 * destructor or are poisoning the objects. 2014 */ 2015 s->offset = size; 2016 size += sizeof(void *); 2017 } 2018 2019 #ifdef CONFIG_SLUB_DEBUG 2020 if (flags & SLAB_STORE_USER) 2021 /* 2022 * Need to store information about allocs and frees after 2023 * the object. 2024 */ 2025 size += 2 * sizeof(struct track); 2026 2027 if (flags & SLAB_RED_ZONE) 2028 /* 2029 * Add some empty padding so that we can catch 2030 * overwrites from earlier objects rather than let 2031 * tracking information or the free pointer be 2032 * corrupted if an user writes before the start 2033 * of the object. 2034 */ 2035 size += sizeof(void *); 2036 #endif 2037 2038 /* 2039 * Determine the alignment based on various parameters that the 2040 * user specified and the dynamic determination of cache line size 2041 * on bootup. 2042 */ 2043 align = calculate_alignment(flags, align, s->objsize); 2044 2045 /* 2046 * SLUB stores one object immediately after another beginning from 2047 * offset 0. In order to align the objects we have to simply size 2048 * each object to conform to the alignment. 2049 */ 2050 size = ALIGN(size, align); 2051 s->size = size; 2052 2053 s->order = calculate_order(size); 2054 if (s->order < 0) 2055 return 0; 2056 2057 /* 2058 * Determine the number of objects per slab 2059 */ 2060 s->objects = (PAGE_SIZE << s->order) / size; 2061 2062 /* 2063 * Verify that the number of objects is within permitted limits. 2064 * The page->inuse field is only 16 bit wide! So we cannot have 2065 * more than 64k objects per slab. 2066 */ 2067 if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB) 2068 return 0; 2069 return 1; 2070 2071 } 2072 2073 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2074 const char *name, size_t size, 2075 size_t align, unsigned long flags, 2076 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 2077 { 2078 memset(s, 0, kmem_size); 2079 s->name = name; 2080 s->ctor = ctor; 2081 s->objsize = size; 2082 s->flags = flags; 2083 s->align = align; 2084 kmem_cache_open_debug_check(s); 2085 2086 if (!calculate_sizes(s)) 2087 goto error; 2088 2089 s->refcount = 1; 2090 #ifdef CONFIG_NUMA 2091 s->defrag_ratio = 100; 2092 #endif 2093 2094 if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 2095 return 1; 2096 error: 2097 if (flags & SLAB_PANIC) 2098 panic("Cannot create slab %s size=%lu realsize=%u " 2099 "order=%u offset=%u flags=%lx\n", 2100 s->name, (unsigned long)size, s->size, s->order, 2101 s->offset, flags); 2102 return 0; 2103 } 2104 2105 /* 2106 * Check if a given pointer is valid 2107 */ 2108 int kmem_ptr_validate(struct kmem_cache *s, const void *object) 2109 { 2110 struct page * page; 2111 2112 page = get_object_page(object); 2113 2114 if (!page || s != page->slab) 2115 /* No slab or wrong slab */ 2116 return 0; 2117 2118 if (!check_valid_pointer(s, page, object)) 2119 return 0; 2120 2121 /* 2122 * We could also check if the object is on the slabs freelist. 2123 * But this would be too expensive and it seems that the main 2124 * purpose of kmem_ptr_valid is to check if the object belongs 2125 * to a certain slab. 2126 */ 2127 return 1; 2128 } 2129 EXPORT_SYMBOL(kmem_ptr_validate); 2130 2131 /* 2132 * Determine the size of a slab object 2133 */ 2134 unsigned int kmem_cache_size(struct kmem_cache *s) 2135 { 2136 return s->objsize; 2137 } 2138 EXPORT_SYMBOL(kmem_cache_size); 2139 2140 const char *kmem_cache_name(struct kmem_cache *s) 2141 { 2142 return s->name; 2143 } 2144 EXPORT_SYMBOL(kmem_cache_name); 2145 2146 /* 2147 * Attempt to free all slabs on a node. Return the number of slabs we 2148 * were unable to free. 2149 */ 2150 static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, 2151 struct list_head *list) 2152 { 2153 int slabs_inuse = 0; 2154 unsigned long flags; 2155 struct page *page, *h; 2156 2157 spin_lock_irqsave(&n->list_lock, flags); 2158 list_for_each_entry_safe(page, h, list, lru) 2159 if (!page->inuse) { 2160 list_del(&page->lru); 2161 discard_slab(s, page); 2162 } else 2163 slabs_inuse++; 2164 spin_unlock_irqrestore(&n->list_lock, flags); 2165 return slabs_inuse; 2166 } 2167 2168 /* 2169 * Release all resources used by a slab cache. 2170 */ 2171 static inline int kmem_cache_close(struct kmem_cache *s) 2172 { 2173 int node; 2174 2175 flush_all(s); 2176 2177 /* Attempt to free all objects */ 2178 for_each_online_node(node) { 2179 struct kmem_cache_node *n = get_node(s, node); 2180 2181 n->nr_partial -= free_list(s, n, &n->partial); 2182 if (atomic_long_read(&n->nr_slabs)) 2183 return 1; 2184 } 2185 free_kmem_cache_nodes(s); 2186 return 0; 2187 } 2188 2189 /* 2190 * Close a cache and release the kmem_cache structure 2191 * (must be used for caches created using kmem_cache_create) 2192 */ 2193 void kmem_cache_destroy(struct kmem_cache *s) 2194 { 2195 down_write(&slub_lock); 2196 s->refcount--; 2197 if (!s->refcount) { 2198 list_del(&s->list); 2199 up_write(&slub_lock); 2200 if (kmem_cache_close(s)) 2201 WARN_ON(1); 2202 sysfs_slab_remove(s); 2203 kfree(s); 2204 } else 2205 up_write(&slub_lock); 2206 } 2207 EXPORT_SYMBOL(kmem_cache_destroy); 2208 2209 /******************************************************************** 2210 * Kmalloc subsystem 2211 *******************************************************************/ 2212 2213 struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; 2214 EXPORT_SYMBOL(kmalloc_caches); 2215 2216 #ifdef CONFIG_ZONE_DMA 2217 static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; 2218 #endif 2219 2220 static int __init setup_slub_min_order(char *str) 2221 { 2222 get_option (&str, &slub_min_order); 2223 2224 return 1; 2225 } 2226 2227 __setup("slub_min_order=", setup_slub_min_order); 2228 2229 static int __init setup_slub_max_order(char *str) 2230 { 2231 get_option (&str, &slub_max_order); 2232 2233 return 1; 2234 } 2235 2236 __setup("slub_max_order=", setup_slub_max_order); 2237 2238 static int __init setup_slub_min_objects(char *str) 2239 { 2240 get_option (&str, &slub_min_objects); 2241 2242 return 1; 2243 } 2244 2245 __setup("slub_min_objects=", setup_slub_min_objects); 2246 2247 static int __init setup_slub_nomerge(char *str) 2248 { 2249 slub_nomerge = 1; 2250 return 1; 2251 } 2252 2253 __setup("slub_nomerge", setup_slub_nomerge); 2254 2255 static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, 2256 const char *name, int size, gfp_t gfp_flags) 2257 { 2258 unsigned int flags = 0; 2259 2260 if (gfp_flags & SLUB_DMA) 2261 flags = SLAB_CACHE_DMA; 2262 2263 down_write(&slub_lock); 2264 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 2265 flags, NULL)) 2266 goto panic; 2267 2268 list_add(&s->list, &slab_caches); 2269 up_write(&slub_lock); 2270 if (sysfs_slab_add(s)) 2271 goto panic; 2272 return s; 2273 2274 panic: 2275 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 2276 } 2277 2278 #ifdef CONFIG_ZONE_DMA 2279 static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) 2280 { 2281 struct kmem_cache *s; 2282 struct kmem_cache *x; 2283 char *text; 2284 size_t realsize; 2285 2286 s = kmalloc_caches_dma[index]; 2287 if (s) 2288 return s; 2289 2290 /* Dynamically create dma cache */ 2291 x = kmalloc(kmem_size, flags & ~SLUB_DMA); 2292 if (!x) 2293 panic("Unable to allocate memory for dma cache\n"); 2294 2295 realsize = kmalloc_caches[index].objsize; 2296 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", 2297 (unsigned int)realsize); 2298 s = create_kmalloc_cache(x, text, realsize, flags); 2299 down_write(&slub_lock); 2300 if (!kmalloc_caches_dma[index]) { 2301 kmalloc_caches_dma[index] = s; 2302 up_write(&slub_lock); 2303 return s; 2304 } 2305 up_write(&slub_lock); 2306 kmem_cache_destroy(s); 2307 return kmalloc_caches_dma[index]; 2308 } 2309 #endif 2310 2311 /* 2312 * Conversion table for small slabs sizes / 8 to the index in the 2313 * kmalloc array. This is necessary for slabs < 192 since we have non power 2314 * of two cache sizes there. The size of larger slabs can be determined using 2315 * fls. 2316 */ 2317 static s8 size_index[24] = { 2318 3, /* 8 */ 2319 4, /* 16 */ 2320 5, /* 24 */ 2321 5, /* 32 */ 2322 6, /* 40 */ 2323 6, /* 48 */ 2324 6, /* 56 */ 2325 6, /* 64 */ 2326 1, /* 72 */ 2327 1, /* 80 */ 2328 1, /* 88 */ 2329 1, /* 96 */ 2330 7, /* 104 */ 2331 7, /* 112 */ 2332 7, /* 120 */ 2333 7, /* 128 */ 2334 2, /* 136 */ 2335 2, /* 144 */ 2336 2, /* 152 */ 2337 2, /* 160 */ 2338 2, /* 168 */ 2339 2, /* 176 */ 2340 2, /* 184 */ 2341 2 /* 192 */ 2342 }; 2343 2344 static struct kmem_cache *get_slab(size_t size, gfp_t flags) 2345 { 2346 int index; 2347 2348 if (size <= 192) { 2349 if (!size) 2350 return ZERO_SIZE_PTR; 2351 2352 index = size_index[(size - 1) / 8]; 2353 } else { 2354 if (size > KMALLOC_MAX_SIZE) 2355 return NULL; 2356 2357 index = fls(size - 1); 2358 } 2359 2360 #ifdef CONFIG_ZONE_DMA 2361 if (unlikely((flags & SLUB_DMA))) 2362 return dma_kmalloc_cache(index, flags); 2363 2364 #endif 2365 return &kmalloc_caches[index]; 2366 } 2367 2368 void *__kmalloc(size_t size, gfp_t flags) 2369 { 2370 struct kmem_cache *s = get_slab(size, flags); 2371 2372 if (ZERO_OR_NULL_PTR(s)) 2373 return s; 2374 2375 return slab_alloc(s, flags, -1, __builtin_return_address(0)); 2376 } 2377 EXPORT_SYMBOL(__kmalloc); 2378 2379 #ifdef CONFIG_NUMA 2380 void *__kmalloc_node(size_t size, gfp_t flags, int node) 2381 { 2382 struct kmem_cache *s = get_slab(size, flags); 2383 2384 if (ZERO_OR_NULL_PTR(s)) 2385 return s; 2386 2387 return slab_alloc(s, flags, node, __builtin_return_address(0)); 2388 } 2389 EXPORT_SYMBOL(__kmalloc_node); 2390 #endif 2391 2392 size_t ksize(const void *object) 2393 { 2394 struct page *page; 2395 struct kmem_cache *s; 2396 2397 if (object == ZERO_SIZE_PTR) 2398 return 0; 2399 2400 page = get_object_page(object); 2401 BUG_ON(!page); 2402 s = page->slab; 2403 BUG_ON(!s); 2404 2405 /* 2406 * Debugging requires use of the padding between object 2407 * and whatever may come after it. 2408 */ 2409 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 2410 return s->objsize; 2411 2412 /* 2413 * If we have the need to store the freelist pointer 2414 * back there or track user information then we can 2415 * only use the space before that information. 2416 */ 2417 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 2418 return s->inuse; 2419 2420 /* 2421 * Else we can use all the padding etc for the allocation 2422 */ 2423 return s->size; 2424 } 2425 EXPORT_SYMBOL(ksize); 2426 2427 void kfree(const void *x) 2428 { 2429 struct kmem_cache *s; 2430 struct page *page; 2431 2432 /* 2433 * This has to be an unsigned comparison. According to Linus 2434 * some gcc version treat a pointer as a signed entity. Then 2435 * this comparison would be true for all "negative" pointers 2436 * (which would cover the whole upper half of the address space). 2437 */ 2438 if (ZERO_OR_NULL_PTR(x)) 2439 return; 2440 2441 page = virt_to_head_page(x); 2442 s = page->slab; 2443 2444 slab_free(s, page, (void *)x, __builtin_return_address(0)); 2445 } 2446 EXPORT_SYMBOL(kfree); 2447 2448 /* 2449 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 2450 * the remaining slabs by the number of items in use. The slabs with the 2451 * most items in use come first. New allocations will then fill those up 2452 * and thus they can be removed from the partial lists. 2453 * 2454 * The slabs with the least items are placed last. This results in them 2455 * being allocated from last increasing the chance that the last objects 2456 * are freed in them. 2457 */ 2458 int kmem_cache_shrink(struct kmem_cache *s) 2459 { 2460 int node; 2461 int i; 2462 struct kmem_cache_node *n; 2463 struct page *page; 2464 struct page *t; 2465 struct list_head *slabs_by_inuse = 2466 kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); 2467 unsigned long flags; 2468 2469 if (!slabs_by_inuse) 2470 return -ENOMEM; 2471 2472 flush_all(s); 2473 for_each_online_node(node) { 2474 n = get_node(s, node); 2475 2476 if (!n->nr_partial) 2477 continue; 2478 2479 for (i = 0; i < s->objects; i++) 2480 INIT_LIST_HEAD(slabs_by_inuse + i); 2481 2482 spin_lock_irqsave(&n->list_lock, flags); 2483 2484 /* 2485 * Build lists indexed by the items in use in each slab. 2486 * 2487 * Note that concurrent frees may occur while we hold the 2488 * list_lock. page->inuse here is the upper limit. 2489 */ 2490 list_for_each_entry_safe(page, t, &n->partial, lru) { 2491 if (!page->inuse && slab_trylock(page)) { 2492 /* 2493 * Must hold slab lock here because slab_free 2494 * may have freed the last object and be 2495 * waiting to release the slab. 2496 */ 2497 list_del(&page->lru); 2498 n->nr_partial--; 2499 slab_unlock(page); 2500 discard_slab(s, page); 2501 } else { 2502 if (n->nr_partial > MAX_PARTIAL) 2503 list_move(&page->lru, 2504 slabs_by_inuse + page->inuse); 2505 } 2506 } 2507 2508 if (n->nr_partial <= MAX_PARTIAL) 2509 goto out; 2510 2511 /* 2512 * Rebuild the partial list with the slabs filled up most 2513 * first and the least used slabs at the end. 2514 */ 2515 for (i = s->objects - 1; i >= 0; i--) 2516 list_splice(slabs_by_inuse + i, n->partial.prev); 2517 2518 out: 2519 spin_unlock_irqrestore(&n->list_lock, flags); 2520 } 2521 2522 kfree(slabs_by_inuse); 2523 return 0; 2524 } 2525 EXPORT_SYMBOL(kmem_cache_shrink); 2526 2527 /******************************************************************** 2528 * Basic setup of slabs 2529 *******************************************************************/ 2530 2531 void __init kmem_cache_init(void) 2532 { 2533 int i; 2534 int caches = 0; 2535 2536 #ifdef CONFIG_NUMA 2537 /* 2538 * Must first have the slab cache available for the allocations of the 2539 * struct kmem_cache_node's. There is special bootstrap code in 2540 * kmem_cache_open for slab_state == DOWN. 2541 */ 2542 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 2543 sizeof(struct kmem_cache_node), GFP_KERNEL); 2544 kmalloc_caches[0].refcount = -1; 2545 caches++; 2546 #endif 2547 2548 /* Able to allocate the per node structures */ 2549 slab_state = PARTIAL; 2550 2551 /* Caches that are not of the two-to-the-power-of size */ 2552 if (KMALLOC_MIN_SIZE <= 64) { 2553 create_kmalloc_cache(&kmalloc_caches[1], 2554 "kmalloc-96", 96, GFP_KERNEL); 2555 caches++; 2556 } 2557 if (KMALLOC_MIN_SIZE <= 128) { 2558 create_kmalloc_cache(&kmalloc_caches[2], 2559 "kmalloc-192", 192, GFP_KERNEL); 2560 caches++; 2561 } 2562 2563 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { 2564 create_kmalloc_cache(&kmalloc_caches[i], 2565 "kmalloc", 1 << i, GFP_KERNEL); 2566 caches++; 2567 } 2568 2569 2570 /* 2571 * Patch up the size_index table if we have strange large alignment 2572 * requirements for the kmalloc array. This is only the case for 2573 * mips it seems. The standard arches will not generate any code here. 2574 * 2575 * Largest permitted alignment is 256 bytes due to the way we 2576 * handle the index determination for the smaller caches. 2577 * 2578 * Make sure that nothing crazy happens if someone starts tinkering 2579 * around with ARCH_KMALLOC_MINALIGN 2580 */ 2581 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 2582 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 2583 2584 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) 2585 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; 2586 2587 slab_state = UP; 2588 2589 /* Provide the correct kmalloc names now that the caches are up */ 2590 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) 2591 kmalloc_caches[i]. name = 2592 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 2593 2594 #ifdef CONFIG_SMP 2595 register_cpu_notifier(&slab_notifier); 2596 #endif 2597 2598 kmem_size = offsetof(struct kmem_cache, cpu_slab) + 2599 nr_cpu_ids * sizeof(struct page *); 2600 2601 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2602 " CPUs=%d, Nodes=%d\n", 2603 caches, cache_line_size(), 2604 slub_min_order, slub_max_order, slub_min_objects, 2605 nr_cpu_ids, nr_node_ids); 2606 } 2607 2608 /* 2609 * Find a mergeable slab cache 2610 */ 2611 static int slab_unmergeable(struct kmem_cache *s) 2612 { 2613 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 2614 return 1; 2615 2616 if (s->ctor) 2617 return 1; 2618 2619 /* 2620 * We may have set a slab to be unmergeable during bootstrap. 2621 */ 2622 if (s->refcount < 0) 2623 return 1; 2624 2625 return 0; 2626 } 2627 2628 static struct kmem_cache *find_mergeable(size_t size, 2629 size_t align, unsigned long flags, 2630 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 2631 { 2632 struct kmem_cache *s; 2633 2634 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 2635 return NULL; 2636 2637 if (ctor) 2638 return NULL; 2639 2640 size = ALIGN(size, sizeof(void *)); 2641 align = calculate_alignment(flags, align, size); 2642 size = ALIGN(size, align); 2643 2644 list_for_each_entry(s, &slab_caches, list) { 2645 if (slab_unmergeable(s)) 2646 continue; 2647 2648 if (size > s->size) 2649 continue; 2650 2651 if (((flags | slub_debug) & SLUB_MERGE_SAME) != 2652 (s->flags & SLUB_MERGE_SAME)) 2653 continue; 2654 /* 2655 * Check if alignment is compatible. 2656 * Courtesy of Adrian Drzewiecki 2657 */ 2658 if ((s->size & ~(align -1)) != s->size) 2659 continue; 2660 2661 if (s->size - size >= sizeof(void *)) 2662 continue; 2663 2664 return s; 2665 } 2666 return NULL; 2667 } 2668 2669 struct kmem_cache *kmem_cache_create(const char *name, size_t size, 2670 size_t align, unsigned long flags, 2671 void (*ctor)(void *, struct kmem_cache *, unsigned long), 2672 void (*dtor)(void *, struct kmem_cache *, unsigned long)) 2673 { 2674 struct kmem_cache *s; 2675 2676 BUG_ON(dtor); 2677 down_write(&slub_lock); 2678 s = find_mergeable(size, align, flags, ctor); 2679 if (s) { 2680 s->refcount++; 2681 /* 2682 * Adjust the object sizes so that we clear 2683 * the complete object on kzalloc. 2684 */ 2685 s->objsize = max(s->objsize, (int)size); 2686 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 2687 up_write(&slub_lock); 2688 if (sysfs_slab_alias(s, name)) 2689 goto err; 2690 return s; 2691 } 2692 s = kmalloc(kmem_size, GFP_KERNEL); 2693 if (s) { 2694 if (kmem_cache_open(s, GFP_KERNEL, name, 2695 size, align, flags, ctor)) { 2696 list_add(&s->list, &slab_caches); 2697 up_write(&slub_lock); 2698 if (sysfs_slab_add(s)) 2699 goto err; 2700 return s; 2701 } 2702 kfree(s); 2703 } 2704 up_write(&slub_lock); 2705 2706 err: 2707 if (flags & SLAB_PANIC) 2708 panic("Cannot create slabcache %s\n", name); 2709 else 2710 s = NULL; 2711 return s; 2712 } 2713 EXPORT_SYMBOL(kmem_cache_create); 2714 2715 #ifdef CONFIG_SMP 2716 /* 2717 * Use the cpu notifier to insure that the cpu slabs are flushed when 2718 * necessary. 2719 */ 2720 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 2721 unsigned long action, void *hcpu) 2722 { 2723 long cpu = (long)hcpu; 2724 struct kmem_cache *s; 2725 unsigned long flags; 2726 2727 switch (action) { 2728 case CPU_UP_CANCELED: 2729 case CPU_UP_CANCELED_FROZEN: 2730 case CPU_DEAD: 2731 case CPU_DEAD_FROZEN: 2732 down_read(&slub_lock); 2733 list_for_each_entry(s, &slab_caches, list) { 2734 local_irq_save(flags); 2735 __flush_cpu_slab(s, cpu); 2736 local_irq_restore(flags); 2737 } 2738 up_read(&slub_lock); 2739 break; 2740 default: 2741 break; 2742 } 2743 return NOTIFY_OK; 2744 } 2745 2746 static struct notifier_block __cpuinitdata slab_notifier = 2747 { &slab_cpuup_callback, NULL, 0 }; 2748 2749 #endif 2750 2751 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) 2752 { 2753 struct kmem_cache *s = get_slab(size, gfpflags); 2754 2755 if (ZERO_OR_NULL_PTR(s)) 2756 return s; 2757 2758 return slab_alloc(s, gfpflags, -1, caller); 2759 } 2760 2761 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 2762 int node, void *caller) 2763 { 2764 struct kmem_cache *s = get_slab(size, gfpflags); 2765 2766 if (ZERO_OR_NULL_PTR(s)) 2767 return s; 2768 2769 return slab_alloc(s, gfpflags, node, caller); 2770 } 2771 2772 #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) 2773 static int validate_slab(struct kmem_cache *s, struct page *page, 2774 unsigned long *map) 2775 { 2776 void *p; 2777 void *addr = page_address(page); 2778 2779 if (!check_slab(s, page) || 2780 !on_freelist(s, page, NULL)) 2781 return 0; 2782 2783 /* Now we know that a valid freelist exists */ 2784 bitmap_zero(map, s->objects); 2785 2786 for_each_free_object(p, s, page->freelist) { 2787 set_bit(slab_index(p, s, addr), map); 2788 if (!check_object(s, page, p, 0)) 2789 return 0; 2790 } 2791 2792 for_each_object(p, s, addr) 2793 if (!test_bit(slab_index(p, s, addr), map)) 2794 if (!check_object(s, page, p, 1)) 2795 return 0; 2796 return 1; 2797 } 2798 2799 static void validate_slab_slab(struct kmem_cache *s, struct page *page, 2800 unsigned long *map) 2801 { 2802 if (slab_trylock(page)) { 2803 validate_slab(s, page, map); 2804 slab_unlock(page); 2805 } else 2806 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", 2807 s->name, page); 2808 2809 if (s->flags & DEBUG_DEFAULT_FLAGS) { 2810 if (!SlabDebug(page)) 2811 printk(KERN_ERR "SLUB %s: SlabDebug not set " 2812 "on slab 0x%p\n", s->name, page); 2813 } else { 2814 if (SlabDebug(page)) 2815 printk(KERN_ERR "SLUB %s: SlabDebug set on " 2816 "slab 0x%p\n", s->name, page); 2817 } 2818 } 2819 2820 static int validate_slab_node(struct kmem_cache *s, 2821 struct kmem_cache_node *n, unsigned long *map) 2822 { 2823 unsigned long count = 0; 2824 struct page *page; 2825 unsigned long flags; 2826 2827 spin_lock_irqsave(&n->list_lock, flags); 2828 2829 list_for_each_entry(page, &n->partial, lru) { 2830 validate_slab_slab(s, page, map); 2831 count++; 2832 } 2833 if (count != n->nr_partial) 2834 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 2835 "counter=%ld\n", s->name, count, n->nr_partial); 2836 2837 if (!(s->flags & SLAB_STORE_USER)) 2838 goto out; 2839 2840 list_for_each_entry(page, &n->full, lru) { 2841 validate_slab_slab(s, page, map); 2842 count++; 2843 } 2844 if (count != atomic_long_read(&n->nr_slabs)) 2845 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 2846 "counter=%ld\n", s->name, count, 2847 atomic_long_read(&n->nr_slabs)); 2848 2849 out: 2850 spin_unlock_irqrestore(&n->list_lock, flags); 2851 return count; 2852 } 2853 2854 static long validate_slab_cache(struct kmem_cache *s) 2855 { 2856 int node; 2857 unsigned long count = 0; 2858 unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) * 2859 sizeof(unsigned long), GFP_KERNEL); 2860 2861 if (!map) 2862 return -ENOMEM; 2863 2864 flush_all(s); 2865 for_each_online_node(node) { 2866 struct kmem_cache_node *n = get_node(s, node); 2867 2868 count += validate_slab_node(s, n, map); 2869 } 2870 kfree(map); 2871 return count; 2872 } 2873 2874 #ifdef SLUB_RESILIENCY_TEST 2875 static void resiliency_test(void) 2876 { 2877 u8 *p; 2878 2879 printk(KERN_ERR "SLUB resiliency testing\n"); 2880 printk(KERN_ERR "-----------------------\n"); 2881 printk(KERN_ERR "A. Corruption after allocation\n"); 2882 2883 p = kzalloc(16, GFP_KERNEL); 2884 p[16] = 0x12; 2885 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 2886 " 0x12->0x%p\n\n", p + 16); 2887 2888 validate_slab_cache(kmalloc_caches + 4); 2889 2890 /* Hmmm... The next two are dangerous */ 2891 p = kzalloc(32, GFP_KERNEL); 2892 p[32 + sizeof(void *)] = 0x34; 2893 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 2894 " 0x34 -> -0x%p\n", p); 2895 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 2896 2897 validate_slab_cache(kmalloc_caches + 5); 2898 p = kzalloc(64, GFP_KERNEL); 2899 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 2900 *p = 0x56; 2901 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 2902 p); 2903 printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); 2904 validate_slab_cache(kmalloc_caches + 6); 2905 2906 printk(KERN_ERR "\nB. Corruption after free\n"); 2907 p = kzalloc(128, GFP_KERNEL); 2908 kfree(p); 2909 *p = 0x78; 2910 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 2911 validate_slab_cache(kmalloc_caches + 7); 2912 2913 p = kzalloc(256, GFP_KERNEL); 2914 kfree(p); 2915 p[50] = 0x9a; 2916 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); 2917 validate_slab_cache(kmalloc_caches + 8); 2918 2919 p = kzalloc(512, GFP_KERNEL); 2920 kfree(p); 2921 p[512] = 0xab; 2922 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 2923 validate_slab_cache(kmalloc_caches + 9); 2924 } 2925 #else 2926 static void resiliency_test(void) {}; 2927 #endif 2928 2929 /* 2930 * Generate lists of code addresses where slabcache objects are allocated 2931 * and freed. 2932 */ 2933 2934 struct location { 2935 unsigned long count; 2936 void *addr; 2937 long long sum_time; 2938 long min_time; 2939 long max_time; 2940 long min_pid; 2941 long max_pid; 2942 cpumask_t cpus; 2943 nodemask_t nodes; 2944 }; 2945 2946 struct loc_track { 2947 unsigned long max; 2948 unsigned long count; 2949 struct location *loc; 2950 }; 2951 2952 static void free_loc_track(struct loc_track *t) 2953 { 2954 if (t->max) 2955 free_pages((unsigned long)t->loc, 2956 get_order(sizeof(struct location) * t->max)); 2957 } 2958 2959 static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) 2960 { 2961 struct location *l; 2962 int order; 2963 2964 order = get_order(sizeof(struct location) * max); 2965 2966 l = (void *)__get_free_pages(flags, order); 2967 if (!l) 2968 return 0; 2969 2970 if (t->count) { 2971 memcpy(l, t->loc, sizeof(struct location) * t->count); 2972 free_loc_track(t); 2973 } 2974 t->max = max; 2975 t->loc = l; 2976 return 1; 2977 } 2978 2979 static int add_location(struct loc_track *t, struct kmem_cache *s, 2980 const struct track *track) 2981 { 2982 long start, end, pos; 2983 struct location *l; 2984 void *caddr; 2985 unsigned long age = jiffies - track->when; 2986 2987 start = -1; 2988 end = t->count; 2989 2990 for ( ; ; ) { 2991 pos = start + (end - start + 1) / 2; 2992 2993 /* 2994 * There is nothing at "end". If we end up there 2995 * we need to add something to before end. 2996 */ 2997 if (pos == end) 2998 break; 2999 3000 caddr = t->loc[pos].addr; 3001 if (track->addr == caddr) { 3002 3003 l = &t->loc[pos]; 3004 l->count++; 3005 if (track->when) { 3006 l->sum_time += age; 3007 if (age < l->min_time) 3008 l->min_time = age; 3009 if (age > l->max_time) 3010 l->max_time = age; 3011 3012 if (track->pid < l->min_pid) 3013 l->min_pid = track->pid; 3014 if (track->pid > l->max_pid) 3015 l->max_pid = track->pid; 3016 3017 cpu_set(track->cpu, l->cpus); 3018 } 3019 node_set(page_to_nid(virt_to_page(track)), l->nodes); 3020 return 1; 3021 } 3022 3023 if (track->addr < caddr) 3024 end = pos; 3025 else 3026 start = pos; 3027 } 3028 3029 /* 3030 * Not found. Insert new tracking element. 3031 */ 3032 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) 3033 return 0; 3034 3035 l = t->loc + pos; 3036 if (pos < t->count) 3037 memmove(l + 1, l, 3038 (t->count - pos) * sizeof(struct location)); 3039 t->count++; 3040 l->count = 1; 3041 l->addr = track->addr; 3042 l->sum_time = age; 3043 l->min_time = age; 3044 l->max_time = age; 3045 l->min_pid = track->pid; 3046 l->max_pid = track->pid; 3047 cpus_clear(l->cpus); 3048 cpu_set(track->cpu, l->cpus); 3049 nodes_clear(l->nodes); 3050 node_set(page_to_nid(virt_to_page(track)), l->nodes); 3051 return 1; 3052 } 3053 3054 static void process_slab(struct loc_track *t, struct kmem_cache *s, 3055 struct page *page, enum track_item alloc) 3056 { 3057 void *addr = page_address(page); 3058 DECLARE_BITMAP(map, s->objects); 3059 void *p; 3060 3061 bitmap_zero(map, s->objects); 3062 for_each_free_object(p, s, page->freelist) 3063 set_bit(slab_index(p, s, addr), map); 3064 3065 for_each_object(p, s, addr) 3066 if (!test_bit(slab_index(p, s, addr), map)) 3067 add_location(t, s, get_track(s, p, alloc)); 3068 } 3069 3070 static int list_locations(struct kmem_cache *s, char *buf, 3071 enum track_item alloc) 3072 { 3073 int n = 0; 3074 unsigned long i; 3075 struct loc_track t = { 0, 0, NULL }; 3076 int node; 3077 3078 if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 3079 GFP_KERNEL)) 3080 return sprintf(buf, "Out of memory\n"); 3081 3082 /* Push back cpu slabs */ 3083 flush_all(s); 3084 3085 for_each_online_node(node) { 3086 struct kmem_cache_node *n = get_node(s, node); 3087 unsigned long flags; 3088 struct page *page; 3089 3090 if (!atomic_read(&n->nr_slabs)) 3091 continue; 3092 3093 spin_lock_irqsave(&n->list_lock, flags); 3094 list_for_each_entry(page, &n->partial, lru) 3095 process_slab(&t, s, page, alloc); 3096 list_for_each_entry(page, &n->full, lru) 3097 process_slab(&t, s, page, alloc); 3098 spin_unlock_irqrestore(&n->list_lock, flags); 3099 } 3100 3101 for (i = 0; i < t.count; i++) { 3102 struct location *l = &t.loc[i]; 3103 3104 if (n > PAGE_SIZE - 100) 3105 break; 3106 n += sprintf(buf + n, "%7ld ", l->count); 3107 3108 if (l->addr) 3109 n += sprint_symbol(buf + n, (unsigned long)l->addr); 3110 else 3111 n += sprintf(buf + n, "<not-available>"); 3112 3113 if (l->sum_time != l->min_time) { 3114 unsigned long remainder; 3115 3116 n += sprintf(buf + n, " age=%ld/%ld/%ld", 3117 l->min_time, 3118 div_long_long_rem(l->sum_time, l->count, &remainder), 3119 l->max_time); 3120 } else 3121 n += sprintf(buf + n, " age=%ld", 3122 l->min_time); 3123 3124 if (l->min_pid != l->max_pid) 3125 n += sprintf(buf + n, " pid=%ld-%ld", 3126 l->min_pid, l->max_pid); 3127 else 3128 n += sprintf(buf + n, " pid=%ld", 3129 l->min_pid); 3130 3131 if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && 3132 n < PAGE_SIZE - 60) { 3133 n += sprintf(buf + n, " cpus="); 3134 n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50, 3135 l->cpus); 3136 } 3137 3138 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && 3139 n < PAGE_SIZE - 60) { 3140 n += sprintf(buf + n, " nodes="); 3141 n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50, 3142 l->nodes); 3143 } 3144 3145 n += sprintf(buf + n, "\n"); 3146 } 3147 3148 free_loc_track(&t); 3149 if (!t.count) 3150 n += sprintf(buf, "No data\n"); 3151 return n; 3152 } 3153 3154 static unsigned long count_partial(struct kmem_cache_node *n) 3155 { 3156 unsigned long flags; 3157 unsigned long x = 0; 3158 struct page *page; 3159 3160 spin_lock_irqsave(&n->list_lock, flags); 3161 list_for_each_entry(page, &n->partial, lru) 3162 x += page->inuse; 3163 spin_unlock_irqrestore(&n->list_lock, flags); 3164 return x; 3165 } 3166 3167 enum slab_stat_type { 3168 SL_FULL, 3169 SL_PARTIAL, 3170 SL_CPU, 3171 SL_OBJECTS 3172 }; 3173 3174 #define SO_FULL (1 << SL_FULL) 3175 #define SO_PARTIAL (1 << SL_PARTIAL) 3176 #define SO_CPU (1 << SL_CPU) 3177 #define SO_OBJECTS (1 << SL_OBJECTS) 3178 3179 static unsigned long slab_objects(struct kmem_cache *s, 3180 char *buf, unsigned long flags) 3181 { 3182 unsigned long total = 0; 3183 int cpu; 3184 int node; 3185 int x; 3186 unsigned long *nodes; 3187 unsigned long *per_cpu; 3188 3189 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 3190 per_cpu = nodes + nr_node_ids; 3191 3192 for_each_possible_cpu(cpu) { 3193 struct page *page = s->cpu_slab[cpu]; 3194 int node; 3195 3196 if (page) { 3197 node = page_to_nid(page); 3198 if (flags & SO_CPU) { 3199 int x = 0; 3200 3201 if (flags & SO_OBJECTS) 3202 x = page->inuse; 3203 else 3204 x = 1; 3205 total += x; 3206 nodes[node] += x; 3207 } 3208 per_cpu[node]++; 3209 } 3210 } 3211 3212 for_each_online_node(node) { 3213 struct kmem_cache_node *n = get_node(s, node); 3214 3215 if (flags & SO_PARTIAL) { 3216 if (flags & SO_OBJECTS) 3217 x = count_partial(n); 3218 else 3219 x = n->nr_partial; 3220 total += x; 3221 nodes[node] += x; 3222 } 3223 3224 if (flags & SO_FULL) { 3225 int full_slabs = atomic_read(&n->nr_slabs) 3226 - per_cpu[node] 3227 - n->nr_partial; 3228 3229 if (flags & SO_OBJECTS) 3230 x = full_slabs * s->objects; 3231 else 3232 x = full_slabs; 3233 total += x; 3234 nodes[node] += x; 3235 } 3236 } 3237 3238 x = sprintf(buf, "%lu", total); 3239 #ifdef CONFIG_NUMA 3240 for_each_online_node(node) 3241 if (nodes[node]) 3242 x += sprintf(buf + x, " N%d=%lu", 3243 node, nodes[node]); 3244 #endif 3245 kfree(nodes); 3246 return x + sprintf(buf + x, "\n"); 3247 } 3248 3249 static int any_slab_objects(struct kmem_cache *s) 3250 { 3251 int node; 3252 int cpu; 3253 3254 for_each_possible_cpu(cpu) 3255 if (s->cpu_slab[cpu]) 3256 return 1; 3257 3258 for_each_node(node) { 3259 struct kmem_cache_node *n = get_node(s, node); 3260 3261 if (n->nr_partial || atomic_read(&n->nr_slabs)) 3262 return 1; 3263 } 3264 return 0; 3265 } 3266 3267 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 3268 #define to_slab(n) container_of(n, struct kmem_cache, kobj); 3269 3270 struct slab_attribute { 3271 struct attribute attr; 3272 ssize_t (*show)(struct kmem_cache *s, char *buf); 3273 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 3274 }; 3275 3276 #define SLAB_ATTR_RO(_name) \ 3277 static struct slab_attribute _name##_attr = __ATTR_RO(_name) 3278 3279 #define SLAB_ATTR(_name) \ 3280 static struct slab_attribute _name##_attr = \ 3281 __ATTR(_name, 0644, _name##_show, _name##_store) 3282 3283 static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 3284 { 3285 return sprintf(buf, "%d\n", s->size); 3286 } 3287 SLAB_ATTR_RO(slab_size); 3288 3289 static ssize_t align_show(struct kmem_cache *s, char *buf) 3290 { 3291 return sprintf(buf, "%d\n", s->align); 3292 } 3293 SLAB_ATTR_RO(align); 3294 3295 static ssize_t object_size_show(struct kmem_cache *s, char *buf) 3296 { 3297 return sprintf(buf, "%d\n", s->objsize); 3298 } 3299 SLAB_ATTR_RO(object_size); 3300 3301 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 3302 { 3303 return sprintf(buf, "%d\n", s->objects); 3304 } 3305 SLAB_ATTR_RO(objs_per_slab); 3306 3307 static ssize_t order_show(struct kmem_cache *s, char *buf) 3308 { 3309 return sprintf(buf, "%d\n", s->order); 3310 } 3311 SLAB_ATTR_RO(order); 3312 3313 static ssize_t ctor_show(struct kmem_cache *s, char *buf) 3314 { 3315 if (s->ctor) { 3316 int n = sprint_symbol(buf, (unsigned long)s->ctor); 3317 3318 return n + sprintf(buf + n, "\n"); 3319 } 3320 return 0; 3321 } 3322 SLAB_ATTR_RO(ctor); 3323 3324 static ssize_t aliases_show(struct kmem_cache *s, char *buf) 3325 { 3326 return sprintf(buf, "%d\n", s->refcount - 1); 3327 } 3328 SLAB_ATTR_RO(aliases); 3329 3330 static ssize_t slabs_show(struct kmem_cache *s, char *buf) 3331 { 3332 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); 3333 } 3334 SLAB_ATTR_RO(slabs); 3335 3336 static ssize_t partial_show(struct kmem_cache *s, char *buf) 3337 { 3338 return slab_objects(s, buf, SO_PARTIAL); 3339 } 3340 SLAB_ATTR_RO(partial); 3341 3342 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 3343 { 3344 return slab_objects(s, buf, SO_CPU); 3345 } 3346 SLAB_ATTR_RO(cpu_slabs); 3347 3348 static ssize_t objects_show(struct kmem_cache *s, char *buf) 3349 { 3350 return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); 3351 } 3352 SLAB_ATTR_RO(objects); 3353 3354 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 3355 { 3356 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 3357 } 3358 3359 static ssize_t sanity_checks_store(struct kmem_cache *s, 3360 const char *buf, size_t length) 3361 { 3362 s->flags &= ~SLAB_DEBUG_FREE; 3363 if (buf[0] == '1') 3364 s->flags |= SLAB_DEBUG_FREE; 3365 return length; 3366 } 3367 SLAB_ATTR(sanity_checks); 3368 3369 static ssize_t trace_show(struct kmem_cache *s, char *buf) 3370 { 3371 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 3372 } 3373 3374 static ssize_t trace_store(struct kmem_cache *s, const char *buf, 3375 size_t length) 3376 { 3377 s->flags &= ~SLAB_TRACE; 3378 if (buf[0] == '1') 3379 s->flags |= SLAB_TRACE; 3380 return length; 3381 } 3382 SLAB_ATTR(trace); 3383 3384 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 3385 { 3386 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 3387 } 3388 3389 static ssize_t reclaim_account_store(struct kmem_cache *s, 3390 const char *buf, size_t length) 3391 { 3392 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 3393 if (buf[0] == '1') 3394 s->flags |= SLAB_RECLAIM_ACCOUNT; 3395 return length; 3396 } 3397 SLAB_ATTR(reclaim_account); 3398 3399 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 3400 { 3401 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 3402 } 3403 SLAB_ATTR_RO(hwcache_align); 3404 3405 #ifdef CONFIG_ZONE_DMA 3406 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 3407 { 3408 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 3409 } 3410 SLAB_ATTR_RO(cache_dma); 3411 #endif 3412 3413 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 3414 { 3415 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 3416 } 3417 SLAB_ATTR_RO(destroy_by_rcu); 3418 3419 static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 3420 { 3421 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 3422 } 3423 3424 static ssize_t red_zone_store(struct kmem_cache *s, 3425 const char *buf, size_t length) 3426 { 3427 if (any_slab_objects(s)) 3428 return -EBUSY; 3429 3430 s->flags &= ~SLAB_RED_ZONE; 3431 if (buf[0] == '1') 3432 s->flags |= SLAB_RED_ZONE; 3433 calculate_sizes(s); 3434 return length; 3435 } 3436 SLAB_ATTR(red_zone); 3437 3438 static ssize_t poison_show(struct kmem_cache *s, char *buf) 3439 { 3440 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 3441 } 3442 3443 static ssize_t poison_store(struct kmem_cache *s, 3444 const char *buf, size_t length) 3445 { 3446 if (any_slab_objects(s)) 3447 return -EBUSY; 3448 3449 s->flags &= ~SLAB_POISON; 3450 if (buf[0] == '1') 3451 s->flags |= SLAB_POISON; 3452 calculate_sizes(s); 3453 return length; 3454 } 3455 SLAB_ATTR(poison); 3456 3457 static ssize_t store_user_show(struct kmem_cache *s, char *buf) 3458 { 3459 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 3460 } 3461 3462 static ssize_t store_user_store(struct kmem_cache *s, 3463 const char *buf, size_t length) 3464 { 3465 if (any_slab_objects(s)) 3466 return -EBUSY; 3467 3468 s->flags &= ~SLAB_STORE_USER; 3469 if (buf[0] == '1') 3470 s->flags |= SLAB_STORE_USER; 3471 calculate_sizes(s); 3472 return length; 3473 } 3474 SLAB_ATTR(store_user); 3475 3476 static ssize_t validate_show(struct kmem_cache *s, char *buf) 3477 { 3478 return 0; 3479 } 3480 3481 static ssize_t validate_store(struct kmem_cache *s, 3482 const char *buf, size_t length) 3483 { 3484 int ret = -EINVAL; 3485 3486 if (buf[0] == '1') { 3487 ret = validate_slab_cache(s); 3488 if (ret >= 0) 3489 ret = length; 3490 } 3491 return ret; 3492 } 3493 SLAB_ATTR(validate); 3494 3495 static ssize_t shrink_show(struct kmem_cache *s, char *buf) 3496 { 3497 return 0; 3498 } 3499 3500 static ssize_t shrink_store(struct kmem_cache *s, 3501 const char *buf, size_t length) 3502 { 3503 if (buf[0] == '1') { 3504 int rc = kmem_cache_shrink(s); 3505 3506 if (rc) 3507 return rc; 3508 } else 3509 return -EINVAL; 3510 return length; 3511 } 3512 SLAB_ATTR(shrink); 3513 3514 static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 3515 { 3516 if (!(s->flags & SLAB_STORE_USER)) 3517 return -ENOSYS; 3518 return list_locations(s, buf, TRACK_ALLOC); 3519 } 3520 SLAB_ATTR_RO(alloc_calls); 3521 3522 static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 3523 { 3524 if (!(s->flags & SLAB_STORE_USER)) 3525 return -ENOSYS; 3526 return list_locations(s, buf, TRACK_FREE); 3527 } 3528 SLAB_ATTR_RO(free_calls); 3529 3530 #ifdef CONFIG_NUMA 3531 static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) 3532 { 3533 return sprintf(buf, "%d\n", s->defrag_ratio / 10); 3534 } 3535 3536 static ssize_t defrag_ratio_store(struct kmem_cache *s, 3537 const char *buf, size_t length) 3538 { 3539 int n = simple_strtoul(buf, NULL, 10); 3540 3541 if (n < 100) 3542 s->defrag_ratio = n * 10; 3543 return length; 3544 } 3545 SLAB_ATTR(defrag_ratio); 3546 #endif 3547 3548 static struct attribute * slab_attrs[] = { 3549 &slab_size_attr.attr, 3550 &object_size_attr.attr, 3551 &objs_per_slab_attr.attr, 3552 &order_attr.attr, 3553 &objects_attr.attr, 3554 &slabs_attr.attr, 3555 &partial_attr.attr, 3556 &cpu_slabs_attr.attr, 3557 &ctor_attr.attr, 3558 &aliases_attr.attr, 3559 &align_attr.attr, 3560 &sanity_checks_attr.attr, 3561 &trace_attr.attr, 3562 &hwcache_align_attr.attr, 3563 &reclaim_account_attr.attr, 3564 &destroy_by_rcu_attr.attr, 3565 &red_zone_attr.attr, 3566 &poison_attr.attr, 3567 &store_user_attr.attr, 3568 &validate_attr.attr, 3569 &shrink_attr.attr, 3570 &alloc_calls_attr.attr, 3571 &free_calls_attr.attr, 3572 #ifdef CONFIG_ZONE_DMA 3573 &cache_dma_attr.attr, 3574 #endif 3575 #ifdef CONFIG_NUMA 3576 &defrag_ratio_attr.attr, 3577 #endif 3578 NULL 3579 }; 3580 3581 static struct attribute_group slab_attr_group = { 3582 .attrs = slab_attrs, 3583 }; 3584 3585 static ssize_t slab_attr_show(struct kobject *kobj, 3586 struct attribute *attr, 3587 char *buf) 3588 { 3589 struct slab_attribute *attribute; 3590 struct kmem_cache *s; 3591 int err; 3592 3593 attribute = to_slab_attr(attr); 3594 s = to_slab(kobj); 3595 3596 if (!attribute->show) 3597 return -EIO; 3598 3599 err = attribute->show(s, buf); 3600 3601 return err; 3602 } 3603 3604 static ssize_t slab_attr_store(struct kobject *kobj, 3605 struct attribute *attr, 3606 const char *buf, size_t len) 3607 { 3608 struct slab_attribute *attribute; 3609 struct kmem_cache *s; 3610 int err; 3611 3612 attribute = to_slab_attr(attr); 3613 s = to_slab(kobj); 3614 3615 if (!attribute->store) 3616 return -EIO; 3617 3618 err = attribute->store(s, buf, len); 3619 3620 return err; 3621 } 3622 3623 static struct sysfs_ops slab_sysfs_ops = { 3624 .show = slab_attr_show, 3625 .store = slab_attr_store, 3626 }; 3627 3628 static struct kobj_type slab_ktype = { 3629 .sysfs_ops = &slab_sysfs_ops, 3630 }; 3631 3632 static int uevent_filter(struct kset *kset, struct kobject *kobj) 3633 { 3634 struct kobj_type *ktype = get_ktype(kobj); 3635 3636 if (ktype == &slab_ktype) 3637 return 1; 3638 return 0; 3639 } 3640 3641 static struct kset_uevent_ops slab_uevent_ops = { 3642 .filter = uevent_filter, 3643 }; 3644 3645 static decl_subsys(slab, &slab_ktype, &slab_uevent_ops); 3646 3647 #define ID_STR_LENGTH 64 3648 3649 /* Create a unique string id for a slab cache: 3650 * format 3651 * :[flags-]size:[memory address of kmemcache] 3652 */ 3653 static char *create_unique_id(struct kmem_cache *s) 3654 { 3655 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 3656 char *p = name; 3657 3658 BUG_ON(!name); 3659 3660 *p++ = ':'; 3661 /* 3662 * First flags affecting slabcache operations. We will only 3663 * get here for aliasable slabs so we do not need to support 3664 * too many flags. The flags here must cover all flags that 3665 * are matched during merging to guarantee that the id is 3666 * unique. 3667 */ 3668 if (s->flags & SLAB_CACHE_DMA) 3669 *p++ = 'd'; 3670 if (s->flags & SLAB_RECLAIM_ACCOUNT) 3671 *p++ = 'a'; 3672 if (s->flags & SLAB_DEBUG_FREE) 3673 *p++ = 'F'; 3674 if (p != name + 1) 3675 *p++ = '-'; 3676 p += sprintf(p, "%07d", s->size); 3677 BUG_ON(p > name + ID_STR_LENGTH - 1); 3678 return name; 3679 } 3680 3681 static int sysfs_slab_add(struct kmem_cache *s) 3682 { 3683 int err; 3684 const char *name; 3685 int unmergeable; 3686 3687 if (slab_state < SYSFS) 3688 /* Defer until later */ 3689 return 0; 3690 3691 unmergeable = slab_unmergeable(s); 3692 if (unmergeable) { 3693 /* 3694 * Slabcache can never be merged so we can use the name proper. 3695 * This is typically the case for debug situations. In that 3696 * case we can catch duplicate names easily. 3697 */ 3698 sysfs_remove_link(&slab_subsys.kobj, s->name); 3699 name = s->name; 3700 } else { 3701 /* 3702 * Create a unique name for the slab as a target 3703 * for the symlinks. 3704 */ 3705 name = create_unique_id(s); 3706 } 3707 3708 kobj_set_kset_s(s, slab_subsys); 3709 kobject_set_name(&s->kobj, name); 3710 kobject_init(&s->kobj); 3711 err = kobject_add(&s->kobj); 3712 if (err) 3713 return err; 3714 3715 err = sysfs_create_group(&s->kobj, &slab_attr_group); 3716 if (err) 3717 return err; 3718 kobject_uevent(&s->kobj, KOBJ_ADD); 3719 if (!unmergeable) { 3720 /* Setup first alias */ 3721 sysfs_slab_alias(s, s->name); 3722 kfree(name); 3723 } 3724 return 0; 3725 } 3726 3727 static void sysfs_slab_remove(struct kmem_cache *s) 3728 { 3729 kobject_uevent(&s->kobj, KOBJ_REMOVE); 3730 kobject_del(&s->kobj); 3731 } 3732 3733 /* 3734 * Need to buffer aliases during bootup until sysfs becomes 3735 * available lest we loose that information. 3736 */ 3737 struct saved_alias { 3738 struct kmem_cache *s; 3739 const char *name; 3740 struct saved_alias *next; 3741 }; 3742 3743 static struct saved_alias *alias_list; 3744 3745 static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 3746 { 3747 struct saved_alias *al; 3748 3749 if (slab_state == SYSFS) { 3750 /* 3751 * If we have a leftover link then remove it. 3752 */ 3753 sysfs_remove_link(&slab_subsys.kobj, name); 3754 return sysfs_create_link(&slab_subsys.kobj, 3755 &s->kobj, name); 3756 } 3757 3758 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 3759 if (!al) 3760 return -ENOMEM; 3761 3762 al->s = s; 3763 al->name = name; 3764 al->next = alias_list; 3765 alias_list = al; 3766 return 0; 3767 } 3768 3769 static int __init slab_sysfs_init(void) 3770 { 3771 struct kmem_cache *s; 3772 int err; 3773 3774 err = subsystem_register(&slab_subsys); 3775 if (err) { 3776 printk(KERN_ERR "Cannot register slab subsystem.\n"); 3777 return -ENOSYS; 3778 } 3779 3780 slab_state = SYSFS; 3781 3782 list_for_each_entry(s, &slab_caches, list) { 3783 err = sysfs_slab_add(s); 3784 BUG_ON(err); 3785 } 3786 3787 while (alias_list) { 3788 struct saved_alias *al = alias_list; 3789 3790 alias_list = alias_list->next; 3791 err = sysfs_slab_alias(al->s, al->name); 3792 BUG_ON(err); 3793 kfree(al); 3794 } 3795 3796 resiliency_test(); 3797 return 0; 3798 } 3799 3800 __initcall(slab_sysfs_init); 3801 #endif 3802