1 /* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks or atomic operatios 6 * and only uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter 9 * (C) 2011 Linux Foundation, Christoph Lameter 10 */ 11 12 #include <linux/mm.h> 13 #include <linux/swap.h> /* struct reclaim_state */ 14 #include <linux/module.h> 15 #include <linux/bit_spinlock.h> 16 #include <linux/interrupt.h> 17 #include <linux/bitops.h> 18 #include <linux/slab.h> 19 #include <linux/proc_fs.h> 20 #include <linux/seq_file.h> 21 #include <linux/kmemcheck.h> 22 #include <linux/cpu.h> 23 #include <linux/cpuset.h> 24 #include <linux/mempolicy.h> 25 #include <linux/ctype.h> 26 #include <linux/debugobjects.h> 27 #include <linux/kallsyms.h> 28 #include <linux/memory.h> 29 #include <linux/math64.h> 30 #include <linux/fault-inject.h> 31 #include <linux/stacktrace.h> 32 33 #include <trace/events/kmem.h> 34 35 /* 36 * Lock order: 37 * 1. slub_lock (Global Semaphore) 38 * 2. node->list_lock 39 * 3. slab_lock(page) (Only on some arches and for debugging) 40 * 41 * slub_lock 42 * 43 * The role of the slub_lock is to protect the list of all the slabs 44 * and to synchronize major metadata changes to slab cache structures. 45 * 46 * The slab_lock is only used for debugging and on arches that do not 47 * have the ability to do a cmpxchg_double. It only protects the second 48 * double word in the page struct. Meaning 49 * A. page->freelist -> List of object free in a page 50 * B. page->counters -> Counters of objects 51 * C. page->frozen -> frozen state 52 * 53 * If a slab is frozen then it is exempt from list management. It is not 54 * on any list. The processor that froze the slab is the one who can 55 * perform list operations on the page. Other processors may put objects 56 * onto the freelist but the processor that froze the slab is the only 57 * one that can retrieve the objects from the page's freelist. 58 * 59 * The list_lock protects the partial and full list on each node and 60 * the partial slab counter. If taken then no new slabs may be added or 61 * removed from the lists nor make the number of partial slabs be modified. 62 * (Note that the total number of slabs is an atomic value that may be 63 * modified without taking the list lock). 64 * 65 * The list_lock is a centralized lock and thus we avoid taking it as 66 * much as possible. As long as SLUB does not have to handle partial 67 * slabs, operations can continue without any centralized lock. F.e. 68 * allocating a long series of objects that fill up slabs does not require 69 * the list lock. 70 * Interrupts are disabled during allocation and deallocation in order to 71 * make the slab allocator safe to use in the context of an irq. In addition 72 * interrupts are disabled to ensure that the processor does not change 73 * while handling per_cpu slabs, due to kernel preemption. 74 * 75 * SLUB assigns one slab for allocation to each processor. 76 * Allocations only occur from these slabs called cpu slabs. 77 * 78 * Slabs with free elements are kept on a partial list and during regular 79 * operations no list for full slabs is used. If an object in a full slab is 80 * freed then the slab will show up again on the partial lists. 81 * We track full slabs for debugging purposes though because otherwise we 82 * cannot scan all objects. 83 * 84 * Slabs are freed when they become empty. Teardown and setup is 85 * minimal so we rely on the page allocators per cpu caches for 86 * fast frees and allocs. 87 * 88 * Overloading of page flags that are otherwise used for LRU management. 89 * 90 * PageActive The slab is frozen and exempt from list processing. 91 * This means that the slab is dedicated to a purpose 92 * such as satisfying allocations for a specific 93 * processor. Objects may be freed in the slab while 94 * it is frozen but slab_free will then skip the usual 95 * list operations. It is up to the processor holding 96 * the slab to integrate the slab into the slab lists 97 * when the slab is no longer needed. 98 * 99 * One use of this flag is to mark slabs that are 100 * used for allocations. Then such a slab becomes a cpu 101 * slab. The cpu slab may be equipped with an additional 102 * freelist that allows lockless access to 103 * free objects in addition to the regular freelist 104 * that requires the slab lock. 105 * 106 * PageError Slab requires special handling due to debug 107 * options set. This moves slab handling out of 108 * the fast path and disables lockless freelists. 109 */ 110 111 #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 112 SLAB_TRACE | SLAB_DEBUG_FREE) 113 114 static inline int kmem_cache_debug(struct kmem_cache *s) 115 { 116 #ifdef CONFIG_SLUB_DEBUG 117 return unlikely(s->flags & SLAB_DEBUG_FLAGS); 118 #else 119 return 0; 120 #endif 121 } 122 123 /* 124 * Issues still to be resolved: 125 * 126 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 127 * 128 * - Variable sizing of the per node arrays 129 */ 130 131 /* Enable to test recovery from slab corruption on boot */ 132 #undef SLUB_RESILIENCY_TEST 133 134 /* Enable to log cmpxchg failures */ 135 #undef SLUB_DEBUG_CMPXCHG 136 137 /* 138 * Mininum number of partial slabs. These will be left on the partial 139 * lists even if they are empty. kmem_cache_shrink may reclaim them. 140 */ 141 #define MIN_PARTIAL 5 142 143 /* 144 * Maximum number of desirable partial slabs. 145 * The existence of more partial slabs makes kmem_cache_shrink 146 * sort the partial list by the number of objects in the. 147 */ 148 #define MAX_PARTIAL 10 149 150 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 151 SLAB_POISON | SLAB_STORE_USER) 152 153 /* 154 * Debugging flags that require metadata to be stored in the slab. These get 155 * disabled when slub_debug=O is used and a cache's min order increases with 156 * metadata. 157 */ 158 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 159 160 /* 161 * Set of flags that will prevent slab merging 162 */ 163 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 164 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ 165 SLAB_FAILSLAB) 166 167 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 168 SLAB_CACHE_DMA | SLAB_NOTRACK) 169 170 #define OO_SHIFT 16 171 #define OO_MASK ((1 << OO_SHIFT) - 1) 172 #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ 173 174 /* Internal SLUB flags */ 175 #define __OBJECT_POISON 0x80000000UL /* Poison object */ 176 #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ 177 178 static int kmem_size = sizeof(struct kmem_cache); 179 180 #ifdef CONFIG_SMP 181 static struct notifier_block slab_notifier; 182 #endif 183 184 static enum { 185 DOWN, /* No slab functionality available */ 186 PARTIAL, /* Kmem_cache_node works */ 187 UP, /* Everything works but does not show up in sysfs */ 188 SYSFS /* Sysfs up */ 189 } slab_state = DOWN; 190 191 /* A list of all slab caches on the system */ 192 static DECLARE_RWSEM(slub_lock); 193 static LIST_HEAD(slab_caches); 194 195 /* 196 * Tracking user of a slab. 197 */ 198 #define TRACK_ADDRS_COUNT 16 199 struct track { 200 unsigned long addr; /* Called from address */ 201 #ifdef CONFIG_STACKTRACE 202 unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ 203 #endif 204 int cpu; /* Was running on cpu */ 205 int pid; /* Pid context */ 206 unsigned long when; /* When did the operation occur */ 207 }; 208 209 enum track_item { TRACK_ALLOC, TRACK_FREE }; 210 211 #ifdef CONFIG_SYSFS 212 static int sysfs_slab_add(struct kmem_cache *); 213 static int sysfs_slab_alias(struct kmem_cache *, const char *); 214 static void sysfs_slab_remove(struct kmem_cache *); 215 216 #else 217 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 218 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 219 { return 0; } 220 static inline void sysfs_slab_remove(struct kmem_cache *s) 221 { 222 kfree(s->name); 223 kfree(s); 224 } 225 226 #endif 227 228 static inline void stat(const struct kmem_cache *s, enum stat_item si) 229 { 230 #ifdef CONFIG_SLUB_STATS 231 __this_cpu_inc(s->cpu_slab->stat[si]); 232 #endif 233 } 234 235 /******************************************************************** 236 * Core slab cache functions 237 *******************************************************************/ 238 239 int slab_is_available(void) 240 { 241 return slab_state >= UP; 242 } 243 244 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 245 { 246 return s->node[node]; 247 } 248 249 /* Verify that a pointer has an address that is valid within a slab page */ 250 static inline int check_valid_pointer(struct kmem_cache *s, 251 struct page *page, const void *object) 252 { 253 void *base; 254 255 if (!object) 256 return 1; 257 258 base = page_address(page); 259 if (object < base || object >= base + page->objects * s->size || 260 (object - base) % s->size) { 261 return 0; 262 } 263 264 return 1; 265 } 266 267 static inline void *get_freepointer(struct kmem_cache *s, void *object) 268 { 269 return *(void **)(object + s->offset); 270 } 271 272 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 273 { 274 void *p; 275 276 #ifdef CONFIG_DEBUG_PAGEALLOC 277 probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); 278 #else 279 p = get_freepointer(s, object); 280 #endif 281 return p; 282 } 283 284 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 285 { 286 *(void **)(object + s->offset) = fp; 287 } 288 289 /* Loop over all objects in a slab */ 290 #define for_each_object(__p, __s, __addr, __objects) \ 291 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ 292 __p += (__s)->size) 293 294 /* Determine object index from a given position */ 295 static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 296 { 297 return (p - addr) / s->size; 298 } 299 300 static inline size_t slab_ksize(const struct kmem_cache *s) 301 { 302 #ifdef CONFIG_SLUB_DEBUG 303 /* 304 * Debugging requires use of the padding between object 305 * and whatever may come after it. 306 */ 307 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 308 return s->objsize; 309 310 #endif 311 /* 312 * If we have the need to store the freelist pointer 313 * back there or track user information then we can 314 * only use the space before that information. 315 */ 316 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 317 return s->inuse; 318 /* 319 * Else we can use all the padding etc for the allocation 320 */ 321 return s->size; 322 } 323 324 static inline int order_objects(int order, unsigned long size, int reserved) 325 { 326 return ((PAGE_SIZE << order) - reserved) / size; 327 } 328 329 static inline struct kmem_cache_order_objects oo_make(int order, 330 unsigned long size, int reserved) 331 { 332 struct kmem_cache_order_objects x = { 333 (order << OO_SHIFT) + order_objects(order, size, reserved) 334 }; 335 336 return x; 337 } 338 339 static inline int oo_order(struct kmem_cache_order_objects x) 340 { 341 return x.x >> OO_SHIFT; 342 } 343 344 static inline int oo_objects(struct kmem_cache_order_objects x) 345 { 346 return x.x & OO_MASK; 347 } 348 349 /* 350 * Per slab locking using the pagelock 351 */ 352 static __always_inline void slab_lock(struct page *page) 353 { 354 bit_spin_lock(PG_locked, &page->flags); 355 } 356 357 static __always_inline void slab_unlock(struct page *page) 358 { 359 __bit_spin_unlock(PG_locked, &page->flags); 360 } 361 362 /* Interrupts must be disabled (for the fallback code to work right) */ 363 static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 364 void *freelist_old, unsigned long counters_old, 365 void *freelist_new, unsigned long counters_new, 366 const char *n) 367 { 368 VM_BUG_ON(!irqs_disabled()); 369 #ifdef CONFIG_CMPXCHG_DOUBLE 370 if (s->flags & __CMPXCHG_DOUBLE) { 371 if (cmpxchg_double(&page->freelist, 372 freelist_old, counters_old, 373 freelist_new, counters_new)) 374 return 1; 375 } else 376 #endif 377 { 378 slab_lock(page); 379 if (page->freelist == freelist_old && page->counters == counters_old) { 380 page->freelist = freelist_new; 381 page->counters = counters_new; 382 slab_unlock(page); 383 return 1; 384 } 385 slab_unlock(page); 386 } 387 388 cpu_relax(); 389 stat(s, CMPXCHG_DOUBLE_FAIL); 390 391 #ifdef SLUB_DEBUG_CMPXCHG 392 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 393 #endif 394 395 return 0; 396 } 397 398 static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 399 void *freelist_old, unsigned long counters_old, 400 void *freelist_new, unsigned long counters_new, 401 const char *n) 402 { 403 #ifdef CONFIG_CMPXCHG_DOUBLE 404 if (s->flags & __CMPXCHG_DOUBLE) { 405 if (cmpxchg_double(&page->freelist, 406 freelist_old, counters_old, 407 freelist_new, counters_new)) 408 return 1; 409 } else 410 #endif 411 { 412 unsigned long flags; 413 414 local_irq_save(flags); 415 slab_lock(page); 416 if (page->freelist == freelist_old && page->counters == counters_old) { 417 page->freelist = freelist_new; 418 page->counters = counters_new; 419 slab_unlock(page); 420 local_irq_restore(flags); 421 return 1; 422 } 423 slab_unlock(page); 424 local_irq_restore(flags); 425 } 426 427 cpu_relax(); 428 stat(s, CMPXCHG_DOUBLE_FAIL); 429 430 #ifdef SLUB_DEBUG_CMPXCHG 431 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 432 #endif 433 434 return 0; 435 } 436 437 #ifdef CONFIG_SLUB_DEBUG 438 /* 439 * Determine a map of object in use on a page. 440 * 441 * Node listlock must be held to guarantee that the page does 442 * not vanish from under us. 443 */ 444 static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) 445 { 446 void *p; 447 void *addr = page_address(page); 448 449 for (p = page->freelist; p; p = get_freepointer(s, p)) 450 set_bit(slab_index(p, s, addr), map); 451 } 452 453 /* 454 * Debug settings: 455 */ 456 #ifdef CONFIG_SLUB_DEBUG_ON 457 static int slub_debug = DEBUG_DEFAULT_FLAGS; 458 #else 459 static int slub_debug; 460 #endif 461 462 static char *slub_debug_slabs; 463 static int disable_higher_order_debug; 464 465 /* 466 * Object debugging 467 */ 468 static void print_section(char *text, u8 *addr, unsigned int length) 469 { 470 int i, offset; 471 int newline = 1; 472 char ascii[17]; 473 474 ascii[16] = 0; 475 476 for (i = 0; i < length; i++) { 477 if (newline) { 478 printk(KERN_ERR "%8s 0x%p: ", text, addr + i); 479 newline = 0; 480 } 481 printk(KERN_CONT " %02x", addr[i]); 482 offset = i % 16; 483 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; 484 if (offset == 15) { 485 printk(KERN_CONT " %s\n", ascii); 486 newline = 1; 487 } 488 } 489 if (!newline) { 490 i %= 16; 491 while (i < 16) { 492 printk(KERN_CONT " "); 493 ascii[i] = ' '; 494 i++; 495 } 496 printk(KERN_CONT " %s\n", ascii); 497 } 498 } 499 500 static struct track *get_track(struct kmem_cache *s, void *object, 501 enum track_item alloc) 502 { 503 struct track *p; 504 505 if (s->offset) 506 p = object + s->offset + sizeof(void *); 507 else 508 p = object + s->inuse; 509 510 return p + alloc; 511 } 512 513 static void set_track(struct kmem_cache *s, void *object, 514 enum track_item alloc, unsigned long addr) 515 { 516 struct track *p = get_track(s, object, alloc); 517 518 if (addr) { 519 #ifdef CONFIG_STACKTRACE 520 struct stack_trace trace; 521 int i; 522 523 trace.nr_entries = 0; 524 trace.max_entries = TRACK_ADDRS_COUNT; 525 trace.entries = p->addrs; 526 trace.skip = 3; 527 save_stack_trace(&trace); 528 529 /* See rant in lockdep.c */ 530 if (trace.nr_entries != 0 && 531 trace.entries[trace.nr_entries - 1] == ULONG_MAX) 532 trace.nr_entries--; 533 534 for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) 535 p->addrs[i] = 0; 536 #endif 537 p->addr = addr; 538 p->cpu = smp_processor_id(); 539 p->pid = current->pid; 540 p->when = jiffies; 541 } else 542 memset(p, 0, sizeof(struct track)); 543 } 544 545 static void init_tracking(struct kmem_cache *s, void *object) 546 { 547 if (!(s->flags & SLAB_STORE_USER)) 548 return; 549 550 set_track(s, object, TRACK_FREE, 0UL); 551 set_track(s, object, TRACK_ALLOC, 0UL); 552 } 553 554 static void print_track(const char *s, struct track *t) 555 { 556 if (!t->addr) 557 return; 558 559 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 560 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 561 #ifdef CONFIG_STACKTRACE 562 { 563 int i; 564 for (i = 0; i < TRACK_ADDRS_COUNT; i++) 565 if (t->addrs[i]) 566 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); 567 else 568 break; 569 } 570 #endif 571 } 572 573 static void print_tracking(struct kmem_cache *s, void *object) 574 { 575 if (!(s->flags & SLAB_STORE_USER)) 576 return; 577 578 print_track("Allocated", get_track(s, object, TRACK_ALLOC)); 579 print_track("Freed", get_track(s, object, TRACK_FREE)); 580 } 581 582 static void print_page_info(struct page *page) 583 { 584 printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", 585 page, page->objects, page->inuse, page->freelist, page->flags); 586 587 } 588 589 static void slab_bug(struct kmem_cache *s, char *fmt, ...) 590 { 591 va_list args; 592 char buf[100]; 593 594 va_start(args, fmt); 595 vsnprintf(buf, sizeof(buf), fmt, args); 596 va_end(args); 597 printk(KERN_ERR "========================================" 598 "=====================================\n"); 599 printk(KERN_ERR "BUG %s: %s\n", s->name, buf); 600 printk(KERN_ERR "----------------------------------------" 601 "-------------------------------------\n\n"); 602 } 603 604 static void slab_fix(struct kmem_cache *s, char *fmt, ...) 605 { 606 va_list args; 607 char buf[100]; 608 609 va_start(args, fmt); 610 vsnprintf(buf, sizeof(buf), fmt, args); 611 va_end(args); 612 printk(KERN_ERR "FIX %s: %s\n", s->name, buf); 613 } 614 615 static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 616 { 617 unsigned int off; /* Offset of last byte */ 618 u8 *addr = page_address(page); 619 620 print_tracking(s, p); 621 622 print_page_info(page); 623 624 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 625 p, p - addr, get_freepointer(s, p)); 626 627 if (p > addr + 16) 628 print_section("Bytes b4", p - 16, 16); 629 630 print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); 631 632 if (s->flags & SLAB_RED_ZONE) 633 print_section("Redzone", p + s->objsize, 634 s->inuse - s->objsize); 635 636 if (s->offset) 637 off = s->offset + sizeof(void *); 638 else 639 off = s->inuse; 640 641 if (s->flags & SLAB_STORE_USER) 642 off += 2 * sizeof(struct track); 643 644 if (off != s->size) 645 /* Beginning of the filler is the free pointer */ 646 print_section("Padding", p + off, s->size - off); 647 648 dump_stack(); 649 } 650 651 static void object_err(struct kmem_cache *s, struct page *page, 652 u8 *object, char *reason) 653 { 654 slab_bug(s, "%s", reason); 655 print_trailer(s, page, object); 656 } 657 658 static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) 659 { 660 va_list args; 661 char buf[100]; 662 663 va_start(args, fmt); 664 vsnprintf(buf, sizeof(buf), fmt, args); 665 va_end(args); 666 slab_bug(s, "%s", buf); 667 print_page_info(page); 668 dump_stack(); 669 } 670 671 static void init_object(struct kmem_cache *s, void *object, u8 val) 672 { 673 u8 *p = object; 674 675 if (s->flags & __OBJECT_POISON) { 676 memset(p, POISON_FREE, s->objsize - 1); 677 p[s->objsize - 1] = POISON_END; 678 } 679 680 if (s->flags & SLAB_RED_ZONE) 681 memset(p + s->objsize, val, s->inuse - s->objsize); 682 } 683 684 static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes) 685 { 686 while (bytes) { 687 if (*start != value) 688 return start; 689 start++; 690 bytes--; 691 } 692 return NULL; 693 } 694 695 static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) 696 { 697 u64 value64; 698 unsigned int words, prefix; 699 700 if (bytes <= 16) 701 return check_bytes8(start, value, bytes); 702 703 value64 = value | value << 8 | value << 16 | value << 24; 704 value64 = value64 | value64 << 32; 705 prefix = 8 - ((unsigned long)start) % 8; 706 707 if (prefix) { 708 u8 *r = check_bytes8(start, value, prefix); 709 if (r) 710 return r; 711 start += prefix; 712 bytes -= prefix; 713 } 714 715 words = bytes / 8; 716 717 while (words) { 718 if (*(u64 *)start != value64) 719 return check_bytes8(start, value, 8); 720 start += 8; 721 words--; 722 } 723 724 return check_bytes8(start, value, bytes % 8); 725 } 726 727 static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 728 void *from, void *to) 729 { 730 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); 731 memset(from, data, to - from); 732 } 733 734 static int check_bytes_and_report(struct kmem_cache *s, struct page *page, 735 u8 *object, char *what, 736 u8 *start, unsigned int value, unsigned int bytes) 737 { 738 u8 *fault; 739 u8 *end; 740 741 fault = check_bytes(start, value, bytes); 742 if (!fault) 743 return 1; 744 745 end = start + bytes; 746 while (end > fault && end[-1] == value) 747 end--; 748 749 slab_bug(s, "%s overwritten", what); 750 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", 751 fault, end - 1, fault[0], value); 752 print_trailer(s, page, object); 753 754 restore_bytes(s, what, value, fault, end); 755 return 0; 756 } 757 758 /* 759 * Object layout: 760 * 761 * object address 762 * Bytes of the object to be managed. 763 * If the freepointer may overlay the object then the free 764 * pointer is the first word of the object. 765 * 766 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 767 * 0xa5 (POISON_END) 768 * 769 * object + s->objsize 770 * Padding to reach word boundary. This is also used for Redzoning. 771 * Padding is extended by another word if Redzoning is enabled and 772 * objsize == inuse. 773 * 774 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 775 * 0xcc (RED_ACTIVE) for objects in use. 776 * 777 * object + s->inuse 778 * Meta data starts here. 779 * 780 * A. Free pointer (if we cannot overwrite object on free) 781 * B. Tracking data for SLAB_STORE_USER 782 * C. Padding to reach required alignment boundary or at mininum 783 * one word if debugging is on to be able to detect writes 784 * before the word boundary. 785 * 786 * Padding is done using 0x5a (POISON_INUSE) 787 * 788 * object + s->size 789 * Nothing is used beyond s->size. 790 * 791 * If slabcaches are merged then the objsize and inuse boundaries are mostly 792 * ignored. And therefore no slab options that rely on these boundaries 793 * may be used with merged slabcaches. 794 */ 795 796 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 797 { 798 unsigned long off = s->inuse; /* The end of info */ 799 800 if (s->offset) 801 /* Freepointer is placed after the object. */ 802 off += sizeof(void *); 803 804 if (s->flags & SLAB_STORE_USER) 805 /* We also have user information there */ 806 off += 2 * sizeof(struct track); 807 808 if (s->size == off) 809 return 1; 810 811 return check_bytes_and_report(s, page, p, "Object padding", 812 p + off, POISON_INUSE, s->size - off); 813 } 814 815 /* Check the pad bytes at the end of a slab page */ 816 static int slab_pad_check(struct kmem_cache *s, struct page *page) 817 { 818 u8 *start; 819 u8 *fault; 820 u8 *end; 821 int length; 822 int remainder; 823 824 if (!(s->flags & SLAB_POISON)) 825 return 1; 826 827 start = page_address(page); 828 length = (PAGE_SIZE << compound_order(page)) - s->reserved; 829 end = start + length; 830 remainder = length % s->size; 831 if (!remainder) 832 return 1; 833 834 fault = check_bytes(end - remainder, POISON_INUSE, remainder); 835 if (!fault) 836 return 1; 837 while (end > fault && end[-1] == POISON_INUSE) 838 end--; 839 840 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 841 print_section("Padding", end - remainder, remainder); 842 843 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); 844 return 0; 845 } 846 847 static int check_object(struct kmem_cache *s, struct page *page, 848 void *object, u8 val) 849 { 850 u8 *p = object; 851 u8 *endobject = object + s->objsize; 852 853 if (s->flags & SLAB_RED_ZONE) { 854 if (!check_bytes_and_report(s, page, object, "Redzone", 855 endobject, val, s->inuse - s->objsize)) 856 return 0; 857 } else { 858 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { 859 check_bytes_and_report(s, page, p, "Alignment padding", 860 endobject, POISON_INUSE, s->inuse - s->objsize); 861 } 862 } 863 864 if (s->flags & SLAB_POISON) { 865 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && 866 (!check_bytes_and_report(s, page, p, "Poison", p, 867 POISON_FREE, s->objsize - 1) || 868 !check_bytes_and_report(s, page, p, "Poison", 869 p + s->objsize - 1, POISON_END, 1))) 870 return 0; 871 /* 872 * check_pad_bytes cleans up on its own. 873 */ 874 check_pad_bytes(s, page, p); 875 } 876 877 if (!s->offset && val == SLUB_RED_ACTIVE) 878 /* 879 * Object and freepointer overlap. Cannot check 880 * freepointer while object is allocated. 881 */ 882 return 1; 883 884 /* Check free pointer validity */ 885 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 886 object_err(s, page, p, "Freepointer corrupt"); 887 /* 888 * No choice but to zap it and thus lose the remainder 889 * of the free objects in this slab. May cause 890 * another error because the object count is now wrong. 891 */ 892 set_freepointer(s, p, NULL); 893 return 0; 894 } 895 return 1; 896 } 897 898 static int check_slab(struct kmem_cache *s, struct page *page) 899 { 900 int maxobj; 901 902 VM_BUG_ON(!irqs_disabled()); 903 904 if (!PageSlab(page)) { 905 slab_err(s, page, "Not a valid slab page"); 906 return 0; 907 } 908 909 maxobj = order_objects(compound_order(page), s->size, s->reserved); 910 if (page->objects > maxobj) { 911 slab_err(s, page, "objects %u > max %u", 912 s->name, page->objects, maxobj); 913 return 0; 914 } 915 if (page->inuse > page->objects) { 916 slab_err(s, page, "inuse %u > max %u", 917 s->name, page->inuse, page->objects); 918 return 0; 919 } 920 /* Slab_pad_check fixes things up after itself */ 921 slab_pad_check(s, page); 922 return 1; 923 } 924 925 /* 926 * Determine if a certain object on a page is on the freelist. Must hold the 927 * slab lock to guarantee that the chains are in a consistent state. 928 */ 929 static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 930 { 931 int nr = 0; 932 void *fp; 933 void *object = NULL; 934 unsigned long max_objects; 935 936 fp = page->freelist; 937 while (fp && nr <= page->objects) { 938 if (fp == search) 939 return 1; 940 if (!check_valid_pointer(s, page, fp)) { 941 if (object) { 942 object_err(s, page, object, 943 "Freechain corrupt"); 944 set_freepointer(s, object, NULL); 945 break; 946 } else { 947 slab_err(s, page, "Freepointer corrupt"); 948 page->freelist = NULL; 949 page->inuse = page->objects; 950 slab_fix(s, "Freelist cleared"); 951 return 0; 952 } 953 break; 954 } 955 object = fp; 956 fp = get_freepointer(s, object); 957 nr++; 958 } 959 960 max_objects = order_objects(compound_order(page), s->size, s->reserved); 961 if (max_objects > MAX_OBJS_PER_PAGE) 962 max_objects = MAX_OBJS_PER_PAGE; 963 964 if (page->objects != max_objects) { 965 slab_err(s, page, "Wrong number of objects. Found %d but " 966 "should be %d", page->objects, max_objects); 967 page->objects = max_objects; 968 slab_fix(s, "Number of objects adjusted."); 969 } 970 if (page->inuse != page->objects - nr) { 971 slab_err(s, page, "Wrong object count. Counter is %d but " 972 "counted were %d", page->inuse, page->objects - nr); 973 page->inuse = page->objects - nr; 974 slab_fix(s, "Object count adjusted."); 975 } 976 return search == NULL; 977 } 978 979 static void trace(struct kmem_cache *s, struct page *page, void *object, 980 int alloc) 981 { 982 if (s->flags & SLAB_TRACE) { 983 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 984 s->name, 985 alloc ? "alloc" : "free", 986 object, page->inuse, 987 page->freelist); 988 989 if (!alloc) 990 print_section("Object", (void *)object, s->objsize); 991 992 dump_stack(); 993 } 994 } 995 996 /* 997 * Hooks for other subsystems that check memory allocations. In a typical 998 * production configuration these hooks all should produce no code at all. 999 */ 1000 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1001 { 1002 flags &= gfp_allowed_mask; 1003 lockdep_trace_alloc(flags); 1004 might_sleep_if(flags & __GFP_WAIT); 1005 1006 return should_failslab(s->objsize, flags, s->flags); 1007 } 1008 1009 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) 1010 { 1011 flags &= gfp_allowed_mask; 1012 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 1013 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); 1014 } 1015 1016 static inline void slab_free_hook(struct kmem_cache *s, void *x) 1017 { 1018 kmemleak_free_recursive(x, s->flags); 1019 1020 /* 1021 * Trouble is that we may no longer disable interupts in the fast path 1022 * So in order to make the debug calls that expect irqs to be 1023 * disabled we need to disable interrupts temporarily. 1024 */ 1025 #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) 1026 { 1027 unsigned long flags; 1028 1029 local_irq_save(flags); 1030 kmemcheck_slab_free(s, x, s->objsize); 1031 debug_check_no_locks_freed(x, s->objsize); 1032 local_irq_restore(flags); 1033 } 1034 #endif 1035 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1036 debug_check_no_obj_freed(x, s->objsize); 1037 } 1038 1039 /* 1040 * Tracking of fully allocated slabs for debugging purposes. 1041 * 1042 * list_lock must be held. 1043 */ 1044 static void add_full(struct kmem_cache *s, 1045 struct kmem_cache_node *n, struct page *page) 1046 { 1047 if (!(s->flags & SLAB_STORE_USER)) 1048 return; 1049 1050 list_add(&page->lru, &n->full); 1051 } 1052 1053 /* 1054 * list_lock must be held. 1055 */ 1056 static void remove_full(struct kmem_cache *s, struct page *page) 1057 { 1058 if (!(s->flags & SLAB_STORE_USER)) 1059 return; 1060 1061 list_del(&page->lru); 1062 } 1063 1064 /* Tracking of the number of slabs for debugging purposes */ 1065 static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1066 { 1067 struct kmem_cache_node *n = get_node(s, node); 1068 1069 return atomic_long_read(&n->nr_slabs); 1070 } 1071 1072 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1073 { 1074 return atomic_long_read(&n->nr_slabs); 1075 } 1076 1077 static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) 1078 { 1079 struct kmem_cache_node *n = get_node(s, node); 1080 1081 /* 1082 * May be called early in order to allocate a slab for the 1083 * kmem_cache_node structure. Solve the chicken-egg 1084 * dilemma by deferring the increment of the count during 1085 * bootstrap (see early_kmem_cache_node_alloc). 1086 */ 1087 if (n) { 1088 atomic_long_inc(&n->nr_slabs); 1089 atomic_long_add(objects, &n->total_objects); 1090 } 1091 } 1092 static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) 1093 { 1094 struct kmem_cache_node *n = get_node(s, node); 1095 1096 atomic_long_dec(&n->nr_slabs); 1097 atomic_long_sub(objects, &n->total_objects); 1098 } 1099 1100 /* Object debug checks for alloc/free paths */ 1101 static void setup_object_debug(struct kmem_cache *s, struct page *page, 1102 void *object) 1103 { 1104 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 1105 return; 1106 1107 init_object(s, object, SLUB_RED_INACTIVE); 1108 init_tracking(s, object); 1109 } 1110 1111 static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page, 1112 void *object, unsigned long addr) 1113 { 1114 if (!check_slab(s, page)) 1115 goto bad; 1116 1117 if (!check_valid_pointer(s, page, object)) { 1118 object_err(s, page, object, "Freelist Pointer check fails"); 1119 goto bad; 1120 } 1121 1122 if (!check_object(s, page, object, SLUB_RED_INACTIVE)) 1123 goto bad; 1124 1125 /* Success perform special debug activities for allocs */ 1126 if (s->flags & SLAB_STORE_USER) 1127 set_track(s, object, TRACK_ALLOC, addr); 1128 trace(s, page, object, 1); 1129 init_object(s, object, SLUB_RED_ACTIVE); 1130 return 1; 1131 1132 bad: 1133 if (PageSlab(page)) { 1134 /* 1135 * If this is a slab page then lets do the best we can 1136 * to avoid issues in the future. Marking all objects 1137 * as used avoids touching the remaining objects. 1138 */ 1139 slab_fix(s, "Marking all objects used"); 1140 page->inuse = page->objects; 1141 page->freelist = NULL; 1142 } 1143 return 0; 1144 } 1145 1146 static noinline int free_debug_processing(struct kmem_cache *s, 1147 struct page *page, void *object, unsigned long addr) 1148 { 1149 unsigned long flags; 1150 int rc = 0; 1151 1152 local_irq_save(flags); 1153 slab_lock(page); 1154 1155 if (!check_slab(s, page)) 1156 goto fail; 1157 1158 if (!check_valid_pointer(s, page, object)) { 1159 slab_err(s, page, "Invalid object pointer 0x%p", object); 1160 goto fail; 1161 } 1162 1163 if (on_freelist(s, page, object)) { 1164 object_err(s, page, object, "Object already free"); 1165 goto fail; 1166 } 1167 1168 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1169 goto out; 1170 1171 if (unlikely(s != page->slab)) { 1172 if (!PageSlab(page)) { 1173 slab_err(s, page, "Attempt to free object(0x%p) " 1174 "outside of slab", object); 1175 } else if (!page->slab) { 1176 printk(KERN_ERR 1177 "SLUB <none>: no slab for object 0x%p.\n", 1178 object); 1179 dump_stack(); 1180 } else 1181 object_err(s, page, object, 1182 "page slab pointer corrupt."); 1183 goto fail; 1184 } 1185 1186 if (s->flags & SLAB_STORE_USER) 1187 set_track(s, object, TRACK_FREE, addr); 1188 trace(s, page, object, 0); 1189 init_object(s, object, SLUB_RED_INACTIVE); 1190 rc = 1; 1191 out: 1192 slab_unlock(page); 1193 local_irq_restore(flags); 1194 return rc; 1195 1196 fail: 1197 slab_fix(s, "Object at 0x%p not freed", object); 1198 goto out; 1199 } 1200 1201 static int __init setup_slub_debug(char *str) 1202 { 1203 slub_debug = DEBUG_DEFAULT_FLAGS; 1204 if (*str++ != '=' || !*str) 1205 /* 1206 * No options specified. Switch on full debugging. 1207 */ 1208 goto out; 1209 1210 if (*str == ',') 1211 /* 1212 * No options but restriction on slabs. This means full 1213 * debugging for slabs matching a pattern. 1214 */ 1215 goto check_slabs; 1216 1217 if (tolower(*str) == 'o') { 1218 /* 1219 * Avoid enabling debugging on caches if its minimum order 1220 * would increase as a result. 1221 */ 1222 disable_higher_order_debug = 1; 1223 goto out; 1224 } 1225 1226 slub_debug = 0; 1227 if (*str == '-') 1228 /* 1229 * Switch off all debugging measures. 1230 */ 1231 goto out; 1232 1233 /* 1234 * Determine which debug features should be switched on 1235 */ 1236 for (; *str && *str != ','; str++) { 1237 switch (tolower(*str)) { 1238 case 'f': 1239 slub_debug |= SLAB_DEBUG_FREE; 1240 break; 1241 case 'z': 1242 slub_debug |= SLAB_RED_ZONE; 1243 break; 1244 case 'p': 1245 slub_debug |= SLAB_POISON; 1246 break; 1247 case 'u': 1248 slub_debug |= SLAB_STORE_USER; 1249 break; 1250 case 't': 1251 slub_debug |= SLAB_TRACE; 1252 break; 1253 case 'a': 1254 slub_debug |= SLAB_FAILSLAB; 1255 break; 1256 default: 1257 printk(KERN_ERR "slub_debug option '%c' " 1258 "unknown. skipped\n", *str); 1259 } 1260 } 1261 1262 check_slabs: 1263 if (*str == ',') 1264 slub_debug_slabs = str + 1; 1265 out: 1266 return 1; 1267 } 1268 1269 __setup("slub_debug", setup_slub_debug); 1270 1271 static unsigned long kmem_cache_flags(unsigned long objsize, 1272 unsigned long flags, const char *name, 1273 void (*ctor)(void *)) 1274 { 1275 /* 1276 * Enable debugging if selected on the kernel commandline. 1277 */ 1278 if (slub_debug && (!slub_debug_slabs || 1279 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) 1280 flags |= slub_debug; 1281 1282 return flags; 1283 } 1284 #else 1285 static inline void setup_object_debug(struct kmem_cache *s, 1286 struct page *page, void *object) {} 1287 1288 static inline int alloc_debug_processing(struct kmem_cache *s, 1289 struct page *page, void *object, unsigned long addr) { return 0; } 1290 1291 static inline int free_debug_processing(struct kmem_cache *s, 1292 struct page *page, void *object, unsigned long addr) { return 0; } 1293 1294 static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1295 { return 1; } 1296 static inline int check_object(struct kmem_cache *s, struct page *page, 1297 void *object, u8 val) { return 1; } 1298 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, 1299 struct page *page) {} 1300 static inline void remove_full(struct kmem_cache *s, struct page *page) {} 1301 static inline unsigned long kmem_cache_flags(unsigned long objsize, 1302 unsigned long flags, const char *name, 1303 void (*ctor)(void *)) 1304 { 1305 return flags; 1306 } 1307 #define slub_debug 0 1308 1309 #define disable_higher_order_debug 0 1310 1311 static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1312 { return 0; } 1313 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1314 { return 0; } 1315 static inline void inc_slabs_node(struct kmem_cache *s, int node, 1316 int objects) {} 1317 static inline void dec_slabs_node(struct kmem_cache *s, int node, 1318 int objects) {} 1319 1320 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1321 { return 0; } 1322 1323 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, 1324 void *object) {} 1325 1326 static inline void slab_free_hook(struct kmem_cache *s, void *x) {} 1327 1328 #endif /* CONFIG_SLUB_DEBUG */ 1329 1330 /* 1331 * Slab allocation and freeing 1332 */ 1333 static inline struct page *alloc_slab_page(gfp_t flags, int node, 1334 struct kmem_cache_order_objects oo) 1335 { 1336 int order = oo_order(oo); 1337 1338 flags |= __GFP_NOTRACK; 1339 1340 if (node == NUMA_NO_NODE) 1341 return alloc_pages(flags, order); 1342 else 1343 return alloc_pages_exact_node(node, flags, order); 1344 } 1345 1346 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1347 { 1348 struct page *page; 1349 struct kmem_cache_order_objects oo = s->oo; 1350 gfp_t alloc_gfp; 1351 1352 flags &= gfp_allowed_mask; 1353 1354 if (flags & __GFP_WAIT) 1355 local_irq_enable(); 1356 1357 flags |= s->allocflags; 1358 1359 /* 1360 * Let the initial higher-order allocation fail under memory pressure 1361 * so we fall-back to the minimum order allocation. 1362 */ 1363 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; 1364 1365 page = alloc_slab_page(alloc_gfp, node, oo); 1366 if (unlikely(!page)) { 1367 oo = s->min; 1368 /* 1369 * Allocation may have failed due to fragmentation. 1370 * Try a lower order alloc if possible 1371 */ 1372 page = alloc_slab_page(flags, node, oo); 1373 1374 if (page) 1375 stat(s, ORDER_FALLBACK); 1376 } 1377 1378 if (flags & __GFP_WAIT) 1379 local_irq_disable(); 1380 1381 if (!page) 1382 return NULL; 1383 1384 if (kmemcheck_enabled 1385 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1386 int pages = 1 << oo_order(oo); 1387 1388 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); 1389 1390 /* 1391 * Objects from caches that have a constructor don't get 1392 * cleared when they're allocated, so we need to do it here. 1393 */ 1394 if (s->ctor) 1395 kmemcheck_mark_uninitialized_pages(page, pages); 1396 else 1397 kmemcheck_mark_unallocated_pages(page, pages); 1398 } 1399 1400 page->objects = oo_objects(oo); 1401 mod_zone_page_state(page_zone(page), 1402 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1403 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1404 1 << oo_order(oo)); 1405 1406 return page; 1407 } 1408 1409 static void setup_object(struct kmem_cache *s, struct page *page, 1410 void *object) 1411 { 1412 setup_object_debug(s, page, object); 1413 if (unlikely(s->ctor)) 1414 s->ctor(object); 1415 } 1416 1417 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1418 { 1419 struct page *page; 1420 void *start; 1421 void *last; 1422 void *p; 1423 1424 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1425 1426 page = allocate_slab(s, 1427 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 1428 if (!page) 1429 goto out; 1430 1431 inc_slabs_node(s, page_to_nid(page), page->objects); 1432 page->slab = s; 1433 page->flags |= 1 << PG_slab; 1434 1435 start = page_address(page); 1436 1437 if (unlikely(s->flags & SLAB_POISON)) 1438 memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); 1439 1440 last = start; 1441 for_each_object(p, s, start, page->objects) { 1442 setup_object(s, page, last); 1443 set_freepointer(s, last, p); 1444 last = p; 1445 } 1446 setup_object(s, page, last); 1447 set_freepointer(s, last, NULL); 1448 1449 page->freelist = start; 1450 page->inuse = 0; 1451 page->frozen = 1; 1452 out: 1453 return page; 1454 } 1455 1456 static void __free_slab(struct kmem_cache *s, struct page *page) 1457 { 1458 int order = compound_order(page); 1459 int pages = 1 << order; 1460 1461 if (kmem_cache_debug(s)) { 1462 void *p; 1463 1464 slab_pad_check(s, page); 1465 for_each_object(p, s, page_address(page), 1466 page->objects) 1467 check_object(s, page, p, SLUB_RED_INACTIVE); 1468 } 1469 1470 kmemcheck_free_shadow(page, compound_order(page)); 1471 1472 mod_zone_page_state(page_zone(page), 1473 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1474 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1475 -pages); 1476 1477 __ClearPageSlab(page); 1478 reset_page_mapcount(page); 1479 if (current->reclaim_state) 1480 current->reclaim_state->reclaimed_slab += pages; 1481 __free_pages(page, order); 1482 } 1483 1484 #define need_reserve_slab_rcu \ 1485 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) 1486 1487 static void rcu_free_slab(struct rcu_head *h) 1488 { 1489 struct page *page; 1490 1491 if (need_reserve_slab_rcu) 1492 page = virt_to_head_page(h); 1493 else 1494 page = container_of((struct list_head *)h, struct page, lru); 1495 1496 __free_slab(page->slab, page); 1497 } 1498 1499 static void free_slab(struct kmem_cache *s, struct page *page) 1500 { 1501 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1502 struct rcu_head *head; 1503 1504 if (need_reserve_slab_rcu) { 1505 int order = compound_order(page); 1506 int offset = (PAGE_SIZE << order) - s->reserved; 1507 1508 VM_BUG_ON(s->reserved != sizeof(*head)); 1509 head = page_address(page) + offset; 1510 } else { 1511 /* 1512 * RCU free overloads the RCU head over the LRU 1513 */ 1514 head = (void *)&page->lru; 1515 } 1516 1517 call_rcu(head, rcu_free_slab); 1518 } else 1519 __free_slab(s, page); 1520 } 1521 1522 static void discard_slab(struct kmem_cache *s, struct page *page) 1523 { 1524 dec_slabs_node(s, page_to_nid(page), page->objects); 1525 free_slab(s, page); 1526 } 1527 1528 /* 1529 * Management of partially allocated slabs. 1530 * 1531 * list_lock must be held. 1532 */ 1533 static inline void add_partial(struct kmem_cache_node *n, 1534 struct page *page, int tail) 1535 { 1536 n->nr_partial++; 1537 if (tail) 1538 list_add_tail(&page->lru, &n->partial); 1539 else 1540 list_add(&page->lru, &n->partial); 1541 } 1542 1543 /* 1544 * list_lock must be held. 1545 */ 1546 static inline void remove_partial(struct kmem_cache_node *n, 1547 struct page *page) 1548 { 1549 list_del(&page->lru); 1550 n->nr_partial--; 1551 } 1552 1553 /* 1554 * Lock slab, remove from the partial list and put the object into the 1555 * per cpu freelist. 1556 * 1557 * Must hold list_lock. 1558 */ 1559 static inline int acquire_slab(struct kmem_cache *s, 1560 struct kmem_cache_node *n, struct page *page) 1561 { 1562 void *freelist; 1563 unsigned long counters; 1564 struct page new; 1565 1566 /* 1567 * Zap the freelist and set the frozen bit. 1568 * The old freelist is the list of objects for the 1569 * per cpu allocation list. 1570 */ 1571 do { 1572 freelist = page->freelist; 1573 counters = page->counters; 1574 new.counters = counters; 1575 new.inuse = page->objects; 1576 1577 VM_BUG_ON(new.frozen); 1578 new.frozen = 1; 1579 1580 } while (!__cmpxchg_double_slab(s, page, 1581 freelist, counters, 1582 NULL, new.counters, 1583 "lock and freeze")); 1584 1585 remove_partial(n, page); 1586 1587 if (freelist) { 1588 /* Populate the per cpu freelist */ 1589 this_cpu_write(s->cpu_slab->freelist, freelist); 1590 this_cpu_write(s->cpu_slab->page, page); 1591 this_cpu_write(s->cpu_slab->node, page_to_nid(page)); 1592 return 1; 1593 } else { 1594 /* 1595 * Slab page came from the wrong list. No object to allocate 1596 * from. Put it onto the correct list and continue partial 1597 * scan. 1598 */ 1599 printk(KERN_ERR "SLUB: %s : Page without available objects on" 1600 " partial list\n", s->name); 1601 return 0; 1602 } 1603 } 1604 1605 /* 1606 * Try to allocate a partial slab from a specific node. 1607 */ 1608 static struct page *get_partial_node(struct kmem_cache *s, 1609 struct kmem_cache_node *n) 1610 { 1611 struct page *page; 1612 1613 /* 1614 * Racy check. If we mistakenly see no partial slabs then we 1615 * just allocate an empty slab. If we mistakenly try to get a 1616 * partial slab and there is none available then get_partials() 1617 * will return NULL. 1618 */ 1619 if (!n || !n->nr_partial) 1620 return NULL; 1621 1622 spin_lock(&n->list_lock); 1623 list_for_each_entry(page, &n->partial, lru) 1624 if (acquire_slab(s, n, page)) 1625 goto out; 1626 page = NULL; 1627 out: 1628 spin_unlock(&n->list_lock); 1629 return page; 1630 } 1631 1632 /* 1633 * Get a page from somewhere. Search in increasing NUMA distances. 1634 */ 1635 static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) 1636 { 1637 #ifdef CONFIG_NUMA 1638 struct zonelist *zonelist; 1639 struct zoneref *z; 1640 struct zone *zone; 1641 enum zone_type high_zoneidx = gfp_zone(flags); 1642 struct page *page; 1643 1644 /* 1645 * The defrag ratio allows a configuration of the tradeoffs between 1646 * inter node defragmentation and node local allocations. A lower 1647 * defrag_ratio increases the tendency to do local allocations 1648 * instead of attempting to obtain partial slabs from other nodes. 1649 * 1650 * If the defrag_ratio is set to 0 then kmalloc() always 1651 * returns node local objects. If the ratio is higher then kmalloc() 1652 * may return off node objects because partial slabs are obtained 1653 * from other nodes and filled up. 1654 * 1655 * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes 1656 * defrag_ratio = 1000) then every (well almost) allocation will 1657 * first attempt to defrag slab caches on other nodes. This means 1658 * scanning over all nodes to look for partial slabs which may be 1659 * expensive if we do it every time we are trying to find a slab 1660 * with available objects. 1661 */ 1662 if (!s->remote_node_defrag_ratio || 1663 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1664 return NULL; 1665 1666 get_mems_allowed(); 1667 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1668 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1669 struct kmem_cache_node *n; 1670 1671 n = get_node(s, zone_to_nid(zone)); 1672 1673 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1674 n->nr_partial > s->min_partial) { 1675 page = get_partial_node(s, n); 1676 if (page) { 1677 put_mems_allowed(); 1678 return page; 1679 } 1680 } 1681 } 1682 put_mems_allowed(); 1683 #endif 1684 return NULL; 1685 } 1686 1687 /* 1688 * Get a partial page, lock it and return it. 1689 */ 1690 static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1691 { 1692 struct page *page; 1693 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1694 1695 page = get_partial_node(s, get_node(s, searchnode)); 1696 if (page || node != NUMA_NO_NODE) 1697 return page; 1698 1699 return get_any_partial(s, flags); 1700 } 1701 1702 #ifdef CONFIG_PREEMPT 1703 /* 1704 * Calculate the next globally unique transaction for disambiguiation 1705 * during cmpxchg. The transactions start with the cpu number and are then 1706 * incremented by CONFIG_NR_CPUS. 1707 */ 1708 #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) 1709 #else 1710 /* 1711 * No preemption supported therefore also no need to check for 1712 * different cpus. 1713 */ 1714 #define TID_STEP 1 1715 #endif 1716 1717 static inline unsigned long next_tid(unsigned long tid) 1718 { 1719 return tid + TID_STEP; 1720 } 1721 1722 static inline unsigned int tid_to_cpu(unsigned long tid) 1723 { 1724 return tid % TID_STEP; 1725 } 1726 1727 static inline unsigned long tid_to_event(unsigned long tid) 1728 { 1729 return tid / TID_STEP; 1730 } 1731 1732 static inline unsigned int init_tid(int cpu) 1733 { 1734 return cpu; 1735 } 1736 1737 static inline void note_cmpxchg_failure(const char *n, 1738 const struct kmem_cache *s, unsigned long tid) 1739 { 1740 #ifdef SLUB_DEBUG_CMPXCHG 1741 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 1742 1743 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); 1744 1745 #ifdef CONFIG_PREEMPT 1746 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) 1747 printk("due to cpu change %d -> %d\n", 1748 tid_to_cpu(tid), tid_to_cpu(actual_tid)); 1749 else 1750 #endif 1751 if (tid_to_event(tid) != tid_to_event(actual_tid)) 1752 printk("due to cpu running other code. Event %ld->%ld\n", 1753 tid_to_event(tid), tid_to_event(actual_tid)); 1754 else 1755 printk("for unknown reason: actual=%lx was=%lx target=%lx\n", 1756 actual_tid, tid, next_tid(tid)); 1757 #endif 1758 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 1759 } 1760 1761 void init_kmem_cache_cpus(struct kmem_cache *s) 1762 { 1763 int cpu; 1764 1765 for_each_possible_cpu(cpu) 1766 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); 1767 } 1768 /* 1769 * Remove the cpu slab 1770 */ 1771 1772 /* 1773 * Remove the cpu slab 1774 */ 1775 static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1776 { 1777 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; 1778 struct page *page = c->page; 1779 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1780 int lock = 0; 1781 enum slab_modes l = M_NONE, m = M_NONE; 1782 void *freelist; 1783 void *nextfree; 1784 int tail = 0; 1785 struct page new; 1786 struct page old; 1787 1788 if (page->freelist) { 1789 stat(s, DEACTIVATE_REMOTE_FREES); 1790 tail = 1; 1791 } 1792 1793 c->tid = next_tid(c->tid); 1794 c->page = NULL; 1795 freelist = c->freelist; 1796 c->freelist = NULL; 1797 1798 /* 1799 * Stage one: Free all available per cpu objects back 1800 * to the page freelist while it is still frozen. Leave the 1801 * last one. 1802 * 1803 * There is no need to take the list->lock because the page 1804 * is still frozen. 1805 */ 1806 while (freelist && (nextfree = get_freepointer(s, freelist))) { 1807 void *prior; 1808 unsigned long counters; 1809 1810 do { 1811 prior = page->freelist; 1812 counters = page->counters; 1813 set_freepointer(s, freelist, prior); 1814 new.counters = counters; 1815 new.inuse--; 1816 VM_BUG_ON(!new.frozen); 1817 1818 } while (!__cmpxchg_double_slab(s, page, 1819 prior, counters, 1820 freelist, new.counters, 1821 "drain percpu freelist")); 1822 1823 freelist = nextfree; 1824 } 1825 1826 /* 1827 * Stage two: Ensure that the page is unfrozen while the 1828 * list presence reflects the actual number of objects 1829 * during unfreeze. 1830 * 1831 * We setup the list membership and then perform a cmpxchg 1832 * with the count. If there is a mismatch then the page 1833 * is not unfrozen but the page is on the wrong list. 1834 * 1835 * Then we restart the process which may have to remove 1836 * the page from the list that we just put it on again 1837 * because the number of objects in the slab may have 1838 * changed. 1839 */ 1840 redo: 1841 1842 old.freelist = page->freelist; 1843 old.counters = page->counters; 1844 VM_BUG_ON(!old.frozen); 1845 1846 /* Determine target state of the slab */ 1847 new.counters = old.counters; 1848 if (freelist) { 1849 new.inuse--; 1850 set_freepointer(s, freelist, old.freelist); 1851 new.freelist = freelist; 1852 } else 1853 new.freelist = old.freelist; 1854 1855 new.frozen = 0; 1856 1857 if (!new.inuse && n->nr_partial < s->min_partial) 1858 m = M_FREE; 1859 else if (new.freelist) { 1860 m = M_PARTIAL; 1861 if (!lock) { 1862 lock = 1; 1863 /* 1864 * Taking the spinlock removes the possiblity 1865 * that acquire_slab() will see a slab page that 1866 * is frozen 1867 */ 1868 spin_lock(&n->list_lock); 1869 } 1870 } else { 1871 m = M_FULL; 1872 if (kmem_cache_debug(s) && !lock) { 1873 lock = 1; 1874 /* 1875 * This also ensures that the scanning of full 1876 * slabs from diagnostic functions will not see 1877 * any frozen slabs. 1878 */ 1879 spin_lock(&n->list_lock); 1880 } 1881 } 1882 1883 if (l != m) { 1884 1885 if (l == M_PARTIAL) 1886 1887 remove_partial(n, page); 1888 1889 else if (l == M_FULL) 1890 1891 remove_full(s, page); 1892 1893 if (m == M_PARTIAL) { 1894 1895 add_partial(n, page, tail); 1896 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1897 1898 } else if (m == M_FULL) { 1899 1900 stat(s, DEACTIVATE_FULL); 1901 add_full(s, n, page); 1902 1903 } 1904 } 1905 1906 l = m; 1907 if (!__cmpxchg_double_slab(s, page, 1908 old.freelist, old.counters, 1909 new.freelist, new.counters, 1910 "unfreezing slab")) 1911 goto redo; 1912 1913 if (lock) 1914 spin_unlock(&n->list_lock); 1915 1916 if (m == M_FREE) { 1917 stat(s, DEACTIVATE_EMPTY); 1918 discard_slab(s, page); 1919 stat(s, FREE_SLAB); 1920 } 1921 } 1922 1923 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1924 { 1925 stat(s, CPUSLAB_FLUSH); 1926 deactivate_slab(s, c); 1927 } 1928 1929 /* 1930 * Flush cpu slab. 1931 * 1932 * Called from IPI handler with interrupts disabled. 1933 */ 1934 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1935 { 1936 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 1937 1938 if (likely(c && c->page)) 1939 flush_slab(s, c); 1940 } 1941 1942 static void flush_cpu_slab(void *d) 1943 { 1944 struct kmem_cache *s = d; 1945 1946 __flush_cpu_slab(s, smp_processor_id()); 1947 } 1948 1949 static void flush_all(struct kmem_cache *s) 1950 { 1951 on_each_cpu(flush_cpu_slab, s, 1); 1952 } 1953 1954 /* 1955 * Check if the objects in a per cpu structure fit numa 1956 * locality expectations. 1957 */ 1958 static inline int node_match(struct kmem_cache_cpu *c, int node) 1959 { 1960 #ifdef CONFIG_NUMA 1961 if (node != NUMA_NO_NODE && c->node != node) 1962 return 0; 1963 #endif 1964 return 1; 1965 } 1966 1967 static int count_free(struct page *page) 1968 { 1969 return page->objects - page->inuse; 1970 } 1971 1972 static unsigned long count_partial(struct kmem_cache_node *n, 1973 int (*get_count)(struct page *)) 1974 { 1975 unsigned long flags; 1976 unsigned long x = 0; 1977 struct page *page; 1978 1979 spin_lock_irqsave(&n->list_lock, flags); 1980 list_for_each_entry(page, &n->partial, lru) 1981 x += get_count(page); 1982 spin_unlock_irqrestore(&n->list_lock, flags); 1983 return x; 1984 } 1985 1986 static inline unsigned long node_nr_objs(struct kmem_cache_node *n) 1987 { 1988 #ifdef CONFIG_SLUB_DEBUG 1989 return atomic_long_read(&n->total_objects); 1990 #else 1991 return 0; 1992 #endif 1993 } 1994 1995 static noinline void 1996 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) 1997 { 1998 int node; 1999 2000 printk(KERN_WARNING 2001 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2002 nid, gfpflags); 2003 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " 2004 "default order: %d, min order: %d\n", s->name, s->objsize, 2005 s->size, oo_order(s->oo), oo_order(s->min)); 2006 2007 if (oo_order(s->min) > get_order(s->objsize)) 2008 printk(KERN_WARNING " %s debugging increased min order, use " 2009 "slub_debug=O to disable.\n", s->name); 2010 2011 for_each_online_node(node) { 2012 struct kmem_cache_node *n = get_node(s, node); 2013 unsigned long nr_slabs; 2014 unsigned long nr_objs; 2015 unsigned long nr_free; 2016 2017 if (!n) 2018 continue; 2019 2020 nr_free = count_partial(n, count_free); 2021 nr_slabs = node_nr_slabs(n); 2022 nr_objs = node_nr_objs(n); 2023 2024 printk(KERN_WARNING 2025 " node %d: slabs: %ld, objs: %ld, free: %ld\n", 2026 node, nr_slabs, nr_objs, nr_free); 2027 } 2028 } 2029 2030 /* 2031 * Slow path. The lockless freelist is empty or we need to perform 2032 * debugging duties. 2033 * 2034 * Interrupts are disabled. 2035 * 2036 * Processing is still very fast if new objects have been freed to the 2037 * regular freelist. In that case we simply take over the regular freelist 2038 * as the lockless freelist and zap the regular freelist. 2039 * 2040 * If that is not working then we fall back to the partial lists. We take the 2041 * first element of the freelist as the object to allocate now and move the 2042 * rest of the freelist to the lockless freelist. 2043 * 2044 * And if we were unable to get a new slab from the partial slab lists then 2045 * we need to allocate a new slab. This is the slowest path since it involves 2046 * a call to the page allocator and the setup of a new slab. 2047 */ 2048 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 2049 unsigned long addr, struct kmem_cache_cpu *c) 2050 { 2051 void **object; 2052 struct page *page; 2053 unsigned long flags; 2054 struct page new; 2055 unsigned long counters; 2056 2057 local_irq_save(flags); 2058 #ifdef CONFIG_PREEMPT 2059 /* 2060 * We may have been preempted and rescheduled on a different 2061 * cpu before disabling interrupts. Need to reload cpu area 2062 * pointer. 2063 */ 2064 c = this_cpu_ptr(s->cpu_slab); 2065 #endif 2066 2067 /* We handle __GFP_ZERO in the caller */ 2068 gfpflags &= ~__GFP_ZERO; 2069 2070 page = c->page; 2071 if (!page) 2072 goto new_slab; 2073 2074 if (unlikely(!node_match(c, node))) { 2075 stat(s, ALLOC_NODE_MISMATCH); 2076 deactivate_slab(s, c); 2077 goto new_slab; 2078 } 2079 2080 stat(s, ALLOC_SLOWPATH); 2081 2082 do { 2083 object = page->freelist; 2084 counters = page->counters; 2085 new.counters = counters; 2086 VM_BUG_ON(!new.frozen); 2087 2088 /* 2089 * If there is no object left then we use this loop to 2090 * deactivate the slab which is simple since no objects 2091 * are left in the slab and therefore we do not need to 2092 * put the page back onto the partial list. 2093 * 2094 * If there are objects left then we retrieve them 2095 * and use them to refill the per cpu queue. 2096 */ 2097 2098 new.inuse = page->objects; 2099 new.frozen = object != NULL; 2100 2101 } while (!__cmpxchg_double_slab(s, page, 2102 object, counters, 2103 NULL, new.counters, 2104 "__slab_alloc")); 2105 2106 if (unlikely(!object)) { 2107 c->page = NULL; 2108 stat(s, DEACTIVATE_BYPASS); 2109 goto new_slab; 2110 } 2111 2112 stat(s, ALLOC_REFILL); 2113 2114 load_freelist: 2115 VM_BUG_ON(!page->frozen); 2116 c->freelist = get_freepointer(s, object); 2117 c->tid = next_tid(c->tid); 2118 local_irq_restore(flags); 2119 return object; 2120 2121 new_slab: 2122 page = get_partial(s, gfpflags, node); 2123 if (page) { 2124 stat(s, ALLOC_FROM_PARTIAL); 2125 object = c->freelist; 2126 2127 if (kmem_cache_debug(s)) 2128 goto debug; 2129 goto load_freelist; 2130 } 2131 2132 page = new_slab(s, gfpflags, node); 2133 2134 if (page) { 2135 c = __this_cpu_ptr(s->cpu_slab); 2136 if (c->page) 2137 flush_slab(s, c); 2138 2139 /* 2140 * No other reference to the page yet so we can 2141 * muck around with it freely without cmpxchg 2142 */ 2143 object = page->freelist; 2144 page->freelist = NULL; 2145 page->inuse = page->objects; 2146 2147 stat(s, ALLOC_SLAB); 2148 c->node = page_to_nid(page); 2149 c->page = page; 2150 2151 if (kmem_cache_debug(s)) 2152 goto debug; 2153 goto load_freelist; 2154 } 2155 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2156 slab_out_of_memory(s, gfpflags, node); 2157 local_irq_restore(flags); 2158 return NULL; 2159 2160 debug: 2161 if (!object || !alloc_debug_processing(s, page, object, addr)) 2162 goto new_slab; 2163 2164 c->freelist = get_freepointer(s, object); 2165 deactivate_slab(s, c); 2166 c->page = NULL; 2167 c->node = NUMA_NO_NODE; 2168 local_irq_restore(flags); 2169 return object; 2170 } 2171 2172 /* 2173 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) 2174 * have the fastpath folded into their functions. So no function call 2175 * overhead for requests that can be satisfied on the fastpath. 2176 * 2177 * The fastpath works by first checking if the lockless freelist can be used. 2178 * If not then __slab_alloc is called for slow processing. 2179 * 2180 * Otherwise we can simply pick the next object from the lockless free list. 2181 */ 2182 static __always_inline void *slab_alloc(struct kmem_cache *s, 2183 gfp_t gfpflags, int node, unsigned long addr) 2184 { 2185 void **object; 2186 struct kmem_cache_cpu *c; 2187 unsigned long tid; 2188 2189 if (slab_pre_alloc_hook(s, gfpflags)) 2190 return NULL; 2191 2192 redo: 2193 2194 /* 2195 * Must read kmem_cache cpu data via this cpu ptr. Preemption is 2196 * enabled. We may switch back and forth between cpus while 2197 * reading from one cpu area. That does not matter as long 2198 * as we end up on the original cpu again when doing the cmpxchg. 2199 */ 2200 c = __this_cpu_ptr(s->cpu_slab); 2201 2202 /* 2203 * The transaction ids are globally unique per cpu and per operation on 2204 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 2205 * occurs on the right processor and that there was no operation on the 2206 * linked list in between. 2207 */ 2208 tid = c->tid; 2209 barrier(); 2210 2211 object = c->freelist; 2212 if (unlikely(!object || !node_match(c, node))) 2213 2214 object = __slab_alloc(s, gfpflags, node, addr, c); 2215 2216 else { 2217 /* 2218 * The cmpxchg will only match if there was no additional 2219 * operation and if we are on the right processor. 2220 * 2221 * The cmpxchg does the following atomically (without lock semantics!) 2222 * 1. Relocate first pointer to the current per cpu area. 2223 * 2. Verify that tid and freelist have not been changed 2224 * 3. If they were not changed replace tid and freelist 2225 * 2226 * Since this is without lock semantics the protection is only against 2227 * code executing on this cpu *not* from access by other cpus. 2228 */ 2229 if (unlikely(!irqsafe_cpu_cmpxchg_double( 2230 s->cpu_slab->freelist, s->cpu_slab->tid, 2231 object, tid, 2232 get_freepointer_safe(s, object), next_tid(tid)))) { 2233 2234 note_cmpxchg_failure("slab_alloc", s, tid); 2235 goto redo; 2236 } 2237 stat(s, ALLOC_FASTPATH); 2238 } 2239 2240 if (unlikely(gfpflags & __GFP_ZERO) && object) 2241 memset(object, 0, s->objsize); 2242 2243 slab_post_alloc_hook(s, gfpflags, object); 2244 2245 return object; 2246 } 2247 2248 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 2249 { 2250 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 2251 2252 trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); 2253 2254 return ret; 2255 } 2256 EXPORT_SYMBOL(kmem_cache_alloc); 2257 2258 #ifdef CONFIG_TRACING 2259 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) 2260 { 2261 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 2262 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); 2263 return ret; 2264 } 2265 EXPORT_SYMBOL(kmem_cache_alloc_trace); 2266 2267 void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) 2268 { 2269 void *ret = kmalloc_order(size, flags, order); 2270 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); 2271 return ret; 2272 } 2273 EXPORT_SYMBOL(kmalloc_order_trace); 2274 #endif 2275 2276 #ifdef CONFIG_NUMA 2277 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 2278 { 2279 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); 2280 2281 trace_kmem_cache_alloc_node(_RET_IP_, ret, 2282 s->objsize, s->size, gfpflags, node); 2283 2284 return ret; 2285 } 2286 EXPORT_SYMBOL(kmem_cache_alloc_node); 2287 2288 #ifdef CONFIG_TRACING 2289 void *kmem_cache_alloc_node_trace(struct kmem_cache *s, 2290 gfp_t gfpflags, 2291 int node, size_t size) 2292 { 2293 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); 2294 2295 trace_kmalloc_node(_RET_IP_, ret, 2296 size, s->size, gfpflags, node); 2297 return ret; 2298 } 2299 EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 2300 #endif 2301 #endif 2302 2303 /* 2304 * Slow patch handling. This may still be called frequently since objects 2305 * have a longer lifetime than the cpu slabs in most processing loads. 2306 * 2307 * So we still attempt to reduce cache line usage. Just take the slab 2308 * lock and free the item. If there is no additional partial page 2309 * handling required then we can return immediately. 2310 */ 2311 static void __slab_free(struct kmem_cache *s, struct page *page, 2312 void *x, unsigned long addr) 2313 { 2314 void *prior; 2315 void **object = (void *)x; 2316 int was_frozen; 2317 int inuse; 2318 struct page new; 2319 unsigned long counters; 2320 struct kmem_cache_node *n = NULL; 2321 unsigned long uninitialized_var(flags); 2322 2323 stat(s, FREE_SLOWPATH); 2324 2325 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) 2326 return; 2327 2328 do { 2329 prior = page->freelist; 2330 counters = page->counters; 2331 set_freepointer(s, object, prior); 2332 new.counters = counters; 2333 was_frozen = new.frozen; 2334 new.inuse--; 2335 if ((!new.inuse || !prior) && !was_frozen && !n) { 2336 n = get_node(s, page_to_nid(page)); 2337 /* 2338 * Speculatively acquire the list_lock. 2339 * If the cmpxchg does not succeed then we may 2340 * drop the list_lock without any processing. 2341 * 2342 * Otherwise the list_lock will synchronize with 2343 * other processors updating the list of slabs. 2344 */ 2345 spin_lock_irqsave(&n->list_lock, flags); 2346 } 2347 inuse = new.inuse; 2348 2349 } while (!cmpxchg_double_slab(s, page, 2350 prior, counters, 2351 object, new.counters, 2352 "__slab_free")); 2353 2354 if (likely(!n)) { 2355 /* 2356 * The list lock was not taken therefore no list 2357 * activity can be necessary. 2358 */ 2359 if (was_frozen) 2360 stat(s, FREE_FROZEN); 2361 return; 2362 } 2363 2364 /* 2365 * was_frozen may have been set after we acquired the list_lock in 2366 * an earlier loop. So we need to check it here again. 2367 */ 2368 if (was_frozen) 2369 stat(s, FREE_FROZEN); 2370 else { 2371 if (unlikely(!inuse && n->nr_partial > s->min_partial)) 2372 goto slab_empty; 2373 2374 /* 2375 * Objects left in the slab. If it was not on the partial list before 2376 * then add it. 2377 */ 2378 if (unlikely(!prior)) { 2379 remove_full(s, page); 2380 add_partial(n, page, 0); 2381 stat(s, FREE_ADD_PARTIAL); 2382 } 2383 } 2384 spin_unlock_irqrestore(&n->list_lock, flags); 2385 return; 2386 2387 slab_empty: 2388 if (prior) { 2389 /* 2390 * Slab still on the partial list. 2391 */ 2392 remove_partial(n, page); 2393 stat(s, FREE_REMOVE_PARTIAL); 2394 } 2395 2396 spin_unlock_irqrestore(&n->list_lock, flags); 2397 stat(s, FREE_SLAB); 2398 discard_slab(s, page); 2399 } 2400 2401 /* 2402 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 2403 * can perform fastpath freeing without additional function calls. 2404 * 2405 * The fastpath is only possible if we are freeing to the current cpu slab 2406 * of this processor. This typically the case if we have just allocated 2407 * the item before. 2408 * 2409 * If fastpath is not possible then fall back to __slab_free where we deal 2410 * with all sorts of special processing. 2411 */ 2412 static __always_inline void slab_free(struct kmem_cache *s, 2413 struct page *page, void *x, unsigned long addr) 2414 { 2415 void **object = (void *)x; 2416 struct kmem_cache_cpu *c; 2417 unsigned long tid; 2418 2419 slab_free_hook(s, x); 2420 2421 redo: 2422 2423 /* 2424 * Determine the currently cpus per cpu slab. 2425 * The cpu may change afterward. However that does not matter since 2426 * data is retrieved via this pointer. If we are on the same cpu 2427 * during the cmpxchg then the free will succedd. 2428 */ 2429 c = __this_cpu_ptr(s->cpu_slab); 2430 2431 tid = c->tid; 2432 barrier(); 2433 2434 if (likely(page == c->page)) { 2435 set_freepointer(s, object, c->freelist); 2436 2437 if (unlikely(!irqsafe_cpu_cmpxchg_double( 2438 s->cpu_slab->freelist, s->cpu_slab->tid, 2439 c->freelist, tid, 2440 object, next_tid(tid)))) { 2441 2442 note_cmpxchg_failure("slab_free", s, tid); 2443 goto redo; 2444 } 2445 stat(s, FREE_FASTPATH); 2446 } else 2447 __slab_free(s, page, x, addr); 2448 2449 } 2450 2451 void kmem_cache_free(struct kmem_cache *s, void *x) 2452 { 2453 struct page *page; 2454 2455 page = virt_to_head_page(x); 2456 2457 slab_free(s, page, x, _RET_IP_); 2458 2459 trace_kmem_cache_free(_RET_IP_, x); 2460 } 2461 EXPORT_SYMBOL(kmem_cache_free); 2462 2463 /* 2464 * Object placement in a slab is made very easy because we always start at 2465 * offset 0. If we tune the size of the object to the alignment then we can 2466 * get the required alignment by putting one properly sized object after 2467 * another. 2468 * 2469 * Notice that the allocation order determines the sizes of the per cpu 2470 * caches. Each processor has always one slab available for allocations. 2471 * Increasing the allocation order reduces the number of times that slabs 2472 * must be moved on and off the partial lists and is therefore a factor in 2473 * locking overhead. 2474 */ 2475 2476 /* 2477 * Mininum / Maximum order of slab pages. This influences locking overhead 2478 * and slab fragmentation. A higher order reduces the number of partial slabs 2479 * and increases the number of allocations possible without having to 2480 * take the list_lock. 2481 */ 2482 static int slub_min_order; 2483 static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; 2484 static int slub_min_objects; 2485 2486 /* 2487 * Merge control. If this is set then no merging of slab caches will occur. 2488 * (Could be removed. This was introduced to pacify the merge skeptics.) 2489 */ 2490 static int slub_nomerge; 2491 2492 /* 2493 * Calculate the order of allocation given an slab object size. 2494 * 2495 * The order of allocation has significant impact on performance and other 2496 * system components. Generally order 0 allocations should be preferred since 2497 * order 0 does not cause fragmentation in the page allocator. Larger objects 2498 * be problematic to put into order 0 slabs because there may be too much 2499 * unused space left. We go to a higher order if more than 1/16th of the slab 2500 * would be wasted. 2501 * 2502 * In order to reach satisfactory performance we must ensure that a minimum 2503 * number of objects is in one slab. Otherwise we may generate too much 2504 * activity on the partial lists which requires taking the list_lock. This is 2505 * less a concern for large slabs though which are rarely used. 2506 * 2507 * slub_max_order specifies the order where we begin to stop considering the 2508 * number of objects in a slab as critical. If we reach slub_max_order then 2509 * we try to keep the page order as low as possible. So we accept more waste 2510 * of space in favor of a small page order. 2511 * 2512 * Higher order allocations also allow the placement of more objects in a 2513 * slab and thereby reduce object handling overhead. If the user has 2514 * requested a higher mininum order then we start with that one instead of 2515 * the smallest order which will fit the object. 2516 */ 2517 static inline int slab_order(int size, int min_objects, 2518 int max_order, int fract_leftover, int reserved) 2519 { 2520 int order; 2521 int rem; 2522 int min_order = slub_min_order; 2523 2524 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) 2525 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 2526 2527 for (order = max(min_order, 2528 fls(min_objects * size - 1) - PAGE_SHIFT); 2529 order <= max_order; order++) { 2530 2531 unsigned long slab_size = PAGE_SIZE << order; 2532 2533 if (slab_size < min_objects * size + reserved) 2534 continue; 2535 2536 rem = (slab_size - reserved) % size; 2537 2538 if (rem <= slab_size / fract_leftover) 2539 break; 2540 2541 } 2542 2543 return order; 2544 } 2545 2546 static inline int calculate_order(int size, int reserved) 2547 { 2548 int order; 2549 int min_objects; 2550 int fraction; 2551 int max_objects; 2552 2553 /* 2554 * Attempt to find best configuration for a slab. This 2555 * works by first attempting to generate a layout with 2556 * the best configuration and backing off gradually. 2557 * 2558 * First we reduce the acceptable waste in a slab. Then 2559 * we reduce the minimum objects required in a slab. 2560 */ 2561 min_objects = slub_min_objects; 2562 if (!min_objects) 2563 min_objects = 4 * (fls(nr_cpu_ids) + 1); 2564 max_objects = order_objects(slub_max_order, size, reserved); 2565 min_objects = min(min_objects, max_objects); 2566 2567 while (min_objects > 1) { 2568 fraction = 16; 2569 while (fraction >= 4) { 2570 order = slab_order(size, min_objects, 2571 slub_max_order, fraction, reserved); 2572 if (order <= slub_max_order) 2573 return order; 2574 fraction /= 2; 2575 } 2576 min_objects--; 2577 } 2578 2579 /* 2580 * We were unable to place multiple objects in a slab. Now 2581 * lets see if we can place a single object there. 2582 */ 2583 order = slab_order(size, 1, slub_max_order, 1, reserved); 2584 if (order <= slub_max_order) 2585 return order; 2586 2587 /* 2588 * Doh this slab cannot be placed using slub_max_order. 2589 */ 2590 order = slab_order(size, 1, MAX_ORDER, 1, reserved); 2591 if (order < MAX_ORDER) 2592 return order; 2593 return -ENOSYS; 2594 } 2595 2596 /* 2597 * Figure out what the alignment of the objects will be. 2598 */ 2599 static unsigned long calculate_alignment(unsigned long flags, 2600 unsigned long align, unsigned long size) 2601 { 2602 /* 2603 * If the user wants hardware cache aligned objects then follow that 2604 * suggestion if the object is sufficiently large. 2605 * 2606 * The hardware cache alignment cannot override the specified 2607 * alignment though. If that is greater then use it. 2608 */ 2609 if (flags & SLAB_HWCACHE_ALIGN) { 2610 unsigned long ralign = cache_line_size(); 2611 while (size <= ralign / 2) 2612 ralign /= 2; 2613 align = max(align, ralign); 2614 } 2615 2616 if (align < ARCH_SLAB_MINALIGN) 2617 align = ARCH_SLAB_MINALIGN; 2618 2619 return ALIGN(align, sizeof(void *)); 2620 } 2621 2622 static void 2623 init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 2624 { 2625 n->nr_partial = 0; 2626 spin_lock_init(&n->list_lock); 2627 INIT_LIST_HEAD(&n->partial); 2628 #ifdef CONFIG_SLUB_DEBUG 2629 atomic_long_set(&n->nr_slabs, 0); 2630 atomic_long_set(&n->total_objects, 0); 2631 INIT_LIST_HEAD(&n->full); 2632 #endif 2633 } 2634 2635 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 2636 { 2637 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 2638 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); 2639 2640 /* 2641 * Must align to double word boundary for the double cmpxchg 2642 * instructions to work; see __pcpu_double_call_return_bool(). 2643 */ 2644 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2645 2 * sizeof(void *)); 2646 2647 if (!s->cpu_slab) 2648 return 0; 2649 2650 init_kmem_cache_cpus(s); 2651 2652 return 1; 2653 } 2654 2655 static struct kmem_cache *kmem_cache_node; 2656 2657 /* 2658 * No kmalloc_node yet so do it by hand. We know that this is the first 2659 * slab on the node for this slabcache. There are no concurrent accesses 2660 * possible. 2661 * 2662 * Note that this function only works on the kmalloc_node_cache 2663 * when allocating for the kmalloc_node_cache. This is used for bootstrapping 2664 * memory on a fresh node that has no slab structures yet. 2665 */ 2666 static void early_kmem_cache_node_alloc(int node) 2667 { 2668 struct page *page; 2669 struct kmem_cache_node *n; 2670 2671 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); 2672 2673 page = new_slab(kmem_cache_node, GFP_NOWAIT, node); 2674 2675 BUG_ON(!page); 2676 if (page_to_nid(page) != node) { 2677 printk(KERN_ERR "SLUB: Unable to allocate memory from " 2678 "node %d\n", node); 2679 printk(KERN_ERR "SLUB: Allocating a useless per node structure " 2680 "in order to be able to continue\n"); 2681 } 2682 2683 n = page->freelist; 2684 BUG_ON(!n); 2685 page->freelist = get_freepointer(kmem_cache_node, n); 2686 page->inuse++; 2687 page->frozen = 0; 2688 kmem_cache_node->node[node] = n; 2689 #ifdef CONFIG_SLUB_DEBUG 2690 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2691 init_tracking(kmem_cache_node, n); 2692 #endif 2693 init_kmem_cache_node(n, kmem_cache_node); 2694 inc_slabs_node(kmem_cache_node, node, page->objects); 2695 2696 add_partial(n, page, 0); 2697 } 2698 2699 static void free_kmem_cache_nodes(struct kmem_cache *s) 2700 { 2701 int node; 2702 2703 for_each_node_state(node, N_NORMAL_MEMORY) { 2704 struct kmem_cache_node *n = s->node[node]; 2705 2706 if (n) 2707 kmem_cache_free(kmem_cache_node, n); 2708 2709 s->node[node] = NULL; 2710 } 2711 } 2712 2713 static int init_kmem_cache_nodes(struct kmem_cache *s) 2714 { 2715 int node; 2716 2717 for_each_node_state(node, N_NORMAL_MEMORY) { 2718 struct kmem_cache_node *n; 2719 2720 if (slab_state == DOWN) { 2721 early_kmem_cache_node_alloc(node); 2722 continue; 2723 } 2724 n = kmem_cache_alloc_node(kmem_cache_node, 2725 GFP_KERNEL, node); 2726 2727 if (!n) { 2728 free_kmem_cache_nodes(s); 2729 return 0; 2730 } 2731 2732 s->node[node] = n; 2733 init_kmem_cache_node(n, s); 2734 } 2735 return 1; 2736 } 2737 2738 static void set_min_partial(struct kmem_cache *s, unsigned long min) 2739 { 2740 if (min < MIN_PARTIAL) 2741 min = MIN_PARTIAL; 2742 else if (min > MAX_PARTIAL) 2743 min = MAX_PARTIAL; 2744 s->min_partial = min; 2745 } 2746 2747 /* 2748 * calculate_sizes() determines the order and the distribution of data within 2749 * a slab object. 2750 */ 2751 static int calculate_sizes(struct kmem_cache *s, int forced_order) 2752 { 2753 unsigned long flags = s->flags; 2754 unsigned long size = s->objsize; 2755 unsigned long align = s->align; 2756 int order; 2757 2758 /* 2759 * Round up object size to the next word boundary. We can only 2760 * place the free pointer at word boundaries and this determines 2761 * the possible location of the free pointer. 2762 */ 2763 size = ALIGN(size, sizeof(void *)); 2764 2765 #ifdef CONFIG_SLUB_DEBUG 2766 /* 2767 * Determine if we can poison the object itself. If the user of 2768 * the slab may touch the object after free or before allocation 2769 * then we should never poison the object itself. 2770 */ 2771 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 2772 !s->ctor) 2773 s->flags |= __OBJECT_POISON; 2774 else 2775 s->flags &= ~__OBJECT_POISON; 2776 2777 2778 /* 2779 * If we are Redzoning then check if there is some space between the 2780 * end of the object and the free pointer. If not then add an 2781 * additional word to have some bytes to store Redzone information. 2782 */ 2783 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 2784 size += sizeof(void *); 2785 #endif 2786 2787 /* 2788 * With that we have determined the number of bytes in actual use 2789 * by the object. This is the potential offset to the free pointer. 2790 */ 2791 s->inuse = size; 2792 2793 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 2794 s->ctor)) { 2795 /* 2796 * Relocate free pointer after the object if it is not 2797 * permitted to overwrite the first word of the object on 2798 * kmem_cache_free. 2799 * 2800 * This is the case if we do RCU, have a constructor or 2801 * destructor or are poisoning the objects. 2802 */ 2803 s->offset = size; 2804 size += sizeof(void *); 2805 } 2806 2807 #ifdef CONFIG_SLUB_DEBUG 2808 if (flags & SLAB_STORE_USER) 2809 /* 2810 * Need to store information about allocs and frees after 2811 * the object. 2812 */ 2813 size += 2 * sizeof(struct track); 2814 2815 if (flags & SLAB_RED_ZONE) 2816 /* 2817 * Add some empty padding so that we can catch 2818 * overwrites from earlier objects rather than let 2819 * tracking information or the free pointer be 2820 * corrupted if a user writes before the start 2821 * of the object. 2822 */ 2823 size += sizeof(void *); 2824 #endif 2825 2826 /* 2827 * Determine the alignment based on various parameters that the 2828 * user specified and the dynamic determination of cache line size 2829 * on bootup. 2830 */ 2831 align = calculate_alignment(flags, align, s->objsize); 2832 s->align = align; 2833 2834 /* 2835 * SLUB stores one object immediately after another beginning from 2836 * offset 0. In order to align the objects we have to simply size 2837 * each object to conform to the alignment. 2838 */ 2839 size = ALIGN(size, align); 2840 s->size = size; 2841 if (forced_order >= 0) 2842 order = forced_order; 2843 else 2844 order = calculate_order(size, s->reserved); 2845 2846 if (order < 0) 2847 return 0; 2848 2849 s->allocflags = 0; 2850 if (order) 2851 s->allocflags |= __GFP_COMP; 2852 2853 if (s->flags & SLAB_CACHE_DMA) 2854 s->allocflags |= SLUB_DMA; 2855 2856 if (s->flags & SLAB_RECLAIM_ACCOUNT) 2857 s->allocflags |= __GFP_RECLAIMABLE; 2858 2859 /* 2860 * Determine the number of objects per slab 2861 */ 2862 s->oo = oo_make(order, size, s->reserved); 2863 s->min = oo_make(get_order(size), size, s->reserved); 2864 if (oo_objects(s->oo) > oo_objects(s->max)) 2865 s->max = s->oo; 2866 2867 return !!oo_objects(s->oo); 2868 2869 } 2870 2871 static int kmem_cache_open(struct kmem_cache *s, 2872 const char *name, size_t size, 2873 size_t align, unsigned long flags, 2874 void (*ctor)(void *)) 2875 { 2876 memset(s, 0, kmem_size); 2877 s->name = name; 2878 s->ctor = ctor; 2879 s->objsize = size; 2880 s->align = align; 2881 s->flags = kmem_cache_flags(size, flags, name, ctor); 2882 s->reserved = 0; 2883 2884 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) 2885 s->reserved = sizeof(struct rcu_head); 2886 2887 if (!calculate_sizes(s, -1)) 2888 goto error; 2889 if (disable_higher_order_debug) { 2890 /* 2891 * Disable debugging flags that store metadata if the min slab 2892 * order increased. 2893 */ 2894 if (get_order(s->size) > get_order(s->objsize)) { 2895 s->flags &= ~DEBUG_METADATA_FLAGS; 2896 s->offset = 0; 2897 if (!calculate_sizes(s, -1)) 2898 goto error; 2899 } 2900 } 2901 2902 #ifdef CONFIG_CMPXCHG_DOUBLE 2903 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) 2904 /* Enable fast mode */ 2905 s->flags |= __CMPXCHG_DOUBLE; 2906 #endif 2907 2908 /* 2909 * The larger the object size is, the more pages we want on the partial 2910 * list to avoid pounding the page allocator excessively. 2911 */ 2912 set_min_partial(s, ilog2(s->size)); 2913 s->refcount = 1; 2914 #ifdef CONFIG_NUMA 2915 s->remote_node_defrag_ratio = 1000; 2916 #endif 2917 if (!init_kmem_cache_nodes(s)) 2918 goto error; 2919 2920 if (alloc_kmem_cache_cpus(s)) 2921 return 1; 2922 2923 free_kmem_cache_nodes(s); 2924 error: 2925 if (flags & SLAB_PANIC) 2926 panic("Cannot create slab %s size=%lu realsize=%u " 2927 "order=%u offset=%u flags=%lx\n", 2928 s->name, (unsigned long)size, s->size, oo_order(s->oo), 2929 s->offset, flags); 2930 return 0; 2931 } 2932 2933 /* 2934 * Determine the size of a slab object 2935 */ 2936 unsigned int kmem_cache_size(struct kmem_cache *s) 2937 { 2938 return s->objsize; 2939 } 2940 EXPORT_SYMBOL(kmem_cache_size); 2941 2942 static void list_slab_objects(struct kmem_cache *s, struct page *page, 2943 const char *text) 2944 { 2945 #ifdef CONFIG_SLUB_DEBUG 2946 void *addr = page_address(page); 2947 void *p; 2948 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * 2949 sizeof(long), GFP_ATOMIC); 2950 if (!map) 2951 return; 2952 slab_err(s, page, "%s", text); 2953 slab_lock(page); 2954 2955 get_map(s, page, map); 2956 for_each_object(p, s, addr, page->objects) { 2957 2958 if (!test_bit(slab_index(p, s, addr), map)) { 2959 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", 2960 p, p - addr); 2961 print_tracking(s, p); 2962 } 2963 } 2964 slab_unlock(page); 2965 kfree(map); 2966 #endif 2967 } 2968 2969 /* 2970 * Attempt to free all partial slabs on a node. 2971 */ 2972 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) 2973 { 2974 unsigned long flags; 2975 struct page *page, *h; 2976 2977 spin_lock_irqsave(&n->list_lock, flags); 2978 list_for_each_entry_safe(page, h, &n->partial, lru) { 2979 if (!page->inuse) { 2980 remove_partial(n, page); 2981 discard_slab(s, page); 2982 } else { 2983 list_slab_objects(s, page, 2984 "Objects remaining on kmem_cache_close()"); 2985 } 2986 } 2987 spin_unlock_irqrestore(&n->list_lock, flags); 2988 } 2989 2990 /* 2991 * Release all resources used by a slab cache. 2992 */ 2993 static inline int kmem_cache_close(struct kmem_cache *s) 2994 { 2995 int node; 2996 2997 flush_all(s); 2998 free_percpu(s->cpu_slab); 2999 /* Attempt to free all objects */ 3000 for_each_node_state(node, N_NORMAL_MEMORY) { 3001 struct kmem_cache_node *n = get_node(s, node); 3002 3003 free_partial(s, n); 3004 if (n->nr_partial || slabs_node(s, node)) 3005 return 1; 3006 } 3007 free_kmem_cache_nodes(s); 3008 return 0; 3009 } 3010 3011 /* 3012 * Close a cache and release the kmem_cache structure 3013 * (must be used for caches created using kmem_cache_create) 3014 */ 3015 void kmem_cache_destroy(struct kmem_cache *s) 3016 { 3017 down_write(&slub_lock); 3018 s->refcount--; 3019 if (!s->refcount) { 3020 list_del(&s->list); 3021 if (kmem_cache_close(s)) { 3022 printk(KERN_ERR "SLUB %s: %s called for cache that " 3023 "still has objects.\n", s->name, __func__); 3024 dump_stack(); 3025 } 3026 if (s->flags & SLAB_DESTROY_BY_RCU) 3027 rcu_barrier(); 3028 sysfs_slab_remove(s); 3029 } 3030 up_write(&slub_lock); 3031 } 3032 EXPORT_SYMBOL(kmem_cache_destroy); 3033 3034 /******************************************************************** 3035 * Kmalloc subsystem 3036 *******************************************************************/ 3037 3038 struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; 3039 EXPORT_SYMBOL(kmalloc_caches); 3040 3041 static struct kmem_cache *kmem_cache; 3042 3043 #ifdef CONFIG_ZONE_DMA 3044 static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; 3045 #endif 3046 3047 static int __init setup_slub_min_order(char *str) 3048 { 3049 get_option(&str, &slub_min_order); 3050 3051 return 1; 3052 } 3053 3054 __setup("slub_min_order=", setup_slub_min_order); 3055 3056 static int __init setup_slub_max_order(char *str) 3057 { 3058 get_option(&str, &slub_max_order); 3059 slub_max_order = min(slub_max_order, MAX_ORDER - 1); 3060 3061 return 1; 3062 } 3063 3064 __setup("slub_max_order=", setup_slub_max_order); 3065 3066 static int __init setup_slub_min_objects(char *str) 3067 { 3068 get_option(&str, &slub_min_objects); 3069 3070 return 1; 3071 } 3072 3073 __setup("slub_min_objects=", setup_slub_min_objects); 3074 3075 static int __init setup_slub_nomerge(char *str) 3076 { 3077 slub_nomerge = 1; 3078 return 1; 3079 } 3080 3081 __setup("slub_nomerge", setup_slub_nomerge); 3082 3083 static struct kmem_cache *__init create_kmalloc_cache(const char *name, 3084 int size, unsigned int flags) 3085 { 3086 struct kmem_cache *s; 3087 3088 s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3089 3090 /* 3091 * This function is called with IRQs disabled during early-boot on 3092 * single CPU so there's no need to take slub_lock here. 3093 */ 3094 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, 3095 flags, NULL)) 3096 goto panic; 3097 3098 list_add(&s->list, &slab_caches); 3099 return s; 3100 3101 panic: 3102 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 3103 return NULL; 3104 } 3105 3106 /* 3107 * Conversion table for small slabs sizes / 8 to the index in the 3108 * kmalloc array. This is necessary for slabs < 192 since we have non power 3109 * of two cache sizes there. The size of larger slabs can be determined using 3110 * fls. 3111 */ 3112 static s8 size_index[24] = { 3113 3, /* 8 */ 3114 4, /* 16 */ 3115 5, /* 24 */ 3116 5, /* 32 */ 3117 6, /* 40 */ 3118 6, /* 48 */ 3119 6, /* 56 */ 3120 6, /* 64 */ 3121 1, /* 72 */ 3122 1, /* 80 */ 3123 1, /* 88 */ 3124 1, /* 96 */ 3125 7, /* 104 */ 3126 7, /* 112 */ 3127 7, /* 120 */ 3128 7, /* 128 */ 3129 2, /* 136 */ 3130 2, /* 144 */ 3131 2, /* 152 */ 3132 2, /* 160 */ 3133 2, /* 168 */ 3134 2, /* 176 */ 3135 2, /* 184 */ 3136 2 /* 192 */ 3137 }; 3138 3139 static inline int size_index_elem(size_t bytes) 3140 { 3141 return (bytes - 1) / 8; 3142 } 3143 3144 static struct kmem_cache *get_slab(size_t size, gfp_t flags) 3145 { 3146 int index; 3147 3148 if (size <= 192) { 3149 if (!size) 3150 return ZERO_SIZE_PTR; 3151 3152 index = size_index[size_index_elem(size)]; 3153 } else 3154 index = fls(size - 1); 3155 3156 #ifdef CONFIG_ZONE_DMA 3157 if (unlikely((flags & SLUB_DMA))) 3158 return kmalloc_dma_caches[index]; 3159 3160 #endif 3161 return kmalloc_caches[index]; 3162 } 3163 3164 void *__kmalloc(size_t size, gfp_t flags) 3165 { 3166 struct kmem_cache *s; 3167 void *ret; 3168 3169 if (unlikely(size > SLUB_MAX_SIZE)) 3170 return kmalloc_large(size, flags); 3171 3172 s = get_slab(size, flags); 3173 3174 if (unlikely(ZERO_OR_NULL_PTR(s))) 3175 return s; 3176 3177 ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_); 3178 3179 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 3180 3181 return ret; 3182 } 3183 EXPORT_SYMBOL(__kmalloc); 3184 3185 #ifdef CONFIG_NUMA 3186 static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 3187 { 3188 struct page *page; 3189 void *ptr = NULL; 3190 3191 flags |= __GFP_COMP | __GFP_NOTRACK; 3192 page = alloc_pages_node(node, flags, get_order(size)); 3193 if (page) 3194 ptr = page_address(page); 3195 3196 kmemleak_alloc(ptr, size, 1, flags); 3197 return ptr; 3198 } 3199 3200 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3201 { 3202 struct kmem_cache *s; 3203 void *ret; 3204 3205 if (unlikely(size > SLUB_MAX_SIZE)) { 3206 ret = kmalloc_large_node(size, flags, node); 3207 3208 trace_kmalloc_node(_RET_IP_, ret, 3209 size, PAGE_SIZE << get_order(size), 3210 flags, node); 3211 3212 return ret; 3213 } 3214 3215 s = get_slab(size, flags); 3216 3217 if (unlikely(ZERO_OR_NULL_PTR(s))) 3218 return s; 3219 3220 ret = slab_alloc(s, flags, node, _RET_IP_); 3221 3222 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); 3223 3224 return ret; 3225 } 3226 EXPORT_SYMBOL(__kmalloc_node); 3227 #endif 3228 3229 size_t ksize(const void *object) 3230 { 3231 struct page *page; 3232 3233 if (unlikely(object == ZERO_SIZE_PTR)) 3234 return 0; 3235 3236 page = virt_to_head_page(object); 3237 3238 if (unlikely(!PageSlab(page))) { 3239 WARN_ON(!PageCompound(page)); 3240 return PAGE_SIZE << compound_order(page); 3241 } 3242 3243 return slab_ksize(page->slab); 3244 } 3245 EXPORT_SYMBOL(ksize); 3246 3247 #ifdef CONFIG_SLUB_DEBUG 3248 bool verify_mem_not_deleted(const void *x) 3249 { 3250 struct page *page; 3251 void *object = (void *)x; 3252 unsigned long flags; 3253 bool rv; 3254 3255 if (unlikely(ZERO_OR_NULL_PTR(x))) 3256 return false; 3257 3258 local_irq_save(flags); 3259 3260 page = virt_to_head_page(x); 3261 if (unlikely(!PageSlab(page))) { 3262 /* maybe it was from stack? */ 3263 rv = true; 3264 goto out_unlock; 3265 } 3266 3267 slab_lock(page); 3268 if (on_freelist(page->slab, page, object)) { 3269 object_err(page->slab, page, object, "Object is on free-list"); 3270 rv = false; 3271 } else { 3272 rv = true; 3273 } 3274 slab_unlock(page); 3275 3276 out_unlock: 3277 local_irq_restore(flags); 3278 return rv; 3279 } 3280 EXPORT_SYMBOL(verify_mem_not_deleted); 3281 #endif 3282 3283 void kfree(const void *x) 3284 { 3285 struct page *page; 3286 void *object = (void *)x; 3287 3288 trace_kfree(_RET_IP_, x); 3289 3290 if (unlikely(ZERO_OR_NULL_PTR(x))) 3291 return; 3292 3293 page = virt_to_head_page(x); 3294 if (unlikely(!PageSlab(page))) { 3295 BUG_ON(!PageCompound(page)); 3296 kmemleak_free(x); 3297 put_page(page); 3298 return; 3299 } 3300 slab_free(page->slab, page, object, _RET_IP_); 3301 } 3302 EXPORT_SYMBOL(kfree); 3303 3304 /* 3305 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 3306 * the remaining slabs by the number of items in use. The slabs with the 3307 * most items in use come first. New allocations will then fill those up 3308 * and thus they can be removed from the partial lists. 3309 * 3310 * The slabs with the least items are placed last. This results in them 3311 * being allocated from last increasing the chance that the last objects 3312 * are freed in them. 3313 */ 3314 int kmem_cache_shrink(struct kmem_cache *s) 3315 { 3316 int node; 3317 int i; 3318 struct kmem_cache_node *n; 3319 struct page *page; 3320 struct page *t; 3321 int objects = oo_objects(s->max); 3322 struct list_head *slabs_by_inuse = 3323 kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); 3324 unsigned long flags; 3325 3326 if (!slabs_by_inuse) 3327 return -ENOMEM; 3328 3329 flush_all(s); 3330 for_each_node_state(node, N_NORMAL_MEMORY) { 3331 n = get_node(s, node); 3332 3333 if (!n->nr_partial) 3334 continue; 3335 3336 for (i = 0; i < objects; i++) 3337 INIT_LIST_HEAD(slabs_by_inuse + i); 3338 3339 spin_lock_irqsave(&n->list_lock, flags); 3340 3341 /* 3342 * Build lists indexed by the items in use in each slab. 3343 * 3344 * Note that concurrent frees may occur while we hold the 3345 * list_lock. page->inuse here is the upper limit. 3346 */ 3347 list_for_each_entry_safe(page, t, &n->partial, lru) { 3348 if (!page->inuse) { 3349 remove_partial(n, page); 3350 discard_slab(s, page); 3351 } else { 3352 list_move(&page->lru, 3353 slabs_by_inuse + page->inuse); 3354 } 3355 } 3356 3357 /* 3358 * Rebuild the partial list with the slabs filled up most 3359 * first and the least used slabs at the end. 3360 */ 3361 for (i = objects - 1; i >= 0; i--) 3362 list_splice(slabs_by_inuse + i, n->partial.prev); 3363 3364 spin_unlock_irqrestore(&n->list_lock, flags); 3365 } 3366 3367 kfree(slabs_by_inuse); 3368 return 0; 3369 } 3370 EXPORT_SYMBOL(kmem_cache_shrink); 3371 3372 #if defined(CONFIG_MEMORY_HOTPLUG) 3373 static int slab_mem_going_offline_callback(void *arg) 3374 { 3375 struct kmem_cache *s; 3376 3377 down_read(&slub_lock); 3378 list_for_each_entry(s, &slab_caches, list) 3379 kmem_cache_shrink(s); 3380 up_read(&slub_lock); 3381 3382 return 0; 3383 } 3384 3385 static void slab_mem_offline_callback(void *arg) 3386 { 3387 struct kmem_cache_node *n; 3388 struct kmem_cache *s; 3389 struct memory_notify *marg = arg; 3390 int offline_node; 3391 3392 offline_node = marg->status_change_nid; 3393 3394 /* 3395 * If the node still has available memory. we need kmem_cache_node 3396 * for it yet. 3397 */ 3398 if (offline_node < 0) 3399 return; 3400 3401 down_read(&slub_lock); 3402 list_for_each_entry(s, &slab_caches, list) { 3403 n = get_node(s, offline_node); 3404 if (n) { 3405 /* 3406 * if n->nr_slabs > 0, slabs still exist on the node 3407 * that is going down. We were unable to free them, 3408 * and offline_pages() function shouldn't call this 3409 * callback. So, we must fail. 3410 */ 3411 BUG_ON(slabs_node(s, offline_node)); 3412 3413 s->node[offline_node] = NULL; 3414 kmem_cache_free(kmem_cache_node, n); 3415 } 3416 } 3417 up_read(&slub_lock); 3418 } 3419 3420 static int slab_mem_going_online_callback(void *arg) 3421 { 3422 struct kmem_cache_node *n; 3423 struct kmem_cache *s; 3424 struct memory_notify *marg = arg; 3425 int nid = marg->status_change_nid; 3426 int ret = 0; 3427 3428 /* 3429 * If the node's memory is already available, then kmem_cache_node is 3430 * already created. Nothing to do. 3431 */ 3432 if (nid < 0) 3433 return 0; 3434 3435 /* 3436 * We are bringing a node online. No memory is available yet. We must 3437 * allocate a kmem_cache_node structure in order to bring the node 3438 * online. 3439 */ 3440 down_read(&slub_lock); 3441 list_for_each_entry(s, &slab_caches, list) { 3442 /* 3443 * XXX: kmem_cache_alloc_node will fallback to other nodes 3444 * since memory is not yet available from the node that 3445 * is brought up. 3446 */ 3447 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); 3448 if (!n) { 3449 ret = -ENOMEM; 3450 goto out; 3451 } 3452 init_kmem_cache_node(n, s); 3453 s->node[nid] = n; 3454 } 3455 out: 3456 up_read(&slub_lock); 3457 return ret; 3458 } 3459 3460 static int slab_memory_callback(struct notifier_block *self, 3461 unsigned long action, void *arg) 3462 { 3463 int ret = 0; 3464 3465 switch (action) { 3466 case MEM_GOING_ONLINE: 3467 ret = slab_mem_going_online_callback(arg); 3468 break; 3469 case MEM_GOING_OFFLINE: 3470 ret = slab_mem_going_offline_callback(arg); 3471 break; 3472 case MEM_OFFLINE: 3473 case MEM_CANCEL_ONLINE: 3474 slab_mem_offline_callback(arg); 3475 break; 3476 case MEM_ONLINE: 3477 case MEM_CANCEL_OFFLINE: 3478 break; 3479 } 3480 if (ret) 3481 ret = notifier_from_errno(ret); 3482 else 3483 ret = NOTIFY_OK; 3484 return ret; 3485 } 3486 3487 #endif /* CONFIG_MEMORY_HOTPLUG */ 3488 3489 /******************************************************************** 3490 * Basic setup of slabs 3491 *******************************************************************/ 3492 3493 /* 3494 * Used for early kmem_cache structures that were allocated using 3495 * the page allocator 3496 */ 3497 3498 static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) 3499 { 3500 int node; 3501 3502 list_add(&s->list, &slab_caches); 3503 s->refcount = -1; 3504 3505 for_each_node_state(node, N_NORMAL_MEMORY) { 3506 struct kmem_cache_node *n = get_node(s, node); 3507 struct page *p; 3508 3509 if (n) { 3510 list_for_each_entry(p, &n->partial, lru) 3511 p->slab = s; 3512 3513 #ifdef CONFIG_SLUB_DEBUG 3514 list_for_each_entry(p, &n->full, lru) 3515 p->slab = s; 3516 #endif 3517 } 3518 } 3519 } 3520 3521 void __init kmem_cache_init(void) 3522 { 3523 int i; 3524 int caches = 0; 3525 struct kmem_cache *temp_kmem_cache; 3526 int order; 3527 struct kmem_cache *temp_kmem_cache_node; 3528 unsigned long kmalloc_size; 3529 3530 kmem_size = offsetof(struct kmem_cache, node) + 3531 nr_node_ids * sizeof(struct kmem_cache_node *); 3532 3533 /* Allocate two kmem_caches from the page allocator */ 3534 kmalloc_size = ALIGN(kmem_size, cache_line_size()); 3535 order = get_order(2 * kmalloc_size); 3536 kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); 3537 3538 /* 3539 * Must first have the slab cache available for the allocations of the 3540 * struct kmem_cache_node's. There is special bootstrap code in 3541 * kmem_cache_open for slab_state == DOWN. 3542 */ 3543 kmem_cache_node = (void *)kmem_cache + kmalloc_size; 3544 3545 kmem_cache_open(kmem_cache_node, "kmem_cache_node", 3546 sizeof(struct kmem_cache_node), 3547 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 3548 3549 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3550 3551 /* Able to allocate the per node structures */ 3552 slab_state = PARTIAL; 3553 3554 temp_kmem_cache = kmem_cache; 3555 kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, 3556 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 3557 kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3558 memcpy(kmem_cache, temp_kmem_cache, kmem_size); 3559 3560 /* 3561 * Allocate kmem_cache_node properly from the kmem_cache slab. 3562 * kmem_cache_node is separately allocated so no need to 3563 * update any list pointers. 3564 */ 3565 temp_kmem_cache_node = kmem_cache_node; 3566 3567 kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3568 memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); 3569 3570 kmem_cache_bootstrap_fixup(kmem_cache_node); 3571 3572 caches++; 3573 kmem_cache_bootstrap_fixup(kmem_cache); 3574 caches++; 3575 /* Free temporary boot structure */ 3576 free_pages((unsigned long)temp_kmem_cache, order); 3577 3578 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 3579 3580 /* 3581 * Patch up the size_index table if we have strange large alignment 3582 * requirements for the kmalloc array. This is only the case for 3583 * MIPS it seems. The standard arches will not generate any code here. 3584 * 3585 * Largest permitted alignment is 256 bytes due to the way we 3586 * handle the index determination for the smaller caches. 3587 * 3588 * Make sure that nothing crazy happens if someone starts tinkering 3589 * around with ARCH_KMALLOC_MINALIGN 3590 */ 3591 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 3592 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 3593 3594 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { 3595 int elem = size_index_elem(i); 3596 if (elem >= ARRAY_SIZE(size_index)) 3597 break; 3598 size_index[elem] = KMALLOC_SHIFT_LOW; 3599 } 3600 3601 if (KMALLOC_MIN_SIZE == 64) { 3602 /* 3603 * The 96 byte size cache is not used if the alignment 3604 * is 64 byte. 3605 */ 3606 for (i = 64 + 8; i <= 96; i += 8) 3607 size_index[size_index_elem(i)] = 7; 3608 } else if (KMALLOC_MIN_SIZE == 128) { 3609 /* 3610 * The 192 byte sized cache is not used if the alignment 3611 * is 128 byte. Redirect kmalloc to use the 256 byte cache 3612 * instead. 3613 */ 3614 for (i = 128 + 8; i <= 192; i += 8) 3615 size_index[size_index_elem(i)] = 8; 3616 } 3617 3618 /* Caches that are not of the two-to-the-power-of size */ 3619 if (KMALLOC_MIN_SIZE <= 32) { 3620 kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0); 3621 caches++; 3622 } 3623 3624 if (KMALLOC_MIN_SIZE <= 64) { 3625 kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0); 3626 caches++; 3627 } 3628 3629 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3630 kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0); 3631 caches++; 3632 } 3633 3634 slab_state = UP; 3635 3636 /* Provide the correct kmalloc names now that the caches are up */ 3637 if (KMALLOC_MIN_SIZE <= 32) { 3638 kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT); 3639 BUG_ON(!kmalloc_caches[1]->name); 3640 } 3641 3642 if (KMALLOC_MIN_SIZE <= 64) { 3643 kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT); 3644 BUG_ON(!kmalloc_caches[2]->name); 3645 } 3646 3647 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3648 char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); 3649 3650 BUG_ON(!s); 3651 kmalloc_caches[i]->name = s; 3652 } 3653 3654 #ifdef CONFIG_SMP 3655 register_cpu_notifier(&slab_notifier); 3656 #endif 3657 3658 #ifdef CONFIG_ZONE_DMA 3659 for (i = 0; i < SLUB_PAGE_SHIFT; i++) { 3660 struct kmem_cache *s = kmalloc_caches[i]; 3661 3662 if (s && s->size) { 3663 char *name = kasprintf(GFP_NOWAIT, 3664 "dma-kmalloc-%d", s->objsize); 3665 3666 BUG_ON(!name); 3667 kmalloc_dma_caches[i] = create_kmalloc_cache(name, 3668 s->objsize, SLAB_CACHE_DMA); 3669 } 3670 } 3671 #endif 3672 printk(KERN_INFO 3673 "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 3674 " CPUs=%d, Nodes=%d\n", 3675 caches, cache_line_size(), 3676 slub_min_order, slub_max_order, slub_min_objects, 3677 nr_cpu_ids, nr_node_ids); 3678 } 3679 3680 void __init kmem_cache_init_late(void) 3681 { 3682 } 3683 3684 /* 3685 * Find a mergeable slab cache 3686 */ 3687 static int slab_unmergeable(struct kmem_cache *s) 3688 { 3689 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 3690 return 1; 3691 3692 if (s->ctor) 3693 return 1; 3694 3695 /* 3696 * We may have set a slab to be unmergeable during bootstrap. 3697 */ 3698 if (s->refcount < 0) 3699 return 1; 3700 3701 return 0; 3702 } 3703 3704 static struct kmem_cache *find_mergeable(size_t size, 3705 size_t align, unsigned long flags, const char *name, 3706 void (*ctor)(void *)) 3707 { 3708 struct kmem_cache *s; 3709 3710 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 3711 return NULL; 3712 3713 if (ctor) 3714 return NULL; 3715 3716 size = ALIGN(size, sizeof(void *)); 3717 align = calculate_alignment(flags, align, size); 3718 size = ALIGN(size, align); 3719 flags = kmem_cache_flags(size, flags, name, NULL); 3720 3721 list_for_each_entry(s, &slab_caches, list) { 3722 if (slab_unmergeable(s)) 3723 continue; 3724 3725 if (size > s->size) 3726 continue; 3727 3728 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) 3729 continue; 3730 /* 3731 * Check if alignment is compatible. 3732 * Courtesy of Adrian Drzewiecki 3733 */ 3734 if ((s->size & ~(align - 1)) != s->size) 3735 continue; 3736 3737 if (s->size - size >= sizeof(void *)) 3738 continue; 3739 3740 return s; 3741 } 3742 return NULL; 3743 } 3744 3745 struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3746 size_t align, unsigned long flags, void (*ctor)(void *)) 3747 { 3748 struct kmem_cache *s; 3749 char *n; 3750 3751 if (WARN_ON(!name)) 3752 return NULL; 3753 3754 down_write(&slub_lock); 3755 s = find_mergeable(size, align, flags, name, ctor); 3756 if (s) { 3757 s->refcount++; 3758 /* 3759 * Adjust the object sizes so that we clear 3760 * the complete object on kzalloc. 3761 */ 3762 s->objsize = max(s->objsize, (int)size); 3763 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3764 3765 if (sysfs_slab_alias(s, name)) { 3766 s->refcount--; 3767 goto err; 3768 } 3769 up_write(&slub_lock); 3770 return s; 3771 } 3772 3773 n = kstrdup(name, GFP_KERNEL); 3774 if (!n) 3775 goto err; 3776 3777 s = kmalloc(kmem_size, GFP_KERNEL); 3778 if (s) { 3779 if (kmem_cache_open(s, n, 3780 size, align, flags, ctor)) { 3781 list_add(&s->list, &slab_caches); 3782 if (sysfs_slab_add(s)) { 3783 list_del(&s->list); 3784 kfree(n); 3785 kfree(s); 3786 goto err; 3787 } 3788 up_write(&slub_lock); 3789 return s; 3790 } 3791 kfree(n); 3792 kfree(s); 3793 } 3794 err: 3795 up_write(&slub_lock); 3796 3797 if (flags & SLAB_PANIC) 3798 panic("Cannot create slabcache %s\n", name); 3799 else 3800 s = NULL; 3801 return s; 3802 } 3803 EXPORT_SYMBOL(kmem_cache_create); 3804 3805 #ifdef CONFIG_SMP 3806 /* 3807 * Use the cpu notifier to insure that the cpu slabs are flushed when 3808 * necessary. 3809 */ 3810 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 3811 unsigned long action, void *hcpu) 3812 { 3813 long cpu = (long)hcpu; 3814 struct kmem_cache *s; 3815 unsigned long flags; 3816 3817 switch (action) { 3818 case CPU_UP_CANCELED: 3819 case CPU_UP_CANCELED_FROZEN: 3820 case CPU_DEAD: 3821 case CPU_DEAD_FROZEN: 3822 down_read(&slub_lock); 3823 list_for_each_entry(s, &slab_caches, list) { 3824 local_irq_save(flags); 3825 __flush_cpu_slab(s, cpu); 3826 local_irq_restore(flags); 3827 } 3828 up_read(&slub_lock); 3829 break; 3830 default: 3831 break; 3832 } 3833 return NOTIFY_OK; 3834 } 3835 3836 static struct notifier_block __cpuinitdata slab_notifier = { 3837 .notifier_call = slab_cpuup_callback 3838 }; 3839 3840 #endif 3841 3842 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) 3843 { 3844 struct kmem_cache *s; 3845 void *ret; 3846 3847 if (unlikely(size > SLUB_MAX_SIZE)) 3848 return kmalloc_large(size, gfpflags); 3849 3850 s = get_slab(size, gfpflags); 3851 3852 if (unlikely(ZERO_OR_NULL_PTR(s))) 3853 return s; 3854 3855 ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); 3856 3857 /* Honor the call site pointer we received. */ 3858 trace_kmalloc(caller, ret, size, s->size, gfpflags); 3859 3860 return ret; 3861 } 3862 3863 #ifdef CONFIG_NUMA 3864 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3865 int node, unsigned long caller) 3866 { 3867 struct kmem_cache *s; 3868 void *ret; 3869 3870 if (unlikely(size > SLUB_MAX_SIZE)) { 3871 ret = kmalloc_large_node(size, gfpflags, node); 3872 3873 trace_kmalloc_node(caller, ret, 3874 size, PAGE_SIZE << get_order(size), 3875 gfpflags, node); 3876 3877 return ret; 3878 } 3879 3880 s = get_slab(size, gfpflags); 3881 3882 if (unlikely(ZERO_OR_NULL_PTR(s))) 3883 return s; 3884 3885 ret = slab_alloc(s, gfpflags, node, caller); 3886 3887 /* Honor the call site pointer we received. */ 3888 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); 3889 3890 return ret; 3891 } 3892 #endif 3893 3894 #ifdef CONFIG_SYSFS 3895 static int count_inuse(struct page *page) 3896 { 3897 return page->inuse; 3898 } 3899 3900 static int count_total(struct page *page) 3901 { 3902 return page->objects; 3903 } 3904 #endif 3905 3906 #ifdef CONFIG_SLUB_DEBUG 3907 static int validate_slab(struct kmem_cache *s, struct page *page, 3908 unsigned long *map) 3909 { 3910 void *p; 3911 void *addr = page_address(page); 3912 3913 if (!check_slab(s, page) || 3914 !on_freelist(s, page, NULL)) 3915 return 0; 3916 3917 /* Now we know that a valid freelist exists */ 3918 bitmap_zero(map, page->objects); 3919 3920 get_map(s, page, map); 3921 for_each_object(p, s, addr, page->objects) { 3922 if (test_bit(slab_index(p, s, addr), map)) 3923 if (!check_object(s, page, p, SLUB_RED_INACTIVE)) 3924 return 0; 3925 } 3926 3927 for_each_object(p, s, addr, page->objects) 3928 if (!test_bit(slab_index(p, s, addr), map)) 3929 if (!check_object(s, page, p, SLUB_RED_ACTIVE)) 3930 return 0; 3931 return 1; 3932 } 3933 3934 static void validate_slab_slab(struct kmem_cache *s, struct page *page, 3935 unsigned long *map) 3936 { 3937 slab_lock(page); 3938 validate_slab(s, page, map); 3939 slab_unlock(page); 3940 } 3941 3942 static int validate_slab_node(struct kmem_cache *s, 3943 struct kmem_cache_node *n, unsigned long *map) 3944 { 3945 unsigned long count = 0; 3946 struct page *page; 3947 unsigned long flags; 3948 3949 spin_lock_irqsave(&n->list_lock, flags); 3950 3951 list_for_each_entry(page, &n->partial, lru) { 3952 validate_slab_slab(s, page, map); 3953 count++; 3954 } 3955 if (count != n->nr_partial) 3956 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 3957 "counter=%ld\n", s->name, count, n->nr_partial); 3958 3959 if (!(s->flags & SLAB_STORE_USER)) 3960 goto out; 3961 3962 list_for_each_entry(page, &n->full, lru) { 3963 validate_slab_slab(s, page, map); 3964 count++; 3965 } 3966 if (count != atomic_long_read(&n->nr_slabs)) 3967 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 3968 "counter=%ld\n", s->name, count, 3969 atomic_long_read(&n->nr_slabs)); 3970 3971 out: 3972 spin_unlock_irqrestore(&n->list_lock, flags); 3973 return count; 3974 } 3975 3976 static long validate_slab_cache(struct kmem_cache *s) 3977 { 3978 int node; 3979 unsigned long count = 0; 3980 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 3981 sizeof(unsigned long), GFP_KERNEL); 3982 3983 if (!map) 3984 return -ENOMEM; 3985 3986 flush_all(s); 3987 for_each_node_state(node, N_NORMAL_MEMORY) { 3988 struct kmem_cache_node *n = get_node(s, node); 3989 3990 count += validate_slab_node(s, n, map); 3991 } 3992 kfree(map); 3993 return count; 3994 } 3995 /* 3996 * Generate lists of code addresses where slabcache objects are allocated 3997 * and freed. 3998 */ 3999 4000 struct location { 4001 unsigned long count; 4002 unsigned long addr; 4003 long long sum_time; 4004 long min_time; 4005 long max_time; 4006 long min_pid; 4007 long max_pid; 4008 DECLARE_BITMAP(cpus, NR_CPUS); 4009 nodemask_t nodes; 4010 }; 4011 4012 struct loc_track { 4013 unsigned long max; 4014 unsigned long count; 4015 struct location *loc; 4016 }; 4017 4018 static void free_loc_track(struct loc_track *t) 4019 { 4020 if (t->max) 4021 free_pages((unsigned long)t->loc, 4022 get_order(sizeof(struct location) * t->max)); 4023 } 4024 4025 static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) 4026 { 4027 struct location *l; 4028 int order; 4029 4030 order = get_order(sizeof(struct location) * max); 4031 4032 l = (void *)__get_free_pages(flags, order); 4033 if (!l) 4034 return 0; 4035 4036 if (t->count) { 4037 memcpy(l, t->loc, sizeof(struct location) * t->count); 4038 free_loc_track(t); 4039 } 4040 t->max = max; 4041 t->loc = l; 4042 return 1; 4043 } 4044 4045 static int add_location(struct loc_track *t, struct kmem_cache *s, 4046 const struct track *track) 4047 { 4048 long start, end, pos; 4049 struct location *l; 4050 unsigned long caddr; 4051 unsigned long age = jiffies - track->when; 4052 4053 start = -1; 4054 end = t->count; 4055 4056 for ( ; ; ) { 4057 pos = start + (end - start + 1) / 2; 4058 4059 /* 4060 * There is nothing at "end". If we end up there 4061 * we need to add something to before end. 4062 */ 4063 if (pos == end) 4064 break; 4065 4066 caddr = t->loc[pos].addr; 4067 if (track->addr == caddr) { 4068 4069 l = &t->loc[pos]; 4070 l->count++; 4071 if (track->when) { 4072 l->sum_time += age; 4073 if (age < l->min_time) 4074 l->min_time = age; 4075 if (age > l->max_time) 4076 l->max_time = age; 4077 4078 if (track->pid < l->min_pid) 4079 l->min_pid = track->pid; 4080 if (track->pid > l->max_pid) 4081 l->max_pid = track->pid; 4082 4083 cpumask_set_cpu(track->cpu, 4084 to_cpumask(l->cpus)); 4085 } 4086 node_set(page_to_nid(virt_to_page(track)), l->nodes); 4087 return 1; 4088 } 4089 4090 if (track->addr < caddr) 4091 end = pos; 4092 else 4093 start = pos; 4094 } 4095 4096 /* 4097 * Not found. Insert new tracking element. 4098 */ 4099 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) 4100 return 0; 4101 4102 l = t->loc + pos; 4103 if (pos < t->count) 4104 memmove(l + 1, l, 4105 (t->count - pos) * sizeof(struct location)); 4106 t->count++; 4107 l->count = 1; 4108 l->addr = track->addr; 4109 l->sum_time = age; 4110 l->min_time = age; 4111 l->max_time = age; 4112 l->min_pid = track->pid; 4113 l->max_pid = track->pid; 4114 cpumask_clear(to_cpumask(l->cpus)); 4115 cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); 4116 nodes_clear(l->nodes); 4117 node_set(page_to_nid(virt_to_page(track)), l->nodes); 4118 return 1; 4119 } 4120 4121 static void process_slab(struct loc_track *t, struct kmem_cache *s, 4122 struct page *page, enum track_item alloc, 4123 unsigned long *map) 4124 { 4125 void *addr = page_address(page); 4126 void *p; 4127 4128 bitmap_zero(map, page->objects); 4129 get_map(s, page, map); 4130 4131 for_each_object(p, s, addr, page->objects) 4132 if (!test_bit(slab_index(p, s, addr), map)) 4133 add_location(t, s, get_track(s, p, alloc)); 4134 } 4135 4136 static int list_locations(struct kmem_cache *s, char *buf, 4137 enum track_item alloc) 4138 { 4139 int len = 0; 4140 unsigned long i; 4141 struct loc_track t = { 0, 0, NULL }; 4142 int node; 4143 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 4144 sizeof(unsigned long), GFP_KERNEL); 4145 4146 if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 4147 GFP_TEMPORARY)) { 4148 kfree(map); 4149 return sprintf(buf, "Out of memory\n"); 4150 } 4151 /* Push back cpu slabs */ 4152 flush_all(s); 4153 4154 for_each_node_state(node, N_NORMAL_MEMORY) { 4155 struct kmem_cache_node *n = get_node(s, node); 4156 unsigned long flags; 4157 struct page *page; 4158 4159 if (!atomic_long_read(&n->nr_slabs)) 4160 continue; 4161 4162 spin_lock_irqsave(&n->list_lock, flags); 4163 list_for_each_entry(page, &n->partial, lru) 4164 process_slab(&t, s, page, alloc, map); 4165 list_for_each_entry(page, &n->full, lru) 4166 process_slab(&t, s, page, alloc, map); 4167 spin_unlock_irqrestore(&n->list_lock, flags); 4168 } 4169 4170 for (i = 0; i < t.count; i++) { 4171 struct location *l = &t.loc[i]; 4172 4173 if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) 4174 break; 4175 len += sprintf(buf + len, "%7ld ", l->count); 4176 4177 if (l->addr) 4178 len += sprintf(buf + len, "%pS", (void *)l->addr); 4179 else 4180 len += sprintf(buf + len, "<not-available>"); 4181 4182 if (l->sum_time != l->min_time) { 4183 len += sprintf(buf + len, " age=%ld/%ld/%ld", 4184 l->min_time, 4185 (long)div_u64(l->sum_time, l->count), 4186 l->max_time); 4187 } else 4188 len += sprintf(buf + len, " age=%ld", 4189 l->min_time); 4190 4191 if (l->min_pid != l->max_pid) 4192 len += sprintf(buf + len, " pid=%ld-%ld", 4193 l->min_pid, l->max_pid); 4194 else 4195 len += sprintf(buf + len, " pid=%ld", 4196 l->min_pid); 4197 4198 if (num_online_cpus() > 1 && 4199 !cpumask_empty(to_cpumask(l->cpus)) && 4200 len < PAGE_SIZE - 60) { 4201 len += sprintf(buf + len, " cpus="); 4202 len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, 4203 to_cpumask(l->cpus)); 4204 } 4205 4206 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && 4207 len < PAGE_SIZE - 60) { 4208 len += sprintf(buf + len, " nodes="); 4209 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, 4210 l->nodes); 4211 } 4212 4213 len += sprintf(buf + len, "\n"); 4214 } 4215 4216 free_loc_track(&t); 4217 kfree(map); 4218 if (!t.count) 4219 len += sprintf(buf, "No data\n"); 4220 return len; 4221 } 4222 #endif 4223 4224 #ifdef SLUB_RESILIENCY_TEST 4225 static void resiliency_test(void) 4226 { 4227 u8 *p; 4228 4229 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10); 4230 4231 printk(KERN_ERR "SLUB resiliency testing\n"); 4232 printk(KERN_ERR "-----------------------\n"); 4233 printk(KERN_ERR "A. Corruption after allocation\n"); 4234 4235 p = kzalloc(16, GFP_KERNEL); 4236 p[16] = 0x12; 4237 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 4238 " 0x12->0x%p\n\n", p + 16); 4239 4240 validate_slab_cache(kmalloc_caches[4]); 4241 4242 /* Hmmm... The next two are dangerous */ 4243 p = kzalloc(32, GFP_KERNEL); 4244 p[32 + sizeof(void *)] = 0x34; 4245 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 4246 " 0x34 -> -0x%p\n", p); 4247 printk(KERN_ERR 4248 "If allocated object is overwritten then not detectable\n\n"); 4249 4250 validate_slab_cache(kmalloc_caches[5]); 4251 p = kzalloc(64, GFP_KERNEL); 4252 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 4253 *p = 0x56; 4254 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 4255 p); 4256 printk(KERN_ERR 4257 "If allocated object is overwritten then not detectable\n\n"); 4258 validate_slab_cache(kmalloc_caches[6]); 4259 4260 printk(KERN_ERR "\nB. Corruption after free\n"); 4261 p = kzalloc(128, GFP_KERNEL); 4262 kfree(p); 4263 *p = 0x78; 4264 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 4265 validate_slab_cache(kmalloc_caches[7]); 4266 4267 p = kzalloc(256, GFP_KERNEL); 4268 kfree(p); 4269 p[50] = 0x9a; 4270 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", 4271 p); 4272 validate_slab_cache(kmalloc_caches[8]); 4273 4274 p = kzalloc(512, GFP_KERNEL); 4275 kfree(p); 4276 p[512] = 0xab; 4277 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 4278 validate_slab_cache(kmalloc_caches[9]); 4279 } 4280 #else 4281 #ifdef CONFIG_SYSFS 4282 static void resiliency_test(void) {}; 4283 #endif 4284 #endif 4285 4286 #ifdef CONFIG_SYSFS 4287 enum slab_stat_type { 4288 SL_ALL, /* All slabs */ 4289 SL_PARTIAL, /* Only partially allocated slabs */ 4290 SL_CPU, /* Only slabs used for cpu caches */ 4291 SL_OBJECTS, /* Determine allocated objects not slabs */ 4292 SL_TOTAL /* Determine object capacity not slabs */ 4293 }; 4294 4295 #define SO_ALL (1 << SL_ALL) 4296 #define SO_PARTIAL (1 << SL_PARTIAL) 4297 #define SO_CPU (1 << SL_CPU) 4298 #define SO_OBJECTS (1 << SL_OBJECTS) 4299 #define SO_TOTAL (1 << SL_TOTAL) 4300 4301 static ssize_t show_slab_objects(struct kmem_cache *s, 4302 char *buf, unsigned long flags) 4303 { 4304 unsigned long total = 0; 4305 int node; 4306 int x; 4307 unsigned long *nodes; 4308 unsigned long *per_cpu; 4309 4310 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 4311 if (!nodes) 4312 return -ENOMEM; 4313 per_cpu = nodes + nr_node_ids; 4314 4315 if (flags & SO_CPU) { 4316 int cpu; 4317 4318 for_each_possible_cpu(cpu) { 4319 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4320 4321 if (!c || c->node < 0) 4322 continue; 4323 4324 if (c->page) { 4325 if (flags & SO_TOTAL) 4326 x = c->page->objects; 4327 else if (flags & SO_OBJECTS) 4328 x = c->page->inuse; 4329 else 4330 x = 1; 4331 4332 total += x; 4333 nodes[c->node] += x; 4334 } 4335 per_cpu[c->node]++; 4336 } 4337 } 4338 4339 lock_memory_hotplug(); 4340 #ifdef CONFIG_SLUB_DEBUG 4341 if (flags & SO_ALL) { 4342 for_each_node_state(node, N_NORMAL_MEMORY) { 4343 struct kmem_cache_node *n = get_node(s, node); 4344 4345 if (flags & SO_TOTAL) 4346 x = atomic_long_read(&n->total_objects); 4347 else if (flags & SO_OBJECTS) 4348 x = atomic_long_read(&n->total_objects) - 4349 count_partial(n, count_free); 4350 4351 else 4352 x = atomic_long_read(&n->nr_slabs); 4353 total += x; 4354 nodes[node] += x; 4355 } 4356 4357 } else 4358 #endif 4359 if (flags & SO_PARTIAL) { 4360 for_each_node_state(node, N_NORMAL_MEMORY) { 4361 struct kmem_cache_node *n = get_node(s, node); 4362 4363 if (flags & SO_TOTAL) 4364 x = count_partial(n, count_total); 4365 else if (flags & SO_OBJECTS) 4366 x = count_partial(n, count_inuse); 4367 else 4368 x = n->nr_partial; 4369 total += x; 4370 nodes[node] += x; 4371 } 4372 } 4373 x = sprintf(buf, "%lu", total); 4374 #ifdef CONFIG_NUMA 4375 for_each_node_state(node, N_NORMAL_MEMORY) 4376 if (nodes[node]) 4377 x += sprintf(buf + x, " N%d=%lu", 4378 node, nodes[node]); 4379 #endif 4380 unlock_memory_hotplug(); 4381 kfree(nodes); 4382 return x + sprintf(buf + x, "\n"); 4383 } 4384 4385 #ifdef CONFIG_SLUB_DEBUG 4386 static int any_slab_objects(struct kmem_cache *s) 4387 { 4388 int node; 4389 4390 for_each_online_node(node) { 4391 struct kmem_cache_node *n = get_node(s, node); 4392 4393 if (!n) 4394 continue; 4395 4396 if (atomic_long_read(&n->total_objects)) 4397 return 1; 4398 } 4399 return 0; 4400 } 4401 #endif 4402 4403 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 4404 #define to_slab(n) container_of(n, struct kmem_cache, kobj) 4405 4406 struct slab_attribute { 4407 struct attribute attr; 4408 ssize_t (*show)(struct kmem_cache *s, char *buf); 4409 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 4410 }; 4411 4412 #define SLAB_ATTR_RO(_name) \ 4413 static struct slab_attribute _name##_attr = __ATTR_RO(_name) 4414 4415 #define SLAB_ATTR(_name) \ 4416 static struct slab_attribute _name##_attr = \ 4417 __ATTR(_name, 0644, _name##_show, _name##_store) 4418 4419 static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 4420 { 4421 return sprintf(buf, "%d\n", s->size); 4422 } 4423 SLAB_ATTR_RO(slab_size); 4424 4425 static ssize_t align_show(struct kmem_cache *s, char *buf) 4426 { 4427 return sprintf(buf, "%d\n", s->align); 4428 } 4429 SLAB_ATTR_RO(align); 4430 4431 static ssize_t object_size_show(struct kmem_cache *s, char *buf) 4432 { 4433 return sprintf(buf, "%d\n", s->objsize); 4434 } 4435 SLAB_ATTR_RO(object_size); 4436 4437 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 4438 { 4439 return sprintf(buf, "%d\n", oo_objects(s->oo)); 4440 } 4441 SLAB_ATTR_RO(objs_per_slab); 4442 4443 static ssize_t order_store(struct kmem_cache *s, 4444 const char *buf, size_t length) 4445 { 4446 unsigned long order; 4447 int err; 4448 4449 err = strict_strtoul(buf, 10, &order); 4450 if (err) 4451 return err; 4452 4453 if (order > slub_max_order || order < slub_min_order) 4454 return -EINVAL; 4455 4456 calculate_sizes(s, order); 4457 return length; 4458 } 4459 4460 static ssize_t order_show(struct kmem_cache *s, char *buf) 4461 { 4462 return sprintf(buf, "%d\n", oo_order(s->oo)); 4463 } 4464 SLAB_ATTR(order); 4465 4466 static ssize_t min_partial_show(struct kmem_cache *s, char *buf) 4467 { 4468 return sprintf(buf, "%lu\n", s->min_partial); 4469 } 4470 4471 static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, 4472 size_t length) 4473 { 4474 unsigned long min; 4475 int err; 4476 4477 err = strict_strtoul(buf, 10, &min); 4478 if (err) 4479 return err; 4480 4481 set_min_partial(s, min); 4482 return length; 4483 } 4484 SLAB_ATTR(min_partial); 4485 4486 static ssize_t ctor_show(struct kmem_cache *s, char *buf) 4487 { 4488 if (!s->ctor) 4489 return 0; 4490 return sprintf(buf, "%pS\n", s->ctor); 4491 } 4492 SLAB_ATTR_RO(ctor); 4493 4494 static ssize_t aliases_show(struct kmem_cache *s, char *buf) 4495 { 4496 return sprintf(buf, "%d\n", s->refcount - 1); 4497 } 4498 SLAB_ATTR_RO(aliases); 4499 4500 static ssize_t partial_show(struct kmem_cache *s, char *buf) 4501 { 4502 return show_slab_objects(s, buf, SO_PARTIAL); 4503 } 4504 SLAB_ATTR_RO(partial); 4505 4506 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 4507 { 4508 return show_slab_objects(s, buf, SO_CPU); 4509 } 4510 SLAB_ATTR_RO(cpu_slabs); 4511 4512 static ssize_t objects_show(struct kmem_cache *s, char *buf) 4513 { 4514 return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); 4515 } 4516 SLAB_ATTR_RO(objects); 4517 4518 static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) 4519 { 4520 return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); 4521 } 4522 SLAB_ATTR_RO(objects_partial); 4523 4524 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4525 { 4526 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4527 } 4528 4529 static ssize_t reclaim_account_store(struct kmem_cache *s, 4530 const char *buf, size_t length) 4531 { 4532 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 4533 if (buf[0] == '1') 4534 s->flags |= SLAB_RECLAIM_ACCOUNT; 4535 return length; 4536 } 4537 SLAB_ATTR(reclaim_account); 4538 4539 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 4540 { 4541 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 4542 } 4543 SLAB_ATTR_RO(hwcache_align); 4544 4545 #ifdef CONFIG_ZONE_DMA 4546 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 4547 { 4548 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 4549 } 4550 SLAB_ATTR_RO(cache_dma); 4551 #endif 4552 4553 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 4554 { 4555 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 4556 } 4557 SLAB_ATTR_RO(destroy_by_rcu); 4558 4559 static ssize_t reserved_show(struct kmem_cache *s, char *buf) 4560 { 4561 return sprintf(buf, "%d\n", s->reserved); 4562 } 4563 SLAB_ATTR_RO(reserved); 4564 4565 #ifdef CONFIG_SLUB_DEBUG 4566 static ssize_t slabs_show(struct kmem_cache *s, char *buf) 4567 { 4568 return show_slab_objects(s, buf, SO_ALL); 4569 } 4570 SLAB_ATTR_RO(slabs); 4571 4572 static ssize_t total_objects_show(struct kmem_cache *s, char *buf) 4573 { 4574 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); 4575 } 4576 SLAB_ATTR_RO(total_objects); 4577 4578 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 4579 { 4580 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 4581 } 4582 4583 static ssize_t sanity_checks_store(struct kmem_cache *s, 4584 const char *buf, size_t length) 4585 { 4586 s->flags &= ~SLAB_DEBUG_FREE; 4587 if (buf[0] == '1') { 4588 s->flags &= ~__CMPXCHG_DOUBLE; 4589 s->flags |= SLAB_DEBUG_FREE; 4590 } 4591 return length; 4592 } 4593 SLAB_ATTR(sanity_checks); 4594 4595 static ssize_t trace_show(struct kmem_cache *s, char *buf) 4596 { 4597 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 4598 } 4599 4600 static ssize_t trace_store(struct kmem_cache *s, const char *buf, 4601 size_t length) 4602 { 4603 s->flags &= ~SLAB_TRACE; 4604 if (buf[0] == '1') { 4605 s->flags &= ~__CMPXCHG_DOUBLE; 4606 s->flags |= SLAB_TRACE; 4607 } 4608 return length; 4609 } 4610 SLAB_ATTR(trace); 4611 4612 static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 4613 { 4614 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 4615 } 4616 4617 static ssize_t red_zone_store(struct kmem_cache *s, 4618 const char *buf, size_t length) 4619 { 4620 if (any_slab_objects(s)) 4621 return -EBUSY; 4622 4623 s->flags &= ~SLAB_RED_ZONE; 4624 if (buf[0] == '1') { 4625 s->flags &= ~__CMPXCHG_DOUBLE; 4626 s->flags |= SLAB_RED_ZONE; 4627 } 4628 calculate_sizes(s, -1); 4629 return length; 4630 } 4631 SLAB_ATTR(red_zone); 4632 4633 static ssize_t poison_show(struct kmem_cache *s, char *buf) 4634 { 4635 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 4636 } 4637 4638 static ssize_t poison_store(struct kmem_cache *s, 4639 const char *buf, size_t length) 4640 { 4641 if (any_slab_objects(s)) 4642 return -EBUSY; 4643 4644 s->flags &= ~SLAB_POISON; 4645 if (buf[0] == '1') { 4646 s->flags &= ~__CMPXCHG_DOUBLE; 4647 s->flags |= SLAB_POISON; 4648 } 4649 calculate_sizes(s, -1); 4650 return length; 4651 } 4652 SLAB_ATTR(poison); 4653 4654 static ssize_t store_user_show(struct kmem_cache *s, char *buf) 4655 { 4656 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 4657 } 4658 4659 static ssize_t store_user_store(struct kmem_cache *s, 4660 const char *buf, size_t length) 4661 { 4662 if (any_slab_objects(s)) 4663 return -EBUSY; 4664 4665 s->flags &= ~SLAB_STORE_USER; 4666 if (buf[0] == '1') { 4667 s->flags &= ~__CMPXCHG_DOUBLE; 4668 s->flags |= SLAB_STORE_USER; 4669 } 4670 calculate_sizes(s, -1); 4671 return length; 4672 } 4673 SLAB_ATTR(store_user); 4674 4675 static ssize_t validate_show(struct kmem_cache *s, char *buf) 4676 { 4677 return 0; 4678 } 4679 4680 static ssize_t validate_store(struct kmem_cache *s, 4681 const char *buf, size_t length) 4682 { 4683 int ret = -EINVAL; 4684 4685 if (buf[0] == '1') { 4686 ret = validate_slab_cache(s); 4687 if (ret >= 0) 4688 ret = length; 4689 } 4690 return ret; 4691 } 4692 SLAB_ATTR(validate); 4693 4694 static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 4695 { 4696 if (!(s->flags & SLAB_STORE_USER)) 4697 return -ENOSYS; 4698 return list_locations(s, buf, TRACK_ALLOC); 4699 } 4700 SLAB_ATTR_RO(alloc_calls); 4701 4702 static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 4703 { 4704 if (!(s->flags & SLAB_STORE_USER)) 4705 return -ENOSYS; 4706 return list_locations(s, buf, TRACK_FREE); 4707 } 4708 SLAB_ATTR_RO(free_calls); 4709 #endif /* CONFIG_SLUB_DEBUG */ 4710 4711 #ifdef CONFIG_FAILSLAB 4712 static ssize_t failslab_show(struct kmem_cache *s, char *buf) 4713 { 4714 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); 4715 } 4716 4717 static ssize_t failslab_store(struct kmem_cache *s, const char *buf, 4718 size_t length) 4719 { 4720 s->flags &= ~SLAB_FAILSLAB; 4721 if (buf[0] == '1') 4722 s->flags |= SLAB_FAILSLAB; 4723 return length; 4724 } 4725 SLAB_ATTR(failslab); 4726 #endif 4727 4728 static ssize_t shrink_show(struct kmem_cache *s, char *buf) 4729 { 4730 return 0; 4731 } 4732 4733 static ssize_t shrink_store(struct kmem_cache *s, 4734 const char *buf, size_t length) 4735 { 4736 if (buf[0] == '1') { 4737 int rc = kmem_cache_shrink(s); 4738 4739 if (rc) 4740 return rc; 4741 } else 4742 return -EINVAL; 4743 return length; 4744 } 4745 SLAB_ATTR(shrink); 4746 4747 #ifdef CONFIG_NUMA 4748 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 4749 { 4750 return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); 4751 } 4752 4753 static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, 4754 const char *buf, size_t length) 4755 { 4756 unsigned long ratio; 4757 int err; 4758 4759 err = strict_strtoul(buf, 10, &ratio); 4760 if (err) 4761 return err; 4762 4763 if (ratio <= 100) 4764 s->remote_node_defrag_ratio = ratio * 10; 4765 4766 return length; 4767 } 4768 SLAB_ATTR(remote_node_defrag_ratio); 4769 #endif 4770 4771 #ifdef CONFIG_SLUB_STATS 4772 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) 4773 { 4774 unsigned long sum = 0; 4775 int cpu; 4776 int len; 4777 int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); 4778 4779 if (!data) 4780 return -ENOMEM; 4781 4782 for_each_online_cpu(cpu) { 4783 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; 4784 4785 data[cpu] = x; 4786 sum += x; 4787 } 4788 4789 len = sprintf(buf, "%lu", sum); 4790 4791 #ifdef CONFIG_SMP 4792 for_each_online_cpu(cpu) { 4793 if (data[cpu] && len < PAGE_SIZE - 20) 4794 len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); 4795 } 4796 #endif 4797 kfree(data); 4798 return len + sprintf(buf + len, "\n"); 4799 } 4800 4801 static void clear_stat(struct kmem_cache *s, enum stat_item si) 4802 { 4803 int cpu; 4804 4805 for_each_online_cpu(cpu) 4806 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; 4807 } 4808 4809 #define STAT_ATTR(si, text) \ 4810 static ssize_t text##_show(struct kmem_cache *s, char *buf) \ 4811 { \ 4812 return show_stat(s, buf, si); \ 4813 } \ 4814 static ssize_t text##_store(struct kmem_cache *s, \ 4815 const char *buf, size_t length) \ 4816 { \ 4817 if (buf[0] != '0') \ 4818 return -EINVAL; \ 4819 clear_stat(s, si); \ 4820 return length; \ 4821 } \ 4822 SLAB_ATTR(text); \ 4823 4824 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 4825 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 4826 STAT_ATTR(FREE_FASTPATH, free_fastpath); 4827 STAT_ATTR(FREE_SLOWPATH, free_slowpath); 4828 STAT_ATTR(FREE_FROZEN, free_frozen); 4829 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 4830 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 4831 STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 4832 STAT_ATTR(ALLOC_SLAB, alloc_slab); 4833 STAT_ATTR(ALLOC_REFILL, alloc_refill); 4834 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); 4835 STAT_ATTR(FREE_SLAB, free_slab); 4836 STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 4837 STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 4838 STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 4839 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 4840 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 4841 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 4842 STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); 4843 STAT_ATTR(ORDER_FALLBACK, order_fallback); 4844 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 4845 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 4846 #endif 4847 4848 static struct attribute *slab_attrs[] = { 4849 &slab_size_attr.attr, 4850 &object_size_attr.attr, 4851 &objs_per_slab_attr.attr, 4852 &order_attr.attr, 4853 &min_partial_attr.attr, 4854 &objects_attr.attr, 4855 &objects_partial_attr.attr, 4856 &partial_attr.attr, 4857 &cpu_slabs_attr.attr, 4858 &ctor_attr.attr, 4859 &aliases_attr.attr, 4860 &align_attr.attr, 4861 &hwcache_align_attr.attr, 4862 &reclaim_account_attr.attr, 4863 &destroy_by_rcu_attr.attr, 4864 &shrink_attr.attr, 4865 &reserved_attr.attr, 4866 #ifdef CONFIG_SLUB_DEBUG 4867 &total_objects_attr.attr, 4868 &slabs_attr.attr, 4869 &sanity_checks_attr.attr, 4870 &trace_attr.attr, 4871 &red_zone_attr.attr, 4872 &poison_attr.attr, 4873 &store_user_attr.attr, 4874 &validate_attr.attr, 4875 &alloc_calls_attr.attr, 4876 &free_calls_attr.attr, 4877 #endif 4878 #ifdef CONFIG_ZONE_DMA 4879 &cache_dma_attr.attr, 4880 #endif 4881 #ifdef CONFIG_NUMA 4882 &remote_node_defrag_ratio_attr.attr, 4883 #endif 4884 #ifdef CONFIG_SLUB_STATS 4885 &alloc_fastpath_attr.attr, 4886 &alloc_slowpath_attr.attr, 4887 &free_fastpath_attr.attr, 4888 &free_slowpath_attr.attr, 4889 &free_frozen_attr.attr, 4890 &free_add_partial_attr.attr, 4891 &free_remove_partial_attr.attr, 4892 &alloc_from_partial_attr.attr, 4893 &alloc_slab_attr.attr, 4894 &alloc_refill_attr.attr, 4895 &alloc_node_mismatch_attr.attr, 4896 &free_slab_attr.attr, 4897 &cpuslab_flush_attr.attr, 4898 &deactivate_full_attr.attr, 4899 &deactivate_empty_attr.attr, 4900 &deactivate_to_head_attr.attr, 4901 &deactivate_to_tail_attr.attr, 4902 &deactivate_remote_frees_attr.attr, 4903 &deactivate_bypass_attr.attr, 4904 &order_fallback_attr.attr, 4905 &cmpxchg_double_fail_attr.attr, 4906 &cmpxchg_double_cpu_fail_attr.attr, 4907 #endif 4908 #ifdef CONFIG_FAILSLAB 4909 &failslab_attr.attr, 4910 #endif 4911 4912 NULL 4913 }; 4914 4915 static struct attribute_group slab_attr_group = { 4916 .attrs = slab_attrs, 4917 }; 4918 4919 static ssize_t slab_attr_show(struct kobject *kobj, 4920 struct attribute *attr, 4921 char *buf) 4922 { 4923 struct slab_attribute *attribute; 4924 struct kmem_cache *s; 4925 int err; 4926 4927 attribute = to_slab_attr(attr); 4928 s = to_slab(kobj); 4929 4930 if (!attribute->show) 4931 return -EIO; 4932 4933 err = attribute->show(s, buf); 4934 4935 return err; 4936 } 4937 4938 static ssize_t slab_attr_store(struct kobject *kobj, 4939 struct attribute *attr, 4940 const char *buf, size_t len) 4941 { 4942 struct slab_attribute *attribute; 4943 struct kmem_cache *s; 4944 int err; 4945 4946 attribute = to_slab_attr(attr); 4947 s = to_slab(kobj); 4948 4949 if (!attribute->store) 4950 return -EIO; 4951 4952 err = attribute->store(s, buf, len); 4953 4954 return err; 4955 } 4956 4957 static void kmem_cache_release(struct kobject *kobj) 4958 { 4959 struct kmem_cache *s = to_slab(kobj); 4960 4961 kfree(s->name); 4962 kfree(s); 4963 } 4964 4965 static const struct sysfs_ops slab_sysfs_ops = { 4966 .show = slab_attr_show, 4967 .store = slab_attr_store, 4968 }; 4969 4970 static struct kobj_type slab_ktype = { 4971 .sysfs_ops = &slab_sysfs_ops, 4972 .release = kmem_cache_release 4973 }; 4974 4975 static int uevent_filter(struct kset *kset, struct kobject *kobj) 4976 { 4977 struct kobj_type *ktype = get_ktype(kobj); 4978 4979 if (ktype == &slab_ktype) 4980 return 1; 4981 return 0; 4982 } 4983 4984 static const struct kset_uevent_ops slab_uevent_ops = { 4985 .filter = uevent_filter, 4986 }; 4987 4988 static struct kset *slab_kset; 4989 4990 #define ID_STR_LENGTH 64 4991 4992 /* Create a unique string id for a slab cache: 4993 * 4994 * Format :[flags-]size 4995 */ 4996 static char *create_unique_id(struct kmem_cache *s) 4997 { 4998 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 4999 char *p = name; 5000 5001 BUG_ON(!name); 5002 5003 *p++ = ':'; 5004 /* 5005 * First flags affecting slabcache operations. We will only 5006 * get here for aliasable slabs so we do not need to support 5007 * too many flags. The flags here must cover all flags that 5008 * are matched during merging to guarantee that the id is 5009 * unique. 5010 */ 5011 if (s->flags & SLAB_CACHE_DMA) 5012 *p++ = 'd'; 5013 if (s->flags & SLAB_RECLAIM_ACCOUNT) 5014 *p++ = 'a'; 5015 if (s->flags & SLAB_DEBUG_FREE) 5016 *p++ = 'F'; 5017 if (!(s->flags & SLAB_NOTRACK)) 5018 *p++ = 't'; 5019 if (p != name + 1) 5020 *p++ = '-'; 5021 p += sprintf(p, "%07d", s->size); 5022 BUG_ON(p > name + ID_STR_LENGTH - 1); 5023 return name; 5024 } 5025 5026 static int sysfs_slab_add(struct kmem_cache *s) 5027 { 5028 int err; 5029 const char *name; 5030 int unmergeable; 5031 5032 if (slab_state < SYSFS) 5033 /* Defer until later */ 5034 return 0; 5035 5036 unmergeable = slab_unmergeable(s); 5037 if (unmergeable) { 5038 /* 5039 * Slabcache can never be merged so we can use the name proper. 5040 * This is typically the case for debug situations. In that 5041 * case we can catch duplicate names easily. 5042 */ 5043 sysfs_remove_link(&slab_kset->kobj, s->name); 5044 name = s->name; 5045 } else { 5046 /* 5047 * Create a unique name for the slab as a target 5048 * for the symlinks. 5049 */ 5050 name = create_unique_id(s); 5051 } 5052 5053 s->kobj.kset = slab_kset; 5054 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); 5055 if (err) { 5056 kobject_put(&s->kobj); 5057 return err; 5058 } 5059 5060 err = sysfs_create_group(&s->kobj, &slab_attr_group); 5061 if (err) { 5062 kobject_del(&s->kobj); 5063 kobject_put(&s->kobj); 5064 return err; 5065 } 5066 kobject_uevent(&s->kobj, KOBJ_ADD); 5067 if (!unmergeable) { 5068 /* Setup first alias */ 5069 sysfs_slab_alias(s, s->name); 5070 kfree(name); 5071 } 5072 return 0; 5073 } 5074 5075 static void sysfs_slab_remove(struct kmem_cache *s) 5076 { 5077 if (slab_state < SYSFS) 5078 /* 5079 * Sysfs has not been setup yet so no need to remove the 5080 * cache from sysfs. 5081 */ 5082 return; 5083 5084 kobject_uevent(&s->kobj, KOBJ_REMOVE); 5085 kobject_del(&s->kobj); 5086 kobject_put(&s->kobj); 5087 } 5088 5089 /* 5090 * Need to buffer aliases during bootup until sysfs becomes 5091 * available lest we lose that information. 5092 */ 5093 struct saved_alias { 5094 struct kmem_cache *s; 5095 const char *name; 5096 struct saved_alias *next; 5097 }; 5098 5099 static struct saved_alias *alias_list; 5100 5101 static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 5102 { 5103 struct saved_alias *al; 5104 5105 if (slab_state == SYSFS) { 5106 /* 5107 * If we have a leftover link then remove it. 5108 */ 5109 sysfs_remove_link(&slab_kset->kobj, name); 5110 return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); 5111 } 5112 5113 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 5114 if (!al) 5115 return -ENOMEM; 5116 5117 al->s = s; 5118 al->name = name; 5119 al->next = alias_list; 5120 alias_list = al; 5121 return 0; 5122 } 5123 5124 static int __init slab_sysfs_init(void) 5125 { 5126 struct kmem_cache *s; 5127 int err; 5128 5129 down_write(&slub_lock); 5130 5131 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 5132 if (!slab_kset) { 5133 up_write(&slub_lock); 5134 printk(KERN_ERR "Cannot register slab subsystem.\n"); 5135 return -ENOSYS; 5136 } 5137 5138 slab_state = SYSFS; 5139 5140 list_for_each_entry(s, &slab_caches, list) { 5141 err = sysfs_slab_add(s); 5142 if (err) 5143 printk(KERN_ERR "SLUB: Unable to add boot slab %s" 5144 " to sysfs\n", s->name); 5145 } 5146 5147 while (alias_list) { 5148 struct saved_alias *al = alias_list; 5149 5150 alias_list = alias_list->next; 5151 err = sysfs_slab_alias(al->s, al->name); 5152 if (err) 5153 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 5154 " %s to sysfs\n", s->name); 5155 kfree(al); 5156 } 5157 5158 up_write(&slub_lock); 5159 resiliency_test(); 5160 return 0; 5161 } 5162 5163 __initcall(slab_sysfs_init); 5164 #endif /* CONFIG_SYSFS */ 5165 5166 /* 5167 * The /proc/slabinfo ABI 5168 */ 5169 #ifdef CONFIG_SLABINFO 5170 static void print_slabinfo_header(struct seq_file *m) 5171 { 5172 seq_puts(m, "slabinfo - version: 2.1\n"); 5173 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 5174 "<objperslab> <pagesperslab>"); 5175 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 5176 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 5177 seq_putc(m, '\n'); 5178 } 5179 5180 static void *s_start(struct seq_file *m, loff_t *pos) 5181 { 5182 loff_t n = *pos; 5183 5184 down_read(&slub_lock); 5185 if (!n) 5186 print_slabinfo_header(m); 5187 5188 return seq_list_start(&slab_caches, *pos); 5189 } 5190 5191 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 5192 { 5193 return seq_list_next(p, &slab_caches, pos); 5194 } 5195 5196 static void s_stop(struct seq_file *m, void *p) 5197 { 5198 up_read(&slub_lock); 5199 } 5200 5201 static int s_show(struct seq_file *m, void *p) 5202 { 5203 unsigned long nr_partials = 0; 5204 unsigned long nr_slabs = 0; 5205 unsigned long nr_inuse = 0; 5206 unsigned long nr_objs = 0; 5207 unsigned long nr_free = 0; 5208 struct kmem_cache *s; 5209 int node; 5210 5211 s = list_entry(p, struct kmem_cache, list); 5212 5213 for_each_online_node(node) { 5214 struct kmem_cache_node *n = get_node(s, node); 5215 5216 if (!n) 5217 continue; 5218 5219 nr_partials += n->nr_partial; 5220 nr_slabs += atomic_long_read(&n->nr_slabs); 5221 nr_objs += atomic_long_read(&n->total_objects); 5222 nr_free += count_partial(n, count_free); 5223 } 5224 5225 nr_inuse = nr_objs - nr_free; 5226 5227 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, 5228 nr_objs, s->size, oo_objects(s->oo), 5229 (1 << oo_order(s->oo))); 5230 seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); 5231 seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, 5232 0UL); 5233 seq_putc(m, '\n'); 5234 return 0; 5235 } 5236 5237 static const struct seq_operations slabinfo_op = { 5238 .start = s_start, 5239 .next = s_next, 5240 .stop = s_stop, 5241 .show = s_show, 5242 }; 5243 5244 static int slabinfo_open(struct inode *inode, struct file *file) 5245 { 5246 return seq_open(file, &slabinfo_op); 5247 } 5248 5249 static const struct file_operations proc_slabinfo_operations = { 5250 .open = slabinfo_open, 5251 .read = seq_read, 5252 .llseek = seq_lseek, 5253 .release = seq_release, 5254 }; 5255 5256 static int __init slab_proc_init(void) 5257 { 5258 proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); 5259 return 0; 5260 } 5261 module_init(slab_proc_init); 5262 #endif /* CONFIG_SLABINFO */ 5263