1 /* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks or atomic operatios 6 * and only uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter 9 * (C) 2011 Linux Foundation, Christoph Lameter 10 */ 11 12 #include <linux/mm.h> 13 #include <linux/swap.h> /* struct reclaim_state */ 14 #include <linux/module.h> 15 #include <linux/bit_spinlock.h> 16 #include <linux/interrupt.h> 17 #include <linux/bitops.h> 18 #include <linux/slab.h> 19 #include <linux/proc_fs.h> 20 #include <linux/seq_file.h> 21 #include <linux/kmemcheck.h> 22 #include <linux/cpu.h> 23 #include <linux/cpuset.h> 24 #include <linux/mempolicy.h> 25 #include <linux/ctype.h> 26 #include <linux/debugobjects.h> 27 #include <linux/kallsyms.h> 28 #include <linux/memory.h> 29 #include <linux/math64.h> 30 #include <linux/fault-inject.h> 31 #include <linux/stacktrace.h> 32 33 #include <trace/events/kmem.h> 34 35 /* 36 * Lock order: 37 * 1. slub_lock (Global Semaphore) 38 * 2. node->list_lock 39 * 3. slab_lock(page) (Only on some arches and for debugging) 40 * 41 * slub_lock 42 * 43 * The role of the slub_lock is to protect the list of all the slabs 44 * and to synchronize major metadata changes to slab cache structures. 45 * 46 * The slab_lock is only used for debugging and on arches that do not 47 * have the ability to do a cmpxchg_double. It only protects the second 48 * double word in the page struct. Meaning 49 * A. page->freelist -> List of object free in a page 50 * B. page->counters -> Counters of objects 51 * C. page->frozen -> frozen state 52 * 53 * If a slab is frozen then it is exempt from list management. It is not 54 * on any list. The processor that froze the slab is the one who can 55 * perform list operations on the page. Other processors may put objects 56 * onto the freelist but the processor that froze the slab is the only 57 * one that can retrieve the objects from the page's freelist. 58 * 59 * The list_lock protects the partial and full list on each node and 60 * the partial slab counter. If taken then no new slabs may be added or 61 * removed from the lists nor make the number of partial slabs be modified. 62 * (Note that the total number of slabs is an atomic value that may be 63 * modified without taking the list lock). 64 * 65 * The list_lock is a centralized lock and thus we avoid taking it as 66 * much as possible. As long as SLUB does not have to handle partial 67 * slabs, operations can continue without any centralized lock. F.e. 68 * allocating a long series of objects that fill up slabs does not require 69 * the list lock. 70 * Interrupts are disabled during allocation and deallocation in order to 71 * make the slab allocator safe to use in the context of an irq. In addition 72 * interrupts are disabled to ensure that the processor does not change 73 * while handling per_cpu slabs, due to kernel preemption. 74 * 75 * SLUB assigns one slab for allocation to each processor. 76 * Allocations only occur from these slabs called cpu slabs. 77 * 78 * Slabs with free elements are kept on a partial list and during regular 79 * operations no list for full slabs is used. If an object in a full slab is 80 * freed then the slab will show up again on the partial lists. 81 * We track full slabs for debugging purposes though because otherwise we 82 * cannot scan all objects. 83 * 84 * Slabs are freed when they become empty. Teardown and setup is 85 * minimal so we rely on the page allocators per cpu caches for 86 * fast frees and allocs. 87 * 88 * Overloading of page flags that are otherwise used for LRU management. 89 * 90 * PageActive The slab is frozen and exempt from list processing. 91 * This means that the slab is dedicated to a purpose 92 * such as satisfying allocations for a specific 93 * processor. Objects may be freed in the slab while 94 * it is frozen but slab_free will then skip the usual 95 * list operations. It is up to the processor holding 96 * the slab to integrate the slab into the slab lists 97 * when the slab is no longer needed. 98 * 99 * One use of this flag is to mark slabs that are 100 * used for allocations. Then such a slab becomes a cpu 101 * slab. The cpu slab may be equipped with an additional 102 * freelist that allows lockless access to 103 * free objects in addition to the regular freelist 104 * that requires the slab lock. 105 * 106 * PageError Slab requires special handling due to debug 107 * options set. This moves slab handling out of 108 * the fast path and disables lockless freelists. 109 */ 110 111 #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 112 SLAB_TRACE | SLAB_DEBUG_FREE) 113 114 static inline int kmem_cache_debug(struct kmem_cache *s) 115 { 116 #ifdef CONFIG_SLUB_DEBUG 117 return unlikely(s->flags & SLAB_DEBUG_FLAGS); 118 #else 119 return 0; 120 #endif 121 } 122 123 /* 124 * Issues still to be resolved: 125 * 126 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 127 * 128 * - Variable sizing of the per node arrays 129 */ 130 131 /* Enable to test recovery from slab corruption on boot */ 132 #undef SLUB_RESILIENCY_TEST 133 134 /* Enable to log cmpxchg failures */ 135 #undef SLUB_DEBUG_CMPXCHG 136 137 /* 138 * Mininum number of partial slabs. These will be left on the partial 139 * lists even if they are empty. kmem_cache_shrink may reclaim them. 140 */ 141 #define MIN_PARTIAL 5 142 143 /* 144 * Maximum number of desirable partial slabs. 145 * The existence of more partial slabs makes kmem_cache_shrink 146 * sort the partial list by the number of objects in the. 147 */ 148 #define MAX_PARTIAL 10 149 150 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 151 SLAB_POISON | SLAB_STORE_USER) 152 153 /* 154 * Debugging flags that require metadata to be stored in the slab. These get 155 * disabled when slub_debug=O is used and a cache's min order increases with 156 * metadata. 157 */ 158 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 159 160 /* 161 * Set of flags that will prevent slab merging 162 */ 163 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 164 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ 165 SLAB_FAILSLAB) 166 167 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 168 SLAB_CACHE_DMA | SLAB_NOTRACK) 169 170 #define OO_SHIFT 16 171 #define OO_MASK ((1 << OO_SHIFT) - 1) 172 #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ 173 174 /* Internal SLUB flags */ 175 #define __OBJECT_POISON 0x80000000UL /* Poison object */ 176 #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ 177 178 static int kmem_size = sizeof(struct kmem_cache); 179 180 #ifdef CONFIG_SMP 181 static struct notifier_block slab_notifier; 182 #endif 183 184 static enum { 185 DOWN, /* No slab functionality available */ 186 PARTIAL, /* Kmem_cache_node works */ 187 UP, /* Everything works but does not show up in sysfs */ 188 SYSFS /* Sysfs up */ 189 } slab_state = DOWN; 190 191 /* A list of all slab caches on the system */ 192 static DECLARE_RWSEM(slub_lock); 193 static LIST_HEAD(slab_caches); 194 195 /* 196 * Tracking user of a slab. 197 */ 198 #define TRACK_ADDRS_COUNT 16 199 struct track { 200 unsigned long addr; /* Called from address */ 201 #ifdef CONFIG_STACKTRACE 202 unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ 203 #endif 204 int cpu; /* Was running on cpu */ 205 int pid; /* Pid context */ 206 unsigned long when; /* When did the operation occur */ 207 }; 208 209 enum track_item { TRACK_ALLOC, TRACK_FREE }; 210 211 #ifdef CONFIG_SYSFS 212 static int sysfs_slab_add(struct kmem_cache *); 213 static int sysfs_slab_alias(struct kmem_cache *, const char *); 214 static void sysfs_slab_remove(struct kmem_cache *); 215 216 #else 217 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 218 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 219 { return 0; } 220 static inline void sysfs_slab_remove(struct kmem_cache *s) 221 { 222 kfree(s->name); 223 kfree(s); 224 } 225 226 #endif 227 228 static inline void stat(const struct kmem_cache *s, enum stat_item si) 229 { 230 #ifdef CONFIG_SLUB_STATS 231 __this_cpu_inc(s->cpu_slab->stat[si]); 232 #endif 233 } 234 235 /******************************************************************** 236 * Core slab cache functions 237 *******************************************************************/ 238 239 int slab_is_available(void) 240 { 241 return slab_state >= UP; 242 } 243 244 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 245 { 246 return s->node[node]; 247 } 248 249 /* Verify that a pointer has an address that is valid within a slab page */ 250 static inline int check_valid_pointer(struct kmem_cache *s, 251 struct page *page, const void *object) 252 { 253 void *base; 254 255 if (!object) 256 return 1; 257 258 base = page_address(page); 259 if (object < base || object >= base + page->objects * s->size || 260 (object - base) % s->size) { 261 return 0; 262 } 263 264 return 1; 265 } 266 267 static inline void *get_freepointer(struct kmem_cache *s, void *object) 268 { 269 return *(void **)(object + s->offset); 270 } 271 272 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 273 { 274 void *p; 275 276 #ifdef CONFIG_DEBUG_PAGEALLOC 277 probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); 278 #else 279 p = get_freepointer(s, object); 280 #endif 281 return p; 282 } 283 284 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 285 { 286 *(void **)(object + s->offset) = fp; 287 } 288 289 /* Loop over all objects in a slab */ 290 #define for_each_object(__p, __s, __addr, __objects) \ 291 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ 292 __p += (__s)->size) 293 294 /* Determine object index from a given position */ 295 static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 296 { 297 return (p - addr) / s->size; 298 } 299 300 static inline size_t slab_ksize(const struct kmem_cache *s) 301 { 302 #ifdef CONFIG_SLUB_DEBUG 303 /* 304 * Debugging requires use of the padding between object 305 * and whatever may come after it. 306 */ 307 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 308 return s->objsize; 309 310 #endif 311 /* 312 * If we have the need to store the freelist pointer 313 * back there or track user information then we can 314 * only use the space before that information. 315 */ 316 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 317 return s->inuse; 318 /* 319 * Else we can use all the padding etc for the allocation 320 */ 321 return s->size; 322 } 323 324 static inline int order_objects(int order, unsigned long size, int reserved) 325 { 326 return ((PAGE_SIZE << order) - reserved) / size; 327 } 328 329 static inline struct kmem_cache_order_objects oo_make(int order, 330 unsigned long size, int reserved) 331 { 332 struct kmem_cache_order_objects x = { 333 (order << OO_SHIFT) + order_objects(order, size, reserved) 334 }; 335 336 return x; 337 } 338 339 static inline int oo_order(struct kmem_cache_order_objects x) 340 { 341 return x.x >> OO_SHIFT; 342 } 343 344 static inline int oo_objects(struct kmem_cache_order_objects x) 345 { 346 return x.x & OO_MASK; 347 } 348 349 /* 350 * Per slab locking using the pagelock 351 */ 352 static __always_inline void slab_lock(struct page *page) 353 { 354 bit_spin_lock(PG_locked, &page->flags); 355 } 356 357 static __always_inline void slab_unlock(struct page *page) 358 { 359 __bit_spin_unlock(PG_locked, &page->flags); 360 } 361 362 /* Interrupts must be disabled (for the fallback code to work right) */ 363 static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 364 void *freelist_old, unsigned long counters_old, 365 void *freelist_new, unsigned long counters_new, 366 const char *n) 367 { 368 VM_BUG_ON(!irqs_disabled()); 369 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 370 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 371 if (s->flags & __CMPXCHG_DOUBLE) { 372 if (cmpxchg_double(&page->freelist, &page->counters, 373 freelist_old, counters_old, 374 freelist_new, counters_new)) 375 return 1; 376 } else 377 #endif 378 { 379 slab_lock(page); 380 if (page->freelist == freelist_old && page->counters == counters_old) { 381 page->freelist = freelist_new; 382 page->counters = counters_new; 383 slab_unlock(page); 384 return 1; 385 } 386 slab_unlock(page); 387 } 388 389 cpu_relax(); 390 stat(s, CMPXCHG_DOUBLE_FAIL); 391 392 #ifdef SLUB_DEBUG_CMPXCHG 393 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 394 #endif 395 396 return 0; 397 } 398 399 static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 400 void *freelist_old, unsigned long counters_old, 401 void *freelist_new, unsigned long counters_new, 402 const char *n) 403 { 404 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 405 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 406 if (s->flags & __CMPXCHG_DOUBLE) { 407 if (cmpxchg_double(&page->freelist, &page->counters, 408 freelist_old, counters_old, 409 freelist_new, counters_new)) 410 return 1; 411 } else 412 #endif 413 { 414 unsigned long flags; 415 416 local_irq_save(flags); 417 slab_lock(page); 418 if (page->freelist == freelist_old && page->counters == counters_old) { 419 page->freelist = freelist_new; 420 page->counters = counters_new; 421 slab_unlock(page); 422 local_irq_restore(flags); 423 return 1; 424 } 425 slab_unlock(page); 426 local_irq_restore(flags); 427 } 428 429 cpu_relax(); 430 stat(s, CMPXCHG_DOUBLE_FAIL); 431 432 #ifdef SLUB_DEBUG_CMPXCHG 433 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 434 #endif 435 436 return 0; 437 } 438 439 #ifdef CONFIG_SLUB_DEBUG 440 /* 441 * Determine a map of object in use on a page. 442 * 443 * Node listlock must be held to guarantee that the page does 444 * not vanish from under us. 445 */ 446 static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) 447 { 448 void *p; 449 void *addr = page_address(page); 450 451 for (p = page->freelist; p; p = get_freepointer(s, p)) 452 set_bit(slab_index(p, s, addr), map); 453 } 454 455 /* 456 * Debug settings: 457 */ 458 #ifdef CONFIG_SLUB_DEBUG_ON 459 static int slub_debug = DEBUG_DEFAULT_FLAGS; 460 #else 461 static int slub_debug; 462 #endif 463 464 static char *slub_debug_slabs; 465 static int disable_higher_order_debug; 466 467 /* 468 * Object debugging 469 */ 470 static void print_section(char *text, u8 *addr, unsigned int length) 471 { 472 print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, 473 length, 1); 474 } 475 476 static struct track *get_track(struct kmem_cache *s, void *object, 477 enum track_item alloc) 478 { 479 struct track *p; 480 481 if (s->offset) 482 p = object + s->offset + sizeof(void *); 483 else 484 p = object + s->inuse; 485 486 return p + alloc; 487 } 488 489 static void set_track(struct kmem_cache *s, void *object, 490 enum track_item alloc, unsigned long addr) 491 { 492 struct track *p = get_track(s, object, alloc); 493 494 if (addr) { 495 #ifdef CONFIG_STACKTRACE 496 struct stack_trace trace; 497 int i; 498 499 trace.nr_entries = 0; 500 trace.max_entries = TRACK_ADDRS_COUNT; 501 trace.entries = p->addrs; 502 trace.skip = 3; 503 save_stack_trace(&trace); 504 505 /* See rant in lockdep.c */ 506 if (trace.nr_entries != 0 && 507 trace.entries[trace.nr_entries - 1] == ULONG_MAX) 508 trace.nr_entries--; 509 510 for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) 511 p->addrs[i] = 0; 512 #endif 513 p->addr = addr; 514 p->cpu = smp_processor_id(); 515 p->pid = current->pid; 516 p->when = jiffies; 517 } else 518 memset(p, 0, sizeof(struct track)); 519 } 520 521 static void init_tracking(struct kmem_cache *s, void *object) 522 { 523 if (!(s->flags & SLAB_STORE_USER)) 524 return; 525 526 set_track(s, object, TRACK_FREE, 0UL); 527 set_track(s, object, TRACK_ALLOC, 0UL); 528 } 529 530 static void print_track(const char *s, struct track *t) 531 { 532 if (!t->addr) 533 return; 534 535 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 536 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 537 #ifdef CONFIG_STACKTRACE 538 { 539 int i; 540 for (i = 0; i < TRACK_ADDRS_COUNT; i++) 541 if (t->addrs[i]) 542 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); 543 else 544 break; 545 } 546 #endif 547 } 548 549 static void print_tracking(struct kmem_cache *s, void *object) 550 { 551 if (!(s->flags & SLAB_STORE_USER)) 552 return; 553 554 print_track("Allocated", get_track(s, object, TRACK_ALLOC)); 555 print_track("Freed", get_track(s, object, TRACK_FREE)); 556 } 557 558 static void print_page_info(struct page *page) 559 { 560 printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", 561 page, page->objects, page->inuse, page->freelist, page->flags); 562 563 } 564 565 static void slab_bug(struct kmem_cache *s, char *fmt, ...) 566 { 567 va_list args; 568 char buf[100]; 569 570 va_start(args, fmt); 571 vsnprintf(buf, sizeof(buf), fmt, args); 572 va_end(args); 573 printk(KERN_ERR "========================================" 574 "=====================================\n"); 575 printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); 576 printk(KERN_ERR "----------------------------------------" 577 "-------------------------------------\n\n"); 578 } 579 580 static void slab_fix(struct kmem_cache *s, char *fmt, ...) 581 { 582 va_list args; 583 char buf[100]; 584 585 va_start(args, fmt); 586 vsnprintf(buf, sizeof(buf), fmt, args); 587 va_end(args); 588 printk(KERN_ERR "FIX %s: %s\n", s->name, buf); 589 } 590 591 static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 592 { 593 unsigned int off; /* Offset of last byte */ 594 u8 *addr = page_address(page); 595 596 print_tracking(s, p); 597 598 print_page_info(page); 599 600 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 601 p, p - addr, get_freepointer(s, p)); 602 603 if (p > addr + 16) 604 print_section("Bytes b4 ", p - 16, 16); 605 606 print_section("Object ", p, min_t(unsigned long, s->objsize, 607 PAGE_SIZE)); 608 if (s->flags & SLAB_RED_ZONE) 609 print_section("Redzone ", p + s->objsize, 610 s->inuse - s->objsize); 611 612 if (s->offset) 613 off = s->offset + sizeof(void *); 614 else 615 off = s->inuse; 616 617 if (s->flags & SLAB_STORE_USER) 618 off += 2 * sizeof(struct track); 619 620 if (off != s->size) 621 /* Beginning of the filler is the free pointer */ 622 print_section("Padding ", p + off, s->size - off); 623 624 dump_stack(); 625 } 626 627 static void object_err(struct kmem_cache *s, struct page *page, 628 u8 *object, char *reason) 629 { 630 slab_bug(s, "%s", reason); 631 print_trailer(s, page, object); 632 } 633 634 static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) 635 { 636 va_list args; 637 char buf[100]; 638 639 va_start(args, fmt); 640 vsnprintf(buf, sizeof(buf), fmt, args); 641 va_end(args); 642 slab_bug(s, "%s", buf); 643 print_page_info(page); 644 dump_stack(); 645 } 646 647 static void init_object(struct kmem_cache *s, void *object, u8 val) 648 { 649 u8 *p = object; 650 651 if (s->flags & __OBJECT_POISON) { 652 memset(p, POISON_FREE, s->objsize - 1); 653 p[s->objsize - 1] = POISON_END; 654 } 655 656 if (s->flags & SLAB_RED_ZONE) 657 memset(p + s->objsize, val, s->inuse - s->objsize); 658 } 659 660 static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 661 void *from, void *to) 662 { 663 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); 664 memset(from, data, to - from); 665 } 666 667 static int check_bytes_and_report(struct kmem_cache *s, struct page *page, 668 u8 *object, char *what, 669 u8 *start, unsigned int value, unsigned int bytes) 670 { 671 u8 *fault; 672 u8 *end; 673 674 fault = memchr_inv(start, value, bytes); 675 if (!fault) 676 return 1; 677 678 end = start + bytes; 679 while (end > fault && end[-1] == value) 680 end--; 681 682 slab_bug(s, "%s overwritten", what); 683 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", 684 fault, end - 1, fault[0], value); 685 print_trailer(s, page, object); 686 687 restore_bytes(s, what, value, fault, end); 688 return 0; 689 } 690 691 /* 692 * Object layout: 693 * 694 * object address 695 * Bytes of the object to be managed. 696 * If the freepointer may overlay the object then the free 697 * pointer is the first word of the object. 698 * 699 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 700 * 0xa5 (POISON_END) 701 * 702 * object + s->objsize 703 * Padding to reach word boundary. This is also used for Redzoning. 704 * Padding is extended by another word if Redzoning is enabled and 705 * objsize == inuse. 706 * 707 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 708 * 0xcc (RED_ACTIVE) for objects in use. 709 * 710 * object + s->inuse 711 * Meta data starts here. 712 * 713 * A. Free pointer (if we cannot overwrite object on free) 714 * B. Tracking data for SLAB_STORE_USER 715 * C. Padding to reach required alignment boundary or at mininum 716 * one word if debugging is on to be able to detect writes 717 * before the word boundary. 718 * 719 * Padding is done using 0x5a (POISON_INUSE) 720 * 721 * object + s->size 722 * Nothing is used beyond s->size. 723 * 724 * If slabcaches are merged then the objsize and inuse boundaries are mostly 725 * ignored. And therefore no slab options that rely on these boundaries 726 * may be used with merged slabcaches. 727 */ 728 729 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 730 { 731 unsigned long off = s->inuse; /* The end of info */ 732 733 if (s->offset) 734 /* Freepointer is placed after the object. */ 735 off += sizeof(void *); 736 737 if (s->flags & SLAB_STORE_USER) 738 /* We also have user information there */ 739 off += 2 * sizeof(struct track); 740 741 if (s->size == off) 742 return 1; 743 744 return check_bytes_and_report(s, page, p, "Object padding", 745 p + off, POISON_INUSE, s->size - off); 746 } 747 748 /* Check the pad bytes at the end of a slab page */ 749 static int slab_pad_check(struct kmem_cache *s, struct page *page) 750 { 751 u8 *start; 752 u8 *fault; 753 u8 *end; 754 int length; 755 int remainder; 756 757 if (!(s->flags & SLAB_POISON)) 758 return 1; 759 760 start = page_address(page); 761 length = (PAGE_SIZE << compound_order(page)) - s->reserved; 762 end = start + length; 763 remainder = length % s->size; 764 if (!remainder) 765 return 1; 766 767 fault = memchr_inv(end - remainder, POISON_INUSE, remainder); 768 if (!fault) 769 return 1; 770 while (end > fault && end[-1] == POISON_INUSE) 771 end--; 772 773 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 774 print_section("Padding ", end - remainder, remainder); 775 776 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); 777 return 0; 778 } 779 780 static int check_object(struct kmem_cache *s, struct page *page, 781 void *object, u8 val) 782 { 783 u8 *p = object; 784 u8 *endobject = object + s->objsize; 785 786 if (s->flags & SLAB_RED_ZONE) { 787 if (!check_bytes_and_report(s, page, object, "Redzone", 788 endobject, val, s->inuse - s->objsize)) 789 return 0; 790 } else { 791 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { 792 check_bytes_and_report(s, page, p, "Alignment padding", 793 endobject, POISON_INUSE, s->inuse - s->objsize); 794 } 795 } 796 797 if (s->flags & SLAB_POISON) { 798 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && 799 (!check_bytes_and_report(s, page, p, "Poison", p, 800 POISON_FREE, s->objsize - 1) || 801 !check_bytes_and_report(s, page, p, "Poison", 802 p + s->objsize - 1, POISON_END, 1))) 803 return 0; 804 /* 805 * check_pad_bytes cleans up on its own. 806 */ 807 check_pad_bytes(s, page, p); 808 } 809 810 if (!s->offset && val == SLUB_RED_ACTIVE) 811 /* 812 * Object and freepointer overlap. Cannot check 813 * freepointer while object is allocated. 814 */ 815 return 1; 816 817 /* Check free pointer validity */ 818 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 819 object_err(s, page, p, "Freepointer corrupt"); 820 /* 821 * No choice but to zap it and thus lose the remainder 822 * of the free objects in this slab. May cause 823 * another error because the object count is now wrong. 824 */ 825 set_freepointer(s, p, NULL); 826 return 0; 827 } 828 return 1; 829 } 830 831 static int check_slab(struct kmem_cache *s, struct page *page) 832 { 833 int maxobj; 834 835 VM_BUG_ON(!irqs_disabled()); 836 837 if (!PageSlab(page)) { 838 slab_err(s, page, "Not a valid slab page"); 839 return 0; 840 } 841 842 maxobj = order_objects(compound_order(page), s->size, s->reserved); 843 if (page->objects > maxobj) { 844 slab_err(s, page, "objects %u > max %u", 845 s->name, page->objects, maxobj); 846 return 0; 847 } 848 if (page->inuse > page->objects) { 849 slab_err(s, page, "inuse %u > max %u", 850 s->name, page->inuse, page->objects); 851 return 0; 852 } 853 /* Slab_pad_check fixes things up after itself */ 854 slab_pad_check(s, page); 855 return 1; 856 } 857 858 /* 859 * Determine if a certain object on a page is on the freelist. Must hold the 860 * slab lock to guarantee that the chains are in a consistent state. 861 */ 862 static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 863 { 864 int nr = 0; 865 void *fp; 866 void *object = NULL; 867 unsigned long max_objects; 868 869 fp = page->freelist; 870 while (fp && nr <= page->objects) { 871 if (fp == search) 872 return 1; 873 if (!check_valid_pointer(s, page, fp)) { 874 if (object) { 875 object_err(s, page, object, 876 "Freechain corrupt"); 877 set_freepointer(s, object, NULL); 878 break; 879 } else { 880 slab_err(s, page, "Freepointer corrupt"); 881 page->freelist = NULL; 882 page->inuse = page->objects; 883 slab_fix(s, "Freelist cleared"); 884 return 0; 885 } 886 break; 887 } 888 object = fp; 889 fp = get_freepointer(s, object); 890 nr++; 891 } 892 893 max_objects = order_objects(compound_order(page), s->size, s->reserved); 894 if (max_objects > MAX_OBJS_PER_PAGE) 895 max_objects = MAX_OBJS_PER_PAGE; 896 897 if (page->objects != max_objects) { 898 slab_err(s, page, "Wrong number of objects. Found %d but " 899 "should be %d", page->objects, max_objects); 900 page->objects = max_objects; 901 slab_fix(s, "Number of objects adjusted."); 902 } 903 if (page->inuse != page->objects - nr) { 904 slab_err(s, page, "Wrong object count. Counter is %d but " 905 "counted were %d", page->inuse, page->objects - nr); 906 page->inuse = page->objects - nr; 907 slab_fix(s, "Object count adjusted."); 908 } 909 return search == NULL; 910 } 911 912 static void trace(struct kmem_cache *s, struct page *page, void *object, 913 int alloc) 914 { 915 if (s->flags & SLAB_TRACE) { 916 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 917 s->name, 918 alloc ? "alloc" : "free", 919 object, page->inuse, 920 page->freelist); 921 922 if (!alloc) 923 print_section("Object ", (void *)object, s->objsize); 924 925 dump_stack(); 926 } 927 } 928 929 /* 930 * Hooks for other subsystems that check memory allocations. In a typical 931 * production configuration these hooks all should produce no code at all. 932 */ 933 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 934 { 935 flags &= gfp_allowed_mask; 936 lockdep_trace_alloc(flags); 937 might_sleep_if(flags & __GFP_WAIT); 938 939 return should_failslab(s->objsize, flags, s->flags); 940 } 941 942 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) 943 { 944 flags &= gfp_allowed_mask; 945 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 946 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); 947 } 948 949 static inline void slab_free_hook(struct kmem_cache *s, void *x) 950 { 951 kmemleak_free_recursive(x, s->flags); 952 953 /* 954 * Trouble is that we may no longer disable interupts in the fast path 955 * So in order to make the debug calls that expect irqs to be 956 * disabled we need to disable interrupts temporarily. 957 */ 958 #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) 959 { 960 unsigned long flags; 961 962 local_irq_save(flags); 963 kmemcheck_slab_free(s, x, s->objsize); 964 debug_check_no_locks_freed(x, s->objsize); 965 local_irq_restore(flags); 966 } 967 #endif 968 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 969 debug_check_no_obj_freed(x, s->objsize); 970 } 971 972 /* 973 * Tracking of fully allocated slabs for debugging purposes. 974 * 975 * list_lock must be held. 976 */ 977 static void add_full(struct kmem_cache *s, 978 struct kmem_cache_node *n, struct page *page) 979 { 980 if (!(s->flags & SLAB_STORE_USER)) 981 return; 982 983 list_add(&page->lru, &n->full); 984 } 985 986 /* 987 * list_lock must be held. 988 */ 989 static void remove_full(struct kmem_cache *s, struct page *page) 990 { 991 if (!(s->flags & SLAB_STORE_USER)) 992 return; 993 994 list_del(&page->lru); 995 } 996 997 /* Tracking of the number of slabs for debugging purposes */ 998 static inline unsigned long slabs_node(struct kmem_cache *s, int node) 999 { 1000 struct kmem_cache_node *n = get_node(s, node); 1001 1002 return atomic_long_read(&n->nr_slabs); 1003 } 1004 1005 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1006 { 1007 return atomic_long_read(&n->nr_slabs); 1008 } 1009 1010 static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) 1011 { 1012 struct kmem_cache_node *n = get_node(s, node); 1013 1014 /* 1015 * May be called early in order to allocate a slab for the 1016 * kmem_cache_node structure. Solve the chicken-egg 1017 * dilemma by deferring the increment of the count during 1018 * bootstrap (see early_kmem_cache_node_alloc). 1019 */ 1020 if (n) { 1021 atomic_long_inc(&n->nr_slabs); 1022 atomic_long_add(objects, &n->total_objects); 1023 } 1024 } 1025 static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) 1026 { 1027 struct kmem_cache_node *n = get_node(s, node); 1028 1029 atomic_long_dec(&n->nr_slabs); 1030 atomic_long_sub(objects, &n->total_objects); 1031 } 1032 1033 /* Object debug checks for alloc/free paths */ 1034 static void setup_object_debug(struct kmem_cache *s, struct page *page, 1035 void *object) 1036 { 1037 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 1038 return; 1039 1040 init_object(s, object, SLUB_RED_INACTIVE); 1041 init_tracking(s, object); 1042 } 1043 1044 static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page, 1045 void *object, unsigned long addr) 1046 { 1047 if (!check_slab(s, page)) 1048 goto bad; 1049 1050 if (!check_valid_pointer(s, page, object)) { 1051 object_err(s, page, object, "Freelist Pointer check fails"); 1052 goto bad; 1053 } 1054 1055 if (!check_object(s, page, object, SLUB_RED_INACTIVE)) 1056 goto bad; 1057 1058 /* Success perform special debug activities for allocs */ 1059 if (s->flags & SLAB_STORE_USER) 1060 set_track(s, object, TRACK_ALLOC, addr); 1061 trace(s, page, object, 1); 1062 init_object(s, object, SLUB_RED_ACTIVE); 1063 return 1; 1064 1065 bad: 1066 if (PageSlab(page)) { 1067 /* 1068 * If this is a slab page then lets do the best we can 1069 * to avoid issues in the future. Marking all objects 1070 * as used avoids touching the remaining objects. 1071 */ 1072 slab_fix(s, "Marking all objects used"); 1073 page->inuse = page->objects; 1074 page->freelist = NULL; 1075 } 1076 return 0; 1077 } 1078 1079 static noinline int free_debug_processing(struct kmem_cache *s, 1080 struct page *page, void *object, unsigned long addr) 1081 { 1082 unsigned long flags; 1083 int rc = 0; 1084 1085 local_irq_save(flags); 1086 slab_lock(page); 1087 1088 if (!check_slab(s, page)) 1089 goto fail; 1090 1091 if (!check_valid_pointer(s, page, object)) { 1092 slab_err(s, page, "Invalid object pointer 0x%p", object); 1093 goto fail; 1094 } 1095 1096 if (on_freelist(s, page, object)) { 1097 object_err(s, page, object, "Object already free"); 1098 goto fail; 1099 } 1100 1101 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1102 goto out; 1103 1104 if (unlikely(s != page->slab)) { 1105 if (!PageSlab(page)) { 1106 slab_err(s, page, "Attempt to free object(0x%p) " 1107 "outside of slab", object); 1108 } else if (!page->slab) { 1109 printk(KERN_ERR 1110 "SLUB <none>: no slab for object 0x%p.\n", 1111 object); 1112 dump_stack(); 1113 } else 1114 object_err(s, page, object, 1115 "page slab pointer corrupt."); 1116 goto fail; 1117 } 1118 1119 if (s->flags & SLAB_STORE_USER) 1120 set_track(s, object, TRACK_FREE, addr); 1121 trace(s, page, object, 0); 1122 init_object(s, object, SLUB_RED_INACTIVE); 1123 rc = 1; 1124 out: 1125 slab_unlock(page); 1126 local_irq_restore(flags); 1127 return rc; 1128 1129 fail: 1130 slab_fix(s, "Object at 0x%p not freed", object); 1131 goto out; 1132 } 1133 1134 static int __init setup_slub_debug(char *str) 1135 { 1136 slub_debug = DEBUG_DEFAULT_FLAGS; 1137 if (*str++ != '=' || !*str) 1138 /* 1139 * No options specified. Switch on full debugging. 1140 */ 1141 goto out; 1142 1143 if (*str == ',') 1144 /* 1145 * No options but restriction on slabs. This means full 1146 * debugging for slabs matching a pattern. 1147 */ 1148 goto check_slabs; 1149 1150 if (tolower(*str) == 'o') { 1151 /* 1152 * Avoid enabling debugging on caches if its minimum order 1153 * would increase as a result. 1154 */ 1155 disable_higher_order_debug = 1; 1156 goto out; 1157 } 1158 1159 slub_debug = 0; 1160 if (*str == '-') 1161 /* 1162 * Switch off all debugging measures. 1163 */ 1164 goto out; 1165 1166 /* 1167 * Determine which debug features should be switched on 1168 */ 1169 for (; *str && *str != ','; str++) { 1170 switch (tolower(*str)) { 1171 case 'f': 1172 slub_debug |= SLAB_DEBUG_FREE; 1173 break; 1174 case 'z': 1175 slub_debug |= SLAB_RED_ZONE; 1176 break; 1177 case 'p': 1178 slub_debug |= SLAB_POISON; 1179 break; 1180 case 'u': 1181 slub_debug |= SLAB_STORE_USER; 1182 break; 1183 case 't': 1184 slub_debug |= SLAB_TRACE; 1185 break; 1186 case 'a': 1187 slub_debug |= SLAB_FAILSLAB; 1188 break; 1189 default: 1190 printk(KERN_ERR "slub_debug option '%c' " 1191 "unknown. skipped\n", *str); 1192 } 1193 } 1194 1195 check_slabs: 1196 if (*str == ',') 1197 slub_debug_slabs = str + 1; 1198 out: 1199 return 1; 1200 } 1201 1202 __setup("slub_debug", setup_slub_debug); 1203 1204 static unsigned long kmem_cache_flags(unsigned long objsize, 1205 unsigned long flags, const char *name, 1206 void (*ctor)(void *)) 1207 { 1208 /* 1209 * Enable debugging if selected on the kernel commandline. 1210 */ 1211 if (slub_debug && (!slub_debug_slabs || 1212 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) 1213 flags |= slub_debug; 1214 1215 return flags; 1216 } 1217 #else 1218 static inline void setup_object_debug(struct kmem_cache *s, 1219 struct page *page, void *object) {} 1220 1221 static inline int alloc_debug_processing(struct kmem_cache *s, 1222 struct page *page, void *object, unsigned long addr) { return 0; } 1223 1224 static inline int free_debug_processing(struct kmem_cache *s, 1225 struct page *page, void *object, unsigned long addr) { return 0; } 1226 1227 static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1228 { return 1; } 1229 static inline int check_object(struct kmem_cache *s, struct page *page, 1230 void *object, u8 val) { return 1; } 1231 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, 1232 struct page *page) {} 1233 static inline void remove_full(struct kmem_cache *s, struct page *page) {} 1234 static inline unsigned long kmem_cache_flags(unsigned long objsize, 1235 unsigned long flags, const char *name, 1236 void (*ctor)(void *)) 1237 { 1238 return flags; 1239 } 1240 #define slub_debug 0 1241 1242 #define disable_higher_order_debug 0 1243 1244 static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1245 { return 0; } 1246 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1247 { return 0; } 1248 static inline void inc_slabs_node(struct kmem_cache *s, int node, 1249 int objects) {} 1250 static inline void dec_slabs_node(struct kmem_cache *s, int node, 1251 int objects) {} 1252 1253 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1254 { return 0; } 1255 1256 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, 1257 void *object) {} 1258 1259 static inline void slab_free_hook(struct kmem_cache *s, void *x) {} 1260 1261 #endif /* CONFIG_SLUB_DEBUG */ 1262 1263 /* 1264 * Slab allocation and freeing 1265 */ 1266 static inline struct page *alloc_slab_page(gfp_t flags, int node, 1267 struct kmem_cache_order_objects oo) 1268 { 1269 int order = oo_order(oo); 1270 1271 flags |= __GFP_NOTRACK; 1272 1273 if (node == NUMA_NO_NODE) 1274 return alloc_pages(flags, order); 1275 else 1276 return alloc_pages_exact_node(node, flags, order); 1277 } 1278 1279 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1280 { 1281 struct page *page; 1282 struct kmem_cache_order_objects oo = s->oo; 1283 gfp_t alloc_gfp; 1284 1285 flags &= gfp_allowed_mask; 1286 1287 if (flags & __GFP_WAIT) 1288 local_irq_enable(); 1289 1290 flags |= s->allocflags; 1291 1292 /* 1293 * Let the initial higher-order allocation fail under memory pressure 1294 * so we fall-back to the minimum order allocation. 1295 */ 1296 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; 1297 1298 page = alloc_slab_page(alloc_gfp, node, oo); 1299 if (unlikely(!page)) { 1300 oo = s->min; 1301 /* 1302 * Allocation may have failed due to fragmentation. 1303 * Try a lower order alloc if possible 1304 */ 1305 page = alloc_slab_page(flags, node, oo); 1306 1307 if (page) 1308 stat(s, ORDER_FALLBACK); 1309 } 1310 1311 if (flags & __GFP_WAIT) 1312 local_irq_disable(); 1313 1314 if (!page) 1315 return NULL; 1316 1317 if (kmemcheck_enabled 1318 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1319 int pages = 1 << oo_order(oo); 1320 1321 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); 1322 1323 /* 1324 * Objects from caches that have a constructor don't get 1325 * cleared when they're allocated, so we need to do it here. 1326 */ 1327 if (s->ctor) 1328 kmemcheck_mark_uninitialized_pages(page, pages); 1329 else 1330 kmemcheck_mark_unallocated_pages(page, pages); 1331 } 1332 1333 page->objects = oo_objects(oo); 1334 mod_zone_page_state(page_zone(page), 1335 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1336 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1337 1 << oo_order(oo)); 1338 1339 return page; 1340 } 1341 1342 static void setup_object(struct kmem_cache *s, struct page *page, 1343 void *object) 1344 { 1345 setup_object_debug(s, page, object); 1346 if (unlikely(s->ctor)) 1347 s->ctor(object); 1348 } 1349 1350 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1351 { 1352 struct page *page; 1353 void *start; 1354 void *last; 1355 void *p; 1356 1357 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1358 1359 page = allocate_slab(s, 1360 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 1361 if (!page) 1362 goto out; 1363 1364 inc_slabs_node(s, page_to_nid(page), page->objects); 1365 page->slab = s; 1366 page->flags |= 1 << PG_slab; 1367 1368 start = page_address(page); 1369 1370 if (unlikely(s->flags & SLAB_POISON)) 1371 memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); 1372 1373 last = start; 1374 for_each_object(p, s, start, page->objects) { 1375 setup_object(s, page, last); 1376 set_freepointer(s, last, p); 1377 last = p; 1378 } 1379 setup_object(s, page, last); 1380 set_freepointer(s, last, NULL); 1381 1382 page->freelist = start; 1383 page->inuse = page->objects; 1384 page->frozen = 1; 1385 out: 1386 return page; 1387 } 1388 1389 static void __free_slab(struct kmem_cache *s, struct page *page) 1390 { 1391 int order = compound_order(page); 1392 int pages = 1 << order; 1393 1394 if (kmem_cache_debug(s)) { 1395 void *p; 1396 1397 slab_pad_check(s, page); 1398 for_each_object(p, s, page_address(page), 1399 page->objects) 1400 check_object(s, page, p, SLUB_RED_INACTIVE); 1401 } 1402 1403 kmemcheck_free_shadow(page, compound_order(page)); 1404 1405 mod_zone_page_state(page_zone(page), 1406 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1407 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1408 -pages); 1409 1410 __ClearPageSlab(page); 1411 reset_page_mapcount(page); 1412 if (current->reclaim_state) 1413 current->reclaim_state->reclaimed_slab += pages; 1414 __free_pages(page, order); 1415 } 1416 1417 #define need_reserve_slab_rcu \ 1418 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) 1419 1420 static void rcu_free_slab(struct rcu_head *h) 1421 { 1422 struct page *page; 1423 1424 if (need_reserve_slab_rcu) 1425 page = virt_to_head_page(h); 1426 else 1427 page = container_of((struct list_head *)h, struct page, lru); 1428 1429 __free_slab(page->slab, page); 1430 } 1431 1432 static void free_slab(struct kmem_cache *s, struct page *page) 1433 { 1434 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1435 struct rcu_head *head; 1436 1437 if (need_reserve_slab_rcu) { 1438 int order = compound_order(page); 1439 int offset = (PAGE_SIZE << order) - s->reserved; 1440 1441 VM_BUG_ON(s->reserved != sizeof(*head)); 1442 head = page_address(page) + offset; 1443 } else { 1444 /* 1445 * RCU free overloads the RCU head over the LRU 1446 */ 1447 head = (void *)&page->lru; 1448 } 1449 1450 call_rcu(head, rcu_free_slab); 1451 } else 1452 __free_slab(s, page); 1453 } 1454 1455 static void discard_slab(struct kmem_cache *s, struct page *page) 1456 { 1457 dec_slabs_node(s, page_to_nid(page), page->objects); 1458 free_slab(s, page); 1459 } 1460 1461 /* 1462 * Management of partially allocated slabs. 1463 * 1464 * list_lock must be held. 1465 */ 1466 static inline void add_partial(struct kmem_cache_node *n, 1467 struct page *page, int tail) 1468 { 1469 n->nr_partial++; 1470 if (tail == DEACTIVATE_TO_TAIL) 1471 list_add_tail(&page->lru, &n->partial); 1472 else 1473 list_add(&page->lru, &n->partial); 1474 } 1475 1476 /* 1477 * list_lock must be held. 1478 */ 1479 static inline void remove_partial(struct kmem_cache_node *n, 1480 struct page *page) 1481 { 1482 list_del(&page->lru); 1483 n->nr_partial--; 1484 } 1485 1486 /* 1487 * Lock slab, remove from the partial list and put the object into the 1488 * per cpu freelist. 1489 * 1490 * Returns a list of objects or NULL if it fails. 1491 * 1492 * Must hold list_lock. 1493 */ 1494 static inline void *acquire_slab(struct kmem_cache *s, 1495 struct kmem_cache_node *n, struct page *page, 1496 int mode) 1497 { 1498 void *freelist; 1499 unsigned long counters; 1500 struct page new; 1501 1502 /* 1503 * Zap the freelist and set the frozen bit. 1504 * The old freelist is the list of objects for the 1505 * per cpu allocation list. 1506 */ 1507 do { 1508 freelist = page->freelist; 1509 counters = page->counters; 1510 new.counters = counters; 1511 if (mode) 1512 new.inuse = page->objects; 1513 1514 VM_BUG_ON(new.frozen); 1515 new.frozen = 1; 1516 1517 } while (!__cmpxchg_double_slab(s, page, 1518 freelist, counters, 1519 NULL, new.counters, 1520 "lock and freeze")); 1521 1522 remove_partial(n, page); 1523 return freelist; 1524 } 1525 1526 static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); 1527 1528 /* 1529 * Try to allocate a partial slab from a specific node. 1530 */ 1531 static void *get_partial_node(struct kmem_cache *s, 1532 struct kmem_cache_node *n, struct kmem_cache_cpu *c) 1533 { 1534 struct page *page, *page2; 1535 void *object = NULL; 1536 1537 /* 1538 * Racy check. If we mistakenly see no partial slabs then we 1539 * just allocate an empty slab. If we mistakenly try to get a 1540 * partial slab and there is none available then get_partials() 1541 * will return NULL. 1542 */ 1543 if (!n || !n->nr_partial) 1544 return NULL; 1545 1546 spin_lock(&n->list_lock); 1547 list_for_each_entry_safe(page, page2, &n->partial, lru) { 1548 void *t = acquire_slab(s, n, page, object == NULL); 1549 int available; 1550 1551 if (!t) 1552 break; 1553 1554 if (!object) { 1555 c->page = page; 1556 c->node = page_to_nid(page); 1557 stat(s, ALLOC_FROM_PARTIAL); 1558 object = t; 1559 available = page->objects - page->inuse; 1560 } else { 1561 page->freelist = t; 1562 available = put_cpu_partial(s, page, 0); 1563 } 1564 if (kmem_cache_debug(s) || available > s->cpu_partial / 2) 1565 break; 1566 1567 } 1568 spin_unlock(&n->list_lock); 1569 return object; 1570 } 1571 1572 /* 1573 * Get a page from somewhere. Search in increasing NUMA distances. 1574 */ 1575 static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, 1576 struct kmem_cache_cpu *c) 1577 { 1578 #ifdef CONFIG_NUMA 1579 struct zonelist *zonelist; 1580 struct zoneref *z; 1581 struct zone *zone; 1582 enum zone_type high_zoneidx = gfp_zone(flags); 1583 void *object; 1584 1585 /* 1586 * The defrag ratio allows a configuration of the tradeoffs between 1587 * inter node defragmentation and node local allocations. A lower 1588 * defrag_ratio increases the tendency to do local allocations 1589 * instead of attempting to obtain partial slabs from other nodes. 1590 * 1591 * If the defrag_ratio is set to 0 then kmalloc() always 1592 * returns node local objects. If the ratio is higher then kmalloc() 1593 * may return off node objects because partial slabs are obtained 1594 * from other nodes and filled up. 1595 * 1596 * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes 1597 * defrag_ratio = 1000) then every (well almost) allocation will 1598 * first attempt to defrag slab caches on other nodes. This means 1599 * scanning over all nodes to look for partial slabs which may be 1600 * expensive if we do it every time we are trying to find a slab 1601 * with available objects. 1602 */ 1603 if (!s->remote_node_defrag_ratio || 1604 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1605 return NULL; 1606 1607 get_mems_allowed(); 1608 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1609 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1610 struct kmem_cache_node *n; 1611 1612 n = get_node(s, zone_to_nid(zone)); 1613 1614 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1615 n->nr_partial > s->min_partial) { 1616 object = get_partial_node(s, n, c); 1617 if (object) { 1618 put_mems_allowed(); 1619 return object; 1620 } 1621 } 1622 } 1623 put_mems_allowed(); 1624 #endif 1625 return NULL; 1626 } 1627 1628 /* 1629 * Get a partial page, lock it and return it. 1630 */ 1631 static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, 1632 struct kmem_cache_cpu *c) 1633 { 1634 void *object; 1635 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1636 1637 object = get_partial_node(s, get_node(s, searchnode), c); 1638 if (object || node != NUMA_NO_NODE) 1639 return object; 1640 1641 return get_any_partial(s, flags, c); 1642 } 1643 1644 #ifdef CONFIG_PREEMPT 1645 /* 1646 * Calculate the next globally unique transaction for disambiguiation 1647 * during cmpxchg. The transactions start with the cpu number and are then 1648 * incremented by CONFIG_NR_CPUS. 1649 */ 1650 #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) 1651 #else 1652 /* 1653 * No preemption supported therefore also no need to check for 1654 * different cpus. 1655 */ 1656 #define TID_STEP 1 1657 #endif 1658 1659 static inline unsigned long next_tid(unsigned long tid) 1660 { 1661 return tid + TID_STEP; 1662 } 1663 1664 static inline unsigned int tid_to_cpu(unsigned long tid) 1665 { 1666 return tid % TID_STEP; 1667 } 1668 1669 static inline unsigned long tid_to_event(unsigned long tid) 1670 { 1671 return tid / TID_STEP; 1672 } 1673 1674 static inline unsigned int init_tid(int cpu) 1675 { 1676 return cpu; 1677 } 1678 1679 static inline void note_cmpxchg_failure(const char *n, 1680 const struct kmem_cache *s, unsigned long tid) 1681 { 1682 #ifdef SLUB_DEBUG_CMPXCHG 1683 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 1684 1685 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); 1686 1687 #ifdef CONFIG_PREEMPT 1688 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) 1689 printk("due to cpu change %d -> %d\n", 1690 tid_to_cpu(tid), tid_to_cpu(actual_tid)); 1691 else 1692 #endif 1693 if (tid_to_event(tid) != tid_to_event(actual_tid)) 1694 printk("due to cpu running other code. Event %ld->%ld\n", 1695 tid_to_event(tid), tid_to_event(actual_tid)); 1696 else 1697 printk("for unknown reason: actual=%lx was=%lx target=%lx\n", 1698 actual_tid, tid, next_tid(tid)); 1699 #endif 1700 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 1701 } 1702 1703 void init_kmem_cache_cpus(struct kmem_cache *s) 1704 { 1705 int cpu; 1706 1707 for_each_possible_cpu(cpu) 1708 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); 1709 } 1710 1711 /* 1712 * Remove the cpu slab 1713 */ 1714 static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1715 { 1716 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; 1717 struct page *page = c->page; 1718 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1719 int lock = 0; 1720 enum slab_modes l = M_NONE, m = M_NONE; 1721 void *freelist; 1722 void *nextfree; 1723 int tail = DEACTIVATE_TO_HEAD; 1724 struct page new; 1725 struct page old; 1726 1727 if (page->freelist) { 1728 stat(s, DEACTIVATE_REMOTE_FREES); 1729 tail = DEACTIVATE_TO_TAIL; 1730 } 1731 1732 c->tid = next_tid(c->tid); 1733 c->page = NULL; 1734 freelist = c->freelist; 1735 c->freelist = NULL; 1736 1737 /* 1738 * Stage one: Free all available per cpu objects back 1739 * to the page freelist while it is still frozen. Leave the 1740 * last one. 1741 * 1742 * There is no need to take the list->lock because the page 1743 * is still frozen. 1744 */ 1745 while (freelist && (nextfree = get_freepointer(s, freelist))) { 1746 void *prior; 1747 unsigned long counters; 1748 1749 do { 1750 prior = page->freelist; 1751 counters = page->counters; 1752 set_freepointer(s, freelist, prior); 1753 new.counters = counters; 1754 new.inuse--; 1755 VM_BUG_ON(!new.frozen); 1756 1757 } while (!__cmpxchg_double_slab(s, page, 1758 prior, counters, 1759 freelist, new.counters, 1760 "drain percpu freelist")); 1761 1762 freelist = nextfree; 1763 } 1764 1765 /* 1766 * Stage two: Ensure that the page is unfrozen while the 1767 * list presence reflects the actual number of objects 1768 * during unfreeze. 1769 * 1770 * We setup the list membership and then perform a cmpxchg 1771 * with the count. If there is a mismatch then the page 1772 * is not unfrozen but the page is on the wrong list. 1773 * 1774 * Then we restart the process which may have to remove 1775 * the page from the list that we just put it on again 1776 * because the number of objects in the slab may have 1777 * changed. 1778 */ 1779 redo: 1780 1781 old.freelist = page->freelist; 1782 old.counters = page->counters; 1783 VM_BUG_ON(!old.frozen); 1784 1785 /* Determine target state of the slab */ 1786 new.counters = old.counters; 1787 if (freelist) { 1788 new.inuse--; 1789 set_freepointer(s, freelist, old.freelist); 1790 new.freelist = freelist; 1791 } else 1792 new.freelist = old.freelist; 1793 1794 new.frozen = 0; 1795 1796 if (!new.inuse && n->nr_partial > s->min_partial) 1797 m = M_FREE; 1798 else if (new.freelist) { 1799 m = M_PARTIAL; 1800 if (!lock) { 1801 lock = 1; 1802 /* 1803 * Taking the spinlock removes the possiblity 1804 * that acquire_slab() will see a slab page that 1805 * is frozen 1806 */ 1807 spin_lock(&n->list_lock); 1808 } 1809 } else { 1810 m = M_FULL; 1811 if (kmem_cache_debug(s) && !lock) { 1812 lock = 1; 1813 /* 1814 * This also ensures that the scanning of full 1815 * slabs from diagnostic functions will not see 1816 * any frozen slabs. 1817 */ 1818 spin_lock(&n->list_lock); 1819 } 1820 } 1821 1822 if (l != m) { 1823 1824 if (l == M_PARTIAL) 1825 1826 remove_partial(n, page); 1827 1828 else if (l == M_FULL) 1829 1830 remove_full(s, page); 1831 1832 if (m == M_PARTIAL) { 1833 1834 add_partial(n, page, tail); 1835 stat(s, tail); 1836 1837 } else if (m == M_FULL) { 1838 1839 stat(s, DEACTIVATE_FULL); 1840 add_full(s, n, page); 1841 1842 } 1843 } 1844 1845 l = m; 1846 if (!__cmpxchg_double_slab(s, page, 1847 old.freelist, old.counters, 1848 new.freelist, new.counters, 1849 "unfreezing slab")) 1850 goto redo; 1851 1852 if (lock) 1853 spin_unlock(&n->list_lock); 1854 1855 if (m == M_FREE) { 1856 stat(s, DEACTIVATE_EMPTY); 1857 discard_slab(s, page); 1858 stat(s, FREE_SLAB); 1859 } 1860 } 1861 1862 /* Unfreeze all the cpu partial slabs */ 1863 static void unfreeze_partials(struct kmem_cache *s) 1864 { 1865 struct kmem_cache_node *n = NULL; 1866 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); 1867 struct page *page, *discard_page = NULL; 1868 1869 while ((page = c->partial)) { 1870 enum slab_modes { M_PARTIAL, M_FREE }; 1871 enum slab_modes l, m; 1872 struct page new; 1873 struct page old; 1874 1875 c->partial = page->next; 1876 l = M_FREE; 1877 1878 do { 1879 1880 old.freelist = page->freelist; 1881 old.counters = page->counters; 1882 VM_BUG_ON(!old.frozen); 1883 1884 new.counters = old.counters; 1885 new.freelist = old.freelist; 1886 1887 new.frozen = 0; 1888 1889 if (!new.inuse && (!n || n->nr_partial > s->min_partial)) 1890 m = M_FREE; 1891 else { 1892 struct kmem_cache_node *n2 = get_node(s, 1893 page_to_nid(page)); 1894 1895 m = M_PARTIAL; 1896 if (n != n2) { 1897 if (n) 1898 spin_unlock(&n->list_lock); 1899 1900 n = n2; 1901 spin_lock(&n->list_lock); 1902 } 1903 } 1904 1905 if (l != m) { 1906 if (l == M_PARTIAL) { 1907 remove_partial(n, page); 1908 stat(s, FREE_REMOVE_PARTIAL); 1909 } else { 1910 add_partial(n, page, 1911 DEACTIVATE_TO_TAIL); 1912 stat(s, FREE_ADD_PARTIAL); 1913 } 1914 1915 l = m; 1916 } 1917 1918 } while (!cmpxchg_double_slab(s, page, 1919 old.freelist, old.counters, 1920 new.freelist, new.counters, 1921 "unfreezing slab")); 1922 1923 if (m == M_FREE) { 1924 page->next = discard_page; 1925 discard_page = page; 1926 } 1927 } 1928 1929 if (n) 1930 spin_unlock(&n->list_lock); 1931 1932 while (discard_page) { 1933 page = discard_page; 1934 discard_page = discard_page->next; 1935 1936 stat(s, DEACTIVATE_EMPTY); 1937 discard_slab(s, page); 1938 stat(s, FREE_SLAB); 1939 } 1940 } 1941 1942 /* 1943 * Put a page that was just frozen (in __slab_free) into a partial page 1944 * slot if available. This is done without interrupts disabled and without 1945 * preemption disabled. The cmpxchg is racy and may put the partial page 1946 * onto a random cpus partial slot. 1947 * 1948 * If we did not find a slot then simply move all the partials to the 1949 * per node partial list. 1950 */ 1951 int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) 1952 { 1953 struct page *oldpage; 1954 int pages; 1955 int pobjects; 1956 1957 do { 1958 pages = 0; 1959 pobjects = 0; 1960 oldpage = this_cpu_read(s->cpu_slab->partial); 1961 1962 if (oldpage) { 1963 pobjects = oldpage->pobjects; 1964 pages = oldpage->pages; 1965 if (drain && pobjects > s->cpu_partial) { 1966 unsigned long flags; 1967 /* 1968 * partial array is full. Move the existing 1969 * set to the per node partial list. 1970 */ 1971 local_irq_save(flags); 1972 unfreeze_partials(s); 1973 local_irq_restore(flags); 1974 pobjects = 0; 1975 pages = 0; 1976 } 1977 } 1978 1979 pages++; 1980 pobjects += page->objects - page->inuse; 1981 1982 page->pages = pages; 1983 page->pobjects = pobjects; 1984 page->next = oldpage; 1985 1986 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); 1987 stat(s, CPU_PARTIAL_FREE); 1988 return pobjects; 1989 } 1990 1991 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1992 { 1993 stat(s, CPUSLAB_FLUSH); 1994 deactivate_slab(s, c); 1995 } 1996 1997 /* 1998 * Flush cpu slab. 1999 * 2000 * Called from IPI handler with interrupts disabled. 2001 */ 2002 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 2003 { 2004 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 2005 2006 if (likely(c)) { 2007 if (c->page) 2008 flush_slab(s, c); 2009 2010 unfreeze_partials(s); 2011 } 2012 } 2013 2014 static void flush_cpu_slab(void *d) 2015 { 2016 struct kmem_cache *s = d; 2017 2018 __flush_cpu_slab(s, smp_processor_id()); 2019 } 2020 2021 static void flush_all(struct kmem_cache *s) 2022 { 2023 on_each_cpu(flush_cpu_slab, s, 1); 2024 } 2025 2026 /* 2027 * Check if the objects in a per cpu structure fit numa 2028 * locality expectations. 2029 */ 2030 static inline int node_match(struct kmem_cache_cpu *c, int node) 2031 { 2032 #ifdef CONFIG_NUMA 2033 if (node != NUMA_NO_NODE && c->node != node) 2034 return 0; 2035 #endif 2036 return 1; 2037 } 2038 2039 static int count_free(struct page *page) 2040 { 2041 return page->objects - page->inuse; 2042 } 2043 2044 static unsigned long count_partial(struct kmem_cache_node *n, 2045 int (*get_count)(struct page *)) 2046 { 2047 unsigned long flags; 2048 unsigned long x = 0; 2049 struct page *page; 2050 2051 spin_lock_irqsave(&n->list_lock, flags); 2052 list_for_each_entry(page, &n->partial, lru) 2053 x += get_count(page); 2054 spin_unlock_irqrestore(&n->list_lock, flags); 2055 return x; 2056 } 2057 2058 static inline unsigned long node_nr_objs(struct kmem_cache_node *n) 2059 { 2060 #ifdef CONFIG_SLUB_DEBUG 2061 return atomic_long_read(&n->total_objects); 2062 #else 2063 return 0; 2064 #endif 2065 } 2066 2067 static noinline void 2068 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) 2069 { 2070 int node; 2071 2072 printk(KERN_WARNING 2073 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2074 nid, gfpflags); 2075 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " 2076 "default order: %d, min order: %d\n", s->name, s->objsize, 2077 s->size, oo_order(s->oo), oo_order(s->min)); 2078 2079 if (oo_order(s->min) > get_order(s->objsize)) 2080 printk(KERN_WARNING " %s debugging increased min order, use " 2081 "slub_debug=O to disable.\n", s->name); 2082 2083 for_each_online_node(node) { 2084 struct kmem_cache_node *n = get_node(s, node); 2085 unsigned long nr_slabs; 2086 unsigned long nr_objs; 2087 unsigned long nr_free; 2088 2089 if (!n) 2090 continue; 2091 2092 nr_free = count_partial(n, count_free); 2093 nr_slabs = node_nr_slabs(n); 2094 nr_objs = node_nr_objs(n); 2095 2096 printk(KERN_WARNING 2097 " node %d: slabs: %ld, objs: %ld, free: %ld\n", 2098 node, nr_slabs, nr_objs, nr_free); 2099 } 2100 } 2101 2102 static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, 2103 int node, struct kmem_cache_cpu **pc) 2104 { 2105 void *object; 2106 struct kmem_cache_cpu *c; 2107 struct page *page = new_slab(s, flags, node); 2108 2109 if (page) { 2110 c = __this_cpu_ptr(s->cpu_slab); 2111 if (c->page) 2112 flush_slab(s, c); 2113 2114 /* 2115 * No other reference to the page yet so we can 2116 * muck around with it freely without cmpxchg 2117 */ 2118 object = page->freelist; 2119 page->freelist = NULL; 2120 2121 stat(s, ALLOC_SLAB); 2122 c->node = page_to_nid(page); 2123 c->page = page; 2124 *pc = c; 2125 } else 2126 object = NULL; 2127 2128 return object; 2129 } 2130 2131 /* 2132 * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist 2133 * or deactivate the page. 2134 * 2135 * The page is still frozen if the return value is not NULL. 2136 * 2137 * If this function returns NULL then the page has been unfrozen. 2138 */ 2139 static inline void *get_freelist(struct kmem_cache *s, struct page *page) 2140 { 2141 struct page new; 2142 unsigned long counters; 2143 void *freelist; 2144 2145 do { 2146 freelist = page->freelist; 2147 counters = page->counters; 2148 new.counters = counters; 2149 VM_BUG_ON(!new.frozen); 2150 2151 new.inuse = page->objects; 2152 new.frozen = freelist != NULL; 2153 2154 } while (!cmpxchg_double_slab(s, page, 2155 freelist, counters, 2156 NULL, new.counters, 2157 "get_freelist")); 2158 2159 return freelist; 2160 } 2161 2162 /* 2163 * Slow path. The lockless freelist is empty or we need to perform 2164 * debugging duties. 2165 * 2166 * Processing is still very fast if new objects have been freed to the 2167 * regular freelist. In that case we simply take over the regular freelist 2168 * as the lockless freelist and zap the regular freelist. 2169 * 2170 * If that is not working then we fall back to the partial lists. We take the 2171 * first element of the freelist as the object to allocate now and move the 2172 * rest of the freelist to the lockless freelist. 2173 * 2174 * And if we were unable to get a new slab from the partial slab lists then 2175 * we need to allocate a new slab. This is the slowest path since it involves 2176 * a call to the page allocator and the setup of a new slab. 2177 */ 2178 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 2179 unsigned long addr, struct kmem_cache_cpu *c) 2180 { 2181 void **object; 2182 unsigned long flags; 2183 2184 local_irq_save(flags); 2185 #ifdef CONFIG_PREEMPT 2186 /* 2187 * We may have been preempted and rescheduled on a different 2188 * cpu before disabling interrupts. Need to reload cpu area 2189 * pointer. 2190 */ 2191 c = this_cpu_ptr(s->cpu_slab); 2192 #endif 2193 2194 if (!c->page) 2195 goto new_slab; 2196 redo: 2197 if (unlikely(!node_match(c, node))) { 2198 stat(s, ALLOC_NODE_MISMATCH); 2199 deactivate_slab(s, c); 2200 goto new_slab; 2201 } 2202 2203 /* must check again c->freelist in case of cpu migration or IRQ */ 2204 object = c->freelist; 2205 if (object) 2206 goto load_freelist; 2207 2208 stat(s, ALLOC_SLOWPATH); 2209 2210 object = get_freelist(s, c->page); 2211 2212 if (!object) { 2213 c->page = NULL; 2214 stat(s, DEACTIVATE_BYPASS); 2215 goto new_slab; 2216 } 2217 2218 stat(s, ALLOC_REFILL); 2219 2220 load_freelist: 2221 c->freelist = get_freepointer(s, object); 2222 c->tid = next_tid(c->tid); 2223 local_irq_restore(flags); 2224 return object; 2225 2226 new_slab: 2227 2228 if (c->partial) { 2229 c->page = c->partial; 2230 c->partial = c->page->next; 2231 c->node = page_to_nid(c->page); 2232 stat(s, CPU_PARTIAL_ALLOC); 2233 c->freelist = NULL; 2234 goto redo; 2235 } 2236 2237 /* Then do expensive stuff like retrieving pages from the partial lists */ 2238 object = get_partial(s, gfpflags, node, c); 2239 2240 if (unlikely(!object)) { 2241 2242 object = new_slab_objects(s, gfpflags, node, &c); 2243 2244 if (unlikely(!object)) { 2245 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2246 slab_out_of_memory(s, gfpflags, node); 2247 2248 local_irq_restore(flags); 2249 return NULL; 2250 } 2251 } 2252 2253 if (likely(!kmem_cache_debug(s))) 2254 goto load_freelist; 2255 2256 /* Only entered in the debug case */ 2257 if (!alloc_debug_processing(s, c->page, object, addr)) 2258 goto new_slab; /* Slab failed checks. Next slab needed */ 2259 2260 c->freelist = get_freepointer(s, object); 2261 deactivate_slab(s, c); 2262 c->node = NUMA_NO_NODE; 2263 local_irq_restore(flags); 2264 return object; 2265 } 2266 2267 /* 2268 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) 2269 * have the fastpath folded into their functions. So no function call 2270 * overhead for requests that can be satisfied on the fastpath. 2271 * 2272 * The fastpath works by first checking if the lockless freelist can be used. 2273 * If not then __slab_alloc is called for slow processing. 2274 * 2275 * Otherwise we can simply pick the next object from the lockless free list. 2276 */ 2277 static __always_inline void *slab_alloc(struct kmem_cache *s, 2278 gfp_t gfpflags, int node, unsigned long addr) 2279 { 2280 void **object; 2281 struct kmem_cache_cpu *c; 2282 unsigned long tid; 2283 2284 if (slab_pre_alloc_hook(s, gfpflags)) 2285 return NULL; 2286 2287 redo: 2288 2289 /* 2290 * Must read kmem_cache cpu data via this cpu ptr. Preemption is 2291 * enabled. We may switch back and forth between cpus while 2292 * reading from one cpu area. That does not matter as long 2293 * as we end up on the original cpu again when doing the cmpxchg. 2294 */ 2295 c = __this_cpu_ptr(s->cpu_slab); 2296 2297 /* 2298 * The transaction ids are globally unique per cpu and per operation on 2299 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 2300 * occurs on the right processor and that there was no operation on the 2301 * linked list in between. 2302 */ 2303 tid = c->tid; 2304 barrier(); 2305 2306 object = c->freelist; 2307 if (unlikely(!object || !node_match(c, node))) 2308 2309 object = __slab_alloc(s, gfpflags, node, addr, c); 2310 2311 else { 2312 /* 2313 * The cmpxchg will only match if there was no additional 2314 * operation and if we are on the right processor. 2315 * 2316 * The cmpxchg does the following atomically (without lock semantics!) 2317 * 1. Relocate first pointer to the current per cpu area. 2318 * 2. Verify that tid and freelist have not been changed 2319 * 3. If they were not changed replace tid and freelist 2320 * 2321 * Since this is without lock semantics the protection is only against 2322 * code executing on this cpu *not* from access by other cpus. 2323 */ 2324 if (unlikely(!this_cpu_cmpxchg_double( 2325 s->cpu_slab->freelist, s->cpu_slab->tid, 2326 object, tid, 2327 get_freepointer_safe(s, object), next_tid(tid)))) { 2328 2329 note_cmpxchg_failure("slab_alloc", s, tid); 2330 goto redo; 2331 } 2332 stat(s, ALLOC_FASTPATH); 2333 } 2334 2335 if (unlikely(gfpflags & __GFP_ZERO) && object) 2336 memset(object, 0, s->objsize); 2337 2338 slab_post_alloc_hook(s, gfpflags, object); 2339 2340 return object; 2341 } 2342 2343 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 2344 { 2345 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 2346 2347 trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); 2348 2349 return ret; 2350 } 2351 EXPORT_SYMBOL(kmem_cache_alloc); 2352 2353 #ifdef CONFIG_TRACING 2354 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) 2355 { 2356 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 2357 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); 2358 return ret; 2359 } 2360 EXPORT_SYMBOL(kmem_cache_alloc_trace); 2361 2362 void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) 2363 { 2364 void *ret = kmalloc_order(size, flags, order); 2365 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); 2366 return ret; 2367 } 2368 EXPORT_SYMBOL(kmalloc_order_trace); 2369 #endif 2370 2371 #ifdef CONFIG_NUMA 2372 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 2373 { 2374 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); 2375 2376 trace_kmem_cache_alloc_node(_RET_IP_, ret, 2377 s->objsize, s->size, gfpflags, node); 2378 2379 return ret; 2380 } 2381 EXPORT_SYMBOL(kmem_cache_alloc_node); 2382 2383 #ifdef CONFIG_TRACING 2384 void *kmem_cache_alloc_node_trace(struct kmem_cache *s, 2385 gfp_t gfpflags, 2386 int node, size_t size) 2387 { 2388 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); 2389 2390 trace_kmalloc_node(_RET_IP_, ret, 2391 size, s->size, gfpflags, node); 2392 return ret; 2393 } 2394 EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 2395 #endif 2396 #endif 2397 2398 /* 2399 * Slow patch handling. This may still be called frequently since objects 2400 * have a longer lifetime than the cpu slabs in most processing loads. 2401 * 2402 * So we still attempt to reduce cache line usage. Just take the slab 2403 * lock and free the item. If there is no additional partial page 2404 * handling required then we can return immediately. 2405 */ 2406 static void __slab_free(struct kmem_cache *s, struct page *page, 2407 void *x, unsigned long addr) 2408 { 2409 void *prior; 2410 void **object = (void *)x; 2411 int was_frozen; 2412 int inuse; 2413 struct page new; 2414 unsigned long counters; 2415 struct kmem_cache_node *n = NULL; 2416 unsigned long uninitialized_var(flags); 2417 2418 stat(s, FREE_SLOWPATH); 2419 2420 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) 2421 return; 2422 2423 do { 2424 prior = page->freelist; 2425 counters = page->counters; 2426 set_freepointer(s, object, prior); 2427 new.counters = counters; 2428 was_frozen = new.frozen; 2429 new.inuse--; 2430 if ((!new.inuse || !prior) && !was_frozen && !n) { 2431 2432 if (!kmem_cache_debug(s) && !prior) 2433 2434 /* 2435 * Slab was on no list before and will be partially empty 2436 * We can defer the list move and instead freeze it. 2437 */ 2438 new.frozen = 1; 2439 2440 else { /* Needs to be taken off a list */ 2441 2442 n = get_node(s, page_to_nid(page)); 2443 /* 2444 * Speculatively acquire the list_lock. 2445 * If the cmpxchg does not succeed then we may 2446 * drop the list_lock without any processing. 2447 * 2448 * Otherwise the list_lock will synchronize with 2449 * other processors updating the list of slabs. 2450 */ 2451 spin_lock_irqsave(&n->list_lock, flags); 2452 2453 } 2454 } 2455 inuse = new.inuse; 2456 2457 } while (!cmpxchg_double_slab(s, page, 2458 prior, counters, 2459 object, new.counters, 2460 "__slab_free")); 2461 2462 if (likely(!n)) { 2463 2464 /* 2465 * If we just froze the page then put it onto the 2466 * per cpu partial list. 2467 */ 2468 if (new.frozen && !was_frozen) 2469 put_cpu_partial(s, page, 1); 2470 2471 /* 2472 * The list lock was not taken therefore no list 2473 * activity can be necessary. 2474 */ 2475 if (was_frozen) 2476 stat(s, FREE_FROZEN); 2477 return; 2478 } 2479 2480 /* 2481 * was_frozen may have been set after we acquired the list_lock in 2482 * an earlier loop. So we need to check it here again. 2483 */ 2484 if (was_frozen) 2485 stat(s, FREE_FROZEN); 2486 else { 2487 if (unlikely(!inuse && n->nr_partial > s->min_partial)) 2488 goto slab_empty; 2489 2490 /* 2491 * Objects left in the slab. If it was not on the partial list before 2492 * then add it. 2493 */ 2494 if (unlikely(!prior)) { 2495 remove_full(s, page); 2496 add_partial(n, page, DEACTIVATE_TO_TAIL); 2497 stat(s, FREE_ADD_PARTIAL); 2498 } 2499 } 2500 spin_unlock_irqrestore(&n->list_lock, flags); 2501 return; 2502 2503 slab_empty: 2504 if (prior) { 2505 /* 2506 * Slab on the partial list. 2507 */ 2508 remove_partial(n, page); 2509 stat(s, FREE_REMOVE_PARTIAL); 2510 } else 2511 /* Slab must be on the full list */ 2512 remove_full(s, page); 2513 2514 spin_unlock_irqrestore(&n->list_lock, flags); 2515 stat(s, FREE_SLAB); 2516 discard_slab(s, page); 2517 } 2518 2519 /* 2520 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 2521 * can perform fastpath freeing without additional function calls. 2522 * 2523 * The fastpath is only possible if we are freeing to the current cpu slab 2524 * of this processor. This typically the case if we have just allocated 2525 * the item before. 2526 * 2527 * If fastpath is not possible then fall back to __slab_free where we deal 2528 * with all sorts of special processing. 2529 */ 2530 static __always_inline void slab_free(struct kmem_cache *s, 2531 struct page *page, void *x, unsigned long addr) 2532 { 2533 void **object = (void *)x; 2534 struct kmem_cache_cpu *c; 2535 unsigned long tid; 2536 2537 slab_free_hook(s, x); 2538 2539 redo: 2540 /* 2541 * Determine the currently cpus per cpu slab. 2542 * The cpu may change afterward. However that does not matter since 2543 * data is retrieved via this pointer. If we are on the same cpu 2544 * during the cmpxchg then the free will succedd. 2545 */ 2546 c = __this_cpu_ptr(s->cpu_slab); 2547 2548 tid = c->tid; 2549 barrier(); 2550 2551 if (likely(page == c->page)) { 2552 set_freepointer(s, object, c->freelist); 2553 2554 if (unlikely(!this_cpu_cmpxchg_double( 2555 s->cpu_slab->freelist, s->cpu_slab->tid, 2556 c->freelist, tid, 2557 object, next_tid(tid)))) { 2558 2559 note_cmpxchg_failure("slab_free", s, tid); 2560 goto redo; 2561 } 2562 stat(s, FREE_FASTPATH); 2563 } else 2564 __slab_free(s, page, x, addr); 2565 2566 } 2567 2568 void kmem_cache_free(struct kmem_cache *s, void *x) 2569 { 2570 struct page *page; 2571 2572 page = virt_to_head_page(x); 2573 2574 slab_free(s, page, x, _RET_IP_); 2575 2576 trace_kmem_cache_free(_RET_IP_, x); 2577 } 2578 EXPORT_SYMBOL(kmem_cache_free); 2579 2580 /* 2581 * Object placement in a slab is made very easy because we always start at 2582 * offset 0. If we tune the size of the object to the alignment then we can 2583 * get the required alignment by putting one properly sized object after 2584 * another. 2585 * 2586 * Notice that the allocation order determines the sizes of the per cpu 2587 * caches. Each processor has always one slab available for allocations. 2588 * Increasing the allocation order reduces the number of times that slabs 2589 * must be moved on and off the partial lists and is therefore a factor in 2590 * locking overhead. 2591 */ 2592 2593 /* 2594 * Mininum / Maximum order of slab pages. This influences locking overhead 2595 * and slab fragmentation. A higher order reduces the number of partial slabs 2596 * and increases the number of allocations possible without having to 2597 * take the list_lock. 2598 */ 2599 static int slub_min_order; 2600 static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; 2601 static int slub_min_objects; 2602 2603 /* 2604 * Merge control. If this is set then no merging of slab caches will occur. 2605 * (Could be removed. This was introduced to pacify the merge skeptics.) 2606 */ 2607 static int slub_nomerge; 2608 2609 /* 2610 * Calculate the order of allocation given an slab object size. 2611 * 2612 * The order of allocation has significant impact on performance and other 2613 * system components. Generally order 0 allocations should be preferred since 2614 * order 0 does not cause fragmentation in the page allocator. Larger objects 2615 * be problematic to put into order 0 slabs because there may be too much 2616 * unused space left. We go to a higher order if more than 1/16th of the slab 2617 * would be wasted. 2618 * 2619 * In order to reach satisfactory performance we must ensure that a minimum 2620 * number of objects is in one slab. Otherwise we may generate too much 2621 * activity on the partial lists which requires taking the list_lock. This is 2622 * less a concern for large slabs though which are rarely used. 2623 * 2624 * slub_max_order specifies the order where we begin to stop considering the 2625 * number of objects in a slab as critical. If we reach slub_max_order then 2626 * we try to keep the page order as low as possible. So we accept more waste 2627 * of space in favor of a small page order. 2628 * 2629 * Higher order allocations also allow the placement of more objects in a 2630 * slab and thereby reduce object handling overhead. If the user has 2631 * requested a higher mininum order then we start with that one instead of 2632 * the smallest order which will fit the object. 2633 */ 2634 static inline int slab_order(int size, int min_objects, 2635 int max_order, int fract_leftover, int reserved) 2636 { 2637 int order; 2638 int rem; 2639 int min_order = slub_min_order; 2640 2641 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) 2642 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 2643 2644 for (order = max(min_order, 2645 fls(min_objects * size - 1) - PAGE_SHIFT); 2646 order <= max_order; order++) { 2647 2648 unsigned long slab_size = PAGE_SIZE << order; 2649 2650 if (slab_size < min_objects * size + reserved) 2651 continue; 2652 2653 rem = (slab_size - reserved) % size; 2654 2655 if (rem <= slab_size / fract_leftover) 2656 break; 2657 2658 } 2659 2660 return order; 2661 } 2662 2663 static inline int calculate_order(int size, int reserved) 2664 { 2665 int order; 2666 int min_objects; 2667 int fraction; 2668 int max_objects; 2669 2670 /* 2671 * Attempt to find best configuration for a slab. This 2672 * works by first attempting to generate a layout with 2673 * the best configuration and backing off gradually. 2674 * 2675 * First we reduce the acceptable waste in a slab. Then 2676 * we reduce the minimum objects required in a slab. 2677 */ 2678 min_objects = slub_min_objects; 2679 if (!min_objects) 2680 min_objects = 4 * (fls(nr_cpu_ids) + 1); 2681 max_objects = order_objects(slub_max_order, size, reserved); 2682 min_objects = min(min_objects, max_objects); 2683 2684 while (min_objects > 1) { 2685 fraction = 16; 2686 while (fraction >= 4) { 2687 order = slab_order(size, min_objects, 2688 slub_max_order, fraction, reserved); 2689 if (order <= slub_max_order) 2690 return order; 2691 fraction /= 2; 2692 } 2693 min_objects--; 2694 } 2695 2696 /* 2697 * We were unable to place multiple objects in a slab. Now 2698 * lets see if we can place a single object there. 2699 */ 2700 order = slab_order(size, 1, slub_max_order, 1, reserved); 2701 if (order <= slub_max_order) 2702 return order; 2703 2704 /* 2705 * Doh this slab cannot be placed using slub_max_order. 2706 */ 2707 order = slab_order(size, 1, MAX_ORDER, 1, reserved); 2708 if (order < MAX_ORDER) 2709 return order; 2710 return -ENOSYS; 2711 } 2712 2713 /* 2714 * Figure out what the alignment of the objects will be. 2715 */ 2716 static unsigned long calculate_alignment(unsigned long flags, 2717 unsigned long align, unsigned long size) 2718 { 2719 /* 2720 * If the user wants hardware cache aligned objects then follow that 2721 * suggestion if the object is sufficiently large. 2722 * 2723 * The hardware cache alignment cannot override the specified 2724 * alignment though. If that is greater then use it. 2725 */ 2726 if (flags & SLAB_HWCACHE_ALIGN) { 2727 unsigned long ralign = cache_line_size(); 2728 while (size <= ralign / 2) 2729 ralign /= 2; 2730 align = max(align, ralign); 2731 } 2732 2733 if (align < ARCH_SLAB_MINALIGN) 2734 align = ARCH_SLAB_MINALIGN; 2735 2736 return ALIGN(align, sizeof(void *)); 2737 } 2738 2739 static void 2740 init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 2741 { 2742 n->nr_partial = 0; 2743 spin_lock_init(&n->list_lock); 2744 INIT_LIST_HEAD(&n->partial); 2745 #ifdef CONFIG_SLUB_DEBUG 2746 atomic_long_set(&n->nr_slabs, 0); 2747 atomic_long_set(&n->total_objects, 0); 2748 INIT_LIST_HEAD(&n->full); 2749 #endif 2750 } 2751 2752 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 2753 { 2754 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 2755 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); 2756 2757 /* 2758 * Must align to double word boundary for the double cmpxchg 2759 * instructions to work; see __pcpu_double_call_return_bool(). 2760 */ 2761 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2762 2 * sizeof(void *)); 2763 2764 if (!s->cpu_slab) 2765 return 0; 2766 2767 init_kmem_cache_cpus(s); 2768 2769 return 1; 2770 } 2771 2772 static struct kmem_cache *kmem_cache_node; 2773 2774 /* 2775 * No kmalloc_node yet so do it by hand. We know that this is the first 2776 * slab on the node for this slabcache. There are no concurrent accesses 2777 * possible. 2778 * 2779 * Note that this function only works on the kmalloc_node_cache 2780 * when allocating for the kmalloc_node_cache. This is used for bootstrapping 2781 * memory on a fresh node that has no slab structures yet. 2782 */ 2783 static void early_kmem_cache_node_alloc(int node) 2784 { 2785 struct page *page; 2786 struct kmem_cache_node *n; 2787 2788 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); 2789 2790 page = new_slab(kmem_cache_node, GFP_NOWAIT, node); 2791 2792 BUG_ON(!page); 2793 if (page_to_nid(page) != node) { 2794 printk(KERN_ERR "SLUB: Unable to allocate memory from " 2795 "node %d\n", node); 2796 printk(KERN_ERR "SLUB: Allocating a useless per node structure " 2797 "in order to be able to continue\n"); 2798 } 2799 2800 n = page->freelist; 2801 BUG_ON(!n); 2802 page->freelist = get_freepointer(kmem_cache_node, n); 2803 page->inuse = 1; 2804 page->frozen = 0; 2805 kmem_cache_node->node[node] = n; 2806 #ifdef CONFIG_SLUB_DEBUG 2807 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2808 init_tracking(kmem_cache_node, n); 2809 #endif 2810 init_kmem_cache_node(n, kmem_cache_node); 2811 inc_slabs_node(kmem_cache_node, node, page->objects); 2812 2813 add_partial(n, page, DEACTIVATE_TO_HEAD); 2814 } 2815 2816 static void free_kmem_cache_nodes(struct kmem_cache *s) 2817 { 2818 int node; 2819 2820 for_each_node_state(node, N_NORMAL_MEMORY) { 2821 struct kmem_cache_node *n = s->node[node]; 2822 2823 if (n) 2824 kmem_cache_free(kmem_cache_node, n); 2825 2826 s->node[node] = NULL; 2827 } 2828 } 2829 2830 static int init_kmem_cache_nodes(struct kmem_cache *s) 2831 { 2832 int node; 2833 2834 for_each_node_state(node, N_NORMAL_MEMORY) { 2835 struct kmem_cache_node *n; 2836 2837 if (slab_state == DOWN) { 2838 early_kmem_cache_node_alloc(node); 2839 continue; 2840 } 2841 n = kmem_cache_alloc_node(kmem_cache_node, 2842 GFP_KERNEL, node); 2843 2844 if (!n) { 2845 free_kmem_cache_nodes(s); 2846 return 0; 2847 } 2848 2849 s->node[node] = n; 2850 init_kmem_cache_node(n, s); 2851 } 2852 return 1; 2853 } 2854 2855 static void set_min_partial(struct kmem_cache *s, unsigned long min) 2856 { 2857 if (min < MIN_PARTIAL) 2858 min = MIN_PARTIAL; 2859 else if (min > MAX_PARTIAL) 2860 min = MAX_PARTIAL; 2861 s->min_partial = min; 2862 } 2863 2864 /* 2865 * calculate_sizes() determines the order and the distribution of data within 2866 * a slab object. 2867 */ 2868 static int calculate_sizes(struct kmem_cache *s, int forced_order) 2869 { 2870 unsigned long flags = s->flags; 2871 unsigned long size = s->objsize; 2872 unsigned long align = s->align; 2873 int order; 2874 2875 /* 2876 * Round up object size to the next word boundary. We can only 2877 * place the free pointer at word boundaries and this determines 2878 * the possible location of the free pointer. 2879 */ 2880 size = ALIGN(size, sizeof(void *)); 2881 2882 #ifdef CONFIG_SLUB_DEBUG 2883 /* 2884 * Determine if we can poison the object itself. If the user of 2885 * the slab may touch the object after free or before allocation 2886 * then we should never poison the object itself. 2887 */ 2888 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 2889 !s->ctor) 2890 s->flags |= __OBJECT_POISON; 2891 else 2892 s->flags &= ~__OBJECT_POISON; 2893 2894 2895 /* 2896 * If we are Redzoning then check if there is some space between the 2897 * end of the object and the free pointer. If not then add an 2898 * additional word to have some bytes to store Redzone information. 2899 */ 2900 if ((flags & SLAB_RED_ZONE) && size == s->objsize) 2901 size += sizeof(void *); 2902 #endif 2903 2904 /* 2905 * With that we have determined the number of bytes in actual use 2906 * by the object. This is the potential offset to the free pointer. 2907 */ 2908 s->inuse = size; 2909 2910 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 2911 s->ctor)) { 2912 /* 2913 * Relocate free pointer after the object if it is not 2914 * permitted to overwrite the first word of the object on 2915 * kmem_cache_free. 2916 * 2917 * This is the case if we do RCU, have a constructor or 2918 * destructor or are poisoning the objects. 2919 */ 2920 s->offset = size; 2921 size += sizeof(void *); 2922 } 2923 2924 #ifdef CONFIG_SLUB_DEBUG 2925 if (flags & SLAB_STORE_USER) 2926 /* 2927 * Need to store information about allocs and frees after 2928 * the object. 2929 */ 2930 size += 2 * sizeof(struct track); 2931 2932 if (flags & SLAB_RED_ZONE) 2933 /* 2934 * Add some empty padding so that we can catch 2935 * overwrites from earlier objects rather than let 2936 * tracking information or the free pointer be 2937 * corrupted if a user writes before the start 2938 * of the object. 2939 */ 2940 size += sizeof(void *); 2941 #endif 2942 2943 /* 2944 * Determine the alignment based on various parameters that the 2945 * user specified and the dynamic determination of cache line size 2946 * on bootup. 2947 */ 2948 align = calculate_alignment(flags, align, s->objsize); 2949 s->align = align; 2950 2951 /* 2952 * SLUB stores one object immediately after another beginning from 2953 * offset 0. In order to align the objects we have to simply size 2954 * each object to conform to the alignment. 2955 */ 2956 size = ALIGN(size, align); 2957 s->size = size; 2958 if (forced_order >= 0) 2959 order = forced_order; 2960 else 2961 order = calculate_order(size, s->reserved); 2962 2963 if (order < 0) 2964 return 0; 2965 2966 s->allocflags = 0; 2967 if (order) 2968 s->allocflags |= __GFP_COMP; 2969 2970 if (s->flags & SLAB_CACHE_DMA) 2971 s->allocflags |= SLUB_DMA; 2972 2973 if (s->flags & SLAB_RECLAIM_ACCOUNT) 2974 s->allocflags |= __GFP_RECLAIMABLE; 2975 2976 /* 2977 * Determine the number of objects per slab 2978 */ 2979 s->oo = oo_make(order, size, s->reserved); 2980 s->min = oo_make(get_order(size), size, s->reserved); 2981 if (oo_objects(s->oo) > oo_objects(s->max)) 2982 s->max = s->oo; 2983 2984 return !!oo_objects(s->oo); 2985 2986 } 2987 2988 static int kmem_cache_open(struct kmem_cache *s, 2989 const char *name, size_t size, 2990 size_t align, unsigned long flags, 2991 void (*ctor)(void *)) 2992 { 2993 memset(s, 0, kmem_size); 2994 s->name = name; 2995 s->ctor = ctor; 2996 s->objsize = size; 2997 s->align = align; 2998 s->flags = kmem_cache_flags(size, flags, name, ctor); 2999 s->reserved = 0; 3000 3001 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) 3002 s->reserved = sizeof(struct rcu_head); 3003 3004 if (!calculate_sizes(s, -1)) 3005 goto error; 3006 if (disable_higher_order_debug) { 3007 /* 3008 * Disable debugging flags that store metadata if the min slab 3009 * order increased. 3010 */ 3011 if (get_order(s->size) > get_order(s->objsize)) { 3012 s->flags &= ~DEBUG_METADATA_FLAGS; 3013 s->offset = 0; 3014 if (!calculate_sizes(s, -1)) 3015 goto error; 3016 } 3017 } 3018 3019 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 3020 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 3021 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) 3022 /* Enable fast mode */ 3023 s->flags |= __CMPXCHG_DOUBLE; 3024 #endif 3025 3026 /* 3027 * The larger the object size is, the more pages we want on the partial 3028 * list to avoid pounding the page allocator excessively. 3029 */ 3030 set_min_partial(s, ilog2(s->size) / 2); 3031 3032 /* 3033 * cpu_partial determined the maximum number of objects kept in the 3034 * per cpu partial lists of a processor. 3035 * 3036 * Per cpu partial lists mainly contain slabs that just have one 3037 * object freed. If they are used for allocation then they can be 3038 * filled up again with minimal effort. The slab will never hit the 3039 * per node partial lists and therefore no locking will be required. 3040 * 3041 * This setting also determines 3042 * 3043 * A) The number of objects from per cpu partial slabs dumped to the 3044 * per node list when we reach the limit. 3045 * B) The number of objects in cpu partial slabs to extract from the 3046 * per node list when we run out of per cpu objects. We only fetch 50% 3047 * to keep some capacity around for frees. 3048 */ 3049 if (kmem_cache_debug(s)) 3050 s->cpu_partial = 0; 3051 else if (s->size >= PAGE_SIZE) 3052 s->cpu_partial = 2; 3053 else if (s->size >= 1024) 3054 s->cpu_partial = 6; 3055 else if (s->size >= 256) 3056 s->cpu_partial = 13; 3057 else 3058 s->cpu_partial = 30; 3059 3060 s->refcount = 1; 3061 #ifdef CONFIG_NUMA 3062 s->remote_node_defrag_ratio = 1000; 3063 #endif 3064 if (!init_kmem_cache_nodes(s)) 3065 goto error; 3066 3067 if (alloc_kmem_cache_cpus(s)) 3068 return 1; 3069 3070 free_kmem_cache_nodes(s); 3071 error: 3072 if (flags & SLAB_PANIC) 3073 panic("Cannot create slab %s size=%lu realsize=%u " 3074 "order=%u offset=%u flags=%lx\n", 3075 s->name, (unsigned long)size, s->size, oo_order(s->oo), 3076 s->offset, flags); 3077 return 0; 3078 } 3079 3080 /* 3081 * Determine the size of a slab object 3082 */ 3083 unsigned int kmem_cache_size(struct kmem_cache *s) 3084 { 3085 return s->objsize; 3086 } 3087 EXPORT_SYMBOL(kmem_cache_size); 3088 3089 static void list_slab_objects(struct kmem_cache *s, struct page *page, 3090 const char *text) 3091 { 3092 #ifdef CONFIG_SLUB_DEBUG 3093 void *addr = page_address(page); 3094 void *p; 3095 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * 3096 sizeof(long), GFP_ATOMIC); 3097 if (!map) 3098 return; 3099 slab_err(s, page, "%s", text); 3100 slab_lock(page); 3101 3102 get_map(s, page, map); 3103 for_each_object(p, s, addr, page->objects) { 3104 3105 if (!test_bit(slab_index(p, s, addr), map)) { 3106 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", 3107 p, p - addr); 3108 print_tracking(s, p); 3109 } 3110 } 3111 slab_unlock(page); 3112 kfree(map); 3113 #endif 3114 } 3115 3116 /* 3117 * Attempt to free all partial slabs on a node. 3118 * This is called from kmem_cache_close(). We must be the last thread 3119 * using the cache and therefore we do not need to lock anymore. 3120 */ 3121 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) 3122 { 3123 struct page *page, *h; 3124 3125 list_for_each_entry_safe(page, h, &n->partial, lru) { 3126 if (!page->inuse) { 3127 remove_partial(n, page); 3128 discard_slab(s, page); 3129 } else { 3130 list_slab_objects(s, page, 3131 "Objects remaining on kmem_cache_close()"); 3132 } 3133 } 3134 } 3135 3136 /* 3137 * Release all resources used by a slab cache. 3138 */ 3139 static inline int kmem_cache_close(struct kmem_cache *s) 3140 { 3141 int node; 3142 3143 flush_all(s); 3144 free_percpu(s->cpu_slab); 3145 /* Attempt to free all objects */ 3146 for_each_node_state(node, N_NORMAL_MEMORY) { 3147 struct kmem_cache_node *n = get_node(s, node); 3148 3149 free_partial(s, n); 3150 if (n->nr_partial || slabs_node(s, node)) 3151 return 1; 3152 } 3153 free_kmem_cache_nodes(s); 3154 return 0; 3155 } 3156 3157 /* 3158 * Close a cache and release the kmem_cache structure 3159 * (must be used for caches created using kmem_cache_create) 3160 */ 3161 void kmem_cache_destroy(struct kmem_cache *s) 3162 { 3163 down_write(&slub_lock); 3164 s->refcount--; 3165 if (!s->refcount) { 3166 list_del(&s->list); 3167 up_write(&slub_lock); 3168 if (kmem_cache_close(s)) { 3169 printk(KERN_ERR "SLUB %s: %s called for cache that " 3170 "still has objects.\n", s->name, __func__); 3171 dump_stack(); 3172 } 3173 if (s->flags & SLAB_DESTROY_BY_RCU) 3174 rcu_barrier(); 3175 sysfs_slab_remove(s); 3176 } else 3177 up_write(&slub_lock); 3178 } 3179 EXPORT_SYMBOL(kmem_cache_destroy); 3180 3181 /******************************************************************** 3182 * Kmalloc subsystem 3183 *******************************************************************/ 3184 3185 struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; 3186 EXPORT_SYMBOL(kmalloc_caches); 3187 3188 static struct kmem_cache *kmem_cache; 3189 3190 #ifdef CONFIG_ZONE_DMA 3191 static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; 3192 #endif 3193 3194 static int __init setup_slub_min_order(char *str) 3195 { 3196 get_option(&str, &slub_min_order); 3197 3198 return 1; 3199 } 3200 3201 __setup("slub_min_order=", setup_slub_min_order); 3202 3203 static int __init setup_slub_max_order(char *str) 3204 { 3205 get_option(&str, &slub_max_order); 3206 slub_max_order = min(slub_max_order, MAX_ORDER - 1); 3207 3208 return 1; 3209 } 3210 3211 __setup("slub_max_order=", setup_slub_max_order); 3212 3213 static int __init setup_slub_min_objects(char *str) 3214 { 3215 get_option(&str, &slub_min_objects); 3216 3217 return 1; 3218 } 3219 3220 __setup("slub_min_objects=", setup_slub_min_objects); 3221 3222 static int __init setup_slub_nomerge(char *str) 3223 { 3224 slub_nomerge = 1; 3225 return 1; 3226 } 3227 3228 __setup("slub_nomerge", setup_slub_nomerge); 3229 3230 static struct kmem_cache *__init create_kmalloc_cache(const char *name, 3231 int size, unsigned int flags) 3232 { 3233 struct kmem_cache *s; 3234 3235 s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3236 3237 /* 3238 * This function is called with IRQs disabled during early-boot on 3239 * single CPU so there's no need to take slub_lock here. 3240 */ 3241 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, 3242 flags, NULL)) 3243 goto panic; 3244 3245 list_add(&s->list, &slab_caches); 3246 return s; 3247 3248 panic: 3249 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 3250 return NULL; 3251 } 3252 3253 /* 3254 * Conversion table for small slabs sizes / 8 to the index in the 3255 * kmalloc array. This is necessary for slabs < 192 since we have non power 3256 * of two cache sizes there. The size of larger slabs can be determined using 3257 * fls. 3258 */ 3259 static s8 size_index[24] = { 3260 3, /* 8 */ 3261 4, /* 16 */ 3262 5, /* 24 */ 3263 5, /* 32 */ 3264 6, /* 40 */ 3265 6, /* 48 */ 3266 6, /* 56 */ 3267 6, /* 64 */ 3268 1, /* 72 */ 3269 1, /* 80 */ 3270 1, /* 88 */ 3271 1, /* 96 */ 3272 7, /* 104 */ 3273 7, /* 112 */ 3274 7, /* 120 */ 3275 7, /* 128 */ 3276 2, /* 136 */ 3277 2, /* 144 */ 3278 2, /* 152 */ 3279 2, /* 160 */ 3280 2, /* 168 */ 3281 2, /* 176 */ 3282 2, /* 184 */ 3283 2 /* 192 */ 3284 }; 3285 3286 static inline int size_index_elem(size_t bytes) 3287 { 3288 return (bytes - 1) / 8; 3289 } 3290 3291 static struct kmem_cache *get_slab(size_t size, gfp_t flags) 3292 { 3293 int index; 3294 3295 if (size <= 192) { 3296 if (!size) 3297 return ZERO_SIZE_PTR; 3298 3299 index = size_index[size_index_elem(size)]; 3300 } else 3301 index = fls(size - 1); 3302 3303 #ifdef CONFIG_ZONE_DMA 3304 if (unlikely((flags & SLUB_DMA))) 3305 return kmalloc_dma_caches[index]; 3306 3307 #endif 3308 return kmalloc_caches[index]; 3309 } 3310 3311 void *__kmalloc(size_t size, gfp_t flags) 3312 { 3313 struct kmem_cache *s; 3314 void *ret; 3315 3316 if (unlikely(size > SLUB_MAX_SIZE)) 3317 return kmalloc_large(size, flags); 3318 3319 s = get_slab(size, flags); 3320 3321 if (unlikely(ZERO_OR_NULL_PTR(s))) 3322 return s; 3323 3324 ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_); 3325 3326 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 3327 3328 return ret; 3329 } 3330 EXPORT_SYMBOL(__kmalloc); 3331 3332 #ifdef CONFIG_NUMA 3333 static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 3334 { 3335 struct page *page; 3336 void *ptr = NULL; 3337 3338 flags |= __GFP_COMP | __GFP_NOTRACK; 3339 page = alloc_pages_node(node, flags, get_order(size)); 3340 if (page) 3341 ptr = page_address(page); 3342 3343 kmemleak_alloc(ptr, size, 1, flags); 3344 return ptr; 3345 } 3346 3347 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3348 { 3349 struct kmem_cache *s; 3350 void *ret; 3351 3352 if (unlikely(size > SLUB_MAX_SIZE)) { 3353 ret = kmalloc_large_node(size, flags, node); 3354 3355 trace_kmalloc_node(_RET_IP_, ret, 3356 size, PAGE_SIZE << get_order(size), 3357 flags, node); 3358 3359 return ret; 3360 } 3361 3362 s = get_slab(size, flags); 3363 3364 if (unlikely(ZERO_OR_NULL_PTR(s))) 3365 return s; 3366 3367 ret = slab_alloc(s, flags, node, _RET_IP_); 3368 3369 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); 3370 3371 return ret; 3372 } 3373 EXPORT_SYMBOL(__kmalloc_node); 3374 #endif 3375 3376 size_t ksize(const void *object) 3377 { 3378 struct page *page; 3379 3380 if (unlikely(object == ZERO_SIZE_PTR)) 3381 return 0; 3382 3383 page = virt_to_head_page(object); 3384 3385 if (unlikely(!PageSlab(page))) { 3386 WARN_ON(!PageCompound(page)); 3387 return PAGE_SIZE << compound_order(page); 3388 } 3389 3390 return slab_ksize(page->slab); 3391 } 3392 EXPORT_SYMBOL(ksize); 3393 3394 #ifdef CONFIG_SLUB_DEBUG 3395 bool verify_mem_not_deleted(const void *x) 3396 { 3397 struct page *page; 3398 void *object = (void *)x; 3399 unsigned long flags; 3400 bool rv; 3401 3402 if (unlikely(ZERO_OR_NULL_PTR(x))) 3403 return false; 3404 3405 local_irq_save(flags); 3406 3407 page = virt_to_head_page(x); 3408 if (unlikely(!PageSlab(page))) { 3409 /* maybe it was from stack? */ 3410 rv = true; 3411 goto out_unlock; 3412 } 3413 3414 slab_lock(page); 3415 if (on_freelist(page->slab, page, object)) { 3416 object_err(page->slab, page, object, "Object is on free-list"); 3417 rv = false; 3418 } else { 3419 rv = true; 3420 } 3421 slab_unlock(page); 3422 3423 out_unlock: 3424 local_irq_restore(flags); 3425 return rv; 3426 } 3427 EXPORT_SYMBOL(verify_mem_not_deleted); 3428 #endif 3429 3430 void kfree(const void *x) 3431 { 3432 struct page *page; 3433 void *object = (void *)x; 3434 3435 trace_kfree(_RET_IP_, x); 3436 3437 if (unlikely(ZERO_OR_NULL_PTR(x))) 3438 return; 3439 3440 page = virt_to_head_page(x); 3441 if (unlikely(!PageSlab(page))) { 3442 BUG_ON(!PageCompound(page)); 3443 kmemleak_free(x); 3444 put_page(page); 3445 return; 3446 } 3447 slab_free(page->slab, page, object, _RET_IP_); 3448 } 3449 EXPORT_SYMBOL(kfree); 3450 3451 /* 3452 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 3453 * the remaining slabs by the number of items in use. The slabs with the 3454 * most items in use come first. New allocations will then fill those up 3455 * and thus they can be removed from the partial lists. 3456 * 3457 * The slabs with the least items are placed last. This results in them 3458 * being allocated from last increasing the chance that the last objects 3459 * are freed in them. 3460 */ 3461 int kmem_cache_shrink(struct kmem_cache *s) 3462 { 3463 int node; 3464 int i; 3465 struct kmem_cache_node *n; 3466 struct page *page; 3467 struct page *t; 3468 int objects = oo_objects(s->max); 3469 struct list_head *slabs_by_inuse = 3470 kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); 3471 unsigned long flags; 3472 3473 if (!slabs_by_inuse) 3474 return -ENOMEM; 3475 3476 flush_all(s); 3477 for_each_node_state(node, N_NORMAL_MEMORY) { 3478 n = get_node(s, node); 3479 3480 if (!n->nr_partial) 3481 continue; 3482 3483 for (i = 0; i < objects; i++) 3484 INIT_LIST_HEAD(slabs_by_inuse + i); 3485 3486 spin_lock_irqsave(&n->list_lock, flags); 3487 3488 /* 3489 * Build lists indexed by the items in use in each slab. 3490 * 3491 * Note that concurrent frees may occur while we hold the 3492 * list_lock. page->inuse here is the upper limit. 3493 */ 3494 list_for_each_entry_safe(page, t, &n->partial, lru) { 3495 list_move(&page->lru, slabs_by_inuse + page->inuse); 3496 if (!page->inuse) 3497 n->nr_partial--; 3498 } 3499 3500 /* 3501 * Rebuild the partial list with the slabs filled up most 3502 * first and the least used slabs at the end. 3503 */ 3504 for (i = objects - 1; i > 0; i--) 3505 list_splice(slabs_by_inuse + i, n->partial.prev); 3506 3507 spin_unlock_irqrestore(&n->list_lock, flags); 3508 3509 /* Release empty slabs */ 3510 list_for_each_entry_safe(page, t, slabs_by_inuse, lru) 3511 discard_slab(s, page); 3512 } 3513 3514 kfree(slabs_by_inuse); 3515 return 0; 3516 } 3517 EXPORT_SYMBOL(kmem_cache_shrink); 3518 3519 #if defined(CONFIG_MEMORY_HOTPLUG) 3520 static int slab_mem_going_offline_callback(void *arg) 3521 { 3522 struct kmem_cache *s; 3523 3524 down_read(&slub_lock); 3525 list_for_each_entry(s, &slab_caches, list) 3526 kmem_cache_shrink(s); 3527 up_read(&slub_lock); 3528 3529 return 0; 3530 } 3531 3532 static void slab_mem_offline_callback(void *arg) 3533 { 3534 struct kmem_cache_node *n; 3535 struct kmem_cache *s; 3536 struct memory_notify *marg = arg; 3537 int offline_node; 3538 3539 offline_node = marg->status_change_nid; 3540 3541 /* 3542 * If the node still has available memory. we need kmem_cache_node 3543 * for it yet. 3544 */ 3545 if (offline_node < 0) 3546 return; 3547 3548 down_read(&slub_lock); 3549 list_for_each_entry(s, &slab_caches, list) { 3550 n = get_node(s, offline_node); 3551 if (n) { 3552 /* 3553 * if n->nr_slabs > 0, slabs still exist on the node 3554 * that is going down. We were unable to free them, 3555 * and offline_pages() function shouldn't call this 3556 * callback. So, we must fail. 3557 */ 3558 BUG_ON(slabs_node(s, offline_node)); 3559 3560 s->node[offline_node] = NULL; 3561 kmem_cache_free(kmem_cache_node, n); 3562 } 3563 } 3564 up_read(&slub_lock); 3565 } 3566 3567 static int slab_mem_going_online_callback(void *arg) 3568 { 3569 struct kmem_cache_node *n; 3570 struct kmem_cache *s; 3571 struct memory_notify *marg = arg; 3572 int nid = marg->status_change_nid; 3573 int ret = 0; 3574 3575 /* 3576 * If the node's memory is already available, then kmem_cache_node is 3577 * already created. Nothing to do. 3578 */ 3579 if (nid < 0) 3580 return 0; 3581 3582 /* 3583 * We are bringing a node online. No memory is available yet. We must 3584 * allocate a kmem_cache_node structure in order to bring the node 3585 * online. 3586 */ 3587 down_read(&slub_lock); 3588 list_for_each_entry(s, &slab_caches, list) { 3589 /* 3590 * XXX: kmem_cache_alloc_node will fallback to other nodes 3591 * since memory is not yet available from the node that 3592 * is brought up. 3593 */ 3594 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); 3595 if (!n) { 3596 ret = -ENOMEM; 3597 goto out; 3598 } 3599 init_kmem_cache_node(n, s); 3600 s->node[nid] = n; 3601 } 3602 out: 3603 up_read(&slub_lock); 3604 return ret; 3605 } 3606 3607 static int slab_memory_callback(struct notifier_block *self, 3608 unsigned long action, void *arg) 3609 { 3610 int ret = 0; 3611 3612 switch (action) { 3613 case MEM_GOING_ONLINE: 3614 ret = slab_mem_going_online_callback(arg); 3615 break; 3616 case MEM_GOING_OFFLINE: 3617 ret = slab_mem_going_offline_callback(arg); 3618 break; 3619 case MEM_OFFLINE: 3620 case MEM_CANCEL_ONLINE: 3621 slab_mem_offline_callback(arg); 3622 break; 3623 case MEM_ONLINE: 3624 case MEM_CANCEL_OFFLINE: 3625 break; 3626 } 3627 if (ret) 3628 ret = notifier_from_errno(ret); 3629 else 3630 ret = NOTIFY_OK; 3631 return ret; 3632 } 3633 3634 #endif /* CONFIG_MEMORY_HOTPLUG */ 3635 3636 /******************************************************************** 3637 * Basic setup of slabs 3638 *******************************************************************/ 3639 3640 /* 3641 * Used for early kmem_cache structures that were allocated using 3642 * the page allocator 3643 */ 3644 3645 static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) 3646 { 3647 int node; 3648 3649 list_add(&s->list, &slab_caches); 3650 s->refcount = -1; 3651 3652 for_each_node_state(node, N_NORMAL_MEMORY) { 3653 struct kmem_cache_node *n = get_node(s, node); 3654 struct page *p; 3655 3656 if (n) { 3657 list_for_each_entry(p, &n->partial, lru) 3658 p->slab = s; 3659 3660 #ifdef CONFIG_SLUB_DEBUG 3661 list_for_each_entry(p, &n->full, lru) 3662 p->slab = s; 3663 #endif 3664 } 3665 } 3666 } 3667 3668 void __init kmem_cache_init(void) 3669 { 3670 int i; 3671 int caches = 0; 3672 struct kmem_cache *temp_kmem_cache; 3673 int order; 3674 struct kmem_cache *temp_kmem_cache_node; 3675 unsigned long kmalloc_size; 3676 3677 if (debug_guardpage_minorder()) 3678 slub_max_order = 0; 3679 3680 kmem_size = offsetof(struct kmem_cache, node) + 3681 nr_node_ids * sizeof(struct kmem_cache_node *); 3682 3683 /* Allocate two kmem_caches from the page allocator */ 3684 kmalloc_size = ALIGN(kmem_size, cache_line_size()); 3685 order = get_order(2 * kmalloc_size); 3686 kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); 3687 3688 /* 3689 * Must first have the slab cache available for the allocations of the 3690 * struct kmem_cache_node's. There is special bootstrap code in 3691 * kmem_cache_open for slab_state == DOWN. 3692 */ 3693 kmem_cache_node = (void *)kmem_cache + kmalloc_size; 3694 3695 kmem_cache_open(kmem_cache_node, "kmem_cache_node", 3696 sizeof(struct kmem_cache_node), 3697 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 3698 3699 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3700 3701 /* Able to allocate the per node structures */ 3702 slab_state = PARTIAL; 3703 3704 temp_kmem_cache = kmem_cache; 3705 kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, 3706 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 3707 kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3708 memcpy(kmem_cache, temp_kmem_cache, kmem_size); 3709 3710 /* 3711 * Allocate kmem_cache_node properly from the kmem_cache slab. 3712 * kmem_cache_node is separately allocated so no need to 3713 * update any list pointers. 3714 */ 3715 temp_kmem_cache_node = kmem_cache_node; 3716 3717 kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3718 memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); 3719 3720 kmem_cache_bootstrap_fixup(kmem_cache_node); 3721 3722 caches++; 3723 kmem_cache_bootstrap_fixup(kmem_cache); 3724 caches++; 3725 /* Free temporary boot structure */ 3726 free_pages((unsigned long)temp_kmem_cache, order); 3727 3728 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 3729 3730 /* 3731 * Patch up the size_index table if we have strange large alignment 3732 * requirements for the kmalloc array. This is only the case for 3733 * MIPS it seems. The standard arches will not generate any code here. 3734 * 3735 * Largest permitted alignment is 256 bytes due to the way we 3736 * handle the index determination for the smaller caches. 3737 * 3738 * Make sure that nothing crazy happens if someone starts tinkering 3739 * around with ARCH_KMALLOC_MINALIGN 3740 */ 3741 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 3742 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 3743 3744 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { 3745 int elem = size_index_elem(i); 3746 if (elem >= ARRAY_SIZE(size_index)) 3747 break; 3748 size_index[elem] = KMALLOC_SHIFT_LOW; 3749 } 3750 3751 if (KMALLOC_MIN_SIZE == 64) { 3752 /* 3753 * The 96 byte size cache is not used if the alignment 3754 * is 64 byte. 3755 */ 3756 for (i = 64 + 8; i <= 96; i += 8) 3757 size_index[size_index_elem(i)] = 7; 3758 } else if (KMALLOC_MIN_SIZE == 128) { 3759 /* 3760 * The 192 byte sized cache is not used if the alignment 3761 * is 128 byte. Redirect kmalloc to use the 256 byte cache 3762 * instead. 3763 */ 3764 for (i = 128 + 8; i <= 192; i += 8) 3765 size_index[size_index_elem(i)] = 8; 3766 } 3767 3768 /* Caches that are not of the two-to-the-power-of size */ 3769 if (KMALLOC_MIN_SIZE <= 32) { 3770 kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0); 3771 caches++; 3772 } 3773 3774 if (KMALLOC_MIN_SIZE <= 64) { 3775 kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0); 3776 caches++; 3777 } 3778 3779 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3780 kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0); 3781 caches++; 3782 } 3783 3784 slab_state = UP; 3785 3786 /* Provide the correct kmalloc names now that the caches are up */ 3787 if (KMALLOC_MIN_SIZE <= 32) { 3788 kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT); 3789 BUG_ON(!kmalloc_caches[1]->name); 3790 } 3791 3792 if (KMALLOC_MIN_SIZE <= 64) { 3793 kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT); 3794 BUG_ON(!kmalloc_caches[2]->name); 3795 } 3796 3797 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3798 char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); 3799 3800 BUG_ON(!s); 3801 kmalloc_caches[i]->name = s; 3802 } 3803 3804 #ifdef CONFIG_SMP 3805 register_cpu_notifier(&slab_notifier); 3806 #endif 3807 3808 #ifdef CONFIG_ZONE_DMA 3809 for (i = 0; i < SLUB_PAGE_SHIFT; i++) { 3810 struct kmem_cache *s = kmalloc_caches[i]; 3811 3812 if (s && s->size) { 3813 char *name = kasprintf(GFP_NOWAIT, 3814 "dma-kmalloc-%d", s->objsize); 3815 3816 BUG_ON(!name); 3817 kmalloc_dma_caches[i] = create_kmalloc_cache(name, 3818 s->objsize, SLAB_CACHE_DMA); 3819 } 3820 } 3821 #endif 3822 printk(KERN_INFO 3823 "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 3824 " CPUs=%d, Nodes=%d\n", 3825 caches, cache_line_size(), 3826 slub_min_order, slub_max_order, slub_min_objects, 3827 nr_cpu_ids, nr_node_ids); 3828 } 3829 3830 void __init kmem_cache_init_late(void) 3831 { 3832 } 3833 3834 /* 3835 * Find a mergeable slab cache 3836 */ 3837 static int slab_unmergeable(struct kmem_cache *s) 3838 { 3839 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 3840 return 1; 3841 3842 if (s->ctor) 3843 return 1; 3844 3845 /* 3846 * We may have set a slab to be unmergeable during bootstrap. 3847 */ 3848 if (s->refcount < 0) 3849 return 1; 3850 3851 return 0; 3852 } 3853 3854 static struct kmem_cache *find_mergeable(size_t size, 3855 size_t align, unsigned long flags, const char *name, 3856 void (*ctor)(void *)) 3857 { 3858 struct kmem_cache *s; 3859 3860 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 3861 return NULL; 3862 3863 if (ctor) 3864 return NULL; 3865 3866 size = ALIGN(size, sizeof(void *)); 3867 align = calculate_alignment(flags, align, size); 3868 size = ALIGN(size, align); 3869 flags = kmem_cache_flags(size, flags, name, NULL); 3870 3871 list_for_each_entry(s, &slab_caches, list) { 3872 if (slab_unmergeable(s)) 3873 continue; 3874 3875 if (size > s->size) 3876 continue; 3877 3878 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) 3879 continue; 3880 /* 3881 * Check if alignment is compatible. 3882 * Courtesy of Adrian Drzewiecki 3883 */ 3884 if ((s->size & ~(align - 1)) != s->size) 3885 continue; 3886 3887 if (s->size - size >= sizeof(void *)) 3888 continue; 3889 3890 return s; 3891 } 3892 return NULL; 3893 } 3894 3895 struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3896 size_t align, unsigned long flags, void (*ctor)(void *)) 3897 { 3898 struct kmem_cache *s; 3899 char *n; 3900 3901 if (WARN_ON(!name)) 3902 return NULL; 3903 3904 down_write(&slub_lock); 3905 s = find_mergeable(size, align, flags, name, ctor); 3906 if (s) { 3907 s->refcount++; 3908 /* 3909 * Adjust the object sizes so that we clear 3910 * the complete object on kzalloc. 3911 */ 3912 s->objsize = max(s->objsize, (int)size); 3913 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3914 3915 if (sysfs_slab_alias(s, name)) { 3916 s->refcount--; 3917 goto err; 3918 } 3919 up_write(&slub_lock); 3920 return s; 3921 } 3922 3923 n = kstrdup(name, GFP_KERNEL); 3924 if (!n) 3925 goto err; 3926 3927 s = kmalloc(kmem_size, GFP_KERNEL); 3928 if (s) { 3929 if (kmem_cache_open(s, n, 3930 size, align, flags, ctor)) { 3931 list_add(&s->list, &slab_caches); 3932 if (sysfs_slab_add(s)) { 3933 list_del(&s->list); 3934 kfree(n); 3935 kfree(s); 3936 goto err; 3937 } 3938 up_write(&slub_lock); 3939 return s; 3940 } 3941 kfree(n); 3942 kfree(s); 3943 } 3944 err: 3945 up_write(&slub_lock); 3946 3947 if (flags & SLAB_PANIC) 3948 panic("Cannot create slabcache %s\n", name); 3949 else 3950 s = NULL; 3951 return s; 3952 } 3953 EXPORT_SYMBOL(kmem_cache_create); 3954 3955 #ifdef CONFIG_SMP 3956 /* 3957 * Use the cpu notifier to insure that the cpu slabs are flushed when 3958 * necessary. 3959 */ 3960 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 3961 unsigned long action, void *hcpu) 3962 { 3963 long cpu = (long)hcpu; 3964 struct kmem_cache *s; 3965 unsigned long flags; 3966 3967 switch (action) { 3968 case CPU_UP_CANCELED: 3969 case CPU_UP_CANCELED_FROZEN: 3970 case CPU_DEAD: 3971 case CPU_DEAD_FROZEN: 3972 down_read(&slub_lock); 3973 list_for_each_entry(s, &slab_caches, list) { 3974 local_irq_save(flags); 3975 __flush_cpu_slab(s, cpu); 3976 local_irq_restore(flags); 3977 } 3978 up_read(&slub_lock); 3979 break; 3980 default: 3981 break; 3982 } 3983 return NOTIFY_OK; 3984 } 3985 3986 static struct notifier_block __cpuinitdata slab_notifier = { 3987 .notifier_call = slab_cpuup_callback 3988 }; 3989 3990 #endif 3991 3992 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) 3993 { 3994 struct kmem_cache *s; 3995 void *ret; 3996 3997 if (unlikely(size > SLUB_MAX_SIZE)) 3998 return kmalloc_large(size, gfpflags); 3999 4000 s = get_slab(size, gfpflags); 4001 4002 if (unlikely(ZERO_OR_NULL_PTR(s))) 4003 return s; 4004 4005 ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); 4006 4007 /* Honor the call site pointer we received. */ 4008 trace_kmalloc(caller, ret, size, s->size, gfpflags); 4009 4010 return ret; 4011 } 4012 4013 #ifdef CONFIG_NUMA 4014 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 4015 int node, unsigned long caller) 4016 { 4017 struct kmem_cache *s; 4018 void *ret; 4019 4020 if (unlikely(size > SLUB_MAX_SIZE)) { 4021 ret = kmalloc_large_node(size, gfpflags, node); 4022 4023 trace_kmalloc_node(caller, ret, 4024 size, PAGE_SIZE << get_order(size), 4025 gfpflags, node); 4026 4027 return ret; 4028 } 4029 4030 s = get_slab(size, gfpflags); 4031 4032 if (unlikely(ZERO_OR_NULL_PTR(s))) 4033 return s; 4034 4035 ret = slab_alloc(s, gfpflags, node, caller); 4036 4037 /* Honor the call site pointer we received. */ 4038 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); 4039 4040 return ret; 4041 } 4042 #endif 4043 4044 #ifdef CONFIG_SYSFS 4045 static int count_inuse(struct page *page) 4046 { 4047 return page->inuse; 4048 } 4049 4050 static int count_total(struct page *page) 4051 { 4052 return page->objects; 4053 } 4054 #endif 4055 4056 #ifdef CONFIG_SLUB_DEBUG 4057 static int validate_slab(struct kmem_cache *s, struct page *page, 4058 unsigned long *map) 4059 { 4060 void *p; 4061 void *addr = page_address(page); 4062 4063 if (!check_slab(s, page) || 4064 !on_freelist(s, page, NULL)) 4065 return 0; 4066 4067 /* Now we know that a valid freelist exists */ 4068 bitmap_zero(map, page->objects); 4069 4070 get_map(s, page, map); 4071 for_each_object(p, s, addr, page->objects) { 4072 if (test_bit(slab_index(p, s, addr), map)) 4073 if (!check_object(s, page, p, SLUB_RED_INACTIVE)) 4074 return 0; 4075 } 4076 4077 for_each_object(p, s, addr, page->objects) 4078 if (!test_bit(slab_index(p, s, addr), map)) 4079 if (!check_object(s, page, p, SLUB_RED_ACTIVE)) 4080 return 0; 4081 return 1; 4082 } 4083 4084 static void validate_slab_slab(struct kmem_cache *s, struct page *page, 4085 unsigned long *map) 4086 { 4087 slab_lock(page); 4088 validate_slab(s, page, map); 4089 slab_unlock(page); 4090 } 4091 4092 static int validate_slab_node(struct kmem_cache *s, 4093 struct kmem_cache_node *n, unsigned long *map) 4094 { 4095 unsigned long count = 0; 4096 struct page *page; 4097 unsigned long flags; 4098 4099 spin_lock_irqsave(&n->list_lock, flags); 4100 4101 list_for_each_entry(page, &n->partial, lru) { 4102 validate_slab_slab(s, page, map); 4103 count++; 4104 } 4105 if (count != n->nr_partial) 4106 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 4107 "counter=%ld\n", s->name, count, n->nr_partial); 4108 4109 if (!(s->flags & SLAB_STORE_USER)) 4110 goto out; 4111 4112 list_for_each_entry(page, &n->full, lru) { 4113 validate_slab_slab(s, page, map); 4114 count++; 4115 } 4116 if (count != atomic_long_read(&n->nr_slabs)) 4117 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 4118 "counter=%ld\n", s->name, count, 4119 atomic_long_read(&n->nr_slabs)); 4120 4121 out: 4122 spin_unlock_irqrestore(&n->list_lock, flags); 4123 return count; 4124 } 4125 4126 static long validate_slab_cache(struct kmem_cache *s) 4127 { 4128 int node; 4129 unsigned long count = 0; 4130 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 4131 sizeof(unsigned long), GFP_KERNEL); 4132 4133 if (!map) 4134 return -ENOMEM; 4135 4136 flush_all(s); 4137 for_each_node_state(node, N_NORMAL_MEMORY) { 4138 struct kmem_cache_node *n = get_node(s, node); 4139 4140 count += validate_slab_node(s, n, map); 4141 } 4142 kfree(map); 4143 return count; 4144 } 4145 /* 4146 * Generate lists of code addresses where slabcache objects are allocated 4147 * and freed. 4148 */ 4149 4150 struct location { 4151 unsigned long count; 4152 unsigned long addr; 4153 long long sum_time; 4154 long min_time; 4155 long max_time; 4156 long min_pid; 4157 long max_pid; 4158 DECLARE_BITMAP(cpus, NR_CPUS); 4159 nodemask_t nodes; 4160 }; 4161 4162 struct loc_track { 4163 unsigned long max; 4164 unsigned long count; 4165 struct location *loc; 4166 }; 4167 4168 static void free_loc_track(struct loc_track *t) 4169 { 4170 if (t->max) 4171 free_pages((unsigned long)t->loc, 4172 get_order(sizeof(struct location) * t->max)); 4173 } 4174 4175 static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) 4176 { 4177 struct location *l; 4178 int order; 4179 4180 order = get_order(sizeof(struct location) * max); 4181 4182 l = (void *)__get_free_pages(flags, order); 4183 if (!l) 4184 return 0; 4185 4186 if (t->count) { 4187 memcpy(l, t->loc, sizeof(struct location) * t->count); 4188 free_loc_track(t); 4189 } 4190 t->max = max; 4191 t->loc = l; 4192 return 1; 4193 } 4194 4195 static int add_location(struct loc_track *t, struct kmem_cache *s, 4196 const struct track *track) 4197 { 4198 long start, end, pos; 4199 struct location *l; 4200 unsigned long caddr; 4201 unsigned long age = jiffies - track->when; 4202 4203 start = -1; 4204 end = t->count; 4205 4206 for ( ; ; ) { 4207 pos = start + (end - start + 1) / 2; 4208 4209 /* 4210 * There is nothing at "end". If we end up there 4211 * we need to add something to before end. 4212 */ 4213 if (pos == end) 4214 break; 4215 4216 caddr = t->loc[pos].addr; 4217 if (track->addr == caddr) { 4218 4219 l = &t->loc[pos]; 4220 l->count++; 4221 if (track->when) { 4222 l->sum_time += age; 4223 if (age < l->min_time) 4224 l->min_time = age; 4225 if (age > l->max_time) 4226 l->max_time = age; 4227 4228 if (track->pid < l->min_pid) 4229 l->min_pid = track->pid; 4230 if (track->pid > l->max_pid) 4231 l->max_pid = track->pid; 4232 4233 cpumask_set_cpu(track->cpu, 4234 to_cpumask(l->cpus)); 4235 } 4236 node_set(page_to_nid(virt_to_page(track)), l->nodes); 4237 return 1; 4238 } 4239 4240 if (track->addr < caddr) 4241 end = pos; 4242 else 4243 start = pos; 4244 } 4245 4246 /* 4247 * Not found. Insert new tracking element. 4248 */ 4249 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) 4250 return 0; 4251 4252 l = t->loc + pos; 4253 if (pos < t->count) 4254 memmove(l + 1, l, 4255 (t->count - pos) * sizeof(struct location)); 4256 t->count++; 4257 l->count = 1; 4258 l->addr = track->addr; 4259 l->sum_time = age; 4260 l->min_time = age; 4261 l->max_time = age; 4262 l->min_pid = track->pid; 4263 l->max_pid = track->pid; 4264 cpumask_clear(to_cpumask(l->cpus)); 4265 cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); 4266 nodes_clear(l->nodes); 4267 node_set(page_to_nid(virt_to_page(track)), l->nodes); 4268 return 1; 4269 } 4270 4271 static void process_slab(struct loc_track *t, struct kmem_cache *s, 4272 struct page *page, enum track_item alloc, 4273 unsigned long *map) 4274 { 4275 void *addr = page_address(page); 4276 void *p; 4277 4278 bitmap_zero(map, page->objects); 4279 get_map(s, page, map); 4280 4281 for_each_object(p, s, addr, page->objects) 4282 if (!test_bit(slab_index(p, s, addr), map)) 4283 add_location(t, s, get_track(s, p, alloc)); 4284 } 4285 4286 static int list_locations(struct kmem_cache *s, char *buf, 4287 enum track_item alloc) 4288 { 4289 int len = 0; 4290 unsigned long i; 4291 struct loc_track t = { 0, 0, NULL }; 4292 int node; 4293 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 4294 sizeof(unsigned long), GFP_KERNEL); 4295 4296 if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 4297 GFP_TEMPORARY)) { 4298 kfree(map); 4299 return sprintf(buf, "Out of memory\n"); 4300 } 4301 /* Push back cpu slabs */ 4302 flush_all(s); 4303 4304 for_each_node_state(node, N_NORMAL_MEMORY) { 4305 struct kmem_cache_node *n = get_node(s, node); 4306 unsigned long flags; 4307 struct page *page; 4308 4309 if (!atomic_long_read(&n->nr_slabs)) 4310 continue; 4311 4312 spin_lock_irqsave(&n->list_lock, flags); 4313 list_for_each_entry(page, &n->partial, lru) 4314 process_slab(&t, s, page, alloc, map); 4315 list_for_each_entry(page, &n->full, lru) 4316 process_slab(&t, s, page, alloc, map); 4317 spin_unlock_irqrestore(&n->list_lock, flags); 4318 } 4319 4320 for (i = 0; i < t.count; i++) { 4321 struct location *l = &t.loc[i]; 4322 4323 if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) 4324 break; 4325 len += sprintf(buf + len, "%7ld ", l->count); 4326 4327 if (l->addr) 4328 len += sprintf(buf + len, "%pS", (void *)l->addr); 4329 else 4330 len += sprintf(buf + len, "<not-available>"); 4331 4332 if (l->sum_time != l->min_time) { 4333 len += sprintf(buf + len, " age=%ld/%ld/%ld", 4334 l->min_time, 4335 (long)div_u64(l->sum_time, l->count), 4336 l->max_time); 4337 } else 4338 len += sprintf(buf + len, " age=%ld", 4339 l->min_time); 4340 4341 if (l->min_pid != l->max_pid) 4342 len += sprintf(buf + len, " pid=%ld-%ld", 4343 l->min_pid, l->max_pid); 4344 else 4345 len += sprintf(buf + len, " pid=%ld", 4346 l->min_pid); 4347 4348 if (num_online_cpus() > 1 && 4349 !cpumask_empty(to_cpumask(l->cpus)) && 4350 len < PAGE_SIZE - 60) { 4351 len += sprintf(buf + len, " cpus="); 4352 len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, 4353 to_cpumask(l->cpus)); 4354 } 4355 4356 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && 4357 len < PAGE_SIZE - 60) { 4358 len += sprintf(buf + len, " nodes="); 4359 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, 4360 l->nodes); 4361 } 4362 4363 len += sprintf(buf + len, "\n"); 4364 } 4365 4366 free_loc_track(&t); 4367 kfree(map); 4368 if (!t.count) 4369 len += sprintf(buf, "No data\n"); 4370 return len; 4371 } 4372 #endif 4373 4374 #ifdef SLUB_RESILIENCY_TEST 4375 static void resiliency_test(void) 4376 { 4377 u8 *p; 4378 4379 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10); 4380 4381 printk(KERN_ERR "SLUB resiliency testing\n"); 4382 printk(KERN_ERR "-----------------------\n"); 4383 printk(KERN_ERR "A. Corruption after allocation\n"); 4384 4385 p = kzalloc(16, GFP_KERNEL); 4386 p[16] = 0x12; 4387 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 4388 " 0x12->0x%p\n\n", p + 16); 4389 4390 validate_slab_cache(kmalloc_caches[4]); 4391 4392 /* Hmmm... The next two are dangerous */ 4393 p = kzalloc(32, GFP_KERNEL); 4394 p[32 + sizeof(void *)] = 0x34; 4395 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 4396 " 0x34 -> -0x%p\n", p); 4397 printk(KERN_ERR 4398 "If allocated object is overwritten then not detectable\n\n"); 4399 4400 validate_slab_cache(kmalloc_caches[5]); 4401 p = kzalloc(64, GFP_KERNEL); 4402 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 4403 *p = 0x56; 4404 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 4405 p); 4406 printk(KERN_ERR 4407 "If allocated object is overwritten then not detectable\n\n"); 4408 validate_slab_cache(kmalloc_caches[6]); 4409 4410 printk(KERN_ERR "\nB. Corruption after free\n"); 4411 p = kzalloc(128, GFP_KERNEL); 4412 kfree(p); 4413 *p = 0x78; 4414 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 4415 validate_slab_cache(kmalloc_caches[7]); 4416 4417 p = kzalloc(256, GFP_KERNEL); 4418 kfree(p); 4419 p[50] = 0x9a; 4420 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", 4421 p); 4422 validate_slab_cache(kmalloc_caches[8]); 4423 4424 p = kzalloc(512, GFP_KERNEL); 4425 kfree(p); 4426 p[512] = 0xab; 4427 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 4428 validate_slab_cache(kmalloc_caches[9]); 4429 } 4430 #else 4431 #ifdef CONFIG_SYSFS 4432 static void resiliency_test(void) {}; 4433 #endif 4434 #endif 4435 4436 #ifdef CONFIG_SYSFS 4437 enum slab_stat_type { 4438 SL_ALL, /* All slabs */ 4439 SL_PARTIAL, /* Only partially allocated slabs */ 4440 SL_CPU, /* Only slabs used for cpu caches */ 4441 SL_OBJECTS, /* Determine allocated objects not slabs */ 4442 SL_TOTAL /* Determine object capacity not slabs */ 4443 }; 4444 4445 #define SO_ALL (1 << SL_ALL) 4446 #define SO_PARTIAL (1 << SL_PARTIAL) 4447 #define SO_CPU (1 << SL_CPU) 4448 #define SO_OBJECTS (1 << SL_OBJECTS) 4449 #define SO_TOTAL (1 << SL_TOTAL) 4450 4451 static ssize_t show_slab_objects(struct kmem_cache *s, 4452 char *buf, unsigned long flags) 4453 { 4454 unsigned long total = 0; 4455 int node; 4456 int x; 4457 unsigned long *nodes; 4458 unsigned long *per_cpu; 4459 4460 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 4461 if (!nodes) 4462 return -ENOMEM; 4463 per_cpu = nodes + nr_node_ids; 4464 4465 if (flags & SO_CPU) { 4466 int cpu; 4467 4468 for_each_possible_cpu(cpu) { 4469 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4470 int node = ACCESS_ONCE(c->node); 4471 struct page *page; 4472 4473 if (node < 0) 4474 continue; 4475 page = ACCESS_ONCE(c->page); 4476 if (page) { 4477 if (flags & SO_TOTAL) 4478 x = page->objects; 4479 else if (flags & SO_OBJECTS) 4480 x = page->inuse; 4481 else 4482 x = 1; 4483 4484 total += x; 4485 nodes[node] += x; 4486 } 4487 page = c->partial; 4488 4489 if (page) { 4490 x = page->pobjects; 4491 total += x; 4492 nodes[node] += x; 4493 } 4494 per_cpu[node]++; 4495 } 4496 } 4497 4498 lock_memory_hotplug(); 4499 #ifdef CONFIG_SLUB_DEBUG 4500 if (flags & SO_ALL) { 4501 for_each_node_state(node, N_NORMAL_MEMORY) { 4502 struct kmem_cache_node *n = get_node(s, node); 4503 4504 if (flags & SO_TOTAL) 4505 x = atomic_long_read(&n->total_objects); 4506 else if (flags & SO_OBJECTS) 4507 x = atomic_long_read(&n->total_objects) - 4508 count_partial(n, count_free); 4509 4510 else 4511 x = atomic_long_read(&n->nr_slabs); 4512 total += x; 4513 nodes[node] += x; 4514 } 4515 4516 } else 4517 #endif 4518 if (flags & SO_PARTIAL) { 4519 for_each_node_state(node, N_NORMAL_MEMORY) { 4520 struct kmem_cache_node *n = get_node(s, node); 4521 4522 if (flags & SO_TOTAL) 4523 x = count_partial(n, count_total); 4524 else if (flags & SO_OBJECTS) 4525 x = count_partial(n, count_inuse); 4526 else 4527 x = n->nr_partial; 4528 total += x; 4529 nodes[node] += x; 4530 } 4531 } 4532 x = sprintf(buf, "%lu", total); 4533 #ifdef CONFIG_NUMA 4534 for_each_node_state(node, N_NORMAL_MEMORY) 4535 if (nodes[node]) 4536 x += sprintf(buf + x, " N%d=%lu", 4537 node, nodes[node]); 4538 #endif 4539 unlock_memory_hotplug(); 4540 kfree(nodes); 4541 return x + sprintf(buf + x, "\n"); 4542 } 4543 4544 #ifdef CONFIG_SLUB_DEBUG 4545 static int any_slab_objects(struct kmem_cache *s) 4546 { 4547 int node; 4548 4549 for_each_online_node(node) { 4550 struct kmem_cache_node *n = get_node(s, node); 4551 4552 if (!n) 4553 continue; 4554 4555 if (atomic_long_read(&n->total_objects)) 4556 return 1; 4557 } 4558 return 0; 4559 } 4560 #endif 4561 4562 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 4563 #define to_slab(n) container_of(n, struct kmem_cache, kobj) 4564 4565 struct slab_attribute { 4566 struct attribute attr; 4567 ssize_t (*show)(struct kmem_cache *s, char *buf); 4568 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 4569 }; 4570 4571 #define SLAB_ATTR_RO(_name) \ 4572 static struct slab_attribute _name##_attr = \ 4573 __ATTR(_name, 0400, _name##_show, NULL) 4574 4575 #define SLAB_ATTR(_name) \ 4576 static struct slab_attribute _name##_attr = \ 4577 __ATTR(_name, 0600, _name##_show, _name##_store) 4578 4579 static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 4580 { 4581 return sprintf(buf, "%d\n", s->size); 4582 } 4583 SLAB_ATTR_RO(slab_size); 4584 4585 static ssize_t align_show(struct kmem_cache *s, char *buf) 4586 { 4587 return sprintf(buf, "%d\n", s->align); 4588 } 4589 SLAB_ATTR_RO(align); 4590 4591 static ssize_t object_size_show(struct kmem_cache *s, char *buf) 4592 { 4593 return sprintf(buf, "%d\n", s->objsize); 4594 } 4595 SLAB_ATTR_RO(object_size); 4596 4597 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 4598 { 4599 return sprintf(buf, "%d\n", oo_objects(s->oo)); 4600 } 4601 SLAB_ATTR_RO(objs_per_slab); 4602 4603 static ssize_t order_store(struct kmem_cache *s, 4604 const char *buf, size_t length) 4605 { 4606 unsigned long order; 4607 int err; 4608 4609 err = strict_strtoul(buf, 10, &order); 4610 if (err) 4611 return err; 4612 4613 if (order > slub_max_order || order < slub_min_order) 4614 return -EINVAL; 4615 4616 calculate_sizes(s, order); 4617 return length; 4618 } 4619 4620 static ssize_t order_show(struct kmem_cache *s, char *buf) 4621 { 4622 return sprintf(buf, "%d\n", oo_order(s->oo)); 4623 } 4624 SLAB_ATTR(order); 4625 4626 static ssize_t min_partial_show(struct kmem_cache *s, char *buf) 4627 { 4628 return sprintf(buf, "%lu\n", s->min_partial); 4629 } 4630 4631 static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, 4632 size_t length) 4633 { 4634 unsigned long min; 4635 int err; 4636 4637 err = strict_strtoul(buf, 10, &min); 4638 if (err) 4639 return err; 4640 4641 set_min_partial(s, min); 4642 return length; 4643 } 4644 SLAB_ATTR(min_partial); 4645 4646 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) 4647 { 4648 return sprintf(buf, "%u\n", s->cpu_partial); 4649 } 4650 4651 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, 4652 size_t length) 4653 { 4654 unsigned long objects; 4655 int err; 4656 4657 err = strict_strtoul(buf, 10, &objects); 4658 if (err) 4659 return err; 4660 if (objects && kmem_cache_debug(s)) 4661 return -EINVAL; 4662 4663 s->cpu_partial = objects; 4664 flush_all(s); 4665 return length; 4666 } 4667 SLAB_ATTR(cpu_partial); 4668 4669 static ssize_t ctor_show(struct kmem_cache *s, char *buf) 4670 { 4671 if (!s->ctor) 4672 return 0; 4673 return sprintf(buf, "%pS\n", s->ctor); 4674 } 4675 SLAB_ATTR_RO(ctor); 4676 4677 static ssize_t aliases_show(struct kmem_cache *s, char *buf) 4678 { 4679 return sprintf(buf, "%d\n", s->refcount - 1); 4680 } 4681 SLAB_ATTR_RO(aliases); 4682 4683 static ssize_t partial_show(struct kmem_cache *s, char *buf) 4684 { 4685 return show_slab_objects(s, buf, SO_PARTIAL); 4686 } 4687 SLAB_ATTR_RO(partial); 4688 4689 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 4690 { 4691 return show_slab_objects(s, buf, SO_CPU); 4692 } 4693 SLAB_ATTR_RO(cpu_slabs); 4694 4695 static ssize_t objects_show(struct kmem_cache *s, char *buf) 4696 { 4697 return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); 4698 } 4699 SLAB_ATTR_RO(objects); 4700 4701 static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) 4702 { 4703 return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); 4704 } 4705 SLAB_ATTR_RO(objects_partial); 4706 4707 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) 4708 { 4709 int objects = 0; 4710 int pages = 0; 4711 int cpu; 4712 int len; 4713 4714 for_each_online_cpu(cpu) { 4715 struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; 4716 4717 if (page) { 4718 pages += page->pages; 4719 objects += page->pobjects; 4720 } 4721 } 4722 4723 len = sprintf(buf, "%d(%d)", objects, pages); 4724 4725 #ifdef CONFIG_SMP 4726 for_each_online_cpu(cpu) { 4727 struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; 4728 4729 if (page && len < PAGE_SIZE - 20) 4730 len += sprintf(buf + len, " C%d=%d(%d)", cpu, 4731 page->pobjects, page->pages); 4732 } 4733 #endif 4734 return len + sprintf(buf + len, "\n"); 4735 } 4736 SLAB_ATTR_RO(slabs_cpu_partial); 4737 4738 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4739 { 4740 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4741 } 4742 4743 static ssize_t reclaim_account_store(struct kmem_cache *s, 4744 const char *buf, size_t length) 4745 { 4746 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 4747 if (buf[0] == '1') 4748 s->flags |= SLAB_RECLAIM_ACCOUNT; 4749 return length; 4750 } 4751 SLAB_ATTR(reclaim_account); 4752 4753 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 4754 { 4755 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 4756 } 4757 SLAB_ATTR_RO(hwcache_align); 4758 4759 #ifdef CONFIG_ZONE_DMA 4760 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 4761 { 4762 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 4763 } 4764 SLAB_ATTR_RO(cache_dma); 4765 #endif 4766 4767 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 4768 { 4769 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 4770 } 4771 SLAB_ATTR_RO(destroy_by_rcu); 4772 4773 static ssize_t reserved_show(struct kmem_cache *s, char *buf) 4774 { 4775 return sprintf(buf, "%d\n", s->reserved); 4776 } 4777 SLAB_ATTR_RO(reserved); 4778 4779 #ifdef CONFIG_SLUB_DEBUG 4780 static ssize_t slabs_show(struct kmem_cache *s, char *buf) 4781 { 4782 return show_slab_objects(s, buf, SO_ALL); 4783 } 4784 SLAB_ATTR_RO(slabs); 4785 4786 static ssize_t total_objects_show(struct kmem_cache *s, char *buf) 4787 { 4788 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); 4789 } 4790 SLAB_ATTR_RO(total_objects); 4791 4792 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 4793 { 4794 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 4795 } 4796 4797 static ssize_t sanity_checks_store(struct kmem_cache *s, 4798 const char *buf, size_t length) 4799 { 4800 s->flags &= ~SLAB_DEBUG_FREE; 4801 if (buf[0] == '1') { 4802 s->flags &= ~__CMPXCHG_DOUBLE; 4803 s->flags |= SLAB_DEBUG_FREE; 4804 } 4805 return length; 4806 } 4807 SLAB_ATTR(sanity_checks); 4808 4809 static ssize_t trace_show(struct kmem_cache *s, char *buf) 4810 { 4811 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 4812 } 4813 4814 static ssize_t trace_store(struct kmem_cache *s, const char *buf, 4815 size_t length) 4816 { 4817 s->flags &= ~SLAB_TRACE; 4818 if (buf[0] == '1') { 4819 s->flags &= ~__CMPXCHG_DOUBLE; 4820 s->flags |= SLAB_TRACE; 4821 } 4822 return length; 4823 } 4824 SLAB_ATTR(trace); 4825 4826 static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 4827 { 4828 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 4829 } 4830 4831 static ssize_t red_zone_store(struct kmem_cache *s, 4832 const char *buf, size_t length) 4833 { 4834 if (any_slab_objects(s)) 4835 return -EBUSY; 4836 4837 s->flags &= ~SLAB_RED_ZONE; 4838 if (buf[0] == '1') { 4839 s->flags &= ~__CMPXCHG_DOUBLE; 4840 s->flags |= SLAB_RED_ZONE; 4841 } 4842 calculate_sizes(s, -1); 4843 return length; 4844 } 4845 SLAB_ATTR(red_zone); 4846 4847 static ssize_t poison_show(struct kmem_cache *s, char *buf) 4848 { 4849 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 4850 } 4851 4852 static ssize_t poison_store(struct kmem_cache *s, 4853 const char *buf, size_t length) 4854 { 4855 if (any_slab_objects(s)) 4856 return -EBUSY; 4857 4858 s->flags &= ~SLAB_POISON; 4859 if (buf[0] == '1') { 4860 s->flags &= ~__CMPXCHG_DOUBLE; 4861 s->flags |= SLAB_POISON; 4862 } 4863 calculate_sizes(s, -1); 4864 return length; 4865 } 4866 SLAB_ATTR(poison); 4867 4868 static ssize_t store_user_show(struct kmem_cache *s, char *buf) 4869 { 4870 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 4871 } 4872 4873 static ssize_t store_user_store(struct kmem_cache *s, 4874 const char *buf, size_t length) 4875 { 4876 if (any_slab_objects(s)) 4877 return -EBUSY; 4878 4879 s->flags &= ~SLAB_STORE_USER; 4880 if (buf[0] == '1') { 4881 s->flags &= ~__CMPXCHG_DOUBLE; 4882 s->flags |= SLAB_STORE_USER; 4883 } 4884 calculate_sizes(s, -1); 4885 return length; 4886 } 4887 SLAB_ATTR(store_user); 4888 4889 static ssize_t validate_show(struct kmem_cache *s, char *buf) 4890 { 4891 return 0; 4892 } 4893 4894 static ssize_t validate_store(struct kmem_cache *s, 4895 const char *buf, size_t length) 4896 { 4897 int ret = -EINVAL; 4898 4899 if (buf[0] == '1') { 4900 ret = validate_slab_cache(s); 4901 if (ret >= 0) 4902 ret = length; 4903 } 4904 return ret; 4905 } 4906 SLAB_ATTR(validate); 4907 4908 static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 4909 { 4910 if (!(s->flags & SLAB_STORE_USER)) 4911 return -ENOSYS; 4912 return list_locations(s, buf, TRACK_ALLOC); 4913 } 4914 SLAB_ATTR_RO(alloc_calls); 4915 4916 static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 4917 { 4918 if (!(s->flags & SLAB_STORE_USER)) 4919 return -ENOSYS; 4920 return list_locations(s, buf, TRACK_FREE); 4921 } 4922 SLAB_ATTR_RO(free_calls); 4923 #endif /* CONFIG_SLUB_DEBUG */ 4924 4925 #ifdef CONFIG_FAILSLAB 4926 static ssize_t failslab_show(struct kmem_cache *s, char *buf) 4927 { 4928 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); 4929 } 4930 4931 static ssize_t failslab_store(struct kmem_cache *s, const char *buf, 4932 size_t length) 4933 { 4934 s->flags &= ~SLAB_FAILSLAB; 4935 if (buf[0] == '1') 4936 s->flags |= SLAB_FAILSLAB; 4937 return length; 4938 } 4939 SLAB_ATTR(failslab); 4940 #endif 4941 4942 static ssize_t shrink_show(struct kmem_cache *s, char *buf) 4943 { 4944 return 0; 4945 } 4946 4947 static ssize_t shrink_store(struct kmem_cache *s, 4948 const char *buf, size_t length) 4949 { 4950 if (buf[0] == '1') { 4951 int rc = kmem_cache_shrink(s); 4952 4953 if (rc) 4954 return rc; 4955 } else 4956 return -EINVAL; 4957 return length; 4958 } 4959 SLAB_ATTR(shrink); 4960 4961 #ifdef CONFIG_NUMA 4962 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 4963 { 4964 return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); 4965 } 4966 4967 static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, 4968 const char *buf, size_t length) 4969 { 4970 unsigned long ratio; 4971 int err; 4972 4973 err = strict_strtoul(buf, 10, &ratio); 4974 if (err) 4975 return err; 4976 4977 if (ratio <= 100) 4978 s->remote_node_defrag_ratio = ratio * 10; 4979 4980 return length; 4981 } 4982 SLAB_ATTR(remote_node_defrag_ratio); 4983 #endif 4984 4985 #ifdef CONFIG_SLUB_STATS 4986 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) 4987 { 4988 unsigned long sum = 0; 4989 int cpu; 4990 int len; 4991 int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); 4992 4993 if (!data) 4994 return -ENOMEM; 4995 4996 for_each_online_cpu(cpu) { 4997 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; 4998 4999 data[cpu] = x; 5000 sum += x; 5001 } 5002 5003 len = sprintf(buf, "%lu", sum); 5004 5005 #ifdef CONFIG_SMP 5006 for_each_online_cpu(cpu) { 5007 if (data[cpu] && len < PAGE_SIZE - 20) 5008 len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); 5009 } 5010 #endif 5011 kfree(data); 5012 return len + sprintf(buf + len, "\n"); 5013 } 5014 5015 static void clear_stat(struct kmem_cache *s, enum stat_item si) 5016 { 5017 int cpu; 5018 5019 for_each_online_cpu(cpu) 5020 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; 5021 } 5022 5023 #define STAT_ATTR(si, text) \ 5024 static ssize_t text##_show(struct kmem_cache *s, char *buf) \ 5025 { \ 5026 return show_stat(s, buf, si); \ 5027 } \ 5028 static ssize_t text##_store(struct kmem_cache *s, \ 5029 const char *buf, size_t length) \ 5030 { \ 5031 if (buf[0] != '0') \ 5032 return -EINVAL; \ 5033 clear_stat(s, si); \ 5034 return length; \ 5035 } \ 5036 SLAB_ATTR(text); \ 5037 5038 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 5039 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 5040 STAT_ATTR(FREE_FASTPATH, free_fastpath); 5041 STAT_ATTR(FREE_SLOWPATH, free_slowpath); 5042 STAT_ATTR(FREE_FROZEN, free_frozen); 5043 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 5044 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 5045 STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 5046 STAT_ATTR(ALLOC_SLAB, alloc_slab); 5047 STAT_ATTR(ALLOC_REFILL, alloc_refill); 5048 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); 5049 STAT_ATTR(FREE_SLAB, free_slab); 5050 STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 5051 STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 5052 STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 5053 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 5054 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 5055 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 5056 STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); 5057 STAT_ATTR(ORDER_FALLBACK, order_fallback); 5058 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 5059 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 5060 STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); 5061 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); 5062 #endif 5063 5064 static struct attribute *slab_attrs[] = { 5065 &slab_size_attr.attr, 5066 &object_size_attr.attr, 5067 &objs_per_slab_attr.attr, 5068 &order_attr.attr, 5069 &min_partial_attr.attr, 5070 &cpu_partial_attr.attr, 5071 &objects_attr.attr, 5072 &objects_partial_attr.attr, 5073 &partial_attr.attr, 5074 &cpu_slabs_attr.attr, 5075 &ctor_attr.attr, 5076 &aliases_attr.attr, 5077 &align_attr.attr, 5078 &hwcache_align_attr.attr, 5079 &reclaim_account_attr.attr, 5080 &destroy_by_rcu_attr.attr, 5081 &shrink_attr.attr, 5082 &reserved_attr.attr, 5083 &slabs_cpu_partial_attr.attr, 5084 #ifdef CONFIG_SLUB_DEBUG 5085 &total_objects_attr.attr, 5086 &slabs_attr.attr, 5087 &sanity_checks_attr.attr, 5088 &trace_attr.attr, 5089 &red_zone_attr.attr, 5090 &poison_attr.attr, 5091 &store_user_attr.attr, 5092 &validate_attr.attr, 5093 &alloc_calls_attr.attr, 5094 &free_calls_attr.attr, 5095 #endif 5096 #ifdef CONFIG_ZONE_DMA 5097 &cache_dma_attr.attr, 5098 #endif 5099 #ifdef CONFIG_NUMA 5100 &remote_node_defrag_ratio_attr.attr, 5101 #endif 5102 #ifdef CONFIG_SLUB_STATS 5103 &alloc_fastpath_attr.attr, 5104 &alloc_slowpath_attr.attr, 5105 &free_fastpath_attr.attr, 5106 &free_slowpath_attr.attr, 5107 &free_frozen_attr.attr, 5108 &free_add_partial_attr.attr, 5109 &free_remove_partial_attr.attr, 5110 &alloc_from_partial_attr.attr, 5111 &alloc_slab_attr.attr, 5112 &alloc_refill_attr.attr, 5113 &alloc_node_mismatch_attr.attr, 5114 &free_slab_attr.attr, 5115 &cpuslab_flush_attr.attr, 5116 &deactivate_full_attr.attr, 5117 &deactivate_empty_attr.attr, 5118 &deactivate_to_head_attr.attr, 5119 &deactivate_to_tail_attr.attr, 5120 &deactivate_remote_frees_attr.attr, 5121 &deactivate_bypass_attr.attr, 5122 &order_fallback_attr.attr, 5123 &cmpxchg_double_fail_attr.attr, 5124 &cmpxchg_double_cpu_fail_attr.attr, 5125 &cpu_partial_alloc_attr.attr, 5126 &cpu_partial_free_attr.attr, 5127 #endif 5128 #ifdef CONFIG_FAILSLAB 5129 &failslab_attr.attr, 5130 #endif 5131 5132 NULL 5133 }; 5134 5135 static struct attribute_group slab_attr_group = { 5136 .attrs = slab_attrs, 5137 }; 5138 5139 static ssize_t slab_attr_show(struct kobject *kobj, 5140 struct attribute *attr, 5141 char *buf) 5142 { 5143 struct slab_attribute *attribute; 5144 struct kmem_cache *s; 5145 int err; 5146 5147 attribute = to_slab_attr(attr); 5148 s = to_slab(kobj); 5149 5150 if (!attribute->show) 5151 return -EIO; 5152 5153 err = attribute->show(s, buf); 5154 5155 return err; 5156 } 5157 5158 static ssize_t slab_attr_store(struct kobject *kobj, 5159 struct attribute *attr, 5160 const char *buf, size_t len) 5161 { 5162 struct slab_attribute *attribute; 5163 struct kmem_cache *s; 5164 int err; 5165 5166 attribute = to_slab_attr(attr); 5167 s = to_slab(kobj); 5168 5169 if (!attribute->store) 5170 return -EIO; 5171 5172 err = attribute->store(s, buf, len); 5173 5174 return err; 5175 } 5176 5177 static void kmem_cache_release(struct kobject *kobj) 5178 { 5179 struct kmem_cache *s = to_slab(kobj); 5180 5181 kfree(s->name); 5182 kfree(s); 5183 } 5184 5185 static const struct sysfs_ops slab_sysfs_ops = { 5186 .show = slab_attr_show, 5187 .store = slab_attr_store, 5188 }; 5189 5190 static struct kobj_type slab_ktype = { 5191 .sysfs_ops = &slab_sysfs_ops, 5192 .release = kmem_cache_release 5193 }; 5194 5195 static int uevent_filter(struct kset *kset, struct kobject *kobj) 5196 { 5197 struct kobj_type *ktype = get_ktype(kobj); 5198 5199 if (ktype == &slab_ktype) 5200 return 1; 5201 return 0; 5202 } 5203 5204 static const struct kset_uevent_ops slab_uevent_ops = { 5205 .filter = uevent_filter, 5206 }; 5207 5208 static struct kset *slab_kset; 5209 5210 #define ID_STR_LENGTH 64 5211 5212 /* Create a unique string id for a slab cache: 5213 * 5214 * Format :[flags-]size 5215 */ 5216 static char *create_unique_id(struct kmem_cache *s) 5217 { 5218 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 5219 char *p = name; 5220 5221 BUG_ON(!name); 5222 5223 *p++ = ':'; 5224 /* 5225 * First flags affecting slabcache operations. We will only 5226 * get here for aliasable slabs so we do not need to support 5227 * too many flags. The flags here must cover all flags that 5228 * are matched during merging to guarantee that the id is 5229 * unique. 5230 */ 5231 if (s->flags & SLAB_CACHE_DMA) 5232 *p++ = 'd'; 5233 if (s->flags & SLAB_RECLAIM_ACCOUNT) 5234 *p++ = 'a'; 5235 if (s->flags & SLAB_DEBUG_FREE) 5236 *p++ = 'F'; 5237 if (!(s->flags & SLAB_NOTRACK)) 5238 *p++ = 't'; 5239 if (p != name + 1) 5240 *p++ = '-'; 5241 p += sprintf(p, "%07d", s->size); 5242 BUG_ON(p > name + ID_STR_LENGTH - 1); 5243 return name; 5244 } 5245 5246 static int sysfs_slab_add(struct kmem_cache *s) 5247 { 5248 int err; 5249 const char *name; 5250 int unmergeable; 5251 5252 if (slab_state < SYSFS) 5253 /* Defer until later */ 5254 return 0; 5255 5256 unmergeable = slab_unmergeable(s); 5257 if (unmergeable) { 5258 /* 5259 * Slabcache can never be merged so we can use the name proper. 5260 * This is typically the case for debug situations. In that 5261 * case we can catch duplicate names easily. 5262 */ 5263 sysfs_remove_link(&slab_kset->kobj, s->name); 5264 name = s->name; 5265 } else { 5266 /* 5267 * Create a unique name for the slab as a target 5268 * for the symlinks. 5269 */ 5270 name = create_unique_id(s); 5271 } 5272 5273 s->kobj.kset = slab_kset; 5274 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); 5275 if (err) { 5276 kobject_put(&s->kobj); 5277 return err; 5278 } 5279 5280 err = sysfs_create_group(&s->kobj, &slab_attr_group); 5281 if (err) { 5282 kobject_del(&s->kobj); 5283 kobject_put(&s->kobj); 5284 return err; 5285 } 5286 kobject_uevent(&s->kobj, KOBJ_ADD); 5287 if (!unmergeable) { 5288 /* Setup first alias */ 5289 sysfs_slab_alias(s, s->name); 5290 kfree(name); 5291 } 5292 return 0; 5293 } 5294 5295 static void sysfs_slab_remove(struct kmem_cache *s) 5296 { 5297 if (slab_state < SYSFS) 5298 /* 5299 * Sysfs has not been setup yet so no need to remove the 5300 * cache from sysfs. 5301 */ 5302 return; 5303 5304 kobject_uevent(&s->kobj, KOBJ_REMOVE); 5305 kobject_del(&s->kobj); 5306 kobject_put(&s->kobj); 5307 } 5308 5309 /* 5310 * Need to buffer aliases during bootup until sysfs becomes 5311 * available lest we lose that information. 5312 */ 5313 struct saved_alias { 5314 struct kmem_cache *s; 5315 const char *name; 5316 struct saved_alias *next; 5317 }; 5318 5319 static struct saved_alias *alias_list; 5320 5321 static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 5322 { 5323 struct saved_alias *al; 5324 5325 if (slab_state == SYSFS) { 5326 /* 5327 * If we have a leftover link then remove it. 5328 */ 5329 sysfs_remove_link(&slab_kset->kobj, name); 5330 return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); 5331 } 5332 5333 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 5334 if (!al) 5335 return -ENOMEM; 5336 5337 al->s = s; 5338 al->name = name; 5339 al->next = alias_list; 5340 alias_list = al; 5341 return 0; 5342 } 5343 5344 static int __init slab_sysfs_init(void) 5345 { 5346 struct kmem_cache *s; 5347 int err; 5348 5349 down_write(&slub_lock); 5350 5351 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 5352 if (!slab_kset) { 5353 up_write(&slub_lock); 5354 printk(KERN_ERR "Cannot register slab subsystem.\n"); 5355 return -ENOSYS; 5356 } 5357 5358 slab_state = SYSFS; 5359 5360 list_for_each_entry(s, &slab_caches, list) { 5361 err = sysfs_slab_add(s); 5362 if (err) 5363 printk(KERN_ERR "SLUB: Unable to add boot slab %s" 5364 " to sysfs\n", s->name); 5365 } 5366 5367 while (alias_list) { 5368 struct saved_alias *al = alias_list; 5369 5370 alias_list = alias_list->next; 5371 err = sysfs_slab_alias(al->s, al->name); 5372 if (err) 5373 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 5374 " %s to sysfs\n", s->name); 5375 kfree(al); 5376 } 5377 5378 up_write(&slub_lock); 5379 resiliency_test(); 5380 return 0; 5381 } 5382 5383 __initcall(slab_sysfs_init); 5384 #endif /* CONFIG_SYSFS */ 5385 5386 /* 5387 * The /proc/slabinfo ABI 5388 */ 5389 #ifdef CONFIG_SLABINFO 5390 static void print_slabinfo_header(struct seq_file *m) 5391 { 5392 seq_puts(m, "slabinfo - version: 2.1\n"); 5393 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 5394 "<objperslab> <pagesperslab>"); 5395 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 5396 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 5397 seq_putc(m, '\n'); 5398 } 5399 5400 static void *s_start(struct seq_file *m, loff_t *pos) 5401 { 5402 loff_t n = *pos; 5403 5404 down_read(&slub_lock); 5405 if (!n) 5406 print_slabinfo_header(m); 5407 5408 return seq_list_start(&slab_caches, *pos); 5409 } 5410 5411 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 5412 { 5413 return seq_list_next(p, &slab_caches, pos); 5414 } 5415 5416 static void s_stop(struct seq_file *m, void *p) 5417 { 5418 up_read(&slub_lock); 5419 } 5420 5421 static int s_show(struct seq_file *m, void *p) 5422 { 5423 unsigned long nr_partials = 0; 5424 unsigned long nr_slabs = 0; 5425 unsigned long nr_inuse = 0; 5426 unsigned long nr_objs = 0; 5427 unsigned long nr_free = 0; 5428 struct kmem_cache *s; 5429 int node; 5430 5431 s = list_entry(p, struct kmem_cache, list); 5432 5433 for_each_online_node(node) { 5434 struct kmem_cache_node *n = get_node(s, node); 5435 5436 if (!n) 5437 continue; 5438 5439 nr_partials += n->nr_partial; 5440 nr_slabs += atomic_long_read(&n->nr_slabs); 5441 nr_objs += atomic_long_read(&n->total_objects); 5442 nr_free += count_partial(n, count_free); 5443 } 5444 5445 nr_inuse = nr_objs - nr_free; 5446 5447 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, 5448 nr_objs, s->size, oo_objects(s->oo), 5449 (1 << oo_order(s->oo))); 5450 seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); 5451 seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, 5452 0UL); 5453 seq_putc(m, '\n'); 5454 return 0; 5455 } 5456 5457 static const struct seq_operations slabinfo_op = { 5458 .start = s_start, 5459 .next = s_next, 5460 .stop = s_stop, 5461 .show = s_show, 5462 }; 5463 5464 static int slabinfo_open(struct inode *inode, struct file *file) 5465 { 5466 return seq_open(file, &slabinfo_op); 5467 } 5468 5469 static const struct file_operations proc_slabinfo_operations = { 5470 .open = slabinfo_open, 5471 .read = seq_read, 5472 .llseek = seq_lseek, 5473 .release = seq_release, 5474 }; 5475 5476 static int __init slab_proc_init(void) 5477 { 5478 proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); 5479 return 0; 5480 } 5481 module_init(slab_proc_init); 5482 #endif /* CONFIG_SLABINFO */ 5483