1 /* 2 * SLUB: A slab allocator that limits cache line use instead of queuing 3 * objects in per cpu and per node lists. 4 * 5 * The allocator synchronizes using per slab locks or atomic operatios 6 * and only uses a centralized lock to manage a pool of partial slabs. 7 * 8 * (C) 2007 SGI, Christoph Lameter 9 * (C) 2011 Linux Foundation, Christoph Lameter 10 */ 11 12 #include <linux/mm.h> 13 #include <linux/swap.h> /* struct reclaim_state */ 14 #include <linux/module.h> 15 #include <linux/bit_spinlock.h> 16 #include <linux/interrupt.h> 17 #include <linux/bitops.h> 18 #include <linux/slab.h> 19 #include "slab.h" 20 #include <linux/proc_fs.h> 21 #include <linux/seq_file.h> 22 #include <linux/kmemcheck.h> 23 #include <linux/cpu.h> 24 #include <linux/cpuset.h> 25 #include <linux/mempolicy.h> 26 #include <linux/ctype.h> 27 #include <linux/debugobjects.h> 28 #include <linux/kallsyms.h> 29 #include <linux/memory.h> 30 #include <linux/math64.h> 31 #include <linux/fault-inject.h> 32 #include <linux/stacktrace.h> 33 #include <linux/prefetch.h> 34 #include <linux/memcontrol.h> 35 36 #include <trace/events/kmem.h> 37 38 #include "internal.h" 39 40 /* 41 * Lock order: 42 * 1. slab_mutex (Global Mutex) 43 * 2. node->list_lock 44 * 3. slab_lock(page) (Only on some arches and for debugging) 45 * 46 * slab_mutex 47 * 48 * The role of the slab_mutex is to protect the list of all the slabs 49 * and to synchronize major metadata changes to slab cache structures. 50 * 51 * The slab_lock is only used for debugging and on arches that do not 52 * have the ability to do a cmpxchg_double. It only protects the second 53 * double word in the page struct. Meaning 54 * A. page->freelist -> List of object free in a page 55 * B. page->counters -> Counters of objects 56 * C. page->frozen -> frozen state 57 * 58 * If a slab is frozen then it is exempt from list management. It is not 59 * on any list. The processor that froze the slab is the one who can 60 * perform list operations on the page. Other processors may put objects 61 * onto the freelist but the processor that froze the slab is the only 62 * one that can retrieve the objects from the page's freelist. 63 * 64 * The list_lock protects the partial and full list on each node and 65 * the partial slab counter. If taken then no new slabs may be added or 66 * removed from the lists nor make the number of partial slabs be modified. 67 * (Note that the total number of slabs is an atomic value that may be 68 * modified without taking the list lock). 69 * 70 * The list_lock is a centralized lock and thus we avoid taking it as 71 * much as possible. As long as SLUB does not have to handle partial 72 * slabs, operations can continue without any centralized lock. F.e. 73 * allocating a long series of objects that fill up slabs does not require 74 * the list lock. 75 * Interrupts are disabled during allocation and deallocation in order to 76 * make the slab allocator safe to use in the context of an irq. In addition 77 * interrupts are disabled to ensure that the processor does not change 78 * while handling per_cpu slabs, due to kernel preemption. 79 * 80 * SLUB assigns one slab for allocation to each processor. 81 * Allocations only occur from these slabs called cpu slabs. 82 * 83 * Slabs with free elements are kept on a partial list and during regular 84 * operations no list for full slabs is used. If an object in a full slab is 85 * freed then the slab will show up again on the partial lists. 86 * We track full slabs for debugging purposes though because otherwise we 87 * cannot scan all objects. 88 * 89 * Slabs are freed when they become empty. Teardown and setup is 90 * minimal so we rely on the page allocators per cpu caches for 91 * fast frees and allocs. 92 * 93 * Overloading of page flags that are otherwise used for LRU management. 94 * 95 * PageActive The slab is frozen and exempt from list processing. 96 * This means that the slab is dedicated to a purpose 97 * such as satisfying allocations for a specific 98 * processor. Objects may be freed in the slab while 99 * it is frozen but slab_free will then skip the usual 100 * list operations. It is up to the processor holding 101 * the slab to integrate the slab into the slab lists 102 * when the slab is no longer needed. 103 * 104 * One use of this flag is to mark slabs that are 105 * used for allocations. Then such a slab becomes a cpu 106 * slab. The cpu slab may be equipped with an additional 107 * freelist that allows lockless access to 108 * free objects in addition to the regular freelist 109 * that requires the slab lock. 110 * 111 * PageError Slab requires special handling due to debug 112 * options set. This moves slab handling out of 113 * the fast path and disables lockless freelists. 114 */ 115 116 static inline int kmem_cache_debug(struct kmem_cache *s) 117 { 118 #ifdef CONFIG_SLUB_DEBUG 119 return unlikely(s->flags & SLAB_DEBUG_FLAGS); 120 #else 121 return 0; 122 #endif 123 } 124 125 /* 126 * Issues still to be resolved: 127 * 128 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 129 * 130 * - Variable sizing of the per node arrays 131 */ 132 133 /* Enable to test recovery from slab corruption on boot */ 134 #undef SLUB_RESILIENCY_TEST 135 136 /* Enable to log cmpxchg failures */ 137 #undef SLUB_DEBUG_CMPXCHG 138 139 /* 140 * Mininum number of partial slabs. These will be left on the partial 141 * lists even if they are empty. kmem_cache_shrink may reclaim them. 142 */ 143 #define MIN_PARTIAL 5 144 145 /* 146 * Maximum number of desirable partial slabs. 147 * The existence of more partial slabs makes kmem_cache_shrink 148 * sort the partial list by the number of objects in the. 149 */ 150 #define MAX_PARTIAL 10 151 152 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ 153 SLAB_POISON | SLAB_STORE_USER) 154 155 /* 156 * Debugging flags that require metadata to be stored in the slab. These get 157 * disabled when slub_debug=O is used and a cache's min order increases with 158 * metadata. 159 */ 160 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 161 162 /* 163 * Set of flags that will prevent slab merging 164 */ 165 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 166 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ 167 SLAB_FAILSLAB) 168 169 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 170 SLAB_CACHE_DMA | SLAB_NOTRACK) 171 172 #define OO_SHIFT 16 173 #define OO_MASK ((1 << OO_SHIFT) - 1) 174 #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ 175 176 /* Internal SLUB flags */ 177 #define __OBJECT_POISON 0x80000000UL /* Poison object */ 178 #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ 179 180 #ifdef CONFIG_SMP 181 static struct notifier_block slab_notifier; 182 #endif 183 184 /* 185 * Tracking user of a slab. 186 */ 187 #define TRACK_ADDRS_COUNT 16 188 struct track { 189 unsigned long addr; /* Called from address */ 190 #ifdef CONFIG_STACKTRACE 191 unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ 192 #endif 193 int cpu; /* Was running on cpu */ 194 int pid; /* Pid context */ 195 unsigned long when; /* When did the operation occur */ 196 }; 197 198 enum track_item { TRACK_ALLOC, TRACK_FREE }; 199 200 #ifdef CONFIG_SYSFS 201 static int sysfs_slab_add(struct kmem_cache *); 202 static int sysfs_slab_alias(struct kmem_cache *, const char *); 203 static void sysfs_slab_remove(struct kmem_cache *); 204 static void memcg_propagate_slab_attrs(struct kmem_cache *s); 205 #else 206 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 207 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 208 { return 0; } 209 static inline void sysfs_slab_remove(struct kmem_cache *s) { } 210 211 static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } 212 #endif 213 214 static inline void stat(const struct kmem_cache *s, enum stat_item si) 215 { 216 #ifdef CONFIG_SLUB_STATS 217 __this_cpu_inc(s->cpu_slab->stat[si]); 218 #endif 219 } 220 221 /******************************************************************** 222 * Core slab cache functions 223 *******************************************************************/ 224 225 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 226 { 227 return s->node[node]; 228 } 229 230 /* Verify that a pointer has an address that is valid within a slab page */ 231 static inline int check_valid_pointer(struct kmem_cache *s, 232 struct page *page, const void *object) 233 { 234 void *base; 235 236 if (!object) 237 return 1; 238 239 base = page_address(page); 240 if (object < base || object >= base + page->objects * s->size || 241 (object - base) % s->size) { 242 return 0; 243 } 244 245 return 1; 246 } 247 248 static inline void *get_freepointer(struct kmem_cache *s, void *object) 249 { 250 return *(void **)(object + s->offset); 251 } 252 253 static void prefetch_freepointer(const struct kmem_cache *s, void *object) 254 { 255 prefetch(object + s->offset); 256 } 257 258 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 259 { 260 void *p; 261 262 #ifdef CONFIG_DEBUG_PAGEALLOC 263 probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); 264 #else 265 p = get_freepointer(s, object); 266 #endif 267 return p; 268 } 269 270 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 271 { 272 *(void **)(object + s->offset) = fp; 273 } 274 275 /* Loop over all objects in a slab */ 276 #define for_each_object(__p, __s, __addr, __objects) \ 277 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ 278 __p += (__s)->size) 279 280 /* Determine object index from a given position */ 281 static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 282 { 283 return (p - addr) / s->size; 284 } 285 286 static inline size_t slab_ksize(const struct kmem_cache *s) 287 { 288 #ifdef CONFIG_SLUB_DEBUG 289 /* 290 * Debugging requires use of the padding between object 291 * and whatever may come after it. 292 */ 293 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 294 return s->object_size; 295 296 #endif 297 /* 298 * If we have the need to store the freelist pointer 299 * back there or track user information then we can 300 * only use the space before that information. 301 */ 302 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 303 return s->inuse; 304 /* 305 * Else we can use all the padding etc for the allocation 306 */ 307 return s->size; 308 } 309 310 static inline int order_objects(int order, unsigned long size, int reserved) 311 { 312 return ((PAGE_SIZE << order) - reserved) / size; 313 } 314 315 static inline struct kmem_cache_order_objects oo_make(int order, 316 unsigned long size, int reserved) 317 { 318 struct kmem_cache_order_objects x = { 319 (order << OO_SHIFT) + order_objects(order, size, reserved) 320 }; 321 322 return x; 323 } 324 325 static inline int oo_order(struct kmem_cache_order_objects x) 326 { 327 return x.x >> OO_SHIFT; 328 } 329 330 static inline int oo_objects(struct kmem_cache_order_objects x) 331 { 332 return x.x & OO_MASK; 333 } 334 335 /* 336 * Per slab locking using the pagelock 337 */ 338 static __always_inline void slab_lock(struct page *page) 339 { 340 bit_spin_lock(PG_locked, &page->flags); 341 } 342 343 static __always_inline void slab_unlock(struct page *page) 344 { 345 __bit_spin_unlock(PG_locked, &page->flags); 346 } 347 348 /* Interrupts must be disabled (for the fallback code to work right) */ 349 static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 350 void *freelist_old, unsigned long counters_old, 351 void *freelist_new, unsigned long counters_new, 352 const char *n) 353 { 354 VM_BUG_ON(!irqs_disabled()); 355 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 356 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 357 if (s->flags & __CMPXCHG_DOUBLE) { 358 if (cmpxchg_double(&page->freelist, &page->counters, 359 freelist_old, counters_old, 360 freelist_new, counters_new)) 361 return 1; 362 } else 363 #endif 364 { 365 slab_lock(page); 366 if (page->freelist == freelist_old && page->counters == counters_old) { 367 page->freelist = freelist_new; 368 page->counters = counters_new; 369 slab_unlock(page); 370 return 1; 371 } 372 slab_unlock(page); 373 } 374 375 cpu_relax(); 376 stat(s, CMPXCHG_DOUBLE_FAIL); 377 378 #ifdef SLUB_DEBUG_CMPXCHG 379 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 380 #endif 381 382 return 0; 383 } 384 385 static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 386 void *freelist_old, unsigned long counters_old, 387 void *freelist_new, unsigned long counters_new, 388 const char *n) 389 { 390 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 391 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 392 if (s->flags & __CMPXCHG_DOUBLE) { 393 if (cmpxchg_double(&page->freelist, &page->counters, 394 freelist_old, counters_old, 395 freelist_new, counters_new)) 396 return 1; 397 } else 398 #endif 399 { 400 unsigned long flags; 401 402 local_irq_save(flags); 403 slab_lock(page); 404 if (page->freelist == freelist_old && page->counters == counters_old) { 405 page->freelist = freelist_new; 406 page->counters = counters_new; 407 slab_unlock(page); 408 local_irq_restore(flags); 409 return 1; 410 } 411 slab_unlock(page); 412 local_irq_restore(flags); 413 } 414 415 cpu_relax(); 416 stat(s, CMPXCHG_DOUBLE_FAIL); 417 418 #ifdef SLUB_DEBUG_CMPXCHG 419 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); 420 #endif 421 422 return 0; 423 } 424 425 #ifdef CONFIG_SLUB_DEBUG 426 /* 427 * Determine a map of object in use on a page. 428 * 429 * Node listlock must be held to guarantee that the page does 430 * not vanish from under us. 431 */ 432 static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) 433 { 434 void *p; 435 void *addr = page_address(page); 436 437 for (p = page->freelist; p; p = get_freepointer(s, p)) 438 set_bit(slab_index(p, s, addr), map); 439 } 440 441 /* 442 * Debug settings: 443 */ 444 #ifdef CONFIG_SLUB_DEBUG_ON 445 static int slub_debug = DEBUG_DEFAULT_FLAGS; 446 #else 447 static int slub_debug; 448 #endif 449 450 static char *slub_debug_slabs; 451 static int disable_higher_order_debug; 452 453 /* 454 * Object debugging 455 */ 456 static void print_section(char *text, u8 *addr, unsigned int length) 457 { 458 print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, 459 length, 1); 460 } 461 462 static struct track *get_track(struct kmem_cache *s, void *object, 463 enum track_item alloc) 464 { 465 struct track *p; 466 467 if (s->offset) 468 p = object + s->offset + sizeof(void *); 469 else 470 p = object + s->inuse; 471 472 return p + alloc; 473 } 474 475 static void set_track(struct kmem_cache *s, void *object, 476 enum track_item alloc, unsigned long addr) 477 { 478 struct track *p = get_track(s, object, alloc); 479 480 if (addr) { 481 #ifdef CONFIG_STACKTRACE 482 struct stack_trace trace; 483 int i; 484 485 trace.nr_entries = 0; 486 trace.max_entries = TRACK_ADDRS_COUNT; 487 trace.entries = p->addrs; 488 trace.skip = 3; 489 save_stack_trace(&trace); 490 491 /* See rant in lockdep.c */ 492 if (trace.nr_entries != 0 && 493 trace.entries[trace.nr_entries - 1] == ULONG_MAX) 494 trace.nr_entries--; 495 496 for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) 497 p->addrs[i] = 0; 498 #endif 499 p->addr = addr; 500 p->cpu = smp_processor_id(); 501 p->pid = current->pid; 502 p->when = jiffies; 503 } else 504 memset(p, 0, sizeof(struct track)); 505 } 506 507 static void init_tracking(struct kmem_cache *s, void *object) 508 { 509 if (!(s->flags & SLAB_STORE_USER)) 510 return; 511 512 set_track(s, object, TRACK_FREE, 0UL); 513 set_track(s, object, TRACK_ALLOC, 0UL); 514 } 515 516 static void print_track(const char *s, struct track *t) 517 { 518 if (!t->addr) 519 return; 520 521 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 522 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 523 #ifdef CONFIG_STACKTRACE 524 { 525 int i; 526 for (i = 0; i < TRACK_ADDRS_COUNT; i++) 527 if (t->addrs[i]) 528 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); 529 else 530 break; 531 } 532 #endif 533 } 534 535 static void print_tracking(struct kmem_cache *s, void *object) 536 { 537 if (!(s->flags & SLAB_STORE_USER)) 538 return; 539 540 print_track("Allocated", get_track(s, object, TRACK_ALLOC)); 541 print_track("Freed", get_track(s, object, TRACK_FREE)); 542 } 543 544 static void print_page_info(struct page *page) 545 { 546 printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", 547 page, page->objects, page->inuse, page->freelist, page->flags); 548 549 } 550 551 static void slab_bug(struct kmem_cache *s, char *fmt, ...) 552 { 553 va_list args; 554 char buf[100]; 555 556 va_start(args, fmt); 557 vsnprintf(buf, sizeof(buf), fmt, args); 558 va_end(args); 559 printk(KERN_ERR "========================================" 560 "=====================================\n"); 561 printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); 562 printk(KERN_ERR "----------------------------------------" 563 "-------------------------------------\n\n"); 564 565 add_taint(TAINT_BAD_PAGE); 566 } 567 568 static void slab_fix(struct kmem_cache *s, char *fmt, ...) 569 { 570 va_list args; 571 char buf[100]; 572 573 va_start(args, fmt); 574 vsnprintf(buf, sizeof(buf), fmt, args); 575 va_end(args); 576 printk(KERN_ERR "FIX %s: %s\n", s->name, buf); 577 } 578 579 static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) 580 { 581 unsigned int off; /* Offset of last byte */ 582 u8 *addr = page_address(page); 583 584 print_tracking(s, p); 585 586 print_page_info(page); 587 588 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", 589 p, p - addr, get_freepointer(s, p)); 590 591 if (p > addr + 16) 592 print_section("Bytes b4 ", p - 16, 16); 593 594 print_section("Object ", p, min_t(unsigned long, s->object_size, 595 PAGE_SIZE)); 596 if (s->flags & SLAB_RED_ZONE) 597 print_section("Redzone ", p + s->object_size, 598 s->inuse - s->object_size); 599 600 if (s->offset) 601 off = s->offset + sizeof(void *); 602 else 603 off = s->inuse; 604 605 if (s->flags & SLAB_STORE_USER) 606 off += 2 * sizeof(struct track); 607 608 if (off != s->size) 609 /* Beginning of the filler is the free pointer */ 610 print_section("Padding ", p + off, s->size - off); 611 612 dump_stack(); 613 } 614 615 static void object_err(struct kmem_cache *s, struct page *page, 616 u8 *object, char *reason) 617 { 618 slab_bug(s, "%s", reason); 619 print_trailer(s, page, object); 620 } 621 622 static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...) 623 { 624 va_list args; 625 char buf[100]; 626 627 va_start(args, fmt); 628 vsnprintf(buf, sizeof(buf), fmt, args); 629 va_end(args); 630 slab_bug(s, "%s", buf); 631 print_page_info(page); 632 dump_stack(); 633 } 634 635 static void init_object(struct kmem_cache *s, void *object, u8 val) 636 { 637 u8 *p = object; 638 639 if (s->flags & __OBJECT_POISON) { 640 memset(p, POISON_FREE, s->object_size - 1); 641 p[s->object_size - 1] = POISON_END; 642 } 643 644 if (s->flags & SLAB_RED_ZONE) 645 memset(p + s->object_size, val, s->inuse - s->object_size); 646 } 647 648 static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 649 void *from, void *to) 650 { 651 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); 652 memset(from, data, to - from); 653 } 654 655 static int check_bytes_and_report(struct kmem_cache *s, struct page *page, 656 u8 *object, char *what, 657 u8 *start, unsigned int value, unsigned int bytes) 658 { 659 u8 *fault; 660 u8 *end; 661 662 fault = memchr_inv(start, value, bytes); 663 if (!fault) 664 return 1; 665 666 end = start + bytes; 667 while (end > fault && end[-1] == value) 668 end--; 669 670 slab_bug(s, "%s overwritten", what); 671 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", 672 fault, end - 1, fault[0], value); 673 print_trailer(s, page, object); 674 675 restore_bytes(s, what, value, fault, end); 676 return 0; 677 } 678 679 /* 680 * Object layout: 681 * 682 * object address 683 * Bytes of the object to be managed. 684 * If the freepointer may overlay the object then the free 685 * pointer is the first word of the object. 686 * 687 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 688 * 0xa5 (POISON_END) 689 * 690 * object + s->object_size 691 * Padding to reach word boundary. This is also used for Redzoning. 692 * Padding is extended by another word if Redzoning is enabled and 693 * object_size == inuse. 694 * 695 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 696 * 0xcc (RED_ACTIVE) for objects in use. 697 * 698 * object + s->inuse 699 * Meta data starts here. 700 * 701 * A. Free pointer (if we cannot overwrite object on free) 702 * B. Tracking data for SLAB_STORE_USER 703 * C. Padding to reach required alignment boundary or at mininum 704 * one word if debugging is on to be able to detect writes 705 * before the word boundary. 706 * 707 * Padding is done using 0x5a (POISON_INUSE) 708 * 709 * object + s->size 710 * Nothing is used beyond s->size. 711 * 712 * If slabcaches are merged then the object_size and inuse boundaries are mostly 713 * ignored. And therefore no slab options that rely on these boundaries 714 * may be used with merged slabcaches. 715 */ 716 717 static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) 718 { 719 unsigned long off = s->inuse; /* The end of info */ 720 721 if (s->offset) 722 /* Freepointer is placed after the object. */ 723 off += sizeof(void *); 724 725 if (s->flags & SLAB_STORE_USER) 726 /* We also have user information there */ 727 off += 2 * sizeof(struct track); 728 729 if (s->size == off) 730 return 1; 731 732 return check_bytes_and_report(s, page, p, "Object padding", 733 p + off, POISON_INUSE, s->size - off); 734 } 735 736 /* Check the pad bytes at the end of a slab page */ 737 static int slab_pad_check(struct kmem_cache *s, struct page *page) 738 { 739 u8 *start; 740 u8 *fault; 741 u8 *end; 742 int length; 743 int remainder; 744 745 if (!(s->flags & SLAB_POISON)) 746 return 1; 747 748 start = page_address(page); 749 length = (PAGE_SIZE << compound_order(page)) - s->reserved; 750 end = start + length; 751 remainder = length % s->size; 752 if (!remainder) 753 return 1; 754 755 fault = memchr_inv(end - remainder, POISON_INUSE, remainder); 756 if (!fault) 757 return 1; 758 while (end > fault && end[-1] == POISON_INUSE) 759 end--; 760 761 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 762 print_section("Padding ", end - remainder, remainder); 763 764 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); 765 return 0; 766 } 767 768 static int check_object(struct kmem_cache *s, struct page *page, 769 void *object, u8 val) 770 { 771 u8 *p = object; 772 u8 *endobject = object + s->object_size; 773 774 if (s->flags & SLAB_RED_ZONE) { 775 if (!check_bytes_and_report(s, page, object, "Redzone", 776 endobject, val, s->inuse - s->object_size)) 777 return 0; 778 } else { 779 if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { 780 check_bytes_and_report(s, page, p, "Alignment padding", 781 endobject, POISON_INUSE, s->inuse - s->object_size); 782 } 783 } 784 785 if (s->flags & SLAB_POISON) { 786 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && 787 (!check_bytes_and_report(s, page, p, "Poison", p, 788 POISON_FREE, s->object_size - 1) || 789 !check_bytes_and_report(s, page, p, "Poison", 790 p + s->object_size - 1, POISON_END, 1))) 791 return 0; 792 /* 793 * check_pad_bytes cleans up on its own. 794 */ 795 check_pad_bytes(s, page, p); 796 } 797 798 if (!s->offset && val == SLUB_RED_ACTIVE) 799 /* 800 * Object and freepointer overlap. Cannot check 801 * freepointer while object is allocated. 802 */ 803 return 1; 804 805 /* Check free pointer validity */ 806 if (!check_valid_pointer(s, page, get_freepointer(s, p))) { 807 object_err(s, page, p, "Freepointer corrupt"); 808 /* 809 * No choice but to zap it and thus lose the remainder 810 * of the free objects in this slab. May cause 811 * another error because the object count is now wrong. 812 */ 813 set_freepointer(s, p, NULL); 814 return 0; 815 } 816 return 1; 817 } 818 819 static int check_slab(struct kmem_cache *s, struct page *page) 820 { 821 int maxobj; 822 823 VM_BUG_ON(!irqs_disabled()); 824 825 if (!PageSlab(page)) { 826 slab_err(s, page, "Not a valid slab page"); 827 return 0; 828 } 829 830 maxobj = order_objects(compound_order(page), s->size, s->reserved); 831 if (page->objects > maxobj) { 832 slab_err(s, page, "objects %u > max %u", 833 s->name, page->objects, maxobj); 834 return 0; 835 } 836 if (page->inuse > page->objects) { 837 slab_err(s, page, "inuse %u > max %u", 838 s->name, page->inuse, page->objects); 839 return 0; 840 } 841 /* Slab_pad_check fixes things up after itself */ 842 slab_pad_check(s, page); 843 return 1; 844 } 845 846 /* 847 * Determine if a certain object on a page is on the freelist. Must hold the 848 * slab lock to guarantee that the chains are in a consistent state. 849 */ 850 static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 851 { 852 int nr = 0; 853 void *fp; 854 void *object = NULL; 855 unsigned long max_objects; 856 857 fp = page->freelist; 858 while (fp && nr <= page->objects) { 859 if (fp == search) 860 return 1; 861 if (!check_valid_pointer(s, page, fp)) { 862 if (object) { 863 object_err(s, page, object, 864 "Freechain corrupt"); 865 set_freepointer(s, object, NULL); 866 break; 867 } else { 868 slab_err(s, page, "Freepointer corrupt"); 869 page->freelist = NULL; 870 page->inuse = page->objects; 871 slab_fix(s, "Freelist cleared"); 872 return 0; 873 } 874 break; 875 } 876 object = fp; 877 fp = get_freepointer(s, object); 878 nr++; 879 } 880 881 max_objects = order_objects(compound_order(page), s->size, s->reserved); 882 if (max_objects > MAX_OBJS_PER_PAGE) 883 max_objects = MAX_OBJS_PER_PAGE; 884 885 if (page->objects != max_objects) { 886 slab_err(s, page, "Wrong number of objects. Found %d but " 887 "should be %d", page->objects, max_objects); 888 page->objects = max_objects; 889 slab_fix(s, "Number of objects adjusted."); 890 } 891 if (page->inuse != page->objects - nr) { 892 slab_err(s, page, "Wrong object count. Counter is %d but " 893 "counted were %d", page->inuse, page->objects - nr); 894 page->inuse = page->objects - nr; 895 slab_fix(s, "Object count adjusted."); 896 } 897 return search == NULL; 898 } 899 900 static void trace(struct kmem_cache *s, struct page *page, void *object, 901 int alloc) 902 { 903 if (s->flags & SLAB_TRACE) { 904 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 905 s->name, 906 alloc ? "alloc" : "free", 907 object, page->inuse, 908 page->freelist); 909 910 if (!alloc) 911 print_section("Object ", (void *)object, s->object_size); 912 913 dump_stack(); 914 } 915 } 916 917 /* 918 * Hooks for other subsystems that check memory allocations. In a typical 919 * production configuration these hooks all should produce no code at all. 920 */ 921 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 922 { 923 flags &= gfp_allowed_mask; 924 lockdep_trace_alloc(flags); 925 might_sleep_if(flags & __GFP_WAIT); 926 927 return should_failslab(s->object_size, flags, s->flags); 928 } 929 930 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) 931 { 932 flags &= gfp_allowed_mask; 933 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 934 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); 935 } 936 937 static inline void slab_free_hook(struct kmem_cache *s, void *x) 938 { 939 kmemleak_free_recursive(x, s->flags); 940 941 /* 942 * Trouble is that we may no longer disable interupts in the fast path 943 * So in order to make the debug calls that expect irqs to be 944 * disabled we need to disable interrupts temporarily. 945 */ 946 #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) 947 { 948 unsigned long flags; 949 950 local_irq_save(flags); 951 kmemcheck_slab_free(s, x, s->object_size); 952 debug_check_no_locks_freed(x, s->object_size); 953 local_irq_restore(flags); 954 } 955 #endif 956 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 957 debug_check_no_obj_freed(x, s->object_size); 958 } 959 960 /* 961 * Tracking of fully allocated slabs for debugging purposes. 962 * 963 * list_lock must be held. 964 */ 965 static void add_full(struct kmem_cache *s, 966 struct kmem_cache_node *n, struct page *page) 967 { 968 if (!(s->flags & SLAB_STORE_USER)) 969 return; 970 971 list_add(&page->lru, &n->full); 972 } 973 974 /* 975 * list_lock must be held. 976 */ 977 static void remove_full(struct kmem_cache *s, struct page *page) 978 { 979 if (!(s->flags & SLAB_STORE_USER)) 980 return; 981 982 list_del(&page->lru); 983 } 984 985 /* Tracking of the number of slabs for debugging purposes */ 986 static inline unsigned long slabs_node(struct kmem_cache *s, int node) 987 { 988 struct kmem_cache_node *n = get_node(s, node); 989 990 return atomic_long_read(&n->nr_slabs); 991 } 992 993 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 994 { 995 return atomic_long_read(&n->nr_slabs); 996 } 997 998 static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) 999 { 1000 struct kmem_cache_node *n = get_node(s, node); 1001 1002 /* 1003 * May be called early in order to allocate a slab for the 1004 * kmem_cache_node structure. Solve the chicken-egg 1005 * dilemma by deferring the increment of the count during 1006 * bootstrap (see early_kmem_cache_node_alloc). 1007 */ 1008 if (n) { 1009 atomic_long_inc(&n->nr_slabs); 1010 atomic_long_add(objects, &n->total_objects); 1011 } 1012 } 1013 static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) 1014 { 1015 struct kmem_cache_node *n = get_node(s, node); 1016 1017 atomic_long_dec(&n->nr_slabs); 1018 atomic_long_sub(objects, &n->total_objects); 1019 } 1020 1021 /* Object debug checks for alloc/free paths */ 1022 static void setup_object_debug(struct kmem_cache *s, struct page *page, 1023 void *object) 1024 { 1025 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 1026 return; 1027 1028 init_object(s, object, SLUB_RED_INACTIVE); 1029 init_tracking(s, object); 1030 } 1031 1032 static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page, 1033 void *object, unsigned long addr) 1034 { 1035 if (!check_slab(s, page)) 1036 goto bad; 1037 1038 if (!check_valid_pointer(s, page, object)) { 1039 object_err(s, page, object, "Freelist Pointer check fails"); 1040 goto bad; 1041 } 1042 1043 if (!check_object(s, page, object, SLUB_RED_INACTIVE)) 1044 goto bad; 1045 1046 /* Success perform special debug activities for allocs */ 1047 if (s->flags & SLAB_STORE_USER) 1048 set_track(s, object, TRACK_ALLOC, addr); 1049 trace(s, page, object, 1); 1050 init_object(s, object, SLUB_RED_ACTIVE); 1051 return 1; 1052 1053 bad: 1054 if (PageSlab(page)) { 1055 /* 1056 * If this is a slab page then lets do the best we can 1057 * to avoid issues in the future. Marking all objects 1058 * as used avoids touching the remaining objects. 1059 */ 1060 slab_fix(s, "Marking all objects used"); 1061 page->inuse = page->objects; 1062 page->freelist = NULL; 1063 } 1064 return 0; 1065 } 1066 1067 static noinline struct kmem_cache_node *free_debug_processing( 1068 struct kmem_cache *s, struct page *page, void *object, 1069 unsigned long addr, unsigned long *flags) 1070 { 1071 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1072 1073 spin_lock_irqsave(&n->list_lock, *flags); 1074 slab_lock(page); 1075 1076 if (!check_slab(s, page)) 1077 goto fail; 1078 1079 if (!check_valid_pointer(s, page, object)) { 1080 slab_err(s, page, "Invalid object pointer 0x%p", object); 1081 goto fail; 1082 } 1083 1084 if (on_freelist(s, page, object)) { 1085 object_err(s, page, object, "Object already free"); 1086 goto fail; 1087 } 1088 1089 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1090 goto out; 1091 1092 if (unlikely(s != page->slab_cache)) { 1093 if (!PageSlab(page)) { 1094 slab_err(s, page, "Attempt to free object(0x%p) " 1095 "outside of slab", object); 1096 } else if (!page->slab_cache) { 1097 printk(KERN_ERR 1098 "SLUB <none>: no slab for object 0x%p.\n", 1099 object); 1100 dump_stack(); 1101 } else 1102 object_err(s, page, object, 1103 "page slab pointer corrupt."); 1104 goto fail; 1105 } 1106 1107 if (s->flags & SLAB_STORE_USER) 1108 set_track(s, object, TRACK_FREE, addr); 1109 trace(s, page, object, 0); 1110 init_object(s, object, SLUB_RED_INACTIVE); 1111 out: 1112 slab_unlock(page); 1113 /* 1114 * Keep node_lock to preserve integrity 1115 * until the object is actually freed 1116 */ 1117 return n; 1118 1119 fail: 1120 slab_unlock(page); 1121 spin_unlock_irqrestore(&n->list_lock, *flags); 1122 slab_fix(s, "Object at 0x%p not freed", object); 1123 return NULL; 1124 } 1125 1126 static int __init setup_slub_debug(char *str) 1127 { 1128 slub_debug = DEBUG_DEFAULT_FLAGS; 1129 if (*str++ != '=' || !*str) 1130 /* 1131 * No options specified. Switch on full debugging. 1132 */ 1133 goto out; 1134 1135 if (*str == ',') 1136 /* 1137 * No options but restriction on slabs. This means full 1138 * debugging for slabs matching a pattern. 1139 */ 1140 goto check_slabs; 1141 1142 if (tolower(*str) == 'o') { 1143 /* 1144 * Avoid enabling debugging on caches if its minimum order 1145 * would increase as a result. 1146 */ 1147 disable_higher_order_debug = 1; 1148 goto out; 1149 } 1150 1151 slub_debug = 0; 1152 if (*str == '-') 1153 /* 1154 * Switch off all debugging measures. 1155 */ 1156 goto out; 1157 1158 /* 1159 * Determine which debug features should be switched on 1160 */ 1161 for (; *str && *str != ','; str++) { 1162 switch (tolower(*str)) { 1163 case 'f': 1164 slub_debug |= SLAB_DEBUG_FREE; 1165 break; 1166 case 'z': 1167 slub_debug |= SLAB_RED_ZONE; 1168 break; 1169 case 'p': 1170 slub_debug |= SLAB_POISON; 1171 break; 1172 case 'u': 1173 slub_debug |= SLAB_STORE_USER; 1174 break; 1175 case 't': 1176 slub_debug |= SLAB_TRACE; 1177 break; 1178 case 'a': 1179 slub_debug |= SLAB_FAILSLAB; 1180 break; 1181 default: 1182 printk(KERN_ERR "slub_debug option '%c' " 1183 "unknown. skipped\n", *str); 1184 } 1185 } 1186 1187 check_slabs: 1188 if (*str == ',') 1189 slub_debug_slabs = str + 1; 1190 out: 1191 return 1; 1192 } 1193 1194 __setup("slub_debug", setup_slub_debug); 1195 1196 static unsigned long kmem_cache_flags(unsigned long object_size, 1197 unsigned long flags, const char *name, 1198 void (*ctor)(void *)) 1199 { 1200 /* 1201 * Enable debugging if selected on the kernel commandline. 1202 */ 1203 if (slub_debug && (!slub_debug_slabs || 1204 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) 1205 flags |= slub_debug; 1206 1207 return flags; 1208 } 1209 #else 1210 static inline void setup_object_debug(struct kmem_cache *s, 1211 struct page *page, void *object) {} 1212 1213 static inline int alloc_debug_processing(struct kmem_cache *s, 1214 struct page *page, void *object, unsigned long addr) { return 0; } 1215 1216 static inline struct kmem_cache_node *free_debug_processing( 1217 struct kmem_cache *s, struct page *page, void *object, 1218 unsigned long addr, unsigned long *flags) { return NULL; } 1219 1220 static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1221 { return 1; } 1222 static inline int check_object(struct kmem_cache *s, struct page *page, 1223 void *object, u8 val) { return 1; } 1224 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, 1225 struct page *page) {} 1226 static inline void remove_full(struct kmem_cache *s, struct page *page) {} 1227 static inline unsigned long kmem_cache_flags(unsigned long object_size, 1228 unsigned long flags, const char *name, 1229 void (*ctor)(void *)) 1230 { 1231 return flags; 1232 } 1233 #define slub_debug 0 1234 1235 #define disable_higher_order_debug 0 1236 1237 static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1238 { return 0; } 1239 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1240 { return 0; } 1241 static inline void inc_slabs_node(struct kmem_cache *s, int node, 1242 int objects) {} 1243 static inline void dec_slabs_node(struct kmem_cache *s, int node, 1244 int objects) {} 1245 1246 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1247 { return 0; } 1248 1249 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, 1250 void *object) {} 1251 1252 static inline void slab_free_hook(struct kmem_cache *s, void *x) {} 1253 1254 #endif /* CONFIG_SLUB_DEBUG */ 1255 1256 /* 1257 * Slab allocation and freeing 1258 */ 1259 static inline struct page *alloc_slab_page(gfp_t flags, int node, 1260 struct kmem_cache_order_objects oo) 1261 { 1262 int order = oo_order(oo); 1263 1264 flags |= __GFP_NOTRACK; 1265 1266 if (node == NUMA_NO_NODE) 1267 return alloc_pages(flags, order); 1268 else 1269 return alloc_pages_exact_node(node, flags, order); 1270 } 1271 1272 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1273 { 1274 struct page *page; 1275 struct kmem_cache_order_objects oo = s->oo; 1276 gfp_t alloc_gfp; 1277 1278 flags &= gfp_allowed_mask; 1279 1280 if (flags & __GFP_WAIT) 1281 local_irq_enable(); 1282 1283 flags |= s->allocflags; 1284 1285 /* 1286 * Let the initial higher-order allocation fail under memory pressure 1287 * so we fall-back to the minimum order allocation. 1288 */ 1289 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; 1290 1291 page = alloc_slab_page(alloc_gfp, node, oo); 1292 if (unlikely(!page)) { 1293 oo = s->min; 1294 /* 1295 * Allocation may have failed due to fragmentation. 1296 * Try a lower order alloc if possible 1297 */ 1298 page = alloc_slab_page(flags, node, oo); 1299 1300 if (page) 1301 stat(s, ORDER_FALLBACK); 1302 } 1303 1304 if (kmemcheck_enabled && page 1305 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1306 int pages = 1 << oo_order(oo); 1307 1308 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); 1309 1310 /* 1311 * Objects from caches that have a constructor don't get 1312 * cleared when they're allocated, so we need to do it here. 1313 */ 1314 if (s->ctor) 1315 kmemcheck_mark_uninitialized_pages(page, pages); 1316 else 1317 kmemcheck_mark_unallocated_pages(page, pages); 1318 } 1319 1320 if (flags & __GFP_WAIT) 1321 local_irq_disable(); 1322 if (!page) 1323 return NULL; 1324 1325 page->objects = oo_objects(oo); 1326 mod_zone_page_state(page_zone(page), 1327 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1328 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1329 1 << oo_order(oo)); 1330 1331 return page; 1332 } 1333 1334 static void setup_object(struct kmem_cache *s, struct page *page, 1335 void *object) 1336 { 1337 setup_object_debug(s, page, object); 1338 if (unlikely(s->ctor)) 1339 s->ctor(object); 1340 } 1341 1342 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1343 { 1344 struct page *page; 1345 void *start; 1346 void *last; 1347 void *p; 1348 int order; 1349 1350 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1351 1352 page = allocate_slab(s, 1353 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 1354 if (!page) 1355 goto out; 1356 1357 order = compound_order(page); 1358 inc_slabs_node(s, page_to_nid(page), page->objects); 1359 memcg_bind_pages(s, order); 1360 page->slab_cache = s; 1361 __SetPageSlab(page); 1362 if (page->pfmemalloc) 1363 SetPageSlabPfmemalloc(page); 1364 1365 start = page_address(page); 1366 1367 if (unlikely(s->flags & SLAB_POISON)) 1368 memset(start, POISON_INUSE, PAGE_SIZE << order); 1369 1370 last = start; 1371 for_each_object(p, s, start, page->objects) { 1372 setup_object(s, page, last); 1373 set_freepointer(s, last, p); 1374 last = p; 1375 } 1376 setup_object(s, page, last); 1377 set_freepointer(s, last, NULL); 1378 1379 page->freelist = start; 1380 page->inuse = page->objects; 1381 page->frozen = 1; 1382 out: 1383 return page; 1384 } 1385 1386 static void __free_slab(struct kmem_cache *s, struct page *page) 1387 { 1388 int order = compound_order(page); 1389 int pages = 1 << order; 1390 1391 if (kmem_cache_debug(s)) { 1392 void *p; 1393 1394 slab_pad_check(s, page); 1395 for_each_object(p, s, page_address(page), 1396 page->objects) 1397 check_object(s, page, p, SLUB_RED_INACTIVE); 1398 } 1399 1400 kmemcheck_free_shadow(page, compound_order(page)); 1401 1402 mod_zone_page_state(page_zone(page), 1403 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1404 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1405 -pages); 1406 1407 __ClearPageSlabPfmemalloc(page); 1408 __ClearPageSlab(page); 1409 1410 memcg_release_pages(s, order); 1411 reset_page_mapcount(page); 1412 if (current->reclaim_state) 1413 current->reclaim_state->reclaimed_slab += pages; 1414 __free_memcg_kmem_pages(page, order); 1415 } 1416 1417 #define need_reserve_slab_rcu \ 1418 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) 1419 1420 static void rcu_free_slab(struct rcu_head *h) 1421 { 1422 struct page *page; 1423 1424 if (need_reserve_slab_rcu) 1425 page = virt_to_head_page(h); 1426 else 1427 page = container_of((struct list_head *)h, struct page, lru); 1428 1429 __free_slab(page->slab_cache, page); 1430 } 1431 1432 static void free_slab(struct kmem_cache *s, struct page *page) 1433 { 1434 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1435 struct rcu_head *head; 1436 1437 if (need_reserve_slab_rcu) { 1438 int order = compound_order(page); 1439 int offset = (PAGE_SIZE << order) - s->reserved; 1440 1441 VM_BUG_ON(s->reserved != sizeof(*head)); 1442 head = page_address(page) + offset; 1443 } else { 1444 /* 1445 * RCU free overloads the RCU head over the LRU 1446 */ 1447 head = (void *)&page->lru; 1448 } 1449 1450 call_rcu(head, rcu_free_slab); 1451 } else 1452 __free_slab(s, page); 1453 } 1454 1455 static void discard_slab(struct kmem_cache *s, struct page *page) 1456 { 1457 dec_slabs_node(s, page_to_nid(page), page->objects); 1458 free_slab(s, page); 1459 } 1460 1461 /* 1462 * Management of partially allocated slabs. 1463 * 1464 * list_lock must be held. 1465 */ 1466 static inline void add_partial(struct kmem_cache_node *n, 1467 struct page *page, int tail) 1468 { 1469 n->nr_partial++; 1470 if (tail == DEACTIVATE_TO_TAIL) 1471 list_add_tail(&page->lru, &n->partial); 1472 else 1473 list_add(&page->lru, &n->partial); 1474 } 1475 1476 /* 1477 * list_lock must be held. 1478 */ 1479 static inline void remove_partial(struct kmem_cache_node *n, 1480 struct page *page) 1481 { 1482 list_del(&page->lru); 1483 n->nr_partial--; 1484 } 1485 1486 /* 1487 * Remove slab from the partial list, freeze it and 1488 * return the pointer to the freelist. 1489 * 1490 * Returns a list of objects or NULL if it fails. 1491 * 1492 * Must hold list_lock since we modify the partial list. 1493 */ 1494 static inline void *acquire_slab(struct kmem_cache *s, 1495 struct kmem_cache_node *n, struct page *page, 1496 int mode) 1497 { 1498 void *freelist; 1499 unsigned long counters; 1500 struct page new; 1501 1502 /* 1503 * Zap the freelist and set the frozen bit. 1504 * The old freelist is the list of objects for the 1505 * per cpu allocation list. 1506 */ 1507 freelist = page->freelist; 1508 counters = page->counters; 1509 new.counters = counters; 1510 if (mode) { 1511 new.inuse = page->objects; 1512 new.freelist = NULL; 1513 } else { 1514 new.freelist = freelist; 1515 } 1516 1517 VM_BUG_ON(new.frozen); 1518 new.frozen = 1; 1519 1520 if (!__cmpxchg_double_slab(s, page, 1521 freelist, counters, 1522 new.freelist, new.counters, 1523 "acquire_slab")) 1524 return NULL; 1525 1526 remove_partial(n, page); 1527 WARN_ON(!freelist); 1528 return freelist; 1529 } 1530 1531 static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); 1532 static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); 1533 1534 /* 1535 * Try to allocate a partial slab from a specific node. 1536 */ 1537 static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, 1538 struct kmem_cache_cpu *c, gfp_t flags) 1539 { 1540 struct page *page, *page2; 1541 void *object = NULL; 1542 1543 /* 1544 * Racy check. If we mistakenly see no partial slabs then we 1545 * just allocate an empty slab. If we mistakenly try to get a 1546 * partial slab and there is none available then get_partials() 1547 * will return NULL. 1548 */ 1549 if (!n || !n->nr_partial) 1550 return NULL; 1551 1552 spin_lock(&n->list_lock); 1553 list_for_each_entry_safe(page, page2, &n->partial, lru) { 1554 void *t; 1555 int available; 1556 1557 if (!pfmemalloc_match(page, flags)) 1558 continue; 1559 1560 t = acquire_slab(s, n, page, object == NULL); 1561 if (!t) 1562 break; 1563 1564 if (!object) { 1565 c->page = page; 1566 stat(s, ALLOC_FROM_PARTIAL); 1567 object = t; 1568 available = page->objects - page->inuse; 1569 } else { 1570 available = put_cpu_partial(s, page, 0); 1571 stat(s, CPU_PARTIAL_NODE); 1572 } 1573 if (kmem_cache_debug(s) || available > s->cpu_partial / 2) 1574 break; 1575 1576 } 1577 spin_unlock(&n->list_lock); 1578 return object; 1579 } 1580 1581 /* 1582 * Get a page from somewhere. Search in increasing NUMA distances. 1583 */ 1584 static void *get_any_partial(struct kmem_cache *s, gfp_t flags, 1585 struct kmem_cache_cpu *c) 1586 { 1587 #ifdef CONFIG_NUMA 1588 struct zonelist *zonelist; 1589 struct zoneref *z; 1590 struct zone *zone; 1591 enum zone_type high_zoneidx = gfp_zone(flags); 1592 void *object; 1593 unsigned int cpuset_mems_cookie; 1594 1595 /* 1596 * The defrag ratio allows a configuration of the tradeoffs between 1597 * inter node defragmentation and node local allocations. A lower 1598 * defrag_ratio increases the tendency to do local allocations 1599 * instead of attempting to obtain partial slabs from other nodes. 1600 * 1601 * If the defrag_ratio is set to 0 then kmalloc() always 1602 * returns node local objects. If the ratio is higher then kmalloc() 1603 * may return off node objects because partial slabs are obtained 1604 * from other nodes and filled up. 1605 * 1606 * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes 1607 * defrag_ratio = 1000) then every (well almost) allocation will 1608 * first attempt to defrag slab caches on other nodes. This means 1609 * scanning over all nodes to look for partial slabs which may be 1610 * expensive if we do it every time we are trying to find a slab 1611 * with available objects. 1612 */ 1613 if (!s->remote_node_defrag_ratio || 1614 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1615 return NULL; 1616 1617 do { 1618 cpuset_mems_cookie = get_mems_allowed(); 1619 zonelist = node_zonelist(slab_node(), flags); 1620 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1621 struct kmem_cache_node *n; 1622 1623 n = get_node(s, zone_to_nid(zone)); 1624 1625 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1626 n->nr_partial > s->min_partial) { 1627 object = get_partial_node(s, n, c, flags); 1628 if (object) { 1629 /* 1630 * Return the object even if 1631 * put_mems_allowed indicated that 1632 * the cpuset mems_allowed was 1633 * updated in parallel. It's a 1634 * harmless race between the alloc 1635 * and the cpuset update. 1636 */ 1637 put_mems_allowed(cpuset_mems_cookie); 1638 return object; 1639 } 1640 } 1641 } 1642 } while (!put_mems_allowed(cpuset_mems_cookie)); 1643 #endif 1644 return NULL; 1645 } 1646 1647 /* 1648 * Get a partial page, lock it and return it. 1649 */ 1650 static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, 1651 struct kmem_cache_cpu *c) 1652 { 1653 void *object; 1654 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1655 1656 object = get_partial_node(s, get_node(s, searchnode), c, flags); 1657 if (object || node != NUMA_NO_NODE) 1658 return object; 1659 1660 return get_any_partial(s, flags, c); 1661 } 1662 1663 #ifdef CONFIG_PREEMPT 1664 /* 1665 * Calculate the next globally unique transaction for disambiguiation 1666 * during cmpxchg. The transactions start with the cpu number and are then 1667 * incremented by CONFIG_NR_CPUS. 1668 */ 1669 #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) 1670 #else 1671 /* 1672 * No preemption supported therefore also no need to check for 1673 * different cpus. 1674 */ 1675 #define TID_STEP 1 1676 #endif 1677 1678 static inline unsigned long next_tid(unsigned long tid) 1679 { 1680 return tid + TID_STEP; 1681 } 1682 1683 static inline unsigned int tid_to_cpu(unsigned long tid) 1684 { 1685 return tid % TID_STEP; 1686 } 1687 1688 static inline unsigned long tid_to_event(unsigned long tid) 1689 { 1690 return tid / TID_STEP; 1691 } 1692 1693 static inline unsigned int init_tid(int cpu) 1694 { 1695 return cpu; 1696 } 1697 1698 static inline void note_cmpxchg_failure(const char *n, 1699 const struct kmem_cache *s, unsigned long tid) 1700 { 1701 #ifdef SLUB_DEBUG_CMPXCHG 1702 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 1703 1704 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); 1705 1706 #ifdef CONFIG_PREEMPT 1707 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) 1708 printk("due to cpu change %d -> %d\n", 1709 tid_to_cpu(tid), tid_to_cpu(actual_tid)); 1710 else 1711 #endif 1712 if (tid_to_event(tid) != tid_to_event(actual_tid)) 1713 printk("due to cpu running other code. Event %ld->%ld\n", 1714 tid_to_event(tid), tid_to_event(actual_tid)); 1715 else 1716 printk("for unknown reason: actual=%lx was=%lx target=%lx\n", 1717 actual_tid, tid, next_tid(tid)); 1718 #endif 1719 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 1720 } 1721 1722 static void init_kmem_cache_cpus(struct kmem_cache *s) 1723 { 1724 int cpu; 1725 1726 for_each_possible_cpu(cpu) 1727 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); 1728 } 1729 1730 /* 1731 * Remove the cpu slab 1732 */ 1733 static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist) 1734 { 1735 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; 1736 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1737 int lock = 0; 1738 enum slab_modes l = M_NONE, m = M_NONE; 1739 void *nextfree; 1740 int tail = DEACTIVATE_TO_HEAD; 1741 struct page new; 1742 struct page old; 1743 1744 if (page->freelist) { 1745 stat(s, DEACTIVATE_REMOTE_FREES); 1746 tail = DEACTIVATE_TO_TAIL; 1747 } 1748 1749 /* 1750 * Stage one: Free all available per cpu objects back 1751 * to the page freelist while it is still frozen. Leave the 1752 * last one. 1753 * 1754 * There is no need to take the list->lock because the page 1755 * is still frozen. 1756 */ 1757 while (freelist && (nextfree = get_freepointer(s, freelist))) { 1758 void *prior; 1759 unsigned long counters; 1760 1761 do { 1762 prior = page->freelist; 1763 counters = page->counters; 1764 set_freepointer(s, freelist, prior); 1765 new.counters = counters; 1766 new.inuse--; 1767 VM_BUG_ON(!new.frozen); 1768 1769 } while (!__cmpxchg_double_slab(s, page, 1770 prior, counters, 1771 freelist, new.counters, 1772 "drain percpu freelist")); 1773 1774 freelist = nextfree; 1775 } 1776 1777 /* 1778 * Stage two: Ensure that the page is unfrozen while the 1779 * list presence reflects the actual number of objects 1780 * during unfreeze. 1781 * 1782 * We setup the list membership and then perform a cmpxchg 1783 * with the count. If there is a mismatch then the page 1784 * is not unfrozen but the page is on the wrong list. 1785 * 1786 * Then we restart the process which may have to remove 1787 * the page from the list that we just put it on again 1788 * because the number of objects in the slab may have 1789 * changed. 1790 */ 1791 redo: 1792 1793 old.freelist = page->freelist; 1794 old.counters = page->counters; 1795 VM_BUG_ON(!old.frozen); 1796 1797 /* Determine target state of the slab */ 1798 new.counters = old.counters; 1799 if (freelist) { 1800 new.inuse--; 1801 set_freepointer(s, freelist, old.freelist); 1802 new.freelist = freelist; 1803 } else 1804 new.freelist = old.freelist; 1805 1806 new.frozen = 0; 1807 1808 if (!new.inuse && n->nr_partial > s->min_partial) 1809 m = M_FREE; 1810 else if (new.freelist) { 1811 m = M_PARTIAL; 1812 if (!lock) { 1813 lock = 1; 1814 /* 1815 * Taking the spinlock removes the possiblity 1816 * that acquire_slab() will see a slab page that 1817 * is frozen 1818 */ 1819 spin_lock(&n->list_lock); 1820 } 1821 } else { 1822 m = M_FULL; 1823 if (kmem_cache_debug(s) && !lock) { 1824 lock = 1; 1825 /* 1826 * This also ensures that the scanning of full 1827 * slabs from diagnostic functions will not see 1828 * any frozen slabs. 1829 */ 1830 spin_lock(&n->list_lock); 1831 } 1832 } 1833 1834 if (l != m) { 1835 1836 if (l == M_PARTIAL) 1837 1838 remove_partial(n, page); 1839 1840 else if (l == M_FULL) 1841 1842 remove_full(s, page); 1843 1844 if (m == M_PARTIAL) { 1845 1846 add_partial(n, page, tail); 1847 stat(s, tail); 1848 1849 } else if (m == M_FULL) { 1850 1851 stat(s, DEACTIVATE_FULL); 1852 add_full(s, n, page); 1853 1854 } 1855 } 1856 1857 l = m; 1858 if (!__cmpxchg_double_slab(s, page, 1859 old.freelist, old.counters, 1860 new.freelist, new.counters, 1861 "unfreezing slab")) 1862 goto redo; 1863 1864 if (lock) 1865 spin_unlock(&n->list_lock); 1866 1867 if (m == M_FREE) { 1868 stat(s, DEACTIVATE_EMPTY); 1869 discard_slab(s, page); 1870 stat(s, FREE_SLAB); 1871 } 1872 } 1873 1874 /* 1875 * Unfreeze all the cpu partial slabs. 1876 * 1877 * This function must be called with interrupts disabled 1878 * for the cpu using c (or some other guarantee must be there 1879 * to guarantee no concurrent accesses). 1880 */ 1881 static void unfreeze_partials(struct kmem_cache *s, 1882 struct kmem_cache_cpu *c) 1883 { 1884 struct kmem_cache_node *n = NULL, *n2 = NULL; 1885 struct page *page, *discard_page = NULL; 1886 1887 while ((page = c->partial)) { 1888 struct page new; 1889 struct page old; 1890 1891 c->partial = page->next; 1892 1893 n2 = get_node(s, page_to_nid(page)); 1894 if (n != n2) { 1895 if (n) 1896 spin_unlock(&n->list_lock); 1897 1898 n = n2; 1899 spin_lock(&n->list_lock); 1900 } 1901 1902 do { 1903 1904 old.freelist = page->freelist; 1905 old.counters = page->counters; 1906 VM_BUG_ON(!old.frozen); 1907 1908 new.counters = old.counters; 1909 new.freelist = old.freelist; 1910 1911 new.frozen = 0; 1912 1913 } while (!__cmpxchg_double_slab(s, page, 1914 old.freelist, old.counters, 1915 new.freelist, new.counters, 1916 "unfreezing slab")); 1917 1918 if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { 1919 page->next = discard_page; 1920 discard_page = page; 1921 } else { 1922 add_partial(n, page, DEACTIVATE_TO_TAIL); 1923 stat(s, FREE_ADD_PARTIAL); 1924 } 1925 } 1926 1927 if (n) 1928 spin_unlock(&n->list_lock); 1929 1930 while (discard_page) { 1931 page = discard_page; 1932 discard_page = discard_page->next; 1933 1934 stat(s, DEACTIVATE_EMPTY); 1935 discard_slab(s, page); 1936 stat(s, FREE_SLAB); 1937 } 1938 } 1939 1940 /* 1941 * Put a page that was just frozen (in __slab_free) into a partial page 1942 * slot if available. This is done without interrupts disabled and without 1943 * preemption disabled. The cmpxchg is racy and may put the partial page 1944 * onto a random cpus partial slot. 1945 * 1946 * If we did not find a slot then simply move all the partials to the 1947 * per node partial list. 1948 */ 1949 static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) 1950 { 1951 struct page *oldpage; 1952 int pages; 1953 int pobjects; 1954 1955 do { 1956 pages = 0; 1957 pobjects = 0; 1958 oldpage = this_cpu_read(s->cpu_slab->partial); 1959 1960 if (oldpage) { 1961 pobjects = oldpage->pobjects; 1962 pages = oldpage->pages; 1963 if (drain && pobjects > s->cpu_partial) { 1964 unsigned long flags; 1965 /* 1966 * partial array is full. Move the existing 1967 * set to the per node partial list. 1968 */ 1969 local_irq_save(flags); 1970 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); 1971 local_irq_restore(flags); 1972 oldpage = NULL; 1973 pobjects = 0; 1974 pages = 0; 1975 stat(s, CPU_PARTIAL_DRAIN); 1976 } 1977 } 1978 1979 pages++; 1980 pobjects += page->objects - page->inuse; 1981 1982 page->pages = pages; 1983 page->pobjects = pobjects; 1984 page->next = oldpage; 1985 1986 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); 1987 return pobjects; 1988 } 1989 1990 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1991 { 1992 stat(s, CPUSLAB_FLUSH); 1993 deactivate_slab(s, c->page, c->freelist); 1994 1995 c->tid = next_tid(c->tid); 1996 c->page = NULL; 1997 c->freelist = NULL; 1998 } 1999 2000 /* 2001 * Flush cpu slab. 2002 * 2003 * Called from IPI handler with interrupts disabled. 2004 */ 2005 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 2006 { 2007 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 2008 2009 if (likely(c)) { 2010 if (c->page) 2011 flush_slab(s, c); 2012 2013 unfreeze_partials(s, c); 2014 } 2015 } 2016 2017 static void flush_cpu_slab(void *d) 2018 { 2019 struct kmem_cache *s = d; 2020 2021 __flush_cpu_slab(s, smp_processor_id()); 2022 } 2023 2024 static bool has_cpu_slab(int cpu, void *info) 2025 { 2026 struct kmem_cache *s = info; 2027 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 2028 2029 return c->page || c->partial; 2030 } 2031 2032 static void flush_all(struct kmem_cache *s) 2033 { 2034 on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); 2035 } 2036 2037 /* 2038 * Check if the objects in a per cpu structure fit numa 2039 * locality expectations. 2040 */ 2041 static inline int node_match(struct page *page, int node) 2042 { 2043 #ifdef CONFIG_NUMA 2044 if (node != NUMA_NO_NODE && page_to_nid(page) != node) 2045 return 0; 2046 #endif 2047 return 1; 2048 } 2049 2050 static int count_free(struct page *page) 2051 { 2052 return page->objects - page->inuse; 2053 } 2054 2055 static unsigned long count_partial(struct kmem_cache_node *n, 2056 int (*get_count)(struct page *)) 2057 { 2058 unsigned long flags; 2059 unsigned long x = 0; 2060 struct page *page; 2061 2062 spin_lock_irqsave(&n->list_lock, flags); 2063 list_for_each_entry(page, &n->partial, lru) 2064 x += get_count(page); 2065 spin_unlock_irqrestore(&n->list_lock, flags); 2066 return x; 2067 } 2068 2069 static inline unsigned long node_nr_objs(struct kmem_cache_node *n) 2070 { 2071 #ifdef CONFIG_SLUB_DEBUG 2072 return atomic_long_read(&n->total_objects); 2073 #else 2074 return 0; 2075 #endif 2076 } 2077 2078 static noinline void 2079 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) 2080 { 2081 int node; 2082 2083 printk(KERN_WARNING 2084 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2085 nid, gfpflags); 2086 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " 2087 "default order: %d, min order: %d\n", s->name, s->object_size, 2088 s->size, oo_order(s->oo), oo_order(s->min)); 2089 2090 if (oo_order(s->min) > get_order(s->object_size)) 2091 printk(KERN_WARNING " %s debugging increased min order, use " 2092 "slub_debug=O to disable.\n", s->name); 2093 2094 for_each_online_node(node) { 2095 struct kmem_cache_node *n = get_node(s, node); 2096 unsigned long nr_slabs; 2097 unsigned long nr_objs; 2098 unsigned long nr_free; 2099 2100 if (!n) 2101 continue; 2102 2103 nr_free = count_partial(n, count_free); 2104 nr_slabs = node_nr_slabs(n); 2105 nr_objs = node_nr_objs(n); 2106 2107 printk(KERN_WARNING 2108 " node %d: slabs: %ld, objs: %ld, free: %ld\n", 2109 node, nr_slabs, nr_objs, nr_free); 2110 } 2111 } 2112 2113 static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, 2114 int node, struct kmem_cache_cpu **pc) 2115 { 2116 void *freelist; 2117 struct kmem_cache_cpu *c = *pc; 2118 struct page *page; 2119 2120 freelist = get_partial(s, flags, node, c); 2121 2122 if (freelist) 2123 return freelist; 2124 2125 page = new_slab(s, flags, node); 2126 if (page) { 2127 c = __this_cpu_ptr(s->cpu_slab); 2128 if (c->page) 2129 flush_slab(s, c); 2130 2131 /* 2132 * No other reference to the page yet so we can 2133 * muck around with it freely without cmpxchg 2134 */ 2135 freelist = page->freelist; 2136 page->freelist = NULL; 2137 2138 stat(s, ALLOC_SLAB); 2139 c->page = page; 2140 *pc = c; 2141 } else 2142 freelist = NULL; 2143 2144 return freelist; 2145 } 2146 2147 static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) 2148 { 2149 if (unlikely(PageSlabPfmemalloc(page))) 2150 return gfp_pfmemalloc_allowed(gfpflags); 2151 2152 return true; 2153 } 2154 2155 /* 2156 * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist 2157 * or deactivate the page. 2158 * 2159 * The page is still frozen if the return value is not NULL. 2160 * 2161 * If this function returns NULL then the page has been unfrozen. 2162 * 2163 * This function must be called with interrupt disabled. 2164 */ 2165 static inline void *get_freelist(struct kmem_cache *s, struct page *page) 2166 { 2167 struct page new; 2168 unsigned long counters; 2169 void *freelist; 2170 2171 do { 2172 freelist = page->freelist; 2173 counters = page->counters; 2174 2175 new.counters = counters; 2176 VM_BUG_ON(!new.frozen); 2177 2178 new.inuse = page->objects; 2179 new.frozen = freelist != NULL; 2180 2181 } while (!__cmpxchg_double_slab(s, page, 2182 freelist, counters, 2183 NULL, new.counters, 2184 "get_freelist")); 2185 2186 return freelist; 2187 } 2188 2189 /* 2190 * Slow path. The lockless freelist is empty or we need to perform 2191 * debugging duties. 2192 * 2193 * Processing is still very fast if new objects have been freed to the 2194 * regular freelist. In that case we simply take over the regular freelist 2195 * as the lockless freelist and zap the regular freelist. 2196 * 2197 * If that is not working then we fall back to the partial lists. We take the 2198 * first element of the freelist as the object to allocate now and move the 2199 * rest of the freelist to the lockless freelist. 2200 * 2201 * And if we were unable to get a new slab from the partial slab lists then 2202 * we need to allocate a new slab. This is the slowest path since it involves 2203 * a call to the page allocator and the setup of a new slab. 2204 */ 2205 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 2206 unsigned long addr, struct kmem_cache_cpu *c) 2207 { 2208 void *freelist; 2209 struct page *page; 2210 unsigned long flags; 2211 2212 local_irq_save(flags); 2213 #ifdef CONFIG_PREEMPT 2214 /* 2215 * We may have been preempted and rescheduled on a different 2216 * cpu before disabling interrupts. Need to reload cpu area 2217 * pointer. 2218 */ 2219 c = this_cpu_ptr(s->cpu_slab); 2220 #endif 2221 2222 page = c->page; 2223 if (!page) 2224 goto new_slab; 2225 redo: 2226 2227 if (unlikely(!node_match(page, node))) { 2228 stat(s, ALLOC_NODE_MISMATCH); 2229 deactivate_slab(s, page, c->freelist); 2230 c->page = NULL; 2231 c->freelist = NULL; 2232 goto new_slab; 2233 } 2234 2235 /* 2236 * By rights, we should be searching for a slab page that was 2237 * PFMEMALLOC but right now, we are losing the pfmemalloc 2238 * information when the page leaves the per-cpu allocator 2239 */ 2240 if (unlikely(!pfmemalloc_match(page, gfpflags))) { 2241 deactivate_slab(s, page, c->freelist); 2242 c->page = NULL; 2243 c->freelist = NULL; 2244 goto new_slab; 2245 } 2246 2247 /* must check again c->freelist in case of cpu migration or IRQ */ 2248 freelist = c->freelist; 2249 if (freelist) 2250 goto load_freelist; 2251 2252 stat(s, ALLOC_SLOWPATH); 2253 2254 freelist = get_freelist(s, page); 2255 2256 if (!freelist) { 2257 c->page = NULL; 2258 stat(s, DEACTIVATE_BYPASS); 2259 goto new_slab; 2260 } 2261 2262 stat(s, ALLOC_REFILL); 2263 2264 load_freelist: 2265 /* 2266 * freelist is pointing to the list of objects to be used. 2267 * page is pointing to the page from which the objects are obtained. 2268 * That page must be frozen for per cpu allocations to work. 2269 */ 2270 VM_BUG_ON(!c->page->frozen); 2271 c->freelist = get_freepointer(s, freelist); 2272 c->tid = next_tid(c->tid); 2273 local_irq_restore(flags); 2274 return freelist; 2275 2276 new_slab: 2277 2278 if (c->partial) { 2279 page = c->page = c->partial; 2280 c->partial = page->next; 2281 stat(s, CPU_PARTIAL_ALLOC); 2282 c->freelist = NULL; 2283 goto redo; 2284 } 2285 2286 freelist = new_slab_objects(s, gfpflags, node, &c); 2287 2288 if (unlikely(!freelist)) { 2289 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2290 slab_out_of_memory(s, gfpflags, node); 2291 2292 local_irq_restore(flags); 2293 return NULL; 2294 } 2295 2296 page = c->page; 2297 if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) 2298 goto load_freelist; 2299 2300 /* Only entered in the debug case */ 2301 if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr)) 2302 goto new_slab; /* Slab failed checks. Next slab needed */ 2303 2304 deactivate_slab(s, page, get_freepointer(s, freelist)); 2305 c->page = NULL; 2306 c->freelist = NULL; 2307 local_irq_restore(flags); 2308 return freelist; 2309 } 2310 2311 /* 2312 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) 2313 * have the fastpath folded into their functions. So no function call 2314 * overhead for requests that can be satisfied on the fastpath. 2315 * 2316 * The fastpath works by first checking if the lockless freelist can be used. 2317 * If not then __slab_alloc is called for slow processing. 2318 * 2319 * Otherwise we can simply pick the next object from the lockless free list. 2320 */ 2321 static __always_inline void *slab_alloc_node(struct kmem_cache *s, 2322 gfp_t gfpflags, int node, unsigned long addr) 2323 { 2324 void **object; 2325 struct kmem_cache_cpu *c; 2326 struct page *page; 2327 unsigned long tid; 2328 2329 if (slab_pre_alloc_hook(s, gfpflags)) 2330 return NULL; 2331 2332 s = memcg_kmem_get_cache(s, gfpflags); 2333 redo: 2334 2335 /* 2336 * Must read kmem_cache cpu data via this cpu ptr. Preemption is 2337 * enabled. We may switch back and forth between cpus while 2338 * reading from one cpu area. That does not matter as long 2339 * as we end up on the original cpu again when doing the cmpxchg. 2340 */ 2341 c = __this_cpu_ptr(s->cpu_slab); 2342 2343 /* 2344 * The transaction ids are globally unique per cpu and per operation on 2345 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 2346 * occurs on the right processor and that there was no operation on the 2347 * linked list in between. 2348 */ 2349 tid = c->tid; 2350 barrier(); 2351 2352 object = c->freelist; 2353 page = c->page; 2354 if (unlikely(!object || !node_match(page, node))) 2355 object = __slab_alloc(s, gfpflags, node, addr, c); 2356 2357 else { 2358 void *next_object = get_freepointer_safe(s, object); 2359 2360 /* 2361 * The cmpxchg will only match if there was no additional 2362 * operation and if we are on the right processor. 2363 * 2364 * The cmpxchg does the following atomically (without lock semantics!) 2365 * 1. Relocate first pointer to the current per cpu area. 2366 * 2. Verify that tid and freelist have not been changed 2367 * 3. If they were not changed replace tid and freelist 2368 * 2369 * Since this is without lock semantics the protection is only against 2370 * code executing on this cpu *not* from access by other cpus. 2371 */ 2372 if (unlikely(!this_cpu_cmpxchg_double( 2373 s->cpu_slab->freelist, s->cpu_slab->tid, 2374 object, tid, 2375 next_object, next_tid(tid)))) { 2376 2377 note_cmpxchg_failure("slab_alloc", s, tid); 2378 goto redo; 2379 } 2380 prefetch_freepointer(s, next_object); 2381 stat(s, ALLOC_FASTPATH); 2382 } 2383 2384 if (unlikely(gfpflags & __GFP_ZERO) && object) 2385 memset(object, 0, s->object_size); 2386 2387 slab_post_alloc_hook(s, gfpflags, object); 2388 2389 return object; 2390 } 2391 2392 static __always_inline void *slab_alloc(struct kmem_cache *s, 2393 gfp_t gfpflags, unsigned long addr) 2394 { 2395 return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); 2396 } 2397 2398 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 2399 { 2400 void *ret = slab_alloc(s, gfpflags, _RET_IP_); 2401 2402 trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); 2403 2404 return ret; 2405 } 2406 EXPORT_SYMBOL(kmem_cache_alloc); 2407 2408 #ifdef CONFIG_TRACING 2409 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) 2410 { 2411 void *ret = slab_alloc(s, gfpflags, _RET_IP_); 2412 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); 2413 return ret; 2414 } 2415 EXPORT_SYMBOL(kmem_cache_alloc_trace); 2416 2417 void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) 2418 { 2419 void *ret = kmalloc_order(size, flags, order); 2420 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); 2421 return ret; 2422 } 2423 EXPORT_SYMBOL(kmalloc_order_trace); 2424 #endif 2425 2426 #ifdef CONFIG_NUMA 2427 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 2428 { 2429 void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); 2430 2431 trace_kmem_cache_alloc_node(_RET_IP_, ret, 2432 s->object_size, s->size, gfpflags, node); 2433 2434 return ret; 2435 } 2436 EXPORT_SYMBOL(kmem_cache_alloc_node); 2437 2438 #ifdef CONFIG_TRACING 2439 void *kmem_cache_alloc_node_trace(struct kmem_cache *s, 2440 gfp_t gfpflags, 2441 int node, size_t size) 2442 { 2443 void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); 2444 2445 trace_kmalloc_node(_RET_IP_, ret, 2446 size, s->size, gfpflags, node); 2447 return ret; 2448 } 2449 EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 2450 #endif 2451 #endif 2452 2453 /* 2454 * Slow patch handling. This may still be called frequently since objects 2455 * have a longer lifetime than the cpu slabs in most processing loads. 2456 * 2457 * So we still attempt to reduce cache line usage. Just take the slab 2458 * lock and free the item. If there is no additional partial page 2459 * handling required then we can return immediately. 2460 */ 2461 static void __slab_free(struct kmem_cache *s, struct page *page, 2462 void *x, unsigned long addr) 2463 { 2464 void *prior; 2465 void **object = (void *)x; 2466 int was_frozen; 2467 struct page new; 2468 unsigned long counters; 2469 struct kmem_cache_node *n = NULL; 2470 unsigned long uninitialized_var(flags); 2471 2472 stat(s, FREE_SLOWPATH); 2473 2474 if (kmem_cache_debug(s) && 2475 !(n = free_debug_processing(s, page, x, addr, &flags))) 2476 return; 2477 2478 do { 2479 if (unlikely(n)) { 2480 spin_unlock_irqrestore(&n->list_lock, flags); 2481 n = NULL; 2482 } 2483 prior = page->freelist; 2484 counters = page->counters; 2485 set_freepointer(s, object, prior); 2486 new.counters = counters; 2487 was_frozen = new.frozen; 2488 new.inuse--; 2489 if ((!new.inuse || !prior) && !was_frozen) { 2490 2491 if (!kmem_cache_debug(s) && !prior) 2492 2493 /* 2494 * Slab was on no list before and will be partially empty 2495 * We can defer the list move and instead freeze it. 2496 */ 2497 new.frozen = 1; 2498 2499 else { /* Needs to be taken off a list */ 2500 2501 n = get_node(s, page_to_nid(page)); 2502 /* 2503 * Speculatively acquire the list_lock. 2504 * If the cmpxchg does not succeed then we may 2505 * drop the list_lock without any processing. 2506 * 2507 * Otherwise the list_lock will synchronize with 2508 * other processors updating the list of slabs. 2509 */ 2510 spin_lock_irqsave(&n->list_lock, flags); 2511 2512 } 2513 } 2514 2515 } while (!cmpxchg_double_slab(s, page, 2516 prior, counters, 2517 object, new.counters, 2518 "__slab_free")); 2519 2520 if (likely(!n)) { 2521 2522 /* 2523 * If we just froze the page then put it onto the 2524 * per cpu partial list. 2525 */ 2526 if (new.frozen && !was_frozen) { 2527 put_cpu_partial(s, page, 1); 2528 stat(s, CPU_PARTIAL_FREE); 2529 } 2530 /* 2531 * The list lock was not taken therefore no list 2532 * activity can be necessary. 2533 */ 2534 if (was_frozen) 2535 stat(s, FREE_FROZEN); 2536 return; 2537 } 2538 2539 if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) 2540 goto slab_empty; 2541 2542 /* 2543 * Objects left in the slab. If it was not on the partial list before 2544 * then add it. 2545 */ 2546 if (kmem_cache_debug(s) && unlikely(!prior)) { 2547 remove_full(s, page); 2548 add_partial(n, page, DEACTIVATE_TO_TAIL); 2549 stat(s, FREE_ADD_PARTIAL); 2550 } 2551 spin_unlock_irqrestore(&n->list_lock, flags); 2552 return; 2553 2554 slab_empty: 2555 if (prior) { 2556 /* 2557 * Slab on the partial list. 2558 */ 2559 remove_partial(n, page); 2560 stat(s, FREE_REMOVE_PARTIAL); 2561 } else 2562 /* Slab must be on the full list */ 2563 remove_full(s, page); 2564 2565 spin_unlock_irqrestore(&n->list_lock, flags); 2566 stat(s, FREE_SLAB); 2567 discard_slab(s, page); 2568 } 2569 2570 /* 2571 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 2572 * can perform fastpath freeing without additional function calls. 2573 * 2574 * The fastpath is only possible if we are freeing to the current cpu slab 2575 * of this processor. This typically the case if we have just allocated 2576 * the item before. 2577 * 2578 * If fastpath is not possible then fall back to __slab_free where we deal 2579 * with all sorts of special processing. 2580 */ 2581 static __always_inline void slab_free(struct kmem_cache *s, 2582 struct page *page, void *x, unsigned long addr) 2583 { 2584 void **object = (void *)x; 2585 struct kmem_cache_cpu *c; 2586 unsigned long tid; 2587 2588 slab_free_hook(s, x); 2589 2590 redo: 2591 /* 2592 * Determine the currently cpus per cpu slab. 2593 * The cpu may change afterward. However that does not matter since 2594 * data is retrieved via this pointer. If we are on the same cpu 2595 * during the cmpxchg then the free will succedd. 2596 */ 2597 c = __this_cpu_ptr(s->cpu_slab); 2598 2599 tid = c->tid; 2600 barrier(); 2601 2602 if (likely(page == c->page)) { 2603 set_freepointer(s, object, c->freelist); 2604 2605 if (unlikely(!this_cpu_cmpxchg_double( 2606 s->cpu_slab->freelist, s->cpu_slab->tid, 2607 c->freelist, tid, 2608 object, next_tid(tid)))) { 2609 2610 note_cmpxchg_failure("slab_free", s, tid); 2611 goto redo; 2612 } 2613 stat(s, FREE_FASTPATH); 2614 } else 2615 __slab_free(s, page, x, addr); 2616 2617 } 2618 2619 void kmem_cache_free(struct kmem_cache *s, void *x) 2620 { 2621 s = cache_from_obj(s, x); 2622 if (!s) 2623 return; 2624 slab_free(s, virt_to_head_page(x), x, _RET_IP_); 2625 trace_kmem_cache_free(_RET_IP_, x); 2626 } 2627 EXPORT_SYMBOL(kmem_cache_free); 2628 2629 /* 2630 * Object placement in a slab is made very easy because we always start at 2631 * offset 0. If we tune the size of the object to the alignment then we can 2632 * get the required alignment by putting one properly sized object after 2633 * another. 2634 * 2635 * Notice that the allocation order determines the sizes of the per cpu 2636 * caches. Each processor has always one slab available for allocations. 2637 * Increasing the allocation order reduces the number of times that slabs 2638 * must be moved on and off the partial lists and is therefore a factor in 2639 * locking overhead. 2640 */ 2641 2642 /* 2643 * Mininum / Maximum order of slab pages. This influences locking overhead 2644 * and slab fragmentation. A higher order reduces the number of partial slabs 2645 * and increases the number of allocations possible without having to 2646 * take the list_lock. 2647 */ 2648 static int slub_min_order; 2649 static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; 2650 static int slub_min_objects; 2651 2652 /* 2653 * Merge control. If this is set then no merging of slab caches will occur. 2654 * (Could be removed. This was introduced to pacify the merge skeptics.) 2655 */ 2656 static int slub_nomerge; 2657 2658 /* 2659 * Calculate the order of allocation given an slab object size. 2660 * 2661 * The order of allocation has significant impact on performance and other 2662 * system components. Generally order 0 allocations should be preferred since 2663 * order 0 does not cause fragmentation in the page allocator. Larger objects 2664 * be problematic to put into order 0 slabs because there may be too much 2665 * unused space left. We go to a higher order if more than 1/16th of the slab 2666 * would be wasted. 2667 * 2668 * In order to reach satisfactory performance we must ensure that a minimum 2669 * number of objects is in one slab. Otherwise we may generate too much 2670 * activity on the partial lists which requires taking the list_lock. This is 2671 * less a concern for large slabs though which are rarely used. 2672 * 2673 * slub_max_order specifies the order where we begin to stop considering the 2674 * number of objects in a slab as critical. If we reach slub_max_order then 2675 * we try to keep the page order as low as possible. So we accept more waste 2676 * of space in favor of a small page order. 2677 * 2678 * Higher order allocations also allow the placement of more objects in a 2679 * slab and thereby reduce object handling overhead. If the user has 2680 * requested a higher mininum order then we start with that one instead of 2681 * the smallest order which will fit the object. 2682 */ 2683 static inline int slab_order(int size, int min_objects, 2684 int max_order, int fract_leftover, int reserved) 2685 { 2686 int order; 2687 int rem; 2688 int min_order = slub_min_order; 2689 2690 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) 2691 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 2692 2693 for (order = max(min_order, 2694 fls(min_objects * size - 1) - PAGE_SHIFT); 2695 order <= max_order; order++) { 2696 2697 unsigned long slab_size = PAGE_SIZE << order; 2698 2699 if (slab_size < min_objects * size + reserved) 2700 continue; 2701 2702 rem = (slab_size - reserved) % size; 2703 2704 if (rem <= slab_size / fract_leftover) 2705 break; 2706 2707 } 2708 2709 return order; 2710 } 2711 2712 static inline int calculate_order(int size, int reserved) 2713 { 2714 int order; 2715 int min_objects; 2716 int fraction; 2717 int max_objects; 2718 2719 /* 2720 * Attempt to find best configuration for a slab. This 2721 * works by first attempting to generate a layout with 2722 * the best configuration and backing off gradually. 2723 * 2724 * First we reduce the acceptable waste in a slab. Then 2725 * we reduce the minimum objects required in a slab. 2726 */ 2727 min_objects = slub_min_objects; 2728 if (!min_objects) 2729 min_objects = 4 * (fls(nr_cpu_ids) + 1); 2730 max_objects = order_objects(slub_max_order, size, reserved); 2731 min_objects = min(min_objects, max_objects); 2732 2733 while (min_objects > 1) { 2734 fraction = 16; 2735 while (fraction >= 4) { 2736 order = slab_order(size, min_objects, 2737 slub_max_order, fraction, reserved); 2738 if (order <= slub_max_order) 2739 return order; 2740 fraction /= 2; 2741 } 2742 min_objects--; 2743 } 2744 2745 /* 2746 * We were unable to place multiple objects in a slab. Now 2747 * lets see if we can place a single object there. 2748 */ 2749 order = slab_order(size, 1, slub_max_order, 1, reserved); 2750 if (order <= slub_max_order) 2751 return order; 2752 2753 /* 2754 * Doh this slab cannot be placed using slub_max_order. 2755 */ 2756 order = slab_order(size, 1, MAX_ORDER, 1, reserved); 2757 if (order < MAX_ORDER) 2758 return order; 2759 return -ENOSYS; 2760 } 2761 2762 static void 2763 init_kmem_cache_node(struct kmem_cache_node *n) 2764 { 2765 n->nr_partial = 0; 2766 spin_lock_init(&n->list_lock); 2767 INIT_LIST_HEAD(&n->partial); 2768 #ifdef CONFIG_SLUB_DEBUG 2769 atomic_long_set(&n->nr_slabs, 0); 2770 atomic_long_set(&n->total_objects, 0); 2771 INIT_LIST_HEAD(&n->full); 2772 #endif 2773 } 2774 2775 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 2776 { 2777 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 2778 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); 2779 2780 /* 2781 * Must align to double word boundary for the double cmpxchg 2782 * instructions to work; see __pcpu_double_call_return_bool(). 2783 */ 2784 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2785 2 * sizeof(void *)); 2786 2787 if (!s->cpu_slab) 2788 return 0; 2789 2790 init_kmem_cache_cpus(s); 2791 2792 return 1; 2793 } 2794 2795 static struct kmem_cache *kmem_cache_node; 2796 2797 /* 2798 * No kmalloc_node yet so do it by hand. We know that this is the first 2799 * slab on the node for this slabcache. There are no concurrent accesses 2800 * possible. 2801 * 2802 * Note that this function only works on the kmalloc_node_cache 2803 * when allocating for the kmalloc_node_cache. This is used for bootstrapping 2804 * memory on a fresh node that has no slab structures yet. 2805 */ 2806 static void early_kmem_cache_node_alloc(int node) 2807 { 2808 struct page *page; 2809 struct kmem_cache_node *n; 2810 2811 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); 2812 2813 page = new_slab(kmem_cache_node, GFP_NOWAIT, node); 2814 2815 BUG_ON(!page); 2816 if (page_to_nid(page) != node) { 2817 printk(KERN_ERR "SLUB: Unable to allocate memory from " 2818 "node %d\n", node); 2819 printk(KERN_ERR "SLUB: Allocating a useless per node structure " 2820 "in order to be able to continue\n"); 2821 } 2822 2823 n = page->freelist; 2824 BUG_ON(!n); 2825 page->freelist = get_freepointer(kmem_cache_node, n); 2826 page->inuse = 1; 2827 page->frozen = 0; 2828 kmem_cache_node->node[node] = n; 2829 #ifdef CONFIG_SLUB_DEBUG 2830 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2831 init_tracking(kmem_cache_node, n); 2832 #endif 2833 init_kmem_cache_node(n); 2834 inc_slabs_node(kmem_cache_node, node, page->objects); 2835 2836 add_partial(n, page, DEACTIVATE_TO_HEAD); 2837 } 2838 2839 static void free_kmem_cache_nodes(struct kmem_cache *s) 2840 { 2841 int node; 2842 2843 for_each_node_state(node, N_NORMAL_MEMORY) { 2844 struct kmem_cache_node *n = s->node[node]; 2845 2846 if (n) 2847 kmem_cache_free(kmem_cache_node, n); 2848 2849 s->node[node] = NULL; 2850 } 2851 } 2852 2853 static int init_kmem_cache_nodes(struct kmem_cache *s) 2854 { 2855 int node; 2856 2857 for_each_node_state(node, N_NORMAL_MEMORY) { 2858 struct kmem_cache_node *n; 2859 2860 if (slab_state == DOWN) { 2861 early_kmem_cache_node_alloc(node); 2862 continue; 2863 } 2864 n = kmem_cache_alloc_node(kmem_cache_node, 2865 GFP_KERNEL, node); 2866 2867 if (!n) { 2868 free_kmem_cache_nodes(s); 2869 return 0; 2870 } 2871 2872 s->node[node] = n; 2873 init_kmem_cache_node(n); 2874 } 2875 return 1; 2876 } 2877 2878 static void set_min_partial(struct kmem_cache *s, unsigned long min) 2879 { 2880 if (min < MIN_PARTIAL) 2881 min = MIN_PARTIAL; 2882 else if (min > MAX_PARTIAL) 2883 min = MAX_PARTIAL; 2884 s->min_partial = min; 2885 } 2886 2887 /* 2888 * calculate_sizes() determines the order and the distribution of data within 2889 * a slab object. 2890 */ 2891 static int calculate_sizes(struct kmem_cache *s, int forced_order) 2892 { 2893 unsigned long flags = s->flags; 2894 unsigned long size = s->object_size; 2895 int order; 2896 2897 /* 2898 * Round up object size to the next word boundary. We can only 2899 * place the free pointer at word boundaries and this determines 2900 * the possible location of the free pointer. 2901 */ 2902 size = ALIGN(size, sizeof(void *)); 2903 2904 #ifdef CONFIG_SLUB_DEBUG 2905 /* 2906 * Determine if we can poison the object itself. If the user of 2907 * the slab may touch the object after free or before allocation 2908 * then we should never poison the object itself. 2909 */ 2910 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 2911 !s->ctor) 2912 s->flags |= __OBJECT_POISON; 2913 else 2914 s->flags &= ~__OBJECT_POISON; 2915 2916 2917 /* 2918 * If we are Redzoning then check if there is some space between the 2919 * end of the object and the free pointer. If not then add an 2920 * additional word to have some bytes to store Redzone information. 2921 */ 2922 if ((flags & SLAB_RED_ZONE) && size == s->object_size) 2923 size += sizeof(void *); 2924 #endif 2925 2926 /* 2927 * With that we have determined the number of bytes in actual use 2928 * by the object. This is the potential offset to the free pointer. 2929 */ 2930 s->inuse = size; 2931 2932 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 2933 s->ctor)) { 2934 /* 2935 * Relocate free pointer after the object if it is not 2936 * permitted to overwrite the first word of the object on 2937 * kmem_cache_free. 2938 * 2939 * This is the case if we do RCU, have a constructor or 2940 * destructor or are poisoning the objects. 2941 */ 2942 s->offset = size; 2943 size += sizeof(void *); 2944 } 2945 2946 #ifdef CONFIG_SLUB_DEBUG 2947 if (flags & SLAB_STORE_USER) 2948 /* 2949 * Need to store information about allocs and frees after 2950 * the object. 2951 */ 2952 size += 2 * sizeof(struct track); 2953 2954 if (flags & SLAB_RED_ZONE) 2955 /* 2956 * Add some empty padding so that we can catch 2957 * overwrites from earlier objects rather than let 2958 * tracking information or the free pointer be 2959 * corrupted if a user writes before the start 2960 * of the object. 2961 */ 2962 size += sizeof(void *); 2963 #endif 2964 2965 /* 2966 * SLUB stores one object immediately after another beginning from 2967 * offset 0. In order to align the objects we have to simply size 2968 * each object to conform to the alignment. 2969 */ 2970 size = ALIGN(size, s->align); 2971 s->size = size; 2972 if (forced_order >= 0) 2973 order = forced_order; 2974 else 2975 order = calculate_order(size, s->reserved); 2976 2977 if (order < 0) 2978 return 0; 2979 2980 s->allocflags = 0; 2981 if (order) 2982 s->allocflags |= __GFP_COMP; 2983 2984 if (s->flags & SLAB_CACHE_DMA) 2985 s->allocflags |= SLUB_DMA; 2986 2987 if (s->flags & SLAB_RECLAIM_ACCOUNT) 2988 s->allocflags |= __GFP_RECLAIMABLE; 2989 2990 /* 2991 * Determine the number of objects per slab 2992 */ 2993 s->oo = oo_make(order, size, s->reserved); 2994 s->min = oo_make(get_order(size), size, s->reserved); 2995 if (oo_objects(s->oo) > oo_objects(s->max)) 2996 s->max = s->oo; 2997 2998 return !!oo_objects(s->oo); 2999 } 3000 3001 static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) 3002 { 3003 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); 3004 s->reserved = 0; 3005 3006 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) 3007 s->reserved = sizeof(struct rcu_head); 3008 3009 if (!calculate_sizes(s, -1)) 3010 goto error; 3011 if (disable_higher_order_debug) { 3012 /* 3013 * Disable debugging flags that store metadata if the min slab 3014 * order increased. 3015 */ 3016 if (get_order(s->size) > get_order(s->object_size)) { 3017 s->flags &= ~DEBUG_METADATA_FLAGS; 3018 s->offset = 0; 3019 if (!calculate_sizes(s, -1)) 3020 goto error; 3021 } 3022 } 3023 3024 #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 3025 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 3026 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) 3027 /* Enable fast mode */ 3028 s->flags |= __CMPXCHG_DOUBLE; 3029 #endif 3030 3031 /* 3032 * The larger the object size is, the more pages we want on the partial 3033 * list to avoid pounding the page allocator excessively. 3034 */ 3035 set_min_partial(s, ilog2(s->size) / 2); 3036 3037 /* 3038 * cpu_partial determined the maximum number of objects kept in the 3039 * per cpu partial lists of a processor. 3040 * 3041 * Per cpu partial lists mainly contain slabs that just have one 3042 * object freed. If they are used for allocation then they can be 3043 * filled up again with minimal effort. The slab will never hit the 3044 * per node partial lists and therefore no locking will be required. 3045 * 3046 * This setting also determines 3047 * 3048 * A) The number of objects from per cpu partial slabs dumped to the 3049 * per node list when we reach the limit. 3050 * B) The number of objects in cpu partial slabs to extract from the 3051 * per node list when we run out of per cpu objects. We only fetch 50% 3052 * to keep some capacity around for frees. 3053 */ 3054 if (kmem_cache_debug(s)) 3055 s->cpu_partial = 0; 3056 else if (s->size >= PAGE_SIZE) 3057 s->cpu_partial = 2; 3058 else if (s->size >= 1024) 3059 s->cpu_partial = 6; 3060 else if (s->size >= 256) 3061 s->cpu_partial = 13; 3062 else 3063 s->cpu_partial = 30; 3064 3065 #ifdef CONFIG_NUMA 3066 s->remote_node_defrag_ratio = 1000; 3067 #endif 3068 if (!init_kmem_cache_nodes(s)) 3069 goto error; 3070 3071 if (alloc_kmem_cache_cpus(s)) 3072 return 0; 3073 3074 free_kmem_cache_nodes(s); 3075 error: 3076 if (flags & SLAB_PANIC) 3077 panic("Cannot create slab %s size=%lu realsize=%u " 3078 "order=%u offset=%u flags=%lx\n", 3079 s->name, (unsigned long)s->size, s->size, oo_order(s->oo), 3080 s->offset, flags); 3081 return -EINVAL; 3082 } 3083 3084 static void list_slab_objects(struct kmem_cache *s, struct page *page, 3085 const char *text) 3086 { 3087 #ifdef CONFIG_SLUB_DEBUG 3088 void *addr = page_address(page); 3089 void *p; 3090 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * 3091 sizeof(long), GFP_ATOMIC); 3092 if (!map) 3093 return; 3094 slab_err(s, page, text, s->name); 3095 slab_lock(page); 3096 3097 get_map(s, page, map); 3098 for_each_object(p, s, addr, page->objects) { 3099 3100 if (!test_bit(slab_index(p, s, addr), map)) { 3101 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", 3102 p, p - addr); 3103 print_tracking(s, p); 3104 } 3105 } 3106 slab_unlock(page); 3107 kfree(map); 3108 #endif 3109 } 3110 3111 /* 3112 * Attempt to free all partial slabs on a node. 3113 * This is called from kmem_cache_close(). We must be the last thread 3114 * using the cache and therefore we do not need to lock anymore. 3115 */ 3116 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) 3117 { 3118 struct page *page, *h; 3119 3120 list_for_each_entry_safe(page, h, &n->partial, lru) { 3121 if (!page->inuse) { 3122 remove_partial(n, page); 3123 discard_slab(s, page); 3124 } else { 3125 list_slab_objects(s, page, 3126 "Objects remaining in %s on kmem_cache_close()"); 3127 } 3128 } 3129 } 3130 3131 /* 3132 * Release all resources used by a slab cache. 3133 */ 3134 static inline int kmem_cache_close(struct kmem_cache *s) 3135 { 3136 int node; 3137 3138 flush_all(s); 3139 /* Attempt to free all objects */ 3140 for_each_node_state(node, N_NORMAL_MEMORY) { 3141 struct kmem_cache_node *n = get_node(s, node); 3142 3143 free_partial(s, n); 3144 if (n->nr_partial || slabs_node(s, node)) 3145 return 1; 3146 } 3147 free_percpu(s->cpu_slab); 3148 free_kmem_cache_nodes(s); 3149 return 0; 3150 } 3151 3152 int __kmem_cache_shutdown(struct kmem_cache *s) 3153 { 3154 int rc = kmem_cache_close(s); 3155 3156 if (!rc) { 3157 /* 3158 * We do the same lock strategy around sysfs_slab_add, see 3159 * __kmem_cache_create. Because this is pretty much the last 3160 * operation we do and the lock will be released shortly after 3161 * that in slab_common.c, we could just move sysfs_slab_remove 3162 * to a later point in common code. We should do that when we 3163 * have a common sysfs framework for all allocators. 3164 */ 3165 mutex_unlock(&slab_mutex); 3166 sysfs_slab_remove(s); 3167 mutex_lock(&slab_mutex); 3168 } 3169 3170 return rc; 3171 } 3172 3173 /******************************************************************** 3174 * Kmalloc subsystem 3175 *******************************************************************/ 3176 3177 struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; 3178 EXPORT_SYMBOL(kmalloc_caches); 3179 3180 #ifdef CONFIG_ZONE_DMA 3181 static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; 3182 #endif 3183 3184 static int __init setup_slub_min_order(char *str) 3185 { 3186 get_option(&str, &slub_min_order); 3187 3188 return 1; 3189 } 3190 3191 __setup("slub_min_order=", setup_slub_min_order); 3192 3193 static int __init setup_slub_max_order(char *str) 3194 { 3195 get_option(&str, &slub_max_order); 3196 slub_max_order = min(slub_max_order, MAX_ORDER - 1); 3197 3198 return 1; 3199 } 3200 3201 __setup("slub_max_order=", setup_slub_max_order); 3202 3203 static int __init setup_slub_min_objects(char *str) 3204 { 3205 get_option(&str, &slub_min_objects); 3206 3207 return 1; 3208 } 3209 3210 __setup("slub_min_objects=", setup_slub_min_objects); 3211 3212 static int __init setup_slub_nomerge(char *str) 3213 { 3214 slub_nomerge = 1; 3215 return 1; 3216 } 3217 3218 __setup("slub_nomerge", setup_slub_nomerge); 3219 3220 /* 3221 * Conversion table for small slabs sizes / 8 to the index in the 3222 * kmalloc array. This is necessary for slabs < 192 since we have non power 3223 * of two cache sizes there. The size of larger slabs can be determined using 3224 * fls. 3225 */ 3226 static s8 size_index[24] = { 3227 3, /* 8 */ 3228 4, /* 16 */ 3229 5, /* 24 */ 3230 5, /* 32 */ 3231 6, /* 40 */ 3232 6, /* 48 */ 3233 6, /* 56 */ 3234 6, /* 64 */ 3235 1, /* 72 */ 3236 1, /* 80 */ 3237 1, /* 88 */ 3238 1, /* 96 */ 3239 7, /* 104 */ 3240 7, /* 112 */ 3241 7, /* 120 */ 3242 7, /* 128 */ 3243 2, /* 136 */ 3244 2, /* 144 */ 3245 2, /* 152 */ 3246 2, /* 160 */ 3247 2, /* 168 */ 3248 2, /* 176 */ 3249 2, /* 184 */ 3250 2 /* 192 */ 3251 }; 3252 3253 static inline int size_index_elem(size_t bytes) 3254 { 3255 return (bytes - 1) / 8; 3256 } 3257 3258 static struct kmem_cache *get_slab(size_t size, gfp_t flags) 3259 { 3260 int index; 3261 3262 if (size <= 192) { 3263 if (!size) 3264 return ZERO_SIZE_PTR; 3265 3266 index = size_index[size_index_elem(size)]; 3267 } else 3268 index = fls(size - 1); 3269 3270 #ifdef CONFIG_ZONE_DMA 3271 if (unlikely((flags & SLUB_DMA))) 3272 return kmalloc_dma_caches[index]; 3273 3274 #endif 3275 return kmalloc_caches[index]; 3276 } 3277 3278 void *__kmalloc(size_t size, gfp_t flags) 3279 { 3280 struct kmem_cache *s; 3281 void *ret; 3282 3283 if (unlikely(size > SLUB_MAX_SIZE)) 3284 return kmalloc_large(size, flags); 3285 3286 s = get_slab(size, flags); 3287 3288 if (unlikely(ZERO_OR_NULL_PTR(s))) 3289 return s; 3290 3291 ret = slab_alloc(s, flags, _RET_IP_); 3292 3293 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 3294 3295 return ret; 3296 } 3297 EXPORT_SYMBOL(__kmalloc); 3298 3299 #ifdef CONFIG_NUMA 3300 static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 3301 { 3302 struct page *page; 3303 void *ptr = NULL; 3304 3305 flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; 3306 page = alloc_pages_node(node, flags, get_order(size)); 3307 if (page) 3308 ptr = page_address(page); 3309 3310 kmemleak_alloc(ptr, size, 1, flags); 3311 return ptr; 3312 } 3313 3314 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3315 { 3316 struct kmem_cache *s; 3317 void *ret; 3318 3319 if (unlikely(size > SLUB_MAX_SIZE)) { 3320 ret = kmalloc_large_node(size, flags, node); 3321 3322 trace_kmalloc_node(_RET_IP_, ret, 3323 size, PAGE_SIZE << get_order(size), 3324 flags, node); 3325 3326 return ret; 3327 } 3328 3329 s = get_slab(size, flags); 3330 3331 if (unlikely(ZERO_OR_NULL_PTR(s))) 3332 return s; 3333 3334 ret = slab_alloc_node(s, flags, node, _RET_IP_); 3335 3336 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); 3337 3338 return ret; 3339 } 3340 EXPORT_SYMBOL(__kmalloc_node); 3341 #endif 3342 3343 size_t ksize(const void *object) 3344 { 3345 struct page *page; 3346 3347 if (unlikely(object == ZERO_SIZE_PTR)) 3348 return 0; 3349 3350 page = virt_to_head_page(object); 3351 3352 if (unlikely(!PageSlab(page))) { 3353 WARN_ON(!PageCompound(page)); 3354 return PAGE_SIZE << compound_order(page); 3355 } 3356 3357 return slab_ksize(page->slab_cache); 3358 } 3359 EXPORT_SYMBOL(ksize); 3360 3361 #ifdef CONFIG_SLUB_DEBUG 3362 bool verify_mem_not_deleted(const void *x) 3363 { 3364 struct page *page; 3365 void *object = (void *)x; 3366 unsigned long flags; 3367 bool rv; 3368 3369 if (unlikely(ZERO_OR_NULL_PTR(x))) 3370 return false; 3371 3372 local_irq_save(flags); 3373 3374 page = virt_to_head_page(x); 3375 if (unlikely(!PageSlab(page))) { 3376 /* maybe it was from stack? */ 3377 rv = true; 3378 goto out_unlock; 3379 } 3380 3381 slab_lock(page); 3382 if (on_freelist(page->slab_cache, page, object)) { 3383 object_err(page->slab_cache, page, object, "Object is on free-list"); 3384 rv = false; 3385 } else { 3386 rv = true; 3387 } 3388 slab_unlock(page); 3389 3390 out_unlock: 3391 local_irq_restore(flags); 3392 return rv; 3393 } 3394 EXPORT_SYMBOL(verify_mem_not_deleted); 3395 #endif 3396 3397 void kfree(const void *x) 3398 { 3399 struct page *page; 3400 void *object = (void *)x; 3401 3402 trace_kfree(_RET_IP_, x); 3403 3404 if (unlikely(ZERO_OR_NULL_PTR(x))) 3405 return; 3406 3407 page = virt_to_head_page(x); 3408 if (unlikely(!PageSlab(page))) { 3409 BUG_ON(!PageCompound(page)); 3410 kmemleak_free(x); 3411 __free_memcg_kmem_pages(page, compound_order(page)); 3412 return; 3413 } 3414 slab_free(page->slab_cache, page, object, _RET_IP_); 3415 } 3416 EXPORT_SYMBOL(kfree); 3417 3418 /* 3419 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 3420 * the remaining slabs by the number of items in use. The slabs with the 3421 * most items in use come first. New allocations will then fill those up 3422 * and thus they can be removed from the partial lists. 3423 * 3424 * The slabs with the least items are placed last. This results in them 3425 * being allocated from last increasing the chance that the last objects 3426 * are freed in them. 3427 */ 3428 int kmem_cache_shrink(struct kmem_cache *s) 3429 { 3430 int node; 3431 int i; 3432 struct kmem_cache_node *n; 3433 struct page *page; 3434 struct page *t; 3435 int objects = oo_objects(s->max); 3436 struct list_head *slabs_by_inuse = 3437 kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); 3438 unsigned long flags; 3439 3440 if (!slabs_by_inuse) 3441 return -ENOMEM; 3442 3443 flush_all(s); 3444 for_each_node_state(node, N_NORMAL_MEMORY) { 3445 n = get_node(s, node); 3446 3447 if (!n->nr_partial) 3448 continue; 3449 3450 for (i = 0; i < objects; i++) 3451 INIT_LIST_HEAD(slabs_by_inuse + i); 3452 3453 spin_lock_irqsave(&n->list_lock, flags); 3454 3455 /* 3456 * Build lists indexed by the items in use in each slab. 3457 * 3458 * Note that concurrent frees may occur while we hold the 3459 * list_lock. page->inuse here is the upper limit. 3460 */ 3461 list_for_each_entry_safe(page, t, &n->partial, lru) { 3462 list_move(&page->lru, slabs_by_inuse + page->inuse); 3463 if (!page->inuse) 3464 n->nr_partial--; 3465 } 3466 3467 /* 3468 * Rebuild the partial list with the slabs filled up most 3469 * first and the least used slabs at the end. 3470 */ 3471 for (i = objects - 1; i > 0; i--) 3472 list_splice(slabs_by_inuse + i, n->partial.prev); 3473 3474 spin_unlock_irqrestore(&n->list_lock, flags); 3475 3476 /* Release empty slabs */ 3477 list_for_each_entry_safe(page, t, slabs_by_inuse, lru) 3478 discard_slab(s, page); 3479 } 3480 3481 kfree(slabs_by_inuse); 3482 return 0; 3483 } 3484 EXPORT_SYMBOL(kmem_cache_shrink); 3485 3486 #if defined(CONFIG_MEMORY_HOTPLUG) 3487 static int slab_mem_going_offline_callback(void *arg) 3488 { 3489 struct kmem_cache *s; 3490 3491 mutex_lock(&slab_mutex); 3492 list_for_each_entry(s, &slab_caches, list) 3493 kmem_cache_shrink(s); 3494 mutex_unlock(&slab_mutex); 3495 3496 return 0; 3497 } 3498 3499 static void slab_mem_offline_callback(void *arg) 3500 { 3501 struct kmem_cache_node *n; 3502 struct kmem_cache *s; 3503 struct memory_notify *marg = arg; 3504 int offline_node; 3505 3506 offline_node = marg->status_change_nid_normal; 3507 3508 /* 3509 * If the node still has available memory. we need kmem_cache_node 3510 * for it yet. 3511 */ 3512 if (offline_node < 0) 3513 return; 3514 3515 mutex_lock(&slab_mutex); 3516 list_for_each_entry(s, &slab_caches, list) { 3517 n = get_node(s, offline_node); 3518 if (n) { 3519 /* 3520 * if n->nr_slabs > 0, slabs still exist on the node 3521 * that is going down. We were unable to free them, 3522 * and offline_pages() function shouldn't call this 3523 * callback. So, we must fail. 3524 */ 3525 BUG_ON(slabs_node(s, offline_node)); 3526 3527 s->node[offline_node] = NULL; 3528 kmem_cache_free(kmem_cache_node, n); 3529 } 3530 } 3531 mutex_unlock(&slab_mutex); 3532 } 3533 3534 static int slab_mem_going_online_callback(void *arg) 3535 { 3536 struct kmem_cache_node *n; 3537 struct kmem_cache *s; 3538 struct memory_notify *marg = arg; 3539 int nid = marg->status_change_nid_normal; 3540 int ret = 0; 3541 3542 /* 3543 * If the node's memory is already available, then kmem_cache_node is 3544 * already created. Nothing to do. 3545 */ 3546 if (nid < 0) 3547 return 0; 3548 3549 /* 3550 * We are bringing a node online. No memory is available yet. We must 3551 * allocate a kmem_cache_node structure in order to bring the node 3552 * online. 3553 */ 3554 mutex_lock(&slab_mutex); 3555 list_for_each_entry(s, &slab_caches, list) { 3556 /* 3557 * XXX: kmem_cache_alloc_node will fallback to other nodes 3558 * since memory is not yet available from the node that 3559 * is brought up. 3560 */ 3561 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); 3562 if (!n) { 3563 ret = -ENOMEM; 3564 goto out; 3565 } 3566 init_kmem_cache_node(n); 3567 s->node[nid] = n; 3568 } 3569 out: 3570 mutex_unlock(&slab_mutex); 3571 return ret; 3572 } 3573 3574 static int slab_memory_callback(struct notifier_block *self, 3575 unsigned long action, void *arg) 3576 { 3577 int ret = 0; 3578 3579 switch (action) { 3580 case MEM_GOING_ONLINE: 3581 ret = slab_mem_going_online_callback(arg); 3582 break; 3583 case MEM_GOING_OFFLINE: 3584 ret = slab_mem_going_offline_callback(arg); 3585 break; 3586 case MEM_OFFLINE: 3587 case MEM_CANCEL_ONLINE: 3588 slab_mem_offline_callback(arg); 3589 break; 3590 case MEM_ONLINE: 3591 case MEM_CANCEL_OFFLINE: 3592 break; 3593 } 3594 if (ret) 3595 ret = notifier_from_errno(ret); 3596 else 3597 ret = NOTIFY_OK; 3598 return ret; 3599 } 3600 3601 #endif /* CONFIG_MEMORY_HOTPLUG */ 3602 3603 /******************************************************************** 3604 * Basic setup of slabs 3605 *******************************************************************/ 3606 3607 /* 3608 * Used for early kmem_cache structures that were allocated using 3609 * the page allocator. Allocate them properly then fix up the pointers 3610 * that may be pointing to the wrong kmem_cache structure. 3611 */ 3612 3613 static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) 3614 { 3615 int node; 3616 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 3617 3618 memcpy(s, static_cache, kmem_cache->object_size); 3619 3620 for_each_node_state(node, N_NORMAL_MEMORY) { 3621 struct kmem_cache_node *n = get_node(s, node); 3622 struct page *p; 3623 3624 if (n) { 3625 list_for_each_entry(p, &n->partial, lru) 3626 p->slab_cache = s; 3627 3628 #ifdef CONFIG_SLUB_DEBUG 3629 list_for_each_entry(p, &n->full, lru) 3630 p->slab_cache = s; 3631 #endif 3632 } 3633 } 3634 list_add(&s->list, &slab_caches); 3635 return s; 3636 } 3637 3638 void __init kmem_cache_init(void) 3639 { 3640 static __initdata struct kmem_cache boot_kmem_cache, 3641 boot_kmem_cache_node; 3642 int i; 3643 int caches = 2; 3644 3645 if (debug_guardpage_minorder()) 3646 slub_max_order = 0; 3647 3648 kmem_cache_node = &boot_kmem_cache_node; 3649 kmem_cache = &boot_kmem_cache; 3650 3651 create_boot_cache(kmem_cache_node, "kmem_cache_node", 3652 sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); 3653 3654 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3655 3656 /* Able to allocate the per node structures */ 3657 slab_state = PARTIAL; 3658 3659 create_boot_cache(kmem_cache, "kmem_cache", 3660 offsetof(struct kmem_cache, node) + 3661 nr_node_ids * sizeof(struct kmem_cache_node *), 3662 SLAB_HWCACHE_ALIGN); 3663 3664 kmem_cache = bootstrap(&boot_kmem_cache); 3665 3666 /* 3667 * Allocate kmem_cache_node properly from the kmem_cache slab. 3668 * kmem_cache_node is separately allocated so no need to 3669 * update any list pointers. 3670 */ 3671 kmem_cache_node = bootstrap(&boot_kmem_cache_node); 3672 3673 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 3674 3675 /* 3676 * Patch up the size_index table if we have strange large alignment 3677 * requirements for the kmalloc array. This is only the case for 3678 * MIPS it seems. The standard arches will not generate any code here. 3679 * 3680 * Largest permitted alignment is 256 bytes due to the way we 3681 * handle the index determination for the smaller caches. 3682 * 3683 * Make sure that nothing crazy happens if someone starts tinkering 3684 * around with ARCH_KMALLOC_MINALIGN 3685 */ 3686 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 3687 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 3688 3689 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { 3690 int elem = size_index_elem(i); 3691 if (elem >= ARRAY_SIZE(size_index)) 3692 break; 3693 size_index[elem] = KMALLOC_SHIFT_LOW; 3694 } 3695 3696 if (KMALLOC_MIN_SIZE == 64) { 3697 /* 3698 * The 96 byte size cache is not used if the alignment 3699 * is 64 byte. 3700 */ 3701 for (i = 64 + 8; i <= 96; i += 8) 3702 size_index[size_index_elem(i)] = 7; 3703 } else if (KMALLOC_MIN_SIZE == 128) { 3704 /* 3705 * The 192 byte sized cache is not used if the alignment 3706 * is 128 byte. Redirect kmalloc to use the 256 byte cache 3707 * instead. 3708 */ 3709 for (i = 128 + 8; i <= 192; i += 8) 3710 size_index[size_index_elem(i)] = 8; 3711 } 3712 3713 /* Caches that are not of the two-to-the-power-of size */ 3714 if (KMALLOC_MIN_SIZE <= 32) { 3715 kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0); 3716 caches++; 3717 } 3718 3719 if (KMALLOC_MIN_SIZE <= 64) { 3720 kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0); 3721 caches++; 3722 } 3723 3724 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3725 kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0); 3726 caches++; 3727 } 3728 3729 slab_state = UP; 3730 3731 /* Provide the correct kmalloc names now that the caches are up */ 3732 if (KMALLOC_MIN_SIZE <= 32) { 3733 kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT); 3734 BUG_ON(!kmalloc_caches[1]->name); 3735 } 3736 3737 if (KMALLOC_MIN_SIZE <= 64) { 3738 kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT); 3739 BUG_ON(!kmalloc_caches[2]->name); 3740 } 3741 3742 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3743 char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); 3744 3745 BUG_ON(!s); 3746 kmalloc_caches[i]->name = s; 3747 } 3748 3749 #ifdef CONFIG_SMP 3750 register_cpu_notifier(&slab_notifier); 3751 #endif 3752 3753 #ifdef CONFIG_ZONE_DMA 3754 for (i = 0; i < SLUB_PAGE_SHIFT; i++) { 3755 struct kmem_cache *s = kmalloc_caches[i]; 3756 3757 if (s && s->size) { 3758 char *name = kasprintf(GFP_NOWAIT, 3759 "dma-kmalloc-%d", s->object_size); 3760 3761 BUG_ON(!name); 3762 kmalloc_dma_caches[i] = create_kmalloc_cache(name, 3763 s->object_size, SLAB_CACHE_DMA); 3764 } 3765 } 3766 #endif 3767 printk(KERN_INFO 3768 "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 3769 " CPUs=%d, Nodes=%d\n", 3770 caches, cache_line_size(), 3771 slub_min_order, slub_max_order, slub_min_objects, 3772 nr_cpu_ids, nr_node_ids); 3773 } 3774 3775 void __init kmem_cache_init_late(void) 3776 { 3777 } 3778 3779 /* 3780 * Find a mergeable slab cache 3781 */ 3782 static int slab_unmergeable(struct kmem_cache *s) 3783 { 3784 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 3785 return 1; 3786 3787 if (s->ctor) 3788 return 1; 3789 3790 /* 3791 * We may have set a slab to be unmergeable during bootstrap. 3792 */ 3793 if (s->refcount < 0) 3794 return 1; 3795 3796 return 0; 3797 } 3798 3799 static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, 3800 size_t align, unsigned long flags, const char *name, 3801 void (*ctor)(void *)) 3802 { 3803 struct kmem_cache *s; 3804 3805 if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) 3806 return NULL; 3807 3808 if (ctor) 3809 return NULL; 3810 3811 size = ALIGN(size, sizeof(void *)); 3812 align = calculate_alignment(flags, align, size); 3813 size = ALIGN(size, align); 3814 flags = kmem_cache_flags(size, flags, name, NULL); 3815 3816 list_for_each_entry(s, &slab_caches, list) { 3817 if (slab_unmergeable(s)) 3818 continue; 3819 3820 if (size > s->size) 3821 continue; 3822 3823 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) 3824 continue; 3825 /* 3826 * Check if alignment is compatible. 3827 * Courtesy of Adrian Drzewiecki 3828 */ 3829 if ((s->size & ~(align - 1)) != s->size) 3830 continue; 3831 3832 if (s->size - size >= sizeof(void *)) 3833 continue; 3834 3835 if (!cache_match_memcg(s, memcg)) 3836 continue; 3837 3838 return s; 3839 } 3840 return NULL; 3841 } 3842 3843 struct kmem_cache * 3844 __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, 3845 size_t align, unsigned long flags, void (*ctor)(void *)) 3846 { 3847 struct kmem_cache *s; 3848 3849 s = find_mergeable(memcg, size, align, flags, name, ctor); 3850 if (s) { 3851 s->refcount++; 3852 /* 3853 * Adjust the object sizes so that we clear 3854 * the complete object on kzalloc. 3855 */ 3856 s->object_size = max(s->object_size, (int)size); 3857 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3858 3859 if (sysfs_slab_alias(s, name)) { 3860 s->refcount--; 3861 s = NULL; 3862 } 3863 } 3864 3865 return s; 3866 } 3867 3868 int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) 3869 { 3870 int err; 3871 3872 err = kmem_cache_open(s, flags); 3873 if (err) 3874 return err; 3875 3876 /* Mutex is not taken during early boot */ 3877 if (slab_state <= UP) 3878 return 0; 3879 3880 memcg_propagate_slab_attrs(s); 3881 mutex_unlock(&slab_mutex); 3882 err = sysfs_slab_add(s); 3883 mutex_lock(&slab_mutex); 3884 3885 if (err) 3886 kmem_cache_close(s); 3887 3888 return err; 3889 } 3890 3891 #ifdef CONFIG_SMP 3892 /* 3893 * Use the cpu notifier to insure that the cpu slabs are flushed when 3894 * necessary. 3895 */ 3896 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 3897 unsigned long action, void *hcpu) 3898 { 3899 long cpu = (long)hcpu; 3900 struct kmem_cache *s; 3901 unsigned long flags; 3902 3903 switch (action) { 3904 case CPU_UP_CANCELED: 3905 case CPU_UP_CANCELED_FROZEN: 3906 case CPU_DEAD: 3907 case CPU_DEAD_FROZEN: 3908 mutex_lock(&slab_mutex); 3909 list_for_each_entry(s, &slab_caches, list) { 3910 local_irq_save(flags); 3911 __flush_cpu_slab(s, cpu); 3912 local_irq_restore(flags); 3913 } 3914 mutex_unlock(&slab_mutex); 3915 break; 3916 default: 3917 break; 3918 } 3919 return NOTIFY_OK; 3920 } 3921 3922 static struct notifier_block __cpuinitdata slab_notifier = { 3923 .notifier_call = slab_cpuup_callback 3924 }; 3925 3926 #endif 3927 3928 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) 3929 { 3930 struct kmem_cache *s; 3931 void *ret; 3932 3933 if (unlikely(size > SLUB_MAX_SIZE)) 3934 return kmalloc_large(size, gfpflags); 3935 3936 s = get_slab(size, gfpflags); 3937 3938 if (unlikely(ZERO_OR_NULL_PTR(s))) 3939 return s; 3940 3941 ret = slab_alloc(s, gfpflags, caller); 3942 3943 /* Honor the call site pointer we received. */ 3944 trace_kmalloc(caller, ret, size, s->size, gfpflags); 3945 3946 return ret; 3947 } 3948 3949 #ifdef CONFIG_NUMA 3950 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3951 int node, unsigned long caller) 3952 { 3953 struct kmem_cache *s; 3954 void *ret; 3955 3956 if (unlikely(size > SLUB_MAX_SIZE)) { 3957 ret = kmalloc_large_node(size, gfpflags, node); 3958 3959 trace_kmalloc_node(caller, ret, 3960 size, PAGE_SIZE << get_order(size), 3961 gfpflags, node); 3962 3963 return ret; 3964 } 3965 3966 s = get_slab(size, gfpflags); 3967 3968 if (unlikely(ZERO_OR_NULL_PTR(s))) 3969 return s; 3970 3971 ret = slab_alloc_node(s, gfpflags, node, caller); 3972 3973 /* Honor the call site pointer we received. */ 3974 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); 3975 3976 return ret; 3977 } 3978 #endif 3979 3980 #ifdef CONFIG_SYSFS 3981 static int count_inuse(struct page *page) 3982 { 3983 return page->inuse; 3984 } 3985 3986 static int count_total(struct page *page) 3987 { 3988 return page->objects; 3989 } 3990 #endif 3991 3992 #ifdef CONFIG_SLUB_DEBUG 3993 static int validate_slab(struct kmem_cache *s, struct page *page, 3994 unsigned long *map) 3995 { 3996 void *p; 3997 void *addr = page_address(page); 3998 3999 if (!check_slab(s, page) || 4000 !on_freelist(s, page, NULL)) 4001 return 0; 4002 4003 /* Now we know that a valid freelist exists */ 4004 bitmap_zero(map, page->objects); 4005 4006 get_map(s, page, map); 4007 for_each_object(p, s, addr, page->objects) { 4008 if (test_bit(slab_index(p, s, addr), map)) 4009 if (!check_object(s, page, p, SLUB_RED_INACTIVE)) 4010 return 0; 4011 } 4012 4013 for_each_object(p, s, addr, page->objects) 4014 if (!test_bit(slab_index(p, s, addr), map)) 4015 if (!check_object(s, page, p, SLUB_RED_ACTIVE)) 4016 return 0; 4017 return 1; 4018 } 4019 4020 static void validate_slab_slab(struct kmem_cache *s, struct page *page, 4021 unsigned long *map) 4022 { 4023 slab_lock(page); 4024 validate_slab(s, page, map); 4025 slab_unlock(page); 4026 } 4027 4028 static int validate_slab_node(struct kmem_cache *s, 4029 struct kmem_cache_node *n, unsigned long *map) 4030 { 4031 unsigned long count = 0; 4032 struct page *page; 4033 unsigned long flags; 4034 4035 spin_lock_irqsave(&n->list_lock, flags); 4036 4037 list_for_each_entry(page, &n->partial, lru) { 4038 validate_slab_slab(s, page, map); 4039 count++; 4040 } 4041 if (count != n->nr_partial) 4042 printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " 4043 "counter=%ld\n", s->name, count, n->nr_partial); 4044 4045 if (!(s->flags & SLAB_STORE_USER)) 4046 goto out; 4047 4048 list_for_each_entry(page, &n->full, lru) { 4049 validate_slab_slab(s, page, map); 4050 count++; 4051 } 4052 if (count != atomic_long_read(&n->nr_slabs)) 4053 printk(KERN_ERR "SLUB: %s %ld slabs counted but " 4054 "counter=%ld\n", s->name, count, 4055 atomic_long_read(&n->nr_slabs)); 4056 4057 out: 4058 spin_unlock_irqrestore(&n->list_lock, flags); 4059 return count; 4060 } 4061 4062 static long validate_slab_cache(struct kmem_cache *s) 4063 { 4064 int node; 4065 unsigned long count = 0; 4066 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 4067 sizeof(unsigned long), GFP_KERNEL); 4068 4069 if (!map) 4070 return -ENOMEM; 4071 4072 flush_all(s); 4073 for_each_node_state(node, N_NORMAL_MEMORY) { 4074 struct kmem_cache_node *n = get_node(s, node); 4075 4076 count += validate_slab_node(s, n, map); 4077 } 4078 kfree(map); 4079 return count; 4080 } 4081 /* 4082 * Generate lists of code addresses where slabcache objects are allocated 4083 * and freed. 4084 */ 4085 4086 struct location { 4087 unsigned long count; 4088 unsigned long addr; 4089 long long sum_time; 4090 long min_time; 4091 long max_time; 4092 long min_pid; 4093 long max_pid; 4094 DECLARE_BITMAP(cpus, NR_CPUS); 4095 nodemask_t nodes; 4096 }; 4097 4098 struct loc_track { 4099 unsigned long max; 4100 unsigned long count; 4101 struct location *loc; 4102 }; 4103 4104 static void free_loc_track(struct loc_track *t) 4105 { 4106 if (t->max) 4107 free_pages((unsigned long)t->loc, 4108 get_order(sizeof(struct location) * t->max)); 4109 } 4110 4111 static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) 4112 { 4113 struct location *l; 4114 int order; 4115 4116 order = get_order(sizeof(struct location) * max); 4117 4118 l = (void *)__get_free_pages(flags, order); 4119 if (!l) 4120 return 0; 4121 4122 if (t->count) { 4123 memcpy(l, t->loc, sizeof(struct location) * t->count); 4124 free_loc_track(t); 4125 } 4126 t->max = max; 4127 t->loc = l; 4128 return 1; 4129 } 4130 4131 static int add_location(struct loc_track *t, struct kmem_cache *s, 4132 const struct track *track) 4133 { 4134 long start, end, pos; 4135 struct location *l; 4136 unsigned long caddr; 4137 unsigned long age = jiffies - track->when; 4138 4139 start = -1; 4140 end = t->count; 4141 4142 for ( ; ; ) { 4143 pos = start + (end - start + 1) / 2; 4144 4145 /* 4146 * There is nothing at "end". If we end up there 4147 * we need to add something to before end. 4148 */ 4149 if (pos == end) 4150 break; 4151 4152 caddr = t->loc[pos].addr; 4153 if (track->addr == caddr) { 4154 4155 l = &t->loc[pos]; 4156 l->count++; 4157 if (track->when) { 4158 l->sum_time += age; 4159 if (age < l->min_time) 4160 l->min_time = age; 4161 if (age > l->max_time) 4162 l->max_time = age; 4163 4164 if (track->pid < l->min_pid) 4165 l->min_pid = track->pid; 4166 if (track->pid > l->max_pid) 4167 l->max_pid = track->pid; 4168 4169 cpumask_set_cpu(track->cpu, 4170 to_cpumask(l->cpus)); 4171 } 4172 node_set(page_to_nid(virt_to_page(track)), l->nodes); 4173 return 1; 4174 } 4175 4176 if (track->addr < caddr) 4177 end = pos; 4178 else 4179 start = pos; 4180 } 4181 4182 /* 4183 * Not found. Insert new tracking element. 4184 */ 4185 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) 4186 return 0; 4187 4188 l = t->loc + pos; 4189 if (pos < t->count) 4190 memmove(l + 1, l, 4191 (t->count - pos) * sizeof(struct location)); 4192 t->count++; 4193 l->count = 1; 4194 l->addr = track->addr; 4195 l->sum_time = age; 4196 l->min_time = age; 4197 l->max_time = age; 4198 l->min_pid = track->pid; 4199 l->max_pid = track->pid; 4200 cpumask_clear(to_cpumask(l->cpus)); 4201 cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); 4202 nodes_clear(l->nodes); 4203 node_set(page_to_nid(virt_to_page(track)), l->nodes); 4204 return 1; 4205 } 4206 4207 static void process_slab(struct loc_track *t, struct kmem_cache *s, 4208 struct page *page, enum track_item alloc, 4209 unsigned long *map) 4210 { 4211 void *addr = page_address(page); 4212 void *p; 4213 4214 bitmap_zero(map, page->objects); 4215 get_map(s, page, map); 4216 4217 for_each_object(p, s, addr, page->objects) 4218 if (!test_bit(slab_index(p, s, addr), map)) 4219 add_location(t, s, get_track(s, p, alloc)); 4220 } 4221 4222 static int list_locations(struct kmem_cache *s, char *buf, 4223 enum track_item alloc) 4224 { 4225 int len = 0; 4226 unsigned long i; 4227 struct loc_track t = { 0, 0, NULL }; 4228 int node; 4229 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 4230 sizeof(unsigned long), GFP_KERNEL); 4231 4232 if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 4233 GFP_TEMPORARY)) { 4234 kfree(map); 4235 return sprintf(buf, "Out of memory\n"); 4236 } 4237 /* Push back cpu slabs */ 4238 flush_all(s); 4239 4240 for_each_node_state(node, N_NORMAL_MEMORY) { 4241 struct kmem_cache_node *n = get_node(s, node); 4242 unsigned long flags; 4243 struct page *page; 4244 4245 if (!atomic_long_read(&n->nr_slabs)) 4246 continue; 4247 4248 spin_lock_irqsave(&n->list_lock, flags); 4249 list_for_each_entry(page, &n->partial, lru) 4250 process_slab(&t, s, page, alloc, map); 4251 list_for_each_entry(page, &n->full, lru) 4252 process_slab(&t, s, page, alloc, map); 4253 spin_unlock_irqrestore(&n->list_lock, flags); 4254 } 4255 4256 for (i = 0; i < t.count; i++) { 4257 struct location *l = &t.loc[i]; 4258 4259 if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) 4260 break; 4261 len += sprintf(buf + len, "%7ld ", l->count); 4262 4263 if (l->addr) 4264 len += sprintf(buf + len, "%pS", (void *)l->addr); 4265 else 4266 len += sprintf(buf + len, "<not-available>"); 4267 4268 if (l->sum_time != l->min_time) { 4269 len += sprintf(buf + len, " age=%ld/%ld/%ld", 4270 l->min_time, 4271 (long)div_u64(l->sum_time, l->count), 4272 l->max_time); 4273 } else 4274 len += sprintf(buf + len, " age=%ld", 4275 l->min_time); 4276 4277 if (l->min_pid != l->max_pid) 4278 len += sprintf(buf + len, " pid=%ld-%ld", 4279 l->min_pid, l->max_pid); 4280 else 4281 len += sprintf(buf + len, " pid=%ld", 4282 l->min_pid); 4283 4284 if (num_online_cpus() > 1 && 4285 !cpumask_empty(to_cpumask(l->cpus)) && 4286 len < PAGE_SIZE - 60) { 4287 len += sprintf(buf + len, " cpus="); 4288 len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, 4289 to_cpumask(l->cpus)); 4290 } 4291 4292 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && 4293 len < PAGE_SIZE - 60) { 4294 len += sprintf(buf + len, " nodes="); 4295 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, 4296 l->nodes); 4297 } 4298 4299 len += sprintf(buf + len, "\n"); 4300 } 4301 4302 free_loc_track(&t); 4303 kfree(map); 4304 if (!t.count) 4305 len += sprintf(buf, "No data\n"); 4306 return len; 4307 } 4308 #endif 4309 4310 #ifdef SLUB_RESILIENCY_TEST 4311 static void resiliency_test(void) 4312 { 4313 u8 *p; 4314 4315 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10); 4316 4317 printk(KERN_ERR "SLUB resiliency testing\n"); 4318 printk(KERN_ERR "-----------------------\n"); 4319 printk(KERN_ERR "A. Corruption after allocation\n"); 4320 4321 p = kzalloc(16, GFP_KERNEL); 4322 p[16] = 0x12; 4323 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" 4324 " 0x12->0x%p\n\n", p + 16); 4325 4326 validate_slab_cache(kmalloc_caches[4]); 4327 4328 /* Hmmm... The next two are dangerous */ 4329 p = kzalloc(32, GFP_KERNEL); 4330 p[32 + sizeof(void *)] = 0x34; 4331 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" 4332 " 0x34 -> -0x%p\n", p); 4333 printk(KERN_ERR 4334 "If allocated object is overwritten then not detectable\n\n"); 4335 4336 validate_slab_cache(kmalloc_caches[5]); 4337 p = kzalloc(64, GFP_KERNEL); 4338 p += 64 + (get_cycles() & 0xff) * sizeof(void *); 4339 *p = 0x56; 4340 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", 4341 p); 4342 printk(KERN_ERR 4343 "If allocated object is overwritten then not detectable\n\n"); 4344 validate_slab_cache(kmalloc_caches[6]); 4345 4346 printk(KERN_ERR "\nB. Corruption after free\n"); 4347 p = kzalloc(128, GFP_KERNEL); 4348 kfree(p); 4349 *p = 0x78; 4350 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); 4351 validate_slab_cache(kmalloc_caches[7]); 4352 4353 p = kzalloc(256, GFP_KERNEL); 4354 kfree(p); 4355 p[50] = 0x9a; 4356 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", 4357 p); 4358 validate_slab_cache(kmalloc_caches[8]); 4359 4360 p = kzalloc(512, GFP_KERNEL); 4361 kfree(p); 4362 p[512] = 0xab; 4363 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); 4364 validate_slab_cache(kmalloc_caches[9]); 4365 } 4366 #else 4367 #ifdef CONFIG_SYSFS 4368 static void resiliency_test(void) {}; 4369 #endif 4370 #endif 4371 4372 #ifdef CONFIG_SYSFS 4373 enum slab_stat_type { 4374 SL_ALL, /* All slabs */ 4375 SL_PARTIAL, /* Only partially allocated slabs */ 4376 SL_CPU, /* Only slabs used for cpu caches */ 4377 SL_OBJECTS, /* Determine allocated objects not slabs */ 4378 SL_TOTAL /* Determine object capacity not slabs */ 4379 }; 4380 4381 #define SO_ALL (1 << SL_ALL) 4382 #define SO_PARTIAL (1 << SL_PARTIAL) 4383 #define SO_CPU (1 << SL_CPU) 4384 #define SO_OBJECTS (1 << SL_OBJECTS) 4385 #define SO_TOTAL (1 << SL_TOTAL) 4386 4387 static ssize_t show_slab_objects(struct kmem_cache *s, 4388 char *buf, unsigned long flags) 4389 { 4390 unsigned long total = 0; 4391 int node; 4392 int x; 4393 unsigned long *nodes; 4394 unsigned long *per_cpu; 4395 4396 nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); 4397 if (!nodes) 4398 return -ENOMEM; 4399 per_cpu = nodes + nr_node_ids; 4400 4401 if (flags & SO_CPU) { 4402 int cpu; 4403 4404 for_each_possible_cpu(cpu) { 4405 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4406 int node; 4407 struct page *page; 4408 4409 page = ACCESS_ONCE(c->page); 4410 if (!page) 4411 continue; 4412 4413 node = page_to_nid(page); 4414 if (flags & SO_TOTAL) 4415 x = page->objects; 4416 else if (flags & SO_OBJECTS) 4417 x = page->inuse; 4418 else 4419 x = 1; 4420 4421 total += x; 4422 nodes[node] += x; 4423 4424 page = ACCESS_ONCE(c->partial); 4425 if (page) { 4426 x = page->pobjects; 4427 total += x; 4428 nodes[node] += x; 4429 } 4430 4431 per_cpu[node]++; 4432 } 4433 } 4434 4435 lock_memory_hotplug(); 4436 #ifdef CONFIG_SLUB_DEBUG 4437 if (flags & SO_ALL) { 4438 for_each_node_state(node, N_NORMAL_MEMORY) { 4439 struct kmem_cache_node *n = get_node(s, node); 4440 4441 if (flags & SO_TOTAL) 4442 x = atomic_long_read(&n->total_objects); 4443 else if (flags & SO_OBJECTS) 4444 x = atomic_long_read(&n->total_objects) - 4445 count_partial(n, count_free); 4446 4447 else 4448 x = atomic_long_read(&n->nr_slabs); 4449 total += x; 4450 nodes[node] += x; 4451 } 4452 4453 } else 4454 #endif 4455 if (flags & SO_PARTIAL) { 4456 for_each_node_state(node, N_NORMAL_MEMORY) { 4457 struct kmem_cache_node *n = get_node(s, node); 4458 4459 if (flags & SO_TOTAL) 4460 x = count_partial(n, count_total); 4461 else if (flags & SO_OBJECTS) 4462 x = count_partial(n, count_inuse); 4463 else 4464 x = n->nr_partial; 4465 total += x; 4466 nodes[node] += x; 4467 } 4468 } 4469 x = sprintf(buf, "%lu", total); 4470 #ifdef CONFIG_NUMA 4471 for_each_node_state(node, N_NORMAL_MEMORY) 4472 if (nodes[node]) 4473 x += sprintf(buf + x, " N%d=%lu", 4474 node, nodes[node]); 4475 #endif 4476 unlock_memory_hotplug(); 4477 kfree(nodes); 4478 return x + sprintf(buf + x, "\n"); 4479 } 4480 4481 #ifdef CONFIG_SLUB_DEBUG 4482 static int any_slab_objects(struct kmem_cache *s) 4483 { 4484 int node; 4485 4486 for_each_online_node(node) { 4487 struct kmem_cache_node *n = get_node(s, node); 4488 4489 if (!n) 4490 continue; 4491 4492 if (atomic_long_read(&n->total_objects)) 4493 return 1; 4494 } 4495 return 0; 4496 } 4497 #endif 4498 4499 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 4500 #define to_slab(n) container_of(n, struct kmem_cache, kobj) 4501 4502 struct slab_attribute { 4503 struct attribute attr; 4504 ssize_t (*show)(struct kmem_cache *s, char *buf); 4505 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 4506 }; 4507 4508 #define SLAB_ATTR_RO(_name) \ 4509 static struct slab_attribute _name##_attr = \ 4510 __ATTR(_name, 0400, _name##_show, NULL) 4511 4512 #define SLAB_ATTR(_name) \ 4513 static struct slab_attribute _name##_attr = \ 4514 __ATTR(_name, 0600, _name##_show, _name##_store) 4515 4516 static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 4517 { 4518 return sprintf(buf, "%d\n", s->size); 4519 } 4520 SLAB_ATTR_RO(slab_size); 4521 4522 static ssize_t align_show(struct kmem_cache *s, char *buf) 4523 { 4524 return sprintf(buf, "%d\n", s->align); 4525 } 4526 SLAB_ATTR_RO(align); 4527 4528 static ssize_t object_size_show(struct kmem_cache *s, char *buf) 4529 { 4530 return sprintf(buf, "%d\n", s->object_size); 4531 } 4532 SLAB_ATTR_RO(object_size); 4533 4534 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 4535 { 4536 return sprintf(buf, "%d\n", oo_objects(s->oo)); 4537 } 4538 SLAB_ATTR_RO(objs_per_slab); 4539 4540 static ssize_t order_store(struct kmem_cache *s, 4541 const char *buf, size_t length) 4542 { 4543 unsigned long order; 4544 int err; 4545 4546 err = strict_strtoul(buf, 10, &order); 4547 if (err) 4548 return err; 4549 4550 if (order > slub_max_order || order < slub_min_order) 4551 return -EINVAL; 4552 4553 calculate_sizes(s, order); 4554 return length; 4555 } 4556 4557 static ssize_t order_show(struct kmem_cache *s, char *buf) 4558 { 4559 return sprintf(buf, "%d\n", oo_order(s->oo)); 4560 } 4561 SLAB_ATTR(order); 4562 4563 static ssize_t min_partial_show(struct kmem_cache *s, char *buf) 4564 { 4565 return sprintf(buf, "%lu\n", s->min_partial); 4566 } 4567 4568 static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, 4569 size_t length) 4570 { 4571 unsigned long min; 4572 int err; 4573 4574 err = strict_strtoul(buf, 10, &min); 4575 if (err) 4576 return err; 4577 4578 set_min_partial(s, min); 4579 return length; 4580 } 4581 SLAB_ATTR(min_partial); 4582 4583 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) 4584 { 4585 return sprintf(buf, "%u\n", s->cpu_partial); 4586 } 4587 4588 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, 4589 size_t length) 4590 { 4591 unsigned long objects; 4592 int err; 4593 4594 err = strict_strtoul(buf, 10, &objects); 4595 if (err) 4596 return err; 4597 if (objects && kmem_cache_debug(s)) 4598 return -EINVAL; 4599 4600 s->cpu_partial = objects; 4601 flush_all(s); 4602 return length; 4603 } 4604 SLAB_ATTR(cpu_partial); 4605 4606 static ssize_t ctor_show(struct kmem_cache *s, char *buf) 4607 { 4608 if (!s->ctor) 4609 return 0; 4610 return sprintf(buf, "%pS\n", s->ctor); 4611 } 4612 SLAB_ATTR_RO(ctor); 4613 4614 static ssize_t aliases_show(struct kmem_cache *s, char *buf) 4615 { 4616 return sprintf(buf, "%d\n", s->refcount - 1); 4617 } 4618 SLAB_ATTR_RO(aliases); 4619 4620 static ssize_t partial_show(struct kmem_cache *s, char *buf) 4621 { 4622 return show_slab_objects(s, buf, SO_PARTIAL); 4623 } 4624 SLAB_ATTR_RO(partial); 4625 4626 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 4627 { 4628 return show_slab_objects(s, buf, SO_CPU); 4629 } 4630 SLAB_ATTR_RO(cpu_slabs); 4631 4632 static ssize_t objects_show(struct kmem_cache *s, char *buf) 4633 { 4634 return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); 4635 } 4636 SLAB_ATTR_RO(objects); 4637 4638 static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) 4639 { 4640 return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); 4641 } 4642 SLAB_ATTR_RO(objects_partial); 4643 4644 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) 4645 { 4646 int objects = 0; 4647 int pages = 0; 4648 int cpu; 4649 int len; 4650 4651 for_each_online_cpu(cpu) { 4652 struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; 4653 4654 if (page) { 4655 pages += page->pages; 4656 objects += page->pobjects; 4657 } 4658 } 4659 4660 len = sprintf(buf, "%d(%d)", objects, pages); 4661 4662 #ifdef CONFIG_SMP 4663 for_each_online_cpu(cpu) { 4664 struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; 4665 4666 if (page && len < PAGE_SIZE - 20) 4667 len += sprintf(buf + len, " C%d=%d(%d)", cpu, 4668 page->pobjects, page->pages); 4669 } 4670 #endif 4671 return len + sprintf(buf + len, "\n"); 4672 } 4673 SLAB_ATTR_RO(slabs_cpu_partial); 4674 4675 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4676 { 4677 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4678 } 4679 4680 static ssize_t reclaim_account_store(struct kmem_cache *s, 4681 const char *buf, size_t length) 4682 { 4683 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 4684 if (buf[0] == '1') 4685 s->flags |= SLAB_RECLAIM_ACCOUNT; 4686 return length; 4687 } 4688 SLAB_ATTR(reclaim_account); 4689 4690 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 4691 { 4692 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 4693 } 4694 SLAB_ATTR_RO(hwcache_align); 4695 4696 #ifdef CONFIG_ZONE_DMA 4697 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 4698 { 4699 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 4700 } 4701 SLAB_ATTR_RO(cache_dma); 4702 #endif 4703 4704 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 4705 { 4706 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 4707 } 4708 SLAB_ATTR_RO(destroy_by_rcu); 4709 4710 static ssize_t reserved_show(struct kmem_cache *s, char *buf) 4711 { 4712 return sprintf(buf, "%d\n", s->reserved); 4713 } 4714 SLAB_ATTR_RO(reserved); 4715 4716 #ifdef CONFIG_SLUB_DEBUG 4717 static ssize_t slabs_show(struct kmem_cache *s, char *buf) 4718 { 4719 return show_slab_objects(s, buf, SO_ALL); 4720 } 4721 SLAB_ATTR_RO(slabs); 4722 4723 static ssize_t total_objects_show(struct kmem_cache *s, char *buf) 4724 { 4725 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); 4726 } 4727 SLAB_ATTR_RO(total_objects); 4728 4729 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 4730 { 4731 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 4732 } 4733 4734 static ssize_t sanity_checks_store(struct kmem_cache *s, 4735 const char *buf, size_t length) 4736 { 4737 s->flags &= ~SLAB_DEBUG_FREE; 4738 if (buf[0] == '1') { 4739 s->flags &= ~__CMPXCHG_DOUBLE; 4740 s->flags |= SLAB_DEBUG_FREE; 4741 } 4742 return length; 4743 } 4744 SLAB_ATTR(sanity_checks); 4745 4746 static ssize_t trace_show(struct kmem_cache *s, char *buf) 4747 { 4748 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 4749 } 4750 4751 static ssize_t trace_store(struct kmem_cache *s, const char *buf, 4752 size_t length) 4753 { 4754 s->flags &= ~SLAB_TRACE; 4755 if (buf[0] == '1') { 4756 s->flags &= ~__CMPXCHG_DOUBLE; 4757 s->flags |= SLAB_TRACE; 4758 } 4759 return length; 4760 } 4761 SLAB_ATTR(trace); 4762 4763 static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 4764 { 4765 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 4766 } 4767 4768 static ssize_t red_zone_store(struct kmem_cache *s, 4769 const char *buf, size_t length) 4770 { 4771 if (any_slab_objects(s)) 4772 return -EBUSY; 4773 4774 s->flags &= ~SLAB_RED_ZONE; 4775 if (buf[0] == '1') { 4776 s->flags &= ~__CMPXCHG_DOUBLE; 4777 s->flags |= SLAB_RED_ZONE; 4778 } 4779 calculate_sizes(s, -1); 4780 return length; 4781 } 4782 SLAB_ATTR(red_zone); 4783 4784 static ssize_t poison_show(struct kmem_cache *s, char *buf) 4785 { 4786 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); 4787 } 4788 4789 static ssize_t poison_store(struct kmem_cache *s, 4790 const char *buf, size_t length) 4791 { 4792 if (any_slab_objects(s)) 4793 return -EBUSY; 4794 4795 s->flags &= ~SLAB_POISON; 4796 if (buf[0] == '1') { 4797 s->flags &= ~__CMPXCHG_DOUBLE; 4798 s->flags |= SLAB_POISON; 4799 } 4800 calculate_sizes(s, -1); 4801 return length; 4802 } 4803 SLAB_ATTR(poison); 4804 4805 static ssize_t store_user_show(struct kmem_cache *s, char *buf) 4806 { 4807 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 4808 } 4809 4810 static ssize_t store_user_store(struct kmem_cache *s, 4811 const char *buf, size_t length) 4812 { 4813 if (any_slab_objects(s)) 4814 return -EBUSY; 4815 4816 s->flags &= ~SLAB_STORE_USER; 4817 if (buf[0] == '1') { 4818 s->flags &= ~__CMPXCHG_DOUBLE; 4819 s->flags |= SLAB_STORE_USER; 4820 } 4821 calculate_sizes(s, -1); 4822 return length; 4823 } 4824 SLAB_ATTR(store_user); 4825 4826 static ssize_t validate_show(struct kmem_cache *s, char *buf) 4827 { 4828 return 0; 4829 } 4830 4831 static ssize_t validate_store(struct kmem_cache *s, 4832 const char *buf, size_t length) 4833 { 4834 int ret = -EINVAL; 4835 4836 if (buf[0] == '1') { 4837 ret = validate_slab_cache(s); 4838 if (ret >= 0) 4839 ret = length; 4840 } 4841 return ret; 4842 } 4843 SLAB_ATTR(validate); 4844 4845 static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) 4846 { 4847 if (!(s->flags & SLAB_STORE_USER)) 4848 return -ENOSYS; 4849 return list_locations(s, buf, TRACK_ALLOC); 4850 } 4851 SLAB_ATTR_RO(alloc_calls); 4852 4853 static ssize_t free_calls_show(struct kmem_cache *s, char *buf) 4854 { 4855 if (!(s->flags & SLAB_STORE_USER)) 4856 return -ENOSYS; 4857 return list_locations(s, buf, TRACK_FREE); 4858 } 4859 SLAB_ATTR_RO(free_calls); 4860 #endif /* CONFIG_SLUB_DEBUG */ 4861 4862 #ifdef CONFIG_FAILSLAB 4863 static ssize_t failslab_show(struct kmem_cache *s, char *buf) 4864 { 4865 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); 4866 } 4867 4868 static ssize_t failslab_store(struct kmem_cache *s, const char *buf, 4869 size_t length) 4870 { 4871 s->flags &= ~SLAB_FAILSLAB; 4872 if (buf[0] == '1') 4873 s->flags |= SLAB_FAILSLAB; 4874 return length; 4875 } 4876 SLAB_ATTR(failslab); 4877 #endif 4878 4879 static ssize_t shrink_show(struct kmem_cache *s, char *buf) 4880 { 4881 return 0; 4882 } 4883 4884 static ssize_t shrink_store(struct kmem_cache *s, 4885 const char *buf, size_t length) 4886 { 4887 if (buf[0] == '1') { 4888 int rc = kmem_cache_shrink(s); 4889 4890 if (rc) 4891 return rc; 4892 } else 4893 return -EINVAL; 4894 return length; 4895 } 4896 SLAB_ATTR(shrink); 4897 4898 #ifdef CONFIG_NUMA 4899 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 4900 { 4901 return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); 4902 } 4903 4904 static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, 4905 const char *buf, size_t length) 4906 { 4907 unsigned long ratio; 4908 int err; 4909 4910 err = strict_strtoul(buf, 10, &ratio); 4911 if (err) 4912 return err; 4913 4914 if (ratio <= 100) 4915 s->remote_node_defrag_ratio = ratio * 10; 4916 4917 return length; 4918 } 4919 SLAB_ATTR(remote_node_defrag_ratio); 4920 #endif 4921 4922 #ifdef CONFIG_SLUB_STATS 4923 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) 4924 { 4925 unsigned long sum = 0; 4926 int cpu; 4927 int len; 4928 int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); 4929 4930 if (!data) 4931 return -ENOMEM; 4932 4933 for_each_online_cpu(cpu) { 4934 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; 4935 4936 data[cpu] = x; 4937 sum += x; 4938 } 4939 4940 len = sprintf(buf, "%lu", sum); 4941 4942 #ifdef CONFIG_SMP 4943 for_each_online_cpu(cpu) { 4944 if (data[cpu] && len < PAGE_SIZE - 20) 4945 len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); 4946 } 4947 #endif 4948 kfree(data); 4949 return len + sprintf(buf + len, "\n"); 4950 } 4951 4952 static void clear_stat(struct kmem_cache *s, enum stat_item si) 4953 { 4954 int cpu; 4955 4956 for_each_online_cpu(cpu) 4957 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; 4958 } 4959 4960 #define STAT_ATTR(si, text) \ 4961 static ssize_t text##_show(struct kmem_cache *s, char *buf) \ 4962 { \ 4963 return show_stat(s, buf, si); \ 4964 } \ 4965 static ssize_t text##_store(struct kmem_cache *s, \ 4966 const char *buf, size_t length) \ 4967 { \ 4968 if (buf[0] != '0') \ 4969 return -EINVAL; \ 4970 clear_stat(s, si); \ 4971 return length; \ 4972 } \ 4973 SLAB_ATTR(text); \ 4974 4975 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 4976 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 4977 STAT_ATTR(FREE_FASTPATH, free_fastpath); 4978 STAT_ATTR(FREE_SLOWPATH, free_slowpath); 4979 STAT_ATTR(FREE_FROZEN, free_frozen); 4980 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 4981 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 4982 STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 4983 STAT_ATTR(ALLOC_SLAB, alloc_slab); 4984 STAT_ATTR(ALLOC_REFILL, alloc_refill); 4985 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); 4986 STAT_ATTR(FREE_SLAB, free_slab); 4987 STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 4988 STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 4989 STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 4990 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 4991 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 4992 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 4993 STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); 4994 STAT_ATTR(ORDER_FALLBACK, order_fallback); 4995 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 4996 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 4997 STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); 4998 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); 4999 STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); 5000 STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); 5001 #endif 5002 5003 static struct attribute *slab_attrs[] = { 5004 &slab_size_attr.attr, 5005 &object_size_attr.attr, 5006 &objs_per_slab_attr.attr, 5007 &order_attr.attr, 5008 &min_partial_attr.attr, 5009 &cpu_partial_attr.attr, 5010 &objects_attr.attr, 5011 &objects_partial_attr.attr, 5012 &partial_attr.attr, 5013 &cpu_slabs_attr.attr, 5014 &ctor_attr.attr, 5015 &aliases_attr.attr, 5016 &align_attr.attr, 5017 &hwcache_align_attr.attr, 5018 &reclaim_account_attr.attr, 5019 &destroy_by_rcu_attr.attr, 5020 &shrink_attr.attr, 5021 &reserved_attr.attr, 5022 &slabs_cpu_partial_attr.attr, 5023 #ifdef CONFIG_SLUB_DEBUG 5024 &total_objects_attr.attr, 5025 &slabs_attr.attr, 5026 &sanity_checks_attr.attr, 5027 &trace_attr.attr, 5028 &red_zone_attr.attr, 5029 &poison_attr.attr, 5030 &store_user_attr.attr, 5031 &validate_attr.attr, 5032 &alloc_calls_attr.attr, 5033 &free_calls_attr.attr, 5034 #endif 5035 #ifdef CONFIG_ZONE_DMA 5036 &cache_dma_attr.attr, 5037 #endif 5038 #ifdef CONFIG_NUMA 5039 &remote_node_defrag_ratio_attr.attr, 5040 #endif 5041 #ifdef CONFIG_SLUB_STATS 5042 &alloc_fastpath_attr.attr, 5043 &alloc_slowpath_attr.attr, 5044 &free_fastpath_attr.attr, 5045 &free_slowpath_attr.attr, 5046 &free_frozen_attr.attr, 5047 &free_add_partial_attr.attr, 5048 &free_remove_partial_attr.attr, 5049 &alloc_from_partial_attr.attr, 5050 &alloc_slab_attr.attr, 5051 &alloc_refill_attr.attr, 5052 &alloc_node_mismatch_attr.attr, 5053 &free_slab_attr.attr, 5054 &cpuslab_flush_attr.attr, 5055 &deactivate_full_attr.attr, 5056 &deactivate_empty_attr.attr, 5057 &deactivate_to_head_attr.attr, 5058 &deactivate_to_tail_attr.attr, 5059 &deactivate_remote_frees_attr.attr, 5060 &deactivate_bypass_attr.attr, 5061 &order_fallback_attr.attr, 5062 &cmpxchg_double_fail_attr.attr, 5063 &cmpxchg_double_cpu_fail_attr.attr, 5064 &cpu_partial_alloc_attr.attr, 5065 &cpu_partial_free_attr.attr, 5066 &cpu_partial_node_attr.attr, 5067 &cpu_partial_drain_attr.attr, 5068 #endif 5069 #ifdef CONFIG_FAILSLAB 5070 &failslab_attr.attr, 5071 #endif 5072 5073 NULL 5074 }; 5075 5076 static struct attribute_group slab_attr_group = { 5077 .attrs = slab_attrs, 5078 }; 5079 5080 static ssize_t slab_attr_show(struct kobject *kobj, 5081 struct attribute *attr, 5082 char *buf) 5083 { 5084 struct slab_attribute *attribute; 5085 struct kmem_cache *s; 5086 int err; 5087 5088 attribute = to_slab_attr(attr); 5089 s = to_slab(kobj); 5090 5091 if (!attribute->show) 5092 return -EIO; 5093 5094 err = attribute->show(s, buf); 5095 5096 return err; 5097 } 5098 5099 static ssize_t slab_attr_store(struct kobject *kobj, 5100 struct attribute *attr, 5101 const char *buf, size_t len) 5102 { 5103 struct slab_attribute *attribute; 5104 struct kmem_cache *s; 5105 int err; 5106 5107 attribute = to_slab_attr(attr); 5108 s = to_slab(kobj); 5109 5110 if (!attribute->store) 5111 return -EIO; 5112 5113 err = attribute->store(s, buf, len); 5114 #ifdef CONFIG_MEMCG_KMEM 5115 if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { 5116 int i; 5117 5118 mutex_lock(&slab_mutex); 5119 if (s->max_attr_size < len) 5120 s->max_attr_size = len; 5121 5122 /* 5123 * This is a best effort propagation, so this function's return 5124 * value will be determined by the parent cache only. This is 5125 * basically because not all attributes will have a well 5126 * defined semantics for rollbacks - most of the actions will 5127 * have permanent effects. 5128 * 5129 * Returning the error value of any of the children that fail 5130 * is not 100 % defined, in the sense that users seeing the 5131 * error code won't be able to know anything about the state of 5132 * the cache. 5133 * 5134 * Only returning the error code for the parent cache at least 5135 * has well defined semantics. The cache being written to 5136 * directly either failed or succeeded, in which case we loop 5137 * through the descendants with best-effort propagation. 5138 */ 5139 for_each_memcg_cache_index(i) { 5140 struct kmem_cache *c = cache_from_memcg(s, i); 5141 if (c) 5142 attribute->store(c, buf, len); 5143 } 5144 mutex_unlock(&slab_mutex); 5145 } 5146 #endif 5147 return err; 5148 } 5149 5150 static void memcg_propagate_slab_attrs(struct kmem_cache *s) 5151 { 5152 #ifdef CONFIG_MEMCG_KMEM 5153 int i; 5154 char *buffer = NULL; 5155 5156 if (!is_root_cache(s)) 5157 return; 5158 5159 /* 5160 * This mean this cache had no attribute written. Therefore, no point 5161 * in copying default values around 5162 */ 5163 if (!s->max_attr_size) 5164 return; 5165 5166 for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { 5167 char mbuf[64]; 5168 char *buf; 5169 struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); 5170 5171 if (!attr || !attr->store || !attr->show) 5172 continue; 5173 5174 /* 5175 * It is really bad that we have to allocate here, so we will 5176 * do it only as a fallback. If we actually allocate, though, 5177 * we can just use the allocated buffer until the end. 5178 * 5179 * Most of the slub attributes will tend to be very small in 5180 * size, but sysfs allows buffers up to a page, so they can 5181 * theoretically happen. 5182 */ 5183 if (buffer) 5184 buf = buffer; 5185 else if (s->max_attr_size < ARRAY_SIZE(mbuf)) 5186 buf = mbuf; 5187 else { 5188 buffer = (char *) get_zeroed_page(GFP_KERNEL); 5189 if (WARN_ON(!buffer)) 5190 continue; 5191 buf = buffer; 5192 } 5193 5194 attr->show(s->memcg_params->root_cache, buf); 5195 attr->store(s, buf, strlen(buf)); 5196 } 5197 5198 if (buffer) 5199 free_page((unsigned long)buffer); 5200 #endif 5201 } 5202 5203 static const struct sysfs_ops slab_sysfs_ops = { 5204 .show = slab_attr_show, 5205 .store = slab_attr_store, 5206 }; 5207 5208 static struct kobj_type slab_ktype = { 5209 .sysfs_ops = &slab_sysfs_ops, 5210 }; 5211 5212 static int uevent_filter(struct kset *kset, struct kobject *kobj) 5213 { 5214 struct kobj_type *ktype = get_ktype(kobj); 5215 5216 if (ktype == &slab_ktype) 5217 return 1; 5218 return 0; 5219 } 5220 5221 static const struct kset_uevent_ops slab_uevent_ops = { 5222 .filter = uevent_filter, 5223 }; 5224 5225 static struct kset *slab_kset; 5226 5227 #define ID_STR_LENGTH 64 5228 5229 /* Create a unique string id for a slab cache: 5230 * 5231 * Format :[flags-]size 5232 */ 5233 static char *create_unique_id(struct kmem_cache *s) 5234 { 5235 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 5236 char *p = name; 5237 5238 BUG_ON(!name); 5239 5240 *p++ = ':'; 5241 /* 5242 * First flags affecting slabcache operations. We will only 5243 * get here for aliasable slabs so we do not need to support 5244 * too many flags. The flags here must cover all flags that 5245 * are matched during merging to guarantee that the id is 5246 * unique. 5247 */ 5248 if (s->flags & SLAB_CACHE_DMA) 5249 *p++ = 'd'; 5250 if (s->flags & SLAB_RECLAIM_ACCOUNT) 5251 *p++ = 'a'; 5252 if (s->flags & SLAB_DEBUG_FREE) 5253 *p++ = 'F'; 5254 if (!(s->flags & SLAB_NOTRACK)) 5255 *p++ = 't'; 5256 if (p != name + 1) 5257 *p++ = '-'; 5258 p += sprintf(p, "%07d", s->size); 5259 5260 #ifdef CONFIG_MEMCG_KMEM 5261 if (!is_root_cache(s)) 5262 p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg)); 5263 #endif 5264 5265 BUG_ON(p > name + ID_STR_LENGTH - 1); 5266 return name; 5267 } 5268 5269 static int sysfs_slab_add(struct kmem_cache *s) 5270 { 5271 int err; 5272 const char *name; 5273 int unmergeable = slab_unmergeable(s); 5274 5275 if (unmergeable) { 5276 /* 5277 * Slabcache can never be merged so we can use the name proper. 5278 * This is typically the case for debug situations. In that 5279 * case we can catch duplicate names easily. 5280 */ 5281 sysfs_remove_link(&slab_kset->kobj, s->name); 5282 name = s->name; 5283 } else { 5284 /* 5285 * Create a unique name for the slab as a target 5286 * for the symlinks. 5287 */ 5288 name = create_unique_id(s); 5289 } 5290 5291 s->kobj.kset = slab_kset; 5292 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); 5293 if (err) { 5294 kobject_put(&s->kobj); 5295 return err; 5296 } 5297 5298 err = sysfs_create_group(&s->kobj, &slab_attr_group); 5299 if (err) { 5300 kobject_del(&s->kobj); 5301 kobject_put(&s->kobj); 5302 return err; 5303 } 5304 kobject_uevent(&s->kobj, KOBJ_ADD); 5305 if (!unmergeable) { 5306 /* Setup first alias */ 5307 sysfs_slab_alias(s, s->name); 5308 kfree(name); 5309 } 5310 return 0; 5311 } 5312 5313 static void sysfs_slab_remove(struct kmem_cache *s) 5314 { 5315 if (slab_state < FULL) 5316 /* 5317 * Sysfs has not been setup yet so no need to remove the 5318 * cache from sysfs. 5319 */ 5320 return; 5321 5322 kobject_uevent(&s->kobj, KOBJ_REMOVE); 5323 kobject_del(&s->kobj); 5324 kobject_put(&s->kobj); 5325 } 5326 5327 /* 5328 * Need to buffer aliases during bootup until sysfs becomes 5329 * available lest we lose that information. 5330 */ 5331 struct saved_alias { 5332 struct kmem_cache *s; 5333 const char *name; 5334 struct saved_alias *next; 5335 }; 5336 5337 static struct saved_alias *alias_list; 5338 5339 static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 5340 { 5341 struct saved_alias *al; 5342 5343 if (slab_state == FULL) { 5344 /* 5345 * If we have a leftover link then remove it. 5346 */ 5347 sysfs_remove_link(&slab_kset->kobj, name); 5348 return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); 5349 } 5350 5351 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 5352 if (!al) 5353 return -ENOMEM; 5354 5355 al->s = s; 5356 al->name = name; 5357 al->next = alias_list; 5358 alias_list = al; 5359 return 0; 5360 } 5361 5362 static int __init slab_sysfs_init(void) 5363 { 5364 struct kmem_cache *s; 5365 int err; 5366 5367 mutex_lock(&slab_mutex); 5368 5369 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 5370 if (!slab_kset) { 5371 mutex_unlock(&slab_mutex); 5372 printk(KERN_ERR "Cannot register slab subsystem.\n"); 5373 return -ENOSYS; 5374 } 5375 5376 slab_state = FULL; 5377 5378 list_for_each_entry(s, &slab_caches, list) { 5379 err = sysfs_slab_add(s); 5380 if (err) 5381 printk(KERN_ERR "SLUB: Unable to add boot slab %s" 5382 " to sysfs\n", s->name); 5383 } 5384 5385 while (alias_list) { 5386 struct saved_alias *al = alias_list; 5387 5388 alias_list = alias_list->next; 5389 err = sysfs_slab_alias(al->s, al->name); 5390 if (err) 5391 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 5392 " %s to sysfs\n", al->name); 5393 kfree(al); 5394 } 5395 5396 mutex_unlock(&slab_mutex); 5397 resiliency_test(); 5398 return 0; 5399 } 5400 5401 __initcall(slab_sysfs_init); 5402 #endif /* CONFIG_SYSFS */ 5403 5404 /* 5405 * The /proc/slabinfo ABI 5406 */ 5407 #ifdef CONFIG_SLABINFO 5408 void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) 5409 { 5410 unsigned long nr_partials = 0; 5411 unsigned long nr_slabs = 0; 5412 unsigned long nr_objs = 0; 5413 unsigned long nr_free = 0; 5414 int node; 5415 5416 for_each_online_node(node) { 5417 struct kmem_cache_node *n = get_node(s, node); 5418 5419 if (!n) 5420 continue; 5421 5422 nr_partials += n->nr_partial; 5423 nr_slabs += atomic_long_read(&n->nr_slabs); 5424 nr_objs += atomic_long_read(&n->total_objects); 5425 nr_free += count_partial(n, count_free); 5426 } 5427 5428 sinfo->active_objs = nr_objs - nr_free; 5429 sinfo->num_objs = nr_objs; 5430 sinfo->active_slabs = nr_slabs; 5431 sinfo->num_slabs = nr_slabs; 5432 sinfo->objects_per_slab = oo_objects(s->oo); 5433 sinfo->cache_order = oo_order(s->oo); 5434 } 5435 5436 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) 5437 { 5438 } 5439 5440 ssize_t slabinfo_write(struct file *file, const char __user *buffer, 5441 size_t count, loff_t *ppos) 5442 { 5443 return -EIO; 5444 } 5445 #endif /* CONFIG_SLABINFO */ 5446