1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 */ 23 24 #include <linux/res_counter.h> 25 #include <linux/memcontrol.h> 26 #include <linux/cgroup.h> 27 #include <linux/mm.h> 28 #include <linux/hugetlb.h> 29 #include <linux/pagemap.h> 30 #include <linux/smp.h> 31 #include <linux/page-flags.h> 32 #include <linux/backing-dev.h> 33 #include <linux/bit_spinlock.h> 34 #include <linux/rcupdate.h> 35 #include <linux/limits.h> 36 #include <linux/mutex.h> 37 #include <linux/rbtree.h> 38 #include <linux/slab.h> 39 #include <linux/swap.h> 40 #include <linux/swapops.h> 41 #include <linux/spinlock.h> 42 #include <linux/eventfd.h> 43 #include <linux/sort.h> 44 #include <linux/fs.h> 45 #include <linux/seq_file.h> 46 #include <linux/vmalloc.h> 47 #include <linux/mm_inline.h> 48 #include <linux/page_cgroup.h> 49 #include <linux/cpu.h> 50 #include "internal.h" 51 52 #include <asm/uaccess.h> 53 54 #include <trace/events/vmscan.h> 55 56 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 57 #define MEM_CGROUP_RECLAIM_RETRIES 5 58 struct mem_cgroup *root_mem_cgroup __read_mostly; 59 60 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 61 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 62 int do_swap_account __read_mostly; 63 static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 64 #else 65 #define do_swap_account (0) 66 #endif 67 68 /* 69 * Per memcg event counter is incremented at every pagein/pageout. This counter 70 * is used for trigger some periodic events. This is straightforward and better 71 * than using jiffies etc. to handle periodic memcg event. 72 * 73 * These values will be used as !((event) & ((1 <<(thresh)) - 1)) 74 */ 75 #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ 76 #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ 77 78 /* 79 * Statistics for memory cgroup. 80 */ 81 enum mem_cgroup_stat_index { 82 /* 83 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 84 */ 85 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 86 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 87 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 88 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 89 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 91 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ 92 93 MEM_CGROUP_STAT_NSTATS, 94 }; 95 96 struct mem_cgroup_stat_cpu { 97 s64 count[MEM_CGROUP_STAT_NSTATS]; 98 }; 99 100 /* 101 * per-zone information in memory controller. 102 */ 103 struct mem_cgroup_per_zone { 104 /* 105 * spin_lock to protect the per cgroup LRU 106 */ 107 struct list_head lists[NR_LRU_LISTS]; 108 unsigned long count[NR_LRU_LISTS]; 109 110 struct zone_reclaim_stat reclaim_stat; 111 struct rb_node tree_node; /* RB tree node */ 112 unsigned long long usage_in_excess;/* Set to the value by which */ 113 /* the soft limit is exceeded*/ 114 bool on_tree; 115 struct mem_cgroup *mem; /* Back pointer, we cannot */ 116 /* use container_of */ 117 }; 118 /* Macro for accessing counter */ 119 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 120 121 struct mem_cgroup_per_node { 122 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 123 }; 124 125 struct mem_cgroup_lru_info { 126 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 127 }; 128 129 /* 130 * Cgroups above their limits are maintained in a RB-Tree, independent of 131 * their hierarchy representation 132 */ 133 134 struct mem_cgroup_tree_per_zone { 135 struct rb_root rb_root; 136 spinlock_t lock; 137 }; 138 139 struct mem_cgroup_tree_per_node { 140 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 141 }; 142 143 struct mem_cgroup_tree { 144 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 145 }; 146 147 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 148 149 struct mem_cgroup_threshold { 150 struct eventfd_ctx *eventfd; 151 u64 threshold; 152 }; 153 154 /* For threshold */ 155 struct mem_cgroup_threshold_ary { 156 /* An array index points to threshold just below usage. */ 157 int current_threshold; 158 /* Size of entries[] */ 159 unsigned int size; 160 /* Array of thresholds */ 161 struct mem_cgroup_threshold entries[0]; 162 }; 163 164 struct mem_cgroup_thresholds { 165 /* Primary thresholds array */ 166 struct mem_cgroup_threshold_ary *primary; 167 /* 168 * Spare threshold array. 169 * This is needed to make mem_cgroup_unregister_event() "never fail". 170 * It must be able to store at least primary->size - 1 entries. 171 */ 172 struct mem_cgroup_threshold_ary *spare; 173 }; 174 175 /* for OOM */ 176 struct mem_cgroup_eventfd_list { 177 struct list_head list; 178 struct eventfd_ctx *eventfd; 179 }; 180 181 static void mem_cgroup_threshold(struct mem_cgroup *mem); 182 static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 183 184 /* 185 * The memory controller data structure. The memory controller controls both 186 * page cache and RSS per cgroup. We would eventually like to provide 187 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 188 * to help the administrator determine what knobs to tune. 189 * 190 * TODO: Add a water mark for the memory controller. Reclaim will begin when 191 * we hit the water mark. May be even add a low water mark, such that 192 * no reclaim occurs from a cgroup at it's low water mark, this is 193 * a feature that will be implemented much later in the future. 194 */ 195 struct mem_cgroup { 196 struct cgroup_subsys_state css; 197 /* 198 * the counter to account for memory usage 199 */ 200 struct res_counter res; 201 /* 202 * the counter to account for mem+swap usage. 203 */ 204 struct res_counter memsw; 205 /* 206 * Per cgroup active and inactive list, similar to the 207 * per zone LRU lists. 208 */ 209 struct mem_cgroup_lru_info info; 210 211 /* 212 protect against reclaim related member. 213 */ 214 spinlock_t reclaim_param_lock; 215 216 /* 217 * While reclaiming in a hierarchy, we cache the last child we 218 * reclaimed from. 219 */ 220 int last_scanned_child; 221 /* 222 * Should the accounting and control be hierarchical, per subtree? 223 */ 224 bool use_hierarchy; 225 atomic_t oom_lock; 226 atomic_t refcnt; 227 228 unsigned int swappiness; 229 /* OOM-Killer disable */ 230 int oom_kill_disable; 231 232 /* set when res.limit == memsw.limit */ 233 bool memsw_is_minimum; 234 235 /* protect arrays of thresholds */ 236 struct mutex thresholds_lock; 237 238 /* thresholds for memory usage. RCU-protected */ 239 struct mem_cgroup_thresholds thresholds; 240 241 /* thresholds for mem+swap usage. RCU-protected */ 242 struct mem_cgroup_thresholds memsw_thresholds; 243 244 /* For oom notifier event fd */ 245 struct list_head oom_notify; 246 247 /* 248 * Should we move charges of a task when a task is moved into this 249 * mem_cgroup ? And what type of charges should we move ? 250 */ 251 unsigned long move_charge_at_immigrate; 252 /* 253 * percpu counter. 254 */ 255 struct mem_cgroup_stat_cpu *stat; 256 }; 257 258 /* Stuffs for move charges at task migration. */ 259 /* 260 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 261 * left-shifted bitmap of these types. 262 */ 263 enum move_type { 264 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 265 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 266 NR_MOVE_TYPE, 267 }; 268 269 /* "mc" and its members are protected by cgroup_mutex */ 270 static struct move_charge_struct { 271 struct mem_cgroup *from; 272 struct mem_cgroup *to; 273 unsigned long precharge; 274 unsigned long moved_charge; 275 unsigned long moved_swap; 276 struct task_struct *moving_task; /* a task moving charges */ 277 wait_queue_head_t waitq; /* a waitq for other context */ 278 } mc = { 279 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 280 }; 281 282 static bool move_anon(void) 283 { 284 return test_bit(MOVE_CHARGE_TYPE_ANON, 285 &mc.to->move_charge_at_immigrate); 286 } 287 288 static bool move_file(void) 289 { 290 return test_bit(MOVE_CHARGE_TYPE_FILE, 291 &mc.to->move_charge_at_immigrate); 292 } 293 294 /* 295 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 296 * limit reclaim to prevent infinite loops, if they ever occur. 297 */ 298 #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 299 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 300 301 enum charge_type { 302 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 303 MEM_CGROUP_CHARGE_TYPE_MAPPED, 304 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 305 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 306 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 307 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 308 NR_CHARGE_TYPE, 309 }; 310 311 /* only for here (for easy reading.) */ 312 #define PCGF_CACHE (1UL << PCG_CACHE) 313 #define PCGF_USED (1UL << PCG_USED) 314 #define PCGF_LOCK (1UL << PCG_LOCK) 315 /* Not used, but added here for completeness */ 316 #define PCGF_ACCT (1UL << PCG_ACCT) 317 318 /* for encoding cft->private value on file */ 319 #define _MEM (0) 320 #define _MEMSWAP (1) 321 #define _OOM_TYPE (2) 322 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 323 #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 324 #define MEMFILE_ATTR(val) ((val) & 0xffff) 325 /* Used for OOM nofiier */ 326 #define OOM_CONTROL (0) 327 328 /* 329 * Reclaim flags for mem_cgroup_hierarchical_reclaim 330 */ 331 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 332 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 333 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 334 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 335 #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 336 #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 337 338 static void mem_cgroup_get(struct mem_cgroup *mem); 339 static void mem_cgroup_put(struct mem_cgroup *mem); 340 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 341 static void drain_all_stock_async(void); 342 343 static struct mem_cgroup_per_zone * 344 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 345 { 346 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 347 } 348 349 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 350 { 351 return &mem->css; 352 } 353 354 static struct mem_cgroup_per_zone * 355 page_cgroup_zoneinfo(struct page_cgroup *pc) 356 { 357 struct mem_cgroup *mem = pc->mem_cgroup; 358 int nid = page_cgroup_nid(pc); 359 int zid = page_cgroup_zid(pc); 360 361 if (!mem) 362 return NULL; 363 364 return mem_cgroup_zoneinfo(mem, nid, zid); 365 } 366 367 static struct mem_cgroup_tree_per_zone * 368 soft_limit_tree_node_zone(int nid, int zid) 369 { 370 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 371 } 372 373 static struct mem_cgroup_tree_per_zone * 374 soft_limit_tree_from_page(struct page *page) 375 { 376 int nid = page_to_nid(page); 377 int zid = page_zonenum(page); 378 379 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 380 } 381 382 static void 383 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 384 struct mem_cgroup_per_zone *mz, 385 struct mem_cgroup_tree_per_zone *mctz, 386 unsigned long long new_usage_in_excess) 387 { 388 struct rb_node **p = &mctz->rb_root.rb_node; 389 struct rb_node *parent = NULL; 390 struct mem_cgroup_per_zone *mz_node; 391 392 if (mz->on_tree) 393 return; 394 395 mz->usage_in_excess = new_usage_in_excess; 396 if (!mz->usage_in_excess) 397 return; 398 while (*p) { 399 parent = *p; 400 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 401 tree_node); 402 if (mz->usage_in_excess < mz_node->usage_in_excess) 403 p = &(*p)->rb_left; 404 /* 405 * We can't avoid mem cgroups that are over their soft 406 * limit by the same amount 407 */ 408 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 409 p = &(*p)->rb_right; 410 } 411 rb_link_node(&mz->tree_node, parent, p); 412 rb_insert_color(&mz->tree_node, &mctz->rb_root); 413 mz->on_tree = true; 414 } 415 416 static void 417 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 418 struct mem_cgroup_per_zone *mz, 419 struct mem_cgroup_tree_per_zone *mctz) 420 { 421 if (!mz->on_tree) 422 return; 423 rb_erase(&mz->tree_node, &mctz->rb_root); 424 mz->on_tree = false; 425 } 426 427 static void 428 mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 429 struct mem_cgroup_per_zone *mz, 430 struct mem_cgroup_tree_per_zone *mctz) 431 { 432 spin_lock(&mctz->lock); 433 __mem_cgroup_remove_exceeded(mem, mz, mctz); 434 spin_unlock(&mctz->lock); 435 } 436 437 438 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 439 { 440 unsigned long long excess; 441 struct mem_cgroup_per_zone *mz; 442 struct mem_cgroup_tree_per_zone *mctz; 443 int nid = page_to_nid(page); 444 int zid = page_zonenum(page); 445 mctz = soft_limit_tree_from_page(page); 446 447 /* 448 * Necessary to update all ancestors when hierarchy is used. 449 * because their event counter is not touched. 450 */ 451 for (; mem; mem = parent_mem_cgroup(mem)) { 452 mz = mem_cgroup_zoneinfo(mem, nid, zid); 453 excess = res_counter_soft_limit_excess(&mem->res); 454 /* 455 * We have to update the tree if mz is on RB-tree or 456 * mem is over its softlimit. 457 */ 458 if (excess || mz->on_tree) { 459 spin_lock(&mctz->lock); 460 /* if on-tree, remove it */ 461 if (mz->on_tree) 462 __mem_cgroup_remove_exceeded(mem, mz, mctz); 463 /* 464 * Insert again. mz->usage_in_excess will be updated. 465 * If excess is 0, no tree ops. 466 */ 467 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 468 spin_unlock(&mctz->lock); 469 } 470 } 471 } 472 473 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 474 { 475 int node, zone; 476 struct mem_cgroup_per_zone *mz; 477 struct mem_cgroup_tree_per_zone *mctz; 478 479 for_each_node_state(node, N_POSSIBLE) { 480 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 481 mz = mem_cgroup_zoneinfo(mem, node, zone); 482 mctz = soft_limit_tree_node_zone(node, zone); 483 mem_cgroup_remove_exceeded(mem, mz, mctz); 484 } 485 } 486 } 487 488 static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) 489 { 490 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; 491 } 492 493 static struct mem_cgroup_per_zone * 494 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 495 { 496 struct rb_node *rightmost = NULL; 497 struct mem_cgroup_per_zone *mz; 498 499 retry: 500 mz = NULL; 501 rightmost = rb_last(&mctz->rb_root); 502 if (!rightmost) 503 goto done; /* Nothing to reclaim from */ 504 505 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 506 /* 507 * Remove the node now but someone else can add it back, 508 * we will to add it back at the end of reclaim to its correct 509 * position in the tree. 510 */ 511 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 512 if (!res_counter_soft_limit_excess(&mz->mem->res) || 513 !css_tryget(&mz->mem->css)) 514 goto retry; 515 done: 516 return mz; 517 } 518 519 static struct mem_cgroup_per_zone * 520 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 521 { 522 struct mem_cgroup_per_zone *mz; 523 524 spin_lock(&mctz->lock); 525 mz = __mem_cgroup_largest_soft_limit_node(mctz); 526 spin_unlock(&mctz->lock); 527 return mz; 528 } 529 530 static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 531 enum mem_cgroup_stat_index idx) 532 { 533 int cpu; 534 s64 val = 0; 535 536 for_each_possible_cpu(cpu) 537 val += per_cpu(mem->stat->count[idx], cpu); 538 return val; 539 } 540 541 static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) 542 { 543 s64 ret; 544 545 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 546 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 547 return ret; 548 } 549 550 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 551 bool charge) 552 { 553 int val = (charge) ? 1 : -1; 554 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 555 } 556 557 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 558 struct page_cgroup *pc, 559 bool charge) 560 { 561 int val = (charge) ? 1 : -1; 562 563 preempt_disable(); 564 565 if (PageCgroupCache(pc)) 566 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); 567 else 568 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); 569 570 if (charge) 571 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 572 else 573 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 574 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); 575 576 preempt_enable(); 577 } 578 579 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 580 enum lru_list idx) 581 { 582 int nid, zid; 583 struct mem_cgroup_per_zone *mz; 584 u64 total = 0; 585 586 for_each_online_node(nid) 587 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 588 mz = mem_cgroup_zoneinfo(mem, nid, zid); 589 total += MEM_CGROUP_ZSTAT(mz, idx); 590 } 591 return total; 592 } 593 594 static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) 595 { 596 s64 val; 597 598 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); 599 600 return !(val & ((1 << event_mask_shift) - 1)); 601 } 602 603 /* 604 * Check events in order. 605 * 606 */ 607 static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 608 { 609 /* threshold event is triggered in finer grain than soft limit */ 610 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { 611 mem_cgroup_threshold(mem); 612 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) 613 mem_cgroup_update_tree(mem, page); 614 } 615 } 616 617 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 618 { 619 return container_of(cgroup_subsys_state(cont, 620 mem_cgroup_subsys_id), struct mem_cgroup, 621 css); 622 } 623 624 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 625 { 626 /* 627 * mm_update_next_owner() may clear mm->owner to NULL 628 * if it races with swapoff, page migration, etc. 629 * So this can be called with p == NULL. 630 */ 631 if (unlikely(!p)) 632 return NULL; 633 634 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 635 struct mem_cgroup, css); 636 } 637 638 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 639 { 640 struct mem_cgroup *mem = NULL; 641 642 if (!mm) 643 return NULL; 644 /* 645 * Because we have no locks, mm->owner's may be being moved to other 646 * cgroup. We use css_tryget() here even if this looks 647 * pessimistic (rather than adding locks here). 648 */ 649 rcu_read_lock(); 650 do { 651 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 652 if (unlikely(!mem)) 653 break; 654 } while (!css_tryget(&mem->css)); 655 rcu_read_unlock(); 656 return mem; 657 } 658 659 /* 660 * Call callback function against all cgroup under hierarchy tree. 661 */ 662 static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, 663 int (*func)(struct mem_cgroup *, void *)) 664 { 665 int found, ret, nextid; 666 struct cgroup_subsys_state *css; 667 struct mem_cgroup *mem; 668 669 if (!root->use_hierarchy) 670 return (*func)(root, data); 671 672 nextid = 1; 673 do { 674 ret = 0; 675 mem = NULL; 676 677 rcu_read_lock(); 678 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 679 &found); 680 if (css && css_tryget(css)) 681 mem = container_of(css, struct mem_cgroup, css); 682 rcu_read_unlock(); 683 684 if (mem) { 685 ret = (*func)(mem, data); 686 css_put(&mem->css); 687 } 688 nextid = found + 1; 689 } while (!ret && css); 690 691 return ret; 692 } 693 694 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 695 { 696 return (mem == root_mem_cgroup); 697 } 698 699 /* 700 * Following LRU functions are allowed to be used without PCG_LOCK. 701 * Operations are called by routine of global LRU independently from memcg. 702 * What we have to take care of here is validness of pc->mem_cgroup. 703 * 704 * Changes to pc->mem_cgroup happens when 705 * 1. charge 706 * 2. moving account 707 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 708 * It is added to LRU before charge. 709 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 710 * When moving account, the page is not on LRU. It's isolated. 711 */ 712 713 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 714 { 715 struct page_cgroup *pc; 716 struct mem_cgroup_per_zone *mz; 717 718 if (mem_cgroup_disabled()) 719 return; 720 pc = lookup_page_cgroup(page); 721 /* can happen while we handle swapcache. */ 722 if (!TestClearPageCgroupAcctLRU(pc)) 723 return; 724 VM_BUG_ON(!pc->mem_cgroup); 725 /* 726 * We don't check PCG_USED bit. It's cleared when the "page" is finally 727 * removed from global LRU. 728 */ 729 mz = page_cgroup_zoneinfo(pc); 730 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 731 if (mem_cgroup_is_root(pc->mem_cgroup)) 732 return; 733 VM_BUG_ON(list_empty(&pc->lru)); 734 list_del_init(&pc->lru); 735 return; 736 } 737 738 void mem_cgroup_del_lru(struct page *page) 739 { 740 mem_cgroup_del_lru_list(page, page_lru(page)); 741 } 742 743 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 744 { 745 struct mem_cgroup_per_zone *mz; 746 struct page_cgroup *pc; 747 748 if (mem_cgroup_disabled()) 749 return; 750 751 pc = lookup_page_cgroup(page); 752 /* 753 * Used bit is set without atomic ops but after smp_wmb(). 754 * For making pc->mem_cgroup visible, insert smp_rmb() here. 755 */ 756 smp_rmb(); 757 /* unused or root page is not rotated. */ 758 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) 759 return; 760 mz = page_cgroup_zoneinfo(pc); 761 list_move(&pc->lru, &mz->lists[lru]); 762 } 763 764 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 765 { 766 struct page_cgroup *pc; 767 struct mem_cgroup_per_zone *mz; 768 769 if (mem_cgroup_disabled()) 770 return; 771 pc = lookup_page_cgroup(page); 772 VM_BUG_ON(PageCgroupAcctLRU(pc)); 773 /* 774 * Used bit is set without atomic ops but after smp_wmb(). 775 * For making pc->mem_cgroup visible, insert smp_rmb() here. 776 */ 777 smp_rmb(); 778 if (!PageCgroupUsed(pc)) 779 return; 780 781 mz = page_cgroup_zoneinfo(pc); 782 MEM_CGROUP_ZSTAT(mz, lru) += 1; 783 SetPageCgroupAcctLRU(pc); 784 if (mem_cgroup_is_root(pc->mem_cgroup)) 785 return; 786 list_add(&pc->lru, &mz->lists[lru]); 787 } 788 789 /* 790 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 791 * lru because the page may.be reused after it's fully uncharged (because of 792 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 793 * it again. This function is only used to charge SwapCache. It's done under 794 * lock_page and expected that zone->lru_lock is never held. 795 */ 796 static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 797 { 798 unsigned long flags; 799 struct zone *zone = page_zone(page); 800 struct page_cgroup *pc = lookup_page_cgroup(page); 801 802 spin_lock_irqsave(&zone->lru_lock, flags); 803 /* 804 * Forget old LRU when this page_cgroup is *not* used. This Used bit 805 * is guarded by lock_page() because the page is SwapCache. 806 */ 807 if (!PageCgroupUsed(pc)) 808 mem_cgroup_del_lru_list(page, page_lru(page)); 809 spin_unlock_irqrestore(&zone->lru_lock, flags); 810 } 811 812 static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 813 { 814 unsigned long flags; 815 struct zone *zone = page_zone(page); 816 struct page_cgroup *pc = lookup_page_cgroup(page); 817 818 spin_lock_irqsave(&zone->lru_lock, flags); 819 /* link when the page is linked to LRU but page_cgroup isn't */ 820 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 821 mem_cgroup_add_lru_list(page, page_lru(page)); 822 spin_unlock_irqrestore(&zone->lru_lock, flags); 823 } 824 825 826 void mem_cgroup_move_lists(struct page *page, 827 enum lru_list from, enum lru_list to) 828 { 829 if (mem_cgroup_disabled()) 830 return; 831 mem_cgroup_del_lru_list(page, from); 832 mem_cgroup_add_lru_list(page, to); 833 } 834 835 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 836 { 837 int ret; 838 struct mem_cgroup *curr = NULL; 839 840 task_lock(task); 841 rcu_read_lock(); 842 curr = try_get_mem_cgroup_from_mm(task->mm); 843 rcu_read_unlock(); 844 task_unlock(task); 845 if (!curr) 846 return 0; 847 /* 848 * We should check use_hierarchy of "mem" not "curr". Because checking 849 * use_hierarchy of "curr" here make this function true if hierarchy is 850 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 851 * hierarchy(even if use_hierarchy is disabled in "mem"). 852 */ 853 if (mem->use_hierarchy) 854 ret = css_is_ancestor(&curr->css, &mem->css); 855 else 856 ret = (curr == mem); 857 css_put(&curr->css); 858 return ret; 859 } 860 861 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 862 { 863 unsigned long active; 864 unsigned long inactive; 865 unsigned long gb; 866 unsigned long inactive_ratio; 867 868 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 869 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 870 871 gb = (inactive + active) >> (30 - PAGE_SHIFT); 872 if (gb) 873 inactive_ratio = int_sqrt(10 * gb); 874 else 875 inactive_ratio = 1; 876 877 if (present_pages) { 878 present_pages[0] = inactive; 879 present_pages[1] = active; 880 } 881 882 return inactive_ratio; 883 } 884 885 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 886 { 887 unsigned long active; 888 unsigned long inactive; 889 unsigned long present_pages[2]; 890 unsigned long inactive_ratio; 891 892 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 893 894 inactive = present_pages[0]; 895 active = present_pages[1]; 896 897 if (inactive * inactive_ratio < active) 898 return 1; 899 900 return 0; 901 } 902 903 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 904 { 905 unsigned long active; 906 unsigned long inactive; 907 908 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 909 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 910 911 return (active > inactive); 912 } 913 914 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 915 struct zone *zone, 916 enum lru_list lru) 917 { 918 int nid = zone->zone_pgdat->node_id; 919 int zid = zone_idx(zone); 920 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 921 922 return MEM_CGROUP_ZSTAT(mz, lru); 923 } 924 925 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 926 struct zone *zone) 927 { 928 int nid = zone->zone_pgdat->node_id; 929 int zid = zone_idx(zone); 930 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 931 932 return &mz->reclaim_stat; 933 } 934 935 struct zone_reclaim_stat * 936 mem_cgroup_get_reclaim_stat_from_page(struct page *page) 937 { 938 struct page_cgroup *pc; 939 struct mem_cgroup_per_zone *mz; 940 941 if (mem_cgroup_disabled()) 942 return NULL; 943 944 pc = lookup_page_cgroup(page); 945 /* 946 * Used bit is set without atomic ops but after smp_wmb(). 947 * For making pc->mem_cgroup visible, insert smp_rmb() here. 948 */ 949 smp_rmb(); 950 if (!PageCgroupUsed(pc)) 951 return NULL; 952 953 mz = page_cgroup_zoneinfo(pc); 954 if (!mz) 955 return NULL; 956 957 return &mz->reclaim_stat; 958 } 959 960 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 961 struct list_head *dst, 962 unsigned long *scanned, int order, 963 int mode, struct zone *z, 964 struct mem_cgroup *mem_cont, 965 int active, int file) 966 { 967 unsigned long nr_taken = 0; 968 struct page *page; 969 unsigned long scan; 970 LIST_HEAD(pc_list); 971 struct list_head *src; 972 struct page_cgroup *pc, *tmp; 973 int nid = z->zone_pgdat->node_id; 974 int zid = zone_idx(z); 975 struct mem_cgroup_per_zone *mz; 976 int lru = LRU_FILE * file + active; 977 int ret; 978 979 BUG_ON(!mem_cont); 980 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 981 src = &mz->lists[lru]; 982 983 scan = 0; 984 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 985 if (scan >= nr_to_scan) 986 break; 987 988 page = pc->page; 989 if (unlikely(!PageCgroupUsed(pc))) 990 continue; 991 if (unlikely(!PageLRU(page))) 992 continue; 993 994 scan++; 995 ret = __isolate_lru_page(page, mode, file); 996 switch (ret) { 997 case 0: 998 list_move(&page->lru, dst); 999 mem_cgroup_del_lru(page); 1000 nr_taken++; 1001 break; 1002 case -EBUSY: 1003 /* we don't affect global LRU but rotate in our LRU */ 1004 mem_cgroup_rotate_lru_list(page, page_lru(page)); 1005 break; 1006 default: 1007 break; 1008 } 1009 } 1010 1011 *scanned = scan; 1012 1013 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, 1014 0, 0, 0, mode); 1015 1016 return nr_taken; 1017 } 1018 1019 #define mem_cgroup_from_res_counter(counter, member) \ 1020 container_of(counter, struct mem_cgroup, member) 1021 1022 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 1023 { 1024 if (do_swap_account) { 1025 if (res_counter_check_under_limit(&mem->res) && 1026 res_counter_check_under_limit(&mem->memsw)) 1027 return true; 1028 } else 1029 if (res_counter_check_under_limit(&mem->res)) 1030 return true; 1031 return false; 1032 } 1033 1034 static unsigned int get_swappiness(struct mem_cgroup *memcg) 1035 { 1036 struct cgroup *cgrp = memcg->css.cgroup; 1037 unsigned int swappiness; 1038 1039 /* root ? */ 1040 if (cgrp->parent == NULL) 1041 return vm_swappiness; 1042 1043 spin_lock(&memcg->reclaim_param_lock); 1044 swappiness = memcg->swappiness; 1045 spin_unlock(&memcg->reclaim_param_lock); 1046 1047 return swappiness; 1048 } 1049 1050 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) 1051 { 1052 int *val = data; 1053 (*val)++; 1054 return 0; 1055 } 1056 1057 /** 1058 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1059 * @memcg: The memory cgroup that went over limit 1060 * @p: Task that is going to be killed 1061 * 1062 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1063 * enabled 1064 */ 1065 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1066 { 1067 struct cgroup *task_cgrp; 1068 struct cgroup *mem_cgrp; 1069 /* 1070 * Need a buffer in BSS, can't rely on allocations. The code relies 1071 * on the assumption that OOM is serialized for memory controller. 1072 * If this assumption is broken, revisit this code. 1073 */ 1074 static char memcg_name[PATH_MAX]; 1075 int ret; 1076 1077 if (!memcg || !p) 1078 return; 1079 1080 1081 rcu_read_lock(); 1082 1083 mem_cgrp = memcg->css.cgroup; 1084 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1085 1086 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1087 if (ret < 0) { 1088 /* 1089 * Unfortunately, we are unable to convert to a useful name 1090 * But we'll still print out the usage information 1091 */ 1092 rcu_read_unlock(); 1093 goto done; 1094 } 1095 rcu_read_unlock(); 1096 1097 printk(KERN_INFO "Task in %s killed", memcg_name); 1098 1099 rcu_read_lock(); 1100 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1101 if (ret < 0) { 1102 rcu_read_unlock(); 1103 goto done; 1104 } 1105 rcu_read_unlock(); 1106 1107 /* 1108 * Continues from above, so we don't need an KERN_ level 1109 */ 1110 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1111 done: 1112 1113 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1114 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1115 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1116 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1117 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1118 "failcnt %llu\n", 1119 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1120 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1121 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1122 } 1123 1124 /* 1125 * This function returns the number of memcg under hierarchy tree. Returns 1126 * 1(self count) if no children. 1127 */ 1128 static int mem_cgroup_count_children(struct mem_cgroup *mem) 1129 { 1130 int num = 0; 1131 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1132 return num; 1133 } 1134 1135 /* 1136 * Return the memory (and swap, if configured) limit for a memcg. 1137 */ 1138 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1139 { 1140 u64 limit; 1141 u64 memsw; 1142 1143 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + 1144 total_swap_pages; 1145 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1146 /* 1147 * If memsw is finite and limits the amount of swap space available 1148 * to this memcg, return that limit. 1149 */ 1150 return min(limit, memsw); 1151 } 1152 1153 /* 1154 * Visit the first child (need not be the first child as per the ordering 1155 * of the cgroup list, since we track last_scanned_child) of @mem and use 1156 * that to reclaim free pages from. 1157 */ 1158 static struct mem_cgroup * 1159 mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1160 { 1161 struct mem_cgroup *ret = NULL; 1162 struct cgroup_subsys_state *css; 1163 int nextid, found; 1164 1165 if (!root_mem->use_hierarchy) { 1166 css_get(&root_mem->css); 1167 ret = root_mem; 1168 } 1169 1170 while (!ret) { 1171 rcu_read_lock(); 1172 nextid = root_mem->last_scanned_child + 1; 1173 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1174 &found); 1175 if (css && css_tryget(css)) 1176 ret = container_of(css, struct mem_cgroup, css); 1177 1178 rcu_read_unlock(); 1179 /* Updates scanning parameter */ 1180 spin_lock(&root_mem->reclaim_param_lock); 1181 if (!css) { 1182 /* this means start scan from ID:1 */ 1183 root_mem->last_scanned_child = 0; 1184 } else 1185 root_mem->last_scanned_child = found; 1186 spin_unlock(&root_mem->reclaim_param_lock); 1187 } 1188 1189 return ret; 1190 } 1191 1192 /* 1193 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1194 * we reclaimed from, so that we don't end up penalizing one child extensively 1195 * based on its position in the children list. 1196 * 1197 * root_mem is the original ancestor that we've been reclaim from. 1198 * 1199 * We give up and return to the caller when we visit root_mem twice. 1200 * (other groups can be removed while we're walking....) 1201 * 1202 * If shrink==true, for avoiding to free too much, this returns immedieately. 1203 */ 1204 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1205 struct zone *zone, 1206 gfp_t gfp_mask, 1207 unsigned long reclaim_options) 1208 { 1209 struct mem_cgroup *victim; 1210 int ret, total = 0; 1211 int loop = 0; 1212 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1213 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1214 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1215 unsigned long excess = mem_cgroup_get_excess(root_mem); 1216 1217 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1218 if (root_mem->memsw_is_minimum) 1219 noswap = true; 1220 1221 while (1) { 1222 victim = mem_cgroup_select_victim(root_mem); 1223 if (victim == root_mem) { 1224 loop++; 1225 if (loop >= 1) 1226 drain_all_stock_async(); 1227 if (loop >= 2) { 1228 /* 1229 * If we have not been able to reclaim 1230 * anything, it might because there are 1231 * no reclaimable pages under this hierarchy 1232 */ 1233 if (!check_soft || !total) { 1234 css_put(&victim->css); 1235 break; 1236 } 1237 /* 1238 * We want to do more targetted reclaim. 1239 * excess >> 2 is not to excessive so as to 1240 * reclaim too much, nor too less that we keep 1241 * coming back to reclaim from this cgroup 1242 */ 1243 if (total >= (excess >> 2) || 1244 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1245 css_put(&victim->css); 1246 break; 1247 } 1248 } 1249 } 1250 if (!mem_cgroup_local_usage(victim)) { 1251 /* this cgroup's local usage == 0 */ 1252 css_put(&victim->css); 1253 continue; 1254 } 1255 /* we use swappiness of local cgroup */ 1256 if (check_soft) 1257 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1258 noswap, get_swappiness(victim), zone, 1259 zone->zone_pgdat->node_id); 1260 else 1261 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1262 noswap, get_swappiness(victim)); 1263 css_put(&victim->css); 1264 /* 1265 * At shrinking usage, we can't check we should stop here or 1266 * reclaim more. It's depends on callers. last_scanned_child 1267 * will work enough for keeping fairness under tree. 1268 */ 1269 if (shrink) 1270 return ret; 1271 total += ret; 1272 if (check_soft) { 1273 if (res_counter_check_under_soft_limit(&root_mem->res)) 1274 return total; 1275 } else if (mem_cgroup_check_under_limit(root_mem)) 1276 return 1 + total; 1277 } 1278 return total; 1279 } 1280 1281 static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) 1282 { 1283 int *val = (int *)data; 1284 int x; 1285 /* 1286 * Logically, we can stop scanning immediately when we find 1287 * a memcg is already locked. But condidering unlock ops and 1288 * creation/removal of memcg, scan-all is simple operation. 1289 */ 1290 x = atomic_inc_return(&mem->oom_lock); 1291 *val = max(x, *val); 1292 return 0; 1293 } 1294 /* 1295 * Check OOM-Killer is already running under our hierarchy. 1296 * If someone is running, return false. 1297 */ 1298 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1299 { 1300 int lock_count = 0; 1301 1302 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); 1303 1304 if (lock_count == 1) 1305 return true; 1306 return false; 1307 } 1308 1309 static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) 1310 { 1311 /* 1312 * When a new child is created while the hierarchy is under oom, 1313 * mem_cgroup_oom_lock() may not be called. We have to use 1314 * atomic_add_unless() here. 1315 */ 1316 atomic_add_unless(&mem->oom_lock, -1, 0); 1317 return 0; 1318 } 1319 1320 static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1321 { 1322 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); 1323 } 1324 1325 static DEFINE_MUTEX(memcg_oom_mutex); 1326 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1327 1328 struct oom_wait_info { 1329 struct mem_cgroup *mem; 1330 wait_queue_t wait; 1331 }; 1332 1333 static int memcg_oom_wake_function(wait_queue_t *wait, 1334 unsigned mode, int sync, void *arg) 1335 { 1336 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; 1337 struct oom_wait_info *oom_wait_info; 1338 1339 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1340 1341 if (oom_wait_info->mem == wake_mem) 1342 goto wakeup; 1343 /* if no hierarchy, no match */ 1344 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) 1345 return 0; 1346 /* 1347 * Both of oom_wait_info->mem and wake_mem are stable under us. 1348 * Then we can use css_is_ancestor without taking care of RCU. 1349 */ 1350 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && 1351 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) 1352 return 0; 1353 1354 wakeup: 1355 return autoremove_wake_function(wait, mode, sync, arg); 1356 } 1357 1358 static void memcg_wakeup_oom(struct mem_cgroup *mem) 1359 { 1360 /* for filtering, pass "mem" as argument. */ 1361 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1362 } 1363 1364 static void memcg_oom_recover(struct mem_cgroup *mem) 1365 { 1366 if (atomic_read(&mem->oom_lock)) 1367 memcg_wakeup_oom(mem); 1368 } 1369 1370 /* 1371 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1372 */ 1373 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1374 { 1375 struct oom_wait_info owait; 1376 bool locked, need_to_kill; 1377 1378 owait.mem = mem; 1379 owait.wait.flags = 0; 1380 owait.wait.func = memcg_oom_wake_function; 1381 owait.wait.private = current; 1382 INIT_LIST_HEAD(&owait.wait.task_list); 1383 need_to_kill = true; 1384 /* At first, try to OOM lock hierarchy under mem.*/ 1385 mutex_lock(&memcg_oom_mutex); 1386 locked = mem_cgroup_oom_lock(mem); 1387 /* 1388 * Even if signal_pending(), we can't quit charge() loop without 1389 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1390 * under OOM is always welcomed, use TASK_KILLABLE here. 1391 */ 1392 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1393 if (!locked || mem->oom_kill_disable) 1394 need_to_kill = false; 1395 if (locked) 1396 mem_cgroup_oom_notify(mem); 1397 mutex_unlock(&memcg_oom_mutex); 1398 1399 if (need_to_kill) { 1400 finish_wait(&memcg_oom_waitq, &owait.wait); 1401 mem_cgroup_out_of_memory(mem, mask); 1402 } else { 1403 schedule(); 1404 finish_wait(&memcg_oom_waitq, &owait.wait); 1405 } 1406 mutex_lock(&memcg_oom_mutex); 1407 mem_cgroup_oom_unlock(mem); 1408 memcg_wakeup_oom(mem); 1409 mutex_unlock(&memcg_oom_mutex); 1410 1411 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1412 return false; 1413 /* Give chance to dying process */ 1414 schedule_timeout(1); 1415 return true; 1416 } 1417 1418 /* 1419 * Currently used to update mapped file statistics, but the routine can be 1420 * generalized to update other statistics as well. 1421 */ 1422 void mem_cgroup_update_file_mapped(struct page *page, int val) 1423 { 1424 struct mem_cgroup *mem; 1425 struct page_cgroup *pc; 1426 1427 pc = lookup_page_cgroup(page); 1428 if (unlikely(!pc)) 1429 return; 1430 1431 lock_page_cgroup(pc); 1432 mem = pc->mem_cgroup; 1433 if (!mem || !PageCgroupUsed(pc)) 1434 goto done; 1435 1436 /* 1437 * Preemption is already disabled. We can use __this_cpu_xxx 1438 */ 1439 if (val > 0) { 1440 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1441 SetPageCgroupFileMapped(pc); 1442 } else { 1443 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1444 ClearPageCgroupFileMapped(pc); 1445 } 1446 1447 done: 1448 unlock_page_cgroup(pc); 1449 } 1450 1451 /* 1452 * size of first charge trial. "32" comes from vmscan.c's magic value. 1453 * TODO: maybe necessary to use big numbers in big irons. 1454 */ 1455 #define CHARGE_SIZE (32 * PAGE_SIZE) 1456 struct memcg_stock_pcp { 1457 struct mem_cgroup *cached; /* this never be root cgroup */ 1458 int charge; 1459 struct work_struct work; 1460 }; 1461 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1462 static atomic_t memcg_drain_count; 1463 1464 /* 1465 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed 1466 * from local stock and true is returned. If the stock is 0 or charges from a 1467 * cgroup which is not current target, returns false. This stock will be 1468 * refilled. 1469 */ 1470 static bool consume_stock(struct mem_cgroup *mem) 1471 { 1472 struct memcg_stock_pcp *stock; 1473 bool ret = true; 1474 1475 stock = &get_cpu_var(memcg_stock); 1476 if (mem == stock->cached && stock->charge) 1477 stock->charge -= PAGE_SIZE; 1478 else /* need to call res_counter_charge */ 1479 ret = false; 1480 put_cpu_var(memcg_stock); 1481 return ret; 1482 } 1483 1484 /* 1485 * Returns stocks cached in percpu to res_counter and reset cached information. 1486 */ 1487 static void drain_stock(struct memcg_stock_pcp *stock) 1488 { 1489 struct mem_cgroup *old = stock->cached; 1490 1491 if (stock->charge) { 1492 res_counter_uncharge(&old->res, stock->charge); 1493 if (do_swap_account) 1494 res_counter_uncharge(&old->memsw, stock->charge); 1495 } 1496 stock->cached = NULL; 1497 stock->charge = 0; 1498 } 1499 1500 /* 1501 * This must be called under preempt disabled or must be called by 1502 * a thread which is pinned to local cpu. 1503 */ 1504 static void drain_local_stock(struct work_struct *dummy) 1505 { 1506 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 1507 drain_stock(stock); 1508 } 1509 1510 /* 1511 * Cache charges(val) which is from res_counter, to local per_cpu area. 1512 * This will be consumed by consume_stock() function, later. 1513 */ 1514 static void refill_stock(struct mem_cgroup *mem, int val) 1515 { 1516 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 1517 1518 if (stock->cached != mem) { /* reset if necessary */ 1519 drain_stock(stock); 1520 stock->cached = mem; 1521 } 1522 stock->charge += val; 1523 put_cpu_var(memcg_stock); 1524 } 1525 1526 /* 1527 * Tries to drain stocked charges in other cpus. This function is asynchronous 1528 * and just put a work per cpu for draining localy on each cpu. Caller can 1529 * expects some charges will be back to res_counter later but cannot wait for 1530 * it. 1531 */ 1532 static void drain_all_stock_async(void) 1533 { 1534 int cpu; 1535 /* This function is for scheduling "drain" in asynchronous way. 1536 * The result of "drain" is not directly handled by callers. Then, 1537 * if someone is calling drain, we don't have to call drain more. 1538 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if 1539 * there is a race. We just do loose check here. 1540 */ 1541 if (atomic_read(&memcg_drain_count)) 1542 return; 1543 /* Notify other cpus that system-wide "drain" is running */ 1544 atomic_inc(&memcg_drain_count); 1545 get_online_cpus(); 1546 for_each_online_cpu(cpu) { 1547 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 1548 schedule_work_on(cpu, &stock->work); 1549 } 1550 put_online_cpus(); 1551 atomic_dec(&memcg_drain_count); 1552 /* We don't wait for flush_work */ 1553 } 1554 1555 /* This is a synchronous drain interface. */ 1556 static void drain_all_stock_sync(void) 1557 { 1558 /* called when force_empty is called */ 1559 atomic_inc(&memcg_drain_count); 1560 schedule_on_each_cpu(drain_local_stock); 1561 atomic_dec(&memcg_drain_count); 1562 } 1563 1564 static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, 1565 unsigned long action, 1566 void *hcpu) 1567 { 1568 int cpu = (unsigned long)hcpu; 1569 struct memcg_stock_pcp *stock; 1570 1571 if (action != CPU_DEAD) 1572 return NOTIFY_OK; 1573 stock = &per_cpu(memcg_stock, cpu); 1574 drain_stock(stock); 1575 return NOTIFY_OK; 1576 } 1577 1578 /* 1579 * Unlike exported interface, "oom" parameter is added. if oom==true, 1580 * oom-killer can be invoked. 1581 */ 1582 static int __mem_cgroup_try_charge(struct mm_struct *mm, 1583 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 1584 { 1585 struct mem_cgroup *mem, *mem_over_limit; 1586 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1587 struct res_counter *fail_res; 1588 int csize = CHARGE_SIZE; 1589 1590 /* 1591 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1592 * in system level. So, allow to go ahead dying process in addition to 1593 * MEMDIE process. 1594 */ 1595 if (unlikely(test_thread_flag(TIF_MEMDIE) 1596 || fatal_signal_pending(current))) 1597 goto bypass; 1598 1599 /* 1600 * We always charge the cgroup the mm_struct belongs to. 1601 * The mm_struct's mem_cgroup changes on task migration if the 1602 * thread group leader migrates. It's possible that mm is not 1603 * set, if so charge the init_mm (happens for pagecache usage). 1604 */ 1605 mem = *memcg; 1606 if (likely(!mem)) { 1607 mem = try_get_mem_cgroup_from_mm(mm); 1608 *memcg = mem; 1609 } else { 1610 css_get(&mem->css); 1611 } 1612 if (unlikely(!mem)) 1613 return 0; 1614 1615 VM_BUG_ON(css_is_removed(&mem->css)); 1616 if (mem_cgroup_is_root(mem)) 1617 goto done; 1618 1619 while (1) { 1620 int ret = 0; 1621 unsigned long flags = 0; 1622 1623 if (consume_stock(mem)) 1624 goto done; 1625 1626 ret = res_counter_charge(&mem->res, csize, &fail_res); 1627 if (likely(!ret)) { 1628 if (!do_swap_account) 1629 break; 1630 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 1631 if (likely(!ret)) 1632 break; 1633 /* mem+swap counter fails */ 1634 res_counter_uncharge(&mem->res, csize); 1635 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1636 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1637 memsw); 1638 } else 1639 /* mem counter fails */ 1640 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1641 res); 1642 1643 /* reduce request size and retry */ 1644 if (csize > PAGE_SIZE) { 1645 csize = PAGE_SIZE; 1646 continue; 1647 } 1648 if (!(gfp_mask & __GFP_WAIT)) 1649 goto nomem; 1650 1651 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1652 gfp_mask, flags); 1653 if (ret) 1654 continue; 1655 1656 /* 1657 * try_to_free_mem_cgroup_pages() might not give us a full 1658 * picture of reclaim. Some pages are reclaimed and might be 1659 * moved to swap cache or just unmapped from the cgroup. 1660 * Check the limit again to see if the reclaim reduced the 1661 * current usage of the cgroup before giving up 1662 * 1663 */ 1664 if (mem_cgroup_check_under_limit(mem_over_limit)) 1665 continue; 1666 1667 /* try to avoid oom while someone is moving charge */ 1668 if (mc.moving_task && current != mc.moving_task) { 1669 struct mem_cgroup *from, *to; 1670 bool do_continue = false; 1671 /* 1672 * There is a small race that "from" or "to" can be 1673 * freed by rmdir, so we use css_tryget(). 1674 */ 1675 from = mc.from; 1676 to = mc.to; 1677 if (from && css_tryget(&from->css)) { 1678 if (mem_over_limit->use_hierarchy) 1679 do_continue = css_is_ancestor( 1680 &from->css, 1681 &mem_over_limit->css); 1682 else 1683 do_continue = (from == mem_over_limit); 1684 css_put(&from->css); 1685 } 1686 if (!do_continue && to && css_tryget(&to->css)) { 1687 if (mem_over_limit->use_hierarchy) 1688 do_continue = css_is_ancestor( 1689 &to->css, 1690 &mem_over_limit->css); 1691 else 1692 do_continue = (to == mem_over_limit); 1693 css_put(&to->css); 1694 } 1695 if (do_continue) { 1696 DEFINE_WAIT(wait); 1697 prepare_to_wait(&mc.waitq, &wait, 1698 TASK_INTERRUPTIBLE); 1699 /* moving charge context might have finished. */ 1700 if (mc.moving_task) 1701 schedule(); 1702 finish_wait(&mc.waitq, &wait); 1703 continue; 1704 } 1705 } 1706 1707 if (!nr_retries--) { 1708 if (!oom) 1709 goto nomem; 1710 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { 1711 nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1712 continue; 1713 } 1714 /* When we reach here, current task is dying .*/ 1715 css_put(&mem->css); 1716 goto bypass; 1717 } 1718 } 1719 if (csize > PAGE_SIZE) 1720 refill_stock(mem, csize - PAGE_SIZE); 1721 done: 1722 return 0; 1723 nomem: 1724 css_put(&mem->css); 1725 return -ENOMEM; 1726 bypass: 1727 *memcg = NULL; 1728 return 0; 1729 } 1730 1731 /* 1732 * Somemtimes we have to undo a charge we got by try_charge(). 1733 * This function is for that and do uncharge, put css's refcnt. 1734 * gotten by try_charge(). 1735 */ 1736 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 1737 unsigned long count) 1738 { 1739 if (!mem_cgroup_is_root(mem)) { 1740 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 1741 if (do_swap_account) 1742 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 1743 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); 1744 WARN_ON_ONCE(count > INT_MAX); 1745 __css_put(&mem->css, (int)count); 1746 } 1747 /* we don't need css_put for root */ 1748 } 1749 1750 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 1751 { 1752 __mem_cgroup_cancel_charge(mem, 1); 1753 } 1754 1755 /* 1756 * A helper function to get mem_cgroup from ID. must be called under 1757 * rcu_read_lock(). The caller must check css_is_removed() or some if 1758 * it's concern. (dropping refcnt from swap can be called against removed 1759 * memcg.) 1760 */ 1761 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 1762 { 1763 struct cgroup_subsys_state *css; 1764 1765 /* ID 0 is unused ID */ 1766 if (!id) 1767 return NULL; 1768 css = css_lookup(&mem_cgroup_subsys, id); 1769 if (!css) 1770 return NULL; 1771 return container_of(css, struct mem_cgroup, css); 1772 } 1773 1774 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 1775 { 1776 struct mem_cgroup *mem = NULL; 1777 struct page_cgroup *pc; 1778 unsigned short id; 1779 swp_entry_t ent; 1780 1781 VM_BUG_ON(!PageLocked(page)); 1782 1783 pc = lookup_page_cgroup(page); 1784 lock_page_cgroup(pc); 1785 if (PageCgroupUsed(pc)) { 1786 mem = pc->mem_cgroup; 1787 if (mem && !css_tryget(&mem->css)) 1788 mem = NULL; 1789 } else if (PageSwapCache(page)) { 1790 ent.val = page_private(page); 1791 id = lookup_swap_cgroup(ent); 1792 rcu_read_lock(); 1793 mem = mem_cgroup_lookup(id); 1794 if (mem && !css_tryget(&mem->css)) 1795 mem = NULL; 1796 rcu_read_unlock(); 1797 } 1798 unlock_page_cgroup(pc); 1799 return mem; 1800 } 1801 1802 /* 1803 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 1804 * USED state. If already USED, uncharge and return. 1805 */ 1806 1807 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 1808 struct page_cgroup *pc, 1809 enum charge_type ctype) 1810 { 1811 /* try_charge() can return NULL to *memcg, taking care of it. */ 1812 if (!mem) 1813 return; 1814 1815 lock_page_cgroup(pc); 1816 if (unlikely(PageCgroupUsed(pc))) { 1817 unlock_page_cgroup(pc); 1818 mem_cgroup_cancel_charge(mem); 1819 return; 1820 } 1821 1822 pc->mem_cgroup = mem; 1823 /* 1824 * We access a page_cgroup asynchronously without lock_page_cgroup(). 1825 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 1826 * is accessed after testing USED bit. To make pc->mem_cgroup visible 1827 * before USED bit, we need memory barrier here. 1828 * See mem_cgroup_add_lru_list(), etc. 1829 */ 1830 smp_wmb(); 1831 switch (ctype) { 1832 case MEM_CGROUP_CHARGE_TYPE_CACHE: 1833 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 1834 SetPageCgroupCache(pc); 1835 SetPageCgroupUsed(pc); 1836 break; 1837 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1838 ClearPageCgroupCache(pc); 1839 SetPageCgroupUsed(pc); 1840 break; 1841 default: 1842 break; 1843 } 1844 1845 mem_cgroup_charge_statistics(mem, pc, true); 1846 1847 unlock_page_cgroup(pc); 1848 /* 1849 * "charge_statistics" updated event counter. Then, check it. 1850 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 1851 * if they exceeds softlimit. 1852 */ 1853 memcg_check_events(mem, pc->page); 1854 } 1855 1856 /** 1857 * __mem_cgroup_move_account - move account of the page 1858 * @pc: page_cgroup of the page. 1859 * @from: mem_cgroup which the page is moved from. 1860 * @to: mem_cgroup which the page is moved to. @from != @to. 1861 * @uncharge: whether we should call uncharge and css_put against @from. 1862 * 1863 * The caller must confirm following. 1864 * - page is not on LRU (isolate_page() is useful.) 1865 * - the pc is locked, used, and ->mem_cgroup points to @from. 1866 * 1867 * This function doesn't do "charge" nor css_get to new cgroup. It should be 1868 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is 1869 * true, this function does "uncharge" from old cgroup, but it doesn't if 1870 * @uncharge is false, so a caller should do "uncharge". 1871 */ 1872 1873 static void __mem_cgroup_move_account(struct page_cgroup *pc, 1874 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 1875 { 1876 VM_BUG_ON(from == to); 1877 VM_BUG_ON(PageLRU(pc->page)); 1878 VM_BUG_ON(!PageCgroupLocked(pc)); 1879 VM_BUG_ON(!PageCgroupUsed(pc)); 1880 VM_BUG_ON(pc->mem_cgroup != from); 1881 1882 if (PageCgroupFileMapped(pc)) { 1883 /* Update mapped_file data for mem_cgroup */ 1884 preempt_disable(); 1885 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1886 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1887 preempt_enable(); 1888 } 1889 mem_cgroup_charge_statistics(from, pc, false); 1890 if (uncharge) 1891 /* This is not "cancel", but cancel_charge does all we need. */ 1892 mem_cgroup_cancel_charge(from); 1893 1894 /* caller should have done css_get */ 1895 pc->mem_cgroup = to; 1896 mem_cgroup_charge_statistics(to, pc, true); 1897 /* 1898 * We charges against "to" which may not have any tasks. Then, "to" 1899 * can be under rmdir(). But in current implementation, caller of 1900 * this function is just force_empty() and move charge, so it's 1901 * garanteed that "to" is never removed. So, we don't check rmdir 1902 * status here. 1903 */ 1904 } 1905 1906 /* 1907 * check whether the @pc is valid for moving account and call 1908 * __mem_cgroup_move_account() 1909 */ 1910 static int mem_cgroup_move_account(struct page_cgroup *pc, 1911 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 1912 { 1913 int ret = -EINVAL; 1914 lock_page_cgroup(pc); 1915 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 1916 __mem_cgroup_move_account(pc, from, to, uncharge); 1917 ret = 0; 1918 } 1919 unlock_page_cgroup(pc); 1920 /* 1921 * check events 1922 */ 1923 memcg_check_events(to, pc->page); 1924 memcg_check_events(from, pc->page); 1925 return ret; 1926 } 1927 1928 /* 1929 * move charges to its parent. 1930 */ 1931 1932 static int mem_cgroup_move_parent(struct page_cgroup *pc, 1933 struct mem_cgroup *child, 1934 gfp_t gfp_mask) 1935 { 1936 struct page *page = pc->page; 1937 struct cgroup *cg = child->css.cgroup; 1938 struct cgroup *pcg = cg->parent; 1939 struct mem_cgroup *parent; 1940 int ret; 1941 1942 /* Is ROOT ? */ 1943 if (!pcg) 1944 return -EINVAL; 1945 1946 ret = -EBUSY; 1947 if (!get_page_unless_zero(page)) 1948 goto out; 1949 if (isolate_lru_page(page)) 1950 goto put; 1951 1952 parent = mem_cgroup_from_cont(pcg); 1953 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 1954 if (ret || !parent) 1955 goto put_back; 1956 1957 ret = mem_cgroup_move_account(pc, child, parent, true); 1958 if (ret) 1959 mem_cgroup_cancel_charge(parent); 1960 put_back: 1961 putback_lru_page(page); 1962 put: 1963 put_page(page); 1964 out: 1965 return ret; 1966 } 1967 1968 /* 1969 * Charge the memory controller for page usage. 1970 * Return 1971 * 0 if the charge was successful 1972 * < 0 if the cgroup is over its limit 1973 */ 1974 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 1975 gfp_t gfp_mask, enum charge_type ctype, 1976 struct mem_cgroup *memcg) 1977 { 1978 struct mem_cgroup *mem; 1979 struct page_cgroup *pc; 1980 int ret; 1981 1982 pc = lookup_page_cgroup(page); 1983 /* can happen at boot */ 1984 if (unlikely(!pc)) 1985 return 0; 1986 prefetchw(pc); 1987 1988 mem = memcg; 1989 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1990 if (ret || !mem) 1991 return ret; 1992 1993 __mem_cgroup_commit_charge(mem, pc, ctype); 1994 return 0; 1995 } 1996 1997 int mem_cgroup_newpage_charge(struct page *page, 1998 struct mm_struct *mm, gfp_t gfp_mask) 1999 { 2000 if (mem_cgroup_disabled()) 2001 return 0; 2002 if (PageCompound(page)) 2003 return 0; 2004 /* 2005 * If already mapped, we don't have to account. 2006 * If page cache, page->mapping has address_space. 2007 * But page->mapping may have out-of-use anon_vma pointer, 2008 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 2009 * is NULL. 2010 */ 2011 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 2012 return 0; 2013 if (unlikely(!mm)) 2014 mm = &init_mm; 2015 return mem_cgroup_charge_common(page, mm, gfp_mask, 2016 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 2017 } 2018 2019 static void 2020 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2021 enum charge_type ctype); 2022 2023 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2024 gfp_t gfp_mask) 2025 { 2026 struct mem_cgroup *mem = NULL; 2027 int ret; 2028 2029 if (mem_cgroup_disabled()) 2030 return 0; 2031 if (PageCompound(page)) 2032 return 0; 2033 /* 2034 * Corner case handling. This is called from add_to_page_cache() 2035 * in usual. But some FS (shmem) precharges this page before calling it 2036 * and call add_to_page_cache() with GFP_NOWAIT. 2037 * 2038 * For GFP_NOWAIT case, the page may be pre-charged before calling 2039 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 2040 * charge twice. (It works but has to pay a bit larger cost.) 2041 * And when the page is SwapCache, it should take swap information 2042 * into account. This is under lock_page() now. 2043 */ 2044 if (!(gfp_mask & __GFP_WAIT)) { 2045 struct page_cgroup *pc; 2046 2047 2048 pc = lookup_page_cgroup(page); 2049 if (!pc) 2050 return 0; 2051 lock_page_cgroup(pc); 2052 if (PageCgroupUsed(pc)) { 2053 unlock_page_cgroup(pc); 2054 return 0; 2055 } 2056 unlock_page_cgroup(pc); 2057 } 2058 2059 if (unlikely(!mm && !mem)) 2060 mm = &init_mm; 2061 2062 if (page_is_file_cache(page)) 2063 return mem_cgroup_charge_common(page, mm, gfp_mask, 2064 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 2065 2066 /* shmem */ 2067 if (PageSwapCache(page)) { 2068 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2069 if (!ret) 2070 __mem_cgroup_commit_charge_swapin(page, mem, 2071 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2072 } else 2073 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2074 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 2075 2076 return ret; 2077 } 2078 2079 /* 2080 * While swap-in, try_charge -> commit or cancel, the page is locked. 2081 * And when try_charge() successfully returns, one refcnt to memcg without 2082 * struct page_cgroup is acquired. This refcnt will be consumed by 2083 * "commit()" or removed by "cancel()" 2084 */ 2085 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2086 struct page *page, 2087 gfp_t mask, struct mem_cgroup **ptr) 2088 { 2089 struct mem_cgroup *mem; 2090 int ret; 2091 2092 if (mem_cgroup_disabled()) 2093 return 0; 2094 2095 if (!do_swap_account) 2096 goto charge_cur_mm; 2097 /* 2098 * A racing thread's fault, or swapoff, may have already updated 2099 * the pte, and even removed page from swap cache: in those cases 2100 * do_swap_page()'s pte_same() test will fail; but there's also a 2101 * KSM case which does need to charge the page. 2102 */ 2103 if (!PageSwapCache(page)) 2104 goto charge_cur_mm; 2105 mem = try_get_mem_cgroup_from_page(page); 2106 if (!mem) 2107 goto charge_cur_mm; 2108 *ptr = mem; 2109 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2110 /* drop extra refcnt from tryget */ 2111 css_put(&mem->css); 2112 return ret; 2113 charge_cur_mm: 2114 if (unlikely(!mm)) 2115 mm = &init_mm; 2116 return __mem_cgroup_try_charge(mm, mask, ptr, true); 2117 } 2118 2119 static void 2120 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2121 enum charge_type ctype) 2122 { 2123 struct page_cgroup *pc; 2124 2125 if (mem_cgroup_disabled()) 2126 return; 2127 if (!ptr) 2128 return; 2129 cgroup_exclude_rmdir(&ptr->css); 2130 pc = lookup_page_cgroup(page); 2131 mem_cgroup_lru_del_before_commit_swapcache(page); 2132 __mem_cgroup_commit_charge(ptr, pc, ctype); 2133 mem_cgroup_lru_add_after_commit_swapcache(page); 2134 /* 2135 * Now swap is on-memory. This means this page may be 2136 * counted both as mem and swap....double count. 2137 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2138 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2139 * may call delete_from_swap_cache() before reach here. 2140 */ 2141 if (do_swap_account && PageSwapCache(page)) { 2142 swp_entry_t ent = {.val = page_private(page)}; 2143 unsigned short id; 2144 struct mem_cgroup *memcg; 2145 2146 id = swap_cgroup_record(ent, 0); 2147 rcu_read_lock(); 2148 memcg = mem_cgroup_lookup(id); 2149 if (memcg) { 2150 /* 2151 * This recorded memcg can be obsolete one. So, avoid 2152 * calling css_tryget 2153 */ 2154 if (!mem_cgroup_is_root(memcg)) 2155 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2156 mem_cgroup_swap_statistics(memcg, false); 2157 mem_cgroup_put(memcg); 2158 } 2159 rcu_read_unlock(); 2160 } 2161 /* 2162 * At swapin, we may charge account against cgroup which has no tasks. 2163 * So, rmdir()->pre_destroy() can be called while we do this charge. 2164 * In that case, we need to call pre_destroy() again. check it here. 2165 */ 2166 cgroup_release_and_wakeup_rmdir(&ptr->css); 2167 } 2168 2169 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2170 { 2171 __mem_cgroup_commit_charge_swapin(page, ptr, 2172 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2173 } 2174 2175 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 2176 { 2177 if (mem_cgroup_disabled()) 2178 return; 2179 if (!mem) 2180 return; 2181 mem_cgroup_cancel_charge(mem); 2182 } 2183 2184 static void 2185 __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) 2186 { 2187 struct memcg_batch_info *batch = NULL; 2188 bool uncharge_memsw = true; 2189 /* If swapout, usage of swap doesn't decrease */ 2190 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2191 uncharge_memsw = false; 2192 2193 batch = ¤t->memcg_batch; 2194 /* 2195 * In usual, we do css_get() when we remember memcg pointer. 2196 * But in this case, we keep res->usage until end of a series of 2197 * uncharges. Then, it's ok to ignore memcg's refcnt. 2198 */ 2199 if (!batch->memcg) 2200 batch->memcg = mem; 2201 /* 2202 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2203 * In those cases, all pages freed continously can be expected to be in 2204 * the same cgroup and we have chance to coalesce uncharges. 2205 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2206 * because we want to do uncharge as soon as possible. 2207 */ 2208 2209 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2210 goto direct_uncharge; 2211 2212 /* 2213 * In typical case, batch->memcg == mem. This means we can 2214 * merge a series of uncharges to an uncharge of res_counter. 2215 * If not, we uncharge res_counter ony by one. 2216 */ 2217 if (batch->memcg != mem) 2218 goto direct_uncharge; 2219 /* remember freed charge and uncharge it later */ 2220 batch->bytes += PAGE_SIZE; 2221 if (uncharge_memsw) 2222 batch->memsw_bytes += PAGE_SIZE; 2223 return; 2224 direct_uncharge: 2225 res_counter_uncharge(&mem->res, PAGE_SIZE); 2226 if (uncharge_memsw) 2227 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 2228 if (unlikely(batch->memcg != mem)) 2229 memcg_oom_recover(mem); 2230 return; 2231 } 2232 2233 /* 2234 * uncharge if !page_mapped(page) 2235 */ 2236 static struct mem_cgroup * 2237 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2238 { 2239 struct page_cgroup *pc; 2240 struct mem_cgroup *mem = NULL; 2241 struct mem_cgroup_per_zone *mz; 2242 2243 if (mem_cgroup_disabled()) 2244 return NULL; 2245 2246 if (PageSwapCache(page)) 2247 return NULL; 2248 2249 /* 2250 * Check if our page_cgroup is valid 2251 */ 2252 pc = lookup_page_cgroup(page); 2253 if (unlikely(!pc || !PageCgroupUsed(pc))) 2254 return NULL; 2255 2256 lock_page_cgroup(pc); 2257 2258 mem = pc->mem_cgroup; 2259 2260 if (!PageCgroupUsed(pc)) 2261 goto unlock_out; 2262 2263 switch (ctype) { 2264 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2265 case MEM_CGROUP_CHARGE_TYPE_DROP: 2266 /* See mem_cgroup_prepare_migration() */ 2267 if (page_mapped(page) || PageCgroupMigration(pc)) 2268 goto unlock_out; 2269 break; 2270 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2271 if (!PageAnon(page)) { /* Shared memory */ 2272 if (page->mapping && !page_is_file_cache(page)) 2273 goto unlock_out; 2274 } else if (page_mapped(page)) /* Anon */ 2275 goto unlock_out; 2276 break; 2277 default: 2278 break; 2279 } 2280 2281 if (!mem_cgroup_is_root(mem)) 2282 __do_uncharge(mem, ctype); 2283 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2284 mem_cgroup_swap_statistics(mem, true); 2285 mem_cgroup_charge_statistics(mem, pc, false); 2286 2287 ClearPageCgroupUsed(pc); 2288 /* 2289 * pc->mem_cgroup is not cleared here. It will be accessed when it's 2290 * freed from LRU. This is safe because uncharged page is expected not 2291 * to be reused (freed soon). Exception is SwapCache, it's handled by 2292 * special functions. 2293 */ 2294 2295 mz = page_cgroup_zoneinfo(pc); 2296 unlock_page_cgroup(pc); 2297 2298 memcg_check_events(mem, page); 2299 /* at swapout, this memcg will be accessed to record to swap */ 2300 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2301 css_put(&mem->css); 2302 2303 return mem; 2304 2305 unlock_out: 2306 unlock_page_cgroup(pc); 2307 return NULL; 2308 } 2309 2310 void mem_cgroup_uncharge_page(struct page *page) 2311 { 2312 /* early check. */ 2313 if (page_mapped(page)) 2314 return; 2315 if (page->mapping && !PageAnon(page)) 2316 return; 2317 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 2318 } 2319 2320 void mem_cgroup_uncharge_cache_page(struct page *page) 2321 { 2322 VM_BUG_ON(page_mapped(page)); 2323 VM_BUG_ON(page->mapping); 2324 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2325 } 2326 2327 /* 2328 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 2329 * In that cases, pages are freed continuously and we can expect pages 2330 * are in the same memcg. All these calls itself limits the number of 2331 * pages freed at once, then uncharge_start/end() is called properly. 2332 * This may be called prural(2) times in a context, 2333 */ 2334 2335 void mem_cgroup_uncharge_start(void) 2336 { 2337 current->memcg_batch.do_batch++; 2338 /* We can do nest. */ 2339 if (current->memcg_batch.do_batch == 1) { 2340 current->memcg_batch.memcg = NULL; 2341 current->memcg_batch.bytes = 0; 2342 current->memcg_batch.memsw_bytes = 0; 2343 } 2344 } 2345 2346 void mem_cgroup_uncharge_end(void) 2347 { 2348 struct memcg_batch_info *batch = ¤t->memcg_batch; 2349 2350 if (!batch->do_batch) 2351 return; 2352 2353 batch->do_batch--; 2354 if (batch->do_batch) /* If stacked, do nothing. */ 2355 return; 2356 2357 if (!batch->memcg) 2358 return; 2359 /* 2360 * This "batch->memcg" is valid without any css_get/put etc... 2361 * bacause we hide charges behind us. 2362 */ 2363 if (batch->bytes) 2364 res_counter_uncharge(&batch->memcg->res, batch->bytes); 2365 if (batch->memsw_bytes) 2366 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 2367 memcg_oom_recover(batch->memcg); 2368 /* forget this pointer (for sanity check) */ 2369 batch->memcg = NULL; 2370 } 2371 2372 #ifdef CONFIG_SWAP 2373 /* 2374 * called after __delete_from_swap_cache() and drop "page" account. 2375 * memcg information is recorded to swap_cgroup of "ent" 2376 */ 2377 void 2378 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 2379 { 2380 struct mem_cgroup *memcg; 2381 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 2382 2383 if (!swapout) /* this was a swap cache but the swap is unused ! */ 2384 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 2385 2386 memcg = __mem_cgroup_uncharge_common(page, ctype); 2387 2388 /* record memcg information */ 2389 if (do_swap_account && swapout && memcg) { 2390 swap_cgroup_record(ent, css_id(&memcg->css)); 2391 mem_cgroup_get(memcg); 2392 } 2393 if (swapout && memcg) 2394 css_put(&memcg->css); 2395 } 2396 #endif 2397 2398 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2399 /* 2400 * called from swap_entry_free(). remove record in swap_cgroup and 2401 * uncharge "memsw" account. 2402 */ 2403 void mem_cgroup_uncharge_swap(swp_entry_t ent) 2404 { 2405 struct mem_cgroup *memcg; 2406 unsigned short id; 2407 2408 if (!do_swap_account) 2409 return; 2410 2411 id = swap_cgroup_record(ent, 0); 2412 rcu_read_lock(); 2413 memcg = mem_cgroup_lookup(id); 2414 if (memcg) { 2415 /* 2416 * We uncharge this because swap is freed. 2417 * This memcg can be obsolete one. We avoid calling css_tryget 2418 */ 2419 if (!mem_cgroup_is_root(memcg)) 2420 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2421 mem_cgroup_swap_statistics(memcg, false); 2422 mem_cgroup_put(memcg); 2423 } 2424 rcu_read_unlock(); 2425 } 2426 2427 /** 2428 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2429 * @entry: swap entry to be moved 2430 * @from: mem_cgroup which the entry is moved from 2431 * @to: mem_cgroup which the entry is moved to 2432 * @need_fixup: whether we should fixup res_counters and refcounts. 2433 * 2434 * It succeeds only when the swap_cgroup's record for this entry is the same 2435 * as the mem_cgroup's id of @from. 2436 * 2437 * Returns 0 on success, -EINVAL on failure. 2438 * 2439 * The caller must have charged to @to, IOW, called res_counter_charge() about 2440 * both res and memsw, and called css_get(). 2441 */ 2442 static int mem_cgroup_move_swap_account(swp_entry_t entry, 2443 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2444 { 2445 unsigned short old_id, new_id; 2446 2447 old_id = css_id(&from->css); 2448 new_id = css_id(&to->css); 2449 2450 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2451 mem_cgroup_swap_statistics(from, false); 2452 mem_cgroup_swap_statistics(to, true); 2453 /* 2454 * This function is only called from task migration context now. 2455 * It postpones res_counter and refcount handling till the end 2456 * of task migration(mem_cgroup_clear_mc()) for performance 2457 * improvement. But we cannot postpone mem_cgroup_get(to) 2458 * because if the process that has been moved to @to does 2459 * swap-in, the refcount of @to might be decreased to 0. 2460 */ 2461 mem_cgroup_get(to); 2462 if (need_fixup) { 2463 if (!mem_cgroup_is_root(from)) 2464 res_counter_uncharge(&from->memsw, PAGE_SIZE); 2465 mem_cgroup_put(from); 2466 /* 2467 * we charged both to->res and to->memsw, so we should 2468 * uncharge to->res. 2469 */ 2470 if (!mem_cgroup_is_root(to)) 2471 res_counter_uncharge(&to->res, PAGE_SIZE); 2472 css_put(&to->css); 2473 } 2474 return 0; 2475 } 2476 return -EINVAL; 2477 } 2478 #else 2479 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2480 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2481 { 2482 return -EINVAL; 2483 } 2484 #endif 2485 2486 /* 2487 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 2488 * page belongs to. 2489 */ 2490 int mem_cgroup_prepare_migration(struct page *page, 2491 struct page *newpage, struct mem_cgroup **ptr) 2492 { 2493 struct page_cgroup *pc; 2494 struct mem_cgroup *mem = NULL; 2495 enum charge_type ctype; 2496 int ret = 0; 2497 2498 if (mem_cgroup_disabled()) 2499 return 0; 2500 2501 pc = lookup_page_cgroup(page); 2502 lock_page_cgroup(pc); 2503 if (PageCgroupUsed(pc)) { 2504 mem = pc->mem_cgroup; 2505 css_get(&mem->css); 2506 /* 2507 * At migrating an anonymous page, its mapcount goes down 2508 * to 0 and uncharge() will be called. But, even if it's fully 2509 * unmapped, migration may fail and this page has to be 2510 * charged again. We set MIGRATION flag here and delay uncharge 2511 * until end_migration() is called 2512 * 2513 * Corner Case Thinking 2514 * A) 2515 * When the old page was mapped as Anon and it's unmap-and-freed 2516 * while migration was ongoing. 2517 * If unmap finds the old page, uncharge() of it will be delayed 2518 * until end_migration(). If unmap finds a new page, it's 2519 * uncharged when it make mapcount to be 1->0. If unmap code 2520 * finds swap_migration_entry, the new page will not be mapped 2521 * and end_migration() will find it(mapcount==0). 2522 * 2523 * B) 2524 * When the old page was mapped but migraion fails, the kernel 2525 * remaps it. A charge for it is kept by MIGRATION flag even 2526 * if mapcount goes down to 0. We can do remap successfully 2527 * without charging it again. 2528 * 2529 * C) 2530 * The "old" page is under lock_page() until the end of 2531 * migration, so, the old page itself will not be swapped-out. 2532 * If the new page is swapped out before end_migraton, our 2533 * hook to usual swap-out path will catch the event. 2534 */ 2535 if (PageAnon(page)) 2536 SetPageCgroupMigration(pc); 2537 } 2538 unlock_page_cgroup(pc); 2539 /* 2540 * If the page is not charged at this point, 2541 * we return here. 2542 */ 2543 if (!mem) 2544 return 0; 2545 2546 *ptr = mem; 2547 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); 2548 css_put(&mem->css);/* drop extra refcnt */ 2549 if (ret || *ptr == NULL) { 2550 if (PageAnon(page)) { 2551 lock_page_cgroup(pc); 2552 ClearPageCgroupMigration(pc); 2553 unlock_page_cgroup(pc); 2554 /* 2555 * The old page may be fully unmapped while we kept it. 2556 */ 2557 mem_cgroup_uncharge_page(page); 2558 } 2559 return -ENOMEM; 2560 } 2561 /* 2562 * We charge new page before it's used/mapped. So, even if unlock_page() 2563 * is called before end_migration, we can catch all events on this new 2564 * page. In the case new page is migrated but not remapped, new page's 2565 * mapcount will be finally 0 and we call uncharge in end_migration(). 2566 */ 2567 pc = lookup_page_cgroup(newpage); 2568 if (PageAnon(page)) 2569 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 2570 else if (page_is_file_cache(page)) 2571 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2572 else 2573 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2574 __mem_cgroup_commit_charge(mem, pc, ctype); 2575 return ret; 2576 } 2577 2578 /* remove redundant charge if migration failed*/ 2579 void mem_cgroup_end_migration(struct mem_cgroup *mem, 2580 struct page *oldpage, struct page *newpage) 2581 { 2582 struct page *used, *unused; 2583 struct page_cgroup *pc; 2584 2585 if (!mem) 2586 return; 2587 /* blocks rmdir() */ 2588 cgroup_exclude_rmdir(&mem->css); 2589 /* at migration success, oldpage->mapping is NULL. */ 2590 if (oldpage->mapping) { 2591 used = oldpage; 2592 unused = newpage; 2593 } else { 2594 used = newpage; 2595 unused = oldpage; 2596 } 2597 /* 2598 * We disallowed uncharge of pages under migration because mapcount 2599 * of the page goes down to zero, temporarly. 2600 * Clear the flag and check the page should be charged. 2601 */ 2602 pc = lookup_page_cgroup(oldpage); 2603 lock_page_cgroup(pc); 2604 ClearPageCgroupMigration(pc); 2605 unlock_page_cgroup(pc); 2606 2607 if (unused != oldpage) 2608 pc = lookup_page_cgroup(unused); 2609 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 2610 2611 pc = lookup_page_cgroup(used); 2612 /* 2613 * If a page is a file cache, radix-tree replacement is very atomic 2614 * and we can skip this check. When it was an Anon page, its mapcount 2615 * goes down to 0. But because we added MIGRATION flage, it's not 2616 * uncharged yet. There are several case but page->mapcount check 2617 * and USED bit check in mem_cgroup_uncharge_page() will do enough 2618 * check. (see prepare_charge() also) 2619 */ 2620 if (PageAnon(used)) 2621 mem_cgroup_uncharge_page(used); 2622 /* 2623 * At migration, we may charge account against cgroup which has no 2624 * tasks. 2625 * So, rmdir()->pre_destroy() can be called while we do this charge. 2626 * In that case, we need to call pre_destroy() again. check it here. 2627 */ 2628 cgroup_release_and_wakeup_rmdir(&mem->css); 2629 } 2630 2631 /* 2632 * A call to try to shrink memory usage on charge failure at shmem's swapin. 2633 * Calling hierarchical_reclaim is not enough because we should update 2634 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. 2635 * Moreover considering hierarchy, we should reclaim from the mem_over_limit, 2636 * not from the memcg which this page would be charged to. 2637 * try_charge_swapin does all of these works properly. 2638 */ 2639 int mem_cgroup_shmem_charge_fallback(struct page *page, 2640 struct mm_struct *mm, 2641 gfp_t gfp_mask) 2642 { 2643 struct mem_cgroup *mem = NULL; 2644 int ret; 2645 2646 if (mem_cgroup_disabled()) 2647 return 0; 2648 2649 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2650 if (!ret) 2651 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ 2652 2653 return ret; 2654 } 2655 2656 static DEFINE_MUTEX(set_limit_mutex); 2657 2658 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2659 unsigned long long val) 2660 { 2661 int retry_count; 2662 u64 memswlimit, memlimit; 2663 int ret = 0; 2664 int children = mem_cgroup_count_children(memcg); 2665 u64 curusage, oldusage; 2666 int enlarge; 2667 2668 /* 2669 * For keeping hierarchical_reclaim simple, how long we should retry 2670 * is depends on callers. We set our retry-count to be function 2671 * of # of children which we should visit in this loop. 2672 */ 2673 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 2674 2675 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2676 2677 enlarge = 0; 2678 while (retry_count) { 2679 if (signal_pending(current)) { 2680 ret = -EINTR; 2681 break; 2682 } 2683 /* 2684 * Rather than hide all in some function, I do this in 2685 * open coded manner. You see what this really does. 2686 * We have to guarantee mem->res.limit < mem->memsw.limit. 2687 */ 2688 mutex_lock(&set_limit_mutex); 2689 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2690 if (memswlimit < val) { 2691 ret = -EINVAL; 2692 mutex_unlock(&set_limit_mutex); 2693 break; 2694 } 2695 2696 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2697 if (memlimit < val) 2698 enlarge = 1; 2699 2700 ret = res_counter_set_limit(&memcg->res, val); 2701 if (!ret) { 2702 if (memswlimit == val) 2703 memcg->memsw_is_minimum = true; 2704 else 2705 memcg->memsw_is_minimum = false; 2706 } 2707 mutex_unlock(&set_limit_mutex); 2708 2709 if (!ret) 2710 break; 2711 2712 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2713 MEM_CGROUP_RECLAIM_SHRINK); 2714 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2715 /* Usage is reduced ? */ 2716 if (curusage >= oldusage) 2717 retry_count--; 2718 else 2719 oldusage = curusage; 2720 } 2721 if (!ret && enlarge) 2722 memcg_oom_recover(memcg); 2723 2724 return ret; 2725 } 2726 2727 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 2728 unsigned long long val) 2729 { 2730 int retry_count; 2731 u64 memlimit, memswlimit, oldusage, curusage; 2732 int children = mem_cgroup_count_children(memcg); 2733 int ret = -EBUSY; 2734 int enlarge = 0; 2735 2736 /* see mem_cgroup_resize_res_limit */ 2737 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 2738 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2739 while (retry_count) { 2740 if (signal_pending(current)) { 2741 ret = -EINTR; 2742 break; 2743 } 2744 /* 2745 * Rather than hide all in some function, I do this in 2746 * open coded manner. You see what this really does. 2747 * We have to guarantee mem->res.limit < mem->memsw.limit. 2748 */ 2749 mutex_lock(&set_limit_mutex); 2750 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2751 if (memlimit > val) { 2752 ret = -EINVAL; 2753 mutex_unlock(&set_limit_mutex); 2754 break; 2755 } 2756 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2757 if (memswlimit < val) 2758 enlarge = 1; 2759 ret = res_counter_set_limit(&memcg->memsw, val); 2760 if (!ret) { 2761 if (memlimit == val) 2762 memcg->memsw_is_minimum = true; 2763 else 2764 memcg->memsw_is_minimum = false; 2765 } 2766 mutex_unlock(&set_limit_mutex); 2767 2768 if (!ret) 2769 break; 2770 2771 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2772 MEM_CGROUP_RECLAIM_NOSWAP | 2773 MEM_CGROUP_RECLAIM_SHRINK); 2774 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2775 /* Usage is reduced ? */ 2776 if (curusage >= oldusage) 2777 retry_count--; 2778 else 2779 oldusage = curusage; 2780 } 2781 if (!ret && enlarge) 2782 memcg_oom_recover(memcg); 2783 return ret; 2784 } 2785 2786 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2787 gfp_t gfp_mask, int nid, 2788 int zid) 2789 { 2790 unsigned long nr_reclaimed = 0; 2791 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2792 unsigned long reclaimed; 2793 int loop = 0; 2794 struct mem_cgroup_tree_per_zone *mctz; 2795 unsigned long long excess; 2796 2797 if (order > 0) 2798 return 0; 2799 2800 mctz = soft_limit_tree_node_zone(nid, zid); 2801 /* 2802 * This loop can run a while, specially if mem_cgroup's continuously 2803 * keep exceeding their soft limit and putting the system under 2804 * pressure 2805 */ 2806 do { 2807 if (next_mz) 2808 mz = next_mz; 2809 else 2810 mz = mem_cgroup_largest_soft_limit_node(mctz); 2811 if (!mz) 2812 break; 2813 2814 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 2815 gfp_mask, 2816 MEM_CGROUP_RECLAIM_SOFT); 2817 nr_reclaimed += reclaimed; 2818 spin_lock(&mctz->lock); 2819 2820 /* 2821 * If we failed to reclaim anything from this memory cgroup 2822 * it is time to move on to the next cgroup 2823 */ 2824 next_mz = NULL; 2825 if (!reclaimed) { 2826 do { 2827 /* 2828 * Loop until we find yet another one. 2829 * 2830 * By the time we get the soft_limit lock 2831 * again, someone might have aded the 2832 * group back on the RB tree. Iterate to 2833 * make sure we get a different mem. 2834 * mem_cgroup_largest_soft_limit_node returns 2835 * NULL if no other cgroup is present on 2836 * the tree 2837 */ 2838 next_mz = 2839 __mem_cgroup_largest_soft_limit_node(mctz); 2840 if (next_mz == mz) { 2841 css_put(&next_mz->mem->css); 2842 next_mz = NULL; 2843 } else /* next_mz == NULL or other memcg */ 2844 break; 2845 } while (1); 2846 } 2847 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 2848 excess = res_counter_soft_limit_excess(&mz->mem->res); 2849 /* 2850 * One school of thought says that we should not add 2851 * back the node to the tree if reclaim returns 0. 2852 * But our reclaim could return 0, simply because due 2853 * to priority we are exposing a smaller subset of 2854 * memory to reclaim from. Consider this as a longer 2855 * term TODO. 2856 */ 2857 /* If excess == 0, no tree ops */ 2858 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 2859 spin_unlock(&mctz->lock); 2860 css_put(&mz->mem->css); 2861 loop++; 2862 /* 2863 * Could not reclaim anything and there are no more 2864 * mem cgroups to try or we seem to be looping without 2865 * reclaiming anything. 2866 */ 2867 if (!nr_reclaimed && 2868 (next_mz == NULL || 2869 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 2870 break; 2871 } while (!nr_reclaimed); 2872 if (next_mz) 2873 css_put(&next_mz->mem->css); 2874 return nr_reclaimed; 2875 } 2876 2877 /* 2878 * This routine traverse page_cgroup in given list and drop them all. 2879 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2880 */ 2881 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 2882 int node, int zid, enum lru_list lru) 2883 { 2884 struct zone *zone; 2885 struct mem_cgroup_per_zone *mz; 2886 struct page_cgroup *pc, *busy; 2887 unsigned long flags, loop; 2888 struct list_head *list; 2889 int ret = 0; 2890 2891 zone = &NODE_DATA(node)->node_zones[zid]; 2892 mz = mem_cgroup_zoneinfo(mem, node, zid); 2893 list = &mz->lists[lru]; 2894 2895 loop = MEM_CGROUP_ZSTAT(mz, lru); 2896 /* give some margin against EBUSY etc...*/ 2897 loop += 256; 2898 busy = NULL; 2899 while (loop--) { 2900 ret = 0; 2901 spin_lock_irqsave(&zone->lru_lock, flags); 2902 if (list_empty(list)) { 2903 spin_unlock_irqrestore(&zone->lru_lock, flags); 2904 break; 2905 } 2906 pc = list_entry(list->prev, struct page_cgroup, lru); 2907 if (busy == pc) { 2908 list_move(&pc->lru, list); 2909 busy = NULL; 2910 spin_unlock_irqrestore(&zone->lru_lock, flags); 2911 continue; 2912 } 2913 spin_unlock_irqrestore(&zone->lru_lock, flags); 2914 2915 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 2916 if (ret == -ENOMEM) 2917 break; 2918 2919 if (ret == -EBUSY || ret == -EINVAL) { 2920 /* found lock contention or "pc" is obsolete. */ 2921 busy = pc; 2922 cond_resched(); 2923 } else 2924 busy = NULL; 2925 } 2926 2927 if (!ret && !list_empty(list)) 2928 return -EBUSY; 2929 return ret; 2930 } 2931 2932 /* 2933 * make mem_cgroup's charge to be 0 if there is no task. 2934 * This enables deleting this mem_cgroup. 2935 */ 2936 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 2937 { 2938 int ret; 2939 int node, zid, shrink; 2940 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2941 struct cgroup *cgrp = mem->css.cgroup; 2942 2943 css_get(&mem->css); 2944 2945 shrink = 0; 2946 /* should free all ? */ 2947 if (free_all) 2948 goto try_to_free; 2949 move_account: 2950 do { 2951 ret = -EBUSY; 2952 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 2953 goto out; 2954 ret = -EINTR; 2955 if (signal_pending(current)) 2956 goto out; 2957 /* This is for making all *used* pages to be on LRU. */ 2958 lru_add_drain_all(); 2959 drain_all_stock_sync(); 2960 ret = 0; 2961 for_each_node_state(node, N_HIGH_MEMORY) { 2962 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 2963 enum lru_list l; 2964 for_each_lru(l) { 2965 ret = mem_cgroup_force_empty_list(mem, 2966 node, zid, l); 2967 if (ret) 2968 break; 2969 } 2970 } 2971 if (ret) 2972 break; 2973 } 2974 memcg_oom_recover(mem); 2975 /* it seems parent cgroup doesn't have enough mem */ 2976 if (ret == -ENOMEM) 2977 goto try_to_free; 2978 cond_resched(); 2979 /* "ret" should also be checked to ensure all lists are empty. */ 2980 } while (mem->res.usage > 0 || ret); 2981 out: 2982 css_put(&mem->css); 2983 return ret; 2984 2985 try_to_free: 2986 /* returns EBUSY if there is a task or if we come here twice. */ 2987 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 2988 ret = -EBUSY; 2989 goto out; 2990 } 2991 /* we call try-to-free pages for make this cgroup empty */ 2992 lru_add_drain_all(); 2993 /* try to free all pages in this cgroup */ 2994 shrink = 1; 2995 while (nr_retries && mem->res.usage > 0) { 2996 int progress; 2997 2998 if (signal_pending(current)) { 2999 ret = -EINTR; 3000 goto out; 3001 } 3002 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3003 false, get_swappiness(mem)); 3004 if (!progress) { 3005 nr_retries--; 3006 /* maybe some writeback is necessary */ 3007 congestion_wait(BLK_RW_ASYNC, HZ/10); 3008 } 3009 3010 } 3011 lru_add_drain(); 3012 /* try move_account...there may be some *locked* pages. */ 3013 goto move_account; 3014 } 3015 3016 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3017 { 3018 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3019 } 3020 3021 3022 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 3023 { 3024 return mem_cgroup_from_cont(cont)->use_hierarchy; 3025 } 3026 3027 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 3028 u64 val) 3029 { 3030 int retval = 0; 3031 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3032 struct cgroup *parent = cont->parent; 3033 struct mem_cgroup *parent_mem = NULL; 3034 3035 if (parent) 3036 parent_mem = mem_cgroup_from_cont(parent); 3037 3038 cgroup_lock(); 3039 /* 3040 * If parent's use_hierarchy is set, we can't make any modifications 3041 * in the child subtrees. If it is unset, then the change can 3042 * occur, provided the current cgroup has no children. 3043 * 3044 * For the root cgroup, parent_mem is NULL, we allow value to be 3045 * set if there are no children. 3046 */ 3047 if ((!parent_mem || !parent_mem->use_hierarchy) && 3048 (val == 1 || val == 0)) { 3049 if (list_empty(&cont->children)) 3050 mem->use_hierarchy = val; 3051 else 3052 retval = -EBUSY; 3053 } else 3054 retval = -EINVAL; 3055 cgroup_unlock(); 3056 3057 return retval; 3058 } 3059 3060 struct mem_cgroup_idx_data { 3061 s64 val; 3062 enum mem_cgroup_stat_index idx; 3063 }; 3064 3065 static int 3066 mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 3067 { 3068 struct mem_cgroup_idx_data *d = data; 3069 d->val += mem_cgroup_read_stat(mem, d->idx); 3070 return 0; 3071 } 3072 3073 static void 3074 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 3075 enum mem_cgroup_stat_index idx, s64 *val) 3076 { 3077 struct mem_cgroup_idx_data d; 3078 d.idx = idx; 3079 d.val = 0; 3080 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); 3081 *val = d.val; 3082 } 3083 3084 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3085 { 3086 u64 idx_val, val; 3087 3088 if (!mem_cgroup_is_root(mem)) { 3089 if (!swap) 3090 return res_counter_read_u64(&mem->res, RES_USAGE); 3091 else 3092 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3093 } 3094 3095 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); 3096 val = idx_val; 3097 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); 3098 val += idx_val; 3099 3100 if (swap) { 3101 mem_cgroup_get_recursive_idx_stat(mem, 3102 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 3103 val += idx_val; 3104 } 3105 3106 return val << PAGE_SHIFT; 3107 } 3108 3109 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3110 { 3111 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3112 u64 val; 3113 int type, name; 3114 3115 type = MEMFILE_TYPE(cft->private); 3116 name = MEMFILE_ATTR(cft->private); 3117 switch (type) { 3118 case _MEM: 3119 if (name == RES_USAGE) 3120 val = mem_cgroup_usage(mem, false); 3121 else 3122 val = res_counter_read_u64(&mem->res, name); 3123 break; 3124 case _MEMSWAP: 3125 if (name == RES_USAGE) 3126 val = mem_cgroup_usage(mem, true); 3127 else 3128 val = res_counter_read_u64(&mem->memsw, name); 3129 break; 3130 default: 3131 BUG(); 3132 break; 3133 } 3134 return val; 3135 } 3136 /* 3137 * The user of this function is... 3138 * RES_LIMIT. 3139 */ 3140 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 3141 const char *buffer) 3142 { 3143 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3144 int type, name; 3145 unsigned long long val; 3146 int ret; 3147 3148 type = MEMFILE_TYPE(cft->private); 3149 name = MEMFILE_ATTR(cft->private); 3150 switch (name) { 3151 case RES_LIMIT: 3152 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3153 ret = -EINVAL; 3154 break; 3155 } 3156 /* This function does all necessary parse...reuse it */ 3157 ret = res_counter_memparse_write_strategy(buffer, &val); 3158 if (ret) 3159 break; 3160 if (type == _MEM) 3161 ret = mem_cgroup_resize_limit(memcg, val); 3162 else 3163 ret = mem_cgroup_resize_memsw_limit(memcg, val); 3164 break; 3165 case RES_SOFT_LIMIT: 3166 ret = res_counter_memparse_write_strategy(buffer, &val); 3167 if (ret) 3168 break; 3169 /* 3170 * For memsw, soft limits are hard to implement in terms 3171 * of semantics, for now, we support soft limits for 3172 * control without swap 3173 */ 3174 if (type == _MEM) 3175 ret = res_counter_set_soft_limit(&memcg->res, val); 3176 else 3177 ret = -EINVAL; 3178 break; 3179 default: 3180 ret = -EINVAL; /* should be BUG() ? */ 3181 break; 3182 } 3183 return ret; 3184 } 3185 3186 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 3187 unsigned long long *mem_limit, unsigned long long *memsw_limit) 3188 { 3189 struct cgroup *cgroup; 3190 unsigned long long min_limit, min_memsw_limit, tmp; 3191 3192 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3193 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3194 cgroup = memcg->css.cgroup; 3195 if (!memcg->use_hierarchy) 3196 goto out; 3197 3198 while (cgroup->parent) { 3199 cgroup = cgroup->parent; 3200 memcg = mem_cgroup_from_cont(cgroup); 3201 if (!memcg->use_hierarchy) 3202 break; 3203 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 3204 min_limit = min(min_limit, tmp); 3205 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3206 min_memsw_limit = min(min_memsw_limit, tmp); 3207 } 3208 out: 3209 *mem_limit = min_limit; 3210 *memsw_limit = min_memsw_limit; 3211 return; 3212 } 3213 3214 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3215 { 3216 struct mem_cgroup *mem; 3217 int type, name; 3218 3219 mem = mem_cgroup_from_cont(cont); 3220 type = MEMFILE_TYPE(event); 3221 name = MEMFILE_ATTR(event); 3222 switch (name) { 3223 case RES_MAX_USAGE: 3224 if (type == _MEM) 3225 res_counter_reset_max(&mem->res); 3226 else 3227 res_counter_reset_max(&mem->memsw); 3228 break; 3229 case RES_FAILCNT: 3230 if (type == _MEM) 3231 res_counter_reset_failcnt(&mem->res); 3232 else 3233 res_counter_reset_failcnt(&mem->memsw); 3234 break; 3235 } 3236 3237 return 0; 3238 } 3239 3240 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 3241 struct cftype *cft) 3242 { 3243 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 3244 } 3245 3246 #ifdef CONFIG_MMU 3247 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3248 struct cftype *cft, u64 val) 3249 { 3250 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3251 3252 if (val >= (1 << NR_MOVE_TYPE)) 3253 return -EINVAL; 3254 /* 3255 * We check this value several times in both in can_attach() and 3256 * attach(), so we need cgroup lock to prevent this value from being 3257 * inconsistent. 3258 */ 3259 cgroup_lock(); 3260 mem->move_charge_at_immigrate = val; 3261 cgroup_unlock(); 3262 3263 return 0; 3264 } 3265 #else 3266 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3267 struct cftype *cft, u64 val) 3268 { 3269 return -ENOSYS; 3270 } 3271 #endif 3272 3273 3274 /* For read statistics */ 3275 enum { 3276 MCS_CACHE, 3277 MCS_RSS, 3278 MCS_FILE_MAPPED, 3279 MCS_PGPGIN, 3280 MCS_PGPGOUT, 3281 MCS_SWAP, 3282 MCS_INACTIVE_ANON, 3283 MCS_ACTIVE_ANON, 3284 MCS_INACTIVE_FILE, 3285 MCS_ACTIVE_FILE, 3286 MCS_UNEVICTABLE, 3287 NR_MCS_STAT, 3288 }; 3289 3290 struct mcs_total_stat { 3291 s64 stat[NR_MCS_STAT]; 3292 }; 3293 3294 struct { 3295 char *local_name; 3296 char *total_name; 3297 } memcg_stat_strings[NR_MCS_STAT] = { 3298 {"cache", "total_cache"}, 3299 {"rss", "total_rss"}, 3300 {"mapped_file", "total_mapped_file"}, 3301 {"pgpgin", "total_pgpgin"}, 3302 {"pgpgout", "total_pgpgout"}, 3303 {"swap", "total_swap"}, 3304 {"inactive_anon", "total_inactive_anon"}, 3305 {"active_anon", "total_active_anon"}, 3306 {"inactive_file", "total_inactive_file"}, 3307 {"active_file", "total_active_file"}, 3308 {"unevictable", "total_unevictable"} 3309 }; 3310 3311 3312 static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 3313 { 3314 struct mcs_total_stat *s = data; 3315 s64 val; 3316 3317 /* per cpu stat */ 3318 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 3319 s->stat[MCS_CACHE] += val * PAGE_SIZE; 3320 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 3321 s->stat[MCS_RSS] += val * PAGE_SIZE; 3322 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 3323 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3324 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); 3325 s->stat[MCS_PGPGIN] += val; 3326 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3327 s->stat[MCS_PGPGOUT] += val; 3328 if (do_swap_account) { 3329 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3330 s->stat[MCS_SWAP] += val * PAGE_SIZE; 3331 } 3332 3333 /* per zone stat */ 3334 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 3335 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 3336 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 3337 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 3338 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 3339 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 3340 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 3341 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 3342 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 3343 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 3344 return 0; 3345 } 3346 3347 static void 3348 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3349 { 3350 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 3351 } 3352 3353 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 3354 struct cgroup_map_cb *cb) 3355 { 3356 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 3357 struct mcs_total_stat mystat; 3358 int i; 3359 3360 memset(&mystat, 0, sizeof(mystat)); 3361 mem_cgroup_get_local_stat(mem_cont, &mystat); 3362 3363 for (i = 0; i < NR_MCS_STAT; i++) { 3364 if (i == MCS_SWAP && !do_swap_account) 3365 continue; 3366 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 3367 } 3368 3369 /* Hierarchical information */ 3370 { 3371 unsigned long long limit, memsw_limit; 3372 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 3373 cb->fill(cb, "hierarchical_memory_limit", limit); 3374 if (do_swap_account) 3375 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 3376 } 3377 3378 memset(&mystat, 0, sizeof(mystat)); 3379 mem_cgroup_get_total_stat(mem_cont, &mystat); 3380 for (i = 0; i < NR_MCS_STAT; i++) { 3381 if (i == MCS_SWAP && !do_swap_account) 3382 continue; 3383 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 3384 } 3385 3386 #ifdef CONFIG_DEBUG_VM 3387 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 3388 3389 { 3390 int nid, zid; 3391 struct mem_cgroup_per_zone *mz; 3392 unsigned long recent_rotated[2] = {0, 0}; 3393 unsigned long recent_scanned[2] = {0, 0}; 3394 3395 for_each_online_node(nid) 3396 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3397 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 3398 3399 recent_rotated[0] += 3400 mz->reclaim_stat.recent_rotated[0]; 3401 recent_rotated[1] += 3402 mz->reclaim_stat.recent_rotated[1]; 3403 recent_scanned[0] += 3404 mz->reclaim_stat.recent_scanned[0]; 3405 recent_scanned[1] += 3406 mz->reclaim_stat.recent_scanned[1]; 3407 } 3408 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 3409 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 3410 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 3411 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 3412 } 3413 #endif 3414 3415 return 0; 3416 } 3417 3418 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 3419 { 3420 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3421 3422 return get_swappiness(memcg); 3423 } 3424 3425 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 3426 u64 val) 3427 { 3428 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3429 struct mem_cgroup *parent; 3430 3431 if (val > 100) 3432 return -EINVAL; 3433 3434 if (cgrp->parent == NULL) 3435 return -EINVAL; 3436 3437 parent = mem_cgroup_from_cont(cgrp->parent); 3438 3439 cgroup_lock(); 3440 3441 /* If under hierarchy, only empty-root can set this value */ 3442 if ((parent->use_hierarchy) || 3443 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 3444 cgroup_unlock(); 3445 return -EINVAL; 3446 } 3447 3448 spin_lock(&memcg->reclaim_param_lock); 3449 memcg->swappiness = val; 3450 spin_unlock(&memcg->reclaim_param_lock); 3451 3452 cgroup_unlock(); 3453 3454 return 0; 3455 } 3456 3457 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3458 { 3459 struct mem_cgroup_threshold_ary *t; 3460 u64 usage; 3461 int i; 3462 3463 rcu_read_lock(); 3464 if (!swap) 3465 t = rcu_dereference(memcg->thresholds.primary); 3466 else 3467 t = rcu_dereference(memcg->memsw_thresholds.primary); 3468 3469 if (!t) 3470 goto unlock; 3471 3472 usage = mem_cgroup_usage(memcg, swap); 3473 3474 /* 3475 * current_threshold points to threshold just below usage. 3476 * If it's not true, a threshold was crossed after last 3477 * call of __mem_cgroup_threshold(). 3478 */ 3479 i = t->current_threshold; 3480 3481 /* 3482 * Iterate backward over array of thresholds starting from 3483 * current_threshold and check if a threshold is crossed. 3484 * If none of thresholds below usage is crossed, we read 3485 * only one element of the array here. 3486 */ 3487 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3488 eventfd_signal(t->entries[i].eventfd, 1); 3489 3490 /* i = current_threshold + 1 */ 3491 i++; 3492 3493 /* 3494 * Iterate forward over array of thresholds starting from 3495 * current_threshold+1 and check if a threshold is crossed. 3496 * If none of thresholds above usage is crossed, we read 3497 * only one element of the array here. 3498 */ 3499 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3500 eventfd_signal(t->entries[i].eventfd, 1); 3501 3502 /* Update current_threshold */ 3503 t->current_threshold = i - 1; 3504 unlock: 3505 rcu_read_unlock(); 3506 } 3507 3508 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3509 { 3510 __mem_cgroup_threshold(memcg, false); 3511 if (do_swap_account) 3512 __mem_cgroup_threshold(memcg, true); 3513 } 3514 3515 static int compare_thresholds(const void *a, const void *b) 3516 { 3517 const struct mem_cgroup_threshold *_a = a; 3518 const struct mem_cgroup_threshold *_b = b; 3519 3520 return _a->threshold - _b->threshold; 3521 } 3522 3523 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) 3524 { 3525 struct mem_cgroup_eventfd_list *ev; 3526 3527 list_for_each_entry(ev, &mem->oom_notify, list) 3528 eventfd_signal(ev->eventfd, 1); 3529 return 0; 3530 } 3531 3532 static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 3533 { 3534 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); 3535 } 3536 3537 static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 3538 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 3539 { 3540 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3541 struct mem_cgroup_thresholds *thresholds; 3542 struct mem_cgroup_threshold_ary *new; 3543 int type = MEMFILE_TYPE(cft->private); 3544 u64 threshold, usage; 3545 int i, size, ret; 3546 3547 ret = res_counter_memparse_write_strategy(args, &threshold); 3548 if (ret) 3549 return ret; 3550 3551 mutex_lock(&memcg->thresholds_lock); 3552 3553 if (type == _MEM) 3554 thresholds = &memcg->thresholds; 3555 else if (type == _MEMSWAP) 3556 thresholds = &memcg->memsw_thresholds; 3557 else 3558 BUG(); 3559 3560 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3561 3562 /* Check if a threshold crossed before adding a new one */ 3563 if (thresholds->primary) 3564 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3565 3566 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3567 3568 /* Allocate memory for new array of thresholds */ 3569 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 3570 GFP_KERNEL); 3571 if (!new) { 3572 ret = -ENOMEM; 3573 goto unlock; 3574 } 3575 new->size = size; 3576 3577 /* Copy thresholds (if any) to new array */ 3578 if (thresholds->primary) { 3579 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 3580 sizeof(struct mem_cgroup_threshold)); 3581 } 3582 3583 /* Add new threshold */ 3584 new->entries[size - 1].eventfd = eventfd; 3585 new->entries[size - 1].threshold = threshold; 3586 3587 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3588 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 3589 compare_thresholds, NULL); 3590 3591 /* Find current threshold */ 3592 new->current_threshold = -1; 3593 for (i = 0; i < size; i++) { 3594 if (new->entries[i].threshold < usage) { 3595 /* 3596 * new->current_threshold will not be used until 3597 * rcu_assign_pointer(), so it's safe to increment 3598 * it here. 3599 */ 3600 ++new->current_threshold; 3601 } 3602 } 3603 3604 /* Free old spare buffer and save old primary buffer as spare */ 3605 kfree(thresholds->spare); 3606 thresholds->spare = thresholds->primary; 3607 3608 rcu_assign_pointer(thresholds->primary, new); 3609 3610 /* To be sure that nobody uses thresholds */ 3611 synchronize_rcu(); 3612 3613 unlock: 3614 mutex_unlock(&memcg->thresholds_lock); 3615 3616 return ret; 3617 } 3618 3619 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 3620 struct cftype *cft, struct eventfd_ctx *eventfd) 3621 { 3622 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3623 struct mem_cgroup_thresholds *thresholds; 3624 struct mem_cgroup_threshold_ary *new; 3625 int type = MEMFILE_TYPE(cft->private); 3626 u64 usage; 3627 int i, j, size; 3628 3629 mutex_lock(&memcg->thresholds_lock); 3630 if (type == _MEM) 3631 thresholds = &memcg->thresholds; 3632 else if (type == _MEMSWAP) 3633 thresholds = &memcg->memsw_thresholds; 3634 else 3635 BUG(); 3636 3637 /* 3638 * Something went wrong if we trying to unregister a threshold 3639 * if we don't have thresholds 3640 */ 3641 BUG_ON(!thresholds); 3642 3643 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3644 3645 /* Check if a threshold crossed before removing */ 3646 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3647 3648 /* Calculate new number of threshold */ 3649 size = 0; 3650 for (i = 0; i < thresholds->primary->size; i++) { 3651 if (thresholds->primary->entries[i].eventfd != eventfd) 3652 size++; 3653 } 3654 3655 new = thresholds->spare; 3656 3657 /* Set thresholds array to NULL if we don't have thresholds */ 3658 if (!size) { 3659 kfree(new); 3660 new = NULL; 3661 goto swap_buffers; 3662 } 3663 3664 new->size = size; 3665 3666 /* Copy thresholds and find current threshold */ 3667 new->current_threshold = -1; 3668 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 3669 if (thresholds->primary->entries[i].eventfd == eventfd) 3670 continue; 3671 3672 new->entries[j] = thresholds->primary->entries[i]; 3673 if (new->entries[j].threshold < usage) { 3674 /* 3675 * new->current_threshold will not be used 3676 * until rcu_assign_pointer(), so it's safe to increment 3677 * it here. 3678 */ 3679 ++new->current_threshold; 3680 } 3681 j++; 3682 } 3683 3684 swap_buffers: 3685 /* Swap primary and spare array */ 3686 thresholds->spare = thresholds->primary; 3687 rcu_assign_pointer(thresholds->primary, new); 3688 3689 /* To be sure that nobody uses thresholds */ 3690 synchronize_rcu(); 3691 3692 mutex_unlock(&memcg->thresholds_lock); 3693 } 3694 3695 static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 3696 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 3697 { 3698 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3699 struct mem_cgroup_eventfd_list *event; 3700 int type = MEMFILE_TYPE(cft->private); 3701 3702 BUG_ON(type != _OOM_TYPE); 3703 event = kmalloc(sizeof(*event), GFP_KERNEL); 3704 if (!event) 3705 return -ENOMEM; 3706 3707 mutex_lock(&memcg_oom_mutex); 3708 3709 event->eventfd = eventfd; 3710 list_add(&event->list, &memcg->oom_notify); 3711 3712 /* already in OOM ? */ 3713 if (atomic_read(&memcg->oom_lock)) 3714 eventfd_signal(eventfd, 1); 3715 mutex_unlock(&memcg_oom_mutex); 3716 3717 return 0; 3718 } 3719 3720 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 3721 struct cftype *cft, struct eventfd_ctx *eventfd) 3722 { 3723 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3724 struct mem_cgroup_eventfd_list *ev, *tmp; 3725 int type = MEMFILE_TYPE(cft->private); 3726 3727 BUG_ON(type != _OOM_TYPE); 3728 3729 mutex_lock(&memcg_oom_mutex); 3730 3731 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 3732 if (ev->eventfd == eventfd) { 3733 list_del(&ev->list); 3734 kfree(ev); 3735 } 3736 } 3737 3738 mutex_unlock(&memcg_oom_mutex); 3739 } 3740 3741 static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 3742 struct cftype *cft, struct cgroup_map_cb *cb) 3743 { 3744 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3745 3746 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 3747 3748 if (atomic_read(&mem->oom_lock)) 3749 cb->fill(cb, "under_oom", 1); 3750 else 3751 cb->fill(cb, "under_oom", 0); 3752 return 0; 3753 } 3754 3755 /* 3756 */ 3757 static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 3758 struct cftype *cft, u64 val) 3759 { 3760 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3761 struct mem_cgroup *parent; 3762 3763 /* cannot set to root cgroup and only 0 and 1 are allowed */ 3764 if (!cgrp->parent || !((val == 0) || (val == 1))) 3765 return -EINVAL; 3766 3767 parent = mem_cgroup_from_cont(cgrp->parent); 3768 3769 cgroup_lock(); 3770 /* oom-kill-disable is a flag for subhierarchy. */ 3771 if ((parent->use_hierarchy) || 3772 (mem->use_hierarchy && !list_empty(&cgrp->children))) { 3773 cgroup_unlock(); 3774 return -EINVAL; 3775 } 3776 mem->oom_kill_disable = val; 3777 if (!val) 3778 memcg_oom_recover(mem); 3779 cgroup_unlock(); 3780 return 0; 3781 } 3782 3783 static struct cftype mem_cgroup_files[] = { 3784 { 3785 .name = "usage_in_bytes", 3786 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3787 .read_u64 = mem_cgroup_read, 3788 .register_event = mem_cgroup_usage_register_event, 3789 .unregister_event = mem_cgroup_usage_unregister_event, 3790 }, 3791 { 3792 .name = "max_usage_in_bytes", 3793 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 3794 .trigger = mem_cgroup_reset, 3795 .read_u64 = mem_cgroup_read, 3796 }, 3797 { 3798 .name = "limit_in_bytes", 3799 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 3800 .write_string = mem_cgroup_write, 3801 .read_u64 = mem_cgroup_read, 3802 }, 3803 { 3804 .name = "soft_limit_in_bytes", 3805 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 3806 .write_string = mem_cgroup_write, 3807 .read_u64 = mem_cgroup_read, 3808 }, 3809 { 3810 .name = "failcnt", 3811 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 3812 .trigger = mem_cgroup_reset, 3813 .read_u64 = mem_cgroup_read, 3814 }, 3815 { 3816 .name = "stat", 3817 .read_map = mem_control_stat_show, 3818 }, 3819 { 3820 .name = "force_empty", 3821 .trigger = mem_cgroup_force_empty_write, 3822 }, 3823 { 3824 .name = "use_hierarchy", 3825 .write_u64 = mem_cgroup_hierarchy_write, 3826 .read_u64 = mem_cgroup_hierarchy_read, 3827 }, 3828 { 3829 .name = "swappiness", 3830 .read_u64 = mem_cgroup_swappiness_read, 3831 .write_u64 = mem_cgroup_swappiness_write, 3832 }, 3833 { 3834 .name = "move_charge_at_immigrate", 3835 .read_u64 = mem_cgroup_move_charge_read, 3836 .write_u64 = mem_cgroup_move_charge_write, 3837 }, 3838 { 3839 .name = "oom_control", 3840 .read_map = mem_cgroup_oom_control_read, 3841 .write_u64 = mem_cgroup_oom_control_write, 3842 .register_event = mem_cgroup_oom_register_event, 3843 .unregister_event = mem_cgroup_oom_unregister_event, 3844 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 3845 }, 3846 }; 3847 3848 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3849 static struct cftype memsw_cgroup_files[] = { 3850 { 3851 .name = "memsw.usage_in_bytes", 3852 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 3853 .read_u64 = mem_cgroup_read, 3854 .register_event = mem_cgroup_usage_register_event, 3855 .unregister_event = mem_cgroup_usage_unregister_event, 3856 }, 3857 { 3858 .name = "memsw.max_usage_in_bytes", 3859 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 3860 .trigger = mem_cgroup_reset, 3861 .read_u64 = mem_cgroup_read, 3862 }, 3863 { 3864 .name = "memsw.limit_in_bytes", 3865 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 3866 .write_string = mem_cgroup_write, 3867 .read_u64 = mem_cgroup_read, 3868 }, 3869 { 3870 .name = "memsw.failcnt", 3871 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 3872 .trigger = mem_cgroup_reset, 3873 .read_u64 = mem_cgroup_read, 3874 }, 3875 }; 3876 3877 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 3878 { 3879 if (!do_swap_account) 3880 return 0; 3881 return cgroup_add_files(cont, ss, memsw_cgroup_files, 3882 ARRAY_SIZE(memsw_cgroup_files)); 3883 }; 3884 #else 3885 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 3886 { 3887 return 0; 3888 } 3889 #endif 3890 3891 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 3892 { 3893 struct mem_cgroup_per_node *pn; 3894 struct mem_cgroup_per_zone *mz; 3895 enum lru_list l; 3896 int zone, tmp = node; 3897 /* 3898 * This routine is called against possible nodes. 3899 * But it's BUG to call kmalloc() against offline node. 3900 * 3901 * TODO: this routine can waste much memory for nodes which will 3902 * never be onlined. It's better to use memory hotplug callback 3903 * function. 3904 */ 3905 if (!node_state(node, N_NORMAL_MEMORY)) 3906 tmp = -1; 3907 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 3908 if (!pn) 3909 return 1; 3910 3911 mem->info.nodeinfo[node] = pn; 3912 memset(pn, 0, sizeof(*pn)); 3913 3914 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3915 mz = &pn->zoneinfo[zone]; 3916 for_each_lru(l) 3917 INIT_LIST_HEAD(&mz->lists[l]); 3918 mz->usage_in_excess = 0; 3919 mz->on_tree = false; 3920 mz->mem = mem; 3921 } 3922 return 0; 3923 } 3924 3925 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 3926 { 3927 kfree(mem->info.nodeinfo[node]); 3928 } 3929 3930 static struct mem_cgroup *mem_cgroup_alloc(void) 3931 { 3932 struct mem_cgroup *mem; 3933 int size = sizeof(struct mem_cgroup); 3934 3935 /* Can be very big if MAX_NUMNODES is very big */ 3936 if (size < PAGE_SIZE) 3937 mem = kmalloc(size, GFP_KERNEL); 3938 else 3939 mem = vmalloc(size); 3940 3941 if (!mem) 3942 return NULL; 3943 3944 memset(mem, 0, size); 3945 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 3946 if (!mem->stat) { 3947 if (size < PAGE_SIZE) 3948 kfree(mem); 3949 else 3950 vfree(mem); 3951 mem = NULL; 3952 } 3953 return mem; 3954 } 3955 3956 /* 3957 * At destroying mem_cgroup, references from swap_cgroup can remain. 3958 * (scanning all at force_empty is too costly...) 3959 * 3960 * Instead of clearing all references at force_empty, we remember 3961 * the number of reference from swap_cgroup and free mem_cgroup when 3962 * it goes down to 0. 3963 * 3964 * Removal of cgroup itself succeeds regardless of refs from swap. 3965 */ 3966 3967 static void __mem_cgroup_free(struct mem_cgroup *mem) 3968 { 3969 int node; 3970 3971 mem_cgroup_remove_from_trees(mem); 3972 free_css_id(&mem_cgroup_subsys, &mem->css); 3973 3974 for_each_node_state(node, N_POSSIBLE) 3975 free_mem_cgroup_per_zone_info(mem, node); 3976 3977 free_percpu(mem->stat); 3978 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 3979 kfree(mem); 3980 else 3981 vfree(mem); 3982 } 3983 3984 static void mem_cgroup_get(struct mem_cgroup *mem) 3985 { 3986 atomic_inc(&mem->refcnt); 3987 } 3988 3989 static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 3990 { 3991 if (atomic_sub_and_test(count, &mem->refcnt)) { 3992 struct mem_cgroup *parent = parent_mem_cgroup(mem); 3993 __mem_cgroup_free(mem); 3994 if (parent) 3995 mem_cgroup_put(parent); 3996 } 3997 } 3998 3999 static void mem_cgroup_put(struct mem_cgroup *mem) 4000 { 4001 __mem_cgroup_put(mem, 1); 4002 } 4003 4004 /* 4005 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4006 */ 4007 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 4008 { 4009 if (!mem->res.parent) 4010 return NULL; 4011 return mem_cgroup_from_res_counter(mem->res.parent, res); 4012 } 4013 4014 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4015 static void __init enable_swap_cgroup(void) 4016 { 4017 if (!mem_cgroup_disabled() && really_do_swap_account) 4018 do_swap_account = 1; 4019 } 4020 #else 4021 static void __init enable_swap_cgroup(void) 4022 { 4023 } 4024 #endif 4025 4026 static int mem_cgroup_soft_limit_tree_init(void) 4027 { 4028 struct mem_cgroup_tree_per_node *rtpn; 4029 struct mem_cgroup_tree_per_zone *rtpz; 4030 int tmp, node, zone; 4031 4032 for_each_node_state(node, N_POSSIBLE) { 4033 tmp = node; 4034 if (!node_state(node, N_NORMAL_MEMORY)) 4035 tmp = -1; 4036 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4037 if (!rtpn) 4038 return 1; 4039 4040 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4041 4042 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4043 rtpz = &rtpn->rb_tree_per_zone[zone]; 4044 rtpz->rb_root = RB_ROOT; 4045 spin_lock_init(&rtpz->lock); 4046 } 4047 } 4048 return 0; 4049 } 4050 4051 static struct cgroup_subsys_state * __ref 4052 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 4053 { 4054 struct mem_cgroup *mem, *parent; 4055 long error = -ENOMEM; 4056 int node; 4057 4058 mem = mem_cgroup_alloc(); 4059 if (!mem) 4060 return ERR_PTR(error); 4061 4062 for_each_node_state(node, N_POSSIBLE) 4063 if (alloc_mem_cgroup_per_zone_info(mem, node)) 4064 goto free_out; 4065 4066 /* root ? */ 4067 if (cont->parent == NULL) { 4068 int cpu; 4069 enable_swap_cgroup(); 4070 parent = NULL; 4071 root_mem_cgroup = mem; 4072 if (mem_cgroup_soft_limit_tree_init()) 4073 goto free_out; 4074 for_each_possible_cpu(cpu) { 4075 struct memcg_stock_pcp *stock = 4076 &per_cpu(memcg_stock, cpu); 4077 INIT_WORK(&stock->work, drain_local_stock); 4078 } 4079 hotcpu_notifier(memcg_stock_cpu_callback, 0); 4080 } else { 4081 parent = mem_cgroup_from_cont(cont->parent); 4082 mem->use_hierarchy = parent->use_hierarchy; 4083 mem->oom_kill_disable = parent->oom_kill_disable; 4084 } 4085 4086 if (parent && parent->use_hierarchy) { 4087 res_counter_init(&mem->res, &parent->res); 4088 res_counter_init(&mem->memsw, &parent->memsw); 4089 /* 4090 * We increment refcnt of the parent to ensure that we can 4091 * safely access it on res_counter_charge/uncharge. 4092 * This refcnt will be decremented when freeing this 4093 * mem_cgroup(see mem_cgroup_put). 4094 */ 4095 mem_cgroup_get(parent); 4096 } else { 4097 res_counter_init(&mem->res, NULL); 4098 res_counter_init(&mem->memsw, NULL); 4099 } 4100 mem->last_scanned_child = 0; 4101 spin_lock_init(&mem->reclaim_param_lock); 4102 INIT_LIST_HEAD(&mem->oom_notify); 4103 4104 if (parent) 4105 mem->swappiness = get_swappiness(parent); 4106 atomic_set(&mem->refcnt, 1); 4107 mem->move_charge_at_immigrate = 0; 4108 mutex_init(&mem->thresholds_lock); 4109 return &mem->css; 4110 free_out: 4111 __mem_cgroup_free(mem); 4112 root_mem_cgroup = NULL; 4113 return ERR_PTR(error); 4114 } 4115 4116 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 4117 struct cgroup *cont) 4118 { 4119 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4120 4121 return mem_cgroup_force_empty(mem, false); 4122 } 4123 4124 static void mem_cgroup_destroy(struct cgroup_subsys *ss, 4125 struct cgroup *cont) 4126 { 4127 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4128 4129 mem_cgroup_put(mem); 4130 } 4131 4132 static int mem_cgroup_populate(struct cgroup_subsys *ss, 4133 struct cgroup *cont) 4134 { 4135 int ret; 4136 4137 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 4138 ARRAY_SIZE(mem_cgroup_files)); 4139 4140 if (!ret) 4141 ret = register_memsw_files(cont, ss); 4142 return ret; 4143 } 4144 4145 #ifdef CONFIG_MMU 4146 /* Handlers for move charge at task migration. */ 4147 #define PRECHARGE_COUNT_AT_ONCE 256 4148 static int mem_cgroup_do_precharge(unsigned long count) 4149 { 4150 int ret = 0; 4151 int batch_count = PRECHARGE_COUNT_AT_ONCE; 4152 struct mem_cgroup *mem = mc.to; 4153 4154 if (mem_cgroup_is_root(mem)) { 4155 mc.precharge += count; 4156 /* we don't need css_get for root */ 4157 return ret; 4158 } 4159 /* try to charge at once */ 4160 if (count > 1) { 4161 struct res_counter *dummy; 4162 /* 4163 * "mem" cannot be under rmdir() because we've already checked 4164 * by cgroup_lock_live_cgroup() that it is not removed and we 4165 * are still under the same cgroup_mutex. So we can postpone 4166 * css_get(). 4167 */ 4168 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 4169 goto one_by_one; 4170 if (do_swap_account && res_counter_charge(&mem->memsw, 4171 PAGE_SIZE * count, &dummy)) { 4172 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 4173 goto one_by_one; 4174 } 4175 mc.precharge += count; 4176 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); 4177 WARN_ON_ONCE(count > INT_MAX); 4178 __css_get(&mem->css, (int)count); 4179 return ret; 4180 } 4181 one_by_one: 4182 /* fall back to one by one charge */ 4183 while (count--) { 4184 if (signal_pending(current)) { 4185 ret = -EINTR; 4186 break; 4187 } 4188 if (!batch_count--) { 4189 batch_count = PRECHARGE_COUNT_AT_ONCE; 4190 cond_resched(); 4191 } 4192 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 4193 if (ret || !mem) 4194 /* mem_cgroup_clear_mc() will do uncharge later */ 4195 return -ENOMEM; 4196 mc.precharge++; 4197 } 4198 return ret; 4199 } 4200 4201 /** 4202 * is_target_pte_for_mc - check a pte whether it is valid for move charge 4203 * @vma: the vma the pte to be checked belongs 4204 * @addr: the address corresponding to the pte to be checked 4205 * @ptent: the pte to be checked 4206 * @target: the pointer the target page or swap ent will be stored(can be NULL) 4207 * 4208 * Returns 4209 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 4210 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 4211 * move charge. if @target is not NULL, the page is stored in target->page 4212 * with extra refcnt got(Callers should handle it). 4213 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 4214 * target for charge migration. if @target is not NULL, the entry is stored 4215 * in target->ent. 4216 * 4217 * Called with pte lock held. 4218 */ 4219 union mc_target { 4220 struct page *page; 4221 swp_entry_t ent; 4222 }; 4223 4224 enum mc_target_type { 4225 MC_TARGET_NONE, /* not used */ 4226 MC_TARGET_PAGE, 4227 MC_TARGET_SWAP, 4228 }; 4229 4230 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4231 unsigned long addr, pte_t ptent) 4232 { 4233 struct page *page = vm_normal_page(vma, addr, ptent); 4234 4235 if (!page || !page_mapped(page)) 4236 return NULL; 4237 if (PageAnon(page)) { 4238 /* we don't move shared anon */ 4239 if (!move_anon() || page_mapcount(page) > 2) 4240 return NULL; 4241 } else if (!move_file()) 4242 /* we ignore mapcount for file pages */ 4243 return NULL; 4244 if (!get_page_unless_zero(page)) 4245 return NULL; 4246 4247 return page; 4248 } 4249 4250 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4251 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4252 { 4253 int usage_count; 4254 struct page *page = NULL; 4255 swp_entry_t ent = pte_to_swp_entry(ptent); 4256 4257 if (!move_anon() || non_swap_entry(ent)) 4258 return NULL; 4259 usage_count = mem_cgroup_count_swap_user(ent, &page); 4260 if (usage_count > 1) { /* we don't move shared anon */ 4261 if (page) 4262 put_page(page); 4263 return NULL; 4264 } 4265 if (do_swap_account) 4266 entry->val = ent.val; 4267 4268 return page; 4269 } 4270 4271 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 4272 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4273 { 4274 struct page *page = NULL; 4275 struct inode *inode; 4276 struct address_space *mapping; 4277 pgoff_t pgoff; 4278 4279 if (!vma->vm_file) /* anonymous vma */ 4280 return NULL; 4281 if (!move_file()) 4282 return NULL; 4283 4284 inode = vma->vm_file->f_path.dentry->d_inode; 4285 mapping = vma->vm_file->f_mapping; 4286 if (pte_none(ptent)) 4287 pgoff = linear_page_index(vma, addr); 4288 else /* pte_file(ptent) is true */ 4289 pgoff = pte_to_pgoff(ptent); 4290 4291 /* page is moved even if it's not RSS of this task(page-faulted). */ 4292 if (!mapping_cap_swap_backed(mapping)) { /* normal file */ 4293 page = find_get_page(mapping, pgoff); 4294 } else { /* shmem/tmpfs file. we should take account of swap too. */ 4295 swp_entry_t ent; 4296 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); 4297 if (do_swap_account) 4298 entry->val = ent.val; 4299 } 4300 4301 return page; 4302 } 4303 4304 static int is_target_pte_for_mc(struct vm_area_struct *vma, 4305 unsigned long addr, pte_t ptent, union mc_target *target) 4306 { 4307 struct page *page = NULL; 4308 struct page_cgroup *pc; 4309 int ret = 0; 4310 swp_entry_t ent = { .val = 0 }; 4311 4312 if (pte_present(ptent)) 4313 page = mc_handle_present_pte(vma, addr, ptent); 4314 else if (is_swap_pte(ptent)) 4315 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4316 else if (pte_none(ptent) || pte_file(ptent)) 4317 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4318 4319 if (!page && !ent.val) 4320 return 0; 4321 if (page) { 4322 pc = lookup_page_cgroup(page); 4323 /* 4324 * Do only loose check w/o page_cgroup lock. 4325 * mem_cgroup_move_account() checks the pc is valid or not under 4326 * the lock. 4327 */ 4328 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 4329 ret = MC_TARGET_PAGE; 4330 if (target) 4331 target->page = page; 4332 } 4333 if (!ret || !target) 4334 put_page(page); 4335 } 4336 /* There is a swap entry and a page doesn't exist or isn't charged */ 4337 if (ent.val && !ret && 4338 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 4339 ret = MC_TARGET_SWAP; 4340 if (target) 4341 target->ent = ent; 4342 } 4343 return ret; 4344 } 4345 4346 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 4347 unsigned long addr, unsigned long end, 4348 struct mm_walk *walk) 4349 { 4350 struct vm_area_struct *vma = walk->private; 4351 pte_t *pte; 4352 spinlock_t *ptl; 4353 4354 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4355 for (; addr != end; pte++, addr += PAGE_SIZE) 4356 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4357 mc.precharge++; /* increment precharge temporarily */ 4358 pte_unmap_unlock(pte - 1, ptl); 4359 cond_resched(); 4360 4361 return 0; 4362 } 4363 4364 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4365 { 4366 unsigned long precharge; 4367 struct vm_area_struct *vma; 4368 4369 down_read(&mm->mmap_sem); 4370 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4371 struct mm_walk mem_cgroup_count_precharge_walk = { 4372 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4373 .mm = mm, 4374 .private = vma, 4375 }; 4376 if (is_vm_hugetlb_page(vma)) 4377 continue; 4378 walk_page_range(vma->vm_start, vma->vm_end, 4379 &mem_cgroup_count_precharge_walk); 4380 } 4381 up_read(&mm->mmap_sem); 4382 4383 precharge = mc.precharge; 4384 mc.precharge = 0; 4385 4386 return precharge; 4387 } 4388 4389 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4390 { 4391 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); 4392 } 4393 4394 static void mem_cgroup_clear_mc(void) 4395 { 4396 /* we must uncharge all the leftover precharges from mc.to */ 4397 if (mc.precharge) { 4398 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4399 mc.precharge = 0; 4400 memcg_oom_recover(mc.to); 4401 } 4402 /* 4403 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4404 * we must uncharge here. 4405 */ 4406 if (mc.moved_charge) { 4407 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4408 mc.moved_charge = 0; 4409 memcg_oom_recover(mc.from); 4410 } 4411 /* we must fixup refcnts and charges */ 4412 if (mc.moved_swap) { 4413 WARN_ON_ONCE(mc.moved_swap > INT_MAX); 4414 /* uncharge swap account from the old cgroup */ 4415 if (!mem_cgroup_is_root(mc.from)) 4416 res_counter_uncharge(&mc.from->memsw, 4417 PAGE_SIZE * mc.moved_swap); 4418 __mem_cgroup_put(mc.from, mc.moved_swap); 4419 4420 if (!mem_cgroup_is_root(mc.to)) { 4421 /* 4422 * we charged both to->res and to->memsw, so we should 4423 * uncharge to->res. 4424 */ 4425 res_counter_uncharge(&mc.to->res, 4426 PAGE_SIZE * mc.moved_swap); 4427 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); 4428 __css_put(&mc.to->css, mc.moved_swap); 4429 } 4430 /* we've already done mem_cgroup_get(mc.to) */ 4431 4432 mc.moved_swap = 0; 4433 } 4434 mc.from = NULL; 4435 mc.to = NULL; 4436 mc.moving_task = NULL; 4437 wake_up_all(&mc.waitq); 4438 } 4439 4440 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4441 struct cgroup *cgroup, 4442 struct task_struct *p, 4443 bool threadgroup) 4444 { 4445 int ret = 0; 4446 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 4447 4448 if (mem->move_charge_at_immigrate) { 4449 struct mm_struct *mm; 4450 struct mem_cgroup *from = mem_cgroup_from_task(p); 4451 4452 VM_BUG_ON(from == mem); 4453 4454 mm = get_task_mm(p); 4455 if (!mm) 4456 return 0; 4457 /* We move charges only when we move a owner of the mm */ 4458 if (mm->owner == p) { 4459 VM_BUG_ON(mc.from); 4460 VM_BUG_ON(mc.to); 4461 VM_BUG_ON(mc.precharge); 4462 VM_BUG_ON(mc.moved_charge); 4463 VM_BUG_ON(mc.moved_swap); 4464 VM_BUG_ON(mc.moving_task); 4465 mc.from = from; 4466 mc.to = mem; 4467 mc.precharge = 0; 4468 mc.moved_charge = 0; 4469 mc.moved_swap = 0; 4470 mc.moving_task = current; 4471 4472 ret = mem_cgroup_precharge_mc(mm); 4473 if (ret) 4474 mem_cgroup_clear_mc(); 4475 } 4476 mmput(mm); 4477 } 4478 return ret; 4479 } 4480 4481 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 4482 struct cgroup *cgroup, 4483 struct task_struct *p, 4484 bool threadgroup) 4485 { 4486 mem_cgroup_clear_mc(); 4487 } 4488 4489 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 4490 unsigned long addr, unsigned long end, 4491 struct mm_walk *walk) 4492 { 4493 int ret = 0; 4494 struct vm_area_struct *vma = walk->private; 4495 pte_t *pte; 4496 spinlock_t *ptl; 4497 4498 retry: 4499 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4500 for (; addr != end; addr += PAGE_SIZE) { 4501 pte_t ptent = *(pte++); 4502 union mc_target target; 4503 int type; 4504 struct page *page; 4505 struct page_cgroup *pc; 4506 swp_entry_t ent; 4507 4508 if (!mc.precharge) 4509 break; 4510 4511 type = is_target_pte_for_mc(vma, addr, ptent, &target); 4512 switch (type) { 4513 case MC_TARGET_PAGE: 4514 page = target.page; 4515 if (isolate_lru_page(page)) 4516 goto put; 4517 pc = lookup_page_cgroup(page); 4518 if (!mem_cgroup_move_account(pc, 4519 mc.from, mc.to, false)) { 4520 mc.precharge--; 4521 /* we uncharge from mc.from later. */ 4522 mc.moved_charge++; 4523 } 4524 putback_lru_page(page); 4525 put: /* is_target_pte_for_mc() gets the page */ 4526 put_page(page); 4527 break; 4528 case MC_TARGET_SWAP: 4529 ent = target.ent; 4530 if (!mem_cgroup_move_swap_account(ent, 4531 mc.from, mc.to, false)) { 4532 mc.precharge--; 4533 /* we fixup refcnts and charges later. */ 4534 mc.moved_swap++; 4535 } 4536 break; 4537 default: 4538 break; 4539 } 4540 } 4541 pte_unmap_unlock(pte - 1, ptl); 4542 cond_resched(); 4543 4544 if (addr != end) { 4545 /* 4546 * We have consumed all precharges we got in can_attach(). 4547 * We try charge one by one, but don't do any additional 4548 * charges to mc.to if we have failed in charge once in attach() 4549 * phase. 4550 */ 4551 ret = mem_cgroup_do_precharge(1); 4552 if (!ret) 4553 goto retry; 4554 } 4555 4556 return ret; 4557 } 4558 4559 static void mem_cgroup_move_charge(struct mm_struct *mm) 4560 { 4561 struct vm_area_struct *vma; 4562 4563 lru_add_drain_all(); 4564 down_read(&mm->mmap_sem); 4565 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4566 int ret; 4567 struct mm_walk mem_cgroup_move_charge_walk = { 4568 .pmd_entry = mem_cgroup_move_charge_pte_range, 4569 .mm = mm, 4570 .private = vma, 4571 }; 4572 if (is_vm_hugetlb_page(vma)) 4573 continue; 4574 ret = walk_page_range(vma->vm_start, vma->vm_end, 4575 &mem_cgroup_move_charge_walk); 4576 if (ret) 4577 /* 4578 * means we have consumed all precharges and failed in 4579 * doing additional charge. Just abandon here. 4580 */ 4581 break; 4582 } 4583 up_read(&mm->mmap_sem); 4584 } 4585 4586 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4587 struct cgroup *cont, 4588 struct cgroup *old_cont, 4589 struct task_struct *p, 4590 bool threadgroup) 4591 { 4592 struct mm_struct *mm; 4593 4594 if (!mc.to) 4595 /* no need to move charge */ 4596 return; 4597 4598 mm = get_task_mm(p); 4599 if (mm) { 4600 mem_cgroup_move_charge(mm); 4601 mmput(mm); 4602 } 4603 mem_cgroup_clear_mc(); 4604 } 4605 #else /* !CONFIG_MMU */ 4606 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4607 struct cgroup *cgroup, 4608 struct task_struct *p, 4609 bool threadgroup) 4610 { 4611 return 0; 4612 } 4613 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 4614 struct cgroup *cgroup, 4615 struct task_struct *p, 4616 bool threadgroup) 4617 { 4618 } 4619 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4620 struct cgroup *cont, 4621 struct cgroup *old_cont, 4622 struct task_struct *p, 4623 bool threadgroup) 4624 { 4625 } 4626 #endif 4627 4628 struct cgroup_subsys mem_cgroup_subsys = { 4629 .name = "memory", 4630 .subsys_id = mem_cgroup_subsys_id, 4631 .create = mem_cgroup_create, 4632 .pre_destroy = mem_cgroup_pre_destroy, 4633 .destroy = mem_cgroup_destroy, 4634 .populate = mem_cgroup_populate, 4635 .can_attach = mem_cgroup_can_attach, 4636 .cancel_attach = mem_cgroup_cancel_attach, 4637 .attach = mem_cgroup_move_task, 4638 .early_init = 0, 4639 .use_id = 1, 4640 }; 4641 4642 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4643 4644 static int __init disable_swap_account(char *s) 4645 { 4646 really_do_swap_account = 0; 4647 return 1; 4648 } 4649 __setup("noswapaccount", disable_swap_account); 4650 #endif 4651