1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 */ 23 24 #include <linux/res_counter.h> 25 #include <linux/memcontrol.h> 26 #include <linux/cgroup.h> 27 #include <linux/mm.h> 28 #include <linux/hugetlb.h> 29 #include <linux/pagemap.h> 30 #include <linux/smp.h> 31 #include <linux/page-flags.h> 32 #include <linux/backing-dev.h> 33 #include <linux/bit_spinlock.h> 34 #include <linux/rcupdate.h> 35 #include <linux/limits.h> 36 #include <linux/mutex.h> 37 #include <linux/rbtree.h> 38 #include <linux/slab.h> 39 #include <linux/swap.h> 40 #include <linux/swapops.h> 41 #include <linux/spinlock.h> 42 #include <linux/eventfd.h> 43 #include <linux/sort.h> 44 #include <linux/fs.h> 45 #include <linux/seq_file.h> 46 #include <linux/vmalloc.h> 47 #include <linux/mm_inline.h> 48 #include <linux/page_cgroup.h> 49 #include <linux/cpu.h> 50 #include <linux/oom.h> 51 #include "internal.h" 52 53 #include <asm/uaccess.h> 54 55 #include <trace/events/vmscan.h> 56 57 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 58 #define MEM_CGROUP_RECLAIM_RETRIES 5 59 struct mem_cgroup *root_mem_cgroup __read_mostly; 60 61 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 62 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 63 int do_swap_account __read_mostly; 64 65 /* for remember boot option*/ 66 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 67 static int really_do_swap_account __initdata = 1; 68 #else 69 static int really_do_swap_account __initdata = 0; 70 #endif 71 72 #else 73 #define do_swap_account (0) 74 #endif 75 76 /* 77 * Per memcg event counter is incremented at every pagein/pageout. This counter 78 * is used for trigger some periodic events. This is straightforward and better 79 * than using jiffies etc. to handle periodic memcg event. 80 * 81 * These values will be used as !((event) & ((1 <<(thresh)) - 1)) 82 */ 83 #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ 84 #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ 85 86 /* 87 * Statistics for memory cgroup. 88 */ 89 enum mem_cgroup_stat_index { 90 /* 91 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 92 */ 93 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 94 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 95 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 96 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 97 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 98 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 99 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 100 /* incremented at every pagein/pageout */ 101 MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, 102 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ 103 104 MEM_CGROUP_STAT_NSTATS, 105 }; 106 107 struct mem_cgroup_stat_cpu { 108 s64 count[MEM_CGROUP_STAT_NSTATS]; 109 }; 110 111 /* 112 * per-zone information in memory controller. 113 */ 114 struct mem_cgroup_per_zone { 115 /* 116 * spin_lock to protect the per cgroup LRU 117 */ 118 struct list_head lists[NR_LRU_LISTS]; 119 unsigned long count[NR_LRU_LISTS]; 120 121 struct zone_reclaim_stat reclaim_stat; 122 struct rb_node tree_node; /* RB tree node */ 123 unsigned long long usage_in_excess;/* Set to the value by which */ 124 /* the soft limit is exceeded*/ 125 bool on_tree; 126 struct mem_cgroup *mem; /* Back pointer, we cannot */ 127 /* use container_of */ 128 }; 129 /* Macro for accessing counter */ 130 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 131 132 struct mem_cgroup_per_node { 133 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 134 }; 135 136 struct mem_cgroup_lru_info { 137 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 138 }; 139 140 /* 141 * Cgroups above their limits are maintained in a RB-Tree, independent of 142 * their hierarchy representation 143 */ 144 145 struct mem_cgroup_tree_per_zone { 146 struct rb_root rb_root; 147 spinlock_t lock; 148 }; 149 150 struct mem_cgroup_tree_per_node { 151 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 152 }; 153 154 struct mem_cgroup_tree { 155 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 156 }; 157 158 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 159 160 struct mem_cgroup_threshold { 161 struct eventfd_ctx *eventfd; 162 u64 threshold; 163 }; 164 165 /* For threshold */ 166 struct mem_cgroup_threshold_ary { 167 /* An array index points to threshold just below usage. */ 168 int current_threshold; 169 /* Size of entries[] */ 170 unsigned int size; 171 /* Array of thresholds */ 172 struct mem_cgroup_threshold entries[0]; 173 }; 174 175 struct mem_cgroup_thresholds { 176 /* Primary thresholds array */ 177 struct mem_cgroup_threshold_ary *primary; 178 /* 179 * Spare threshold array. 180 * This is needed to make mem_cgroup_unregister_event() "never fail". 181 * It must be able to store at least primary->size - 1 entries. 182 */ 183 struct mem_cgroup_threshold_ary *spare; 184 }; 185 186 /* for OOM */ 187 struct mem_cgroup_eventfd_list { 188 struct list_head list; 189 struct eventfd_ctx *eventfd; 190 }; 191 192 static void mem_cgroup_threshold(struct mem_cgroup *mem); 193 static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 194 195 /* 196 * The memory controller data structure. The memory controller controls both 197 * page cache and RSS per cgroup. We would eventually like to provide 198 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 199 * to help the administrator determine what knobs to tune. 200 * 201 * TODO: Add a water mark for the memory controller. Reclaim will begin when 202 * we hit the water mark. May be even add a low water mark, such that 203 * no reclaim occurs from a cgroup at it's low water mark, this is 204 * a feature that will be implemented much later in the future. 205 */ 206 struct mem_cgroup { 207 struct cgroup_subsys_state css; 208 /* 209 * the counter to account for memory usage 210 */ 211 struct res_counter res; 212 /* 213 * the counter to account for mem+swap usage. 214 */ 215 struct res_counter memsw; 216 /* 217 * Per cgroup active and inactive list, similar to the 218 * per zone LRU lists. 219 */ 220 struct mem_cgroup_lru_info info; 221 222 /* 223 protect against reclaim related member. 224 */ 225 spinlock_t reclaim_param_lock; 226 227 /* 228 * While reclaiming in a hierarchy, we cache the last child we 229 * reclaimed from. 230 */ 231 int last_scanned_child; 232 /* 233 * Should the accounting and control be hierarchical, per subtree? 234 */ 235 bool use_hierarchy; 236 atomic_t oom_lock; 237 atomic_t refcnt; 238 239 unsigned int swappiness; 240 /* OOM-Killer disable */ 241 int oom_kill_disable; 242 243 /* set when res.limit == memsw.limit */ 244 bool memsw_is_minimum; 245 246 /* protect arrays of thresholds */ 247 struct mutex thresholds_lock; 248 249 /* thresholds for memory usage. RCU-protected */ 250 struct mem_cgroup_thresholds thresholds; 251 252 /* thresholds for mem+swap usage. RCU-protected */ 253 struct mem_cgroup_thresholds memsw_thresholds; 254 255 /* For oom notifier event fd */ 256 struct list_head oom_notify; 257 258 /* 259 * Should we move charges of a task when a task is moved into this 260 * mem_cgroup ? And what type of charges should we move ? 261 */ 262 unsigned long move_charge_at_immigrate; 263 /* 264 * percpu counter. 265 */ 266 struct mem_cgroup_stat_cpu *stat; 267 /* 268 * used when a cpu is offlined or other synchronizations 269 * See mem_cgroup_read_stat(). 270 */ 271 struct mem_cgroup_stat_cpu nocpu_base; 272 spinlock_t pcp_counter_lock; 273 }; 274 275 /* Stuffs for move charges at task migration. */ 276 /* 277 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 278 * left-shifted bitmap of these types. 279 */ 280 enum move_type { 281 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 282 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 283 NR_MOVE_TYPE, 284 }; 285 286 /* "mc" and its members are protected by cgroup_mutex */ 287 static struct move_charge_struct { 288 spinlock_t lock; /* for from, to */ 289 struct mem_cgroup *from; 290 struct mem_cgroup *to; 291 unsigned long precharge; 292 unsigned long moved_charge; 293 unsigned long moved_swap; 294 struct task_struct *moving_task; /* a task moving charges */ 295 wait_queue_head_t waitq; /* a waitq for other context */ 296 } mc = { 297 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 298 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 299 }; 300 301 static bool move_anon(void) 302 { 303 return test_bit(MOVE_CHARGE_TYPE_ANON, 304 &mc.to->move_charge_at_immigrate); 305 } 306 307 static bool move_file(void) 308 { 309 return test_bit(MOVE_CHARGE_TYPE_FILE, 310 &mc.to->move_charge_at_immigrate); 311 } 312 313 /* 314 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 315 * limit reclaim to prevent infinite loops, if they ever occur. 316 */ 317 #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 318 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 319 320 enum charge_type { 321 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 322 MEM_CGROUP_CHARGE_TYPE_MAPPED, 323 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 324 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 325 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 326 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 327 NR_CHARGE_TYPE, 328 }; 329 330 /* only for here (for easy reading.) */ 331 #define PCGF_CACHE (1UL << PCG_CACHE) 332 #define PCGF_USED (1UL << PCG_USED) 333 #define PCGF_LOCK (1UL << PCG_LOCK) 334 /* Not used, but added here for completeness */ 335 #define PCGF_ACCT (1UL << PCG_ACCT) 336 337 /* for encoding cft->private value on file */ 338 #define _MEM (0) 339 #define _MEMSWAP (1) 340 #define _OOM_TYPE (2) 341 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 342 #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 343 #define MEMFILE_ATTR(val) ((val) & 0xffff) 344 /* Used for OOM nofiier */ 345 #define OOM_CONTROL (0) 346 347 /* 348 * Reclaim flags for mem_cgroup_hierarchical_reclaim 349 */ 350 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 351 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 352 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 353 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 354 #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 355 #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 356 357 static void mem_cgroup_get(struct mem_cgroup *mem); 358 static void mem_cgroup_put(struct mem_cgroup *mem); 359 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 360 static void drain_all_stock_async(void); 361 362 static struct mem_cgroup_per_zone * 363 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 364 { 365 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 366 } 367 368 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 369 { 370 return &mem->css; 371 } 372 373 static struct mem_cgroup_per_zone * 374 page_cgroup_zoneinfo(struct page_cgroup *pc) 375 { 376 struct mem_cgroup *mem = pc->mem_cgroup; 377 int nid = page_cgroup_nid(pc); 378 int zid = page_cgroup_zid(pc); 379 380 if (!mem) 381 return NULL; 382 383 return mem_cgroup_zoneinfo(mem, nid, zid); 384 } 385 386 static struct mem_cgroup_tree_per_zone * 387 soft_limit_tree_node_zone(int nid, int zid) 388 { 389 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 390 } 391 392 static struct mem_cgroup_tree_per_zone * 393 soft_limit_tree_from_page(struct page *page) 394 { 395 int nid = page_to_nid(page); 396 int zid = page_zonenum(page); 397 398 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 399 } 400 401 static void 402 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 403 struct mem_cgroup_per_zone *mz, 404 struct mem_cgroup_tree_per_zone *mctz, 405 unsigned long long new_usage_in_excess) 406 { 407 struct rb_node **p = &mctz->rb_root.rb_node; 408 struct rb_node *parent = NULL; 409 struct mem_cgroup_per_zone *mz_node; 410 411 if (mz->on_tree) 412 return; 413 414 mz->usage_in_excess = new_usage_in_excess; 415 if (!mz->usage_in_excess) 416 return; 417 while (*p) { 418 parent = *p; 419 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 420 tree_node); 421 if (mz->usage_in_excess < mz_node->usage_in_excess) 422 p = &(*p)->rb_left; 423 /* 424 * We can't avoid mem cgroups that are over their soft 425 * limit by the same amount 426 */ 427 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 428 p = &(*p)->rb_right; 429 } 430 rb_link_node(&mz->tree_node, parent, p); 431 rb_insert_color(&mz->tree_node, &mctz->rb_root); 432 mz->on_tree = true; 433 } 434 435 static void 436 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 437 struct mem_cgroup_per_zone *mz, 438 struct mem_cgroup_tree_per_zone *mctz) 439 { 440 if (!mz->on_tree) 441 return; 442 rb_erase(&mz->tree_node, &mctz->rb_root); 443 mz->on_tree = false; 444 } 445 446 static void 447 mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 448 struct mem_cgroup_per_zone *mz, 449 struct mem_cgroup_tree_per_zone *mctz) 450 { 451 spin_lock(&mctz->lock); 452 __mem_cgroup_remove_exceeded(mem, mz, mctz); 453 spin_unlock(&mctz->lock); 454 } 455 456 457 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 458 { 459 unsigned long long excess; 460 struct mem_cgroup_per_zone *mz; 461 struct mem_cgroup_tree_per_zone *mctz; 462 int nid = page_to_nid(page); 463 int zid = page_zonenum(page); 464 mctz = soft_limit_tree_from_page(page); 465 466 /* 467 * Necessary to update all ancestors when hierarchy is used. 468 * because their event counter is not touched. 469 */ 470 for (; mem; mem = parent_mem_cgroup(mem)) { 471 mz = mem_cgroup_zoneinfo(mem, nid, zid); 472 excess = res_counter_soft_limit_excess(&mem->res); 473 /* 474 * We have to update the tree if mz is on RB-tree or 475 * mem is over its softlimit. 476 */ 477 if (excess || mz->on_tree) { 478 spin_lock(&mctz->lock); 479 /* if on-tree, remove it */ 480 if (mz->on_tree) 481 __mem_cgroup_remove_exceeded(mem, mz, mctz); 482 /* 483 * Insert again. mz->usage_in_excess will be updated. 484 * If excess is 0, no tree ops. 485 */ 486 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 487 spin_unlock(&mctz->lock); 488 } 489 } 490 } 491 492 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 493 { 494 int node, zone; 495 struct mem_cgroup_per_zone *mz; 496 struct mem_cgroup_tree_per_zone *mctz; 497 498 for_each_node_state(node, N_POSSIBLE) { 499 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 500 mz = mem_cgroup_zoneinfo(mem, node, zone); 501 mctz = soft_limit_tree_node_zone(node, zone); 502 mem_cgroup_remove_exceeded(mem, mz, mctz); 503 } 504 } 505 } 506 507 static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) 508 { 509 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; 510 } 511 512 static struct mem_cgroup_per_zone * 513 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 514 { 515 struct rb_node *rightmost = NULL; 516 struct mem_cgroup_per_zone *mz; 517 518 retry: 519 mz = NULL; 520 rightmost = rb_last(&mctz->rb_root); 521 if (!rightmost) 522 goto done; /* Nothing to reclaim from */ 523 524 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 525 /* 526 * Remove the node now but someone else can add it back, 527 * we will to add it back at the end of reclaim to its correct 528 * position in the tree. 529 */ 530 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 531 if (!res_counter_soft_limit_excess(&mz->mem->res) || 532 !css_tryget(&mz->mem->css)) 533 goto retry; 534 done: 535 return mz; 536 } 537 538 static struct mem_cgroup_per_zone * 539 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 540 { 541 struct mem_cgroup_per_zone *mz; 542 543 spin_lock(&mctz->lock); 544 mz = __mem_cgroup_largest_soft_limit_node(mctz); 545 spin_unlock(&mctz->lock); 546 return mz; 547 } 548 549 /* 550 * Implementation Note: reading percpu statistics for memcg. 551 * 552 * Both of vmstat[] and percpu_counter has threshold and do periodic 553 * synchronization to implement "quick" read. There are trade-off between 554 * reading cost and precision of value. Then, we may have a chance to implement 555 * a periodic synchronizion of counter in memcg's counter. 556 * 557 * But this _read() function is used for user interface now. The user accounts 558 * memory usage by memory cgroup and he _always_ requires exact value because 559 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 560 * have to visit all online cpus and make sum. So, for now, unnecessary 561 * synchronization is not implemented. (just implemented for cpu hotplug) 562 * 563 * If there are kernel internal actions which can make use of some not-exact 564 * value, and reading all cpu value can be performance bottleneck in some 565 * common workload, threashold and synchonization as vmstat[] should be 566 * implemented. 567 */ 568 static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 569 enum mem_cgroup_stat_index idx) 570 { 571 int cpu; 572 s64 val = 0; 573 574 get_online_cpus(); 575 for_each_online_cpu(cpu) 576 val += per_cpu(mem->stat->count[idx], cpu); 577 #ifdef CONFIG_HOTPLUG_CPU 578 spin_lock(&mem->pcp_counter_lock); 579 val += mem->nocpu_base.count[idx]; 580 spin_unlock(&mem->pcp_counter_lock); 581 #endif 582 put_online_cpus(); 583 return val; 584 } 585 586 static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) 587 { 588 s64 ret; 589 590 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 591 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 592 return ret; 593 } 594 595 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 596 bool charge) 597 { 598 int val = (charge) ? 1 : -1; 599 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 600 } 601 602 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 603 bool file, int nr_pages) 604 { 605 preempt_disable(); 606 607 if (file) 608 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); 609 else 610 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); 611 612 /* pagein of a big page is an event. So, ignore page size */ 613 if (nr_pages > 0) 614 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 615 else { 616 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 617 nr_pages = -nr_pages; /* for event */ 618 } 619 620 __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); 621 622 preempt_enable(); 623 } 624 625 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 626 enum lru_list idx) 627 { 628 int nid, zid; 629 struct mem_cgroup_per_zone *mz; 630 u64 total = 0; 631 632 for_each_online_node(nid) 633 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 634 mz = mem_cgroup_zoneinfo(mem, nid, zid); 635 total += MEM_CGROUP_ZSTAT(mz, idx); 636 } 637 return total; 638 } 639 640 static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) 641 { 642 s64 val; 643 644 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); 645 646 return !(val & ((1 << event_mask_shift) - 1)); 647 } 648 649 /* 650 * Check events in order. 651 * 652 */ 653 static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 654 { 655 /* threshold event is triggered in finer grain than soft limit */ 656 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { 657 mem_cgroup_threshold(mem); 658 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) 659 mem_cgroup_update_tree(mem, page); 660 } 661 } 662 663 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 664 { 665 return container_of(cgroup_subsys_state(cont, 666 mem_cgroup_subsys_id), struct mem_cgroup, 667 css); 668 } 669 670 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 671 { 672 /* 673 * mm_update_next_owner() may clear mm->owner to NULL 674 * if it races with swapoff, page migration, etc. 675 * So this can be called with p == NULL. 676 */ 677 if (unlikely(!p)) 678 return NULL; 679 680 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 681 struct mem_cgroup, css); 682 } 683 684 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 685 { 686 struct mem_cgroup *mem = NULL; 687 688 if (!mm) 689 return NULL; 690 /* 691 * Because we have no locks, mm->owner's may be being moved to other 692 * cgroup. We use css_tryget() here even if this looks 693 * pessimistic (rather than adding locks here). 694 */ 695 rcu_read_lock(); 696 do { 697 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 698 if (unlikely(!mem)) 699 break; 700 } while (!css_tryget(&mem->css)); 701 rcu_read_unlock(); 702 return mem; 703 } 704 705 /* The caller has to guarantee "mem" exists before calling this */ 706 static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) 707 { 708 struct cgroup_subsys_state *css; 709 int found; 710 711 if (!mem) /* ROOT cgroup has the smallest ID */ 712 return root_mem_cgroup; /*css_put/get against root is ignored*/ 713 if (!mem->use_hierarchy) { 714 if (css_tryget(&mem->css)) 715 return mem; 716 return NULL; 717 } 718 rcu_read_lock(); 719 /* 720 * searching a memory cgroup which has the smallest ID under given 721 * ROOT cgroup. (ID >= 1) 722 */ 723 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); 724 if (css && css_tryget(css)) 725 mem = container_of(css, struct mem_cgroup, css); 726 else 727 mem = NULL; 728 rcu_read_unlock(); 729 return mem; 730 } 731 732 static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 733 struct mem_cgroup *root, 734 bool cond) 735 { 736 int nextid = css_id(&iter->css) + 1; 737 int found; 738 int hierarchy_used; 739 struct cgroup_subsys_state *css; 740 741 hierarchy_used = iter->use_hierarchy; 742 743 css_put(&iter->css); 744 /* If no ROOT, walk all, ignore hierarchy */ 745 if (!cond || (root && !hierarchy_used)) 746 return NULL; 747 748 if (!root) 749 root = root_mem_cgroup; 750 751 do { 752 iter = NULL; 753 rcu_read_lock(); 754 755 css = css_get_next(&mem_cgroup_subsys, nextid, 756 &root->css, &found); 757 if (css && css_tryget(css)) 758 iter = container_of(css, struct mem_cgroup, css); 759 rcu_read_unlock(); 760 /* If css is NULL, no more cgroups will be found */ 761 nextid = found + 1; 762 } while (css && !iter); 763 764 return iter; 765 } 766 /* 767 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please 768 * be careful that "break" loop is not allowed. We have reference count. 769 * Instead of that modify "cond" to be false and "continue" to exit the loop. 770 */ 771 #define for_each_mem_cgroup_tree_cond(iter, root, cond) \ 772 for (iter = mem_cgroup_start_loop(root);\ 773 iter != NULL;\ 774 iter = mem_cgroup_get_next(iter, root, cond)) 775 776 #define for_each_mem_cgroup_tree(iter, root) \ 777 for_each_mem_cgroup_tree_cond(iter, root, true) 778 779 #define for_each_mem_cgroup_all(iter) \ 780 for_each_mem_cgroup_tree_cond(iter, NULL, true) 781 782 783 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 784 { 785 return (mem == root_mem_cgroup); 786 } 787 788 /* 789 * Following LRU functions are allowed to be used without PCG_LOCK. 790 * Operations are called by routine of global LRU independently from memcg. 791 * What we have to take care of here is validness of pc->mem_cgroup. 792 * 793 * Changes to pc->mem_cgroup happens when 794 * 1. charge 795 * 2. moving account 796 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 797 * It is added to LRU before charge. 798 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 799 * When moving account, the page is not on LRU. It's isolated. 800 */ 801 802 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 803 { 804 struct page_cgroup *pc; 805 struct mem_cgroup_per_zone *mz; 806 807 if (mem_cgroup_disabled()) 808 return; 809 pc = lookup_page_cgroup(page); 810 /* can happen while we handle swapcache. */ 811 if (!TestClearPageCgroupAcctLRU(pc)) 812 return; 813 VM_BUG_ON(!pc->mem_cgroup); 814 /* 815 * We don't check PCG_USED bit. It's cleared when the "page" is finally 816 * removed from global LRU. 817 */ 818 mz = page_cgroup_zoneinfo(pc); 819 /* huge page split is done under lru_lock. so, we have no races. */ 820 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); 821 if (mem_cgroup_is_root(pc->mem_cgroup)) 822 return; 823 VM_BUG_ON(list_empty(&pc->lru)); 824 list_del_init(&pc->lru); 825 } 826 827 void mem_cgroup_del_lru(struct page *page) 828 { 829 mem_cgroup_del_lru_list(page, page_lru(page)); 830 } 831 832 /* 833 * Writeback is about to end against a page which has been marked for immediate 834 * reclaim. If it still appears to be reclaimable, move it to the tail of the 835 * inactive list. 836 */ 837 void mem_cgroup_rotate_reclaimable_page(struct page *page) 838 { 839 struct mem_cgroup_per_zone *mz; 840 struct page_cgroup *pc; 841 enum lru_list lru = page_lru(page); 842 843 if (mem_cgroup_disabled()) 844 return; 845 846 pc = lookup_page_cgroup(page); 847 /* unused or root page is not rotated. */ 848 if (!PageCgroupUsed(pc)) 849 return; 850 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 851 smp_rmb(); 852 if (mem_cgroup_is_root(pc->mem_cgroup)) 853 return; 854 mz = page_cgroup_zoneinfo(pc); 855 list_move_tail(&pc->lru, &mz->lists[lru]); 856 } 857 858 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 859 { 860 struct mem_cgroup_per_zone *mz; 861 struct page_cgroup *pc; 862 863 if (mem_cgroup_disabled()) 864 return; 865 866 pc = lookup_page_cgroup(page); 867 /* unused or root page is not rotated. */ 868 if (!PageCgroupUsed(pc)) 869 return; 870 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 871 smp_rmb(); 872 if (mem_cgroup_is_root(pc->mem_cgroup)) 873 return; 874 mz = page_cgroup_zoneinfo(pc); 875 list_move(&pc->lru, &mz->lists[lru]); 876 } 877 878 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 879 { 880 struct page_cgroup *pc; 881 struct mem_cgroup_per_zone *mz; 882 883 if (mem_cgroup_disabled()) 884 return; 885 pc = lookup_page_cgroup(page); 886 VM_BUG_ON(PageCgroupAcctLRU(pc)); 887 if (!PageCgroupUsed(pc)) 888 return; 889 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 890 smp_rmb(); 891 mz = page_cgroup_zoneinfo(pc); 892 /* huge page split is done under lru_lock. so, we have no races. */ 893 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 894 SetPageCgroupAcctLRU(pc); 895 if (mem_cgroup_is_root(pc->mem_cgroup)) 896 return; 897 list_add(&pc->lru, &mz->lists[lru]); 898 } 899 900 /* 901 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 902 * lru because the page may.be reused after it's fully uncharged (because of 903 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 904 * it again. This function is only used to charge SwapCache. It's done under 905 * lock_page and expected that zone->lru_lock is never held. 906 */ 907 static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 908 { 909 unsigned long flags; 910 struct zone *zone = page_zone(page); 911 struct page_cgroup *pc = lookup_page_cgroup(page); 912 913 spin_lock_irqsave(&zone->lru_lock, flags); 914 /* 915 * Forget old LRU when this page_cgroup is *not* used. This Used bit 916 * is guarded by lock_page() because the page is SwapCache. 917 */ 918 if (!PageCgroupUsed(pc)) 919 mem_cgroup_del_lru_list(page, page_lru(page)); 920 spin_unlock_irqrestore(&zone->lru_lock, flags); 921 } 922 923 static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 924 { 925 unsigned long flags; 926 struct zone *zone = page_zone(page); 927 struct page_cgroup *pc = lookup_page_cgroup(page); 928 929 spin_lock_irqsave(&zone->lru_lock, flags); 930 /* link when the page is linked to LRU but page_cgroup isn't */ 931 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 932 mem_cgroup_add_lru_list(page, page_lru(page)); 933 spin_unlock_irqrestore(&zone->lru_lock, flags); 934 } 935 936 937 void mem_cgroup_move_lists(struct page *page, 938 enum lru_list from, enum lru_list to) 939 { 940 if (mem_cgroup_disabled()) 941 return; 942 mem_cgroup_del_lru_list(page, from); 943 mem_cgroup_add_lru_list(page, to); 944 } 945 946 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 947 { 948 int ret; 949 struct mem_cgroup *curr = NULL; 950 struct task_struct *p; 951 952 p = find_lock_task_mm(task); 953 if (!p) 954 return 0; 955 curr = try_get_mem_cgroup_from_mm(p->mm); 956 task_unlock(p); 957 if (!curr) 958 return 0; 959 /* 960 * We should check use_hierarchy of "mem" not "curr". Because checking 961 * use_hierarchy of "curr" here make this function true if hierarchy is 962 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 963 * hierarchy(even if use_hierarchy is disabled in "mem"). 964 */ 965 if (mem->use_hierarchy) 966 ret = css_is_ancestor(&curr->css, &mem->css); 967 else 968 ret = (curr == mem); 969 css_put(&curr->css); 970 return ret; 971 } 972 973 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 974 { 975 unsigned long active; 976 unsigned long inactive; 977 unsigned long gb; 978 unsigned long inactive_ratio; 979 980 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 981 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 982 983 gb = (inactive + active) >> (30 - PAGE_SHIFT); 984 if (gb) 985 inactive_ratio = int_sqrt(10 * gb); 986 else 987 inactive_ratio = 1; 988 989 if (present_pages) { 990 present_pages[0] = inactive; 991 present_pages[1] = active; 992 } 993 994 return inactive_ratio; 995 } 996 997 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 998 { 999 unsigned long active; 1000 unsigned long inactive; 1001 unsigned long present_pages[2]; 1002 unsigned long inactive_ratio; 1003 1004 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 1005 1006 inactive = present_pages[0]; 1007 active = present_pages[1]; 1008 1009 if (inactive * inactive_ratio < active) 1010 return 1; 1011 1012 return 0; 1013 } 1014 1015 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 1016 { 1017 unsigned long active; 1018 unsigned long inactive; 1019 1020 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 1021 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 1022 1023 return (active > inactive); 1024 } 1025 1026 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 1027 struct zone *zone, 1028 enum lru_list lru) 1029 { 1030 int nid = zone_to_nid(zone); 1031 int zid = zone_idx(zone); 1032 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1033 1034 return MEM_CGROUP_ZSTAT(mz, lru); 1035 } 1036 1037 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1038 struct zone *zone) 1039 { 1040 int nid = zone_to_nid(zone); 1041 int zid = zone_idx(zone); 1042 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1043 1044 return &mz->reclaim_stat; 1045 } 1046 1047 struct zone_reclaim_stat * 1048 mem_cgroup_get_reclaim_stat_from_page(struct page *page) 1049 { 1050 struct page_cgroup *pc; 1051 struct mem_cgroup_per_zone *mz; 1052 1053 if (mem_cgroup_disabled()) 1054 return NULL; 1055 1056 pc = lookup_page_cgroup(page); 1057 if (!PageCgroupUsed(pc)) 1058 return NULL; 1059 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1060 smp_rmb(); 1061 mz = page_cgroup_zoneinfo(pc); 1062 if (!mz) 1063 return NULL; 1064 1065 return &mz->reclaim_stat; 1066 } 1067 1068 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 1069 struct list_head *dst, 1070 unsigned long *scanned, int order, 1071 int mode, struct zone *z, 1072 struct mem_cgroup *mem_cont, 1073 int active, int file) 1074 { 1075 unsigned long nr_taken = 0; 1076 struct page *page; 1077 unsigned long scan; 1078 LIST_HEAD(pc_list); 1079 struct list_head *src; 1080 struct page_cgroup *pc, *tmp; 1081 int nid = zone_to_nid(z); 1082 int zid = zone_idx(z); 1083 struct mem_cgroup_per_zone *mz; 1084 int lru = LRU_FILE * file + active; 1085 int ret; 1086 1087 BUG_ON(!mem_cont); 1088 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 1089 src = &mz->lists[lru]; 1090 1091 scan = 0; 1092 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 1093 if (scan >= nr_to_scan) 1094 break; 1095 1096 page = pc->page; 1097 if (unlikely(!PageCgroupUsed(pc))) 1098 continue; 1099 if (unlikely(!PageLRU(page))) 1100 continue; 1101 1102 scan++; 1103 ret = __isolate_lru_page(page, mode, file); 1104 switch (ret) { 1105 case 0: 1106 list_move(&page->lru, dst); 1107 mem_cgroup_del_lru(page); 1108 nr_taken += hpage_nr_pages(page); 1109 break; 1110 case -EBUSY: 1111 /* we don't affect global LRU but rotate in our LRU */ 1112 mem_cgroup_rotate_lru_list(page, page_lru(page)); 1113 break; 1114 default: 1115 break; 1116 } 1117 } 1118 1119 *scanned = scan; 1120 1121 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, 1122 0, 0, 0, mode); 1123 1124 return nr_taken; 1125 } 1126 1127 #define mem_cgroup_from_res_counter(counter, member) \ 1128 container_of(counter, struct mem_cgroup, member) 1129 1130 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 1131 { 1132 if (do_swap_account) { 1133 if (res_counter_check_under_limit(&mem->res) && 1134 res_counter_check_under_limit(&mem->memsw)) 1135 return true; 1136 } else 1137 if (res_counter_check_under_limit(&mem->res)) 1138 return true; 1139 return false; 1140 } 1141 1142 /** 1143 * mem_cgroup_check_margin - check if the memory cgroup allows charging 1144 * @mem: memory cgroup to check 1145 * @bytes: the number of bytes the caller intends to charge 1146 * 1147 * Returns a boolean value on whether @mem can be charged @bytes or 1148 * whether this would exceed the limit. 1149 */ 1150 static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) 1151 { 1152 if (!res_counter_check_margin(&mem->res, bytes)) 1153 return false; 1154 if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) 1155 return false; 1156 return true; 1157 } 1158 1159 static unsigned int get_swappiness(struct mem_cgroup *memcg) 1160 { 1161 struct cgroup *cgrp = memcg->css.cgroup; 1162 unsigned int swappiness; 1163 1164 /* root ? */ 1165 if (cgrp->parent == NULL) 1166 return vm_swappiness; 1167 1168 spin_lock(&memcg->reclaim_param_lock); 1169 swappiness = memcg->swappiness; 1170 spin_unlock(&memcg->reclaim_param_lock); 1171 1172 return swappiness; 1173 } 1174 1175 static void mem_cgroup_start_move(struct mem_cgroup *mem) 1176 { 1177 int cpu; 1178 1179 get_online_cpus(); 1180 spin_lock(&mem->pcp_counter_lock); 1181 for_each_online_cpu(cpu) 1182 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; 1183 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; 1184 spin_unlock(&mem->pcp_counter_lock); 1185 put_online_cpus(); 1186 1187 synchronize_rcu(); 1188 } 1189 1190 static void mem_cgroup_end_move(struct mem_cgroup *mem) 1191 { 1192 int cpu; 1193 1194 if (!mem) 1195 return; 1196 get_online_cpus(); 1197 spin_lock(&mem->pcp_counter_lock); 1198 for_each_online_cpu(cpu) 1199 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1200 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; 1201 spin_unlock(&mem->pcp_counter_lock); 1202 put_online_cpus(); 1203 } 1204 /* 1205 * 2 routines for checking "mem" is under move_account() or not. 1206 * 1207 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used 1208 * for avoiding race in accounting. If true, 1209 * pc->mem_cgroup may be overwritten. 1210 * 1211 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1212 * under hierarchy of moving cgroups. This is for 1213 * waiting at hith-memory prressure caused by "move". 1214 */ 1215 1216 static bool mem_cgroup_stealed(struct mem_cgroup *mem) 1217 { 1218 VM_BUG_ON(!rcu_read_lock_held()); 1219 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1220 } 1221 1222 static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1223 { 1224 struct mem_cgroup *from; 1225 struct mem_cgroup *to; 1226 bool ret = false; 1227 /* 1228 * Unlike task_move routines, we access mc.to, mc.from not under 1229 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1230 */ 1231 spin_lock(&mc.lock); 1232 from = mc.from; 1233 to = mc.to; 1234 if (!from) 1235 goto unlock; 1236 if (from == mem || to == mem 1237 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) 1238 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) 1239 ret = true; 1240 unlock: 1241 spin_unlock(&mc.lock); 1242 return ret; 1243 } 1244 1245 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) 1246 { 1247 if (mc.moving_task && current != mc.moving_task) { 1248 if (mem_cgroup_under_move(mem)) { 1249 DEFINE_WAIT(wait); 1250 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1251 /* moving charge context might have finished. */ 1252 if (mc.moving_task) 1253 schedule(); 1254 finish_wait(&mc.waitq, &wait); 1255 return true; 1256 } 1257 } 1258 return false; 1259 } 1260 1261 /** 1262 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1263 * @memcg: The memory cgroup that went over limit 1264 * @p: Task that is going to be killed 1265 * 1266 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1267 * enabled 1268 */ 1269 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1270 { 1271 struct cgroup *task_cgrp; 1272 struct cgroup *mem_cgrp; 1273 /* 1274 * Need a buffer in BSS, can't rely on allocations. The code relies 1275 * on the assumption that OOM is serialized for memory controller. 1276 * If this assumption is broken, revisit this code. 1277 */ 1278 static char memcg_name[PATH_MAX]; 1279 int ret; 1280 1281 if (!memcg || !p) 1282 return; 1283 1284 1285 rcu_read_lock(); 1286 1287 mem_cgrp = memcg->css.cgroup; 1288 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1289 1290 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1291 if (ret < 0) { 1292 /* 1293 * Unfortunately, we are unable to convert to a useful name 1294 * But we'll still print out the usage information 1295 */ 1296 rcu_read_unlock(); 1297 goto done; 1298 } 1299 rcu_read_unlock(); 1300 1301 printk(KERN_INFO "Task in %s killed", memcg_name); 1302 1303 rcu_read_lock(); 1304 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1305 if (ret < 0) { 1306 rcu_read_unlock(); 1307 goto done; 1308 } 1309 rcu_read_unlock(); 1310 1311 /* 1312 * Continues from above, so we don't need an KERN_ level 1313 */ 1314 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1315 done: 1316 1317 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1318 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1319 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1320 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1321 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1322 "failcnt %llu\n", 1323 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1324 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1325 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1326 } 1327 1328 /* 1329 * This function returns the number of memcg under hierarchy tree. Returns 1330 * 1(self count) if no children. 1331 */ 1332 static int mem_cgroup_count_children(struct mem_cgroup *mem) 1333 { 1334 int num = 0; 1335 struct mem_cgroup *iter; 1336 1337 for_each_mem_cgroup_tree(iter, mem) 1338 num++; 1339 return num; 1340 } 1341 1342 /* 1343 * Return the memory (and swap, if configured) limit for a memcg. 1344 */ 1345 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1346 { 1347 u64 limit; 1348 u64 memsw; 1349 1350 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1351 limit += total_swap_pages << PAGE_SHIFT; 1352 1353 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1354 /* 1355 * If memsw is finite and limits the amount of swap space available 1356 * to this memcg, return that limit. 1357 */ 1358 return min(limit, memsw); 1359 } 1360 1361 /* 1362 * Visit the first child (need not be the first child as per the ordering 1363 * of the cgroup list, since we track last_scanned_child) of @mem and use 1364 * that to reclaim free pages from. 1365 */ 1366 static struct mem_cgroup * 1367 mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1368 { 1369 struct mem_cgroup *ret = NULL; 1370 struct cgroup_subsys_state *css; 1371 int nextid, found; 1372 1373 if (!root_mem->use_hierarchy) { 1374 css_get(&root_mem->css); 1375 ret = root_mem; 1376 } 1377 1378 while (!ret) { 1379 rcu_read_lock(); 1380 nextid = root_mem->last_scanned_child + 1; 1381 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1382 &found); 1383 if (css && css_tryget(css)) 1384 ret = container_of(css, struct mem_cgroup, css); 1385 1386 rcu_read_unlock(); 1387 /* Updates scanning parameter */ 1388 spin_lock(&root_mem->reclaim_param_lock); 1389 if (!css) { 1390 /* this means start scan from ID:1 */ 1391 root_mem->last_scanned_child = 0; 1392 } else 1393 root_mem->last_scanned_child = found; 1394 spin_unlock(&root_mem->reclaim_param_lock); 1395 } 1396 1397 return ret; 1398 } 1399 1400 /* 1401 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1402 * we reclaimed from, so that we don't end up penalizing one child extensively 1403 * based on its position in the children list. 1404 * 1405 * root_mem is the original ancestor that we've been reclaim from. 1406 * 1407 * We give up and return to the caller when we visit root_mem twice. 1408 * (other groups can be removed while we're walking....) 1409 * 1410 * If shrink==true, for avoiding to free too much, this returns immedieately. 1411 */ 1412 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1413 struct zone *zone, 1414 gfp_t gfp_mask, 1415 unsigned long reclaim_options) 1416 { 1417 struct mem_cgroup *victim; 1418 int ret, total = 0; 1419 int loop = 0; 1420 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1421 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1422 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1423 unsigned long excess = mem_cgroup_get_excess(root_mem); 1424 1425 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1426 if (root_mem->memsw_is_minimum) 1427 noswap = true; 1428 1429 while (1) { 1430 victim = mem_cgroup_select_victim(root_mem); 1431 if (victim == root_mem) { 1432 loop++; 1433 if (loop >= 1) 1434 drain_all_stock_async(); 1435 if (loop >= 2) { 1436 /* 1437 * If we have not been able to reclaim 1438 * anything, it might because there are 1439 * no reclaimable pages under this hierarchy 1440 */ 1441 if (!check_soft || !total) { 1442 css_put(&victim->css); 1443 break; 1444 } 1445 /* 1446 * We want to do more targetted reclaim. 1447 * excess >> 2 is not to excessive so as to 1448 * reclaim too much, nor too less that we keep 1449 * coming back to reclaim from this cgroup 1450 */ 1451 if (total >= (excess >> 2) || 1452 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1453 css_put(&victim->css); 1454 break; 1455 } 1456 } 1457 } 1458 if (!mem_cgroup_local_usage(victim)) { 1459 /* this cgroup's local usage == 0 */ 1460 css_put(&victim->css); 1461 continue; 1462 } 1463 /* we use swappiness of local cgroup */ 1464 if (check_soft) 1465 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1466 noswap, get_swappiness(victim), zone); 1467 else 1468 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1469 noswap, get_swappiness(victim)); 1470 css_put(&victim->css); 1471 /* 1472 * At shrinking usage, we can't check we should stop here or 1473 * reclaim more. It's depends on callers. last_scanned_child 1474 * will work enough for keeping fairness under tree. 1475 */ 1476 if (shrink) 1477 return ret; 1478 total += ret; 1479 if (check_soft) { 1480 if (res_counter_check_under_soft_limit(&root_mem->res)) 1481 return total; 1482 } else if (mem_cgroup_check_under_limit(root_mem)) 1483 return 1 + total; 1484 } 1485 return total; 1486 } 1487 1488 /* 1489 * Check OOM-Killer is already running under our hierarchy. 1490 * If someone is running, return false. 1491 */ 1492 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1493 { 1494 int x, lock_count = 0; 1495 struct mem_cgroup *iter; 1496 1497 for_each_mem_cgroup_tree(iter, mem) { 1498 x = atomic_inc_return(&iter->oom_lock); 1499 lock_count = max(x, lock_count); 1500 } 1501 1502 if (lock_count == 1) 1503 return true; 1504 return false; 1505 } 1506 1507 static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1508 { 1509 struct mem_cgroup *iter; 1510 1511 /* 1512 * When a new child is created while the hierarchy is under oom, 1513 * mem_cgroup_oom_lock() may not be called. We have to use 1514 * atomic_add_unless() here. 1515 */ 1516 for_each_mem_cgroup_tree(iter, mem) 1517 atomic_add_unless(&iter->oom_lock, -1, 0); 1518 return 0; 1519 } 1520 1521 1522 static DEFINE_MUTEX(memcg_oom_mutex); 1523 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1524 1525 struct oom_wait_info { 1526 struct mem_cgroup *mem; 1527 wait_queue_t wait; 1528 }; 1529 1530 static int memcg_oom_wake_function(wait_queue_t *wait, 1531 unsigned mode, int sync, void *arg) 1532 { 1533 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; 1534 struct oom_wait_info *oom_wait_info; 1535 1536 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1537 1538 if (oom_wait_info->mem == wake_mem) 1539 goto wakeup; 1540 /* if no hierarchy, no match */ 1541 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) 1542 return 0; 1543 /* 1544 * Both of oom_wait_info->mem and wake_mem are stable under us. 1545 * Then we can use css_is_ancestor without taking care of RCU. 1546 */ 1547 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && 1548 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) 1549 return 0; 1550 1551 wakeup: 1552 return autoremove_wake_function(wait, mode, sync, arg); 1553 } 1554 1555 static void memcg_wakeup_oom(struct mem_cgroup *mem) 1556 { 1557 /* for filtering, pass "mem" as argument. */ 1558 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1559 } 1560 1561 static void memcg_oom_recover(struct mem_cgroup *mem) 1562 { 1563 if (mem && atomic_read(&mem->oom_lock)) 1564 memcg_wakeup_oom(mem); 1565 } 1566 1567 /* 1568 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1569 */ 1570 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1571 { 1572 struct oom_wait_info owait; 1573 bool locked, need_to_kill; 1574 1575 owait.mem = mem; 1576 owait.wait.flags = 0; 1577 owait.wait.func = memcg_oom_wake_function; 1578 owait.wait.private = current; 1579 INIT_LIST_HEAD(&owait.wait.task_list); 1580 need_to_kill = true; 1581 /* At first, try to OOM lock hierarchy under mem.*/ 1582 mutex_lock(&memcg_oom_mutex); 1583 locked = mem_cgroup_oom_lock(mem); 1584 /* 1585 * Even if signal_pending(), we can't quit charge() loop without 1586 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1587 * under OOM is always welcomed, use TASK_KILLABLE here. 1588 */ 1589 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1590 if (!locked || mem->oom_kill_disable) 1591 need_to_kill = false; 1592 if (locked) 1593 mem_cgroup_oom_notify(mem); 1594 mutex_unlock(&memcg_oom_mutex); 1595 1596 if (need_to_kill) { 1597 finish_wait(&memcg_oom_waitq, &owait.wait); 1598 mem_cgroup_out_of_memory(mem, mask); 1599 } else { 1600 schedule(); 1601 finish_wait(&memcg_oom_waitq, &owait.wait); 1602 } 1603 mutex_lock(&memcg_oom_mutex); 1604 mem_cgroup_oom_unlock(mem); 1605 memcg_wakeup_oom(mem); 1606 mutex_unlock(&memcg_oom_mutex); 1607 1608 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1609 return false; 1610 /* Give chance to dying process */ 1611 schedule_timeout(1); 1612 return true; 1613 } 1614 1615 /* 1616 * Currently used to update mapped file statistics, but the routine can be 1617 * generalized to update other statistics as well. 1618 * 1619 * Notes: Race condition 1620 * 1621 * We usually use page_cgroup_lock() for accessing page_cgroup member but 1622 * it tends to be costly. But considering some conditions, we doesn't need 1623 * to do so _always_. 1624 * 1625 * Considering "charge", lock_page_cgroup() is not required because all 1626 * file-stat operations happen after a page is attached to radix-tree. There 1627 * are no race with "charge". 1628 * 1629 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 1630 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 1631 * if there are race with "uncharge". Statistics itself is properly handled 1632 * by flags. 1633 * 1634 * Considering "move", this is an only case we see a race. To make the race 1635 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are 1636 * possibility of race condition. If there is, we take a lock. 1637 */ 1638 1639 void mem_cgroup_update_page_stat(struct page *page, 1640 enum mem_cgroup_page_stat_item idx, int val) 1641 { 1642 struct mem_cgroup *mem; 1643 struct page_cgroup *pc = lookup_page_cgroup(page); 1644 bool need_unlock = false; 1645 unsigned long uninitialized_var(flags); 1646 1647 if (unlikely(!pc)) 1648 return; 1649 1650 rcu_read_lock(); 1651 mem = pc->mem_cgroup; 1652 if (unlikely(!mem || !PageCgroupUsed(pc))) 1653 goto out; 1654 /* pc->mem_cgroup is unstable ? */ 1655 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { 1656 /* take a lock against to access pc->mem_cgroup */ 1657 move_lock_page_cgroup(pc, &flags); 1658 need_unlock = true; 1659 mem = pc->mem_cgroup; 1660 if (!mem || !PageCgroupUsed(pc)) 1661 goto out; 1662 } 1663 1664 switch (idx) { 1665 case MEMCG_NR_FILE_MAPPED: 1666 if (val > 0) 1667 SetPageCgroupFileMapped(pc); 1668 else if (!page_mapped(page)) 1669 ClearPageCgroupFileMapped(pc); 1670 idx = MEM_CGROUP_STAT_FILE_MAPPED; 1671 break; 1672 default: 1673 BUG(); 1674 } 1675 1676 this_cpu_add(mem->stat->count[idx], val); 1677 1678 out: 1679 if (unlikely(need_unlock)) 1680 move_unlock_page_cgroup(pc, &flags); 1681 rcu_read_unlock(); 1682 return; 1683 } 1684 EXPORT_SYMBOL(mem_cgroup_update_page_stat); 1685 1686 /* 1687 * size of first charge trial. "32" comes from vmscan.c's magic value. 1688 * TODO: maybe necessary to use big numbers in big irons. 1689 */ 1690 #define CHARGE_SIZE (32 * PAGE_SIZE) 1691 struct memcg_stock_pcp { 1692 struct mem_cgroup *cached; /* this never be root cgroup */ 1693 int charge; 1694 struct work_struct work; 1695 }; 1696 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1697 static atomic_t memcg_drain_count; 1698 1699 /* 1700 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed 1701 * from local stock and true is returned. If the stock is 0 or charges from a 1702 * cgroup which is not current target, returns false. This stock will be 1703 * refilled. 1704 */ 1705 static bool consume_stock(struct mem_cgroup *mem) 1706 { 1707 struct memcg_stock_pcp *stock; 1708 bool ret = true; 1709 1710 stock = &get_cpu_var(memcg_stock); 1711 if (mem == stock->cached && stock->charge) 1712 stock->charge -= PAGE_SIZE; 1713 else /* need to call res_counter_charge */ 1714 ret = false; 1715 put_cpu_var(memcg_stock); 1716 return ret; 1717 } 1718 1719 /* 1720 * Returns stocks cached in percpu to res_counter and reset cached information. 1721 */ 1722 static void drain_stock(struct memcg_stock_pcp *stock) 1723 { 1724 struct mem_cgroup *old = stock->cached; 1725 1726 if (stock->charge) { 1727 res_counter_uncharge(&old->res, stock->charge); 1728 if (do_swap_account) 1729 res_counter_uncharge(&old->memsw, stock->charge); 1730 } 1731 stock->cached = NULL; 1732 stock->charge = 0; 1733 } 1734 1735 /* 1736 * This must be called under preempt disabled or must be called by 1737 * a thread which is pinned to local cpu. 1738 */ 1739 static void drain_local_stock(struct work_struct *dummy) 1740 { 1741 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 1742 drain_stock(stock); 1743 } 1744 1745 /* 1746 * Cache charges(val) which is from res_counter, to local per_cpu area. 1747 * This will be consumed by consume_stock() function, later. 1748 */ 1749 static void refill_stock(struct mem_cgroup *mem, int val) 1750 { 1751 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 1752 1753 if (stock->cached != mem) { /* reset if necessary */ 1754 drain_stock(stock); 1755 stock->cached = mem; 1756 } 1757 stock->charge += val; 1758 put_cpu_var(memcg_stock); 1759 } 1760 1761 /* 1762 * Tries to drain stocked charges in other cpus. This function is asynchronous 1763 * and just put a work per cpu for draining localy on each cpu. Caller can 1764 * expects some charges will be back to res_counter later but cannot wait for 1765 * it. 1766 */ 1767 static void drain_all_stock_async(void) 1768 { 1769 int cpu; 1770 /* This function is for scheduling "drain" in asynchronous way. 1771 * The result of "drain" is not directly handled by callers. Then, 1772 * if someone is calling drain, we don't have to call drain more. 1773 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if 1774 * there is a race. We just do loose check here. 1775 */ 1776 if (atomic_read(&memcg_drain_count)) 1777 return; 1778 /* Notify other cpus that system-wide "drain" is running */ 1779 atomic_inc(&memcg_drain_count); 1780 get_online_cpus(); 1781 for_each_online_cpu(cpu) { 1782 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 1783 schedule_work_on(cpu, &stock->work); 1784 } 1785 put_online_cpus(); 1786 atomic_dec(&memcg_drain_count); 1787 /* We don't wait for flush_work */ 1788 } 1789 1790 /* This is a synchronous drain interface. */ 1791 static void drain_all_stock_sync(void) 1792 { 1793 /* called when force_empty is called */ 1794 atomic_inc(&memcg_drain_count); 1795 schedule_on_each_cpu(drain_local_stock); 1796 atomic_dec(&memcg_drain_count); 1797 } 1798 1799 /* 1800 * This function drains percpu counter value from DEAD cpu and 1801 * move it to local cpu. Note that this function can be preempted. 1802 */ 1803 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) 1804 { 1805 int i; 1806 1807 spin_lock(&mem->pcp_counter_lock); 1808 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 1809 s64 x = per_cpu(mem->stat->count[i], cpu); 1810 1811 per_cpu(mem->stat->count[i], cpu) = 0; 1812 mem->nocpu_base.count[i] += x; 1813 } 1814 /* need to clear ON_MOVE value, works as a kind of lock. */ 1815 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; 1816 spin_unlock(&mem->pcp_counter_lock); 1817 } 1818 1819 static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) 1820 { 1821 int idx = MEM_CGROUP_ON_MOVE; 1822 1823 spin_lock(&mem->pcp_counter_lock); 1824 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; 1825 spin_unlock(&mem->pcp_counter_lock); 1826 } 1827 1828 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 1829 unsigned long action, 1830 void *hcpu) 1831 { 1832 int cpu = (unsigned long)hcpu; 1833 struct memcg_stock_pcp *stock; 1834 struct mem_cgroup *iter; 1835 1836 if ((action == CPU_ONLINE)) { 1837 for_each_mem_cgroup_all(iter) 1838 synchronize_mem_cgroup_on_move(iter, cpu); 1839 return NOTIFY_OK; 1840 } 1841 1842 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 1843 return NOTIFY_OK; 1844 1845 for_each_mem_cgroup_all(iter) 1846 mem_cgroup_drain_pcp_counter(iter, cpu); 1847 1848 stock = &per_cpu(memcg_stock, cpu); 1849 drain_stock(stock); 1850 return NOTIFY_OK; 1851 } 1852 1853 1854 /* See __mem_cgroup_try_charge() for details */ 1855 enum { 1856 CHARGE_OK, /* success */ 1857 CHARGE_RETRY, /* need to retry but retry is not bad */ 1858 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 1859 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 1860 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 1861 }; 1862 1863 static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, 1864 int csize, bool oom_check) 1865 { 1866 struct mem_cgroup *mem_over_limit; 1867 struct res_counter *fail_res; 1868 unsigned long flags = 0; 1869 int ret; 1870 1871 ret = res_counter_charge(&mem->res, csize, &fail_res); 1872 1873 if (likely(!ret)) { 1874 if (!do_swap_account) 1875 return CHARGE_OK; 1876 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 1877 if (likely(!ret)) 1878 return CHARGE_OK; 1879 1880 res_counter_uncharge(&mem->res, csize); 1881 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 1882 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1883 } else 1884 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 1885 /* 1886 * csize can be either a huge page (HPAGE_SIZE), a batch of 1887 * regular pages (CHARGE_SIZE), or a single regular page 1888 * (PAGE_SIZE). 1889 * 1890 * Never reclaim on behalf of optional batching, retry with a 1891 * single page instead. 1892 */ 1893 if (csize == CHARGE_SIZE) 1894 return CHARGE_RETRY; 1895 1896 if (!(gfp_mask & __GFP_WAIT)) 1897 return CHARGE_WOULDBLOCK; 1898 1899 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1900 gfp_mask, flags); 1901 if (mem_cgroup_check_margin(mem_over_limit, csize)) 1902 return CHARGE_RETRY; 1903 /* 1904 * Even though the limit is exceeded at this point, reclaim 1905 * may have been able to free some pages. Retry the charge 1906 * before killing the task. 1907 * 1908 * Only for regular pages, though: huge pages are rather 1909 * unlikely to succeed so close to the limit, and we fall back 1910 * to regular pages anyway in case of failure. 1911 */ 1912 if (csize == PAGE_SIZE && ret) 1913 return CHARGE_RETRY; 1914 1915 /* 1916 * At task move, charge accounts can be doubly counted. So, it's 1917 * better to wait until the end of task_move if something is going on. 1918 */ 1919 if (mem_cgroup_wait_acct_move(mem_over_limit)) 1920 return CHARGE_RETRY; 1921 1922 /* If we don't need to call oom-killer at el, return immediately */ 1923 if (!oom_check) 1924 return CHARGE_NOMEM; 1925 /* check OOM */ 1926 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) 1927 return CHARGE_OOM_DIE; 1928 1929 return CHARGE_RETRY; 1930 } 1931 1932 /* 1933 * Unlike exported interface, "oom" parameter is added. if oom==true, 1934 * oom-killer can be invoked. 1935 */ 1936 static int __mem_cgroup_try_charge(struct mm_struct *mm, 1937 gfp_t gfp_mask, 1938 struct mem_cgroup **memcg, bool oom, 1939 int page_size) 1940 { 1941 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 1942 struct mem_cgroup *mem = NULL; 1943 int ret; 1944 int csize = max(CHARGE_SIZE, (unsigned long) page_size); 1945 1946 /* 1947 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1948 * in system level. So, allow to go ahead dying process in addition to 1949 * MEMDIE process. 1950 */ 1951 if (unlikely(test_thread_flag(TIF_MEMDIE) 1952 || fatal_signal_pending(current))) 1953 goto bypass; 1954 1955 /* 1956 * We always charge the cgroup the mm_struct belongs to. 1957 * The mm_struct's mem_cgroup changes on task migration if the 1958 * thread group leader migrates. It's possible that mm is not 1959 * set, if so charge the init_mm (happens for pagecache usage). 1960 */ 1961 if (!*memcg && !mm) 1962 goto bypass; 1963 again: 1964 if (*memcg) { /* css should be a valid one */ 1965 mem = *memcg; 1966 VM_BUG_ON(css_is_removed(&mem->css)); 1967 if (mem_cgroup_is_root(mem)) 1968 goto done; 1969 if (page_size == PAGE_SIZE && consume_stock(mem)) 1970 goto done; 1971 css_get(&mem->css); 1972 } else { 1973 struct task_struct *p; 1974 1975 rcu_read_lock(); 1976 p = rcu_dereference(mm->owner); 1977 /* 1978 * Because we don't have task_lock(), "p" can exit. 1979 * In that case, "mem" can point to root or p can be NULL with 1980 * race with swapoff. Then, we have small risk of mis-accouning. 1981 * But such kind of mis-account by race always happens because 1982 * we don't have cgroup_mutex(). It's overkill and we allo that 1983 * small race, here. 1984 * (*) swapoff at el will charge against mm-struct not against 1985 * task-struct. So, mm->owner can be NULL. 1986 */ 1987 mem = mem_cgroup_from_task(p); 1988 if (!mem || mem_cgroup_is_root(mem)) { 1989 rcu_read_unlock(); 1990 goto done; 1991 } 1992 if (page_size == PAGE_SIZE && consume_stock(mem)) { 1993 /* 1994 * It seems dagerous to access memcg without css_get(). 1995 * But considering how consume_stok works, it's not 1996 * necessary. If consume_stock success, some charges 1997 * from this memcg are cached on this cpu. So, we 1998 * don't need to call css_get()/css_tryget() before 1999 * calling consume_stock(). 2000 */ 2001 rcu_read_unlock(); 2002 goto done; 2003 } 2004 /* after here, we may be blocked. we need to get refcnt */ 2005 if (!css_tryget(&mem->css)) { 2006 rcu_read_unlock(); 2007 goto again; 2008 } 2009 rcu_read_unlock(); 2010 } 2011 2012 do { 2013 bool oom_check; 2014 2015 /* If killed, bypass charge */ 2016 if (fatal_signal_pending(current)) { 2017 css_put(&mem->css); 2018 goto bypass; 2019 } 2020 2021 oom_check = false; 2022 if (oom && !nr_oom_retries) { 2023 oom_check = true; 2024 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2025 } 2026 2027 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); 2028 2029 switch (ret) { 2030 case CHARGE_OK: 2031 break; 2032 case CHARGE_RETRY: /* not in OOM situation but retry */ 2033 csize = page_size; 2034 css_put(&mem->css); 2035 mem = NULL; 2036 goto again; 2037 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2038 css_put(&mem->css); 2039 goto nomem; 2040 case CHARGE_NOMEM: /* OOM routine works */ 2041 if (!oom) { 2042 css_put(&mem->css); 2043 goto nomem; 2044 } 2045 /* If oom, we never return -ENOMEM */ 2046 nr_oom_retries--; 2047 break; 2048 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2049 css_put(&mem->css); 2050 goto bypass; 2051 } 2052 } while (ret != CHARGE_OK); 2053 2054 if (csize > page_size) 2055 refill_stock(mem, csize - page_size); 2056 css_put(&mem->css); 2057 done: 2058 *memcg = mem; 2059 return 0; 2060 nomem: 2061 *memcg = NULL; 2062 return -ENOMEM; 2063 bypass: 2064 *memcg = NULL; 2065 return 0; 2066 } 2067 2068 /* 2069 * Somemtimes we have to undo a charge we got by try_charge(). 2070 * This function is for that and do uncharge, put css's refcnt. 2071 * gotten by try_charge(). 2072 */ 2073 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2074 unsigned long count) 2075 { 2076 if (!mem_cgroup_is_root(mem)) { 2077 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 2078 if (do_swap_account) 2079 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 2080 } 2081 } 2082 2083 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2084 int page_size) 2085 { 2086 __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); 2087 } 2088 2089 /* 2090 * A helper function to get mem_cgroup from ID. must be called under 2091 * rcu_read_lock(). The caller must check css_is_removed() or some if 2092 * it's concern. (dropping refcnt from swap can be called against removed 2093 * memcg.) 2094 */ 2095 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2096 { 2097 struct cgroup_subsys_state *css; 2098 2099 /* ID 0 is unused ID */ 2100 if (!id) 2101 return NULL; 2102 css = css_lookup(&mem_cgroup_subsys, id); 2103 if (!css) 2104 return NULL; 2105 return container_of(css, struct mem_cgroup, css); 2106 } 2107 2108 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2109 { 2110 struct mem_cgroup *mem = NULL; 2111 struct page_cgroup *pc; 2112 unsigned short id; 2113 swp_entry_t ent; 2114 2115 VM_BUG_ON(!PageLocked(page)); 2116 2117 pc = lookup_page_cgroup(page); 2118 lock_page_cgroup(pc); 2119 if (PageCgroupUsed(pc)) { 2120 mem = pc->mem_cgroup; 2121 if (mem && !css_tryget(&mem->css)) 2122 mem = NULL; 2123 } else if (PageSwapCache(page)) { 2124 ent.val = page_private(page); 2125 id = lookup_swap_cgroup(ent); 2126 rcu_read_lock(); 2127 mem = mem_cgroup_lookup(id); 2128 if (mem && !css_tryget(&mem->css)) 2129 mem = NULL; 2130 rcu_read_unlock(); 2131 } 2132 unlock_page_cgroup(pc); 2133 return mem; 2134 } 2135 2136 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2137 struct page_cgroup *pc, 2138 enum charge_type ctype, 2139 int page_size) 2140 { 2141 int nr_pages = page_size >> PAGE_SHIFT; 2142 2143 /* try_charge() can return NULL to *memcg, taking care of it. */ 2144 if (!mem) 2145 return; 2146 2147 lock_page_cgroup(pc); 2148 if (unlikely(PageCgroupUsed(pc))) { 2149 unlock_page_cgroup(pc); 2150 mem_cgroup_cancel_charge(mem, page_size); 2151 return; 2152 } 2153 /* 2154 * we don't need page_cgroup_lock about tail pages, becase they are not 2155 * accessed by any other context at this point. 2156 */ 2157 pc->mem_cgroup = mem; 2158 /* 2159 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2160 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2161 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2162 * before USED bit, we need memory barrier here. 2163 * See mem_cgroup_add_lru_list(), etc. 2164 */ 2165 smp_wmb(); 2166 switch (ctype) { 2167 case MEM_CGROUP_CHARGE_TYPE_CACHE: 2168 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 2169 SetPageCgroupCache(pc); 2170 SetPageCgroupUsed(pc); 2171 break; 2172 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2173 ClearPageCgroupCache(pc); 2174 SetPageCgroupUsed(pc); 2175 break; 2176 default: 2177 break; 2178 } 2179 2180 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); 2181 unlock_page_cgroup(pc); 2182 /* 2183 * "charge_statistics" updated event counter. Then, check it. 2184 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2185 * if they exceeds softlimit. 2186 */ 2187 memcg_check_events(mem, pc->page); 2188 } 2189 2190 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2191 2192 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2193 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) 2194 /* 2195 * Because tail pages are not marked as "used", set it. We're under 2196 * zone->lru_lock, 'splitting on pmd' and compund_lock. 2197 */ 2198 void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) 2199 { 2200 struct page_cgroup *head_pc = lookup_page_cgroup(head); 2201 struct page_cgroup *tail_pc = lookup_page_cgroup(tail); 2202 unsigned long flags; 2203 2204 if (mem_cgroup_disabled()) 2205 return; 2206 /* 2207 * We have no races with charge/uncharge but will have races with 2208 * page state accounting. 2209 */ 2210 move_lock_page_cgroup(head_pc, &flags); 2211 2212 tail_pc->mem_cgroup = head_pc->mem_cgroup; 2213 smp_wmb(); /* see __commit_charge() */ 2214 if (PageCgroupAcctLRU(head_pc)) { 2215 enum lru_list lru; 2216 struct mem_cgroup_per_zone *mz; 2217 2218 /* 2219 * LRU flags cannot be copied because we need to add tail 2220 *.page to LRU by generic call and our hook will be called. 2221 * We hold lru_lock, then, reduce counter directly. 2222 */ 2223 lru = page_lru(head); 2224 mz = page_cgroup_zoneinfo(head_pc); 2225 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 2226 } 2227 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 2228 move_unlock_page_cgroup(head_pc, &flags); 2229 } 2230 #endif 2231 2232 /** 2233 * __mem_cgroup_move_account - move account of the page 2234 * @pc: page_cgroup of the page. 2235 * @from: mem_cgroup which the page is moved from. 2236 * @to: mem_cgroup which the page is moved to. @from != @to. 2237 * @uncharge: whether we should call uncharge and css_put against @from. 2238 * 2239 * The caller must confirm following. 2240 * - page is not on LRU (isolate_page() is useful.) 2241 * - the pc is locked, used, and ->mem_cgroup points to @from. 2242 * 2243 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2244 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is 2245 * true, this function does "uncharge" from old cgroup, but it doesn't if 2246 * @uncharge is false, so a caller should do "uncharge". 2247 */ 2248 2249 static void __mem_cgroup_move_account(struct page_cgroup *pc, 2250 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, 2251 int charge_size) 2252 { 2253 int nr_pages = charge_size >> PAGE_SHIFT; 2254 2255 VM_BUG_ON(from == to); 2256 VM_BUG_ON(PageLRU(pc->page)); 2257 VM_BUG_ON(!page_is_cgroup_locked(pc)); 2258 VM_BUG_ON(!PageCgroupUsed(pc)); 2259 VM_BUG_ON(pc->mem_cgroup != from); 2260 2261 if (PageCgroupFileMapped(pc)) { 2262 /* Update mapped_file data for mem_cgroup */ 2263 preempt_disable(); 2264 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2265 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2266 preempt_enable(); 2267 } 2268 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); 2269 if (uncharge) 2270 /* This is not "cancel", but cancel_charge does all we need. */ 2271 mem_cgroup_cancel_charge(from, charge_size); 2272 2273 /* caller should have done css_get */ 2274 pc->mem_cgroup = to; 2275 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); 2276 /* 2277 * We charges against "to" which may not have any tasks. Then, "to" 2278 * can be under rmdir(). But in current implementation, caller of 2279 * this function is just force_empty() and move charge, so it's 2280 * garanteed that "to" is never removed. So, we don't check rmdir 2281 * status here. 2282 */ 2283 } 2284 2285 /* 2286 * check whether the @pc is valid for moving account and call 2287 * __mem_cgroup_move_account() 2288 */ 2289 static int mem_cgroup_move_account(struct page_cgroup *pc, 2290 struct mem_cgroup *from, struct mem_cgroup *to, 2291 bool uncharge, int charge_size) 2292 { 2293 int ret = -EINVAL; 2294 unsigned long flags; 2295 /* 2296 * The page is isolated from LRU. So, collapse function 2297 * will not handle this page. But page splitting can happen. 2298 * Do this check under compound_page_lock(). The caller should 2299 * hold it. 2300 */ 2301 if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) 2302 return -EBUSY; 2303 2304 lock_page_cgroup(pc); 2305 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 2306 move_lock_page_cgroup(pc, &flags); 2307 __mem_cgroup_move_account(pc, from, to, uncharge, charge_size); 2308 move_unlock_page_cgroup(pc, &flags); 2309 ret = 0; 2310 } 2311 unlock_page_cgroup(pc); 2312 /* 2313 * check events 2314 */ 2315 memcg_check_events(to, pc->page); 2316 memcg_check_events(from, pc->page); 2317 return ret; 2318 } 2319 2320 /* 2321 * move charges to its parent. 2322 */ 2323 2324 static int mem_cgroup_move_parent(struct page_cgroup *pc, 2325 struct mem_cgroup *child, 2326 gfp_t gfp_mask) 2327 { 2328 struct page *page = pc->page; 2329 struct cgroup *cg = child->css.cgroup; 2330 struct cgroup *pcg = cg->parent; 2331 struct mem_cgroup *parent; 2332 int page_size = PAGE_SIZE; 2333 unsigned long flags; 2334 int ret; 2335 2336 /* Is ROOT ? */ 2337 if (!pcg) 2338 return -EINVAL; 2339 2340 ret = -EBUSY; 2341 if (!get_page_unless_zero(page)) 2342 goto out; 2343 if (isolate_lru_page(page)) 2344 goto put; 2345 2346 if (PageTransHuge(page)) 2347 page_size = HPAGE_SIZE; 2348 2349 parent = mem_cgroup_from_cont(pcg); 2350 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 2351 &parent, false, page_size); 2352 if (ret || !parent) 2353 goto put_back; 2354 2355 if (page_size > PAGE_SIZE) 2356 flags = compound_lock_irqsave(page); 2357 2358 ret = mem_cgroup_move_account(pc, child, parent, true, page_size); 2359 if (ret) 2360 mem_cgroup_cancel_charge(parent, page_size); 2361 2362 if (page_size > PAGE_SIZE) 2363 compound_unlock_irqrestore(page, flags); 2364 put_back: 2365 putback_lru_page(page); 2366 put: 2367 put_page(page); 2368 out: 2369 return ret; 2370 } 2371 2372 /* 2373 * Charge the memory controller for page usage. 2374 * Return 2375 * 0 if the charge was successful 2376 * < 0 if the cgroup is over its limit 2377 */ 2378 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2379 gfp_t gfp_mask, enum charge_type ctype) 2380 { 2381 struct mem_cgroup *mem = NULL; 2382 int page_size = PAGE_SIZE; 2383 struct page_cgroup *pc; 2384 bool oom = true; 2385 int ret; 2386 2387 if (PageTransHuge(page)) { 2388 page_size <<= compound_order(page); 2389 VM_BUG_ON(!PageTransHuge(page)); 2390 /* 2391 * Never OOM-kill a process for a huge page. The 2392 * fault handler will fall back to regular pages. 2393 */ 2394 oom = false; 2395 } 2396 2397 pc = lookup_page_cgroup(page); 2398 /* can happen at boot */ 2399 if (unlikely(!pc)) 2400 return 0; 2401 prefetchw(pc); 2402 2403 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); 2404 if (ret || !mem) 2405 return ret; 2406 2407 __mem_cgroup_commit_charge(mem, pc, ctype, page_size); 2408 return 0; 2409 } 2410 2411 int mem_cgroup_newpage_charge(struct page *page, 2412 struct mm_struct *mm, gfp_t gfp_mask) 2413 { 2414 if (mem_cgroup_disabled()) 2415 return 0; 2416 /* 2417 * If already mapped, we don't have to account. 2418 * If page cache, page->mapping has address_space. 2419 * But page->mapping may have out-of-use anon_vma pointer, 2420 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 2421 * is NULL. 2422 */ 2423 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 2424 return 0; 2425 if (unlikely(!mm)) 2426 mm = &init_mm; 2427 return mem_cgroup_charge_common(page, mm, gfp_mask, 2428 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2429 } 2430 2431 static void 2432 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2433 enum charge_type ctype); 2434 2435 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2436 gfp_t gfp_mask) 2437 { 2438 int ret; 2439 2440 if (mem_cgroup_disabled()) 2441 return 0; 2442 if (PageCompound(page)) 2443 return 0; 2444 /* 2445 * Corner case handling. This is called from add_to_page_cache() 2446 * in usual. But some FS (shmem) precharges this page before calling it 2447 * and call add_to_page_cache() with GFP_NOWAIT. 2448 * 2449 * For GFP_NOWAIT case, the page may be pre-charged before calling 2450 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 2451 * charge twice. (It works but has to pay a bit larger cost.) 2452 * And when the page is SwapCache, it should take swap information 2453 * into account. This is under lock_page() now. 2454 */ 2455 if (!(gfp_mask & __GFP_WAIT)) { 2456 struct page_cgroup *pc; 2457 2458 pc = lookup_page_cgroup(page); 2459 if (!pc) 2460 return 0; 2461 lock_page_cgroup(pc); 2462 if (PageCgroupUsed(pc)) { 2463 unlock_page_cgroup(pc); 2464 return 0; 2465 } 2466 unlock_page_cgroup(pc); 2467 } 2468 2469 if (unlikely(!mm)) 2470 mm = &init_mm; 2471 2472 if (page_is_file_cache(page)) 2473 return mem_cgroup_charge_common(page, mm, gfp_mask, 2474 MEM_CGROUP_CHARGE_TYPE_CACHE); 2475 2476 /* shmem */ 2477 if (PageSwapCache(page)) { 2478 struct mem_cgroup *mem = NULL; 2479 2480 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2481 if (!ret) 2482 __mem_cgroup_commit_charge_swapin(page, mem, 2483 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2484 } else 2485 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2486 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2487 2488 return ret; 2489 } 2490 2491 /* 2492 * While swap-in, try_charge -> commit or cancel, the page is locked. 2493 * And when try_charge() successfully returns, one refcnt to memcg without 2494 * struct page_cgroup is acquired. This refcnt will be consumed by 2495 * "commit()" or removed by "cancel()" 2496 */ 2497 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2498 struct page *page, 2499 gfp_t mask, struct mem_cgroup **ptr) 2500 { 2501 struct mem_cgroup *mem; 2502 int ret; 2503 2504 if (mem_cgroup_disabled()) 2505 return 0; 2506 2507 if (!do_swap_account) 2508 goto charge_cur_mm; 2509 /* 2510 * A racing thread's fault, or swapoff, may have already updated 2511 * the pte, and even removed page from swap cache: in those cases 2512 * do_swap_page()'s pte_same() test will fail; but there's also a 2513 * KSM case which does need to charge the page. 2514 */ 2515 if (!PageSwapCache(page)) 2516 goto charge_cur_mm; 2517 mem = try_get_mem_cgroup_from_page(page); 2518 if (!mem) 2519 goto charge_cur_mm; 2520 *ptr = mem; 2521 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); 2522 css_put(&mem->css); 2523 return ret; 2524 charge_cur_mm: 2525 if (unlikely(!mm)) 2526 mm = &init_mm; 2527 return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); 2528 } 2529 2530 static void 2531 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2532 enum charge_type ctype) 2533 { 2534 struct page_cgroup *pc; 2535 2536 if (mem_cgroup_disabled()) 2537 return; 2538 if (!ptr) 2539 return; 2540 cgroup_exclude_rmdir(&ptr->css); 2541 pc = lookup_page_cgroup(page); 2542 mem_cgroup_lru_del_before_commit_swapcache(page); 2543 __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); 2544 mem_cgroup_lru_add_after_commit_swapcache(page); 2545 /* 2546 * Now swap is on-memory. This means this page may be 2547 * counted both as mem and swap....double count. 2548 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2549 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2550 * may call delete_from_swap_cache() before reach here. 2551 */ 2552 if (do_swap_account && PageSwapCache(page)) { 2553 swp_entry_t ent = {.val = page_private(page)}; 2554 unsigned short id; 2555 struct mem_cgroup *memcg; 2556 2557 id = swap_cgroup_record(ent, 0); 2558 rcu_read_lock(); 2559 memcg = mem_cgroup_lookup(id); 2560 if (memcg) { 2561 /* 2562 * This recorded memcg can be obsolete one. So, avoid 2563 * calling css_tryget 2564 */ 2565 if (!mem_cgroup_is_root(memcg)) 2566 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2567 mem_cgroup_swap_statistics(memcg, false); 2568 mem_cgroup_put(memcg); 2569 } 2570 rcu_read_unlock(); 2571 } 2572 /* 2573 * At swapin, we may charge account against cgroup which has no tasks. 2574 * So, rmdir()->pre_destroy() can be called while we do this charge. 2575 * In that case, we need to call pre_destroy() again. check it here. 2576 */ 2577 cgroup_release_and_wakeup_rmdir(&ptr->css); 2578 } 2579 2580 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2581 { 2582 __mem_cgroup_commit_charge_swapin(page, ptr, 2583 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2584 } 2585 2586 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 2587 { 2588 if (mem_cgroup_disabled()) 2589 return; 2590 if (!mem) 2591 return; 2592 mem_cgroup_cancel_charge(mem, PAGE_SIZE); 2593 } 2594 2595 static void 2596 __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, 2597 int page_size) 2598 { 2599 struct memcg_batch_info *batch = NULL; 2600 bool uncharge_memsw = true; 2601 /* If swapout, usage of swap doesn't decrease */ 2602 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2603 uncharge_memsw = false; 2604 2605 batch = ¤t->memcg_batch; 2606 /* 2607 * In usual, we do css_get() when we remember memcg pointer. 2608 * But in this case, we keep res->usage until end of a series of 2609 * uncharges. Then, it's ok to ignore memcg's refcnt. 2610 */ 2611 if (!batch->memcg) 2612 batch->memcg = mem; 2613 /* 2614 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2615 * In those cases, all pages freed continously can be expected to be in 2616 * the same cgroup and we have chance to coalesce uncharges. 2617 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2618 * because we want to do uncharge as soon as possible. 2619 */ 2620 2621 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2622 goto direct_uncharge; 2623 2624 if (page_size != PAGE_SIZE) 2625 goto direct_uncharge; 2626 2627 /* 2628 * In typical case, batch->memcg == mem. This means we can 2629 * merge a series of uncharges to an uncharge of res_counter. 2630 * If not, we uncharge res_counter ony by one. 2631 */ 2632 if (batch->memcg != mem) 2633 goto direct_uncharge; 2634 /* remember freed charge and uncharge it later */ 2635 batch->bytes += PAGE_SIZE; 2636 if (uncharge_memsw) 2637 batch->memsw_bytes += PAGE_SIZE; 2638 return; 2639 direct_uncharge: 2640 res_counter_uncharge(&mem->res, page_size); 2641 if (uncharge_memsw) 2642 res_counter_uncharge(&mem->memsw, page_size); 2643 if (unlikely(batch->memcg != mem)) 2644 memcg_oom_recover(mem); 2645 return; 2646 } 2647 2648 /* 2649 * uncharge if !page_mapped(page) 2650 */ 2651 static struct mem_cgroup * 2652 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2653 { 2654 int count; 2655 struct page_cgroup *pc; 2656 struct mem_cgroup *mem = NULL; 2657 int page_size = PAGE_SIZE; 2658 2659 if (mem_cgroup_disabled()) 2660 return NULL; 2661 2662 if (PageSwapCache(page)) 2663 return NULL; 2664 2665 if (PageTransHuge(page)) { 2666 page_size <<= compound_order(page); 2667 VM_BUG_ON(!PageTransHuge(page)); 2668 } 2669 2670 count = page_size >> PAGE_SHIFT; 2671 /* 2672 * Check if our page_cgroup is valid 2673 */ 2674 pc = lookup_page_cgroup(page); 2675 if (unlikely(!pc || !PageCgroupUsed(pc))) 2676 return NULL; 2677 2678 lock_page_cgroup(pc); 2679 2680 mem = pc->mem_cgroup; 2681 2682 if (!PageCgroupUsed(pc)) 2683 goto unlock_out; 2684 2685 switch (ctype) { 2686 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2687 case MEM_CGROUP_CHARGE_TYPE_DROP: 2688 /* See mem_cgroup_prepare_migration() */ 2689 if (page_mapped(page) || PageCgroupMigration(pc)) 2690 goto unlock_out; 2691 break; 2692 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2693 if (!PageAnon(page)) { /* Shared memory */ 2694 if (page->mapping && !page_is_file_cache(page)) 2695 goto unlock_out; 2696 } else if (page_mapped(page)) /* Anon */ 2697 goto unlock_out; 2698 break; 2699 default: 2700 break; 2701 } 2702 2703 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); 2704 2705 ClearPageCgroupUsed(pc); 2706 /* 2707 * pc->mem_cgroup is not cleared here. It will be accessed when it's 2708 * freed from LRU. This is safe because uncharged page is expected not 2709 * to be reused (freed soon). Exception is SwapCache, it's handled by 2710 * special functions. 2711 */ 2712 2713 unlock_page_cgroup(pc); 2714 /* 2715 * even after unlock, we have mem->res.usage here and this memcg 2716 * will never be freed. 2717 */ 2718 memcg_check_events(mem, page); 2719 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 2720 mem_cgroup_swap_statistics(mem, true); 2721 mem_cgroup_get(mem); 2722 } 2723 if (!mem_cgroup_is_root(mem)) 2724 __do_uncharge(mem, ctype, page_size); 2725 2726 return mem; 2727 2728 unlock_out: 2729 unlock_page_cgroup(pc); 2730 return NULL; 2731 } 2732 2733 void mem_cgroup_uncharge_page(struct page *page) 2734 { 2735 /* early check. */ 2736 if (page_mapped(page)) 2737 return; 2738 if (page->mapping && !PageAnon(page)) 2739 return; 2740 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 2741 } 2742 2743 void mem_cgroup_uncharge_cache_page(struct page *page) 2744 { 2745 VM_BUG_ON(page_mapped(page)); 2746 VM_BUG_ON(page->mapping); 2747 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2748 } 2749 2750 /* 2751 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 2752 * In that cases, pages are freed continuously and we can expect pages 2753 * are in the same memcg. All these calls itself limits the number of 2754 * pages freed at once, then uncharge_start/end() is called properly. 2755 * This may be called prural(2) times in a context, 2756 */ 2757 2758 void mem_cgroup_uncharge_start(void) 2759 { 2760 current->memcg_batch.do_batch++; 2761 /* We can do nest. */ 2762 if (current->memcg_batch.do_batch == 1) { 2763 current->memcg_batch.memcg = NULL; 2764 current->memcg_batch.bytes = 0; 2765 current->memcg_batch.memsw_bytes = 0; 2766 } 2767 } 2768 2769 void mem_cgroup_uncharge_end(void) 2770 { 2771 struct memcg_batch_info *batch = ¤t->memcg_batch; 2772 2773 if (!batch->do_batch) 2774 return; 2775 2776 batch->do_batch--; 2777 if (batch->do_batch) /* If stacked, do nothing. */ 2778 return; 2779 2780 if (!batch->memcg) 2781 return; 2782 /* 2783 * This "batch->memcg" is valid without any css_get/put etc... 2784 * bacause we hide charges behind us. 2785 */ 2786 if (batch->bytes) 2787 res_counter_uncharge(&batch->memcg->res, batch->bytes); 2788 if (batch->memsw_bytes) 2789 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 2790 memcg_oom_recover(batch->memcg); 2791 /* forget this pointer (for sanity check) */ 2792 batch->memcg = NULL; 2793 } 2794 2795 #ifdef CONFIG_SWAP 2796 /* 2797 * called after __delete_from_swap_cache() and drop "page" account. 2798 * memcg information is recorded to swap_cgroup of "ent" 2799 */ 2800 void 2801 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 2802 { 2803 struct mem_cgroup *memcg; 2804 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 2805 2806 if (!swapout) /* this was a swap cache but the swap is unused ! */ 2807 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 2808 2809 memcg = __mem_cgroup_uncharge_common(page, ctype); 2810 2811 /* 2812 * record memcg information, if swapout && memcg != NULL, 2813 * mem_cgroup_get() was called in uncharge(). 2814 */ 2815 if (do_swap_account && swapout && memcg) 2816 swap_cgroup_record(ent, css_id(&memcg->css)); 2817 } 2818 #endif 2819 2820 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2821 /* 2822 * called from swap_entry_free(). remove record in swap_cgroup and 2823 * uncharge "memsw" account. 2824 */ 2825 void mem_cgroup_uncharge_swap(swp_entry_t ent) 2826 { 2827 struct mem_cgroup *memcg; 2828 unsigned short id; 2829 2830 if (!do_swap_account) 2831 return; 2832 2833 id = swap_cgroup_record(ent, 0); 2834 rcu_read_lock(); 2835 memcg = mem_cgroup_lookup(id); 2836 if (memcg) { 2837 /* 2838 * We uncharge this because swap is freed. 2839 * This memcg can be obsolete one. We avoid calling css_tryget 2840 */ 2841 if (!mem_cgroup_is_root(memcg)) 2842 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2843 mem_cgroup_swap_statistics(memcg, false); 2844 mem_cgroup_put(memcg); 2845 } 2846 rcu_read_unlock(); 2847 } 2848 2849 /** 2850 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2851 * @entry: swap entry to be moved 2852 * @from: mem_cgroup which the entry is moved from 2853 * @to: mem_cgroup which the entry is moved to 2854 * @need_fixup: whether we should fixup res_counters and refcounts. 2855 * 2856 * It succeeds only when the swap_cgroup's record for this entry is the same 2857 * as the mem_cgroup's id of @from. 2858 * 2859 * Returns 0 on success, -EINVAL on failure. 2860 * 2861 * The caller must have charged to @to, IOW, called res_counter_charge() about 2862 * both res and memsw, and called css_get(). 2863 */ 2864 static int mem_cgroup_move_swap_account(swp_entry_t entry, 2865 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2866 { 2867 unsigned short old_id, new_id; 2868 2869 old_id = css_id(&from->css); 2870 new_id = css_id(&to->css); 2871 2872 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2873 mem_cgroup_swap_statistics(from, false); 2874 mem_cgroup_swap_statistics(to, true); 2875 /* 2876 * This function is only called from task migration context now. 2877 * It postpones res_counter and refcount handling till the end 2878 * of task migration(mem_cgroup_clear_mc()) for performance 2879 * improvement. But we cannot postpone mem_cgroup_get(to) 2880 * because if the process that has been moved to @to does 2881 * swap-in, the refcount of @to might be decreased to 0. 2882 */ 2883 mem_cgroup_get(to); 2884 if (need_fixup) { 2885 if (!mem_cgroup_is_root(from)) 2886 res_counter_uncharge(&from->memsw, PAGE_SIZE); 2887 mem_cgroup_put(from); 2888 /* 2889 * we charged both to->res and to->memsw, so we should 2890 * uncharge to->res. 2891 */ 2892 if (!mem_cgroup_is_root(to)) 2893 res_counter_uncharge(&to->res, PAGE_SIZE); 2894 } 2895 return 0; 2896 } 2897 return -EINVAL; 2898 } 2899 #else 2900 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2901 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2902 { 2903 return -EINVAL; 2904 } 2905 #endif 2906 2907 /* 2908 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 2909 * page belongs to. 2910 */ 2911 int mem_cgroup_prepare_migration(struct page *page, 2912 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) 2913 { 2914 struct page_cgroup *pc; 2915 struct mem_cgroup *mem = NULL; 2916 enum charge_type ctype; 2917 int ret = 0; 2918 2919 VM_BUG_ON(PageTransHuge(page)); 2920 if (mem_cgroup_disabled()) 2921 return 0; 2922 2923 pc = lookup_page_cgroup(page); 2924 lock_page_cgroup(pc); 2925 if (PageCgroupUsed(pc)) { 2926 mem = pc->mem_cgroup; 2927 css_get(&mem->css); 2928 /* 2929 * At migrating an anonymous page, its mapcount goes down 2930 * to 0 and uncharge() will be called. But, even if it's fully 2931 * unmapped, migration may fail and this page has to be 2932 * charged again. We set MIGRATION flag here and delay uncharge 2933 * until end_migration() is called 2934 * 2935 * Corner Case Thinking 2936 * A) 2937 * When the old page was mapped as Anon and it's unmap-and-freed 2938 * while migration was ongoing. 2939 * If unmap finds the old page, uncharge() of it will be delayed 2940 * until end_migration(). If unmap finds a new page, it's 2941 * uncharged when it make mapcount to be 1->0. If unmap code 2942 * finds swap_migration_entry, the new page will not be mapped 2943 * and end_migration() will find it(mapcount==0). 2944 * 2945 * B) 2946 * When the old page was mapped but migraion fails, the kernel 2947 * remaps it. A charge for it is kept by MIGRATION flag even 2948 * if mapcount goes down to 0. We can do remap successfully 2949 * without charging it again. 2950 * 2951 * C) 2952 * The "old" page is under lock_page() until the end of 2953 * migration, so, the old page itself will not be swapped-out. 2954 * If the new page is swapped out before end_migraton, our 2955 * hook to usual swap-out path will catch the event. 2956 */ 2957 if (PageAnon(page)) 2958 SetPageCgroupMigration(pc); 2959 } 2960 unlock_page_cgroup(pc); 2961 /* 2962 * If the page is not charged at this point, 2963 * we return here. 2964 */ 2965 if (!mem) 2966 return 0; 2967 2968 *ptr = mem; 2969 ret = __mem_cgroup_try_charge(NULL, gfp_mask, ptr, false, PAGE_SIZE); 2970 css_put(&mem->css);/* drop extra refcnt */ 2971 if (ret || *ptr == NULL) { 2972 if (PageAnon(page)) { 2973 lock_page_cgroup(pc); 2974 ClearPageCgroupMigration(pc); 2975 unlock_page_cgroup(pc); 2976 /* 2977 * The old page may be fully unmapped while we kept it. 2978 */ 2979 mem_cgroup_uncharge_page(page); 2980 } 2981 return -ENOMEM; 2982 } 2983 /* 2984 * We charge new page before it's used/mapped. So, even if unlock_page() 2985 * is called before end_migration, we can catch all events on this new 2986 * page. In the case new page is migrated but not remapped, new page's 2987 * mapcount will be finally 0 and we call uncharge in end_migration(). 2988 */ 2989 pc = lookup_page_cgroup(newpage); 2990 if (PageAnon(page)) 2991 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 2992 else if (page_is_file_cache(page)) 2993 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2994 else 2995 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2996 __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); 2997 return ret; 2998 } 2999 3000 /* remove redundant charge if migration failed*/ 3001 void mem_cgroup_end_migration(struct mem_cgroup *mem, 3002 struct page *oldpage, struct page *newpage, bool migration_ok) 3003 { 3004 struct page *used, *unused; 3005 struct page_cgroup *pc; 3006 3007 if (!mem) 3008 return; 3009 /* blocks rmdir() */ 3010 cgroup_exclude_rmdir(&mem->css); 3011 if (!migration_ok) { 3012 used = oldpage; 3013 unused = newpage; 3014 } else { 3015 used = newpage; 3016 unused = oldpage; 3017 } 3018 /* 3019 * We disallowed uncharge of pages under migration because mapcount 3020 * of the page goes down to zero, temporarly. 3021 * Clear the flag and check the page should be charged. 3022 */ 3023 pc = lookup_page_cgroup(oldpage); 3024 lock_page_cgroup(pc); 3025 ClearPageCgroupMigration(pc); 3026 unlock_page_cgroup(pc); 3027 3028 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 3029 3030 /* 3031 * If a page is a file cache, radix-tree replacement is very atomic 3032 * and we can skip this check. When it was an Anon page, its mapcount 3033 * goes down to 0. But because we added MIGRATION flage, it's not 3034 * uncharged yet. There are several case but page->mapcount check 3035 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3036 * check. (see prepare_charge() also) 3037 */ 3038 if (PageAnon(used)) 3039 mem_cgroup_uncharge_page(used); 3040 /* 3041 * At migration, we may charge account against cgroup which has no 3042 * tasks. 3043 * So, rmdir()->pre_destroy() can be called while we do this charge. 3044 * In that case, we need to call pre_destroy() again. check it here. 3045 */ 3046 cgroup_release_and_wakeup_rmdir(&mem->css); 3047 } 3048 3049 /* 3050 * A call to try to shrink memory usage on charge failure at shmem's swapin. 3051 * Calling hierarchical_reclaim is not enough because we should update 3052 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. 3053 * Moreover considering hierarchy, we should reclaim from the mem_over_limit, 3054 * not from the memcg which this page would be charged to. 3055 * try_charge_swapin does all of these works properly. 3056 */ 3057 int mem_cgroup_shmem_charge_fallback(struct page *page, 3058 struct mm_struct *mm, 3059 gfp_t gfp_mask) 3060 { 3061 struct mem_cgroup *mem = NULL; 3062 int ret; 3063 3064 if (mem_cgroup_disabled()) 3065 return 0; 3066 3067 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 3068 if (!ret) 3069 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ 3070 3071 return ret; 3072 } 3073 3074 static DEFINE_MUTEX(set_limit_mutex); 3075 3076 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3077 unsigned long long val) 3078 { 3079 int retry_count; 3080 u64 memswlimit, memlimit; 3081 int ret = 0; 3082 int children = mem_cgroup_count_children(memcg); 3083 u64 curusage, oldusage; 3084 int enlarge; 3085 3086 /* 3087 * For keeping hierarchical_reclaim simple, how long we should retry 3088 * is depends on callers. We set our retry-count to be function 3089 * of # of children which we should visit in this loop. 3090 */ 3091 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3092 3093 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3094 3095 enlarge = 0; 3096 while (retry_count) { 3097 if (signal_pending(current)) { 3098 ret = -EINTR; 3099 break; 3100 } 3101 /* 3102 * Rather than hide all in some function, I do this in 3103 * open coded manner. You see what this really does. 3104 * We have to guarantee mem->res.limit < mem->memsw.limit. 3105 */ 3106 mutex_lock(&set_limit_mutex); 3107 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3108 if (memswlimit < val) { 3109 ret = -EINVAL; 3110 mutex_unlock(&set_limit_mutex); 3111 break; 3112 } 3113 3114 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3115 if (memlimit < val) 3116 enlarge = 1; 3117 3118 ret = res_counter_set_limit(&memcg->res, val); 3119 if (!ret) { 3120 if (memswlimit == val) 3121 memcg->memsw_is_minimum = true; 3122 else 3123 memcg->memsw_is_minimum = false; 3124 } 3125 mutex_unlock(&set_limit_mutex); 3126 3127 if (!ret) 3128 break; 3129 3130 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3131 MEM_CGROUP_RECLAIM_SHRINK); 3132 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3133 /* Usage is reduced ? */ 3134 if (curusage >= oldusage) 3135 retry_count--; 3136 else 3137 oldusage = curusage; 3138 } 3139 if (!ret && enlarge) 3140 memcg_oom_recover(memcg); 3141 3142 return ret; 3143 } 3144 3145 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3146 unsigned long long val) 3147 { 3148 int retry_count; 3149 u64 memlimit, memswlimit, oldusage, curusage; 3150 int children = mem_cgroup_count_children(memcg); 3151 int ret = -EBUSY; 3152 int enlarge = 0; 3153 3154 /* see mem_cgroup_resize_res_limit */ 3155 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3156 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3157 while (retry_count) { 3158 if (signal_pending(current)) { 3159 ret = -EINTR; 3160 break; 3161 } 3162 /* 3163 * Rather than hide all in some function, I do this in 3164 * open coded manner. You see what this really does. 3165 * We have to guarantee mem->res.limit < mem->memsw.limit. 3166 */ 3167 mutex_lock(&set_limit_mutex); 3168 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3169 if (memlimit > val) { 3170 ret = -EINVAL; 3171 mutex_unlock(&set_limit_mutex); 3172 break; 3173 } 3174 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3175 if (memswlimit < val) 3176 enlarge = 1; 3177 ret = res_counter_set_limit(&memcg->memsw, val); 3178 if (!ret) { 3179 if (memlimit == val) 3180 memcg->memsw_is_minimum = true; 3181 else 3182 memcg->memsw_is_minimum = false; 3183 } 3184 mutex_unlock(&set_limit_mutex); 3185 3186 if (!ret) 3187 break; 3188 3189 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3190 MEM_CGROUP_RECLAIM_NOSWAP | 3191 MEM_CGROUP_RECLAIM_SHRINK); 3192 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3193 /* Usage is reduced ? */ 3194 if (curusage >= oldusage) 3195 retry_count--; 3196 else 3197 oldusage = curusage; 3198 } 3199 if (!ret && enlarge) 3200 memcg_oom_recover(memcg); 3201 return ret; 3202 } 3203 3204 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3205 gfp_t gfp_mask) 3206 { 3207 unsigned long nr_reclaimed = 0; 3208 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3209 unsigned long reclaimed; 3210 int loop = 0; 3211 struct mem_cgroup_tree_per_zone *mctz; 3212 unsigned long long excess; 3213 3214 if (order > 0) 3215 return 0; 3216 3217 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3218 /* 3219 * This loop can run a while, specially if mem_cgroup's continuously 3220 * keep exceeding their soft limit and putting the system under 3221 * pressure 3222 */ 3223 do { 3224 if (next_mz) 3225 mz = next_mz; 3226 else 3227 mz = mem_cgroup_largest_soft_limit_node(mctz); 3228 if (!mz) 3229 break; 3230 3231 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 3232 gfp_mask, 3233 MEM_CGROUP_RECLAIM_SOFT); 3234 nr_reclaimed += reclaimed; 3235 spin_lock(&mctz->lock); 3236 3237 /* 3238 * If we failed to reclaim anything from this memory cgroup 3239 * it is time to move on to the next cgroup 3240 */ 3241 next_mz = NULL; 3242 if (!reclaimed) { 3243 do { 3244 /* 3245 * Loop until we find yet another one. 3246 * 3247 * By the time we get the soft_limit lock 3248 * again, someone might have aded the 3249 * group back on the RB tree. Iterate to 3250 * make sure we get a different mem. 3251 * mem_cgroup_largest_soft_limit_node returns 3252 * NULL if no other cgroup is present on 3253 * the tree 3254 */ 3255 next_mz = 3256 __mem_cgroup_largest_soft_limit_node(mctz); 3257 if (next_mz == mz) { 3258 css_put(&next_mz->mem->css); 3259 next_mz = NULL; 3260 } else /* next_mz == NULL or other memcg */ 3261 break; 3262 } while (1); 3263 } 3264 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 3265 excess = res_counter_soft_limit_excess(&mz->mem->res); 3266 /* 3267 * One school of thought says that we should not add 3268 * back the node to the tree if reclaim returns 0. 3269 * But our reclaim could return 0, simply because due 3270 * to priority we are exposing a smaller subset of 3271 * memory to reclaim from. Consider this as a longer 3272 * term TODO. 3273 */ 3274 /* If excess == 0, no tree ops */ 3275 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 3276 spin_unlock(&mctz->lock); 3277 css_put(&mz->mem->css); 3278 loop++; 3279 /* 3280 * Could not reclaim anything and there are no more 3281 * mem cgroups to try or we seem to be looping without 3282 * reclaiming anything. 3283 */ 3284 if (!nr_reclaimed && 3285 (next_mz == NULL || 3286 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3287 break; 3288 } while (!nr_reclaimed); 3289 if (next_mz) 3290 css_put(&next_mz->mem->css); 3291 return nr_reclaimed; 3292 } 3293 3294 /* 3295 * This routine traverse page_cgroup in given list and drop them all. 3296 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3297 */ 3298 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 3299 int node, int zid, enum lru_list lru) 3300 { 3301 struct zone *zone; 3302 struct mem_cgroup_per_zone *mz; 3303 struct page_cgroup *pc, *busy; 3304 unsigned long flags, loop; 3305 struct list_head *list; 3306 int ret = 0; 3307 3308 zone = &NODE_DATA(node)->node_zones[zid]; 3309 mz = mem_cgroup_zoneinfo(mem, node, zid); 3310 list = &mz->lists[lru]; 3311 3312 loop = MEM_CGROUP_ZSTAT(mz, lru); 3313 /* give some margin against EBUSY etc...*/ 3314 loop += 256; 3315 busy = NULL; 3316 while (loop--) { 3317 ret = 0; 3318 spin_lock_irqsave(&zone->lru_lock, flags); 3319 if (list_empty(list)) { 3320 spin_unlock_irqrestore(&zone->lru_lock, flags); 3321 break; 3322 } 3323 pc = list_entry(list->prev, struct page_cgroup, lru); 3324 if (busy == pc) { 3325 list_move(&pc->lru, list); 3326 busy = NULL; 3327 spin_unlock_irqrestore(&zone->lru_lock, flags); 3328 continue; 3329 } 3330 spin_unlock_irqrestore(&zone->lru_lock, flags); 3331 3332 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 3333 if (ret == -ENOMEM) 3334 break; 3335 3336 if (ret == -EBUSY || ret == -EINVAL) { 3337 /* found lock contention or "pc" is obsolete. */ 3338 busy = pc; 3339 cond_resched(); 3340 } else 3341 busy = NULL; 3342 } 3343 3344 if (!ret && !list_empty(list)) 3345 return -EBUSY; 3346 return ret; 3347 } 3348 3349 /* 3350 * make mem_cgroup's charge to be 0 if there is no task. 3351 * This enables deleting this mem_cgroup. 3352 */ 3353 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 3354 { 3355 int ret; 3356 int node, zid, shrink; 3357 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3358 struct cgroup *cgrp = mem->css.cgroup; 3359 3360 css_get(&mem->css); 3361 3362 shrink = 0; 3363 /* should free all ? */ 3364 if (free_all) 3365 goto try_to_free; 3366 move_account: 3367 do { 3368 ret = -EBUSY; 3369 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3370 goto out; 3371 ret = -EINTR; 3372 if (signal_pending(current)) 3373 goto out; 3374 /* This is for making all *used* pages to be on LRU. */ 3375 lru_add_drain_all(); 3376 drain_all_stock_sync(); 3377 ret = 0; 3378 mem_cgroup_start_move(mem); 3379 for_each_node_state(node, N_HIGH_MEMORY) { 3380 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3381 enum lru_list l; 3382 for_each_lru(l) { 3383 ret = mem_cgroup_force_empty_list(mem, 3384 node, zid, l); 3385 if (ret) 3386 break; 3387 } 3388 } 3389 if (ret) 3390 break; 3391 } 3392 mem_cgroup_end_move(mem); 3393 memcg_oom_recover(mem); 3394 /* it seems parent cgroup doesn't have enough mem */ 3395 if (ret == -ENOMEM) 3396 goto try_to_free; 3397 cond_resched(); 3398 /* "ret" should also be checked to ensure all lists are empty. */ 3399 } while (mem->res.usage > 0 || ret); 3400 out: 3401 css_put(&mem->css); 3402 return ret; 3403 3404 try_to_free: 3405 /* returns EBUSY if there is a task or if we come here twice. */ 3406 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3407 ret = -EBUSY; 3408 goto out; 3409 } 3410 /* we call try-to-free pages for make this cgroup empty */ 3411 lru_add_drain_all(); 3412 /* try to free all pages in this cgroup */ 3413 shrink = 1; 3414 while (nr_retries && mem->res.usage > 0) { 3415 int progress; 3416 3417 if (signal_pending(current)) { 3418 ret = -EINTR; 3419 goto out; 3420 } 3421 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3422 false, get_swappiness(mem)); 3423 if (!progress) { 3424 nr_retries--; 3425 /* maybe some writeback is necessary */ 3426 congestion_wait(BLK_RW_ASYNC, HZ/10); 3427 } 3428 3429 } 3430 lru_add_drain(); 3431 /* try move_account...there may be some *locked* pages. */ 3432 goto move_account; 3433 } 3434 3435 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3436 { 3437 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3438 } 3439 3440 3441 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 3442 { 3443 return mem_cgroup_from_cont(cont)->use_hierarchy; 3444 } 3445 3446 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 3447 u64 val) 3448 { 3449 int retval = 0; 3450 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3451 struct cgroup *parent = cont->parent; 3452 struct mem_cgroup *parent_mem = NULL; 3453 3454 if (parent) 3455 parent_mem = mem_cgroup_from_cont(parent); 3456 3457 cgroup_lock(); 3458 /* 3459 * If parent's use_hierarchy is set, we can't make any modifications 3460 * in the child subtrees. If it is unset, then the change can 3461 * occur, provided the current cgroup has no children. 3462 * 3463 * For the root cgroup, parent_mem is NULL, we allow value to be 3464 * set if there are no children. 3465 */ 3466 if ((!parent_mem || !parent_mem->use_hierarchy) && 3467 (val == 1 || val == 0)) { 3468 if (list_empty(&cont->children)) 3469 mem->use_hierarchy = val; 3470 else 3471 retval = -EBUSY; 3472 } else 3473 retval = -EINVAL; 3474 cgroup_unlock(); 3475 3476 return retval; 3477 } 3478 3479 3480 static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 3481 enum mem_cgroup_stat_index idx) 3482 { 3483 struct mem_cgroup *iter; 3484 s64 val = 0; 3485 3486 /* each per cpu's value can be minus.Then, use s64 */ 3487 for_each_mem_cgroup_tree(iter, mem) 3488 val += mem_cgroup_read_stat(iter, idx); 3489 3490 if (val < 0) /* race ? */ 3491 val = 0; 3492 return val; 3493 } 3494 3495 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3496 { 3497 u64 val; 3498 3499 if (!mem_cgroup_is_root(mem)) { 3500 if (!swap) 3501 return res_counter_read_u64(&mem->res, RES_USAGE); 3502 else 3503 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3504 } 3505 3506 val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); 3507 val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); 3508 3509 if (swap) 3510 val += mem_cgroup_get_recursive_idx_stat(mem, 3511 MEM_CGROUP_STAT_SWAPOUT); 3512 3513 return val << PAGE_SHIFT; 3514 } 3515 3516 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3517 { 3518 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3519 u64 val; 3520 int type, name; 3521 3522 type = MEMFILE_TYPE(cft->private); 3523 name = MEMFILE_ATTR(cft->private); 3524 switch (type) { 3525 case _MEM: 3526 if (name == RES_USAGE) 3527 val = mem_cgroup_usage(mem, false); 3528 else 3529 val = res_counter_read_u64(&mem->res, name); 3530 break; 3531 case _MEMSWAP: 3532 if (name == RES_USAGE) 3533 val = mem_cgroup_usage(mem, true); 3534 else 3535 val = res_counter_read_u64(&mem->memsw, name); 3536 break; 3537 default: 3538 BUG(); 3539 break; 3540 } 3541 return val; 3542 } 3543 /* 3544 * The user of this function is... 3545 * RES_LIMIT. 3546 */ 3547 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 3548 const char *buffer) 3549 { 3550 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3551 int type, name; 3552 unsigned long long val; 3553 int ret; 3554 3555 type = MEMFILE_TYPE(cft->private); 3556 name = MEMFILE_ATTR(cft->private); 3557 switch (name) { 3558 case RES_LIMIT: 3559 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3560 ret = -EINVAL; 3561 break; 3562 } 3563 /* This function does all necessary parse...reuse it */ 3564 ret = res_counter_memparse_write_strategy(buffer, &val); 3565 if (ret) 3566 break; 3567 if (type == _MEM) 3568 ret = mem_cgroup_resize_limit(memcg, val); 3569 else 3570 ret = mem_cgroup_resize_memsw_limit(memcg, val); 3571 break; 3572 case RES_SOFT_LIMIT: 3573 ret = res_counter_memparse_write_strategy(buffer, &val); 3574 if (ret) 3575 break; 3576 /* 3577 * For memsw, soft limits are hard to implement in terms 3578 * of semantics, for now, we support soft limits for 3579 * control without swap 3580 */ 3581 if (type == _MEM) 3582 ret = res_counter_set_soft_limit(&memcg->res, val); 3583 else 3584 ret = -EINVAL; 3585 break; 3586 default: 3587 ret = -EINVAL; /* should be BUG() ? */ 3588 break; 3589 } 3590 return ret; 3591 } 3592 3593 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 3594 unsigned long long *mem_limit, unsigned long long *memsw_limit) 3595 { 3596 struct cgroup *cgroup; 3597 unsigned long long min_limit, min_memsw_limit, tmp; 3598 3599 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3600 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3601 cgroup = memcg->css.cgroup; 3602 if (!memcg->use_hierarchy) 3603 goto out; 3604 3605 while (cgroup->parent) { 3606 cgroup = cgroup->parent; 3607 memcg = mem_cgroup_from_cont(cgroup); 3608 if (!memcg->use_hierarchy) 3609 break; 3610 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 3611 min_limit = min(min_limit, tmp); 3612 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3613 min_memsw_limit = min(min_memsw_limit, tmp); 3614 } 3615 out: 3616 *mem_limit = min_limit; 3617 *memsw_limit = min_memsw_limit; 3618 return; 3619 } 3620 3621 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3622 { 3623 struct mem_cgroup *mem; 3624 int type, name; 3625 3626 mem = mem_cgroup_from_cont(cont); 3627 type = MEMFILE_TYPE(event); 3628 name = MEMFILE_ATTR(event); 3629 switch (name) { 3630 case RES_MAX_USAGE: 3631 if (type == _MEM) 3632 res_counter_reset_max(&mem->res); 3633 else 3634 res_counter_reset_max(&mem->memsw); 3635 break; 3636 case RES_FAILCNT: 3637 if (type == _MEM) 3638 res_counter_reset_failcnt(&mem->res); 3639 else 3640 res_counter_reset_failcnt(&mem->memsw); 3641 break; 3642 } 3643 3644 return 0; 3645 } 3646 3647 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 3648 struct cftype *cft) 3649 { 3650 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 3651 } 3652 3653 #ifdef CONFIG_MMU 3654 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3655 struct cftype *cft, u64 val) 3656 { 3657 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3658 3659 if (val >= (1 << NR_MOVE_TYPE)) 3660 return -EINVAL; 3661 /* 3662 * We check this value several times in both in can_attach() and 3663 * attach(), so we need cgroup lock to prevent this value from being 3664 * inconsistent. 3665 */ 3666 cgroup_lock(); 3667 mem->move_charge_at_immigrate = val; 3668 cgroup_unlock(); 3669 3670 return 0; 3671 } 3672 #else 3673 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3674 struct cftype *cft, u64 val) 3675 { 3676 return -ENOSYS; 3677 } 3678 #endif 3679 3680 3681 /* For read statistics */ 3682 enum { 3683 MCS_CACHE, 3684 MCS_RSS, 3685 MCS_FILE_MAPPED, 3686 MCS_PGPGIN, 3687 MCS_PGPGOUT, 3688 MCS_SWAP, 3689 MCS_INACTIVE_ANON, 3690 MCS_ACTIVE_ANON, 3691 MCS_INACTIVE_FILE, 3692 MCS_ACTIVE_FILE, 3693 MCS_UNEVICTABLE, 3694 NR_MCS_STAT, 3695 }; 3696 3697 struct mcs_total_stat { 3698 s64 stat[NR_MCS_STAT]; 3699 }; 3700 3701 struct { 3702 char *local_name; 3703 char *total_name; 3704 } memcg_stat_strings[NR_MCS_STAT] = { 3705 {"cache", "total_cache"}, 3706 {"rss", "total_rss"}, 3707 {"mapped_file", "total_mapped_file"}, 3708 {"pgpgin", "total_pgpgin"}, 3709 {"pgpgout", "total_pgpgout"}, 3710 {"swap", "total_swap"}, 3711 {"inactive_anon", "total_inactive_anon"}, 3712 {"active_anon", "total_active_anon"}, 3713 {"inactive_file", "total_inactive_file"}, 3714 {"active_file", "total_active_file"}, 3715 {"unevictable", "total_unevictable"} 3716 }; 3717 3718 3719 static void 3720 mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3721 { 3722 s64 val; 3723 3724 /* per cpu stat */ 3725 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 3726 s->stat[MCS_CACHE] += val * PAGE_SIZE; 3727 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 3728 s->stat[MCS_RSS] += val * PAGE_SIZE; 3729 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 3730 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3731 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); 3732 s->stat[MCS_PGPGIN] += val; 3733 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3734 s->stat[MCS_PGPGOUT] += val; 3735 if (do_swap_account) { 3736 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3737 s->stat[MCS_SWAP] += val * PAGE_SIZE; 3738 } 3739 3740 /* per zone stat */ 3741 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 3742 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 3743 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 3744 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 3745 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 3746 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 3747 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 3748 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 3749 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 3750 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 3751 } 3752 3753 static void 3754 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3755 { 3756 struct mem_cgroup *iter; 3757 3758 for_each_mem_cgroup_tree(iter, mem) 3759 mem_cgroup_get_local_stat(iter, s); 3760 } 3761 3762 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 3763 struct cgroup_map_cb *cb) 3764 { 3765 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 3766 struct mcs_total_stat mystat; 3767 int i; 3768 3769 memset(&mystat, 0, sizeof(mystat)); 3770 mem_cgroup_get_local_stat(mem_cont, &mystat); 3771 3772 for (i = 0; i < NR_MCS_STAT; i++) { 3773 if (i == MCS_SWAP && !do_swap_account) 3774 continue; 3775 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 3776 } 3777 3778 /* Hierarchical information */ 3779 { 3780 unsigned long long limit, memsw_limit; 3781 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 3782 cb->fill(cb, "hierarchical_memory_limit", limit); 3783 if (do_swap_account) 3784 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 3785 } 3786 3787 memset(&mystat, 0, sizeof(mystat)); 3788 mem_cgroup_get_total_stat(mem_cont, &mystat); 3789 for (i = 0; i < NR_MCS_STAT; i++) { 3790 if (i == MCS_SWAP && !do_swap_account) 3791 continue; 3792 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 3793 } 3794 3795 #ifdef CONFIG_DEBUG_VM 3796 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 3797 3798 { 3799 int nid, zid; 3800 struct mem_cgroup_per_zone *mz; 3801 unsigned long recent_rotated[2] = {0, 0}; 3802 unsigned long recent_scanned[2] = {0, 0}; 3803 3804 for_each_online_node(nid) 3805 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3806 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 3807 3808 recent_rotated[0] += 3809 mz->reclaim_stat.recent_rotated[0]; 3810 recent_rotated[1] += 3811 mz->reclaim_stat.recent_rotated[1]; 3812 recent_scanned[0] += 3813 mz->reclaim_stat.recent_scanned[0]; 3814 recent_scanned[1] += 3815 mz->reclaim_stat.recent_scanned[1]; 3816 } 3817 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 3818 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 3819 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 3820 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 3821 } 3822 #endif 3823 3824 return 0; 3825 } 3826 3827 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 3828 { 3829 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3830 3831 return get_swappiness(memcg); 3832 } 3833 3834 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 3835 u64 val) 3836 { 3837 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3838 struct mem_cgroup *parent; 3839 3840 if (val > 100) 3841 return -EINVAL; 3842 3843 if (cgrp->parent == NULL) 3844 return -EINVAL; 3845 3846 parent = mem_cgroup_from_cont(cgrp->parent); 3847 3848 cgroup_lock(); 3849 3850 /* If under hierarchy, only empty-root can set this value */ 3851 if ((parent->use_hierarchy) || 3852 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 3853 cgroup_unlock(); 3854 return -EINVAL; 3855 } 3856 3857 spin_lock(&memcg->reclaim_param_lock); 3858 memcg->swappiness = val; 3859 spin_unlock(&memcg->reclaim_param_lock); 3860 3861 cgroup_unlock(); 3862 3863 return 0; 3864 } 3865 3866 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3867 { 3868 struct mem_cgroup_threshold_ary *t; 3869 u64 usage; 3870 int i; 3871 3872 rcu_read_lock(); 3873 if (!swap) 3874 t = rcu_dereference(memcg->thresholds.primary); 3875 else 3876 t = rcu_dereference(memcg->memsw_thresholds.primary); 3877 3878 if (!t) 3879 goto unlock; 3880 3881 usage = mem_cgroup_usage(memcg, swap); 3882 3883 /* 3884 * current_threshold points to threshold just below usage. 3885 * If it's not true, a threshold was crossed after last 3886 * call of __mem_cgroup_threshold(). 3887 */ 3888 i = t->current_threshold; 3889 3890 /* 3891 * Iterate backward over array of thresholds starting from 3892 * current_threshold and check if a threshold is crossed. 3893 * If none of thresholds below usage is crossed, we read 3894 * only one element of the array here. 3895 */ 3896 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3897 eventfd_signal(t->entries[i].eventfd, 1); 3898 3899 /* i = current_threshold + 1 */ 3900 i++; 3901 3902 /* 3903 * Iterate forward over array of thresholds starting from 3904 * current_threshold+1 and check if a threshold is crossed. 3905 * If none of thresholds above usage is crossed, we read 3906 * only one element of the array here. 3907 */ 3908 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3909 eventfd_signal(t->entries[i].eventfd, 1); 3910 3911 /* Update current_threshold */ 3912 t->current_threshold = i - 1; 3913 unlock: 3914 rcu_read_unlock(); 3915 } 3916 3917 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3918 { 3919 while (memcg) { 3920 __mem_cgroup_threshold(memcg, false); 3921 if (do_swap_account) 3922 __mem_cgroup_threshold(memcg, true); 3923 3924 memcg = parent_mem_cgroup(memcg); 3925 } 3926 } 3927 3928 static int compare_thresholds(const void *a, const void *b) 3929 { 3930 const struct mem_cgroup_threshold *_a = a; 3931 const struct mem_cgroup_threshold *_b = b; 3932 3933 return _a->threshold - _b->threshold; 3934 } 3935 3936 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) 3937 { 3938 struct mem_cgroup_eventfd_list *ev; 3939 3940 list_for_each_entry(ev, &mem->oom_notify, list) 3941 eventfd_signal(ev->eventfd, 1); 3942 return 0; 3943 } 3944 3945 static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 3946 { 3947 struct mem_cgroup *iter; 3948 3949 for_each_mem_cgroup_tree(iter, mem) 3950 mem_cgroup_oom_notify_cb(iter); 3951 } 3952 3953 static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 3954 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 3955 { 3956 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3957 struct mem_cgroup_thresholds *thresholds; 3958 struct mem_cgroup_threshold_ary *new; 3959 int type = MEMFILE_TYPE(cft->private); 3960 u64 threshold, usage; 3961 int i, size, ret; 3962 3963 ret = res_counter_memparse_write_strategy(args, &threshold); 3964 if (ret) 3965 return ret; 3966 3967 mutex_lock(&memcg->thresholds_lock); 3968 3969 if (type == _MEM) 3970 thresholds = &memcg->thresholds; 3971 else if (type == _MEMSWAP) 3972 thresholds = &memcg->memsw_thresholds; 3973 else 3974 BUG(); 3975 3976 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 3977 3978 /* Check if a threshold crossed before adding a new one */ 3979 if (thresholds->primary) 3980 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3981 3982 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3983 3984 /* Allocate memory for new array of thresholds */ 3985 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 3986 GFP_KERNEL); 3987 if (!new) { 3988 ret = -ENOMEM; 3989 goto unlock; 3990 } 3991 new->size = size; 3992 3993 /* Copy thresholds (if any) to new array */ 3994 if (thresholds->primary) { 3995 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 3996 sizeof(struct mem_cgroup_threshold)); 3997 } 3998 3999 /* Add new threshold */ 4000 new->entries[size - 1].eventfd = eventfd; 4001 new->entries[size - 1].threshold = threshold; 4002 4003 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4004 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 4005 compare_thresholds, NULL); 4006 4007 /* Find current threshold */ 4008 new->current_threshold = -1; 4009 for (i = 0; i < size; i++) { 4010 if (new->entries[i].threshold < usage) { 4011 /* 4012 * new->current_threshold will not be used until 4013 * rcu_assign_pointer(), so it's safe to increment 4014 * it here. 4015 */ 4016 ++new->current_threshold; 4017 } 4018 } 4019 4020 /* Free old spare buffer and save old primary buffer as spare */ 4021 kfree(thresholds->spare); 4022 thresholds->spare = thresholds->primary; 4023 4024 rcu_assign_pointer(thresholds->primary, new); 4025 4026 /* To be sure that nobody uses thresholds */ 4027 synchronize_rcu(); 4028 4029 unlock: 4030 mutex_unlock(&memcg->thresholds_lock); 4031 4032 return ret; 4033 } 4034 4035 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 4036 struct cftype *cft, struct eventfd_ctx *eventfd) 4037 { 4038 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4039 struct mem_cgroup_thresholds *thresholds; 4040 struct mem_cgroup_threshold_ary *new; 4041 int type = MEMFILE_TYPE(cft->private); 4042 u64 usage; 4043 int i, j, size; 4044 4045 mutex_lock(&memcg->thresholds_lock); 4046 if (type == _MEM) 4047 thresholds = &memcg->thresholds; 4048 else if (type == _MEMSWAP) 4049 thresholds = &memcg->memsw_thresholds; 4050 else 4051 BUG(); 4052 4053 /* 4054 * Something went wrong if we trying to unregister a threshold 4055 * if we don't have thresholds 4056 */ 4057 BUG_ON(!thresholds); 4058 4059 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4060 4061 /* Check if a threshold crossed before removing */ 4062 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4063 4064 /* Calculate new number of threshold */ 4065 size = 0; 4066 for (i = 0; i < thresholds->primary->size; i++) { 4067 if (thresholds->primary->entries[i].eventfd != eventfd) 4068 size++; 4069 } 4070 4071 new = thresholds->spare; 4072 4073 /* Set thresholds array to NULL if we don't have thresholds */ 4074 if (!size) { 4075 kfree(new); 4076 new = NULL; 4077 goto swap_buffers; 4078 } 4079 4080 new->size = size; 4081 4082 /* Copy thresholds and find current threshold */ 4083 new->current_threshold = -1; 4084 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4085 if (thresholds->primary->entries[i].eventfd == eventfd) 4086 continue; 4087 4088 new->entries[j] = thresholds->primary->entries[i]; 4089 if (new->entries[j].threshold < usage) { 4090 /* 4091 * new->current_threshold will not be used 4092 * until rcu_assign_pointer(), so it's safe to increment 4093 * it here. 4094 */ 4095 ++new->current_threshold; 4096 } 4097 j++; 4098 } 4099 4100 swap_buffers: 4101 /* Swap primary and spare array */ 4102 thresholds->spare = thresholds->primary; 4103 rcu_assign_pointer(thresholds->primary, new); 4104 4105 /* To be sure that nobody uses thresholds */ 4106 synchronize_rcu(); 4107 4108 mutex_unlock(&memcg->thresholds_lock); 4109 } 4110 4111 static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 4112 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4113 { 4114 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4115 struct mem_cgroup_eventfd_list *event; 4116 int type = MEMFILE_TYPE(cft->private); 4117 4118 BUG_ON(type != _OOM_TYPE); 4119 event = kmalloc(sizeof(*event), GFP_KERNEL); 4120 if (!event) 4121 return -ENOMEM; 4122 4123 mutex_lock(&memcg_oom_mutex); 4124 4125 event->eventfd = eventfd; 4126 list_add(&event->list, &memcg->oom_notify); 4127 4128 /* already in OOM ? */ 4129 if (atomic_read(&memcg->oom_lock)) 4130 eventfd_signal(eventfd, 1); 4131 mutex_unlock(&memcg_oom_mutex); 4132 4133 return 0; 4134 } 4135 4136 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 4137 struct cftype *cft, struct eventfd_ctx *eventfd) 4138 { 4139 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4140 struct mem_cgroup_eventfd_list *ev, *tmp; 4141 int type = MEMFILE_TYPE(cft->private); 4142 4143 BUG_ON(type != _OOM_TYPE); 4144 4145 mutex_lock(&memcg_oom_mutex); 4146 4147 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 4148 if (ev->eventfd == eventfd) { 4149 list_del(&ev->list); 4150 kfree(ev); 4151 } 4152 } 4153 4154 mutex_unlock(&memcg_oom_mutex); 4155 } 4156 4157 static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4158 struct cftype *cft, struct cgroup_map_cb *cb) 4159 { 4160 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4161 4162 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 4163 4164 if (atomic_read(&mem->oom_lock)) 4165 cb->fill(cb, "under_oom", 1); 4166 else 4167 cb->fill(cb, "under_oom", 0); 4168 return 0; 4169 } 4170 4171 static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 4172 struct cftype *cft, u64 val) 4173 { 4174 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4175 struct mem_cgroup *parent; 4176 4177 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4178 if (!cgrp->parent || !((val == 0) || (val == 1))) 4179 return -EINVAL; 4180 4181 parent = mem_cgroup_from_cont(cgrp->parent); 4182 4183 cgroup_lock(); 4184 /* oom-kill-disable is a flag for subhierarchy. */ 4185 if ((parent->use_hierarchy) || 4186 (mem->use_hierarchy && !list_empty(&cgrp->children))) { 4187 cgroup_unlock(); 4188 return -EINVAL; 4189 } 4190 mem->oom_kill_disable = val; 4191 if (!val) 4192 memcg_oom_recover(mem); 4193 cgroup_unlock(); 4194 return 0; 4195 } 4196 4197 static struct cftype mem_cgroup_files[] = { 4198 { 4199 .name = "usage_in_bytes", 4200 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4201 .read_u64 = mem_cgroup_read, 4202 .register_event = mem_cgroup_usage_register_event, 4203 .unregister_event = mem_cgroup_usage_unregister_event, 4204 }, 4205 { 4206 .name = "max_usage_in_bytes", 4207 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4208 .trigger = mem_cgroup_reset, 4209 .read_u64 = mem_cgroup_read, 4210 }, 4211 { 4212 .name = "limit_in_bytes", 4213 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4214 .write_string = mem_cgroup_write, 4215 .read_u64 = mem_cgroup_read, 4216 }, 4217 { 4218 .name = "soft_limit_in_bytes", 4219 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4220 .write_string = mem_cgroup_write, 4221 .read_u64 = mem_cgroup_read, 4222 }, 4223 { 4224 .name = "failcnt", 4225 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4226 .trigger = mem_cgroup_reset, 4227 .read_u64 = mem_cgroup_read, 4228 }, 4229 { 4230 .name = "stat", 4231 .read_map = mem_control_stat_show, 4232 }, 4233 { 4234 .name = "force_empty", 4235 .trigger = mem_cgroup_force_empty_write, 4236 }, 4237 { 4238 .name = "use_hierarchy", 4239 .write_u64 = mem_cgroup_hierarchy_write, 4240 .read_u64 = mem_cgroup_hierarchy_read, 4241 }, 4242 { 4243 .name = "swappiness", 4244 .read_u64 = mem_cgroup_swappiness_read, 4245 .write_u64 = mem_cgroup_swappiness_write, 4246 }, 4247 { 4248 .name = "move_charge_at_immigrate", 4249 .read_u64 = mem_cgroup_move_charge_read, 4250 .write_u64 = mem_cgroup_move_charge_write, 4251 }, 4252 { 4253 .name = "oom_control", 4254 .read_map = mem_cgroup_oom_control_read, 4255 .write_u64 = mem_cgroup_oom_control_write, 4256 .register_event = mem_cgroup_oom_register_event, 4257 .unregister_event = mem_cgroup_oom_unregister_event, 4258 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4259 }, 4260 }; 4261 4262 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4263 static struct cftype memsw_cgroup_files[] = { 4264 { 4265 .name = "memsw.usage_in_bytes", 4266 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4267 .read_u64 = mem_cgroup_read, 4268 .register_event = mem_cgroup_usage_register_event, 4269 .unregister_event = mem_cgroup_usage_unregister_event, 4270 }, 4271 { 4272 .name = "memsw.max_usage_in_bytes", 4273 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4274 .trigger = mem_cgroup_reset, 4275 .read_u64 = mem_cgroup_read, 4276 }, 4277 { 4278 .name = "memsw.limit_in_bytes", 4279 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4280 .write_string = mem_cgroup_write, 4281 .read_u64 = mem_cgroup_read, 4282 }, 4283 { 4284 .name = "memsw.failcnt", 4285 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4286 .trigger = mem_cgroup_reset, 4287 .read_u64 = mem_cgroup_read, 4288 }, 4289 }; 4290 4291 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4292 { 4293 if (!do_swap_account) 4294 return 0; 4295 return cgroup_add_files(cont, ss, memsw_cgroup_files, 4296 ARRAY_SIZE(memsw_cgroup_files)); 4297 }; 4298 #else 4299 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4300 { 4301 return 0; 4302 } 4303 #endif 4304 4305 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4306 { 4307 struct mem_cgroup_per_node *pn; 4308 struct mem_cgroup_per_zone *mz; 4309 enum lru_list l; 4310 int zone, tmp = node; 4311 /* 4312 * This routine is called against possible nodes. 4313 * But it's BUG to call kmalloc() against offline node. 4314 * 4315 * TODO: this routine can waste much memory for nodes which will 4316 * never be onlined. It's better to use memory hotplug callback 4317 * function. 4318 */ 4319 if (!node_state(node, N_NORMAL_MEMORY)) 4320 tmp = -1; 4321 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4322 if (!pn) 4323 return 1; 4324 4325 mem->info.nodeinfo[node] = pn; 4326 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4327 mz = &pn->zoneinfo[zone]; 4328 for_each_lru(l) 4329 INIT_LIST_HEAD(&mz->lists[l]); 4330 mz->usage_in_excess = 0; 4331 mz->on_tree = false; 4332 mz->mem = mem; 4333 } 4334 return 0; 4335 } 4336 4337 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4338 { 4339 kfree(mem->info.nodeinfo[node]); 4340 } 4341 4342 static struct mem_cgroup *mem_cgroup_alloc(void) 4343 { 4344 struct mem_cgroup *mem; 4345 int size = sizeof(struct mem_cgroup); 4346 4347 /* Can be very big if MAX_NUMNODES is very big */ 4348 if (size < PAGE_SIZE) 4349 mem = kzalloc(size, GFP_KERNEL); 4350 else 4351 mem = vzalloc(size); 4352 4353 if (!mem) 4354 return NULL; 4355 4356 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4357 if (!mem->stat) 4358 goto out_free; 4359 spin_lock_init(&mem->pcp_counter_lock); 4360 return mem; 4361 4362 out_free: 4363 if (size < PAGE_SIZE) 4364 kfree(mem); 4365 else 4366 vfree(mem); 4367 return NULL; 4368 } 4369 4370 /* 4371 * At destroying mem_cgroup, references from swap_cgroup can remain. 4372 * (scanning all at force_empty is too costly...) 4373 * 4374 * Instead of clearing all references at force_empty, we remember 4375 * the number of reference from swap_cgroup and free mem_cgroup when 4376 * it goes down to 0. 4377 * 4378 * Removal of cgroup itself succeeds regardless of refs from swap. 4379 */ 4380 4381 static void __mem_cgroup_free(struct mem_cgroup *mem) 4382 { 4383 int node; 4384 4385 mem_cgroup_remove_from_trees(mem); 4386 free_css_id(&mem_cgroup_subsys, &mem->css); 4387 4388 for_each_node_state(node, N_POSSIBLE) 4389 free_mem_cgroup_per_zone_info(mem, node); 4390 4391 free_percpu(mem->stat); 4392 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4393 kfree(mem); 4394 else 4395 vfree(mem); 4396 } 4397 4398 static void mem_cgroup_get(struct mem_cgroup *mem) 4399 { 4400 atomic_inc(&mem->refcnt); 4401 } 4402 4403 static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 4404 { 4405 if (atomic_sub_and_test(count, &mem->refcnt)) { 4406 struct mem_cgroup *parent = parent_mem_cgroup(mem); 4407 __mem_cgroup_free(mem); 4408 if (parent) 4409 mem_cgroup_put(parent); 4410 } 4411 } 4412 4413 static void mem_cgroup_put(struct mem_cgroup *mem) 4414 { 4415 __mem_cgroup_put(mem, 1); 4416 } 4417 4418 /* 4419 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4420 */ 4421 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 4422 { 4423 if (!mem->res.parent) 4424 return NULL; 4425 return mem_cgroup_from_res_counter(mem->res.parent, res); 4426 } 4427 4428 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4429 static void __init enable_swap_cgroup(void) 4430 { 4431 if (!mem_cgroup_disabled() && really_do_swap_account) 4432 do_swap_account = 1; 4433 } 4434 #else 4435 static void __init enable_swap_cgroup(void) 4436 { 4437 } 4438 #endif 4439 4440 static int mem_cgroup_soft_limit_tree_init(void) 4441 { 4442 struct mem_cgroup_tree_per_node *rtpn; 4443 struct mem_cgroup_tree_per_zone *rtpz; 4444 int tmp, node, zone; 4445 4446 for_each_node_state(node, N_POSSIBLE) { 4447 tmp = node; 4448 if (!node_state(node, N_NORMAL_MEMORY)) 4449 tmp = -1; 4450 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4451 if (!rtpn) 4452 return 1; 4453 4454 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4455 4456 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4457 rtpz = &rtpn->rb_tree_per_zone[zone]; 4458 rtpz->rb_root = RB_ROOT; 4459 spin_lock_init(&rtpz->lock); 4460 } 4461 } 4462 return 0; 4463 } 4464 4465 static struct cgroup_subsys_state * __ref 4466 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 4467 { 4468 struct mem_cgroup *mem, *parent; 4469 long error = -ENOMEM; 4470 int node; 4471 4472 mem = mem_cgroup_alloc(); 4473 if (!mem) 4474 return ERR_PTR(error); 4475 4476 for_each_node_state(node, N_POSSIBLE) 4477 if (alloc_mem_cgroup_per_zone_info(mem, node)) 4478 goto free_out; 4479 4480 /* root ? */ 4481 if (cont->parent == NULL) { 4482 int cpu; 4483 enable_swap_cgroup(); 4484 parent = NULL; 4485 root_mem_cgroup = mem; 4486 if (mem_cgroup_soft_limit_tree_init()) 4487 goto free_out; 4488 for_each_possible_cpu(cpu) { 4489 struct memcg_stock_pcp *stock = 4490 &per_cpu(memcg_stock, cpu); 4491 INIT_WORK(&stock->work, drain_local_stock); 4492 } 4493 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 4494 } else { 4495 parent = mem_cgroup_from_cont(cont->parent); 4496 mem->use_hierarchy = parent->use_hierarchy; 4497 mem->oom_kill_disable = parent->oom_kill_disable; 4498 } 4499 4500 if (parent && parent->use_hierarchy) { 4501 res_counter_init(&mem->res, &parent->res); 4502 res_counter_init(&mem->memsw, &parent->memsw); 4503 /* 4504 * We increment refcnt of the parent to ensure that we can 4505 * safely access it on res_counter_charge/uncharge. 4506 * This refcnt will be decremented when freeing this 4507 * mem_cgroup(see mem_cgroup_put). 4508 */ 4509 mem_cgroup_get(parent); 4510 } else { 4511 res_counter_init(&mem->res, NULL); 4512 res_counter_init(&mem->memsw, NULL); 4513 } 4514 mem->last_scanned_child = 0; 4515 spin_lock_init(&mem->reclaim_param_lock); 4516 INIT_LIST_HEAD(&mem->oom_notify); 4517 4518 if (parent) 4519 mem->swappiness = get_swappiness(parent); 4520 atomic_set(&mem->refcnt, 1); 4521 mem->move_charge_at_immigrate = 0; 4522 mutex_init(&mem->thresholds_lock); 4523 return &mem->css; 4524 free_out: 4525 __mem_cgroup_free(mem); 4526 root_mem_cgroup = NULL; 4527 return ERR_PTR(error); 4528 } 4529 4530 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 4531 struct cgroup *cont) 4532 { 4533 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4534 4535 return mem_cgroup_force_empty(mem, false); 4536 } 4537 4538 static void mem_cgroup_destroy(struct cgroup_subsys *ss, 4539 struct cgroup *cont) 4540 { 4541 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4542 4543 mem_cgroup_put(mem); 4544 } 4545 4546 static int mem_cgroup_populate(struct cgroup_subsys *ss, 4547 struct cgroup *cont) 4548 { 4549 int ret; 4550 4551 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 4552 ARRAY_SIZE(mem_cgroup_files)); 4553 4554 if (!ret) 4555 ret = register_memsw_files(cont, ss); 4556 return ret; 4557 } 4558 4559 #ifdef CONFIG_MMU 4560 /* Handlers for move charge at task migration. */ 4561 #define PRECHARGE_COUNT_AT_ONCE 256 4562 static int mem_cgroup_do_precharge(unsigned long count) 4563 { 4564 int ret = 0; 4565 int batch_count = PRECHARGE_COUNT_AT_ONCE; 4566 struct mem_cgroup *mem = mc.to; 4567 4568 if (mem_cgroup_is_root(mem)) { 4569 mc.precharge += count; 4570 /* we don't need css_get for root */ 4571 return ret; 4572 } 4573 /* try to charge at once */ 4574 if (count > 1) { 4575 struct res_counter *dummy; 4576 /* 4577 * "mem" cannot be under rmdir() because we've already checked 4578 * by cgroup_lock_live_cgroup() that it is not removed and we 4579 * are still under the same cgroup_mutex. So we can postpone 4580 * css_get(). 4581 */ 4582 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 4583 goto one_by_one; 4584 if (do_swap_account && res_counter_charge(&mem->memsw, 4585 PAGE_SIZE * count, &dummy)) { 4586 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 4587 goto one_by_one; 4588 } 4589 mc.precharge += count; 4590 return ret; 4591 } 4592 one_by_one: 4593 /* fall back to one by one charge */ 4594 while (count--) { 4595 if (signal_pending(current)) { 4596 ret = -EINTR; 4597 break; 4598 } 4599 if (!batch_count--) { 4600 batch_count = PRECHARGE_COUNT_AT_ONCE; 4601 cond_resched(); 4602 } 4603 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, 4604 PAGE_SIZE); 4605 if (ret || !mem) 4606 /* mem_cgroup_clear_mc() will do uncharge later */ 4607 return -ENOMEM; 4608 mc.precharge++; 4609 } 4610 return ret; 4611 } 4612 4613 /** 4614 * is_target_pte_for_mc - check a pte whether it is valid for move charge 4615 * @vma: the vma the pte to be checked belongs 4616 * @addr: the address corresponding to the pte to be checked 4617 * @ptent: the pte to be checked 4618 * @target: the pointer the target page or swap ent will be stored(can be NULL) 4619 * 4620 * Returns 4621 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 4622 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 4623 * move charge. if @target is not NULL, the page is stored in target->page 4624 * with extra refcnt got(Callers should handle it). 4625 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 4626 * target for charge migration. if @target is not NULL, the entry is stored 4627 * in target->ent. 4628 * 4629 * Called with pte lock held. 4630 */ 4631 union mc_target { 4632 struct page *page; 4633 swp_entry_t ent; 4634 }; 4635 4636 enum mc_target_type { 4637 MC_TARGET_NONE, /* not used */ 4638 MC_TARGET_PAGE, 4639 MC_TARGET_SWAP, 4640 }; 4641 4642 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4643 unsigned long addr, pte_t ptent) 4644 { 4645 struct page *page = vm_normal_page(vma, addr, ptent); 4646 4647 if (!page || !page_mapped(page)) 4648 return NULL; 4649 if (PageAnon(page)) { 4650 /* we don't move shared anon */ 4651 if (!move_anon() || page_mapcount(page) > 2) 4652 return NULL; 4653 } else if (!move_file()) 4654 /* we ignore mapcount for file pages */ 4655 return NULL; 4656 if (!get_page_unless_zero(page)) 4657 return NULL; 4658 4659 return page; 4660 } 4661 4662 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4663 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4664 { 4665 int usage_count; 4666 struct page *page = NULL; 4667 swp_entry_t ent = pte_to_swp_entry(ptent); 4668 4669 if (!move_anon() || non_swap_entry(ent)) 4670 return NULL; 4671 usage_count = mem_cgroup_count_swap_user(ent, &page); 4672 if (usage_count > 1) { /* we don't move shared anon */ 4673 if (page) 4674 put_page(page); 4675 return NULL; 4676 } 4677 if (do_swap_account) 4678 entry->val = ent.val; 4679 4680 return page; 4681 } 4682 4683 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 4684 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4685 { 4686 struct page *page = NULL; 4687 struct inode *inode; 4688 struct address_space *mapping; 4689 pgoff_t pgoff; 4690 4691 if (!vma->vm_file) /* anonymous vma */ 4692 return NULL; 4693 if (!move_file()) 4694 return NULL; 4695 4696 inode = vma->vm_file->f_path.dentry->d_inode; 4697 mapping = vma->vm_file->f_mapping; 4698 if (pte_none(ptent)) 4699 pgoff = linear_page_index(vma, addr); 4700 else /* pte_file(ptent) is true */ 4701 pgoff = pte_to_pgoff(ptent); 4702 4703 /* page is moved even if it's not RSS of this task(page-faulted). */ 4704 if (!mapping_cap_swap_backed(mapping)) { /* normal file */ 4705 page = find_get_page(mapping, pgoff); 4706 } else { /* shmem/tmpfs file. we should take account of swap too. */ 4707 swp_entry_t ent; 4708 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); 4709 if (do_swap_account) 4710 entry->val = ent.val; 4711 } 4712 4713 return page; 4714 } 4715 4716 static int is_target_pte_for_mc(struct vm_area_struct *vma, 4717 unsigned long addr, pte_t ptent, union mc_target *target) 4718 { 4719 struct page *page = NULL; 4720 struct page_cgroup *pc; 4721 int ret = 0; 4722 swp_entry_t ent = { .val = 0 }; 4723 4724 if (pte_present(ptent)) 4725 page = mc_handle_present_pte(vma, addr, ptent); 4726 else if (is_swap_pte(ptent)) 4727 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4728 else if (pte_none(ptent) || pte_file(ptent)) 4729 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4730 4731 if (!page && !ent.val) 4732 return 0; 4733 if (page) { 4734 pc = lookup_page_cgroup(page); 4735 /* 4736 * Do only loose check w/o page_cgroup lock. 4737 * mem_cgroup_move_account() checks the pc is valid or not under 4738 * the lock. 4739 */ 4740 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 4741 ret = MC_TARGET_PAGE; 4742 if (target) 4743 target->page = page; 4744 } 4745 if (!ret || !target) 4746 put_page(page); 4747 } 4748 /* There is a swap entry and a page doesn't exist or isn't charged */ 4749 if (ent.val && !ret && 4750 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 4751 ret = MC_TARGET_SWAP; 4752 if (target) 4753 target->ent = ent; 4754 } 4755 return ret; 4756 } 4757 4758 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 4759 unsigned long addr, unsigned long end, 4760 struct mm_walk *walk) 4761 { 4762 struct vm_area_struct *vma = walk->private; 4763 pte_t *pte; 4764 spinlock_t *ptl; 4765 4766 split_huge_page_pmd(walk->mm, pmd); 4767 4768 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4769 for (; addr != end; pte++, addr += PAGE_SIZE) 4770 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4771 mc.precharge++; /* increment precharge temporarily */ 4772 pte_unmap_unlock(pte - 1, ptl); 4773 cond_resched(); 4774 4775 return 0; 4776 } 4777 4778 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4779 { 4780 unsigned long precharge; 4781 struct vm_area_struct *vma; 4782 4783 down_read(&mm->mmap_sem); 4784 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4785 struct mm_walk mem_cgroup_count_precharge_walk = { 4786 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4787 .mm = mm, 4788 .private = vma, 4789 }; 4790 if (is_vm_hugetlb_page(vma)) 4791 continue; 4792 walk_page_range(vma->vm_start, vma->vm_end, 4793 &mem_cgroup_count_precharge_walk); 4794 } 4795 up_read(&mm->mmap_sem); 4796 4797 precharge = mc.precharge; 4798 mc.precharge = 0; 4799 4800 return precharge; 4801 } 4802 4803 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4804 { 4805 unsigned long precharge = mem_cgroup_count_precharge(mm); 4806 4807 VM_BUG_ON(mc.moving_task); 4808 mc.moving_task = current; 4809 return mem_cgroup_do_precharge(precharge); 4810 } 4811 4812 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 4813 static void __mem_cgroup_clear_mc(void) 4814 { 4815 struct mem_cgroup *from = mc.from; 4816 struct mem_cgroup *to = mc.to; 4817 4818 /* we must uncharge all the leftover precharges from mc.to */ 4819 if (mc.precharge) { 4820 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4821 mc.precharge = 0; 4822 } 4823 /* 4824 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4825 * we must uncharge here. 4826 */ 4827 if (mc.moved_charge) { 4828 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4829 mc.moved_charge = 0; 4830 } 4831 /* we must fixup refcnts and charges */ 4832 if (mc.moved_swap) { 4833 /* uncharge swap account from the old cgroup */ 4834 if (!mem_cgroup_is_root(mc.from)) 4835 res_counter_uncharge(&mc.from->memsw, 4836 PAGE_SIZE * mc.moved_swap); 4837 __mem_cgroup_put(mc.from, mc.moved_swap); 4838 4839 if (!mem_cgroup_is_root(mc.to)) { 4840 /* 4841 * we charged both to->res and to->memsw, so we should 4842 * uncharge to->res. 4843 */ 4844 res_counter_uncharge(&mc.to->res, 4845 PAGE_SIZE * mc.moved_swap); 4846 } 4847 /* we've already done mem_cgroup_get(mc.to) */ 4848 mc.moved_swap = 0; 4849 } 4850 memcg_oom_recover(from); 4851 memcg_oom_recover(to); 4852 wake_up_all(&mc.waitq); 4853 } 4854 4855 static void mem_cgroup_clear_mc(void) 4856 { 4857 struct mem_cgroup *from = mc.from; 4858 4859 /* 4860 * we must clear moving_task before waking up waiters at the end of 4861 * task migration. 4862 */ 4863 mc.moving_task = NULL; 4864 __mem_cgroup_clear_mc(); 4865 spin_lock(&mc.lock); 4866 mc.from = NULL; 4867 mc.to = NULL; 4868 spin_unlock(&mc.lock); 4869 mem_cgroup_end_move(from); 4870 } 4871 4872 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4873 struct cgroup *cgroup, 4874 struct task_struct *p, 4875 bool threadgroup) 4876 { 4877 int ret = 0; 4878 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 4879 4880 if (mem->move_charge_at_immigrate) { 4881 struct mm_struct *mm; 4882 struct mem_cgroup *from = mem_cgroup_from_task(p); 4883 4884 VM_BUG_ON(from == mem); 4885 4886 mm = get_task_mm(p); 4887 if (!mm) 4888 return 0; 4889 /* We move charges only when we move a owner of the mm */ 4890 if (mm->owner == p) { 4891 VM_BUG_ON(mc.from); 4892 VM_BUG_ON(mc.to); 4893 VM_BUG_ON(mc.precharge); 4894 VM_BUG_ON(mc.moved_charge); 4895 VM_BUG_ON(mc.moved_swap); 4896 mem_cgroup_start_move(from); 4897 spin_lock(&mc.lock); 4898 mc.from = from; 4899 mc.to = mem; 4900 spin_unlock(&mc.lock); 4901 /* We set mc.moving_task later */ 4902 4903 ret = mem_cgroup_precharge_mc(mm); 4904 if (ret) 4905 mem_cgroup_clear_mc(); 4906 } 4907 mmput(mm); 4908 } 4909 return ret; 4910 } 4911 4912 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 4913 struct cgroup *cgroup, 4914 struct task_struct *p, 4915 bool threadgroup) 4916 { 4917 mem_cgroup_clear_mc(); 4918 } 4919 4920 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 4921 unsigned long addr, unsigned long end, 4922 struct mm_walk *walk) 4923 { 4924 int ret = 0; 4925 struct vm_area_struct *vma = walk->private; 4926 pte_t *pte; 4927 spinlock_t *ptl; 4928 4929 split_huge_page_pmd(walk->mm, pmd); 4930 retry: 4931 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4932 for (; addr != end; addr += PAGE_SIZE) { 4933 pte_t ptent = *(pte++); 4934 union mc_target target; 4935 int type; 4936 struct page *page; 4937 struct page_cgroup *pc; 4938 swp_entry_t ent; 4939 4940 if (!mc.precharge) 4941 break; 4942 4943 type = is_target_pte_for_mc(vma, addr, ptent, &target); 4944 switch (type) { 4945 case MC_TARGET_PAGE: 4946 page = target.page; 4947 if (isolate_lru_page(page)) 4948 goto put; 4949 pc = lookup_page_cgroup(page); 4950 if (!mem_cgroup_move_account(pc, 4951 mc.from, mc.to, false, PAGE_SIZE)) { 4952 mc.precharge--; 4953 /* we uncharge from mc.from later. */ 4954 mc.moved_charge++; 4955 } 4956 putback_lru_page(page); 4957 put: /* is_target_pte_for_mc() gets the page */ 4958 put_page(page); 4959 break; 4960 case MC_TARGET_SWAP: 4961 ent = target.ent; 4962 if (!mem_cgroup_move_swap_account(ent, 4963 mc.from, mc.to, false)) { 4964 mc.precharge--; 4965 /* we fixup refcnts and charges later. */ 4966 mc.moved_swap++; 4967 } 4968 break; 4969 default: 4970 break; 4971 } 4972 } 4973 pte_unmap_unlock(pte - 1, ptl); 4974 cond_resched(); 4975 4976 if (addr != end) { 4977 /* 4978 * We have consumed all precharges we got in can_attach(). 4979 * We try charge one by one, but don't do any additional 4980 * charges to mc.to if we have failed in charge once in attach() 4981 * phase. 4982 */ 4983 ret = mem_cgroup_do_precharge(1); 4984 if (!ret) 4985 goto retry; 4986 } 4987 4988 return ret; 4989 } 4990 4991 static void mem_cgroup_move_charge(struct mm_struct *mm) 4992 { 4993 struct vm_area_struct *vma; 4994 4995 lru_add_drain_all(); 4996 retry: 4997 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 4998 /* 4999 * Someone who are holding the mmap_sem might be waiting in 5000 * waitq. So we cancel all extra charges, wake up all waiters, 5001 * and retry. Because we cancel precharges, we might not be able 5002 * to move enough charges, but moving charge is a best-effort 5003 * feature anyway, so it wouldn't be a big problem. 5004 */ 5005 __mem_cgroup_clear_mc(); 5006 cond_resched(); 5007 goto retry; 5008 } 5009 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5010 int ret; 5011 struct mm_walk mem_cgroup_move_charge_walk = { 5012 .pmd_entry = mem_cgroup_move_charge_pte_range, 5013 .mm = mm, 5014 .private = vma, 5015 }; 5016 if (is_vm_hugetlb_page(vma)) 5017 continue; 5018 ret = walk_page_range(vma->vm_start, vma->vm_end, 5019 &mem_cgroup_move_charge_walk); 5020 if (ret) 5021 /* 5022 * means we have consumed all precharges and failed in 5023 * doing additional charge. Just abandon here. 5024 */ 5025 break; 5026 } 5027 up_read(&mm->mmap_sem); 5028 } 5029 5030 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5031 struct cgroup *cont, 5032 struct cgroup *old_cont, 5033 struct task_struct *p, 5034 bool threadgroup) 5035 { 5036 struct mm_struct *mm; 5037 5038 if (!mc.to) 5039 /* no need to move charge */ 5040 return; 5041 5042 mm = get_task_mm(p); 5043 if (mm) { 5044 mem_cgroup_move_charge(mm); 5045 mmput(mm); 5046 } 5047 mem_cgroup_clear_mc(); 5048 } 5049 #else /* !CONFIG_MMU */ 5050 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5051 struct cgroup *cgroup, 5052 struct task_struct *p, 5053 bool threadgroup) 5054 { 5055 return 0; 5056 } 5057 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5058 struct cgroup *cgroup, 5059 struct task_struct *p, 5060 bool threadgroup) 5061 { 5062 } 5063 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5064 struct cgroup *cont, 5065 struct cgroup *old_cont, 5066 struct task_struct *p, 5067 bool threadgroup) 5068 { 5069 } 5070 #endif 5071 5072 struct cgroup_subsys mem_cgroup_subsys = { 5073 .name = "memory", 5074 .subsys_id = mem_cgroup_subsys_id, 5075 .create = mem_cgroup_create, 5076 .pre_destroy = mem_cgroup_pre_destroy, 5077 .destroy = mem_cgroup_destroy, 5078 .populate = mem_cgroup_populate, 5079 .can_attach = mem_cgroup_can_attach, 5080 .cancel_attach = mem_cgroup_cancel_attach, 5081 .attach = mem_cgroup_move_task, 5082 .early_init = 0, 5083 .use_id = 1, 5084 }; 5085 5086 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5087 static int __init enable_swap_account(char *s) 5088 { 5089 /* consider enabled if no parameter or 1 is given */ 5090 if (!(*s) || !strcmp(s, "=1")) 5091 really_do_swap_account = 1; 5092 else if (!strcmp(s, "=0")) 5093 really_do_swap_account = 0; 5094 return 1; 5095 } 5096 __setup("swapaccount", enable_swap_account); 5097 5098 static int __init disable_swap_account(char *s) 5099 { 5100 printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); 5101 enable_swap_account("=0"); 5102 return 1; 5103 } 5104 __setup("noswapaccount", disable_swap_account); 5105 #endif 5106