1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 */ 23 24 #include <linux/res_counter.h> 25 #include <linux/memcontrol.h> 26 #include <linux/cgroup.h> 27 #include <linux/mm.h> 28 #include <linux/hugetlb.h> 29 #include <linux/pagemap.h> 30 #include <linux/smp.h> 31 #include <linux/page-flags.h> 32 #include <linux/backing-dev.h> 33 #include <linux/bit_spinlock.h> 34 #include <linux/rcupdate.h> 35 #include <linux/limits.h> 36 #include <linux/export.h> 37 #include <linux/mutex.h> 38 #include <linux/rbtree.h> 39 #include <linux/slab.h> 40 #include <linux/swap.h> 41 #include <linux/swapops.h> 42 #include <linux/spinlock.h> 43 #include <linux/eventfd.h> 44 #include <linux/sort.h> 45 #include <linux/fs.h> 46 #include <linux/seq_file.h> 47 #include <linux/vmalloc.h> 48 #include <linux/mm_inline.h> 49 #include <linux/page_cgroup.h> 50 #include <linux/cpu.h> 51 #include <linux/oom.h> 52 #include "internal.h" 53 54 #include <asm/uaccess.h> 55 56 #include <trace/events/vmscan.h> 57 58 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 59 #define MEM_CGROUP_RECLAIM_RETRIES 5 60 struct mem_cgroup *root_mem_cgroup __read_mostly; 61 62 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 63 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 64 int do_swap_account __read_mostly; 65 66 /* for remember boot option*/ 67 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 68 static int really_do_swap_account __initdata = 1; 69 #else 70 static int really_do_swap_account __initdata = 0; 71 #endif 72 73 #else 74 #define do_swap_account (0) 75 #endif 76 77 78 /* 79 * Statistics for memory cgroup. 80 */ 81 enum mem_cgroup_stat_index { 82 /* 83 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 84 */ 85 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 86 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 87 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 88 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 89 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 90 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ 91 MEM_CGROUP_STAT_NSTATS, 92 }; 93 94 enum mem_cgroup_events_index { 95 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 96 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 97 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ 98 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 99 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 100 MEM_CGROUP_EVENTS_NSTATS, 101 }; 102 /* 103 * Per memcg event counter is incremented at every pagein/pageout. With THP, 104 * it will be incremated by the number of pages. This counter is used for 105 * for trigger some periodic events. This is straightforward and better 106 * than using jiffies etc. to handle periodic memcg event. 107 */ 108 enum mem_cgroup_events_target { 109 MEM_CGROUP_TARGET_THRESH, 110 MEM_CGROUP_TARGET_SOFTLIMIT, 111 MEM_CGROUP_TARGET_NUMAINFO, 112 MEM_CGROUP_NTARGETS, 113 }; 114 #define THRESHOLDS_EVENTS_TARGET (128) 115 #define SOFTLIMIT_EVENTS_TARGET (1024) 116 #define NUMAINFO_EVENTS_TARGET (1024) 117 118 struct mem_cgroup_stat_cpu { 119 long count[MEM_CGROUP_STAT_NSTATS]; 120 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 121 unsigned long targets[MEM_CGROUP_NTARGETS]; 122 }; 123 124 /* 125 * per-zone information in memory controller. 126 */ 127 struct mem_cgroup_per_zone { 128 /* 129 * spin_lock to protect the per cgroup LRU 130 */ 131 struct list_head lists[NR_LRU_LISTS]; 132 unsigned long count[NR_LRU_LISTS]; 133 134 struct zone_reclaim_stat reclaim_stat; 135 struct rb_node tree_node; /* RB tree node */ 136 unsigned long long usage_in_excess;/* Set to the value by which */ 137 /* the soft limit is exceeded*/ 138 bool on_tree; 139 struct mem_cgroup *mem; /* Back pointer, we cannot */ 140 /* use container_of */ 141 }; 142 /* Macro for accessing counter */ 143 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 144 145 struct mem_cgroup_per_node { 146 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 147 }; 148 149 struct mem_cgroup_lru_info { 150 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 151 }; 152 153 /* 154 * Cgroups above their limits are maintained in a RB-Tree, independent of 155 * their hierarchy representation 156 */ 157 158 struct mem_cgroup_tree_per_zone { 159 struct rb_root rb_root; 160 spinlock_t lock; 161 }; 162 163 struct mem_cgroup_tree_per_node { 164 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 165 }; 166 167 struct mem_cgroup_tree { 168 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 169 }; 170 171 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 172 173 struct mem_cgroup_threshold { 174 struct eventfd_ctx *eventfd; 175 u64 threshold; 176 }; 177 178 /* For threshold */ 179 struct mem_cgroup_threshold_ary { 180 /* An array index points to threshold just below usage. */ 181 int current_threshold; 182 /* Size of entries[] */ 183 unsigned int size; 184 /* Array of thresholds */ 185 struct mem_cgroup_threshold entries[0]; 186 }; 187 188 struct mem_cgroup_thresholds { 189 /* Primary thresholds array */ 190 struct mem_cgroup_threshold_ary *primary; 191 /* 192 * Spare threshold array. 193 * This is needed to make mem_cgroup_unregister_event() "never fail". 194 * It must be able to store at least primary->size - 1 entries. 195 */ 196 struct mem_cgroup_threshold_ary *spare; 197 }; 198 199 /* for OOM */ 200 struct mem_cgroup_eventfd_list { 201 struct list_head list; 202 struct eventfd_ctx *eventfd; 203 }; 204 205 static void mem_cgroup_threshold(struct mem_cgroup *mem); 206 static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 207 208 /* 209 * The memory controller data structure. The memory controller controls both 210 * page cache and RSS per cgroup. We would eventually like to provide 211 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 212 * to help the administrator determine what knobs to tune. 213 * 214 * TODO: Add a water mark for the memory controller. Reclaim will begin when 215 * we hit the water mark. May be even add a low water mark, such that 216 * no reclaim occurs from a cgroup at it's low water mark, this is 217 * a feature that will be implemented much later in the future. 218 */ 219 struct mem_cgroup { 220 struct cgroup_subsys_state css; 221 /* 222 * the counter to account for memory usage 223 */ 224 struct res_counter res; 225 /* 226 * the counter to account for mem+swap usage. 227 */ 228 struct res_counter memsw; 229 /* 230 * Per cgroup active and inactive list, similar to the 231 * per zone LRU lists. 232 */ 233 struct mem_cgroup_lru_info info; 234 /* 235 * While reclaiming in a hierarchy, we cache the last child we 236 * reclaimed from. 237 */ 238 int last_scanned_child; 239 int last_scanned_node; 240 #if MAX_NUMNODES > 1 241 nodemask_t scan_nodes; 242 atomic_t numainfo_events; 243 atomic_t numainfo_updating; 244 #endif 245 /* 246 * Should the accounting and control be hierarchical, per subtree? 247 */ 248 bool use_hierarchy; 249 250 bool oom_lock; 251 atomic_t under_oom; 252 253 atomic_t refcnt; 254 255 int swappiness; 256 /* OOM-Killer disable */ 257 int oom_kill_disable; 258 259 /* set when res.limit == memsw.limit */ 260 bool memsw_is_minimum; 261 262 /* protect arrays of thresholds */ 263 struct mutex thresholds_lock; 264 265 /* thresholds for memory usage. RCU-protected */ 266 struct mem_cgroup_thresholds thresholds; 267 268 /* thresholds for mem+swap usage. RCU-protected */ 269 struct mem_cgroup_thresholds memsw_thresholds; 270 271 /* For oom notifier event fd */ 272 struct list_head oom_notify; 273 274 /* 275 * Should we move charges of a task when a task is moved into this 276 * mem_cgroup ? And what type of charges should we move ? 277 */ 278 unsigned long move_charge_at_immigrate; 279 /* 280 * percpu counter. 281 */ 282 struct mem_cgroup_stat_cpu *stat; 283 /* 284 * used when a cpu is offlined or other synchronizations 285 * See mem_cgroup_read_stat(). 286 */ 287 struct mem_cgroup_stat_cpu nocpu_base; 288 spinlock_t pcp_counter_lock; 289 }; 290 291 /* Stuffs for move charges at task migration. */ 292 /* 293 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 294 * left-shifted bitmap of these types. 295 */ 296 enum move_type { 297 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 298 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 299 NR_MOVE_TYPE, 300 }; 301 302 /* "mc" and its members are protected by cgroup_mutex */ 303 static struct move_charge_struct { 304 spinlock_t lock; /* for from, to */ 305 struct mem_cgroup *from; 306 struct mem_cgroup *to; 307 unsigned long precharge; 308 unsigned long moved_charge; 309 unsigned long moved_swap; 310 struct task_struct *moving_task; /* a task moving charges */ 311 wait_queue_head_t waitq; /* a waitq for other context */ 312 } mc = { 313 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 314 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 315 }; 316 317 static bool move_anon(void) 318 { 319 return test_bit(MOVE_CHARGE_TYPE_ANON, 320 &mc.to->move_charge_at_immigrate); 321 } 322 323 static bool move_file(void) 324 { 325 return test_bit(MOVE_CHARGE_TYPE_FILE, 326 &mc.to->move_charge_at_immigrate); 327 } 328 329 /* 330 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 331 * limit reclaim to prevent infinite loops, if they ever occur. 332 */ 333 #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 334 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 335 336 enum charge_type { 337 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 338 MEM_CGROUP_CHARGE_TYPE_MAPPED, 339 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 340 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 341 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 342 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 343 NR_CHARGE_TYPE, 344 }; 345 346 /* for encoding cft->private value on file */ 347 #define _MEM (0) 348 #define _MEMSWAP (1) 349 #define _OOM_TYPE (2) 350 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 351 #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 352 #define MEMFILE_ATTR(val) ((val) & 0xffff) 353 /* Used for OOM nofiier */ 354 #define OOM_CONTROL (0) 355 356 /* 357 * Reclaim flags for mem_cgroup_hierarchical_reclaim 358 */ 359 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 360 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 361 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 362 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 363 #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 364 #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 365 366 static void mem_cgroup_get(struct mem_cgroup *mem); 367 static void mem_cgroup_put(struct mem_cgroup *mem); 368 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 369 static void drain_all_stock_async(struct mem_cgroup *mem); 370 371 static struct mem_cgroup_per_zone * 372 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 373 { 374 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 375 } 376 377 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 378 { 379 return &mem->css; 380 } 381 382 static struct mem_cgroup_per_zone * 383 page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) 384 { 385 int nid = page_to_nid(page); 386 int zid = page_zonenum(page); 387 388 return mem_cgroup_zoneinfo(mem, nid, zid); 389 } 390 391 static struct mem_cgroup_tree_per_zone * 392 soft_limit_tree_node_zone(int nid, int zid) 393 { 394 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 395 } 396 397 static struct mem_cgroup_tree_per_zone * 398 soft_limit_tree_from_page(struct page *page) 399 { 400 int nid = page_to_nid(page); 401 int zid = page_zonenum(page); 402 403 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 404 } 405 406 static void 407 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 408 struct mem_cgroup_per_zone *mz, 409 struct mem_cgroup_tree_per_zone *mctz, 410 unsigned long long new_usage_in_excess) 411 { 412 struct rb_node **p = &mctz->rb_root.rb_node; 413 struct rb_node *parent = NULL; 414 struct mem_cgroup_per_zone *mz_node; 415 416 if (mz->on_tree) 417 return; 418 419 mz->usage_in_excess = new_usage_in_excess; 420 if (!mz->usage_in_excess) 421 return; 422 while (*p) { 423 parent = *p; 424 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 425 tree_node); 426 if (mz->usage_in_excess < mz_node->usage_in_excess) 427 p = &(*p)->rb_left; 428 /* 429 * We can't avoid mem cgroups that are over their soft 430 * limit by the same amount 431 */ 432 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 433 p = &(*p)->rb_right; 434 } 435 rb_link_node(&mz->tree_node, parent, p); 436 rb_insert_color(&mz->tree_node, &mctz->rb_root); 437 mz->on_tree = true; 438 } 439 440 static void 441 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 442 struct mem_cgroup_per_zone *mz, 443 struct mem_cgroup_tree_per_zone *mctz) 444 { 445 if (!mz->on_tree) 446 return; 447 rb_erase(&mz->tree_node, &mctz->rb_root); 448 mz->on_tree = false; 449 } 450 451 static void 452 mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 453 struct mem_cgroup_per_zone *mz, 454 struct mem_cgroup_tree_per_zone *mctz) 455 { 456 spin_lock(&mctz->lock); 457 __mem_cgroup_remove_exceeded(mem, mz, mctz); 458 spin_unlock(&mctz->lock); 459 } 460 461 462 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 463 { 464 unsigned long long excess; 465 struct mem_cgroup_per_zone *mz; 466 struct mem_cgroup_tree_per_zone *mctz; 467 int nid = page_to_nid(page); 468 int zid = page_zonenum(page); 469 mctz = soft_limit_tree_from_page(page); 470 471 /* 472 * Necessary to update all ancestors when hierarchy is used. 473 * because their event counter is not touched. 474 */ 475 for (; mem; mem = parent_mem_cgroup(mem)) { 476 mz = mem_cgroup_zoneinfo(mem, nid, zid); 477 excess = res_counter_soft_limit_excess(&mem->res); 478 /* 479 * We have to update the tree if mz is on RB-tree or 480 * mem is over its softlimit. 481 */ 482 if (excess || mz->on_tree) { 483 spin_lock(&mctz->lock); 484 /* if on-tree, remove it */ 485 if (mz->on_tree) 486 __mem_cgroup_remove_exceeded(mem, mz, mctz); 487 /* 488 * Insert again. mz->usage_in_excess will be updated. 489 * If excess is 0, no tree ops. 490 */ 491 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 492 spin_unlock(&mctz->lock); 493 } 494 } 495 } 496 497 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 498 { 499 int node, zone; 500 struct mem_cgroup_per_zone *mz; 501 struct mem_cgroup_tree_per_zone *mctz; 502 503 for_each_node_state(node, N_POSSIBLE) { 504 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 505 mz = mem_cgroup_zoneinfo(mem, node, zone); 506 mctz = soft_limit_tree_node_zone(node, zone); 507 mem_cgroup_remove_exceeded(mem, mz, mctz); 508 } 509 } 510 } 511 512 static struct mem_cgroup_per_zone * 513 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 514 { 515 struct rb_node *rightmost = NULL; 516 struct mem_cgroup_per_zone *mz; 517 518 retry: 519 mz = NULL; 520 rightmost = rb_last(&mctz->rb_root); 521 if (!rightmost) 522 goto done; /* Nothing to reclaim from */ 523 524 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 525 /* 526 * Remove the node now but someone else can add it back, 527 * we will to add it back at the end of reclaim to its correct 528 * position in the tree. 529 */ 530 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 531 if (!res_counter_soft_limit_excess(&mz->mem->res) || 532 !css_tryget(&mz->mem->css)) 533 goto retry; 534 done: 535 return mz; 536 } 537 538 static struct mem_cgroup_per_zone * 539 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 540 { 541 struct mem_cgroup_per_zone *mz; 542 543 spin_lock(&mctz->lock); 544 mz = __mem_cgroup_largest_soft_limit_node(mctz); 545 spin_unlock(&mctz->lock); 546 return mz; 547 } 548 549 /* 550 * Implementation Note: reading percpu statistics for memcg. 551 * 552 * Both of vmstat[] and percpu_counter has threshold and do periodic 553 * synchronization to implement "quick" read. There are trade-off between 554 * reading cost and precision of value. Then, we may have a chance to implement 555 * a periodic synchronizion of counter in memcg's counter. 556 * 557 * But this _read() function is used for user interface now. The user accounts 558 * memory usage by memory cgroup and he _always_ requires exact value because 559 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 560 * have to visit all online cpus and make sum. So, for now, unnecessary 561 * synchronization is not implemented. (just implemented for cpu hotplug) 562 * 563 * If there are kernel internal actions which can make use of some not-exact 564 * value, and reading all cpu value can be performance bottleneck in some 565 * common workload, threashold and synchonization as vmstat[] should be 566 * implemented. 567 */ 568 static long mem_cgroup_read_stat(struct mem_cgroup *mem, 569 enum mem_cgroup_stat_index idx) 570 { 571 long val = 0; 572 int cpu; 573 574 get_online_cpus(); 575 for_each_online_cpu(cpu) 576 val += per_cpu(mem->stat->count[idx], cpu); 577 #ifdef CONFIG_HOTPLUG_CPU 578 spin_lock(&mem->pcp_counter_lock); 579 val += mem->nocpu_base.count[idx]; 580 spin_unlock(&mem->pcp_counter_lock); 581 #endif 582 put_online_cpus(); 583 return val; 584 } 585 586 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 587 bool charge) 588 { 589 int val = (charge) ? 1 : -1; 590 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 591 } 592 593 void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) 594 { 595 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); 596 } 597 598 void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) 599 { 600 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); 601 } 602 603 static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, 604 enum mem_cgroup_events_index idx) 605 { 606 unsigned long val = 0; 607 int cpu; 608 609 for_each_online_cpu(cpu) 610 val += per_cpu(mem->stat->events[idx], cpu); 611 #ifdef CONFIG_HOTPLUG_CPU 612 spin_lock(&mem->pcp_counter_lock); 613 val += mem->nocpu_base.events[idx]; 614 spin_unlock(&mem->pcp_counter_lock); 615 #endif 616 return val; 617 } 618 619 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 620 bool file, int nr_pages) 621 { 622 preempt_disable(); 623 624 if (file) 625 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); 626 else 627 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); 628 629 /* pagein of a big page is an event. So, ignore page size */ 630 if (nr_pages > 0) 631 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 632 else { 633 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 634 nr_pages = -nr_pages; /* for event */ 635 } 636 637 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); 638 639 preempt_enable(); 640 } 641 642 unsigned long 643 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, 644 unsigned int lru_mask) 645 { 646 struct mem_cgroup_per_zone *mz; 647 enum lru_list l; 648 unsigned long ret = 0; 649 650 mz = mem_cgroup_zoneinfo(mem, nid, zid); 651 652 for_each_lru(l) { 653 if (BIT(l) & lru_mask) 654 ret += MEM_CGROUP_ZSTAT(mz, l); 655 } 656 return ret; 657 } 658 659 static unsigned long 660 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, 661 int nid, unsigned int lru_mask) 662 { 663 u64 total = 0; 664 int zid; 665 666 for (zid = 0; zid < MAX_NR_ZONES; zid++) 667 total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); 668 669 return total; 670 } 671 672 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, 673 unsigned int lru_mask) 674 { 675 int nid; 676 u64 total = 0; 677 678 for_each_node_state(nid, N_HIGH_MEMORY) 679 total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); 680 return total; 681 } 682 683 static bool __memcg_event_check(struct mem_cgroup *mem, int target) 684 { 685 unsigned long val, next; 686 687 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 688 next = this_cpu_read(mem->stat->targets[target]); 689 /* from time_after() in jiffies.h */ 690 return ((long)next - (long)val < 0); 691 } 692 693 static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) 694 { 695 unsigned long val, next; 696 697 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 698 699 switch (target) { 700 case MEM_CGROUP_TARGET_THRESH: 701 next = val + THRESHOLDS_EVENTS_TARGET; 702 break; 703 case MEM_CGROUP_TARGET_SOFTLIMIT: 704 next = val + SOFTLIMIT_EVENTS_TARGET; 705 break; 706 case MEM_CGROUP_TARGET_NUMAINFO: 707 next = val + NUMAINFO_EVENTS_TARGET; 708 break; 709 default: 710 return; 711 } 712 713 this_cpu_write(mem->stat->targets[target], next); 714 } 715 716 /* 717 * Check events in order. 718 * 719 */ 720 static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 721 { 722 /* threshold event is triggered in finer grain than soft limit */ 723 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { 724 mem_cgroup_threshold(mem); 725 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); 726 if (unlikely(__memcg_event_check(mem, 727 MEM_CGROUP_TARGET_SOFTLIMIT))) { 728 mem_cgroup_update_tree(mem, page); 729 __mem_cgroup_target_update(mem, 730 MEM_CGROUP_TARGET_SOFTLIMIT); 731 } 732 #if MAX_NUMNODES > 1 733 if (unlikely(__memcg_event_check(mem, 734 MEM_CGROUP_TARGET_NUMAINFO))) { 735 atomic_inc(&mem->numainfo_events); 736 __mem_cgroup_target_update(mem, 737 MEM_CGROUP_TARGET_NUMAINFO); 738 } 739 #endif 740 } 741 } 742 743 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 744 { 745 return container_of(cgroup_subsys_state(cont, 746 mem_cgroup_subsys_id), struct mem_cgroup, 747 css); 748 } 749 750 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 751 { 752 /* 753 * mm_update_next_owner() may clear mm->owner to NULL 754 * if it races with swapoff, page migration, etc. 755 * So this can be called with p == NULL. 756 */ 757 if (unlikely(!p)) 758 return NULL; 759 760 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 761 struct mem_cgroup, css); 762 } 763 764 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 765 { 766 struct mem_cgroup *mem = NULL; 767 768 if (!mm) 769 return NULL; 770 /* 771 * Because we have no locks, mm->owner's may be being moved to other 772 * cgroup. We use css_tryget() here even if this looks 773 * pessimistic (rather than adding locks here). 774 */ 775 rcu_read_lock(); 776 do { 777 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 778 if (unlikely(!mem)) 779 break; 780 } while (!css_tryget(&mem->css)); 781 rcu_read_unlock(); 782 return mem; 783 } 784 785 /* The caller has to guarantee "mem" exists before calling this */ 786 static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) 787 { 788 struct cgroup_subsys_state *css; 789 int found; 790 791 if (!mem) /* ROOT cgroup has the smallest ID */ 792 return root_mem_cgroup; /*css_put/get against root is ignored*/ 793 if (!mem->use_hierarchy) { 794 if (css_tryget(&mem->css)) 795 return mem; 796 return NULL; 797 } 798 rcu_read_lock(); 799 /* 800 * searching a memory cgroup which has the smallest ID under given 801 * ROOT cgroup. (ID >= 1) 802 */ 803 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); 804 if (css && css_tryget(css)) 805 mem = container_of(css, struct mem_cgroup, css); 806 else 807 mem = NULL; 808 rcu_read_unlock(); 809 return mem; 810 } 811 812 static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 813 struct mem_cgroup *root, 814 bool cond) 815 { 816 int nextid = css_id(&iter->css) + 1; 817 int found; 818 int hierarchy_used; 819 struct cgroup_subsys_state *css; 820 821 hierarchy_used = iter->use_hierarchy; 822 823 css_put(&iter->css); 824 /* If no ROOT, walk all, ignore hierarchy */ 825 if (!cond || (root && !hierarchy_used)) 826 return NULL; 827 828 if (!root) 829 root = root_mem_cgroup; 830 831 do { 832 iter = NULL; 833 rcu_read_lock(); 834 835 css = css_get_next(&mem_cgroup_subsys, nextid, 836 &root->css, &found); 837 if (css && css_tryget(css)) 838 iter = container_of(css, struct mem_cgroup, css); 839 rcu_read_unlock(); 840 /* If css is NULL, no more cgroups will be found */ 841 nextid = found + 1; 842 } while (css && !iter); 843 844 return iter; 845 } 846 /* 847 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please 848 * be careful that "break" loop is not allowed. We have reference count. 849 * Instead of that modify "cond" to be false and "continue" to exit the loop. 850 */ 851 #define for_each_mem_cgroup_tree_cond(iter, root, cond) \ 852 for (iter = mem_cgroup_start_loop(root);\ 853 iter != NULL;\ 854 iter = mem_cgroup_get_next(iter, root, cond)) 855 856 #define for_each_mem_cgroup_tree(iter, root) \ 857 for_each_mem_cgroup_tree_cond(iter, root, true) 858 859 #define for_each_mem_cgroup_all(iter) \ 860 for_each_mem_cgroup_tree_cond(iter, NULL, true) 861 862 863 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 864 { 865 return (mem == root_mem_cgroup); 866 } 867 868 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 869 { 870 struct mem_cgroup *mem; 871 872 if (!mm) 873 return; 874 875 rcu_read_lock(); 876 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 877 if (unlikely(!mem)) 878 goto out; 879 880 switch (idx) { 881 case PGMAJFAULT: 882 mem_cgroup_pgmajfault(mem, 1); 883 break; 884 case PGFAULT: 885 mem_cgroup_pgfault(mem, 1); 886 break; 887 default: 888 BUG(); 889 } 890 out: 891 rcu_read_unlock(); 892 } 893 EXPORT_SYMBOL(mem_cgroup_count_vm_event); 894 895 /* 896 * Following LRU functions are allowed to be used without PCG_LOCK. 897 * Operations are called by routine of global LRU independently from memcg. 898 * What we have to take care of here is validness of pc->mem_cgroup. 899 * 900 * Changes to pc->mem_cgroup happens when 901 * 1. charge 902 * 2. moving account 903 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 904 * It is added to LRU before charge. 905 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 906 * When moving account, the page is not on LRU. It's isolated. 907 */ 908 909 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 910 { 911 struct page_cgroup *pc; 912 struct mem_cgroup_per_zone *mz; 913 914 if (mem_cgroup_disabled()) 915 return; 916 pc = lookup_page_cgroup(page); 917 /* can happen while we handle swapcache. */ 918 if (!TestClearPageCgroupAcctLRU(pc)) 919 return; 920 VM_BUG_ON(!pc->mem_cgroup); 921 /* 922 * We don't check PCG_USED bit. It's cleared when the "page" is finally 923 * removed from global LRU. 924 */ 925 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 926 /* huge page split is done under lru_lock. so, we have no races. */ 927 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); 928 if (mem_cgroup_is_root(pc->mem_cgroup)) 929 return; 930 VM_BUG_ON(list_empty(&pc->lru)); 931 list_del_init(&pc->lru); 932 } 933 934 void mem_cgroup_del_lru(struct page *page) 935 { 936 mem_cgroup_del_lru_list(page, page_lru(page)); 937 } 938 939 /* 940 * Writeback is about to end against a page which has been marked for immediate 941 * reclaim. If it still appears to be reclaimable, move it to the tail of the 942 * inactive list. 943 */ 944 void mem_cgroup_rotate_reclaimable_page(struct page *page) 945 { 946 struct mem_cgroup_per_zone *mz; 947 struct page_cgroup *pc; 948 enum lru_list lru = page_lru(page); 949 950 if (mem_cgroup_disabled()) 951 return; 952 953 pc = lookup_page_cgroup(page); 954 /* unused or root page is not rotated. */ 955 if (!PageCgroupUsed(pc)) 956 return; 957 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 958 smp_rmb(); 959 if (mem_cgroup_is_root(pc->mem_cgroup)) 960 return; 961 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 962 list_move_tail(&pc->lru, &mz->lists[lru]); 963 } 964 965 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 966 { 967 struct mem_cgroup_per_zone *mz; 968 struct page_cgroup *pc; 969 970 if (mem_cgroup_disabled()) 971 return; 972 973 pc = lookup_page_cgroup(page); 974 /* unused or root page is not rotated. */ 975 if (!PageCgroupUsed(pc)) 976 return; 977 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 978 smp_rmb(); 979 if (mem_cgroup_is_root(pc->mem_cgroup)) 980 return; 981 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 982 list_move(&pc->lru, &mz->lists[lru]); 983 } 984 985 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 986 { 987 struct page_cgroup *pc; 988 struct mem_cgroup_per_zone *mz; 989 990 if (mem_cgroup_disabled()) 991 return; 992 pc = lookup_page_cgroup(page); 993 VM_BUG_ON(PageCgroupAcctLRU(pc)); 994 if (!PageCgroupUsed(pc)) 995 return; 996 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 997 smp_rmb(); 998 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 999 /* huge page split is done under lru_lock. so, we have no races. */ 1000 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 1001 SetPageCgroupAcctLRU(pc); 1002 if (mem_cgroup_is_root(pc->mem_cgroup)) 1003 return; 1004 list_add(&pc->lru, &mz->lists[lru]); 1005 } 1006 1007 /* 1008 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed 1009 * while it's linked to lru because the page may be reused after it's fully 1010 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. 1011 * It's done under lock_page and expected that zone->lru_lock isnever held. 1012 */ 1013 static void mem_cgroup_lru_del_before_commit(struct page *page) 1014 { 1015 unsigned long flags; 1016 struct zone *zone = page_zone(page); 1017 struct page_cgroup *pc = lookup_page_cgroup(page); 1018 1019 /* 1020 * Doing this check without taking ->lru_lock seems wrong but this 1021 * is safe. Because if page_cgroup's USED bit is unset, the page 1022 * will not be added to any memcg's LRU. If page_cgroup's USED bit is 1023 * set, the commit after this will fail, anyway. 1024 * This all charge/uncharge is done under some mutual execustion. 1025 * So, we don't need to taking care of changes in USED bit. 1026 */ 1027 if (likely(!PageLRU(page))) 1028 return; 1029 1030 spin_lock_irqsave(&zone->lru_lock, flags); 1031 /* 1032 * Forget old LRU when this page_cgroup is *not* used. This Used bit 1033 * is guarded by lock_page() because the page is SwapCache. 1034 */ 1035 if (!PageCgroupUsed(pc)) 1036 mem_cgroup_del_lru_list(page, page_lru(page)); 1037 spin_unlock_irqrestore(&zone->lru_lock, flags); 1038 } 1039 1040 static void mem_cgroup_lru_add_after_commit(struct page *page) 1041 { 1042 unsigned long flags; 1043 struct zone *zone = page_zone(page); 1044 struct page_cgroup *pc = lookup_page_cgroup(page); 1045 1046 /* taking care of that the page is added to LRU while we commit it */ 1047 if (likely(!PageLRU(page))) 1048 return; 1049 spin_lock_irqsave(&zone->lru_lock, flags); 1050 /* link when the page is linked to LRU but page_cgroup isn't */ 1051 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 1052 mem_cgroup_add_lru_list(page, page_lru(page)); 1053 spin_unlock_irqrestore(&zone->lru_lock, flags); 1054 } 1055 1056 1057 void mem_cgroup_move_lists(struct page *page, 1058 enum lru_list from, enum lru_list to) 1059 { 1060 if (mem_cgroup_disabled()) 1061 return; 1062 mem_cgroup_del_lru_list(page, from); 1063 mem_cgroup_add_lru_list(page, to); 1064 } 1065 1066 /* 1067 * Checks whether given mem is same or in the root_mem's 1068 * hierarchy subtree 1069 */ 1070 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, 1071 struct mem_cgroup *mem) 1072 { 1073 if (root_mem != mem) { 1074 return (root_mem->use_hierarchy && 1075 css_is_ancestor(&mem->css, &root_mem->css)); 1076 } 1077 1078 return true; 1079 } 1080 1081 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 1082 { 1083 int ret; 1084 struct mem_cgroup *curr = NULL; 1085 struct task_struct *p; 1086 1087 p = find_lock_task_mm(task); 1088 if (!p) 1089 return 0; 1090 curr = try_get_mem_cgroup_from_mm(p->mm); 1091 task_unlock(p); 1092 if (!curr) 1093 return 0; 1094 /* 1095 * We should check use_hierarchy of "mem" not "curr". Because checking 1096 * use_hierarchy of "curr" here make this function true if hierarchy is 1097 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 1098 * hierarchy(even if use_hierarchy is disabled in "mem"). 1099 */ 1100 ret = mem_cgroup_same_or_subtree(mem, curr); 1101 css_put(&curr->css); 1102 return ret; 1103 } 1104 1105 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 1106 { 1107 unsigned long active; 1108 unsigned long inactive; 1109 unsigned long gb; 1110 unsigned long inactive_ratio; 1111 1112 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); 1113 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); 1114 1115 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1116 if (gb) 1117 inactive_ratio = int_sqrt(10 * gb); 1118 else 1119 inactive_ratio = 1; 1120 1121 if (present_pages) { 1122 present_pages[0] = inactive; 1123 present_pages[1] = active; 1124 } 1125 1126 return inactive_ratio; 1127 } 1128 1129 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 1130 { 1131 unsigned long active; 1132 unsigned long inactive; 1133 unsigned long present_pages[2]; 1134 unsigned long inactive_ratio; 1135 1136 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 1137 1138 inactive = present_pages[0]; 1139 active = present_pages[1]; 1140 1141 if (inactive * inactive_ratio < active) 1142 return 1; 1143 1144 return 0; 1145 } 1146 1147 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 1148 { 1149 unsigned long active; 1150 unsigned long inactive; 1151 1152 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); 1153 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); 1154 1155 return (active > inactive); 1156 } 1157 1158 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1159 struct zone *zone) 1160 { 1161 int nid = zone_to_nid(zone); 1162 int zid = zone_idx(zone); 1163 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1164 1165 return &mz->reclaim_stat; 1166 } 1167 1168 struct zone_reclaim_stat * 1169 mem_cgroup_get_reclaim_stat_from_page(struct page *page) 1170 { 1171 struct page_cgroup *pc; 1172 struct mem_cgroup_per_zone *mz; 1173 1174 if (mem_cgroup_disabled()) 1175 return NULL; 1176 1177 pc = lookup_page_cgroup(page); 1178 if (!PageCgroupUsed(pc)) 1179 return NULL; 1180 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1181 smp_rmb(); 1182 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1183 return &mz->reclaim_stat; 1184 } 1185 1186 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 1187 struct list_head *dst, 1188 unsigned long *scanned, int order, 1189 int mode, struct zone *z, 1190 struct mem_cgroup *mem_cont, 1191 int active, int file) 1192 { 1193 unsigned long nr_taken = 0; 1194 struct page *page; 1195 unsigned long scan; 1196 LIST_HEAD(pc_list); 1197 struct list_head *src; 1198 struct page_cgroup *pc, *tmp; 1199 int nid = zone_to_nid(z); 1200 int zid = zone_idx(z); 1201 struct mem_cgroup_per_zone *mz; 1202 int lru = LRU_FILE * file + active; 1203 int ret; 1204 1205 BUG_ON(!mem_cont); 1206 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 1207 src = &mz->lists[lru]; 1208 1209 scan = 0; 1210 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 1211 if (scan >= nr_to_scan) 1212 break; 1213 1214 if (unlikely(!PageCgroupUsed(pc))) 1215 continue; 1216 1217 page = lookup_cgroup_page(pc); 1218 1219 if (unlikely(!PageLRU(page))) 1220 continue; 1221 1222 scan++; 1223 ret = __isolate_lru_page(page, mode, file); 1224 switch (ret) { 1225 case 0: 1226 list_move(&page->lru, dst); 1227 mem_cgroup_del_lru(page); 1228 nr_taken += hpage_nr_pages(page); 1229 break; 1230 case -EBUSY: 1231 /* we don't affect global LRU but rotate in our LRU */ 1232 mem_cgroup_rotate_lru_list(page, page_lru(page)); 1233 break; 1234 default: 1235 break; 1236 } 1237 } 1238 1239 *scanned = scan; 1240 1241 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, 1242 0, 0, 0, mode); 1243 1244 return nr_taken; 1245 } 1246 1247 #define mem_cgroup_from_res_counter(counter, member) \ 1248 container_of(counter, struct mem_cgroup, member) 1249 1250 /** 1251 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1252 * @mem: the memory cgroup 1253 * 1254 * Returns the maximum amount of memory @mem can be charged with, in 1255 * pages. 1256 */ 1257 static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) 1258 { 1259 unsigned long long margin; 1260 1261 margin = res_counter_margin(&mem->res); 1262 if (do_swap_account) 1263 margin = min(margin, res_counter_margin(&mem->memsw)); 1264 return margin >> PAGE_SHIFT; 1265 } 1266 1267 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1268 { 1269 struct cgroup *cgrp = memcg->css.cgroup; 1270 1271 /* root ? */ 1272 if (cgrp->parent == NULL) 1273 return vm_swappiness; 1274 1275 return memcg->swappiness; 1276 } 1277 1278 static void mem_cgroup_start_move(struct mem_cgroup *mem) 1279 { 1280 int cpu; 1281 1282 get_online_cpus(); 1283 spin_lock(&mem->pcp_counter_lock); 1284 for_each_online_cpu(cpu) 1285 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; 1286 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; 1287 spin_unlock(&mem->pcp_counter_lock); 1288 put_online_cpus(); 1289 1290 synchronize_rcu(); 1291 } 1292 1293 static void mem_cgroup_end_move(struct mem_cgroup *mem) 1294 { 1295 int cpu; 1296 1297 if (!mem) 1298 return; 1299 get_online_cpus(); 1300 spin_lock(&mem->pcp_counter_lock); 1301 for_each_online_cpu(cpu) 1302 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1303 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; 1304 spin_unlock(&mem->pcp_counter_lock); 1305 put_online_cpus(); 1306 } 1307 /* 1308 * 2 routines for checking "mem" is under move_account() or not. 1309 * 1310 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used 1311 * for avoiding race in accounting. If true, 1312 * pc->mem_cgroup may be overwritten. 1313 * 1314 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1315 * under hierarchy of moving cgroups. This is for 1316 * waiting at hith-memory prressure caused by "move". 1317 */ 1318 1319 static bool mem_cgroup_stealed(struct mem_cgroup *mem) 1320 { 1321 VM_BUG_ON(!rcu_read_lock_held()); 1322 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1323 } 1324 1325 static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1326 { 1327 struct mem_cgroup *from; 1328 struct mem_cgroup *to; 1329 bool ret = false; 1330 /* 1331 * Unlike task_move routines, we access mc.to, mc.from not under 1332 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1333 */ 1334 spin_lock(&mc.lock); 1335 from = mc.from; 1336 to = mc.to; 1337 if (!from) 1338 goto unlock; 1339 1340 ret = mem_cgroup_same_or_subtree(mem, from) 1341 || mem_cgroup_same_or_subtree(mem, to); 1342 unlock: 1343 spin_unlock(&mc.lock); 1344 return ret; 1345 } 1346 1347 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) 1348 { 1349 if (mc.moving_task && current != mc.moving_task) { 1350 if (mem_cgroup_under_move(mem)) { 1351 DEFINE_WAIT(wait); 1352 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1353 /* moving charge context might have finished. */ 1354 if (mc.moving_task) 1355 schedule(); 1356 finish_wait(&mc.waitq, &wait); 1357 return true; 1358 } 1359 } 1360 return false; 1361 } 1362 1363 /** 1364 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1365 * @memcg: The memory cgroup that went over limit 1366 * @p: Task that is going to be killed 1367 * 1368 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1369 * enabled 1370 */ 1371 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1372 { 1373 struct cgroup *task_cgrp; 1374 struct cgroup *mem_cgrp; 1375 /* 1376 * Need a buffer in BSS, can't rely on allocations. The code relies 1377 * on the assumption that OOM is serialized for memory controller. 1378 * If this assumption is broken, revisit this code. 1379 */ 1380 static char memcg_name[PATH_MAX]; 1381 int ret; 1382 1383 if (!memcg || !p) 1384 return; 1385 1386 1387 rcu_read_lock(); 1388 1389 mem_cgrp = memcg->css.cgroup; 1390 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1391 1392 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1393 if (ret < 0) { 1394 /* 1395 * Unfortunately, we are unable to convert to a useful name 1396 * But we'll still print out the usage information 1397 */ 1398 rcu_read_unlock(); 1399 goto done; 1400 } 1401 rcu_read_unlock(); 1402 1403 printk(KERN_INFO "Task in %s killed", memcg_name); 1404 1405 rcu_read_lock(); 1406 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1407 if (ret < 0) { 1408 rcu_read_unlock(); 1409 goto done; 1410 } 1411 rcu_read_unlock(); 1412 1413 /* 1414 * Continues from above, so we don't need an KERN_ level 1415 */ 1416 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1417 done: 1418 1419 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1420 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1421 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1422 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1423 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1424 "failcnt %llu\n", 1425 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1426 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1427 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1428 } 1429 1430 /* 1431 * This function returns the number of memcg under hierarchy tree. Returns 1432 * 1(self count) if no children. 1433 */ 1434 static int mem_cgroup_count_children(struct mem_cgroup *mem) 1435 { 1436 int num = 0; 1437 struct mem_cgroup *iter; 1438 1439 for_each_mem_cgroup_tree(iter, mem) 1440 num++; 1441 return num; 1442 } 1443 1444 /* 1445 * Return the memory (and swap, if configured) limit for a memcg. 1446 */ 1447 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1448 { 1449 u64 limit; 1450 u64 memsw; 1451 1452 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1453 limit += total_swap_pages << PAGE_SHIFT; 1454 1455 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1456 /* 1457 * If memsw is finite and limits the amount of swap space available 1458 * to this memcg, return that limit. 1459 */ 1460 return min(limit, memsw); 1461 } 1462 1463 /* 1464 * Visit the first child (need not be the first child as per the ordering 1465 * of the cgroup list, since we track last_scanned_child) of @mem and use 1466 * that to reclaim free pages from. 1467 */ 1468 static struct mem_cgroup * 1469 mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1470 { 1471 struct mem_cgroup *ret = NULL; 1472 struct cgroup_subsys_state *css; 1473 int nextid, found; 1474 1475 if (!root_mem->use_hierarchy) { 1476 css_get(&root_mem->css); 1477 ret = root_mem; 1478 } 1479 1480 while (!ret) { 1481 rcu_read_lock(); 1482 nextid = root_mem->last_scanned_child + 1; 1483 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1484 &found); 1485 if (css && css_tryget(css)) 1486 ret = container_of(css, struct mem_cgroup, css); 1487 1488 rcu_read_unlock(); 1489 /* Updates scanning parameter */ 1490 if (!css) { 1491 /* this means start scan from ID:1 */ 1492 root_mem->last_scanned_child = 0; 1493 } else 1494 root_mem->last_scanned_child = found; 1495 } 1496 1497 return ret; 1498 } 1499 1500 /** 1501 * test_mem_cgroup_node_reclaimable 1502 * @mem: the target memcg 1503 * @nid: the node ID to be checked. 1504 * @noswap : specify true here if the user wants flle only information. 1505 * 1506 * This function returns whether the specified memcg contains any 1507 * reclaimable pages on a node. Returns true if there are any reclaimable 1508 * pages in the node. 1509 */ 1510 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, 1511 int nid, bool noswap) 1512 { 1513 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) 1514 return true; 1515 if (noswap || !total_swap_pages) 1516 return false; 1517 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) 1518 return true; 1519 return false; 1520 1521 } 1522 #if MAX_NUMNODES > 1 1523 1524 /* 1525 * Always updating the nodemask is not very good - even if we have an empty 1526 * list or the wrong list here, we can start from some node and traverse all 1527 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1528 * 1529 */ 1530 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) 1531 { 1532 int nid; 1533 /* 1534 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1535 * pagein/pageout changes since the last update. 1536 */ 1537 if (!atomic_read(&mem->numainfo_events)) 1538 return; 1539 if (atomic_inc_return(&mem->numainfo_updating) > 1) 1540 return; 1541 1542 /* make a nodemask where this memcg uses memory from */ 1543 mem->scan_nodes = node_states[N_HIGH_MEMORY]; 1544 1545 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1546 1547 if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) 1548 node_clear(nid, mem->scan_nodes); 1549 } 1550 1551 atomic_set(&mem->numainfo_events, 0); 1552 atomic_set(&mem->numainfo_updating, 0); 1553 } 1554 1555 /* 1556 * Selecting a node where we start reclaim from. Because what we need is just 1557 * reducing usage counter, start from anywhere is O,K. Considering 1558 * memory reclaim from current node, there are pros. and cons. 1559 * 1560 * Freeing memory from current node means freeing memory from a node which 1561 * we'll use or we've used. So, it may make LRU bad. And if several threads 1562 * hit limits, it will see a contention on a node. But freeing from remote 1563 * node means more costs for memory reclaim because of memory latency. 1564 * 1565 * Now, we use round-robin. Better algorithm is welcomed. 1566 */ 1567 int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1568 { 1569 int node; 1570 1571 mem_cgroup_may_update_nodemask(mem); 1572 node = mem->last_scanned_node; 1573 1574 node = next_node(node, mem->scan_nodes); 1575 if (node == MAX_NUMNODES) 1576 node = first_node(mem->scan_nodes); 1577 /* 1578 * We call this when we hit limit, not when pages are added to LRU. 1579 * No LRU may hold pages because all pages are UNEVICTABLE or 1580 * memcg is too small and all pages are not on LRU. In that case, 1581 * we use curret node. 1582 */ 1583 if (unlikely(node == MAX_NUMNODES)) 1584 node = numa_node_id(); 1585 1586 mem->last_scanned_node = node; 1587 return node; 1588 } 1589 1590 /* 1591 * Check all nodes whether it contains reclaimable pages or not. 1592 * For quick scan, we make use of scan_nodes. This will allow us to skip 1593 * unused nodes. But scan_nodes is lazily updated and may not cotain 1594 * enough new information. We need to do double check. 1595 */ 1596 bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) 1597 { 1598 int nid; 1599 1600 /* 1601 * quick check...making use of scan_node. 1602 * We can skip unused nodes. 1603 */ 1604 if (!nodes_empty(mem->scan_nodes)) { 1605 for (nid = first_node(mem->scan_nodes); 1606 nid < MAX_NUMNODES; 1607 nid = next_node(nid, mem->scan_nodes)) { 1608 1609 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) 1610 return true; 1611 } 1612 } 1613 /* 1614 * Check rest of nodes. 1615 */ 1616 for_each_node_state(nid, N_HIGH_MEMORY) { 1617 if (node_isset(nid, mem->scan_nodes)) 1618 continue; 1619 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) 1620 return true; 1621 } 1622 return false; 1623 } 1624 1625 #else 1626 int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1627 { 1628 return 0; 1629 } 1630 1631 bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) 1632 { 1633 return test_mem_cgroup_node_reclaimable(mem, 0, noswap); 1634 } 1635 #endif 1636 1637 /* 1638 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1639 * we reclaimed from, so that we don't end up penalizing one child extensively 1640 * based on its position in the children list. 1641 * 1642 * root_mem is the original ancestor that we've been reclaim from. 1643 * 1644 * We give up and return to the caller when we visit root_mem twice. 1645 * (other groups can be removed while we're walking....) 1646 * 1647 * If shrink==true, for avoiding to free too much, this returns immedieately. 1648 */ 1649 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1650 struct zone *zone, 1651 gfp_t gfp_mask, 1652 unsigned long reclaim_options, 1653 unsigned long *total_scanned) 1654 { 1655 struct mem_cgroup *victim; 1656 int ret, total = 0; 1657 int loop = 0; 1658 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1659 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1660 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1661 unsigned long excess; 1662 unsigned long nr_scanned; 1663 1664 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1665 1666 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1667 if (!check_soft && !shrink && root_mem->memsw_is_minimum) 1668 noswap = true; 1669 1670 while (1) { 1671 victim = mem_cgroup_select_victim(root_mem); 1672 if (victim == root_mem) { 1673 loop++; 1674 /* 1675 * We are not draining per cpu cached charges during 1676 * soft limit reclaim because global reclaim doesn't 1677 * care about charges. It tries to free some memory and 1678 * charges will not give any. 1679 */ 1680 if (!check_soft && loop >= 1) 1681 drain_all_stock_async(root_mem); 1682 if (loop >= 2) { 1683 /* 1684 * If we have not been able to reclaim 1685 * anything, it might because there are 1686 * no reclaimable pages under this hierarchy 1687 */ 1688 if (!check_soft || !total) { 1689 css_put(&victim->css); 1690 break; 1691 } 1692 /* 1693 * We want to do more targeted reclaim. 1694 * excess >> 2 is not to excessive so as to 1695 * reclaim too much, nor too less that we keep 1696 * coming back to reclaim from this cgroup 1697 */ 1698 if (total >= (excess >> 2) || 1699 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1700 css_put(&victim->css); 1701 break; 1702 } 1703 } 1704 } 1705 if (!mem_cgroup_reclaimable(victim, noswap)) { 1706 /* this cgroup's local usage == 0 */ 1707 css_put(&victim->css); 1708 continue; 1709 } 1710 /* we use swappiness of local cgroup */ 1711 if (check_soft) { 1712 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1713 noswap, zone, &nr_scanned); 1714 *total_scanned += nr_scanned; 1715 } else 1716 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1717 noswap); 1718 css_put(&victim->css); 1719 /* 1720 * At shrinking usage, we can't check we should stop here or 1721 * reclaim more. It's depends on callers. last_scanned_child 1722 * will work enough for keeping fairness under tree. 1723 */ 1724 if (shrink) 1725 return ret; 1726 total += ret; 1727 if (check_soft) { 1728 if (!res_counter_soft_limit_excess(&root_mem->res)) 1729 return total; 1730 } else if (mem_cgroup_margin(root_mem)) 1731 return total; 1732 } 1733 return total; 1734 } 1735 1736 /* 1737 * Check OOM-Killer is already running under our hierarchy. 1738 * If someone is running, return false. 1739 * Has to be called with memcg_oom_lock 1740 */ 1741 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1742 { 1743 struct mem_cgroup *iter, *failed = NULL; 1744 bool cond = true; 1745 1746 for_each_mem_cgroup_tree_cond(iter, mem, cond) { 1747 if (iter->oom_lock) { 1748 /* 1749 * this subtree of our hierarchy is already locked 1750 * so we cannot give a lock. 1751 */ 1752 failed = iter; 1753 cond = false; 1754 } else 1755 iter->oom_lock = true; 1756 } 1757 1758 if (!failed) 1759 return true; 1760 1761 /* 1762 * OK, we failed to lock the whole subtree so we have to clean up 1763 * what we set up to the failing subtree 1764 */ 1765 cond = true; 1766 for_each_mem_cgroup_tree_cond(iter, mem, cond) { 1767 if (iter == failed) { 1768 cond = false; 1769 continue; 1770 } 1771 iter->oom_lock = false; 1772 } 1773 return false; 1774 } 1775 1776 /* 1777 * Has to be called with memcg_oom_lock 1778 */ 1779 static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1780 { 1781 struct mem_cgroup *iter; 1782 1783 for_each_mem_cgroup_tree(iter, mem) 1784 iter->oom_lock = false; 1785 return 0; 1786 } 1787 1788 static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) 1789 { 1790 struct mem_cgroup *iter; 1791 1792 for_each_mem_cgroup_tree(iter, mem) 1793 atomic_inc(&iter->under_oom); 1794 } 1795 1796 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) 1797 { 1798 struct mem_cgroup *iter; 1799 1800 /* 1801 * When a new child is created while the hierarchy is under oom, 1802 * mem_cgroup_oom_lock() may not be called. We have to use 1803 * atomic_add_unless() here. 1804 */ 1805 for_each_mem_cgroup_tree(iter, mem) 1806 atomic_add_unless(&iter->under_oom, -1, 0); 1807 } 1808 1809 static DEFINE_SPINLOCK(memcg_oom_lock); 1810 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1811 1812 struct oom_wait_info { 1813 struct mem_cgroup *mem; 1814 wait_queue_t wait; 1815 }; 1816 1817 static int memcg_oom_wake_function(wait_queue_t *wait, 1818 unsigned mode, int sync, void *arg) 1819 { 1820 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, 1821 *oom_wait_mem; 1822 struct oom_wait_info *oom_wait_info; 1823 1824 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1825 oom_wait_mem = oom_wait_info->mem; 1826 1827 /* 1828 * Both of oom_wait_info->mem and wake_mem are stable under us. 1829 * Then we can use css_is_ancestor without taking care of RCU. 1830 */ 1831 if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) 1832 && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) 1833 return 0; 1834 return autoremove_wake_function(wait, mode, sync, arg); 1835 } 1836 1837 static void memcg_wakeup_oom(struct mem_cgroup *mem) 1838 { 1839 /* for filtering, pass "mem" as argument. */ 1840 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1841 } 1842 1843 static void memcg_oom_recover(struct mem_cgroup *mem) 1844 { 1845 if (mem && atomic_read(&mem->under_oom)) 1846 memcg_wakeup_oom(mem); 1847 } 1848 1849 /* 1850 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1851 */ 1852 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1853 { 1854 struct oom_wait_info owait; 1855 bool locked, need_to_kill; 1856 1857 owait.mem = mem; 1858 owait.wait.flags = 0; 1859 owait.wait.func = memcg_oom_wake_function; 1860 owait.wait.private = current; 1861 INIT_LIST_HEAD(&owait.wait.task_list); 1862 need_to_kill = true; 1863 mem_cgroup_mark_under_oom(mem); 1864 1865 /* At first, try to OOM lock hierarchy under mem.*/ 1866 spin_lock(&memcg_oom_lock); 1867 locked = mem_cgroup_oom_lock(mem); 1868 /* 1869 * Even if signal_pending(), we can't quit charge() loop without 1870 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1871 * under OOM is always welcomed, use TASK_KILLABLE here. 1872 */ 1873 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1874 if (!locked || mem->oom_kill_disable) 1875 need_to_kill = false; 1876 if (locked) 1877 mem_cgroup_oom_notify(mem); 1878 spin_unlock(&memcg_oom_lock); 1879 1880 if (need_to_kill) { 1881 finish_wait(&memcg_oom_waitq, &owait.wait); 1882 mem_cgroup_out_of_memory(mem, mask); 1883 } else { 1884 schedule(); 1885 finish_wait(&memcg_oom_waitq, &owait.wait); 1886 } 1887 spin_lock(&memcg_oom_lock); 1888 if (locked) 1889 mem_cgroup_oom_unlock(mem); 1890 memcg_wakeup_oom(mem); 1891 spin_unlock(&memcg_oom_lock); 1892 1893 mem_cgroup_unmark_under_oom(mem); 1894 1895 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1896 return false; 1897 /* Give chance to dying process */ 1898 schedule_timeout(1); 1899 return true; 1900 } 1901 1902 /* 1903 * Currently used to update mapped file statistics, but the routine can be 1904 * generalized to update other statistics as well. 1905 * 1906 * Notes: Race condition 1907 * 1908 * We usually use page_cgroup_lock() for accessing page_cgroup member but 1909 * it tends to be costly. But considering some conditions, we doesn't need 1910 * to do so _always_. 1911 * 1912 * Considering "charge", lock_page_cgroup() is not required because all 1913 * file-stat operations happen after a page is attached to radix-tree. There 1914 * are no race with "charge". 1915 * 1916 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 1917 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 1918 * if there are race with "uncharge". Statistics itself is properly handled 1919 * by flags. 1920 * 1921 * Considering "move", this is an only case we see a race. To make the race 1922 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are 1923 * possibility of race condition. If there is, we take a lock. 1924 */ 1925 1926 void mem_cgroup_update_page_stat(struct page *page, 1927 enum mem_cgroup_page_stat_item idx, int val) 1928 { 1929 struct mem_cgroup *mem; 1930 struct page_cgroup *pc = lookup_page_cgroup(page); 1931 bool need_unlock = false; 1932 unsigned long uninitialized_var(flags); 1933 1934 if (unlikely(!pc)) 1935 return; 1936 1937 rcu_read_lock(); 1938 mem = pc->mem_cgroup; 1939 if (unlikely(!mem || !PageCgroupUsed(pc))) 1940 goto out; 1941 /* pc->mem_cgroup is unstable ? */ 1942 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { 1943 /* take a lock against to access pc->mem_cgroup */ 1944 move_lock_page_cgroup(pc, &flags); 1945 need_unlock = true; 1946 mem = pc->mem_cgroup; 1947 if (!mem || !PageCgroupUsed(pc)) 1948 goto out; 1949 } 1950 1951 switch (idx) { 1952 case MEMCG_NR_FILE_MAPPED: 1953 if (val > 0) 1954 SetPageCgroupFileMapped(pc); 1955 else if (!page_mapped(page)) 1956 ClearPageCgroupFileMapped(pc); 1957 idx = MEM_CGROUP_STAT_FILE_MAPPED; 1958 break; 1959 default: 1960 BUG(); 1961 } 1962 1963 this_cpu_add(mem->stat->count[idx], val); 1964 1965 out: 1966 if (unlikely(need_unlock)) 1967 move_unlock_page_cgroup(pc, &flags); 1968 rcu_read_unlock(); 1969 return; 1970 } 1971 EXPORT_SYMBOL(mem_cgroup_update_page_stat); 1972 1973 /* 1974 * size of first charge trial. "32" comes from vmscan.c's magic value. 1975 * TODO: maybe necessary to use big numbers in big irons. 1976 */ 1977 #define CHARGE_BATCH 32U 1978 struct memcg_stock_pcp { 1979 struct mem_cgroup *cached; /* this never be root cgroup */ 1980 unsigned int nr_pages; 1981 struct work_struct work; 1982 unsigned long flags; 1983 #define FLUSHING_CACHED_CHARGE (0) 1984 }; 1985 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1986 static DEFINE_MUTEX(percpu_charge_mutex); 1987 1988 /* 1989 * Try to consume stocked charge on this cpu. If success, one page is consumed 1990 * from local stock and true is returned. If the stock is 0 or charges from a 1991 * cgroup which is not current target, returns false. This stock will be 1992 * refilled. 1993 */ 1994 static bool consume_stock(struct mem_cgroup *mem) 1995 { 1996 struct memcg_stock_pcp *stock; 1997 bool ret = true; 1998 1999 stock = &get_cpu_var(memcg_stock); 2000 if (mem == stock->cached && stock->nr_pages) 2001 stock->nr_pages--; 2002 else /* need to call res_counter_charge */ 2003 ret = false; 2004 put_cpu_var(memcg_stock); 2005 return ret; 2006 } 2007 2008 /* 2009 * Returns stocks cached in percpu to res_counter and reset cached information. 2010 */ 2011 static void drain_stock(struct memcg_stock_pcp *stock) 2012 { 2013 struct mem_cgroup *old = stock->cached; 2014 2015 if (stock->nr_pages) { 2016 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2017 2018 res_counter_uncharge(&old->res, bytes); 2019 if (do_swap_account) 2020 res_counter_uncharge(&old->memsw, bytes); 2021 stock->nr_pages = 0; 2022 } 2023 stock->cached = NULL; 2024 } 2025 2026 /* 2027 * This must be called under preempt disabled or must be called by 2028 * a thread which is pinned to local cpu. 2029 */ 2030 static void drain_local_stock(struct work_struct *dummy) 2031 { 2032 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2033 drain_stock(stock); 2034 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2035 } 2036 2037 /* 2038 * Cache charges(val) which is from res_counter, to local per_cpu area. 2039 * This will be consumed by consume_stock() function, later. 2040 */ 2041 static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) 2042 { 2043 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2044 2045 if (stock->cached != mem) { /* reset if necessary */ 2046 drain_stock(stock); 2047 stock->cached = mem; 2048 } 2049 stock->nr_pages += nr_pages; 2050 put_cpu_var(memcg_stock); 2051 } 2052 2053 /* 2054 * Drains all per-CPU charge caches for given root_mem resp. subtree 2055 * of the hierarchy under it. sync flag says whether we should block 2056 * until the work is done. 2057 */ 2058 static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) 2059 { 2060 int cpu, curcpu; 2061 2062 /* Notify other cpus that system-wide "drain" is running */ 2063 get_online_cpus(); 2064 curcpu = get_cpu(); 2065 for_each_online_cpu(cpu) { 2066 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2067 struct mem_cgroup *mem; 2068 2069 mem = stock->cached; 2070 if (!mem || !stock->nr_pages) 2071 continue; 2072 if (!mem_cgroup_same_or_subtree(root_mem, mem)) 2073 continue; 2074 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2075 if (cpu == curcpu) 2076 drain_local_stock(&stock->work); 2077 else 2078 schedule_work_on(cpu, &stock->work); 2079 } 2080 } 2081 put_cpu(); 2082 2083 if (!sync) 2084 goto out; 2085 2086 for_each_online_cpu(cpu) { 2087 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2088 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2089 flush_work(&stock->work); 2090 } 2091 out: 2092 put_online_cpus(); 2093 } 2094 2095 /* 2096 * Tries to drain stocked charges in other cpus. This function is asynchronous 2097 * and just put a work per cpu for draining localy on each cpu. Caller can 2098 * expects some charges will be back to res_counter later but cannot wait for 2099 * it. 2100 */ 2101 static void drain_all_stock_async(struct mem_cgroup *root_mem) 2102 { 2103 /* 2104 * If someone calls draining, avoid adding more kworker runs. 2105 */ 2106 if (!mutex_trylock(&percpu_charge_mutex)) 2107 return; 2108 drain_all_stock(root_mem, false); 2109 mutex_unlock(&percpu_charge_mutex); 2110 } 2111 2112 /* This is a synchronous drain interface. */ 2113 static void drain_all_stock_sync(struct mem_cgroup *root_mem) 2114 { 2115 /* called when force_empty is called */ 2116 mutex_lock(&percpu_charge_mutex); 2117 drain_all_stock(root_mem, true); 2118 mutex_unlock(&percpu_charge_mutex); 2119 } 2120 2121 /* 2122 * This function drains percpu counter value from DEAD cpu and 2123 * move it to local cpu. Note that this function can be preempted. 2124 */ 2125 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) 2126 { 2127 int i; 2128 2129 spin_lock(&mem->pcp_counter_lock); 2130 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 2131 long x = per_cpu(mem->stat->count[i], cpu); 2132 2133 per_cpu(mem->stat->count[i], cpu) = 0; 2134 mem->nocpu_base.count[i] += x; 2135 } 2136 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2137 unsigned long x = per_cpu(mem->stat->events[i], cpu); 2138 2139 per_cpu(mem->stat->events[i], cpu) = 0; 2140 mem->nocpu_base.events[i] += x; 2141 } 2142 /* need to clear ON_MOVE value, works as a kind of lock. */ 2143 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; 2144 spin_unlock(&mem->pcp_counter_lock); 2145 } 2146 2147 static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) 2148 { 2149 int idx = MEM_CGROUP_ON_MOVE; 2150 2151 spin_lock(&mem->pcp_counter_lock); 2152 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; 2153 spin_unlock(&mem->pcp_counter_lock); 2154 } 2155 2156 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 2157 unsigned long action, 2158 void *hcpu) 2159 { 2160 int cpu = (unsigned long)hcpu; 2161 struct memcg_stock_pcp *stock; 2162 struct mem_cgroup *iter; 2163 2164 if ((action == CPU_ONLINE)) { 2165 for_each_mem_cgroup_all(iter) 2166 synchronize_mem_cgroup_on_move(iter, cpu); 2167 return NOTIFY_OK; 2168 } 2169 2170 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2171 return NOTIFY_OK; 2172 2173 for_each_mem_cgroup_all(iter) 2174 mem_cgroup_drain_pcp_counter(iter, cpu); 2175 2176 stock = &per_cpu(memcg_stock, cpu); 2177 drain_stock(stock); 2178 return NOTIFY_OK; 2179 } 2180 2181 2182 /* See __mem_cgroup_try_charge() for details */ 2183 enum { 2184 CHARGE_OK, /* success */ 2185 CHARGE_RETRY, /* need to retry but retry is not bad */ 2186 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2187 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2188 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 2189 }; 2190 2191 static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, 2192 unsigned int nr_pages, bool oom_check) 2193 { 2194 unsigned long csize = nr_pages * PAGE_SIZE; 2195 struct mem_cgroup *mem_over_limit; 2196 struct res_counter *fail_res; 2197 unsigned long flags = 0; 2198 int ret; 2199 2200 ret = res_counter_charge(&mem->res, csize, &fail_res); 2201 2202 if (likely(!ret)) { 2203 if (!do_swap_account) 2204 return CHARGE_OK; 2205 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 2206 if (likely(!ret)) 2207 return CHARGE_OK; 2208 2209 res_counter_uncharge(&mem->res, csize); 2210 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2211 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2212 } else 2213 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2214 /* 2215 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch 2216 * of regular pages (CHARGE_BATCH), or a single regular page (1). 2217 * 2218 * Never reclaim on behalf of optional batching, retry with a 2219 * single page instead. 2220 */ 2221 if (nr_pages == CHARGE_BATCH) 2222 return CHARGE_RETRY; 2223 2224 if (!(gfp_mask & __GFP_WAIT)) 2225 return CHARGE_WOULDBLOCK; 2226 2227 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 2228 gfp_mask, flags, NULL); 2229 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2230 return CHARGE_RETRY; 2231 /* 2232 * Even though the limit is exceeded at this point, reclaim 2233 * may have been able to free some pages. Retry the charge 2234 * before killing the task. 2235 * 2236 * Only for regular pages, though: huge pages are rather 2237 * unlikely to succeed so close to the limit, and we fall back 2238 * to regular pages anyway in case of failure. 2239 */ 2240 if (nr_pages == 1 && ret) 2241 return CHARGE_RETRY; 2242 2243 /* 2244 * At task move, charge accounts can be doubly counted. So, it's 2245 * better to wait until the end of task_move if something is going on. 2246 */ 2247 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2248 return CHARGE_RETRY; 2249 2250 /* If we don't need to call oom-killer at el, return immediately */ 2251 if (!oom_check) 2252 return CHARGE_NOMEM; 2253 /* check OOM */ 2254 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) 2255 return CHARGE_OOM_DIE; 2256 2257 return CHARGE_RETRY; 2258 } 2259 2260 /* 2261 * Unlike exported interface, "oom" parameter is added. if oom==true, 2262 * oom-killer can be invoked. 2263 */ 2264 static int __mem_cgroup_try_charge(struct mm_struct *mm, 2265 gfp_t gfp_mask, 2266 unsigned int nr_pages, 2267 struct mem_cgroup **memcg, 2268 bool oom) 2269 { 2270 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2271 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2272 struct mem_cgroup *mem = NULL; 2273 int ret; 2274 2275 /* 2276 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 2277 * in system level. So, allow to go ahead dying process in addition to 2278 * MEMDIE process. 2279 */ 2280 if (unlikely(test_thread_flag(TIF_MEMDIE) 2281 || fatal_signal_pending(current))) 2282 goto bypass; 2283 2284 /* 2285 * We always charge the cgroup the mm_struct belongs to. 2286 * The mm_struct's mem_cgroup changes on task migration if the 2287 * thread group leader migrates. It's possible that mm is not 2288 * set, if so charge the init_mm (happens for pagecache usage). 2289 */ 2290 if (!*memcg && !mm) 2291 goto bypass; 2292 again: 2293 if (*memcg) { /* css should be a valid one */ 2294 mem = *memcg; 2295 VM_BUG_ON(css_is_removed(&mem->css)); 2296 if (mem_cgroup_is_root(mem)) 2297 goto done; 2298 if (nr_pages == 1 && consume_stock(mem)) 2299 goto done; 2300 css_get(&mem->css); 2301 } else { 2302 struct task_struct *p; 2303 2304 rcu_read_lock(); 2305 p = rcu_dereference(mm->owner); 2306 /* 2307 * Because we don't have task_lock(), "p" can exit. 2308 * In that case, "mem" can point to root or p can be NULL with 2309 * race with swapoff. Then, we have small risk of mis-accouning. 2310 * But such kind of mis-account by race always happens because 2311 * we don't have cgroup_mutex(). It's overkill and we allo that 2312 * small race, here. 2313 * (*) swapoff at el will charge against mm-struct not against 2314 * task-struct. So, mm->owner can be NULL. 2315 */ 2316 mem = mem_cgroup_from_task(p); 2317 if (!mem || mem_cgroup_is_root(mem)) { 2318 rcu_read_unlock(); 2319 goto done; 2320 } 2321 if (nr_pages == 1 && consume_stock(mem)) { 2322 /* 2323 * It seems dagerous to access memcg without css_get(). 2324 * But considering how consume_stok works, it's not 2325 * necessary. If consume_stock success, some charges 2326 * from this memcg are cached on this cpu. So, we 2327 * don't need to call css_get()/css_tryget() before 2328 * calling consume_stock(). 2329 */ 2330 rcu_read_unlock(); 2331 goto done; 2332 } 2333 /* after here, we may be blocked. we need to get refcnt */ 2334 if (!css_tryget(&mem->css)) { 2335 rcu_read_unlock(); 2336 goto again; 2337 } 2338 rcu_read_unlock(); 2339 } 2340 2341 do { 2342 bool oom_check; 2343 2344 /* If killed, bypass charge */ 2345 if (fatal_signal_pending(current)) { 2346 css_put(&mem->css); 2347 goto bypass; 2348 } 2349 2350 oom_check = false; 2351 if (oom && !nr_oom_retries) { 2352 oom_check = true; 2353 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2354 } 2355 2356 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); 2357 switch (ret) { 2358 case CHARGE_OK: 2359 break; 2360 case CHARGE_RETRY: /* not in OOM situation but retry */ 2361 batch = nr_pages; 2362 css_put(&mem->css); 2363 mem = NULL; 2364 goto again; 2365 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2366 css_put(&mem->css); 2367 goto nomem; 2368 case CHARGE_NOMEM: /* OOM routine works */ 2369 if (!oom) { 2370 css_put(&mem->css); 2371 goto nomem; 2372 } 2373 /* If oom, we never return -ENOMEM */ 2374 nr_oom_retries--; 2375 break; 2376 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2377 css_put(&mem->css); 2378 goto bypass; 2379 } 2380 } while (ret != CHARGE_OK); 2381 2382 if (batch > nr_pages) 2383 refill_stock(mem, batch - nr_pages); 2384 css_put(&mem->css); 2385 done: 2386 *memcg = mem; 2387 return 0; 2388 nomem: 2389 *memcg = NULL; 2390 return -ENOMEM; 2391 bypass: 2392 *memcg = NULL; 2393 return 0; 2394 } 2395 2396 /* 2397 * Somemtimes we have to undo a charge we got by try_charge(). 2398 * This function is for that and do uncharge, put css's refcnt. 2399 * gotten by try_charge(). 2400 */ 2401 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2402 unsigned int nr_pages) 2403 { 2404 if (!mem_cgroup_is_root(mem)) { 2405 unsigned long bytes = nr_pages * PAGE_SIZE; 2406 2407 res_counter_uncharge(&mem->res, bytes); 2408 if (do_swap_account) 2409 res_counter_uncharge(&mem->memsw, bytes); 2410 } 2411 } 2412 2413 /* 2414 * A helper function to get mem_cgroup from ID. must be called under 2415 * rcu_read_lock(). The caller must check css_is_removed() or some if 2416 * it's concern. (dropping refcnt from swap can be called against removed 2417 * memcg.) 2418 */ 2419 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2420 { 2421 struct cgroup_subsys_state *css; 2422 2423 /* ID 0 is unused ID */ 2424 if (!id) 2425 return NULL; 2426 css = css_lookup(&mem_cgroup_subsys, id); 2427 if (!css) 2428 return NULL; 2429 return container_of(css, struct mem_cgroup, css); 2430 } 2431 2432 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2433 { 2434 struct mem_cgroup *mem = NULL; 2435 struct page_cgroup *pc; 2436 unsigned short id; 2437 swp_entry_t ent; 2438 2439 VM_BUG_ON(!PageLocked(page)); 2440 2441 pc = lookup_page_cgroup(page); 2442 lock_page_cgroup(pc); 2443 if (PageCgroupUsed(pc)) { 2444 mem = pc->mem_cgroup; 2445 if (mem && !css_tryget(&mem->css)) 2446 mem = NULL; 2447 } else if (PageSwapCache(page)) { 2448 ent.val = page_private(page); 2449 id = lookup_swap_cgroup(ent); 2450 rcu_read_lock(); 2451 mem = mem_cgroup_lookup(id); 2452 if (mem && !css_tryget(&mem->css)) 2453 mem = NULL; 2454 rcu_read_unlock(); 2455 } 2456 unlock_page_cgroup(pc); 2457 return mem; 2458 } 2459 2460 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2461 struct page *page, 2462 unsigned int nr_pages, 2463 struct page_cgroup *pc, 2464 enum charge_type ctype) 2465 { 2466 lock_page_cgroup(pc); 2467 if (unlikely(PageCgroupUsed(pc))) { 2468 unlock_page_cgroup(pc); 2469 __mem_cgroup_cancel_charge(mem, nr_pages); 2470 return; 2471 } 2472 /* 2473 * we don't need page_cgroup_lock about tail pages, becase they are not 2474 * accessed by any other context at this point. 2475 */ 2476 pc->mem_cgroup = mem; 2477 /* 2478 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2479 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2480 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2481 * before USED bit, we need memory barrier here. 2482 * See mem_cgroup_add_lru_list(), etc. 2483 */ 2484 smp_wmb(); 2485 switch (ctype) { 2486 case MEM_CGROUP_CHARGE_TYPE_CACHE: 2487 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 2488 SetPageCgroupCache(pc); 2489 SetPageCgroupUsed(pc); 2490 break; 2491 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2492 ClearPageCgroupCache(pc); 2493 SetPageCgroupUsed(pc); 2494 break; 2495 default: 2496 break; 2497 } 2498 2499 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); 2500 unlock_page_cgroup(pc); 2501 /* 2502 * "charge_statistics" updated event counter. Then, check it. 2503 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2504 * if they exceeds softlimit. 2505 */ 2506 memcg_check_events(mem, page); 2507 } 2508 2509 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2510 2511 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2512 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) 2513 /* 2514 * Because tail pages are not marked as "used", set it. We're under 2515 * zone->lru_lock, 'splitting on pmd' and compund_lock. 2516 */ 2517 void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) 2518 { 2519 struct page_cgroup *head_pc = lookup_page_cgroup(head); 2520 struct page_cgroup *tail_pc = lookup_page_cgroup(tail); 2521 unsigned long flags; 2522 2523 if (mem_cgroup_disabled()) 2524 return; 2525 /* 2526 * We have no races with charge/uncharge but will have races with 2527 * page state accounting. 2528 */ 2529 move_lock_page_cgroup(head_pc, &flags); 2530 2531 tail_pc->mem_cgroup = head_pc->mem_cgroup; 2532 smp_wmb(); /* see __commit_charge() */ 2533 if (PageCgroupAcctLRU(head_pc)) { 2534 enum lru_list lru; 2535 struct mem_cgroup_per_zone *mz; 2536 2537 /* 2538 * LRU flags cannot be copied because we need to add tail 2539 *.page to LRU by generic call and our hook will be called. 2540 * We hold lru_lock, then, reduce counter directly. 2541 */ 2542 lru = page_lru(head); 2543 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); 2544 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 2545 } 2546 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 2547 move_unlock_page_cgroup(head_pc, &flags); 2548 } 2549 #endif 2550 2551 /** 2552 * mem_cgroup_move_account - move account of the page 2553 * @page: the page 2554 * @nr_pages: number of regular pages (>1 for huge pages) 2555 * @pc: page_cgroup of the page. 2556 * @from: mem_cgroup which the page is moved from. 2557 * @to: mem_cgroup which the page is moved to. @from != @to. 2558 * @uncharge: whether we should call uncharge and css_put against @from. 2559 * 2560 * The caller must confirm following. 2561 * - page is not on LRU (isolate_page() is useful.) 2562 * - compound_lock is held when nr_pages > 1 2563 * 2564 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2565 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is 2566 * true, this function does "uncharge" from old cgroup, but it doesn't if 2567 * @uncharge is false, so a caller should do "uncharge". 2568 */ 2569 static int mem_cgroup_move_account(struct page *page, 2570 unsigned int nr_pages, 2571 struct page_cgroup *pc, 2572 struct mem_cgroup *from, 2573 struct mem_cgroup *to, 2574 bool uncharge) 2575 { 2576 unsigned long flags; 2577 int ret; 2578 2579 VM_BUG_ON(from == to); 2580 VM_BUG_ON(PageLRU(page)); 2581 /* 2582 * The page is isolated from LRU. So, collapse function 2583 * will not handle this page. But page splitting can happen. 2584 * Do this check under compound_page_lock(). The caller should 2585 * hold it. 2586 */ 2587 ret = -EBUSY; 2588 if (nr_pages > 1 && !PageTransHuge(page)) 2589 goto out; 2590 2591 lock_page_cgroup(pc); 2592 2593 ret = -EINVAL; 2594 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2595 goto unlock; 2596 2597 move_lock_page_cgroup(pc, &flags); 2598 2599 if (PageCgroupFileMapped(pc)) { 2600 /* Update mapped_file data for mem_cgroup */ 2601 preempt_disable(); 2602 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2603 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2604 preempt_enable(); 2605 } 2606 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); 2607 if (uncharge) 2608 /* This is not "cancel", but cancel_charge does all we need. */ 2609 __mem_cgroup_cancel_charge(from, nr_pages); 2610 2611 /* caller should have done css_get */ 2612 pc->mem_cgroup = to; 2613 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); 2614 /* 2615 * We charges against "to" which may not have any tasks. Then, "to" 2616 * can be under rmdir(). But in current implementation, caller of 2617 * this function is just force_empty() and move charge, so it's 2618 * guaranteed that "to" is never removed. So, we don't check rmdir 2619 * status here. 2620 */ 2621 move_unlock_page_cgroup(pc, &flags); 2622 ret = 0; 2623 unlock: 2624 unlock_page_cgroup(pc); 2625 /* 2626 * check events 2627 */ 2628 memcg_check_events(to, page); 2629 memcg_check_events(from, page); 2630 out: 2631 return ret; 2632 } 2633 2634 /* 2635 * move charges to its parent. 2636 */ 2637 2638 static int mem_cgroup_move_parent(struct page *page, 2639 struct page_cgroup *pc, 2640 struct mem_cgroup *child, 2641 gfp_t gfp_mask) 2642 { 2643 struct cgroup *cg = child->css.cgroup; 2644 struct cgroup *pcg = cg->parent; 2645 struct mem_cgroup *parent; 2646 unsigned int nr_pages; 2647 unsigned long uninitialized_var(flags); 2648 int ret; 2649 2650 /* Is ROOT ? */ 2651 if (!pcg) 2652 return -EINVAL; 2653 2654 ret = -EBUSY; 2655 if (!get_page_unless_zero(page)) 2656 goto out; 2657 if (isolate_lru_page(page)) 2658 goto put; 2659 2660 nr_pages = hpage_nr_pages(page); 2661 2662 parent = mem_cgroup_from_cont(pcg); 2663 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2664 if (ret || !parent) 2665 goto put_back; 2666 2667 if (nr_pages > 1) 2668 flags = compound_lock_irqsave(page); 2669 2670 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); 2671 if (ret) 2672 __mem_cgroup_cancel_charge(parent, nr_pages); 2673 2674 if (nr_pages > 1) 2675 compound_unlock_irqrestore(page, flags); 2676 put_back: 2677 putback_lru_page(page); 2678 put: 2679 put_page(page); 2680 out: 2681 return ret; 2682 } 2683 2684 /* 2685 * Charge the memory controller for page usage. 2686 * Return 2687 * 0 if the charge was successful 2688 * < 0 if the cgroup is over its limit 2689 */ 2690 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2691 gfp_t gfp_mask, enum charge_type ctype) 2692 { 2693 struct mem_cgroup *mem = NULL; 2694 unsigned int nr_pages = 1; 2695 struct page_cgroup *pc; 2696 bool oom = true; 2697 int ret; 2698 2699 if (PageTransHuge(page)) { 2700 nr_pages <<= compound_order(page); 2701 VM_BUG_ON(!PageTransHuge(page)); 2702 /* 2703 * Never OOM-kill a process for a huge page. The 2704 * fault handler will fall back to regular pages. 2705 */ 2706 oom = false; 2707 } 2708 2709 pc = lookup_page_cgroup(page); 2710 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ 2711 2712 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); 2713 if (ret || !mem) 2714 return ret; 2715 2716 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); 2717 return 0; 2718 } 2719 2720 int mem_cgroup_newpage_charge(struct page *page, 2721 struct mm_struct *mm, gfp_t gfp_mask) 2722 { 2723 if (mem_cgroup_disabled()) 2724 return 0; 2725 /* 2726 * If already mapped, we don't have to account. 2727 * If page cache, page->mapping has address_space. 2728 * But page->mapping may have out-of-use anon_vma pointer, 2729 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 2730 * is NULL. 2731 */ 2732 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 2733 return 0; 2734 if (unlikely(!mm)) 2735 mm = &init_mm; 2736 return mem_cgroup_charge_common(page, mm, gfp_mask, 2737 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2738 } 2739 2740 static void 2741 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2742 enum charge_type ctype); 2743 2744 static void 2745 __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, 2746 enum charge_type ctype) 2747 { 2748 struct page_cgroup *pc = lookup_page_cgroup(page); 2749 /* 2750 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page 2751 * is already on LRU. It means the page may on some other page_cgroup's 2752 * LRU. Take care of it. 2753 */ 2754 mem_cgroup_lru_del_before_commit(page); 2755 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 2756 mem_cgroup_lru_add_after_commit(page); 2757 return; 2758 } 2759 2760 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2761 gfp_t gfp_mask) 2762 { 2763 struct mem_cgroup *mem = NULL; 2764 int ret; 2765 2766 if (mem_cgroup_disabled()) 2767 return 0; 2768 if (PageCompound(page)) 2769 return 0; 2770 2771 if (unlikely(!mm)) 2772 mm = &init_mm; 2773 2774 if (page_is_file_cache(page)) { 2775 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); 2776 if (ret || !mem) 2777 return ret; 2778 2779 /* 2780 * FUSE reuses pages without going through the final 2781 * put that would remove them from the LRU list, make 2782 * sure that they get relinked properly. 2783 */ 2784 __mem_cgroup_commit_charge_lrucare(page, mem, 2785 MEM_CGROUP_CHARGE_TYPE_CACHE); 2786 return ret; 2787 } 2788 /* shmem */ 2789 if (PageSwapCache(page)) { 2790 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2791 if (!ret) 2792 __mem_cgroup_commit_charge_swapin(page, mem, 2793 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2794 } else 2795 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2796 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2797 2798 return ret; 2799 } 2800 2801 /* 2802 * While swap-in, try_charge -> commit or cancel, the page is locked. 2803 * And when try_charge() successfully returns, one refcnt to memcg without 2804 * struct page_cgroup is acquired. This refcnt will be consumed by 2805 * "commit()" or removed by "cancel()" 2806 */ 2807 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2808 struct page *page, 2809 gfp_t mask, struct mem_cgroup **ptr) 2810 { 2811 struct mem_cgroup *mem; 2812 int ret; 2813 2814 *ptr = NULL; 2815 2816 if (mem_cgroup_disabled()) 2817 return 0; 2818 2819 if (!do_swap_account) 2820 goto charge_cur_mm; 2821 /* 2822 * A racing thread's fault, or swapoff, may have already updated 2823 * the pte, and even removed page from swap cache: in those cases 2824 * do_swap_page()'s pte_same() test will fail; but there's also a 2825 * KSM case which does need to charge the page. 2826 */ 2827 if (!PageSwapCache(page)) 2828 goto charge_cur_mm; 2829 mem = try_get_mem_cgroup_from_page(page); 2830 if (!mem) 2831 goto charge_cur_mm; 2832 *ptr = mem; 2833 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); 2834 css_put(&mem->css); 2835 return ret; 2836 charge_cur_mm: 2837 if (unlikely(!mm)) 2838 mm = &init_mm; 2839 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); 2840 } 2841 2842 static void 2843 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2844 enum charge_type ctype) 2845 { 2846 if (mem_cgroup_disabled()) 2847 return; 2848 if (!ptr) 2849 return; 2850 cgroup_exclude_rmdir(&ptr->css); 2851 2852 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); 2853 /* 2854 * Now swap is on-memory. This means this page may be 2855 * counted both as mem and swap....double count. 2856 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2857 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2858 * may call delete_from_swap_cache() before reach here. 2859 */ 2860 if (do_swap_account && PageSwapCache(page)) { 2861 swp_entry_t ent = {.val = page_private(page)}; 2862 unsigned short id; 2863 struct mem_cgroup *memcg; 2864 2865 id = swap_cgroup_record(ent, 0); 2866 rcu_read_lock(); 2867 memcg = mem_cgroup_lookup(id); 2868 if (memcg) { 2869 /* 2870 * This recorded memcg can be obsolete one. So, avoid 2871 * calling css_tryget 2872 */ 2873 if (!mem_cgroup_is_root(memcg)) 2874 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2875 mem_cgroup_swap_statistics(memcg, false); 2876 mem_cgroup_put(memcg); 2877 } 2878 rcu_read_unlock(); 2879 } 2880 /* 2881 * At swapin, we may charge account against cgroup which has no tasks. 2882 * So, rmdir()->pre_destroy() can be called while we do this charge. 2883 * In that case, we need to call pre_destroy() again. check it here. 2884 */ 2885 cgroup_release_and_wakeup_rmdir(&ptr->css); 2886 } 2887 2888 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2889 { 2890 __mem_cgroup_commit_charge_swapin(page, ptr, 2891 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2892 } 2893 2894 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 2895 { 2896 if (mem_cgroup_disabled()) 2897 return; 2898 if (!mem) 2899 return; 2900 __mem_cgroup_cancel_charge(mem, 1); 2901 } 2902 2903 static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, 2904 unsigned int nr_pages, 2905 const enum charge_type ctype) 2906 { 2907 struct memcg_batch_info *batch = NULL; 2908 bool uncharge_memsw = true; 2909 2910 /* If swapout, usage of swap doesn't decrease */ 2911 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2912 uncharge_memsw = false; 2913 2914 batch = ¤t->memcg_batch; 2915 /* 2916 * In usual, we do css_get() when we remember memcg pointer. 2917 * But in this case, we keep res->usage until end of a series of 2918 * uncharges. Then, it's ok to ignore memcg's refcnt. 2919 */ 2920 if (!batch->memcg) 2921 batch->memcg = mem; 2922 /* 2923 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2924 * In those cases, all pages freed continuously can be expected to be in 2925 * the same cgroup and we have chance to coalesce uncharges. 2926 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2927 * because we want to do uncharge as soon as possible. 2928 */ 2929 2930 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2931 goto direct_uncharge; 2932 2933 if (nr_pages > 1) 2934 goto direct_uncharge; 2935 2936 /* 2937 * In typical case, batch->memcg == mem. This means we can 2938 * merge a series of uncharges to an uncharge of res_counter. 2939 * If not, we uncharge res_counter ony by one. 2940 */ 2941 if (batch->memcg != mem) 2942 goto direct_uncharge; 2943 /* remember freed charge and uncharge it later */ 2944 batch->nr_pages++; 2945 if (uncharge_memsw) 2946 batch->memsw_nr_pages++; 2947 return; 2948 direct_uncharge: 2949 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); 2950 if (uncharge_memsw) 2951 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); 2952 if (unlikely(batch->memcg != mem)) 2953 memcg_oom_recover(mem); 2954 return; 2955 } 2956 2957 /* 2958 * uncharge if !page_mapped(page) 2959 */ 2960 static struct mem_cgroup * 2961 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2962 { 2963 struct mem_cgroup *mem = NULL; 2964 unsigned int nr_pages = 1; 2965 struct page_cgroup *pc; 2966 2967 if (mem_cgroup_disabled()) 2968 return NULL; 2969 2970 if (PageSwapCache(page)) 2971 return NULL; 2972 2973 if (PageTransHuge(page)) { 2974 nr_pages <<= compound_order(page); 2975 VM_BUG_ON(!PageTransHuge(page)); 2976 } 2977 /* 2978 * Check if our page_cgroup is valid 2979 */ 2980 pc = lookup_page_cgroup(page); 2981 if (unlikely(!pc || !PageCgroupUsed(pc))) 2982 return NULL; 2983 2984 lock_page_cgroup(pc); 2985 2986 mem = pc->mem_cgroup; 2987 2988 if (!PageCgroupUsed(pc)) 2989 goto unlock_out; 2990 2991 switch (ctype) { 2992 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2993 case MEM_CGROUP_CHARGE_TYPE_DROP: 2994 /* See mem_cgroup_prepare_migration() */ 2995 if (page_mapped(page) || PageCgroupMigration(pc)) 2996 goto unlock_out; 2997 break; 2998 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2999 if (!PageAnon(page)) { /* Shared memory */ 3000 if (page->mapping && !page_is_file_cache(page)) 3001 goto unlock_out; 3002 } else if (page_mapped(page)) /* Anon */ 3003 goto unlock_out; 3004 break; 3005 default: 3006 break; 3007 } 3008 3009 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); 3010 3011 ClearPageCgroupUsed(pc); 3012 /* 3013 * pc->mem_cgroup is not cleared here. It will be accessed when it's 3014 * freed from LRU. This is safe because uncharged page is expected not 3015 * to be reused (freed soon). Exception is SwapCache, it's handled by 3016 * special functions. 3017 */ 3018 3019 unlock_page_cgroup(pc); 3020 /* 3021 * even after unlock, we have mem->res.usage here and this memcg 3022 * will never be freed. 3023 */ 3024 memcg_check_events(mem, page); 3025 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 3026 mem_cgroup_swap_statistics(mem, true); 3027 mem_cgroup_get(mem); 3028 } 3029 if (!mem_cgroup_is_root(mem)) 3030 mem_cgroup_do_uncharge(mem, nr_pages, ctype); 3031 3032 return mem; 3033 3034 unlock_out: 3035 unlock_page_cgroup(pc); 3036 return NULL; 3037 } 3038 3039 void mem_cgroup_uncharge_page(struct page *page) 3040 { 3041 /* early check. */ 3042 if (page_mapped(page)) 3043 return; 3044 if (page->mapping && !PageAnon(page)) 3045 return; 3046 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 3047 } 3048 3049 void mem_cgroup_uncharge_cache_page(struct page *page) 3050 { 3051 VM_BUG_ON(page_mapped(page)); 3052 VM_BUG_ON(page->mapping); 3053 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 3054 } 3055 3056 /* 3057 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 3058 * In that cases, pages are freed continuously and we can expect pages 3059 * are in the same memcg. All these calls itself limits the number of 3060 * pages freed at once, then uncharge_start/end() is called properly. 3061 * This may be called prural(2) times in a context, 3062 */ 3063 3064 void mem_cgroup_uncharge_start(void) 3065 { 3066 current->memcg_batch.do_batch++; 3067 /* We can do nest. */ 3068 if (current->memcg_batch.do_batch == 1) { 3069 current->memcg_batch.memcg = NULL; 3070 current->memcg_batch.nr_pages = 0; 3071 current->memcg_batch.memsw_nr_pages = 0; 3072 } 3073 } 3074 3075 void mem_cgroup_uncharge_end(void) 3076 { 3077 struct memcg_batch_info *batch = ¤t->memcg_batch; 3078 3079 if (!batch->do_batch) 3080 return; 3081 3082 batch->do_batch--; 3083 if (batch->do_batch) /* If stacked, do nothing. */ 3084 return; 3085 3086 if (!batch->memcg) 3087 return; 3088 /* 3089 * This "batch->memcg" is valid without any css_get/put etc... 3090 * bacause we hide charges behind us. 3091 */ 3092 if (batch->nr_pages) 3093 res_counter_uncharge(&batch->memcg->res, 3094 batch->nr_pages * PAGE_SIZE); 3095 if (batch->memsw_nr_pages) 3096 res_counter_uncharge(&batch->memcg->memsw, 3097 batch->memsw_nr_pages * PAGE_SIZE); 3098 memcg_oom_recover(batch->memcg); 3099 /* forget this pointer (for sanity check) */ 3100 batch->memcg = NULL; 3101 } 3102 3103 #ifdef CONFIG_SWAP 3104 /* 3105 * called after __delete_from_swap_cache() and drop "page" account. 3106 * memcg information is recorded to swap_cgroup of "ent" 3107 */ 3108 void 3109 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 3110 { 3111 struct mem_cgroup *memcg; 3112 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 3113 3114 if (!swapout) /* this was a swap cache but the swap is unused ! */ 3115 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 3116 3117 memcg = __mem_cgroup_uncharge_common(page, ctype); 3118 3119 /* 3120 * record memcg information, if swapout && memcg != NULL, 3121 * mem_cgroup_get() was called in uncharge(). 3122 */ 3123 if (do_swap_account && swapout && memcg) 3124 swap_cgroup_record(ent, css_id(&memcg->css)); 3125 } 3126 #endif 3127 3128 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3129 /* 3130 * called from swap_entry_free(). remove record in swap_cgroup and 3131 * uncharge "memsw" account. 3132 */ 3133 void mem_cgroup_uncharge_swap(swp_entry_t ent) 3134 { 3135 struct mem_cgroup *memcg; 3136 unsigned short id; 3137 3138 if (!do_swap_account) 3139 return; 3140 3141 id = swap_cgroup_record(ent, 0); 3142 rcu_read_lock(); 3143 memcg = mem_cgroup_lookup(id); 3144 if (memcg) { 3145 /* 3146 * We uncharge this because swap is freed. 3147 * This memcg can be obsolete one. We avoid calling css_tryget 3148 */ 3149 if (!mem_cgroup_is_root(memcg)) 3150 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 3151 mem_cgroup_swap_statistics(memcg, false); 3152 mem_cgroup_put(memcg); 3153 } 3154 rcu_read_unlock(); 3155 } 3156 3157 /** 3158 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3159 * @entry: swap entry to be moved 3160 * @from: mem_cgroup which the entry is moved from 3161 * @to: mem_cgroup which the entry is moved to 3162 * @need_fixup: whether we should fixup res_counters and refcounts. 3163 * 3164 * It succeeds only when the swap_cgroup's record for this entry is the same 3165 * as the mem_cgroup's id of @from. 3166 * 3167 * Returns 0 on success, -EINVAL on failure. 3168 * 3169 * The caller must have charged to @to, IOW, called res_counter_charge() about 3170 * both res and memsw, and called css_get(). 3171 */ 3172 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3173 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3174 { 3175 unsigned short old_id, new_id; 3176 3177 old_id = css_id(&from->css); 3178 new_id = css_id(&to->css); 3179 3180 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3181 mem_cgroup_swap_statistics(from, false); 3182 mem_cgroup_swap_statistics(to, true); 3183 /* 3184 * This function is only called from task migration context now. 3185 * It postpones res_counter and refcount handling till the end 3186 * of task migration(mem_cgroup_clear_mc()) for performance 3187 * improvement. But we cannot postpone mem_cgroup_get(to) 3188 * because if the process that has been moved to @to does 3189 * swap-in, the refcount of @to might be decreased to 0. 3190 */ 3191 mem_cgroup_get(to); 3192 if (need_fixup) { 3193 if (!mem_cgroup_is_root(from)) 3194 res_counter_uncharge(&from->memsw, PAGE_SIZE); 3195 mem_cgroup_put(from); 3196 /* 3197 * we charged both to->res and to->memsw, so we should 3198 * uncharge to->res. 3199 */ 3200 if (!mem_cgroup_is_root(to)) 3201 res_counter_uncharge(&to->res, PAGE_SIZE); 3202 } 3203 return 0; 3204 } 3205 return -EINVAL; 3206 } 3207 #else 3208 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3209 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3210 { 3211 return -EINVAL; 3212 } 3213 #endif 3214 3215 /* 3216 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 3217 * page belongs to. 3218 */ 3219 int mem_cgroup_prepare_migration(struct page *page, 3220 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) 3221 { 3222 struct mem_cgroup *mem = NULL; 3223 struct page_cgroup *pc; 3224 enum charge_type ctype; 3225 int ret = 0; 3226 3227 *ptr = NULL; 3228 3229 VM_BUG_ON(PageTransHuge(page)); 3230 if (mem_cgroup_disabled()) 3231 return 0; 3232 3233 pc = lookup_page_cgroup(page); 3234 lock_page_cgroup(pc); 3235 if (PageCgroupUsed(pc)) { 3236 mem = pc->mem_cgroup; 3237 css_get(&mem->css); 3238 /* 3239 * At migrating an anonymous page, its mapcount goes down 3240 * to 0 and uncharge() will be called. But, even if it's fully 3241 * unmapped, migration may fail and this page has to be 3242 * charged again. We set MIGRATION flag here and delay uncharge 3243 * until end_migration() is called 3244 * 3245 * Corner Case Thinking 3246 * A) 3247 * When the old page was mapped as Anon and it's unmap-and-freed 3248 * while migration was ongoing. 3249 * If unmap finds the old page, uncharge() of it will be delayed 3250 * until end_migration(). If unmap finds a new page, it's 3251 * uncharged when it make mapcount to be 1->0. If unmap code 3252 * finds swap_migration_entry, the new page will not be mapped 3253 * and end_migration() will find it(mapcount==0). 3254 * 3255 * B) 3256 * When the old page was mapped but migraion fails, the kernel 3257 * remaps it. A charge for it is kept by MIGRATION flag even 3258 * if mapcount goes down to 0. We can do remap successfully 3259 * without charging it again. 3260 * 3261 * C) 3262 * The "old" page is under lock_page() until the end of 3263 * migration, so, the old page itself will not be swapped-out. 3264 * If the new page is swapped out before end_migraton, our 3265 * hook to usual swap-out path will catch the event. 3266 */ 3267 if (PageAnon(page)) 3268 SetPageCgroupMigration(pc); 3269 } 3270 unlock_page_cgroup(pc); 3271 /* 3272 * If the page is not charged at this point, 3273 * we return here. 3274 */ 3275 if (!mem) 3276 return 0; 3277 3278 *ptr = mem; 3279 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); 3280 css_put(&mem->css);/* drop extra refcnt */ 3281 if (ret || *ptr == NULL) { 3282 if (PageAnon(page)) { 3283 lock_page_cgroup(pc); 3284 ClearPageCgroupMigration(pc); 3285 unlock_page_cgroup(pc); 3286 /* 3287 * The old page may be fully unmapped while we kept it. 3288 */ 3289 mem_cgroup_uncharge_page(page); 3290 } 3291 return -ENOMEM; 3292 } 3293 /* 3294 * We charge new page before it's used/mapped. So, even if unlock_page() 3295 * is called before end_migration, we can catch all events on this new 3296 * page. In the case new page is migrated but not remapped, new page's 3297 * mapcount will be finally 0 and we call uncharge in end_migration(). 3298 */ 3299 pc = lookup_page_cgroup(newpage); 3300 if (PageAnon(page)) 3301 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 3302 else if (page_is_file_cache(page)) 3303 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3304 else 3305 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3306 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 3307 return ret; 3308 } 3309 3310 /* remove redundant charge if migration failed*/ 3311 void mem_cgroup_end_migration(struct mem_cgroup *mem, 3312 struct page *oldpage, struct page *newpage, bool migration_ok) 3313 { 3314 struct page *used, *unused; 3315 struct page_cgroup *pc; 3316 3317 if (!mem) 3318 return; 3319 /* blocks rmdir() */ 3320 cgroup_exclude_rmdir(&mem->css); 3321 if (!migration_ok) { 3322 used = oldpage; 3323 unused = newpage; 3324 } else { 3325 used = newpage; 3326 unused = oldpage; 3327 } 3328 /* 3329 * We disallowed uncharge of pages under migration because mapcount 3330 * of the page goes down to zero, temporarly. 3331 * Clear the flag and check the page should be charged. 3332 */ 3333 pc = lookup_page_cgroup(oldpage); 3334 lock_page_cgroup(pc); 3335 ClearPageCgroupMigration(pc); 3336 unlock_page_cgroup(pc); 3337 3338 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 3339 3340 /* 3341 * If a page is a file cache, radix-tree replacement is very atomic 3342 * and we can skip this check. When it was an Anon page, its mapcount 3343 * goes down to 0. But because we added MIGRATION flage, it's not 3344 * uncharged yet. There are several case but page->mapcount check 3345 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3346 * check. (see prepare_charge() also) 3347 */ 3348 if (PageAnon(used)) 3349 mem_cgroup_uncharge_page(used); 3350 /* 3351 * At migration, we may charge account against cgroup which has no 3352 * tasks. 3353 * So, rmdir()->pre_destroy() can be called while we do this charge. 3354 * In that case, we need to call pre_destroy() again. check it here. 3355 */ 3356 cgroup_release_and_wakeup_rmdir(&mem->css); 3357 } 3358 3359 #ifdef CONFIG_DEBUG_VM 3360 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3361 { 3362 struct page_cgroup *pc; 3363 3364 pc = lookup_page_cgroup(page); 3365 if (likely(pc) && PageCgroupUsed(pc)) 3366 return pc; 3367 return NULL; 3368 } 3369 3370 bool mem_cgroup_bad_page_check(struct page *page) 3371 { 3372 if (mem_cgroup_disabled()) 3373 return false; 3374 3375 return lookup_page_cgroup_used(page) != NULL; 3376 } 3377 3378 void mem_cgroup_print_bad_page(struct page *page) 3379 { 3380 struct page_cgroup *pc; 3381 3382 pc = lookup_page_cgroup_used(page); 3383 if (pc) { 3384 int ret = -1; 3385 char *path; 3386 3387 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", 3388 pc, pc->flags, pc->mem_cgroup); 3389 3390 path = kmalloc(PATH_MAX, GFP_KERNEL); 3391 if (path) { 3392 rcu_read_lock(); 3393 ret = cgroup_path(pc->mem_cgroup->css.cgroup, 3394 path, PATH_MAX); 3395 rcu_read_unlock(); 3396 } 3397 3398 printk(KERN_CONT "(%s)\n", 3399 (ret < 0) ? "cannot get the path" : path); 3400 kfree(path); 3401 } 3402 } 3403 #endif 3404 3405 static DEFINE_MUTEX(set_limit_mutex); 3406 3407 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3408 unsigned long long val) 3409 { 3410 int retry_count; 3411 u64 memswlimit, memlimit; 3412 int ret = 0; 3413 int children = mem_cgroup_count_children(memcg); 3414 u64 curusage, oldusage; 3415 int enlarge; 3416 3417 /* 3418 * For keeping hierarchical_reclaim simple, how long we should retry 3419 * is depends on callers. We set our retry-count to be function 3420 * of # of children which we should visit in this loop. 3421 */ 3422 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3423 3424 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3425 3426 enlarge = 0; 3427 while (retry_count) { 3428 if (signal_pending(current)) { 3429 ret = -EINTR; 3430 break; 3431 } 3432 /* 3433 * Rather than hide all in some function, I do this in 3434 * open coded manner. You see what this really does. 3435 * We have to guarantee mem->res.limit < mem->memsw.limit. 3436 */ 3437 mutex_lock(&set_limit_mutex); 3438 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3439 if (memswlimit < val) { 3440 ret = -EINVAL; 3441 mutex_unlock(&set_limit_mutex); 3442 break; 3443 } 3444 3445 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3446 if (memlimit < val) 3447 enlarge = 1; 3448 3449 ret = res_counter_set_limit(&memcg->res, val); 3450 if (!ret) { 3451 if (memswlimit == val) 3452 memcg->memsw_is_minimum = true; 3453 else 3454 memcg->memsw_is_minimum = false; 3455 } 3456 mutex_unlock(&set_limit_mutex); 3457 3458 if (!ret) 3459 break; 3460 3461 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3462 MEM_CGROUP_RECLAIM_SHRINK, 3463 NULL); 3464 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3465 /* Usage is reduced ? */ 3466 if (curusage >= oldusage) 3467 retry_count--; 3468 else 3469 oldusage = curusage; 3470 } 3471 if (!ret && enlarge) 3472 memcg_oom_recover(memcg); 3473 3474 return ret; 3475 } 3476 3477 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3478 unsigned long long val) 3479 { 3480 int retry_count; 3481 u64 memlimit, memswlimit, oldusage, curusage; 3482 int children = mem_cgroup_count_children(memcg); 3483 int ret = -EBUSY; 3484 int enlarge = 0; 3485 3486 /* see mem_cgroup_resize_res_limit */ 3487 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3488 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3489 while (retry_count) { 3490 if (signal_pending(current)) { 3491 ret = -EINTR; 3492 break; 3493 } 3494 /* 3495 * Rather than hide all in some function, I do this in 3496 * open coded manner. You see what this really does. 3497 * We have to guarantee mem->res.limit < mem->memsw.limit. 3498 */ 3499 mutex_lock(&set_limit_mutex); 3500 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3501 if (memlimit > val) { 3502 ret = -EINVAL; 3503 mutex_unlock(&set_limit_mutex); 3504 break; 3505 } 3506 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3507 if (memswlimit < val) 3508 enlarge = 1; 3509 ret = res_counter_set_limit(&memcg->memsw, val); 3510 if (!ret) { 3511 if (memlimit == val) 3512 memcg->memsw_is_minimum = true; 3513 else 3514 memcg->memsw_is_minimum = false; 3515 } 3516 mutex_unlock(&set_limit_mutex); 3517 3518 if (!ret) 3519 break; 3520 3521 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3522 MEM_CGROUP_RECLAIM_NOSWAP | 3523 MEM_CGROUP_RECLAIM_SHRINK, 3524 NULL); 3525 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3526 /* Usage is reduced ? */ 3527 if (curusage >= oldusage) 3528 retry_count--; 3529 else 3530 oldusage = curusage; 3531 } 3532 if (!ret && enlarge) 3533 memcg_oom_recover(memcg); 3534 return ret; 3535 } 3536 3537 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3538 gfp_t gfp_mask, 3539 unsigned long *total_scanned) 3540 { 3541 unsigned long nr_reclaimed = 0; 3542 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3543 unsigned long reclaimed; 3544 int loop = 0; 3545 struct mem_cgroup_tree_per_zone *mctz; 3546 unsigned long long excess; 3547 unsigned long nr_scanned; 3548 3549 if (order > 0) 3550 return 0; 3551 3552 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3553 /* 3554 * This loop can run a while, specially if mem_cgroup's continuously 3555 * keep exceeding their soft limit and putting the system under 3556 * pressure 3557 */ 3558 do { 3559 if (next_mz) 3560 mz = next_mz; 3561 else 3562 mz = mem_cgroup_largest_soft_limit_node(mctz); 3563 if (!mz) 3564 break; 3565 3566 nr_scanned = 0; 3567 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 3568 gfp_mask, 3569 MEM_CGROUP_RECLAIM_SOFT, 3570 &nr_scanned); 3571 nr_reclaimed += reclaimed; 3572 *total_scanned += nr_scanned; 3573 spin_lock(&mctz->lock); 3574 3575 /* 3576 * If we failed to reclaim anything from this memory cgroup 3577 * it is time to move on to the next cgroup 3578 */ 3579 next_mz = NULL; 3580 if (!reclaimed) { 3581 do { 3582 /* 3583 * Loop until we find yet another one. 3584 * 3585 * By the time we get the soft_limit lock 3586 * again, someone might have aded the 3587 * group back on the RB tree. Iterate to 3588 * make sure we get a different mem. 3589 * mem_cgroup_largest_soft_limit_node returns 3590 * NULL if no other cgroup is present on 3591 * the tree 3592 */ 3593 next_mz = 3594 __mem_cgroup_largest_soft_limit_node(mctz); 3595 if (next_mz == mz) 3596 css_put(&next_mz->mem->css); 3597 else /* next_mz == NULL or other memcg */ 3598 break; 3599 } while (1); 3600 } 3601 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 3602 excess = res_counter_soft_limit_excess(&mz->mem->res); 3603 /* 3604 * One school of thought says that we should not add 3605 * back the node to the tree if reclaim returns 0. 3606 * But our reclaim could return 0, simply because due 3607 * to priority we are exposing a smaller subset of 3608 * memory to reclaim from. Consider this as a longer 3609 * term TODO. 3610 */ 3611 /* If excess == 0, no tree ops */ 3612 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 3613 spin_unlock(&mctz->lock); 3614 css_put(&mz->mem->css); 3615 loop++; 3616 /* 3617 * Could not reclaim anything and there are no more 3618 * mem cgroups to try or we seem to be looping without 3619 * reclaiming anything. 3620 */ 3621 if (!nr_reclaimed && 3622 (next_mz == NULL || 3623 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3624 break; 3625 } while (!nr_reclaimed); 3626 if (next_mz) 3627 css_put(&next_mz->mem->css); 3628 return nr_reclaimed; 3629 } 3630 3631 /* 3632 * This routine traverse page_cgroup in given list and drop them all. 3633 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3634 */ 3635 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 3636 int node, int zid, enum lru_list lru) 3637 { 3638 struct zone *zone; 3639 struct mem_cgroup_per_zone *mz; 3640 struct page_cgroup *pc, *busy; 3641 unsigned long flags, loop; 3642 struct list_head *list; 3643 int ret = 0; 3644 3645 zone = &NODE_DATA(node)->node_zones[zid]; 3646 mz = mem_cgroup_zoneinfo(mem, node, zid); 3647 list = &mz->lists[lru]; 3648 3649 loop = MEM_CGROUP_ZSTAT(mz, lru); 3650 /* give some margin against EBUSY etc...*/ 3651 loop += 256; 3652 busy = NULL; 3653 while (loop--) { 3654 struct page *page; 3655 3656 ret = 0; 3657 spin_lock_irqsave(&zone->lru_lock, flags); 3658 if (list_empty(list)) { 3659 spin_unlock_irqrestore(&zone->lru_lock, flags); 3660 break; 3661 } 3662 pc = list_entry(list->prev, struct page_cgroup, lru); 3663 if (busy == pc) { 3664 list_move(&pc->lru, list); 3665 busy = NULL; 3666 spin_unlock_irqrestore(&zone->lru_lock, flags); 3667 continue; 3668 } 3669 spin_unlock_irqrestore(&zone->lru_lock, flags); 3670 3671 page = lookup_cgroup_page(pc); 3672 3673 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); 3674 if (ret == -ENOMEM) 3675 break; 3676 3677 if (ret == -EBUSY || ret == -EINVAL) { 3678 /* found lock contention or "pc" is obsolete. */ 3679 busy = pc; 3680 cond_resched(); 3681 } else 3682 busy = NULL; 3683 } 3684 3685 if (!ret && !list_empty(list)) 3686 return -EBUSY; 3687 return ret; 3688 } 3689 3690 /* 3691 * make mem_cgroup's charge to be 0 if there is no task. 3692 * This enables deleting this mem_cgroup. 3693 */ 3694 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 3695 { 3696 int ret; 3697 int node, zid, shrink; 3698 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3699 struct cgroup *cgrp = mem->css.cgroup; 3700 3701 css_get(&mem->css); 3702 3703 shrink = 0; 3704 /* should free all ? */ 3705 if (free_all) 3706 goto try_to_free; 3707 move_account: 3708 do { 3709 ret = -EBUSY; 3710 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3711 goto out; 3712 ret = -EINTR; 3713 if (signal_pending(current)) 3714 goto out; 3715 /* This is for making all *used* pages to be on LRU. */ 3716 lru_add_drain_all(); 3717 drain_all_stock_sync(mem); 3718 ret = 0; 3719 mem_cgroup_start_move(mem); 3720 for_each_node_state(node, N_HIGH_MEMORY) { 3721 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3722 enum lru_list l; 3723 for_each_lru(l) { 3724 ret = mem_cgroup_force_empty_list(mem, 3725 node, zid, l); 3726 if (ret) 3727 break; 3728 } 3729 } 3730 if (ret) 3731 break; 3732 } 3733 mem_cgroup_end_move(mem); 3734 memcg_oom_recover(mem); 3735 /* it seems parent cgroup doesn't have enough mem */ 3736 if (ret == -ENOMEM) 3737 goto try_to_free; 3738 cond_resched(); 3739 /* "ret" should also be checked to ensure all lists are empty. */ 3740 } while (mem->res.usage > 0 || ret); 3741 out: 3742 css_put(&mem->css); 3743 return ret; 3744 3745 try_to_free: 3746 /* returns EBUSY if there is a task or if we come here twice. */ 3747 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3748 ret = -EBUSY; 3749 goto out; 3750 } 3751 /* we call try-to-free pages for make this cgroup empty */ 3752 lru_add_drain_all(); 3753 /* try to free all pages in this cgroup */ 3754 shrink = 1; 3755 while (nr_retries && mem->res.usage > 0) { 3756 int progress; 3757 3758 if (signal_pending(current)) { 3759 ret = -EINTR; 3760 goto out; 3761 } 3762 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3763 false); 3764 if (!progress) { 3765 nr_retries--; 3766 /* maybe some writeback is necessary */ 3767 congestion_wait(BLK_RW_ASYNC, HZ/10); 3768 } 3769 3770 } 3771 lru_add_drain(); 3772 /* try move_account...there may be some *locked* pages. */ 3773 goto move_account; 3774 } 3775 3776 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3777 { 3778 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3779 } 3780 3781 3782 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 3783 { 3784 return mem_cgroup_from_cont(cont)->use_hierarchy; 3785 } 3786 3787 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 3788 u64 val) 3789 { 3790 int retval = 0; 3791 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3792 struct cgroup *parent = cont->parent; 3793 struct mem_cgroup *parent_mem = NULL; 3794 3795 if (parent) 3796 parent_mem = mem_cgroup_from_cont(parent); 3797 3798 cgroup_lock(); 3799 /* 3800 * If parent's use_hierarchy is set, we can't make any modifications 3801 * in the child subtrees. If it is unset, then the change can 3802 * occur, provided the current cgroup has no children. 3803 * 3804 * For the root cgroup, parent_mem is NULL, we allow value to be 3805 * set if there are no children. 3806 */ 3807 if ((!parent_mem || !parent_mem->use_hierarchy) && 3808 (val == 1 || val == 0)) { 3809 if (list_empty(&cont->children)) 3810 mem->use_hierarchy = val; 3811 else 3812 retval = -EBUSY; 3813 } else 3814 retval = -EINVAL; 3815 cgroup_unlock(); 3816 3817 return retval; 3818 } 3819 3820 3821 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, 3822 enum mem_cgroup_stat_index idx) 3823 { 3824 struct mem_cgroup *iter; 3825 long val = 0; 3826 3827 /* Per-cpu values can be negative, use a signed accumulator */ 3828 for_each_mem_cgroup_tree(iter, mem) 3829 val += mem_cgroup_read_stat(iter, idx); 3830 3831 if (val < 0) /* race ? */ 3832 val = 0; 3833 return val; 3834 } 3835 3836 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3837 { 3838 u64 val; 3839 3840 if (!mem_cgroup_is_root(mem)) { 3841 if (!swap) 3842 return res_counter_read_u64(&mem->res, RES_USAGE); 3843 else 3844 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3845 } 3846 3847 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); 3848 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); 3849 3850 if (swap) 3851 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3852 3853 return val << PAGE_SHIFT; 3854 } 3855 3856 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3857 { 3858 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3859 u64 val; 3860 int type, name; 3861 3862 type = MEMFILE_TYPE(cft->private); 3863 name = MEMFILE_ATTR(cft->private); 3864 switch (type) { 3865 case _MEM: 3866 if (name == RES_USAGE) 3867 val = mem_cgroup_usage(mem, false); 3868 else 3869 val = res_counter_read_u64(&mem->res, name); 3870 break; 3871 case _MEMSWAP: 3872 if (name == RES_USAGE) 3873 val = mem_cgroup_usage(mem, true); 3874 else 3875 val = res_counter_read_u64(&mem->memsw, name); 3876 break; 3877 default: 3878 BUG(); 3879 break; 3880 } 3881 return val; 3882 } 3883 /* 3884 * The user of this function is... 3885 * RES_LIMIT. 3886 */ 3887 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 3888 const char *buffer) 3889 { 3890 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3891 int type, name; 3892 unsigned long long val; 3893 int ret; 3894 3895 type = MEMFILE_TYPE(cft->private); 3896 name = MEMFILE_ATTR(cft->private); 3897 switch (name) { 3898 case RES_LIMIT: 3899 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3900 ret = -EINVAL; 3901 break; 3902 } 3903 /* This function does all necessary parse...reuse it */ 3904 ret = res_counter_memparse_write_strategy(buffer, &val); 3905 if (ret) 3906 break; 3907 if (type == _MEM) 3908 ret = mem_cgroup_resize_limit(memcg, val); 3909 else 3910 ret = mem_cgroup_resize_memsw_limit(memcg, val); 3911 break; 3912 case RES_SOFT_LIMIT: 3913 ret = res_counter_memparse_write_strategy(buffer, &val); 3914 if (ret) 3915 break; 3916 /* 3917 * For memsw, soft limits are hard to implement in terms 3918 * of semantics, for now, we support soft limits for 3919 * control without swap 3920 */ 3921 if (type == _MEM) 3922 ret = res_counter_set_soft_limit(&memcg->res, val); 3923 else 3924 ret = -EINVAL; 3925 break; 3926 default: 3927 ret = -EINVAL; /* should be BUG() ? */ 3928 break; 3929 } 3930 return ret; 3931 } 3932 3933 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 3934 unsigned long long *mem_limit, unsigned long long *memsw_limit) 3935 { 3936 struct cgroup *cgroup; 3937 unsigned long long min_limit, min_memsw_limit, tmp; 3938 3939 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3940 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3941 cgroup = memcg->css.cgroup; 3942 if (!memcg->use_hierarchy) 3943 goto out; 3944 3945 while (cgroup->parent) { 3946 cgroup = cgroup->parent; 3947 memcg = mem_cgroup_from_cont(cgroup); 3948 if (!memcg->use_hierarchy) 3949 break; 3950 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 3951 min_limit = min(min_limit, tmp); 3952 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3953 min_memsw_limit = min(min_memsw_limit, tmp); 3954 } 3955 out: 3956 *mem_limit = min_limit; 3957 *memsw_limit = min_memsw_limit; 3958 return; 3959 } 3960 3961 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3962 { 3963 struct mem_cgroup *mem; 3964 int type, name; 3965 3966 mem = mem_cgroup_from_cont(cont); 3967 type = MEMFILE_TYPE(event); 3968 name = MEMFILE_ATTR(event); 3969 switch (name) { 3970 case RES_MAX_USAGE: 3971 if (type == _MEM) 3972 res_counter_reset_max(&mem->res); 3973 else 3974 res_counter_reset_max(&mem->memsw); 3975 break; 3976 case RES_FAILCNT: 3977 if (type == _MEM) 3978 res_counter_reset_failcnt(&mem->res); 3979 else 3980 res_counter_reset_failcnt(&mem->memsw); 3981 break; 3982 } 3983 3984 return 0; 3985 } 3986 3987 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 3988 struct cftype *cft) 3989 { 3990 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 3991 } 3992 3993 #ifdef CONFIG_MMU 3994 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3995 struct cftype *cft, u64 val) 3996 { 3997 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3998 3999 if (val >= (1 << NR_MOVE_TYPE)) 4000 return -EINVAL; 4001 /* 4002 * We check this value several times in both in can_attach() and 4003 * attach(), so we need cgroup lock to prevent this value from being 4004 * inconsistent. 4005 */ 4006 cgroup_lock(); 4007 mem->move_charge_at_immigrate = val; 4008 cgroup_unlock(); 4009 4010 return 0; 4011 } 4012 #else 4013 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4014 struct cftype *cft, u64 val) 4015 { 4016 return -ENOSYS; 4017 } 4018 #endif 4019 4020 4021 /* For read statistics */ 4022 enum { 4023 MCS_CACHE, 4024 MCS_RSS, 4025 MCS_FILE_MAPPED, 4026 MCS_PGPGIN, 4027 MCS_PGPGOUT, 4028 MCS_SWAP, 4029 MCS_PGFAULT, 4030 MCS_PGMAJFAULT, 4031 MCS_INACTIVE_ANON, 4032 MCS_ACTIVE_ANON, 4033 MCS_INACTIVE_FILE, 4034 MCS_ACTIVE_FILE, 4035 MCS_UNEVICTABLE, 4036 NR_MCS_STAT, 4037 }; 4038 4039 struct mcs_total_stat { 4040 s64 stat[NR_MCS_STAT]; 4041 }; 4042 4043 struct { 4044 char *local_name; 4045 char *total_name; 4046 } memcg_stat_strings[NR_MCS_STAT] = { 4047 {"cache", "total_cache"}, 4048 {"rss", "total_rss"}, 4049 {"mapped_file", "total_mapped_file"}, 4050 {"pgpgin", "total_pgpgin"}, 4051 {"pgpgout", "total_pgpgout"}, 4052 {"swap", "total_swap"}, 4053 {"pgfault", "total_pgfault"}, 4054 {"pgmajfault", "total_pgmajfault"}, 4055 {"inactive_anon", "total_inactive_anon"}, 4056 {"active_anon", "total_active_anon"}, 4057 {"inactive_file", "total_inactive_file"}, 4058 {"active_file", "total_active_file"}, 4059 {"unevictable", "total_unevictable"} 4060 }; 4061 4062 4063 static void 4064 mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 4065 { 4066 s64 val; 4067 4068 /* per cpu stat */ 4069 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 4070 s->stat[MCS_CACHE] += val * PAGE_SIZE; 4071 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 4072 s->stat[MCS_RSS] += val * PAGE_SIZE; 4073 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 4074 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 4075 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); 4076 s->stat[MCS_PGPGIN] += val; 4077 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); 4078 s->stat[MCS_PGPGOUT] += val; 4079 if (do_swap_account) { 4080 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 4081 s->stat[MCS_SWAP] += val * PAGE_SIZE; 4082 } 4083 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); 4084 s->stat[MCS_PGFAULT] += val; 4085 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); 4086 s->stat[MCS_PGMAJFAULT] += val; 4087 4088 /* per zone stat */ 4089 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); 4090 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 4091 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); 4092 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 4093 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); 4094 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 4095 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); 4096 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 4097 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); 4098 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 4099 } 4100 4101 static void 4102 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 4103 { 4104 struct mem_cgroup *iter; 4105 4106 for_each_mem_cgroup_tree(iter, mem) 4107 mem_cgroup_get_local_stat(iter, s); 4108 } 4109 4110 #ifdef CONFIG_NUMA 4111 static int mem_control_numa_stat_show(struct seq_file *m, void *arg) 4112 { 4113 int nid; 4114 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4115 unsigned long node_nr; 4116 struct cgroup *cont = m->private; 4117 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4118 4119 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); 4120 seq_printf(m, "total=%lu", total_nr); 4121 for_each_node_state(nid, N_HIGH_MEMORY) { 4122 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); 4123 seq_printf(m, " N%d=%lu", nid, node_nr); 4124 } 4125 seq_putc(m, '\n'); 4126 4127 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); 4128 seq_printf(m, "file=%lu", file_nr); 4129 for_each_node_state(nid, N_HIGH_MEMORY) { 4130 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4131 LRU_ALL_FILE); 4132 seq_printf(m, " N%d=%lu", nid, node_nr); 4133 } 4134 seq_putc(m, '\n'); 4135 4136 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); 4137 seq_printf(m, "anon=%lu", anon_nr); 4138 for_each_node_state(nid, N_HIGH_MEMORY) { 4139 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4140 LRU_ALL_ANON); 4141 seq_printf(m, " N%d=%lu", nid, node_nr); 4142 } 4143 seq_putc(m, '\n'); 4144 4145 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); 4146 seq_printf(m, "unevictable=%lu", unevictable_nr); 4147 for_each_node_state(nid, N_HIGH_MEMORY) { 4148 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4149 BIT(LRU_UNEVICTABLE)); 4150 seq_printf(m, " N%d=%lu", nid, node_nr); 4151 } 4152 seq_putc(m, '\n'); 4153 return 0; 4154 } 4155 #endif /* CONFIG_NUMA */ 4156 4157 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4158 struct cgroup_map_cb *cb) 4159 { 4160 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4161 struct mcs_total_stat mystat; 4162 int i; 4163 4164 memset(&mystat, 0, sizeof(mystat)); 4165 mem_cgroup_get_local_stat(mem_cont, &mystat); 4166 4167 4168 for (i = 0; i < NR_MCS_STAT; i++) { 4169 if (i == MCS_SWAP && !do_swap_account) 4170 continue; 4171 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 4172 } 4173 4174 /* Hierarchical information */ 4175 { 4176 unsigned long long limit, memsw_limit; 4177 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 4178 cb->fill(cb, "hierarchical_memory_limit", limit); 4179 if (do_swap_account) 4180 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4181 } 4182 4183 memset(&mystat, 0, sizeof(mystat)); 4184 mem_cgroup_get_total_stat(mem_cont, &mystat); 4185 for (i = 0; i < NR_MCS_STAT; i++) { 4186 if (i == MCS_SWAP && !do_swap_account) 4187 continue; 4188 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 4189 } 4190 4191 #ifdef CONFIG_DEBUG_VM 4192 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 4193 4194 { 4195 int nid, zid; 4196 struct mem_cgroup_per_zone *mz; 4197 unsigned long recent_rotated[2] = {0, 0}; 4198 unsigned long recent_scanned[2] = {0, 0}; 4199 4200 for_each_online_node(nid) 4201 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4202 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 4203 4204 recent_rotated[0] += 4205 mz->reclaim_stat.recent_rotated[0]; 4206 recent_rotated[1] += 4207 mz->reclaim_stat.recent_rotated[1]; 4208 recent_scanned[0] += 4209 mz->reclaim_stat.recent_scanned[0]; 4210 recent_scanned[1] += 4211 mz->reclaim_stat.recent_scanned[1]; 4212 } 4213 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 4214 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 4215 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 4216 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 4217 } 4218 #endif 4219 4220 return 0; 4221 } 4222 4223 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 4224 { 4225 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4226 4227 return mem_cgroup_swappiness(memcg); 4228 } 4229 4230 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 4231 u64 val) 4232 { 4233 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4234 struct mem_cgroup *parent; 4235 4236 if (val > 100) 4237 return -EINVAL; 4238 4239 if (cgrp->parent == NULL) 4240 return -EINVAL; 4241 4242 parent = mem_cgroup_from_cont(cgrp->parent); 4243 4244 cgroup_lock(); 4245 4246 /* If under hierarchy, only empty-root can set this value */ 4247 if ((parent->use_hierarchy) || 4248 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 4249 cgroup_unlock(); 4250 return -EINVAL; 4251 } 4252 4253 memcg->swappiness = val; 4254 4255 cgroup_unlock(); 4256 4257 return 0; 4258 } 4259 4260 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4261 { 4262 struct mem_cgroup_threshold_ary *t; 4263 u64 usage; 4264 int i; 4265 4266 rcu_read_lock(); 4267 if (!swap) 4268 t = rcu_dereference(memcg->thresholds.primary); 4269 else 4270 t = rcu_dereference(memcg->memsw_thresholds.primary); 4271 4272 if (!t) 4273 goto unlock; 4274 4275 usage = mem_cgroup_usage(memcg, swap); 4276 4277 /* 4278 * current_threshold points to threshold just below usage. 4279 * If it's not true, a threshold was crossed after last 4280 * call of __mem_cgroup_threshold(). 4281 */ 4282 i = t->current_threshold; 4283 4284 /* 4285 * Iterate backward over array of thresholds starting from 4286 * current_threshold and check if a threshold is crossed. 4287 * If none of thresholds below usage is crossed, we read 4288 * only one element of the array here. 4289 */ 4290 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4291 eventfd_signal(t->entries[i].eventfd, 1); 4292 4293 /* i = current_threshold + 1 */ 4294 i++; 4295 4296 /* 4297 * Iterate forward over array of thresholds starting from 4298 * current_threshold+1 and check if a threshold is crossed. 4299 * If none of thresholds above usage is crossed, we read 4300 * only one element of the array here. 4301 */ 4302 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4303 eventfd_signal(t->entries[i].eventfd, 1); 4304 4305 /* Update current_threshold */ 4306 t->current_threshold = i - 1; 4307 unlock: 4308 rcu_read_unlock(); 4309 } 4310 4311 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4312 { 4313 while (memcg) { 4314 __mem_cgroup_threshold(memcg, false); 4315 if (do_swap_account) 4316 __mem_cgroup_threshold(memcg, true); 4317 4318 memcg = parent_mem_cgroup(memcg); 4319 } 4320 } 4321 4322 static int compare_thresholds(const void *a, const void *b) 4323 { 4324 const struct mem_cgroup_threshold *_a = a; 4325 const struct mem_cgroup_threshold *_b = b; 4326 4327 return _a->threshold - _b->threshold; 4328 } 4329 4330 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) 4331 { 4332 struct mem_cgroup_eventfd_list *ev; 4333 4334 list_for_each_entry(ev, &mem->oom_notify, list) 4335 eventfd_signal(ev->eventfd, 1); 4336 return 0; 4337 } 4338 4339 static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 4340 { 4341 struct mem_cgroup *iter; 4342 4343 for_each_mem_cgroup_tree(iter, mem) 4344 mem_cgroup_oom_notify_cb(iter); 4345 } 4346 4347 static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 4348 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4349 { 4350 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4351 struct mem_cgroup_thresholds *thresholds; 4352 struct mem_cgroup_threshold_ary *new; 4353 int type = MEMFILE_TYPE(cft->private); 4354 u64 threshold, usage; 4355 int i, size, ret; 4356 4357 ret = res_counter_memparse_write_strategy(args, &threshold); 4358 if (ret) 4359 return ret; 4360 4361 mutex_lock(&memcg->thresholds_lock); 4362 4363 if (type == _MEM) 4364 thresholds = &memcg->thresholds; 4365 else if (type == _MEMSWAP) 4366 thresholds = &memcg->memsw_thresholds; 4367 else 4368 BUG(); 4369 4370 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4371 4372 /* Check if a threshold crossed before adding a new one */ 4373 if (thresholds->primary) 4374 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4375 4376 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4377 4378 /* Allocate memory for new array of thresholds */ 4379 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 4380 GFP_KERNEL); 4381 if (!new) { 4382 ret = -ENOMEM; 4383 goto unlock; 4384 } 4385 new->size = size; 4386 4387 /* Copy thresholds (if any) to new array */ 4388 if (thresholds->primary) { 4389 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 4390 sizeof(struct mem_cgroup_threshold)); 4391 } 4392 4393 /* Add new threshold */ 4394 new->entries[size - 1].eventfd = eventfd; 4395 new->entries[size - 1].threshold = threshold; 4396 4397 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4398 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 4399 compare_thresholds, NULL); 4400 4401 /* Find current threshold */ 4402 new->current_threshold = -1; 4403 for (i = 0; i < size; i++) { 4404 if (new->entries[i].threshold < usage) { 4405 /* 4406 * new->current_threshold will not be used until 4407 * rcu_assign_pointer(), so it's safe to increment 4408 * it here. 4409 */ 4410 ++new->current_threshold; 4411 } 4412 } 4413 4414 /* Free old spare buffer and save old primary buffer as spare */ 4415 kfree(thresholds->spare); 4416 thresholds->spare = thresholds->primary; 4417 4418 rcu_assign_pointer(thresholds->primary, new); 4419 4420 /* To be sure that nobody uses thresholds */ 4421 synchronize_rcu(); 4422 4423 unlock: 4424 mutex_unlock(&memcg->thresholds_lock); 4425 4426 return ret; 4427 } 4428 4429 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 4430 struct cftype *cft, struct eventfd_ctx *eventfd) 4431 { 4432 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4433 struct mem_cgroup_thresholds *thresholds; 4434 struct mem_cgroup_threshold_ary *new; 4435 int type = MEMFILE_TYPE(cft->private); 4436 u64 usage; 4437 int i, j, size; 4438 4439 mutex_lock(&memcg->thresholds_lock); 4440 if (type == _MEM) 4441 thresholds = &memcg->thresholds; 4442 else if (type == _MEMSWAP) 4443 thresholds = &memcg->memsw_thresholds; 4444 else 4445 BUG(); 4446 4447 /* 4448 * Something went wrong if we trying to unregister a threshold 4449 * if we don't have thresholds 4450 */ 4451 BUG_ON(!thresholds); 4452 4453 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4454 4455 /* Check if a threshold crossed before removing */ 4456 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4457 4458 /* Calculate new number of threshold */ 4459 size = 0; 4460 for (i = 0; i < thresholds->primary->size; i++) { 4461 if (thresholds->primary->entries[i].eventfd != eventfd) 4462 size++; 4463 } 4464 4465 new = thresholds->spare; 4466 4467 /* Set thresholds array to NULL if we don't have thresholds */ 4468 if (!size) { 4469 kfree(new); 4470 new = NULL; 4471 goto swap_buffers; 4472 } 4473 4474 new->size = size; 4475 4476 /* Copy thresholds and find current threshold */ 4477 new->current_threshold = -1; 4478 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4479 if (thresholds->primary->entries[i].eventfd == eventfd) 4480 continue; 4481 4482 new->entries[j] = thresholds->primary->entries[i]; 4483 if (new->entries[j].threshold < usage) { 4484 /* 4485 * new->current_threshold will not be used 4486 * until rcu_assign_pointer(), so it's safe to increment 4487 * it here. 4488 */ 4489 ++new->current_threshold; 4490 } 4491 j++; 4492 } 4493 4494 swap_buffers: 4495 /* Swap primary and spare array */ 4496 thresholds->spare = thresholds->primary; 4497 rcu_assign_pointer(thresholds->primary, new); 4498 4499 /* To be sure that nobody uses thresholds */ 4500 synchronize_rcu(); 4501 4502 mutex_unlock(&memcg->thresholds_lock); 4503 } 4504 4505 static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 4506 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4507 { 4508 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4509 struct mem_cgroup_eventfd_list *event; 4510 int type = MEMFILE_TYPE(cft->private); 4511 4512 BUG_ON(type != _OOM_TYPE); 4513 event = kmalloc(sizeof(*event), GFP_KERNEL); 4514 if (!event) 4515 return -ENOMEM; 4516 4517 spin_lock(&memcg_oom_lock); 4518 4519 event->eventfd = eventfd; 4520 list_add(&event->list, &memcg->oom_notify); 4521 4522 /* already in OOM ? */ 4523 if (atomic_read(&memcg->under_oom)) 4524 eventfd_signal(eventfd, 1); 4525 spin_unlock(&memcg_oom_lock); 4526 4527 return 0; 4528 } 4529 4530 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 4531 struct cftype *cft, struct eventfd_ctx *eventfd) 4532 { 4533 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4534 struct mem_cgroup_eventfd_list *ev, *tmp; 4535 int type = MEMFILE_TYPE(cft->private); 4536 4537 BUG_ON(type != _OOM_TYPE); 4538 4539 spin_lock(&memcg_oom_lock); 4540 4541 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 4542 if (ev->eventfd == eventfd) { 4543 list_del(&ev->list); 4544 kfree(ev); 4545 } 4546 } 4547 4548 spin_unlock(&memcg_oom_lock); 4549 } 4550 4551 static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4552 struct cftype *cft, struct cgroup_map_cb *cb) 4553 { 4554 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4555 4556 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 4557 4558 if (atomic_read(&mem->under_oom)) 4559 cb->fill(cb, "under_oom", 1); 4560 else 4561 cb->fill(cb, "under_oom", 0); 4562 return 0; 4563 } 4564 4565 static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 4566 struct cftype *cft, u64 val) 4567 { 4568 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4569 struct mem_cgroup *parent; 4570 4571 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4572 if (!cgrp->parent || !((val == 0) || (val == 1))) 4573 return -EINVAL; 4574 4575 parent = mem_cgroup_from_cont(cgrp->parent); 4576 4577 cgroup_lock(); 4578 /* oom-kill-disable is a flag for subhierarchy. */ 4579 if ((parent->use_hierarchy) || 4580 (mem->use_hierarchy && !list_empty(&cgrp->children))) { 4581 cgroup_unlock(); 4582 return -EINVAL; 4583 } 4584 mem->oom_kill_disable = val; 4585 if (!val) 4586 memcg_oom_recover(mem); 4587 cgroup_unlock(); 4588 return 0; 4589 } 4590 4591 #ifdef CONFIG_NUMA 4592 static const struct file_operations mem_control_numa_stat_file_operations = { 4593 .read = seq_read, 4594 .llseek = seq_lseek, 4595 .release = single_release, 4596 }; 4597 4598 static int mem_control_numa_stat_open(struct inode *unused, struct file *file) 4599 { 4600 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; 4601 4602 file->f_op = &mem_control_numa_stat_file_operations; 4603 return single_open(file, mem_control_numa_stat_show, cont); 4604 } 4605 #endif /* CONFIG_NUMA */ 4606 4607 static struct cftype mem_cgroup_files[] = { 4608 { 4609 .name = "usage_in_bytes", 4610 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4611 .read_u64 = mem_cgroup_read, 4612 .register_event = mem_cgroup_usage_register_event, 4613 .unregister_event = mem_cgroup_usage_unregister_event, 4614 }, 4615 { 4616 .name = "max_usage_in_bytes", 4617 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4618 .trigger = mem_cgroup_reset, 4619 .read_u64 = mem_cgroup_read, 4620 }, 4621 { 4622 .name = "limit_in_bytes", 4623 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4624 .write_string = mem_cgroup_write, 4625 .read_u64 = mem_cgroup_read, 4626 }, 4627 { 4628 .name = "soft_limit_in_bytes", 4629 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4630 .write_string = mem_cgroup_write, 4631 .read_u64 = mem_cgroup_read, 4632 }, 4633 { 4634 .name = "failcnt", 4635 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4636 .trigger = mem_cgroup_reset, 4637 .read_u64 = mem_cgroup_read, 4638 }, 4639 { 4640 .name = "stat", 4641 .read_map = mem_control_stat_show, 4642 }, 4643 { 4644 .name = "force_empty", 4645 .trigger = mem_cgroup_force_empty_write, 4646 }, 4647 { 4648 .name = "use_hierarchy", 4649 .write_u64 = mem_cgroup_hierarchy_write, 4650 .read_u64 = mem_cgroup_hierarchy_read, 4651 }, 4652 { 4653 .name = "swappiness", 4654 .read_u64 = mem_cgroup_swappiness_read, 4655 .write_u64 = mem_cgroup_swappiness_write, 4656 }, 4657 { 4658 .name = "move_charge_at_immigrate", 4659 .read_u64 = mem_cgroup_move_charge_read, 4660 .write_u64 = mem_cgroup_move_charge_write, 4661 }, 4662 { 4663 .name = "oom_control", 4664 .read_map = mem_cgroup_oom_control_read, 4665 .write_u64 = mem_cgroup_oom_control_write, 4666 .register_event = mem_cgroup_oom_register_event, 4667 .unregister_event = mem_cgroup_oom_unregister_event, 4668 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4669 }, 4670 #ifdef CONFIG_NUMA 4671 { 4672 .name = "numa_stat", 4673 .open = mem_control_numa_stat_open, 4674 .mode = S_IRUGO, 4675 }, 4676 #endif 4677 }; 4678 4679 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4680 static struct cftype memsw_cgroup_files[] = { 4681 { 4682 .name = "memsw.usage_in_bytes", 4683 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4684 .read_u64 = mem_cgroup_read, 4685 .register_event = mem_cgroup_usage_register_event, 4686 .unregister_event = mem_cgroup_usage_unregister_event, 4687 }, 4688 { 4689 .name = "memsw.max_usage_in_bytes", 4690 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4691 .trigger = mem_cgroup_reset, 4692 .read_u64 = mem_cgroup_read, 4693 }, 4694 { 4695 .name = "memsw.limit_in_bytes", 4696 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4697 .write_string = mem_cgroup_write, 4698 .read_u64 = mem_cgroup_read, 4699 }, 4700 { 4701 .name = "memsw.failcnt", 4702 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4703 .trigger = mem_cgroup_reset, 4704 .read_u64 = mem_cgroup_read, 4705 }, 4706 }; 4707 4708 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4709 { 4710 if (!do_swap_account) 4711 return 0; 4712 return cgroup_add_files(cont, ss, memsw_cgroup_files, 4713 ARRAY_SIZE(memsw_cgroup_files)); 4714 }; 4715 #else 4716 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4717 { 4718 return 0; 4719 } 4720 #endif 4721 4722 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4723 { 4724 struct mem_cgroup_per_node *pn; 4725 struct mem_cgroup_per_zone *mz; 4726 enum lru_list l; 4727 int zone, tmp = node; 4728 /* 4729 * This routine is called against possible nodes. 4730 * But it's BUG to call kmalloc() against offline node. 4731 * 4732 * TODO: this routine can waste much memory for nodes which will 4733 * never be onlined. It's better to use memory hotplug callback 4734 * function. 4735 */ 4736 if (!node_state(node, N_NORMAL_MEMORY)) 4737 tmp = -1; 4738 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4739 if (!pn) 4740 return 1; 4741 4742 mem->info.nodeinfo[node] = pn; 4743 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4744 mz = &pn->zoneinfo[zone]; 4745 for_each_lru(l) 4746 INIT_LIST_HEAD(&mz->lists[l]); 4747 mz->usage_in_excess = 0; 4748 mz->on_tree = false; 4749 mz->mem = mem; 4750 } 4751 return 0; 4752 } 4753 4754 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4755 { 4756 kfree(mem->info.nodeinfo[node]); 4757 } 4758 4759 static struct mem_cgroup *mem_cgroup_alloc(void) 4760 { 4761 struct mem_cgroup *mem; 4762 int size = sizeof(struct mem_cgroup); 4763 4764 /* Can be very big if MAX_NUMNODES is very big */ 4765 if (size < PAGE_SIZE) 4766 mem = kzalloc(size, GFP_KERNEL); 4767 else 4768 mem = vzalloc(size); 4769 4770 if (!mem) 4771 return NULL; 4772 4773 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4774 if (!mem->stat) 4775 goto out_free; 4776 spin_lock_init(&mem->pcp_counter_lock); 4777 return mem; 4778 4779 out_free: 4780 if (size < PAGE_SIZE) 4781 kfree(mem); 4782 else 4783 vfree(mem); 4784 return NULL; 4785 } 4786 4787 /* 4788 * At destroying mem_cgroup, references from swap_cgroup can remain. 4789 * (scanning all at force_empty is too costly...) 4790 * 4791 * Instead of clearing all references at force_empty, we remember 4792 * the number of reference from swap_cgroup and free mem_cgroup when 4793 * it goes down to 0. 4794 * 4795 * Removal of cgroup itself succeeds regardless of refs from swap. 4796 */ 4797 4798 static void __mem_cgroup_free(struct mem_cgroup *mem) 4799 { 4800 int node; 4801 4802 mem_cgroup_remove_from_trees(mem); 4803 free_css_id(&mem_cgroup_subsys, &mem->css); 4804 4805 for_each_node_state(node, N_POSSIBLE) 4806 free_mem_cgroup_per_zone_info(mem, node); 4807 4808 free_percpu(mem->stat); 4809 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4810 kfree(mem); 4811 else 4812 vfree(mem); 4813 } 4814 4815 static void mem_cgroup_get(struct mem_cgroup *mem) 4816 { 4817 atomic_inc(&mem->refcnt); 4818 } 4819 4820 static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 4821 { 4822 if (atomic_sub_and_test(count, &mem->refcnt)) { 4823 struct mem_cgroup *parent = parent_mem_cgroup(mem); 4824 __mem_cgroup_free(mem); 4825 if (parent) 4826 mem_cgroup_put(parent); 4827 } 4828 } 4829 4830 static void mem_cgroup_put(struct mem_cgroup *mem) 4831 { 4832 __mem_cgroup_put(mem, 1); 4833 } 4834 4835 /* 4836 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4837 */ 4838 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 4839 { 4840 if (!mem->res.parent) 4841 return NULL; 4842 return mem_cgroup_from_res_counter(mem->res.parent, res); 4843 } 4844 4845 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4846 static void __init enable_swap_cgroup(void) 4847 { 4848 if (!mem_cgroup_disabled() && really_do_swap_account) 4849 do_swap_account = 1; 4850 } 4851 #else 4852 static void __init enable_swap_cgroup(void) 4853 { 4854 } 4855 #endif 4856 4857 static int mem_cgroup_soft_limit_tree_init(void) 4858 { 4859 struct mem_cgroup_tree_per_node *rtpn; 4860 struct mem_cgroup_tree_per_zone *rtpz; 4861 int tmp, node, zone; 4862 4863 for_each_node_state(node, N_POSSIBLE) { 4864 tmp = node; 4865 if (!node_state(node, N_NORMAL_MEMORY)) 4866 tmp = -1; 4867 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4868 if (!rtpn) 4869 return 1; 4870 4871 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4872 4873 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4874 rtpz = &rtpn->rb_tree_per_zone[zone]; 4875 rtpz->rb_root = RB_ROOT; 4876 spin_lock_init(&rtpz->lock); 4877 } 4878 } 4879 return 0; 4880 } 4881 4882 static struct cgroup_subsys_state * __ref 4883 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 4884 { 4885 struct mem_cgroup *mem, *parent; 4886 long error = -ENOMEM; 4887 int node; 4888 4889 mem = mem_cgroup_alloc(); 4890 if (!mem) 4891 return ERR_PTR(error); 4892 4893 for_each_node_state(node, N_POSSIBLE) 4894 if (alloc_mem_cgroup_per_zone_info(mem, node)) 4895 goto free_out; 4896 4897 /* root ? */ 4898 if (cont->parent == NULL) { 4899 int cpu; 4900 enable_swap_cgroup(); 4901 parent = NULL; 4902 root_mem_cgroup = mem; 4903 if (mem_cgroup_soft_limit_tree_init()) 4904 goto free_out; 4905 for_each_possible_cpu(cpu) { 4906 struct memcg_stock_pcp *stock = 4907 &per_cpu(memcg_stock, cpu); 4908 INIT_WORK(&stock->work, drain_local_stock); 4909 } 4910 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 4911 } else { 4912 parent = mem_cgroup_from_cont(cont->parent); 4913 mem->use_hierarchy = parent->use_hierarchy; 4914 mem->oom_kill_disable = parent->oom_kill_disable; 4915 } 4916 4917 if (parent && parent->use_hierarchy) { 4918 res_counter_init(&mem->res, &parent->res); 4919 res_counter_init(&mem->memsw, &parent->memsw); 4920 /* 4921 * We increment refcnt of the parent to ensure that we can 4922 * safely access it on res_counter_charge/uncharge. 4923 * This refcnt will be decremented when freeing this 4924 * mem_cgroup(see mem_cgroup_put). 4925 */ 4926 mem_cgroup_get(parent); 4927 } else { 4928 res_counter_init(&mem->res, NULL); 4929 res_counter_init(&mem->memsw, NULL); 4930 } 4931 mem->last_scanned_child = 0; 4932 mem->last_scanned_node = MAX_NUMNODES; 4933 INIT_LIST_HEAD(&mem->oom_notify); 4934 4935 if (parent) 4936 mem->swappiness = mem_cgroup_swappiness(parent); 4937 atomic_set(&mem->refcnt, 1); 4938 mem->move_charge_at_immigrate = 0; 4939 mutex_init(&mem->thresholds_lock); 4940 return &mem->css; 4941 free_out: 4942 __mem_cgroup_free(mem); 4943 root_mem_cgroup = NULL; 4944 return ERR_PTR(error); 4945 } 4946 4947 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 4948 struct cgroup *cont) 4949 { 4950 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4951 4952 return mem_cgroup_force_empty(mem, false); 4953 } 4954 4955 static void mem_cgroup_destroy(struct cgroup_subsys *ss, 4956 struct cgroup *cont) 4957 { 4958 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4959 4960 mem_cgroup_put(mem); 4961 } 4962 4963 static int mem_cgroup_populate(struct cgroup_subsys *ss, 4964 struct cgroup *cont) 4965 { 4966 int ret; 4967 4968 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 4969 ARRAY_SIZE(mem_cgroup_files)); 4970 4971 if (!ret) 4972 ret = register_memsw_files(cont, ss); 4973 return ret; 4974 } 4975 4976 #ifdef CONFIG_MMU 4977 /* Handlers for move charge at task migration. */ 4978 #define PRECHARGE_COUNT_AT_ONCE 256 4979 static int mem_cgroup_do_precharge(unsigned long count) 4980 { 4981 int ret = 0; 4982 int batch_count = PRECHARGE_COUNT_AT_ONCE; 4983 struct mem_cgroup *mem = mc.to; 4984 4985 if (mem_cgroup_is_root(mem)) { 4986 mc.precharge += count; 4987 /* we don't need css_get for root */ 4988 return ret; 4989 } 4990 /* try to charge at once */ 4991 if (count > 1) { 4992 struct res_counter *dummy; 4993 /* 4994 * "mem" cannot be under rmdir() because we've already checked 4995 * by cgroup_lock_live_cgroup() that it is not removed and we 4996 * are still under the same cgroup_mutex. So we can postpone 4997 * css_get(). 4998 */ 4999 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 5000 goto one_by_one; 5001 if (do_swap_account && res_counter_charge(&mem->memsw, 5002 PAGE_SIZE * count, &dummy)) { 5003 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 5004 goto one_by_one; 5005 } 5006 mc.precharge += count; 5007 return ret; 5008 } 5009 one_by_one: 5010 /* fall back to one by one charge */ 5011 while (count--) { 5012 if (signal_pending(current)) { 5013 ret = -EINTR; 5014 break; 5015 } 5016 if (!batch_count--) { 5017 batch_count = PRECHARGE_COUNT_AT_ONCE; 5018 cond_resched(); 5019 } 5020 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); 5021 if (ret || !mem) 5022 /* mem_cgroup_clear_mc() will do uncharge later */ 5023 return -ENOMEM; 5024 mc.precharge++; 5025 } 5026 return ret; 5027 } 5028 5029 /** 5030 * is_target_pte_for_mc - check a pte whether it is valid for move charge 5031 * @vma: the vma the pte to be checked belongs 5032 * @addr: the address corresponding to the pte to be checked 5033 * @ptent: the pte to be checked 5034 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5035 * 5036 * Returns 5037 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5038 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5039 * move charge. if @target is not NULL, the page is stored in target->page 5040 * with extra refcnt got(Callers should handle it). 5041 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5042 * target for charge migration. if @target is not NULL, the entry is stored 5043 * in target->ent. 5044 * 5045 * Called with pte lock held. 5046 */ 5047 union mc_target { 5048 struct page *page; 5049 swp_entry_t ent; 5050 }; 5051 5052 enum mc_target_type { 5053 MC_TARGET_NONE, /* not used */ 5054 MC_TARGET_PAGE, 5055 MC_TARGET_SWAP, 5056 }; 5057 5058 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5059 unsigned long addr, pte_t ptent) 5060 { 5061 struct page *page = vm_normal_page(vma, addr, ptent); 5062 5063 if (!page || !page_mapped(page)) 5064 return NULL; 5065 if (PageAnon(page)) { 5066 /* we don't move shared anon */ 5067 if (!move_anon() || page_mapcount(page) > 2) 5068 return NULL; 5069 } else if (!move_file()) 5070 /* we ignore mapcount for file pages */ 5071 return NULL; 5072 if (!get_page_unless_zero(page)) 5073 return NULL; 5074 5075 return page; 5076 } 5077 5078 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5079 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5080 { 5081 int usage_count; 5082 struct page *page = NULL; 5083 swp_entry_t ent = pte_to_swp_entry(ptent); 5084 5085 if (!move_anon() || non_swap_entry(ent)) 5086 return NULL; 5087 usage_count = mem_cgroup_count_swap_user(ent, &page); 5088 if (usage_count > 1) { /* we don't move shared anon */ 5089 if (page) 5090 put_page(page); 5091 return NULL; 5092 } 5093 if (do_swap_account) 5094 entry->val = ent.val; 5095 5096 return page; 5097 } 5098 5099 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5100 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5101 { 5102 struct page *page = NULL; 5103 struct inode *inode; 5104 struct address_space *mapping; 5105 pgoff_t pgoff; 5106 5107 if (!vma->vm_file) /* anonymous vma */ 5108 return NULL; 5109 if (!move_file()) 5110 return NULL; 5111 5112 inode = vma->vm_file->f_path.dentry->d_inode; 5113 mapping = vma->vm_file->f_mapping; 5114 if (pte_none(ptent)) 5115 pgoff = linear_page_index(vma, addr); 5116 else /* pte_file(ptent) is true */ 5117 pgoff = pte_to_pgoff(ptent); 5118 5119 /* page is moved even if it's not RSS of this task(page-faulted). */ 5120 page = find_get_page(mapping, pgoff); 5121 5122 #ifdef CONFIG_SWAP 5123 /* shmem/tmpfs may report page out on swap: account for that too. */ 5124 if (radix_tree_exceptional_entry(page)) { 5125 swp_entry_t swap = radix_to_swp_entry(page); 5126 if (do_swap_account) 5127 *entry = swap; 5128 page = find_get_page(&swapper_space, swap.val); 5129 } 5130 #endif 5131 return page; 5132 } 5133 5134 static int is_target_pte_for_mc(struct vm_area_struct *vma, 5135 unsigned long addr, pte_t ptent, union mc_target *target) 5136 { 5137 struct page *page = NULL; 5138 struct page_cgroup *pc; 5139 int ret = 0; 5140 swp_entry_t ent = { .val = 0 }; 5141 5142 if (pte_present(ptent)) 5143 page = mc_handle_present_pte(vma, addr, ptent); 5144 else if (is_swap_pte(ptent)) 5145 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 5146 else if (pte_none(ptent) || pte_file(ptent)) 5147 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5148 5149 if (!page && !ent.val) 5150 return 0; 5151 if (page) { 5152 pc = lookup_page_cgroup(page); 5153 /* 5154 * Do only loose check w/o page_cgroup lock. 5155 * mem_cgroup_move_account() checks the pc is valid or not under 5156 * the lock. 5157 */ 5158 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5159 ret = MC_TARGET_PAGE; 5160 if (target) 5161 target->page = page; 5162 } 5163 if (!ret || !target) 5164 put_page(page); 5165 } 5166 /* There is a swap entry and a page doesn't exist or isn't charged */ 5167 if (ent.val && !ret && 5168 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 5169 ret = MC_TARGET_SWAP; 5170 if (target) 5171 target->ent = ent; 5172 } 5173 return ret; 5174 } 5175 5176 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5177 unsigned long addr, unsigned long end, 5178 struct mm_walk *walk) 5179 { 5180 struct vm_area_struct *vma = walk->private; 5181 pte_t *pte; 5182 spinlock_t *ptl; 5183 5184 split_huge_page_pmd(walk->mm, pmd); 5185 5186 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5187 for (; addr != end; pte++, addr += PAGE_SIZE) 5188 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 5189 mc.precharge++; /* increment precharge temporarily */ 5190 pte_unmap_unlock(pte - 1, ptl); 5191 cond_resched(); 5192 5193 return 0; 5194 } 5195 5196 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5197 { 5198 unsigned long precharge; 5199 struct vm_area_struct *vma; 5200 5201 down_read(&mm->mmap_sem); 5202 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5203 struct mm_walk mem_cgroup_count_precharge_walk = { 5204 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5205 .mm = mm, 5206 .private = vma, 5207 }; 5208 if (is_vm_hugetlb_page(vma)) 5209 continue; 5210 walk_page_range(vma->vm_start, vma->vm_end, 5211 &mem_cgroup_count_precharge_walk); 5212 } 5213 up_read(&mm->mmap_sem); 5214 5215 precharge = mc.precharge; 5216 mc.precharge = 0; 5217 5218 return precharge; 5219 } 5220 5221 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5222 { 5223 unsigned long precharge = mem_cgroup_count_precharge(mm); 5224 5225 VM_BUG_ON(mc.moving_task); 5226 mc.moving_task = current; 5227 return mem_cgroup_do_precharge(precharge); 5228 } 5229 5230 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5231 static void __mem_cgroup_clear_mc(void) 5232 { 5233 struct mem_cgroup *from = mc.from; 5234 struct mem_cgroup *to = mc.to; 5235 5236 /* we must uncharge all the leftover precharges from mc.to */ 5237 if (mc.precharge) { 5238 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 5239 mc.precharge = 0; 5240 } 5241 /* 5242 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5243 * we must uncharge here. 5244 */ 5245 if (mc.moved_charge) { 5246 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 5247 mc.moved_charge = 0; 5248 } 5249 /* we must fixup refcnts and charges */ 5250 if (mc.moved_swap) { 5251 /* uncharge swap account from the old cgroup */ 5252 if (!mem_cgroup_is_root(mc.from)) 5253 res_counter_uncharge(&mc.from->memsw, 5254 PAGE_SIZE * mc.moved_swap); 5255 __mem_cgroup_put(mc.from, mc.moved_swap); 5256 5257 if (!mem_cgroup_is_root(mc.to)) { 5258 /* 5259 * we charged both to->res and to->memsw, so we should 5260 * uncharge to->res. 5261 */ 5262 res_counter_uncharge(&mc.to->res, 5263 PAGE_SIZE * mc.moved_swap); 5264 } 5265 /* we've already done mem_cgroup_get(mc.to) */ 5266 mc.moved_swap = 0; 5267 } 5268 memcg_oom_recover(from); 5269 memcg_oom_recover(to); 5270 wake_up_all(&mc.waitq); 5271 } 5272 5273 static void mem_cgroup_clear_mc(void) 5274 { 5275 struct mem_cgroup *from = mc.from; 5276 5277 /* 5278 * we must clear moving_task before waking up waiters at the end of 5279 * task migration. 5280 */ 5281 mc.moving_task = NULL; 5282 __mem_cgroup_clear_mc(); 5283 spin_lock(&mc.lock); 5284 mc.from = NULL; 5285 mc.to = NULL; 5286 spin_unlock(&mc.lock); 5287 mem_cgroup_end_move(from); 5288 } 5289 5290 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5291 struct cgroup *cgroup, 5292 struct task_struct *p) 5293 { 5294 int ret = 0; 5295 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 5296 5297 if (mem->move_charge_at_immigrate) { 5298 struct mm_struct *mm; 5299 struct mem_cgroup *from = mem_cgroup_from_task(p); 5300 5301 VM_BUG_ON(from == mem); 5302 5303 mm = get_task_mm(p); 5304 if (!mm) 5305 return 0; 5306 /* We move charges only when we move a owner of the mm */ 5307 if (mm->owner == p) { 5308 VM_BUG_ON(mc.from); 5309 VM_BUG_ON(mc.to); 5310 VM_BUG_ON(mc.precharge); 5311 VM_BUG_ON(mc.moved_charge); 5312 VM_BUG_ON(mc.moved_swap); 5313 mem_cgroup_start_move(from); 5314 spin_lock(&mc.lock); 5315 mc.from = from; 5316 mc.to = mem; 5317 spin_unlock(&mc.lock); 5318 /* We set mc.moving_task later */ 5319 5320 ret = mem_cgroup_precharge_mc(mm); 5321 if (ret) 5322 mem_cgroup_clear_mc(); 5323 } 5324 mmput(mm); 5325 } 5326 return ret; 5327 } 5328 5329 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5330 struct cgroup *cgroup, 5331 struct task_struct *p) 5332 { 5333 mem_cgroup_clear_mc(); 5334 } 5335 5336 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5337 unsigned long addr, unsigned long end, 5338 struct mm_walk *walk) 5339 { 5340 int ret = 0; 5341 struct vm_area_struct *vma = walk->private; 5342 pte_t *pte; 5343 spinlock_t *ptl; 5344 5345 split_huge_page_pmd(walk->mm, pmd); 5346 retry: 5347 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5348 for (; addr != end; addr += PAGE_SIZE) { 5349 pte_t ptent = *(pte++); 5350 union mc_target target; 5351 int type; 5352 struct page *page; 5353 struct page_cgroup *pc; 5354 swp_entry_t ent; 5355 5356 if (!mc.precharge) 5357 break; 5358 5359 type = is_target_pte_for_mc(vma, addr, ptent, &target); 5360 switch (type) { 5361 case MC_TARGET_PAGE: 5362 page = target.page; 5363 if (isolate_lru_page(page)) 5364 goto put; 5365 pc = lookup_page_cgroup(page); 5366 if (!mem_cgroup_move_account(page, 1, pc, 5367 mc.from, mc.to, false)) { 5368 mc.precharge--; 5369 /* we uncharge from mc.from later. */ 5370 mc.moved_charge++; 5371 } 5372 putback_lru_page(page); 5373 put: /* is_target_pte_for_mc() gets the page */ 5374 put_page(page); 5375 break; 5376 case MC_TARGET_SWAP: 5377 ent = target.ent; 5378 if (!mem_cgroup_move_swap_account(ent, 5379 mc.from, mc.to, false)) { 5380 mc.precharge--; 5381 /* we fixup refcnts and charges later. */ 5382 mc.moved_swap++; 5383 } 5384 break; 5385 default: 5386 break; 5387 } 5388 } 5389 pte_unmap_unlock(pte - 1, ptl); 5390 cond_resched(); 5391 5392 if (addr != end) { 5393 /* 5394 * We have consumed all precharges we got in can_attach(). 5395 * We try charge one by one, but don't do any additional 5396 * charges to mc.to if we have failed in charge once in attach() 5397 * phase. 5398 */ 5399 ret = mem_cgroup_do_precharge(1); 5400 if (!ret) 5401 goto retry; 5402 } 5403 5404 return ret; 5405 } 5406 5407 static void mem_cgroup_move_charge(struct mm_struct *mm) 5408 { 5409 struct vm_area_struct *vma; 5410 5411 lru_add_drain_all(); 5412 retry: 5413 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5414 /* 5415 * Someone who are holding the mmap_sem might be waiting in 5416 * waitq. So we cancel all extra charges, wake up all waiters, 5417 * and retry. Because we cancel precharges, we might not be able 5418 * to move enough charges, but moving charge is a best-effort 5419 * feature anyway, so it wouldn't be a big problem. 5420 */ 5421 __mem_cgroup_clear_mc(); 5422 cond_resched(); 5423 goto retry; 5424 } 5425 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5426 int ret; 5427 struct mm_walk mem_cgroup_move_charge_walk = { 5428 .pmd_entry = mem_cgroup_move_charge_pte_range, 5429 .mm = mm, 5430 .private = vma, 5431 }; 5432 if (is_vm_hugetlb_page(vma)) 5433 continue; 5434 ret = walk_page_range(vma->vm_start, vma->vm_end, 5435 &mem_cgroup_move_charge_walk); 5436 if (ret) 5437 /* 5438 * means we have consumed all precharges and failed in 5439 * doing additional charge. Just abandon here. 5440 */ 5441 break; 5442 } 5443 up_read(&mm->mmap_sem); 5444 } 5445 5446 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5447 struct cgroup *cont, 5448 struct cgroup *old_cont, 5449 struct task_struct *p) 5450 { 5451 struct mm_struct *mm = get_task_mm(p); 5452 5453 if (mm) { 5454 if (mc.to) 5455 mem_cgroup_move_charge(mm); 5456 put_swap_token(mm); 5457 mmput(mm); 5458 } 5459 if (mc.to) 5460 mem_cgroup_clear_mc(); 5461 } 5462 #else /* !CONFIG_MMU */ 5463 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5464 struct cgroup *cgroup, 5465 struct task_struct *p) 5466 { 5467 return 0; 5468 } 5469 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5470 struct cgroup *cgroup, 5471 struct task_struct *p) 5472 { 5473 } 5474 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5475 struct cgroup *cont, 5476 struct cgroup *old_cont, 5477 struct task_struct *p) 5478 { 5479 } 5480 #endif 5481 5482 struct cgroup_subsys mem_cgroup_subsys = { 5483 .name = "memory", 5484 .subsys_id = mem_cgroup_subsys_id, 5485 .create = mem_cgroup_create, 5486 .pre_destroy = mem_cgroup_pre_destroy, 5487 .destroy = mem_cgroup_destroy, 5488 .populate = mem_cgroup_populate, 5489 .can_attach = mem_cgroup_can_attach, 5490 .cancel_attach = mem_cgroup_cancel_attach, 5491 .attach = mem_cgroup_move_task, 5492 .early_init = 0, 5493 .use_id = 1, 5494 }; 5495 5496 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5497 static int __init enable_swap_account(char *s) 5498 { 5499 /* consider enabled if no parameter or 1 is given */ 5500 if (!strcmp(s, "1")) 5501 really_do_swap_account = 1; 5502 else if (!strcmp(s, "0")) 5503 really_do_swap_account = 0; 5504 return 1; 5505 } 5506 __setup("swapaccount=", enable_swap_account); 5507 5508 #endif 5509