1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * This program is free software; you can redistribute it and/or modify 14 * it under the terms of the GNU General Public License as published by 15 * the Free Software Foundation; either version 2 of the License, or 16 * (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 * GNU General Public License for more details. 22 */ 23 24 #include <linux/res_counter.h> 25 #include <linux/memcontrol.h> 26 #include <linux/cgroup.h> 27 #include <linux/mm.h> 28 #include <linux/hugetlb.h> 29 #include <linux/pagemap.h> 30 #include <linux/smp.h> 31 #include <linux/page-flags.h> 32 #include <linux/backing-dev.h> 33 #include <linux/bit_spinlock.h> 34 #include <linux/rcupdate.h> 35 #include <linux/limits.h> 36 #include <linux/export.h> 37 #include <linux/mutex.h> 38 #include <linux/rbtree.h> 39 #include <linux/slab.h> 40 #include <linux/swap.h> 41 #include <linux/swapops.h> 42 #include <linux/spinlock.h> 43 #include <linux/eventfd.h> 44 #include <linux/sort.h> 45 #include <linux/fs.h> 46 #include <linux/seq_file.h> 47 #include <linux/vmalloc.h> 48 #include <linux/mm_inline.h> 49 #include <linux/page_cgroup.h> 50 #include <linux/cpu.h> 51 #include <linux/oom.h> 52 #include "internal.h" 53 #include <net/sock.h> 54 #include <net/tcp_memcontrol.h> 55 56 #include <asm/uaccess.h> 57 58 #include <trace/events/vmscan.h> 59 60 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 61 #define MEM_CGROUP_RECLAIM_RETRIES 5 62 struct mem_cgroup *root_mem_cgroup __read_mostly; 63 64 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 65 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 66 int do_swap_account __read_mostly; 67 68 /* for remember boot option*/ 69 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 70 static int really_do_swap_account __initdata = 1; 71 #else 72 static int really_do_swap_account __initdata = 0; 73 #endif 74 75 #else 76 #define do_swap_account (0) 77 #endif 78 79 80 /* 81 * Statistics for memory cgroup. 82 */ 83 enum mem_cgroup_stat_index { 84 /* 85 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 86 */ 87 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 92 MEM_CGROUP_STAT_NSTATS, 93 }; 94 95 enum mem_cgroup_events_index { 96 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 97 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 98 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ 99 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 100 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 101 MEM_CGROUP_EVENTS_NSTATS, 102 }; 103 /* 104 * Per memcg event counter is incremented at every pagein/pageout. With THP, 105 * it will be incremated by the number of pages. This counter is used for 106 * for trigger some periodic events. This is straightforward and better 107 * than using jiffies etc. to handle periodic memcg event. 108 */ 109 enum mem_cgroup_events_target { 110 MEM_CGROUP_TARGET_THRESH, 111 MEM_CGROUP_TARGET_SOFTLIMIT, 112 MEM_CGROUP_TARGET_NUMAINFO, 113 MEM_CGROUP_NTARGETS, 114 }; 115 #define THRESHOLDS_EVENTS_TARGET (128) 116 #define SOFTLIMIT_EVENTS_TARGET (1024) 117 #define NUMAINFO_EVENTS_TARGET (1024) 118 119 struct mem_cgroup_stat_cpu { 120 long count[MEM_CGROUP_STAT_NSTATS]; 121 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 122 unsigned long targets[MEM_CGROUP_NTARGETS]; 123 }; 124 125 struct mem_cgroup_reclaim_iter { 126 /* css_id of the last scanned hierarchy member */ 127 int position; 128 /* scan generation, increased every round-trip */ 129 unsigned int generation; 130 }; 131 132 /* 133 * per-zone information in memory controller. 134 */ 135 struct mem_cgroup_per_zone { 136 struct lruvec lruvec; 137 unsigned long lru_size[NR_LRU_LISTS]; 138 139 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 140 141 struct zone_reclaim_stat reclaim_stat; 142 struct rb_node tree_node; /* RB tree node */ 143 unsigned long long usage_in_excess;/* Set to the value by which */ 144 /* the soft limit is exceeded*/ 145 bool on_tree; 146 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 147 /* use container_of */ 148 }; 149 150 struct mem_cgroup_per_node { 151 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 152 }; 153 154 struct mem_cgroup_lru_info { 155 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 156 }; 157 158 /* 159 * Cgroups above their limits are maintained in a RB-Tree, independent of 160 * their hierarchy representation 161 */ 162 163 struct mem_cgroup_tree_per_zone { 164 struct rb_root rb_root; 165 spinlock_t lock; 166 }; 167 168 struct mem_cgroup_tree_per_node { 169 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 170 }; 171 172 struct mem_cgroup_tree { 173 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 174 }; 175 176 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 177 178 struct mem_cgroup_threshold { 179 struct eventfd_ctx *eventfd; 180 u64 threshold; 181 }; 182 183 /* For threshold */ 184 struct mem_cgroup_threshold_ary { 185 /* An array index points to threshold just below usage. */ 186 int current_threshold; 187 /* Size of entries[] */ 188 unsigned int size; 189 /* Array of thresholds */ 190 struct mem_cgroup_threshold entries[0]; 191 }; 192 193 struct mem_cgroup_thresholds { 194 /* Primary thresholds array */ 195 struct mem_cgroup_threshold_ary *primary; 196 /* 197 * Spare threshold array. 198 * This is needed to make mem_cgroup_unregister_event() "never fail". 199 * It must be able to store at least primary->size - 1 entries. 200 */ 201 struct mem_cgroup_threshold_ary *spare; 202 }; 203 204 /* for OOM */ 205 struct mem_cgroup_eventfd_list { 206 struct list_head list; 207 struct eventfd_ctx *eventfd; 208 }; 209 210 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 211 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 212 213 /* 214 * The memory controller data structure. The memory controller controls both 215 * page cache and RSS per cgroup. We would eventually like to provide 216 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 217 * to help the administrator determine what knobs to tune. 218 * 219 * TODO: Add a water mark for the memory controller. Reclaim will begin when 220 * we hit the water mark. May be even add a low water mark, such that 221 * no reclaim occurs from a cgroup at it's low water mark, this is 222 * a feature that will be implemented much later in the future. 223 */ 224 struct mem_cgroup { 225 struct cgroup_subsys_state css; 226 /* 227 * the counter to account for memory usage 228 */ 229 struct res_counter res; 230 231 union { 232 /* 233 * the counter to account for mem+swap usage. 234 */ 235 struct res_counter memsw; 236 237 /* 238 * rcu_freeing is used only when freeing struct mem_cgroup, 239 * so put it into a union to avoid wasting more memory. 240 * It must be disjoint from the css field. It could be 241 * in a union with the res field, but res plays a much 242 * larger part in mem_cgroup life than memsw, and might 243 * be of interest, even at time of free, when debugging. 244 * So share rcu_head with the less interesting memsw. 245 */ 246 struct rcu_head rcu_freeing; 247 /* 248 * But when using vfree(), that cannot be done at 249 * interrupt time, so we must then queue the work. 250 */ 251 struct work_struct work_freeing; 252 }; 253 254 /* 255 * Per cgroup active and inactive list, similar to the 256 * per zone LRU lists. 257 */ 258 struct mem_cgroup_lru_info info; 259 int last_scanned_node; 260 #if MAX_NUMNODES > 1 261 nodemask_t scan_nodes; 262 atomic_t numainfo_events; 263 atomic_t numainfo_updating; 264 #endif 265 /* 266 * Should the accounting and control be hierarchical, per subtree? 267 */ 268 bool use_hierarchy; 269 270 bool oom_lock; 271 atomic_t under_oom; 272 273 atomic_t refcnt; 274 275 int swappiness; 276 /* OOM-Killer disable */ 277 int oom_kill_disable; 278 279 /* set when res.limit == memsw.limit */ 280 bool memsw_is_minimum; 281 282 /* protect arrays of thresholds */ 283 struct mutex thresholds_lock; 284 285 /* thresholds for memory usage. RCU-protected */ 286 struct mem_cgroup_thresholds thresholds; 287 288 /* thresholds for mem+swap usage. RCU-protected */ 289 struct mem_cgroup_thresholds memsw_thresholds; 290 291 /* For oom notifier event fd */ 292 struct list_head oom_notify; 293 294 /* 295 * Should we move charges of a task when a task is moved into this 296 * mem_cgroup ? And what type of charges should we move ? 297 */ 298 unsigned long move_charge_at_immigrate; 299 /* 300 * set > 0 if pages under this cgroup are moving to other cgroup. 301 */ 302 atomic_t moving_account; 303 /* taken only while moving_account > 0 */ 304 spinlock_t move_lock; 305 /* 306 * percpu counter. 307 */ 308 struct mem_cgroup_stat_cpu *stat; 309 /* 310 * used when a cpu is offlined or other synchronizations 311 * See mem_cgroup_read_stat(). 312 */ 313 struct mem_cgroup_stat_cpu nocpu_base; 314 spinlock_t pcp_counter_lock; 315 316 #ifdef CONFIG_INET 317 struct tcp_memcontrol tcp_mem; 318 #endif 319 }; 320 321 /* Stuffs for move charges at task migration. */ 322 /* 323 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 324 * left-shifted bitmap of these types. 325 */ 326 enum move_type { 327 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 328 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 329 NR_MOVE_TYPE, 330 }; 331 332 /* "mc" and its members are protected by cgroup_mutex */ 333 static struct move_charge_struct { 334 spinlock_t lock; /* for from, to */ 335 struct mem_cgroup *from; 336 struct mem_cgroup *to; 337 unsigned long precharge; 338 unsigned long moved_charge; 339 unsigned long moved_swap; 340 struct task_struct *moving_task; /* a task moving charges */ 341 wait_queue_head_t waitq; /* a waitq for other context */ 342 } mc = { 343 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 344 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 345 }; 346 347 static bool move_anon(void) 348 { 349 return test_bit(MOVE_CHARGE_TYPE_ANON, 350 &mc.to->move_charge_at_immigrate); 351 } 352 353 static bool move_file(void) 354 { 355 return test_bit(MOVE_CHARGE_TYPE_FILE, 356 &mc.to->move_charge_at_immigrate); 357 } 358 359 /* 360 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 361 * limit reclaim to prevent infinite loops, if they ever occur. 362 */ 363 #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 364 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 365 366 enum charge_type { 367 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 368 MEM_CGROUP_CHARGE_TYPE_MAPPED, 369 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 370 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 371 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 372 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 373 NR_CHARGE_TYPE, 374 }; 375 376 /* for encoding cft->private value on file */ 377 #define _MEM (0) 378 #define _MEMSWAP (1) 379 #define _OOM_TYPE (2) 380 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 381 #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 382 #define MEMFILE_ATTR(val) ((val) & 0xffff) 383 /* Used for OOM nofiier */ 384 #define OOM_CONTROL (0) 385 386 /* 387 * Reclaim flags for mem_cgroup_hierarchical_reclaim 388 */ 389 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 390 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 391 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 392 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 393 394 static void mem_cgroup_get(struct mem_cgroup *memcg); 395 static void mem_cgroup_put(struct mem_cgroup *memcg); 396 397 /* Writing them here to avoid exposing memcg's inner layout */ 398 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 399 #include <net/sock.h> 400 #include <net/ip.h> 401 402 static bool mem_cgroup_is_root(struct mem_cgroup *memcg); 403 void sock_update_memcg(struct sock *sk) 404 { 405 if (mem_cgroup_sockets_enabled) { 406 struct mem_cgroup *memcg; 407 408 BUG_ON(!sk->sk_prot->proto_cgroup); 409 410 /* Socket cloning can throw us here with sk_cgrp already 411 * filled. It won't however, necessarily happen from 412 * process context. So the test for root memcg given 413 * the current task's memcg won't help us in this case. 414 * 415 * Respecting the original socket's memcg is a better 416 * decision in this case. 417 */ 418 if (sk->sk_cgrp) { 419 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 420 mem_cgroup_get(sk->sk_cgrp->memcg); 421 return; 422 } 423 424 rcu_read_lock(); 425 memcg = mem_cgroup_from_task(current); 426 if (!mem_cgroup_is_root(memcg)) { 427 mem_cgroup_get(memcg); 428 sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); 429 } 430 rcu_read_unlock(); 431 } 432 } 433 EXPORT_SYMBOL(sock_update_memcg); 434 435 void sock_release_memcg(struct sock *sk) 436 { 437 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 438 struct mem_cgroup *memcg; 439 WARN_ON(!sk->sk_cgrp->memcg); 440 memcg = sk->sk_cgrp->memcg; 441 mem_cgroup_put(memcg); 442 } 443 } 444 445 #ifdef CONFIG_INET 446 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 447 { 448 if (!memcg || mem_cgroup_is_root(memcg)) 449 return NULL; 450 451 return &memcg->tcp_mem.cg_proto; 452 } 453 EXPORT_SYMBOL(tcp_proto_cgroup); 454 #endif /* CONFIG_INET */ 455 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ 456 457 static void drain_all_stock_async(struct mem_cgroup *memcg); 458 459 static struct mem_cgroup_per_zone * 460 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) 461 { 462 return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; 463 } 464 465 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 466 { 467 return &memcg->css; 468 } 469 470 static struct mem_cgroup_per_zone * 471 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) 472 { 473 int nid = page_to_nid(page); 474 int zid = page_zonenum(page); 475 476 return mem_cgroup_zoneinfo(memcg, nid, zid); 477 } 478 479 static struct mem_cgroup_tree_per_zone * 480 soft_limit_tree_node_zone(int nid, int zid) 481 { 482 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 483 } 484 485 static struct mem_cgroup_tree_per_zone * 486 soft_limit_tree_from_page(struct page *page) 487 { 488 int nid = page_to_nid(page); 489 int zid = page_zonenum(page); 490 491 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 492 } 493 494 static void 495 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, 496 struct mem_cgroup_per_zone *mz, 497 struct mem_cgroup_tree_per_zone *mctz, 498 unsigned long long new_usage_in_excess) 499 { 500 struct rb_node **p = &mctz->rb_root.rb_node; 501 struct rb_node *parent = NULL; 502 struct mem_cgroup_per_zone *mz_node; 503 504 if (mz->on_tree) 505 return; 506 507 mz->usage_in_excess = new_usage_in_excess; 508 if (!mz->usage_in_excess) 509 return; 510 while (*p) { 511 parent = *p; 512 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 513 tree_node); 514 if (mz->usage_in_excess < mz_node->usage_in_excess) 515 p = &(*p)->rb_left; 516 /* 517 * We can't avoid mem cgroups that are over their soft 518 * limit by the same amount 519 */ 520 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 521 p = &(*p)->rb_right; 522 } 523 rb_link_node(&mz->tree_node, parent, p); 524 rb_insert_color(&mz->tree_node, &mctz->rb_root); 525 mz->on_tree = true; 526 } 527 528 static void 529 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 530 struct mem_cgroup_per_zone *mz, 531 struct mem_cgroup_tree_per_zone *mctz) 532 { 533 if (!mz->on_tree) 534 return; 535 rb_erase(&mz->tree_node, &mctz->rb_root); 536 mz->on_tree = false; 537 } 538 539 static void 540 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 541 struct mem_cgroup_per_zone *mz, 542 struct mem_cgroup_tree_per_zone *mctz) 543 { 544 spin_lock(&mctz->lock); 545 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 546 spin_unlock(&mctz->lock); 547 } 548 549 550 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 551 { 552 unsigned long long excess; 553 struct mem_cgroup_per_zone *mz; 554 struct mem_cgroup_tree_per_zone *mctz; 555 int nid = page_to_nid(page); 556 int zid = page_zonenum(page); 557 mctz = soft_limit_tree_from_page(page); 558 559 /* 560 * Necessary to update all ancestors when hierarchy is used. 561 * because their event counter is not touched. 562 */ 563 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 564 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 565 excess = res_counter_soft_limit_excess(&memcg->res); 566 /* 567 * We have to update the tree if mz is on RB-tree or 568 * mem is over its softlimit. 569 */ 570 if (excess || mz->on_tree) { 571 spin_lock(&mctz->lock); 572 /* if on-tree, remove it */ 573 if (mz->on_tree) 574 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 575 /* 576 * Insert again. mz->usage_in_excess will be updated. 577 * If excess is 0, no tree ops. 578 */ 579 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); 580 spin_unlock(&mctz->lock); 581 } 582 } 583 } 584 585 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 586 { 587 int node, zone; 588 struct mem_cgroup_per_zone *mz; 589 struct mem_cgroup_tree_per_zone *mctz; 590 591 for_each_node(node) { 592 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 593 mz = mem_cgroup_zoneinfo(memcg, node, zone); 594 mctz = soft_limit_tree_node_zone(node, zone); 595 mem_cgroup_remove_exceeded(memcg, mz, mctz); 596 } 597 } 598 } 599 600 static struct mem_cgroup_per_zone * 601 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 602 { 603 struct rb_node *rightmost = NULL; 604 struct mem_cgroup_per_zone *mz; 605 606 retry: 607 mz = NULL; 608 rightmost = rb_last(&mctz->rb_root); 609 if (!rightmost) 610 goto done; /* Nothing to reclaim from */ 611 612 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 613 /* 614 * Remove the node now but someone else can add it back, 615 * we will to add it back at the end of reclaim to its correct 616 * position in the tree. 617 */ 618 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 619 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 620 !css_tryget(&mz->memcg->css)) 621 goto retry; 622 done: 623 return mz; 624 } 625 626 static struct mem_cgroup_per_zone * 627 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 628 { 629 struct mem_cgroup_per_zone *mz; 630 631 spin_lock(&mctz->lock); 632 mz = __mem_cgroup_largest_soft_limit_node(mctz); 633 spin_unlock(&mctz->lock); 634 return mz; 635 } 636 637 /* 638 * Implementation Note: reading percpu statistics for memcg. 639 * 640 * Both of vmstat[] and percpu_counter has threshold and do periodic 641 * synchronization to implement "quick" read. There are trade-off between 642 * reading cost and precision of value. Then, we may have a chance to implement 643 * a periodic synchronizion of counter in memcg's counter. 644 * 645 * But this _read() function is used for user interface now. The user accounts 646 * memory usage by memory cgroup and he _always_ requires exact value because 647 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 648 * have to visit all online cpus and make sum. So, for now, unnecessary 649 * synchronization is not implemented. (just implemented for cpu hotplug) 650 * 651 * If there are kernel internal actions which can make use of some not-exact 652 * value, and reading all cpu value can be performance bottleneck in some 653 * common workload, threashold and synchonization as vmstat[] should be 654 * implemented. 655 */ 656 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 657 enum mem_cgroup_stat_index idx) 658 { 659 long val = 0; 660 int cpu; 661 662 get_online_cpus(); 663 for_each_online_cpu(cpu) 664 val += per_cpu(memcg->stat->count[idx], cpu); 665 #ifdef CONFIG_HOTPLUG_CPU 666 spin_lock(&memcg->pcp_counter_lock); 667 val += memcg->nocpu_base.count[idx]; 668 spin_unlock(&memcg->pcp_counter_lock); 669 #endif 670 put_online_cpus(); 671 return val; 672 } 673 674 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 675 bool charge) 676 { 677 int val = (charge) ? 1 : -1; 678 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 679 } 680 681 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 682 enum mem_cgroup_events_index idx) 683 { 684 unsigned long val = 0; 685 int cpu; 686 687 for_each_online_cpu(cpu) 688 val += per_cpu(memcg->stat->events[idx], cpu); 689 #ifdef CONFIG_HOTPLUG_CPU 690 spin_lock(&memcg->pcp_counter_lock); 691 val += memcg->nocpu_base.events[idx]; 692 spin_unlock(&memcg->pcp_counter_lock); 693 #endif 694 return val; 695 } 696 697 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 698 bool anon, int nr_pages) 699 { 700 preempt_disable(); 701 702 /* 703 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 704 * counted as CACHE even if it's on ANON LRU. 705 */ 706 if (anon) 707 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 708 nr_pages); 709 else 710 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 711 nr_pages); 712 713 /* pagein of a big page is an event. So, ignore page size */ 714 if (nr_pages > 0) 715 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 716 else { 717 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 718 nr_pages = -nr_pages; /* for event */ 719 } 720 721 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); 722 723 preempt_enable(); 724 } 725 726 unsigned long 727 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, 728 unsigned int lru_mask) 729 { 730 struct mem_cgroup_per_zone *mz; 731 enum lru_list lru; 732 unsigned long ret = 0; 733 734 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 735 736 for_each_lru(lru) { 737 if (BIT(lru) & lru_mask) 738 ret += mz->lru_size[lru]; 739 } 740 return ret; 741 } 742 743 static unsigned long 744 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 745 int nid, unsigned int lru_mask) 746 { 747 u64 total = 0; 748 int zid; 749 750 for (zid = 0; zid < MAX_NR_ZONES; zid++) 751 total += mem_cgroup_zone_nr_lru_pages(memcg, 752 nid, zid, lru_mask); 753 754 return total; 755 } 756 757 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 758 unsigned int lru_mask) 759 { 760 int nid; 761 u64 total = 0; 762 763 for_each_node_state(nid, N_HIGH_MEMORY) 764 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 765 return total; 766 } 767 768 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 769 enum mem_cgroup_events_target target) 770 { 771 unsigned long val, next; 772 773 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); 774 next = __this_cpu_read(memcg->stat->targets[target]); 775 /* from time_after() in jiffies.h */ 776 if ((long)next - (long)val < 0) { 777 switch (target) { 778 case MEM_CGROUP_TARGET_THRESH: 779 next = val + THRESHOLDS_EVENTS_TARGET; 780 break; 781 case MEM_CGROUP_TARGET_SOFTLIMIT: 782 next = val + SOFTLIMIT_EVENTS_TARGET; 783 break; 784 case MEM_CGROUP_TARGET_NUMAINFO: 785 next = val + NUMAINFO_EVENTS_TARGET; 786 break; 787 default: 788 break; 789 } 790 __this_cpu_write(memcg->stat->targets[target], next); 791 return true; 792 } 793 return false; 794 } 795 796 /* 797 * Check events in order. 798 * 799 */ 800 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 801 { 802 preempt_disable(); 803 /* threshold event is triggered in finer grain than soft limit */ 804 if (unlikely(mem_cgroup_event_ratelimit(memcg, 805 MEM_CGROUP_TARGET_THRESH))) { 806 bool do_softlimit; 807 bool do_numainfo __maybe_unused; 808 809 do_softlimit = mem_cgroup_event_ratelimit(memcg, 810 MEM_CGROUP_TARGET_SOFTLIMIT); 811 #if MAX_NUMNODES > 1 812 do_numainfo = mem_cgroup_event_ratelimit(memcg, 813 MEM_CGROUP_TARGET_NUMAINFO); 814 #endif 815 preempt_enable(); 816 817 mem_cgroup_threshold(memcg); 818 if (unlikely(do_softlimit)) 819 mem_cgroup_update_tree(memcg, page); 820 #if MAX_NUMNODES > 1 821 if (unlikely(do_numainfo)) 822 atomic_inc(&memcg->numainfo_events); 823 #endif 824 } else 825 preempt_enable(); 826 } 827 828 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 829 { 830 return container_of(cgroup_subsys_state(cont, 831 mem_cgroup_subsys_id), struct mem_cgroup, 832 css); 833 } 834 835 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 836 { 837 /* 838 * mm_update_next_owner() may clear mm->owner to NULL 839 * if it races with swapoff, page migration, etc. 840 * So this can be called with p == NULL. 841 */ 842 if (unlikely(!p)) 843 return NULL; 844 845 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 846 struct mem_cgroup, css); 847 } 848 849 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 850 { 851 struct mem_cgroup *memcg = NULL; 852 853 if (!mm) 854 return NULL; 855 /* 856 * Because we have no locks, mm->owner's may be being moved to other 857 * cgroup. We use css_tryget() here even if this looks 858 * pessimistic (rather than adding locks here). 859 */ 860 rcu_read_lock(); 861 do { 862 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 863 if (unlikely(!memcg)) 864 break; 865 } while (!css_tryget(&memcg->css)); 866 rcu_read_unlock(); 867 return memcg; 868 } 869 870 /** 871 * mem_cgroup_iter - iterate over memory cgroup hierarchy 872 * @root: hierarchy root 873 * @prev: previously returned memcg, NULL on first invocation 874 * @reclaim: cookie for shared reclaim walks, NULL for full walks 875 * 876 * Returns references to children of the hierarchy below @root, or 877 * @root itself, or %NULL after a full round-trip. 878 * 879 * Caller must pass the return value in @prev on subsequent 880 * invocations for reference counting, or use mem_cgroup_iter_break() 881 * to cancel a hierarchy walk before the round-trip is complete. 882 * 883 * Reclaimers can specify a zone and a priority level in @reclaim to 884 * divide up the memcgs in the hierarchy among all concurrent 885 * reclaimers operating on the same zone and priority. 886 */ 887 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 888 struct mem_cgroup *prev, 889 struct mem_cgroup_reclaim_cookie *reclaim) 890 { 891 struct mem_cgroup *memcg = NULL; 892 int id = 0; 893 894 if (mem_cgroup_disabled()) 895 return NULL; 896 897 if (!root) 898 root = root_mem_cgroup; 899 900 if (prev && !reclaim) 901 id = css_id(&prev->css); 902 903 if (prev && prev != root) 904 css_put(&prev->css); 905 906 if (!root->use_hierarchy && root != root_mem_cgroup) { 907 if (prev) 908 return NULL; 909 return root; 910 } 911 912 while (!memcg) { 913 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 914 struct cgroup_subsys_state *css; 915 916 if (reclaim) { 917 int nid = zone_to_nid(reclaim->zone); 918 int zid = zone_idx(reclaim->zone); 919 struct mem_cgroup_per_zone *mz; 920 921 mz = mem_cgroup_zoneinfo(root, nid, zid); 922 iter = &mz->reclaim_iter[reclaim->priority]; 923 if (prev && reclaim->generation != iter->generation) 924 return NULL; 925 id = iter->position; 926 } 927 928 rcu_read_lock(); 929 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); 930 if (css) { 931 if (css == &root->css || css_tryget(css)) 932 memcg = container_of(css, 933 struct mem_cgroup, css); 934 } else 935 id = 0; 936 rcu_read_unlock(); 937 938 if (reclaim) { 939 iter->position = id; 940 if (!css) 941 iter->generation++; 942 else if (!prev && memcg) 943 reclaim->generation = iter->generation; 944 } 945 946 if (prev && !css) 947 return NULL; 948 } 949 return memcg; 950 } 951 952 /** 953 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 954 * @root: hierarchy root 955 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 956 */ 957 void mem_cgroup_iter_break(struct mem_cgroup *root, 958 struct mem_cgroup *prev) 959 { 960 if (!root) 961 root = root_mem_cgroup; 962 if (prev && prev != root) 963 css_put(&prev->css); 964 } 965 966 /* 967 * Iteration constructs for visiting all cgroups (under a tree). If 968 * loops are exited prematurely (break), mem_cgroup_iter_break() must 969 * be used for reference counting. 970 */ 971 #define for_each_mem_cgroup_tree(iter, root) \ 972 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 973 iter != NULL; \ 974 iter = mem_cgroup_iter(root, iter, NULL)) 975 976 #define for_each_mem_cgroup(iter) \ 977 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 978 iter != NULL; \ 979 iter = mem_cgroup_iter(NULL, iter, NULL)) 980 981 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 982 { 983 return (memcg == root_mem_cgroup); 984 } 985 986 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 987 { 988 struct mem_cgroup *memcg; 989 990 if (!mm) 991 return; 992 993 rcu_read_lock(); 994 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 995 if (unlikely(!memcg)) 996 goto out; 997 998 switch (idx) { 999 case PGFAULT: 1000 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1001 break; 1002 case PGMAJFAULT: 1003 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1004 break; 1005 default: 1006 BUG(); 1007 } 1008 out: 1009 rcu_read_unlock(); 1010 } 1011 EXPORT_SYMBOL(mem_cgroup_count_vm_event); 1012 1013 /** 1014 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1015 * @zone: zone of the wanted lruvec 1016 * @mem: memcg of the wanted lruvec 1017 * 1018 * Returns the lru list vector holding pages for the given @zone and 1019 * @mem. This can be the global zone lruvec, if the memory controller 1020 * is disabled. 1021 */ 1022 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1023 struct mem_cgroup *memcg) 1024 { 1025 struct mem_cgroup_per_zone *mz; 1026 1027 if (mem_cgroup_disabled()) 1028 return &zone->lruvec; 1029 1030 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); 1031 return &mz->lruvec; 1032 } 1033 1034 /* 1035 * Following LRU functions are allowed to be used without PCG_LOCK. 1036 * Operations are called by routine of global LRU independently from memcg. 1037 * What we have to take care of here is validness of pc->mem_cgroup. 1038 * 1039 * Changes to pc->mem_cgroup happens when 1040 * 1. charge 1041 * 2. moving account 1042 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 1043 * It is added to LRU before charge. 1044 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 1045 * When moving account, the page is not on LRU. It's isolated. 1046 */ 1047 1048 /** 1049 * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec 1050 * @zone: zone of the page 1051 * @page: the page 1052 * @lru: current lru 1053 * 1054 * This function accounts for @page being added to @lru, and returns 1055 * the lruvec for the given @zone and the memcg @page is charged to. 1056 * 1057 * The callsite is then responsible for physically linking the page to 1058 * the returned lruvec->lists[@lru]. 1059 */ 1060 struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, 1061 enum lru_list lru) 1062 { 1063 struct mem_cgroup_per_zone *mz; 1064 struct mem_cgroup *memcg; 1065 struct page_cgroup *pc; 1066 1067 if (mem_cgroup_disabled()) 1068 return &zone->lruvec; 1069 1070 pc = lookup_page_cgroup(page); 1071 memcg = pc->mem_cgroup; 1072 1073 /* 1074 * Surreptitiously switch any uncharged page to root: 1075 * an uncharged page off lru does nothing to secure 1076 * its former mem_cgroup from sudden removal. 1077 * 1078 * Our caller holds lru_lock, and PageCgroupUsed is updated 1079 * under page_cgroup lock: between them, they make all uses 1080 * of pc->mem_cgroup safe. 1081 */ 1082 if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1083 pc->mem_cgroup = memcg = root_mem_cgroup; 1084 1085 mz = page_cgroup_zoneinfo(memcg, page); 1086 /* compound_order() is stabilized through lru_lock */ 1087 mz->lru_size[lru] += 1 << compound_order(page); 1088 return &mz->lruvec; 1089 } 1090 1091 /** 1092 * mem_cgroup_lru_del_list - account for removing an lru page 1093 * @page: the page 1094 * @lru: target lru 1095 * 1096 * This function accounts for @page being removed from @lru. 1097 * 1098 * The callsite is then responsible for physically unlinking 1099 * @page->lru. 1100 */ 1101 void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) 1102 { 1103 struct mem_cgroup_per_zone *mz; 1104 struct mem_cgroup *memcg; 1105 struct page_cgroup *pc; 1106 1107 if (mem_cgroup_disabled()) 1108 return; 1109 1110 pc = lookup_page_cgroup(page); 1111 memcg = pc->mem_cgroup; 1112 VM_BUG_ON(!memcg); 1113 mz = page_cgroup_zoneinfo(memcg, page); 1114 /* huge page split is done under lru_lock. so, we have no races. */ 1115 VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page))); 1116 mz->lru_size[lru] -= 1 << compound_order(page); 1117 } 1118 1119 void mem_cgroup_lru_del(struct page *page) 1120 { 1121 mem_cgroup_lru_del_list(page, page_lru(page)); 1122 } 1123 1124 /** 1125 * mem_cgroup_lru_move_lists - account for moving a page between lrus 1126 * @zone: zone of the page 1127 * @page: the page 1128 * @from: current lru 1129 * @to: target lru 1130 * 1131 * This function accounts for @page being moved between the lrus @from 1132 * and @to, and returns the lruvec for the given @zone and the memcg 1133 * @page is charged to. 1134 * 1135 * The callsite is then responsible for physically relinking 1136 * @page->lru to the returned lruvec->lists[@to]. 1137 */ 1138 struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, 1139 struct page *page, 1140 enum lru_list from, 1141 enum lru_list to) 1142 { 1143 /* XXX: Optimize this, especially for @from == @to */ 1144 mem_cgroup_lru_del_list(page, from); 1145 return mem_cgroup_lru_add_list(zone, page, to); 1146 } 1147 1148 /* 1149 * Checks whether given mem is same or in the root_mem_cgroup's 1150 * hierarchy subtree 1151 */ 1152 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1153 struct mem_cgroup *memcg) 1154 { 1155 if (root_memcg != memcg) { 1156 return (root_memcg->use_hierarchy && 1157 css_is_ancestor(&memcg->css, &root_memcg->css)); 1158 } 1159 1160 return true; 1161 } 1162 1163 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) 1164 { 1165 int ret; 1166 struct mem_cgroup *curr = NULL; 1167 struct task_struct *p; 1168 1169 p = find_lock_task_mm(task); 1170 if (p) { 1171 curr = try_get_mem_cgroup_from_mm(p->mm); 1172 task_unlock(p); 1173 } else { 1174 /* 1175 * All threads may have already detached their mm's, but the oom 1176 * killer still needs to detect if they have already been oom 1177 * killed to prevent needlessly killing additional tasks. 1178 */ 1179 task_lock(task); 1180 curr = mem_cgroup_from_task(task); 1181 if (curr) 1182 css_get(&curr->css); 1183 task_unlock(task); 1184 } 1185 if (!curr) 1186 return 0; 1187 /* 1188 * We should check use_hierarchy of "memcg" not "curr". Because checking 1189 * use_hierarchy of "curr" here make this function true if hierarchy is 1190 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* 1191 * hierarchy(even if use_hierarchy is disabled in "memcg"). 1192 */ 1193 ret = mem_cgroup_same_or_subtree(memcg, curr); 1194 css_put(&curr->css); 1195 return ret; 1196 } 1197 1198 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) 1199 { 1200 unsigned long inactive_ratio; 1201 int nid = zone_to_nid(zone); 1202 int zid = zone_idx(zone); 1203 unsigned long inactive; 1204 unsigned long active; 1205 unsigned long gb; 1206 1207 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1208 BIT(LRU_INACTIVE_ANON)); 1209 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1210 BIT(LRU_ACTIVE_ANON)); 1211 1212 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1213 if (gb) 1214 inactive_ratio = int_sqrt(10 * gb); 1215 else 1216 inactive_ratio = 1; 1217 1218 return inactive * inactive_ratio < active; 1219 } 1220 1221 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) 1222 { 1223 unsigned long active; 1224 unsigned long inactive; 1225 int zid = zone_idx(zone); 1226 int nid = zone_to_nid(zone); 1227 1228 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1229 BIT(LRU_INACTIVE_FILE)); 1230 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1231 BIT(LRU_ACTIVE_FILE)); 1232 1233 return (active > inactive); 1234 } 1235 1236 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1237 struct zone *zone) 1238 { 1239 int nid = zone_to_nid(zone); 1240 int zid = zone_idx(zone); 1241 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1242 1243 return &mz->reclaim_stat; 1244 } 1245 1246 struct zone_reclaim_stat * 1247 mem_cgroup_get_reclaim_stat_from_page(struct page *page) 1248 { 1249 struct page_cgroup *pc; 1250 struct mem_cgroup_per_zone *mz; 1251 1252 if (mem_cgroup_disabled()) 1253 return NULL; 1254 1255 pc = lookup_page_cgroup(page); 1256 if (!PageCgroupUsed(pc)) 1257 return NULL; 1258 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1259 smp_rmb(); 1260 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1261 return &mz->reclaim_stat; 1262 } 1263 1264 #define mem_cgroup_from_res_counter(counter, member) \ 1265 container_of(counter, struct mem_cgroup, member) 1266 1267 /** 1268 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1269 * @mem: the memory cgroup 1270 * 1271 * Returns the maximum amount of memory @mem can be charged with, in 1272 * pages. 1273 */ 1274 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1275 { 1276 unsigned long long margin; 1277 1278 margin = res_counter_margin(&memcg->res); 1279 if (do_swap_account) 1280 margin = min(margin, res_counter_margin(&memcg->memsw)); 1281 return margin >> PAGE_SHIFT; 1282 } 1283 1284 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1285 { 1286 struct cgroup *cgrp = memcg->css.cgroup; 1287 1288 /* root ? */ 1289 if (cgrp->parent == NULL) 1290 return vm_swappiness; 1291 1292 return memcg->swappiness; 1293 } 1294 1295 /* 1296 * memcg->moving_account is used for checking possibility that some thread is 1297 * calling move_account(). When a thread on CPU-A starts moving pages under 1298 * a memcg, other threads should check memcg->moving_account under 1299 * rcu_read_lock(), like this: 1300 * 1301 * CPU-A CPU-B 1302 * rcu_read_lock() 1303 * memcg->moving_account+1 if (memcg->mocing_account) 1304 * take heavy locks. 1305 * synchronize_rcu() update something. 1306 * rcu_read_unlock() 1307 * start move here. 1308 */ 1309 1310 /* for quick checking without looking up memcg */ 1311 atomic_t memcg_moving __read_mostly; 1312 1313 static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1314 { 1315 atomic_inc(&memcg_moving); 1316 atomic_inc(&memcg->moving_account); 1317 synchronize_rcu(); 1318 } 1319 1320 static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1321 { 1322 /* 1323 * Now, mem_cgroup_clear_mc() may call this function with NULL. 1324 * We check NULL in callee rather than caller. 1325 */ 1326 if (memcg) { 1327 atomic_dec(&memcg_moving); 1328 atomic_dec(&memcg->moving_account); 1329 } 1330 } 1331 1332 /* 1333 * 2 routines for checking "mem" is under move_account() or not. 1334 * 1335 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This 1336 * is used for avoiding races in accounting. If true, 1337 * pc->mem_cgroup may be overwritten. 1338 * 1339 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1340 * under hierarchy of moving cgroups. This is for 1341 * waiting at hith-memory prressure caused by "move". 1342 */ 1343 1344 static bool mem_cgroup_stolen(struct mem_cgroup *memcg) 1345 { 1346 VM_BUG_ON(!rcu_read_lock_held()); 1347 return atomic_read(&memcg->moving_account) > 0; 1348 } 1349 1350 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1351 { 1352 struct mem_cgroup *from; 1353 struct mem_cgroup *to; 1354 bool ret = false; 1355 /* 1356 * Unlike task_move routines, we access mc.to, mc.from not under 1357 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1358 */ 1359 spin_lock(&mc.lock); 1360 from = mc.from; 1361 to = mc.to; 1362 if (!from) 1363 goto unlock; 1364 1365 ret = mem_cgroup_same_or_subtree(memcg, from) 1366 || mem_cgroup_same_or_subtree(memcg, to); 1367 unlock: 1368 spin_unlock(&mc.lock); 1369 return ret; 1370 } 1371 1372 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1373 { 1374 if (mc.moving_task && current != mc.moving_task) { 1375 if (mem_cgroup_under_move(memcg)) { 1376 DEFINE_WAIT(wait); 1377 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1378 /* moving charge context might have finished. */ 1379 if (mc.moving_task) 1380 schedule(); 1381 finish_wait(&mc.waitq, &wait); 1382 return true; 1383 } 1384 } 1385 return false; 1386 } 1387 1388 /* 1389 * Take this lock when 1390 * - a code tries to modify page's memcg while it's USED. 1391 * - a code tries to modify page state accounting in a memcg. 1392 * see mem_cgroup_stolen(), too. 1393 */ 1394 static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1395 unsigned long *flags) 1396 { 1397 spin_lock_irqsave(&memcg->move_lock, *flags); 1398 } 1399 1400 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, 1401 unsigned long *flags) 1402 { 1403 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1404 } 1405 1406 /** 1407 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1408 * @memcg: The memory cgroup that went over limit 1409 * @p: Task that is going to be killed 1410 * 1411 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1412 * enabled 1413 */ 1414 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1415 { 1416 struct cgroup *task_cgrp; 1417 struct cgroup *mem_cgrp; 1418 /* 1419 * Need a buffer in BSS, can't rely on allocations. The code relies 1420 * on the assumption that OOM is serialized for memory controller. 1421 * If this assumption is broken, revisit this code. 1422 */ 1423 static char memcg_name[PATH_MAX]; 1424 int ret; 1425 1426 if (!memcg || !p) 1427 return; 1428 1429 rcu_read_lock(); 1430 1431 mem_cgrp = memcg->css.cgroup; 1432 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1433 1434 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1435 if (ret < 0) { 1436 /* 1437 * Unfortunately, we are unable to convert to a useful name 1438 * But we'll still print out the usage information 1439 */ 1440 rcu_read_unlock(); 1441 goto done; 1442 } 1443 rcu_read_unlock(); 1444 1445 printk(KERN_INFO "Task in %s killed", memcg_name); 1446 1447 rcu_read_lock(); 1448 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1449 if (ret < 0) { 1450 rcu_read_unlock(); 1451 goto done; 1452 } 1453 rcu_read_unlock(); 1454 1455 /* 1456 * Continues from above, so we don't need an KERN_ level 1457 */ 1458 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1459 done: 1460 1461 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1462 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1463 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1464 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1465 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1466 "failcnt %llu\n", 1467 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1468 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1469 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1470 } 1471 1472 /* 1473 * This function returns the number of memcg under hierarchy tree. Returns 1474 * 1(self count) if no children. 1475 */ 1476 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1477 { 1478 int num = 0; 1479 struct mem_cgroup *iter; 1480 1481 for_each_mem_cgroup_tree(iter, memcg) 1482 num++; 1483 return num; 1484 } 1485 1486 /* 1487 * Return the memory (and swap, if configured) limit for a memcg. 1488 */ 1489 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1490 { 1491 u64 limit; 1492 u64 memsw; 1493 1494 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1495 limit += total_swap_pages << PAGE_SHIFT; 1496 1497 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1498 /* 1499 * If memsw is finite and limits the amount of swap space available 1500 * to this memcg, return that limit. 1501 */ 1502 return min(limit, memsw); 1503 } 1504 1505 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1506 gfp_t gfp_mask, 1507 unsigned long flags) 1508 { 1509 unsigned long total = 0; 1510 bool noswap = false; 1511 int loop; 1512 1513 if (flags & MEM_CGROUP_RECLAIM_NOSWAP) 1514 noswap = true; 1515 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) 1516 noswap = true; 1517 1518 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { 1519 if (loop) 1520 drain_all_stock_async(memcg); 1521 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); 1522 /* 1523 * Allow limit shrinkers, which are triggered directly 1524 * by userspace, to catch signals and stop reclaim 1525 * after minimal progress, regardless of the margin. 1526 */ 1527 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) 1528 break; 1529 if (mem_cgroup_margin(memcg)) 1530 break; 1531 /* 1532 * If nothing was reclaimed after two attempts, there 1533 * may be no reclaimable pages in this hierarchy. 1534 */ 1535 if (loop && !total) 1536 break; 1537 } 1538 return total; 1539 } 1540 1541 /** 1542 * test_mem_cgroup_node_reclaimable 1543 * @mem: the target memcg 1544 * @nid: the node ID to be checked. 1545 * @noswap : specify true here if the user wants flle only information. 1546 * 1547 * This function returns whether the specified memcg contains any 1548 * reclaimable pages on a node. Returns true if there are any reclaimable 1549 * pages in the node. 1550 */ 1551 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1552 int nid, bool noswap) 1553 { 1554 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1555 return true; 1556 if (noswap || !total_swap_pages) 1557 return false; 1558 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1559 return true; 1560 return false; 1561 1562 } 1563 #if MAX_NUMNODES > 1 1564 1565 /* 1566 * Always updating the nodemask is not very good - even if we have an empty 1567 * list or the wrong list here, we can start from some node and traverse all 1568 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1569 * 1570 */ 1571 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1572 { 1573 int nid; 1574 /* 1575 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1576 * pagein/pageout changes since the last update. 1577 */ 1578 if (!atomic_read(&memcg->numainfo_events)) 1579 return; 1580 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1581 return; 1582 1583 /* make a nodemask where this memcg uses memory from */ 1584 memcg->scan_nodes = node_states[N_HIGH_MEMORY]; 1585 1586 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1587 1588 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1589 node_clear(nid, memcg->scan_nodes); 1590 } 1591 1592 atomic_set(&memcg->numainfo_events, 0); 1593 atomic_set(&memcg->numainfo_updating, 0); 1594 } 1595 1596 /* 1597 * Selecting a node where we start reclaim from. Because what we need is just 1598 * reducing usage counter, start from anywhere is O,K. Considering 1599 * memory reclaim from current node, there are pros. and cons. 1600 * 1601 * Freeing memory from current node means freeing memory from a node which 1602 * we'll use or we've used. So, it may make LRU bad. And if several threads 1603 * hit limits, it will see a contention on a node. But freeing from remote 1604 * node means more costs for memory reclaim because of memory latency. 1605 * 1606 * Now, we use round-robin. Better algorithm is welcomed. 1607 */ 1608 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1609 { 1610 int node; 1611 1612 mem_cgroup_may_update_nodemask(memcg); 1613 node = memcg->last_scanned_node; 1614 1615 node = next_node(node, memcg->scan_nodes); 1616 if (node == MAX_NUMNODES) 1617 node = first_node(memcg->scan_nodes); 1618 /* 1619 * We call this when we hit limit, not when pages are added to LRU. 1620 * No LRU may hold pages because all pages are UNEVICTABLE or 1621 * memcg is too small and all pages are not on LRU. In that case, 1622 * we use curret node. 1623 */ 1624 if (unlikely(node == MAX_NUMNODES)) 1625 node = numa_node_id(); 1626 1627 memcg->last_scanned_node = node; 1628 return node; 1629 } 1630 1631 /* 1632 * Check all nodes whether it contains reclaimable pages or not. 1633 * For quick scan, we make use of scan_nodes. This will allow us to skip 1634 * unused nodes. But scan_nodes is lazily updated and may not cotain 1635 * enough new information. We need to do double check. 1636 */ 1637 bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1638 { 1639 int nid; 1640 1641 /* 1642 * quick check...making use of scan_node. 1643 * We can skip unused nodes. 1644 */ 1645 if (!nodes_empty(memcg->scan_nodes)) { 1646 for (nid = first_node(memcg->scan_nodes); 1647 nid < MAX_NUMNODES; 1648 nid = next_node(nid, memcg->scan_nodes)) { 1649 1650 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1651 return true; 1652 } 1653 } 1654 /* 1655 * Check rest of nodes. 1656 */ 1657 for_each_node_state(nid, N_HIGH_MEMORY) { 1658 if (node_isset(nid, memcg->scan_nodes)) 1659 continue; 1660 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1661 return true; 1662 } 1663 return false; 1664 } 1665 1666 #else 1667 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1668 { 1669 return 0; 1670 } 1671 1672 bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1673 { 1674 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 1675 } 1676 #endif 1677 1678 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1679 struct zone *zone, 1680 gfp_t gfp_mask, 1681 unsigned long *total_scanned) 1682 { 1683 struct mem_cgroup *victim = NULL; 1684 int total = 0; 1685 int loop = 0; 1686 unsigned long excess; 1687 unsigned long nr_scanned; 1688 struct mem_cgroup_reclaim_cookie reclaim = { 1689 .zone = zone, 1690 .priority = 0, 1691 }; 1692 1693 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1694 1695 while (1) { 1696 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1697 if (!victim) { 1698 loop++; 1699 if (loop >= 2) { 1700 /* 1701 * If we have not been able to reclaim 1702 * anything, it might because there are 1703 * no reclaimable pages under this hierarchy 1704 */ 1705 if (!total) 1706 break; 1707 /* 1708 * We want to do more targeted reclaim. 1709 * excess >> 2 is not to excessive so as to 1710 * reclaim too much, nor too less that we keep 1711 * coming back to reclaim from this cgroup 1712 */ 1713 if (total >= (excess >> 2) || 1714 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1715 break; 1716 } 1717 continue; 1718 } 1719 if (!mem_cgroup_reclaimable(victim, false)) 1720 continue; 1721 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1722 zone, &nr_scanned); 1723 *total_scanned += nr_scanned; 1724 if (!res_counter_soft_limit_excess(&root_memcg->res)) 1725 break; 1726 } 1727 mem_cgroup_iter_break(root_memcg, victim); 1728 return total; 1729 } 1730 1731 /* 1732 * Check OOM-Killer is already running under our hierarchy. 1733 * If someone is running, return false. 1734 * Has to be called with memcg_oom_lock 1735 */ 1736 static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) 1737 { 1738 struct mem_cgroup *iter, *failed = NULL; 1739 1740 for_each_mem_cgroup_tree(iter, memcg) { 1741 if (iter->oom_lock) { 1742 /* 1743 * this subtree of our hierarchy is already locked 1744 * so we cannot give a lock. 1745 */ 1746 failed = iter; 1747 mem_cgroup_iter_break(memcg, iter); 1748 break; 1749 } else 1750 iter->oom_lock = true; 1751 } 1752 1753 if (!failed) 1754 return true; 1755 1756 /* 1757 * OK, we failed to lock the whole subtree so we have to clean up 1758 * what we set up to the failing subtree 1759 */ 1760 for_each_mem_cgroup_tree(iter, memcg) { 1761 if (iter == failed) { 1762 mem_cgroup_iter_break(memcg, iter); 1763 break; 1764 } 1765 iter->oom_lock = false; 1766 } 1767 return false; 1768 } 1769 1770 /* 1771 * Has to be called with memcg_oom_lock 1772 */ 1773 static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1774 { 1775 struct mem_cgroup *iter; 1776 1777 for_each_mem_cgroup_tree(iter, memcg) 1778 iter->oom_lock = false; 1779 return 0; 1780 } 1781 1782 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1783 { 1784 struct mem_cgroup *iter; 1785 1786 for_each_mem_cgroup_tree(iter, memcg) 1787 atomic_inc(&iter->under_oom); 1788 } 1789 1790 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1791 { 1792 struct mem_cgroup *iter; 1793 1794 /* 1795 * When a new child is created while the hierarchy is under oom, 1796 * mem_cgroup_oom_lock() may not be called. We have to use 1797 * atomic_add_unless() here. 1798 */ 1799 for_each_mem_cgroup_tree(iter, memcg) 1800 atomic_add_unless(&iter->under_oom, -1, 0); 1801 } 1802 1803 static DEFINE_SPINLOCK(memcg_oom_lock); 1804 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1805 1806 struct oom_wait_info { 1807 struct mem_cgroup *memcg; 1808 wait_queue_t wait; 1809 }; 1810 1811 static int memcg_oom_wake_function(wait_queue_t *wait, 1812 unsigned mode, int sync, void *arg) 1813 { 1814 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1815 struct mem_cgroup *oom_wait_memcg; 1816 struct oom_wait_info *oom_wait_info; 1817 1818 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1819 oom_wait_memcg = oom_wait_info->memcg; 1820 1821 /* 1822 * Both of oom_wait_info->memcg and wake_memcg are stable under us. 1823 * Then we can use css_is_ancestor without taking care of RCU. 1824 */ 1825 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 1826 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) 1827 return 0; 1828 return autoremove_wake_function(wait, mode, sync, arg); 1829 } 1830 1831 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 1832 { 1833 /* for filtering, pass "memcg" as argument. */ 1834 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1835 } 1836 1837 static void memcg_oom_recover(struct mem_cgroup *memcg) 1838 { 1839 if (memcg && atomic_read(&memcg->under_oom)) 1840 memcg_wakeup_oom(memcg); 1841 } 1842 1843 /* 1844 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1845 */ 1846 bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1847 { 1848 struct oom_wait_info owait; 1849 bool locked, need_to_kill; 1850 1851 owait.memcg = memcg; 1852 owait.wait.flags = 0; 1853 owait.wait.func = memcg_oom_wake_function; 1854 owait.wait.private = current; 1855 INIT_LIST_HEAD(&owait.wait.task_list); 1856 need_to_kill = true; 1857 mem_cgroup_mark_under_oom(memcg); 1858 1859 /* At first, try to OOM lock hierarchy under memcg.*/ 1860 spin_lock(&memcg_oom_lock); 1861 locked = mem_cgroup_oom_lock(memcg); 1862 /* 1863 * Even if signal_pending(), we can't quit charge() loop without 1864 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1865 * under OOM is always welcomed, use TASK_KILLABLE here. 1866 */ 1867 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1868 if (!locked || memcg->oom_kill_disable) 1869 need_to_kill = false; 1870 if (locked) 1871 mem_cgroup_oom_notify(memcg); 1872 spin_unlock(&memcg_oom_lock); 1873 1874 if (need_to_kill) { 1875 finish_wait(&memcg_oom_waitq, &owait.wait); 1876 mem_cgroup_out_of_memory(memcg, mask, order); 1877 } else { 1878 schedule(); 1879 finish_wait(&memcg_oom_waitq, &owait.wait); 1880 } 1881 spin_lock(&memcg_oom_lock); 1882 if (locked) 1883 mem_cgroup_oom_unlock(memcg); 1884 memcg_wakeup_oom(memcg); 1885 spin_unlock(&memcg_oom_lock); 1886 1887 mem_cgroup_unmark_under_oom(memcg); 1888 1889 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1890 return false; 1891 /* Give chance to dying process */ 1892 schedule_timeout_uninterruptible(1); 1893 return true; 1894 } 1895 1896 /* 1897 * Currently used to update mapped file statistics, but the routine can be 1898 * generalized to update other statistics as well. 1899 * 1900 * Notes: Race condition 1901 * 1902 * We usually use page_cgroup_lock() for accessing page_cgroup member but 1903 * it tends to be costly. But considering some conditions, we doesn't need 1904 * to do so _always_. 1905 * 1906 * Considering "charge", lock_page_cgroup() is not required because all 1907 * file-stat operations happen after a page is attached to radix-tree. There 1908 * are no race with "charge". 1909 * 1910 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 1911 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 1912 * if there are race with "uncharge". Statistics itself is properly handled 1913 * by flags. 1914 * 1915 * Considering "move", this is an only case we see a race. To make the race 1916 * small, we check mm->moving_account and detect there are possibility of race 1917 * If there is, we take a lock. 1918 */ 1919 1920 void __mem_cgroup_begin_update_page_stat(struct page *page, 1921 bool *locked, unsigned long *flags) 1922 { 1923 struct mem_cgroup *memcg; 1924 struct page_cgroup *pc; 1925 1926 pc = lookup_page_cgroup(page); 1927 again: 1928 memcg = pc->mem_cgroup; 1929 if (unlikely(!memcg || !PageCgroupUsed(pc))) 1930 return; 1931 /* 1932 * If this memory cgroup is not under account moving, we don't 1933 * need to take move_lock_page_cgroup(). Because we already hold 1934 * rcu_read_lock(), any calls to move_account will be delayed until 1935 * rcu_read_unlock() if mem_cgroup_stolen() == true. 1936 */ 1937 if (!mem_cgroup_stolen(memcg)) 1938 return; 1939 1940 move_lock_mem_cgroup(memcg, flags); 1941 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 1942 move_unlock_mem_cgroup(memcg, flags); 1943 goto again; 1944 } 1945 *locked = true; 1946 } 1947 1948 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) 1949 { 1950 struct page_cgroup *pc = lookup_page_cgroup(page); 1951 1952 /* 1953 * It's guaranteed that pc->mem_cgroup never changes while 1954 * lock is held because a routine modifies pc->mem_cgroup 1955 * should take move_lock_page_cgroup(). 1956 */ 1957 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 1958 } 1959 1960 void mem_cgroup_update_page_stat(struct page *page, 1961 enum mem_cgroup_page_stat_item idx, int val) 1962 { 1963 struct mem_cgroup *memcg; 1964 struct page_cgroup *pc = lookup_page_cgroup(page); 1965 unsigned long uninitialized_var(flags); 1966 1967 if (mem_cgroup_disabled()) 1968 return; 1969 1970 memcg = pc->mem_cgroup; 1971 if (unlikely(!memcg || !PageCgroupUsed(pc))) 1972 return; 1973 1974 switch (idx) { 1975 case MEMCG_NR_FILE_MAPPED: 1976 idx = MEM_CGROUP_STAT_FILE_MAPPED; 1977 break; 1978 default: 1979 BUG(); 1980 } 1981 1982 this_cpu_add(memcg->stat->count[idx], val); 1983 } 1984 1985 /* 1986 * size of first charge trial. "32" comes from vmscan.c's magic value. 1987 * TODO: maybe necessary to use big numbers in big irons. 1988 */ 1989 #define CHARGE_BATCH 32U 1990 struct memcg_stock_pcp { 1991 struct mem_cgroup *cached; /* this never be root cgroup */ 1992 unsigned int nr_pages; 1993 struct work_struct work; 1994 unsigned long flags; 1995 #define FLUSHING_CACHED_CHARGE (0) 1996 }; 1997 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1998 static DEFINE_MUTEX(percpu_charge_mutex); 1999 2000 /* 2001 * Try to consume stocked charge on this cpu. If success, one page is consumed 2002 * from local stock and true is returned. If the stock is 0 or charges from a 2003 * cgroup which is not current target, returns false. This stock will be 2004 * refilled. 2005 */ 2006 static bool consume_stock(struct mem_cgroup *memcg) 2007 { 2008 struct memcg_stock_pcp *stock; 2009 bool ret = true; 2010 2011 stock = &get_cpu_var(memcg_stock); 2012 if (memcg == stock->cached && stock->nr_pages) 2013 stock->nr_pages--; 2014 else /* need to call res_counter_charge */ 2015 ret = false; 2016 put_cpu_var(memcg_stock); 2017 return ret; 2018 } 2019 2020 /* 2021 * Returns stocks cached in percpu to res_counter and reset cached information. 2022 */ 2023 static void drain_stock(struct memcg_stock_pcp *stock) 2024 { 2025 struct mem_cgroup *old = stock->cached; 2026 2027 if (stock->nr_pages) { 2028 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2029 2030 res_counter_uncharge(&old->res, bytes); 2031 if (do_swap_account) 2032 res_counter_uncharge(&old->memsw, bytes); 2033 stock->nr_pages = 0; 2034 } 2035 stock->cached = NULL; 2036 } 2037 2038 /* 2039 * This must be called under preempt disabled or must be called by 2040 * a thread which is pinned to local cpu. 2041 */ 2042 static void drain_local_stock(struct work_struct *dummy) 2043 { 2044 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2045 drain_stock(stock); 2046 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2047 } 2048 2049 /* 2050 * Cache charges(val) which is from res_counter, to local per_cpu area. 2051 * This will be consumed by consume_stock() function, later. 2052 */ 2053 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2054 { 2055 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2056 2057 if (stock->cached != memcg) { /* reset if necessary */ 2058 drain_stock(stock); 2059 stock->cached = memcg; 2060 } 2061 stock->nr_pages += nr_pages; 2062 put_cpu_var(memcg_stock); 2063 } 2064 2065 /* 2066 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2067 * of the hierarchy under it. sync flag says whether we should block 2068 * until the work is done. 2069 */ 2070 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) 2071 { 2072 int cpu, curcpu; 2073 2074 /* Notify other cpus that system-wide "drain" is running */ 2075 get_online_cpus(); 2076 curcpu = get_cpu(); 2077 for_each_online_cpu(cpu) { 2078 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2079 struct mem_cgroup *memcg; 2080 2081 memcg = stock->cached; 2082 if (!memcg || !stock->nr_pages) 2083 continue; 2084 if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) 2085 continue; 2086 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2087 if (cpu == curcpu) 2088 drain_local_stock(&stock->work); 2089 else 2090 schedule_work_on(cpu, &stock->work); 2091 } 2092 } 2093 put_cpu(); 2094 2095 if (!sync) 2096 goto out; 2097 2098 for_each_online_cpu(cpu) { 2099 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2100 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2101 flush_work(&stock->work); 2102 } 2103 out: 2104 put_online_cpus(); 2105 } 2106 2107 /* 2108 * Tries to drain stocked charges in other cpus. This function is asynchronous 2109 * and just put a work per cpu for draining localy on each cpu. Caller can 2110 * expects some charges will be back to res_counter later but cannot wait for 2111 * it. 2112 */ 2113 static void drain_all_stock_async(struct mem_cgroup *root_memcg) 2114 { 2115 /* 2116 * If someone calls draining, avoid adding more kworker runs. 2117 */ 2118 if (!mutex_trylock(&percpu_charge_mutex)) 2119 return; 2120 drain_all_stock(root_memcg, false); 2121 mutex_unlock(&percpu_charge_mutex); 2122 } 2123 2124 /* This is a synchronous drain interface. */ 2125 static void drain_all_stock_sync(struct mem_cgroup *root_memcg) 2126 { 2127 /* called when force_empty is called */ 2128 mutex_lock(&percpu_charge_mutex); 2129 drain_all_stock(root_memcg, true); 2130 mutex_unlock(&percpu_charge_mutex); 2131 } 2132 2133 /* 2134 * This function drains percpu counter value from DEAD cpu and 2135 * move it to local cpu. Note that this function can be preempted. 2136 */ 2137 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) 2138 { 2139 int i; 2140 2141 spin_lock(&memcg->pcp_counter_lock); 2142 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 2143 long x = per_cpu(memcg->stat->count[i], cpu); 2144 2145 per_cpu(memcg->stat->count[i], cpu) = 0; 2146 memcg->nocpu_base.count[i] += x; 2147 } 2148 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2149 unsigned long x = per_cpu(memcg->stat->events[i], cpu); 2150 2151 per_cpu(memcg->stat->events[i], cpu) = 0; 2152 memcg->nocpu_base.events[i] += x; 2153 } 2154 spin_unlock(&memcg->pcp_counter_lock); 2155 } 2156 2157 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 2158 unsigned long action, 2159 void *hcpu) 2160 { 2161 int cpu = (unsigned long)hcpu; 2162 struct memcg_stock_pcp *stock; 2163 struct mem_cgroup *iter; 2164 2165 if (action == CPU_ONLINE) 2166 return NOTIFY_OK; 2167 2168 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 2169 return NOTIFY_OK; 2170 2171 for_each_mem_cgroup(iter) 2172 mem_cgroup_drain_pcp_counter(iter, cpu); 2173 2174 stock = &per_cpu(memcg_stock, cpu); 2175 drain_stock(stock); 2176 return NOTIFY_OK; 2177 } 2178 2179 2180 /* See __mem_cgroup_try_charge() for details */ 2181 enum { 2182 CHARGE_OK, /* success */ 2183 CHARGE_RETRY, /* need to retry but retry is not bad */ 2184 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2185 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2186 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 2187 }; 2188 2189 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2190 unsigned int nr_pages, bool oom_check) 2191 { 2192 unsigned long csize = nr_pages * PAGE_SIZE; 2193 struct mem_cgroup *mem_over_limit; 2194 struct res_counter *fail_res; 2195 unsigned long flags = 0; 2196 int ret; 2197 2198 ret = res_counter_charge(&memcg->res, csize, &fail_res); 2199 2200 if (likely(!ret)) { 2201 if (!do_swap_account) 2202 return CHARGE_OK; 2203 ret = res_counter_charge(&memcg->memsw, csize, &fail_res); 2204 if (likely(!ret)) 2205 return CHARGE_OK; 2206 2207 res_counter_uncharge(&memcg->res, csize); 2208 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2209 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2210 } else 2211 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2212 /* 2213 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch 2214 * of regular pages (CHARGE_BATCH), or a single regular page (1). 2215 * 2216 * Never reclaim on behalf of optional batching, retry with a 2217 * single page instead. 2218 */ 2219 if (nr_pages == CHARGE_BATCH) 2220 return CHARGE_RETRY; 2221 2222 if (!(gfp_mask & __GFP_WAIT)) 2223 return CHARGE_WOULDBLOCK; 2224 2225 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2226 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2227 return CHARGE_RETRY; 2228 /* 2229 * Even though the limit is exceeded at this point, reclaim 2230 * may have been able to free some pages. Retry the charge 2231 * before killing the task. 2232 * 2233 * Only for regular pages, though: huge pages are rather 2234 * unlikely to succeed so close to the limit, and we fall back 2235 * to regular pages anyway in case of failure. 2236 */ 2237 if (nr_pages == 1 && ret) 2238 return CHARGE_RETRY; 2239 2240 /* 2241 * At task move, charge accounts can be doubly counted. So, it's 2242 * better to wait until the end of task_move if something is going on. 2243 */ 2244 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2245 return CHARGE_RETRY; 2246 2247 /* If we don't need to call oom-killer at el, return immediately */ 2248 if (!oom_check) 2249 return CHARGE_NOMEM; 2250 /* check OOM */ 2251 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) 2252 return CHARGE_OOM_DIE; 2253 2254 return CHARGE_RETRY; 2255 } 2256 2257 /* 2258 * __mem_cgroup_try_charge() does 2259 * 1. detect memcg to be charged against from passed *mm and *ptr, 2260 * 2. update res_counter 2261 * 3. call memory reclaim if necessary. 2262 * 2263 * In some special case, if the task is fatal, fatal_signal_pending() or 2264 * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup 2265 * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon 2266 * as possible without any hazards. 2: all pages should have a valid 2267 * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg 2268 * pointer, that is treated as a charge to root_mem_cgroup. 2269 * 2270 * So __mem_cgroup_try_charge() will return 2271 * 0 ... on success, filling *ptr with a valid memcg pointer. 2272 * -ENOMEM ... charge failure because of resource limits. 2273 * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. 2274 * 2275 * Unlike the exported interface, an "oom" parameter is added. if oom==true, 2276 * the oom-killer can be invoked. 2277 */ 2278 static int __mem_cgroup_try_charge(struct mm_struct *mm, 2279 gfp_t gfp_mask, 2280 unsigned int nr_pages, 2281 struct mem_cgroup **ptr, 2282 bool oom) 2283 { 2284 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2285 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2286 struct mem_cgroup *memcg = NULL; 2287 int ret; 2288 2289 /* 2290 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 2291 * in system level. So, allow to go ahead dying process in addition to 2292 * MEMDIE process. 2293 */ 2294 if (unlikely(test_thread_flag(TIF_MEMDIE) 2295 || fatal_signal_pending(current))) 2296 goto bypass; 2297 2298 /* 2299 * We always charge the cgroup the mm_struct belongs to. 2300 * The mm_struct's mem_cgroup changes on task migration if the 2301 * thread group leader migrates. It's possible that mm is not 2302 * set, if so charge the init_mm (happens for pagecache usage). 2303 */ 2304 if (!*ptr && !mm) 2305 *ptr = root_mem_cgroup; 2306 again: 2307 if (*ptr) { /* css should be a valid one */ 2308 memcg = *ptr; 2309 VM_BUG_ON(css_is_removed(&memcg->css)); 2310 if (mem_cgroup_is_root(memcg)) 2311 goto done; 2312 if (nr_pages == 1 && consume_stock(memcg)) 2313 goto done; 2314 css_get(&memcg->css); 2315 } else { 2316 struct task_struct *p; 2317 2318 rcu_read_lock(); 2319 p = rcu_dereference(mm->owner); 2320 /* 2321 * Because we don't have task_lock(), "p" can exit. 2322 * In that case, "memcg" can point to root or p can be NULL with 2323 * race with swapoff. Then, we have small risk of mis-accouning. 2324 * But such kind of mis-account by race always happens because 2325 * we don't have cgroup_mutex(). It's overkill and we allo that 2326 * small race, here. 2327 * (*) swapoff at el will charge against mm-struct not against 2328 * task-struct. So, mm->owner can be NULL. 2329 */ 2330 memcg = mem_cgroup_from_task(p); 2331 if (!memcg) 2332 memcg = root_mem_cgroup; 2333 if (mem_cgroup_is_root(memcg)) { 2334 rcu_read_unlock(); 2335 goto done; 2336 } 2337 if (nr_pages == 1 && consume_stock(memcg)) { 2338 /* 2339 * It seems dagerous to access memcg without css_get(). 2340 * But considering how consume_stok works, it's not 2341 * necessary. If consume_stock success, some charges 2342 * from this memcg are cached on this cpu. So, we 2343 * don't need to call css_get()/css_tryget() before 2344 * calling consume_stock(). 2345 */ 2346 rcu_read_unlock(); 2347 goto done; 2348 } 2349 /* after here, we may be blocked. we need to get refcnt */ 2350 if (!css_tryget(&memcg->css)) { 2351 rcu_read_unlock(); 2352 goto again; 2353 } 2354 rcu_read_unlock(); 2355 } 2356 2357 do { 2358 bool oom_check; 2359 2360 /* If killed, bypass charge */ 2361 if (fatal_signal_pending(current)) { 2362 css_put(&memcg->css); 2363 goto bypass; 2364 } 2365 2366 oom_check = false; 2367 if (oom && !nr_oom_retries) { 2368 oom_check = true; 2369 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2370 } 2371 2372 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); 2373 switch (ret) { 2374 case CHARGE_OK: 2375 break; 2376 case CHARGE_RETRY: /* not in OOM situation but retry */ 2377 batch = nr_pages; 2378 css_put(&memcg->css); 2379 memcg = NULL; 2380 goto again; 2381 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2382 css_put(&memcg->css); 2383 goto nomem; 2384 case CHARGE_NOMEM: /* OOM routine works */ 2385 if (!oom) { 2386 css_put(&memcg->css); 2387 goto nomem; 2388 } 2389 /* If oom, we never return -ENOMEM */ 2390 nr_oom_retries--; 2391 break; 2392 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2393 css_put(&memcg->css); 2394 goto bypass; 2395 } 2396 } while (ret != CHARGE_OK); 2397 2398 if (batch > nr_pages) 2399 refill_stock(memcg, batch - nr_pages); 2400 css_put(&memcg->css); 2401 done: 2402 *ptr = memcg; 2403 return 0; 2404 nomem: 2405 *ptr = NULL; 2406 return -ENOMEM; 2407 bypass: 2408 *ptr = root_mem_cgroup; 2409 return -EINTR; 2410 } 2411 2412 /* 2413 * Somemtimes we have to undo a charge we got by try_charge(). 2414 * This function is for that and do uncharge, put css's refcnt. 2415 * gotten by try_charge(). 2416 */ 2417 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, 2418 unsigned int nr_pages) 2419 { 2420 if (!mem_cgroup_is_root(memcg)) { 2421 unsigned long bytes = nr_pages * PAGE_SIZE; 2422 2423 res_counter_uncharge(&memcg->res, bytes); 2424 if (do_swap_account) 2425 res_counter_uncharge(&memcg->memsw, bytes); 2426 } 2427 } 2428 2429 /* 2430 * A helper function to get mem_cgroup from ID. must be called under 2431 * rcu_read_lock(). The caller must check css_is_removed() or some if 2432 * it's concern. (dropping refcnt from swap can be called against removed 2433 * memcg.) 2434 */ 2435 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2436 { 2437 struct cgroup_subsys_state *css; 2438 2439 /* ID 0 is unused ID */ 2440 if (!id) 2441 return NULL; 2442 css = css_lookup(&mem_cgroup_subsys, id); 2443 if (!css) 2444 return NULL; 2445 return container_of(css, struct mem_cgroup, css); 2446 } 2447 2448 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2449 { 2450 struct mem_cgroup *memcg = NULL; 2451 struct page_cgroup *pc; 2452 unsigned short id; 2453 swp_entry_t ent; 2454 2455 VM_BUG_ON(!PageLocked(page)); 2456 2457 pc = lookup_page_cgroup(page); 2458 lock_page_cgroup(pc); 2459 if (PageCgroupUsed(pc)) { 2460 memcg = pc->mem_cgroup; 2461 if (memcg && !css_tryget(&memcg->css)) 2462 memcg = NULL; 2463 } else if (PageSwapCache(page)) { 2464 ent.val = page_private(page); 2465 id = lookup_swap_cgroup_id(ent); 2466 rcu_read_lock(); 2467 memcg = mem_cgroup_lookup(id); 2468 if (memcg && !css_tryget(&memcg->css)) 2469 memcg = NULL; 2470 rcu_read_unlock(); 2471 } 2472 unlock_page_cgroup(pc); 2473 return memcg; 2474 } 2475 2476 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, 2477 struct page *page, 2478 unsigned int nr_pages, 2479 enum charge_type ctype, 2480 bool lrucare) 2481 { 2482 struct page_cgroup *pc = lookup_page_cgroup(page); 2483 struct zone *uninitialized_var(zone); 2484 bool was_on_lru = false; 2485 bool anon; 2486 2487 lock_page_cgroup(pc); 2488 if (unlikely(PageCgroupUsed(pc))) { 2489 unlock_page_cgroup(pc); 2490 __mem_cgroup_cancel_charge(memcg, nr_pages); 2491 return; 2492 } 2493 /* 2494 * we don't need page_cgroup_lock about tail pages, becase they are not 2495 * accessed by any other context at this point. 2496 */ 2497 2498 /* 2499 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2500 * may already be on some other mem_cgroup's LRU. Take care of it. 2501 */ 2502 if (lrucare) { 2503 zone = page_zone(page); 2504 spin_lock_irq(&zone->lru_lock); 2505 if (PageLRU(page)) { 2506 ClearPageLRU(page); 2507 del_page_from_lru_list(zone, page, page_lru(page)); 2508 was_on_lru = true; 2509 } 2510 } 2511 2512 pc->mem_cgroup = memcg; 2513 /* 2514 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2515 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2516 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2517 * before USED bit, we need memory barrier here. 2518 * See mem_cgroup_add_lru_list(), etc. 2519 */ 2520 smp_wmb(); 2521 SetPageCgroupUsed(pc); 2522 2523 if (lrucare) { 2524 if (was_on_lru) { 2525 VM_BUG_ON(PageLRU(page)); 2526 SetPageLRU(page); 2527 add_page_to_lru_list(zone, page, page_lru(page)); 2528 } 2529 spin_unlock_irq(&zone->lru_lock); 2530 } 2531 2532 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2533 anon = true; 2534 else 2535 anon = false; 2536 2537 mem_cgroup_charge_statistics(memcg, anon, nr_pages); 2538 unlock_page_cgroup(pc); 2539 2540 /* 2541 * "charge_statistics" updated event counter. Then, check it. 2542 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2543 * if they exceeds softlimit. 2544 */ 2545 memcg_check_events(memcg, page); 2546 } 2547 2548 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2549 2550 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) 2551 /* 2552 * Because tail pages are not marked as "used", set it. We're under 2553 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2554 * charge/uncharge will be never happen and move_account() is done under 2555 * compound_lock(), so we don't have to take care of races. 2556 */ 2557 void mem_cgroup_split_huge_fixup(struct page *head) 2558 { 2559 struct page_cgroup *head_pc = lookup_page_cgroup(head); 2560 struct page_cgroup *pc; 2561 int i; 2562 2563 if (mem_cgroup_disabled()) 2564 return; 2565 for (i = 1; i < HPAGE_PMD_NR; i++) { 2566 pc = head_pc + i; 2567 pc->mem_cgroup = head_pc->mem_cgroup; 2568 smp_wmb();/* see __commit_charge() */ 2569 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 2570 } 2571 } 2572 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2573 2574 /** 2575 * mem_cgroup_move_account - move account of the page 2576 * @page: the page 2577 * @nr_pages: number of regular pages (>1 for huge pages) 2578 * @pc: page_cgroup of the page. 2579 * @from: mem_cgroup which the page is moved from. 2580 * @to: mem_cgroup which the page is moved to. @from != @to. 2581 * @uncharge: whether we should call uncharge and css_put against @from. 2582 * 2583 * The caller must confirm following. 2584 * - page is not on LRU (isolate_page() is useful.) 2585 * - compound_lock is held when nr_pages > 1 2586 * 2587 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2588 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is 2589 * true, this function does "uncharge" from old cgroup, but it doesn't if 2590 * @uncharge is false, so a caller should do "uncharge". 2591 */ 2592 static int mem_cgroup_move_account(struct page *page, 2593 unsigned int nr_pages, 2594 struct page_cgroup *pc, 2595 struct mem_cgroup *from, 2596 struct mem_cgroup *to, 2597 bool uncharge) 2598 { 2599 unsigned long flags; 2600 int ret; 2601 bool anon = PageAnon(page); 2602 2603 VM_BUG_ON(from == to); 2604 VM_BUG_ON(PageLRU(page)); 2605 /* 2606 * The page is isolated from LRU. So, collapse function 2607 * will not handle this page. But page splitting can happen. 2608 * Do this check under compound_page_lock(). The caller should 2609 * hold it. 2610 */ 2611 ret = -EBUSY; 2612 if (nr_pages > 1 && !PageTransHuge(page)) 2613 goto out; 2614 2615 lock_page_cgroup(pc); 2616 2617 ret = -EINVAL; 2618 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2619 goto unlock; 2620 2621 move_lock_mem_cgroup(from, &flags); 2622 2623 if (!anon && page_mapped(page)) { 2624 /* Update mapped_file data for mem_cgroup */ 2625 preempt_disable(); 2626 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2627 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2628 preempt_enable(); 2629 } 2630 mem_cgroup_charge_statistics(from, anon, -nr_pages); 2631 if (uncharge) 2632 /* This is not "cancel", but cancel_charge does all we need. */ 2633 __mem_cgroup_cancel_charge(from, nr_pages); 2634 2635 /* caller should have done css_get */ 2636 pc->mem_cgroup = to; 2637 mem_cgroup_charge_statistics(to, anon, nr_pages); 2638 /* 2639 * We charges against "to" which may not have any tasks. Then, "to" 2640 * can be under rmdir(). But in current implementation, caller of 2641 * this function is just force_empty() and move charge, so it's 2642 * guaranteed that "to" is never removed. So, we don't check rmdir 2643 * status here. 2644 */ 2645 move_unlock_mem_cgroup(from, &flags); 2646 ret = 0; 2647 unlock: 2648 unlock_page_cgroup(pc); 2649 /* 2650 * check events 2651 */ 2652 memcg_check_events(to, page); 2653 memcg_check_events(from, page); 2654 out: 2655 return ret; 2656 } 2657 2658 /* 2659 * move charges to its parent. 2660 */ 2661 2662 static int mem_cgroup_move_parent(struct page *page, 2663 struct page_cgroup *pc, 2664 struct mem_cgroup *child, 2665 gfp_t gfp_mask) 2666 { 2667 struct cgroup *cg = child->css.cgroup; 2668 struct cgroup *pcg = cg->parent; 2669 struct mem_cgroup *parent; 2670 unsigned int nr_pages; 2671 unsigned long uninitialized_var(flags); 2672 int ret; 2673 2674 /* Is ROOT ? */ 2675 if (!pcg) 2676 return -EINVAL; 2677 2678 ret = -EBUSY; 2679 if (!get_page_unless_zero(page)) 2680 goto out; 2681 if (isolate_lru_page(page)) 2682 goto put; 2683 2684 nr_pages = hpage_nr_pages(page); 2685 2686 parent = mem_cgroup_from_cont(pcg); 2687 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2688 if (ret) 2689 goto put_back; 2690 2691 if (nr_pages > 1) 2692 flags = compound_lock_irqsave(page); 2693 2694 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); 2695 if (ret) 2696 __mem_cgroup_cancel_charge(parent, nr_pages); 2697 2698 if (nr_pages > 1) 2699 compound_unlock_irqrestore(page, flags); 2700 put_back: 2701 putback_lru_page(page); 2702 put: 2703 put_page(page); 2704 out: 2705 return ret; 2706 } 2707 2708 /* 2709 * Charge the memory controller for page usage. 2710 * Return 2711 * 0 if the charge was successful 2712 * < 0 if the cgroup is over its limit 2713 */ 2714 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2715 gfp_t gfp_mask, enum charge_type ctype) 2716 { 2717 struct mem_cgroup *memcg = NULL; 2718 unsigned int nr_pages = 1; 2719 bool oom = true; 2720 int ret; 2721 2722 if (PageTransHuge(page)) { 2723 nr_pages <<= compound_order(page); 2724 VM_BUG_ON(!PageTransHuge(page)); 2725 /* 2726 * Never OOM-kill a process for a huge page. The 2727 * fault handler will fall back to regular pages. 2728 */ 2729 oom = false; 2730 } 2731 2732 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 2733 if (ret == -ENOMEM) 2734 return ret; 2735 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); 2736 return 0; 2737 } 2738 2739 int mem_cgroup_newpage_charge(struct page *page, 2740 struct mm_struct *mm, gfp_t gfp_mask) 2741 { 2742 if (mem_cgroup_disabled()) 2743 return 0; 2744 VM_BUG_ON(page_mapped(page)); 2745 VM_BUG_ON(page->mapping && !PageAnon(page)); 2746 VM_BUG_ON(!mm); 2747 return mem_cgroup_charge_common(page, mm, gfp_mask, 2748 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2749 } 2750 2751 static void 2752 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2753 enum charge_type ctype); 2754 2755 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2756 gfp_t gfp_mask) 2757 { 2758 struct mem_cgroup *memcg = NULL; 2759 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 2760 int ret; 2761 2762 if (mem_cgroup_disabled()) 2763 return 0; 2764 if (PageCompound(page)) 2765 return 0; 2766 2767 if (unlikely(!mm)) 2768 mm = &init_mm; 2769 if (!page_is_file_cache(page)) 2770 type = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2771 2772 if (!PageSwapCache(page)) 2773 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); 2774 else { /* page is swapcache/shmem */ 2775 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); 2776 if (!ret) 2777 __mem_cgroup_commit_charge_swapin(page, memcg, type); 2778 } 2779 return ret; 2780 } 2781 2782 /* 2783 * While swap-in, try_charge -> commit or cancel, the page is locked. 2784 * And when try_charge() successfully returns, one refcnt to memcg without 2785 * struct page_cgroup is acquired. This refcnt will be consumed by 2786 * "commit()" or removed by "cancel()" 2787 */ 2788 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2789 struct page *page, 2790 gfp_t mask, struct mem_cgroup **memcgp) 2791 { 2792 struct mem_cgroup *memcg; 2793 int ret; 2794 2795 *memcgp = NULL; 2796 2797 if (mem_cgroup_disabled()) 2798 return 0; 2799 2800 if (!do_swap_account) 2801 goto charge_cur_mm; 2802 /* 2803 * A racing thread's fault, or swapoff, may have already updated 2804 * the pte, and even removed page from swap cache: in those cases 2805 * do_swap_page()'s pte_same() test will fail; but there's also a 2806 * KSM case which does need to charge the page. 2807 */ 2808 if (!PageSwapCache(page)) 2809 goto charge_cur_mm; 2810 memcg = try_get_mem_cgroup_from_page(page); 2811 if (!memcg) 2812 goto charge_cur_mm; 2813 *memcgp = memcg; 2814 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); 2815 css_put(&memcg->css); 2816 if (ret == -EINTR) 2817 ret = 0; 2818 return ret; 2819 charge_cur_mm: 2820 if (unlikely(!mm)) 2821 mm = &init_mm; 2822 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 2823 if (ret == -EINTR) 2824 ret = 0; 2825 return ret; 2826 } 2827 2828 static void 2829 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 2830 enum charge_type ctype) 2831 { 2832 if (mem_cgroup_disabled()) 2833 return; 2834 if (!memcg) 2835 return; 2836 cgroup_exclude_rmdir(&memcg->css); 2837 2838 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 2839 /* 2840 * Now swap is on-memory. This means this page may be 2841 * counted both as mem and swap....double count. 2842 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2843 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2844 * may call delete_from_swap_cache() before reach here. 2845 */ 2846 if (do_swap_account && PageSwapCache(page)) { 2847 swp_entry_t ent = {.val = page_private(page)}; 2848 struct mem_cgroup *swap_memcg; 2849 unsigned short id; 2850 2851 id = swap_cgroup_record(ent, 0); 2852 rcu_read_lock(); 2853 swap_memcg = mem_cgroup_lookup(id); 2854 if (swap_memcg) { 2855 /* 2856 * This recorded memcg can be obsolete one. So, avoid 2857 * calling css_tryget 2858 */ 2859 if (!mem_cgroup_is_root(swap_memcg)) 2860 res_counter_uncharge(&swap_memcg->memsw, 2861 PAGE_SIZE); 2862 mem_cgroup_swap_statistics(swap_memcg, false); 2863 mem_cgroup_put(swap_memcg); 2864 } 2865 rcu_read_unlock(); 2866 } 2867 /* 2868 * At swapin, we may charge account against cgroup which has no tasks. 2869 * So, rmdir()->pre_destroy() can be called while we do this charge. 2870 * In that case, we need to call pre_destroy() again. check it here. 2871 */ 2872 cgroup_release_and_wakeup_rmdir(&memcg->css); 2873 } 2874 2875 void mem_cgroup_commit_charge_swapin(struct page *page, 2876 struct mem_cgroup *memcg) 2877 { 2878 __mem_cgroup_commit_charge_swapin(page, memcg, 2879 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2880 } 2881 2882 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 2883 { 2884 if (mem_cgroup_disabled()) 2885 return; 2886 if (!memcg) 2887 return; 2888 __mem_cgroup_cancel_charge(memcg, 1); 2889 } 2890 2891 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 2892 unsigned int nr_pages, 2893 const enum charge_type ctype) 2894 { 2895 struct memcg_batch_info *batch = NULL; 2896 bool uncharge_memsw = true; 2897 2898 /* If swapout, usage of swap doesn't decrease */ 2899 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2900 uncharge_memsw = false; 2901 2902 batch = ¤t->memcg_batch; 2903 /* 2904 * In usual, we do css_get() when we remember memcg pointer. 2905 * But in this case, we keep res->usage until end of a series of 2906 * uncharges. Then, it's ok to ignore memcg's refcnt. 2907 */ 2908 if (!batch->memcg) 2909 batch->memcg = memcg; 2910 /* 2911 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2912 * In those cases, all pages freed continuously can be expected to be in 2913 * the same cgroup and we have chance to coalesce uncharges. 2914 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2915 * because we want to do uncharge as soon as possible. 2916 */ 2917 2918 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2919 goto direct_uncharge; 2920 2921 if (nr_pages > 1) 2922 goto direct_uncharge; 2923 2924 /* 2925 * In typical case, batch->memcg == mem. This means we can 2926 * merge a series of uncharges to an uncharge of res_counter. 2927 * If not, we uncharge res_counter ony by one. 2928 */ 2929 if (batch->memcg != memcg) 2930 goto direct_uncharge; 2931 /* remember freed charge and uncharge it later */ 2932 batch->nr_pages++; 2933 if (uncharge_memsw) 2934 batch->memsw_nr_pages++; 2935 return; 2936 direct_uncharge: 2937 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); 2938 if (uncharge_memsw) 2939 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); 2940 if (unlikely(batch->memcg != memcg)) 2941 memcg_oom_recover(memcg); 2942 } 2943 2944 /* 2945 * uncharge if !page_mapped(page) 2946 */ 2947 static struct mem_cgroup * 2948 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2949 { 2950 struct mem_cgroup *memcg = NULL; 2951 unsigned int nr_pages = 1; 2952 struct page_cgroup *pc; 2953 bool anon; 2954 2955 if (mem_cgroup_disabled()) 2956 return NULL; 2957 2958 if (PageSwapCache(page)) 2959 return NULL; 2960 2961 if (PageTransHuge(page)) { 2962 nr_pages <<= compound_order(page); 2963 VM_BUG_ON(!PageTransHuge(page)); 2964 } 2965 /* 2966 * Check if our page_cgroup is valid 2967 */ 2968 pc = lookup_page_cgroup(page); 2969 if (unlikely(!PageCgroupUsed(pc))) 2970 return NULL; 2971 2972 lock_page_cgroup(pc); 2973 2974 memcg = pc->mem_cgroup; 2975 2976 if (!PageCgroupUsed(pc)) 2977 goto unlock_out; 2978 2979 anon = PageAnon(page); 2980 2981 switch (ctype) { 2982 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2983 /* 2984 * Generally PageAnon tells if it's the anon statistics to be 2985 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is 2986 * used before page reached the stage of being marked PageAnon. 2987 */ 2988 anon = true; 2989 /* fallthrough */ 2990 case MEM_CGROUP_CHARGE_TYPE_DROP: 2991 /* See mem_cgroup_prepare_migration() */ 2992 if (page_mapped(page) || PageCgroupMigration(pc)) 2993 goto unlock_out; 2994 break; 2995 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2996 if (!PageAnon(page)) { /* Shared memory */ 2997 if (page->mapping && !page_is_file_cache(page)) 2998 goto unlock_out; 2999 } else if (page_mapped(page)) /* Anon */ 3000 goto unlock_out; 3001 break; 3002 default: 3003 break; 3004 } 3005 3006 mem_cgroup_charge_statistics(memcg, anon, -nr_pages); 3007 3008 ClearPageCgroupUsed(pc); 3009 /* 3010 * pc->mem_cgroup is not cleared here. It will be accessed when it's 3011 * freed from LRU. This is safe because uncharged page is expected not 3012 * to be reused (freed soon). Exception is SwapCache, it's handled by 3013 * special functions. 3014 */ 3015 3016 unlock_page_cgroup(pc); 3017 /* 3018 * even after unlock, we have memcg->res.usage here and this memcg 3019 * will never be freed. 3020 */ 3021 memcg_check_events(memcg, page); 3022 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 3023 mem_cgroup_swap_statistics(memcg, true); 3024 mem_cgroup_get(memcg); 3025 } 3026 if (!mem_cgroup_is_root(memcg)) 3027 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 3028 3029 return memcg; 3030 3031 unlock_out: 3032 unlock_page_cgroup(pc); 3033 return NULL; 3034 } 3035 3036 void mem_cgroup_uncharge_page(struct page *page) 3037 { 3038 /* early check. */ 3039 if (page_mapped(page)) 3040 return; 3041 VM_BUG_ON(page->mapping && !PageAnon(page)); 3042 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 3043 } 3044 3045 void mem_cgroup_uncharge_cache_page(struct page *page) 3046 { 3047 VM_BUG_ON(page_mapped(page)); 3048 VM_BUG_ON(page->mapping); 3049 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 3050 } 3051 3052 /* 3053 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 3054 * In that cases, pages are freed continuously and we can expect pages 3055 * are in the same memcg. All these calls itself limits the number of 3056 * pages freed at once, then uncharge_start/end() is called properly. 3057 * This may be called prural(2) times in a context, 3058 */ 3059 3060 void mem_cgroup_uncharge_start(void) 3061 { 3062 current->memcg_batch.do_batch++; 3063 /* We can do nest. */ 3064 if (current->memcg_batch.do_batch == 1) { 3065 current->memcg_batch.memcg = NULL; 3066 current->memcg_batch.nr_pages = 0; 3067 current->memcg_batch.memsw_nr_pages = 0; 3068 } 3069 } 3070 3071 void mem_cgroup_uncharge_end(void) 3072 { 3073 struct memcg_batch_info *batch = ¤t->memcg_batch; 3074 3075 if (!batch->do_batch) 3076 return; 3077 3078 batch->do_batch--; 3079 if (batch->do_batch) /* If stacked, do nothing. */ 3080 return; 3081 3082 if (!batch->memcg) 3083 return; 3084 /* 3085 * This "batch->memcg" is valid without any css_get/put etc... 3086 * bacause we hide charges behind us. 3087 */ 3088 if (batch->nr_pages) 3089 res_counter_uncharge(&batch->memcg->res, 3090 batch->nr_pages * PAGE_SIZE); 3091 if (batch->memsw_nr_pages) 3092 res_counter_uncharge(&batch->memcg->memsw, 3093 batch->memsw_nr_pages * PAGE_SIZE); 3094 memcg_oom_recover(batch->memcg); 3095 /* forget this pointer (for sanity check) */ 3096 batch->memcg = NULL; 3097 } 3098 3099 #ifdef CONFIG_SWAP 3100 /* 3101 * called after __delete_from_swap_cache() and drop "page" account. 3102 * memcg information is recorded to swap_cgroup of "ent" 3103 */ 3104 void 3105 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 3106 { 3107 struct mem_cgroup *memcg; 3108 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 3109 3110 if (!swapout) /* this was a swap cache but the swap is unused ! */ 3111 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 3112 3113 memcg = __mem_cgroup_uncharge_common(page, ctype); 3114 3115 /* 3116 * record memcg information, if swapout && memcg != NULL, 3117 * mem_cgroup_get() was called in uncharge(). 3118 */ 3119 if (do_swap_account && swapout && memcg) 3120 swap_cgroup_record(ent, css_id(&memcg->css)); 3121 } 3122 #endif 3123 3124 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3125 /* 3126 * called from swap_entry_free(). remove record in swap_cgroup and 3127 * uncharge "memsw" account. 3128 */ 3129 void mem_cgroup_uncharge_swap(swp_entry_t ent) 3130 { 3131 struct mem_cgroup *memcg; 3132 unsigned short id; 3133 3134 if (!do_swap_account) 3135 return; 3136 3137 id = swap_cgroup_record(ent, 0); 3138 rcu_read_lock(); 3139 memcg = mem_cgroup_lookup(id); 3140 if (memcg) { 3141 /* 3142 * We uncharge this because swap is freed. 3143 * This memcg can be obsolete one. We avoid calling css_tryget 3144 */ 3145 if (!mem_cgroup_is_root(memcg)) 3146 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 3147 mem_cgroup_swap_statistics(memcg, false); 3148 mem_cgroup_put(memcg); 3149 } 3150 rcu_read_unlock(); 3151 } 3152 3153 /** 3154 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3155 * @entry: swap entry to be moved 3156 * @from: mem_cgroup which the entry is moved from 3157 * @to: mem_cgroup which the entry is moved to 3158 * @need_fixup: whether we should fixup res_counters and refcounts. 3159 * 3160 * It succeeds only when the swap_cgroup's record for this entry is the same 3161 * as the mem_cgroup's id of @from. 3162 * 3163 * Returns 0 on success, -EINVAL on failure. 3164 * 3165 * The caller must have charged to @to, IOW, called res_counter_charge() about 3166 * both res and memsw, and called css_get(). 3167 */ 3168 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3169 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3170 { 3171 unsigned short old_id, new_id; 3172 3173 old_id = css_id(&from->css); 3174 new_id = css_id(&to->css); 3175 3176 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3177 mem_cgroup_swap_statistics(from, false); 3178 mem_cgroup_swap_statistics(to, true); 3179 /* 3180 * This function is only called from task migration context now. 3181 * It postpones res_counter and refcount handling till the end 3182 * of task migration(mem_cgroup_clear_mc()) for performance 3183 * improvement. But we cannot postpone mem_cgroup_get(to) 3184 * because if the process that has been moved to @to does 3185 * swap-in, the refcount of @to might be decreased to 0. 3186 */ 3187 mem_cgroup_get(to); 3188 if (need_fixup) { 3189 if (!mem_cgroup_is_root(from)) 3190 res_counter_uncharge(&from->memsw, PAGE_SIZE); 3191 mem_cgroup_put(from); 3192 /* 3193 * we charged both to->res and to->memsw, so we should 3194 * uncharge to->res. 3195 */ 3196 if (!mem_cgroup_is_root(to)) 3197 res_counter_uncharge(&to->res, PAGE_SIZE); 3198 } 3199 return 0; 3200 } 3201 return -EINVAL; 3202 } 3203 #else 3204 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3205 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3206 { 3207 return -EINVAL; 3208 } 3209 #endif 3210 3211 /* 3212 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 3213 * page belongs to. 3214 */ 3215 int mem_cgroup_prepare_migration(struct page *page, 3216 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) 3217 { 3218 struct mem_cgroup *memcg = NULL; 3219 struct page_cgroup *pc; 3220 enum charge_type ctype; 3221 int ret = 0; 3222 3223 *memcgp = NULL; 3224 3225 VM_BUG_ON(PageTransHuge(page)); 3226 if (mem_cgroup_disabled()) 3227 return 0; 3228 3229 pc = lookup_page_cgroup(page); 3230 lock_page_cgroup(pc); 3231 if (PageCgroupUsed(pc)) { 3232 memcg = pc->mem_cgroup; 3233 css_get(&memcg->css); 3234 /* 3235 * At migrating an anonymous page, its mapcount goes down 3236 * to 0 and uncharge() will be called. But, even if it's fully 3237 * unmapped, migration may fail and this page has to be 3238 * charged again. We set MIGRATION flag here and delay uncharge 3239 * until end_migration() is called 3240 * 3241 * Corner Case Thinking 3242 * A) 3243 * When the old page was mapped as Anon and it's unmap-and-freed 3244 * while migration was ongoing. 3245 * If unmap finds the old page, uncharge() of it will be delayed 3246 * until end_migration(). If unmap finds a new page, it's 3247 * uncharged when it make mapcount to be 1->0. If unmap code 3248 * finds swap_migration_entry, the new page will not be mapped 3249 * and end_migration() will find it(mapcount==0). 3250 * 3251 * B) 3252 * When the old page was mapped but migraion fails, the kernel 3253 * remaps it. A charge for it is kept by MIGRATION flag even 3254 * if mapcount goes down to 0. We can do remap successfully 3255 * without charging it again. 3256 * 3257 * C) 3258 * The "old" page is under lock_page() until the end of 3259 * migration, so, the old page itself will not be swapped-out. 3260 * If the new page is swapped out before end_migraton, our 3261 * hook to usual swap-out path will catch the event. 3262 */ 3263 if (PageAnon(page)) 3264 SetPageCgroupMigration(pc); 3265 } 3266 unlock_page_cgroup(pc); 3267 /* 3268 * If the page is not charged at this point, 3269 * we return here. 3270 */ 3271 if (!memcg) 3272 return 0; 3273 3274 *memcgp = memcg; 3275 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); 3276 css_put(&memcg->css);/* drop extra refcnt */ 3277 if (ret) { 3278 if (PageAnon(page)) { 3279 lock_page_cgroup(pc); 3280 ClearPageCgroupMigration(pc); 3281 unlock_page_cgroup(pc); 3282 /* 3283 * The old page may be fully unmapped while we kept it. 3284 */ 3285 mem_cgroup_uncharge_page(page); 3286 } 3287 /* we'll need to revisit this error code (we have -EINTR) */ 3288 return -ENOMEM; 3289 } 3290 /* 3291 * We charge new page before it's used/mapped. So, even if unlock_page() 3292 * is called before end_migration, we can catch all events on this new 3293 * page. In the case new page is migrated but not remapped, new page's 3294 * mapcount will be finally 0 and we call uncharge in end_migration(). 3295 */ 3296 if (PageAnon(page)) 3297 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 3298 else if (page_is_file_cache(page)) 3299 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3300 else 3301 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3302 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); 3303 return ret; 3304 } 3305 3306 /* remove redundant charge if migration failed*/ 3307 void mem_cgroup_end_migration(struct mem_cgroup *memcg, 3308 struct page *oldpage, struct page *newpage, bool migration_ok) 3309 { 3310 struct page *used, *unused; 3311 struct page_cgroup *pc; 3312 bool anon; 3313 3314 if (!memcg) 3315 return; 3316 /* blocks rmdir() */ 3317 cgroup_exclude_rmdir(&memcg->css); 3318 if (!migration_ok) { 3319 used = oldpage; 3320 unused = newpage; 3321 } else { 3322 used = newpage; 3323 unused = oldpage; 3324 } 3325 /* 3326 * We disallowed uncharge of pages under migration because mapcount 3327 * of the page goes down to zero, temporarly. 3328 * Clear the flag and check the page should be charged. 3329 */ 3330 pc = lookup_page_cgroup(oldpage); 3331 lock_page_cgroup(pc); 3332 ClearPageCgroupMigration(pc); 3333 unlock_page_cgroup(pc); 3334 anon = PageAnon(used); 3335 __mem_cgroup_uncharge_common(unused, 3336 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED 3337 : MEM_CGROUP_CHARGE_TYPE_CACHE); 3338 3339 /* 3340 * If a page is a file cache, radix-tree replacement is very atomic 3341 * and we can skip this check. When it was an Anon page, its mapcount 3342 * goes down to 0. But because we added MIGRATION flage, it's not 3343 * uncharged yet. There are several case but page->mapcount check 3344 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3345 * check. (see prepare_charge() also) 3346 */ 3347 if (anon) 3348 mem_cgroup_uncharge_page(used); 3349 /* 3350 * At migration, we may charge account against cgroup which has no 3351 * tasks. 3352 * So, rmdir()->pre_destroy() can be called while we do this charge. 3353 * In that case, we need to call pre_destroy() again. check it here. 3354 */ 3355 cgroup_release_and_wakeup_rmdir(&memcg->css); 3356 } 3357 3358 /* 3359 * At replace page cache, newpage is not under any memcg but it's on 3360 * LRU. So, this function doesn't touch res_counter but handles LRU 3361 * in correct way. Both pages are locked so we cannot race with uncharge. 3362 */ 3363 void mem_cgroup_replace_page_cache(struct page *oldpage, 3364 struct page *newpage) 3365 { 3366 struct mem_cgroup *memcg; 3367 struct page_cgroup *pc; 3368 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 3369 3370 if (mem_cgroup_disabled()) 3371 return; 3372 3373 pc = lookup_page_cgroup(oldpage); 3374 /* fix accounting on old pages */ 3375 lock_page_cgroup(pc); 3376 memcg = pc->mem_cgroup; 3377 mem_cgroup_charge_statistics(memcg, false, -1); 3378 ClearPageCgroupUsed(pc); 3379 unlock_page_cgroup(pc); 3380 3381 if (PageSwapBacked(oldpage)) 3382 type = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3383 3384 /* 3385 * Even if newpage->mapping was NULL before starting replacement, 3386 * the newpage may be on LRU(or pagevec for LRU) already. We lock 3387 * LRU while we overwrite pc->mem_cgroup. 3388 */ 3389 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); 3390 } 3391 3392 #ifdef CONFIG_DEBUG_VM 3393 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3394 { 3395 struct page_cgroup *pc; 3396 3397 pc = lookup_page_cgroup(page); 3398 /* 3399 * Can be NULL while feeding pages into the page allocator for 3400 * the first time, i.e. during boot or memory hotplug; 3401 * or when mem_cgroup_disabled(). 3402 */ 3403 if (likely(pc) && PageCgroupUsed(pc)) 3404 return pc; 3405 return NULL; 3406 } 3407 3408 bool mem_cgroup_bad_page_check(struct page *page) 3409 { 3410 if (mem_cgroup_disabled()) 3411 return false; 3412 3413 return lookup_page_cgroup_used(page) != NULL; 3414 } 3415 3416 void mem_cgroup_print_bad_page(struct page *page) 3417 { 3418 struct page_cgroup *pc; 3419 3420 pc = lookup_page_cgroup_used(page); 3421 if (pc) { 3422 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 3423 pc, pc->flags, pc->mem_cgroup); 3424 } 3425 } 3426 #endif 3427 3428 static DEFINE_MUTEX(set_limit_mutex); 3429 3430 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3431 unsigned long long val) 3432 { 3433 int retry_count; 3434 u64 memswlimit, memlimit; 3435 int ret = 0; 3436 int children = mem_cgroup_count_children(memcg); 3437 u64 curusage, oldusage; 3438 int enlarge; 3439 3440 /* 3441 * For keeping hierarchical_reclaim simple, how long we should retry 3442 * is depends on callers. We set our retry-count to be function 3443 * of # of children which we should visit in this loop. 3444 */ 3445 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3446 3447 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3448 3449 enlarge = 0; 3450 while (retry_count) { 3451 if (signal_pending(current)) { 3452 ret = -EINTR; 3453 break; 3454 } 3455 /* 3456 * Rather than hide all in some function, I do this in 3457 * open coded manner. You see what this really does. 3458 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3459 */ 3460 mutex_lock(&set_limit_mutex); 3461 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3462 if (memswlimit < val) { 3463 ret = -EINVAL; 3464 mutex_unlock(&set_limit_mutex); 3465 break; 3466 } 3467 3468 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3469 if (memlimit < val) 3470 enlarge = 1; 3471 3472 ret = res_counter_set_limit(&memcg->res, val); 3473 if (!ret) { 3474 if (memswlimit == val) 3475 memcg->memsw_is_minimum = true; 3476 else 3477 memcg->memsw_is_minimum = false; 3478 } 3479 mutex_unlock(&set_limit_mutex); 3480 3481 if (!ret) 3482 break; 3483 3484 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3485 MEM_CGROUP_RECLAIM_SHRINK); 3486 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3487 /* Usage is reduced ? */ 3488 if (curusage >= oldusage) 3489 retry_count--; 3490 else 3491 oldusage = curusage; 3492 } 3493 if (!ret && enlarge) 3494 memcg_oom_recover(memcg); 3495 3496 return ret; 3497 } 3498 3499 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3500 unsigned long long val) 3501 { 3502 int retry_count; 3503 u64 memlimit, memswlimit, oldusage, curusage; 3504 int children = mem_cgroup_count_children(memcg); 3505 int ret = -EBUSY; 3506 int enlarge = 0; 3507 3508 /* see mem_cgroup_resize_res_limit */ 3509 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3510 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3511 while (retry_count) { 3512 if (signal_pending(current)) { 3513 ret = -EINTR; 3514 break; 3515 } 3516 /* 3517 * Rather than hide all in some function, I do this in 3518 * open coded manner. You see what this really does. 3519 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3520 */ 3521 mutex_lock(&set_limit_mutex); 3522 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3523 if (memlimit > val) { 3524 ret = -EINVAL; 3525 mutex_unlock(&set_limit_mutex); 3526 break; 3527 } 3528 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3529 if (memswlimit < val) 3530 enlarge = 1; 3531 ret = res_counter_set_limit(&memcg->memsw, val); 3532 if (!ret) { 3533 if (memlimit == val) 3534 memcg->memsw_is_minimum = true; 3535 else 3536 memcg->memsw_is_minimum = false; 3537 } 3538 mutex_unlock(&set_limit_mutex); 3539 3540 if (!ret) 3541 break; 3542 3543 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3544 MEM_CGROUP_RECLAIM_NOSWAP | 3545 MEM_CGROUP_RECLAIM_SHRINK); 3546 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3547 /* Usage is reduced ? */ 3548 if (curusage >= oldusage) 3549 retry_count--; 3550 else 3551 oldusage = curusage; 3552 } 3553 if (!ret && enlarge) 3554 memcg_oom_recover(memcg); 3555 return ret; 3556 } 3557 3558 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3559 gfp_t gfp_mask, 3560 unsigned long *total_scanned) 3561 { 3562 unsigned long nr_reclaimed = 0; 3563 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3564 unsigned long reclaimed; 3565 int loop = 0; 3566 struct mem_cgroup_tree_per_zone *mctz; 3567 unsigned long long excess; 3568 unsigned long nr_scanned; 3569 3570 if (order > 0) 3571 return 0; 3572 3573 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3574 /* 3575 * This loop can run a while, specially if mem_cgroup's continuously 3576 * keep exceeding their soft limit and putting the system under 3577 * pressure 3578 */ 3579 do { 3580 if (next_mz) 3581 mz = next_mz; 3582 else 3583 mz = mem_cgroup_largest_soft_limit_node(mctz); 3584 if (!mz) 3585 break; 3586 3587 nr_scanned = 0; 3588 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 3589 gfp_mask, &nr_scanned); 3590 nr_reclaimed += reclaimed; 3591 *total_scanned += nr_scanned; 3592 spin_lock(&mctz->lock); 3593 3594 /* 3595 * If we failed to reclaim anything from this memory cgroup 3596 * it is time to move on to the next cgroup 3597 */ 3598 next_mz = NULL; 3599 if (!reclaimed) { 3600 do { 3601 /* 3602 * Loop until we find yet another one. 3603 * 3604 * By the time we get the soft_limit lock 3605 * again, someone might have aded the 3606 * group back on the RB tree. Iterate to 3607 * make sure we get a different mem. 3608 * mem_cgroup_largest_soft_limit_node returns 3609 * NULL if no other cgroup is present on 3610 * the tree 3611 */ 3612 next_mz = 3613 __mem_cgroup_largest_soft_limit_node(mctz); 3614 if (next_mz == mz) 3615 css_put(&next_mz->memcg->css); 3616 else /* next_mz == NULL or other memcg */ 3617 break; 3618 } while (1); 3619 } 3620 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 3621 excess = res_counter_soft_limit_excess(&mz->memcg->res); 3622 /* 3623 * One school of thought says that we should not add 3624 * back the node to the tree if reclaim returns 0. 3625 * But our reclaim could return 0, simply because due 3626 * to priority we are exposing a smaller subset of 3627 * memory to reclaim from. Consider this as a longer 3628 * term TODO. 3629 */ 3630 /* If excess == 0, no tree ops */ 3631 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); 3632 spin_unlock(&mctz->lock); 3633 css_put(&mz->memcg->css); 3634 loop++; 3635 /* 3636 * Could not reclaim anything and there are no more 3637 * mem cgroups to try or we seem to be looping without 3638 * reclaiming anything. 3639 */ 3640 if (!nr_reclaimed && 3641 (next_mz == NULL || 3642 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3643 break; 3644 } while (!nr_reclaimed); 3645 if (next_mz) 3646 css_put(&next_mz->memcg->css); 3647 return nr_reclaimed; 3648 } 3649 3650 /* 3651 * This routine traverse page_cgroup in given list and drop them all. 3652 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3653 */ 3654 static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3655 int node, int zid, enum lru_list lru) 3656 { 3657 struct mem_cgroup_per_zone *mz; 3658 unsigned long flags, loop; 3659 struct list_head *list; 3660 struct page *busy; 3661 struct zone *zone; 3662 int ret = 0; 3663 3664 zone = &NODE_DATA(node)->node_zones[zid]; 3665 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3666 list = &mz->lruvec.lists[lru]; 3667 3668 loop = mz->lru_size[lru]; 3669 /* give some margin against EBUSY etc...*/ 3670 loop += 256; 3671 busy = NULL; 3672 while (loop--) { 3673 struct page_cgroup *pc; 3674 struct page *page; 3675 3676 ret = 0; 3677 spin_lock_irqsave(&zone->lru_lock, flags); 3678 if (list_empty(list)) { 3679 spin_unlock_irqrestore(&zone->lru_lock, flags); 3680 break; 3681 } 3682 page = list_entry(list->prev, struct page, lru); 3683 if (busy == page) { 3684 list_move(&page->lru, list); 3685 busy = NULL; 3686 spin_unlock_irqrestore(&zone->lru_lock, flags); 3687 continue; 3688 } 3689 spin_unlock_irqrestore(&zone->lru_lock, flags); 3690 3691 pc = lookup_page_cgroup(page); 3692 3693 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); 3694 if (ret == -ENOMEM || ret == -EINTR) 3695 break; 3696 3697 if (ret == -EBUSY || ret == -EINVAL) { 3698 /* found lock contention or "pc" is obsolete. */ 3699 busy = page; 3700 cond_resched(); 3701 } else 3702 busy = NULL; 3703 } 3704 3705 if (!ret && !list_empty(list)) 3706 return -EBUSY; 3707 return ret; 3708 } 3709 3710 /* 3711 * make mem_cgroup's charge to be 0 if there is no task. 3712 * This enables deleting this mem_cgroup. 3713 */ 3714 static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) 3715 { 3716 int ret; 3717 int node, zid, shrink; 3718 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3719 struct cgroup *cgrp = memcg->css.cgroup; 3720 3721 css_get(&memcg->css); 3722 3723 shrink = 0; 3724 /* should free all ? */ 3725 if (free_all) 3726 goto try_to_free; 3727 move_account: 3728 do { 3729 ret = -EBUSY; 3730 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3731 goto out; 3732 ret = -EINTR; 3733 if (signal_pending(current)) 3734 goto out; 3735 /* This is for making all *used* pages to be on LRU. */ 3736 lru_add_drain_all(); 3737 drain_all_stock_sync(memcg); 3738 ret = 0; 3739 mem_cgroup_start_move(memcg); 3740 for_each_node_state(node, N_HIGH_MEMORY) { 3741 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3742 enum lru_list lru; 3743 for_each_lru(lru) { 3744 ret = mem_cgroup_force_empty_list(memcg, 3745 node, zid, lru); 3746 if (ret) 3747 break; 3748 } 3749 } 3750 if (ret) 3751 break; 3752 } 3753 mem_cgroup_end_move(memcg); 3754 memcg_oom_recover(memcg); 3755 /* it seems parent cgroup doesn't have enough mem */ 3756 if (ret == -ENOMEM) 3757 goto try_to_free; 3758 cond_resched(); 3759 /* "ret" should also be checked to ensure all lists are empty. */ 3760 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); 3761 out: 3762 css_put(&memcg->css); 3763 return ret; 3764 3765 try_to_free: 3766 /* returns EBUSY if there is a task or if we come here twice. */ 3767 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3768 ret = -EBUSY; 3769 goto out; 3770 } 3771 /* we call try-to-free pages for make this cgroup empty */ 3772 lru_add_drain_all(); 3773 /* try to free all pages in this cgroup */ 3774 shrink = 1; 3775 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 3776 int progress; 3777 3778 if (signal_pending(current)) { 3779 ret = -EINTR; 3780 goto out; 3781 } 3782 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 3783 false); 3784 if (!progress) { 3785 nr_retries--; 3786 /* maybe some writeback is necessary */ 3787 congestion_wait(BLK_RW_ASYNC, HZ/10); 3788 } 3789 3790 } 3791 lru_add_drain(); 3792 /* try move_account...there may be some *locked* pages. */ 3793 goto move_account; 3794 } 3795 3796 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3797 { 3798 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3799 } 3800 3801 3802 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 3803 { 3804 return mem_cgroup_from_cont(cont)->use_hierarchy; 3805 } 3806 3807 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 3808 u64 val) 3809 { 3810 int retval = 0; 3811 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3812 struct cgroup *parent = cont->parent; 3813 struct mem_cgroup *parent_memcg = NULL; 3814 3815 if (parent) 3816 parent_memcg = mem_cgroup_from_cont(parent); 3817 3818 cgroup_lock(); 3819 /* 3820 * If parent's use_hierarchy is set, we can't make any modifications 3821 * in the child subtrees. If it is unset, then the change can 3822 * occur, provided the current cgroup has no children. 3823 * 3824 * For the root cgroup, parent_mem is NULL, we allow value to be 3825 * set if there are no children. 3826 */ 3827 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 3828 (val == 1 || val == 0)) { 3829 if (list_empty(&cont->children)) 3830 memcg->use_hierarchy = val; 3831 else 3832 retval = -EBUSY; 3833 } else 3834 retval = -EINVAL; 3835 cgroup_unlock(); 3836 3837 return retval; 3838 } 3839 3840 3841 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 3842 enum mem_cgroup_stat_index idx) 3843 { 3844 struct mem_cgroup *iter; 3845 long val = 0; 3846 3847 /* Per-cpu values can be negative, use a signed accumulator */ 3848 for_each_mem_cgroup_tree(iter, memcg) 3849 val += mem_cgroup_read_stat(iter, idx); 3850 3851 if (val < 0) /* race ? */ 3852 val = 0; 3853 return val; 3854 } 3855 3856 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3857 { 3858 u64 val; 3859 3860 if (!mem_cgroup_is_root(memcg)) { 3861 if (!swap) 3862 return res_counter_read_u64(&memcg->res, RES_USAGE); 3863 else 3864 return res_counter_read_u64(&memcg->memsw, RES_USAGE); 3865 } 3866 3867 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); 3868 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 3869 3870 if (swap) 3871 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); 3872 3873 return val << PAGE_SHIFT; 3874 } 3875 3876 static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, 3877 struct file *file, char __user *buf, 3878 size_t nbytes, loff_t *ppos) 3879 { 3880 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3881 char str[64]; 3882 u64 val; 3883 int type, name, len; 3884 3885 type = MEMFILE_TYPE(cft->private); 3886 name = MEMFILE_ATTR(cft->private); 3887 3888 if (!do_swap_account && type == _MEMSWAP) 3889 return -EOPNOTSUPP; 3890 3891 switch (type) { 3892 case _MEM: 3893 if (name == RES_USAGE) 3894 val = mem_cgroup_usage(memcg, false); 3895 else 3896 val = res_counter_read_u64(&memcg->res, name); 3897 break; 3898 case _MEMSWAP: 3899 if (name == RES_USAGE) 3900 val = mem_cgroup_usage(memcg, true); 3901 else 3902 val = res_counter_read_u64(&memcg->memsw, name); 3903 break; 3904 default: 3905 BUG(); 3906 } 3907 3908 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 3909 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 3910 } 3911 /* 3912 * The user of this function is... 3913 * RES_LIMIT. 3914 */ 3915 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 3916 const char *buffer) 3917 { 3918 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3919 int type, name; 3920 unsigned long long val; 3921 int ret; 3922 3923 type = MEMFILE_TYPE(cft->private); 3924 name = MEMFILE_ATTR(cft->private); 3925 3926 if (!do_swap_account && type == _MEMSWAP) 3927 return -EOPNOTSUPP; 3928 3929 switch (name) { 3930 case RES_LIMIT: 3931 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3932 ret = -EINVAL; 3933 break; 3934 } 3935 /* This function does all necessary parse...reuse it */ 3936 ret = res_counter_memparse_write_strategy(buffer, &val); 3937 if (ret) 3938 break; 3939 if (type == _MEM) 3940 ret = mem_cgroup_resize_limit(memcg, val); 3941 else 3942 ret = mem_cgroup_resize_memsw_limit(memcg, val); 3943 break; 3944 case RES_SOFT_LIMIT: 3945 ret = res_counter_memparse_write_strategy(buffer, &val); 3946 if (ret) 3947 break; 3948 /* 3949 * For memsw, soft limits are hard to implement in terms 3950 * of semantics, for now, we support soft limits for 3951 * control without swap 3952 */ 3953 if (type == _MEM) 3954 ret = res_counter_set_soft_limit(&memcg->res, val); 3955 else 3956 ret = -EINVAL; 3957 break; 3958 default: 3959 ret = -EINVAL; /* should be BUG() ? */ 3960 break; 3961 } 3962 return ret; 3963 } 3964 3965 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 3966 unsigned long long *mem_limit, unsigned long long *memsw_limit) 3967 { 3968 struct cgroup *cgroup; 3969 unsigned long long min_limit, min_memsw_limit, tmp; 3970 3971 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3972 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3973 cgroup = memcg->css.cgroup; 3974 if (!memcg->use_hierarchy) 3975 goto out; 3976 3977 while (cgroup->parent) { 3978 cgroup = cgroup->parent; 3979 memcg = mem_cgroup_from_cont(cgroup); 3980 if (!memcg->use_hierarchy) 3981 break; 3982 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 3983 min_limit = min(min_limit, tmp); 3984 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3985 min_memsw_limit = min(min_memsw_limit, tmp); 3986 } 3987 out: 3988 *mem_limit = min_limit; 3989 *memsw_limit = min_memsw_limit; 3990 } 3991 3992 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3993 { 3994 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3995 int type, name; 3996 3997 type = MEMFILE_TYPE(event); 3998 name = MEMFILE_ATTR(event); 3999 4000 if (!do_swap_account && type == _MEMSWAP) 4001 return -EOPNOTSUPP; 4002 4003 switch (name) { 4004 case RES_MAX_USAGE: 4005 if (type == _MEM) 4006 res_counter_reset_max(&memcg->res); 4007 else 4008 res_counter_reset_max(&memcg->memsw); 4009 break; 4010 case RES_FAILCNT: 4011 if (type == _MEM) 4012 res_counter_reset_failcnt(&memcg->res); 4013 else 4014 res_counter_reset_failcnt(&memcg->memsw); 4015 break; 4016 } 4017 4018 return 0; 4019 } 4020 4021 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 4022 struct cftype *cft) 4023 { 4024 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 4025 } 4026 4027 #ifdef CONFIG_MMU 4028 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4029 struct cftype *cft, u64 val) 4030 { 4031 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4032 4033 if (val >= (1 << NR_MOVE_TYPE)) 4034 return -EINVAL; 4035 /* 4036 * We check this value several times in both in can_attach() and 4037 * attach(), so we need cgroup lock to prevent this value from being 4038 * inconsistent. 4039 */ 4040 cgroup_lock(); 4041 memcg->move_charge_at_immigrate = val; 4042 cgroup_unlock(); 4043 4044 return 0; 4045 } 4046 #else 4047 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4048 struct cftype *cft, u64 val) 4049 { 4050 return -ENOSYS; 4051 } 4052 #endif 4053 4054 4055 /* For read statistics */ 4056 enum { 4057 MCS_CACHE, 4058 MCS_RSS, 4059 MCS_FILE_MAPPED, 4060 MCS_PGPGIN, 4061 MCS_PGPGOUT, 4062 MCS_SWAP, 4063 MCS_PGFAULT, 4064 MCS_PGMAJFAULT, 4065 MCS_INACTIVE_ANON, 4066 MCS_ACTIVE_ANON, 4067 MCS_INACTIVE_FILE, 4068 MCS_ACTIVE_FILE, 4069 MCS_UNEVICTABLE, 4070 NR_MCS_STAT, 4071 }; 4072 4073 struct mcs_total_stat { 4074 s64 stat[NR_MCS_STAT]; 4075 }; 4076 4077 struct { 4078 char *local_name; 4079 char *total_name; 4080 } memcg_stat_strings[NR_MCS_STAT] = { 4081 {"cache", "total_cache"}, 4082 {"rss", "total_rss"}, 4083 {"mapped_file", "total_mapped_file"}, 4084 {"pgpgin", "total_pgpgin"}, 4085 {"pgpgout", "total_pgpgout"}, 4086 {"swap", "total_swap"}, 4087 {"pgfault", "total_pgfault"}, 4088 {"pgmajfault", "total_pgmajfault"}, 4089 {"inactive_anon", "total_inactive_anon"}, 4090 {"active_anon", "total_active_anon"}, 4091 {"inactive_file", "total_inactive_file"}, 4092 {"active_file", "total_active_file"}, 4093 {"unevictable", "total_unevictable"} 4094 }; 4095 4096 4097 static void 4098 mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) 4099 { 4100 s64 val; 4101 4102 /* per cpu stat */ 4103 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE); 4104 s->stat[MCS_CACHE] += val * PAGE_SIZE; 4105 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS); 4106 s->stat[MCS_RSS] += val * PAGE_SIZE; 4107 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); 4108 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 4109 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN); 4110 s->stat[MCS_PGPGIN] += val; 4111 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT); 4112 s->stat[MCS_PGPGOUT] += val; 4113 if (do_swap_account) { 4114 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); 4115 s->stat[MCS_SWAP] += val * PAGE_SIZE; 4116 } 4117 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT); 4118 s->stat[MCS_PGFAULT] += val; 4119 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT); 4120 s->stat[MCS_PGMAJFAULT] += val; 4121 4122 /* per zone stat */ 4123 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); 4124 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 4125 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); 4126 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 4127 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); 4128 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 4129 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); 4130 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 4131 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); 4132 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 4133 } 4134 4135 static void 4136 mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) 4137 { 4138 struct mem_cgroup *iter; 4139 4140 for_each_mem_cgroup_tree(iter, memcg) 4141 mem_cgroup_get_local_stat(iter, s); 4142 } 4143 4144 #ifdef CONFIG_NUMA 4145 static int mem_control_numa_stat_show(struct seq_file *m, void *arg) 4146 { 4147 int nid; 4148 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4149 unsigned long node_nr; 4150 struct cgroup *cont = m->private; 4151 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4152 4153 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 4154 seq_printf(m, "total=%lu", total_nr); 4155 for_each_node_state(nid, N_HIGH_MEMORY) { 4156 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); 4157 seq_printf(m, " N%d=%lu", nid, node_nr); 4158 } 4159 seq_putc(m, '\n'); 4160 4161 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); 4162 seq_printf(m, "file=%lu", file_nr); 4163 for_each_node_state(nid, N_HIGH_MEMORY) { 4164 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4165 LRU_ALL_FILE); 4166 seq_printf(m, " N%d=%lu", nid, node_nr); 4167 } 4168 seq_putc(m, '\n'); 4169 4170 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); 4171 seq_printf(m, "anon=%lu", anon_nr); 4172 for_each_node_state(nid, N_HIGH_MEMORY) { 4173 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4174 LRU_ALL_ANON); 4175 seq_printf(m, " N%d=%lu", nid, node_nr); 4176 } 4177 seq_putc(m, '\n'); 4178 4179 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); 4180 seq_printf(m, "unevictable=%lu", unevictable_nr); 4181 for_each_node_state(nid, N_HIGH_MEMORY) { 4182 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4183 BIT(LRU_UNEVICTABLE)); 4184 seq_printf(m, " N%d=%lu", nid, node_nr); 4185 } 4186 seq_putc(m, '\n'); 4187 return 0; 4188 } 4189 #endif /* CONFIG_NUMA */ 4190 4191 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4192 struct cgroup_map_cb *cb) 4193 { 4194 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4195 struct mcs_total_stat mystat; 4196 int i; 4197 4198 memset(&mystat, 0, sizeof(mystat)); 4199 mem_cgroup_get_local_stat(memcg, &mystat); 4200 4201 4202 for (i = 0; i < NR_MCS_STAT; i++) { 4203 if (i == MCS_SWAP && !do_swap_account) 4204 continue; 4205 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 4206 } 4207 4208 /* Hierarchical information */ 4209 { 4210 unsigned long long limit, memsw_limit; 4211 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 4212 cb->fill(cb, "hierarchical_memory_limit", limit); 4213 if (do_swap_account) 4214 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4215 } 4216 4217 memset(&mystat, 0, sizeof(mystat)); 4218 mem_cgroup_get_total_stat(memcg, &mystat); 4219 for (i = 0; i < NR_MCS_STAT; i++) { 4220 if (i == MCS_SWAP && !do_swap_account) 4221 continue; 4222 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 4223 } 4224 4225 #ifdef CONFIG_DEBUG_VM 4226 { 4227 int nid, zid; 4228 struct mem_cgroup_per_zone *mz; 4229 unsigned long recent_rotated[2] = {0, 0}; 4230 unsigned long recent_scanned[2] = {0, 0}; 4231 4232 for_each_online_node(nid) 4233 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4234 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 4235 4236 recent_rotated[0] += 4237 mz->reclaim_stat.recent_rotated[0]; 4238 recent_rotated[1] += 4239 mz->reclaim_stat.recent_rotated[1]; 4240 recent_scanned[0] += 4241 mz->reclaim_stat.recent_scanned[0]; 4242 recent_scanned[1] += 4243 mz->reclaim_stat.recent_scanned[1]; 4244 } 4245 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 4246 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 4247 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 4248 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 4249 } 4250 #endif 4251 4252 return 0; 4253 } 4254 4255 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 4256 { 4257 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4258 4259 return mem_cgroup_swappiness(memcg); 4260 } 4261 4262 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 4263 u64 val) 4264 { 4265 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4266 struct mem_cgroup *parent; 4267 4268 if (val > 100) 4269 return -EINVAL; 4270 4271 if (cgrp->parent == NULL) 4272 return -EINVAL; 4273 4274 parent = mem_cgroup_from_cont(cgrp->parent); 4275 4276 cgroup_lock(); 4277 4278 /* If under hierarchy, only empty-root can set this value */ 4279 if ((parent->use_hierarchy) || 4280 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 4281 cgroup_unlock(); 4282 return -EINVAL; 4283 } 4284 4285 memcg->swappiness = val; 4286 4287 cgroup_unlock(); 4288 4289 return 0; 4290 } 4291 4292 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4293 { 4294 struct mem_cgroup_threshold_ary *t; 4295 u64 usage; 4296 int i; 4297 4298 rcu_read_lock(); 4299 if (!swap) 4300 t = rcu_dereference(memcg->thresholds.primary); 4301 else 4302 t = rcu_dereference(memcg->memsw_thresholds.primary); 4303 4304 if (!t) 4305 goto unlock; 4306 4307 usage = mem_cgroup_usage(memcg, swap); 4308 4309 /* 4310 * current_threshold points to threshold just below usage. 4311 * If it's not true, a threshold was crossed after last 4312 * call of __mem_cgroup_threshold(). 4313 */ 4314 i = t->current_threshold; 4315 4316 /* 4317 * Iterate backward over array of thresholds starting from 4318 * current_threshold and check if a threshold is crossed. 4319 * If none of thresholds below usage is crossed, we read 4320 * only one element of the array here. 4321 */ 4322 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4323 eventfd_signal(t->entries[i].eventfd, 1); 4324 4325 /* i = current_threshold + 1 */ 4326 i++; 4327 4328 /* 4329 * Iterate forward over array of thresholds starting from 4330 * current_threshold+1 and check if a threshold is crossed. 4331 * If none of thresholds above usage is crossed, we read 4332 * only one element of the array here. 4333 */ 4334 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4335 eventfd_signal(t->entries[i].eventfd, 1); 4336 4337 /* Update current_threshold */ 4338 t->current_threshold = i - 1; 4339 unlock: 4340 rcu_read_unlock(); 4341 } 4342 4343 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4344 { 4345 while (memcg) { 4346 __mem_cgroup_threshold(memcg, false); 4347 if (do_swap_account) 4348 __mem_cgroup_threshold(memcg, true); 4349 4350 memcg = parent_mem_cgroup(memcg); 4351 } 4352 } 4353 4354 static int compare_thresholds(const void *a, const void *b) 4355 { 4356 const struct mem_cgroup_threshold *_a = a; 4357 const struct mem_cgroup_threshold *_b = b; 4358 4359 return _a->threshold - _b->threshold; 4360 } 4361 4362 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4363 { 4364 struct mem_cgroup_eventfd_list *ev; 4365 4366 list_for_each_entry(ev, &memcg->oom_notify, list) 4367 eventfd_signal(ev->eventfd, 1); 4368 return 0; 4369 } 4370 4371 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4372 { 4373 struct mem_cgroup *iter; 4374 4375 for_each_mem_cgroup_tree(iter, memcg) 4376 mem_cgroup_oom_notify_cb(iter); 4377 } 4378 4379 static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 4380 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4381 { 4382 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4383 struct mem_cgroup_thresholds *thresholds; 4384 struct mem_cgroup_threshold_ary *new; 4385 int type = MEMFILE_TYPE(cft->private); 4386 u64 threshold, usage; 4387 int i, size, ret; 4388 4389 ret = res_counter_memparse_write_strategy(args, &threshold); 4390 if (ret) 4391 return ret; 4392 4393 mutex_lock(&memcg->thresholds_lock); 4394 4395 if (type == _MEM) 4396 thresholds = &memcg->thresholds; 4397 else if (type == _MEMSWAP) 4398 thresholds = &memcg->memsw_thresholds; 4399 else 4400 BUG(); 4401 4402 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4403 4404 /* Check if a threshold crossed before adding a new one */ 4405 if (thresholds->primary) 4406 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4407 4408 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4409 4410 /* Allocate memory for new array of thresholds */ 4411 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 4412 GFP_KERNEL); 4413 if (!new) { 4414 ret = -ENOMEM; 4415 goto unlock; 4416 } 4417 new->size = size; 4418 4419 /* Copy thresholds (if any) to new array */ 4420 if (thresholds->primary) { 4421 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 4422 sizeof(struct mem_cgroup_threshold)); 4423 } 4424 4425 /* Add new threshold */ 4426 new->entries[size - 1].eventfd = eventfd; 4427 new->entries[size - 1].threshold = threshold; 4428 4429 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4430 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 4431 compare_thresholds, NULL); 4432 4433 /* Find current threshold */ 4434 new->current_threshold = -1; 4435 for (i = 0; i < size; i++) { 4436 if (new->entries[i].threshold < usage) { 4437 /* 4438 * new->current_threshold will not be used until 4439 * rcu_assign_pointer(), so it's safe to increment 4440 * it here. 4441 */ 4442 ++new->current_threshold; 4443 } 4444 } 4445 4446 /* Free old spare buffer and save old primary buffer as spare */ 4447 kfree(thresholds->spare); 4448 thresholds->spare = thresholds->primary; 4449 4450 rcu_assign_pointer(thresholds->primary, new); 4451 4452 /* To be sure that nobody uses thresholds */ 4453 synchronize_rcu(); 4454 4455 unlock: 4456 mutex_unlock(&memcg->thresholds_lock); 4457 4458 return ret; 4459 } 4460 4461 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 4462 struct cftype *cft, struct eventfd_ctx *eventfd) 4463 { 4464 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4465 struct mem_cgroup_thresholds *thresholds; 4466 struct mem_cgroup_threshold_ary *new; 4467 int type = MEMFILE_TYPE(cft->private); 4468 u64 usage; 4469 int i, j, size; 4470 4471 mutex_lock(&memcg->thresholds_lock); 4472 if (type == _MEM) 4473 thresholds = &memcg->thresholds; 4474 else if (type == _MEMSWAP) 4475 thresholds = &memcg->memsw_thresholds; 4476 else 4477 BUG(); 4478 4479 if (!thresholds->primary) 4480 goto unlock; 4481 4482 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4483 4484 /* Check if a threshold crossed before removing */ 4485 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4486 4487 /* Calculate new number of threshold */ 4488 size = 0; 4489 for (i = 0; i < thresholds->primary->size; i++) { 4490 if (thresholds->primary->entries[i].eventfd != eventfd) 4491 size++; 4492 } 4493 4494 new = thresholds->spare; 4495 4496 /* Set thresholds array to NULL if we don't have thresholds */ 4497 if (!size) { 4498 kfree(new); 4499 new = NULL; 4500 goto swap_buffers; 4501 } 4502 4503 new->size = size; 4504 4505 /* Copy thresholds and find current threshold */ 4506 new->current_threshold = -1; 4507 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4508 if (thresholds->primary->entries[i].eventfd == eventfd) 4509 continue; 4510 4511 new->entries[j] = thresholds->primary->entries[i]; 4512 if (new->entries[j].threshold < usage) { 4513 /* 4514 * new->current_threshold will not be used 4515 * until rcu_assign_pointer(), so it's safe to increment 4516 * it here. 4517 */ 4518 ++new->current_threshold; 4519 } 4520 j++; 4521 } 4522 4523 swap_buffers: 4524 /* Swap primary and spare array */ 4525 thresholds->spare = thresholds->primary; 4526 /* If all events are unregistered, free the spare array */ 4527 if (!new) { 4528 kfree(thresholds->spare); 4529 thresholds->spare = NULL; 4530 } 4531 4532 rcu_assign_pointer(thresholds->primary, new); 4533 4534 /* To be sure that nobody uses thresholds */ 4535 synchronize_rcu(); 4536 unlock: 4537 mutex_unlock(&memcg->thresholds_lock); 4538 } 4539 4540 static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 4541 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4542 { 4543 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4544 struct mem_cgroup_eventfd_list *event; 4545 int type = MEMFILE_TYPE(cft->private); 4546 4547 BUG_ON(type != _OOM_TYPE); 4548 event = kmalloc(sizeof(*event), GFP_KERNEL); 4549 if (!event) 4550 return -ENOMEM; 4551 4552 spin_lock(&memcg_oom_lock); 4553 4554 event->eventfd = eventfd; 4555 list_add(&event->list, &memcg->oom_notify); 4556 4557 /* already in OOM ? */ 4558 if (atomic_read(&memcg->under_oom)) 4559 eventfd_signal(eventfd, 1); 4560 spin_unlock(&memcg_oom_lock); 4561 4562 return 0; 4563 } 4564 4565 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 4566 struct cftype *cft, struct eventfd_ctx *eventfd) 4567 { 4568 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4569 struct mem_cgroup_eventfd_list *ev, *tmp; 4570 int type = MEMFILE_TYPE(cft->private); 4571 4572 BUG_ON(type != _OOM_TYPE); 4573 4574 spin_lock(&memcg_oom_lock); 4575 4576 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4577 if (ev->eventfd == eventfd) { 4578 list_del(&ev->list); 4579 kfree(ev); 4580 } 4581 } 4582 4583 spin_unlock(&memcg_oom_lock); 4584 } 4585 4586 static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4587 struct cftype *cft, struct cgroup_map_cb *cb) 4588 { 4589 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4590 4591 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); 4592 4593 if (atomic_read(&memcg->under_oom)) 4594 cb->fill(cb, "under_oom", 1); 4595 else 4596 cb->fill(cb, "under_oom", 0); 4597 return 0; 4598 } 4599 4600 static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 4601 struct cftype *cft, u64 val) 4602 { 4603 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4604 struct mem_cgroup *parent; 4605 4606 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4607 if (!cgrp->parent || !((val == 0) || (val == 1))) 4608 return -EINVAL; 4609 4610 parent = mem_cgroup_from_cont(cgrp->parent); 4611 4612 cgroup_lock(); 4613 /* oom-kill-disable is a flag for subhierarchy. */ 4614 if ((parent->use_hierarchy) || 4615 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 4616 cgroup_unlock(); 4617 return -EINVAL; 4618 } 4619 memcg->oom_kill_disable = val; 4620 if (!val) 4621 memcg_oom_recover(memcg); 4622 cgroup_unlock(); 4623 return 0; 4624 } 4625 4626 #ifdef CONFIG_NUMA 4627 static const struct file_operations mem_control_numa_stat_file_operations = { 4628 .read = seq_read, 4629 .llseek = seq_lseek, 4630 .release = single_release, 4631 }; 4632 4633 static int mem_control_numa_stat_open(struct inode *unused, struct file *file) 4634 { 4635 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; 4636 4637 file->f_op = &mem_control_numa_stat_file_operations; 4638 return single_open(file, mem_control_numa_stat_show, cont); 4639 } 4640 #endif /* CONFIG_NUMA */ 4641 4642 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 4643 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4644 { 4645 return mem_cgroup_sockets_init(memcg, ss); 4646 }; 4647 4648 static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 4649 { 4650 mem_cgroup_sockets_destroy(memcg); 4651 } 4652 #else 4653 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4654 { 4655 return 0; 4656 } 4657 4658 static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 4659 { 4660 } 4661 #endif 4662 4663 static struct cftype mem_cgroup_files[] = { 4664 { 4665 .name = "usage_in_bytes", 4666 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4667 .read = mem_cgroup_read, 4668 .register_event = mem_cgroup_usage_register_event, 4669 .unregister_event = mem_cgroup_usage_unregister_event, 4670 }, 4671 { 4672 .name = "max_usage_in_bytes", 4673 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4674 .trigger = mem_cgroup_reset, 4675 .read = mem_cgroup_read, 4676 }, 4677 { 4678 .name = "limit_in_bytes", 4679 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4680 .write_string = mem_cgroup_write, 4681 .read = mem_cgroup_read, 4682 }, 4683 { 4684 .name = "soft_limit_in_bytes", 4685 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4686 .write_string = mem_cgroup_write, 4687 .read = mem_cgroup_read, 4688 }, 4689 { 4690 .name = "failcnt", 4691 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4692 .trigger = mem_cgroup_reset, 4693 .read = mem_cgroup_read, 4694 }, 4695 { 4696 .name = "stat", 4697 .read_map = mem_control_stat_show, 4698 }, 4699 { 4700 .name = "force_empty", 4701 .trigger = mem_cgroup_force_empty_write, 4702 }, 4703 { 4704 .name = "use_hierarchy", 4705 .write_u64 = mem_cgroup_hierarchy_write, 4706 .read_u64 = mem_cgroup_hierarchy_read, 4707 }, 4708 { 4709 .name = "swappiness", 4710 .read_u64 = mem_cgroup_swappiness_read, 4711 .write_u64 = mem_cgroup_swappiness_write, 4712 }, 4713 { 4714 .name = "move_charge_at_immigrate", 4715 .read_u64 = mem_cgroup_move_charge_read, 4716 .write_u64 = mem_cgroup_move_charge_write, 4717 }, 4718 { 4719 .name = "oom_control", 4720 .read_map = mem_cgroup_oom_control_read, 4721 .write_u64 = mem_cgroup_oom_control_write, 4722 .register_event = mem_cgroup_oom_register_event, 4723 .unregister_event = mem_cgroup_oom_unregister_event, 4724 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4725 }, 4726 #ifdef CONFIG_NUMA 4727 { 4728 .name = "numa_stat", 4729 .open = mem_control_numa_stat_open, 4730 .mode = S_IRUGO, 4731 }, 4732 #endif 4733 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4734 { 4735 .name = "memsw.usage_in_bytes", 4736 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4737 .read = mem_cgroup_read, 4738 .register_event = mem_cgroup_usage_register_event, 4739 .unregister_event = mem_cgroup_usage_unregister_event, 4740 }, 4741 { 4742 .name = "memsw.max_usage_in_bytes", 4743 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4744 .trigger = mem_cgroup_reset, 4745 .read = mem_cgroup_read, 4746 }, 4747 { 4748 .name = "memsw.limit_in_bytes", 4749 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4750 .write_string = mem_cgroup_write, 4751 .read = mem_cgroup_read, 4752 }, 4753 { 4754 .name = "memsw.failcnt", 4755 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4756 .trigger = mem_cgroup_reset, 4757 .read = mem_cgroup_read, 4758 }, 4759 #endif 4760 { }, /* terminate */ 4761 }; 4762 4763 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4764 { 4765 struct mem_cgroup_per_node *pn; 4766 struct mem_cgroup_per_zone *mz; 4767 enum lru_list lru; 4768 int zone, tmp = node; 4769 /* 4770 * This routine is called against possible nodes. 4771 * But it's BUG to call kmalloc() against offline node. 4772 * 4773 * TODO: this routine can waste much memory for nodes which will 4774 * never be onlined. It's better to use memory hotplug callback 4775 * function. 4776 */ 4777 if (!node_state(node, N_NORMAL_MEMORY)) 4778 tmp = -1; 4779 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4780 if (!pn) 4781 return 1; 4782 4783 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4784 mz = &pn->zoneinfo[zone]; 4785 for_each_lru(lru) 4786 INIT_LIST_HEAD(&mz->lruvec.lists[lru]); 4787 mz->usage_in_excess = 0; 4788 mz->on_tree = false; 4789 mz->memcg = memcg; 4790 } 4791 memcg->info.nodeinfo[node] = pn; 4792 return 0; 4793 } 4794 4795 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4796 { 4797 kfree(memcg->info.nodeinfo[node]); 4798 } 4799 4800 static struct mem_cgroup *mem_cgroup_alloc(void) 4801 { 4802 struct mem_cgroup *memcg; 4803 int size = sizeof(struct mem_cgroup); 4804 4805 /* Can be very big if MAX_NUMNODES is very big */ 4806 if (size < PAGE_SIZE) 4807 memcg = kzalloc(size, GFP_KERNEL); 4808 else 4809 memcg = vzalloc(size); 4810 4811 if (!memcg) 4812 return NULL; 4813 4814 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4815 if (!memcg->stat) 4816 goto out_free; 4817 spin_lock_init(&memcg->pcp_counter_lock); 4818 return memcg; 4819 4820 out_free: 4821 if (size < PAGE_SIZE) 4822 kfree(memcg); 4823 else 4824 vfree(memcg); 4825 return NULL; 4826 } 4827 4828 /* 4829 * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, 4830 * but in process context. The work_freeing structure is overlaid 4831 * on the rcu_freeing structure, which itself is overlaid on memsw. 4832 */ 4833 static void vfree_work(struct work_struct *work) 4834 { 4835 struct mem_cgroup *memcg; 4836 4837 memcg = container_of(work, struct mem_cgroup, work_freeing); 4838 vfree(memcg); 4839 } 4840 static void vfree_rcu(struct rcu_head *rcu_head) 4841 { 4842 struct mem_cgroup *memcg; 4843 4844 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); 4845 INIT_WORK(&memcg->work_freeing, vfree_work); 4846 schedule_work(&memcg->work_freeing); 4847 } 4848 4849 /* 4850 * At destroying mem_cgroup, references from swap_cgroup can remain. 4851 * (scanning all at force_empty is too costly...) 4852 * 4853 * Instead of clearing all references at force_empty, we remember 4854 * the number of reference from swap_cgroup and free mem_cgroup when 4855 * it goes down to 0. 4856 * 4857 * Removal of cgroup itself succeeds regardless of refs from swap. 4858 */ 4859 4860 static void __mem_cgroup_free(struct mem_cgroup *memcg) 4861 { 4862 int node; 4863 4864 mem_cgroup_remove_from_trees(memcg); 4865 free_css_id(&mem_cgroup_subsys, &memcg->css); 4866 4867 for_each_node(node) 4868 free_mem_cgroup_per_zone_info(memcg, node); 4869 4870 free_percpu(memcg->stat); 4871 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4872 kfree_rcu(memcg, rcu_freeing); 4873 else 4874 call_rcu(&memcg->rcu_freeing, vfree_rcu); 4875 } 4876 4877 static void mem_cgroup_get(struct mem_cgroup *memcg) 4878 { 4879 atomic_inc(&memcg->refcnt); 4880 } 4881 4882 static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) 4883 { 4884 if (atomic_sub_and_test(count, &memcg->refcnt)) { 4885 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 4886 __mem_cgroup_free(memcg); 4887 if (parent) 4888 mem_cgroup_put(parent); 4889 } 4890 } 4891 4892 static void mem_cgroup_put(struct mem_cgroup *memcg) 4893 { 4894 __mem_cgroup_put(memcg, 1); 4895 } 4896 4897 /* 4898 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4899 */ 4900 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 4901 { 4902 if (!memcg->res.parent) 4903 return NULL; 4904 return mem_cgroup_from_res_counter(memcg->res.parent, res); 4905 } 4906 EXPORT_SYMBOL(parent_mem_cgroup); 4907 4908 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4909 static void __init enable_swap_cgroup(void) 4910 { 4911 if (!mem_cgroup_disabled() && really_do_swap_account) 4912 do_swap_account = 1; 4913 } 4914 #else 4915 static void __init enable_swap_cgroup(void) 4916 { 4917 } 4918 #endif 4919 4920 static int mem_cgroup_soft_limit_tree_init(void) 4921 { 4922 struct mem_cgroup_tree_per_node *rtpn; 4923 struct mem_cgroup_tree_per_zone *rtpz; 4924 int tmp, node, zone; 4925 4926 for_each_node(node) { 4927 tmp = node; 4928 if (!node_state(node, N_NORMAL_MEMORY)) 4929 tmp = -1; 4930 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4931 if (!rtpn) 4932 goto err_cleanup; 4933 4934 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4935 4936 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4937 rtpz = &rtpn->rb_tree_per_zone[zone]; 4938 rtpz->rb_root = RB_ROOT; 4939 spin_lock_init(&rtpz->lock); 4940 } 4941 } 4942 return 0; 4943 4944 err_cleanup: 4945 for_each_node(node) { 4946 if (!soft_limit_tree.rb_tree_per_node[node]) 4947 break; 4948 kfree(soft_limit_tree.rb_tree_per_node[node]); 4949 soft_limit_tree.rb_tree_per_node[node] = NULL; 4950 } 4951 return 1; 4952 4953 } 4954 4955 static struct cgroup_subsys_state * __ref 4956 mem_cgroup_create(struct cgroup *cont) 4957 { 4958 struct mem_cgroup *memcg, *parent; 4959 long error = -ENOMEM; 4960 int node; 4961 4962 memcg = mem_cgroup_alloc(); 4963 if (!memcg) 4964 return ERR_PTR(error); 4965 4966 for_each_node(node) 4967 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4968 goto free_out; 4969 4970 /* root ? */ 4971 if (cont->parent == NULL) { 4972 int cpu; 4973 enable_swap_cgroup(); 4974 parent = NULL; 4975 if (mem_cgroup_soft_limit_tree_init()) 4976 goto free_out; 4977 root_mem_cgroup = memcg; 4978 for_each_possible_cpu(cpu) { 4979 struct memcg_stock_pcp *stock = 4980 &per_cpu(memcg_stock, cpu); 4981 INIT_WORK(&stock->work, drain_local_stock); 4982 } 4983 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 4984 } else { 4985 parent = mem_cgroup_from_cont(cont->parent); 4986 memcg->use_hierarchy = parent->use_hierarchy; 4987 memcg->oom_kill_disable = parent->oom_kill_disable; 4988 } 4989 4990 if (parent && parent->use_hierarchy) { 4991 res_counter_init(&memcg->res, &parent->res); 4992 res_counter_init(&memcg->memsw, &parent->memsw); 4993 /* 4994 * We increment refcnt of the parent to ensure that we can 4995 * safely access it on res_counter_charge/uncharge. 4996 * This refcnt will be decremented when freeing this 4997 * mem_cgroup(see mem_cgroup_put). 4998 */ 4999 mem_cgroup_get(parent); 5000 } else { 5001 res_counter_init(&memcg->res, NULL); 5002 res_counter_init(&memcg->memsw, NULL); 5003 } 5004 memcg->last_scanned_node = MAX_NUMNODES; 5005 INIT_LIST_HEAD(&memcg->oom_notify); 5006 5007 if (parent) 5008 memcg->swappiness = mem_cgroup_swappiness(parent); 5009 atomic_set(&memcg->refcnt, 1); 5010 memcg->move_charge_at_immigrate = 0; 5011 mutex_init(&memcg->thresholds_lock); 5012 spin_lock_init(&memcg->move_lock); 5013 5014 error = memcg_init_kmem(memcg, &mem_cgroup_subsys); 5015 if (error) { 5016 /* 5017 * We call put now because our (and parent's) refcnts 5018 * are already in place. mem_cgroup_put() will internally 5019 * call __mem_cgroup_free, so return directly 5020 */ 5021 mem_cgroup_put(memcg); 5022 return ERR_PTR(error); 5023 } 5024 return &memcg->css; 5025 free_out: 5026 __mem_cgroup_free(memcg); 5027 return ERR_PTR(error); 5028 } 5029 5030 static int mem_cgroup_pre_destroy(struct cgroup *cont) 5031 { 5032 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5033 5034 return mem_cgroup_force_empty(memcg, false); 5035 } 5036 5037 static void mem_cgroup_destroy(struct cgroup *cont) 5038 { 5039 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5040 5041 kmem_cgroup_destroy(memcg); 5042 5043 mem_cgroup_put(memcg); 5044 } 5045 5046 #ifdef CONFIG_MMU 5047 /* Handlers for move charge at task migration. */ 5048 #define PRECHARGE_COUNT_AT_ONCE 256 5049 static int mem_cgroup_do_precharge(unsigned long count) 5050 { 5051 int ret = 0; 5052 int batch_count = PRECHARGE_COUNT_AT_ONCE; 5053 struct mem_cgroup *memcg = mc.to; 5054 5055 if (mem_cgroup_is_root(memcg)) { 5056 mc.precharge += count; 5057 /* we don't need css_get for root */ 5058 return ret; 5059 } 5060 /* try to charge at once */ 5061 if (count > 1) { 5062 struct res_counter *dummy; 5063 /* 5064 * "memcg" cannot be under rmdir() because we've already checked 5065 * by cgroup_lock_live_cgroup() that it is not removed and we 5066 * are still under the same cgroup_mutex. So we can postpone 5067 * css_get(). 5068 */ 5069 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) 5070 goto one_by_one; 5071 if (do_swap_account && res_counter_charge(&memcg->memsw, 5072 PAGE_SIZE * count, &dummy)) { 5073 res_counter_uncharge(&memcg->res, PAGE_SIZE * count); 5074 goto one_by_one; 5075 } 5076 mc.precharge += count; 5077 return ret; 5078 } 5079 one_by_one: 5080 /* fall back to one by one charge */ 5081 while (count--) { 5082 if (signal_pending(current)) { 5083 ret = -EINTR; 5084 break; 5085 } 5086 if (!batch_count--) { 5087 batch_count = PRECHARGE_COUNT_AT_ONCE; 5088 cond_resched(); 5089 } 5090 ret = __mem_cgroup_try_charge(NULL, 5091 GFP_KERNEL, 1, &memcg, false); 5092 if (ret) 5093 /* mem_cgroup_clear_mc() will do uncharge later */ 5094 return ret; 5095 mc.precharge++; 5096 } 5097 return ret; 5098 } 5099 5100 /** 5101 * get_mctgt_type - get target type of moving charge 5102 * @vma: the vma the pte to be checked belongs 5103 * @addr: the address corresponding to the pte to be checked 5104 * @ptent: the pte to be checked 5105 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5106 * 5107 * Returns 5108 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5109 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5110 * move charge. if @target is not NULL, the page is stored in target->page 5111 * with extra refcnt got(Callers should handle it). 5112 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5113 * target for charge migration. if @target is not NULL, the entry is stored 5114 * in target->ent. 5115 * 5116 * Called with pte lock held. 5117 */ 5118 union mc_target { 5119 struct page *page; 5120 swp_entry_t ent; 5121 }; 5122 5123 enum mc_target_type { 5124 MC_TARGET_NONE = 0, 5125 MC_TARGET_PAGE, 5126 MC_TARGET_SWAP, 5127 }; 5128 5129 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5130 unsigned long addr, pte_t ptent) 5131 { 5132 struct page *page = vm_normal_page(vma, addr, ptent); 5133 5134 if (!page || !page_mapped(page)) 5135 return NULL; 5136 if (PageAnon(page)) { 5137 /* we don't move shared anon */ 5138 if (!move_anon() || page_mapcount(page) > 2) 5139 return NULL; 5140 } else if (!move_file()) 5141 /* we ignore mapcount for file pages */ 5142 return NULL; 5143 if (!get_page_unless_zero(page)) 5144 return NULL; 5145 5146 return page; 5147 } 5148 5149 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5150 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5151 { 5152 int usage_count; 5153 struct page *page = NULL; 5154 swp_entry_t ent = pte_to_swp_entry(ptent); 5155 5156 if (!move_anon() || non_swap_entry(ent)) 5157 return NULL; 5158 usage_count = mem_cgroup_count_swap_user(ent, &page); 5159 if (usage_count > 1) { /* we don't move shared anon */ 5160 if (page) 5161 put_page(page); 5162 return NULL; 5163 } 5164 if (do_swap_account) 5165 entry->val = ent.val; 5166 5167 return page; 5168 } 5169 5170 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5171 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5172 { 5173 struct page *page = NULL; 5174 struct inode *inode; 5175 struct address_space *mapping; 5176 pgoff_t pgoff; 5177 5178 if (!vma->vm_file) /* anonymous vma */ 5179 return NULL; 5180 if (!move_file()) 5181 return NULL; 5182 5183 inode = vma->vm_file->f_path.dentry->d_inode; 5184 mapping = vma->vm_file->f_mapping; 5185 if (pte_none(ptent)) 5186 pgoff = linear_page_index(vma, addr); 5187 else /* pte_file(ptent) is true */ 5188 pgoff = pte_to_pgoff(ptent); 5189 5190 /* page is moved even if it's not RSS of this task(page-faulted). */ 5191 page = find_get_page(mapping, pgoff); 5192 5193 #ifdef CONFIG_SWAP 5194 /* shmem/tmpfs may report page out on swap: account for that too. */ 5195 if (radix_tree_exceptional_entry(page)) { 5196 swp_entry_t swap = radix_to_swp_entry(page); 5197 if (do_swap_account) 5198 *entry = swap; 5199 page = find_get_page(&swapper_space, swap.val); 5200 } 5201 #endif 5202 return page; 5203 } 5204 5205 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5206 unsigned long addr, pte_t ptent, union mc_target *target) 5207 { 5208 struct page *page = NULL; 5209 struct page_cgroup *pc; 5210 enum mc_target_type ret = MC_TARGET_NONE; 5211 swp_entry_t ent = { .val = 0 }; 5212 5213 if (pte_present(ptent)) 5214 page = mc_handle_present_pte(vma, addr, ptent); 5215 else if (is_swap_pte(ptent)) 5216 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 5217 else if (pte_none(ptent) || pte_file(ptent)) 5218 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5219 5220 if (!page && !ent.val) 5221 return ret; 5222 if (page) { 5223 pc = lookup_page_cgroup(page); 5224 /* 5225 * Do only loose check w/o page_cgroup lock. 5226 * mem_cgroup_move_account() checks the pc is valid or not under 5227 * the lock. 5228 */ 5229 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5230 ret = MC_TARGET_PAGE; 5231 if (target) 5232 target->page = page; 5233 } 5234 if (!ret || !target) 5235 put_page(page); 5236 } 5237 /* There is a swap entry and a page doesn't exist or isn't charged */ 5238 if (ent.val && !ret && 5239 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { 5240 ret = MC_TARGET_SWAP; 5241 if (target) 5242 target->ent = ent; 5243 } 5244 return ret; 5245 } 5246 5247 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5248 /* 5249 * We don't consider swapping or file mapped pages because THP does not 5250 * support them for now. 5251 * Caller should make sure that pmd_trans_huge(pmd) is true. 5252 */ 5253 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5254 unsigned long addr, pmd_t pmd, union mc_target *target) 5255 { 5256 struct page *page = NULL; 5257 struct page_cgroup *pc; 5258 enum mc_target_type ret = MC_TARGET_NONE; 5259 5260 page = pmd_page(pmd); 5261 VM_BUG_ON(!page || !PageHead(page)); 5262 if (!move_anon()) 5263 return ret; 5264 pc = lookup_page_cgroup(page); 5265 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5266 ret = MC_TARGET_PAGE; 5267 if (target) { 5268 get_page(page); 5269 target->page = page; 5270 } 5271 } 5272 return ret; 5273 } 5274 #else 5275 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5276 unsigned long addr, pmd_t pmd, union mc_target *target) 5277 { 5278 return MC_TARGET_NONE; 5279 } 5280 #endif 5281 5282 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5283 unsigned long addr, unsigned long end, 5284 struct mm_walk *walk) 5285 { 5286 struct vm_area_struct *vma = walk->private; 5287 pte_t *pte; 5288 spinlock_t *ptl; 5289 5290 if (pmd_trans_huge_lock(pmd, vma) == 1) { 5291 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5292 mc.precharge += HPAGE_PMD_NR; 5293 spin_unlock(&vma->vm_mm->page_table_lock); 5294 return 0; 5295 } 5296 5297 if (pmd_trans_unstable(pmd)) 5298 return 0; 5299 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5300 for (; addr != end; pte++, addr += PAGE_SIZE) 5301 if (get_mctgt_type(vma, addr, *pte, NULL)) 5302 mc.precharge++; /* increment precharge temporarily */ 5303 pte_unmap_unlock(pte - 1, ptl); 5304 cond_resched(); 5305 5306 return 0; 5307 } 5308 5309 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5310 { 5311 unsigned long precharge; 5312 struct vm_area_struct *vma; 5313 5314 down_read(&mm->mmap_sem); 5315 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5316 struct mm_walk mem_cgroup_count_precharge_walk = { 5317 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5318 .mm = mm, 5319 .private = vma, 5320 }; 5321 if (is_vm_hugetlb_page(vma)) 5322 continue; 5323 walk_page_range(vma->vm_start, vma->vm_end, 5324 &mem_cgroup_count_precharge_walk); 5325 } 5326 up_read(&mm->mmap_sem); 5327 5328 precharge = mc.precharge; 5329 mc.precharge = 0; 5330 5331 return precharge; 5332 } 5333 5334 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5335 { 5336 unsigned long precharge = mem_cgroup_count_precharge(mm); 5337 5338 VM_BUG_ON(mc.moving_task); 5339 mc.moving_task = current; 5340 return mem_cgroup_do_precharge(precharge); 5341 } 5342 5343 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5344 static void __mem_cgroup_clear_mc(void) 5345 { 5346 struct mem_cgroup *from = mc.from; 5347 struct mem_cgroup *to = mc.to; 5348 5349 /* we must uncharge all the leftover precharges from mc.to */ 5350 if (mc.precharge) { 5351 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 5352 mc.precharge = 0; 5353 } 5354 /* 5355 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5356 * we must uncharge here. 5357 */ 5358 if (mc.moved_charge) { 5359 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 5360 mc.moved_charge = 0; 5361 } 5362 /* we must fixup refcnts and charges */ 5363 if (mc.moved_swap) { 5364 /* uncharge swap account from the old cgroup */ 5365 if (!mem_cgroup_is_root(mc.from)) 5366 res_counter_uncharge(&mc.from->memsw, 5367 PAGE_SIZE * mc.moved_swap); 5368 __mem_cgroup_put(mc.from, mc.moved_swap); 5369 5370 if (!mem_cgroup_is_root(mc.to)) { 5371 /* 5372 * we charged both to->res and to->memsw, so we should 5373 * uncharge to->res. 5374 */ 5375 res_counter_uncharge(&mc.to->res, 5376 PAGE_SIZE * mc.moved_swap); 5377 } 5378 /* we've already done mem_cgroup_get(mc.to) */ 5379 mc.moved_swap = 0; 5380 } 5381 memcg_oom_recover(from); 5382 memcg_oom_recover(to); 5383 wake_up_all(&mc.waitq); 5384 } 5385 5386 static void mem_cgroup_clear_mc(void) 5387 { 5388 struct mem_cgroup *from = mc.from; 5389 5390 /* 5391 * we must clear moving_task before waking up waiters at the end of 5392 * task migration. 5393 */ 5394 mc.moving_task = NULL; 5395 __mem_cgroup_clear_mc(); 5396 spin_lock(&mc.lock); 5397 mc.from = NULL; 5398 mc.to = NULL; 5399 spin_unlock(&mc.lock); 5400 mem_cgroup_end_move(from); 5401 } 5402 5403 static int mem_cgroup_can_attach(struct cgroup *cgroup, 5404 struct cgroup_taskset *tset) 5405 { 5406 struct task_struct *p = cgroup_taskset_first(tset); 5407 int ret = 0; 5408 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); 5409 5410 if (memcg->move_charge_at_immigrate) { 5411 struct mm_struct *mm; 5412 struct mem_cgroup *from = mem_cgroup_from_task(p); 5413 5414 VM_BUG_ON(from == memcg); 5415 5416 mm = get_task_mm(p); 5417 if (!mm) 5418 return 0; 5419 /* We move charges only when we move a owner of the mm */ 5420 if (mm->owner == p) { 5421 VM_BUG_ON(mc.from); 5422 VM_BUG_ON(mc.to); 5423 VM_BUG_ON(mc.precharge); 5424 VM_BUG_ON(mc.moved_charge); 5425 VM_BUG_ON(mc.moved_swap); 5426 mem_cgroup_start_move(from); 5427 spin_lock(&mc.lock); 5428 mc.from = from; 5429 mc.to = memcg; 5430 spin_unlock(&mc.lock); 5431 /* We set mc.moving_task later */ 5432 5433 ret = mem_cgroup_precharge_mc(mm); 5434 if (ret) 5435 mem_cgroup_clear_mc(); 5436 } 5437 mmput(mm); 5438 } 5439 return ret; 5440 } 5441 5442 static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 5443 struct cgroup_taskset *tset) 5444 { 5445 mem_cgroup_clear_mc(); 5446 } 5447 5448 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5449 unsigned long addr, unsigned long end, 5450 struct mm_walk *walk) 5451 { 5452 int ret = 0; 5453 struct vm_area_struct *vma = walk->private; 5454 pte_t *pte; 5455 spinlock_t *ptl; 5456 enum mc_target_type target_type; 5457 union mc_target target; 5458 struct page *page; 5459 struct page_cgroup *pc; 5460 5461 /* 5462 * We don't take compound_lock() here but no race with splitting thp 5463 * happens because: 5464 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 5465 * under splitting, which means there's no concurrent thp split, 5466 * - if another thread runs into split_huge_page() just after we 5467 * entered this if-block, the thread must wait for page table lock 5468 * to be unlocked in __split_huge_page_splitting(), where the main 5469 * part of thp split is not executed yet. 5470 */ 5471 if (pmd_trans_huge_lock(pmd, vma) == 1) { 5472 if (mc.precharge < HPAGE_PMD_NR) { 5473 spin_unlock(&vma->vm_mm->page_table_lock); 5474 return 0; 5475 } 5476 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 5477 if (target_type == MC_TARGET_PAGE) { 5478 page = target.page; 5479 if (!isolate_lru_page(page)) { 5480 pc = lookup_page_cgroup(page); 5481 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5482 pc, mc.from, mc.to, 5483 false)) { 5484 mc.precharge -= HPAGE_PMD_NR; 5485 mc.moved_charge += HPAGE_PMD_NR; 5486 } 5487 putback_lru_page(page); 5488 } 5489 put_page(page); 5490 } 5491 spin_unlock(&vma->vm_mm->page_table_lock); 5492 return 0; 5493 } 5494 5495 if (pmd_trans_unstable(pmd)) 5496 return 0; 5497 retry: 5498 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5499 for (; addr != end; addr += PAGE_SIZE) { 5500 pte_t ptent = *(pte++); 5501 swp_entry_t ent; 5502 5503 if (!mc.precharge) 5504 break; 5505 5506 switch (get_mctgt_type(vma, addr, ptent, &target)) { 5507 case MC_TARGET_PAGE: 5508 page = target.page; 5509 if (isolate_lru_page(page)) 5510 goto put; 5511 pc = lookup_page_cgroup(page); 5512 if (!mem_cgroup_move_account(page, 1, pc, 5513 mc.from, mc.to, false)) { 5514 mc.precharge--; 5515 /* we uncharge from mc.from later. */ 5516 mc.moved_charge++; 5517 } 5518 putback_lru_page(page); 5519 put: /* get_mctgt_type() gets the page */ 5520 put_page(page); 5521 break; 5522 case MC_TARGET_SWAP: 5523 ent = target.ent; 5524 if (!mem_cgroup_move_swap_account(ent, 5525 mc.from, mc.to, false)) { 5526 mc.precharge--; 5527 /* we fixup refcnts and charges later. */ 5528 mc.moved_swap++; 5529 } 5530 break; 5531 default: 5532 break; 5533 } 5534 } 5535 pte_unmap_unlock(pte - 1, ptl); 5536 cond_resched(); 5537 5538 if (addr != end) { 5539 /* 5540 * We have consumed all precharges we got in can_attach(). 5541 * We try charge one by one, but don't do any additional 5542 * charges to mc.to if we have failed in charge once in attach() 5543 * phase. 5544 */ 5545 ret = mem_cgroup_do_precharge(1); 5546 if (!ret) 5547 goto retry; 5548 } 5549 5550 return ret; 5551 } 5552 5553 static void mem_cgroup_move_charge(struct mm_struct *mm) 5554 { 5555 struct vm_area_struct *vma; 5556 5557 lru_add_drain_all(); 5558 retry: 5559 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5560 /* 5561 * Someone who are holding the mmap_sem might be waiting in 5562 * waitq. So we cancel all extra charges, wake up all waiters, 5563 * and retry. Because we cancel precharges, we might not be able 5564 * to move enough charges, but moving charge is a best-effort 5565 * feature anyway, so it wouldn't be a big problem. 5566 */ 5567 __mem_cgroup_clear_mc(); 5568 cond_resched(); 5569 goto retry; 5570 } 5571 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5572 int ret; 5573 struct mm_walk mem_cgroup_move_charge_walk = { 5574 .pmd_entry = mem_cgroup_move_charge_pte_range, 5575 .mm = mm, 5576 .private = vma, 5577 }; 5578 if (is_vm_hugetlb_page(vma)) 5579 continue; 5580 ret = walk_page_range(vma->vm_start, vma->vm_end, 5581 &mem_cgroup_move_charge_walk); 5582 if (ret) 5583 /* 5584 * means we have consumed all precharges and failed in 5585 * doing additional charge. Just abandon here. 5586 */ 5587 break; 5588 } 5589 up_read(&mm->mmap_sem); 5590 } 5591 5592 static void mem_cgroup_move_task(struct cgroup *cont, 5593 struct cgroup_taskset *tset) 5594 { 5595 struct task_struct *p = cgroup_taskset_first(tset); 5596 struct mm_struct *mm = get_task_mm(p); 5597 5598 if (mm) { 5599 if (mc.to) 5600 mem_cgroup_move_charge(mm); 5601 put_swap_token(mm); 5602 mmput(mm); 5603 } 5604 if (mc.to) 5605 mem_cgroup_clear_mc(); 5606 } 5607 #else /* !CONFIG_MMU */ 5608 static int mem_cgroup_can_attach(struct cgroup *cgroup, 5609 struct cgroup_taskset *tset) 5610 { 5611 return 0; 5612 } 5613 static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 5614 struct cgroup_taskset *tset) 5615 { 5616 } 5617 static void mem_cgroup_move_task(struct cgroup *cont, 5618 struct cgroup_taskset *tset) 5619 { 5620 } 5621 #endif 5622 5623 struct cgroup_subsys mem_cgroup_subsys = { 5624 .name = "memory", 5625 .subsys_id = mem_cgroup_subsys_id, 5626 .create = mem_cgroup_create, 5627 .pre_destroy = mem_cgroup_pre_destroy, 5628 .destroy = mem_cgroup_destroy, 5629 .can_attach = mem_cgroup_can_attach, 5630 .cancel_attach = mem_cgroup_cancel_attach, 5631 .attach = mem_cgroup_move_task, 5632 .base_cftypes = mem_cgroup_files, 5633 .early_init = 0, 5634 .use_id = 1, 5635 .__DEPRECATED_clear_css_refs = true, 5636 }; 5637 5638 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5639 static int __init enable_swap_account(char *s) 5640 { 5641 /* consider enabled if no parameter or 1 is given */ 5642 if (!strcmp(s, "1")) 5643 really_do_swap_account = 1; 5644 else if (!strcmp(s, "0")) 5645 really_do_swap_account = 0; 5646 return 1; 5647 } 5648 __setup("swapaccount=", enable_swap_account); 5649 5650 #endif 5651