1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * Kernel Memory Controller 14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 15 * Authors: Glauber Costa and Suleiman Souhlal 16 * 17 * This program is free software; you can redistribute it and/or modify 18 * it under the terms of the GNU General Public License as published by 19 * the Free Software Foundation; either version 2 of the License, or 20 * (at your option) any later version. 21 * 22 * This program is distributed in the hope that it will be useful, 23 * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 * GNU General Public License for more details. 26 */ 27 28 #include <linux/res_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/mm.h> 32 #include <linux/hugetlb.h> 33 #include <linux/pagemap.h> 34 #include <linux/smp.h> 35 #include <linux/page-flags.h> 36 #include <linux/backing-dev.h> 37 #include <linux/bit_spinlock.h> 38 #include <linux/rcupdate.h> 39 #include <linux/limits.h> 40 #include <linux/export.h> 41 #include <linux/mutex.h> 42 #include <linux/rbtree.h> 43 #include <linux/slab.h> 44 #include <linux/swap.h> 45 #include <linux/swapops.h> 46 #include <linux/spinlock.h> 47 #include <linux/eventfd.h> 48 #include <linux/poll.h> 49 #include <linux/sort.h> 50 #include <linux/fs.h> 51 #include <linux/seq_file.h> 52 #include <linux/vmalloc.h> 53 #include <linux/vmpressure.h> 54 #include <linux/mm_inline.h> 55 #include <linux/page_cgroup.h> 56 #include <linux/cpu.h> 57 #include <linux/oom.h> 58 #include <linux/lockdep.h> 59 #include <linux/file.h> 60 #include "internal.h" 61 #include <net/sock.h> 62 #include <net/ip.h> 63 #include <net/tcp_memcontrol.h> 64 65 #include <asm/uaccess.h> 66 67 #include <trace/events/vmscan.h> 68 69 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 70 EXPORT_SYMBOL(mem_cgroup_subsys); 71 72 #define MEM_CGROUP_RECLAIM_RETRIES 5 73 static struct mem_cgroup *root_mem_cgroup __read_mostly; 74 75 #ifdef CONFIG_MEMCG_SWAP 76 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 77 int do_swap_account __read_mostly; 78 79 /* for remember boot option*/ 80 #ifdef CONFIG_MEMCG_SWAP_ENABLED 81 static int really_do_swap_account __initdata = 1; 82 #else 83 static int really_do_swap_account __initdata = 0; 84 #endif 85 86 #else 87 #define do_swap_account 0 88 #endif 89 90 91 static const char * const mem_cgroup_stat_names[] = { 92 "cache", 93 "rss", 94 "rss_huge", 95 "mapped_file", 96 "writeback", 97 "swap", 98 }; 99 100 enum mem_cgroup_events_index { 101 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 102 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 103 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 104 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 105 MEM_CGROUP_EVENTS_NSTATS, 106 }; 107 108 static const char * const mem_cgroup_events_names[] = { 109 "pgpgin", 110 "pgpgout", 111 "pgfault", 112 "pgmajfault", 113 }; 114 115 static const char * const mem_cgroup_lru_names[] = { 116 "inactive_anon", 117 "active_anon", 118 "inactive_file", 119 "active_file", 120 "unevictable", 121 }; 122 123 /* 124 * Per memcg event counter is incremented at every pagein/pageout. With THP, 125 * it will be incremated by the number of pages. This counter is used for 126 * for trigger some periodic events. This is straightforward and better 127 * than using jiffies etc. to handle periodic memcg event. 128 */ 129 enum mem_cgroup_events_target { 130 MEM_CGROUP_TARGET_THRESH, 131 MEM_CGROUP_TARGET_SOFTLIMIT, 132 MEM_CGROUP_TARGET_NUMAINFO, 133 MEM_CGROUP_NTARGETS, 134 }; 135 #define THRESHOLDS_EVENTS_TARGET 128 136 #define SOFTLIMIT_EVENTS_TARGET 1024 137 #define NUMAINFO_EVENTS_TARGET 1024 138 139 struct mem_cgroup_stat_cpu { 140 long count[MEM_CGROUP_STAT_NSTATS]; 141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 142 unsigned long nr_page_events; 143 unsigned long targets[MEM_CGROUP_NTARGETS]; 144 }; 145 146 struct mem_cgroup_reclaim_iter { 147 /* 148 * last scanned hierarchy member. Valid only if last_dead_count 149 * matches memcg->dead_count of the hierarchy root group. 150 */ 151 struct mem_cgroup *last_visited; 152 unsigned long last_dead_count; 153 154 /* scan generation, increased every round-trip */ 155 unsigned int generation; 156 }; 157 158 /* 159 * per-zone information in memory controller. 160 */ 161 struct mem_cgroup_per_zone { 162 struct lruvec lruvec; 163 unsigned long lru_size[NR_LRU_LISTS]; 164 165 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 166 167 struct rb_node tree_node; /* RB tree node */ 168 unsigned long long usage_in_excess;/* Set to the value by which */ 169 /* the soft limit is exceeded*/ 170 bool on_tree; 171 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 172 /* use container_of */ 173 }; 174 175 struct mem_cgroup_per_node { 176 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 177 }; 178 179 /* 180 * Cgroups above their limits are maintained in a RB-Tree, independent of 181 * their hierarchy representation 182 */ 183 184 struct mem_cgroup_tree_per_zone { 185 struct rb_root rb_root; 186 spinlock_t lock; 187 }; 188 189 struct mem_cgroup_tree_per_node { 190 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 191 }; 192 193 struct mem_cgroup_tree { 194 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 195 }; 196 197 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 198 199 struct mem_cgroup_threshold { 200 struct eventfd_ctx *eventfd; 201 u64 threshold; 202 }; 203 204 /* For threshold */ 205 struct mem_cgroup_threshold_ary { 206 /* An array index points to threshold just below or equal to usage. */ 207 int current_threshold; 208 /* Size of entries[] */ 209 unsigned int size; 210 /* Array of thresholds */ 211 struct mem_cgroup_threshold entries[0]; 212 }; 213 214 struct mem_cgroup_thresholds { 215 /* Primary thresholds array */ 216 struct mem_cgroup_threshold_ary *primary; 217 /* 218 * Spare threshold array. 219 * This is needed to make mem_cgroup_unregister_event() "never fail". 220 * It must be able to store at least primary->size - 1 entries. 221 */ 222 struct mem_cgroup_threshold_ary *spare; 223 }; 224 225 /* for OOM */ 226 struct mem_cgroup_eventfd_list { 227 struct list_head list; 228 struct eventfd_ctx *eventfd; 229 }; 230 231 /* 232 * cgroup_event represents events which userspace want to receive. 233 */ 234 struct cgroup_event { 235 /* 236 * css which the event belongs to. 237 */ 238 struct cgroup_subsys_state *css; 239 /* 240 * Control file which the event associated. 241 */ 242 struct cftype *cft; 243 /* 244 * eventfd to signal userspace about the event. 245 */ 246 struct eventfd_ctx *eventfd; 247 /* 248 * Each of these stored in a list by the cgroup. 249 */ 250 struct list_head list; 251 /* 252 * All fields below needed to unregister event when 253 * userspace closes eventfd. 254 */ 255 poll_table pt; 256 wait_queue_head_t *wqh; 257 wait_queue_t wait; 258 struct work_struct remove; 259 }; 260 261 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 262 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 263 264 /* 265 * The memory controller data structure. The memory controller controls both 266 * page cache and RSS per cgroup. We would eventually like to provide 267 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 268 * to help the administrator determine what knobs to tune. 269 * 270 * TODO: Add a water mark for the memory controller. Reclaim will begin when 271 * we hit the water mark. May be even add a low water mark, such that 272 * no reclaim occurs from a cgroup at it's low water mark, this is 273 * a feature that will be implemented much later in the future. 274 */ 275 struct mem_cgroup { 276 struct cgroup_subsys_state css; 277 /* 278 * the counter to account for memory usage 279 */ 280 struct res_counter res; 281 282 /* vmpressure notifications */ 283 struct vmpressure vmpressure; 284 285 /* 286 * the counter to account for mem+swap usage. 287 */ 288 struct res_counter memsw; 289 290 /* 291 * the counter to account for kernel memory usage. 292 */ 293 struct res_counter kmem; 294 /* 295 * Should the accounting and control be hierarchical, per subtree? 296 */ 297 bool use_hierarchy; 298 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ 299 300 bool oom_lock; 301 atomic_t under_oom; 302 atomic_t oom_wakeups; 303 304 int swappiness; 305 /* OOM-Killer disable */ 306 int oom_kill_disable; 307 308 /* set when res.limit == memsw.limit */ 309 bool memsw_is_minimum; 310 311 /* protect arrays of thresholds */ 312 struct mutex thresholds_lock; 313 314 /* thresholds for memory usage. RCU-protected */ 315 struct mem_cgroup_thresholds thresholds; 316 317 /* thresholds for mem+swap usage. RCU-protected */ 318 struct mem_cgroup_thresholds memsw_thresholds; 319 320 /* For oom notifier event fd */ 321 struct list_head oom_notify; 322 323 /* 324 * Should we move charges of a task when a task is moved into this 325 * mem_cgroup ? And what type of charges should we move ? 326 */ 327 unsigned long move_charge_at_immigrate; 328 /* 329 * set > 0 if pages under this cgroup are moving to other cgroup. 330 */ 331 atomic_t moving_account; 332 /* taken only while moving_account > 0 */ 333 spinlock_t move_lock; 334 /* 335 * percpu counter. 336 */ 337 struct mem_cgroup_stat_cpu __percpu *stat; 338 /* 339 * used when a cpu is offlined or other synchronizations 340 * See mem_cgroup_read_stat(). 341 */ 342 struct mem_cgroup_stat_cpu nocpu_base; 343 spinlock_t pcp_counter_lock; 344 345 atomic_t dead_count; 346 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 347 struct tcp_memcontrol tcp_mem; 348 #endif 349 #if defined(CONFIG_MEMCG_KMEM) 350 /* analogous to slab_common's slab_caches list. per-memcg */ 351 struct list_head memcg_slab_caches; 352 /* Not a spinlock, we can take a lot of time walking the list */ 353 struct mutex slab_caches_mutex; 354 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 355 int kmemcg_id; 356 #endif 357 358 int last_scanned_node; 359 #if MAX_NUMNODES > 1 360 nodemask_t scan_nodes; 361 atomic_t numainfo_events; 362 atomic_t numainfo_updating; 363 #endif 364 365 struct mem_cgroup_per_node *nodeinfo[0]; 366 /* WARNING: nodeinfo must be the last member here */ 367 }; 368 369 static size_t memcg_size(void) 370 { 371 return sizeof(struct mem_cgroup) + 372 nr_node_ids * sizeof(struct mem_cgroup_per_node); 373 } 374 375 /* internal only representation about the status of kmem accounting. */ 376 enum { 377 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ 378 KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ 379 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ 380 }; 381 382 /* We account when limit is on, but only after call sites are patched */ 383 #define KMEM_ACCOUNTED_MASK \ 384 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) 385 386 #ifdef CONFIG_MEMCG_KMEM 387 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) 388 { 389 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 390 } 391 392 static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 393 { 394 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 395 } 396 397 static void memcg_kmem_set_activated(struct mem_cgroup *memcg) 398 { 399 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); 400 } 401 402 static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) 403 { 404 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); 405 } 406 407 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 408 { 409 /* 410 * Our caller must use css_get() first, because memcg_uncharge_kmem() 411 * will call css_put() if it sees the memcg is dead. 412 */ 413 smp_wmb(); 414 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) 415 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); 416 } 417 418 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) 419 { 420 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, 421 &memcg->kmem_account_flags); 422 } 423 #endif 424 425 /* Stuffs for move charges at task migration. */ 426 /* 427 * Types of charges to be moved. "move_charge_at_immitgrate" and 428 * "immigrate_flags" are treated as a left-shifted bitmap of these types. 429 */ 430 enum move_type { 431 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 432 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 433 NR_MOVE_TYPE, 434 }; 435 436 /* "mc" and its members are protected by cgroup_mutex */ 437 static struct move_charge_struct { 438 spinlock_t lock; /* for from, to */ 439 struct mem_cgroup *from; 440 struct mem_cgroup *to; 441 unsigned long immigrate_flags; 442 unsigned long precharge; 443 unsigned long moved_charge; 444 unsigned long moved_swap; 445 struct task_struct *moving_task; /* a task moving charges */ 446 wait_queue_head_t waitq; /* a waitq for other context */ 447 } mc = { 448 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 449 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 450 }; 451 452 static bool move_anon(void) 453 { 454 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); 455 } 456 457 static bool move_file(void) 458 { 459 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); 460 } 461 462 /* 463 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 464 * limit reclaim to prevent infinite loops, if they ever occur. 465 */ 466 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 467 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 468 469 enum charge_type { 470 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 471 MEM_CGROUP_CHARGE_TYPE_ANON, 472 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 473 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 474 NR_CHARGE_TYPE, 475 }; 476 477 /* for encoding cft->private value on file */ 478 enum res_type { 479 _MEM, 480 _MEMSWAP, 481 _OOM_TYPE, 482 _KMEM, 483 }; 484 485 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 486 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 487 #define MEMFILE_ATTR(val) ((val) & 0xffff) 488 /* Used for OOM nofiier */ 489 #define OOM_CONTROL (0) 490 491 /* 492 * Reclaim flags for mem_cgroup_hierarchical_reclaim 493 */ 494 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 495 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 496 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 497 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 498 499 /* 500 * The memcg_create_mutex will be held whenever a new cgroup is created. 501 * As a consequence, any change that needs to protect against new child cgroups 502 * appearing has to hold it as well. 503 */ 504 static DEFINE_MUTEX(memcg_create_mutex); 505 506 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 507 { 508 return s ? container_of(s, struct mem_cgroup, css) : NULL; 509 } 510 511 /* Some nice accessors for the vmpressure. */ 512 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 513 { 514 if (!memcg) 515 memcg = root_mem_cgroup; 516 return &memcg->vmpressure; 517 } 518 519 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 520 { 521 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 522 } 523 524 struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) 525 { 526 return &mem_cgroup_from_css(css)->vmpressure; 527 } 528 529 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 530 { 531 return (memcg == root_mem_cgroup); 532 } 533 534 /* Writing them here to avoid exposing memcg's inner layout */ 535 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 536 537 void sock_update_memcg(struct sock *sk) 538 { 539 if (mem_cgroup_sockets_enabled) { 540 struct mem_cgroup *memcg; 541 struct cg_proto *cg_proto; 542 543 BUG_ON(!sk->sk_prot->proto_cgroup); 544 545 /* Socket cloning can throw us here with sk_cgrp already 546 * filled. It won't however, necessarily happen from 547 * process context. So the test for root memcg given 548 * the current task's memcg won't help us in this case. 549 * 550 * Respecting the original socket's memcg is a better 551 * decision in this case. 552 */ 553 if (sk->sk_cgrp) { 554 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 555 css_get(&sk->sk_cgrp->memcg->css); 556 return; 557 } 558 559 rcu_read_lock(); 560 memcg = mem_cgroup_from_task(current); 561 cg_proto = sk->sk_prot->proto_cgroup(memcg); 562 if (!mem_cgroup_is_root(memcg) && 563 memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { 564 sk->sk_cgrp = cg_proto; 565 } 566 rcu_read_unlock(); 567 } 568 } 569 EXPORT_SYMBOL(sock_update_memcg); 570 571 void sock_release_memcg(struct sock *sk) 572 { 573 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 574 struct mem_cgroup *memcg; 575 WARN_ON(!sk->sk_cgrp->memcg); 576 memcg = sk->sk_cgrp->memcg; 577 css_put(&sk->sk_cgrp->memcg->css); 578 } 579 } 580 581 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 582 { 583 if (!memcg || mem_cgroup_is_root(memcg)) 584 return NULL; 585 586 return &memcg->tcp_mem.cg_proto; 587 } 588 EXPORT_SYMBOL(tcp_proto_cgroup); 589 590 static void disarm_sock_keys(struct mem_cgroup *memcg) 591 { 592 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) 593 return; 594 static_key_slow_dec(&memcg_socket_limit_enabled); 595 } 596 #else 597 static void disarm_sock_keys(struct mem_cgroup *memcg) 598 { 599 } 600 #endif 601 602 #ifdef CONFIG_MEMCG_KMEM 603 /* 604 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 605 * There are two main reasons for not using the css_id for this: 606 * 1) this works better in sparse environments, where we have a lot of memcgs, 607 * but only a few kmem-limited. Or also, if we have, for instance, 200 608 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 609 * 200 entry array for that. 610 * 611 * 2) In order not to violate the cgroup API, we would like to do all memory 612 * allocation in ->create(). At that point, we haven't yet allocated the 613 * css_id. Having a separate index prevents us from messing with the cgroup 614 * core for this 615 * 616 * The current size of the caches array is stored in 617 * memcg_limited_groups_array_size. It will double each time we have to 618 * increase it. 619 */ 620 static DEFINE_IDA(kmem_limited_groups); 621 int memcg_limited_groups_array_size; 622 623 /* 624 * MIN_SIZE is different than 1, because we would like to avoid going through 625 * the alloc/free process all the time. In a small machine, 4 kmem-limited 626 * cgroups is a reasonable guess. In the future, it could be a parameter or 627 * tunable, but that is strictly not necessary. 628 * 629 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get 630 * this constant directly from cgroup, but it is understandable that this is 631 * better kept as an internal representation in cgroup.c. In any case, the 632 * css_id space is not getting any smaller, and we don't have to necessarily 633 * increase ours as well if it increases. 634 */ 635 #define MEMCG_CACHES_MIN_SIZE 4 636 #define MEMCG_CACHES_MAX_SIZE 65535 637 638 /* 639 * A lot of the calls to the cache allocation functions are expected to be 640 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 641 * conditional to this static branch, we'll have to allow modules that does 642 * kmem_cache_alloc and the such to see this symbol as well 643 */ 644 struct static_key memcg_kmem_enabled_key; 645 EXPORT_SYMBOL(memcg_kmem_enabled_key); 646 647 static void disarm_kmem_keys(struct mem_cgroup *memcg) 648 { 649 if (memcg_kmem_is_active(memcg)) { 650 static_key_slow_dec(&memcg_kmem_enabled_key); 651 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); 652 } 653 /* 654 * This check can't live in kmem destruction function, 655 * since the charges will outlive the cgroup 656 */ 657 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); 658 } 659 #else 660 static void disarm_kmem_keys(struct mem_cgroup *memcg) 661 { 662 } 663 #endif /* CONFIG_MEMCG_KMEM */ 664 665 static void disarm_static_keys(struct mem_cgroup *memcg) 666 { 667 disarm_sock_keys(memcg); 668 disarm_kmem_keys(memcg); 669 } 670 671 static void drain_all_stock_async(struct mem_cgroup *memcg); 672 673 static struct mem_cgroup_per_zone * 674 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) 675 { 676 VM_BUG_ON((unsigned)nid >= nr_node_ids); 677 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 678 } 679 680 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 681 { 682 return &memcg->css; 683 } 684 685 static struct mem_cgroup_per_zone * 686 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) 687 { 688 int nid = page_to_nid(page); 689 int zid = page_zonenum(page); 690 691 return mem_cgroup_zoneinfo(memcg, nid, zid); 692 } 693 694 static struct mem_cgroup_tree_per_zone * 695 soft_limit_tree_node_zone(int nid, int zid) 696 { 697 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 698 } 699 700 static struct mem_cgroup_tree_per_zone * 701 soft_limit_tree_from_page(struct page *page) 702 { 703 int nid = page_to_nid(page); 704 int zid = page_zonenum(page); 705 706 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 707 } 708 709 static void 710 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, 711 struct mem_cgroup_per_zone *mz, 712 struct mem_cgroup_tree_per_zone *mctz, 713 unsigned long long new_usage_in_excess) 714 { 715 struct rb_node **p = &mctz->rb_root.rb_node; 716 struct rb_node *parent = NULL; 717 struct mem_cgroup_per_zone *mz_node; 718 719 if (mz->on_tree) 720 return; 721 722 mz->usage_in_excess = new_usage_in_excess; 723 if (!mz->usage_in_excess) 724 return; 725 while (*p) { 726 parent = *p; 727 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 728 tree_node); 729 if (mz->usage_in_excess < mz_node->usage_in_excess) 730 p = &(*p)->rb_left; 731 /* 732 * We can't avoid mem cgroups that are over their soft 733 * limit by the same amount 734 */ 735 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 736 p = &(*p)->rb_right; 737 } 738 rb_link_node(&mz->tree_node, parent, p); 739 rb_insert_color(&mz->tree_node, &mctz->rb_root); 740 mz->on_tree = true; 741 } 742 743 static void 744 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 745 struct mem_cgroup_per_zone *mz, 746 struct mem_cgroup_tree_per_zone *mctz) 747 { 748 if (!mz->on_tree) 749 return; 750 rb_erase(&mz->tree_node, &mctz->rb_root); 751 mz->on_tree = false; 752 } 753 754 static void 755 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 756 struct mem_cgroup_per_zone *mz, 757 struct mem_cgroup_tree_per_zone *mctz) 758 { 759 spin_lock(&mctz->lock); 760 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 761 spin_unlock(&mctz->lock); 762 } 763 764 765 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 766 { 767 unsigned long long excess; 768 struct mem_cgroup_per_zone *mz; 769 struct mem_cgroup_tree_per_zone *mctz; 770 int nid = page_to_nid(page); 771 int zid = page_zonenum(page); 772 mctz = soft_limit_tree_from_page(page); 773 774 /* 775 * Necessary to update all ancestors when hierarchy is used. 776 * because their event counter is not touched. 777 */ 778 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 779 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 780 excess = res_counter_soft_limit_excess(&memcg->res); 781 /* 782 * We have to update the tree if mz is on RB-tree or 783 * mem is over its softlimit. 784 */ 785 if (excess || mz->on_tree) { 786 spin_lock(&mctz->lock); 787 /* if on-tree, remove it */ 788 if (mz->on_tree) 789 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 790 /* 791 * Insert again. mz->usage_in_excess will be updated. 792 * If excess is 0, no tree ops. 793 */ 794 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); 795 spin_unlock(&mctz->lock); 796 } 797 } 798 } 799 800 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 801 { 802 int node, zone; 803 struct mem_cgroup_per_zone *mz; 804 struct mem_cgroup_tree_per_zone *mctz; 805 806 for_each_node(node) { 807 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 808 mz = mem_cgroup_zoneinfo(memcg, node, zone); 809 mctz = soft_limit_tree_node_zone(node, zone); 810 mem_cgroup_remove_exceeded(memcg, mz, mctz); 811 } 812 } 813 } 814 815 static struct mem_cgroup_per_zone * 816 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 817 { 818 struct rb_node *rightmost = NULL; 819 struct mem_cgroup_per_zone *mz; 820 821 retry: 822 mz = NULL; 823 rightmost = rb_last(&mctz->rb_root); 824 if (!rightmost) 825 goto done; /* Nothing to reclaim from */ 826 827 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 828 /* 829 * Remove the node now but someone else can add it back, 830 * we will to add it back at the end of reclaim to its correct 831 * position in the tree. 832 */ 833 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 834 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 835 !css_tryget(&mz->memcg->css)) 836 goto retry; 837 done: 838 return mz; 839 } 840 841 static struct mem_cgroup_per_zone * 842 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 843 { 844 struct mem_cgroup_per_zone *mz; 845 846 spin_lock(&mctz->lock); 847 mz = __mem_cgroup_largest_soft_limit_node(mctz); 848 spin_unlock(&mctz->lock); 849 return mz; 850 } 851 852 /* 853 * Implementation Note: reading percpu statistics for memcg. 854 * 855 * Both of vmstat[] and percpu_counter has threshold and do periodic 856 * synchronization to implement "quick" read. There are trade-off between 857 * reading cost and precision of value. Then, we may have a chance to implement 858 * a periodic synchronizion of counter in memcg's counter. 859 * 860 * But this _read() function is used for user interface now. The user accounts 861 * memory usage by memory cgroup and he _always_ requires exact value because 862 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 863 * have to visit all online cpus and make sum. So, for now, unnecessary 864 * synchronization is not implemented. (just implemented for cpu hotplug) 865 * 866 * If there are kernel internal actions which can make use of some not-exact 867 * value, and reading all cpu value can be performance bottleneck in some 868 * common workload, threashold and synchonization as vmstat[] should be 869 * implemented. 870 */ 871 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 872 enum mem_cgroup_stat_index idx) 873 { 874 long val = 0; 875 int cpu; 876 877 get_online_cpus(); 878 for_each_online_cpu(cpu) 879 val += per_cpu(memcg->stat->count[idx], cpu); 880 #ifdef CONFIG_HOTPLUG_CPU 881 spin_lock(&memcg->pcp_counter_lock); 882 val += memcg->nocpu_base.count[idx]; 883 spin_unlock(&memcg->pcp_counter_lock); 884 #endif 885 put_online_cpus(); 886 return val; 887 } 888 889 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 890 bool charge) 891 { 892 int val = (charge) ? 1 : -1; 893 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 894 } 895 896 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 897 enum mem_cgroup_events_index idx) 898 { 899 unsigned long val = 0; 900 int cpu; 901 902 get_online_cpus(); 903 for_each_online_cpu(cpu) 904 val += per_cpu(memcg->stat->events[idx], cpu); 905 #ifdef CONFIG_HOTPLUG_CPU 906 spin_lock(&memcg->pcp_counter_lock); 907 val += memcg->nocpu_base.events[idx]; 908 spin_unlock(&memcg->pcp_counter_lock); 909 #endif 910 put_online_cpus(); 911 return val; 912 } 913 914 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 915 struct page *page, 916 bool anon, int nr_pages) 917 { 918 preempt_disable(); 919 920 /* 921 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 922 * counted as CACHE even if it's on ANON LRU. 923 */ 924 if (anon) 925 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 926 nr_pages); 927 else 928 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 929 nr_pages); 930 931 if (PageTransHuge(page)) 932 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 933 nr_pages); 934 935 /* pagein of a big page is an event. So, ignore page size */ 936 if (nr_pages > 0) 937 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 938 else { 939 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 940 nr_pages = -nr_pages; /* for event */ 941 } 942 943 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 944 945 preempt_enable(); 946 } 947 948 unsigned long 949 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 950 { 951 struct mem_cgroup_per_zone *mz; 952 953 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 954 return mz->lru_size[lru]; 955 } 956 957 static unsigned long 958 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, 959 unsigned int lru_mask) 960 { 961 struct mem_cgroup_per_zone *mz; 962 enum lru_list lru; 963 unsigned long ret = 0; 964 965 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 966 967 for_each_lru(lru) { 968 if (BIT(lru) & lru_mask) 969 ret += mz->lru_size[lru]; 970 } 971 return ret; 972 } 973 974 static unsigned long 975 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 976 int nid, unsigned int lru_mask) 977 { 978 u64 total = 0; 979 int zid; 980 981 for (zid = 0; zid < MAX_NR_ZONES; zid++) 982 total += mem_cgroup_zone_nr_lru_pages(memcg, 983 nid, zid, lru_mask); 984 985 return total; 986 } 987 988 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 989 unsigned int lru_mask) 990 { 991 int nid; 992 u64 total = 0; 993 994 for_each_node_state(nid, N_MEMORY) 995 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 996 return total; 997 } 998 999 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 1000 enum mem_cgroup_events_target target) 1001 { 1002 unsigned long val, next; 1003 1004 val = __this_cpu_read(memcg->stat->nr_page_events); 1005 next = __this_cpu_read(memcg->stat->targets[target]); 1006 /* from time_after() in jiffies.h */ 1007 if ((long)next - (long)val < 0) { 1008 switch (target) { 1009 case MEM_CGROUP_TARGET_THRESH: 1010 next = val + THRESHOLDS_EVENTS_TARGET; 1011 break; 1012 case MEM_CGROUP_TARGET_SOFTLIMIT: 1013 next = val + SOFTLIMIT_EVENTS_TARGET; 1014 break; 1015 case MEM_CGROUP_TARGET_NUMAINFO: 1016 next = val + NUMAINFO_EVENTS_TARGET; 1017 break; 1018 default: 1019 break; 1020 } 1021 __this_cpu_write(memcg->stat->targets[target], next); 1022 return true; 1023 } 1024 return false; 1025 } 1026 1027 /* 1028 * Check events in order. 1029 * 1030 */ 1031 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 1032 { 1033 preempt_disable(); 1034 /* threshold event is triggered in finer grain than soft limit */ 1035 if (unlikely(mem_cgroup_event_ratelimit(memcg, 1036 MEM_CGROUP_TARGET_THRESH))) { 1037 bool do_softlimit; 1038 bool do_numainfo __maybe_unused; 1039 1040 do_softlimit = mem_cgroup_event_ratelimit(memcg, 1041 MEM_CGROUP_TARGET_SOFTLIMIT); 1042 #if MAX_NUMNODES > 1 1043 do_numainfo = mem_cgroup_event_ratelimit(memcg, 1044 MEM_CGROUP_TARGET_NUMAINFO); 1045 #endif 1046 preempt_enable(); 1047 1048 mem_cgroup_threshold(memcg); 1049 if (unlikely(do_softlimit)) 1050 mem_cgroup_update_tree(memcg, page); 1051 #if MAX_NUMNODES > 1 1052 if (unlikely(do_numainfo)) 1053 atomic_inc(&memcg->numainfo_events); 1054 #endif 1055 } else 1056 preempt_enable(); 1057 } 1058 1059 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1060 { 1061 /* 1062 * mm_update_next_owner() may clear mm->owner to NULL 1063 * if it races with swapoff, page migration, etc. 1064 * So this can be called with p == NULL. 1065 */ 1066 if (unlikely(!p)) 1067 return NULL; 1068 1069 return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); 1070 } 1071 1072 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 1073 { 1074 struct mem_cgroup *memcg = NULL; 1075 1076 if (!mm) 1077 return NULL; 1078 /* 1079 * Because we have no locks, mm->owner's may be being moved to other 1080 * cgroup. We use css_tryget() here even if this looks 1081 * pessimistic (rather than adding locks here). 1082 */ 1083 rcu_read_lock(); 1084 do { 1085 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1086 if (unlikely(!memcg)) 1087 break; 1088 } while (!css_tryget(&memcg->css)); 1089 rcu_read_unlock(); 1090 return memcg; 1091 } 1092 1093 /* 1094 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1095 * ref. count) or NULL if the whole root's subtree has been visited. 1096 * 1097 * helper function to be used by mem_cgroup_iter 1098 */ 1099 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1100 struct mem_cgroup *last_visited) 1101 { 1102 struct cgroup_subsys_state *prev_css, *next_css; 1103 1104 prev_css = last_visited ? &last_visited->css : NULL; 1105 skip_node: 1106 next_css = css_next_descendant_pre(prev_css, &root->css); 1107 1108 /* 1109 * Even if we found a group we have to make sure it is 1110 * alive. css && !memcg means that the groups should be 1111 * skipped and we should continue the tree walk. 1112 * last_visited css is safe to use because it is 1113 * protected by css_get and the tree walk is rcu safe. 1114 */ 1115 if (next_css) { 1116 struct mem_cgroup *mem = mem_cgroup_from_css(next_css); 1117 1118 if (css_tryget(&mem->css)) 1119 return mem; 1120 else { 1121 prev_css = next_css; 1122 goto skip_node; 1123 } 1124 } 1125 1126 return NULL; 1127 } 1128 1129 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) 1130 { 1131 /* 1132 * When a group in the hierarchy below root is destroyed, the 1133 * hierarchy iterator can no longer be trusted since it might 1134 * have pointed to the destroyed group. Invalidate it. 1135 */ 1136 atomic_inc(&root->dead_count); 1137 } 1138 1139 static struct mem_cgroup * 1140 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, 1141 struct mem_cgroup *root, 1142 int *sequence) 1143 { 1144 struct mem_cgroup *position = NULL; 1145 /* 1146 * A cgroup destruction happens in two stages: offlining and 1147 * release. They are separated by a RCU grace period. 1148 * 1149 * If the iterator is valid, we may still race with an 1150 * offlining. The RCU lock ensures the object won't be 1151 * released, tryget will fail if we lost the race. 1152 */ 1153 *sequence = atomic_read(&root->dead_count); 1154 if (iter->last_dead_count == *sequence) { 1155 smp_rmb(); 1156 position = iter->last_visited; 1157 if (position && !css_tryget(&position->css)) 1158 position = NULL; 1159 } 1160 return position; 1161 } 1162 1163 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, 1164 struct mem_cgroup *last_visited, 1165 struct mem_cgroup *new_position, 1166 int sequence) 1167 { 1168 if (last_visited) 1169 css_put(&last_visited->css); 1170 /* 1171 * We store the sequence count from the time @last_visited was 1172 * loaded successfully instead of rereading it here so that we 1173 * don't lose destruction events in between. We could have 1174 * raced with the destruction of @new_position after all. 1175 */ 1176 iter->last_visited = new_position; 1177 smp_wmb(); 1178 iter->last_dead_count = sequence; 1179 } 1180 1181 /** 1182 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1183 * @root: hierarchy root 1184 * @prev: previously returned memcg, NULL on first invocation 1185 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1186 * 1187 * Returns references to children of the hierarchy below @root, or 1188 * @root itself, or %NULL after a full round-trip. 1189 * 1190 * Caller must pass the return value in @prev on subsequent 1191 * invocations for reference counting, or use mem_cgroup_iter_break() 1192 * to cancel a hierarchy walk before the round-trip is complete. 1193 * 1194 * Reclaimers can specify a zone and a priority level in @reclaim to 1195 * divide up the memcgs in the hierarchy among all concurrent 1196 * reclaimers operating on the same zone and priority. 1197 */ 1198 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1199 struct mem_cgroup *prev, 1200 struct mem_cgroup_reclaim_cookie *reclaim) 1201 { 1202 struct mem_cgroup *memcg = NULL; 1203 struct mem_cgroup *last_visited = NULL; 1204 1205 if (mem_cgroup_disabled()) 1206 return NULL; 1207 1208 if (!root) 1209 root = root_mem_cgroup; 1210 1211 if (prev && !reclaim) 1212 last_visited = prev; 1213 1214 if (!root->use_hierarchy && root != root_mem_cgroup) { 1215 if (prev) 1216 goto out_css_put; 1217 return root; 1218 } 1219 1220 rcu_read_lock(); 1221 while (!memcg) { 1222 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 1223 int uninitialized_var(seq); 1224 1225 if (reclaim) { 1226 int nid = zone_to_nid(reclaim->zone); 1227 int zid = zone_idx(reclaim->zone); 1228 struct mem_cgroup_per_zone *mz; 1229 1230 mz = mem_cgroup_zoneinfo(root, nid, zid); 1231 iter = &mz->reclaim_iter[reclaim->priority]; 1232 if (prev && reclaim->generation != iter->generation) { 1233 iter->last_visited = NULL; 1234 goto out_unlock; 1235 } 1236 1237 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1238 } 1239 1240 memcg = __mem_cgroup_iter_next(root, last_visited); 1241 1242 if (reclaim) { 1243 mem_cgroup_iter_update(iter, last_visited, memcg, seq); 1244 1245 if (!memcg) 1246 iter->generation++; 1247 else if (!prev && memcg) 1248 reclaim->generation = iter->generation; 1249 } 1250 1251 if (prev && !memcg) 1252 goto out_unlock; 1253 } 1254 out_unlock: 1255 rcu_read_unlock(); 1256 out_css_put: 1257 if (prev && prev != root) 1258 css_put(&prev->css); 1259 1260 return memcg; 1261 } 1262 1263 /** 1264 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1265 * @root: hierarchy root 1266 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1267 */ 1268 void mem_cgroup_iter_break(struct mem_cgroup *root, 1269 struct mem_cgroup *prev) 1270 { 1271 if (!root) 1272 root = root_mem_cgroup; 1273 if (prev && prev != root) 1274 css_put(&prev->css); 1275 } 1276 1277 /* 1278 * Iteration constructs for visiting all cgroups (under a tree). If 1279 * loops are exited prematurely (break), mem_cgroup_iter_break() must 1280 * be used for reference counting. 1281 */ 1282 #define for_each_mem_cgroup_tree(iter, root) \ 1283 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 1284 iter != NULL; \ 1285 iter = mem_cgroup_iter(root, iter, NULL)) 1286 1287 #define for_each_mem_cgroup(iter) \ 1288 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 1289 iter != NULL; \ 1290 iter = mem_cgroup_iter(NULL, iter, NULL)) 1291 1292 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1293 { 1294 struct mem_cgroup *memcg; 1295 1296 rcu_read_lock(); 1297 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1298 if (unlikely(!memcg)) 1299 goto out; 1300 1301 switch (idx) { 1302 case PGFAULT: 1303 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1304 break; 1305 case PGMAJFAULT: 1306 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1307 break; 1308 default: 1309 BUG(); 1310 } 1311 out: 1312 rcu_read_unlock(); 1313 } 1314 EXPORT_SYMBOL(__mem_cgroup_count_vm_event); 1315 1316 /** 1317 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1318 * @zone: zone of the wanted lruvec 1319 * @memcg: memcg of the wanted lruvec 1320 * 1321 * Returns the lru list vector holding pages for the given @zone and 1322 * @mem. This can be the global zone lruvec, if the memory controller 1323 * is disabled. 1324 */ 1325 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1326 struct mem_cgroup *memcg) 1327 { 1328 struct mem_cgroup_per_zone *mz; 1329 struct lruvec *lruvec; 1330 1331 if (mem_cgroup_disabled()) { 1332 lruvec = &zone->lruvec; 1333 goto out; 1334 } 1335 1336 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); 1337 lruvec = &mz->lruvec; 1338 out: 1339 /* 1340 * Since a node can be onlined after the mem_cgroup was created, 1341 * we have to be prepared to initialize lruvec->zone here; 1342 * and if offlined then reonlined, we need to reinitialize it. 1343 */ 1344 if (unlikely(lruvec->zone != zone)) 1345 lruvec->zone = zone; 1346 return lruvec; 1347 } 1348 1349 /* 1350 * Following LRU functions are allowed to be used without PCG_LOCK. 1351 * Operations are called by routine of global LRU independently from memcg. 1352 * What we have to take care of here is validness of pc->mem_cgroup. 1353 * 1354 * Changes to pc->mem_cgroup happens when 1355 * 1. charge 1356 * 2. moving account 1357 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 1358 * It is added to LRU before charge. 1359 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 1360 * When moving account, the page is not on LRU. It's isolated. 1361 */ 1362 1363 /** 1364 * mem_cgroup_page_lruvec - return lruvec for adding an lru page 1365 * @page: the page 1366 * @zone: zone of the page 1367 */ 1368 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1369 { 1370 struct mem_cgroup_per_zone *mz; 1371 struct mem_cgroup *memcg; 1372 struct page_cgroup *pc; 1373 struct lruvec *lruvec; 1374 1375 if (mem_cgroup_disabled()) { 1376 lruvec = &zone->lruvec; 1377 goto out; 1378 } 1379 1380 pc = lookup_page_cgroup(page); 1381 memcg = pc->mem_cgroup; 1382 1383 /* 1384 * Surreptitiously switch any uncharged offlist page to root: 1385 * an uncharged page off lru does nothing to secure 1386 * its former mem_cgroup from sudden removal. 1387 * 1388 * Our caller holds lru_lock, and PageCgroupUsed is updated 1389 * under page_cgroup lock: between them, they make all uses 1390 * of pc->mem_cgroup safe. 1391 */ 1392 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1393 pc->mem_cgroup = memcg = root_mem_cgroup; 1394 1395 mz = page_cgroup_zoneinfo(memcg, page); 1396 lruvec = &mz->lruvec; 1397 out: 1398 /* 1399 * Since a node can be onlined after the mem_cgroup was created, 1400 * we have to be prepared to initialize lruvec->zone here; 1401 * and if offlined then reonlined, we need to reinitialize it. 1402 */ 1403 if (unlikely(lruvec->zone != zone)) 1404 lruvec->zone = zone; 1405 return lruvec; 1406 } 1407 1408 /** 1409 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1410 * @lruvec: mem_cgroup per zone lru vector 1411 * @lru: index of lru list the page is sitting on 1412 * @nr_pages: positive when adding or negative when removing 1413 * 1414 * This function must be called when a page is added to or removed from an 1415 * lru list. 1416 */ 1417 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1418 int nr_pages) 1419 { 1420 struct mem_cgroup_per_zone *mz; 1421 unsigned long *lru_size; 1422 1423 if (mem_cgroup_disabled()) 1424 return; 1425 1426 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1427 lru_size = mz->lru_size + lru; 1428 *lru_size += nr_pages; 1429 VM_BUG_ON((long)(*lru_size) < 0); 1430 } 1431 1432 /* 1433 * Checks whether given mem is same or in the root_mem_cgroup's 1434 * hierarchy subtree 1435 */ 1436 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1437 struct mem_cgroup *memcg) 1438 { 1439 if (root_memcg == memcg) 1440 return true; 1441 if (!root_memcg->use_hierarchy || !memcg) 1442 return false; 1443 return css_is_ancestor(&memcg->css, &root_memcg->css); 1444 } 1445 1446 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1447 struct mem_cgroup *memcg) 1448 { 1449 bool ret; 1450 1451 rcu_read_lock(); 1452 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); 1453 rcu_read_unlock(); 1454 return ret; 1455 } 1456 1457 bool task_in_mem_cgroup(struct task_struct *task, 1458 const struct mem_cgroup *memcg) 1459 { 1460 struct mem_cgroup *curr = NULL; 1461 struct task_struct *p; 1462 bool ret; 1463 1464 p = find_lock_task_mm(task); 1465 if (p) { 1466 curr = try_get_mem_cgroup_from_mm(p->mm); 1467 task_unlock(p); 1468 } else { 1469 /* 1470 * All threads may have already detached their mm's, but the oom 1471 * killer still needs to detect if they have already been oom 1472 * killed to prevent needlessly killing additional tasks. 1473 */ 1474 rcu_read_lock(); 1475 curr = mem_cgroup_from_task(task); 1476 if (curr) 1477 css_get(&curr->css); 1478 rcu_read_unlock(); 1479 } 1480 if (!curr) 1481 return false; 1482 /* 1483 * We should check use_hierarchy of "memcg" not "curr". Because checking 1484 * use_hierarchy of "curr" here make this function true if hierarchy is 1485 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* 1486 * hierarchy(even if use_hierarchy is disabled in "memcg"). 1487 */ 1488 ret = mem_cgroup_same_or_subtree(memcg, curr); 1489 css_put(&curr->css); 1490 return ret; 1491 } 1492 1493 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 1494 { 1495 unsigned long inactive_ratio; 1496 unsigned long inactive; 1497 unsigned long active; 1498 unsigned long gb; 1499 1500 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); 1501 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); 1502 1503 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1504 if (gb) 1505 inactive_ratio = int_sqrt(10 * gb); 1506 else 1507 inactive_ratio = 1; 1508 1509 return inactive * inactive_ratio < active; 1510 } 1511 1512 #define mem_cgroup_from_res_counter(counter, member) \ 1513 container_of(counter, struct mem_cgroup, member) 1514 1515 /** 1516 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1517 * @memcg: the memory cgroup 1518 * 1519 * Returns the maximum amount of memory @mem can be charged with, in 1520 * pages. 1521 */ 1522 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1523 { 1524 unsigned long long margin; 1525 1526 margin = res_counter_margin(&memcg->res); 1527 if (do_swap_account) 1528 margin = min(margin, res_counter_margin(&memcg->memsw)); 1529 return margin >> PAGE_SHIFT; 1530 } 1531 1532 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1533 { 1534 /* root ? */ 1535 if (!css_parent(&memcg->css)) 1536 return vm_swappiness; 1537 1538 return memcg->swappiness; 1539 } 1540 1541 /* 1542 * memcg->moving_account is used for checking possibility that some thread is 1543 * calling move_account(). When a thread on CPU-A starts moving pages under 1544 * a memcg, other threads should check memcg->moving_account under 1545 * rcu_read_lock(), like this: 1546 * 1547 * CPU-A CPU-B 1548 * rcu_read_lock() 1549 * memcg->moving_account+1 if (memcg->mocing_account) 1550 * take heavy locks. 1551 * synchronize_rcu() update something. 1552 * rcu_read_unlock() 1553 * start move here. 1554 */ 1555 1556 /* for quick checking without looking up memcg */ 1557 atomic_t memcg_moving __read_mostly; 1558 1559 static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1560 { 1561 atomic_inc(&memcg_moving); 1562 atomic_inc(&memcg->moving_account); 1563 synchronize_rcu(); 1564 } 1565 1566 static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1567 { 1568 /* 1569 * Now, mem_cgroup_clear_mc() may call this function with NULL. 1570 * We check NULL in callee rather than caller. 1571 */ 1572 if (memcg) { 1573 atomic_dec(&memcg_moving); 1574 atomic_dec(&memcg->moving_account); 1575 } 1576 } 1577 1578 /* 1579 * 2 routines for checking "mem" is under move_account() or not. 1580 * 1581 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This 1582 * is used for avoiding races in accounting. If true, 1583 * pc->mem_cgroup may be overwritten. 1584 * 1585 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1586 * under hierarchy of moving cgroups. This is for 1587 * waiting at hith-memory prressure caused by "move". 1588 */ 1589 1590 static bool mem_cgroup_stolen(struct mem_cgroup *memcg) 1591 { 1592 VM_BUG_ON(!rcu_read_lock_held()); 1593 return atomic_read(&memcg->moving_account) > 0; 1594 } 1595 1596 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1597 { 1598 struct mem_cgroup *from; 1599 struct mem_cgroup *to; 1600 bool ret = false; 1601 /* 1602 * Unlike task_move routines, we access mc.to, mc.from not under 1603 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1604 */ 1605 spin_lock(&mc.lock); 1606 from = mc.from; 1607 to = mc.to; 1608 if (!from) 1609 goto unlock; 1610 1611 ret = mem_cgroup_same_or_subtree(memcg, from) 1612 || mem_cgroup_same_or_subtree(memcg, to); 1613 unlock: 1614 spin_unlock(&mc.lock); 1615 return ret; 1616 } 1617 1618 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1619 { 1620 if (mc.moving_task && current != mc.moving_task) { 1621 if (mem_cgroup_under_move(memcg)) { 1622 DEFINE_WAIT(wait); 1623 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1624 /* moving charge context might have finished. */ 1625 if (mc.moving_task) 1626 schedule(); 1627 finish_wait(&mc.waitq, &wait); 1628 return true; 1629 } 1630 } 1631 return false; 1632 } 1633 1634 /* 1635 * Take this lock when 1636 * - a code tries to modify page's memcg while it's USED. 1637 * - a code tries to modify page state accounting in a memcg. 1638 * see mem_cgroup_stolen(), too. 1639 */ 1640 static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1641 unsigned long *flags) 1642 { 1643 spin_lock_irqsave(&memcg->move_lock, *flags); 1644 } 1645 1646 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, 1647 unsigned long *flags) 1648 { 1649 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1650 } 1651 1652 #define K(x) ((x) << (PAGE_SHIFT-10)) 1653 /** 1654 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1655 * @memcg: The memory cgroup that went over limit 1656 * @p: Task that is going to be killed 1657 * 1658 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1659 * enabled 1660 */ 1661 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1662 { 1663 struct cgroup *task_cgrp; 1664 struct cgroup *mem_cgrp; 1665 /* 1666 * Need a buffer in BSS, can't rely on allocations. The code relies 1667 * on the assumption that OOM is serialized for memory controller. 1668 * If this assumption is broken, revisit this code. 1669 */ 1670 static char memcg_name[PATH_MAX]; 1671 int ret; 1672 struct mem_cgroup *iter; 1673 unsigned int i; 1674 1675 if (!p) 1676 return; 1677 1678 rcu_read_lock(); 1679 1680 mem_cgrp = memcg->css.cgroup; 1681 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1682 1683 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1684 if (ret < 0) { 1685 /* 1686 * Unfortunately, we are unable to convert to a useful name 1687 * But we'll still print out the usage information 1688 */ 1689 rcu_read_unlock(); 1690 goto done; 1691 } 1692 rcu_read_unlock(); 1693 1694 pr_info("Task in %s killed", memcg_name); 1695 1696 rcu_read_lock(); 1697 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1698 if (ret < 0) { 1699 rcu_read_unlock(); 1700 goto done; 1701 } 1702 rcu_read_unlock(); 1703 1704 /* 1705 * Continues from above, so we don't need an KERN_ level 1706 */ 1707 pr_cont(" as a result of limit of %s\n", memcg_name); 1708 done: 1709 1710 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", 1711 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1712 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1713 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1714 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", 1715 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1716 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1717 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1718 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1719 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, 1720 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, 1721 res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); 1722 1723 for_each_mem_cgroup_tree(iter, memcg) { 1724 pr_info("Memory cgroup stats"); 1725 1726 rcu_read_lock(); 1727 ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); 1728 if (!ret) 1729 pr_cont(" for %s", memcg_name); 1730 rcu_read_unlock(); 1731 pr_cont(":"); 1732 1733 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1734 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1735 continue; 1736 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], 1737 K(mem_cgroup_read_stat(iter, i))); 1738 } 1739 1740 for (i = 0; i < NR_LRU_LISTS; i++) 1741 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1742 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1743 1744 pr_cont("\n"); 1745 } 1746 } 1747 1748 /* 1749 * This function returns the number of memcg under hierarchy tree. Returns 1750 * 1(self count) if no children. 1751 */ 1752 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1753 { 1754 int num = 0; 1755 struct mem_cgroup *iter; 1756 1757 for_each_mem_cgroup_tree(iter, memcg) 1758 num++; 1759 return num; 1760 } 1761 1762 /* 1763 * Return the memory (and swap, if configured) limit for a memcg. 1764 */ 1765 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1766 { 1767 u64 limit; 1768 1769 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1770 1771 /* 1772 * Do not consider swap space if we cannot swap due to swappiness 1773 */ 1774 if (mem_cgroup_swappiness(memcg)) { 1775 u64 memsw; 1776 1777 limit += total_swap_pages << PAGE_SHIFT; 1778 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1779 1780 /* 1781 * If memsw is finite and limits the amount of swap space 1782 * available to this memcg, return that limit. 1783 */ 1784 limit = min(limit, memsw); 1785 } 1786 1787 return limit; 1788 } 1789 1790 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1791 int order) 1792 { 1793 struct mem_cgroup *iter; 1794 unsigned long chosen_points = 0; 1795 unsigned long totalpages; 1796 unsigned int points = 0; 1797 struct task_struct *chosen = NULL; 1798 1799 /* 1800 * If current has a pending SIGKILL or is exiting, then automatically 1801 * select it. The goal is to allow it to allocate so that it may 1802 * quickly exit and free its memory. 1803 */ 1804 if (fatal_signal_pending(current) || current->flags & PF_EXITING) { 1805 set_thread_flag(TIF_MEMDIE); 1806 return; 1807 } 1808 1809 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1810 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; 1811 for_each_mem_cgroup_tree(iter, memcg) { 1812 struct css_task_iter it; 1813 struct task_struct *task; 1814 1815 css_task_iter_start(&iter->css, &it); 1816 while ((task = css_task_iter_next(&it))) { 1817 switch (oom_scan_process_thread(task, totalpages, NULL, 1818 false)) { 1819 case OOM_SCAN_SELECT: 1820 if (chosen) 1821 put_task_struct(chosen); 1822 chosen = task; 1823 chosen_points = ULONG_MAX; 1824 get_task_struct(chosen); 1825 /* fall through */ 1826 case OOM_SCAN_CONTINUE: 1827 continue; 1828 case OOM_SCAN_ABORT: 1829 css_task_iter_end(&it); 1830 mem_cgroup_iter_break(memcg, iter); 1831 if (chosen) 1832 put_task_struct(chosen); 1833 return; 1834 case OOM_SCAN_OK: 1835 break; 1836 }; 1837 points = oom_badness(task, memcg, NULL, totalpages); 1838 if (points > chosen_points) { 1839 if (chosen) 1840 put_task_struct(chosen); 1841 chosen = task; 1842 chosen_points = points; 1843 get_task_struct(chosen); 1844 } 1845 } 1846 css_task_iter_end(&it); 1847 } 1848 1849 if (!chosen) 1850 return; 1851 points = chosen_points * 1000 / totalpages; 1852 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, 1853 NULL, "Memory cgroup out of memory"); 1854 } 1855 1856 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1857 gfp_t gfp_mask, 1858 unsigned long flags) 1859 { 1860 unsigned long total = 0; 1861 bool noswap = false; 1862 int loop; 1863 1864 if (flags & MEM_CGROUP_RECLAIM_NOSWAP) 1865 noswap = true; 1866 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) 1867 noswap = true; 1868 1869 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { 1870 if (loop) 1871 drain_all_stock_async(memcg); 1872 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); 1873 /* 1874 * Allow limit shrinkers, which are triggered directly 1875 * by userspace, to catch signals and stop reclaim 1876 * after minimal progress, regardless of the margin. 1877 */ 1878 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) 1879 break; 1880 if (mem_cgroup_margin(memcg)) 1881 break; 1882 /* 1883 * If nothing was reclaimed after two attempts, there 1884 * may be no reclaimable pages in this hierarchy. 1885 */ 1886 if (loop && !total) 1887 break; 1888 } 1889 return total; 1890 } 1891 1892 /** 1893 * test_mem_cgroup_node_reclaimable 1894 * @memcg: the target memcg 1895 * @nid: the node ID to be checked. 1896 * @noswap : specify true here if the user wants flle only information. 1897 * 1898 * This function returns whether the specified memcg contains any 1899 * reclaimable pages on a node. Returns true if there are any reclaimable 1900 * pages in the node. 1901 */ 1902 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1903 int nid, bool noswap) 1904 { 1905 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1906 return true; 1907 if (noswap || !total_swap_pages) 1908 return false; 1909 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1910 return true; 1911 return false; 1912 1913 } 1914 #if MAX_NUMNODES > 1 1915 1916 /* 1917 * Always updating the nodemask is not very good - even if we have an empty 1918 * list or the wrong list here, we can start from some node and traverse all 1919 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1920 * 1921 */ 1922 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1923 { 1924 int nid; 1925 /* 1926 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1927 * pagein/pageout changes since the last update. 1928 */ 1929 if (!atomic_read(&memcg->numainfo_events)) 1930 return; 1931 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1932 return; 1933 1934 /* make a nodemask where this memcg uses memory from */ 1935 memcg->scan_nodes = node_states[N_MEMORY]; 1936 1937 for_each_node_mask(nid, node_states[N_MEMORY]) { 1938 1939 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1940 node_clear(nid, memcg->scan_nodes); 1941 } 1942 1943 atomic_set(&memcg->numainfo_events, 0); 1944 atomic_set(&memcg->numainfo_updating, 0); 1945 } 1946 1947 /* 1948 * Selecting a node where we start reclaim from. Because what we need is just 1949 * reducing usage counter, start from anywhere is O,K. Considering 1950 * memory reclaim from current node, there are pros. and cons. 1951 * 1952 * Freeing memory from current node means freeing memory from a node which 1953 * we'll use or we've used. So, it may make LRU bad. And if several threads 1954 * hit limits, it will see a contention on a node. But freeing from remote 1955 * node means more costs for memory reclaim because of memory latency. 1956 * 1957 * Now, we use round-robin. Better algorithm is welcomed. 1958 */ 1959 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1960 { 1961 int node; 1962 1963 mem_cgroup_may_update_nodemask(memcg); 1964 node = memcg->last_scanned_node; 1965 1966 node = next_node(node, memcg->scan_nodes); 1967 if (node == MAX_NUMNODES) 1968 node = first_node(memcg->scan_nodes); 1969 /* 1970 * We call this when we hit limit, not when pages are added to LRU. 1971 * No LRU may hold pages because all pages are UNEVICTABLE or 1972 * memcg is too small and all pages are not on LRU. In that case, 1973 * we use curret node. 1974 */ 1975 if (unlikely(node == MAX_NUMNODES)) 1976 node = numa_node_id(); 1977 1978 memcg->last_scanned_node = node; 1979 return node; 1980 } 1981 1982 /* 1983 * Check all nodes whether it contains reclaimable pages or not. 1984 * For quick scan, we make use of scan_nodes. This will allow us to skip 1985 * unused nodes. But scan_nodes is lazily updated and may not cotain 1986 * enough new information. We need to do double check. 1987 */ 1988 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1989 { 1990 int nid; 1991 1992 /* 1993 * quick check...making use of scan_node. 1994 * We can skip unused nodes. 1995 */ 1996 if (!nodes_empty(memcg->scan_nodes)) { 1997 for (nid = first_node(memcg->scan_nodes); 1998 nid < MAX_NUMNODES; 1999 nid = next_node(nid, memcg->scan_nodes)) { 2000 2001 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 2002 return true; 2003 } 2004 } 2005 /* 2006 * Check rest of nodes. 2007 */ 2008 for_each_node_state(nid, N_MEMORY) { 2009 if (node_isset(nid, memcg->scan_nodes)) 2010 continue; 2011 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 2012 return true; 2013 } 2014 return false; 2015 } 2016 2017 #else 2018 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 2019 { 2020 return 0; 2021 } 2022 2023 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 2024 { 2025 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 2026 } 2027 #endif 2028 2029 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 2030 struct zone *zone, 2031 gfp_t gfp_mask, 2032 unsigned long *total_scanned) 2033 { 2034 struct mem_cgroup *victim = NULL; 2035 int total = 0; 2036 int loop = 0; 2037 unsigned long excess; 2038 unsigned long nr_scanned; 2039 struct mem_cgroup_reclaim_cookie reclaim = { 2040 .zone = zone, 2041 .priority = 0, 2042 }; 2043 2044 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 2045 2046 while (1) { 2047 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 2048 if (!victim) { 2049 loop++; 2050 if (loop >= 2) { 2051 /* 2052 * If we have not been able to reclaim 2053 * anything, it might because there are 2054 * no reclaimable pages under this hierarchy 2055 */ 2056 if (!total) 2057 break; 2058 /* 2059 * We want to do more targeted reclaim. 2060 * excess >> 2 is not to excessive so as to 2061 * reclaim too much, nor too less that we keep 2062 * coming back to reclaim from this cgroup 2063 */ 2064 if (total >= (excess >> 2) || 2065 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 2066 break; 2067 } 2068 continue; 2069 } 2070 if (!mem_cgroup_reclaimable(victim, false)) 2071 continue; 2072 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 2073 zone, &nr_scanned); 2074 *total_scanned += nr_scanned; 2075 if (!res_counter_soft_limit_excess(&root_memcg->res)) 2076 break; 2077 } 2078 mem_cgroup_iter_break(root_memcg, victim); 2079 return total; 2080 } 2081 2082 #ifdef CONFIG_LOCKDEP 2083 static struct lockdep_map memcg_oom_lock_dep_map = { 2084 .name = "memcg_oom_lock", 2085 }; 2086 #endif 2087 2088 static DEFINE_SPINLOCK(memcg_oom_lock); 2089 2090 /* 2091 * Check OOM-Killer is already running under our hierarchy. 2092 * If someone is running, return false. 2093 */ 2094 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 2095 { 2096 struct mem_cgroup *iter, *failed = NULL; 2097 2098 spin_lock(&memcg_oom_lock); 2099 2100 for_each_mem_cgroup_tree(iter, memcg) { 2101 if (iter->oom_lock) { 2102 /* 2103 * this subtree of our hierarchy is already locked 2104 * so we cannot give a lock. 2105 */ 2106 failed = iter; 2107 mem_cgroup_iter_break(memcg, iter); 2108 break; 2109 } else 2110 iter->oom_lock = true; 2111 } 2112 2113 if (failed) { 2114 /* 2115 * OK, we failed to lock the whole subtree so we have 2116 * to clean up what we set up to the failing subtree 2117 */ 2118 for_each_mem_cgroup_tree(iter, memcg) { 2119 if (iter == failed) { 2120 mem_cgroup_iter_break(memcg, iter); 2121 break; 2122 } 2123 iter->oom_lock = false; 2124 } 2125 } else 2126 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 2127 2128 spin_unlock(&memcg_oom_lock); 2129 2130 return !failed; 2131 } 2132 2133 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 2134 { 2135 struct mem_cgroup *iter; 2136 2137 spin_lock(&memcg_oom_lock); 2138 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 2139 for_each_mem_cgroup_tree(iter, memcg) 2140 iter->oom_lock = false; 2141 spin_unlock(&memcg_oom_lock); 2142 } 2143 2144 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 2145 { 2146 struct mem_cgroup *iter; 2147 2148 for_each_mem_cgroup_tree(iter, memcg) 2149 atomic_inc(&iter->under_oom); 2150 } 2151 2152 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 2153 { 2154 struct mem_cgroup *iter; 2155 2156 /* 2157 * When a new child is created while the hierarchy is under oom, 2158 * mem_cgroup_oom_lock() may not be called. We have to use 2159 * atomic_add_unless() here. 2160 */ 2161 for_each_mem_cgroup_tree(iter, memcg) 2162 atomic_add_unless(&iter->under_oom, -1, 0); 2163 } 2164 2165 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 2166 2167 struct oom_wait_info { 2168 struct mem_cgroup *memcg; 2169 wait_queue_t wait; 2170 }; 2171 2172 static int memcg_oom_wake_function(wait_queue_t *wait, 2173 unsigned mode, int sync, void *arg) 2174 { 2175 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 2176 struct mem_cgroup *oom_wait_memcg; 2177 struct oom_wait_info *oom_wait_info; 2178 2179 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 2180 oom_wait_memcg = oom_wait_info->memcg; 2181 2182 /* 2183 * Both of oom_wait_info->memcg and wake_memcg are stable under us. 2184 * Then we can use css_is_ancestor without taking care of RCU. 2185 */ 2186 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 2187 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) 2188 return 0; 2189 return autoremove_wake_function(wait, mode, sync, arg); 2190 } 2191 2192 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 2193 { 2194 atomic_inc(&memcg->oom_wakeups); 2195 /* for filtering, pass "memcg" as argument. */ 2196 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 2197 } 2198 2199 static void memcg_oom_recover(struct mem_cgroup *memcg) 2200 { 2201 if (memcg && atomic_read(&memcg->under_oom)) 2202 memcg_wakeup_oom(memcg); 2203 } 2204 2205 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2206 { 2207 if (!current->memcg_oom.may_oom) 2208 return; 2209 /* 2210 * We are in the middle of the charge context here, so we 2211 * don't want to block when potentially sitting on a callstack 2212 * that holds all kinds of filesystem and mm locks. 2213 * 2214 * Also, the caller may handle a failed allocation gracefully 2215 * (like optional page cache readahead) and so an OOM killer 2216 * invocation might not even be necessary. 2217 * 2218 * That's why we don't do anything here except remember the 2219 * OOM context and then deal with it at the end of the page 2220 * fault when the stack is unwound, the locks are released, 2221 * and when we know whether the fault was overall successful. 2222 */ 2223 css_get(&memcg->css); 2224 current->memcg_oom.memcg = memcg; 2225 current->memcg_oom.gfp_mask = mask; 2226 current->memcg_oom.order = order; 2227 } 2228 2229 /** 2230 * mem_cgroup_oom_synchronize - complete memcg OOM handling 2231 * @handle: actually kill/wait or just clean up the OOM state 2232 * 2233 * This has to be called at the end of a page fault if the memcg OOM 2234 * handler was enabled. 2235 * 2236 * Memcg supports userspace OOM handling where failed allocations must 2237 * sleep on a waitqueue until the userspace task resolves the 2238 * situation. Sleeping directly in the charge context with all kinds 2239 * of locks held is not a good idea, instead we remember an OOM state 2240 * in the task and mem_cgroup_oom_synchronize() has to be called at 2241 * the end of the page fault to complete the OOM handling. 2242 * 2243 * Returns %true if an ongoing memcg OOM situation was detected and 2244 * completed, %false otherwise. 2245 */ 2246 bool mem_cgroup_oom_synchronize(bool handle) 2247 { 2248 struct mem_cgroup *memcg = current->memcg_oom.memcg; 2249 struct oom_wait_info owait; 2250 bool locked; 2251 2252 /* OOM is global, do not handle */ 2253 if (!memcg) 2254 return false; 2255 2256 if (!handle) 2257 goto cleanup; 2258 2259 owait.memcg = memcg; 2260 owait.wait.flags = 0; 2261 owait.wait.func = memcg_oom_wake_function; 2262 owait.wait.private = current; 2263 INIT_LIST_HEAD(&owait.wait.task_list); 2264 2265 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2266 mem_cgroup_mark_under_oom(memcg); 2267 2268 locked = mem_cgroup_oom_trylock(memcg); 2269 2270 if (locked) 2271 mem_cgroup_oom_notify(memcg); 2272 2273 if (locked && !memcg->oom_kill_disable) { 2274 mem_cgroup_unmark_under_oom(memcg); 2275 finish_wait(&memcg_oom_waitq, &owait.wait); 2276 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 2277 current->memcg_oom.order); 2278 } else { 2279 schedule(); 2280 mem_cgroup_unmark_under_oom(memcg); 2281 finish_wait(&memcg_oom_waitq, &owait.wait); 2282 } 2283 2284 if (locked) { 2285 mem_cgroup_oom_unlock(memcg); 2286 /* 2287 * There is no guarantee that an OOM-lock contender 2288 * sees the wakeups triggered by the OOM kill 2289 * uncharges. Wake any sleepers explicitely. 2290 */ 2291 memcg_oom_recover(memcg); 2292 } 2293 cleanup: 2294 current->memcg_oom.memcg = NULL; 2295 css_put(&memcg->css); 2296 return true; 2297 } 2298 2299 /* 2300 * Currently used to update mapped file statistics, but the routine can be 2301 * generalized to update other statistics as well. 2302 * 2303 * Notes: Race condition 2304 * 2305 * We usually use page_cgroup_lock() for accessing page_cgroup member but 2306 * it tends to be costly. But considering some conditions, we doesn't need 2307 * to do so _always_. 2308 * 2309 * Considering "charge", lock_page_cgroup() is not required because all 2310 * file-stat operations happen after a page is attached to radix-tree. There 2311 * are no race with "charge". 2312 * 2313 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 2314 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 2315 * if there are race with "uncharge". Statistics itself is properly handled 2316 * by flags. 2317 * 2318 * Considering "move", this is an only case we see a race. To make the race 2319 * small, we check mm->moving_account and detect there are possibility of race 2320 * If there is, we take a lock. 2321 */ 2322 2323 void __mem_cgroup_begin_update_page_stat(struct page *page, 2324 bool *locked, unsigned long *flags) 2325 { 2326 struct mem_cgroup *memcg; 2327 struct page_cgroup *pc; 2328 2329 pc = lookup_page_cgroup(page); 2330 again: 2331 memcg = pc->mem_cgroup; 2332 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2333 return; 2334 /* 2335 * If this memory cgroup is not under account moving, we don't 2336 * need to take move_lock_mem_cgroup(). Because we already hold 2337 * rcu_read_lock(), any calls to move_account will be delayed until 2338 * rcu_read_unlock() if mem_cgroup_stolen() == true. 2339 */ 2340 if (!mem_cgroup_stolen(memcg)) 2341 return; 2342 2343 move_lock_mem_cgroup(memcg, flags); 2344 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 2345 move_unlock_mem_cgroup(memcg, flags); 2346 goto again; 2347 } 2348 *locked = true; 2349 } 2350 2351 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) 2352 { 2353 struct page_cgroup *pc = lookup_page_cgroup(page); 2354 2355 /* 2356 * It's guaranteed that pc->mem_cgroup never changes while 2357 * lock is held because a routine modifies pc->mem_cgroup 2358 * should take move_lock_mem_cgroup(). 2359 */ 2360 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 2361 } 2362 2363 void mem_cgroup_update_page_stat(struct page *page, 2364 enum mem_cgroup_stat_index idx, int val) 2365 { 2366 struct mem_cgroup *memcg; 2367 struct page_cgroup *pc = lookup_page_cgroup(page); 2368 unsigned long uninitialized_var(flags); 2369 2370 if (mem_cgroup_disabled()) 2371 return; 2372 2373 VM_BUG_ON(!rcu_read_lock_held()); 2374 memcg = pc->mem_cgroup; 2375 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2376 return; 2377 2378 this_cpu_add(memcg->stat->count[idx], val); 2379 } 2380 2381 /* 2382 * size of first charge trial. "32" comes from vmscan.c's magic value. 2383 * TODO: maybe necessary to use big numbers in big irons. 2384 */ 2385 #define CHARGE_BATCH 32U 2386 struct memcg_stock_pcp { 2387 struct mem_cgroup *cached; /* this never be root cgroup */ 2388 unsigned int nr_pages; 2389 struct work_struct work; 2390 unsigned long flags; 2391 #define FLUSHING_CACHED_CHARGE 0 2392 }; 2393 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2394 static DEFINE_MUTEX(percpu_charge_mutex); 2395 2396 /** 2397 * consume_stock: Try to consume stocked charge on this cpu. 2398 * @memcg: memcg to consume from. 2399 * @nr_pages: how many pages to charge. 2400 * 2401 * The charges will only happen if @memcg matches the current cpu's memcg 2402 * stock, and at least @nr_pages are available in that stock. Failure to 2403 * service an allocation will refill the stock. 2404 * 2405 * returns true if successful, false otherwise. 2406 */ 2407 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2408 { 2409 struct memcg_stock_pcp *stock; 2410 bool ret = true; 2411 2412 if (nr_pages > CHARGE_BATCH) 2413 return false; 2414 2415 stock = &get_cpu_var(memcg_stock); 2416 if (memcg == stock->cached && stock->nr_pages >= nr_pages) 2417 stock->nr_pages -= nr_pages; 2418 else /* need to call res_counter_charge */ 2419 ret = false; 2420 put_cpu_var(memcg_stock); 2421 return ret; 2422 } 2423 2424 /* 2425 * Returns stocks cached in percpu to res_counter and reset cached information. 2426 */ 2427 static void drain_stock(struct memcg_stock_pcp *stock) 2428 { 2429 struct mem_cgroup *old = stock->cached; 2430 2431 if (stock->nr_pages) { 2432 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2433 2434 res_counter_uncharge(&old->res, bytes); 2435 if (do_swap_account) 2436 res_counter_uncharge(&old->memsw, bytes); 2437 stock->nr_pages = 0; 2438 } 2439 stock->cached = NULL; 2440 } 2441 2442 /* 2443 * This must be called under preempt disabled or must be called by 2444 * a thread which is pinned to local cpu. 2445 */ 2446 static void drain_local_stock(struct work_struct *dummy) 2447 { 2448 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2449 drain_stock(stock); 2450 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2451 } 2452 2453 static void __init memcg_stock_init(void) 2454 { 2455 int cpu; 2456 2457 for_each_possible_cpu(cpu) { 2458 struct memcg_stock_pcp *stock = 2459 &per_cpu(memcg_stock, cpu); 2460 INIT_WORK(&stock->work, drain_local_stock); 2461 } 2462 } 2463 2464 /* 2465 * Cache charges(val) which is from res_counter, to local per_cpu area. 2466 * This will be consumed by consume_stock() function, later. 2467 */ 2468 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2469 { 2470 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2471 2472 if (stock->cached != memcg) { /* reset if necessary */ 2473 drain_stock(stock); 2474 stock->cached = memcg; 2475 } 2476 stock->nr_pages += nr_pages; 2477 put_cpu_var(memcg_stock); 2478 } 2479 2480 /* 2481 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2482 * of the hierarchy under it. sync flag says whether we should block 2483 * until the work is done. 2484 */ 2485 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) 2486 { 2487 int cpu, curcpu; 2488 2489 /* Notify other cpus that system-wide "drain" is running */ 2490 get_online_cpus(); 2491 curcpu = get_cpu(); 2492 for_each_online_cpu(cpu) { 2493 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2494 struct mem_cgroup *memcg; 2495 2496 memcg = stock->cached; 2497 if (!memcg || !stock->nr_pages) 2498 continue; 2499 if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) 2500 continue; 2501 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2502 if (cpu == curcpu) 2503 drain_local_stock(&stock->work); 2504 else 2505 schedule_work_on(cpu, &stock->work); 2506 } 2507 } 2508 put_cpu(); 2509 2510 if (!sync) 2511 goto out; 2512 2513 for_each_online_cpu(cpu) { 2514 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2515 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2516 flush_work(&stock->work); 2517 } 2518 out: 2519 put_online_cpus(); 2520 } 2521 2522 /* 2523 * Tries to drain stocked charges in other cpus. This function is asynchronous 2524 * and just put a work per cpu for draining localy on each cpu. Caller can 2525 * expects some charges will be back to res_counter later but cannot wait for 2526 * it. 2527 */ 2528 static void drain_all_stock_async(struct mem_cgroup *root_memcg) 2529 { 2530 /* 2531 * If someone calls draining, avoid adding more kworker runs. 2532 */ 2533 if (!mutex_trylock(&percpu_charge_mutex)) 2534 return; 2535 drain_all_stock(root_memcg, false); 2536 mutex_unlock(&percpu_charge_mutex); 2537 } 2538 2539 /* This is a synchronous drain interface. */ 2540 static void drain_all_stock_sync(struct mem_cgroup *root_memcg) 2541 { 2542 /* called when force_empty is called */ 2543 mutex_lock(&percpu_charge_mutex); 2544 drain_all_stock(root_memcg, true); 2545 mutex_unlock(&percpu_charge_mutex); 2546 } 2547 2548 /* 2549 * This function drains percpu counter value from DEAD cpu and 2550 * move it to local cpu. Note that this function can be preempted. 2551 */ 2552 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) 2553 { 2554 int i; 2555 2556 spin_lock(&memcg->pcp_counter_lock); 2557 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 2558 long x = per_cpu(memcg->stat->count[i], cpu); 2559 2560 per_cpu(memcg->stat->count[i], cpu) = 0; 2561 memcg->nocpu_base.count[i] += x; 2562 } 2563 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2564 unsigned long x = per_cpu(memcg->stat->events[i], cpu); 2565 2566 per_cpu(memcg->stat->events[i], cpu) = 0; 2567 memcg->nocpu_base.events[i] += x; 2568 } 2569 spin_unlock(&memcg->pcp_counter_lock); 2570 } 2571 2572 static int memcg_cpu_hotplug_callback(struct notifier_block *nb, 2573 unsigned long action, 2574 void *hcpu) 2575 { 2576 int cpu = (unsigned long)hcpu; 2577 struct memcg_stock_pcp *stock; 2578 struct mem_cgroup *iter; 2579 2580 if (action == CPU_ONLINE) 2581 return NOTIFY_OK; 2582 2583 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 2584 return NOTIFY_OK; 2585 2586 for_each_mem_cgroup(iter) 2587 mem_cgroup_drain_pcp_counter(iter, cpu); 2588 2589 stock = &per_cpu(memcg_stock, cpu); 2590 drain_stock(stock); 2591 return NOTIFY_OK; 2592 } 2593 2594 2595 /* See __mem_cgroup_try_charge() for details */ 2596 enum { 2597 CHARGE_OK, /* success */ 2598 CHARGE_RETRY, /* need to retry but retry is not bad */ 2599 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2600 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2601 }; 2602 2603 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2604 unsigned int nr_pages, unsigned int min_pages, 2605 bool invoke_oom) 2606 { 2607 unsigned long csize = nr_pages * PAGE_SIZE; 2608 struct mem_cgroup *mem_over_limit; 2609 struct res_counter *fail_res; 2610 unsigned long flags = 0; 2611 int ret; 2612 2613 ret = res_counter_charge(&memcg->res, csize, &fail_res); 2614 2615 if (likely(!ret)) { 2616 if (!do_swap_account) 2617 return CHARGE_OK; 2618 ret = res_counter_charge(&memcg->memsw, csize, &fail_res); 2619 if (likely(!ret)) 2620 return CHARGE_OK; 2621 2622 res_counter_uncharge(&memcg->res, csize); 2623 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2624 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2625 } else 2626 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2627 /* 2628 * Never reclaim on behalf of optional batching, retry with a 2629 * single page instead. 2630 */ 2631 if (nr_pages > min_pages) 2632 return CHARGE_RETRY; 2633 2634 if (!(gfp_mask & __GFP_WAIT)) 2635 return CHARGE_WOULDBLOCK; 2636 2637 if (gfp_mask & __GFP_NORETRY) 2638 return CHARGE_NOMEM; 2639 2640 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2641 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2642 return CHARGE_RETRY; 2643 /* 2644 * Even though the limit is exceeded at this point, reclaim 2645 * may have been able to free some pages. Retry the charge 2646 * before killing the task. 2647 * 2648 * Only for regular pages, though: huge pages are rather 2649 * unlikely to succeed so close to the limit, and we fall back 2650 * to regular pages anyway in case of failure. 2651 */ 2652 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) 2653 return CHARGE_RETRY; 2654 2655 /* 2656 * At task move, charge accounts can be doubly counted. So, it's 2657 * better to wait until the end of task_move if something is going on. 2658 */ 2659 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2660 return CHARGE_RETRY; 2661 2662 if (invoke_oom) 2663 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); 2664 2665 return CHARGE_NOMEM; 2666 } 2667 2668 /* 2669 * __mem_cgroup_try_charge() does 2670 * 1. detect memcg to be charged against from passed *mm and *ptr, 2671 * 2. update res_counter 2672 * 3. call memory reclaim if necessary. 2673 * 2674 * In some special case, if the task is fatal, fatal_signal_pending() or 2675 * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup 2676 * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon 2677 * as possible without any hazards. 2: all pages should have a valid 2678 * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg 2679 * pointer, that is treated as a charge to root_mem_cgroup. 2680 * 2681 * So __mem_cgroup_try_charge() will return 2682 * 0 ... on success, filling *ptr with a valid memcg pointer. 2683 * -ENOMEM ... charge failure because of resource limits. 2684 * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. 2685 * 2686 * Unlike the exported interface, an "oom" parameter is added. if oom==true, 2687 * the oom-killer can be invoked. 2688 */ 2689 static int __mem_cgroup_try_charge(struct mm_struct *mm, 2690 gfp_t gfp_mask, 2691 unsigned int nr_pages, 2692 struct mem_cgroup **ptr, 2693 bool oom) 2694 { 2695 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2696 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2697 struct mem_cgroup *memcg = NULL; 2698 int ret; 2699 2700 /* 2701 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 2702 * in system level. So, allow to go ahead dying process in addition to 2703 * MEMDIE process. 2704 */ 2705 if (unlikely(test_thread_flag(TIF_MEMDIE) 2706 || fatal_signal_pending(current))) 2707 goto bypass; 2708 2709 if (unlikely(task_in_memcg_oom(current))) 2710 goto bypass; 2711 2712 /* 2713 * We always charge the cgroup the mm_struct belongs to. 2714 * The mm_struct's mem_cgroup changes on task migration if the 2715 * thread group leader migrates. It's possible that mm is not 2716 * set, if so charge the root memcg (happens for pagecache usage). 2717 */ 2718 if (!*ptr && !mm) 2719 *ptr = root_mem_cgroup; 2720 again: 2721 if (*ptr) { /* css should be a valid one */ 2722 memcg = *ptr; 2723 if (mem_cgroup_is_root(memcg)) 2724 goto done; 2725 if (consume_stock(memcg, nr_pages)) 2726 goto done; 2727 css_get(&memcg->css); 2728 } else { 2729 struct task_struct *p; 2730 2731 rcu_read_lock(); 2732 p = rcu_dereference(mm->owner); 2733 /* 2734 * Because we don't have task_lock(), "p" can exit. 2735 * In that case, "memcg" can point to root or p can be NULL with 2736 * race with swapoff. Then, we have small risk of mis-accouning. 2737 * But such kind of mis-account by race always happens because 2738 * we don't have cgroup_mutex(). It's overkill and we allo that 2739 * small race, here. 2740 * (*) swapoff at el will charge against mm-struct not against 2741 * task-struct. So, mm->owner can be NULL. 2742 */ 2743 memcg = mem_cgroup_from_task(p); 2744 if (!memcg) 2745 memcg = root_mem_cgroup; 2746 if (mem_cgroup_is_root(memcg)) { 2747 rcu_read_unlock(); 2748 goto done; 2749 } 2750 if (consume_stock(memcg, nr_pages)) { 2751 /* 2752 * It seems dagerous to access memcg without css_get(). 2753 * But considering how consume_stok works, it's not 2754 * necessary. If consume_stock success, some charges 2755 * from this memcg are cached on this cpu. So, we 2756 * don't need to call css_get()/css_tryget() before 2757 * calling consume_stock(). 2758 */ 2759 rcu_read_unlock(); 2760 goto done; 2761 } 2762 /* after here, we may be blocked. we need to get refcnt */ 2763 if (!css_tryget(&memcg->css)) { 2764 rcu_read_unlock(); 2765 goto again; 2766 } 2767 rcu_read_unlock(); 2768 } 2769 2770 do { 2771 bool invoke_oom = oom && !nr_oom_retries; 2772 2773 /* If killed, bypass charge */ 2774 if (fatal_signal_pending(current)) { 2775 css_put(&memcg->css); 2776 goto bypass; 2777 } 2778 2779 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, 2780 nr_pages, invoke_oom); 2781 switch (ret) { 2782 case CHARGE_OK: 2783 break; 2784 case CHARGE_RETRY: /* not in OOM situation but retry */ 2785 batch = nr_pages; 2786 css_put(&memcg->css); 2787 memcg = NULL; 2788 goto again; 2789 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2790 css_put(&memcg->css); 2791 goto nomem; 2792 case CHARGE_NOMEM: /* OOM routine works */ 2793 if (!oom || invoke_oom) { 2794 css_put(&memcg->css); 2795 goto nomem; 2796 } 2797 nr_oom_retries--; 2798 break; 2799 } 2800 } while (ret != CHARGE_OK); 2801 2802 if (batch > nr_pages) 2803 refill_stock(memcg, batch - nr_pages); 2804 css_put(&memcg->css); 2805 done: 2806 *ptr = memcg; 2807 return 0; 2808 nomem: 2809 if (!(gfp_mask & __GFP_NOFAIL)) { 2810 *ptr = NULL; 2811 return -ENOMEM; 2812 } 2813 bypass: 2814 *ptr = root_mem_cgroup; 2815 return -EINTR; 2816 } 2817 2818 /* 2819 * Somemtimes we have to undo a charge we got by try_charge(). 2820 * This function is for that and do uncharge, put css's refcnt. 2821 * gotten by try_charge(). 2822 */ 2823 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, 2824 unsigned int nr_pages) 2825 { 2826 if (!mem_cgroup_is_root(memcg)) { 2827 unsigned long bytes = nr_pages * PAGE_SIZE; 2828 2829 res_counter_uncharge(&memcg->res, bytes); 2830 if (do_swap_account) 2831 res_counter_uncharge(&memcg->memsw, bytes); 2832 } 2833 } 2834 2835 /* 2836 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. 2837 * This is useful when moving usage to parent cgroup. 2838 */ 2839 static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, 2840 unsigned int nr_pages) 2841 { 2842 unsigned long bytes = nr_pages * PAGE_SIZE; 2843 2844 if (mem_cgroup_is_root(memcg)) 2845 return; 2846 2847 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2848 if (do_swap_account) 2849 res_counter_uncharge_until(&memcg->memsw, 2850 memcg->memsw.parent, bytes); 2851 } 2852 2853 /* 2854 * A helper function to get mem_cgroup from ID. must be called under 2855 * rcu_read_lock(). The caller is responsible for calling css_tryget if 2856 * the mem_cgroup is used for charging. (dropping refcnt from swap can be 2857 * called against removed memcg.) 2858 */ 2859 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2860 { 2861 struct cgroup_subsys_state *css; 2862 2863 /* ID 0 is unused ID */ 2864 if (!id) 2865 return NULL; 2866 css = css_lookup(&mem_cgroup_subsys, id); 2867 if (!css) 2868 return NULL; 2869 return mem_cgroup_from_css(css); 2870 } 2871 2872 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2873 { 2874 struct mem_cgroup *memcg = NULL; 2875 struct page_cgroup *pc; 2876 unsigned short id; 2877 swp_entry_t ent; 2878 2879 VM_BUG_ON(!PageLocked(page)); 2880 2881 pc = lookup_page_cgroup(page); 2882 lock_page_cgroup(pc); 2883 if (PageCgroupUsed(pc)) { 2884 memcg = pc->mem_cgroup; 2885 if (memcg && !css_tryget(&memcg->css)) 2886 memcg = NULL; 2887 } else if (PageSwapCache(page)) { 2888 ent.val = page_private(page); 2889 id = lookup_swap_cgroup_id(ent); 2890 rcu_read_lock(); 2891 memcg = mem_cgroup_lookup(id); 2892 if (memcg && !css_tryget(&memcg->css)) 2893 memcg = NULL; 2894 rcu_read_unlock(); 2895 } 2896 unlock_page_cgroup(pc); 2897 return memcg; 2898 } 2899 2900 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, 2901 struct page *page, 2902 unsigned int nr_pages, 2903 enum charge_type ctype, 2904 bool lrucare) 2905 { 2906 struct page_cgroup *pc = lookup_page_cgroup(page); 2907 struct zone *uninitialized_var(zone); 2908 struct lruvec *lruvec; 2909 bool was_on_lru = false; 2910 bool anon; 2911 2912 lock_page_cgroup(pc); 2913 VM_BUG_ON(PageCgroupUsed(pc)); 2914 /* 2915 * we don't need page_cgroup_lock about tail pages, becase they are not 2916 * accessed by any other context at this point. 2917 */ 2918 2919 /* 2920 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2921 * may already be on some other mem_cgroup's LRU. Take care of it. 2922 */ 2923 if (lrucare) { 2924 zone = page_zone(page); 2925 spin_lock_irq(&zone->lru_lock); 2926 if (PageLRU(page)) { 2927 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); 2928 ClearPageLRU(page); 2929 del_page_from_lru_list(page, lruvec, page_lru(page)); 2930 was_on_lru = true; 2931 } 2932 } 2933 2934 pc->mem_cgroup = memcg; 2935 /* 2936 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2937 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2938 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2939 * before USED bit, we need memory barrier here. 2940 * See mem_cgroup_add_lru_list(), etc. 2941 */ 2942 smp_wmb(); 2943 SetPageCgroupUsed(pc); 2944 2945 if (lrucare) { 2946 if (was_on_lru) { 2947 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); 2948 VM_BUG_ON(PageLRU(page)); 2949 SetPageLRU(page); 2950 add_page_to_lru_list(page, lruvec, page_lru(page)); 2951 } 2952 spin_unlock_irq(&zone->lru_lock); 2953 } 2954 2955 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) 2956 anon = true; 2957 else 2958 anon = false; 2959 2960 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); 2961 unlock_page_cgroup(pc); 2962 2963 /* 2964 * "charge_statistics" updated event counter. Then, check it. 2965 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2966 * if they exceeds softlimit. 2967 */ 2968 memcg_check_events(memcg, page); 2969 } 2970 2971 static DEFINE_MUTEX(set_limit_mutex); 2972 2973 #ifdef CONFIG_MEMCG_KMEM 2974 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) 2975 { 2976 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && 2977 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); 2978 } 2979 2980 /* 2981 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2982 * in the memcg_cache_params struct. 2983 */ 2984 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) 2985 { 2986 struct kmem_cache *cachep; 2987 2988 VM_BUG_ON(p->is_root_cache); 2989 cachep = p->root_cache; 2990 return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; 2991 } 2992 2993 #ifdef CONFIG_SLABINFO 2994 static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, 2995 struct cftype *cft, struct seq_file *m) 2996 { 2997 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2998 struct memcg_cache_params *params; 2999 3000 if (!memcg_can_account_kmem(memcg)) 3001 return -EIO; 3002 3003 print_slabinfo_header(m); 3004 3005 mutex_lock(&memcg->slab_caches_mutex); 3006 list_for_each_entry(params, &memcg->memcg_slab_caches, list) 3007 cache_show(memcg_params_to_cache(params), m); 3008 mutex_unlock(&memcg->slab_caches_mutex); 3009 3010 return 0; 3011 } 3012 #endif 3013 3014 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) 3015 { 3016 struct res_counter *fail_res; 3017 struct mem_cgroup *_memcg; 3018 int ret = 0; 3019 bool may_oom; 3020 3021 ret = res_counter_charge(&memcg->kmem, size, &fail_res); 3022 if (ret) 3023 return ret; 3024 3025 /* 3026 * Conditions under which we can wait for the oom_killer. Those are 3027 * the same conditions tested by the core page allocator 3028 */ 3029 may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); 3030 3031 _memcg = memcg; 3032 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, 3033 &_memcg, may_oom); 3034 3035 if (ret == -EINTR) { 3036 /* 3037 * __mem_cgroup_try_charge() chosed to bypass to root due to 3038 * OOM kill or fatal signal. Since our only options are to 3039 * either fail the allocation or charge it to this cgroup, do 3040 * it as a temporary condition. But we can't fail. From a 3041 * kmem/slab perspective, the cache has already been selected, 3042 * by mem_cgroup_kmem_get_cache(), so it is too late to change 3043 * our minds. 3044 * 3045 * This condition will only trigger if the task entered 3046 * memcg_charge_kmem in a sane state, but was OOM-killed during 3047 * __mem_cgroup_try_charge() above. Tasks that were already 3048 * dying when the allocation triggers should have been already 3049 * directed to the root cgroup in memcontrol.h 3050 */ 3051 res_counter_charge_nofail(&memcg->res, size, &fail_res); 3052 if (do_swap_account) 3053 res_counter_charge_nofail(&memcg->memsw, size, 3054 &fail_res); 3055 ret = 0; 3056 } else if (ret) 3057 res_counter_uncharge(&memcg->kmem, size); 3058 3059 return ret; 3060 } 3061 3062 static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) 3063 { 3064 res_counter_uncharge(&memcg->res, size); 3065 if (do_swap_account) 3066 res_counter_uncharge(&memcg->memsw, size); 3067 3068 /* Not down to 0 */ 3069 if (res_counter_uncharge(&memcg->kmem, size)) 3070 return; 3071 3072 /* 3073 * Releases a reference taken in kmem_cgroup_css_offline in case 3074 * this last uncharge is racing with the offlining code or it is 3075 * outliving the memcg existence. 3076 * 3077 * The memory barrier imposed by test&clear is paired with the 3078 * explicit one in memcg_kmem_mark_dead(). 3079 */ 3080 if (memcg_kmem_test_and_clear_dead(memcg)) 3081 css_put(&memcg->css); 3082 } 3083 3084 void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) 3085 { 3086 if (!memcg) 3087 return; 3088 3089 mutex_lock(&memcg->slab_caches_mutex); 3090 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 3091 mutex_unlock(&memcg->slab_caches_mutex); 3092 } 3093 3094 /* 3095 * helper for acessing a memcg's index. It will be used as an index in the 3096 * child cache array in kmem_cache, and also to derive its name. This function 3097 * will return -1 when this is not a kmem-limited memcg. 3098 */ 3099 int memcg_cache_id(struct mem_cgroup *memcg) 3100 { 3101 return memcg ? memcg->kmemcg_id : -1; 3102 } 3103 3104 /* 3105 * This ends up being protected by the set_limit mutex, during normal 3106 * operation, because that is its main call site. 3107 * 3108 * But when we create a new cache, we can call this as well if its parent 3109 * is kmem-limited. That will have to hold set_limit_mutex as well. 3110 */ 3111 int memcg_update_cache_sizes(struct mem_cgroup *memcg) 3112 { 3113 int num, ret; 3114 3115 num = ida_simple_get(&kmem_limited_groups, 3116 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 3117 if (num < 0) 3118 return num; 3119 /* 3120 * After this point, kmem_accounted (that we test atomically in 3121 * the beginning of this conditional), is no longer 0. This 3122 * guarantees only one process will set the following boolean 3123 * to true. We don't need test_and_set because we're protected 3124 * by the set_limit_mutex anyway. 3125 */ 3126 memcg_kmem_set_activated(memcg); 3127 3128 ret = memcg_update_all_caches(num+1); 3129 if (ret) { 3130 ida_simple_remove(&kmem_limited_groups, num); 3131 memcg_kmem_clear_activated(memcg); 3132 return ret; 3133 } 3134 3135 memcg->kmemcg_id = num; 3136 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 3137 mutex_init(&memcg->slab_caches_mutex); 3138 return 0; 3139 } 3140 3141 static size_t memcg_caches_array_size(int num_groups) 3142 { 3143 ssize_t size; 3144 if (num_groups <= 0) 3145 return 0; 3146 3147 size = 2 * num_groups; 3148 if (size < MEMCG_CACHES_MIN_SIZE) 3149 size = MEMCG_CACHES_MIN_SIZE; 3150 else if (size > MEMCG_CACHES_MAX_SIZE) 3151 size = MEMCG_CACHES_MAX_SIZE; 3152 3153 return size; 3154 } 3155 3156 /* 3157 * We should update the current array size iff all caches updates succeed. This 3158 * can only be done from the slab side. The slab mutex needs to be held when 3159 * calling this. 3160 */ 3161 void memcg_update_array_size(int num) 3162 { 3163 if (num > memcg_limited_groups_array_size) 3164 memcg_limited_groups_array_size = memcg_caches_array_size(num); 3165 } 3166 3167 static void kmem_cache_destroy_work_func(struct work_struct *w); 3168 3169 int memcg_update_cache_size(struct kmem_cache *s, int num_groups) 3170 { 3171 struct memcg_cache_params *cur_params = s->memcg_params; 3172 3173 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); 3174 3175 if (num_groups > memcg_limited_groups_array_size) { 3176 int i; 3177 ssize_t size = memcg_caches_array_size(num_groups); 3178 3179 size *= sizeof(void *); 3180 size += offsetof(struct memcg_cache_params, memcg_caches); 3181 3182 s->memcg_params = kzalloc(size, GFP_KERNEL); 3183 if (!s->memcg_params) { 3184 s->memcg_params = cur_params; 3185 return -ENOMEM; 3186 } 3187 3188 s->memcg_params->is_root_cache = true; 3189 3190 /* 3191 * There is the chance it will be bigger than 3192 * memcg_limited_groups_array_size, if we failed an allocation 3193 * in a cache, in which case all caches updated before it, will 3194 * have a bigger array. 3195 * 3196 * But if that is the case, the data after 3197 * memcg_limited_groups_array_size is certainly unused 3198 */ 3199 for (i = 0; i < memcg_limited_groups_array_size; i++) { 3200 if (!cur_params->memcg_caches[i]) 3201 continue; 3202 s->memcg_params->memcg_caches[i] = 3203 cur_params->memcg_caches[i]; 3204 } 3205 3206 /* 3207 * Ideally, we would wait until all caches succeed, and only 3208 * then free the old one. But this is not worth the extra 3209 * pointer per-cache we'd have to have for this. 3210 * 3211 * It is not a big deal if some caches are left with a size 3212 * bigger than the others. And all updates will reset this 3213 * anyway. 3214 */ 3215 kfree(cur_params); 3216 } 3217 return 0; 3218 } 3219 3220 int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, 3221 struct kmem_cache *root_cache) 3222 { 3223 size_t size; 3224 3225 if (!memcg_kmem_enabled()) 3226 return 0; 3227 3228 if (!memcg) { 3229 size = offsetof(struct memcg_cache_params, memcg_caches); 3230 size += memcg_limited_groups_array_size * sizeof(void *); 3231 } else 3232 size = sizeof(struct memcg_cache_params); 3233 3234 s->memcg_params = kzalloc(size, GFP_KERNEL); 3235 if (!s->memcg_params) 3236 return -ENOMEM; 3237 3238 if (memcg) { 3239 s->memcg_params->memcg = memcg; 3240 s->memcg_params->root_cache = root_cache; 3241 INIT_WORK(&s->memcg_params->destroy, 3242 kmem_cache_destroy_work_func); 3243 } else 3244 s->memcg_params->is_root_cache = true; 3245 3246 return 0; 3247 } 3248 3249 void memcg_release_cache(struct kmem_cache *s) 3250 { 3251 struct kmem_cache *root; 3252 struct mem_cgroup *memcg; 3253 int id; 3254 3255 /* 3256 * This happens, for instance, when a root cache goes away before we 3257 * add any memcg. 3258 */ 3259 if (!s->memcg_params) 3260 return; 3261 3262 if (s->memcg_params->is_root_cache) 3263 goto out; 3264 3265 memcg = s->memcg_params->memcg; 3266 id = memcg_cache_id(memcg); 3267 3268 root = s->memcg_params->root_cache; 3269 root->memcg_params->memcg_caches[id] = NULL; 3270 3271 mutex_lock(&memcg->slab_caches_mutex); 3272 list_del(&s->memcg_params->list); 3273 mutex_unlock(&memcg->slab_caches_mutex); 3274 3275 css_put(&memcg->css); 3276 out: 3277 kfree(s->memcg_params); 3278 } 3279 3280 /* 3281 * During the creation a new cache, we need to disable our accounting mechanism 3282 * altogether. This is true even if we are not creating, but rather just 3283 * enqueing new caches to be created. 3284 * 3285 * This is because that process will trigger allocations; some visible, like 3286 * explicit kmallocs to auxiliary data structures, name strings and internal 3287 * cache structures; some well concealed, like INIT_WORK() that can allocate 3288 * objects during debug. 3289 * 3290 * If any allocation happens during memcg_kmem_get_cache, we will recurse back 3291 * to it. This may not be a bounded recursion: since the first cache creation 3292 * failed to complete (waiting on the allocation), we'll just try to create the 3293 * cache again, failing at the same point. 3294 * 3295 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of 3296 * memcg_kmem_skip_account. So we enclose anything that might allocate memory 3297 * inside the following two functions. 3298 */ 3299 static inline void memcg_stop_kmem_account(void) 3300 { 3301 VM_BUG_ON(!current->mm); 3302 current->memcg_kmem_skip_account++; 3303 } 3304 3305 static inline void memcg_resume_kmem_account(void) 3306 { 3307 VM_BUG_ON(!current->mm); 3308 current->memcg_kmem_skip_account--; 3309 } 3310 3311 static void kmem_cache_destroy_work_func(struct work_struct *w) 3312 { 3313 struct kmem_cache *cachep; 3314 struct memcg_cache_params *p; 3315 3316 p = container_of(w, struct memcg_cache_params, destroy); 3317 3318 cachep = memcg_params_to_cache(p); 3319 3320 /* 3321 * If we get down to 0 after shrink, we could delete right away. 3322 * However, memcg_release_pages() already puts us back in the workqueue 3323 * in that case. If we proceed deleting, we'll get a dangling 3324 * reference, and removing the object from the workqueue in that case 3325 * is unnecessary complication. We are not a fast path. 3326 * 3327 * Note that this case is fundamentally different from racing with 3328 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in 3329 * kmem_cache_shrink, not only we would be reinserting a dead cache 3330 * into the queue, but doing so from inside the worker racing to 3331 * destroy it. 3332 * 3333 * So if we aren't down to zero, we'll just schedule a worker and try 3334 * again 3335 */ 3336 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { 3337 kmem_cache_shrink(cachep); 3338 if (atomic_read(&cachep->memcg_params->nr_pages) == 0) 3339 return; 3340 } else 3341 kmem_cache_destroy(cachep); 3342 } 3343 3344 void mem_cgroup_destroy_cache(struct kmem_cache *cachep) 3345 { 3346 if (!cachep->memcg_params->dead) 3347 return; 3348 3349 /* 3350 * There are many ways in which we can get here. 3351 * 3352 * We can get to a memory-pressure situation while the delayed work is 3353 * still pending to run. The vmscan shrinkers can then release all 3354 * cache memory and get us to destruction. If this is the case, we'll 3355 * be executed twice, which is a bug (the second time will execute over 3356 * bogus data). In this case, cancelling the work should be fine. 3357 * 3358 * But we can also get here from the worker itself, if 3359 * kmem_cache_shrink is enough to shake all the remaining objects and 3360 * get the page count to 0. In this case, we'll deadlock if we try to 3361 * cancel the work (the worker runs with an internal lock held, which 3362 * is the same lock we would hold for cancel_work_sync().) 3363 * 3364 * Since we can't possibly know who got us here, just refrain from 3365 * running if there is already work pending 3366 */ 3367 if (work_pending(&cachep->memcg_params->destroy)) 3368 return; 3369 /* 3370 * We have to defer the actual destroying to a workqueue, because 3371 * we might currently be in a context that cannot sleep. 3372 */ 3373 schedule_work(&cachep->memcg_params->destroy); 3374 } 3375 3376 /* 3377 * This lock protects updaters, not readers. We want readers to be as fast as 3378 * they can, and they will either see NULL or a valid cache value. Our model 3379 * allow them to see NULL, in which case the root memcg will be selected. 3380 * 3381 * We need this lock because multiple allocations to the same cache from a non 3382 * will span more than one worker. Only one of them can create the cache. 3383 */ 3384 static DEFINE_MUTEX(memcg_cache_mutex); 3385 3386 /* 3387 * Called with memcg_cache_mutex held 3388 */ 3389 static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, 3390 struct kmem_cache *s) 3391 { 3392 struct kmem_cache *new; 3393 static char *tmp_name = NULL; 3394 3395 lockdep_assert_held(&memcg_cache_mutex); 3396 3397 /* 3398 * kmem_cache_create_memcg duplicates the given name and 3399 * cgroup_name for this name requires RCU context. 3400 * This static temporary buffer is used to prevent from 3401 * pointless shortliving allocation. 3402 */ 3403 if (!tmp_name) { 3404 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); 3405 if (!tmp_name) 3406 return NULL; 3407 } 3408 3409 rcu_read_lock(); 3410 snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, 3411 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); 3412 rcu_read_unlock(); 3413 3414 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, 3415 (s->flags & ~SLAB_PANIC), s->ctor, s); 3416 3417 if (new) 3418 new->allocflags |= __GFP_KMEMCG; 3419 3420 return new; 3421 } 3422 3423 static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, 3424 struct kmem_cache *cachep) 3425 { 3426 struct kmem_cache *new_cachep; 3427 int idx; 3428 3429 BUG_ON(!memcg_can_account_kmem(memcg)); 3430 3431 idx = memcg_cache_id(memcg); 3432 3433 mutex_lock(&memcg_cache_mutex); 3434 new_cachep = cachep->memcg_params->memcg_caches[idx]; 3435 if (new_cachep) { 3436 css_put(&memcg->css); 3437 goto out; 3438 } 3439 3440 new_cachep = kmem_cache_dup(memcg, cachep); 3441 if (new_cachep == NULL) { 3442 new_cachep = cachep; 3443 css_put(&memcg->css); 3444 goto out; 3445 } 3446 3447 atomic_set(&new_cachep->memcg_params->nr_pages , 0); 3448 3449 cachep->memcg_params->memcg_caches[idx] = new_cachep; 3450 /* 3451 * the readers won't lock, make sure everybody sees the updated value, 3452 * so they won't put stuff in the queue again for no reason 3453 */ 3454 wmb(); 3455 out: 3456 mutex_unlock(&memcg_cache_mutex); 3457 return new_cachep; 3458 } 3459 3460 void kmem_cache_destroy_memcg_children(struct kmem_cache *s) 3461 { 3462 struct kmem_cache *c; 3463 int i; 3464 3465 if (!s->memcg_params) 3466 return; 3467 if (!s->memcg_params->is_root_cache) 3468 return; 3469 3470 /* 3471 * If the cache is being destroyed, we trust that there is no one else 3472 * requesting objects from it. Even if there are, the sanity checks in 3473 * kmem_cache_destroy should caught this ill-case. 3474 * 3475 * Still, we don't want anyone else freeing memcg_caches under our 3476 * noses, which can happen if a new memcg comes to life. As usual, 3477 * we'll take the set_limit_mutex to protect ourselves against this. 3478 */ 3479 mutex_lock(&set_limit_mutex); 3480 for (i = 0; i < memcg_limited_groups_array_size; i++) { 3481 c = s->memcg_params->memcg_caches[i]; 3482 if (!c) 3483 continue; 3484 3485 /* 3486 * We will now manually delete the caches, so to avoid races 3487 * we need to cancel all pending destruction workers and 3488 * proceed with destruction ourselves. 3489 * 3490 * kmem_cache_destroy() will call kmem_cache_shrink internally, 3491 * and that could spawn the workers again: it is likely that 3492 * the cache still have active pages until this very moment. 3493 * This would lead us back to mem_cgroup_destroy_cache. 3494 * 3495 * But that will not execute at all if the "dead" flag is not 3496 * set, so flip it down to guarantee we are in control. 3497 */ 3498 c->memcg_params->dead = false; 3499 cancel_work_sync(&c->memcg_params->destroy); 3500 kmem_cache_destroy(c); 3501 } 3502 mutex_unlock(&set_limit_mutex); 3503 } 3504 3505 struct create_work { 3506 struct mem_cgroup *memcg; 3507 struct kmem_cache *cachep; 3508 struct work_struct work; 3509 }; 3510 3511 static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3512 { 3513 struct kmem_cache *cachep; 3514 struct memcg_cache_params *params; 3515 3516 if (!memcg_kmem_is_active(memcg)) 3517 return; 3518 3519 mutex_lock(&memcg->slab_caches_mutex); 3520 list_for_each_entry(params, &memcg->memcg_slab_caches, list) { 3521 cachep = memcg_params_to_cache(params); 3522 cachep->memcg_params->dead = true; 3523 schedule_work(&cachep->memcg_params->destroy); 3524 } 3525 mutex_unlock(&memcg->slab_caches_mutex); 3526 } 3527 3528 static void memcg_create_cache_work_func(struct work_struct *w) 3529 { 3530 struct create_work *cw; 3531 3532 cw = container_of(w, struct create_work, work); 3533 memcg_create_kmem_cache(cw->memcg, cw->cachep); 3534 kfree(cw); 3535 } 3536 3537 /* 3538 * Enqueue the creation of a per-memcg kmem_cache. 3539 */ 3540 static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3541 struct kmem_cache *cachep) 3542 { 3543 struct create_work *cw; 3544 3545 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); 3546 if (cw == NULL) { 3547 css_put(&memcg->css); 3548 return; 3549 } 3550 3551 cw->memcg = memcg; 3552 cw->cachep = cachep; 3553 3554 INIT_WORK(&cw->work, memcg_create_cache_work_func); 3555 schedule_work(&cw->work); 3556 } 3557 3558 static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3559 struct kmem_cache *cachep) 3560 { 3561 /* 3562 * We need to stop accounting when we kmalloc, because if the 3563 * corresponding kmalloc cache is not yet created, the first allocation 3564 * in __memcg_create_cache_enqueue will recurse. 3565 * 3566 * However, it is better to enclose the whole function. Depending on 3567 * the debugging options enabled, INIT_WORK(), for instance, can 3568 * trigger an allocation. This too, will make us recurse. Because at 3569 * this point we can't allow ourselves back into memcg_kmem_get_cache, 3570 * the safest choice is to do it like this, wrapping the whole function. 3571 */ 3572 memcg_stop_kmem_account(); 3573 __memcg_create_cache_enqueue(memcg, cachep); 3574 memcg_resume_kmem_account(); 3575 } 3576 /* 3577 * Return the kmem_cache we're supposed to use for a slab allocation. 3578 * We try to use the current memcg's version of the cache. 3579 * 3580 * If the cache does not exist yet, if we are the first user of it, 3581 * we either create it immediately, if possible, or create it asynchronously 3582 * in a workqueue. 3583 * In the latter case, we will let the current allocation go through with 3584 * the original cache. 3585 * 3586 * Can't be called in interrupt context or from kernel threads. 3587 * This function needs to be called with rcu_read_lock() held. 3588 */ 3589 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, 3590 gfp_t gfp) 3591 { 3592 struct mem_cgroup *memcg; 3593 int idx; 3594 3595 VM_BUG_ON(!cachep->memcg_params); 3596 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 3597 3598 if (!current->mm || current->memcg_kmem_skip_account) 3599 return cachep; 3600 3601 rcu_read_lock(); 3602 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); 3603 3604 if (!memcg_can_account_kmem(memcg)) 3605 goto out; 3606 3607 idx = memcg_cache_id(memcg); 3608 3609 /* 3610 * barrier to mare sure we're always seeing the up to date value. The 3611 * code updating memcg_caches will issue a write barrier to match this. 3612 */ 3613 read_barrier_depends(); 3614 if (likely(cachep->memcg_params->memcg_caches[idx])) { 3615 cachep = cachep->memcg_params->memcg_caches[idx]; 3616 goto out; 3617 } 3618 3619 /* The corresponding put will be done in the workqueue. */ 3620 if (!css_tryget(&memcg->css)) 3621 goto out; 3622 rcu_read_unlock(); 3623 3624 /* 3625 * If we are in a safe context (can wait, and not in interrupt 3626 * context), we could be be predictable and return right away. 3627 * This would guarantee that the allocation being performed 3628 * already belongs in the new cache. 3629 * 3630 * However, there are some clashes that can arrive from locking. 3631 * For instance, because we acquire the slab_mutex while doing 3632 * kmem_cache_dup, this means no further allocation could happen 3633 * with the slab_mutex held. 3634 * 3635 * Also, because cache creation issue get_online_cpus(), this 3636 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, 3637 * that ends up reversed during cpu hotplug. (cpuset allocates 3638 * a bunch of GFP_KERNEL memory during cpuup). Due to all that, 3639 * better to defer everything. 3640 */ 3641 memcg_create_cache_enqueue(memcg, cachep); 3642 return cachep; 3643 out: 3644 rcu_read_unlock(); 3645 return cachep; 3646 } 3647 EXPORT_SYMBOL(__memcg_kmem_get_cache); 3648 3649 /* 3650 * We need to verify if the allocation against current->mm->owner's memcg is 3651 * possible for the given order. But the page is not allocated yet, so we'll 3652 * need a further commit step to do the final arrangements. 3653 * 3654 * It is possible for the task to switch cgroups in this mean time, so at 3655 * commit time, we can't rely on task conversion any longer. We'll then use 3656 * the handle argument to return to the caller which cgroup we should commit 3657 * against. We could also return the memcg directly and avoid the pointer 3658 * passing, but a boolean return value gives better semantics considering 3659 * the compiled-out case as well. 3660 * 3661 * Returning true means the allocation is possible. 3662 */ 3663 bool 3664 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) 3665 { 3666 struct mem_cgroup *memcg; 3667 int ret; 3668 3669 *_memcg = NULL; 3670 3671 /* 3672 * Disabling accounting is only relevant for some specific memcg 3673 * internal allocations. Therefore we would initially not have such 3674 * check here, since direct calls to the page allocator that are marked 3675 * with GFP_KMEMCG only happen outside memcg core. We are mostly 3676 * concerned with cache allocations, and by having this test at 3677 * memcg_kmem_get_cache, we are already able to relay the allocation to 3678 * the root cache and bypass the memcg cache altogether. 3679 * 3680 * There is one exception, though: the SLUB allocator does not create 3681 * large order caches, but rather service large kmallocs directly from 3682 * the page allocator. Therefore, the following sequence when backed by 3683 * the SLUB allocator: 3684 * 3685 * memcg_stop_kmem_account(); 3686 * kmalloc(<large_number>) 3687 * memcg_resume_kmem_account(); 3688 * 3689 * would effectively ignore the fact that we should skip accounting, 3690 * since it will drive us directly to this function without passing 3691 * through the cache selector memcg_kmem_get_cache. Such large 3692 * allocations are extremely rare but can happen, for instance, for the 3693 * cache arrays. We bring this test here. 3694 */ 3695 if (!current->mm || current->memcg_kmem_skip_account) 3696 return true; 3697 3698 memcg = try_get_mem_cgroup_from_mm(current->mm); 3699 3700 /* 3701 * very rare case described in mem_cgroup_from_task. Unfortunately there 3702 * isn't much we can do without complicating this too much, and it would 3703 * be gfp-dependent anyway. Just let it go 3704 */ 3705 if (unlikely(!memcg)) 3706 return true; 3707 3708 if (!memcg_can_account_kmem(memcg)) { 3709 css_put(&memcg->css); 3710 return true; 3711 } 3712 3713 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); 3714 if (!ret) 3715 *_memcg = memcg; 3716 3717 css_put(&memcg->css); 3718 return (ret == 0); 3719 } 3720 3721 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 3722 int order) 3723 { 3724 struct page_cgroup *pc; 3725 3726 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3727 3728 /* The page allocation failed. Revert */ 3729 if (!page) { 3730 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3731 return; 3732 } 3733 3734 pc = lookup_page_cgroup(page); 3735 lock_page_cgroup(pc); 3736 pc->mem_cgroup = memcg; 3737 SetPageCgroupUsed(pc); 3738 unlock_page_cgroup(pc); 3739 } 3740 3741 void __memcg_kmem_uncharge_pages(struct page *page, int order) 3742 { 3743 struct mem_cgroup *memcg = NULL; 3744 struct page_cgroup *pc; 3745 3746 3747 pc = lookup_page_cgroup(page); 3748 /* 3749 * Fast unlocked return. Theoretically might have changed, have to 3750 * check again after locking. 3751 */ 3752 if (!PageCgroupUsed(pc)) 3753 return; 3754 3755 lock_page_cgroup(pc); 3756 if (PageCgroupUsed(pc)) { 3757 memcg = pc->mem_cgroup; 3758 ClearPageCgroupUsed(pc); 3759 } 3760 unlock_page_cgroup(pc); 3761 3762 /* 3763 * We trust that only if there is a memcg associated with the page, it 3764 * is a valid allocation 3765 */ 3766 if (!memcg) 3767 return; 3768 3769 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3770 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3771 } 3772 #else 3773 static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3774 { 3775 } 3776 #endif /* CONFIG_MEMCG_KMEM */ 3777 3778 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3779 3780 #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) 3781 /* 3782 * Because tail pages are not marked as "used", set it. We're under 3783 * zone->lru_lock, 'splitting on pmd' and compound_lock. 3784 * charge/uncharge will be never happen and move_account() is done under 3785 * compound_lock(), so we don't have to take care of races. 3786 */ 3787 void mem_cgroup_split_huge_fixup(struct page *head) 3788 { 3789 struct page_cgroup *head_pc = lookup_page_cgroup(head); 3790 struct page_cgroup *pc; 3791 struct mem_cgroup *memcg; 3792 int i; 3793 3794 if (mem_cgroup_disabled()) 3795 return; 3796 3797 memcg = head_pc->mem_cgroup; 3798 for (i = 1; i < HPAGE_PMD_NR; i++) { 3799 pc = head_pc + i; 3800 pc->mem_cgroup = memcg; 3801 smp_wmb();/* see __commit_charge() */ 3802 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 3803 } 3804 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3805 HPAGE_PMD_NR); 3806 } 3807 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3808 3809 static inline 3810 void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, 3811 struct mem_cgroup *to, 3812 unsigned int nr_pages, 3813 enum mem_cgroup_stat_index idx) 3814 { 3815 /* Update stat data for mem_cgroup */ 3816 preempt_disable(); 3817 __this_cpu_sub(from->stat->count[idx], nr_pages); 3818 __this_cpu_add(to->stat->count[idx], nr_pages); 3819 preempt_enable(); 3820 } 3821 3822 /** 3823 * mem_cgroup_move_account - move account of the page 3824 * @page: the page 3825 * @nr_pages: number of regular pages (>1 for huge pages) 3826 * @pc: page_cgroup of the page. 3827 * @from: mem_cgroup which the page is moved from. 3828 * @to: mem_cgroup which the page is moved to. @from != @to. 3829 * 3830 * The caller must confirm following. 3831 * - page is not on LRU (isolate_page() is useful.) 3832 * - compound_lock is held when nr_pages > 1 3833 * 3834 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 3835 * from old cgroup. 3836 */ 3837 static int mem_cgroup_move_account(struct page *page, 3838 unsigned int nr_pages, 3839 struct page_cgroup *pc, 3840 struct mem_cgroup *from, 3841 struct mem_cgroup *to) 3842 { 3843 unsigned long flags; 3844 int ret; 3845 bool anon = PageAnon(page); 3846 3847 VM_BUG_ON(from == to); 3848 VM_BUG_ON(PageLRU(page)); 3849 /* 3850 * The page is isolated from LRU. So, collapse function 3851 * will not handle this page. But page splitting can happen. 3852 * Do this check under compound_page_lock(). The caller should 3853 * hold it. 3854 */ 3855 ret = -EBUSY; 3856 if (nr_pages > 1 && !PageTransHuge(page)) 3857 goto out; 3858 3859 lock_page_cgroup(pc); 3860 3861 ret = -EINVAL; 3862 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 3863 goto unlock; 3864 3865 move_lock_mem_cgroup(from, &flags); 3866 3867 if (!anon && page_mapped(page)) 3868 mem_cgroup_move_account_page_stat(from, to, nr_pages, 3869 MEM_CGROUP_STAT_FILE_MAPPED); 3870 3871 if (PageWriteback(page)) 3872 mem_cgroup_move_account_page_stat(from, to, nr_pages, 3873 MEM_CGROUP_STAT_WRITEBACK); 3874 3875 mem_cgroup_charge_statistics(from, page, anon, -nr_pages); 3876 3877 /* caller should have done css_get */ 3878 pc->mem_cgroup = to; 3879 mem_cgroup_charge_statistics(to, page, anon, nr_pages); 3880 move_unlock_mem_cgroup(from, &flags); 3881 ret = 0; 3882 unlock: 3883 unlock_page_cgroup(pc); 3884 /* 3885 * check events 3886 */ 3887 memcg_check_events(to, page); 3888 memcg_check_events(from, page); 3889 out: 3890 return ret; 3891 } 3892 3893 /** 3894 * mem_cgroup_move_parent - moves page to the parent group 3895 * @page: the page to move 3896 * @pc: page_cgroup of the page 3897 * @child: page's cgroup 3898 * 3899 * move charges to its parent or the root cgroup if the group has no 3900 * parent (aka use_hierarchy==0). 3901 * Although this might fail (get_page_unless_zero, isolate_lru_page or 3902 * mem_cgroup_move_account fails) the failure is always temporary and 3903 * it signals a race with a page removal/uncharge or migration. In the 3904 * first case the page is on the way out and it will vanish from the LRU 3905 * on the next attempt and the call should be retried later. 3906 * Isolation from the LRU fails only if page has been isolated from 3907 * the LRU since we looked at it and that usually means either global 3908 * reclaim or migration going on. The page will either get back to the 3909 * LRU or vanish. 3910 * Finaly mem_cgroup_move_account fails only if the page got uncharged 3911 * (!PageCgroupUsed) or moved to a different group. The page will 3912 * disappear in the next attempt. 3913 */ 3914 static int mem_cgroup_move_parent(struct page *page, 3915 struct page_cgroup *pc, 3916 struct mem_cgroup *child) 3917 { 3918 struct mem_cgroup *parent; 3919 unsigned int nr_pages; 3920 unsigned long uninitialized_var(flags); 3921 int ret; 3922 3923 VM_BUG_ON(mem_cgroup_is_root(child)); 3924 3925 ret = -EBUSY; 3926 if (!get_page_unless_zero(page)) 3927 goto out; 3928 if (isolate_lru_page(page)) 3929 goto put; 3930 3931 nr_pages = hpage_nr_pages(page); 3932 3933 parent = parent_mem_cgroup(child); 3934 /* 3935 * If no parent, move charges to root cgroup. 3936 */ 3937 if (!parent) 3938 parent = root_mem_cgroup; 3939 3940 if (nr_pages > 1) { 3941 VM_BUG_ON(!PageTransHuge(page)); 3942 flags = compound_lock_irqsave(page); 3943 } 3944 3945 ret = mem_cgroup_move_account(page, nr_pages, 3946 pc, child, parent); 3947 if (!ret) 3948 __mem_cgroup_cancel_local_charge(child, nr_pages); 3949 3950 if (nr_pages > 1) 3951 compound_unlock_irqrestore(page, flags); 3952 putback_lru_page(page); 3953 put: 3954 put_page(page); 3955 out: 3956 return ret; 3957 } 3958 3959 /* 3960 * Charge the memory controller for page usage. 3961 * Return 3962 * 0 if the charge was successful 3963 * < 0 if the cgroup is over its limit 3964 */ 3965 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 3966 gfp_t gfp_mask, enum charge_type ctype) 3967 { 3968 struct mem_cgroup *memcg = NULL; 3969 unsigned int nr_pages = 1; 3970 bool oom = true; 3971 int ret; 3972 3973 if (PageTransHuge(page)) { 3974 nr_pages <<= compound_order(page); 3975 VM_BUG_ON(!PageTransHuge(page)); 3976 /* 3977 * Never OOM-kill a process for a huge page. The 3978 * fault handler will fall back to regular pages. 3979 */ 3980 oom = false; 3981 } 3982 3983 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 3984 if (ret == -ENOMEM) 3985 return ret; 3986 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); 3987 return 0; 3988 } 3989 3990 int mem_cgroup_newpage_charge(struct page *page, 3991 struct mm_struct *mm, gfp_t gfp_mask) 3992 { 3993 if (mem_cgroup_disabled()) 3994 return 0; 3995 VM_BUG_ON(page_mapped(page)); 3996 VM_BUG_ON(page->mapping && !PageAnon(page)); 3997 VM_BUG_ON(!mm); 3998 return mem_cgroup_charge_common(page, mm, gfp_mask, 3999 MEM_CGROUP_CHARGE_TYPE_ANON); 4000 } 4001 4002 /* 4003 * While swap-in, try_charge -> commit or cancel, the page is locked. 4004 * And when try_charge() successfully returns, one refcnt to memcg without 4005 * struct page_cgroup is acquired. This refcnt will be consumed by 4006 * "commit()" or removed by "cancel()" 4007 */ 4008 static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, 4009 struct page *page, 4010 gfp_t mask, 4011 struct mem_cgroup **memcgp) 4012 { 4013 struct mem_cgroup *memcg; 4014 struct page_cgroup *pc; 4015 int ret; 4016 4017 pc = lookup_page_cgroup(page); 4018 /* 4019 * Every swap fault against a single page tries to charge the 4020 * page, bail as early as possible. shmem_unuse() encounters 4021 * already charged pages, too. The USED bit is protected by 4022 * the page lock, which serializes swap cache removal, which 4023 * in turn serializes uncharging. 4024 */ 4025 if (PageCgroupUsed(pc)) 4026 return 0; 4027 if (!do_swap_account) 4028 goto charge_cur_mm; 4029 memcg = try_get_mem_cgroup_from_page(page); 4030 if (!memcg) 4031 goto charge_cur_mm; 4032 *memcgp = memcg; 4033 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); 4034 css_put(&memcg->css); 4035 if (ret == -EINTR) 4036 ret = 0; 4037 return ret; 4038 charge_cur_mm: 4039 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 4040 if (ret == -EINTR) 4041 ret = 0; 4042 return ret; 4043 } 4044 4045 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, 4046 gfp_t gfp_mask, struct mem_cgroup **memcgp) 4047 { 4048 *memcgp = NULL; 4049 if (mem_cgroup_disabled()) 4050 return 0; 4051 /* 4052 * A racing thread's fault, or swapoff, may have already 4053 * updated the pte, and even removed page from swap cache: in 4054 * those cases unuse_pte()'s pte_same() test will fail; but 4055 * there's also a KSM case which does need to charge the page. 4056 */ 4057 if (!PageSwapCache(page)) { 4058 int ret; 4059 4060 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); 4061 if (ret == -EINTR) 4062 ret = 0; 4063 return ret; 4064 } 4065 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); 4066 } 4067 4068 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 4069 { 4070 if (mem_cgroup_disabled()) 4071 return; 4072 if (!memcg) 4073 return; 4074 __mem_cgroup_cancel_charge(memcg, 1); 4075 } 4076 4077 static void 4078 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 4079 enum charge_type ctype) 4080 { 4081 if (mem_cgroup_disabled()) 4082 return; 4083 if (!memcg) 4084 return; 4085 4086 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 4087 /* 4088 * Now swap is on-memory. This means this page may be 4089 * counted both as mem and swap....double count. 4090 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 4091 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 4092 * may call delete_from_swap_cache() before reach here. 4093 */ 4094 if (do_swap_account && PageSwapCache(page)) { 4095 swp_entry_t ent = {.val = page_private(page)}; 4096 mem_cgroup_uncharge_swap(ent); 4097 } 4098 } 4099 4100 void mem_cgroup_commit_charge_swapin(struct page *page, 4101 struct mem_cgroup *memcg) 4102 { 4103 __mem_cgroup_commit_charge_swapin(page, memcg, 4104 MEM_CGROUP_CHARGE_TYPE_ANON); 4105 } 4106 4107 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 4108 gfp_t gfp_mask) 4109 { 4110 struct mem_cgroup *memcg = NULL; 4111 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 4112 int ret; 4113 4114 if (mem_cgroup_disabled()) 4115 return 0; 4116 if (PageCompound(page)) 4117 return 0; 4118 4119 if (!PageSwapCache(page)) 4120 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); 4121 else { /* page is swapcache/shmem */ 4122 ret = __mem_cgroup_try_charge_swapin(mm, page, 4123 gfp_mask, &memcg); 4124 if (!ret) 4125 __mem_cgroup_commit_charge_swapin(page, memcg, type); 4126 } 4127 return ret; 4128 } 4129 4130 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 4131 unsigned int nr_pages, 4132 const enum charge_type ctype) 4133 { 4134 struct memcg_batch_info *batch = NULL; 4135 bool uncharge_memsw = true; 4136 4137 /* If swapout, usage of swap doesn't decrease */ 4138 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 4139 uncharge_memsw = false; 4140 4141 batch = ¤t->memcg_batch; 4142 /* 4143 * In usual, we do css_get() when we remember memcg pointer. 4144 * But in this case, we keep res->usage until end of a series of 4145 * uncharges. Then, it's ok to ignore memcg's refcnt. 4146 */ 4147 if (!batch->memcg) 4148 batch->memcg = memcg; 4149 /* 4150 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 4151 * In those cases, all pages freed continuously can be expected to be in 4152 * the same cgroup and we have chance to coalesce uncharges. 4153 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 4154 * because we want to do uncharge as soon as possible. 4155 */ 4156 4157 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 4158 goto direct_uncharge; 4159 4160 if (nr_pages > 1) 4161 goto direct_uncharge; 4162 4163 /* 4164 * In typical case, batch->memcg == mem. This means we can 4165 * merge a series of uncharges to an uncharge of res_counter. 4166 * If not, we uncharge res_counter ony by one. 4167 */ 4168 if (batch->memcg != memcg) 4169 goto direct_uncharge; 4170 /* remember freed charge and uncharge it later */ 4171 batch->nr_pages++; 4172 if (uncharge_memsw) 4173 batch->memsw_nr_pages++; 4174 return; 4175 direct_uncharge: 4176 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); 4177 if (uncharge_memsw) 4178 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); 4179 if (unlikely(batch->memcg != memcg)) 4180 memcg_oom_recover(memcg); 4181 } 4182 4183 /* 4184 * uncharge if !page_mapped(page) 4185 */ 4186 static struct mem_cgroup * 4187 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, 4188 bool end_migration) 4189 { 4190 struct mem_cgroup *memcg = NULL; 4191 unsigned int nr_pages = 1; 4192 struct page_cgroup *pc; 4193 bool anon; 4194 4195 if (mem_cgroup_disabled()) 4196 return NULL; 4197 4198 if (PageTransHuge(page)) { 4199 nr_pages <<= compound_order(page); 4200 VM_BUG_ON(!PageTransHuge(page)); 4201 } 4202 /* 4203 * Check if our page_cgroup is valid 4204 */ 4205 pc = lookup_page_cgroup(page); 4206 if (unlikely(!PageCgroupUsed(pc))) 4207 return NULL; 4208 4209 lock_page_cgroup(pc); 4210 4211 memcg = pc->mem_cgroup; 4212 4213 if (!PageCgroupUsed(pc)) 4214 goto unlock_out; 4215 4216 anon = PageAnon(page); 4217 4218 switch (ctype) { 4219 case MEM_CGROUP_CHARGE_TYPE_ANON: 4220 /* 4221 * Generally PageAnon tells if it's the anon statistics to be 4222 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is 4223 * used before page reached the stage of being marked PageAnon. 4224 */ 4225 anon = true; 4226 /* fallthrough */ 4227 case MEM_CGROUP_CHARGE_TYPE_DROP: 4228 /* See mem_cgroup_prepare_migration() */ 4229 if (page_mapped(page)) 4230 goto unlock_out; 4231 /* 4232 * Pages under migration may not be uncharged. But 4233 * end_migration() /must/ be the one uncharging the 4234 * unused post-migration page and so it has to call 4235 * here with the migration bit still set. See the 4236 * res_counter handling below. 4237 */ 4238 if (!end_migration && PageCgroupMigration(pc)) 4239 goto unlock_out; 4240 break; 4241 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 4242 if (!PageAnon(page)) { /* Shared memory */ 4243 if (page->mapping && !page_is_file_cache(page)) 4244 goto unlock_out; 4245 } else if (page_mapped(page)) /* Anon */ 4246 goto unlock_out; 4247 break; 4248 default: 4249 break; 4250 } 4251 4252 mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); 4253 4254 ClearPageCgroupUsed(pc); 4255 /* 4256 * pc->mem_cgroup is not cleared here. It will be accessed when it's 4257 * freed from LRU. This is safe because uncharged page is expected not 4258 * to be reused (freed soon). Exception is SwapCache, it's handled by 4259 * special functions. 4260 */ 4261 4262 unlock_page_cgroup(pc); 4263 /* 4264 * even after unlock, we have memcg->res.usage here and this memcg 4265 * will never be freed, so it's safe to call css_get(). 4266 */ 4267 memcg_check_events(memcg, page); 4268 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 4269 mem_cgroup_swap_statistics(memcg, true); 4270 css_get(&memcg->css); 4271 } 4272 /* 4273 * Migration does not charge the res_counter for the 4274 * replacement page, so leave it alone when phasing out the 4275 * page that is unused after the migration. 4276 */ 4277 if (!end_migration && !mem_cgroup_is_root(memcg)) 4278 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 4279 4280 return memcg; 4281 4282 unlock_out: 4283 unlock_page_cgroup(pc); 4284 return NULL; 4285 } 4286 4287 void mem_cgroup_uncharge_page(struct page *page) 4288 { 4289 /* early check. */ 4290 if (page_mapped(page)) 4291 return; 4292 VM_BUG_ON(page->mapping && !PageAnon(page)); 4293 /* 4294 * If the page is in swap cache, uncharge should be deferred 4295 * to the swap path, which also properly accounts swap usage 4296 * and handles memcg lifetime. 4297 * 4298 * Note that this check is not stable and reclaim may add the 4299 * page to swap cache at any time after this. However, if the 4300 * page is not in swap cache by the time page->mapcount hits 4301 * 0, there won't be any page table references to the swap 4302 * slot, and reclaim will free it and not actually write the 4303 * page to disk. 4304 */ 4305 if (PageSwapCache(page)) 4306 return; 4307 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); 4308 } 4309 4310 void mem_cgroup_uncharge_cache_page(struct page *page) 4311 { 4312 VM_BUG_ON(page_mapped(page)); 4313 VM_BUG_ON(page->mapping); 4314 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); 4315 } 4316 4317 /* 4318 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 4319 * In that cases, pages are freed continuously and we can expect pages 4320 * are in the same memcg. All these calls itself limits the number of 4321 * pages freed at once, then uncharge_start/end() is called properly. 4322 * This may be called prural(2) times in a context, 4323 */ 4324 4325 void mem_cgroup_uncharge_start(void) 4326 { 4327 current->memcg_batch.do_batch++; 4328 /* We can do nest. */ 4329 if (current->memcg_batch.do_batch == 1) { 4330 current->memcg_batch.memcg = NULL; 4331 current->memcg_batch.nr_pages = 0; 4332 current->memcg_batch.memsw_nr_pages = 0; 4333 } 4334 } 4335 4336 void mem_cgroup_uncharge_end(void) 4337 { 4338 struct memcg_batch_info *batch = ¤t->memcg_batch; 4339 4340 if (!batch->do_batch) 4341 return; 4342 4343 batch->do_batch--; 4344 if (batch->do_batch) /* If stacked, do nothing. */ 4345 return; 4346 4347 if (!batch->memcg) 4348 return; 4349 /* 4350 * This "batch->memcg" is valid without any css_get/put etc... 4351 * bacause we hide charges behind us. 4352 */ 4353 if (batch->nr_pages) 4354 res_counter_uncharge(&batch->memcg->res, 4355 batch->nr_pages * PAGE_SIZE); 4356 if (batch->memsw_nr_pages) 4357 res_counter_uncharge(&batch->memcg->memsw, 4358 batch->memsw_nr_pages * PAGE_SIZE); 4359 memcg_oom_recover(batch->memcg); 4360 /* forget this pointer (for sanity check) */ 4361 batch->memcg = NULL; 4362 } 4363 4364 #ifdef CONFIG_SWAP 4365 /* 4366 * called after __delete_from_swap_cache() and drop "page" account. 4367 * memcg information is recorded to swap_cgroup of "ent" 4368 */ 4369 void 4370 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 4371 { 4372 struct mem_cgroup *memcg; 4373 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 4374 4375 if (!swapout) /* this was a swap cache but the swap is unused ! */ 4376 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 4377 4378 memcg = __mem_cgroup_uncharge_common(page, ctype, false); 4379 4380 /* 4381 * record memcg information, if swapout && memcg != NULL, 4382 * css_get() was called in uncharge(). 4383 */ 4384 if (do_swap_account && swapout && memcg) 4385 swap_cgroup_record(ent, css_id(&memcg->css)); 4386 } 4387 #endif 4388 4389 #ifdef CONFIG_MEMCG_SWAP 4390 /* 4391 * called from swap_entry_free(). remove record in swap_cgroup and 4392 * uncharge "memsw" account. 4393 */ 4394 void mem_cgroup_uncharge_swap(swp_entry_t ent) 4395 { 4396 struct mem_cgroup *memcg; 4397 unsigned short id; 4398 4399 if (!do_swap_account) 4400 return; 4401 4402 id = swap_cgroup_record(ent, 0); 4403 rcu_read_lock(); 4404 memcg = mem_cgroup_lookup(id); 4405 if (memcg) { 4406 /* 4407 * We uncharge this because swap is freed. 4408 * This memcg can be obsolete one. We avoid calling css_tryget 4409 */ 4410 if (!mem_cgroup_is_root(memcg)) 4411 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 4412 mem_cgroup_swap_statistics(memcg, false); 4413 css_put(&memcg->css); 4414 } 4415 rcu_read_unlock(); 4416 } 4417 4418 /** 4419 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 4420 * @entry: swap entry to be moved 4421 * @from: mem_cgroup which the entry is moved from 4422 * @to: mem_cgroup which the entry is moved to 4423 * 4424 * It succeeds only when the swap_cgroup's record for this entry is the same 4425 * as the mem_cgroup's id of @from. 4426 * 4427 * Returns 0 on success, -EINVAL on failure. 4428 * 4429 * The caller must have charged to @to, IOW, called res_counter_charge() about 4430 * both res and memsw, and called css_get(). 4431 */ 4432 static int mem_cgroup_move_swap_account(swp_entry_t entry, 4433 struct mem_cgroup *from, struct mem_cgroup *to) 4434 { 4435 unsigned short old_id, new_id; 4436 4437 old_id = css_id(&from->css); 4438 new_id = css_id(&to->css); 4439 4440 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 4441 mem_cgroup_swap_statistics(from, false); 4442 mem_cgroup_swap_statistics(to, true); 4443 /* 4444 * This function is only called from task migration context now. 4445 * It postpones res_counter and refcount handling till the end 4446 * of task migration(mem_cgroup_clear_mc()) for performance 4447 * improvement. But we cannot postpone css_get(to) because if 4448 * the process that has been moved to @to does swap-in, the 4449 * refcount of @to might be decreased to 0. 4450 * 4451 * We are in attach() phase, so the cgroup is guaranteed to be 4452 * alive, so we can just call css_get(). 4453 */ 4454 css_get(&to->css); 4455 return 0; 4456 } 4457 return -EINVAL; 4458 } 4459 #else 4460 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 4461 struct mem_cgroup *from, struct mem_cgroup *to) 4462 { 4463 return -EINVAL; 4464 } 4465 #endif 4466 4467 /* 4468 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 4469 * page belongs to. 4470 */ 4471 void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, 4472 struct mem_cgroup **memcgp) 4473 { 4474 struct mem_cgroup *memcg = NULL; 4475 unsigned int nr_pages = 1; 4476 struct page_cgroup *pc; 4477 enum charge_type ctype; 4478 4479 *memcgp = NULL; 4480 4481 if (mem_cgroup_disabled()) 4482 return; 4483 4484 if (PageTransHuge(page)) 4485 nr_pages <<= compound_order(page); 4486 4487 pc = lookup_page_cgroup(page); 4488 lock_page_cgroup(pc); 4489 if (PageCgroupUsed(pc)) { 4490 memcg = pc->mem_cgroup; 4491 css_get(&memcg->css); 4492 /* 4493 * At migrating an anonymous page, its mapcount goes down 4494 * to 0 and uncharge() will be called. But, even if it's fully 4495 * unmapped, migration may fail and this page has to be 4496 * charged again. We set MIGRATION flag here and delay uncharge 4497 * until end_migration() is called 4498 * 4499 * Corner Case Thinking 4500 * A) 4501 * When the old page was mapped as Anon and it's unmap-and-freed 4502 * while migration was ongoing. 4503 * If unmap finds the old page, uncharge() of it will be delayed 4504 * until end_migration(). If unmap finds a new page, it's 4505 * uncharged when it make mapcount to be 1->0. If unmap code 4506 * finds swap_migration_entry, the new page will not be mapped 4507 * and end_migration() will find it(mapcount==0). 4508 * 4509 * B) 4510 * When the old page was mapped but migraion fails, the kernel 4511 * remaps it. A charge for it is kept by MIGRATION flag even 4512 * if mapcount goes down to 0. We can do remap successfully 4513 * without charging it again. 4514 * 4515 * C) 4516 * The "old" page is under lock_page() until the end of 4517 * migration, so, the old page itself will not be swapped-out. 4518 * If the new page is swapped out before end_migraton, our 4519 * hook to usual swap-out path will catch the event. 4520 */ 4521 if (PageAnon(page)) 4522 SetPageCgroupMigration(pc); 4523 } 4524 unlock_page_cgroup(pc); 4525 /* 4526 * If the page is not charged at this point, 4527 * we return here. 4528 */ 4529 if (!memcg) 4530 return; 4531 4532 *memcgp = memcg; 4533 /* 4534 * We charge new page before it's used/mapped. So, even if unlock_page() 4535 * is called before end_migration, we can catch all events on this new 4536 * page. In the case new page is migrated but not remapped, new page's 4537 * mapcount will be finally 0 and we call uncharge in end_migration(). 4538 */ 4539 if (PageAnon(page)) 4540 ctype = MEM_CGROUP_CHARGE_TYPE_ANON; 4541 else 4542 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 4543 /* 4544 * The page is committed to the memcg, but it's not actually 4545 * charged to the res_counter since we plan on replacing the 4546 * old one and only one page is going to be left afterwards. 4547 */ 4548 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); 4549 } 4550 4551 /* remove redundant charge if migration failed*/ 4552 void mem_cgroup_end_migration(struct mem_cgroup *memcg, 4553 struct page *oldpage, struct page *newpage, bool migration_ok) 4554 { 4555 struct page *used, *unused; 4556 struct page_cgroup *pc; 4557 bool anon; 4558 4559 if (!memcg) 4560 return; 4561 4562 if (!migration_ok) { 4563 used = oldpage; 4564 unused = newpage; 4565 } else { 4566 used = newpage; 4567 unused = oldpage; 4568 } 4569 anon = PageAnon(used); 4570 __mem_cgroup_uncharge_common(unused, 4571 anon ? MEM_CGROUP_CHARGE_TYPE_ANON 4572 : MEM_CGROUP_CHARGE_TYPE_CACHE, 4573 true); 4574 css_put(&memcg->css); 4575 /* 4576 * We disallowed uncharge of pages under migration because mapcount 4577 * of the page goes down to zero, temporarly. 4578 * Clear the flag and check the page should be charged. 4579 */ 4580 pc = lookup_page_cgroup(oldpage); 4581 lock_page_cgroup(pc); 4582 ClearPageCgroupMigration(pc); 4583 unlock_page_cgroup(pc); 4584 4585 /* 4586 * If a page is a file cache, radix-tree replacement is very atomic 4587 * and we can skip this check. When it was an Anon page, its mapcount 4588 * goes down to 0. But because we added MIGRATION flage, it's not 4589 * uncharged yet. There are several case but page->mapcount check 4590 * and USED bit check in mem_cgroup_uncharge_page() will do enough 4591 * check. (see prepare_charge() also) 4592 */ 4593 if (anon) 4594 mem_cgroup_uncharge_page(used); 4595 } 4596 4597 /* 4598 * At replace page cache, newpage is not under any memcg but it's on 4599 * LRU. So, this function doesn't touch res_counter but handles LRU 4600 * in correct way. Both pages are locked so we cannot race with uncharge. 4601 */ 4602 void mem_cgroup_replace_page_cache(struct page *oldpage, 4603 struct page *newpage) 4604 { 4605 struct mem_cgroup *memcg = NULL; 4606 struct page_cgroup *pc; 4607 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 4608 4609 if (mem_cgroup_disabled()) 4610 return; 4611 4612 pc = lookup_page_cgroup(oldpage); 4613 /* fix accounting on old pages */ 4614 lock_page_cgroup(pc); 4615 if (PageCgroupUsed(pc)) { 4616 memcg = pc->mem_cgroup; 4617 mem_cgroup_charge_statistics(memcg, oldpage, false, -1); 4618 ClearPageCgroupUsed(pc); 4619 } 4620 unlock_page_cgroup(pc); 4621 4622 /* 4623 * When called from shmem_replace_page(), in some cases the 4624 * oldpage has already been charged, and in some cases not. 4625 */ 4626 if (!memcg) 4627 return; 4628 /* 4629 * Even if newpage->mapping was NULL before starting replacement, 4630 * the newpage may be on LRU(or pagevec for LRU) already. We lock 4631 * LRU while we overwrite pc->mem_cgroup. 4632 */ 4633 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); 4634 } 4635 4636 #ifdef CONFIG_DEBUG_VM 4637 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 4638 { 4639 struct page_cgroup *pc; 4640 4641 pc = lookup_page_cgroup(page); 4642 /* 4643 * Can be NULL while feeding pages into the page allocator for 4644 * the first time, i.e. during boot or memory hotplug; 4645 * or when mem_cgroup_disabled(). 4646 */ 4647 if (likely(pc) && PageCgroupUsed(pc)) 4648 return pc; 4649 return NULL; 4650 } 4651 4652 bool mem_cgroup_bad_page_check(struct page *page) 4653 { 4654 if (mem_cgroup_disabled()) 4655 return false; 4656 4657 return lookup_page_cgroup_used(page) != NULL; 4658 } 4659 4660 void mem_cgroup_print_bad_page(struct page *page) 4661 { 4662 struct page_cgroup *pc; 4663 4664 pc = lookup_page_cgroup_used(page); 4665 if (pc) { 4666 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 4667 pc, pc->flags, pc->mem_cgroup); 4668 } 4669 } 4670 #endif 4671 4672 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 4673 unsigned long long val) 4674 { 4675 int retry_count; 4676 u64 memswlimit, memlimit; 4677 int ret = 0; 4678 int children = mem_cgroup_count_children(memcg); 4679 u64 curusage, oldusage; 4680 int enlarge; 4681 4682 /* 4683 * For keeping hierarchical_reclaim simple, how long we should retry 4684 * is depends on callers. We set our retry-count to be function 4685 * of # of children which we should visit in this loop. 4686 */ 4687 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 4688 4689 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 4690 4691 enlarge = 0; 4692 while (retry_count) { 4693 if (signal_pending(current)) { 4694 ret = -EINTR; 4695 break; 4696 } 4697 /* 4698 * Rather than hide all in some function, I do this in 4699 * open coded manner. You see what this really does. 4700 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 4701 */ 4702 mutex_lock(&set_limit_mutex); 4703 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4704 if (memswlimit < val) { 4705 ret = -EINVAL; 4706 mutex_unlock(&set_limit_mutex); 4707 break; 4708 } 4709 4710 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4711 if (memlimit < val) 4712 enlarge = 1; 4713 4714 ret = res_counter_set_limit(&memcg->res, val); 4715 if (!ret) { 4716 if (memswlimit == val) 4717 memcg->memsw_is_minimum = true; 4718 else 4719 memcg->memsw_is_minimum = false; 4720 } 4721 mutex_unlock(&set_limit_mutex); 4722 4723 if (!ret) 4724 break; 4725 4726 mem_cgroup_reclaim(memcg, GFP_KERNEL, 4727 MEM_CGROUP_RECLAIM_SHRINK); 4728 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 4729 /* Usage is reduced ? */ 4730 if (curusage >= oldusage) 4731 retry_count--; 4732 else 4733 oldusage = curusage; 4734 } 4735 if (!ret && enlarge) 4736 memcg_oom_recover(memcg); 4737 4738 return ret; 4739 } 4740 4741 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 4742 unsigned long long val) 4743 { 4744 int retry_count; 4745 u64 memlimit, memswlimit, oldusage, curusage; 4746 int children = mem_cgroup_count_children(memcg); 4747 int ret = -EBUSY; 4748 int enlarge = 0; 4749 4750 /* see mem_cgroup_resize_res_limit */ 4751 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 4752 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4753 while (retry_count) { 4754 if (signal_pending(current)) { 4755 ret = -EINTR; 4756 break; 4757 } 4758 /* 4759 * Rather than hide all in some function, I do this in 4760 * open coded manner. You see what this really does. 4761 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 4762 */ 4763 mutex_lock(&set_limit_mutex); 4764 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4765 if (memlimit > val) { 4766 ret = -EINVAL; 4767 mutex_unlock(&set_limit_mutex); 4768 break; 4769 } 4770 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4771 if (memswlimit < val) 4772 enlarge = 1; 4773 ret = res_counter_set_limit(&memcg->memsw, val); 4774 if (!ret) { 4775 if (memlimit == val) 4776 memcg->memsw_is_minimum = true; 4777 else 4778 memcg->memsw_is_minimum = false; 4779 } 4780 mutex_unlock(&set_limit_mutex); 4781 4782 if (!ret) 4783 break; 4784 4785 mem_cgroup_reclaim(memcg, GFP_KERNEL, 4786 MEM_CGROUP_RECLAIM_NOSWAP | 4787 MEM_CGROUP_RECLAIM_SHRINK); 4788 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4789 /* Usage is reduced ? */ 4790 if (curusage >= oldusage) 4791 retry_count--; 4792 else 4793 oldusage = curusage; 4794 } 4795 if (!ret && enlarge) 4796 memcg_oom_recover(memcg); 4797 return ret; 4798 } 4799 4800 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 4801 gfp_t gfp_mask, 4802 unsigned long *total_scanned) 4803 { 4804 unsigned long nr_reclaimed = 0; 4805 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 4806 unsigned long reclaimed; 4807 int loop = 0; 4808 struct mem_cgroup_tree_per_zone *mctz; 4809 unsigned long long excess; 4810 unsigned long nr_scanned; 4811 4812 if (order > 0) 4813 return 0; 4814 4815 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 4816 /* 4817 * This loop can run a while, specially if mem_cgroup's continuously 4818 * keep exceeding their soft limit and putting the system under 4819 * pressure 4820 */ 4821 do { 4822 if (next_mz) 4823 mz = next_mz; 4824 else 4825 mz = mem_cgroup_largest_soft_limit_node(mctz); 4826 if (!mz) 4827 break; 4828 4829 nr_scanned = 0; 4830 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 4831 gfp_mask, &nr_scanned); 4832 nr_reclaimed += reclaimed; 4833 *total_scanned += nr_scanned; 4834 spin_lock(&mctz->lock); 4835 4836 /* 4837 * If we failed to reclaim anything from this memory cgroup 4838 * it is time to move on to the next cgroup 4839 */ 4840 next_mz = NULL; 4841 if (!reclaimed) { 4842 do { 4843 /* 4844 * Loop until we find yet another one. 4845 * 4846 * By the time we get the soft_limit lock 4847 * again, someone might have aded the 4848 * group back on the RB tree. Iterate to 4849 * make sure we get a different mem. 4850 * mem_cgroup_largest_soft_limit_node returns 4851 * NULL if no other cgroup is present on 4852 * the tree 4853 */ 4854 next_mz = 4855 __mem_cgroup_largest_soft_limit_node(mctz); 4856 if (next_mz == mz) 4857 css_put(&next_mz->memcg->css); 4858 else /* next_mz == NULL or other memcg */ 4859 break; 4860 } while (1); 4861 } 4862 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 4863 excess = res_counter_soft_limit_excess(&mz->memcg->res); 4864 /* 4865 * One school of thought says that we should not add 4866 * back the node to the tree if reclaim returns 0. 4867 * But our reclaim could return 0, simply because due 4868 * to priority we are exposing a smaller subset of 4869 * memory to reclaim from. Consider this as a longer 4870 * term TODO. 4871 */ 4872 /* If excess == 0, no tree ops */ 4873 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); 4874 spin_unlock(&mctz->lock); 4875 css_put(&mz->memcg->css); 4876 loop++; 4877 /* 4878 * Could not reclaim anything and there are no more 4879 * mem cgroups to try or we seem to be looping without 4880 * reclaiming anything. 4881 */ 4882 if (!nr_reclaimed && 4883 (next_mz == NULL || 4884 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 4885 break; 4886 } while (!nr_reclaimed); 4887 if (next_mz) 4888 css_put(&next_mz->memcg->css); 4889 return nr_reclaimed; 4890 } 4891 4892 /** 4893 * mem_cgroup_force_empty_list - clears LRU of a group 4894 * @memcg: group to clear 4895 * @node: NUMA node 4896 * @zid: zone id 4897 * @lru: lru to to clear 4898 * 4899 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 4900 * reclaim the pages page themselves - pages are moved to the parent (or root) 4901 * group. 4902 */ 4903 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 4904 int node, int zid, enum lru_list lru) 4905 { 4906 struct lruvec *lruvec; 4907 unsigned long flags; 4908 struct list_head *list; 4909 struct page *busy; 4910 struct zone *zone; 4911 4912 zone = &NODE_DATA(node)->node_zones[zid]; 4913 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 4914 list = &lruvec->lists[lru]; 4915 4916 busy = NULL; 4917 do { 4918 struct page_cgroup *pc; 4919 struct page *page; 4920 4921 spin_lock_irqsave(&zone->lru_lock, flags); 4922 if (list_empty(list)) { 4923 spin_unlock_irqrestore(&zone->lru_lock, flags); 4924 break; 4925 } 4926 page = list_entry(list->prev, struct page, lru); 4927 if (busy == page) { 4928 list_move(&page->lru, list); 4929 busy = NULL; 4930 spin_unlock_irqrestore(&zone->lru_lock, flags); 4931 continue; 4932 } 4933 spin_unlock_irqrestore(&zone->lru_lock, flags); 4934 4935 pc = lookup_page_cgroup(page); 4936 4937 if (mem_cgroup_move_parent(page, pc, memcg)) { 4938 /* found lock contention or "pc" is obsolete. */ 4939 busy = page; 4940 cond_resched(); 4941 } else 4942 busy = NULL; 4943 } while (!list_empty(list)); 4944 } 4945 4946 /* 4947 * make mem_cgroup's charge to be 0 if there is no task by moving 4948 * all the charges and pages to the parent. 4949 * This enables deleting this mem_cgroup. 4950 * 4951 * Caller is responsible for holding css reference on the memcg. 4952 */ 4953 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) 4954 { 4955 int node, zid; 4956 u64 usage; 4957 4958 do { 4959 /* This is for making all *used* pages to be on LRU. */ 4960 lru_add_drain_all(); 4961 drain_all_stock_sync(memcg); 4962 mem_cgroup_start_move(memcg); 4963 for_each_node_state(node, N_MEMORY) { 4964 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4965 enum lru_list lru; 4966 for_each_lru(lru) { 4967 mem_cgroup_force_empty_list(memcg, 4968 node, zid, lru); 4969 } 4970 } 4971 } 4972 mem_cgroup_end_move(memcg); 4973 memcg_oom_recover(memcg); 4974 cond_resched(); 4975 4976 /* 4977 * Kernel memory may not necessarily be trackable to a specific 4978 * process. So they are not migrated, and therefore we can't 4979 * expect their value to drop to 0 here. 4980 * Having res filled up with kmem only is enough. 4981 * 4982 * This is a safety check because mem_cgroup_force_empty_list 4983 * could have raced with mem_cgroup_replace_page_cache callers 4984 * so the lru seemed empty but the page could have been added 4985 * right after the check. RES_USAGE should be safe as we always 4986 * charge before adding to the LRU. 4987 */ 4988 usage = res_counter_read_u64(&memcg->res, RES_USAGE) - 4989 res_counter_read_u64(&memcg->kmem, RES_USAGE); 4990 } while (usage > 0); 4991 } 4992 4993 static inline bool memcg_has_children(struct mem_cgroup *memcg) 4994 { 4995 lockdep_assert_held(&memcg_create_mutex); 4996 /* 4997 * The lock does not prevent addition or deletion to the list 4998 * of children, but it prevents a new child from being 4999 * initialized based on this parent in css_online(), so it's 5000 * enough to decide whether hierarchically inherited 5001 * attributes can still be changed or not. 5002 */ 5003 return memcg->use_hierarchy && 5004 !list_empty(&memcg->css.cgroup->children); 5005 } 5006 5007 /* 5008 * Reclaims as many pages from the given memcg as possible and moves 5009 * the rest to the parent. 5010 * 5011 * Caller is responsible for holding css reference for memcg. 5012 */ 5013 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 5014 { 5015 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 5016 struct cgroup *cgrp = memcg->css.cgroup; 5017 5018 /* returns EBUSY if there is a task or if we come here twice. */ 5019 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 5020 return -EBUSY; 5021 5022 /* we call try-to-free pages for make this cgroup empty */ 5023 lru_add_drain_all(); 5024 /* try to free all pages in this cgroup */ 5025 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 5026 int progress; 5027 5028 if (signal_pending(current)) 5029 return -EINTR; 5030 5031 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 5032 false); 5033 if (!progress) { 5034 nr_retries--; 5035 /* maybe some writeback is necessary */ 5036 congestion_wait(BLK_RW_ASYNC, HZ/10); 5037 } 5038 5039 } 5040 lru_add_drain(); 5041 mem_cgroup_reparent_charges(memcg); 5042 5043 return 0; 5044 } 5045 5046 static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, 5047 unsigned int event) 5048 { 5049 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5050 5051 if (mem_cgroup_is_root(memcg)) 5052 return -EINVAL; 5053 return mem_cgroup_force_empty(memcg); 5054 } 5055 5056 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 5057 struct cftype *cft) 5058 { 5059 return mem_cgroup_from_css(css)->use_hierarchy; 5060 } 5061 5062 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 5063 struct cftype *cft, u64 val) 5064 { 5065 int retval = 0; 5066 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5067 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); 5068 5069 mutex_lock(&memcg_create_mutex); 5070 5071 if (memcg->use_hierarchy == val) 5072 goto out; 5073 5074 /* 5075 * If parent's use_hierarchy is set, we can't make any modifications 5076 * in the child subtrees. If it is unset, then the change can 5077 * occur, provided the current cgroup has no children. 5078 * 5079 * For the root cgroup, parent_mem is NULL, we allow value to be 5080 * set if there are no children. 5081 */ 5082 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 5083 (val == 1 || val == 0)) { 5084 if (list_empty(&memcg->css.cgroup->children)) 5085 memcg->use_hierarchy = val; 5086 else 5087 retval = -EBUSY; 5088 } else 5089 retval = -EINVAL; 5090 5091 out: 5092 mutex_unlock(&memcg_create_mutex); 5093 5094 return retval; 5095 } 5096 5097 5098 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 5099 enum mem_cgroup_stat_index idx) 5100 { 5101 struct mem_cgroup *iter; 5102 long val = 0; 5103 5104 /* Per-cpu values can be negative, use a signed accumulator */ 5105 for_each_mem_cgroup_tree(iter, memcg) 5106 val += mem_cgroup_read_stat(iter, idx); 5107 5108 if (val < 0) /* race ? */ 5109 val = 0; 5110 return val; 5111 } 5112 5113 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 5114 { 5115 u64 val; 5116 5117 if (!mem_cgroup_is_root(memcg)) { 5118 if (!swap) 5119 return res_counter_read_u64(&memcg->res, RES_USAGE); 5120 else 5121 return res_counter_read_u64(&memcg->memsw, RES_USAGE); 5122 } 5123 5124 /* 5125 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS 5126 * as well as in MEM_CGROUP_STAT_RSS_HUGE. 5127 */ 5128 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); 5129 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 5130 5131 if (swap) 5132 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); 5133 5134 return val << PAGE_SHIFT; 5135 } 5136 5137 static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, 5138 struct cftype *cft, struct file *file, 5139 char __user *buf, size_t nbytes, loff_t *ppos) 5140 { 5141 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5142 char str[64]; 5143 u64 val; 5144 int name, len; 5145 enum res_type type; 5146 5147 type = MEMFILE_TYPE(cft->private); 5148 name = MEMFILE_ATTR(cft->private); 5149 5150 switch (type) { 5151 case _MEM: 5152 if (name == RES_USAGE) 5153 val = mem_cgroup_usage(memcg, false); 5154 else 5155 val = res_counter_read_u64(&memcg->res, name); 5156 break; 5157 case _MEMSWAP: 5158 if (name == RES_USAGE) 5159 val = mem_cgroup_usage(memcg, true); 5160 else 5161 val = res_counter_read_u64(&memcg->memsw, name); 5162 break; 5163 case _KMEM: 5164 val = res_counter_read_u64(&memcg->kmem, name); 5165 break; 5166 default: 5167 BUG(); 5168 } 5169 5170 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 5171 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 5172 } 5173 5174 static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) 5175 { 5176 int ret = -EINVAL; 5177 #ifdef CONFIG_MEMCG_KMEM 5178 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5179 /* 5180 * For simplicity, we won't allow this to be disabled. It also can't 5181 * be changed if the cgroup has children already, or if tasks had 5182 * already joined. 5183 * 5184 * If tasks join before we set the limit, a person looking at 5185 * kmem.usage_in_bytes will have no way to determine when it took 5186 * place, which makes the value quite meaningless. 5187 * 5188 * After it first became limited, changes in the value of the limit are 5189 * of course permitted. 5190 */ 5191 mutex_lock(&memcg_create_mutex); 5192 mutex_lock(&set_limit_mutex); 5193 if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { 5194 if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { 5195 ret = -EBUSY; 5196 goto out; 5197 } 5198 ret = res_counter_set_limit(&memcg->kmem, val); 5199 VM_BUG_ON(ret); 5200 5201 ret = memcg_update_cache_sizes(memcg); 5202 if (ret) { 5203 res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); 5204 goto out; 5205 } 5206 static_key_slow_inc(&memcg_kmem_enabled_key); 5207 /* 5208 * setting the active bit after the inc will guarantee no one 5209 * starts accounting before all call sites are patched 5210 */ 5211 memcg_kmem_set_active(memcg); 5212 } else 5213 ret = res_counter_set_limit(&memcg->kmem, val); 5214 out: 5215 mutex_unlock(&set_limit_mutex); 5216 mutex_unlock(&memcg_create_mutex); 5217 #endif 5218 return ret; 5219 } 5220 5221 #ifdef CONFIG_MEMCG_KMEM 5222 static int memcg_propagate_kmem(struct mem_cgroup *memcg) 5223 { 5224 int ret = 0; 5225 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5226 if (!parent) 5227 goto out; 5228 5229 memcg->kmem_account_flags = parent->kmem_account_flags; 5230 /* 5231 * When that happen, we need to disable the static branch only on those 5232 * memcgs that enabled it. To achieve this, we would be forced to 5233 * complicate the code by keeping track of which memcgs were the ones 5234 * that actually enabled limits, and which ones got it from its 5235 * parents. 5236 * 5237 * It is a lot simpler just to do static_key_slow_inc() on every child 5238 * that is accounted. 5239 */ 5240 if (!memcg_kmem_is_active(memcg)) 5241 goto out; 5242 5243 /* 5244 * __mem_cgroup_free() will issue static_key_slow_dec() because this 5245 * memcg is active already. If the later initialization fails then the 5246 * cgroup core triggers the cleanup so we do not have to do it here. 5247 */ 5248 static_key_slow_inc(&memcg_kmem_enabled_key); 5249 5250 mutex_lock(&set_limit_mutex); 5251 memcg_stop_kmem_account(); 5252 ret = memcg_update_cache_sizes(memcg); 5253 memcg_resume_kmem_account(); 5254 mutex_unlock(&set_limit_mutex); 5255 out: 5256 return ret; 5257 } 5258 #endif /* CONFIG_MEMCG_KMEM */ 5259 5260 /* 5261 * The user of this function is... 5262 * RES_LIMIT. 5263 */ 5264 static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, 5265 const char *buffer) 5266 { 5267 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5268 enum res_type type; 5269 int name; 5270 unsigned long long val; 5271 int ret; 5272 5273 type = MEMFILE_TYPE(cft->private); 5274 name = MEMFILE_ATTR(cft->private); 5275 5276 switch (name) { 5277 case RES_LIMIT: 5278 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 5279 ret = -EINVAL; 5280 break; 5281 } 5282 /* This function does all necessary parse...reuse it */ 5283 ret = res_counter_memparse_write_strategy(buffer, &val); 5284 if (ret) 5285 break; 5286 if (type == _MEM) 5287 ret = mem_cgroup_resize_limit(memcg, val); 5288 else if (type == _MEMSWAP) 5289 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5290 else if (type == _KMEM) 5291 ret = memcg_update_kmem_limit(css, val); 5292 else 5293 return -EINVAL; 5294 break; 5295 case RES_SOFT_LIMIT: 5296 ret = res_counter_memparse_write_strategy(buffer, &val); 5297 if (ret) 5298 break; 5299 /* 5300 * For memsw, soft limits are hard to implement in terms 5301 * of semantics, for now, we support soft limits for 5302 * control without swap 5303 */ 5304 if (type == _MEM) 5305 ret = res_counter_set_soft_limit(&memcg->res, val); 5306 else 5307 ret = -EINVAL; 5308 break; 5309 default: 5310 ret = -EINVAL; /* should be BUG() ? */ 5311 break; 5312 } 5313 return ret; 5314 } 5315 5316 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 5317 unsigned long long *mem_limit, unsigned long long *memsw_limit) 5318 { 5319 unsigned long long min_limit, min_memsw_limit, tmp; 5320 5321 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 5322 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 5323 if (!memcg->use_hierarchy) 5324 goto out; 5325 5326 while (css_parent(&memcg->css)) { 5327 memcg = mem_cgroup_from_css(css_parent(&memcg->css)); 5328 if (!memcg->use_hierarchy) 5329 break; 5330 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 5331 min_limit = min(min_limit, tmp); 5332 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 5333 min_memsw_limit = min(min_memsw_limit, tmp); 5334 } 5335 out: 5336 *mem_limit = min_limit; 5337 *memsw_limit = min_memsw_limit; 5338 } 5339 5340 static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) 5341 { 5342 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5343 int name; 5344 enum res_type type; 5345 5346 type = MEMFILE_TYPE(event); 5347 name = MEMFILE_ATTR(event); 5348 5349 switch (name) { 5350 case RES_MAX_USAGE: 5351 if (type == _MEM) 5352 res_counter_reset_max(&memcg->res); 5353 else if (type == _MEMSWAP) 5354 res_counter_reset_max(&memcg->memsw); 5355 else if (type == _KMEM) 5356 res_counter_reset_max(&memcg->kmem); 5357 else 5358 return -EINVAL; 5359 break; 5360 case RES_FAILCNT: 5361 if (type == _MEM) 5362 res_counter_reset_failcnt(&memcg->res); 5363 else if (type == _MEMSWAP) 5364 res_counter_reset_failcnt(&memcg->memsw); 5365 else if (type == _KMEM) 5366 res_counter_reset_failcnt(&memcg->kmem); 5367 else 5368 return -EINVAL; 5369 break; 5370 } 5371 5372 return 0; 5373 } 5374 5375 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 5376 struct cftype *cft) 5377 { 5378 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 5379 } 5380 5381 #ifdef CONFIG_MMU 5382 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 5383 struct cftype *cft, u64 val) 5384 { 5385 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5386 5387 if (val >= (1 << NR_MOVE_TYPE)) 5388 return -EINVAL; 5389 5390 /* 5391 * No kind of locking is needed in here, because ->can_attach() will 5392 * check this value once in the beginning of the process, and then carry 5393 * on with stale data. This means that changes to this value will only 5394 * affect task migrations starting after the change. 5395 */ 5396 memcg->move_charge_at_immigrate = val; 5397 return 0; 5398 } 5399 #else 5400 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 5401 struct cftype *cft, u64 val) 5402 { 5403 return -ENOSYS; 5404 } 5405 #endif 5406 5407 #ifdef CONFIG_NUMA 5408 static int memcg_numa_stat_show(struct cgroup_subsys_state *css, 5409 struct cftype *cft, struct seq_file *m) 5410 { 5411 int nid; 5412 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 5413 unsigned long node_nr; 5414 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5415 5416 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 5417 seq_printf(m, "total=%lu", total_nr); 5418 for_each_node_state(nid, N_MEMORY) { 5419 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); 5420 seq_printf(m, " N%d=%lu", nid, node_nr); 5421 } 5422 seq_putc(m, '\n'); 5423 5424 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); 5425 seq_printf(m, "file=%lu", file_nr); 5426 for_each_node_state(nid, N_MEMORY) { 5427 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5428 LRU_ALL_FILE); 5429 seq_printf(m, " N%d=%lu", nid, node_nr); 5430 } 5431 seq_putc(m, '\n'); 5432 5433 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); 5434 seq_printf(m, "anon=%lu", anon_nr); 5435 for_each_node_state(nid, N_MEMORY) { 5436 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5437 LRU_ALL_ANON); 5438 seq_printf(m, " N%d=%lu", nid, node_nr); 5439 } 5440 seq_putc(m, '\n'); 5441 5442 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); 5443 seq_printf(m, "unevictable=%lu", unevictable_nr); 5444 for_each_node_state(nid, N_MEMORY) { 5445 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5446 BIT(LRU_UNEVICTABLE)); 5447 seq_printf(m, " N%d=%lu", nid, node_nr); 5448 } 5449 seq_putc(m, '\n'); 5450 return 0; 5451 } 5452 #endif /* CONFIG_NUMA */ 5453 5454 static inline void mem_cgroup_lru_names_not_uptodate(void) 5455 { 5456 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5457 } 5458 5459 static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, 5460 struct seq_file *m) 5461 { 5462 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5463 struct mem_cgroup *mi; 5464 unsigned int i; 5465 5466 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 5467 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 5468 continue; 5469 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 5470 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 5471 } 5472 5473 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 5474 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 5475 mem_cgroup_read_events(memcg, i)); 5476 5477 for (i = 0; i < NR_LRU_LISTS; i++) 5478 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 5479 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 5480 5481 /* Hierarchical information */ 5482 { 5483 unsigned long long limit, memsw_limit; 5484 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 5485 seq_printf(m, "hierarchical_memory_limit %llu\n", limit); 5486 if (do_swap_account) 5487 seq_printf(m, "hierarchical_memsw_limit %llu\n", 5488 memsw_limit); 5489 } 5490 5491 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 5492 long long val = 0; 5493 5494 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 5495 continue; 5496 for_each_mem_cgroup_tree(mi, memcg) 5497 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 5498 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); 5499 } 5500 5501 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 5502 unsigned long long val = 0; 5503 5504 for_each_mem_cgroup_tree(mi, memcg) 5505 val += mem_cgroup_read_events(mi, i); 5506 seq_printf(m, "total_%s %llu\n", 5507 mem_cgroup_events_names[i], val); 5508 } 5509 5510 for (i = 0; i < NR_LRU_LISTS; i++) { 5511 unsigned long long val = 0; 5512 5513 for_each_mem_cgroup_tree(mi, memcg) 5514 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 5515 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 5516 } 5517 5518 #ifdef CONFIG_DEBUG_VM 5519 { 5520 int nid, zid; 5521 struct mem_cgroup_per_zone *mz; 5522 struct zone_reclaim_stat *rstat; 5523 unsigned long recent_rotated[2] = {0, 0}; 5524 unsigned long recent_scanned[2] = {0, 0}; 5525 5526 for_each_online_node(nid) 5527 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 5528 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 5529 rstat = &mz->lruvec.reclaim_stat; 5530 5531 recent_rotated[0] += rstat->recent_rotated[0]; 5532 recent_rotated[1] += rstat->recent_rotated[1]; 5533 recent_scanned[0] += rstat->recent_scanned[0]; 5534 recent_scanned[1] += rstat->recent_scanned[1]; 5535 } 5536 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 5537 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 5538 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 5539 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 5540 } 5541 #endif 5542 5543 return 0; 5544 } 5545 5546 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 5547 struct cftype *cft) 5548 { 5549 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5550 5551 return mem_cgroup_swappiness(memcg); 5552 } 5553 5554 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 5555 struct cftype *cft, u64 val) 5556 { 5557 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5558 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); 5559 5560 if (val > 100 || !parent) 5561 return -EINVAL; 5562 5563 mutex_lock(&memcg_create_mutex); 5564 5565 /* If under hierarchy, only empty-root can set this value */ 5566 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5567 mutex_unlock(&memcg_create_mutex); 5568 return -EINVAL; 5569 } 5570 5571 memcg->swappiness = val; 5572 5573 mutex_unlock(&memcg_create_mutex); 5574 5575 return 0; 5576 } 5577 5578 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 5579 { 5580 struct mem_cgroup_threshold_ary *t; 5581 u64 usage; 5582 int i; 5583 5584 rcu_read_lock(); 5585 if (!swap) 5586 t = rcu_dereference(memcg->thresholds.primary); 5587 else 5588 t = rcu_dereference(memcg->memsw_thresholds.primary); 5589 5590 if (!t) 5591 goto unlock; 5592 5593 usage = mem_cgroup_usage(memcg, swap); 5594 5595 /* 5596 * current_threshold points to threshold just below or equal to usage. 5597 * If it's not true, a threshold was crossed after last 5598 * call of __mem_cgroup_threshold(). 5599 */ 5600 i = t->current_threshold; 5601 5602 /* 5603 * Iterate backward over array of thresholds starting from 5604 * current_threshold and check if a threshold is crossed. 5605 * If none of thresholds below usage is crossed, we read 5606 * only one element of the array here. 5607 */ 5608 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 5609 eventfd_signal(t->entries[i].eventfd, 1); 5610 5611 /* i = current_threshold + 1 */ 5612 i++; 5613 5614 /* 5615 * Iterate forward over array of thresholds starting from 5616 * current_threshold+1 and check if a threshold is crossed. 5617 * If none of thresholds above usage is crossed, we read 5618 * only one element of the array here. 5619 */ 5620 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 5621 eventfd_signal(t->entries[i].eventfd, 1); 5622 5623 /* Update current_threshold */ 5624 t->current_threshold = i - 1; 5625 unlock: 5626 rcu_read_unlock(); 5627 } 5628 5629 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 5630 { 5631 while (memcg) { 5632 __mem_cgroup_threshold(memcg, false); 5633 if (do_swap_account) 5634 __mem_cgroup_threshold(memcg, true); 5635 5636 memcg = parent_mem_cgroup(memcg); 5637 } 5638 } 5639 5640 static int compare_thresholds(const void *a, const void *b) 5641 { 5642 const struct mem_cgroup_threshold *_a = a; 5643 const struct mem_cgroup_threshold *_b = b; 5644 5645 if (_a->threshold > _b->threshold) 5646 return 1; 5647 5648 if (_a->threshold < _b->threshold) 5649 return -1; 5650 5651 return 0; 5652 } 5653 5654 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 5655 { 5656 struct mem_cgroup_eventfd_list *ev; 5657 5658 list_for_each_entry(ev, &memcg->oom_notify, list) 5659 eventfd_signal(ev->eventfd, 1); 5660 return 0; 5661 } 5662 5663 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 5664 { 5665 struct mem_cgroup *iter; 5666 5667 for_each_mem_cgroup_tree(iter, memcg) 5668 mem_cgroup_oom_notify_cb(iter); 5669 } 5670 5671 static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, 5672 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5673 { 5674 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5675 struct mem_cgroup_thresholds *thresholds; 5676 struct mem_cgroup_threshold_ary *new; 5677 enum res_type type = MEMFILE_TYPE(cft->private); 5678 u64 threshold, usage; 5679 int i, size, ret; 5680 5681 ret = res_counter_memparse_write_strategy(args, &threshold); 5682 if (ret) 5683 return ret; 5684 5685 mutex_lock(&memcg->thresholds_lock); 5686 5687 if (type == _MEM) 5688 thresholds = &memcg->thresholds; 5689 else if (type == _MEMSWAP) 5690 thresholds = &memcg->memsw_thresholds; 5691 else 5692 BUG(); 5693 5694 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 5695 5696 /* Check if a threshold crossed before adding a new one */ 5697 if (thresholds->primary) 5698 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 5699 5700 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 5701 5702 /* Allocate memory for new array of thresholds */ 5703 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 5704 GFP_KERNEL); 5705 if (!new) { 5706 ret = -ENOMEM; 5707 goto unlock; 5708 } 5709 new->size = size; 5710 5711 /* Copy thresholds (if any) to new array */ 5712 if (thresholds->primary) { 5713 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 5714 sizeof(struct mem_cgroup_threshold)); 5715 } 5716 5717 /* Add new threshold */ 5718 new->entries[size - 1].eventfd = eventfd; 5719 new->entries[size - 1].threshold = threshold; 5720 5721 /* Sort thresholds. Registering of new threshold isn't time-critical */ 5722 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 5723 compare_thresholds, NULL); 5724 5725 /* Find current threshold */ 5726 new->current_threshold = -1; 5727 for (i = 0; i < size; i++) { 5728 if (new->entries[i].threshold <= usage) { 5729 /* 5730 * new->current_threshold will not be used until 5731 * rcu_assign_pointer(), so it's safe to increment 5732 * it here. 5733 */ 5734 ++new->current_threshold; 5735 } else 5736 break; 5737 } 5738 5739 /* Free old spare buffer and save old primary buffer as spare */ 5740 kfree(thresholds->spare); 5741 thresholds->spare = thresholds->primary; 5742 5743 rcu_assign_pointer(thresholds->primary, new); 5744 5745 /* To be sure that nobody uses thresholds */ 5746 synchronize_rcu(); 5747 5748 unlock: 5749 mutex_unlock(&memcg->thresholds_lock); 5750 5751 return ret; 5752 } 5753 5754 static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, 5755 struct cftype *cft, struct eventfd_ctx *eventfd) 5756 { 5757 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5758 struct mem_cgroup_thresholds *thresholds; 5759 struct mem_cgroup_threshold_ary *new; 5760 enum res_type type = MEMFILE_TYPE(cft->private); 5761 u64 usage; 5762 int i, j, size; 5763 5764 mutex_lock(&memcg->thresholds_lock); 5765 if (type == _MEM) 5766 thresholds = &memcg->thresholds; 5767 else if (type == _MEMSWAP) 5768 thresholds = &memcg->memsw_thresholds; 5769 else 5770 BUG(); 5771 5772 if (!thresholds->primary) 5773 goto unlock; 5774 5775 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 5776 5777 /* Check if a threshold crossed before removing */ 5778 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 5779 5780 /* Calculate new number of threshold */ 5781 size = 0; 5782 for (i = 0; i < thresholds->primary->size; i++) { 5783 if (thresholds->primary->entries[i].eventfd != eventfd) 5784 size++; 5785 } 5786 5787 new = thresholds->spare; 5788 5789 /* Set thresholds array to NULL if we don't have thresholds */ 5790 if (!size) { 5791 kfree(new); 5792 new = NULL; 5793 goto swap_buffers; 5794 } 5795 5796 new->size = size; 5797 5798 /* Copy thresholds and find current threshold */ 5799 new->current_threshold = -1; 5800 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 5801 if (thresholds->primary->entries[i].eventfd == eventfd) 5802 continue; 5803 5804 new->entries[j] = thresholds->primary->entries[i]; 5805 if (new->entries[j].threshold <= usage) { 5806 /* 5807 * new->current_threshold will not be used 5808 * until rcu_assign_pointer(), so it's safe to increment 5809 * it here. 5810 */ 5811 ++new->current_threshold; 5812 } 5813 j++; 5814 } 5815 5816 swap_buffers: 5817 /* Swap primary and spare array */ 5818 thresholds->spare = thresholds->primary; 5819 /* If all events are unregistered, free the spare array */ 5820 if (!new) { 5821 kfree(thresholds->spare); 5822 thresholds->spare = NULL; 5823 } 5824 5825 rcu_assign_pointer(thresholds->primary, new); 5826 5827 /* To be sure that nobody uses thresholds */ 5828 synchronize_rcu(); 5829 unlock: 5830 mutex_unlock(&memcg->thresholds_lock); 5831 } 5832 5833 static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, 5834 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5835 { 5836 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5837 struct mem_cgroup_eventfd_list *event; 5838 enum res_type type = MEMFILE_TYPE(cft->private); 5839 5840 BUG_ON(type != _OOM_TYPE); 5841 event = kmalloc(sizeof(*event), GFP_KERNEL); 5842 if (!event) 5843 return -ENOMEM; 5844 5845 spin_lock(&memcg_oom_lock); 5846 5847 event->eventfd = eventfd; 5848 list_add(&event->list, &memcg->oom_notify); 5849 5850 /* already in OOM ? */ 5851 if (atomic_read(&memcg->under_oom)) 5852 eventfd_signal(eventfd, 1); 5853 spin_unlock(&memcg_oom_lock); 5854 5855 return 0; 5856 } 5857 5858 static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, 5859 struct cftype *cft, struct eventfd_ctx *eventfd) 5860 { 5861 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5862 struct mem_cgroup_eventfd_list *ev, *tmp; 5863 enum res_type type = MEMFILE_TYPE(cft->private); 5864 5865 BUG_ON(type != _OOM_TYPE); 5866 5867 spin_lock(&memcg_oom_lock); 5868 5869 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 5870 if (ev->eventfd == eventfd) { 5871 list_del(&ev->list); 5872 kfree(ev); 5873 } 5874 } 5875 5876 spin_unlock(&memcg_oom_lock); 5877 } 5878 5879 static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, 5880 struct cftype *cft, struct cgroup_map_cb *cb) 5881 { 5882 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5883 5884 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); 5885 5886 if (atomic_read(&memcg->under_oom)) 5887 cb->fill(cb, "under_oom", 1); 5888 else 5889 cb->fill(cb, "under_oom", 0); 5890 return 0; 5891 } 5892 5893 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 5894 struct cftype *cft, u64 val) 5895 { 5896 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5897 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); 5898 5899 /* cannot set to root cgroup and only 0 and 1 are allowed */ 5900 if (!parent || !((val == 0) || (val == 1))) 5901 return -EINVAL; 5902 5903 mutex_lock(&memcg_create_mutex); 5904 /* oom-kill-disable is a flag for subhierarchy. */ 5905 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5906 mutex_unlock(&memcg_create_mutex); 5907 return -EINVAL; 5908 } 5909 memcg->oom_kill_disable = val; 5910 if (!val) 5911 memcg_oom_recover(memcg); 5912 mutex_unlock(&memcg_create_mutex); 5913 return 0; 5914 } 5915 5916 #ifdef CONFIG_MEMCG_KMEM 5917 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5918 { 5919 int ret; 5920 5921 memcg->kmemcg_id = -1; 5922 ret = memcg_propagate_kmem(memcg); 5923 if (ret) 5924 return ret; 5925 5926 return mem_cgroup_sockets_init(memcg, ss); 5927 } 5928 5929 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 5930 { 5931 mem_cgroup_sockets_destroy(memcg); 5932 } 5933 5934 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 5935 { 5936 if (!memcg_kmem_is_active(memcg)) 5937 return; 5938 5939 /* 5940 * kmem charges can outlive the cgroup. In the case of slab 5941 * pages, for instance, a page contain objects from various 5942 * processes. As we prevent from taking a reference for every 5943 * such allocation we have to be careful when doing uncharge 5944 * (see memcg_uncharge_kmem) and here during offlining. 5945 * 5946 * The idea is that that only the _last_ uncharge which sees 5947 * the dead memcg will drop the last reference. An additional 5948 * reference is taken here before the group is marked dead 5949 * which is then paired with css_put during uncharge resp. here. 5950 * 5951 * Although this might sound strange as this path is called from 5952 * css_offline() when the referencemight have dropped down to 0 5953 * and shouldn't be incremented anymore (css_tryget would fail) 5954 * we do not have other options because of the kmem allocations 5955 * lifetime. 5956 */ 5957 css_get(&memcg->css); 5958 5959 memcg_kmem_mark_dead(memcg); 5960 5961 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) 5962 return; 5963 5964 if (memcg_kmem_test_and_clear_dead(memcg)) 5965 css_put(&memcg->css); 5966 } 5967 #else 5968 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5969 { 5970 return 0; 5971 } 5972 5973 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 5974 { 5975 } 5976 5977 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 5978 { 5979 } 5980 #endif 5981 5982 /* 5983 * Unregister event and free resources. 5984 * 5985 * Gets called from workqueue. 5986 */ 5987 static void cgroup_event_remove(struct work_struct *work) 5988 { 5989 struct cgroup_event *event = container_of(work, struct cgroup_event, 5990 remove); 5991 struct cgroup_subsys_state *css = event->css; 5992 5993 remove_wait_queue(event->wqh, &event->wait); 5994 5995 event->cft->unregister_event(css, event->cft, event->eventfd); 5996 5997 /* Notify userspace the event is going away. */ 5998 eventfd_signal(event->eventfd, 1); 5999 6000 eventfd_ctx_put(event->eventfd); 6001 kfree(event); 6002 css_put(css); 6003 } 6004 6005 /* 6006 * Gets called on POLLHUP on eventfd when user closes it. 6007 * 6008 * Called with wqh->lock held and interrupts disabled. 6009 */ 6010 static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, 6011 int sync, void *key) 6012 { 6013 struct cgroup_event *event = container_of(wait, 6014 struct cgroup_event, wait); 6015 struct cgroup *cgrp = event->css->cgroup; 6016 unsigned long flags = (unsigned long)key; 6017 6018 if (flags & POLLHUP) { 6019 /* 6020 * If the event has been detached at cgroup removal, we 6021 * can simply return knowing the other side will cleanup 6022 * for us. 6023 * 6024 * We can't race against event freeing since the other 6025 * side will require wqh->lock via remove_wait_queue(), 6026 * which we hold. 6027 */ 6028 spin_lock(&cgrp->event_list_lock); 6029 if (!list_empty(&event->list)) { 6030 list_del_init(&event->list); 6031 /* 6032 * We are in atomic context, but cgroup_event_remove() 6033 * may sleep, so we have to call it in workqueue. 6034 */ 6035 schedule_work(&event->remove); 6036 } 6037 spin_unlock(&cgrp->event_list_lock); 6038 } 6039 6040 return 0; 6041 } 6042 6043 static void cgroup_event_ptable_queue_proc(struct file *file, 6044 wait_queue_head_t *wqh, poll_table *pt) 6045 { 6046 struct cgroup_event *event = container_of(pt, 6047 struct cgroup_event, pt); 6048 6049 event->wqh = wqh; 6050 add_wait_queue(wqh, &event->wait); 6051 } 6052 6053 /* 6054 * Parse input and register new cgroup event handler. 6055 * 6056 * Input must be in format '<event_fd> <control_fd> <args>'. 6057 * Interpretation of args is defined by control file implementation. 6058 */ 6059 static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, 6060 struct cftype *cft, const char *buffer) 6061 { 6062 struct cgroup *cgrp = dummy_css->cgroup; 6063 struct cgroup_event *event; 6064 struct cgroup_subsys_state *cfile_css; 6065 unsigned int efd, cfd; 6066 struct fd efile; 6067 struct fd cfile; 6068 char *endp; 6069 int ret; 6070 6071 efd = simple_strtoul(buffer, &endp, 10); 6072 if (*endp != ' ') 6073 return -EINVAL; 6074 buffer = endp + 1; 6075 6076 cfd = simple_strtoul(buffer, &endp, 10); 6077 if ((*endp != ' ') && (*endp != '\0')) 6078 return -EINVAL; 6079 buffer = endp + 1; 6080 6081 event = kzalloc(sizeof(*event), GFP_KERNEL); 6082 if (!event) 6083 return -ENOMEM; 6084 6085 INIT_LIST_HEAD(&event->list); 6086 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); 6087 init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 6088 INIT_WORK(&event->remove, cgroup_event_remove); 6089 6090 efile = fdget(efd); 6091 if (!efile.file) { 6092 ret = -EBADF; 6093 goto out_kfree; 6094 } 6095 6096 event->eventfd = eventfd_ctx_fileget(efile.file); 6097 if (IS_ERR(event->eventfd)) { 6098 ret = PTR_ERR(event->eventfd); 6099 goto out_put_efile; 6100 } 6101 6102 cfile = fdget(cfd); 6103 if (!cfile.file) { 6104 ret = -EBADF; 6105 goto out_put_eventfd; 6106 } 6107 6108 /* the process need read permission on control file */ 6109 /* AV: shouldn't we check that it's been opened for read instead? */ 6110 ret = inode_permission(file_inode(cfile.file), MAY_READ); 6111 if (ret < 0) 6112 goto out_put_cfile; 6113 6114 event->cft = __file_cft(cfile.file); 6115 if (IS_ERR(event->cft)) { 6116 ret = PTR_ERR(event->cft); 6117 goto out_put_cfile; 6118 } 6119 6120 if (!event->cft->ss) { 6121 ret = -EBADF; 6122 goto out_put_cfile; 6123 } 6124 6125 /* 6126 * Determine the css of @cfile, verify it belongs to the same 6127 * cgroup as cgroup.event_control, and associate @event with it. 6128 * Remaining events are automatically removed on cgroup destruction 6129 * but the removal is asynchronous, so take an extra ref. 6130 */ 6131 rcu_read_lock(); 6132 6133 ret = -EINVAL; 6134 event->css = cgroup_css(cgrp, event->cft->ss); 6135 cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); 6136 if (event->css && event->css == cfile_css && css_tryget(event->css)) 6137 ret = 0; 6138 6139 rcu_read_unlock(); 6140 if (ret) 6141 goto out_put_cfile; 6142 6143 if (!event->cft->register_event || !event->cft->unregister_event) { 6144 ret = -EINVAL; 6145 goto out_put_css; 6146 } 6147 6148 ret = event->cft->register_event(event->css, event->cft, 6149 event->eventfd, buffer); 6150 if (ret) 6151 goto out_put_css; 6152 6153 efile.file->f_op->poll(efile.file, &event->pt); 6154 6155 spin_lock(&cgrp->event_list_lock); 6156 list_add(&event->list, &cgrp->event_list); 6157 spin_unlock(&cgrp->event_list_lock); 6158 6159 fdput(cfile); 6160 fdput(efile); 6161 6162 return 0; 6163 6164 out_put_css: 6165 css_put(event->css); 6166 out_put_cfile: 6167 fdput(cfile); 6168 out_put_eventfd: 6169 eventfd_ctx_put(event->eventfd); 6170 out_put_efile: 6171 fdput(efile); 6172 out_kfree: 6173 kfree(event); 6174 6175 return ret; 6176 } 6177 6178 static struct cftype mem_cgroup_files[] = { 6179 { 6180 .name = "usage_in_bytes", 6181 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 6182 .read = mem_cgroup_read, 6183 .register_event = mem_cgroup_usage_register_event, 6184 .unregister_event = mem_cgroup_usage_unregister_event, 6185 }, 6186 { 6187 .name = "max_usage_in_bytes", 6188 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 6189 .trigger = mem_cgroup_reset, 6190 .read = mem_cgroup_read, 6191 }, 6192 { 6193 .name = "limit_in_bytes", 6194 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 6195 .write_string = mem_cgroup_write, 6196 .read = mem_cgroup_read, 6197 }, 6198 { 6199 .name = "soft_limit_in_bytes", 6200 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 6201 .write_string = mem_cgroup_write, 6202 .read = mem_cgroup_read, 6203 }, 6204 { 6205 .name = "failcnt", 6206 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 6207 .trigger = mem_cgroup_reset, 6208 .read = mem_cgroup_read, 6209 }, 6210 { 6211 .name = "stat", 6212 .read_seq_string = memcg_stat_show, 6213 }, 6214 { 6215 .name = "force_empty", 6216 .trigger = mem_cgroup_force_empty_write, 6217 }, 6218 { 6219 .name = "use_hierarchy", 6220 .flags = CFTYPE_INSANE, 6221 .write_u64 = mem_cgroup_hierarchy_write, 6222 .read_u64 = mem_cgroup_hierarchy_read, 6223 }, 6224 { 6225 .name = "cgroup.event_control", 6226 .write_string = cgroup_write_event_control, 6227 .flags = CFTYPE_NO_PREFIX, 6228 .mode = S_IWUGO, 6229 }, 6230 { 6231 .name = "swappiness", 6232 .read_u64 = mem_cgroup_swappiness_read, 6233 .write_u64 = mem_cgroup_swappiness_write, 6234 }, 6235 { 6236 .name = "move_charge_at_immigrate", 6237 .read_u64 = mem_cgroup_move_charge_read, 6238 .write_u64 = mem_cgroup_move_charge_write, 6239 }, 6240 { 6241 .name = "oom_control", 6242 .read_map = mem_cgroup_oom_control_read, 6243 .write_u64 = mem_cgroup_oom_control_write, 6244 .register_event = mem_cgroup_oom_register_event, 6245 .unregister_event = mem_cgroup_oom_unregister_event, 6246 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 6247 }, 6248 { 6249 .name = "pressure_level", 6250 .register_event = vmpressure_register_event, 6251 .unregister_event = vmpressure_unregister_event, 6252 }, 6253 #ifdef CONFIG_NUMA 6254 { 6255 .name = "numa_stat", 6256 .read_seq_string = memcg_numa_stat_show, 6257 }, 6258 #endif 6259 #ifdef CONFIG_MEMCG_KMEM 6260 { 6261 .name = "kmem.limit_in_bytes", 6262 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 6263 .write_string = mem_cgroup_write, 6264 .read = mem_cgroup_read, 6265 }, 6266 { 6267 .name = "kmem.usage_in_bytes", 6268 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 6269 .read = mem_cgroup_read, 6270 }, 6271 { 6272 .name = "kmem.failcnt", 6273 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 6274 .trigger = mem_cgroup_reset, 6275 .read = mem_cgroup_read, 6276 }, 6277 { 6278 .name = "kmem.max_usage_in_bytes", 6279 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 6280 .trigger = mem_cgroup_reset, 6281 .read = mem_cgroup_read, 6282 }, 6283 #ifdef CONFIG_SLABINFO 6284 { 6285 .name = "kmem.slabinfo", 6286 .read_seq_string = mem_cgroup_slabinfo_read, 6287 }, 6288 #endif 6289 #endif 6290 { }, /* terminate */ 6291 }; 6292 6293 #ifdef CONFIG_MEMCG_SWAP 6294 static struct cftype memsw_cgroup_files[] = { 6295 { 6296 .name = "memsw.usage_in_bytes", 6297 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6298 .read = mem_cgroup_read, 6299 .register_event = mem_cgroup_usage_register_event, 6300 .unregister_event = mem_cgroup_usage_unregister_event, 6301 }, 6302 { 6303 .name = "memsw.max_usage_in_bytes", 6304 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6305 .trigger = mem_cgroup_reset, 6306 .read = mem_cgroup_read, 6307 }, 6308 { 6309 .name = "memsw.limit_in_bytes", 6310 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6311 .write_string = mem_cgroup_write, 6312 .read = mem_cgroup_read, 6313 }, 6314 { 6315 .name = "memsw.failcnt", 6316 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6317 .trigger = mem_cgroup_reset, 6318 .read = mem_cgroup_read, 6319 }, 6320 { }, /* terminate */ 6321 }; 6322 #endif 6323 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 6324 { 6325 struct mem_cgroup_per_node *pn; 6326 struct mem_cgroup_per_zone *mz; 6327 int zone, tmp = node; 6328 /* 6329 * This routine is called against possible nodes. 6330 * But it's BUG to call kmalloc() against offline node. 6331 * 6332 * TODO: this routine can waste much memory for nodes which will 6333 * never be onlined. It's better to use memory hotplug callback 6334 * function. 6335 */ 6336 if (!node_state(node, N_NORMAL_MEMORY)) 6337 tmp = -1; 6338 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 6339 if (!pn) 6340 return 1; 6341 6342 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6343 mz = &pn->zoneinfo[zone]; 6344 lruvec_init(&mz->lruvec); 6345 mz->usage_in_excess = 0; 6346 mz->on_tree = false; 6347 mz->memcg = memcg; 6348 } 6349 memcg->nodeinfo[node] = pn; 6350 return 0; 6351 } 6352 6353 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 6354 { 6355 kfree(memcg->nodeinfo[node]); 6356 } 6357 6358 static struct mem_cgroup *mem_cgroup_alloc(void) 6359 { 6360 struct mem_cgroup *memcg; 6361 size_t size = memcg_size(); 6362 6363 /* Can be very big if nr_node_ids is very big */ 6364 if (size < PAGE_SIZE) 6365 memcg = kzalloc(size, GFP_KERNEL); 6366 else 6367 memcg = vzalloc(size); 6368 6369 if (!memcg) 6370 return NULL; 6371 6372 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 6373 if (!memcg->stat) 6374 goto out_free; 6375 spin_lock_init(&memcg->pcp_counter_lock); 6376 return memcg; 6377 6378 out_free: 6379 if (size < PAGE_SIZE) 6380 kfree(memcg); 6381 else 6382 vfree(memcg); 6383 return NULL; 6384 } 6385 6386 /* 6387 * At destroying mem_cgroup, references from swap_cgroup can remain. 6388 * (scanning all at force_empty is too costly...) 6389 * 6390 * Instead of clearing all references at force_empty, we remember 6391 * the number of reference from swap_cgroup and free mem_cgroup when 6392 * it goes down to 0. 6393 * 6394 * Removal of cgroup itself succeeds regardless of refs from swap. 6395 */ 6396 6397 static void __mem_cgroup_free(struct mem_cgroup *memcg) 6398 { 6399 int node; 6400 size_t size = memcg_size(); 6401 6402 mem_cgroup_remove_from_trees(memcg); 6403 free_css_id(&mem_cgroup_subsys, &memcg->css); 6404 6405 for_each_node(node) 6406 free_mem_cgroup_per_zone_info(memcg, node); 6407 6408 free_percpu(memcg->stat); 6409 6410 /* 6411 * We need to make sure that (at least for now), the jump label 6412 * destruction code runs outside of the cgroup lock. This is because 6413 * get_online_cpus(), which is called from the static_branch update, 6414 * can't be called inside the cgroup_lock. cpusets are the ones 6415 * enforcing this dependency, so if they ever change, we might as well. 6416 * 6417 * schedule_work() will guarantee this happens. Be careful if you need 6418 * to move this code around, and make sure it is outside 6419 * the cgroup_lock. 6420 */ 6421 disarm_static_keys(memcg); 6422 if (size < PAGE_SIZE) 6423 kfree(memcg); 6424 else 6425 vfree(memcg); 6426 } 6427 6428 /* 6429 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 6430 */ 6431 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 6432 { 6433 if (!memcg->res.parent) 6434 return NULL; 6435 return mem_cgroup_from_res_counter(memcg->res.parent, res); 6436 } 6437 EXPORT_SYMBOL(parent_mem_cgroup); 6438 6439 static void __init mem_cgroup_soft_limit_tree_init(void) 6440 { 6441 struct mem_cgroup_tree_per_node *rtpn; 6442 struct mem_cgroup_tree_per_zone *rtpz; 6443 int tmp, node, zone; 6444 6445 for_each_node(node) { 6446 tmp = node; 6447 if (!node_state(node, N_NORMAL_MEMORY)) 6448 tmp = -1; 6449 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 6450 BUG_ON(!rtpn); 6451 6452 soft_limit_tree.rb_tree_per_node[node] = rtpn; 6453 6454 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6455 rtpz = &rtpn->rb_tree_per_zone[zone]; 6456 rtpz->rb_root = RB_ROOT; 6457 spin_lock_init(&rtpz->lock); 6458 } 6459 } 6460 } 6461 6462 static struct cgroup_subsys_state * __ref 6463 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6464 { 6465 struct mem_cgroup *memcg; 6466 long error = -ENOMEM; 6467 int node; 6468 6469 memcg = mem_cgroup_alloc(); 6470 if (!memcg) 6471 return ERR_PTR(error); 6472 6473 for_each_node(node) 6474 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 6475 goto free_out; 6476 6477 /* root ? */ 6478 if (parent_css == NULL) { 6479 root_mem_cgroup = memcg; 6480 res_counter_init(&memcg->res, NULL); 6481 res_counter_init(&memcg->memsw, NULL); 6482 res_counter_init(&memcg->kmem, NULL); 6483 } 6484 6485 memcg->last_scanned_node = MAX_NUMNODES; 6486 INIT_LIST_HEAD(&memcg->oom_notify); 6487 memcg->move_charge_at_immigrate = 0; 6488 mutex_init(&memcg->thresholds_lock); 6489 spin_lock_init(&memcg->move_lock); 6490 vmpressure_init(&memcg->vmpressure); 6491 6492 return &memcg->css; 6493 6494 free_out: 6495 __mem_cgroup_free(memcg); 6496 return ERR_PTR(error); 6497 } 6498 6499 static int 6500 mem_cgroup_css_online(struct cgroup_subsys_state *css) 6501 { 6502 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6503 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); 6504 int error = 0; 6505 6506 if (!parent) 6507 return 0; 6508 6509 mutex_lock(&memcg_create_mutex); 6510 6511 memcg->use_hierarchy = parent->use_hierarchy; 6512 memcg->oom_kill_disable = parent->oom_kill_disable; 6513 memcg->swappiness = mem_cgroup_swappiness(parent); 6514 6515 if (parent->use_hierarchy) { 6516 res_counter_init(&memcg->res, &parent->res); 6517 res_counter_init(&memcg->memsw, &parent->memsw); 6518 res_counter_init(&memcg->kmem, &parent->kmem); 6519 6520 /* 6521 * No need to take a reference to the parent because cgroup 6522 * core guarantees its existence. 6523 */ 6524 } else { 6525 res_counter_init(&memcg->res, NULL); 6526 res_counter_init(&memcg->memsw, NULL); 6527 res_counter_init(&memcg->kmem, NULL); 6528 /* 6529 * Deeper hierachy with use_hierarchy == false doesn't make 6530 * much sense so let cgroup subsystem know about this 6531 * unfortunate state in our controller. 6532 */ 6533 if (parent != root_mem_cgroup) 6534 mem_cgroup_subsys.broken_hierarchy = true; 6535 } 6536 6537 error = memcg_init_kmem(memcg, &mem_cgroup_subsys); 6538 mutex_unlock(&memcg_create_mutex); 6539 return error; 6540 } 6541 6542 /* 6543 * Announce all parents that a group from their hierarchy is gone. 6544 */ 6545 static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) 6546 { 6547 struct mem_cgroup *parent = memcg; 6548 6549 while ((parent = parent_mem_cgroup(parent))) 6550 mem_cgroup_iter_invalidate(parent); 6551 6552 /* 6553 * if the root memcg is not hierarchical we have to check it 6554 * explicitely. 6555 */ 6556 if (!root_mem_cgroup->use_hierarchy) 6557 mem_cgroup_iter_invalidate(root_mem_cgroup); 6558 } 6559 6560 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 6561 { 6562 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6563 struct cgroup *cgrp = css->cgroup; 6564 struct cgroup_event *event, *tmp; 6565 6566 /* 6567 * Unregister events and notify userspace. 6568 * Notify userspace about cgroup removing only after rmdir of cgroup 6569 * directory to avoid race between userspace and kernelspace. 6570 */ 6571 spin_lock(&cgrp->event_list_lock); 6572 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { 6573 list_del_init(&event->list); 6574 schedule_work(&event->remove); 6575 } 6576 spin_unlock(&cgrp->event_list_lock); 6577 6578 kmem_cgroup_css_offline(memcg); 6579 6580 mem_cgroup_invalidate_reclaim_iterators(memcg); 6581 mem_cgroup_reparent_charges(memcg); 6582 mem_cgroup_destroy_all_caches(memcg); 6583 vmpressure_cleanup(&memcg->vmpressure); 6584 } 6585 6586 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 6587 { 6588 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6589 6590 memcg_destroy_kmem(memcg); 6591 __mem_cgroup_free(memcg); 6592 } 6593 6594 #ifdef CONFIG_MMU 6595 /* Handlers for move charge at task migration. */ 6596 #define PRECHARGE_COUNT_AT_ONCE 256 6597 static int mem_cgroup_do_precharge(unsigned long count) 6598 { 6599 int ret = 0; 6600 int batch_count = PRECHARGE_COUNT_AT_ONCE; 6601 struct mem_cgroup *memcg = mc.to; 6602 6603 if (mem_cgroup_is_root(memcg)) { 6604 mc.precharge += count; 6605 /* we don't need css_get for root */ 6606 return ret; 6607 } 6608 /* try to charge at once */ 6609 if (count > 1) { 6610 struct res_counter *dummy; 6611 /* 6612 * "memcg" cannot be under rmdir() because we've already checked 6613 * by cgroup_lock_live_cgroup() that it is not removed and we 6614 * are still under the same cgroup_mutex. So we can postpone 6615 * css_get(). 6616 */ 6617 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) 6618 goto one_by_one; 6619 if (do_swap_account && res_counter_charge(&memcg->memsw, 6620 PAGE_SIZE * count, &dummy)) { 6621 res_counter_uncharge(&memcg->res, PAGE_SIZE * count); 6622 goto one_by_one; 6623 } 6624 mc.precharge += count; 6625 return ret; 6626 } 6627 one_by_one: 6628 /* fall back to one by one charge */ 6629 while (count--) { 6630 if (signal_pending(current)) { 6631 ret = -EINTR; 6632 break; 6633 } 6634 if (!batch_count--) { 6635 batch_count = PRECHARGE_COUNT_AT_ONCE; 6636 cond_resched(); 6637 } 6638 ret = __mem_cgroup_try_charge(NULL, 6639 GFP_KERNEL, 1, &memcg, false); 6640 if (ret) 6641 /* mem_cgroup_clear_mc() will do uncharge later */ 6642 return ret; 6643 mc.precharge++; 6644 } 6645 return ret; 6646 } 6647 6648 /** 6649 * get_mctgt_type - get target type of moving charge 6650 * @vma: the vma the pte to be checked belongs 6651 * @addr: the address corresponding to the pte to be checked 6652 * @ptent: the pte to be checked 6653 * @target: the pointer the target page or swap ent will be stored(can be NULL) 6654 * 6655 * Returns 6656 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 6657 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 6658 * move charge. if @target is not NULL, the page is stored in target->page 6659 * with extra refcnt got(Callers should handle it). 6660 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 6661 * target for charge migration. if @target is not NULL, the entry is stored 6662 * in target->ent. 6663 * 6664 * Called with pte lock held. 6665 */ 6666 union mc_target { 6667 struct page *page; 6668 swp_entry_t ent; 6669 }; 6670 6671 enum mc_target_type { 6672 MC_TARGET_NONE = 0, 6673 MC_TARGET_PAGE, 6674 MC_TARGET_SWAP, 6675 }; 6676 6677 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 6678 unsigned long addr, pte_t ptent) 6679 { 6680 struct page *page = vm_normal_page(vma, addr, ptent); 6681 6682 if (!page || !page_mapped(page)) 6683 return NULL; 6684 if (PageAnon(page)) { 6685 /* we don't move shared anon */ 6686 if (!move_anon()) 6687 return NULL; 6688 } else if (!move_file()) 6689 /* we ignore mapcount for file pages */ 6690 return NULL; 6691 if (!get_page_unless_zero(page)) 6692 return NULL; 6693 6694 return page; 6695 } 6696 6697 #ifdef CONFIG_SWAP 6698 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 6699 unsigned long addr, pte_t ptent, swp_entry_t *entry) 6700 { 6701 struct page *page = NULL; 6702 swp_entry_t ent = pte_to_swp_entry(ptent); 6703 6704 if (!move_anon() || non_swap_entry(ent)) 6705 return NULL; 6706 /* 6707 * Because lookup_swap_cache() updates some statistics counter, 6708 * we call find_get_page() with swapper_space directly. 6709 */ 6710 page = find_get_page(swap_address_space(ent), ent.val); 6711 if (do_swap_account) 6712 entry->val = ent.val; 6713 6714 return page; 6715 } 6716 #else 6717 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 6718 unsigned long addr, pte_t ptent, swp_entry_t *entry) 6719 { 6720 return NULL; 6721 } 6722 #endif 6723 6724 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 6725 unsigned long addr, pte_t ptent, swp_entry_t *entry) 6726 { 6727 struct page *page = NULL; 6728 struct address_space *mapping; 6729 pgoff_t pgoff; 6730 6731 if (!vma->vm_file) /* anonymous vma */ 6732 return NULL; 6733 if (!move_file()) 6734 return NULL; 6735 6736 mapping = vma->vm_file->f_mapping; 6737 if (pte_none(ptent)) 6738 pgoff = linear_page_index(vma, addr); 6739 else /* pte_file(ptent) is true */ 6740 pgoff = pte_to_pgoff(ptent); 6741 6742 /* page is moved even if it's not RSS of this task(page-faulted). */ 6743 page = find_get_page(mapping, pgoff); 6744 6745 #ifdef CONFIG_SWAP 6746 /* shmem/tmpfs may report page out on swap: account for that too. */ 6747 if (radix_tree_exceptional_entry(page)) { 6748 swp_entry_t swap = radix_to_swp_entry(page); 6749 if (do_swap_account) 6750 *entry = swap; 6751 page = find_get_page(swap_address_space(swap), swap.val); 6752 } 6753 #endif 6754 return page; 6755 } 6756 6757 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 6758 unsigned long addr, pte_t ptent, union mc_target *target) 6759 { 6760 struct page *page = NULL; 6761 struct page_cgroup *pc; 6762 enum mc_target_type ret = MC_TARGET_NONE; 6763 swp_entry_t ent = { .val = 0 }; 6764 6765 if (pte_present(ptent)) 6766 page = mc_handle_present_pte(vma, addr, ptent); 6767 else if (is_swap_pte(ptent)) 6768 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 6769 else if (pte_none(ptent) || pte_file(ptent)) 6770 page = mc_handle_file_pte(vma, addr, ptent, &ent); 6771 6772 if (!page && !ent.val) 6773 return ret; 6774 if (page) { 6775 pc = lookup_page_cgroup(page); 6776 /* 6777 * Do only loose check w/o page_cgroup lock. 6778 * mem_cgroup_move_account() checks the pc is valid or not under 6779 * the lock. 6780 */ 6781 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 6782 ret = MC_TARGET_PAGE; 6783 if (target) 6784 target->page = page; 6785 } 6786 if (!ret || !target) 6787 put_page(page); 6788 } 6789 /* There is a swap entry and a page doesn't exist or isn't charged */ 6790 if (ent.val && !ret && 6791 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { 6792 ret = MC_TARGET_SWAP; 6793 if (target) 6794 target->ent = ent; 6795 } 6796 return ret; 6797 } 6798 6799 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6800 /* 6801 * We don't consider swapping or file mapped pages because THP does not 6802 * support them for now. 6803 * Caller should make sure that pmd_trans_huge(pmd) is true. 6804 */ 6805 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 6806 unsigned long addr, pmd_t pmd, union mc_target *target) 6807 { 6808 struct page *page = NULL; 6809 struct page_cgroup *pc; 6810 enum mc_target_type ret = MC_TARGET_NONE; 6811 6812 page = pmd_page(pmd); 6813 VM_BUG_ON(!page || !PageHead(page)); 6814 if (!move_anon()) 6815 return ret; 6816 pc = lookup_page_cgroup(page); 6817 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 6818 ret = MC_TARGET_PAGE; 6819 if (target) { 6820 get_page(page); 6821 target->page = page; 6822 } 6823 } 6824 return ret; 6825 } 6826 #else 6827 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 6828 unsigned long addr, pmd_t pmd, union mc_target *target) 6829 { 6830 return MC_TARGET_NONE; 6831 } 6832 #endif 6833 6834 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 6835 unsigned long addr, unsigned long end, 6836 struct mm_walk *walk) 6837 { 6838 struct vm_area_struct *vma = walk->private; 6839 pte_t *pte; 6840 spinlock_t *ptl; 6841 6842 if (pmd_trans_huge_lock(pmd, vma) == 1) { 6843 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 6844 mc.precharge += HPAGE_PMD_NR; 6845 spin_unlock(&vma->vm_mm->page_table_lock); 6846 return 0; 6847 } 6848 6849 if (pmd_trans_unstable(pmd)) 6850 return 0; 6851 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6852 for (; addr != end; pte++, addr += PAGE_SIZE) 6853 if (get_mctgt_type(vma, addr, *pte, NULL)) 6854 mc.precharge++; /* increment precharge temporarily */ 6855 pte_unmap_unlock(pte - 1, ptl); 6856 cond_resched(); 6857 6858 return 0; 6859 } 6860 6861 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 6862 { 6863 unsigned long precharge; 6864 struct vm_area_struct *vma; 6865 6866 down_read(&mm->mmap_sem); 6867 for (vma = mm->mmap; vma; vma = vma->vm_next) { 6868 struct mm_walk mem_cgroup_count_precharge_walk = { 6869 .pmd_entry = mem_cgroup_count_precharge_pte_range, 6870 .mm = mm, 6871 .private = vma, 6872 }; 6873 if (is_vm_hugetlb_page(vma)) 6874 continue; 6875 walk_page_range(vma->vm_start, vma->vm_end, 6876 &mem_cgroup_count_precharge_walk); 6877 } 6878 up_read(&mm->mmap_sem); 6879 6880 precharge = mc.precharge; 6881 mc.precharge = 0; 6882 6883 return precharge; 6884 } 6885 6886 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 6887 { 6888 unsigned long precharge = mem_cgroup_count_precharge(mm); 6889 6890 VM_BUG_ON(mc.moving_task); 6891 mc.moving_task = current; 6892 return mem_cgroup_do_precharge(precharge); 6893 } 6894 6895 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 6896 static void __mem_cgroup_clear_mc(void) 6897 { 6898 struct mem_cgroup *from = mc.from; 6899 struct mem_cgroup *to = mc.to; 6900 int i; 6901 6902 /* we must uncharge all the leftover precharges from mc.to */ 6903 if (mc.precharge) { 6904 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 6905 mc.precharge = 0; 6906 } 6907 /* 6908 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 6909 * we must uncharge here. 6910 */ 6911 if (mc.moved_charge) { 6912 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 6913 mc.moved_charge = 0; 6914 } 6915 /* we must fixup refcnts and charges */ 6916 if (mc.moved_swap) { 6917 /* uncharge swap account from the old cgroup */ 6918 if (!mem_cgroup_is_root(mc.from)) 6919 res_counter_uncharge(&mc.from->memsw, 6920 PAGE_SIZE * mc.moved_swap); 6921 6922 for (i = 0; i < mc.moved_swap; i++) 6923 css_put(&mc.from->css); 6924 6925 if (!mem_cgroup_is_root(mc.to)) { 6926 /* 6927 * we charged both to->res and to->memsw, so we should 6928 * uncharge to->res. 6929 */ 6930 res_counter_uncharge(&mc.to->res, 6931 PAGE_SIZE * mc.moved_swap); 6932 } 6933 /* we've already done css_get(mc.to) */ 6934 mc.moved_swap = 0; 6935 } 6936 memcg_oom_recover(from); 6937 memcg_oom_recover(to); 6938 wake_up_all(&mc.waitq); 6939 } 6940 6941 static void mem_cgroup_clear_mc(void) 6942 { 6943 struct mem_cgroup *from = mc.from; 6944 6945 /* 6946 * we must clear moving_task before waking up waiters at the end of 6947 * task migration. 6948 */ 6949 mc.moving_task = NULL; 6950 __mem_cgroup_clear_mc(); 6951 spin_lock(&mc.lock); 6952 mc.from = NULL; 6953 mc.to = NULL; 6954 spin_unlock(&mc.lock); 6955 mem_cgroup_end_move(from); 6956 } 6957 6958 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 6959 struct cgroup_taskset *tset) 6960 { 6961 struct task_struct *p = cgroup_taskset_first(tset); 6962 int ret = 0; 6963 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6964 unsigned long move_charge_at_immigrate; 6965 6966 /* 6967 * We are now commited to this value whatever it is. Changes in this 6968 * tunable will only affect upcoming migrations, not the current one. 6969 * So we need to save it, and keep it going. 6970 */ 6971 move_charge_at_immigrate = memcg->move_charge_at_immigrate; 6972 if (move_charge_at_immigrate) { 6973 struct mm_struct *mm; 6974 struct mem_cgroup *from = mem_cgroup_from_task(p); 6975 6976 VM_BUG_ON(from == memcg); 6977 6978 mm = get_task_mm(p); 6979 if (!mm) 6980 return 0; 6981 /* We move charges only when we move a owner of the mm */ 6982 if (mm->owner == p) { 6983 VM_BUG_ON(mc.from); 6984 VM_BUG_ON(mc.to); 6985 VM_BUG_ON(mc.precharge); 6986 VM_BUG_ON(mc.moved_charge); 6987 VM_BUG_ON(mc.moved_swap); 6988 mem_cgroup_start_move(from); 6989 spin_lock(&mc.lock); 6990 mc.from = from; 6991 mc.to = memcg; 6992 mc.immigrate_flags = move_charge_at_immigrate; 6993 spin_unlock(&mc.lock); 6994 /* We set mc.moving_task later */ 6995 6996 ret = mem_cgroup_precharge_mc(mm); 6997 if (ret) 6998 mem_cgroup_clear_mc(); 6999 } 7000 mmput(mm); 7001 } 7002 return ret; 7003 } 7004 7005 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 7006 struct cgroup_taskset *tset) 7007 { 7008 mem_cgroup_clear_mc(); 7009 } 7010 7011 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 7012 unsigned long addr, unsigned long end, 7013 struct mm_walk *walk) 7014 { 7015 int ret = 0; 7016 struct vm_area_struct *vma = walk->private; 7017 pte_t *pte; 7018 spinlock_t *ptl; 7019 enum mc_target_type target_type; 7020 union mc_target target; 7021 struct page *page; 7022 struct page_cgroup *pc; 7023 7024 /* 7025 * We don't take compound_lock() here but no race with splitting thp 7026 * happens because: 7027 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 7028 * under splitting, which means there's no concurrent thp split, 7029 * - if another thread runs into split_huge_page() just after we 7030 * entered this if-block, the thread must wait for page table lock 7031 * to be unlocked in __split_huge_page_splitting(), where the main 7032 * part of thp split is not executed yet. 7033 */ 7034 if (pmd_trans_huge_lock(pmd, vma) == 1) { 7035 if (mc.precharge < HPAGE_PMD_NR) { 7036 spin_unlock(&vma->vm_mm->page_table_lock); 7037 return 0; 7038 } 7039 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 7040 if (target_type == MC_TARGET_PAGE) { 7041 page = target.page; 7042 if (!isolate_lru_page(page)) { 7043 pc = lookup_page_cgroup(page); 7044 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 7045 pc, mc.from, mc.to)) { 7046 mc.precharge -= HPAGE_PMD_NR; 7047 mc.moved_charge += HPAGE_PMD_NR; 7048 } 7049 putback_lru_page(page); 7050 } 7051 put_page(page); 7052 } 7053 spin_unlock(&vma->vm_mm->page_table_lock); 7054 return 0; 7055 } 7056 7057 if (pmd_trans_unstable(pmd)) 7058 return 0; 7059 retry: 7060 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 7061 for (; addr != end; addr += PAGE_SIZE) { 7062 pte_t ptent = *(pte++); 7063 swp_entry_t ent; 7064 7065 if (!mc.precharge) 7066 break; 7067 7068 switch (get_mctgt_type(vma, addr, ptent, &target)) { 7069 case MC_TARGET_PAGE: 7070 page = target.page; 7071 if (isolate_lru_page(page)) 7072 goto put; 7073 pc = lookup_page_cgroup(page); 7074 if (!mem_cgroup_move_account(page, 1, pc, 7075 mc.from, mc.to)) { 7076 mc.precharge--; 7077 /* we uncharge from mc.from later. */ 7078 mc.moved_charge++; 7079 } 7080 putback_lru_page(page); 7081 put: /* get_mctgt_type() gets the page */ 7082 put_page(page); 7083 break; 7084 case MC_TARGET_SWAP: 7085 ent = target.ent; 7086 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 7087 mc.precharge--; 7088 /* we fixup refcnts and charges later. */ 7089 mc.moved_swap++; 7090 } 7091 break; 7092 default: 7093 break; 7094 } 7095 } 7096 pte_unmap_unlock(pte - 1, ptl); 7097 cond_resched(); 7098 7099 if (addr != end) { 7100 /* 7101 * We have consumed all precharges we got in can_attach(). 7102 * We try charge one by one, but don't do any additional 7103 * charges to mc.to if we have failed in charge once in attach() 7104 * phase. 7105 */ 7106 ret = mem_cgroup_do_precharge(1); 7107 if (!ret) 7108 goto retry; 7109 } 7110 7111 return ret; 7112 } 7113 7114 static void mem_cgroup_move_charge(struct mm_struct *mm) 7115 { 7116 struct vm_area_struct *vma; 7117 7118 lru_add_drain_all(); 7119 retry: 7120 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 7121 /* 7122 * Someone who are holding the mmap_sem might be waiting in 7123 * waitq. So we cancel all extra charges, wake up all waiters, 7124 * and retry. Because we cancel precharges, we might not be able 7125 * to move enough charges, but moving charge is a best-effort 7126 * feature anyway, so it wouldn't be a big problem. 7127 */ 7128 __mem_cgroup_clear_mc(); 7129 cond_resched(); 7130 goto retry; 7131 } 7132 for (vma = mm->mmap; vma; vma = vma->vm_next) { 7133 int ret; 7134 struct mm_walk mem_cgroup_move_charge_walk = { 7135 .pmd_entry = mem_cgroup_move_charge_pte_range, 7136 .mm = mm, 7137 .private = vma, 7138 }; 7139 if (is_vm_hugetlb_page(vma)) 7140 continue; 7141 ret = walk_page_range(vma->vm_start, vma->vm_end, 7142 &mem_cgroup_move_charge_walk); 7143 if (ret) 7144 /* 7145 * means we have consumed all precharges and failed in 7146 * doing additional charge. Just abandon here. 7147 */ 7148 break; 7149 } 7150 up_read(&mm->mmap_sem); 7151 } 7152 7153 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 7154 struct cgroup_taskset *tset) 7155 { 7156 struct task_struct *p = cgroup_taskset_first(tset); 7157 struct mm_struct *mm = get_task_mm(p); 7158 7159 if (mm) { 7160 if (mc.to) 7161 mem_cgroup_move_charge(mm); 7162 mmput(mm); 7163 } 7164 if (mc.to) 7165 mem_cgroup_clear_mc(); 7166 } 7167 #else /* !CONFIG_MMU */ 7168 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 7169 struct cgroup_taskset *tset) 7170 { 7171 return 0; 7172 } 7173 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 7174 struct cgroup_taskset *tset) 7175 { 7176 } 7177 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 7178 struct cgroup_taskset *tset) 7179 { 7180 } 7181 #endif 7182 7183 /* 7184 * Cgroup retains root cgroups across [un]mount cycles making it necessary 7185 * to verify sane_behavior flag on each mount attempt. 7186 */ 7187 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 7188 { 7189 /* 7190 * use_hierarchy is forced with sane_behavior. cgroup core 7191 * guarantees that @root doesn't have any children, so turning it 7192 * on for the root memcg is enough. 7193 */ 7194 if (cgroup_sane_behavior(root_css->cgroup)) 7195 mem_cgroup_from_css(root_css)->use_hierarchy = true; 7196 } 7197 7198 struct cgroup_subsys mem_cgroup_subsys = { 7199 .name = "memory", 7200 .subsys_id = mem_cgroup_subsys_id, 7201 .css_alloc = mem_cgroup_css_alloc, 7202 .css_online = mem_cgroup_css_online, 7203 .css_offline = mem_cgroup_css_offline, 7204 .css_free = mem_cgroup_css_free, 7205 .can_attach = mem_cgroup_can_attach, 7206 .cancel_attach = mem_cgroup_cancel_attach, 7207 .attach = mem_cgroup_move_task, 7208 .bind = mem_cgroup_bind, 7209 .base_cftypes = mem_cgroup_files, 7210 .early_init = 0, 7211 .use_id = 1, 7212 }; 7213 7214 #ifdef CONFIG_MEMCG_SWAP 7215 static int __init enable_swap_account(char *s) 7216 { 7217 if (!strcmp(s, "1")) 7218 really_do_swap_account = 1; 7219 else if (!strcmp(s, "0")) 7220 really_do_swap_account = 0; 7221 return 1; 7222 } 7223 __setup("swapaccount=", enable_swap_account); 7224 7225 static void __init memsw_file_init(void) 7226 { 7227 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); 7228 } 7229 7230 static void __init enable_swap_cgroup(void) 7231 { 7232 if (!mem_cgroup_disabled() && really_do_swap_account) { 7233 do_swap_account = 1; 7234 memsw_file_init(); 7235 } 7236 } 7237 7238 #else 7239 static void __init enable_swap_cgroup(void) 7240 { 7241 } 7242 #endif 7243 7244 /* 7245 * subsys_initcall() for memory controller. 7246 * 7247 * Some parts like hotcpu_notifier() have to be initialized from this context 7248 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically 7249 * everything that doesn't depend on a specific mem_cgroup structure should 7250 * be initialized from here. 7251 */ 7252 static int __init mem_cgroup_init(void) 7253 { 7254 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 7255 enable_swap_cgroup(); 7256 mem_cgroup_soft_limit_tree_init(); 7257 memcg_stock_init(); 7258 return 0; 7259 } 7260 subsys_initcall(mem_cgroup_init); 7261