1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * Kernel Memory Controller 14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 15 * Authors: Glauber Costa and Suleiman Souhlal 16 * 17 * This program is free software; you can redistribute it and/or modify 18 * it under the terms of the GNU General Public License as published by 19 * the Free Software Foundation; either version 2 of the License, or 20 * (at your option) any later version. 21 * 22 * This program is distributed in the hope that it will be useful, 23 * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 * GNU General Public License for more details. 26 */ 27 28 #include <linux/page_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/mm.h> 32 #include <linux/hugetlb.h> 33 #include <linux/pagemap.h> 34 #include <linux/smp.h> 35 #include <linux/page-flags.h> 36 #include <linux/backing-dev.h> 37 #include <linux/bit_spinlock.h> 38 #include <linux/rcupdate.h> 39 #include <linux/limits.h> 40 #include <linux/export.h> 41 #include <linux/mutex.h> 42 #include <linux/rbtree.h> 43 #include <linux/slab.h> 44 #include <linux/swap.h> 45 #include <linux/swapops.h> 46 #include <linux/spinlock.h> 47 #include <linux/eventfd.h> 48 #include <linux/poll.h> 49 #include <linux/sort.h> 50 #include <linux/fs.h> 51 #include <linux/seq_file.h> 52 #include <linux/vmpressure.h> 53 #include <linux/mm_inline.h> 54 #include <linux/swap_cgroup.h> 55 #include <linux/cpu.h> 56 #include <linux/oom.h> 57 #include <linux/lockdep.h> 58 #include <linux/file.h> 59 #include "internal.h" 60 #include <net/sock.h> 61 #include <net/ip.h> 62 #include <net/tcp_memcontrol.h> 63 #include "slab.h" 64 65 #include <asm/uaccess.h> 66 67 #include <trace/events/vmscan.h> 68 69 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 70 EXPORT_SYMBOL(memory_cgrp_subsys); 71 72 #define MEM_CGROUP_RECLAIM_RETRIES 5 73 static struct mem_cgroup *root_mem_cgroup __read_mostly; 74 75 #ifdef CONFIG_MEMCG_SWAP 76 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 77 int do_swap_account __read_mostly; 78 79 /* for remember boot option*/ 80 #ifdef CONFIG_MEMCG_SWAP_ENABLED 81 static int really_do_swap_account __initdata = 1; 82 #else 83 static int really_do_swap_account __initdata; 84 #endif 85 86 #else 87 #define do_swap_account 0 88 #endif 89 90 91 static const char * const mem_cgroup_stat_names[] = { 92 "cache", 93 "rss", 94 "rss_huge", 95 "mapped_file", 96 "writeback", 97 "swap", 98 }; 99 100 enum mem_cgroup_events_index { 101 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 102 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 103 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 104 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 105 MEM_CGROUP_EVENTS_NSTATS, 106 }; 107 108 static const char * const mem_cgroup_events_names[] = { 109 "pgpgin", 110 "pgpgout", 111 "pgfault", 112 "pgmajfault", 113 }; 114 115 static const char * const mem_cgroup_lru_names[] = { 116 "inactive_anon", 117 "active_anon", 118 "inactive_file", 119 "active_file", 120 "unevictable", 121 }; 122 123 /* 124 * Per memcg event counter is incremented at every pagein/pageout. With THP, 125 * it will be incremated by the number of pages. This counter is used for 126 * for trigger some periodic events. This is straightforward and better 127 * than using jiffies etc. to handle periodic memcg event. 128 */ 129 enum mem_cgroup_events_target { 130 MEM_CGROUP_TARGET_THRESH, 131 MEM_CGROUP_TARGET_SOFTLIMIT, 132 MEM_CGROUP_TARGET_NUMAINFO, 133 MEM_CGROUP_NTARGETS, 134 }; 135 #define THRESHOLDS_EVENTS_TARGET 128 136 #define SOFTLIMIT_EVENTS_TARGET 1024 137 #define NUMAINFO_EVENTS_TARGET 1024 138 139 struct mem_cgroup_stat_cpu { 140 long count[MEM_CGROUP_STAT_NSTATS]; 141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 142 unsigned long nr_page_events; 143 unsigned long targets[MEM_CGROUP_NTARGETS]; 144 }; 145 146 struct reclaim_iter { 147 struct mem_cgroup *position; 148 /* scan generation, increased every round-trip */ 149 unsigned int generation; 150 }; 151 152 /* 153 * per-zone information in memory controller. 154 */ 155 struct mem_cgroup_per_zone { 156 struct lruvec lruvec; 157 unsigned long lru_size[NR_LRU_LISTS]; 158 159 struct reclaim_iter iter[DEF_PRIORITY + 1]; 160 161 struct rb_node tree_node; /* RB tree node */ 162 unsigned long usage_in_excess;/* Set to the value by which */ 163 /* the soft limit is exceeded*/ 164 bool on_tree; 165 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 166 /* use container_of */ 167 }; 168 169 struct mem_cgroup_per_node { 170 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 171 }; 172 173 /* 174 * Cgroups above their limits are maintained in a RB-Tree, independent of 175 * their hierarchy representation 176 */ 177 178 struct mem_cgroup_tree_per_zone { 179 struct rb_root rb_root; 180 spinlock_t lock; 181 }; 182 183 struct mem_cgroup_tree_per_node { 184 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 185 }; 186 187 struct mem_cgroup_tree { 188 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 189 }; 190 191 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 192 193 struct mem_cgroup_threshold { 194 struct eventfd_ctx *eventfd; 195 unsigned long threshold; 196 }; 197 198 /* For threshold */ 199 struct mem_cgroup_threshold_ary { 200 /* An array index points to threshold just below or equal to usage. */ 201 int current_threshold; 202 /* Size of entries[] */ 203 unsigned int size; 204 /* Array of thresholds */ 205 struct mem_cgroup_threshold entries[0]; 206 }; 207 208 struct mem_cgroup_thresholds { 209 /* Primary thresholds array */ 210 struct mem_cgroup_threshold_ary *primary; 211 /* 212 * Spare threshold array. 213 * This is needed to make mem_cgroup_unregister_event() "never fail". 214 * It must be able to store at least primary->size - 1 entries. 215 */ 216 struct mem_cgroup_threshold_ary *spare; 217 }; 218 219 /* for OOM */ 220 struct mem_cgroup_eventfd_list { 221 struct list_head list; 222 struct eventfd_ctx *eventfd; 223 }; 224 225 /* 226 * cgroup_event represents events which userspace want to receive. 227 */ 228 struct mem_cgroup_event { 229 /* 230 * memcg which the event belongs to. 231 */ 232 struct mem_cgroup *memcg; 233 /* 234 * eventfd to signal userspace about the event. 235 */ 236 struct eventfd_ctx *eventfd; 237 /* 238 * Each of these stored in a list by the cgroup. 239 */ 240 struct list_head list; 241 /* 242 * register_event() callback will be used to add new userspace 243 * waiter for changes related to this event. Use eventfd_signal() 244 * on eventfd to send notification to userspace. 245 */ 246 int (*register_event)(struct mem_cgroup *memcg, 247 struct eventfd_ctx *eventfd, const char *args); 248 /* 249 * unregister_event() callback will be called when userspace closes 250 * the eventfd or on cgroup removing. This callback must be set, 251 * if you want provide notification functionality. 252 */ 253 void (*unregister_event)(struct mem_cgroup *memcg, 254 struct eventfd_ctx *eventfd); 255 /* 256 * All fields below needed to unregister event when 257 * userspace closes eventfd. 258 */ 259 poll_table pt; 260 wait_queue_head_t *wqh; 261 wait_queue_t wait; 262 struct work_struct remove; 263 }; 264 265 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 266 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 267 268 /* 269 * The memory controller data structure. The memory controller controls both 270 * page cache and RSS per cgroup. We would eventually like to provide 271 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 272 * to help the administrator determine what knobs to tune. 273 * 274 * TODO: Add a water mark for the memory controller. Reclaim will begin when 275 * we hit the water mark. May be even add a low water mark, such that 276 * no reclaim occurs from a cgroup at it's low water mark, this is 277 * a feature that will be implemented much later in the future. 278 */ 279 struct mem_cgroup { 280 struct cgroup_subsys_state css; 281 282 /* Accounted resources */ 283 struct page_counter memory; 284 struct page_counter memsw; 285 struct page_counter kmem; 286 287 unsigned long soft_limit; 288 289 /* vmpressure notifications */ 290 struct vmpressure vmpressure; 291 292 /* css_online() has been completed */ 293 int initialized; 294 295 /* 296 * Should the accounting and control be hierarchical, per subtree? 297 */ 298 bool use_hierarchy; 299 300 bool oom_lock; 301 atomic_t under_oom; 302 atomic_t oom_wakeups; 303 304 int swappiness; 305 /* OOM-Killer disable */ 306 int oom_kill_disable; 307 308 /* protect arrays of thresholds */ 309 struct mutex thresholds_lock; 310 311 /* thresholds for memory usage. RCU-protected */ 312 struct mem_cgroup_thresholds thresholds; 313 314 /* thresholds for mem+swap usage. RCU-protected */ 315 struct mem_cgroup_thresholds memsw_thresholds; 316 317 /* For oom notifier event fd */ 318 struct list_head oom_notify; 319 320 /* 321 * Should we move charges of a task when a task is moved into this 322 * mem_cgroup ? And what type of charges should we move ? 323 */ 324 unsigned long move_charge_at_immigrate; 325 /* 326 * set > 0 if pages under this cgroup are moving to other cgroup. 327 */ 328 atomic_t moving_account; 329 /* taken only while moving_account > 0 */ 330 spinlock_t move_lock; 331 /* 332 * percpu counter. 333 */ 334 struct mem_cgroup_stat_cpu __percpu *stat; 335 /* 336 * used when a cpu is offlined or other synchronizations 337 * See mem_cgroup_read_stat(). 338 */ 339 struct mem_cgroup_stat_cpu nocpu_base; 340 spinlock_t pcp_counter_lock; 341 342 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 343 struct cg_proto tcp_mem; 344 #endif 345 #if defined(CONFIG_MEMCG_KMEM) 346 /* analogous to slab_common's slab_caches list, but per-memcg; 347 * protected by memcg_slab_mutex */ 348 struct list_head memcg_slab_caches; 349 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 350 int kmemcg_id; 351 #endif 352 353 int last_scanned_node; 354 #if MAX_NUMNODES > 1 355 nodemask_t scan_nodes; 356 atomic_t numainfo_events; 357 atomic_t numainfo_updating; 358 #endif 359 360 /* List of events which userspace want to receive */ 361 struct list_head event_list; 362 spinlock_t event_list_lock; 363 364 struct mem_cgroup_per_node *nodeinfo[0]; 365 /* WARNING: nodeinfo must be the last member here */ 366 }; 367 368 #ifdef CONFIG_MEMCG_KMEM 369 static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 370 { 371 return memcg->kmemcg_id >= 0; 372 } 373 #endif 374 375 /* Stuffs for move charges at task migration. */ 376 /* 377 * Types of charges to be moved. "move_charge_at_immitgrate" and 378 * "immigrate_flags" are treated as a left-shifted bitmap of these types. 379 */ 380 enum move_type { 381 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 382 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 383 NR_MOVE_TYPE, 384 }; 385 386 /* "mc" and its members are protected by cgroup_mutex */ 387 static struct move_charge_struct { 388 spinlock_t lock; /* for from, to */ 389 struct mem_cgroup *from; 390 struct mem_cgroup *to; 391 unsigned long immigrate_flags; 392 unsigned long precharge; 393 unsigned long moved_charge; 394 unsigned long moved_swap; 395 struct task_struct *moving_task; /* a task moving charges */ 396 wait_queue_head_t waitq; /* a waitq for other context */ 397 } mc = { 398 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 399 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 400 }; 401 402 static bool move_anon(void) 403 { 404 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); 405 } 406 407 static bool move_file(void) 408 { 409 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); 410 } 411 412 /* 413 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 414 * limit reclaim to prevent infinite loops, if they ever occur. 415 */ 416 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 417 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 418 419 enum charge_type { 420 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 421 MEM_CGROUP_CHARGE_TYPE_ANON, 422 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 423 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 424 NR_CHARGE_TYPE, 425 }; 426 427 /* for encoding cft->private value on file */ 428 enum res_type { 429 _MEM, 430 _MEMSWAP, 431 _OOM_TYPE, 432 _KMEM, 433 }; 434 435 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 436 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 437 #define MEMFILE_ATTR(val) ((val) & 0xffff) 438 /* Used for OOM nofiier */ 439 #define OOM_CONTROL (0) 440 441 /* 442 * The memcg_create_mutex will be held whenever a new cgroup is created. 443 * As a consequence, any change that needs to protect against new child cgroups 444 * appearing has to hold it as well. 445 */ 446 static DEFINE_MUTEX(memcg_create_mutex); 447 448 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 449 { 450 return s ? container_of(s, struct mem_cgroup, css) : NULL; 451 } 452 453 /* Some nice accessors for the vmpressure. */ 454 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 455 { 456 if (!memcg) 457 memcg = root_mem_cgroup; 458 return &memcg->vmpressure; 459 } 460 461 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 462 { 463 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 464 } 465 466 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 467 { 468 return (memcg == root_mem_cgroup); 469 } 470 471 /* 472 * We restrict the id in the range of [1, 65535], so it can fit into 473 * an unsigned short. 474 */ 475 #define MEM_CGROUP_ID_MAX USHRT_MAX 476 477 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) 478 { 479 return memcg->css.id; 480 } 481 482 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 483 { 484 struct cgroup_subsys_state *css; 485 486 css = css_from_id(id, &memory_cgrp_subsys); 487 return mem_cgroup_from_css(css); 488 } 489 490 /* Writing them here to avoid exposing memcg's inner layout */ 491 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 492 493 void sock_update_memcg(struct sock *sk) 494 { 495 if (mem_cgroup_sockets_enabled) { 496 struct mem_cgroup *memcg; 497 struct cg_proto *cg_proto; 498 499 BUG_ON(!sk->sk_prot->proto_cgroup); 500 501 /* Socket cloning can throw us here with sk_cgrp already 502 * filled. It won't however, necessarily happen from 503 * process context. So the test for root memcg given 504 * the current task's memcg won't help us in this case. 505 * 506 * Respecting the original socket's memcg is a better 507 * decision in this case. 508 */ 509 if (sk->sk_cgrp) { 510 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 511 css_get(&sk->sk_cgrp->memcg->css); 512 return; 513 } 514 515 rcu_read_lock(); 516 memcg = mem_cgroup_from_task(current); 517 cg_proto = sk->sk_prot->proto_cgroup(memcg); 518 if (!mem_cgroup_is_root(memcg) && 519 memcg_proto_active(cg_proto) && 520 css_tryget_online(&memcg->css)) { 521 sk->sk_cgrp = cg_proto; 522 } 523 rcu_read_unlock(); 524 } 525 } 526 EXPORT_SYMBOL(sock_update_memcg); 527 528 void sock_release_memcg(struct sock *sk) 529 { 530 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 531 struct mem_cgroup *memcg; 532 WARN_ON(!sk->sk_cgrp->memcg); 533 memcg = sk->sk_cgrp->memcg; 534 css_put(&sk->sk_cgrp->memcg->css); 535 } 536 } 537 538 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 539 { 540 if (!memcg || mem_cgroup_is_root(memcg)) 541 return NULL; 542 543 return &memcg->tcp_mem; 544 } 545 EXPORT_SYMBOL(tcp_proto_cgroup); 546 547 static void disarm_sock_keys(struct mem_cgroup *memcg) 548 { 549 if (!memcg_proto_activated(&memcg->tcp_mem)) 550 return; 551 static_key_slow_dec(&memcg_socket_limit_enabled); 552 } 553 #else 554 static void disarm_sock_keys(struct mem_cgroup *memcg) 555 { 556 } 557 #endif 558 559 #ifdef CONFIG_MEMCG_KMEM 560 /* 561 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 562 * The main reason for not using cgroup id for this: 563 * this works better in sparse environments, where we have a lot of memcgs, 564 * but only a few kmem-limited. Or also, if we have, for instance, 200 565 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 566 * 200 entry array for that. 567 * 568 * The current size of the caches array is stored in 569 * memcg_limited_groups_array_size. It will double each time we have to 570 * increase it. 571 */ 572 static DEFINE_IDA(kmem_limited_groups); 573 int memcg_limited_groups_array_size; 574 575 /* 576 * MIN_SIZE is different than 1, because we would like to avoid going through 577 * the alloc/free process all the time. In a small machine, 4 kmem-limited 578 * cgroups is a reasonable guess. In the future, it could be a parameter or 579 * tunable, but that is strictly not necessary. 580 * 581 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 582 * this constant directly from cgroup, but it is understandable that this is 583 * better kept as an internal representation in cgroup.c. In any case, the 584 * cgrp_id space is not getting any smaller, and we don't have to necessarily 585 * increase ours as well if it increases. 586 */ 587 #define MEMCG_CACHES_MIN_SIZE 4 588 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 589 590 /* 591 * A lot of the calls to the cache allocation functions are expected to be 592 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 593 * conditional to this static branch, we'll have to allow modules that does 594 * kmem_cache_alloc and the such to see this symbol as well 595 */ 596 struct static_key memcg_kmem_enabled_key; 597 EXPORT_SYMBOL(memcg_kmem_enabled_key); 598 599 static void memcg_free_cache_id(int id); 600 601 static void disarm_kmem_keys(struct mem_cgroup *memcg) 602 { 603 if (memcg_kmem_is_active(memcg)) { 604 static_key_slow_dec(&memcg_kmem_enabled_key); 605 memcg_free_cache_id(memcg->kmemcg_id); 606 } 607 /* 608 * This check can't live in kmem destruction function, 609 * since the charges will outlive the cgroup 610 */ 611 WARN_ON(page_counter_read(&memcg->kmem)); 612 } 613 #else 614 static void disarm_kmem_keys(struct mem_cgroup *memcg) 615 { 616 } 617 #endif /* CONFIG_MEMCG_KMEM */ 618 619 static void disarm_static_keys(struct mem_cgroup *memcg) 620 { 621 disarm_sock_keys(memcg); 622 disarm_kmem_keys(memcg); 623 } 624 625 static struct mem_cgroup_per_zone * 626 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 627 { 628 int nid = zone_to_nid(zone); 629 int zid = zone_idx(zone); 630 631 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 632 } 633 634 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 635 { 636 return &memcg->css; 637 } 638 639 static struct mem_cgroup_per_zone * 640 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) 641 { 642 int nid = page_to_nid(page); 643 int zid = page_zonenum(page); 644 645 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 646 } 647 648 static struct mem_cgroup_tree_per_zone * 649 soft_limit_tree_node_zone(int nid, int zid) 650 { 651 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 652 } 653 654 static struct mem_cgroup_tree_per_zone * 655 soft_limit_tree_from_page(struct page *page) 656 { 657 int nid = page_to_nid(page); 658 int zid = page_zonenum(page); 659 660 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 661 } 662 663 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 664 struct mem_cgroup_tree_per_zone *mctz, 665 unsigned long new_usage_in_excess) 666 { 667 struct rb_node **p = &mctz->rb_root.rb_node; 668 struct rb_node *parent = NULL; 669 struct mem_cgroup_per_zone *mz_node; 670 671 if (mz->on_tree) 672 return; 673 674 mz->usage_in_excess = new_usage_in_excess; 675 if (!mz->usage_in_excess) 676 return; 677 while (*p) { 678 parent = *p; 679 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 680 tree_node); 681 if (mz->usage_in_excess < mz_node->usage_in_excess) 682 p = &(*p)->rb_left; 683 /* 684 * We can't avoid mem cgroups that are over their soft 685 * limit by the same amount 686 */ 687 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 688 p = &(*p)->rb_right; 689 } 690 rb_link_node(&mz->tree_node, parent, p); 691 rb_insert_color(&mz->tree_node, &mctz->rb_root); 692 mz->on_tree = true; 693 } 694 695 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 696 struct mem_cgroup_tree_per_zone *mctz) 697 { 698 if (!mz->on_tree) 699 return; 700 rb_erase(&mz->tree_node, &mctz->rb_root); 701 mz->on_tree = false; 702 } 703 704 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 705 struct mem_cgroup_tree_per_zone *mctz) 706 { 707 unsigned long flags; 708 709 spin_lock_irqsave(&mctz->lock, flags); 710 __mem_cgroup_remove_exceeded(mz, mctz); 711 spin_unlock_irqrestore(&mctz->lock, flags); 712 } 713 714 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 715 { 716 unsigned long nr_pages = page_counter_read(&memcg->memory); 717 unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); 718 unsigned long excess = 0; 719 720 if (nr_pages > soft_limit) 721 excess = nr_pages - soft_limit; 722 723 return excess; 724 } 725 726 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 727 { 728 unsigned long excess; 729 struct mem_cgroup_per_zone *mz; 730 struct mem_cgroup_tree_per_zone *mctz; 731 732 mctz = soft_limit_tree_from_page(page); 733 /* 734 * Necessary to update all ancestors when hierarchy is used. 735 * because their event counter is not touched. 736 */ 737 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 738 mz = mem_cgroup_page_zoneinfo(memcg, page); 739 excess = soft_limit_excess(memcg); 740 /* 741 * We have to update the tree if mz is on RB-tree or 742 * mem is over its softlimit. 743 */ 744 if (excess || mz->on_tree) { 745 unsigned long flags; 746 747 spin_lock_irqsave(&mctz->lock, flags); 748 /* if on-tree, remove it */ 749 if (mz->on_tree) 750 __mem_cgroup_remove_exceeded(mz, mctz); 751 /* 752 * Insert again. mz->usage_in_excess will be updated. 753 * If excess is 0, no tree ops. 754 */ 755 __mem_cgroup_insert_exceeded(mz, mctz, excess); 756 spin_unlock_irqrestore(&mctz->lock, flags); 757 } 758 } 759 } 760 761 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 762 { 763 struct mem_cgroup_tree_per_zone *mctz; 764 struct mem_cgroup_per_zone *mz; 765 int nid, zid; 766 767 for_each_node(nid) { 768 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 769 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 770 mctz = soft_limit_tree_node_zone(nid, zid); 771 mem_cgroup_remove_exceeded(mz, mctz); 772 } 773 } 774 } 775 776 static struct mem_cgroup_per_zone * 777 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 778 { 779 struct rb_node *rightmost = NULL; 780 struct mem_cgroup_per_zone *mz; 781 782 retry: 783 mz = NULL; 784 rightmost = rb_last(&mctz->rb_root); 785 if (!rightmost) 786 goto done; /* Nothing to reclaim from */ 787 788 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 789 /* 790 * Remove the node now but someone else can add it back, 791 * we will to add it back at the end of reclaim to its correct 792 * position in the tree. 793 */ 794 __mem_cgroup_remove_exceeded(mz, mctz); 795 if (!soft_limit_excess(mz->memcg) || 796 !css_tryget_online(&mz->memcg->css)) 797 goto retry; 798 done: 799 return mz; 800 } 801 802 static struct mem_cgroup_per_zone * 803 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 804 { 805 struct mem_cgroup_per_zone *mz; 806 807 spin_lock_irq(&mctz->lock); 808 mz = __mem_cgroup_largest_soft_limit_node(mctz); 809 spin_unlock_irq(&mctz->lock); 810 return mz; 811 } 812 813 /* 814 * Implementation Note: reading percpu statistics for memcg. 815 * 816 * Both of vmstat[] and percpu_counter has threshold and do periodic 817 * synchronization to implement "quick" read. There are trade-off between 818 * reading cost and precision of value. Then, we may have a chance to implement 819 * a periodic synchronizion of counter in memcg's counter. 820 * 821 * But this _read() function is used for user interface now. The user accounts 822 * memory usage by memory cgroup and he _always_ requires exact value because 823 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 824 * have to visit all online cpus and make sum. So, for now, unnecessary 825 * synchronization is not implemented. (just implemented for cpu hotplug) 826 * 827 * If there are kernel internal actions which can make use of some not-exact 828 * value, and reading all cpu value can be performance bottleneck in some 829 * common workload, threashold and synchonization as vmstat[] should be 830 * implemented. 831 */ 832 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 833 enum mem_cgroup_stat_index idx) 834 { 835 long val = 0; 836 int cpu; 837 838 get_online_cpus(); 839 for_each_online_cpu(cpu) 840 val += per_cpu(memcg->stat->count[idx], cpu); 841 #ifdef CONFIG_HOTPLUG_CPU 842 spin_lock(&memcg->pcp_counter_lock); 843 val += memcg->nocpu_base.count[idx]; 844 spin_unlock(&memcg->pcp_counter_lock); 845 #endif 846 put_online_cpus(); 847 return val; 848 } 849 850 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 851 enum mem_cgroup_events_index idx) 852 { 853 unsigned long val = 0; 854 int cpu; 855 856 get_online_cpus(); 857 for_each_online_cpu(cpu) 858 val += per_cpu(memcg->stat->events[idx], cpu); 859 #ifdef CONFIG_HOTPLUG_CPU 860 spin_lock(&memcg->pcp_counter_lock); 861 val += memcg->nocpu_base.events[idx]; 862 spin_unlock(&memcg->pcp_counter_lock); 863 #endif 864 put_online_cpus(); 865 return val; 866 } 867 868 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 869 struct page *page, 870 int nr_pages) 871 { 872 /* 873 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 874 * counted as CACHE even if it's on ANON LRU. 875 */ 876 if (PageAnon(page)) 877 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 878 nr_pages); 879 else 880 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 881 nr_pages); 882 883 if (PageTransHuge(page)) 884 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 885 nr_pages); 886 887 /* pagein of a big page is an event. So, ignore page size */ 888 if (nr_pages > 0) 889 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 890 else { 891 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 892 nr_pages = -nr_pages; /* for event */ 893 } 894 895 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 896 } 897 898 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 899 { 900 struct mem_cgroup_per_zone *mz; 901 902 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 903 return mz->lru_size[lru]; 904 } 905 906 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 907 int nid, 908 unsigned int lru_mask) 909 { 910 unsigned long nr = 0; 911 int zid; 912 913 VM_BUG_ON((unsigned)nid >= nr_node_ids); 914 915 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 916 struct mem_cgroup_per_zone *mz; 917 enum lru_list lru; 918 919 for_each_lru(lru) { 920 if (!(BIT(lru) & lru_mask)) 921 continue; 922 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 923 nr += mz->lru_size[lru]; 924 } 925 } 926 return nr; 927 } 928 929 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 930 unsigned int lru_mask) 931 { 932 unsigned long nr = 0; 933 int nid; 934 935 for_each_node_state(nid, N_MEMORY) 936 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 937 return nr; 938 } 939 940 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 941 enum mem_cgroup_events_target target) 942 { 943 unsigned long val, next; 944 945 val = __this_cpu_read(memcg->stat->nr_page_events); 946 next = __this_cpu_read(memcg->stat->targets[target]); 947 /* from time_after() in jiffies.h */ 948 if ((long)next - (long)val < 0) { 949 switch (target) { 950 case MEM_CGROUP_TARGET_THRESH: 951 next = val + THRESHOLDS_EVENTS_TARGET; 952 break; 953 case MEM_CGROUP_TARGET_SOFTLIMIT: 954 next = val + SOFTLIMIT_EVENTS_TARGET; 955 break; 956 case MEM_CGROUP_TARGET_NUMAINFO: 957 next = val + NUMAINFO_EVENTS_TARGET; 958 break; 959 default: 960 break; 961 } 962 __this_cpu_write(memcg->stat->targets[target], next); 963 return true; 964 } 965 return false; 966 } 967 968 /* 969 * Check events in order. 970 * 971 */ 972 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 973 { 974 /* threshold event is triggered in finer grain than soft limit */ 975 if (unlikely(mem_cgroup_event_ratelimit(memcg, 976 MEM_CGROUP_TARGET_THRESH))) { 977 bool do_softlimit; 978 bool do_numainfo __maybe_unused; 979 980 do_softlimit = mem_cgroup_event_ratelimit(memcg, 981 MEM_CGROUP_TARGET_SOFTLIMIT); 982 #if MAX_NUMNODES > 1 983 do_numainfo = mem_cgroup_event_ratelimit(memcg, 984 MEM_CGROUP_TARGET_NUMAINFO); 985 #endif 986 mem_cgroup_threshold(memcg); 987 if (unlikely(do_softlimit)) 988 mem_cgroup_update_tree(memcg, page); 989 #if MAX_NUMNODES > 1 990 if (unlikely(do_numainfo)) 991 atomic_inc(&memcg->numainfo_events); 992 #endif 993 } 994 } 995 996 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 997 { 998 /* 999 * mm_update_next_owner() may clear mm->owner to NULL 1000 * if it races with swapoff, page migration, etc. 1001 * So this can be called with p == NULL. 1002 */ 1003 if (unlikely(!p)) 1004 return NULL; 1005 1006 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 1007 } 1008 1009 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 1010 { 1011 struct mem_cgroup *memcg = NULL; 1012 1013 rcu_read_lock(); 1014 do { 1015 /* 1016 * Page cache insertions can happen withou an 1017 * actual mm context, e.g. during disk probing 1018 * on boot, loopback IO, acct() writes etc. 1019 */ 1020 if (unlikely(!mm)) 1021 memcg = root_mem_cgroup; 1022 else { 1023 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1024 if (unlikely(!memcg)) 1025 memcg = root_mem_cgroup; 1026 } 1027 } while (!css_tryget_online(&memcg->css)); 1028 rcu_read_unlock(); 1029 return memcg; 1030 } 1031 1032 /** 1033 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1034 * @root: hierarchy root 1035 * @prev: previously returned memcg, NULL on first invocation 1036 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1037 * 1038 * Returns references to children of the hierarchy below @root, or 1039 * @root itself, or %NULL after a full round-trip. 1040 * 1041 * Caller must pass the return value in @prev on subsequent 1042 * invocations for reference counting, or use mem_cgroup_iter_break() 1043 * to cancel a hierarchy walk before the round-trip is complete. 1044 * 1045 * Reclaimers can specify a zone and a priority level in @reclaim to 1046 * divide up the memcgs in the hierarchy among all concurrent 1047 * reclaimers operating on the same zone and priority. 1048 */ 1049 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1050 struct mem_cgroup *prev, 1051 struct mem_cgroup_reclaim_cookie *reclaim) 1052 { 1053 struct reclaim_iter *uninitialized_var(iter); 1054 struct cgroup_subsys_state *css = NULL; 1055 struct mem_cgroup *memcg = NULL; 1056 struct mem_cgroup *pos = NULL; 1057 1058 if (mem_cgroup_disabled()) 1059 return NULL; 1060 1061 if (!root) 1062 root = root_mem_cgroup; 1063 1064 if (prev && !reclaim) 1065 pos = prev; 1066 1067 if (!root->use_hierarchy && root != root_mem_cgroup) { 1068 if (prev) 1069 goto out; 1070 return root; 1071 } 1072 1073 rcu_read_lock(); 1074 1075 if (reclaim) { 1076 struct mem_cgroup_per_zone *mz; 1077 1078 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); 1079 iter = &mz->iter[reclaim->priority]; 1080 1081 if (prev && reclaim->generation != iter->generation) 1082 goto out_unlock; 1083 1084 do { 1085 pos = ACCESS_ONCE(iter->position); 1086 /* 1087 * A racing update may change the position and 1088 * put the last reference, hence css_tryget(), 1089 * or retry to see the updated position. 1090 */ 1091 } while (pos && !css_tryget(&pos->css)); 1092 } 1093 1094 if (pos) 1095 css = &pos->css; 1096 1097 for (;;) { 1098 css = css_next_descendant_pre(css, &root->css); 1099 if (!css) { 1100 /* 1101 * Reclaimers share the hierarchy walk, and a 1102 * new one might jump in right at the end of 1103 * the hierarchy - make sure they see at least 1104 * one group and restart from the beginning. 1105 */ 1106 if (!prev) 1107 continue; 1108 break; 1109 } 1110 1111 /* 1112 * Verify the css and acquire a reference. The root 1113 * is provided by the caller, so we know it's alive 1114 * and kicking, and don't take an extra reference. 1115 */ 1116 memcg = mem_cgroup_from_css(css); 1117 1118 if (css == &root->css) 1119 break; 1120 1121 if (css_tryget(css)) { 1122 /* 1123 * Make sure the memcg is initialized: 1124 * mem_cgroup_css_online() orders the the 1125 * initialization against setting the flag. 1126 */ 1127 if (smp_load_acquire(&memcg->initialized)) 1128 break; 1129 1130 css_put(css); 1131 } 1132 1133 memcg = NULL; 1134 } 1135 1136 if (reclaim) { 1137 if (cmpxchg(&iter->position, pos, memcg) == pos) { 1138 if (memcg) 1139 css_get(&memcg->css); 1140 if (pos) 1141 css_put(&pos->css); 1142 } 1143 1144 /* 1145 * pairs with css_tryget when dereferencing iter->position 1146 * above. 1147 */ 1148 if (pos) 1149 css_put(&pos->css); 1150 1151 if (!memcg) 1152 iter->generation++; 1153 else if (!prev) 1154 reclaim->generation = iter->generation; 1155 } 1156 1157 out_unlock: 1158 rcu_read_unlock(); 1159 out: 1160 if (prev && prev != root) 1161 css_put(&prev->css); 1162 1163 return memcg; 1164 } 1165 1166 /** 1167 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1168 * @root: hierarchy root 1169 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1170 */ 1171 void mem_cgroup_iter_break(struct mem_cgroup *root, 1172 struct mem_cgroup *prev) 1173 { 1174 if (!root) 1175 root = root_mem_cgroup; 1176 if (prev && prev != root) 1177 css_put(&prev->css); 1178 } 1179 1180 /* 1181 * Iteration constructs for visiting all cgroups (under a tree). If 1182 * loops are exited prematurely (break), mem_cgroup_iter_break() must 1183 * be used for reference counting. 1184 */ 1185 #define for_each_mem_cgroup_tree(iter, root) \ 1186 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 1187 iter != NULL; \ 1188 iter = mem_cgroup_iter(root, iter, NULL)) 1189 1190 #define for_each_mem_cgroup(iter) \ 1191 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 1192 iter != NULL; \ 1193 iter = mem_cgroup_iter(NULL, iter, NULL)) 1194 1195 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1196 { 1197 struct mem_cgroup *memcg; 1198 1199 rcu_read_lock(); 1200 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1201 if (unlikely(!memcg)) 1202 goto out; 1203 1204 switch (idx) { 1205 case PGFAULT: 1206 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1207 break; 1208 case PGMAJFAULT: 1209 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1210 break; 1211 default: 1212 BUG(); 1213 } 1214 out: 1215 rcu_read_unlock(); 1216 } 1217 EXPORT_SYMBOL(__mem_cgroup_count_vm_event); 1218 1219 /** 1220 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1221 * @zone: zone of the wanted lruvec 1222 * @memcg: memcg of the wanted lruvec 1223 * 1224 * Returns the lru list vector holding pages for the given @zone and 1225 * @mem. This can be the global zone lruvec, if the memory controller 1226 * is disabled. 1227 */ 1228 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1229 struct mem_cgroup *memcg) 1230 { 1231 struct mem_cgroup_per_zone *mz; 1232 struct lruvec *lruvec; 1233 1234 if (mem_cgroup_disabled()) { 1235 lruvec = &zone->lruvec; 1236 goto out; 1237 } 1238 1239 mz = mem_cgroup_zone_zoneinfo(memcg, zone); 1240 lruvec = &mz->lruvec; 1241 out: 1242 /* 1243 * Since a node can be onlined after the mem_cgroup was created, 1244 * we have to be prepared to initialize lruvec->zone here; 1245 * and if offlined then reonlined, we need to reinitialize it. 1246 */ 1247 if (unlikely(lruvec->zone != zone)) 1248 lruvec->zone = zone; 1249 return lruvec; 1250 } 1251 1252 /** 1253 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 1254 * @page: the page 1255 * @zone: zone of the page 1256 * 1257 * This function is only safe when following the LRU page isolation 1258 * and putback protocol: the LRU lock must be held, and the page must 1259 * either be PageLRU() or the caller must have isolated/allocated it. 1260 */ 1261 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1262 { 1263 struct mem_cgroup_per_zone *mz; 1264 struct mem_cgroup *memcg; 1265 struct lruvec *lruvec; 1266 1267 if (mem_cgroup_disabled()) { 1268 lruvec = &zone->lruvec; 1269 goto out; 1270 } 1271 1272 memcg = page->mem_cgroup; 1273 /* 1274 * Swapcache readahead pages are added to the LRU - and 1275 * possibly migrated - before they are charged. 1276 */ 1277 if (!memcg) 1278 memcg = root_mem_cgroup; 1279 1280 mz = mem_cgroup_page_zoneinfo(memcg, page); 1281 lruvec = &mz->lruvec; 1282 out: 1283 /* 1284 * Since a node can be onlined after the mem_cgroup was created, 1285 * we have to be prepared to initialize lruvec->zone here; 1286 * and if offlined then reonlined, we need to reinitialize it. 1287 */ 1288 if (unlikely(lruvec->zone != zone)) 1289 lruvec->zone = zone; 1290 return lruvec; 1291 } 1292 1293 /** 1294 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1295 * @lruvec: mem_cgroup per zone lru vector 1296 * @lru: index of lru list the page is sitting on 1297 * @nr_pages: positive when adding or negative when removing 1298 * 1299 * This function must be called when a page is added to or removed from an 1300 * lru list. 1301 */ 1302 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1303 int nr_pages) 1304 { 1305 struct mem_cgroup_per_zone *mz; 1306 unsigned long *lru_size; 1307 1308 if (mem_cgroup_disabled()) 1309 return; 1310 1311 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1312 lru_size = mz->lru_size + lru; 1313 *lru_size += nr_pages; 1314 VM_BUG_ON((long)(*lru_size) < 0); 1315 } 1316 1317 bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) 1318 { 1319 if (root == memcg) 1320 return true; 1321 if (!root->use_hierarchy) 1322 return false; 1323 return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); 1324 } 1325 1326 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) 1327 { 1328 struct mem_cgroup *task_memcg; 1329 struct task_struct *p; 1330 bool ret; 1331 1332 p = find_lock_task_mm(task); 1333 if (p) { 1334 task_memcg = get_mem_cgroup_from_mm(p->mm); 1335 task_unlock(p); 1336 } else { 1337 /* 1338 * All threads may have already detached their mm's, but the oom 1339 * killer still needs to detect if they have already been oom 1340 * killed to prevent needlessly killing additional tasks. 1341 */ 1342 rcu_read_lock(); 1343 task_memcg = mem_cgroup_from_task(task); 1344 css_get(&task_memcg->css); 1345 rcu_read_unlock(); 1346 } 1347 ret = mem_cgroup_is_descendant(task_memcg, memcg); 1348 css_put(&task_memcg->css); 1349 return ret; 1350 } 1351 1352 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 1353 { 1354 unsigned long inactive_ratio; 1355 unsigned long inactive; 1356 unsigned long active; 1357 unsigned long gb; 1358 1359 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); 1360 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); 1361 1362 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1363 if (gb) 1364 inactive_ratio = int_sqrt(10 * gb); 1365 else 1366 inactive_ratio = 1; 1367 1368 return inactive * inactive_ratio < active; 1369 } 1370 1371 #define mem_cgroup_from_counter(counter, member) \ 1372 container_of(counter, struct mem_cgroup, member) 1373 1374 /** 1375 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1376 * @memcg: the memory cgroup 1377 * 1378 * Returns the maximum amount of memory @mem can be charged with, in 1379 * pages. 1380 */ 1381 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1382 { 1383 unsigned long margin = 0; 1384 unsigned long count; 1385 unsigned long limit; 1386 1387 count = page_counter_read(&memcg->memory); 1388 limit = ACCESS_ONCE(memcg->memory.limit); 1389 if (count < limit) 1390 margin = limit - count; 1391 1392 if (do_swap_account) { 1393 count = page_counter_read(&memcg->memsw); 1394 limit = ACCESS_ONCE(memcg->memsw.limit); 1395 if (count <= limit) 1396 margin = min(margin, limit - count); 1397 } 1398 1399 return margin; 1400 } 1401 1402 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1403 { 1404 /* root ? */ 1405 if (mem_cgroup_disabled() || !memcg->css.parent) 1406 return vm_swappiness; 1407 1408 return memcg->swappiness; 1409 } 1410 1411 /* 1412 * A routine for checking "mem" is under move_account() or not. 1413 * 1414 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1415 * moving cgroups. This is for waiting at high-memory pressure 1416 * caused by "move". 1417 */ 1418 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1419 { 1420 struct mem_cgroup *from; 1421 struct mem_cgroup *to; 1422 bool ret = false; 1423 /* 1424 * Unlike task_move routines, we access mc.to, mc.from not under 1425 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1426 */ 1427 spin_lock(&mc.lock); 1428 from = mc.from; 1429 to = mc.to; 1430 if (!from) 1431 goto unlock; 1432 1433 ret = mem_cgroup_is_descendant(from, memcg) || 1434 mem_cgroup_is_descendant(to, memcg); 1435 unlock: 1436 spin_unlock(&mc.lock); 1437 return ret; 1438 } 1439 1440 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1441 { 1442 if (mc.moving_task && current != mc.moving_task) { 1443 if (mem_cgroup_under_move(memcg)) { 1444 DEFINE_WAIT(wait); 1445 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1446 /* moving charge context might have finished. */ 1447 if (mc.moving_task) 1448 schedule(); 1449 finish_wait(&mc.waitq, &wait); 1450 return true; 1451 } 1452 } 1453 return false; 1454 } 1455 1456 #define K(x) ((x) << (PAGE_SHIFT-10)) 1457 /** 1458 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1459 * @memcg: The memory cgroup that went over limit 1460 * @p: Task that is going to be killed 1461 * 1462 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1463 * enabled 1464 */ 1465 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1466 { 1467 /* oom_info_lock ensures that parallel ooms do not interleave */ 1468 static DEFINE_MUTEX(oom_info_lock); 1469 struct mem_cgroup *iter; 1470 unsigned int i; 1471 1472 if (!p) 1473 return; 1474 1475 mutex_lock(&oom_info_lock); 1476 rcu_read_lock(); 1477 1478 pr_info("Task in "); 1479 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1480 pr_info(" killed as a result of limit of "); 1481 pr_cont_cgroup_path(memcg->css.cgroup); 1482 pr_info("\n"); 1483 1484 rcu_read_unlock(); 1485 1486 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1487 K((u64)page_counter_read(&memcg->memory)), 1488 K((u64)memcg->memory.limit), memcg->memory.failcnt); 1489 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1490 K((u64)page_counter_read(&memcg->memsw)), 1491 K((u64)memcg->memsw.limit), memcg->memsw.failcnt); 1492 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1493 K((u64)page_counter_read(&memcg->kmem)), 1494 K((u64)memcg->kmem.limit), memcg->kmem.failcnt); 1495 1496 for_each_mem_cgroup_tree(iter, memcg) { 1497 pr_info("Memory cgroup stats for "); 1498 pr_cont_cgroup_path(iter->css.cgroup); 1499 pr_cont(":"); 1500 1501 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1502 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1503 continue; 1504 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], 1505 K(mem_cgroup_read_stat(iter, i))); 1506 } 1507 1508 for (i = 0; i < NR_LRU_LISTS; i++) 1509 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1510 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1511 1512 pr_cont("\n"); 1513 } 1514 mutex_unlock(&oom_info_lock); 1515 } 1516 1517 /* 1518 * This function returns the number of memcg under hierarchy tree. Returns 1519 * 1(self count) if no children. 1520 */ 1521 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1522 { 1523 int num = 0; 1524 struct mem_cgroup *iter; 1525 1526 for_each_mem_cgroup_tree(iter, memcg) 1527 num++; 1528 return num; 1529 } 1530 1531 /* 1532 * Return the memory (and swap, if configured) limit for a memcg. 1533 */ 1534 static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) 1535 { 1536 unsigned long limit; 1537 1538 limit = memcg->memory.limit; 1539 if (mem_cgroup_swappiness(memcg)) { 1540 unsigned long memsw_limit; 1541 1542 memsw_limit = memcg->memsw.limit; 1543 limit = min(limit + total_swap_pages, memsw_limit); 1544 } 1545 return limit; 1546 } 1547 1548 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1549 int order) 1550 { 1551 struct mem_cgroup *iter; 1552 unsigned long chosen_points = 0; 1553 unsigned long totalpages; 1554 unsigned int points = 0; 1555 struct task_struct *chosen = NULL; 1556 1557 /* 1558 * If current has a pending SIGKILL or is exiting, then automatically 1559 * select it. The goal is to allow it to allocate so that it may 1560 * quickly exit and free its memory. 1561 */ 1562 if (fatal_signal_pending(current) || task_will_free_mem(current)) { 1563 set_thread_flag(TIF_MEMDIE); 1564 return; 1565 } 1566 1567 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1568 totalpages = mem_cgroup_get_limit(memcg) ? : 1; 1569 for_each_mem_cgroup_tree(iter, memcg) { 1570 struct css_task_iter it; 1571 struct task_struct *task; 1572 1573 css_task_iter_start(&iter->css, &it); 1574 while ((task = css_task_iter_next(&it))) { 1575 switch (oom_scan_process_thread(task, totalpages, NULL, 1576 false)) { 1577 case OOM_SCAN_SELECT: 1578 if (chosen) 1579 put_task_struct(chosen); 1580 chosen = task; 1581 chosen_points = ULONG_MAX; 1582 get_task_struct(chosen); 1583 /* fall through */ 1584 case OOM_SCAN_CONTINUE: 1585 continue; 1586 case OOM_SCAN_ABORT: 1587 css_task_iter_end(&it); 1588 mem_cgroup_iter_break(memcg, iter); 1589 if (chosen) 1590 put_task_struct(chosen); 1591 return; 1592 case OOM_SCAN_OK: 1593 break; 1594 }; 1595 points = oom_badness(task, memcg, NULL, totalpages); 1596 if (!points || points < chosen_points) 1597 continue; 1598 /* Prefer thread group leaders for display purposes */ 1599 if (points == chosen_points && 1600 thread_group_leader(chosen)) 1601 continue; 1602 1603 if (chosen) 1604 put_task_struct(chosen); 1605 chosen = task; 1606 chosen_points = points; 1607 get_task_struct(chosen); 1608 } 1609 css_task_iter_end(&it); 1610 } 1611 1612 if (!chosen) 1613 return; 1614 points = chosen_points * 1000 / totalpages; 1615 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, 1616 NULL, "Memory cgroup out of memory"); 1617 } 1618 1619 #if MAX_NUMNODES > 1 1620 1621 /** 1622 * test_mem_cgroup_node_reclaimable 1623 * @memcg: the target memcg 1624 * @nid: the node ID to be checked. 1625 * @noswap : specify true here if the user wants flle only information. 1626 * 1627 * This function returns whether the specified memcg contains any 1628 * reclaimable pages on a node. Returns true if there are any reclaimable 1629 * pages in the node. 1630 */ 1631 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1632 int nid, bool noswap) 1633 { 1634 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1635 return true; 1636 if (noswap || !total_swap_pages) 1637 return false; 1638 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1639 return true; 1640 return false; 1641 1642 } 1643 1644 /* 1645 * Always updating the nodemask is not very good - even if we have an empty 1646 * list or the wrong list here, we can start from some node and traverse all 1647 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1648 * 1649 */ 1650 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1651 { 1652 int nid; 1653 /* 1654 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1655 * pagein/pageout changes since the last update. 1656 */ 1657 if (!atomic_read(&memcg->numainfo_events)) 1658 return; 1659 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1660 return; 1661 1662 /* make a nodemask where this memcg uses memory from */ 1663 memcg->scan_nodes = node_states[N_MEMORY]; 1664 1665 for_each_node_mask(nid, node_states[N_MEMORY]) { 1666 1667 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1668 node_clear(nid, memcg->scan_nodes); 1669 } 1670 1671 atomic_set(&memcg->numainfo_events, 0); 1672 atomic_set(&memcg->numainfo_updating, 0); 1673 } 1674 1675 /* 1676 * Selecting a node where we start reclaim from. Because what we need is just 1677 * reducing usage counter, start from anywhere is O,K. Considering 1678 * memory reclaim from current node, there are pros. and cons. 1679 * 1680 * Freeing memory from current node means freeing memory from a node which 1681 * we'll use or we've used. So, it may make LRU bad. And if several threads 1682 * hit limits, it will see a contention on a node. But freeing from remote 1683 * node means more costs for memory reclaim because of memory latency. 1684 * 1685 * Now, we use round-robin. Better algorithm is welcomed. 1686 */ 1687 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1688 { 1689 int node; 1690 1691 mem_cgroup_may_update_nodemask(memcg); 1692 node = memcg->last_scanned_node; 1693 1694 node = next_node(node, memcg->scan_nodes); 1695 if (node == MAX_NUMNODES) 1696 node = first_node(memcg->scan_nodes); 1697 /* 1698 * We call this when we hit limit, not when pages are added to LRU. 1699 * No LRU may hold pages because all pages are UNEVICTABLE or 1700 * memcg is too small and all pages are not on LRU. In that case, 1701 * we use curret node. 1702 */ 1703 if (unlikely(node == MAX_NUMNODES)) 1704 node = numa_node_id(); 1705 1706 memcg->last_scanned_node = node; 1707 return node; 1708 } 1709 #else 1710 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1711 { 1712 return 0; 1713 } 1714 #endif 1715 1716 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1717 struct zone *zone, 1718 gfp_t gfp_mask, 1719 unsigned long *total_scanned) 1720 { 1721 struct mem_cgroup *victim = NULL; 1722 int total = 0; 1723 int loop = 0; 1724 unsigned long excess; 1725 unsigned long nr_scanned; 1726 struct mem_cgroup_reclaim_cookie reclaim = { 1727 .zone = zone, 1728 .priority = 0, 1729 }; 1730 1731 excess = soft_limit_excess(root_memcg); 1732 1733 while (1) { 1734 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1735 if (!victim) { 1736 loop++; 1737 if (loop >= 2) { 1738 /* 1739 * If we have not been able to reclaim 1740 * anything, it might because there are 1741 * no reclaimable pages under this hierarchy 1742 */ 1743 if (!total) 1744 break; 1745 /* 1746 * We want to do more targeted reclaim. 1747 * excess >> 2 is not to excessive so as to 1748 * reclaim too much, nor too less that we keep 1749 * coming back to reclaim from this cgroup 1750 */ 1751 if (total >= (excess >> 2) || 1752 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1753 break; 1754 } 1755 continue; 1756 } 1757 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1758 zone, &nr_scanned); 1759 *total_scanned += nr_scanned; 1760 if (!soft_limit_excess(root_memcg)) 1761 break; 1762 } 1763 mem_cgroup_iter_break(root_memcg, victim); 1764 return total; 1765 } 1766 1767 #ifdef CONFIG_LOCKDEP 1768 static struct lockdep_map memcg_oom_lock_dep_map = { 1769 .name = "memcg_oom_lock", 1770 }; 1771 #endif 1772 1773 static DEFINE_SPINLOCK(memcg_oom_lock); 1774 1775 /* 1776 * Check OOM-Killer is already running under our hierarchy. 1777 * If someone is running, return false. 1778 */ 1779 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1780 { 1781 struct mem_cgroup *iter, *failed = NULL; 1782 1783 spin_lock(&memcg_oom_lock); 1784 1785 for_each_mem_cgroup_tree(iter, memcg) { 1786 if (iter->oom_lock) { 1787 /* 1788 * this subtree of our hierarchy is already locked 1789 * so we cannot give a lock. 1790 */ 1791 failed = iter; 1792 mem_cgroup_iter_break(memcg, iter); 1793 break; 1794 } else 1795 iter->oom_lock = true; 1796 } 1797 1798 if (failed) { 1799 /* 1800 * OK, we failed to lock the whole subtree so we have 1801 * to clean up what we set up to the failing subtree 1802 */ 1803 for_each_mem_cgroup_tree(iter, memcg) { 1804 if (iter == failed) { 1805 mem_cgroup_iter_break(memcg, iter); 1806 break; 1807 } 1808 iter->oom_lock = false; 1809 } 1810 } else 1811 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1812 1813 spin_unlock(&memcg_oom_lock); 1814 1815 return !failed; 1816 } 1817 1818 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1819 { 1820 struct mem_cgroup *iter; 1821 1822 spin_lock(&memcg_oom_lock); 1823 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 1824 for_each_mem_cgroup_tree(iter, memcg) 1825 iter->oom_lock = false; 1826 spin_unlock(&memcg_oom_lock); 1827 } 1828 1829 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1830 { 1831 struct mem_cgroup *iter; 1832 1833 for_each_mem_cgroup_tree(iter, memcg) 1834 atomic_inc(&iter->under_oom); 1835 } 1836 1837 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1838 { 1839 struct mem_cgroup *iter; 1840 1841 /* 1842 * When a new child is created while the hierarchy is under oom, 1843 * mem_cgroup_oom_lock() may not be called. We have to use 1844 * atomic_add_unless() here. 1845 */ 1846 for_each_mem_cgroup_tree(iter, memcg) 1847 atomic_add_unless(&iter->under_oom, -1, 0); 1848 } 1849 1850 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1851 1852 struct oom_wait_info { 1853 struct mem_cgroup *memcg; 1854 wait_queue_t wait; 1855 }; 1856 1857 static int memcg_oom_wake_function(wait_queue_t *wait, 1858 unsigned mode, int sync, void *arg) 1859 { 1860 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1861 struct mem_cgroup *oom_wait_memcg; 1862 struct oom_wait_info *oom_wait_info; 1863 1864 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1865 oom_wait_memcg = oom_wait_info->memcg; 1866 1867 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1868 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1869 return 0; 1870 return autoremove_wake_function(wait, mode, sync, arg); 1871 } 1872 1873 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 1874 { 1875 atomic_inc(&memcg->oom_wakeups); 1876 /* for filtering, pass "memcg" as argument. */ 1877 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1878 } 1879 1880 static void memcg_oom_recover(struct mem_cgroup *memcg) 1881 { 1882 if (memcg && atomic_read(&memcg->under_oom)) 1883 memcg_wakeup_oom(memcg); 1884 } 1885 1886 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1887 { 1888 if (!current->memcg_oom.may_oom) 1889 return; 1890 /* 1891 * We are in the middle of the charge context here, so we 1892 * don't want to block when potentially sitting on a callstack 1893 * that holds all kinds of filesystem and mm locks. 1894 * 1895 * Also, the caller may handle a failed allocation gracefully 1896 * (like optional page cache readahead) and so an OOM killer 1897 * invocation might not even be necessary. 1898 * 1899 * That's why we don't do anything here except remember the 1900 * OOM context and then deal with it at the end of the page 1901 * fault when the stack is unwound, the locks are released, 1902 * and when we know whether the fault was overall successful. 1903 */ 1904 css_get(&memcg->css); 1905 current->memcg_oom.memcg = memcg; 1906 current->memcg_oom.gfp_mask = mask; 1907 current->memcg_oom.order = order; 1908 } 1909 1910 /** 1911 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1912 * @handle: actually kill/wait or just clean up the OOM state 1913 * 1914 * This has to be called at the end of a page fault if the memcg OOM 1915 * handler was enabled. 1916 * 1917 * Memcg supports userspace OOM handling where failed allocations must 1918 * sleep on a waitqueue until the userspace task resolves the 1919 * situation. Sleeping directly in the charge context with all kinds 1920 * of locks held is not a good idea, instead we remember an OOM state 1921 * in the task and mem_cgroup_oom_synchronize() has to be called at 1922 * the end of the page fault to complete the OOM handling. 1923 * 1924 * Returns %true if an ongoing memcg OOM situation was detected and 1925 * completed, %false otherwise. 1926 */ 1927 bool mem_cgroup_oom_synchronize(bool handle) 1928 { 1929 struct mem_cgroup *memcg = current->memcg_oom.memcg; 1930 struct oom_wait_info owait; 1931 bool locked; 1932 1933 /* OOM is global, do not handle */ 1934 if (!memcg) 1935 return false; 1936 1937 if (!handle) 1938 goto cleanup; 1939 1940 owait.memcg = memcg; 1941 owait.wait.flags = 0; 1942 owait.wait.func = memcg_oom_wake_function; 1943 owait.wait.private = current; 1944 INIT_LIST_HEAD(&owait.wait.task_list); 1945 1946 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1947 mem_cgroup_mark_under_oom(memcg); 1948 1949 locked = mem_cgroup_oom_trylock(memcg); 1950 1951 if (locked) 1952 mem_cgroup_oom_notify(memcg); 1953 1954 if (locked && !memcg->oom_kill_disable) { 1955 mem_cgroup_unmark_under_oom(memcg); 1956 finish_wait(&memcg_oom_waitq, &owait.wait); 1957 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 1958 current->memcg_oom.order); 1959 } else { 1960 schedule(); 1961 mem_cgroup_unmark_under_oom(memcg); 1962 finish_wait(&memcg_oom_waitq, &owait.wait); 1963 } 1964 1965 if (locked) { 1966 mem_cgroup_oom_unlock(memcg); 1967 /* 1968 * There is no guarantee that an OOM-lock contender 1969 * sees the wakeups triggered by the OOM kill 1970 * uncharges. Wake any sleepers explicitely. 1971 */ 1972 memcg_oom_recover(memcg); 1973 } 1974 cleanup: 1975 current->memcg_oom.memcg = NULL; 1976 css_put(&memcg->css); 1977 return true; 1978 } 1979 1980 /** 1981 * mem_cgroup_begin_page_stat - begin a page state statistics transaction 1982 * @page: page that is going to change accounted state 1983 * @locked: &memcg->move_lock slowpath was taken 1984 * @flags: IRQ-state flags for &memcg->move_lock 1985 * 1986 * This function must mark the beginning of an accounted page state 1987 * change to prevent double accounting when the page is concurrently 1988 * being moved to another memcg: 1989 * 1990 * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); 1991 * if (TestClearPageState(page)) 1992 * mem_cgroup_update_page_stat(memcg, state, -1); 1993 * mem_cgroup_end_page_stat(memcg, locked, flags); 1994 * 1995 * The RCU lock is held throughout the transaction. The fast path can 1996 * get away without acquiring the memcg->move_lock (@locked is false) 1997 * because page moving starts with an RCU grace period. 1998 * 1999 * The RCU lock also protects the memcg from being freed when the page 2000 * state that is going to change is the only thing preventing the page 2001 * from being uncharged. E.g. end-writeback clearing PageWriteback(), 2002 * which allows migration to go ahead and uncharge the page before the 2003 * account transaction might be complete. 2004 */ 2005 struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, 2006 bool *locked, 2007 unsigned long *flags) 2008 { 2009 struct mem_cgroup *memcg; 2010 2011 rcu_read_lock(); 2012 2013 if (mem_cgroup_disabled()) 2014 return NULL; 2015 again: 2016 memcg = page->mem_cgroup; 2017 if (unlikely(!memcg)) 2018 return NULL; 2019 2020 *locked = false; 2021 if (atomic_read(&memcg->moving_account) <= 0) 2022 return memcg; 2023 2024 spin_lock_irqsave(&memcg->move_lock, *flags); 2025 if (memcg != page->mem_cgroup) { 2026 spin_unlock_irqrestore(&memcg->move_lock, *flags); 2027 goto again; 2028 } 2029 *locked = true; 2030 2031 return memcg; 2032 } 2033 2034 /** 2035 * mem_cgroup_end_page_stat - finish a page state statistics transaction 2036 * @memcg: the memcg that was accounted against 2037 * @locked: value received from mem_cgroup_begin_page_stat() 2038 * @flags: value received from mem_cgroup_begin_page_stat() 2039 */ 2040 void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, 2041 unsigned long *flags) 2042 { 2043 if (memcg && *locked) 2044 spin_unlock_irqrestore(&memcg->move_lock, *flags); 2045 2046 rcu_read_unlock(); 2047 } 2048 2049 /** 2050 * mem_cgroup_update_page_stat - update page state statistics 2051 * @memcg: memcg to account against 2052 * @idx: page state item to account 2053 * @val: number of pages (positive or negative) 2054 * 2055 * See mem_cgroup_begin_page_stat() for locking requirements. 2056 */ 2057 void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, 2058 enum mem_cgroup_stat_index idx, int val) 2059 { 2060 VM_BUG_ON(!rcu_read_lock_held()); 2061 2062 if (memcg) 2063 this_cpu_add(memcg->stat->count[idx], val); 2064 } 2065 2066 /* 2067 * size of first charge trial. "32" comes from vmscan.c's magic value. 2068 * TODO: maybe necessary to use big numbers in big irons. 2069 */ 2070 #define CHARGE_BATCH 32U 2071 struct memcg_stock_pcp { 2072 struct mem_cgroup *cached; /* this never be root cgroup */ 2073 unsigned int nr_pages; 2074 struct work_struct work; 2075 unsigned long flags; 2076 #define FLUSHING_CACHED_CHARGE 0 2077 }; 2078 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2079 static DEFINE_MUTEX(percpu_charge_mutex); 2080 2081 /** 2082 * consume_stock: Try to consume stocked charge on this cpu. 2083 * @memcg: memcg to consume from. 2084 * @nr_pages: how many pages to charge. 2085 * 2086 * The charges will only happen if @memcg matches the current cpu's memcg 2087 * stock, and at least @nr_pages are available in that stock. Failure to 2088 * service an allocation will refill the stock. 2089 * 2090 * returns true if successful, false otherwise. 2091 */ 2092 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2093 { 2094 struct memcg_stock_pcp *stock; 2095 bool ret = false; 2096 2097 if (nr_pages > CHARGE_BATCH) 2098 return ret; 2099 2100 stock = &get_cpu_var(memcg_stock); 2101 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 2102 stock->nr_pages -= nr_pages; 2103 ret = true; 2104 } 2105 put_cpu_var(memcg_stock); 2106 return ret; 2107 } 2108 2109 /* 2110 * Returns stocks cached in percpu and reset cached information. 2111 */ 2112 static void drain_stock(struct memcg_stock_pcp *stock) 2113 { 2114 struct mem_cgroup *old = stock->cached; 2115 2116 if (stock->nr_pages) { 2117 page_counter_uncharge(&old->memory, stock->nr_pages); 2118 if (do_swap_account) 2119 page_counter_uncharge(&old->memsw, stock->nr_pages); 2120 css_put_many(&old->css, stock->nr_pages); 2121 stock->nr_pages = 0; 2122 } 2123 stock->cached = NULL; 2124 } 2125 2126 /* 2127 * This must be called under preempt disabled or must be called by 2128 * a thread which is pinned to local cpu. 2129 */ 2130 static void drain_local_stock(struct work_struct *dummy) 2131 { 2132 struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); 2133 drain_stock(stock); 2134 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2135 } 2136 2137 static void __init memcg_stock_init(void) 2138 { 2139 int cpu; 2140 2141 for_each_possible_cpu(cpu) { 2142 struct memcg_stock_pcp *stock = 2143 &per_cpu(memcg_stock, cpu); 2144 INIT_WORK(&stock->work, drain_local_stock); 2145 } 2146 } 2147 2148 /* 2149 * Cache charges(val) to local per_cpu area. 2150 * This will be consumed by consume_stock() function, later. 2151 */ 2152 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2153 { 2154 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2155 2156 if (stock->cached != memcg) { /* reset if necessary */ 2157 drain_stock(stock); 2158 stock->cached = memcg; 2159 } 2160 stock->nr_pages += nr_pages; 2161 put_cpu_var(memcg_stock); 2162 } 2163 2164 /* 2165 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2166 * of the hierarchy under it. 2167 */ 2168 static void drain_all_stock(struct mem_cgroup *root_memcg) 2169 { 2170 int cpu, curcpu; 2171 2172 /* If someone's already draining, avoid adding running more workers. */ 2173 if (!mutex_trylock(&percpu_charge_mutex)) 2174 return; 2175 /* Notify other cpus that system-wide "drain" is running */ 2176 get_online_cpus(); 2177 curcpu = get_cpu(); 2178 for_each_online_cpu(cpu) { 2179 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2180 struct mem_cgroup *memcg; 2181 2182 memcg = stock->cached; 2183 if (!memcg || !stock->nr_pages) 2184 continue; 2185 if (!mem_cgroup_is_descendant(memcg, root_memcg)) 2186 continue; 2187 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2188 if (cpu == curcpu) 2189 drain_local_stock(&stock->work); 2190 else 2191 schedule_work_on(cpu, &stock->work); 2192 } 2193 } 2194 put_cpu(); 2195 put_online_cpus(); 2196 mutex_unlock(&percpu_charge_mutex); 2197 } 2198 2199 /* 2200 * This function drains percpu counter value from DEAD cpu and 2201 * move it to local cpu. Note that this function can be preempted. 2202 */ 2203 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) 2204 { 2205 int i; 2206 2207 spin_lock(&memcg->pcp_counter_lock); 2208 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 2209 long x = per_cpu(memcg->stat->count[i], cpu); 2210 2211 per_cpu(memcg->stat->count[i], cpu) = 0; 2212 memcg->nocpu_base.count[i] += x; 2213 } 2214 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2215 unsigned long x = per_cpu(memcg->stat->events[i], cpu); 2216 2217 per_cpu(memcg->stat->events[i], cpu) = 0; 2218 memcg->nocpu_base.events[i] += x; 2219 } 2220 spin_unlock(&memcg->pcp_counter_lock); 2221 } 2222 2223 static int memcg_cpu_hotplug_callback(struct notifier_block *nb, 2224 unsigned long action, 2225 void *hcpu) 2226 { 2227 int cpu = (unsigned long)hcpu; 2228 struct memcg_stock_pcp *stock; 2229 struct mem_cgroup *iter; 2230 2231 if (action == CPU_ONLINE) 2232 return NOTIFY_OK; 2233 2234 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 2235 return NOTIFY_OK; 2236 2237 for_each_mem_cgroup(iter) 2238 mem_cgroup_drain_pcp_counter(iter, cpu); 2239 2240 stock = &per_cpu(memcg_stock, cpu); 2241 drain_stock(stock); 2242 return NOTIFY_OK; 2243 } 2244 2245 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2246 unsigned int nr_pages) 2247 { 2248 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2249 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2250 struct mem_cgroup *mem_over_limit; 2251 struct page_counter *counter; 2252 unsigned long nr_reclaimed; 2253 bool may_swap = true; 2254 bool drained = false; 2255 int ret = 0; 2256 2257 if (mem_cgroup_is_root(memcg)) 2258 goto done; 2259 retry: 2260 if (consume_stock(memcg, nr_pages)) 2261 goto done; 2262 2263 if (!do_swap_account || 2264 !page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2265 if (!page_counter_try_charge(&memcg->memory, batch, &counter)) 2266 goto done_restock; 2267 if (do_swap_account) 2268 page_counter_uncharge(&memcg->memsw, batch); 2269 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2270 } else { 2271 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2272 may_swap = false; 2273 } 2274 2275 if (batch > nr_pages) { 2276 batch = nr_pages; 2277 goto retry; 2278 } 2279 2280 /* 2281 * Unlike in global OOM situations, memcg is not in a physical 2282 * memory shortage. Allow dying and OOM-killed tasks to 2283 * bypass the last charges so that they can exit quickly and 2284 * free their memory. 2285 */ 2286 if (unlikely(test_thread_flag(TIF_MEMDIE) || 2287 fatal_signal_pending(current) || 2288 current->flags & PF_EXITING)) 2289 goto bypass; 2290 2291 if (unlikely(task_in_memcg_oom(current))) 2292 goto nomem; 2293 2294 if (!(gfp_mask & __GFP_WAIT)) 2295 goto nomem; 2296 2297 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2298 gfp_mask, may_swap); 2299 2300 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2301 goto retry; 2302 2303 if (!drained) { 2304 drain_all_stock(mem_over_limit); 2305 drained = true; 2306 goto retry; 2307 } 2308 2309 if (gfp_mask & __GFP_NORETRY) 2310 goto nomem; 2311 /* 2312 * Even though the limit is exceeded at this point, reclaim 2313 * may have been able to free some pages. Retry the charge 2314 * before killing the task. 2315 * 2316 * Only for regular pages, though: huge pages are rather 2317 * unlikely to succeed so close to the limit, and we fall back 2318 * to regular pages anyway in case of failure. 2319 */ 2320 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2321 goto retry; 2322 /* 2323 * At task move, charge accounts can be doubly counted. So, it's 2324 * better to wait until the end of task_move if something is going on. 2325 */ 2326 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2327 goto retry; 2328 2329 if (nr_retries--) 2330 goto retry; 2331 2332 if (gfp_mask & __GFP_NOFAIL) 2333 goto bypass; 2334 2335 if (fatal_signal_pending(current)) 2336 goto bypass; 2337 2338 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); 2339 nomem: 2340 if (!(gfp_mask & __GFP_NOFAIL)) 2341 return -ENOMEM; 2342 bypass: 2343 return -EINTR; 2344 2345 done_restock: 2346 css_get_many(&memcg->css, batch); 2347 if (batch > nr_pages) 2348 refill_stock(memcg, batch - nr_pages); 2349 done: 2350 return ret; 2351 } 2352 2353 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2354 { 2355 if (mem_cgroup_is_root(memcg)) 2356 return; 2357 2358 page_counter_uncharge(&memcg->memory, nr_pages); 2359 if (do_swap_account) 2360 page_counter_uncharge(&memcg->memsw, nr_pages); 2361 2362 css_put_many(&memcg->css, nr_pages); 2363 } 2364 2365 /* 2366 * A helper function to get mem_cgroup from ID. must be called under 2367 * rcu_read_lock(). The caller is responsible for calling 2368 * css_tryget_online() if the mem_cgroup is used for charging. (dropping 2369 * refcnt from swap can be called against removed memcg.) 2370 */ 2371 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2372 { 2373 /* ID 0 is unused ID */ 2374 if (!id) 2375 return NULL; 2376 return mem_cgroup_from_id(id); 2377 } 2378 2379 /* 2380 * try_get_mem_cgroup_from_page - look up page's memcg association 2381 * @page: the page 2382 * 2383 * Look up, get a css reference, and return the memcg that owns @page. 2384 * 2385 * The page must be locked to prevent racing with swap-in and page 2386 * cache charges. If coming from an unlocked page table, the caller 2387 * must ensure the page is on the LRU or this can race with charging. 2388 */ 2389 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2390 { 2391 struct mem_cgroup *memcg; 2392 unsigned short id; 2393 swp_entry_t ent; 2394 2395 VM_BUG_ON_PAGE(!PageLocked(page), page); 2396 2397 memcg = page->mem_cgroup; 2398 if (memcg) { 2399 if (!css_tryget_online(&memcg->css)) 2400 memcg = NULL; 2401 } else if (PageSwapCache(page)) { 2402 ent.val = page_private(page); 2403 id = lookup_swap_cgroup_id(ent); 2404 rcu_read_lock(); 2405 memcg = mem_cgroup_lookup(id); 2406 if (memcg && !css_tryget_online(&memcg->css)) 2407 memcg = NULL; 2408 rcu_read_unlock(); 2409 } 2410 return memcg; 2411 } 2412 2413 static void lock_page_lru(struct page *page, int *isolated) 2414 { 2415 struct zone *zone = page_zone(page); 2416 2417 spin_lock_irq(&zone->lru_lock); 2418 if (PageLRU(page)) { 2419 struct lruvec *lruvec; 2420 2421 lruvec = mem_cgroup_page_lruvec(page, zone); 2422 ClearPageLRU(page); 2423 del_page_from_lru_list(page, lruvec, page_lru(page)); 2424 *isolated = 1; 2425 } else 2426 *isolated = 0; 2427 } 2428 2429 static void unlock_page_lru(struct page *page, int isolated) 2430 { 2431 struct zone *zone = page_zone(page); 2432 2433 if (isolated) { 2434 struct lruvec *lruvec; 2435 2436 lruvec = mem_cgroup_page_lruvec(page, zone); 2437 VM_BUG_ON_PAGE(PageLRU(page), page); 2438 SetPageLRU(page); 2439 add_page_to_lru_list(page, lruvec, page_lru(page)); 2440 } 2441 spin_unlock_irq(&zone->lru_lock); 2442 } 2443 2444 static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2445 bool lrucare) 2446 { 2447 int isolated; 2448 2449 VM_BUG_ON_PAGE(page->mem_cgroup, page); 2450 2451 /* 2452 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2453 * may already be on some other mem_cgroup's LRU. Take care of it. 2454 */ 2455 if (lrucare) 2456 lock_page_lru(page, &isolated); 2457 2458 /* 2459 * Nobody should be changing or seriously looking at 2460 * page->mem_cgroup at this point: 2461 * 2462 * - the page is uncharged 2463 * 2464 * - the page is off-LRU 2465 * 2466 * - an anonymous fault has exclusive page access, except for 2467 * a locked page table 2468 * 2469 * - a page cache insertion, a swapin fault, or a migration 2470 * have the page locked 2471 */ 2472 page->mem_cgroup = memcg; 2473 2474 if (lrucare) 2475 unlock_page_lru(page, isolated); 2476 } 2477 2478 #ifdef CONFIG_MEMCG_KMEM 2479 /* 2480 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or 2481 * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. 2482 */ 2483 static DEFINE_MUTEX(memcg_slab_mutex); 2484 2485 /* 2486 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2487 * in the memcg_cache_params struct. 2488 */ 2489 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) 2490 { 2491 struct kmem_cache *cachep; 2492 2493 VM_BUG_ON(p->is_root_cache); 2494 cachep = p->root_cache; 2495 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); 2496 } 2497 2498 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, 2499 unsigned long nr_pages) 2500 { 2501 struct page_counter *counter; 2502 int ret = 0; 2503 2504 ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); 2505 if (ret < 0) 2506 return ret; 2507 2508 ret = try_charge(memcg, gfp, nr_pages); 2509 if (ret == -EINTR) { 2510 /* 2511 * try_charge() chose to bypass to root due to OOM kill or 2512 * fatal signal. Since our only options are to either fail 2513 * the allocation or charge it to this cgroup, do it as a 2514 * temporary condition. But we can't fail. From a kmem/slab 2515 * perspective, the cache has already been selected, by 2516 * mem_cgroup_kmem_get_cache(), so it is too late to change 2517 * our minds. 2518 * 2519 * This condition will only trigger if the task entered 2520 * memcg_charge_kmem in a sane state, but was OOM-killed 2521 * during try_charge() above. Tasks that were already dying 2522 * when the allocation triggers should have been already 2523 * directed to the root cgroup in memcontrol.h 2524 */ 2525 page_counter_charge(&memcg->memory, nr_pages); 2526 if (do_swap_account) 2527 page_counter_charge(&memcg->memsw, nr_pages); 2528 css_get_many(&memcg->css, nr_pages); 2529 ret = 0; 2530 } else if (ret) 2531 page_counter_uncharge(&memcg->kmem, nr_pages); 2532 2533 return ret; 2534 } 2535 2536 static void memcg_uncharge_kmem(struct mem_cgroup *memcg, 2537 unsigned long nr_pages) 2538 { 2539 page_counter_uncharge(&memcg->memory, nr_pages); 2540 if (do_swap_account) 2541 page_counter_uncharge(&memcg->memsw, nr_pages); 2542 2543 page_counter_uncharge(&memcg->kmem, nr_pages); 2544 2545 css_put_many(&memcg->css, nr_pages); 2546 } 2547 2548 /* 2549 * helper for acessing a memcg's index. It will be used as an index in the 2550 * child cache array in kmem_cache, and also to derive its name. This function 2551 * will return -1 when this is not a kmem-limited memcg. 2552 */ 2553 int memcg_cache_id(struct mem_cgroup *memcg) 2554 { 2555 return memcg ? memcg->kmemcg_id : -1; 2556 } 2557 2558 static int memcg_alloc_cache_id(void) 2559 { 2560 int id, size; 2561 int err; 2562 2563 id = ida_simple_get(&kmem_limited_groups, 2564 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2565 if (id < 0) 2566 return id; 2567 2568 if (id < memcg_limited_groups_array_size) 2569 return id; 2570 2571 /* 2572 * There's no space for the new id in memcg_caches arrays, 2573 * so we have to grow them. 2574 */ 2575 2576 size = 2 * (id + 1); 2577 if (size < MEMCG_CACHES_MIN_SIZE) 2578 size = MEMCG_CACHES_MIN_SIZE; 2579 else if (size > MEMCG_CACHES_MAX_SIZE) 2580 size = MEMCG_CACHES_MAX_SIZE; 2581 2582 mutex_lock(&memcg_slab_mutex); 2583 err = memcg_update_all_caches(size); 2584 mutex_unlock(&memcg_slab_mutex); 2585 2586 if (err) { 2587 ida_simple_remove(&kmem_limited_groups, id); 2588 return err; 2589 } 2590 return id; 2591 } 2592 2593 static void memcg_free_cache_id(int id) 2594 { 2595 ida_simple_remove(&kmem_limited_groups, id); 2596 } 2597 2598 /* 2599 * We should update the current array size iff all caches updates succeed. This 2600 * can only be done from the slab side. The slab mutex needs to be held when 2601 * calling this. 2602 */ 2603 void memcg_update_array_size(int num) 2604 { 2605 memcg_limited_groups_array_size = num; 2606 } 2607 2608 static void memcg_register_cache(struct mem_cgroup *memcg, 2609 struct kmem_cache *root_cache) 2610 { 2611 static char memcg_name_buf[NAME_MAX + 1]; /* protected by 2612 memcg_slab_mutex */ 2613 struct kmem_cache *cachep; 2614 int id; 2615 2616 lockdep_assert_held(&memcg_slab_mutex); 2617 2618 id = memcg_cache_id(memcg); 2619 2620 /* 2621 * Since per-memcg caches are created asynchronously on first 2622 * allocation (see memcg_kmem_get_cache()), several threads can try to 2623 * create the same cache, but only one of them may succeed. 2624 */ 2625 if (cache_from_memcg_idx(root_cache, id)) 2626 return; 2627 2628 cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); 2629 cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); 2630 /* 2631 * If we could not create a memcg cache, do not complain, because 2632 * that's not critical at all as we can always proceed with the root 2633 * cache. 2634 */ 2635 if (!cachep) 2636 return; 2637 2638 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 2639 2640 /* 2641 * Since readers won't lock (see cache_from_memcg_idx()), we need a 2642 * barrier here to ensure nobody will see the kmem_cache partially 2643 * initialized. 2644 */ 2645 smp_wmb(); 2646 2647 BUG_ON(root_cache->memcg_params->memcg_caches[id]); 2648 root_cache->memcg_params->memcg_caches[id] = cachep; 2649 } 2650 2651 static void memcg_unregister_cache(struct kmem_cache *cachep) 2652 { 2653 struct kmem_cache *root_cache; 2654 struct mem_cgroup *memcg; 2655 int id; 2656 2657 lockdep_assert_held(&memcg_slab_mutex); 2658 2659 BUG_ON(is_root_cache(cachep)); 2660 2661 root_cache = cachep->memcg_params->root_cache; 2662 memcg = cachep->memcg_params->memcg; 2663 id = memcg_cache_id(memcg); 2664 2665 BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); 2666 root_cache->memcg_params->memcg_caches[id] = NULL; 2667 2668 list_del(&cachep->memcg_params->list); 2669 2670 kmem_cache_destroy(cachep); 2671 } 2672 2673 int __memcg_cleanup_cache_params(struct kmem_cache *s) 2674 { 2675 struct kmem_cache *c; 2676 int i, failed = 0; 2677 2678 mutex_lock(&memcg_slab_mutex); 2679 for_each_memcg_cache_index(i) { 2680 c = cache_from_memcg_idx(s, i); 2681 if (!c) 2682 continue; 2683 2684 memcg_unregister_cache(c); 2685 2686 if (cache_from_memcg_idx(s, i)) 2687 failed++; 2688 } 2689 mutex_unlock(&memcg_slab_mutex); 2690 return failed; 2691 } 2692 2693 static void memcg_unregister_all_caches(struct mem_cgroup *memcg) 2694 { 2695 struct kmem_cache *cachep; 2696 struct memcg_cache_params *params, *tmp; 2697 2698 if (!memcg_kmem_is_active(memcg)) 2699 return; 2700 2701 mutex_lock(&memcg_slab_mutex); 2702 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { 2703 cachep = memcg_params_to_cache(params); 2704 memcg_unregister_cache(cachep); 2705 } 2706 mutex_unlock(&memcg_slab_mutex); 2707 } 2708 2709 struct memcg_register_cache_work { 2710 struct mem_cgroup *memcg; 2711 struct kmem_cache *cachep; 2712 struct work_struct work; 2713 }; 2714 2715 static void memcg_register_cache_func(struct work_struct *w) 2716 { 2717 struct memcg_register_cache_work *cw = 2718 container_of(w, struct memcg_register_cache_work, work); 2719 struct mem_cgroup *memcg = cw->memcg; 2720 struct kmem_cache *cachep = cw->cachep; 2721 2722 mutex_lock(&memcg_slab_mutex); 2723 memcg_register_cache(memcg, cachep); 2724 mutex_unlock(&memcg_slab_mutex); 2725 2726 css_put(&memcg->css); 2727 kfree(cw); 2728 } 2729 2730 /* 2731 * Enqueue the creation of a per-memcg kmem_cache. 2732 */ 2733 static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, 2734 struct kmem_cache *cachep) 2735 { 2736 struct memcg_register_cache_work *cw; 2737 2738 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 2739 if (!cw) 2740 return; 2741 2742 css_get(&memcg->css); 2743 2744 cw->memcg = memcg; 2745 cw->cachep = cachep; 2746 2747 INIT_WORK(&cw->work, memcg_register_cache_func); 2748 schedule_work(&cw->work); 2749 } 2750 2751 static void memcg_schedule_register_cache(struct mem_cgroup *memcg, 2752 struct kmem_cache *cachep) 2753 { 2754 /* 2755 * We need to stop accounting when we kmalloc, because if the 2756 * corresponding kmalloc cache is not yet created, the first allocation 2757 * in __memcg_schedule_register_cache will recurse. 2758 * 2759 * However, it is better to enclose the whole function. Depending on 2760 * the debugging options enabled, INIT_WORK(), for instance, can 2761 * trigger an allocation. This too, will make us recurse. Because at 2762 * this point we can't allow ourselves back into memcg_kmem_get_cache, 2763 * the safest choice is to do it like this, wrapping the whole function. 2764 */ 2765 current->memcg_kmem_skip_account = 1; 2766 __memcg_schedule_register_cache(memcg, cachep); 2767 current->memcg_kmem_skip_account = 0; 2768 } 2769 2770 int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) 2771 { 2772 unsigned int nr_pages = 1 << order; 2773 2774 return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); 2775 } 2776 2777 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) 2778 { 2779 unsigned int nr_pages = 1 << order; 2780 2781 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); 2782 } 2783 2784 /* 2785 * Return the kmem_cache we're supposed to use for a slab allocation. 2786 * We try to use the current memcg's version of the cache. 2787 * 2788 * If the cache does not exist yet, if we are the first user of it, 2789 * we either create it immediately, if possible, or create it asynchronously 2790 * in a workqueue. 2791 * In the latter case, we will let the current allocation go through with 2792 * the original cache. 2793 * 2794 * Can't be called in interrupt context or from kernel threads. 2795 * This function needs to be called with rcu_read_lock() held. 2796 */ 2797 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) 2798 { 2799 struct mem_cgroup *memcg; 2800 struct kmem_cache *memcg_cachep; 2801 2802 VM_BUG_ON(!cachep->memcg_params); 2803 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 2804 2805 if (current->memcg_kmem_skip_account) 2806 return cachep; 2807 2808 memcg = get_mem_cgroup_from_mm(current->mm); 2809 if (!memcg_kmem_is_active(memcg)) 2810 goto out; 2811 2812 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 2813 if (likely(memcg_cachep)) 2814 return memcg_cachep; 2815 2816 /* 2817 * If we are in a safe context (can wait, and not in interrupt 2818 * context), we could be be predictable and return right away. 2819 * This would guarantee that the allocation being performed 2820 * already belongs in the new cache. 2821 * 2822 * However, there are some clashes that can arrive from locking. 2823 * For instance, because we acquire the slab_mutex while doing 2824 * memcg_create_kmem_cache, this means no further allocation 2825 * could happen with the slab_mutex held. So it's better to 2826 * defer everything. 2827 */ 2828 memcg_schedule_register_cache(memcg, cachep); 2829 out: 2830 css_put(&memcg->css); 2831 return cachep; 2832 } 2833 2834 void __memcg_kmem_put_cache(struct kmem_cache *cachep) 2835 { 2836 if (!is_root_cache(cachep)) 2837 css_put(&cachep->memcg_params->memcg->css); 2838 } 2839 2840 /* 2841 * We need to verify if the allocation against current->mm->owner's memcg is 2842 * possible for the given order. But the page is not allocated yet, so we'll 2843 * need a further commit step to do the final arrangements. 2844 * 2845 * It is possible for the task to switch cgroups in this mean time, so at 2846 * commit time, we can't rely on task conversion any longer. We'll then use 2847 * the handle argument to return to the caller which cgroup we should commit 2848 * against. We could also return the memcg directly and avoid the pointer 2849 * passing, but a boolean return value gives better semantics considering 2850 * the compiled-out case as well. 2851 * 2852 * Returning true means the allocation is possible. 2853 */ 2854 bool 2855 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) 2856 { 2857 struct mem_cgroup *memcg; 2858 int ret; 2859 2860 *_memcg = NULL; 2861 2862 memcg = get_mem_cgroup_from_mm(current->mm); 2863 2864 if (!memcg_kmem_is_active(memcg)) { 2865 css_put(&memcg->css); 2866 return true; 2867 } 2868 2869 ret = memcg_charge_kmem(memcg, gfp, 1 << order); 2870 if (!ret) 2871 *_memcg = memcg; 2872 2873 css_put(&memcg->css); 2874 return (ret == 0); 2875 } 2876 2877 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 2878 int order) 2879 { 2880 VM_BUG_ON(mem_cgroup_is_root(memcg)); 2881 2882 /* The page allocation failed. Revert */ 2883 if (!page) { 2884 memcg_uncharge_kmem(memcg, 1 << order); 2885 return; 2886 } 2887 page->mem_cgroup = memcg; 2888 } 2889 2890 void __memcg_kmem_uncharge_pages(struct page *page, int order) 2891 { 2892 struct mem_cgroup *memcg = page->mem_cgroup; 2893 2894 if (!memcg) 2895 return; 2896 2897 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2898 2899 memcg_uncharge_kmem(memcg, 1 << order); 2900 page->mem_cgroup = NULL; 2901 } 2902 #endif /* CONFIG_MEMCG_KMEM */ 2903 2904 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2905 2906 /* 2907 * Because tail pages are not marked as "used", set it. We're under 2908 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2909 * charge/uncharge will be never happen and move_account() is done under 2910 * compound_lock(), so we don't have to take care of races. 2911 */ 2912 void mem_cgroup_split_huge_fixup(struct page *head) 2913 { 2914 int i; 2915 2916 if (mem_cgroup_disabled()) 2917 return; 2918 2919 for (i = 1; i < HPAGE_PMD_NR; i++) 2920 head[i].mem_cgroup = head->mem_cgroup; 2921 2922 __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 2923 HPAGE_PMD_NR); 2924 } 2925 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2926 2927 /** 2928 * mem_cgroup_move_account - move account of the page 2929 * @page: the page 2930 * @nr_pages: number of regular pages (>1 for huge pages) 2931 * @from: mem_cgroup which the page is moved from. 2932 * @to: mem_cgroup which the page is moved to. @from != @to. 2933 * 2934 * The caller must confirm following. 2935 * - page is not on LRU (isolate_page() is useful.) 2936 * - compound_lock is held when nr_pages > 1 2937 * 2938 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 2939 * from old cgroup. 2940 */ 2941 static int mem_cgroup_move_account(struct page *page, 2942 unsigned int nr_pages, 2943 struct mem_cgroup *from, 2944 struct mem_cgroup *to) 2945 { 2946 unsigned long flags; 2947 int ret; 2948 2949 VM_BUG_ON(from == to); 2950 VM_BUG_ON_PAGE(PageLRU(page), page); 2951 /* 2952 * The page is isolated from LRU. So, collapse function 2953 * will not handle this page. But page splitting can happen. 2954 * Do this check under compound_page_lock(). The caller should 2955 * hold it. 2956 */ 2957 ret = -EBUSY; 2958 if (nr_pages > 1 && !PageTransHuge(page)) 2959 goto out; 2960 2961 /* 2962 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup 2963 * of its source page while we change it: page migration takes 2964 * both pages off the LRU, but page cache replacement doesn't. 2965 */ 2966 if (!trylock_page(page)) 2967 goto out; 2968 2969 ret = -EINVAL; 2970 if (page->mem_cgroup != from) 2971 goto out_unlock; 2972 2973 spin_lock_irqsave(&from->move_lock, flags); 2974 2975 if (!PageAnon(page) && page_mapped(page)) { 2976 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 2977 nr_pages); 2978 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 2979 nr_pages); 2980 } 2981 2982 if (PageWriteback(page)) { 2983 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], 2984 nr_pages); 2985 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], 2986 nr_pages); 2987 } 2988 2989 /* 2990 * It is safe to change page->mem_cgroup here because the page 2991 * is referenced, charged, and isolated - we can't race with 2992 * uncharging, charging, migration, or LRU putback. 2993 */ 2994 2995 /* caller should have done css_get */ 2996 page->mem_cgroup = to; 2997 spin_unlock_irqrestore(&from->move_lock, flags); 2998 2999 ret = 0; 3000 3001 local_irq_disable(); 3002 mem_cgroup_charge_statistics(to, page, nr_pages); 3003 memcg_check_events(to, page); 3004 mem_cgroup_charge_statistics(from, page, -nr_pages); 3005 memcg_check_events(from, page); 3006 local_irq_enable(); 3007 out_unlock: 3008 unlock_page(page); 3009 out: 3010 return ret; 3011 } 3012 3013 #ifdef CONFIG_MEMCG_SWAP 3014 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 3015 bool charge) 3016 { 3017 int val = (charge) ? 1 : -1; 3018 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 3019 } 3020 3021 /** 3022 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3023 * @entry: swap entry to be moved 3024 * @from: mem_cgroup which the entry is moved from 3025 * @to: mem_cgroup which the entry is moved to 3026 * 3027 * It succeeds only when the swap_cgroup's record for this entry is the same 3028 * as the mem_cgroup's id of @from. 3029 * 3030 * Returns 0 on success, -EINVAL on failure. 3031 * 3032 * The caller must have charged to @to, IOW, called page_counter_charge() about 3033 * both res and memsw, and called css_get(). 3034 */ 3035 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3036 struct mem_cgroup *from, struct mem_cgroup *to) 3037 { 3038 unsigned short old_id, new_id; 3039 3040 old_id = mem_cgroup_id(from); 3041 new_id = mem_cgroup_id(to); 3042 3043 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3044 mem_cgroup_swap_statistics(from, false); 3045 mem_cgroup_swap_statistics(to, true); 3046 return 0; 3047 } 3048 return -EINVAL; 3049 } 3050 #else 3051 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3052 struct mem_cgroup *from, struct mem_cgroup *to) 3053 { 3054 return -EINVAL; 3055 } 3056 #endif 3057 3058 static DEFINE_MUTEX(memcg_limit_mutex); 3059 3060 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3061 unsigned long limit) 3062 { 3063 unsigned long curusage; 3064 unsigned long oldusage; 3065 bool enlarge = false; 3066 int retry_count; 3067 int ret; 3068 3069 /* 3070 * For keeping hierarchical_reclaim simple, how long we should retry 3071 * is depends on callers. We set our retry-count to be function 3072 * of # of children which we should visit in this loop. 3073 */ 3074 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 3075 mem_cgroup_count_children(memcg); 3076 3077 oldusage = page_counter_read(&memcg->memory); 3078 3079 do { 3080 if (signal_pending(current)) { 3081 ret = -EINTR; 3082 break; 3083 } 3084 3085 mutex_lock(&memcg_limit_mutex); 3086 if (limit > memcg->memsw.limit) { 3087 mutex_unlock(&memcg_limit_mutex); 3088 ret = -EINVAL; 3089 break; 3090 } 3091 if (limit > memcg->memory.limit) 3092 enlarge = true; 3093 ret = page_counter_limit(&memcg->memory, limit); 3094 mutex_unlock(&memcg_limit_mutex); 3095 3096 if (!ret) 3097 break; 3098 3099 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); 3100 3101 curusage = page_counter_read(&memcg->memory); 3102 /* Usage is reduced ? */ 3103 if (curusage >= oldusage) 3104 retry_count--; 3105 else 3106 oldusage = curusage; 3107 } while (retry_count); 3108 3109 if (!ret && enlarge) 3110 memcg_oom_recover(memcg); 3111 3112 return ret; 3113 } 3114 3115 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3116 unsigned long limit) 3117 { 3118 unsigned long curusage; 3119 unsigned long oldusage; 3120 bool enlarge = false; 3121 int retry_count; 3122 int ret; 3123 3124 /* see mem_cgroup_resize_res_limit */ 3125 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 3126 mem_cgroup_count_children(memcg); 3127 3128 oldusage = page_counter_read(&memcg->memsw); 3129 3130 do { 3131 if (signal_pending(current)) { 3132 ret = -EINTR; 3133 break; 3134 } 3135 3136 mutex_lock(&memcg_limit_mutex); 3137 if (limit < memcg->memory.limit) { 3138 mutex_unlock(&memcg_limit_mutex); 3139 ret = -EINVAL; 3140 break; 3141 } 3142 if (limit > memcg->memsw.limit) 3143 enlarge = true; 3144 ret = page_counter_limit(&memcg->memsw, limit); 3145 mutex_unlock(&memcg_limit_mutex); 3146 3147 if (!ret) 3148 break; 3149 3150 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); 3151 3152 curusage = page_counter_read(&memcg->memsw); 3153 /* Usage is reduced ? */ 3154 if (curusage >= oldusage) 3155 retry_count--; 3156 else 3157 oldusage = curusage; 3158 } while (retry_count); 3159 3160 if (!ret && enlarge) 3161 memcg_oom_recover(memcg); 3162 3163 return ret; 3164 } 3165 3166 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3167 gfp_t gfp_mask, 3168 unsigned long *total_scanned) 3169 { 3170 unsigned long nr_reclaimed = 0; 3171 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3172 unsigned long reclaimed; 3173 int loop = 0; 3174 struct mem_cgroup_tree_per_zone *mctz; 3175 unsigned long excess; 3176 unsigned long nr_scanned; 3177 3178 if (order > 0) 3179 return 0; 3180 3181 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3182 /* 3183 * This loop can run a while, specially if mem_cgroup's continuously 3184 * keep exceeding their soft limit and putting the system under 3185 * pressure 3186 */ 3187 do { 3188 if (next_mz) 3189 mz = next_mz; 3190 else 3191 mz = mem_cgroup_largest_soft_limit_node(mctz); 3192 if (!mz) 3193 break; 3194 3195 nr_scanned = 0; 3196 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 3197 gfp_mask, &nr_scanned); 3198 nr_reclaimed += reclaimed; 3199 *total_scanned += nr_scanned; 3200 spin_lock_irq(&mctz->lock); 3201 __mem_cgroup_remove_exceeded(mz, mctz); 3202 3203 /* 3204 * If we failed to reclaim anything from this memory cgroup 3205 * it is time to move on to the next cgroup 3206 */ 3207 next_mz = NULL; 3208 if (!reclaimed) 3209 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 3210 3211 excess = soft_limit_excess(mz->memcg); 3212 /* 3213 * One school of thought says that we should not add 3214 * back the node to the tree if reclaim returns 0. 3215 * But our reclaim could return 0, simply because due 3216 * to priority we are exposing a smaller subset of 3217 * memory to reclaim from. Consider this as a longer 3218 * term TODO. 3219 */ 3220 /* If excess == 0, no tree ops */ 3221 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3222 spin_unlock_irq(&mctz->lock); 3223 css_put(&mz->memcg->css); 3224 loop++; 3225 /* 3226 * Could not reclaim anything and there are no more 3227 * mem cgroups to try or we seem to be looping without 3228 * reclaiming anything. 3229 */ 3230 if (!nr_reclaimed && 3231 (next_mz == NULL || 3232 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3233 break; 3234 } while (!nr_reclaimed); 3235 if (next_mz) 3236 css_put(&next_mz->memcg->css); 3237 return nr_reclaimed; 3238 } 3239 3240 /* 3241 * Test whether @memcg has children, dead or alive. Note that this 3242 * function doesn't care whether @memcg has use_hierarchy enabled and 3243 * returns %true if there are child csses according to the cgroup 3244 * hierarchy. Testing use_hierarchy is the caller's responsiblity. 3245 */ 3246 static inline bool memcg_has_children(struct mem_cgroup *memcg) 3247 { 3248 bool ret; 3249 3250 /* 3251 * The lock does not prevent addition or deletion of children, but 3252 * it prevents a new child from being initialized based on this 3253 * parent in css_online(), so it's enough to decide whether 3254 * hierarchically inherited attributes can still be changed or not. 3255 */ 3256 lockdep_assert_held(&memcg_create_mutex); 3257 3258 rcu_read_lock(); 3259 ret = css_next_child(NULL, &memcg->css); 3260 rcu_read_unlock(); 3261 return ret; 3262 } 3263 3264 /* 3265 * Reclaims as many pages from the given memcg as possible and moves 3266 * the rest to the parent. 3267 * 3268 * Caller is responsible for holding css reference for memcg. 3269 */ 3270 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3271 { 3272 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3273 3274 /* we call try-to-free pages for make this cgroup empty */ 3275 lru_add_drain_all(); 3276 /* try to free all pages in this cgroup */ 3277 while (nr_retries && page_counter_read(&memcg->memory)) { 3278 int progress; 3279 3280 if (signal_pending(current)) 3281 return -EINTR; 3282 3283 progress = try_to_free_mem_cgroup_pages(memcg, 1, 3284 GFP_KERNEL, true); 3285 if (!progress) { 3286 nr_retries--; 3287 /* maybe some writeback is necessary */ 3288 congestion_wait(BLK_RW_ASYNC, HZ/10); 3289 } 3290 3291 } 3292 3293 return 0; 3294 } 3295 3296 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3297 char *buf, size_t nbytes, 3298 loff_t off) 3299 { 3300 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3301 3302 if (mem_cgroup_is_root(memcg)) 3303 return -EINVAL; 3304 return mem_cgroup_force_empty(memcg) ?: nbytes; 3305 } 3306 3307 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3308 struct cftype *cft) 3309 { 3310 return mem_cgroup_from_css(css)->use_hierarchy; 3311 } 3312 3313 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3314 struct cftype *cft, u64 val) 3315 { 3316 int retval = 0; 3317 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3318 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 3319 3320 mutex_lock(&memcg_create_mutex); 3321 3322 if (memcg->use_hierarchy == val) 3323 goto out; 3324 3325 /* 3326 * If parent's use_hierarchy is set, we can't make any modifications 3327 * in the child subtrees. If it is unset, then the change can 3328 * occur, provided the current cgroup has no children. 3329 * 3330 * For the root cgroup, parent_mem is NULL, we allow value to be 3331 * set if there are no children. 3332 */ 3333 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 3334 (val == 1 || val == 0)) { 3335 if (!memcg_has_children(memcg)) 3336 memcg->use_hierarchy = val; 3337 else 3338 retval = -EBUSY; 3339 } else 3340 retval = -EINVAL; 3341 3342 out: 3343 mutex_unlock(&memcg_create_mutex); 3344 3345 return retval; 3346 } 3347 3348 static unsigned long tree_stat(struct mem_cgroup *memcg, 3349 enum mem_cgroup_stat_index idx) 3350 { 3351 struct mem_cgroup *iter; 3352 long val = 0; 3353 3354 /* Per-cpu values can be negative, use a signed accumulator */ 3355 for_each_mem_cgroup_tree(iter, memcg) 3356 val += mem_cgroup_read_stat(iter, idx); 3357 3358 if (val < 0) /* race ? */ 3359 val = 0; 3360 return val; 3361 } 3362 3363 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3364 { 3365 u64 val; 3366 3367 if (mem_cgroup_is_root(memcg)) { 3368 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); 3369 val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); 3370 if (swap) 3371 val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); 3372 } else { 3373 if (!swap) 3374 val = page_counter_read(&memcg->memory); 3375 else 3376 val = page_counter_read(&memcg->memsw); 3377 } 3378 return val << PAGE_SHIFT; 3379 } 3380 3381 enum { 3382 RES_USAGE, 3383 RES_LIMIT, 3384 RES_MAX_USAGE, 3385 RES_FAILCNT, 3386 RES_SOFT_LIMIT, 3387 }; 3388 3389 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3390 struct cftype *cft) 3391 { 3392 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3393 struct page_counter *counter; 3394 3395 switch (MEMFILE_TYPE(cft->private)) { 3396 case _MEM: 3397 counter = &memcg->memory; 3398 break; 3399 case _MEMSWAP: 3400 counter = &memcg->memsw; 3401 break; 3402 case _KMEM: 3403 counter = &memcg->kmem; 3404 break; 3405 default: 3406 BUG(); 3407 } 3408 3409 switch (MEMFILE_ATTR(cft->private)) { 3410 case RES_USAGE: 3411 if (counter == &memcg->memory) 3412 return mem_cgroup_usage(memcg, false); 3413 if (counter == &memcg->memsw) 3414 return mem_cgroup_usage(memcg, true); 3415 return (u64)page_counter_read(counter) * PAGE_SIZE; 3416 case RES_LIMIT: 3417 return (u64)counter->limit * PAGE_SIZE; 3418 case RES_MAX_USAGE: 3419 return (u64)counter->watermark * PAGE_SIZE; 3420 case RES_FAILCNT: 3421 return counter->failcnt; 3422 case RES_SOFT_LIMIT: 3423 return (u64)memcg->soft_limit * PAGE_SIZE; 3424 default: 3425 BUG(); 3426 } 3427 } 3428 3429 #ifdef CONFIG_MEMCG_KMEM 3430 static int memcg_activate_kmem(struct mem_cgroup *memcg, 3431 unsigned long nr_pages) 3432 { 3433 int err = 0; 3434 int memcg_id; 3435 3436 if (memcg_kmem_is_active(memcg)) 3437 return 0; 3438 3439 /* 3440 * For simplicity, we won't allow this to be disabled. It also can't 3441 * be changed if the cgroup has children already, or if tasks had 3442 * already joined. 3443 * 3444 * If tasks join before we set the limit, a person looking at 3445 * kmem.usage_in_bytes will have no way to determine when it took 3446 * place, which makes the value quite meaningless. 3447 * 3448 * After it first became limited, changes in the value of the limit are 3449 * of course permitted. 3450 */ 3451 mutex_lock(&memcg_create_mutex); 3452 if (cgroup_has_tasks(memcg->css.cgroup) || 3453 (memcg->use_hierarchy && memcg_has_children(memcg))) 3454 err = -EBUSY; 3455 mutex_unlock(&memcg_create_mutex); 3456 if (err) 3457 goto out; 3458 3459 memcg_id = memcg_alloc_cache_id(); 3460 if (memcg_id < 0) { 3461 err = memcg_id; 3462 goto out; 3463 } 3464 3465 /* 3466 * We couldn't have accounted to this cgroup, because it hasn't got 3467 * activated yet, so this should succeed. 3468 */ 3469 err = page_counter_limit(&memcg->kmem, nr_pages); 3470 VM_BUG_ON(err); 3471 3472 static_key_slow_inc(&memcg_kmem_enabled_key); 3473 /* 3474 * A memory cgroup is considered kmem-active as soon as it gets 3475 * kmemcg_id. Setting the id after enabling static branching will 3476 * guarantee no one starts accounting before all call sites are 3477 * patched. 3478 */ 3479 memcg->kmemcg_id = memcg_id; 3480 out: 3481 return err; 3482 } 3483 3484 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 3485 unsigned long limit) 3486 { 3487 int ret; 3488 3489 mutex_lock(&memcg_limit_mutex); 3490 if (!memcg_kmem_is_active(memcg)) 3491 ret = memcg_activate_kmem(memcg, limit); 3492 else 3493 ret = page_counter_limit(&memcg->kmem, limit); 3494 mutex_unlock(&memcg_limit_mutex); 3495 return ret; 3496 } 3497 3498 static int memcg_propagate_kmem(struct mem_cgroup *memcg) 3499 { 3500 int ret = 0; 3501 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 3502 3503 if (!parent) 3504 return 0; 3505 3506 mutex_lock(&memcg_limit_mutex); 3507 /* 3508 * If the parent cgroup is not kmem-active now, it cannot be activated 3509 * after this point, because it has at least one child already. 3510 */ 3511 if (memcg_kmem_is_active(parent)) 3512 ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); 3513 mutex_unlock(&memcg_limit_mutex); 3514 return ret; 3515 } 3516 #else 3517 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 3518 unsigned long limit) 3519 { 3520 return -EINVAL; 3521 } 3522 #endif /* CONFIG_MEMCG_KMEM */ 3523 3524 /* 3525 * The user of this function is... 3526 * RES_LIMIT. 3527 */ 3528 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3529 char *buf, size_t nbytes, loff_t off) 3530 { 3531 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3532 unsigned long nr_pages; 3533 int ret; 3534 3535 buf = strstrip(buf); 3536 ret = page_counter_memparse(buf, &nr_pages); 3537 if (ret) 3538 return ret; 3539 3540 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3541 case RES_LIMIT: 3542 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3543 ret = -EINVAL; 3544 break; 3545 } 3546 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3547 case _MEM: 3548 ret = mem_cgroup_resize_limit(memcg, nr_pages); 3549 break; 3550 case _MEMSWAP: 3551 ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); 3552 break; 3553 case _KMEM: 3554 ret = memcg_update_kmem_limit(memcg, nr_pages); 3555 break; 3556 } 3557 break; 3558 case RES_SOFT_LIMIT: 3559 memcg->soft_limit = nr_pages; 3560 ret = 0; 3561 break; 3562 } 3563 return ret ?: nbytes; 3564 } 3565 3566 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3567 size_t nbytes, loff_t off) 3568 { 3569 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3570 struct page_counter *counter; 3571 3572 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3573 case _MEM: 3574 counter = &memcg->memory; 3575 break; 3576 case _MEMSWAP: 3577 counter = &memcg->memsw; 3578 break; 3579 case _KMEM: 3580 counter = &memcg->kmem; 3581 break; 3582 default: 3583 BUG(); 3584 } 3585 3586 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3587 case RES_MAX_USAGE: 3588 page_counter_reset_watermark(counter); 3589 break; 3590 case RES_FAILCNT: 3591 counter->failcnt = 0; 3592 break; 3593 default: 3594 BUG(); 3595 } 3596 3597 return nbytes; 3598 } 3599 3600 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3601 struct cftype *cft) 3602 { 3603 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3604 } 3605 3606 #ifdef CONFIG_MMU 3607 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3608 struct cftype *cft, u64 val) 3609 { 3610 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3611 3612 if (val >= (1 << NR_MOVE_TYPE)) 3613 return -EINVAL; 3614 3615 /* 3616 * No kind of locking is needed in here, because ->can_attach() will 3617 * check this value once in the beginning of the process, and then carry 3618 * on with stale data. This means that changes to this value will only 3619 * affect task migrations starting after the change. 3620 */ 3621 memcg->move_charge_at_immigrate = val; 3622 return 0; 3623 } 3624 #else 3625 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3626 struct cftype *cft, u64 val) 3627 { 3628 return -ENOSYS; 3629 } 3630 #endif 3631 3632 #ifdef CONFIG_NUMA 3633 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3634 { 3635 struct numa_stat { 3636 const char *name; 3637 unsigned int lru_mask; 3638 }; 3639 3640 static const struct numa_stat stats[] = { 3641 { "total", LRU_ALL }, 3642 { "file", LRU_ALL_FILE }, 3643 { "anon", LRU_ALL_ANON }, 3644 { "unevictable", BIT(LRU_UNEVICTABLE) }, 3645 }; 3646 const struct numa_stat *stat; 3647 int nid; 3648 unsigned long nr; 3649 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3650 3651 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3652 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 3653 seq_printf(m, "%s=%lu", stat->name, nr); 3654 for_each_node_state(nid, N_MEMORY) { 3655 nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 3656 stat->lru_mask); 3657 seq_printf(m, " N%d=%lu", nid, nr); 3658 } 3659 seq_putc(m, '\n'); 3660 } 3661 3662 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3663 struct mem_cgroup *iter; 3664 3665 nr = 0; 3666 for_each_mem_cgroup_tree(iter, memcg) 3667 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); 3668 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); 3669 for_each_node_state(nid, N_MEMORY) { 3670 nr = 0; 3671 for_each_mem_cgroup_tree(iter, memcg) 3672 nr += mem_cgroup_node_nr_lru_pages( 3673 iter, nid, stat->lru_mask); 3674 seq_printf(m, " N%d=%lu", nid, nr); 3675 } 3676 seq_putc(m, '\n'); 3677 } 3678 3679 return 0; 3680 } 3681 #endif /* CONFIG_NUMA */ 3682 3683 static int memcg_stat_show(struct seq_file *m, void *v) 3684 { 3685 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3686 unsigned long memory, memsw; 3687 struct mem_cgroup *mi; 3688 unsigned int i; 3689 3690 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 3691 3692 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3693 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3694 continue; 3695 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 3696 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 3697 } 3698 3699 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 3700 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 3701 mem_cgroup_read_events(memcg, i)); 3702 3703 for (i = 0; i < NR_LRU_LISTS; i++) 3704 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 3705 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 3706 3707 /* Hierarchical information */ 3708 memory = memsw = PAGE_COUNTER_MAX; 3709 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 3710 memory = min(memory, mi->memory.limit); 3711 memsw = min(memsw, mi->memsw.limit); 3712 } 3713 seq_printf(m, "hierarchical_memory_limit %llu\n", 3714 (u64)memory * PAGE_SIZE); 3715 if (do_swap_account) 3716 seq_printf(m, "hierarchical_memsw_limit %llu\n", 3717 (u64)memsw * PAGE_SIZE); 3718 3719 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3720 long long val = 0; 3721 3722 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 3723 continue; 3724 for_each_mem_cgroup_tree(mi, memcg) 3725 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 3726 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); 3727 } 3728 3729 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 3730 unsigned long long val = 0; 3731 3732 for_each_mem_cgroup_tree(mi, memcg) 3733 val += mem_cgroup_read_events(mi, i); 3734 seq_printf(m, "total_%s %llu\n", 3735 mem_cgroup_events_names[i], val); 3736 } 3737 3738 for (i = 0; i < NR_LRU_LISTS; i++) { 3739 unsigned long long val = 0; 3740 3741 for_each_mem_cgroup_tree(mi, memcg) 3742 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 3743 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 3744 } 3745 3746 #ifdef CONFIG_DEBUG_VM 3747 { 3748 int nid, zid; 3749 struct mem_cgroup_per_zone *mz; 3750 struct zone_reclaim_stat *rstat; 3751 unsigned long recent_rotated[2] = {0, 0}; 3752 unsigned long recent_scanned[2] = {0, 0}; 3753 3754 for_each_online_node(nid) 3755 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3756 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 3757 rstat = &mz->lruvec.reclaim_stat; 3758 3759 recent_rotated[0] += rstat->recent_rotated[0]; 3760 recent_rotated[1] += rstat->recent_rotated[1]; 3761 recent_scanned[0] += rstat->recent_scanned[0]; 3762 recent_scanned[1] += rstat->recent_scanned[1]; 3763 } 3764 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 3765 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 3766 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 3767 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 3768 } 3769 #endif 3770 3771 return 0; 3772 } 3773 3774 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 3775 struct cftype *cft) 3776 { 3777 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3778 3779 return mem_cgroup_swappiness(memcg); 3780 } 3781 3782 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 3783 struct cftype *cft, u64 val) 3784 { 3785 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3786 3787 if (val > 100) 3788 return -EINVAL; 3789 3790 if (css->parent) 3791 memcg->swappiness = val; 3792 else 3793 vm_swappiness = val; 3794 3795 return 0; 3796 } 3797 3798 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3799 { 3800 struct mem_cgroup_threshold_ary *t; 3801 unsigned long usage; 3802 int i; 3803 3804 rcu_read_lock(); 3805 if (!swap) 3806 t = rcu_dereference(memcg->thresholds.primary); 3807 else 3808 t = rcu_dereference(memcg->memsw_thresholds.primary); 3809 3810 if (!t) 3811 goto unlock; 3812 3813 usage = mem_cgroup_usage(memcg, swap); 3814 3815 /* 3816 * current_threshold points to threshold just below or equal to usage. 3817 * If it's not true, a threshold was crossed after last 3818 * call of __mem_cgroup_threshold(). 3819 */ 3820 i = t->current_threshold; 3821 3822 /* 3823 * Iterate backward over array of thresholds starting from 3824 * current_threshold and check if a threshold is crossed. 3825 * If none of thresholds below usage is crossed, we read 3826 * only one element of the array here. 3827 */ 3828 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3829 eventfd_signal(t->entries[i].eventfd, 1); 3830 3831 /* i = current_threshold + 1 */ 3832 i++; 3833 3834 /* 3835 * Iterate forward over array of thresholds starting from 3836 * current_threshold+1 and check if a threshold is crossed. 3837 * If none of thresholds above usage is crossed, we read 3838 * only one element of the array here. 3839 */ 3840 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3841 eventfd_signal(t->entries[i].eventfd, 1); 3842 3843 /* Update current_threshold */ 3844 t->current_threshold = i - 1; 3845 unlock: 3846 rcu_read_unlock(); 3847 } 3848 3849 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3850 { 3851 while (memcg) { 3852 __mem_cgroup_threshold(memcg, false); 3853 if (do_swap_account) 3854 __mem_cgroup_threshold(memcg, true); 3855 3856 memcg = parent_mem_cgroup(memcg); 3857 } 3858 } 3859 3860 static int compare_thresholds(const void *a, const void *b) 3861 { 3862 const struct mem_cgroup_threshold *_a = a; 3863 const struct mem_cgroup_threshold *_b = b; 3864 3865 if (_a->threshold > _b->threshold) 3866 return 1; 3867 3868 if (_a->threshold < _b->threshold) 3869 return -1; 3870 3871 return 0; 3872 } 3873 3874 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 3875 { 3876 struct mem_cgroup_eventfd_list *ev; 3877 3878 spin_lock(&memcg_oom_lock); 3879 3880 list_for_each_entry(ev, &memcg->oom_notify, list) 3881 eventfd_signal(ev->eventfd, 1); 3882 3883 spin_unlock(&memcg_oom_lock); 3884 return 0; 3885 } 3886 3887 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 3888 { 3889 struct mem_cgroup *iter; 3890 3891 for_each_mem_cgroup_tree(iter, memcg) 3892 mem_cgroup_oom_notify_cb(iter); 3893 } 3894 3895 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3896 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 3897 { 3898 struct mem_cgroup_thresholds *thresholds; 3899 struct mem_cgroup_threshold_ary *new; 3900 unsigned long threshold; 3901 unsigned long usage; 3902 int i, size, ret; 3903 3904 ret = page_counter_memparse(args, &threshold); 3905 if (ret) 3906 return ret; 3907 3908 mutex_lock(&memcg->thresholds_lock); 3909 3910 if (type == _MEM) { 3911 thresholds = &memcg->thresholds; 3912 usage = mem_cgroup_usage(memcg, false); 3913 } else if (type == _MEMSWAP) { 3914 thresholds = &memcg->memsw_thresholds; 3915 usage = mem_cgroup_usage(memcg, true); 3916 } else 3917 BUG(); 3918 3919 /* Check if a threshold crossed before adding a new one */ 3920 if (thresholds->primary) 3921 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3922 3923 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3924 3925 /* Allocate memory for new array of thresholds */ 3926 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 3927 GFP_KERNEL); 3928 if (!new) { 3929 ret = -ENOMEM; 3930 goto unlock; 3931 } 3932 new->size = size; 3933 3934 /* Copy thresholds (if any) to new array */ 3935 if (thresholds->primary) { 3936 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 3937 sizeof(struct mem_cgroup_threshold)); 3938 } 3939 3940 /* Add new threshold */ 3941 new->entries[size - 1].eventfd = eventfd; 3942 new->entries[size - 1].threshold = threshold; 3943 3944 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3945 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 3946 compare_thresholds, NULL); 3947 3948 /* Find current threshold */ 3949 new->current_threshold = -1; 3950 for (i = 0; i < size; i++) { 3951 if (new->entries[i].threshold <= usage) { 3952 /* 3953 * new->current_threshold will not be used until 3954 * rcu_assign_pointer(), so it's safe to increment 3955 * it here. 3956 */ 3957 ++new->current_threshold; 3958 } else 3959 break; 3960 } 3961 3962 /* Free old spare buffer and save old primary buffer as spare */ 3963 kfree(thresholds->spare); 3964 thresholds->spare = thresholds->primary; 3965 3966 rcu_assign_pointer(thresholds->primary, new); 3967 3968 /* To be sure that nobody uses thresholds */ 3969 synchronize_rcu(); 3970 3971 unlock: 3972 mutex_unlock(&memcg->thresholds_lock); 3973 3974 return ret; 3975 } 3976 3977 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3978 struct eventfd_ctx *eventfd, const char *args) 3979 { 3980 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 3981 } 3982 3983 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 3984 struct eventfd_ctx *eventfd, const char *args) 3985 { 3986 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 3987 } 3988 3989 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3990 struct eventfd_ctx *eventfd, enum res_type type) 3991 { 3992 struct mem_cgroup_thresholds *thresholds; 3993 struct mem_cgroup_threshold_ary *new; 3994 unsigned long usage; 3995 int i, j, size; 3996 3997 mutex_lock(&memcg->thresholds_lock); 3998 3999 if (type == _MEM) { 4000 thresholds = &memcg->thresholds; 4001 usage = mem_cgroup_usage(memcg, false); 4002 } else if (type == _MEMSWAP) { 4003 thresholds = &memcg->memsw_thresholds; 4004 usage = mem_cgroup_usage(memcg, true); 4005 } else 4006 BUG(); 4007 4008 if (!thresholds->primary) 4009 goto unlock; 4010 4011 /* Check if a threshold crossed before removing */ 4012 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4013 4014 /* Calculate new number of threshold */ 4015 size = 0; 4016 for (i = 0; i < thresholds->primary->size; i++) { 4017 if (thresholds->primary->entries[i].eventfd != eventfd) 4018 size++; 4019 } 4020 4021 new = thresholds->spare; 4022 4023 /* Set thresholds array to NULL if we don't have thresholds */ 4024 if (!size) { 4025 kfree(new); 4026 new = NULL; 4027 goto swap_buffers; 4028 } 4029 4030 new->size = size; 4031 4032 /* Copy thresholds and find current threshold */ 4033 new->current_threshold = -1; 4034 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4035 if (thresholds->primary->entries[i].eventfd == eventfd) 4036 continue; 4037 4038 new->entries[j] = thresholds->primary->entries[i]; 4039 if (new->entries[j].threshold <= usage) { 4040 /* 4041 * new->current_threshold will not be used 4042 * until rcu_assign_pointer(), so it's safe to increment 4043 * it here. 4044 */ 4045 ++new->current_threshold; 4046 } 4047 j++; 4048 } 4049 4050 swap_buffers: 4051 /* Swap primary and spare array */ 4052 thresholds->spare = thresholds->primary; 4053 /* If all events are unregistered, free the spare array */ 4054 if (!new) { 4055 kfree(thresholds->spare); 4056 thresholds->spare = NULL; 4057 } 4058 4059 rcu_assign_pointer(thresholds->primary, new); 4060 4061 /* To be sure that nobody uses thresholds */ 4062 synchronize_rcu(); 4063 unlock: 4064 mutex_unlock(&memcg->thresholds_lock); 4065 } 4066 4067 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4068 struct eventfd_ctx *eventfd) 4069 { 4070 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4071 } 4072 4073 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4074 struct eventfd_ctx *eventfd) 4075 { 4076 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4077 } 4078 4079 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4080 struct eventfd_ctx *eventfd, const char *args) 4081 { 4082 struct mem_cgroup_eventfd_list *event; 4083 4084 event = kmalloc(sizeof(*event), GFP_KERNEL); 4085 if (!event) 4086 return -ENOMEM; 4087 4088 spin_lock(&memcg_oom_lock); 4089 4090 event->eventfd = eventfd; 4091 list_add(&event->list, &memcg->oom_notify); 4092 4093 /* already in OOM ? */ 4094 if (atomic_read(&memcg->under_oom)) 4095 eventfd_signal(eventfd, 1); 4096 spin_unlock(&memcg_oom_lock); 4097 4098 return 0; 4099 } 4100 4101 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4102 struct eventfd_ctx *eventfd) 4103 { 4104 struct mem_cgroup_eventfd_list *ev, *tmp; 4105 4106 spin_lock(&memcg_oom_lock); 4107 4108 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4109 if (ev->eventfd == eventfd) { 4110 list_del(&ev->list); 4111 kfree(ev); 4112 } 4113 } 4114 4115 spin_unlock(&memcg_oom_lock); 4116 } 4117 4118 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4119 { 4120 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 4121 4122 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 4123 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); 4124 return 0; 4125 } 4126 4127 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4128 struct cftype *cft, u64 val) 4129 { 4130 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4131 4132 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4133 if (!css->parent || !((val == 0) || (val == 1))) 4134 return -EINVAL; 4135 4136 memcg->oom_kill_disable = val; 4137 if (!val) 4138 memcg_oom_recover(memcg); 4139 4140 return 0; 4141 } 4142 4143 #ifdef CONFIG_MEMCG_KMEM 4144 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4145 { 4146 int ret; 4147 4148 ret = memcg_propagate_kmem(memcg); 4149 if (ret) 4150 return ret; 4151 4152 return mem_cgroup_sockets_init(memcg, ss); 4153 } 4154 4155 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4156 { 4157 memcg_unregister_all_caches(memcg); 4158 mem_cgroup_sockets_destroy(memcg); 4159 } 4160 #else 4161 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4162 { 4163 return 0; 4164 } 4165 4166 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4167 { 4168 } 4169 #endif 4170 4171 /* 4172 * DO NOT USE IN NEW FILES. 4173 * 4174 * "cgroup.event_control" implementation. 4175 * 4176 * This is way over-engineered. It tries to support fully configurable 4177 * events for each user. Such level of flexibility is completely 4178 * unnecessary especially in the light of the planned unified hierarchy. 4179 * 4180 * Please deprecate this and replace with something simpler if at all 4181 * possible. 4182 */ 4183 4184 /* 4185 * Unregister event and free resources. 4186 * 4187 * Gets called from workqueue. 4188 */ 4189 static void memcg_event_remove(struct work_struct *work) 4190 { 4191 struct mem_cgroup_event *event = 4192 container_of(work, struct mem_cgroup_event, remove); 4193 struct mem_cgroup *memcg = event->memcg; 4194 4195 remove_wait_queue(event->wqh, &event->wait); 4196 4197 event->unregister_event(memcg, event->eventfd); 4198 4199 /* Notify userspace the event is going away. */ 4200 eventfd_signal(event->eventfd, 1); 4201 4202 eventfd_ctx_put(event->eventfd); 4203 kfree(event); 4204 css_put(&memcg->css); 4205 } 4206 4207 /* 4208 * Gets called on POLLHUP on eventfd when user closes it. 4209 * 4210 * Called with wqh->lock held and interrupts disabled. 4211 */ 4212 static int memcg_event_wake(wait_queue_t *wait, unsigned mode, 4213 int sync, void *key) 4214 { 4215 struct mem_cgroup_event *event = 4216 container_of(wait, struct mem_cgroup_event, wait); 4217 struct mem_cgroup *memcg = event->memcg; 4218 unsigned long flags = (unsigned long)key; 4219 4220 if (flags & POLLHUP) { 4221 /* 4222 * If the event has been detached at cgroup removal, we 4223 * can simply return knowing the other side will cleanup 4224 * for us. 4225 * 4226 * We can't race against event freeing since the other 4227 * side will require wqh->lock via remove_wait_queue(), 4228 * which we hold. 4229 */ 4230 spin_lock(&memcg->event_list_lock); 4231 if (!list_empty(&event->list)) { 4232 list_del_init(&event->list); 4233 /* 4234 * We are in atomic context, but cgroup_event_remove() 4235 * may sleep, so we have to call it in workqueue. 4236 */ 4237 schedule_work(&event->remove); 4238 } 4239 spin_unlock(&memcg->event_list_lock); 4240 } 4241 4242 return 0; 4243 } 4244 4245 static void memcg_event_ptable_queue_proc(struct file *file, 4246 wait_queue_head_t *wqh, poll_table *pt) 4247 { 4248 struct mem_cgroup_event *event = 4249 container_of(pt, struct mem_cgroup_event, pt); 4250 4251 event->wqh = wqh; 4252 add_wait_queue(wqh, &event->wait); 4253 } 4254 4255 /* 4256 * DO NOT USE IN NEW FILES. 4257 * 4258 * Parse input and register new cgroup event handler. 4259 * 4260 * Input must be in format '<event_fd> <control_fd> <args>'. 4261 * Interpretation of args is defined by control file implementation. 4262 */ 4263 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4264 char *buf, size_t nbytes, loff_t off) 4265 { 4266 struct cgroup_subsys_state *css = of_css(of); 4267 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4268 struct mem_cgroup_event *event; 4269 struct cgroup_subsys_state *cfile_css; 4270 unsigned int efd, cfd; 4271 struct fd efile; 4272 struct fd cfile; 4273 const char *name; 4274 char *endp; 4275 int ret; 4276 4277 buf = strstrip(buf); 4278 4279 efd = simple_strtoul(buf, &endp, 10); 4280 if (*endp != ' ') 4281 return -EINVAL; 4282 buf = endp + 1; 4283 4284 cfd = simple_strtoul(buf, &endp, 10); 4285 if ((*endp != ' ') && (*endp != '\0')) 4286 return -EINVAL; 4287 buf = endp + 1; 4288 4289 event = kzalloc(sizeof(*event), GFP_KERNEL); 4290 if (!event) 4291 return -ENOMEM; 4292 4293 event->memcg = memcg; 4294 INIT_LIST_HEAD(&event->list); 4295 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4296 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4297 INIT_WORK(&event->remove, memcg_event_remove); 4298 4299 efile = fdget(efd); 4300 if (!efile.file) { 4301 ret = -EBADF; 4302 goto out_kfree; 4303 } 4304 4305 event->eventfd = eventfd_ctx_fileget(efile.file); 4306 if (IS_ERR(event->eventfd)) { 4307 ret = PTR_ERR(event->eventfd); 4308 goto out_put_efile; 4309 } 4310 4311 cfile = fdget(cfd); 4312 if (!cfile.file) { 4313 ret = -EBADF; 4314 goto out_put_eventfd; 4315 } 4316 4317 /* the process need read permission on control file */ 4318 /* AV: shouldn't we check that it's been opened for read instead? */ 4319 ret = inode_permission(file_inode(cfile.file), MAY_READ); 4320 if (ret < 0) 4321 goto out_put_cfile; 4322 4323 /* 4324 * Determine the event callbacks and set them in @event. This used 4325 * to be done via struct cftype but cgroup core no longer knows 4326 * about these events. The following is crude but the whole thing 4327 * is for compatibility anyway. 4328 * 4329 * DO NOT ADD NEW FILES. 4330 */ 4331 name = cfile.file->f_path.dentry->d_name.name; 4332 4333 if (!strcmp(name, "memory.usage_in_bytes")) { 4334 event->register_event = mem_cgroup_usage_register_event; 4335 event->unregister_event = mem_cgroup_usage_unregister_event; 4336 } else if (!strcmp(name, "memory.oom_control")) { 4337 event->register_event = mem_cgroup_oom_register_event; 4338 event->unregister_event = mem_cgroup_oom_unregister_event; 4339 } else if (!strcmp(name, "memory.pressure_level")) { 4340 event->register_event = vmpressure_register_event; 4341 event->unregister_event = vmpressure_unregister_event; 4342 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 4343 event->register_event = memsw_cgroup_usage_register_event; 4344 event->unregister_event = memsw_cgroup_usage_unregister_event; 4345 } else { 4346 ret = -EINVAL; 4347 goto out_put_cfile; 4348 } 4349 4350 /* 4351 * Verify @cfile should belong to @css. Also, remaining events are 4352 * automatically removed on cgroup destruction but the removal is 4353 * asynchronous, so take an extra ref on @css. 4354 */ 4355 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, 4356 &memory_cgrp_subsys); 4357 ret = -EINVAL; 4358 if (IS_ERR(cfile_css)) 4359 goto out_put_cfile; 4360 if (cfile_css != css) { 4361 css_put(cfile_css); 4362 goto out_put_cfile; 4363 } 4364 4365 ret = event->register_event(memcg, event->eventfd, buf); 4366 if (ret) 4367 goto out_put_css; 4368 4369 efile.file->f_op->poll(efile.file, &event->pt); 4370 4371 spin_lock(&memcg->event_list_lock); 4372 list_add(&event->list, &memcg->event_list); 4373 spin_unlock(&memcg->event_list_lock); 4374 4375 fdput(cfile); 4376 fdput(efile); 4377 4378 return nbytes; 4379 4380 out_put_css: 4381 css_put(css); 4382 out_put_cfile: 4383 fdput(cfile); 4384 out_put_eventfd: 4385 eventfd_ctx_put(event->eventfd); 4386 out_put_efile: 4387 fdput(efile); 4388 out_kfree: 4389 kfree(event); 4390 4391 return ret; 4392 } 4393 4394 static struct cftype mem_cgroup_files[] = { 4395 { 4396 .name = "usage_in_bytes", 4397 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4398 .read_u64 = mem_cgroup_read_u64, 4399 }, 4400 { 4401 .name = "max_usage_in_bytes", 4402 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4403 .write = mem_cgroup_reset, 4404 .read_u64 = mem_cgroup_read_u64, 4405 }, 4406 { 4407 .name = "limit_in_bytes", 4408 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4409 .write = mem_cgroup_write, 4410 .read_u64 = mem_cgroup_read_u64, 4411 }, 4412 { 4413 .name = "soft_limit_in_bytes", 4414 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4415 .write = mem_cgroup_write, 4416 .read_u64 = mem_cgroup_read_u64, 4417 }, 4418 { 4419 .name = "failcnt", 4420 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4421 .write = mem_cgroup_reset, 4422 .read_u64 = mem_cgroup_read_u64, 4423 }, 4424 { 4425 .name = "stat", 4426 .seq_show = memcg_stat_show, 4427 }, 4428 { 4429 .name = "force_empty", 4430 .write = mem_cgroup_force_empty_write, 4431 }, 4432 { 4433 .name = "use_hierarchy", 4434 .write_u64 = mem_cgroup_hierarchy_write, 4435 .read_u64 = mem_cgroup_hierarchy_read, 4436 }, 4437 { 4438 .name = "cgroup.event_control", /* XXX: for compat */ 4439 .write = memcg_write_event_control, 4440 .flags = CFTYPE_NO_PREFIX, 4441 .mode = S_IWUGO, 4442 }, 4443 { 4444 .name = "swappiness", 4445 .read_u64 = mem_cgroup_swappiness_read, 4446 .write_u64 = mem_cgroup_swappiness_write, 4447 }, 4448 { 4449 .name = "move_charge_at_immigrate", 4450 .read_u64 = mem_cgroup_move_charge_read, 4451 .write_u64 = mem_cgroup_move_charge_write, 4452 }, 4453 { 4454 .name = "oom_control", 4455 .seq_show = mem_cgroup_oom_control_read, 4456 .write_u64 = mem_cgroup_oom_control_write, 4457 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4458 }, 4459 { 4460 .name = "pressure_level", 4461 }, 4462 #ifdef CONFIG_NUMA 4463 { 4464 .name = "numa_stat", 4465 .seq_show = memcg_numa_stat_show, 4466 }, 4467 #endif 4468 #ifdef CONFIG_MEMCG_KMEM 4469 { 4470 .name = "kmem.limit_in_bytes", 4471 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 4472 .write = mem_cgroup_write, 4473 .read_u64 = mem_cgroup_read_u64, 4474 }, 4475 { 4476 .name = "kmem.usage_in_bytes", 4477 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 4478 .read_u64 = mem_cgroup_read_u64, 4479 }, 4480 { 4481 .name = "kmem.failcnt", 4482 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 4483 .write = mem_cgroup_reset, 4484 .read_u64 = mem_cgroup_read_u64, 4485 }, 4486 { 4487 .name = "kmem.max_usage_in_bytes", 4488 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 4489 .write = mem_cgroup_reset, 4490 .read_u64 = mem_cgroup_read_u64, 4491 }, 4492 #ifdef CONFIG_SLABINFO 4493 { 4494 .name = "kmem.slabinfo", 4495 .seq_start = slab_start, 4496 .seq_next = slab_next, 4497 .seq_stop = slab_stop, 4498 .seq_show = memcg_slab_show, 4499 }, 4500 #endif 4501 #endif 4502 { }, /* terminate */ 4503 }; 4504 4505 #ifdef CONFIG_MEMCG_SWAP 4506 static struct cftype memsw_cgroup_files[] = { 4507 { 4508 .name = "memsw.usage_in_bytes", 4509 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4510 .read_u64 = mem_cgroup_read_u64, 4511 }, 4512 { 4513 .name = "memsw.max_usage_in_bytes", 4514 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4515 .write = mem_cgroup_reset, 4516 .read_u64 = mem_cgroup_read_u64, 4517 }, 4518 { 4519 .name = "memsw.limit_in_bytes", 4520 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4521 .write = mem_cgroup_write, 4522 .read_u64 = mem_cgroup_read_u64, 4523 }, 4524 { 4525 .name = "memsw.failcnt", 4526 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4527 .write = mem_cgroup_reset, 4528 .read_u64 = mem_cgroup_read_u64, 4529 }, 4530 { }, /* terminate */ 4531 }; 4532 #endif 4533 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4534 { 4535 struct mem_cgroup_per_node *pn; 4536 struct mem_cgroup_per_zone *mz; 4537 int zone, tmp = node; 4538 /* 4539 * This routine is called against possible nodes. 4540 * But it's BUG to call kmalloc() against offline node. 4541 * 4542 * TODO: this routine can waste much memory for nodes which will 4543 * never be onlined. It's better to use memory hotplug callback 4544 * function. 4545 */ 4546 if (!node_state(node, N_NORMAL_MEMORY)) 4547 tmp = -1; 4548 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4549 if (!pn) 4550 return 1; 4551 4552 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4553 mz = &pn->zoneinfo[zone]; 4554 lruvec_init(&mz->lruvec); 4555 mz->usage_in_excess = 0; 4556 mz->on_tree = false; 4557 mz->memcg = memcg; 4558 } 4559 memcg->nodeinfo[node] = pn; 4560 return 0; 4561 } 4562 4563 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4564 { 4565 kfree(memcg->nodeinfo[node]); 4566 } 4567 4568 static struct mem_cgroup *mem_cgroup_alloc(void) 4569 { 4570 struct mem_cgroup *memcg; 4571 size_t size; 4572 4573 size = sizeof(struct mem_cgroup); 4574 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 4575 4576 memcg = kzalloc(size, GFP_KERNEL); 4577 if (!memcg) 4578 return NULL; 4579 4580 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4581 if (!memcg->stat) 4582 goto out_free; 4583 spin_lock_init(&memcg->pcp_counter_lock); 4584 return memcg; 4585 4586 out_free: 4587 kfree(memcg); 4588 return NULL; 4589 } 4590 4591 /* 4592 * At destroying mem_cgroup, references from swap_cgroup can remain. 4593 * (scanning all at force_empty is too costly...) 4594 * 4595 * Instead of clearing all references at force_empty, we remember 4596 * the number of reference from swap_cgroup and free mem_cgroup when 4597 * it goes down to 0. 4598 * 4599 * Removal of cgroup itself succeeds regardless of refs from swap. 4600 */ 4601 4602 static void __mem_cgroup_free(struct mem_cgroup *memcg) 4603 { 4604 int node; 4605 4606 mem_cgroup_remove_from_trees(memcg); 4607 4608 for_each_node(node) 4609 free_mem_cgroup_per_zone_info(memcg, node); 4610 4611 free_percpu(memcg->stat); 4612 4613 disarm_static_keys(memcg); 4614 kfree(memcg); 4615 } 4616 4617 /* 4618 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4619 */ 4620 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 4621 { 4622 if (!memcg->memory.parent) 4623 return NULL; 4624 return mem_cgroup_from_counter(memcg->memory.parent, memory); 4625 } 4626 EXPORT_SYMBOL(parent_mem_cgroup); 4627 4628 static void __init mem_cgroup_soft_limit_tree_init(void) 4629 { 4630 struct mem_cgroup_tree_per_node *rtpn; 4631 struct mem_cgroup_tree_per_zone *rtpz; 4632 int tmp, node, zone; 4633 4634 for_each_node(node) { 4635 tmp = node; 4636 if (!node_state(node, N_NORMAL_MEMORY)) 4637 tmp = -1; 4638 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4639 BUG_ON(!rtpn); 4640 4641 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4642 4643 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4644 rtpz = &rtpn->rb_tree_per_zone[zone]; 4645 rtpz->rb_root = RB_ROOT; 4646 spin_lock_init(&rtpz->lock); 4647 } 4648 } 4649 } 4650 4651 static struct cgroup_subsys_state * __ref 4652 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4653 { 4654 struct mem_cgroup *memcg; 4655 long error = -ENOMEM; 4656 int node; 4657 4658 memcg = mem_cgroup_alloc(); 4659 if (!memcg) 4660 return ERR_PTR(error); 4661 4662 for_each_node(node) 4663 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4664 goto free_out; 4665 4666 /* root ? */ 4667 if (parent_css == NULL) { 4668 root_mem_cgroup = memcg; 4669 page_counter_init(&memcg->memory, NULL); 4670 memcg->soft_limit = PAGE_COUNTER_MAX; 4671 page_counter_init(&memcg->memsw, NULL); 4672 page_counter_init(&memcg->kmem, NULL); 4673 } 4674 4675 memcg->last_scanned_node = MAX_NUMNODES; 4676 INIT_LIST_HEAD(&memcg->oom_notify); 4677 memcg->move_charge_at_immigrate = 0; 4678 mutex_init(&memcg->thresholds_lock); 4679 spin_lock_init(&memcg->move_lock); 4680 vmpressure_init(&memcg->vmpressure); 4681 INIT_LIST_HEAD(&memcg->event_list); 4682 spin_lock_init(&memcg->event_list_lock); 4683 #ifdef CONFIG_MEMCG_KMEM 4684 memcg->kmemcg_id = -1; 4685 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 4686 #endif 4687 4688 return &memcg->css; 4689 4690 free_out: 4691 __mem_cgroup_free(memcg); 4692 return ERR_PTR(error); 4693 } 4694 4695 static int 4696 mem_cgroup_css_online(struct cgroup_subsys_state *css) 4697 { 4698 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4699 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); 4700 int ret; 4701 4702 if (css->id > MEM_CGROUP_ID_MAX) 4703 return -ENOSPC; 4704 4705 if (!parent) 4706 return 0; 4707 4708 mutex_lock(&memcg_create_mutex); 4709 4710 memcg->use_hierarchy = parent->use_hierarchy; 4711 memcg->oom_kill_disable = parent->oom_kill_disable; 4712 memcg->swappiness = mem_cgroup_swappiness(parent); 4713 4714 if (parent->use_hierarchy) { 4715 page_counter_init(&memcg->memory, &parent->memory); 4716 memcg->soft_limit = PAGE_COUNTER_MAX; 4717 page_counter_init(&memcg->memsw, &parent->memsw); 4718 page_counter_init(&memcg->kmem, &parent->kmem); 4719 4720 /* 4721 * No need to take a reference to the parent because cgroup 4722 * core guarantees its existence. 4723 */ 4724 } else { 4725 page_counter_init(&memcg->memory, NULL); 4726 memcg->soft_limit = PAGE_COUNTER_MAX; 4727 page_counter_init(&memcg->memsw, NULL); 4728 page_counter_init(&memcg->kmem, NULL); 4729 /* 4730 * Deeper hierachy with use_hierarchy == false doesn't make 4731 * much sense so let cgroup subsystem know about this 4732 * unfortunate state in our controller. 4733 */ 4734 if (parent != root_mem_cgroup) 4735 memory_cgrp_subsys.broken_hierarchy = true; 4736 } 4737 mutex_unlock(&memcg_create_mutex); 4738 4739 ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); 4740 if (ret) 4741 return ret; 4742 4743 /* 4744 * Make sure the memcg is initialized: mem_cgroup_iter() 4745 * orders reading memcg->initialized against its callers 4746 * reading the memcg members. 4747 */ 4748 smp_store_release(&memcg->initialized, 1); 4749 4750 return 0; 4751 } 4752 4753 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 4754 { 4755 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4756 struct mem_cgroup_event *event, *tmp; 4757 4758 /* 4759 * Unregister events and notify userspace. 4760 * Notify userspace about cgroup removing only after rmdir of cgroup 4761 * directory to avoid race between userspace and kernelspace. 4762 */ 4763 spin_lock(&memcg->event_list_lock); 4764 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 4765 list_del_init(&event->list); 4766 schedule_work(&event->remove); 4767 } 4768 spin_unlock(&memcg->event_list_lock); 4769 4770 vmpressure_cleanup(&memcg->vmpressure); 4771 } 4772 4773 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 4774 { 4775 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4776 4777 memcg_destroy_kmem(memcg); 4778 __mem_cgroup_free(memcg); 4779 } 4780 4781 /** 4782 * mem_cgroup_css_reset - reset the states of a mem_cgroup 4783 * @css: the target css 4784 * 4785 * Reset the states of the mem_cgroup associated with @css. This is 4786 * invoked when the userland requests disabling on the default hierarchy 4787 * but the memcg is pinned through dependency. The memcg should stop 4788 * applying policies and should revert to the vanilla state as it may be 4789 * made visible again. 4790 * 4791 * The current implementation only resets the essential configurations. 4792 * This needs to be expanded to cover all the visible parts. 4793 */ 4794 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 4795 { 4796 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4797 4798 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); 4799 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); 4800 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); 4801 memcg->soft_limit = PAGE_COUNTER_MAX; 4802 } 4803 4804 #ifdef CONFIG_MMU 4805 /* Handlers for move charge at task migration. */ 4806 static int mem_cgroup_do_precharge(unsigned long count) 4807 { 4808 int ret; 4809 4810 /* Try a single bulk charge without reclaim first */ 4811 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); 4812 if (!ret) { 4813 mc.precharge += count; 4814 return ret; 4815 } 4816 if (ret == -EINTR) { 4817 cancel_charge(root_mem_cgroup, count); 4818 return ret; 4819 } 4820 4821 /* Try charges one by one with reclaim */ 4822 while (count--) { 4823 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); 4824 /* 4825 * In case of failure, any residual charges against 4826 * mc.to will be dropped by mem_cgroup_clear_mc() 4827 * later on. However, cancel any charges that are 4828 * bypassed to root right away or they'll be lost. 4829 */ 4830 if (ret == -EINTR) 4831 cancel_charge(root_mem_cgroup, 1); 4832 if (ret) 4833 return ret; 4834 mc.precharge++; 4835 cond_resched(); 4836 } 4837 return 0; 4838 } 4839 4840 /** 4841 * get_mctgt_type - get target type of moving charge 4842 * @vma: the vma the pte to be checked belongs 4843 * @addr: the address corresponding to the pte to be checked 4844 * @ptent: the pte to be checked 4845 * @target: the pointer the target page or swap ent will be stored(can be NULL) 4846 * 4847 * Returns 4848 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 4849 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 4850 * move charge. if @target is not NULL, the page is stored in target->page 4851 * with extra refcnt got(Callers should handle it). 4852 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 4853 * target for charge migration. if @target is not NULL, the entry is stored 4854 * in target->ent. 4855 * 4856 * Called with pte lock held. 4857 */ 4858 union mc_target { 4859 struct page *page; 4860 swp_entry_t ent; 4861 }; 4862 4863 enum mc_target_type { 4864 MC_TARGET_NONE = 0, 4865 MC_TARGET_PAGE, 4866 MC_TARGET_SWAP, 4867 }; 4868 4869 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4870 unsigned long addr, pte_t ptent) 4871 { 4872 struct page *page = vm_normal_page(vma, addr, ptent); 4873 4874 if (!page || !page_mapped(page)) 4875 return NULL; 4876 if (PageAnon(page)) { 4877 /* we don't move shared anon */ 4878 if (!move_anon()) 4879 return NULL; 4880 } else if (!move_file()) 4881 /* we ignore mapcount for file pages */ 4882 return NULL; 4883 if (!get_page_unless_zero(page)) 4884 return NULL; 4885 4886 return page; 4887 } 4888 4889 #ifdef CONFIG_SWAP 4890 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4891 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4892 { 4893 struct page *page = NULL; 4894 swp_entry_t ent = pte_to_swp_entry(ptent); 4895 4896 if (!move_anon() || non_swap_entry(ent)) 4897 return NULL; 4898 /* 4899 * Because lookup_swap_cache() updates some statistics counter, 4900 * we call find_get_page() with swapper_space directly. 4901 */ 4902 page = find_get_page(swap_address_space(ent), ent.val); 4903 if (do_swap_account) 4904 entry->val = ent.val; 4905 4906 return page; 4907 } 4908 #else 4909 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4910 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4911 { 4912 return NULL; 4913 } 4914 #endif 4915 4916 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 4917 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4918 { 4919 struct page *page = NULL; 4920 struct address_space *mapping; 4921 pgoff_t pgoff; 4922 4923 if (!vma->vm_file) /* anonymous vma */ 4924 return NULL; 4925 if (!move_file()) 4926 return NULL; 4927 4928 mapping = vma->vm_file->f_mapping; 4929 if (pte_none(ptent)) 4930 pgoff = linear_page_index(vma, addr); 4931 else /* pte_file(ptent) is true */ 4932 pgoff = pte_to_pgoff(ptent); 4933 4934 /* page is moved even if it's not RSS of this task(page-faulted). */ 4935 #ifdef CONFIG_SWAP 4936 /* shmem/tmpfs may report page out on swap: account for that too. */ 4937 if (shmem_mapping(mapping)) { 4938 page = find_get_entry(mapping, pgoff); 4939 if (radix_tree_exceptional_entry(page)) { 4940 swp_entry_t swp = radix_to_swp_entry(page); 4941 if (do_swap_account) 4942 *entry = swp; 4943 page = find_get_page(swap_address_space(swp), swp.val); 4944 } 4945 } else 4946 page = find_get_page(mapping, pgoff); 4947 #else 4948 page = find_get_page(mapping, pgoff); 4949 #endif 4950 return page; 4951 } 4952 4953 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 4954 unsigned long addr, pte_t ptent, union mc_target *target) 4955 { 4956 struct page *page = NULL; 4957 enum mc_target_type ret = MC_TARGET_NONE; 4958 swp_entry_t ent = { .val = 0 }; 4959 4960 if (pte_present(ptent)) 4961 page = mc_handle_present_pte(vma, addr, ptent); 4962 else if (is_swap_pte(ptent)) 4963 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4964 else if (pte_none(ptent) || pte_file(ptent)) 4965 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4966 4967 if (!page && !ent.val) 4968 return ret; 4969 if (page) { 4970 /* 4971 * Do only loose check w/o serialization. 4972 * mem_cgroup_move_account() checks the page is valid or 4973 * not under LRU exclusion. 4974 */ 4975 if (page->mem_cgroup == mc.from) { 4976 ret = MC_TARGET_PAGE; 4977 if (target) 4978 target->page = page; 4979 } 4980 if (!ret || !target) 4981 put_page(page); 4982 } 4983 /* There is a swap entry and a page doesn't exist or isn't charged */ 4984 if (ent.val && !ret && 4985 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 4986 ret = MC_TARGET_SWAP; 4987 if (target) 4988 target->ent = ent; 4989 } 4990 return ret; 4991 } 4992 4993 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4994 /* 4995 * We don't consider swapping or file mapped pages because THP does not 4996 * support them for now. 4997 * Caller should make sure that pmd_trans_huge(pmd) is true. 4998 */ 4999 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5000 unsigned long addr, pmd_t pmd, union mc_target *target) 5001 { 5002 struct page *page = NULL; 5003 enum mc_target_type ret = MC_TARGET_NONE; 5004 5005 page = pmd_page(pmd); 5006 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5007 if (!move_anon()) 5008 return ret; 5009 if (page->mem_cgroup == mc.from) { 5010 ret = MC_TARGET_PAGE; 5011 if (target) { 5012 get_page(page); 5013 target->page = page; 5014 } 5015 } 5016 return ret; 5017 } 5018 #else 5019 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5020 unsigned long addr, pmd_t pmd, union mc_target *target) 5021 { 5022 return MC_TARGET_NONE; 5023 } 5024 #endif 5025 5026 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5027 unsigned long addr, unsigned long end, 5028 struct mm_walk *walk) 5029 { 5030 struct vm_area_struct *vma = walk->private; 5031 pte_t *pte; 5032 spinlock_t *ptl; 5033 5034 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 5035 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5036 mc.precharge += HPAGE_PMD_NR; 5037 spin_unlock(ptl); 5038 return 0; 5039 } 5040 5041 if (pmd_trans_unstable(pmd)) 5042 return 0; 5043 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5044 for (; addr != end; pte++, addr += PAGE_SIZE) 5045 if (get_mctgt_type(vma, addr, *pte, NULL)) 5046 mc.precharge++; /* increment precharge temporarily */ 5047 pte_unmap_unlock(pte - 1, ptl); 5048 cond_resched(); 5049 5050 return 0; 5051 } 5052 5053 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5054 { 5055 unsigned long precharge; 5056 struct vm_area_struct *vma; 5057 5058 down_read(&mm->mmap_sem); 5059 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5060 struct mm_walk mem_cgroup_count_precharge_walk = { 5061 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5062 .mm = mm, 5063 .private = vma, 5064 }; 5065 if (is_vm_hugetlb_page(vma)) 5066 continue; 5067 walk_page_range(vma->vm_start, vma->vm_end, 5068 &mem_cgroup_count_precharge_walk); 5069 } 5070 up_read(&mm->mmap_sem); 5071 5072 precharge = mc.precharge; 5073 mc.precharge = 0; 5074 5075 return precharge; 5076 } 5077 5078 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5079 { 5080 unsigned long precharge = mem_cgroup_count_precharge(mm); 5081 5082 VM_BUG_ON(mc.moving_task); 5083 mc.moving_task = current; 5084 return mem_cgroup_do_precharge(precharge); 5085 } 5086 5087 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5088 static void __mem_cgroup_clear_mc(void) 5089 { 5090 struct mem_cgroup *from = mc.from; 5091 struct mem_cgroup *to = mc.to; 5092 5093 /* we must uncharge all the leftover precharges from mc.to */ 5094 if (mc.precharge) { 5095 cancel_charge(mc.to, mc.precharge); 5096 mc.precharge = 0; 5097 } 5098 /* 5099 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5100 * we must uncharge here. 5101 */ 5102 if (mc.moved_charge) { 5103 cancel_charge(mc.from, mc.moved_charge); 5104 mc.moved_charge = 0; 5105 } 5106 /* we must fixup refcnts and charges */ 5107 if (mc.moved_swap) { 5108 /* uncharge swap account from the old cgroup */ 5109 if (!mem_cgroup_is_root(mc.from)) 5110 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 5111 5112 /* 5113 * we charged both to->memory and to->memsw, so we 5114 * should uncharge to->memory. 5115 */ 5116 if (!mem_cgroup_is_root(mc.to)) 5117 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 5118 5119 css_put_many(&mc.from->css, mc.moved_swap); 5120 5121 /* we've already done css_get(mc.to) */ 5122 mc.moved_swap = 0; 5123 } 5124 memcg_oom_recover(from); 5125 memcg_oom_recover(to); 5126 wake_up_all(&mc.waitq); 5127 } 5128 5129 static void mem_cgroup_clear_mc(void) 5130 { 5131 /* 5132 * we must clear moving_task before waking up waiters at the end of 5133 * task migration. 5134 */ 5135 mc.moving_task = NULL; 5136 __mem_cgroup_clear_mc(); 5137 spin_lock(&mc.lock); 5138 mc.from = NULL; 5139 mc.to = NULL; 5140 spin_unlock(&mc.lock); 5141 } 5142 5143 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 5144 struct cgroup_taskset *tset) 5145 { 5146 struct task_struct *p = cgroup_taskset_first(tset); 5147 int ret = 0; 5148 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5149 unsigned long move_charge_at_immigrate; 5150 5151 /* 5152 * We are now commited to this value whatever it is. Changes in this 5153 * tunable will only affect upcoming migrations, not the current one. 5154 * So we need to save it, and keep it going. 5155 */ 5156 move_charge_at_immigrate = memcg->move_charge_at_immigrate; 5157 if (move_charge_at_immigrate) { 5158 struct mm_struct *mm; 5159 struct mem_cgroup *from = mem_cgroup_from_task(p); 5160 5161 VM_BUG_ON(from == memcg); 5162 5163 mm = get_task_mm(p); 5164 if (!mm) 5165 return 0; 5166 /* We move charges only when we move a owner of the mm */ 5167 if (mm->owner == p) { 5168 VM_BUG_ON(mc.from); 5169 VM_BUG_ON(mc.to); 5170 VM_BUG_ON(mc.precharge); 5171 VM_BUG_ON(mc.moved_charge); 5172 VM_BUG_ON(mc.moved_swap); 5173 5174 spin_lock(&mc.lock); 5175 mc.from = from; 5176 mc.to = memcg; 5177 mc.immigrate_flags = move_charge_at_immigrate; 5178 spin_unlock(&mc.lock); 5179 /* We set mc.moving_task later */ 5180 5181 ret = mem_cgroup_precharge_mc(mm); 5182 if (ret) 5183 mem_cgroup_clear_mc(); 5184 } 5185 mmput(mm); 5186 } 5187 return ret; 5188 } 5189 5190 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 5191 struct cgroup_taskset *tset) 5192 { 5193 if (mc.to) 5194 mem_cgroup_clear_mc(); 5195 } 5196 5197 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5198 unsigned long addr, unsigned long end, 5199 struct mm_walk *walk) 5200 { 5201 int ret = 0; 5202 struct vm_area_struct *vma = walk->private; 5203 pte_t *pte; 5204 spinlock_t *ptl; 5205 enum mc_target_type target_type; 5206 union mc_target target; 5207 struct page *page; 5208 5209 /* 5210 * We don't take compound_lock() here but no race with splitting thp 5211 * happens because: 5212 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 5213 * under splitting, which means there's no concurrent thp split, 5214 * - if another thread runs into split_huge_page() just after we 5215 * entered this if-block, the thread must wait for page table lock 5216 * to be unlocked in __split_huge_page_splitting(), where the main 5217 * part of thp split is not executed yet. 5218 */ 5219 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 5220 if (mc.precharge < HPAGE_PMD_NR) { 5221 spin_unlock(ptl); 5222 return 0; 5223 } 5224 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 5225 if (target_type == MC_TARGET_PAGE) { 5226 page = target.page; 5227 if (!isolate_lru_page(page)) { 5228 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5229 mc.from, mc.to)) { 5230 mc.precharge -= HPAGE_PMD_NR; 5231 mc.moved_charge += HPAGE_PMD_NR; 5232 } 5233 putback_lru_page(page); 5234 } 5235 put_page(page); 5236 } 5237 spin_unlock(ptl); 5238 return 0; 5239 } 5240 5241 if (pmd_trans_unstable(pmd)) 5242 return 0; 5243 retry: 5244 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5245 for (; addr != end; addr += PAGE_SIZE) { 5246 pte_t ptent = *(pte++); 5247 swp_entry_t ent; 5248 5249 if (!mc.precharge) 5250 break; 5251 5252 switch (get_mctgt_type(vma, addr, ptent, &target)) { 5253 case MC_TARGET_PAGE: 5254 page = target.page; 5255 if (isolate_lru_page(page)) 5256 goto put; 5257 if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { 5258 mc.precharge--; 5259 /* we uncharge from mc.from later. */ 5260 mc.moved_charge++; 5261 } 5262 putback_lru_page(page); 5263 put: /* get_mctgt_type() gets the page */ 5264 put_page(page); 5265 break; 5266 case MC_TARGET_SWAP: 5267 ent = target.ent; 5268 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 5269 mc.precharge--; 5270 /* we fixup refcnts and charges later. */ 5271 mc.moved_swap++; 5272 } 5273 break; 5274 default: 5275 break; 5276 } 5277 } 5278 pte_unmap_unlock(pte - 1, ptl); 5279 cond_resched(); 5280 5281 if (addr != end) { 5282 /* 5283 * We have consumed all precharges we got in can_attach(). 5284 * We try charge one by one, but don't do any additional 5285 * charges to mc.to if we have failed in charge once in attach() 5286 * phase. 5287 */ 5288 ret = mem_cgroup_do_precharge(1); 5289 if (!ret) 5290 goto retry; 5291 } 5292 5293 return ret; 5294 } 5295 5296 static void mem_cgroup_move_charge(struct mm_struct *mm) 5297 { 5298 struct vm_area_struct *vma; 5299 5300 lru_add_drain_all(); 5301 /* 5302 * Signal mem_cgroup_begin_page_stat() to take the memcg's 5303 * move_lock while we're moving its pages to another memcg. 5304 * Then wait for already started RCU-only updates to finish. 5305 */ 5306 atomic_inc(&mc.from->moving_account); 5307 synchronize_rcu(); 5308 retry: 5309 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5310 /* 5311 * Someone who are holding the mmap_sem might be waiting in 5312 * waitq. So we cancel all extra charges, wake up all waiters, 5313 * and retry. Because we cancel precharges, we might not be able 5314 * to move enough charges, but moving charge is a best-effort 5315 * feature anyway, so it wouldn't be a big problem. 5316 */ 5317 __mem_cgroup_clear_mc(); 5318 cond_resched(); 5319 goto retry; 5320 } 5321 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5322 int ret; 5323 struct mm_walk mem_cgroup_move_charge_walk = { 5324 .pmd_entry = mem_cgroup_move_charge_pte_range, 5325 .mm = mm, 5326 .private = vma, 5327 }; 5328 if (is_vm_hugetlb_page(vma)) 5329 continue; 5330 ret = walk_page_range(vma->vm_start, vma->vm_end, 5331 &mem_cgroup_move_charge_walk); 5332 if (ret) 5333 /* 5334 * means we have consumed all precharges and failed in 5335 * doing additional charge. Just abandon here. 5336 */ 5337 break; 5338 } 5339 up_read(&mm->mmap_sem); 5340 atomic_dec(&mc.from->moving_account); 5341 } 5342 5343 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 5344 struct cgroup_taskset *tset) 5345 { 5346 struct task_struct *p = cgroup_taskset_first(tset); 5347 struct mm_struct *mm = get_task_mm(p); 5348 5349 if (mm) { 5350 if (mc.to) 5351 mem_cgroup_move_charge(mm); 5352 mmput(mm); 5353 } 5354 if (mc.to) 5355 mem_cgroup_clear_mc(); 5356 } 5357 #else /* !CONFIG_MMU */ 5358 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 5359 struct cgroup_taskset *tset) 5360 { 5361 return 0; 5362 } 5363 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 5364 struct cgroup_taskset *tset) 5365 { 5366 } 5367 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 5368 struct cgroup_taskset *tset) 5369 { 5370 } 5371 #endif 5372 5373 /* 5374 * Cgroup retains root cgroups across [un]mount cycles making it necessary 5375 * to verify whether we're attached to the default hierarchy on each mount 5376 * attempt. 5377 */ 5378 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 5379 { 5380 /* 5381 * use_hierarchy is forced on the default hierarchy. cgroup core 5382 * guarantees that @root doesn't have any children, so turning it 5383 * on for the root memcg is enough. 5384 */ 5385 if (cgroup_on_dfl(root_css->cgroup)) 5386 mem_cgroup_from_css(root_css)->use_hierarchy = true; 5387 } 5388 5389 struct cgroup_subsys memory_cgrp_subsys = { 5390 .css_alloc = mem_cgroup_css_alloc, 5391 .css_online = mem_cgroup_css_online, 5392 .css_offline = mem_cgroup_css_offline, 5393 .css_free = mem_cgroup_css_free, 5394 .css_reset = mem_cgroup_css_reset, 5395 .can_attach = mem_cgroup_can_attach, 5396 .cancel_attach = mem_cgroup_cancel_attach, 5397 .attach = mem_cgroup_move_task, 5398 .bind = mem_cgroup_bind, 5399 .legacy_cftypes = mem_cgroup_files, 5400 .early_init = 0, 5401 }; 5402 5403 #ifdef CONFIG_MEMCG_SWAP 5404 static int __init enable_swap_account(char *s) 5405 { 5406 if (!strcmp(s, "1")) 5407 really_do_swap_account = 1; 5408 else if (!strcmp(s, "0")) 5409 really_do_swap_account = 0; 5410 return 1; 5411 } 5412 __setup("swapaccount=", enable_swap_account); 5413 5414 static void __init memsw_file_init(void) 5415 { 5416 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 5417 memsw_cgroup_files)); 5418 } 5419 5420 static void __init enable_swap_cgroup(void) 5421 { 5422 if (!mem_cgroup_disabled() && really_do_swap_account) { 5423 do_swap_account = 1; 5424 memsw_file_init(); 5425 } 5426 } 5427 5428 #else 5429 static void __init enable_swap_cgroup(void) 5430 { 5431 } 5432 #endif 5433 5434 #ifdef CONFIG_MEMCG_SWAP 5435 /** 5436 * mem_cgroup_swapout - transfer a memsw charge to swap 5437 * @page: page whose memsw charge to transfer 5438 * @entry: swap entry to move the charge to 5439 * 5440 * Transfer the memsw charge of @page to @entry. 5441 */ 5442 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 5443 { 5444 struct mem_cgroup *memcg; 5445 unsigned short oldid; 5446 5447 VM_BUG_ON_PAGE(PageLRU(page), page); 5448 VM_BUG_ON_PAGE(page_count(page), page); 5449 5450 if (!do_swap_account) 5451 return; 5452 5453 memcg = page->mem_cgroup; 5454 5455 /* Readahead page, never charged */ 5456 if (!memcg) 5457 return; 5458 5459 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); 5460 VM_BUG_ON_PAGE(oldid, page); 5461 mem_cgroup_swap_statistics(memcg, true); 5462 5463 page->mem_cgroup = NULL; 5464 5465 if (!mem_cgroup_is_root(memcg)) 5466 page_counter_uncharge(&memcg->memory, 1); 5467 5468 /* XXX: caller holds IRQ-safe mapping->tree_lock */ 5469 VM_BUG_ON(!irqs_disabled()); 5470 5471 mem_cgroup_charge_statistics(memcg, page, -1); 5472 memcg_check_events(memcg, page); 5473 } 5474 5475 /** 5476 * mem_cgroup_uncharge_swap - uncharge a swap entry 5477 * @entry: swap entry to uncharge 5478 * 5479 * Drop the memsw charge associated with @entry. 5480 */ 5481 void mem_cgroup_uncharge_swap(swp_entry_t entry) 5482 { 5483 struct mem_cgroup *memcg; 5484 unsigned short id; 5485 5486 if (!do_swap_account) 5487 return; 5488 5489 id = swap_cgroup_record(entry, 0); 5490 rcu_read_lock(); 5491 memcg = mem_cgroup_lookup(id); 5492 if (memcg) { 5493 if (!mem_cgroup_is_root(memcg)) 5494 page_counter_uncharge(&memcg->memsw, 1); 5495 mem_cgroup_swap_statistics(memcg, false); 5496 css_put(&memcg->css); 5497 } 5498 rcu_read_unlock(); 5499 } 5500 #endif 5501 5502 /** 5503 * mem_cgroup_try_charge - try charging a page 5504 * @page: page to charge 5505 * @mm: mm context of the victim 5506 * @gfp_mask: reclaim mode 5507 * @memcgp: charged memcg return 5508 * 5509 * Try to charge @page to the memcg that @mm belongs to, reclaiming 5510 * pages according to @gfp_mask if necessary. 5511 * 5512 * Returns 0 on success, with *@memcgp pointing to the charged memcg. 5513 * Otherwise, an error code is returned. 5514 * 5515 * After page->mapping has been set up, the caller must finalize the 5516 * charge with mem_cgroup_commit_charge(). Or abort the transaction 5517 * with mem_cgroup_cancel_charge() in case page instantiation fails. 5518 */ 5519 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 5520 gfp_t gfp_mask, struct mem_cgroup **memcgp) 5521 { 5522 struct mem_cgroup *memcg = NULL; 5523 unsigned int nr_pages = 1; 5524 int ret = 0; 5525 5526 if (mem_cgroup_disabled()) 5527 goto out; 5528 5529 if (PageSwapCache(page)) { 5530 /* 5531 * Every swap fault against a single page tries to charge the 5532 * page, bail as early as possible. shmem_unuse() encounters 5533 * already charged pages, too. The USED bit is protected by 5534 * the page lock, which serializes swap cache removal, which 5535 * in turn serializes uncharging. 5536 */ 5537 if (page->mem_cgroup) 5538 goto out; 5539 } 5540 5541 if (PageTransHuge(page)) { 5542 nr_pages <<= compound_order(page); 5543 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5544 } 5545 5546 if (do_swap_account && PageSwapCache(page)) 5547 memcg = try_get_mem_cgroup_from_page(page); 5548 if (!memcg) 5549 memcg = get_mem_cgroup_from_mm(mm); 5550 5551 ret = try_charge(memcg, gfp_mask, nr_pages); 5552 5553 css_put(&memcg->css); 5554 5555 if (ret == -EINTR) { 5556 memcg = root_mem_cgroup; 5557 ret = 0; 5558 } 5559 out: 5560 *memcgp = memcg; 5561 return ret; 5562 } 5563 5564 /** 5565 * mem_cgroup_commit_charge - commit a page charge 5566 * @page: page to charge 5567 * @memcg: memcg to charge the page to 5568 * @lrucare: page might be on LRU already 5569 * 5570 * Finalize a charge transaction started by mem_cgroup_try_charge(), 5571 * after page->mapping has been set up. This must happen atomically 5572 * as part of the page instantiation, i.e. under the page table lock 5573 * for anonymous pages, under the page lock for page and swap cache. 5574 * 5575 * In addition, the page must not be on the LRU during the commit, to 5576 * prevent racing with task migration. If it might be, use @lrucare. 5577 * 5578 * Use mem_cgroup_cancel_charge() to cancel the transaction instead. 5579 */ 5580 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 5581 bool lrucare) 5582 { 5583 unsigned int nr_pages = 1; 5584 5585 VM_BUG_ON_PAGE(!page->mapping, page); 5586 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); 5587 5588 if (mem_cgroup_disabled()) 5589 return; 5590 /* 5591 * Swap faults will attempt to charge the same page multiple 5592 * times. But reuse_swap_page() might have removed the page 5593 * from swapcache already, so we can't check PageSwapCache(). 5594 */ 5595 if (!memcg) 5596 return; 5597 5598 commit_charge(page, memcg, lrucare); 5599 5600 if (PageTransHuge(page)) { 5601 nr_pages <<= compound_order(page); 5602 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5603 } 5604 5605 local_irq_disable(); 5606 mem_cgroup_charge_statistics(memcg, page, nr_pages); 5607 memcg_check_events(memcg, page); 5608 local_irq_enable(); 5609 5610 if (do_swap_account && PageSwapCache(page)) { 5611 swp_entry_t entry = { .val = page_private(page) }; 5612 /* 5613 * The swap entry might not get freed for a long time, 5614 * let's not wait for it. The page already received a 5615 * memory+swap charge, drop the swap entry duplicate. 5616 */ 5617 mem_cgroup_uncharge_swap(entry); 5618 } 5619 } 5620 5621 /** 5622 * mem_cgroup_cancel_charge - cancel a page charge 5623 * @page: page to charge 5624 * @memcg: memcg to charge the page to 5625 * 5626 * Cancel a charge transaction started by mem_cgroup_try_charge(). 5627 */ 5628 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) 5629 { 5630 unsigned int nr_pages = 1; 5631 5632 if (mem_cgroup_disabled()) 5633 return; 5634 /* 5635 * Swap faults will attempt to charge the same page multiple 5636 * times. But reuse_swap_page() might have removed the page 5637 * from swapcache already, so we can't check PageSwapCache(). 5638 */ 5639 if (!memcg) 5640 return; 5641 5642 if (PageTransHuge(page)) { 5643 nr_pages <<= compound_order(page); 5644 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5645 } 5646 5647 cancel_charge(memcg, nr_pages); 5648 } 5649 5650 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 5651 unsigned long nr_anon, unsigned long nr_file, 5652 unsigned long nr_huge, struct page *dummy_page) 5653 { 5654 unsigned long nr_pages = nr_anon + nr_file; 5655 unsigned long flags; 5656 5657 if (!mem_cgroup_is_root(memcg)) { 5658 page_counter_uncharge(&memcg->memory, nr_pages); 5659 if (do_swap_account) 5660 page_counter_uncharge(&memcg->memsw, nr_pages); 5661 memcg_oom_recover(memcg); 5662 } 5663 5664 local_irq_save(flags); 5665 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); 5666 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); 5667 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); 5668 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); 5669 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 5670 memcg_check_events(memcg, dummy_page); 5671 local_irq_restore(flags); 5672 5673 if (!mem_cgroup_is_root(memcg)) 5674 css_put_many(&memcg->css, nr_pages); 5675 } 5676 5677 static void uncharge_list(struct list_head *page_list) 5678 { 5679 struct mem_cgroup *memcg = NULL; 5680 unsigned long nr_anon = 0; 5681 unsigned long nr_file = 0; 5682 unsigned long nr_huge = 0; 5683 unsigned long pgpgout = 0; 5684 struct list_head *next; 5685 struct page *page; 5686 5687 next = page_list->next; 5688 do { 5689 unsigned int nr_pages = 1; 5690 5691 page = list_entry(next, struct page, lru); 5692 next = page->lru.next; 5693 5694 VM_BUG_ON_PAGE(PageLRU(page), page); 5695 VM_BUG_ON_PAGE(page_count(page), page); 5696 5697 if (!page->mem_cgroup) 5698 continue; 5699 5700 /* 5701 * Nobody should be changing or seriously looking at 5702 * page->mem_cgroup at this point, we have fully 5703 * exclusive access to the page. 5704 */ 5705 5706 if (memcg != page->mem_cgroup) { 5707 if (memcg) { 5708 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5709 nr_huge, page); 5710 pgpgout = nr_anon = nr_file = nr_huge = 0; 5711 } 5712 memcg = page->mem_cgroup; 5713 } 5714 5715 if (PageTransHuge(page)) { 5716 nr_pages <<= compound_order(page); 5717 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 5718 nr_huge += nr_pages; 5719 } 5720 5721 if (PageAnon(page)) 5722 nr_anon += nr_pages; 5723 else 5724 nr_file += nr_pages; 5725 5726 page->mem_cgroup = NULL; 5727 5728 pgpgout++; 5729 } while (next != page_list); 5730 5731 if (memcg) 5732 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5733 nr_huge, page); 5734 } 5735 5736 /** 5737 * mem_cgroup_uncharge - uncharge a page 5738 * @page: page to uncharge 5739 * 5740 * Uncharge a page previously charged with mem_cgroup_try_charge() and 5741 * mem_cgroup_commit_charge(). 5742 */ 5743 void mem_cgroup_uncharge(struct page *page) 5744 { 5745 if (mem_cgroup_disabled()) 5746 return; 5747 5748 /* Don't touch page->lru of any random page, pre-check: */ 5749 if (!page->mem_cgroup) 5750 return; 5751 5752 INIT_LIST_HEAD(&page->lru); 5753 uncharge_list(&page->lru); 5754 } 5755 5756 /** 5757 * mem_cgroup_uncharge_list - uncharge a list of page 5758 * @page_list: list of pages to uncharge 5759 * 5760 * Uncharge a list of pages previously charged with 5761 * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). 5762 */ 5763 void mem_cgroup_uncharge_list(struct list_head *page_list) 5764 { 5765 if (mem_cgroup_disabled()) 5766 return; 5767 5768 if (!list_empty(page_list)) 5769 uncharge_list(page_list); 5770 } 5771 5772 /** 5773 * mem_cgroup_migrate - migrate a charge to another page 5774 * @oldpage: currently charged page 5775 * @newpage: page to transfer the charge to 5776 * @lrucare: both pages might be on the LRU already 5777 * 5778 * Migrate the charge from @oldpage to @newpage. 5779 * 5780 * Both pages must be locked, @newpage->mapping must be set up. 5781 */ 5782 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, 5783 bool lrucare) 5784 { 5785 struct mem_cgroup *memcg; 5786 int isolated; 5787 5788 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 5789 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 5790 VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); 5791 VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); 5792 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 5793 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 5794 newpage); 5795 5796 if (mem_cgroup_disabled()) 5797 return; 5798 5799 /* Page cache replacement: new page already charged? */ 5800 if (newpage->mem_cgroup) 5801 return; 5802 5803 /* 5804 * Swapcache readahead pages can get migrated before being 5805 * charged, and migration from compaction can happen to an 5806 * uncharged page when the PFN walker finds a page that 5807 * reclaim just put back on the LRU but has not released yet. 5808 */ 5809 memcg = oldpage->mem_cgroup; 5810 if (!memcg) 5811 return; 5812 5813 if (lrucare) 5814 lock_page_lru(oldpage, &isolated); 5815 5816 oldpage->mem_cgroup = NULL; 5817 5818 if (lrucare) 5819 unlock_page_lru(oldpage, isolated); 5820 5821 commit_charge(newpage, memcg, lrucare); 5822 } 5823 5824 /* 5825 * subsys_initcall() for memory controller. 5826 * 5827 * Some parts like hotcpu_notifier() have to be initialized from this context 5828 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically 5829 * everything that doesn't depend on a specific mem_cgroup structure should 5830 * be initialized from here. 5831 */ 5832 static int __init mem_cgroup_init(void) 5833 { 5834 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 5835 enable_swap_cgroup(); 5836 mem_cgroup_soft_limit_tree_init(); 5837 memcg_stock_init(); 5838 return 0; 5839 } 5840 subsys_initcall(mem_cgroup_init); 5841