1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * Kernel Memory Controller 14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 15 * Authors: Glauber Costa and Suleiman Souhlal 16 * 17 * Native page reclaim 18 * Charge lifetime sanitation 19 * Lockless page tracking & accounting 20 * Unified hierarchy configuration model 21 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 22 * 23 * This program is free software; you can redistribute it and/or modify 24 * it under the terms of the GNU General Public License as published by 25 * the Free Software Foundation; either version 2 of the License, or 26 * (at your option) any later version. 27 * 28 * This program is distributed in the hope that it will be useful, 29 * but WITHOUT ANY WARRANTY; without even the implied warranty of 30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 31 * GNU General Public License for more details. 32 */ 33 34 #include <linux/page_counter.h> 35 #include <linux/memcontrol.h> 36 #include <linux/cgroup.h> 37 #include <linux/mm.h> 38 #include <linux/sched/mm.h> 39 #include <linux/shmem_fs.h> 40 #include <linux/hugetlb.h> 41 #include <linux/pagemap.h> 42 #include <linux/smp.h> 43 #include <linux/page-flags.h> 44 #include <linux/backing-dev.h> 45 #include <linux/bit_spinlock.h> 46 #include <linux/rcupdate.h> 47 #include <linux/limits.h> 48 #include <linux/export.h> 49 #include <linux/mutex.h> 50 #include <linux/rbtree.h> 51 #include <linux/slab.h> 52 #include <linux/swap.h> 53 #include <linux/swapops.h> 54 #include <linux/spinlock.h> 55 #include <linux/eventfd.h> 56 #include <linux/poll.h> 57 #include <linux/sort.h> 58 #include <linux/fs.h> 59 #include <linux/seq_file.h> 60 #include <linux/vmpressure.h> 61 #include <linux/mm_inline.h> 62 #include <linux/swap_cgroup.h> 63 #include <linux/cpu.h> 64 #include <linux/oom.h> 65 #include <linux/lockdep.h> 66 #include <linux/file.h> 67 #include <linux/tracehook.h> 68 #include "internal.h" 69 #include <net/sock.h> 70 #include <net/ip.h> 71 #include "slab.h" 72 73 #include <linux/uaccess.h> 74 75 #include <trace/events/vmscan.h> 76 77 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 78 EXPORT_SYMBOL(memory_cgrp_subsys); 79 80 struct mem_cgroup *root_mem_cgroup __read_mostly; 81 82 #define MEM_CGROUP_RECLAIM_RETRIES 5 83 84 /* Socket memory accounting disabled? */ 85 static bool cgroup_memory_nosocket; 86 87 /* Kernel memory accounting disabled? */ 88 static bool cgroup_memory_nokmem; 89 90 /* Whether the swap controller is active */ 91 #ifdef CONFIG_MEMCG_SWAP 92 int do_swap_account __read_mostly; 93 #else 94 #define do_swap_account 0 95 #endif 96 97 /* Whether legacy memory+swap accounting is active */ 98 static bool do_memsw_account(void) 99 { 100 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; 101 } 102 103 static const char *const mem_cgroup_lru_names[] = { 104 "inactive_anon", 105 "active_anon", 106 "inactive_file", 107 "active_file", 108 "unevictable", 109 }; 110 111 #define THRESHOLDS_EVENTS_TARGET 128 112 #define SOFTLIMIT_EVENTS_TARGET 1024 113 #define NUMAINFO_EVENTS_TARGET 1024 114 115 /* 116 * Cgroups above their limits are maintained in a RB-Tree, independent of 117 * their hierarchy representation 118 */ 119 120 struct mem_cgroup_tree_per_node { 121 struct rb_root rb_root; 122 struct rb_node *rb_rightmost; 123 spinlock_t lock; 124 }; 125 126 struct mem_cgroup_tree { 127 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 128 }; 129 130 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 131 132 /* for OOM */ 133 struct mem_cgroup_eventfd_list { 134 struct list_head list; 135 struct eventfd_ctx *eventfd; 136 }; 137 138 /* 139 * cgroup_event represents events which userspace want to receive. 140 */ 141 struct mem_cgroup_event { 142 /* 143 * memcg which the event belongs to. 144 */ 145 struct mem_cgroup *memcg; 146 /* 147 * eventfd to signal userspace about the event. 148 */ 149 struct eventfd_ctx *eventfd; 150 /* 151 * Each of these stored in a list by the cgroup. 152 */ 153 struct list_head list; 154 /* 155 * register_event() callback will be used to add new userspace 156 * waiter for changes related to this event. Use eventfd_signal() 157 * on eventfd to send notification to userspace. 158 */ 159 int (*register_event)(struct mem_cgroup *memcg, 160 struct eventfd_ctx *eventfd, const char *args); 161 /* 162 * unregister_event() callback will be called when userspace closes 163 * the eventfd or on cgroup removing. This callback must be set, 164 * if you want provide notification functionality. 165 */ 166 void (*unregister_event)(struct mem_cgroup *memcg, 167 struct eventfd_ctx *eventfd); 168 /* 169 * All fields below needed to unregister event when 170 * userspace closes eventfd. 171 */ 172 poll_table pt; 173 wait_queue_head_t *wqh; 174 wait_queue_entry_t wait; 175 struct work_struct remove; 176 }; 177 178 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 179 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 180 181 /* Stuffs for move charges at task migration. */ 182 /* 183 * Types of charges to be moved. 184 */ 185 #define MOVE_ANON 0x1U 186 #define MOVE_FILE 0x2U 187 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 188 189 /* "mc" and its members are protected by cgroup_mutex */ 190 static struct move_charge_struct { 191 spinlock_t lock; /* for from, to */ 192 struct mm_struct *mm; 193 struct mem_cgroup *from; 194 struct mem_cgroup *to; 195 unsigned long flags; 196 unsigned long precharge; 197 unsigned long moved_charge; 198 unsigned long moved_swap; 199 struct task_struct *moving_task; /* a task moving charges */ 200 wait_queue_head_t waitq; /* a waitq for other context */ 201 } mc = { 202 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 203 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 204 }; 205 206 /* 207 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 208 * limit reclaim to prevent infinite loops, if they ever occur. 209 */ 210 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 211 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 212 213 enum charge_type { 214 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 215 MEM_CGROUP_CHARGE_TYPE_ANON, 216 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 217 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 218 NR_CHARGE_TYPE, 219 }; 220 221 /* for encoding cft->private value on file */ 222 enum res_type { 223 _MEM, 224 _MEMSWAP, 225 _OOM_TYPE, 226 _KMEM, 227 _TCP, 228 }; 229 230 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 231 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 232 #define MEMFILE_ATTR(val) ((val) & 0xffff) 233 /* Used for OOM nofiier */ 234 #define OOM_CONTROL (0) 235 236 /* 237 * Iteration constructs for visiting all cgroups (under a tree). If 238 * loops are exited prematurely (break), mem_cgroup_iter_break() must 239 * be used for reference counting. 240 */ 241 #define for_each_mem_cgroup_tree(iter, root) \ 242 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 243 iter != NULL; \ 244 iter = mem_cgroup_iter(root, iter, NULL)) 245 246 #define for_each_mem_cgroup(iter) \ 247 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 248 iter != NULL; \ 249 iter = mem_cgroup_iter(NULL, iter, NULL)) 250 251 static inline bool should_force_charge(void) 252 { 253 return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 254 (current->flags & PF_EXITING); 255 } 256 257 /* Some nice accessors for the vmpressure. */ 258 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 259 { 260 if (!memcg) 261 memcg = root_mem_cgroup; 262 return &memcg->vmpressure; 263 } 264 265 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 266 { 267 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 268 } 269 270 #ifdef CONFIG_MEMCG_KMEM 271 /* 272 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. 273 * The main reason for not using cgroup id for this: 274 * this works better in sparse environments, where we have a lot of memcgs, 275 * but only a few kmem-limited. Or also, if we have, for instance, 200 276 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 277 * 200 entry array for that. 278 * 279 * The current size of the caches array is stored in memcg_nr_cache_ids. It 280 * will double each time we have to increase it. 281 */ 282 static DEFINE_IDA(memcg_cache_ida); 283 int memcg_nr_cache_ids; 284 285 /* Protects memcg_nr_cache_ids */ 286 static DECLARE_RWSEM(memcg_cache_ids_sem); 287 288 void memcg_get_cache_ids(void) 289 { 290 down_read(&memcg_cache_ids_sem); 291 } 292 293 void memcg_put_cache_ids(void) 294 { 295 up_read(&memcg_cache_ids_sem); 296 } 297 298 /* 299 * MIN_SIZE is different than 1, because we would like to avoid going through 300 * the alloc/free process all the time. In a small machine, 4 kmem-limited 301 * cgroups is a reasonable guess. In the future, it could be a parameter or 302 * tunable, but that is strictly not necessary. 303 * 304 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 305 * this constant directly from cgroup, but it is understandable that this is 306 * better kept as an internal representation in cgroup.c. In any case, the 307 * cgrp_id space is not getting any smaller, and we don't have to necessarily 308 * increase ours as well if it increases. 309 */ 310 #define MEMCG_CACHES_MIN_SIZE 4 311 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 312 313 /* 314 * A lot of the calls to the cache allocation functions are expected to be 315 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 316 * conditional to this static branch, we'll have to allow modules that does 317 * kmem_cache_alloc and the such to see this symbol as well 318 */ 319 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); 320 EXPORT_SYMBOL(memcg_kmem_enabled_key); 321 322 struct workqueue_struct *memcg_kmem_cache_wq; 323 324 static int memcg_shrinker_map_size; 325 static DEFINE_MUTEX(memcg_shrinker_map_mutex); 326 327 static void memcg_free_shrinker_map_rcu(struct rcu_head *head) 328 { 329 kvfree(container_of(head, struct memcg_shrinker_map, rcu)); 330 } 331 332 static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, 333 int size, int old_size) 334 { 335 struct memcg_shrinker_map *new, *old; 336 int nid; 337 338 lockdep_assert_held(&memcg_shrinker_map_mutex); 339 340 for_each_node(nid) { 341 old = rcu_dereference_protected( 342 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); 343 /* Not yet online memcg */ 344 if (!old) 345 return 0; 346 347 new = kvmalloc(sizeof(*new) + size, GFP_KERNEL); 348 if (!new) 349 return -ENOMEM; 350 351 /* Set all old bits, clear all new bits */ 352 memset(new->map, (int)0xff, old_size); 353 memset((void *)new->map + old_size, 0, size - old_size); 354 355 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); 356 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); 357 } 358 359 return 0; 360 } 361 362 static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) 363 { 364 struct mem_cgroup_per_node *pn; 365 struct memcg_shrinker_map *map; 366 int nid; 367 368 if (mem_cgroup_is_root(memcg)) 369 return; 370 371 for_each_node(nid) { 372 pn = mem_cgroup_nodeinfo(memcg, nid); 373 map = rcu_dereference_protected(pn->shrinker_map, true); 374 if (map) 375 kvfree(map); 376 rcu_assign_pointer(pn->shrinker_map, NULL); 377 } 378 } 379 380 static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) 381 { 382 struct memcg_shrinker_map *map; 383 int nid, size, ret = 0; 384 385 if (mem_cgroup_is_root(memcg)) 386 return 0; 387 388 mutex_lock(&memcg_shrinker_map_mutex); 389 size = memcg_shrinker_map_size; 390 for_each_node(nid) { 391 map = kvzalloc(sizeof(*map) + size, GFP_KERNEL); 392 if (!map) { 393 memcg_free_shrinker_maps(memcg); 394 ret = -ENOMEM; 395 break; 396 } 397 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); 398 } 399 mutex_unlock(&memcg_shrinker_map_mutex); 400 401 return ret; 402 } 403 404 int memcg_expand_shrinker_maps(int new_id) 405 { 406 int size, old_size, ret = 0; 407 struct mem_cgroup *memcg; 408 409 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); 410 old_size = memcg_shrinker_map_size; 411 if (size <= old_size) 412 return 0; 413 414 mutex_lock(&memcg_shrinker_map_mutex); 415 if (!root_mem_cgroup) 416 goto unlock; 417 418 for_each_mem_cgroup(memcg) { 419 if (mem_cgroup_is_root(memcg)) 420 continue; 421 ret = memcg_expand_one_shrinker_map(memcg, size, old_size); 422 if (ret) 423 goto unlock; 424 } 425 unlock: 426 if (!ret) 427 memcg_shrinker_map_size = size; 428 mutex_unlock(&memcg_shrinker_map_mutex); 429 return ret; 430 } 431 432 void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) 433 { 434 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { 435 struct memcg_shrinker_map *map; 436 437 rcu_read_lock(); 438 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); 439 /* Pairs with smp mb in shrink_slab() */ 440 smp_mb__before_atomic(); 441 set_bit(shrinker_id, map->map); 442 rcu_read_unlock(); 443 } 444 } 445 446 #else /* CONFIG_MEMCG_KMEM */ 447 static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) 448 { 449 return 0; 450 } 451 static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } 452 #endif /* CONFIG_MEMCG_KMEM */ 453 454 /** 455 * mem_cgroup_css_from_page - css of the memcg associated with a page 456 * @page: page of interest 457 * 458 * If memcg is bound to the default hierarchy, css of the memcg associated 459 * with @page is returned. The returned css remains associated with @page 460 * until it is released. 461 * 462 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 463 * is returned. 464 */ 465 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) 466 { 467 struct mem_cgroup *memcg; 468 469 memcg = page->mem_cgroup; 470 471 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 472 memcg = root_mem_cgroup; 473 474 return &memcg->css; 475 } 476 477 /** 478 * page_cgroup_ino - return inode number of the memcg a page is charged to 479 * @page: the page 480 * 481 * Look up the closest online ancestor of the memory cgroup @page is charged to 482 * and return its inode number or 0 if @page is not charged to any cgroup. It 483 * is safe to call this function without holding a reference to @page. 484 * 485 * Note, this function is inherently racy, because there is nothing to prevent 486 * the cgroup inode from getting torn down and potentially reallocated a moment 487 * after page_cgroup_ino() returns, so it only should be used by callers that 488 * do not care (such as procfs interfaces). 489 */ 490 ino_t page_cgroup_ino(struct page *page) 491 { 492 struct mem_cgroup *memcg; 493 unsigned long ino = 0; 494 495 rcu_read_lock(); 496 memcg = READ_ONCE(page->mem_cgroup); 497 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 498 memcg = parent_mem_cgroup(memcg); 499 if (memcg) 500 ino = cgroup_ino(memcg->css.cgroup); 501 rcu_read_unlock(); 502 return ino; 503 } 504 505 static struct mem_cgroup_per_node * 506 mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) 507 { 508 int nid = page_to_nid(page); 509 510 return memcg->nodeinfo[nid]; 511 } 512 513 static struct mem_cgroup_tree_per_node * 514 soft_limit_tree_node(int nid) 515 { 516 return soft_limit_tree.rb_tree_per_node[nid]; 517 } 518 519 static struct mem_cgroup_tree_per_node * 520 soft_limit_tree_from_page(struct page *page) 521 { 522 int nid = page_to_nid(page); 523 524 return soft_limit_tree.rb_tree_per_node[nid]; 525 } 526 527 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 528 struct mem_cgroup_tree_per_node *mctz, 529 unsigned long new_usage_in_excess) 530 { 531 struct rb_node **p = &mctz->rb_root.rb_node; 532 struct rb_node *parent = NULL; 533 struct mem_cgroup_per_node *mz_node; 534 bool rightmost = true; 535 536 if (mz->on_tree) 537 return; 538 539 mz->usage_in_excess = new_usage_in_excess; 540 if (!mz->usage_in_excess) 541 return; 542 while (*p) { 543 parent = *p; 544 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 545 tree_node); 546 if (mz->usage_in_excess < mz_node->usage_in_excess) { 547 p = &(*p)->rb_left; 548 rightmost = false; 549 } 550 551 /* 552 * We can't avoid mem cgroups that are over their soft 553 * limit by the same amount 554 */ 555 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 556 p = &(*p)->rb_right; 557 } 558 559 if (rightmost) 560 mctz->rb_rightmost = &mz->tree_node; 561 562 rb_link_node(&mz->tree_node, parent, p); 563 rb_insert_color(&mz->tree_node, &mctz->rb_root); 564 mz->on_tree = true; 565 } 566 567 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 568 struct mem_cgroup_tree_per_node *mctz) 569 { 570 if (!mz->on_tree) 571 return; 572 573 if (&mz->tree_node == mctz->rb_rightmost) 574 mctz->rb_rightmost = rb_prev(&mz->tree_node); 575 576 rb_erase(&mz->tree_node, &mctz->rb_root); 577 mz->on_tree = false; 578 } 579 580 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 581 struct mem_cgroup_tree_per_node *mctz) 582 { 583 unsigned long flags; 584 585 spin_lock_irqsave(&mctz->lock, flags); 586 __mem_cgroup_remove_exceeded(mz, mctz); 587 spin_unlock_irqrestore(&mctz->lock, flags); 588 } 589 590 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 591 { 592 unsigned long nr_pages = page_counter_read(&memcg->memory); 593 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 594 unsigned long excess = 0; 595 596 if (nr_pages > soft_limit) 597 excess = nr_pages - soft_limit; 598 599 return excess; 600 } 601 602 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 603 { 604 unsigned long excess; 605 struct mem_cgroup_per_node *mz; 606 struct mem_cgroup_tree_per_node *mctz; 607 608 mctz = soft_limit_tree_from_page(page); 609 if (!mctz) 610 return; 611 /* 612 * Necessary to update all ancestors when hierarchy is used. 613 * because their event counter is not touched. 614 */ 615 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 616 mz = mem_cgroup_page_nodeinfo(memcg, page); 617 excess = soft_limit_excess(memcg); 618 /* 619 * We have to update the tree if mz is on RB-tree or 620 * mem is over its softlimit. 621 */ 622 if (excess || mz->on_tree) { 623 unsigned long flags; 624 625 spin_lock_irqsave(&mctz->lock, flags); 626 /* if on-tree, remove it */ 627 if (mz->on_tree) 628 __mem_cgroup_remove_exceeded(mz, mctz); 629 /* 630 * Insert again. mz->usage_in_excess will be updated. 631 * If excess is 0, no tree ops. 632 */ 633 __mem_cgroup_insert_exceeded(mz, mctz, excess); 634 spin_unlock_irqrestore(&mctz->lock, flags); 635 } 636 } 637 } 638 639 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 640 { 641 struct mem_cgroup_tree_per_node *mctz; 642 struct mem_cgroup_per_node *mz; 643 int nid; 644 645 for_each_node(nid) { 646 mz = mem_cgroup_nodeinfo(memcg, nid); 647 mctz = soft_limit_tree_node(nid); 648 if (mctz) 649 mem_cgroup_remove_exceeded(mz, mctz); 650 } 651 } 652 653 static struct mem_cgroup_per_node * 654 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 655 { 656 struct mem_cgroup_per_node *mz; 657 658 retry: 659 mz = NULL; 660 if (!mctz->rb_rightmost) 661 goto done; /* Nothing to reclaim from */ 662 663 mz = rb_entry(mctz->rb_rightmost, 664 struct mem_cgroup_per_node, tree_node); 665 /* 666 * Remove the node now but someone else can add it back, 667 * we will to add it back at the end of reclaim to its correct 668 * position in the tree. 669 */ 670 __mem_cgroup_remove_exceeded(mz, mctz); 671 if (!soft_limit_excess(mz->memcg) || 672 !css_tryget_online(&mz->memcg->css)) 673 goto retry; 674 done: 675 return mz; 676 } 677 678 static struct mem_cgroup_per_node * 679 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 680 { 681 struct mem_cgroup_per_node *mz; 682 683 spin_lock_irq(&mctz->lock); 684 mz = __mem_cgroup_largest_soft_limit_node(mctz); 685 spin_unlock_irq(&mctz->lock); 686 return mz; 687 } 688 689 static unsigned long memcg_sum_events(struct mem_cgroup *memcg, 690 int event) 691 { 692 return atomic_long_read(&memcg->events[event]); 693 } 694 695 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 696 struct page *page, 697 bool compound, int nr_pages) 698 { 699 /* 700 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 701 * counted as CACHE even if it's on ANON LRU. 702 */ 703 if (PageAnon(page)) 704 __mod_memcg_state(memcg, MEMCG_RSS, nr_pages); 705 else { 706 __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages); 707 if (PageSwapBacked(page)) 708 __mod_memcg_state(memcg, NR_SHMEM, nr_pages); 709 } 710 711 if (compound) { 712 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 713 __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); 714 } 715 716 /* pagein of a big page is an event. So, ignore page size */ 717 if (nr_pages > 0) 718 __count_memcg_events(memcg, PGPGIN, 1); 719 else { 720 __count_memcg_events(memcg, PGPGOUT, 1); 721 nr_pages = -nr_pages; /* for event */ 722 } 723 724 __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); 725 } 726 727 unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 728 int nid, unsigned int lru_mask) 729 { 730 struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); 731 unsigned long nr = 0; 732 enum lru_list lru; 733 734 VM_BUG_ON((unsigned)nid >= nr_node_ids); 735 736 for_each_lru(lru) { 737 if (!(BIT(lru) & lru_mask)) 738 continue; 739 nr += mem_cgroup_get_lru_size(lruvec, lru); 740 } 741 return nr; 742 } 743 744 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 745 unsigned int lru_mask) 746 { 747 unsigned long nr = 0; 748 int nid; 749 750 for_each_node_state(nid, N_MEMORY) 751 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 752 return nr; 753 } 754 755 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 756 enum mem_cgroup_events_target target) 757 { 758 unsigned long val, next; 759 760 val = __this_cpu_read(memcg->stat_cpu->nr_page_events); 761 next = __this_cpu_read(memcg->stat_cpu->targets[target]); 762 /* from time_after() in jiffies.h */ 763 if ((long)(next - val) < 0) { 764 switch (target) { 765 case MEM_CGROUP_TARGET_THRESH: 766 next = val + THRESHOLDS_EVENTS_TARGET; 767 break; 768 case MEM_CGROUP_TARGET_SOFTLIMIT: 769 next = val + SOFTLIMIT_EVENTS_TARGET; 770 break; 771 case MEM_CGROUP_TARGET_NUMAINFO: 772 next = val + NUMAINFO_EVENTS_TARGET; 773 break; 774 default: 775 break; 776 } 777 __this_cpu_write(memcg->stat_cpu->targets[target], next); 778 return true; 779 } 780 return false; 781 } 782 783 /* 784 * Check events in order. 785 * 786 */ 787 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 788 { 789 /* threshold event is triggered in finer grain than soft limit */ 790 if (unlikely(mem_cgroup_event_ratelimit(memcg, 791 MEM_CGROUP_TARGET_THRESH))) { 792 bool do_softlimit; 793 bool do_numainfo __maybe_unused; 794 795 do_softlimit = mem_cgroup_event_ratelimit(memcg, 796 MEM_CGROUP_TARGET_SOFTLIMIT); 797 #if MAX_NUMNODES > 1 798 do_numainfo = mem_cgroup_event_ratelimit(memcg, 799 MEM_CGROUP_TARGET_NUMAINFO); 800 #endif 801 mem_cgroup_threshold(memcg); 802 if (unlikely(do_softlimit)) 803 mem_cgroup_update_tree(memcg, page); 804 #if MAX_NUMNODES > 1 805 if (unlikely(do_numainfo)) 806 atomic_inc(&memcg->numainfo_events); 807 #endif 808 } 809 } 810 811 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 812 { 813 /* 814 * mm_update_next_owner() may clear mm->owner to NULL 815 * if it races with swapoff, page migration, etc. 816 * So this can be called with p == NULL. 817 */ 818 if (unlikely(!p)) 819 return NULL; 820 821 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 822 } 823 EXPORT_SYMBOL(mem_cgroup_from_task); 824 825 /** 826 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 827 * @mm: mm from which memcg should be extracted. It can be NULL. 828 * 829 * Obtain a reference on mm->memcg and returns it if successful. Otherwise 830 * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is 831 * returned. 832 */ 833 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 834 { 835 struct mem_cgroup *memcg; 836 837 if (mem_cgroup_disabled()) 838 return NULL; 839 840 rcu_read_lock(); 841 do { 842 /* 843 * Page cache insertions can happen withou an 844 * actual mm context, e.g. during disk probing 845 * on boot, loopback IO, acct() writes etc. 846 */ 847 if (unlikely(!mm)) 848 memcg = root_mem_cgroup; 849 else { 850 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 851 if (unlikely(!memcg)) 852 memcg = root_mem_cgroup; 853 } 854 } while (!css_tryget_online(&memcg->css)); 855 rcu_read_unlock(); 856 return memcg; 857 } 858 EXPORT_SYMBOL(get_mem_cgroup_from_mm); 859 860 /** 861 * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. 862 * @page: page from which memcg should be extracted. 863 * 864 * Obtain a reference on page->memcg and returns it if successful. Otherwise 865 * root_mem_cgroup is returned. 866 */ 867 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) 868 { 869 struct mem_cgroup *memcg = page->mem_cgroup; 870 871 if (mem_cgroup_disabled()) 872 return NULL; 873 874 rcu_read_lock(); 875 if (!memcg || !css_tryget_online(&memcg->css)) 876 memcg = root_mem_cgroup; 877 rcu_read_unlock(); 878 return memcg; 879 } 880 EXPORT_SYMBOL(get_mem_cgroup_from_page); 881 882 /** 883 * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. 884 */ 885 static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) 886 { 887 if (unlikely(current->active_memcg)) { 888 struct mem_cgroup *memcg = root_mem_cgroup; 889 890 rcu_read_lock(); 891 if (css_tryget_online(¤t->active_memcg->css)) 892 memcg = current->active_memcg; 893 rcu_read_unlock(); 894 return memcg; 895 } 896 return get_mem_cgroup_from_mm(current->mm); 897 } 898 899 /** 900 * mem_cgroup_iter - iterate over memory cgroup hierarchy 901 * @root: hierarchy root 902 * @prev: previously returned memcg, NULL on first invocation 903 * @reclaim: cookie for shared reclaim walks, NULL for full walks 904 * 905 * Returns references to children of the hierarchy below @root, or 906 * @root itself, or %NULL after a full round-trip. 907 * 908 * Caller must pass the return value in @prev on subsequent 909 * invocations for reference counting, or use mem_cgroup_iter_break() 910 * to cancel a hierarchy walk before the round-trip is complete. 911 * 912 * Reclaimers can specify a node and a priority level in @reclaim to 913 * divide up the memcgs in the hierarchy among all concurrent 914 * reclaimers operating on the same node and priority. 915 */ 916 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 917 struct mem_cgroup *prev, 918 struct mem_cgroup_reclaim_cookie *reclaim) 919 { 920 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 921 struct cgroup_subsys_state *css = NULL; 922 struct mem_cgroup *memcg = NULL; 923 struct mem_cgroup *pos = NULL; 924 925 if (mem_cgroup_disabled()) 926 return NULL; 927 928 if (!root) 929 root = root_mem_cgroup; 930 931 if (prev && !reclaim) 932 pos = prev; 933 934 if (!root->use_hierarchy && root != root_mem_cgroup) { 935 if (prev) 936 goto out; 937 return root; 938 } 939 940 rcu_read_lock(); 941 942 if (reclaim) { 943 struct mem_cgroup_per_node *mz; 944 945 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); 946 iter = &mz->iter[reclaim->priority]; 947 948 if (prev && reclaim->generation != iter->generation) 949 goto out_unlock; 950 951 while (1) { 952 pos = READ_ONCE(iter->position); 953 if (!pos || css_tryget(&pos->css)) 954 break; 955 /* 956 * css reference reached zero, so iter->position will 957 * be cleared by ->css_released. However, we should not 958 * rely on this happening soon, because ->css_released 959 * is called from a work queue, and by busy-waiting we 960 * might block it. So we clear iter->position right 961 * away. 962 */ 963 (void)cmpxchg(&iter->position, pos, NULL); 964 } 965 } 966 967 if (pos) 968 css = &pos->css; 969 970 for (;;) { 971 css = css_next_descendant_pre(css, &root->css); 972 if (!css) { 973 /* 974 * Reclaimers share the hierarchy walk, and a 975 * new one might jump in right at the end of 976 * the hierarchy - make sure they see at least 977 * one group and restart from the beginning. 978 */ 979 if (!prev) 980 continue; 981 break; 982 } 983 984 /* 985 * Verify the css and acquire a reference. The root 986 * is provided by the caller, so we know it's alive 987 * and kicking, and don't take an extra reference. 988 */ 989 memcg = mem_cgroup_from_css(css); 990 991 if (css == &root->css) 992 break; 993 994 if (css_tryget(css)) 995 break; 996 997 memcg = NULL; 998 } 999 1000 if (reclaim) { 1001 /* 1002 * The position could have already been updated by a competing 1003 * thread, so check that the value hasn't changed since we read 1004 * it to avoid reclaiming from the same cgroup twice. 1005 */ 1006 (void)cmpxchg(&iter->position, pos, memcg); 1007 1008 if (pos) 1009 css_put(&pos->css); 1010 1011 if (!memcg) 1012 iter->generation++; 1013 else if (!prev) 1014 reclaim->generation = iter->generation; 1015 } 1016 1017 out_unlock: 1018 rcu_read_unlock(); 1019 out: 1020 if (prev && prev != root) 1021 css_put(&prev->css); 1022 1023 return memcg; 1024 } 1025 1026 /** 1027 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1028 * @root: hierarchy root 1029 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1030 */ 1031 void mem_cgroup_iter_break(struct mem_cgroup *root, 1032 struct mem_cgroup *prev) 1033 { 1034 if (!root) 1035 root = root_mem_cgroup; 1036 if (prev && prev != root) 1037 css_put(&prev->css); 1038 } 1039 1040 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 1041 { 1042 struct mem_cgroup *memcg = dead_memcg; 1043 struct mem_cgroup_reclaim_iter *iter; 1044 struct mem_cgroup_per_node *mz; 1045 int nid; 1046 int i; 1047 1048 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 1049 for_each_node(nid) { 1050 mz = mem_cgroup_nodeinfo(memcg, nid); 1051 for (i = 0; i <= DEF_PRIORITY; i++) { 1052 iter = &mz->iter[i]; 1053 cmpxchg(&iter->position, 1054 dead_memcg, NULL); 1055 } 1056 } 1057 } 1058 } 1059 1060 /** 1061 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 1062 * @memcg: hierarchy root 1063 * @fn: function to call for each task 1064 * @arg: argument passed to @fn 1065 * 1066 * This function iterates over tasks attached to @memcg or to any of its 1067 * descendants and calls @fn for each task. If @fn returns a non-zero 1068 * value, the function breaks the iteration loop and returns the value. 1069 * Otherwise, it will iterate over all tasks and return 0. 1070 * 1071 * This function must not be called for the root memory cgroup. 1072 */ 1073 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 1074 int (*fn)(struct task_struct *, void *), void *arg) 1075 { 1076 struct mem_cgroup *iter; 1077 int ret = 0; 1078 1079 BUG_ON(memcg == root_mem_cgroup); 1080 1081 for_each_mem_cgroup_tree(iter, memcg) { 1082 struct css_task_iter it; 1083 struct task_struct *task; 1084 1085 css_task_iter_start(&iter->css, 0, &it); 1086 while (!ret && (task = css_task_iter_next(&it))) 1087 ret = fn(task, arg); 1088 css_task_iter_end(&it); 1089 if (ret) { 1090 mem_cgroup_iter_break(memcg, iter); 1091 break; 1092 } 1093 } 1094 return ret; 1095 } 1096 1097 /** 1098 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 1099 * @page: the page 1100 * @pgdat: pgdat of the page 1101 * 1102 * This function is only safe when following the LRU page isolation 1103 * and putback protocol: the LRU lock must be held, and the page must 1104 * either be PageLRU() or the caller must have isolated/allocated it. 1105 */ 1106 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) 1107 { 1108 struct mem_cgroup_per_node *mz; 1109 struct mem_cgroup *memcg; 1110 struct lruvec *lruvec; 1111 1112 if (mem_cgroup_disabled()) { 1113 lruvec = &pgdat->lruvec; 1114 goto out; 1115 } 1116 1117 memcg = page->mem_cgroup; 1118 /* 1119 * Swapcache readahead pages are added to the LRU - and 1120 * possibly migrated - before they are charged. 1121 */ 1122 if (!memcg) 1123 memcg = root_mem_cgroup; 1124 1125 mz = mem_cgroup_page_nodeinfo(memcg, page); 1126 lruvec = &mz->lruvec; 1127 out: 1128 /* 1129 * Since a node can be onlined after the mem_cgroup was created, 1130 * we have to be prepared to initialize lruvec->zone here; 1131 * and if offlined then reonlined, we need to reinitialize it. 1132 */ 1133 if (unlikely(lruvec->pgdat != pgdat)) 1134 lruvec->pgdat = pgdat; 1135 return lruvec; 1136 } 1137 1138 /** 1139 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1140 * @lruvec: mem_cgroup per zone lru vector 1141 * @lru: index of lru list the page is sitting on 1142 * @zid: zone id of the accounted pages 1143 * @nr_pages: positive when adding or negative when removing 1144 * 1145 * This function must be called under lru_lock, just before a page is added 1146 * to or just after a page is removed from an lru list (that ordering being 1147 * so as to allow it to check that lru_size 0 is consistent with list_empty). 1148 */ 1149 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1150 int zid, int nr_pages) 1151 { 1152 struct mem_cgroup_per_node *mz; 1153 unsigned long *lru_size; 1154 long size; 1155 1156 if (mem_cgroup_disabled()) 1157 return; 1158 1159 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 1160 lru_size = &mz->lru_zone_size[zid][lru]; 1161 1162 if (nr_pages < 0) 1163 *lru_size += nr_pages; 1164 1165 size = *lru_size; 1166 if (WARN_ONCE(size < 0, 1167 "%s(%p, %d, %d): lru_size %ld\n", 1168 __func__, lruvec, lru, nr_pages, size)) { 1169 VM_BUG_ON(1); 1170 *lru_size = 0; 1171 } 1172 1173 if (nr_pages > 0) 1174 *lru_size += nr_pages; 1175 } 1176 1177 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) 1178 { 1179 struct mem_cgroup *task_memcg; 1180 struct task_struct *p; 1181 bool ret; 1182 1183 p = find_lock_task_mm(task); 1184 if (p) { 1185 task_memcg = get_mem_cgroup_from_mm(p->mm); 1186 task_unlock(p); 1187 } else { 1188 /* 1189 * All threads may have already detached their mm's, but the oom 1190 * killer still needs to detect if they have already been oom 1191 * killed to prevent needlessly killing additional tasks. 1192 */ 1193 rcu_read_lock(); 1194 task_memcg = mem_cgroup_from_task(task); 1195 css_get(&task_memcg->css); 1196 rcu_read_unlock(); 1197 } 1198 ret = mem_cgroup_is_descendant(task_memcg, memcg); 1199 css_put(&task_memcg->css); 1200 return ret; 1201 } 1202 1203 /** 1204 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1205 * @memcg: the memory cgroup 1206 * 1207 * Returns the maximum amount of memory @mem can be charged with, in 1208 * pages. 1209 */ 1210 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1211 { 1212 unsigned long margin = 0; 1213 unsigned long count; 1214 unsigned long limit; 1215 1216 count = page_counter_read(&memcg->memory); 1217 limit = READ_ONCE(memcg->memory.max); 1218 if (count < limit) 1219 margin = limit - count; 1220 1221 if (do_memsw_account()) { 1222 count = page_counter_read(&memcg->memsw); 1223 limit = READ_ONCE(memcg->memsw.max); 1224 if (count <= limit) 1225 margin = min(margin, limit - count); 1226 else 1227 margin = 0; 1228 } 1229 1230 return margin; 1231 } 1232 1233 /* 1234 * A routine for checking "mem" is under move_account() or not. 1235 * 1236 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1237 * moving cgroups. This is for waiting at high-memory pressure 1238 * caused by "move". 1239 */ 1240 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1241 { 1242 struct mem_cgroup *from; 1243 struct mem_cgroup *to; 1244 bool ret = false; 1245 /* 1246 * Unlike task_move routines, we access mc.to, mc.from not under 1247 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1248 */ 1249 spin_lock(&mc.lock); 1250 from = mc.from; 1251 to = mc.to; 1252 if (!from) 1253 goto unlock; 1254 1255 ret = mem_cgroup_is_descendant(from, memcg) || 1256 mem_cgroup_is_descendant(to, memcg); 1257 unlock: 1258 spin_unlock(&mc.lock); 1259 return ret; 1260 } 1261 1262 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1263 { 1264 if (mc.moving_task && current != mc.moving_task) { 1265 if (mem_cgroup_under_move(memcg)) { 1266 DEFINE_WAIT(wait); 1267 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1268 /* moving charge context might have finished. */ 1269 if (mc.moving_task) 1270 schedule(); 1271 finish_wait(&mc.waitq, &wait); 1272 return true; 1273 } 1274 } 1275 return false; 1276 } 1277 1278 static const unsigned int memcg1_stats[] = { 1279 MEMCG_CACHE, 1280 MEMCG_RSS, 1281 MEMCG_RSS_HUGE, 1282 NR_SHMEM, 1283 NR_FILE_MAPPED, 1284 NR_FILE_DIRTY, 1285 NR_WRITEBACK, 1286 MEMCG_SWAP, 1287 }; 1288 1289 static const char *const memcg1_stat_names[] = { 1290 "cache", 1291 "rss", 1292 "rss_huge", 1293 "shmem", 1294 "mapped_file", 1295 "dirty", 1296 "writeback", 1297 "swap", 1298 }; 1299 1300 #define K(x) ((x) << (PAGE_SHIFT-10)) 1301 /** 1302 * mem_cgroup_print_oom_context: Print OOM information relevant to 1303 * memory controller. 1304 * @memcg: The memory cgroup that went over limit 1305 * @p: Task that is going to be killed 1306 * 1307 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1308 * enabled 1309 */ 1310 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 1311 { 1312 rcu_read_lock(); 1313 1314 if (memcg) { 1315 pr_cont(",oom_memcg="); 1316 pr_cont_cgroup_path(memcg->css.cgroup); 1317 } else 1318 pr_cont(",global_oom"); 1319 if (p) { 1320 pr_cont(",task_memcg="); 1321 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1322 } 1323 rcu_read_unlock(); 1324 } 1325 1326 /** 1327 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 1328 * memory controller. 1329 * @memcg: The memory cgroup that went over limit 1330 */ 1331 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 1332 { 1333 struct mem_cgroup *iter; 1334 unsigned int i; 1335 1336 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1337 K((u64)page_counter_read(&memcg->memory)), 1338 K((u64)memcg->memory.max), memcg->memory.failcnt); 1339 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1340 K((u64)page_counter_read(&memcg->memsw)), 1341 K((u64)memcg->memsw.max), memcg->memsw.failcnt); 1342 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1343 K((u64)page_counter_read(&memcg->kmem)), 1344 K((u64)memcg->kmem.max), memcg->kmem.failcnt); 1345 1346 for_each_mem_cgroup_tree(iter, memcg) { 1347 pr_info("Memory cgroup stats for "); 1348 pr_cont_cgroup_path(iter->css.cgroup); 1349 pr_cont(":"); 1350 1351 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 1352 if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) 1353 continue; 1354 pr_cont(" %s:%luKB", memcg1_stat_names[i], 1355 K(memcg_page_state(iter, memcg1_stats[i]))); 1356 } 1357 1358 for (i = 0; i < NR_LRU_LISTS; i++) 1359 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1360 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1361 1362 pr_cont("\n"); 1363 } 1364 } 1365 1366 /* 1367 * Return the memory (and swap, if configured) limit for a memcg. 1368 */ 1369 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 1370 { 1371 unsigned long max; 1372 1373 max = memcg->memory.max; 1374 if (mem_cgroup_swappiness(memcg)) { 1375 unsigned long memsw_max; 1376 unsigned long swap_max; 1377 1378 memsw_max = memcg->memsw.max; 1379 swap_max = memcg->swap.max; 1380 swap_max = min(swap_max, (unsigned long)total_swap_pages); 1381 max = min(max + swap_max, memsw_max); 1382 } 1383 return max; 1384 } 1385 1386 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1387 int order) 1388 { 1389 struct oom_control oc = { 1390 .zonelist = NULL, 1391 .nodemask = NULL, 1392 .memcg = memcg, 1393 .gfp_mask = gfp_mask, 1394 .order = order, 1395 }; 1396 bool ret; 1397 1398 if (mutex_lock_killable(&oom_lock)) 1399 return true; 1400 /* 1401 * A few threads which were not waiting at mutex_lock_killable() can 1402 * fail to bail out. Therefore, check again after holding oom_lock. 1403 */ 1404 ret = should_force_charge() || out_of_memory(&oc); 1405 mutex_unlock(&oom_lock); 1406 return ret; 1407 } 1408 1409 #if MAX_NUMNODES > 1 1410 1411 /** 1412 * test_mem_cgroup_node_reclaimable 1413 * @memcg: the target memcg 1414 * @nid: the node ID to be checked. 1415 * @noswap : specify true here if the user wants flle only information. 1416 * 1417 * This function returns whether the specified memcg contains any 1418 * reclaimable pages on a node. Returns true if there are any reclaimable 1419 * pages in the node. 1420 */ 1421 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1422 int nid, bool noswap) 1423 { 1424 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1425 return true; 1426 if (noswap || !total_swap_pages) 1427 return false; 1428 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1429 return true; 1430 return false; 1431 1432 } 1433 1434 /* 1435 * Always updating the nodemask is not very good - even if we have an empty 1436 * list or the wrong list here, we can start from some node and traverse all 1437 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1438 * 1439 */ 1440 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1441 { 1442 int nid; 1443 /* 1444 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1445 * pagein/pageout changes since the last update. 1446 */ 1447 if (!atomic_read(&memcg->numainfo_events)) 1448 return; 1449 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1450 return; 1451 1452 /* make a nodemask where this memcg uses memory from */ 1453 memcg->scan_nodes = node_states[N_MEMORY]; 1454 1455 for_each_node_mask(nid, node_states[N_MEMORY]) { 1456 1457 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1458 node_clear(nid, memcg->scan_nodes); 1459 } 1460 1461 atomic_set(&memcg->numainfo_events, 0); 1462 atomic_set(&memcg->numainfo_updating, 0); 1463 } 1464 1465 /* 1466 * Selecting a node where we start reclaim from. Because what we need is just 1467 * reducing usage counter, start from anywhere is O,K. Considering 1468 * memory reclaim from current node, there are pros. and cons. 1469 * 1470 * Freeing memory from current node means freeing memory from a node which 1471 * we'll use or we've used. So, it may make LRU bad. And if several threads 1472 * hit limits, it will see a contention on a node. But freeing from remote 1473 * node means more costs for memory reclaim because of memory latency. 1474 * 1475 * Now, we use round-robin. Better algorithm is welcomed. 1476 */ 1477 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1478 { 1479 int node; 1480 1481 mem_cgroup_may_update_nodemask(memcg); 1482 node = memcg->last_scanned_node; 1483 1484 node = next_node_in(node, memcg->scan_nodes); 1485 /* 1486 * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages 1487 * last time it really checked all the LRUs due to rate limiting. 1488 * Fallback to the current node in that case for simplicity. 1489 */ 1490 if (unlikely(node == MAX_NUMNODES)) 1491 node = numa_node_id(); 1492 1493 memcg->last_scanned_node = node; 1494 return node; 1495 } 1496 #else 1497 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1498 { 1499 return 0; 1500 } 1501 #endif 1502 1503 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1504 pg_data_t *pgdat, 1505 gfp_t gfp_mask, 1506 unsigned long *total_scanned) 1507 { 1508 struct mem_cgroup *victim = NULL; 1509 int total = 0; 1510 int loop = 0; 1511 unsigned long excess; 1512 unsigned long nr_scanned; 1513 struct mem_cgroup_reclaim_cookie reclaim = { 1514 .pgdat = pgdat, 1515 .priority = 0, 1516 }; 1517 1518 excess = soft_limit_excess(root_memcg); 1519 1520 while (1) { 1521 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1522 if (!victim) { 1523 loop++; 1524 if (loop >= 2) { 1525 /* 1526 * If we have not been able to reclaim 1527 * anything, it might because there are 1528 * no reclaimable pages under this hierarchy 1529 */ 1530 if (!total) 1531 break; 1532 /* 1533 * We want to do more targeted reclaim. 1534 * excess >> 2 is not to excessive so as to 1535 * reclaim too much, nor too less that we keep 1536 * coming back to reclaim from this cgroup 1537 */ 1538 if (total >= (excess >> 2) || 1539 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1540 break; 1541 } 1542 continue; 1543 } 1544 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 1545 pgdat, &nr_scanned); 1546 *total_scanned += nr_scanned; 1547 if (!soft_limit_excess(root_memcg)) 1548 break; 1549 } 1550 mem_cgroup_iter_break(root_memcg, victim); 1551 return total; 1552 } 1553 1554 #ifdef CONFIG_LOCKDEP 1555 static struct lockdep_map memcg_oom_lock_dep_map = { 1556 .name = "memcg_oom_lock", 1557 }; 1558 #endif 1559 1560 static DEFINE_SPINLOCK(memcg_oom_lock); 1561 1562 /* 1563 * Check OOM-Killer is already running under our hierarchy. 1564 * If someone is running, return false. 1565 */ 1566 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1567 { 1568 struct mem_cgroup *iter, *failed = NULL; 1569 1570 spin_lock(&memcg_oom_lock); 1571 1572 for_each_mem_cgroup_tree(iter, memcg) { 1573 if (iter->oom_lock) { 1574 /* 1575 * this subtree of our hierarchy is already locked 1576 * so we cannot give a lock. 1577 */ 1578 failed = iter; 1579 mem_cgroup_iter_break(memcg, iter); 1580 break; 1581 } else 1582 iter->oom_lock = true; 1583 } 1584 1585 if (failed) { 1586 /* 1587 * OK, we failed to lock the whole subtree so we have 1588 * to clean up what we set up to the failing subtree 1589 */ 1590 for_each_mem_cgroup_tree(iter, memcg) { 1591 if (iter == failed) { 1592 mem_cgroup_iter_break(memcg, iter); 1593 break; 1594 } 1595 iter->oom_lock = false; 1596 } 1597 } else 1598 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1599 1600 spin_unlock(&memcg_oom_lock); 1601 1602 return !failed; 1603 } 1604 1605 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1606 { 1607 struct mem_cgroup *iter; 1608 1609 spin_lock(&memcg_oom_lock); 1610 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 1611 for_each_mem_cgroup_tree(iter, memcg) 1612 iter->oom_lock = false; 1613 spin_unlock(&memcg_oom_lock); 1614 } 1615 1616 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1617 { 1618 struct mem_cgroup *iter; 1619 1620 spin_lock(&memcg_oom_lock); 1621 for_each_mem_cgroup_tree(iter, memcg) 1622 iter->under_oom++; 1623 spin_unlock(&memcg_oom_lock); 1624 } 1625 1626 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1627 { 1628 struct mem_cgroup *iter; 1629 1630 /* 1631 * When a new child is created while the hierarchy is under oom, 1632 * mem_cgroup_oom_lock() may not be called. Watch for underflow. 1633 */ 1634 spin_lock(&memcg_oom_lock); 1635 for_each_mem_cgroup_tree(iter, memcg) 1636 if (iter->under_oom > 0) 1637 iter->under_oom--; 1638 spin_unlock(&memcg_oom_lock); 1639 } 1640 1641 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1642 1643 struct oom_wait_info { 1644 struct mem_cgroup *memcg; 1645 wait_queue_entry_t wait; 1646 }; 1647 1648 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1649 unsigned mode, int sync, void *arg) 1650 { 1651 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1652 struct mem_cgroup *oom_wait_memcg; 1653 struct oom_wait_info *oom_wait_info; 1654 1655 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1656 oom_wait_memcg = oom_wait_info->memcg; 1657 1658 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1659 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1660 return 0; 1661 return autoremove_wake_function(wait, mode, sync, arg); 1662 } 1663 1664 static void memcg_oom_recover(struct mem_cgroup *memcg) 1665 { 1666 /* 1667 * For the following lockless ->under_oom test, the only required 1668 * guarantee is that it must see the state asserted by an OOM when 1669 * this function is called as a result of userland actions 1670 * triggered by the notification of the OOM. This is trivially 1671 * achieved by invoking mem_cgroup_mark_under_oom() before 1672 * triggering notification. 1673 */ 1674 if (memcg && memcg->under_oom) 1675 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1676 } 1677 1678 enum oom_status { 1679 OOM_SUCCESS, 1680 OOM_FAILED, 1681 OOM_ASYNC, 1682 OOM_SKIPPED 1683 }; 1684 1685 static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1686 { 1687 enum oom_status ret; 1688 bool locked; 1689 1690 if (order > PAGE_ALLOC_COSTLY_ORDER) 1691 return OOM_SKIPPED; 1692 1693 memcg_memory_event(memcg, MEMCG_OOM); 1694 1695 /* 1696 * We are in the middle of the charge context here, so we 1697 * don't want to block when potentially sitting on a callstack 1698 * that holds all kinds of filesystem and mm locks. 1699 * 1700 * cgroup1 allows disabling the OOM killer and waiting for outside 1701 * handling until the charge can succeed; remember the context and put 1702 * the task to sleep at the end of the page fault when all locks are 1703 * released. 1704 * 1705 * On the other hand, in-kernel OOM killer allows for an async victim 1706 * memory reclaim (oom_reaper) and that means that we are not solely 1707 * relying on the oom victim to make a forward progress and we can 1708 * invoke the oom killer here. 1709 * 1710 * Please note that mem_cgroup_out_of_memory might fail to find a 1711 * victim and then we have to bail out from the charge path. 1712 */ 1713 if (memcg->oom_kill_disable) { 1714 if (!current->in_user_fault) 1715 return OOM_SKIPPED; 1716 css_get(&memcg->css); 1717 current->memcg_in_oom = memcg; 1718 current->memcg_oom_gfp_mask = mask; 1719 current->memcg_oom_order = order; 1720 1721 return OOM_ASYNC; 1722 } 1723 1724 mem_cgroup_mark_under_oom(memcg); 1725 1726 locked = mem_cgroup_oom_trylock(memcg); 1727 1728 if (locked) 1729 mem_cgroup_oom_notify(memcg); 1730 1731 mem_cgroup_unmark_under_oom(memcg); 1732 if (mem_cgroup_out_of_memory(memcg, mask, order)) 1733 ret = OOM_SUCCESS; 1734 else 1735 ret = OOM_FAILED; 1736 1737 if (locked) 1738 mem_cgroup_oom_unlock(memcg); 1739 1740 return ret; 1741 } 1742 1743 /** 1744 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1745 * @handle: actually kill/wait or just clean up the OOM state 1746 * 1747 * This has to be called at the end of a page fault if the memcg OOM 1748 * handler was enabled. 1749 * 1750 * Memcg supports userspace OOM handling where failed allocations must 1751 * sleep on a waitqueue until the userspace task resolves the 1752 * situation. Sleeping directly in the charge context with all kinds 1753 * of locks held is not a good idea, instead we remember an OOM state 1754 * in the task and mem_cgroup_oom_synchronize() has to be called at 1755 * the end of the page fault to complete the OOM handling. 1756 * 1757 * Returns %true if an ongoing memcg OOM situation was detected and 1758 * completed, %false otherwise. 1759 */ 1760 bool mem_cgroup_oom_synchronize(bool handle) 1761 { 1762 struct mem_cgroup *memcg = current->memcg_in_oom; 1763 struct oom_wait_info owait; 1764 bool locked; 1765 1766 /* OOM is global, do not handle */ 1767 if (!memcg) 1768 return false; 1769 1770 if (!handle) 1771 goto cleanup; 1772 1773 owait.memcg = memcg; 1774 owait.wait.flags = 0; 1775 owait.wait.func = memcg_oom_wake_function; 1776 owait.wait.private = current; 1777 INIT_LIST_HEAD(&owait.wait.entry); 1778 1779 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1780 mem_cgroup_mark_under_oom(memcg); 1781 1782 locked = mem_cgroup_oom_trylock(memcg); 1783 1784 if (locked) 1785 mem_cgroup_oom_notify(memcg); 1786 1787 if (locked && !memcg->oom_kill_disable) { 1788 mem_cgroup_unmark_under_oom(memcg); 1789 finish_wait(&memcg_oom_waitq, &owait.wait); 1790 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, 1791 current->memcg_oom_order); 1792 } else { 1793 schedule(); 1794 mem_cgroup_unmark_under_oom(memcg); 1795 finish_wait(&memcg_oom_waitq, &owait.wait); 1796 } 1797 1798 if (locked) { 1799 mem_cgroup_oom_unlock(memcg); 1800 /* 1801 * There is no guarantee that an OOM-lock contender 1802 * sees the wakeups triggered by the OOM kill 1803 * uncharges. Wake any sleepers explicitely. 1804 */ 1805 memcg_oom_recover(memcg); 1806 } 1807 cleanup: 1808 current->memcg_in_oom = NULL; 1809 css_put(&memcg->css); 1810 return true; 1811 } 1812 1813 /** 1814 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 1815 * @victim: task to be killed by the OOM killer 1816 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 1817 * 1818 * Returns a pointer to a memory cgroup, which has to be cleaned up 1819 * by killing all belonging OOM-killable tasks. 1820 * 1821 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 1822 */ 1823 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 1824 struct mem_cgroup *oom_domain) 1825 { 1826 struct mem_cgroup *oom_group = NULL; 1827 struct mem_cgroup *memcg; 1828 1829 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1830 return NULL; 1831 1832 if (!oom_domain) 1833 oom_domain = root_mem_cgroup; 1834 1835 rcu_read_lock(); 1836 1837 memcg = mem_cgroup_from_task(victim); 1838 if (memcg == root_mem_cgroup) 1839 goto out; 1840 1841 /* 1842 * Traverse the memory cgroup hierarchy from the victim task's 1843 * cgroup up to the OOMing cgroup (or root) to find the 1844 * highest-level memory cgroup with oom.group set. 1845 */ 1846 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 1847 if (memcg->oom_group) 1848 oom_group = memcg; 1849 1850 if (memcg == oom_domain) 1851 break; 1852 } 1853 1854 if (oom_group) 1855 css_get(&oom_group->css); 1856 out: 1857 rcu_read_unlock(); 1858 1859 return oom_group; 1860 } 1861 1862 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 1863 { 1864 pr_info("Tasks in "); 1865 pr_cont_cgroup_path(memcg->css.cgroup); 1866 pr_cont(" are going to be killed due to memory.oom.group set\n"); 1867 } 1868 1869 /** 1870 * lock_page_memcg - lock a page->mem_cgroup binding 1871 * @page: the page 1872 * 1873 * This function protects unlocked LRU pages from being moved to 1874 * another cgroup. 1875 * 1876 * It ensures lifetime of the returned memcg. Caller is responsible 1877 * for the lifetime of the page; __unlock_page_memcg() is available 1878 * when @page might get freed inside the locked section. 1879 */ 1880 struct mem_cgroup *lock_page_memcg(struct page *page) 1881 { 1882 struct mem_cgroup *memcg; 1883 unsigned long flags; 1884 1885 /* 1886 * The RCU lock is held throughout the transaction. The fast 1887 * path can get away without acquiring the memcg->move_lock 1888 * because page moving starts with an RCU grace period. 1889 * 1890 * The RCU lock also protects the memcg from being freed when 1891 * the page state that is going to change is the only thing 1892 * preventing the page itself from being freed. E.g. writeback 1893 * doesn't hold a page reference and relies on PG_writeback to 1894 * keep off truncation, migration and so forth. 1895 */ 1896 rcu_read_lock(); 1897 1898 if (mem_cgroup_disabled()) 1899 return NULL; 1900 again: 1901 memcg = page->mem_cgroup; 1902 if (unlikely(!memcg)) 1903 return NULL; 1904 1905 if (atomic_read(&memcg->moving_account) <= 0) 1906 return memcg; 1907 1908 spin_lock_irqsave(&memcg->move_lock, flags); 1909 if (memcg != page->mem_cgroup) { 1910 spin_unlock_irqrestore(&memcg->move_lock, flags); 1911 goto again; 1912 } 1913 1914 /* 1915 * When charge migration first begins, we can have locked and 1916 * unlocked page stat updates happening concurrently. Track 1917 * the task who has the lock for unlock_page_memcg(). 1918 */ 1919 memcg->move_lock_task = current; 1920 memcg->move_lock_flags = flags; 1921 1922 return memcg; 1923 } 1924 EXPORT_SYMBOL(lock_page_memcg); 1925 1926 /** 1927 * __unlock_page_memcg - unlock and unpin a memcg 1928 * @memcg: the memcg 1929 * 1930 * Unlock and unpin a memcg returned by lock_page_memcg(). 1931 */ 1932 void __unlock_page_memcg(struct mem_cgroup *memcg) 1933 { 1934 if (memcg && memcg->move_lock_task == current) { 1935 unsigned long flags = memcg->move_lock_flags; 1936 1937 memcg->move_lock_task = NULL; 1938 memcg->move_lock_flags = 0; 1939 1940 spin_unlock_irqrestore(&memcg->move_lock, flags); 1941 } 1942 1943 rcu_read_unlock(); 1944 } 1945 1946 /** 1947 * unlock_page_memcg - unlock a page->mem_cgroup binding 1948 * @page: the page 1949 */ 1950 void unlock_page_memcg(struct page *page) 1951 { 1952 __unlock_page_memcg(page->mem_cgroup); 1953 } 1954 EXPORT_SYMBOL(unlock_page_memcg); 1955 1956 struct memcg_stock_pcp { 1957 struct mem_cgroup *cached; /* this never be root cgroup */ 1958 unsigned int nr_pages; 1959 struct work_struct work; 1960 unsigned long flags; 1961 #define FLUSHING_CACHED_CHARGE 0 1962 }; 1963 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1964 static DEFINE_MUTEX(percpu_charge_mutex); 1965 1966 /** 1967 * consume_stock: Try to consume stocked charge on this cpu. 1968 * @memcg: memcg to consume from. 1969 * @nr_pages: how many pages to charge. 1970 * 1971 * The charges will only happen if @memcg matches the current cpu's memcg 1972 * stock, and at least @nr_pages are available in that stock. Failure to 1973 * service an allocation will refill the stock. 1974 * 1975 * returns true if successful, false otherwise. 1976 */ 1977 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 1978 { 1979 struct memcg_stock_pcp *stock; 1980 unsigned long flags; 1981 bool ret = false; 1982 1983 if (nr_pages > MEMCG_CHARGE_BATCH) 1984 return ret; 1985 1986 local_irq_save(flags); 1987 1988 stock = this_cpu_ptr(&memcg_stock); 1989 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 1990 stock->nr_pages -= nr_pages; 1991 ret = true; 1992 } 1993 1994 local_irq_restore(flags); 1995 1996 return ret; 1997 } 1998 1999 /* 2000 * Returns stocks cached in percpu and reset cached information. 2001 */ 2002 static void drain_stock(struct memcg_stock_pcp *stock) 2003 { 2004 struct mem_cgroup *old = stock->cached; 2005 2006 if (stock->nr_pages) { 2007 page_counter_uncharge(&old->memory, stock->nr_pages); 2008 if (do_memsw_account()) 2009 page_counter_uncharge(&old->memsw, stock->nr_pages); 2010 css_put_many(&old->css, stock->nr_pages); 2011 stock->nr_pages = 0; 2012 } 2013 stock->cached = NULL; 2014 } 2015 2016 static void drain_local_stock(struct work_struct *dummy) 2017 { 2018 struct memcg_stock_pcp *stock; 2019 unsigned long flags; 2020 2021 /* 2022 * The only protection from memory hotplug vs. drain_stock races is 2023 * that we always operate on local CPU stock here with IRQ disabled 2024 */ 2025 local_irq_save(flags); 2026 2027 stock = this_cpu_ptr(&memcg_stock); 2028 drain_stock(stock); 2029 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2030 2031 local_irq_restore(flags); 2032 } 2033 2034 /* 2035 * Cache charges(val) to local per_cpu area. 2036 * This will be consumed by consume_stock() function, later. 2037 */ 2038 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2039 { 2040 struct memcg_stock_pcp *stock; 2041 unsigned long flags; 2042 2043 local_irq_save(flags); 2044 2045 stock = this_cpu_ptr(&memcg_stock); 2046 if (stock->cached != memcg) { /* reset if necessary */ 2047 drain_stock(stock); 2048 stock->cached = memcg; 2049 } 2050 stock->nr_pages += nr_pages; 2051 2052 if (stock->nr_pages > MEMCG_CHARGE_BATCH) 2053 drain_stock(stock); 2054 2055 local_irq_restore(flags); 2056 } 2057 2058 /* 2059 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2060 * of the hierarchy under it. 2061 */ 2062 static void drain_all_stock(struct mem_cgroup *root_memcg) 2063 { 2064 int cpu, curcpu; 2065 2066 /* If someone's already draining, avoid adding running more workers. */ 2067 if (!mutex_trylock(&percpu_charge_mutex)) 2068 return; 2069 /* 2070 * Notify other cpus that system-wide "drain" is running 2071 * We do not care about races with the cpu hotplug because cpu down 2072 * as well as workers from this path always operate on the local 2073 * per-cpu data. CPU up doesn't touch memcg_stock at all. 2074 */ 2075 curcpu = get_cpu(); 2076 for_each_online_cpu(cpu) { 2077 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2078 struct mem_cgroup *memcg; 2079 2080 memcg = stock->cached; 2081 if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) 2082 continue; 2083 if (!mem_cgroup_is_descendant(memcg, root_memcg)) { 2084 css_put(&memcg->css); 2085 continue; 2086 } 2087 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2088 if (cpu == curcpu) 2089 drain_local_stock(&stock->work); 2090 else 2091 schedule_work_on(cpu, &stock->work); 2092 } 2093 css_put(&memcg->css); 2094 } 2095 put_cpu(); 2096 mutex_unlock(&percpu_charge_mutex); 2097 } 2098 2099 static int memcg_hotplug_cpu_dead(unsigned int cpu) 2100 { 2101 struct memcg_stock_pcp *stock; 2102 struct mem_cgroup *memcg; 2103 2104 stock = &per_cpu(memcg_stock, cpu); 2105 drain_stock(stock); 2106 2107 for_each_mem_cgroup(memcg) { 2108 int i; 2109 2110 for (i = 0; i < MEMCG_NR_STAT; i++) { 2111 int nid; 2112 long x; 2113 2114 x = this_cpu_xchg(memcg->stat_cpu->count[i], 0); 2115 if (x) 2116 atomic_long_add(x, &memcg->stat[i]); 2117 2118 if (i >= NR_VM_NODE_STAT_ITEMS) 2119 continue; 2120 2121 for_each_node(nid) { 2122 struct mem_cgroup_per_node *pn; 2123 2124 pn = mem_cgroup_nodeinfo(memcg, nid); 2125 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); 2126 if (x) 2127 atomic_long_add(x, &pn->lruvec_stat[i]); 2128 } 2129 } 2130 2131 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { 2132 long x; 2133 2134 x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); 2135 if (x) 2136 atomic_long_add(x, &memcg->events[i]); 2137 } 2138 } 2139 2140 return 0; 2141 } 2142 2143 static void reclaim_high(struct mem_cgroup *memcg, 2144 unsigned int nr_pages, 2145 gfp_t gfp_mask) 2146 { 2147 do { 2148 if (page_counter_read(&memcg->memory) <= memcg->high) 2149 continue; 2150 memcg_memory_event(memcg, MEMCG_HIGH); 2151 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); 2152 } while ((memcg = parent_mem_cgroup(memcg))); 2153 } 2154 2155 static void high_work_func(struct work_struct *work) 2156 { 2157 struct mem_cgroup *memcg; 2158 2159 memcg = container_of(work, struct mem_cgroup, high_work); 2160 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 2161 } 2162 2163 /* 2164 * Scheduled by try_charge() to be executed from the userland return path 2165 * and reclaims memory over the high limit. 2166 */ 2167 void mem_cgroup_handle_over_high(void) 2168 { 2169 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2170 struct mem_cgroup *memcg; 2171 2172 if (likely(!nr_pages)) 2173 return; 2174 2175 memcg = get_mem_cgroup_from_mm(current->mm); 2176 reclaim_high(memcg, nr_pages, GFP_KERNEL); 2177 css_put(&memcg->css); 2178 current->memcg_nr_pages_over_high = 0; 2179 } 2180 2181 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2182 unsigned int nr_pages) 2183 { 2184 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 2185 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2186 struct mem_cgroup *mem_over_limit; 2187 struct page_counter *counter; 2188 unsigned long nr_reclaimed; 2189 bool may_swap = true; 2190 bool drained = false; 2191 bool oomed = false; 2192 enum oom_status oom_status; 2193 2194 if (mem_cgroup_is_root(memcg)) 2195 return 0; 2196 retry: 2197 if (consume_stock(memcg, nr_pages)) 2198 return 0; 2199 2200 if (!do_memsw_account() || 2201 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2202 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 2203 goto done_restock; 2204 if (do_memsw_account()) 2205 page_counter_uncharge(&memcg->memsw, batch); 2206 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2207 } else { 2208 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2209 may_swap = false; 2210 } 2211 2212 if (batch > nr_pages) { 2213 batch = nr_pages; 2214 goto retry; 2215 } 2216 2217 /* 2218 * Unlike in global OOM situations, memcg is not in a physical 2219 * memory shortage. Allow dying and OOM-killed tasks to 2220 * bypass the last charges so that they can exit quickly and 2221 * free their memory. 2222 */ 2223 if (unlikely(should_force_charge())) 2224 goto force; 2225 2226 /* 2227 * Prevent unbounded recursion when reclaim operations need to 2228 * allocate memory. This might exceed the limits temporarily, 2229 * but we prefer facilitating memory reclaim and getting back 2230 * under the limit over triggering OOM kills in these cases. 2231 */ 2232 if (unlikely(current->flags & PF_MEMALLOC)) 2233 goto force; 2234 2235 if (unlikely(task_in_memcg_oom(current))) 2236 goto nomem; 2237 2238 if (!gfpflags_allow_blocking(gfp_mask)) 2239 goto nomem; 2240 2241 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2242 2243 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2244 gfp_mask, may_swap); 2245 2246 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2247 goto retry; 2248 2249 if (!drained) { 2250 drain_all_stock(mem_over_limit); 2251 drained = true; 2252 goto retry; 2253 } 2254 2255 if (gfp_mask & __GFP_NORETRY) 2256 goto nomem; 2257 /* 2258 * Even though the limit is exceeded at this point, reclaim 2259 * may have been able to free some pages. Retry the charge 2260 * before killing the task. 2261 * 2262 * Only for regular pages, though: huge pages are rather 2263 * unlikely to succeed so close to the limit, and we fall back 2264 * to regular pages anyway in case of failure. 2265 */ 2266 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2267 goto retry; 2268 /* 2269 * At task move, charge accounts can be doubly counted. So, it's 2270 * better to wait until the end of task_move if something is going on. 2271 */ 2272 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2273 goto retry; 2274 2275 if (nr_retries--) 2276 goto retry; 2277 2278 if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed) 2279 goto nomem; 2280 2281 if (gfp_mask & __GFP_NOFAIL) 2282 goto force; 2283 2284 if (fatal_signal_pending(current)) 2285 goto force; 2286 2287 /* 2288 * keep retrying as long as the memcg oom killer is able to make 2289 * a forward progress or bypass the charge if the oom killer 2290 * couldn't make any progress. 2291 */ 2292 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, 2293 get_order(nr_pages * PAGE_SIZE)); 2294 switch (oom_status) { 2295 case OOM_SUCCESS: 2296 nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2297 oomed = true; 2298 goto retry; 2299 case OOM_FAILED: 2300 goto force; 2301 default: 2302 goto nomem; 2303 } 2304 nomem: 2305 if (!(gfp_mask & __GFP_NOFAIL)) 2306 return -ENOMEM; 2307 force: 2308 /* 2309 * The allocation either can't fail or will lead to more memory 2310 * being freed very soon. Allow memory usage go over the limit 2311 * temporarily by force charging it. 2312 */ 2313 page_counter_charge(&memcg->memory, nr_pages); 2314 if (do_memsw_account()) 2315 page_counter_charge(&memcg->memsw, nr_pages); 2316 css_get_many(&memcg->css, nr_pages); 2317 2318 return 0; 2319 2320 done_restock: 2321 css_get_many(&memcg->css, batch); 2322 if (batch > nr_pages) 2323 refill_stock(memcg, batch - nr_pages); 2324 2325 /* 2326 * If the hierarchy is above the normal consumption range, schedule 2327 * reclaim on returning to userland. We can perform reclaim here 2328 * if __GFP_RECLAIM but let's always punt for simplicity and so that 2329 * GFP_KERNEL can consistently be used during reclaim. @memcg is 2330 * not recorded as it most likely matches current's and won't 2331 * change in the meantime. As high limit is checked again before 2332 * reclaim, the cost of mismatch is negligible. 2333 */ 2334 do { 2335 if (page_counter_read(&memcg->memory) > memcg->high) { 2336 /* Don't bother a random interrupted task */ 2337 if (in_interrupt()) { 2338 schedule_work(&memcg->high_work); 2339 break; 2340 } 2341 current->memcg_nr_pages_over_high += batch; 2342 set_notify_resume(current); 2343 break; 2344 } 2345 } while ((memcg = parent_mem_cgroup(memcg))); 2346 2347 return 0; 2348 } 2349 2350 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2351 { 2352 if (mem_cgroup_is_root(memcg)) 2353 return; 2354 2355 page_counter_uncharge(&memcg->memory, nr_pages); 2356 if (do_memsw_account()) 2357 page_counter_uncharge(&memcg->memsw, nr_pages); 2358 2359 css_put_many(&memcg->css, nr_pages); 2360 } 2361 2362 static void lock_page_lru(struct page *page, int *isolated) 2363 { 2364 struct zone *zone = page_zone(page); 2365 2366 spin_lock_irq(zone_lru_lock(zone)); 2367 if (PageLRU(page)) { 2368 struct lruvec *lruvec; 2369 2370 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 2371 ClearPageLRU(page); 2372 del_page_from_lru_list(page, lruvec, page_lru(page)); 2373 *isolated = 1; 2374 } else 2375 *isolated = 0; 2376 } 2377 2378 static void unlock_page_lru(struct page *page, int isolated) 2379 { 2380 struct zone *zone = page_zone(page); 2381 2382 if (isolated) { 2383 struct lruvec *lruvec; 2384 2385 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 2386 VM_BUG_ON_PAGE(PageLRU(page), page); 2387 SetPageLRU(page); 2388 add_page_to_lru_list(page, lruvec, page_lru(page)); 2389 } 2390 spin_unlock_irq(zone_lru_lock(zone)); 2391 } 2392 2393 static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2394 bool lrucare) 2395 { 2396 int isolated; 2397 2398 VM_BUG_ON_PAGE(page->mem_cgroup, page); 2399 2400 /* 2401 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2402 * may already be on some other mem_cgroup's LRU. Take care of it. 2403 */ 2404 if (lrucare) 2405 lock_page_lru(page, &isolated); 2406 2407 /* 2408 * Nobody should be changing or seriously looking at 2409 * page->mem_cgroup at this point: 2410 * 2411 * - the page is uncharged 2412 * 2413 * - the page is off-LRU 2414 * 2415 * - an anonymous fault has exclusive page access, except for 2416 * a locked page table 2417 * 2418 * - a page cache insertion, a swapin fault, or a migration 2419 * have the page locked 2420 */ 2421 page->mem_cgroup = memcg; 2422 2423 if (lrucare) 2424 unlock_page_lru(page, isolated); 2425 } 2426 2427 #ifdef CONFIG_MEMCG_KMEM 2428 static int memcg_alloc_cache_id(void) 2429 { 2430 int id, size; 2431 int err; 2432 2433 id = ida_simple_get(&memcg_cache_ida, 2434 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2435 if (id < 0) 2436 return id; 2437 2438 if (id < memcg_nr_cache_ids) 2439 return id; 2440 2441 /* 2442 * There's no space for the new id in memcg_caches arrays, 2443 * so we have to grow them. 2444 */ 2445 down_write(&memcg_cache_ids_sem); 2446 2447 size = 2 * (id + 1); 2448 if (size < MEMCG_CACHES_MIN_SIZE) 2449 size = MEMCG_CACHES_MIN_SIZE; 2450 else if (size > MEMCG_CACHES_MAX_SIZE) 2451 size = MEMCG_CACHES_MAX_SIZE; 2452 2453 err = memcg_update_all_caches(size); 2454 if (!err) 2455 err = memcg_update_all_list_lrus(size); 2456 if (!err) 2457 memcg_nr_cache_ids = size; 2458 2459 up_write(&memcg_cache_ids_sem); 2460 2461 if (err) { 2462 ida_simple_remove(&memcg_cache_ida, id); 2463 return err; 2464 } 2465 return id; 2466 } 2467 2468 static void memcg_free_cache_id(int id) 2469 { 2470 ida_simple_remove(&memcg_cache_ida, id); 2471 } 2472 2473 struct memcg_kmem_cache_create_work { 2474 struct mem_cgroup *memcg; 2475 struct kmem_cache *cachep; 2476 struct work_struct work; 2477 }; 2478 2479 static void memcg_kmem_cache_create_func(struct work_struct *w) 2480 { 2481 struct memcg_kmem_cache_create_work *cw = 2482 container_of(w, struct memcg_kmem_cache_create_work, work); 2483 struct mem_cgroup *memcg = cw->memcg; 2484 struct kmem_cache *cachep = cw->cachep; 2485 2486 memcg_create_kmem_cache(memcg, cachep); 2487 2488 css_put(&memcg->css); 2489 kfree(cw); 2490 } 2491 2492 /* 2493 * Enqueue the creation of a per-memcg kmem_cache. 2494 */ 2495 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, 2496 struct kmem_cache *cachep) 2497 { 2498 struct memcg_kmem_cache_create_work *cw; 2499 2500 cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); 2501 if (!cw) 2502 return; 2503 2504 css_get(&memcg->css); 2505 2506 cw->memcg = memcg; 2507 cw->cachep = cachep; 2508 INIT_WORK(&cw->work, memcg_kmem_cache_create_func); 2509 2510 queue_work(memcg_kmem_cache_wq, &cw->work); 2511 } 2512 2513 static inline bool memcg_kmem_bypass(void) 2514 { 2515 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) 2516 return true; 2517 return false; 2518 } 2519 2520 /** 2521 * memcg_kmem_get_cache: select the correct per-memcg cache for allocation 2522 * @cachep: the original global kmem cache 2523 * 2524 * Return the kmem_cache we're supposed to use for a slab allocation. 2525 * We try to use the current memcg's version of the cache. 2526 * 2527 * If the cache does not exist yet, if we are the first user of it, we 2528 * create it asynchronously in a workqueue and let the current allocation 2529 * go through with the original cache. 2530 * 2531 * This function takes a reference to the cache it returns to assure it 2532 * won't get destroyed while we are working with it. Once the caller is 2533 * done with it, memcg_kmem_put_cache() must be called to release the 2534 * reference. 2535 */ 2536 struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) 2537 { 2538 struct mem_cgroup *memcg; 2539 struct kmem_cache *memcg_cachep; 2540 int kmemcg_id; 2541 2542 VM_BUG_ON(!is_root_cache(cachep)); 2543 2544 if (memcg_kmem_bypass()) 2545 return cachep; 2546 2547 memcg = get_mem_cgroup_from_current(); 2548 kmemcg_id = READ_ONCE(memcg->kmemcg_id); 2549 if (kmemcg_id < 0) 2550 goto out; 2551 2552 memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); 2553 if (likely(memcg_cachep)) 2554 return memcg_cachep; 2555 2556 /* 2557 * If we are in a safe context (can wait, and not in interrupt 2558 * context), we could be be predictable and return right away. 2559 * This would guarantee that the allocation being performed 2560 * already belongs in the new cache. 2561 * 2562 * However, there are some clashes that can arrive from locking. 2563 * For instance, because we acquire the slab_mutex while doing 2564 * memcg_create_kmem_cache, this means no further allocation 2565 * could happen with the slab_mutex held. So it's better to 2566 * defer everything. 2567 */ 2568 memcg_schedule_kmem_cache_create(memcg, cachep); 2569 out: 2570 css_put(&memcg->css); 2571 return cachep; 2572 } 2573 2574 /** 2575 * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache 2576 * @cachep: the cache returned by memcg_kmem_get_cache 2577 */ 2578 void memcg_kmem_put_cache(struct kmem_cache *cachep) 2579 { 2580 if (!is_root_cache(cachep)) 2581 css_put(&cachep->memcg_params.memcg->css); 2582 } 2583 2584 /** 2585 * __memcg_kmem_charge_memcg: charge a kmem page 2586 * @page: page to charge 2587 * @gfp: reclaim mode 2588 * @order: allocation order 2589 * @memcg: memory cgroup to charge 2590 * 2591 * Returns 0 on success, an error code on failure. 2592 */ 2593 int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, 2594 struct mem_cgroup *memcg) 2595 { 2596 unsigned int nr_pages = 1 << order; 2597 struct page_counter *counter; 2598 int ret; 2599 2600 ret = try_charge(memcg, gfp, nr_pages); 2601 if (ret) 2602 return ret; 2603 2604 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && 2605 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { 2606 cancel_charge(memcg, nr_pages); 2607 return -ENOMEM; 2608 } 2609 2610 page->mem_cgroup = memcg; 2611 2612 return 0; 2613 } 2614 2615 /** 2616 * __memcg_kmem_charge: charge a kmem page to the current memory cgroup 2617 * @page: page to charge 2618 * @gfp: reclaim mode 2619 * @order: allocation order 2620 * 2621 * Returns 0 on success, an error code on failure. 2622 */ 2623 int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) 2624 { 2625 struct mem_cgroup *memcg; 2626 int ret = 0; 2627 2628 if (memcg_kmem_bypass()) 2629 return 0; 2630 2631 memcg = get_mem_cgroup_from_current(); 2632 if (!mem_cgroup_is_root(memcg)) { 2633 ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); 2634 if (!ret) 2635 __SetPageKmemcg(page); 2636 } 2637 css_put(&memcg->css); 2638 return ret; 2639 } 2640 /** 2641 * __memcg_kmem_uncharge: uncharge a kmem page 2642 * @page: page to uncharge 2643 * @order: allocation order 2644 */ 2645 void __memcg_kmem_uncharge(struct page *page, int order) 2646 { 2647 struct mem_cgroup *memcg = page->mem_cgroup; 2648 unsigned int nr_pages = 1 << order; 2649 2650 if (!memcg) 2651 return; 2652 2653 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2654 2655 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 2656 page_counter_uncharge(&memcg->kmem, nr_pages); 2657 2658 page_counter_uncharge(&memcg->memory, nr_pages); 2659 if (do_memsw_account()) 2660 page_counter_uncharge(&memcg->memsw, nr_pages); 2661 2662 page->mem_cgroup = NULL; 2663 2664 /* slab pages do not have PageKmemcg flag set */ 2665 if (PageKmemcg(page)) 2666 __ClearPageKmemcg(page); 2667 2668 css_put_many(&memcg->css, nr_pages); 2669 } 2670 #endif /* CONFIG_MEMCG_KMEM */ 2671 2672 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2673 2674 /* 2675 * Because tail pages are not marked as "used", set it. We're under 2676 * zone_lru_lock and migration entries setup in all page mappings. 2677 */ 2678 void mem_cgroup_split_huge_fixup(struct page *head) 2679 { 2680 int i; 2681 2682 if (mem_cgroup_disabled()) 2683 return; 2684 2685 for (i = 1; i < HPAGE_PMD_NR; i++) 2686 head[i].mem_cgroup = head->mem_cgroup; 2687 2688 __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR); 2689 } 2690 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2691 2692 #ifdef CONFIG_MEMCG_SWAP 2693 /** 2694 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2695 * @entry: swap entry to be moved 2696 * @from: mem_cgroup which the entry is moved from 2697 * @to: mem_cgroup which the entry is moved to 2698 * 2699 * It succeeds only when the swap_cgroup's record for this entry is the same 2700 * as the mem_cgroup's id of @from. 2701 * 2702 * Returns 0 on success, -EINVAL on failure. 2703 * 2704 * The caller must have charged to @to, IOW, called page_counter_charge() about 2705 * both res and memsw, and called css_get(). 2706 */ 2707 static int mem_cgroup_move_swap_account(swp_entry_t entry, 2708 struct mem_cgroup *from, struct mem_cgroup *to) 2709 { 2710 unsigned short old_id, new_id; 2711 2712 old_id = mem_cgroup_id(from); 2713 new_id = mem_cgroup_id(to); 2714 2715 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2716 mod_memcg_state(from, MEMCG_SWAP, -1); 2717 mod_memcg_state(to, MEMCG_SWAP, 1); 2718 return 0; 2719 } 2720 return -EINVAL; 2721 } 2722 #else 2723 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2724 struct mem_cgroup *from, struct mem_cgroup *to) 2725 { 2726 return -EINVAL; 2727 } 2728 #endif 2729 2730 static DEFINE_MUTEX(memcg_max_mutex); 2731 2732 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 2733 unsigned long max, bool memsw) 2734 { 2735 bool enlarge = false; 2736 bool drained = false; 2737 int ret; 2738 bool limits_invariant; 2739 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 2740 2741 do { 2742 if (signal_pending(current)) { 2743 ret = -EINTR; 2744 break; 2745 } 2746 2747 mutex_lock(&memcg_max_mutex); 2748 /* 2749 * Make sure that the new limit (memsw or memory limit) doesn't 2750 * break our basic invariant rule memory.max <= memsw.max. 2751 */ 2752 limits_invariant = memsw ? max >= memcg->memory.max : 2753 max <= memcg->memsw.max; 2754 if (!limits_invariant) { 2755 mutex_unlock(&memcg_max_mutex); 2756 ret = -EINVAL; 2757 break; 2758 } 2759 if (max > counter->max) 2760 enlarge = true; 2761 ret = page_counter_set_max(counter, max); 2762 mutex_unlock(&memcg_max_mutex); 2763 2764 if (!ret) 2765 break; 2766 2767 if (!drained) { 2768 drain_all_stock(memcg); 2769 drained = true; 2770 continue; 2771 } 2772 2773 if (!try_to_free_mem_cgroup_pages(memcg, 1, 2774 GFP_KERNEL, !memsw)) { 2775 ret = -EBUSY; 2776 break; 2777 } 2778 } while (true); 2779 2780 if (!ret && enlarge) 2781 memcg_oom_recover(memcg); 2782 2783 return ret; 2784 } 2785 2786 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 2787 gfp_t gfp_mask, 2788 unsigned long *total_scanned) 2789 { 2790 unsigned long nr_reclaimed = 0; 2791 struct mem_cgroup_per_node *mz, *next_mz = NULL; 2792 unsigned long reclaimed; 2793 int loop = 0; 2794 struct mem_cgroup_tree_per_node *mctz; 2795 unsigned long excess; 2796 unsigned long nr_scanned; 2797 2798 if (order > 0) 2799 return 0; 2800 2801 mctz = soft_limit_tree_node(pgdat->node_id); 2802 2803 /* 2804 * Do not even bother to check the largest node if the root 2805 * is empty. Do it lockless to prevent lock bouncing. Races 2806 * are acceptable as soft limit is best effort anyway. 2807 */ 2808 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 2809 return 0; 2810 2811 /* 2812 * This loop can run a while, specially if mem_cgroup's continuously 2813 * keep exceeding their soft limit and putting the system under 2814 * pressure 2815 */ 2816 do { 2817 if (next_mz) 2818 mz = next_mz; 2819 else 2820 mz = mem_cgroup_largest_soft_limit_node(mctz); 2821 if (!mz) 2822 break; 2823 2824 nr_scanned = 0; 2825 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 2826 gfp_mask, &nr_scanned); 2827 nr_reclaimed += reclaimed; 2828 *total_scanned += nr_scanned; 2829 spin_lock_irq(&mctz->lock); 2830 __mem_cgroup_remove_exceeded(mz, mctz); 2831 2832 /* 2833 * If we failed to reclaim anything from this memory cgroup 2834 * it is time to move on to the next cgroup 2835 */ 2836 next_mz = NULL; 2837 if (!reclaimed) 2838 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 2839 2840 excess = soft_limit_excess(mz->memcg); 2841 /* 2842 * One school of thought says that we should not add 2843 * back the node to the tree if reclaim returns 0. 2844 * But our reclaim could return 0, simply because due 2845 * to priority we are exposing a smaller subset of 2846 * memory to reclaim from. Consider this as a longer 2847 * term TODO. 2848 */ 2849 /* If excess == 0, no tree ops */ 2850 __mem_cgroup_insert_exceeded(mz, mctz, excess); 2851 spin_unlock_irq(&mctz->lock); 2852 css_put(&mz->memcg->css); 2853 loop++; 2854 /* 2855 * Could not reclaim anything and there are no more 2856 * mem cgroups to try or we seem to be looping without 2857 * reclaiming anything. 2858 */ 2859 if (!nr_reclaimed && 2860 (next_mz == NULL || 2861 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 2862 break; 2863 } while (!nr_reclaimed); 2864 if (next_mz) 2865 css_put(&next_mz->memcg->css); 2866 return nr_reclaimed; 2867 } 2868 2869 /* 2870 * Test whether @memcg has children, dead or alive. Note that this 2871 * function doesn't care whether @memcg has use_hierarchy enabled and 2872 * returns %true if there are child csses according to the cgroup 2873 * hierarchy. Testing use_hierarchy is the caller's responsiblity. 2874 */ 2875 static inline bool memcg_has_children(struct mem_cgroup *memcg) 2876 { 2877 bool ret; 2878 2879 rcu_read_lock(); 2880 ret = css_next_child(NULL, &memcg->css); 2881 rcu_read_unlock(); 2882 return ret; 2883 } 2884 2885 /* 2886 * Reclaims as many pages from the given memcg as possible. 2887 * 2888 * Caller is responsible for holding css reference for memcg. 2889 */ 2890 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 2891 { 2892 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2893 2894 /* we call try-to-free pages for make this cgroup empty */ 2895 lru_add_drain_all(); 2896 2897 drain_all_stock(memcg); 2898 2899 /* try to free all pages in this cgroup */ 2900 while (nr_retries && page_counter_read(&memcg->memory)) { 2901 int progress; 2902 2903 if (signal_pending(current)) 2904 return -EINTR; 2905 2906 progress = try_to_free_mem_cgroup_pages(memcg, 1, 2907 GFP_KERNEL, true); 2908 if (!progress) { 2909 nr_retries--; 2910 /* maybe some writeback is necessary */ 2911 congestion_wait(BLK_RW_ASYNC, HZ/10); 2912 } 2913 2914 } 2915 2916 return 0; 2917 } 2918 2919 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 2920 char *buf, size_t nbytes, 2921 loff_t off) 2922 { 2923 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 2924 2925 if (mem_cgroup_is_root(memcg)) 2926 return -EINVAL; 2927 return mem_cgroup_force_empty(memcg) ?: nbytes; 2928 } 2929 2930 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 2931 struct cftype *cft) 2932 { 2933 return mem_cgroup_from_css(css)->use_hierarchy; 2934 } 2935 2936 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 2937 struct cftype *cft, u64 val) 2938 { 2939 int retval = 0; 2940 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2941 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 2942 2943 if (memcg->use_hierarchy == val) 2944 return 0; 2945 2946 /* 2947 * If parent's use_hierarchy is set, we can't make any modifications 2948 * in the child subtrees. If it is unset, then the change can 2949 * occur, provided the current cgroup has no children. 2950 * 2951 * For the root cgroup, parent_mem is NULL, we allow value to be 2952 * set if there are no children. 2953 */ 2954 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 2955 (val == 1 || val == 0)) { 2956 if (!memcg_has_children(memcg)) 2957 memcg->use_hierarchy = val; 2958 else 2959 retval = -EBUSY; 2960 } else 2961 retval = -EINVAL; 2962 2963 return retval; 2964 } 2965 2966 struct accumulated_stats { 2967 unsigned long stat[MEMCG_NR_STAT]; 2968 unsigned long events[NR_VM_EVENT_ITEMS]; 2969 unsigned long lru_pages[NR_LRU_LISTS]; 2970 const unsigned int *stats_array; 2971 const unsigned int *events_array; 2972 int stats_size; 2973 int events_size; 2974 }; 2975 2976 static void accumulate_memcg_tree(struct mem_cgroup *memcg, 2977 struct accumulated_stats *acc) 2978 { 2979 struct mem_cgroup *mi; 2980 int i; 2981 2982 for_each_mem_cgroup_tree(mi, memcg) { 2983 for (i = 0; i < acc->stats_size; i++) 2984 acc->stat[i] += memcg_page_state(mi, 2985 acc->stats_array ? acc->stats_array[i] : i); 2986 2987 for (i = 0; i < acc->events_size; i++) 2988 acc->events[i] += memcg_sum_events(mi, 2989 acc->events_array ? acc->events_array[i] : i); 2990 2991 for (i = 0; i < NR_LRU_LISTS; i++) 2992 acc->lru_pages[i] += 2993 mem_cgroup_nr_lru_pages(mi, BIT(i)); 2994 } 2995 } 2996 2997 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 2998 { 2999 unsigned long val = 0; 3000 3001 if (mem_cgroup_is_root(memcg)) { 3002 struct mem_cgroup *iter; 3003 3004 for_each_mem_cgroup_tree(iter, memcg) { 3005 val += memcg_page_state(iter, MEMCG_CACHE); 3006 val += memcg_page_state(iter, MEMCG_RSS); 3007 if (swap) 3008 val += memcg_page_state(iter, MEMCG_SWAP); 3009 } 3010 } else { 3011 if (!swap) 3012 val = page_counter_read(&memcg->memory); 3013 else 3014 val = page_counter_read(&memcg->memsw); 3015 } 3016 return val; 3017 } 3018 3019 enum { 3020 RES_USAGE, 3021 RES_LIMIT, 3022 RES_MAX_USAGE, 3023 RES_FAILCNT, 3024 RES_SOFT_LIMIT, 3025 }; 3026 3027 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3028 struct cftype *cft) 3029 { 3030 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3031 struct page_counter *counter; 3032 3033 switch (MEMFILE_TYPE(cft->private)) { 3034 case _MEM: 3035 counter = &memcg->memory; 3036 break; 3037 case _MEMSWAP: 3038 counter = &memcg->memsw; 3039 break; 3040 case _KMEM: 3041 counter = &memcg->kmem; 3042 break; 3043 case _TCP: 3044 counter = &memcg->tcpmem; 3045 break; 3046 default: 3047 BUG(); 3048 } 3049 3050 switch (MEMFILE_ATTR(cft->private)) { 3051 case RES_USAGE: 3052 if (counter == &memcg->memory) 3053 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 3054 if (counter == &memcg->memsw) 3055 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 3056 return (u64)page_counter_read(counter) * PAGE_SIZE; 3057 case RES_LIMIT: 3058 return (u64)counter->max * PAGE_SIZE; 3059 case RES_MAX_USAGE: 3060 return (u64)counter->watermark * PAGE_SIZE; 3061 case RES_FAILCNT: 3062 return counter->failcnt; 3063 case RES_SOFT_LIMIT: 3064 return (u64)memcg->soft_limit * PAGE_SIZE; 3065 default: 3066 BUG(); 3067 } 3068 } 3069 3070 #ifdef CONFIG_MEMCG_KMEM 3071 static int memcg_online_kmem(struct mem_cgroup *memcg) 3072 { 3073 int memcg_id; 3074 3075 if (cgroup_memory_nokmem) 3076 return 0; 3077 3078 BUG_ON(memcg->kmemcg_id >= 0); 3079 BUG_ON(memcg->kmem_state); 3080 3081 memcg_id = memcg_alloc_cache_id(); 3082 if (memcg_id < 0) 3083 return memcg_id; 3084 3085 static_branch_inc(&memcg_kmem_enabled_key); 3086 /* 3087 * A memory cgroup is considered kmem-online as soon as it gets 3088 * kmemcg_id. Setting the id after enabling static branching will 3089 * guarantee no one starts accounting before all call sites are 3090 * patched. 3091 */ 3092 memcg->kmemcg_id = memcg_id; 3093 memcg->kmem_state = KMEM_ONLINE; 3094 INIT_LIST_HEAD(&memcg->kmem_caches); 3095 3096 return 0; 3097 } 3098 3099 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3100 { 3101 struct cgroup_subsys_state *css; 3102 struct mem_cgroup *parent, *child; 3103 int kmemcg_id; 3104 3105 if (memcg->kmem_state != KMEM_ONLINE) 3106 return; 3107 /* 3108 * Clear the online state before clearing memcg_caches array 3109 * entries. The slab_mutex in memcg_deactivate_kmem_caches() 3110 * guarantees that no cache will be created for this cgroup 3111 * after we are done (see memcg_create_kmem_cache()). 3112 */ 3113 memcg->kmem_state = KMEM_ALLOCATED; 3114 3115 memcg_deactivate_kmem_caches(memcg); 3116 3117 kmemcg_id = memcg->kmemcg_id; 3118 BUG_ON(kmemcg_id < 0); 3119 3120 parent = parent_mem_cgroup(memcg); 3121 if (!parent) 3122 parent = root_mem_cgroup; 3123 3124 /* 3125 * Change kmemcg_id of this cgroup and all its descendants to the 3126 * parent's id, and then move all entries from this cgroup's list_lrus 3127 * to ones of the parent. After we have finished, all list_lrus 3128 * corresponding to this cgroup are guaranteed to remain empty. The 3129 * ordering is imposed by list_lru_node->lock taken by 3130 * memcg_drain_all_list_lrus(). 3131 */ 3132 rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ 3133 css_for_each_descendant_pre(css, &memcg->css) { 3134 child = mem_cgroup_from_css(css); 3135 BUG_ON(child->kmemcg_id != kmemcg_id); 3136 child->kmemcg_id = parent->kmemcg_id; 3137 if (!memcg->use_hierarchy) 3138 break; 3139 } 3140 rcu_read_unlock(); 3141 3142 memcg_drain_all_list_lrus(kmemcg_id, parent); 3143 3144 memcg_free_cache_id(kmemcg_id); 3145 } 3146 3147 static void memcg_free_kmem(struct mem_cgroup *memcg) 3148 { 3149 /* css_alloc() failed, offlining didn't happen */ 3150 if (unlikely(memcg->kmem_state == KMEM_ONLINE)) 3151 memcg_offline_kmem(memcg); 3152 3153 if (memcg->kmem_state == KMEM_ALLOCATED) { 3154 memcg_destroy_kmem_caches(memcg); 3155 static_branch_dec(&memcg_kmem_enabled_key); 3156 WARN_ON(page_counter_read(&memcg->kmem)); 3157 } 3158 } 3159 #else 3160 static int memcg_online_kmem(struct mem_cgroup *memcg) 3161 { 3162 return 0; 3163 } 3164 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3165 { 3166 } 3167 static void memcg_free_kmem(struct mem_cgroup *memcg) 3168 { 3169 } 3170 #endif /* CONFIG_MEMCG_KMEM */ 3171 3172 static int memcg_update_kmem_max(struct mem_cgroup *memcg, 3173 unsigned long max) 3174 { 3175 int ret; 3176 3177 mutex_lock(&memcg_max_mutex); 3178 ret = page_counter_set_max(&memcg->kmem, max); 3179 mutex_unlock(&memcg_max_mutex); 3180 return ret; 3181 } 3182 3183 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 3184 { 3185 int ret; 3186 3187 mutex_lock(&memcg_max_mutex); 3188 3189 ret = page_counter_set_max(&memcg->tcpmem, max); 3190 if (ret) 3191 goto out; 3192 3193 if (!memcg->tcpmem_active) { 3194 /* 3195 * The active flag needs to be written after the static_key 3196 * update. This is what guarantees that the socket activation 3197 * function is the last one to run. See mem_cgroup_sk_alloc() 3198 * for details, and note that we don't mark any socket as 3199 * belonging to this memcg until that flag is up. 3200 * 3201 * We need to do this, because static_keys will span multiple 3202 * sites, but we can't control their order. If we mark a socket 3203 * as accounted, but the accounting functions are not patched in 3204 * yet, we'll lose accounting. 3205 * 3206 * We never race with the readers in mem_cgroup_sk_alloc(), 3207 * because when this value change, the code to process it is not 3208 * patched in yet. 3209 */ 3210 static_branch_inc(&memcg_sockets_enabled_key); 3211 memcg->tcpmem_active = true; 3212 } 3213 out: 3214 mutex_unlock(&memcg_max_mutex); 3215 return ret; 3216 } 3217 3218 /* 3219 * The user of this function is... 3220 * RES_LIMIT. 3221 */ 3222 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3223 char *buf, size_t nbytes, loff_t off) 3224 { 3225 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3226 unsigned long nr_pages; 3227 int ret; 3228 3229 buf = strstrip(buf); 3230 ret = page_counter_memparse(buf, "-1", &nr_pages); 3231 if (ret) 3232 return ret; 3233 3234 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3235 case RES_LIMIT: 3236 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3237 ret = -EINVAL; 3238 break; 3239 } 3240 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3241 case _MEM: 3242 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 3243 break; 3244 case _MEMSWAP: 3245 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 3246 break; 3247 case _KMEM: 3248 ret = memcg_update_kmem_max(memcg, nr_pages); 3249 break; 3250 case _TCP: 3251 ret = memcg_update_tcp_max(memcg, nr_pages); 3252 break; 3253 } 3254 break; 3255 case RES_SOFT_LIMIT: 3256 memcg->soft_limit = nr_pages; 3257 ret = 0; 3258 break; 3259 } 3260 return ret ?: nbytes; 3261 } 3262 3263 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3264 size_t nbytes, loff_t off) 3265 { 3266 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3267 struct page_counter *counter; 3268 3269 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3270 case _MEM: 3271 counter = &memcg->memory; 3272 break; 3273 case _MEMSWAP: 3274 counter = &memcg->memsw; 3275 break; 3276 case _KMEM: 3277 counter = &memcg->kmem; 3278 break; 3279 case _TCP: 3280 counter = &memcg->tcpmem; 3281 break; 3282 default: 3283 BUG(); 3284 } 3285 3286 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3287 case RES_MAX_USAGE: 3288 page_counter_reset_watermark(counter); 3289 break; 3290 case RES_FAILCNT: 3291 counter->failcnt = 0; 3292 break; 3293 default: 3294 BUG(); 3295 } 3296 3297 return nbytes; 3298 } 3299 3300 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3301 struct cftype *cft) 3302 { 3303 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3304 } 3305 3306 #ifdef CONFIG_MMU 3307 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3308 struct cftype *cft, u64 val) 3309 { 3310 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3311 3312 if (val & ~MOVE_MASK) 3313 return -EINVAL; 3314 3315 /* 3316 * No kind of locking is needed in here, because ->can_attach() will 3317 * check this value once in the beginning of the process, and then carry 3318 * on with stale data. This means that changes to this value will only 3319 * affect task migrations starting after the change. 3320 */ 3321 memcg->move_charge_at_immigrate = val; 3322 return 0; 3323 } 3324 #else 3325 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3326 struct cftype *cft, u64 val) 3327 { 3328 return -ENOSYS; 3329 } 3330 #endif 3331 3332 #ifdef CONFIG_NUMA 3333 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3334 { 3335 struct numa_stat { 3336 const char *name; 3337 unsigned int lru_mask; 3338 }; 3339 3340 static const struct numa_stat stats[] = { 3341 { "total", LRU_ALL }, 3342 { "file", LRU_ALL_FILE }, 3343 { "anon", LRU_ALL_ANON }, 3344 { "unevictable", BIT(LRU_UNEVICTABLE) }, 3345 }; 3346 const struct numa_stat *stat; 3347 int nid; 3348 unsigned long nr; 3349 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 3350 3351 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3352 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 3353 seq_printf(m, "%s=%lu", stat->name, nr); 3354 for_each_node_state(nid, N_MEMORY) { 3355 nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 3356 stat->lru_mask); 3357 seq_printf(m, " N%d=%lu", nid, nr); 3358 } 3359 seq_putc(m, '\n'); 3360 } 3361 3362 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3363 struct mem_cgroup *iter; 3364 3365 nr = 0; 3366 for_each_mem_cgroup_tree(iter, memcg) 3367 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); 3368 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); 3369 for_each_node_state(nid, N_MEMORY) { 3370 nr = 0; 3371 for_each_mem_cgroup_tree(iter, memcg) 3372 nr += mem_cgroup_node_nr_lru_pages( 3373 iter, nid, stat->lru_mask); 3374 seq_printf(m, " N%d=%lu", nid, nr); 3375 } 3376 seq_putc(m, '\n'); 3377 } 3378 3379 return 0; 3380 } 3381 #endif /* CONFIG_NUMA */ 3382 3383 /* Universal VM events cgroup1 shows, original sort order */ 3384 static const unsigned int memcg1_events[] = { 3385 PGPGIN, 3386 PGPGOUT, 3387 PGFAULT, 3388 PGMAJFAULT, 3389 }; 3390 3391 static const char *const memcg1_event_names[] = { 3392 "pgpgin", 3393 "pgpgout", 3394 "pgfault", 3395 "pgmajfault", 3396 }; 3397 3398 static int memcg_stat_show(struct seq_file *m, void *v) 3399 { 3400 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 3401 unsigned long memory, memsw; 3402 struct mem_cgroup *mi; 3403 unsigned int i; 3404 struct accumulated_stats acc; 3405 3406 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 3407 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 3408 3409 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 3410 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 3411 continue; 3412 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], 3413 memcg_page_state(memcg, memcg1_stats[i]) * 3414 PAGE_SIZE); 3415 } 3416 3417 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 3418 seq_printf(m, "%s %lu\n", memcg1_event_names[i], 3419 memcg_sum_events(memcg, memcg1_events[i])); 3420 3421 for (i = 0; i < NR_LRU_LISTS; i++) 3422 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 3423 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 3424 3425 /* Hierarchical information */ 3426 memory = memsw = PAGE_COUNTER_MAX; 3427 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 3428 memory = min(memory, mi->memory.max); 3429 memsw = min(memsw, mi->memsw.max); 3430 } 3431 seq_printf(m, "hierarchical_memory_limit %llu\n", 3432 (u64)memory * PAGE_SIZE); 3433 if (do_memsw_account()) 3434 seq_printf(m, "hierarchical_memsw_limit %llu\n", 3435 (u64)memsw * PAGE_SIZE); 3436 3437 memset(&acc, 0, sizeof(acc)); 3438 acc.stats_size = ARRAY_SIZE(memcg1_stats); 3439 acc.stats_array = memcg1_stats; 3440 acc.events_size = ARRAY_SIZE(memcg1_events); 3441 acc.events_array = memcg1_events; 3442 accumulate_memcg_tree(memcg, &acc); 3443 3444 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 3445 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 3446 continue; 3447 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], 3448 (u64)acc.stat[i] * PAGE_SIZE); 3449 } 3450 3451 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 3452 seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], 3453 (u64)acc.events[i]); 3454 3455 for (i = 0; i < NR_LRU_LISTS; i++) 3456 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], 3457 (u64)acc.lru_pages[i] * PAGE_SIZE); 3458 3459 #ifdef CONFIG_DEBUG_VM 3460 { 3461 pg_data_t *pgdat; 3462 struct mem_cgroup_per_node *mz; 3463 struct zone_reclaim_stat *rstat; 3464 unsigned long recent_rotated[2] = {0, 0}; 3465 unsigned long recent_scanned[2] = {0, 0}; 3466 3467 for_each_online_pgdat(pgdat) { 3468 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); 3469 rstat = &mz->lruvec.reclaim_stat; 3470 3471 recent_rotated[0] += rstat->recent_rotated[0]; 3472 recent_rotated[1] += rstat->recent_rotated[1]; 3473 recent_scanned[0] += rstat->recent_scanned[0]; 3474 recent_scanned[1] += rstat->recent_scanned[1]; 3475 } 3476 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 3477 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 3478 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 3479 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 3480 } 3481 #endif 3482 3483 return 0; 3484 } 3485 3486 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 3487 struct cftype *cft) 3488 { 3489 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3490 3491 return mem_cgroup_swappiness(memcg); 3492 } 3493 3494 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 3495 struct cftype *cft, u64 val) 3496 { 3497 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3498 3499 if (val > 100) 3500 return -EINVAL; 3501 3502 if (css->parent) 3503 memcg->swappiness = val; 3504 else 3505 vm_swappiness = val; 3506 3507 return 0; 3508 } 3509 3510 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3511 { 3512 struct mem_cgroup_threshold_ary *t; 3513 unsigned long usage; 3514 int i; 3515 3516 rcu_read_lock(); 3517 if (!swap) 3518 t = rcu_dereference(memcg->thresholds.primary); 3519 else 3520 t = rcu_dereference(memcg->memsw_thresholds.primary); 3521 3522 if (!t) 3523 goto unlock; 3524 3525 usage = mem_cgroup_usage(memcg, swap); 3526 3527 /* 3528 * current_threshold points to threshold just below or equal to usage. 3529 * If it's not true, a threshold was crossed after last 3530 * call of __mem_cgroup_threshold(). 3531 */ 3532 i = t->current_threshold; 3533 3534 /* 3535 * Iterate backward over array of thresholds starting from 3536 * current_threshold and check if a threshold is crossed. 3537 * If none of thresholds below usage is crossed, we read 3538 * only one element of the array here. 3539 */ 3540 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3541 eventfd_signal(t->entries[i].eventfd, 1); 3542 3543 /* i = current_threshold + 1 */ 3544 i++; 3545 3546 /* 3547 * Iterate forward over array of thresholds starting from 3548 * current_threshold+1 and check if a threshold is crossed. 3549 * If none of thresholds above usage is crossed, we read 3550 * only one element of the array here. 3551 */ 3552 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3553 eventfd_signal(t->entries[i].eventfd, 1); 3554 3555 /* Update current_threshold */ 3556 t->current_threshold = i - 1; 3557 unlock: 3558 rcu_read_unlock(); 3559 } 3560 3561 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3562 { 3563 while (memcg) { 3564 __mem_cgroup_threshold(memcg, false); 3565 if (do_memsw_account()) 3566 __mem_cgroup_threshold(memcg, true); 3567 3568 memcg = parent_mem_cgroup(memcg); 3569 } 3570 } 3571 3572 static int compare_thresholds(const void *a, const void *b) 3573 { 3574 const struct mem_cgroup_threshold *_a = a; 3575 const struct mem_cgroup_threshold *_b = b; 3576 3577 if (_a->threshold > _b->threshold) 3578 return 1; 3579 3580 if (_a->threshold < _b->threshold) 3581 return -1; 3582 3583 return 0; 3584 } 3585 3586 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 3587 { 3588 struct mem_cgroup_eventfd_list *ev; 3589 3590 spin_lock(&memcg_oom_lock); 3591 3592 list_for_each_entry(ev, &memcg->oom_notify, list) 3593 eventfd_signal(ev->eventfd, 1); 3594 3595 spin_unlock(&memcg_oom_lock); 3596 return 0; 3597 } 3598 3599 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 3600 { 3601 struct mem_cgroup *iter; 3602 3603 for_each_mem_cgroup_tree(iter, memcg) 3604 mem_cgroup_oom_notify_cb(iter); 3605 } 3606 3607 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3608 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 3609 { 3610 struct mem_cgroup_thresholds *thresholds; 3611 struct mem_cgroup_threshold_ary *new; 3612 unsigned long threshold; 3613 unsigned long usage; 3614 int i, size, ret; 3615 3616 ret = page_counter_memparse(args, "-1", &threshold); 3617 if (ret) 3618 return ret; 3619 3620 mutex_lock(&memcg->thresholds_lock); 3621 3622 if (type == _MEM) { 3623 thresholds = &memcg->thresholds; 3624 usage = mem_cgroup_usage(memcg, false); 3625 } else if (type == _MEMSWAP) { 3626 thresholds = &memcg->memsw_thresholds; 3627 usage = mem_cgroup_usage(memcg, true); 3628 } else 3629 BUG(); 3630 3631 /* Check if a threshold crossed before adding a new one */ 3632 if (thresholds->primary) 3633 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3634 3635 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3636 3637 /* Allocate memory for new array of thresholds */ 3638 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 3639 if (!new) { 3640 ret = -ENOMEM; 3641 goto unlock; 3642 } 3643 new->size = size; 3644 3645 /* Copy thresholds (if any) to new array */ 3646 if (thresholds->primary) { 3647 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 3648 sizeof(struct mem_cgroup_threshold)); 3649 } 3650 3651 /* Add new threshold */ 3652 new->entries[size - 1].eventfd = eventfd; 3653 new->entries[size - 1].threshold = threshold; 3654 3655 /* Sort thresholds. Registering of new threshold isn't time-critical */ 3656 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 3657 compare_thresholds, NULL); 3658 3659 /* Find current threshold */ 3660 new->current_threshold = -1; 3661 for (i = 0; i < size; i++) { 3662 if (new->entries[i].threshold <= usage) { 3663 /* 3664 * new->current_threshold will not be used until 3665 * rcu_assign_pointer(), so it's safe to increment 3666 * it here. 3667 */ 3668 ++new->current_threshold; 3669 } else 3670 break; 3671 } 3672 3673 /* Free old spare buffer and save old primary buffer as spare */ 3674 kfree(thresholds->spare); 3675 thresholds->spare = thresholds->primary; 3676 3677 rcu_assign_pointer(thresholds->primary, new); 3678 3679 /* To be sure that nobody uses thresholds */ 3680 synchronize_rcu(); 3681 3682 unlock: 3683 mutex_unlock(&memcg->thresholds_lock); 3684 3685 return ret; 3686 } 3687 3688 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 3689 struct eventfd_ctx *eventfd, const char *args) 3690 { 3691 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 3692 } 3693 3694 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 3695 struct eventfd_ctx *eventfd, const char *args) 3696 { 3697 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 3698 } 3699 3700 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3701 struct eventfd_ctx *eventfd, enum res_type type) 3702 { 3703 struct mem_cgroup_thresholds *thresholds; 3704 struct mem_cgroup_threshold_ary *new; 3705 unsigned long usage; 3706 int i, j, size; 3707 3708 mutex_lock(&memcg->thresholds_lock); 3709 3710 if (type == _MEM) { 3711 thresholds = &memcg->thresholds; 3712 usage = mem_cgroup_usage(memcg, false); 3713 } else if (type == _MEMSWAP) { 3714 thresholds = &memcg->memsw_thresholds; 3715 usage = mem_cgroup_usage(memcg, true); 3716 } else 3717 BUG(); 3718 3719 if (!thresholds->primary) 3720 goto unlock; 3721 3722 /* Check if a threshold crossed before removing */ 3723 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 3724 3725 /* Calculate new number of threshold */ 3726 size = 0; 3727 for (i = 0; i < thresholds->primary->size; i++) { 3728 if (thresholds->primary->entries[i].eventfd != eventfd) 3729 size++; 3730 } 3731 3732 new = thresholds->spare; 3733 3734 /* Set thresholds array to NULL if we don't have thresholds */ 3735 if (!size) { 3736 kfree(new); 3737 new = NULL; 3738 goto swap_buffers; 3739 } 3740 3741 new->size = size; 3742 3743 /* Copy thresholds and find current threshold */ 3744 new->current_threshold = -1; 3745 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 3746 if (thresholds->primary->entries[i].eventfd == eventfd) 3747 continue; 3748 3749 new->entries[j] = thresholds->primary->entries[i]; 3750 if (new->entries[j].threshold <= usage) { 3751 /* 3752 * new->current_threshold will not be used 3753 * until rcu_assign_pointer(), so it's safe to increment 3754 * it here. 3755 */ 3756 ++new->current_threshold; 3757 } 3758 j++; 3759 } 3760 3761 swap_buffers: 3762 /* Swap primary and spare array */ 3763 thresholds->spare = thresholds->primary; 3764 3765 rcu_assign_pointer(thresholds->primary, new); 3766 3767 /* To be sure that nobody uses thresholds */ 3768 synchronize_rcu(); 3769 3770 /* If all events are unregistered, free the spare array */ 3771 if (!new) { 3772 kfree(thresholds->spare); 3773 thresholds->spare = NULL; 3774 } 3775 unlock: 3776 mutex_unlock(&memcg->thresholds_lock); 3777 } 3778 3779 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3780 struct eventfd_ctx *eventfd) 3781 { 3782 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 3783 } 3784 3785 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 3786 struct eventfd_ctx *eventfd) 3787 { 3788 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 3789 } 3790 3791 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 3792 struct eventfd_ctx *eventfd, const char *args) 3793 { 3794 struct mem_cgroup_eventfd_list *event; 3795 3796 event = kmalloc(sizeof(*event), GFP_KERNEL); 3797 if (!event) 3798 return -ENOMEM; 3799 3800 spin_lock(&memcg_oom_lock); 3801 3802 event->eventfd = eventfd; 3803 list_add(&event->list, &memcg->oom_notify); 3804 3805 /* already in OOM ? */ 3806 if (memcg->under_oom) 3807 eventfd_signal(eventfd, 1); 3808 spin_unlock(&memcg_oom_lock); 3809 3810 return 0; 3811 } 3812 3813 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 3814 struct eventfd_ctx *eventfd) 3815 { 3816 struct mem_cgroup_eventfd_list *ev, *tmp; 3817 3818 spin_lock(&memcg_oom_lock); 3819 3820 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 3821 if (ev->eventfd == eventfd) { 3822 list_del(&ev->list); 3823 kfree(ev); 3824 } 3825 } 3826 3827 spin_unlock(&memcg_oom_lock); 3828 } 3829 3830 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 3831 { 3832 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 3833 3834 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 3835 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 3836 seq_printf(sf, "oom_kill %lu\n", 3837 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 3838 return 0; 3839 } 3840 3841 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 3842 struct cftype *cft, u64 val) 3843 { 3844 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3845 3846 /* cannot set to root cgroup and only 0 and 1 are allowed */ 3847 if (!css->parent || !((val == 0) || (val == 1))) 3848 return -EINVAL; 3849 3850 memcg->oom_kill_disable = val; 3851 if (!val) 3852 memcg_oom_recover(memcg); 3853 3854 return 0; 3855 } 3856 3857 #ifdef CONFIG_CGROUP_WRITEBACK 3858 3859 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 3860 { 3861 return wb_domain_init(&memcg->cgwb_domain, gfp); 3862 } 3863 3864 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 3865 { 3866 wb_domain_exit(&memcg->cgwb_domain); 3867 } 3868 3869 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 3870 { 3871 wb_domain_size_changed(&memcg->cgwb_domain); 3872 } 3873 3874 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 3875 { 3876 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 3877 3878 if (!memcg->css.parent) 3879 return NULL; 3880 3881 return &memcg->cgwb_domain; 3882 } 3883 3884 /** 3885 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 3886 * @wb: bdi_writeback in question 3887 * @pfilepages: out parameter for number of file pages 3888 * @pheadroom: out parameter for number of allocatable pages according to memcg 3889 * @pdirty: out parameter for number of dirty pages 3890 * @pwriteback: out parameter for number of pages under writeback 3891 * 3892 * Determine the numbers of file, headroom, dirty, and writeback pages in 3893 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 3894 * is a bit more involved. 3895 * 3896 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 3897 * headroom is calculated as the lowest headroom of itself and the 3898 * ancestors. Note that this doesn't consider the actual amount of 3899 * available memory in the system. The caller should further cap 3900 * *@pheadroom accordingly. 3901 */ 3902 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 3903 unsigned long *pheadroom, unsigned long *pdirty, 3904 unsigned long *pwriteback) 3905 { 3906 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 3907 struct mem_cgroup *parent; 3908 3909 *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); 3910 3911 /* this should eventually include NR_UNSTABLE_NFS */ 3912 *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); 3913 *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | 3914 (1 << LRU_ACTIVE_FILE)); 3915 *pheadroom = PAGE_COUNTER_MAX; 3916 3917 while ((parent = parent_mem_cgroup(memcg))) { 3918 unsigned long ceiling = min(memcg->memory.max, memcg->high); 3919 unsigned long used = page_counter_read(&memcg->memory); 3920 3921 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 3922 memcg = parent; 3923 } 3924 } 3925 3926 #else /* CONFIG_CGROUP_WRITEBACK */ 3927 3928 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 3929 { 3930 return 0; 3931 } 3932 3933 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 3934 { 3935 } 3936 3937 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 3938 { 3939 } 3940 3941 #endif /* CONFIG_CGROUP_WRITEBACK */ 3942 3943 /* 3944 * DO NOT USE IN NEW FILES. 3945 * 3946 * "cgroup.event_control" implementation. 3947 * 3948 * This is way over-engineered. It tries to support fully configurable 3949 * events for each user. Such level of flexibility is completely 3950 * unnecessary especially in the light of the planned unified hierarchy. 3951 * 3952 * Please deprecate this and replace with something simpler if at all 3953 * possible. 3954 */ 3955 3956 /* 3957 * Unregister event and free resources. 3958 * 3959 * Gets called from workqueue. 3960 */ 3961 static void memcg_event_remove(struct work_struct *work) 3962 { 3963 struct mem_cgroup_event *event = 3964 container_of(work, struct mem_cgroup_event, remove); 3965 struct mem_cgroup *memcg = event->memcg; 3966 3967 remove_wait_queue(event->wqh, &event->wait); 3968 3969 event->unregister_event(memcg, event->eventfd); 3970 3971 /* Notify userspace the event is going away. */ 3972 eventfd_signal(event->eventfd, 1); 3973 3974 eventfd_ctx_put(event->eventfd); 3975 kfree(event); 3976 css_put(&memcg->css); 3977 } 3978 3979 /* 3980 * Gets called on EPOLLHUP on eventfd when user closes it. 3981 * 3982 * Called with wqh->lock held and interrupts disabled. 3983 */ 3984 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 3985 int sync, void *key) 3986 { 3987 struct mem_cgroup_event *event = 3988 container_of(wait, struct mem_cgroup_event, wait); 3989 struct mem_cgroup *memcg = event->memcg; 3990 __poll_t flags = key_to_poll(key); 3991 3992 if (flags & EPOLLHUP) { 3993 /* 3994 * If the event has been detached at cgroup removal, we 3995 * can simply return knowing the other side will cleanup 3996 * for us. 3997 * 3998 * We can't race against event freeing since the other 3999 * side will require wqh->lock via remove_wait_queue(), 4000 * which we hold. 4001 */ 4002 spin_lock(&memcg->event_list_lock); 4003 if (!list_empty(&event->list)) { 4004 list_del_init(&event->list); 4005 /* 4006 * We are in atomic context, but cgroup_event_remove() 4007 * may sleep, so we have to call it in workqueue. 4008 */ 4009 schedule_work(&event->remove); 4010 } 4011 spin_unlock(&memcg->event_list_lock); 4012 } 4013 4014 return 0; 4015 } 4016 4017 static void memcg_event_ptable_queue_proc(struct file *file, 4018 wait_queue_head_t *wqh, poll_table *pt) 4019 { 4020 struct mem_cgroup_event *event = 4021 container_of(pt, struct mem_cgroup_event, pt); 4022 4023 event->wqh = wqh; 4024 add_wait_queue(wqh, &event->wait); 4025 } 4026 4027 /* 4028 * DO NOT USE IN NEW FILES. 4029 * 4030 * Parse input and register new cgroup event handler. 4031 * 4032 * Input must be in format '<event_fd> <control_fd> <args>'. 4033 * Interpretation of args is defined by control file implementation. 4034 */ 4035 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4036 char *buf, size_t nbytes, loff_t off) 4037 { 4038 struct cgroup_subsys_state *css = of_css(of); 4039 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4040 struct mem_cgroup_event *event; 4041 struct cgroup_subsys_state *cfile_css; 4042 unsigned int efd, cfd; 4043 struct fd efile; 4044 struct fd cfile; 4045 const char *name; 4046 char *endp; 4047 int ret; 4048 4049 buf = strstrip(buf); 4050 4051 efd = simple_strtoul(buf, &endp, 10); 4052 if (*endp != ' ') 4053 return -EINVAL; 4054 buf = endp + 1; 4055 4056 cfd = simple_strtoul(buf, &endp, 10); 4057 if ((*endp != ' ') && (*endp != '\0')) 4058 return -EINVAL; 4059 buf = endp + 1; 4060 4061 event = kzalloc(sizeof(*event), GFP_KERNEL); 4062 if (!event) 4063 return -ENOMEM; 4064 4065 event->memcg = memcg; 4066 INIT_LIST_HEAD(&event->list); 4067 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4068 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4069 INIT_WORK(&event->remove, memcg_event_remove); 4070 4071 efile = fdget(efd); 4072 if (!efile.file) { 4073 ret = -EBADF; 4074 goto out_kfree; 4075 } 4076 4077 event->eventfd = eventfd_ctx_fileget(efile.file); 4078 if (IS_ERR(event->eventfd)) { 4079 ret = PTR_ERR(event->eventfd); 4080 goto out_put_efile; 4081 } 4082 4083 cfile = fdget(cfd); 4084 if (!cfile.file) { 4085 ret = -EBADF; 4086 goto out_put_eventfd; 4087 } 4088 4089 /* the process need read permission on control file */ 4090 /* AV: shouldn't we check that it's been opened for read instead? */ 4091 ret = inode_permission(file_inode(cfile.file), MAY_READ); 4092 if (ret < 0) 4093 goto out_put_cfile; 4094 4095 /* 4096 * Determine the event callbacks and set them in @event. This used 4097 * to be done via struct cftype but cgroup core no longer knows 4098 * about these events. The following is crude but the whole thing 4099 * is for compatibility anyway. 4100 * 4101 * DO NOT ADD NEW FILES. 4102 */ 4103 name = cfile.file->f_path.dentry->d_name.name; 4104 4105 if (!strcmp(name, "memory.usage_in_bytes")) { 4106 event->register_event = mem_cgroup_usage_register_event; 4107 event->unregister_event = mem_cgroup_usage_unregister_event; 4108 } else if (!strcmp(name, "memory.oom_control")) { 4109 event->register_event = mem_cgroup_oom_register_event; 4110 event->unregister_event = mem_cgroup_oom_unregister_event; 4111 } else if (!strcmp(name, "memory.pressure_level")) { 4112 event->register_event = vmpressure_register_event; 4113 event->unregister_event = vmpressure_unregister_event; 4114 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 4115 event->register_event = memsw_cgroup_usage_register_event; 4116 event->unregister_event = memsw_cgroup_usage_unregister_event; 4117 } else { 4118 ret = -EINVAL; 4119 goto out_put_cfile; 4120 } 4121 4122 /* 4123 * Verify @cfile should belong to @css. Also, remaining events are 4124 * automatically removed on cgroup destruction but the removal is 4125 * asynchronous, so take an extra ref on @css. 4126 */ 4127 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, 4128 &memory_cgrp_subsys); 4129 ret = -EINVAL; 4130 if (IS_ERR(cfile_css)) 4131 goto out_put_cfile; 4132 if (cfile_css != css) { 4133 css_put(cfile_css); 4134 goto out_put_cfile; 4135 } 4136 4137 ret = event->register_event(memcg, event->eventfd, buf); 4138 if (ret) 4139 goto out_put_css; 4140 4141 vfs_poll(efile.file, &event->pt); 4142 4143 spin_lock(&memcg->event_list_lock); 4144 list_add(&event->list, &memcg->event_list); 4145 spin_unlock(&memcg->event_list_lock); 4146 4147 fdput(cfile); 4148 fdput(efile); 4149 4150 return nbytes; 4151 4152 out_put_css: 4153 css_put(css); 4154 out_put_cfile: 4155 fdput(cfile); 4156 out_put_eventfd: 4157 eventfd_ctx_put(event->eventfd); 4158 out_put_efile: 4159 fdput(efile); 4160 out_kfree: 4161 kfree(event); 4162 4163 return ret; 4164 } 4165 4166 static struct cftype mem_cgroup_legacy_files[] = { 4167 { 4168 .name = "usage_in_bytes", 4169 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4170 .read_u64 = mem_cgroup_read_u64, 4171 }, 4172 { 4173 .name = "max_usage_in_bytes", 4174 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4175 .write = mem_cgroup_reset, 4176 .read_u64 = mem_cgroup_read_u64, 4177 }, 4178 { 4179 .name = "limit_in_bytes", 4180 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4181 .write = mem_cgroup_write, 4182 .read_u64 = mem_cgroup_read_u64, 4183 }, 4184 { 4185 .name = "soft_limit_in_bytes", 4186 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4187 .write = mem_cgroup_write, 4188 .read_u64 = mem_cgroup_read_u64, 4189 }, 4190 { 4191 .name = "failcnt", 4192 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4193 .write = mem_cgroup_reset, 4194 .read_u64 = mem_cgroup_read_u64, 4195 }, 4196 { 4197 .name = "stat", 4198 .seq_show = memcg_stat_show, 4199 }, 4200 { 4201 .name = "force_empty", 4202 .write = mem_cgroup_force_empty_write, 4203 }, 4204 { 4205 .name = "use_hierarchy", 4206 .write_u64 = mem_cgroup_hierarchy_write, 4207 .read_u64 = mem_cgroup_hierarchy_read, 4208 }, 4209 { 4210 .name = "cgroup.event_control", /* XXX: for compat */ 4211 .write = memcg_write_event_control, 4212 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 4213 }, 4214 { 4215 .name = "swappiness", 4216 .read_u64 = mem_cgroup_swappiness_read, 4217 .write_u64 = mem_cgroup_swappiness_write, 4218 }, 4219 { 4220 .name = "move_charge_at_immigrate", 4221 .read_u64 = mem_cgroup_move_charge_read, 4222 .write_u64 = mem_cgroup_move_charge_write, 4223 }, 4224 { 4225 .name = "oom_control", 4226 .seq_show = mem_cgroup_oom_control_read, 4227 .write_u64 = mem_cgroup_oom_control_write, 4228 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4229 }, 4230 { 4231 .name = "pressure_level", 4232 }, 4233 #ifdef CONFIG_NUMA 4234 { 4235 .name = "numa_stat", 4236 .seq_show = memcg_numa_stat_show, 4237 }, 4238 #endif 4239 { 4240 .name = "kmem.limit_in_bytes", 4241 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 4242 .write = mem_cgroup_write, 4243 .read_u64 = mem_cgroup_read_u64, 4244 }, 4245 { 4246 .name = "kmem.usage_in_bytes", 4247 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 4248 .read_u64 = mem_cgroup_read_u64, 4249 }, 4250 { 4251 .name = "kmem.failcnt", 4252 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 4253 .write = mem_cgroup_reset, 4254 .read_u64 = mem_cgroup_read_u64, 4255 }, 4256 { 4257 .name = "kmem.max_usage_in_bytes", 4258 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 4259 .write = mem_cgroup_reset, 4260 .read_u64 = mem_cgroup_read_u64, 4261 }, 4262 #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) 4263 { 4264 .name = "kmem.slabinfo", 4265 .seq_start = memcg_slab_start, 4266 .seq_next = memcg_slab_next, 4267 .seq_stop = memcg_slab_stop, 4268 .seq_show = memcg_slab_show, 4269 }, 4270 #endif 4271 { 4272 .name = "kmem.tcp.limit_in_bytes", 4273 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 4274 .write = mem_cgroup_write, 4275 .read_u64 = mem_cgroup_read_u64, 4276 }, 4277 { 4278 .name = "kmem.tcp.usage_in_bytes", 4279 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 4280 .read_u64 = mem_cgroup_read_u64, 4281 }, 4282 { 4283 .name = "kmem.tcp.failcnt", 4284 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 4285 .write = mem_cgroup_reset, 4286 .read_u64 = mem_cgroup_read_u64, 4287 }, 4288 { 4289 .name = "kmem.tcp.max_usage_in_bytes", 4290 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 4291 .write = mem_cgroup_reset, 4292 .read_u64 = mem_cgroup_read_u64, 4293 }, 4294 { }, /* terminate */ 4295 }; 4296 4297 /* 4298 * Private memory cgroup IDR 4299 * 4300 * Swap-out records and page cache shadow entries need to store memcg 4301 * references in constrained space, so we maintain an ID space that is 4302 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 4303 * memory-controlled cgroups to 64k. 4304 * 4305 * However, there usually are many references to the oflline CSS after 4306 * the cgroup has been destroyed, such as page cache or reclaimable 4307 * slab objects, that don't need to hang on to the ID. We want to keep 4308 * those dead CSS from occupying IDs, or we might quickly exhaust the 4309 * relatively small ID space and prevent the creation of new cgroups 4310 * even when there are much fewer than 64k cgroups - possibly none. 4311 * 4312 * Maintain a private 16-bit ID space for memcg, and allow the ID to 4313 * be freed and recycled when it's no longer needed, which is usually 4314 * when the CSS is offlined. 4315 * 4316 * The only exception to that are records of swapped out tmpfs/shmem 4317 * pages that need to be attributed to live ancestors on swapin. But 4318 * those references are manageable from userspace. 4319 */ 4320 4321 static DEFINE_IDR(mem_cgroup_idr); 4322 4323 static void mem_cgroup_id_remove(struct mem_cgroup *memcg) 4324 { 4325 if (memcg->id.id > 0) { 4326 idr_remove(&mem_cgroup_idr, memcg->id.id); 4327 memcg->id.id = 0; 4328 } 4329 } 4330 4331 static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) 4332 { 4333 refcount_add(n, &memcg->id.ref); 4334 } 4335 4336 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 4337 { 4338 if (refcount_sub_and_test(n, &memcg->id.ref)) { 4339 mem_cgroup_id_remove(memcg); 4340 4341 /* Memcg ID pins CSS */ 4342 css_put(&memcg->css); 4343 } 4344 } 4345 4346 static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) 4347 { 4348 mem_cgroup_id_get_many(memcg, 1); 4349 } 4350 4351 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 4352 { 4353 mem_cgroup_id_put_many(memcg, 1); 4354 } 4355 4356 /** 4357 * mem_cgroup_from_id - look up a memcg from a memcg id 4358 * @id: the memcg id to look up 4359 * 4360 * Caller must hold rcu_read_lock(). 4361 */ 4362 struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 4363 { 4364 WARN_ON_ONCE(!rcu_read_lock_held()); 4365 return idr_find(&mem_cgroup_idr, id); 4366 } 4367 4368 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 4369 { 4370 struct mem_cgroup_per_node *pn; 4371 int tmp = node; 4372 /* 4373 * This routine is called against possible nodes. 4374 * But it's BUG to call kmalloc() against offline node. 4375 * 4376 * TODO: this routine can waste much memory for nodes which will 4377 * never be onlined. It's better to use memory hotplug callback 4378 * function. 4379 */ 4380 if (!node_state(node, N_NORMAL_MEMORY)) 4381 tmp = -1; 4382 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4383 if (!pn) 4384 return 1; 4385 4386 pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); 4387 if (!pn->lruvec_stat_cpu) { 4388 kfree(pn); 4389 return 1; 4390 } 4391 4392 lruvec_init(&pn->lruvec); 4393 pn->usage_in_excess = 0; 4394 pn->on_tree = false; 4395 pn->memcg = memcg; 4396 4397 memcg->nodeinfo[node] = pn; 4398 return 0; 4399 } 4400 4401 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 4402 { 4403 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 4404 4405 if (!pn) 4406 return; 4407 4408 free_percpu(pn->lruvec_stat_cpu); 4409 kfree(pn); 4410 } 4411 4412 static void __mem_cgroup_free(struct mem_cgroup *memcg) 4413 { 4414 int node; 4415 4416 for_each_node(node) 4417 free_mem_cgroup_per_node_info(memcg, node); 4418 free_percpu(memcg->stat_cpu); 4419 kfree(memcg); 4420 } 4421 4422 static void mem_cgroup_free(struct mem_cgroup *memcg) 4423 { 4424 memcg_wb_domain_exit(memcg); 4425 __mem_cgroup_free(memcg); 4426 } 4427 4428 static struct mem_cgroup *mem_cgroup_alloc(void) 4429 { 4430 struct mem_cgroup *memcg; 4431 size_t size; 4432 int node; 4433 4434 size = sizeof(struct mem_cgroup); 4435 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 4436 4437 memcg = kzalloc(size, GFP_KERNEL); 4438 if (!memcg) 4439 return NULL; 4440 4441 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 4442 1, MEM_CGROUP_ID_MAX, 4443 GFP_KERNEL); 4444 if (memcg->id.id < 0) 4445 goto fail; 4446 4447 memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu); 4448 if (!memcg->stat_cpu) 4449 goto fail; 4450 4451 for_each_node(node) 4452 if (alloc_mem_cgroup_per_node_info(memcg, node)) 4453 goto fail; 4454 4455 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 4456 goto fail; 4457 4458 INIT_WORK(&memcg->high_work, high_work_func); 4459 memcg->last_scanned_node = MAX_NUMNODES; 4460 INIT_LIST_HEAD(&memcg->oom_notify); 4461 mutex_init(&memcg->thresholds_lock); 4462 spin_lock_init(&memcg->move_lock); 4463 vmpressure_init(&memcg->vmpressure); 4464 INIT_LIST_HEAD(&memcg->event_list); 4465 spin_lock_init(&memcg->event_list_lock); 4466 memcg->socket_pressure = jiffies; 4467 #ifdef CONFIG_MEMCG_KMEM 4468 memcg->kmemcg_id = -1; 4469 #endif 4470 #ifdef CONFIG_CGROUP_WRITEBACK 4471 INIT_LIST_HEAD(&memcg->cgwb_list); 4472 #endif 4473 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 4474 return memcg; 4475 fail: 4476 mem_cgroup_id_remove(memcg); 4477 __mem_cgroup_free(memcg); 4478 return NULL; 4479 } 4480 4481 static struct cgroup_subsys_state * __ref 4482 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4483 { 4484 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 4485 struct mem_cgroup *memcg; 4486 long error = -ENOMEM; 4487 4488 memcg = mem_cgroup_alloc(); 4489 if (!memcg) 4490 return ERR_PTR(error); 4491 4492 memcg->high = PAGE_COUNTER_MAX; 4493 memcg->soft_limit = PAGE_COUNTER_MAX; 4494 if (parent) { 4495 memcg->swappiness = mem_cgroup_swappiness(parent); 4496 memcg->oom_kill_disable = parent->oom_kill_disable; 4497 } 4498 if (parent && parent->use_hierarchy) { 4499 memcg->use_hierarchy = true; 4500 page_counter_init(&memcg->memory, &parent->memory); 4501 page_counter_init(&memcg->swap, &parent->swap); 4502 page_counter_init(&memcg->memsw, &parent->memsw); 4503 page_counter_init(&memcg->kmem, &parent->kmem); 4504 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 4505 } else { 4506 page_counter_init(&memcg->memory, NULL); 4507 page_counter_init(&memcg->swap, NULL); 4508 page_counter_init(&memcg->memsw, NULL); 4509 page_counter_init(&memcg->kmem, NULL); 4510 page_counter_init(&memcg->tcpmem, NULL); 4511 /* 4512 * Deeper hierachy with use_hierarchy == false doesn't make 4513 * much sense so let cgroup subsystem know about this 4514 * unfortunate state in our controller. 4515 */ 4516 if (parent != root_mem_cgroup) 4517 memory_cgrp_subsys.broken_hierarchy = true; 4518 } 4519 4520 /* The following stuff does not apply to the root */ 4521 if (!parent) { 4522 root_mem_cgroup = memcg; 4523 return &memcg->css; 4524 } 4525 4526 error = memcg_online_kmem(memcg); 4527 if (error) 4528 goto fail; 4529 4530 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 4531 static_branch_inc(&memcg_sockets_enabled_key); 4532 4533 return &memcg->css; 4534 fail: 4535 mem_cgroup_id_remove(memcg); 4536 mem_cgroup_free(memcg); 4537 return ERR_PTR(-ENOMEM); 4538 } 4539 4540 static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 4541 { 4542 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4543 4544 /* 4545 * A memcg must be visible for memcg_expand_shrinker_maps() 4546 * by the time the maps are allocated. So, we allocate maps 4547 * here, when for_each_mem_cgroup() can't skip it. 4548 */ 4549 if (memcg_alloc_shrinker_maps(memcg)) { 4550 mem_cgroup_id_remove(memcg); 4551 return -ENOMEM; 4552 } 4553 4554 /* Online state pins memcg ID, memcg ID pins CSS */ 4555 refcount_set(&memcg->id.ref, 1); 4556 css_get(css); 4557 return 0; 4558 } 4559 4560 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 4561 { 4562 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4563 struct mem_cgroup_event *event, *tmp; 4564 4565 /* 4566 * Unregister events and notify userspace. 4567 * Notify userspace about cgroup removing only after rmdir of cgroup 4568 * directory to avoid race between userspace and kernelspace. 4569 */ 4570 spin_lock(&memcg->event_list_lock); 4571 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 4572 list_del_init(&event->list); 4573 schedule_work(&event->remove); 4574 } 4575 spin_unlock(&memcg->event_list_lock); 4576 4577 page_counter_set_min(&memcg->memory, 0); 4578 page_counter_set_low(&memcg->memory, 0); 4579 4580 memcg_offline_kmem(memcg); 4581 wb_memcg_offline(memcg); 4582 4583 drain_all_stock(memcg); 4584 4585 mem_cgroup_id_put(memcg); 4586 } 4587 4588 static void mem_cgroup_css_released(struct cgroup_subsys_state *css) 4589 { 4590 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4591 4592 invalidate_reclaim_iterators(memcg); 4593 } 4594 4595 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 4596 { 4597 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4598 4599 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 4600 static_branch_dec(&memcg_sockets_enabled_key); 4601 4602 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 4603 static_branch_dec(&memcg_sockets_enabled_key); 4604 4605 vmpressure_cleanup(&memcg->vmpressure); 4606 cancel_work_sync(&memcg->high_work); 4607 mem_cgroup_remove_from_trees(memcg); 4608 memcg_free_shrinker_maps(memcg); 4609 memcg_free_kmem(memcg); 4610 mem_cgroup_free(memcg); 4611 } 4612 4613 /** 4614 * mem_cgroup_css_reset - reset the states of a mem_cgroup 4615 * @css: the target css 4616 * 4617 * Reset the states of the mem_cgroup associated with @css. This is 4618 * invoked when the userland requests disabling on the default hierarchy 4619 * but the memcg is pinned through dependency. The memcg should stop 4620 * applying policies and should revert to the vanilla state as it may be 4621 * made visible again. 4622 * 4623 * The current implementation only resets the essential configurations. 4624 * This needs to be expanded to cover all the visible parts. 4625 */ 4626 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 4627 { 4628 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4629 4630 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 4631 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 4632 page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); 4633 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 4634 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 4635 page_counter_set_min(&memcg->memory, 0); 4636 page_counter_set_low(&memcg->memory, 0); 4637 memcg->high = PAGE_COUNTER_MAX; 4638 memcg->soft_limit = PAGE_COUNTER_MAX; 4639 memcg_wb_domain_size_changed(memcg); 4640 } 4641 4642 #ifdef CONFIG_MMU 4643 /* Handlers for move charge at task migration. */ 4644 static int mem_cgroup_do_precharge(unsigned long count) 4645 { 4646 int ret; 4647 4648 /* Try a single bulk charge without reclaim first, kswapd may wake */ 4649 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 4650 if (!ret) { 4651 mc.precharge += count; 4652 return ret; 4653 } 4654 4655 /* Try charges one by one with reclaim, but do not retry */ 4656 while (count--) { 4657 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 4658 if (ret) 4659 return ret; 4660 mc.precharge++; 4661 cond_resched(); 4662 } 4663 return 0; 4664 } 4665 4666 union mc_target { 4667 struct page *page; 4668 swp_entry_t ent; 4669 }; 4670 4671 enum mc_target_type { 4672 MC_TARGET_NONE = 0, 4673 MC_TARGET_PAGE, 4674 MC_TARGET_SWAP, 4675 MC_TARGET_DEVICE, 4676 }; 4677 4678 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4679 unsigned long addr, pte_t ptent) 4680 { 4681 struct page *page = _vm_normal_page(vma, addr, ptent, true); 4682 4683 if (!page || !page_mapped(page)) 4684 return NULL; 4685 if (PageAnon(page)) { 4686 if (!(mc.flags & MOVE_ANON)) 4687 return NULL; 4688 } else { 4689 if (!(mc.flags & MOVE_FILE)) 4690 return NULL; 4691 } 4692 if (!get_page_unless_zero(page)) 4693 return NULL; 4694 4695 return page; 4696 } 4697 4698 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 4699 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4700 pte_t ptent, swp_entry_t *entry) 4701 { 4702 struct page *page = NULL; 4703 swp_entry_t ent = pte_to_swp_entry(ptent); 4704 4705 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) 4706 return NULL; 4707 4708 /* 4709 * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to 4710 * a device and because they are not accessible by CPU they are store 4711 * as special swap entry in the CPU page table. 4712 */ 4713 if (is_device_private_entry(ent)) { 4714 page = device_private_entry_to_page(ent); 4715 /* 4716 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have 4717 * a refcount of 1 when free (unlike normal page) 4718 */ 4719 if (!page_ref_add_unless(page, 1, 1)) 4720 return NULL; 4721 return page; 4722 } 4723 4724 /* 4725 * Because lookup_swap_cache() updates some statistics counter, 4726 * we call find_get_page() with swapper_space directly. 4727 */ 4728 page = find_get_page(swap_address_space(ent), swp_offset(ent)); 4729 if (do_memsw_account()) 4730 entry->val = ent.val; 4731 4732 return page; 4733 } 4734 #else 4735 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4736 pte_t ptent, swp_entry_t *entry) 4737 { 4738 return NULL; 4739 } 4740 #endif 4741 4742 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 4743 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4744 { 4745 struct page *page = NULL; 4746 struct address_space *mapping; 4747 pgoff_t pgoff; 4748 4749 if (!vma->vm_file) /* anonymous vma */ 4750 return NULL; 4751 if (!(mc.flags & MOVE_FILE)) 4752 return NULL; 4753 4754 mapping = vma->vm_file->f_mapping; 4755 pgoff = linear_page_index(vma, addr); 4756 4757 /* page is moved even if it's not RSS of this task(page-faulted). */ 4758 #ifdef CONFIG_SWAP 4759 /* shmem/tmpfs may report page out on swap: account for that too. */ 4760 if (shmem_mapping(mapping)) { 4761 page = find_get_entry(mapping, pgoff); 4762 if (xa_is_value(page)) { 4763 swp_entry_t swp = radix_to_swp_entry(page); 4764 if (do_memsw_account()) 4765 *entry = swp; 4766 page = find_get_page(swap_address_space(swp), 4767 swp_offset(swp)); 4768 } 4769 } else 4770 page = find_get_page(mapping, pgoff); 4771 #else 4772 page = find_get_page(mapping, pgoff); 4773 #endif 4774 return page; 4775 } 4776 4777 /** 4778 * mem_cgroup_move_account - move account of the page 4779 * @page: the page 4780 * @compound: charge the page as compound or small page 4781 * @from: mem_cgroup which the page is moved from. 4782 * @to: mem_cgroup which the page is moved to. @from != @to. 4783 * 4784 * The caller must make sure the page is not on LRU (isolate_page() is useful.) 4785 * 4786 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 4787 * from old cgroup. 4788 */ 4789 static int mem_cgroup_move_account(struct page *page, 4790 bool compound, 4791 struct mem_cgroup *from, 4792 struct mem_cgroup *to) 4793 { 4794 unsigned long flags; 4795 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; 4796 int ret; 4797 bool anon; 4798 4799 VM_BUG_ON(from == to); 4800 VM_BUG_ON_PAGE(PageLRU(page), page); 4801 VM_BUG_ON(compound && !PageTransHuge(page)); 4802 4803 /* 4804 * Prevent mem_cgroup_migrate() from looking at 4805 * page->mem_cgroup of its source page while we change it. 4806 */ 4807 ret = -EBUSY; 4808 if (!trylock_page(page)) 4809 goto out; 4810 4811 ret = -EINVAL; 4812 if (page->mem_cgroup != from) 4813 goto out_unlock; 4814 4815 anon = PageAnon(page); 4816 4817 spin_lock_irqsave(&from->move_lock, flags); 4818 4819 if (!anon && page_mapped(page)) { 4820 __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages); 4821 __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages); 4822 } 4823 4824 /* 4825 * move_lock grabbed above and caller set from->moving_account, so 4826 * mod_memcg_page_state will serialize updates to PageDirty. 4827 * So mapping should be stable for dirty pages. 4828 */ 4829 if (!anon && PageDirty(page)) { 4830 struct address_space *mapping = page_mapping(page); 4831 4832 if (mapping_cap_account_dirty(mapping)) { 4833 __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages); 4834 __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages); 4835 } 4836 } 4837 4838 if (PageWriteback(page)) { 4839 __mod_memcg_state(from, NR_WRITEBACK, -nr_pages); 4840 __mod_memcg_state(to, NR_WRITEBACK, nr_pages); 4841 } 4842 4843 /* 4844 * It is safe to change page->mem_cgroup here because the page 4845 * is referenced, charged, and isolated - we can't race with 4846 * uncharging, charging, migration, or LRU putback. 4847 */ 4848 4849 /* caller should have done css_get */ 4850 page->mem_cgroup = to; 4851 spin_unlock_irqrestore(&from->move_lock, flags); 4852 4853 ret = 0; 4854 4855 local_irq_disable(); 4856 mem_cgroup_charge_statistics(to, page, compound, nr_pages); 4857 memcg_check_events(to, page); 4858 mem_cgroup_charge_statistics(from, page, compound, -nr_pages); 4859 memcg_check_events(from, page); 4860 local_irq_enable(); 4861 out_unlock: 4862 unlock_page(page); 4863 out: 4864 return ret; 4865 } 4866 4867 /** 4868 * get_mctgt_type - get target type of moving charge 4869 * @vma: the vma the pte to be checked belongs 4870 * @addr: the address corresponding to the pte to be checked 4871 * @ptent: the pte to be checked 4872 * @target: the pointer the target page or swap ent will be stored(can be NULL) 4873 * 4874 * Returns 4875 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 4876 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 4877 * move charge. if @target is not NULL, the page is stored in target->page 4878 * with extra refcnt got(Callers should handle it). 4879 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 4880 * target for charge migration. if @target is not NULL, the entry is stored 4881 * in target->ent. 4882 * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC 4883 * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). 4884 * For now we such page is charge like a regular page would be as for all 4885 * intent and purposes it is just special memory taking the place of a 4886 * regular page. 4887 * 4888 * See Documentations/vm/hmm.txt and include/linux/hmm.h 4889 * 4890 * Called with pte lock held. 4891 */ 4892 4893 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 4894 unsigned long addr, pte_t ptent, union mc_target *target) 4895 { 4896 struct page *page = NULL; 4897 enum mc_target_type ret = MC_TARGET_NONE; 4898 swp_entry_t ent = { .val = 0 }; 4899 4900 if (pte_present(ptent)) 4901 page = mc_handle_present_pte(vma, addr, ptent); 4902 else if (is_swap_pte(ptent)) 4903 page = mc_handle_swap_pte(vma, ptent, &ent); 4904 else if (pte_none(ptent)) 4905 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4906 4907 if (!page && !ent.val) 4908 return ret; 4909 if (page) { 4910 /* 4911 * Do only loose check w/o serialization. 4912 * mem_cgroup_move_account() checks the page is valid or 4913 * not under LRU exclusion. 4914 */ 4915 if (page->mem_cgroup == mc.from) { 4916 ret = MC_TARGET_PAGE; 4917 if (is_device_private_page(page) || 4918 is_device_public_page(page)) 4919 ret = MC_TARGET_DEVICE; 4920 if (target) 4921 target->page = page; 4922 } 4923 if (!ret || !target) 4924 put_page(page); 4925 } 4926 /* 4927 * There is a swap entry and a page doesn't exist or isn't charged. 4928 * But we cannot move a tail-page in a THP. 4929 */ 4930 if (ent.val && !ret && (!page || !PageTransCompound(page)) && 4931 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 4932 ret = MC_TARGET_SWAP; 4933 if (target) 4934 target->ent = ent; 4935 } 4936 return ret; 4937 } 4938 4939 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4940 /* 4941 * We don't consider PMD mapped swapping or file mapped pages because THP does 4942 * not support them for now. 4943 * Caller should make sure that pmd_trans_huge(pmd) is true. 4944 */ 4945 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 4946 unsigned long addr, pmd_t pmd, union mc_target *target) 4947 { 4948 struct page *page = NULL; 4949 enum mc_target_type ret = MC_TARGET_NONE; 4950 4951 if (unlikely(is_swap_pmd(pmd))) { 4952 VM_BUG_ON(thp_migration_supported() && 4953 !is_pmd_migration_entry(pmd)); 4954 return ret; 4955 } 4956 page = pmd_page(pmd); 4957 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 4958 if (!(mc.flags & MOVE_ANON)) 4959 return ret; 4960 if (page->mem_cgroup == mc.from) { 4961 ret = MC_TARGET_PAGE; 4962 if (target) { 4963 get_page(page); 4964 target->page = page; 4965 } 4966 } 4967 return ret; 4968 } 4969 #else 4970 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 4971 unsigned long addr, pmd_t pmd, union mc_target *target) 4972 { 4973 return MC_TARGET_NONE; 4974 } 4975 #endif 4976 4977 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 4978 unsigned long addr, unsigned long end, 4979 struct mm_walk *walk) 4980 { 4981 struct vm_area_struct *vma = walk->vma; 4982 pte_t *pte; 4983 spinlock_t *ptl; 4984 4985 ptl = pmd_trans_huge_lock(pmd, vma); 4986 if (ptl) { 4987 /* 4988 * Note their can not be MC_TARGET_DEVICE for now as we do not 4989 * support transparent huge page with MEMORY_DEVICE_PUBLIC or 4990 * MEMORY_DEVICE_PRIVATE but this might change. 4991 */ 4992 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 4993 mc.precharge += HPAGE_PMD_NR; 4994 spin_unlock(ptl); 4995 return 0; 4996 } 4997 4998 if (pmd_trans_unstable(pmd)) 4999 return 0; 5000 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5001 for (; addr != end; pte++, addr += PAGE_SIZE) 5002 if (get_mctgt_type(vma, addr, *pte, NULL)) 5003 mc.precharge++; /* increment precharge temporarily */ 5004 pte_unmap_unlock(pte - 1, ptl); 5005 cond_resched(); 5006 5007 return 0; 5008 } 5009 5010 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5011 { 5012 unsigned long precharge; 5013 5014 struct mm_walk mem_cgroup_count_precharge_walk = { 5015 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5016 .mm = mm, 5017 }; 5018 down_read(&mm->mmap_sem); 5019 walk_page_range(0, mm->highest_vm_end, 5020 &mem_cgroup_count_precharge_walk); 5021 up_read(&mm->mmap_sem); 5022 5023 precharge = mc.precharge; 5024 mc.precharge = 0; 5025 5026 return precharge; 5027 } 5028 5029 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5030 { 5031 unsigned long precharge = mem_cgroup_count_precharge(mm); 5032 5033 VM_BUG_ON(mc.moving_task); 5034 mc.moving_task = current; 5035 return mem_cgroup_do_precharge(precharge); 5036 } 5037 5038 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5039 static void __mem_cgroup_clear_mc(void) 5040 { 5041 struct mem_cgroup *from = mc.from; 5042 struct mem_cgroup *to = mc.to; 5043 5044 /* we must uncharge all the leftover precharges from mc.to */ 5045 if (mc.precharge) { 5046 cancel_charge(mc.to, mc.precharge); 5047 mc.precharge = 0; 5048 } 5049 /* 5050 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5051 * we must uncharge here. 5052 */ 5053 if (mc.moved_charge) { 5054 cancel_charge(mc.from, mc.moved_charge); 5055 mc.moved_charge = 0; 5056 } 5057 /* we must fixup refcnts and charges */ 5058 if (mc.moved_swap) { 5059 /* uncharge swap account from the old cgroup */ 5060 if (!mem_cgroup_is_root(mc.from)) 5061 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 5062 5063 mem_cgroup_id_put_many(mc.from, mc.moved_swap); 5064 5065 /* 5066 * we charged both to->memory and to->memsw, so we 5067 * should uncharge to->memory. 5068 */ 5069 if (!mem_cgroup_is_root(mc.to)) 5070 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 5071 5072 mem_cgroup_id_get_many(mc.to, mc.moved_swap); 5073 css_put_many(&mc.to->css, mc.moved_swap); 5074 5075 mc.moved_swap = 0; 5076 } 5077 memcg_oom_recover(from); 5078 memcg_oom_recover(to); 5079 wake_up_all(&mc.waitq); 5080 } 5081 5082 static void mem_cgroup_clear_mc(void) 5083 { 5084 struct mm_struct *mm = mc.mm; 5085 5086 /* 5087 * we must clear moving_task before waking up waiters at the end of 5088 * task migration. 5089 */ 5090 mc.moving_task = NULL; 5091 __mem_cgroup_clear_mc(); 5092 spin_lock(&mc.lock); 5093 mc.from = NULL; 5094 mc.to = NULL; 5095 mc.mm = NULL; 5096 spin_unlock(&mc.lock); 5097 5098 mmput(mm); 5099 } 5100 5101 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 5102 { 5103 struct cgroup_subsys_state *css; 5104 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 5105 struct mem_cgroup *from; 5106 struct task_struct *leader, *p; 5107 struct mm_struct *mm; 5108 unsigned long move_flags; 5109 int ret = 0; 5110 5111 /* charge immigration isn't supported on the default hierarchy */ 5112 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5113 return 0; 5114 5115 /* 5116 * Multi-process migrations only happen on the default hierarchy 5117 * where charge immigration is not used. Perform charge 5118 * immigration if @tset contains a leader and whine if there are 5119 * multiple. 5120 */ 5121 p = NULL; 5122 cgroup_taskset_for_each_leader(leader, css, tset) { 5123 WARN_ON_ONCE(p); 5124 p = leader; 5125 memcg = mem_cgroup_from_css(css); 5126 } 5127 if (!p) 5128 return 0; 5129 5130 /* 5131 * We are now commited to this value whatever it is. Changes in this 5132 * tunable will only affect upcoming migrations, not the current one. 5133 * So we need to save it, and keep it going. 5134 */ 5135 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 5136 if (!move_flags) 5137 return 0; 5138 5139 from = mem_cgroup_from_task(p); 5140 5141 VM_BUG_ON(from == memcg); 5142 5143 mm = get_task_mm(p); 5144 if (!mm) 5145 return 0; 5146 /* We move charges only when we move a owner of the mm */ 5147 if (mm->owner == p) { 5148 VM_BUG_ON(mc.from); 5149 VM_BUG_ON(mc.to); 5150 VM_BUG_ON(mc.precharge); 5151 VM_BUG_ON(mc.moved_charge); 5152 VM_BUG_ON(mc.moved_swap); 5153 5154 spin_lock(&mc.lock); 5155 mc.mm = mm; 5156 mc.from = from; 5157 mc.to = memcg; 5158 mc.flags = move_flags; 5159 spin_unlock(&mc.lock); 5160 /* We set mc.moving_task later */ 5161 5162 ret = mem_cgroup_precharge_mc(mm); 5163 if (ret) 5164 mem_cgroup_clear_mc(); 5165 } else { 5166 mmput(mm); 5167 } 5168 return ret; 5169 } 5170 5171 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 5172 { 5173 if (mc.to) 5174 mem_cgroup_clear_mc(); 5175 } 5176 5177 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5178 unsigned long addr, unsigned long end, 5179 struct mm_walk *walk) 5180 { 5181 int ret = 0; 5182 struct vm_area_struct *vma = walk->vma; 5183 pte_t *pte; 5184 spinlock_t *ptl; 5185 enum mc_target_type target_type; 5186 union mc_target target; 5187 struct page *page; 5188 5189 ptl = pmd_trans_huge_lock(pmd, vma); 5190 if (ptl) { 5191 if (mc.precharge < HPAGE_PMD_NR) { 5192 spin_unlock(ptl); 5193 return 0; 5194 } 5195 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 5196 if (target_type == MC_TARGET_PAGE) { 5197 page = target.page; 5198 if (!isolate_lru_page(page)) { 5199 if (!mem_cgroup_move_account(page, true, 5200 mc.from, mc.to)) { 5201 mc.precharge -= HPAGE_PMD_NR; 5202 mc.moved_charge += HPAGE_PMD_NR; 5203 } 5204 putback_lru_page(page); 5205 } 5206 put_page(page); 5207 } else if (target_type == MC_TARGET_DEVICE) { 5208 page = target.page; 5209 if (!mem_cgroup_move_account(page, true, 5210 mc.from, mc.to)) { 5211 mc.precharge -= HPAGE_PMD_NR; 5212 mc.moved_charge += HPAGE_PMD_NR; 5213 } 5214 put_page(page); 5215 } 5216 spin_unlock(ptl); 5217 return 0; 5218 } 5219 5220 if (pmd_trans_unstable(pmd)) 5221 return 0; 5222 retry: 5223 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5224 for (; addr != end; addr += PAGE_SIZE) { 5225 pte_t ptent = *(pte++); 5226 bool device = false; 5227 swp_entry_t ent; 5228 5229 if (!mc.precharge) 5230 break; 5231 5232 switch (get_mctgt_type(vma, addr, ptent, &target)) { 5233 case MC_TARGET_DEVICE: 5234 device = true; 5235 /* fall through */ 5236 case MC_TARGET_PAGE: 5237 page = target.page; 5238 /* 5239 * We can have a part of the split pmd here. Moving it 5240 * can be done but it would be too convoluted so simply 5241 * ignore such a partial THP and keep it in original 5242 * memcg. There should be somebody mapping the head. 5243 */ 5244 if (PageTransCompound(page)) 5245 goto put; 5246 if (!device && isolate_lru_page(page)) 5247 goto put; 5248 if (!mem_cgroup_move_account(page, false, 5249 mc.from, mc.to)) { 5250 mc.precharge--; 5251 /* we uncharge from mc.from later. */ 5252 mc.moved_charge++; 5253 } 5254 if (!device) 5255 putback_lru_page(page); 5256 put: /* get_mctgt_type() gets the page */ 5257 put_page(page); 5258 break; 5259 case MC_TARGET_SWAP: 5260 ent = target.ent; 5261 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 5262 mc.precharge--; 5263 /* we fixup refcnts and charges later. */ 5264 mc.moved_swap++; 5265 } 5266 break; 5267 default: 5268 break; 5269 } 5270 } 5271 pte_unmap_unlock(pte - 1, ptl); 5272 cond_resched(); 5273 5274 if (addr != end) { 5275 /* 5276 * We have consumed all precharges we got in can_attach(). 5277 * We try charge one by one, but don't do any additional 5278 * charges to mc.to if we have failed in charge once in attach() 5279 * phase. 5280 */ 5281 ret = mem_cgroup_do_precharge(1); 5282 if (!ret) 5283 goto retry; 5284 } 5285 5286 return ret; 5287 } 5288 5289 static void mem_cgroup_move_charge(void) 5290 { 5291 struct mm_walk mem_cgroup_move_charge_walk = { 5292 .pmd_entry = mem_cgroup_move_charge_pte_range, 5293 .mm = mc.mm, 5294 }; 5295 5296 lru_add_drain_all(); 5297 /* 5298 * Signal lock_page_memcg() to take the memcg's move_lock 5299 * while we're moving its pages to another memcg. Then wait 5300 * for already started RCU-only updates to finish. 5301 */ 5302 atomic_inc(&mc.from->moving_account); 5303 synchronize_rcu(); 5304 retry: 5305 if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) { 5306 /* 5307 * Someone who are holding the mmap_sem might be waiting in 5308 * waitq. So we cancel all extra charges, wake up all waiters, 5309 * and retry. Because we cancel precharges, we might not be able 5310 * to move enough charges, but moving charge is a best-effort 5311 * feature anyway, so it wouldn't be a big problem. 5312 */ 5313 __mem_cgroup_clear_mc(); 5314 cond_resched(); 5315 goto retry; 5316 } 5317 /* 5318 * When we have consumed all precharges and failed in doing 5319 * additional charge, the page walk just aborts. 5320 */ 5321 walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); 5322 5323 up_read(&mc.mm->mmap_sem); 5324 atomic_dec(&mc.from->moving_account); 5325 } 5326 5327 static void mem_cgroup_move_task(void) 5328 { 5329 if (mc.to) { 5330 mem_cgroup_move_charge(); 5331 mem_cgroup_clear_mc(); 5332 } 5333 } 5334 #else /* !CONFIG_MMU */ 5335 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 5336 { 5337 return 0; 5338 } 5339 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 5340 { 5341 } 5342 static void mem_cgroup_move_task(void) 5343 { 5344 } 5345 #endif 5346 5347 /* 5348 * Cgroup retains root cgroups across [un]mount cycles making it necessary 5349 * to verify whether we're attached to the default hierarchy on each mount 5350 * attempt. 5351 */ 5352 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 5353 { 5354 /* 5355 * use_hierarchy is forced on the default hierarchy. cgroup core 5356 * guarantees that @root doesn't have any children, so turning it 5357 * on for the root memcg is enough. 5358 */ 5359 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5360 root_mem_cgroup->use_hierarchy = true; 5361 else 5362 root_mem_cgroup->use_hierarchy = false; 5363 } 5364 5365 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 5366 { 5367 if (value == PAGE_COUNTER_MAX) 5368 seq_puts(m, "max\n"); 5369 else 5370 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 5371 5372 return 0; 5373 } 5374 5375 static u64 memory_current_read(struct cgroup_subsys_state *css, 5376 struct cftype *cft) 5377 { 5378 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5379 5380 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 5381 } 5382 5383 static int memory_min_show(struct seq_file *m, void *v) 5384 { 5385 return seq_puts_memcg_tunable(m, 5386 READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 5387 } 5388 5389 static ssize_t memory_min_write(struct kernfs_open_file *of, 5390 char *buf, size_t nbytes, loff_t off) 5391 { 5392 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5393 unsigned long min; 5394 int err; 5395 5396 buf = strstrip(buf); 5397 err = page_counter_memparse(buf, "max", &min); 5398 if (err) 5399 return err; 5400 5401 page_counter_set_min(&memcg->memory, min); 5402 5403 return nbytes; 5404 } 5405 5406 static int memory_low_show(struct seq_file *m, void *v) 5407 { 5408 return seq_puts_memcg_tunable(m, 5409 READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 5410 } 5411 5412 static ssize_t memory_low_write(struct kernfs_open_file *of, 5413 char *buf, size_t nbytes, loff_t off) 5414 { 5415 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5416 unsigned long low; 5417 int err; 5418 5419 buf = strstrip(buf); 5420 err = page_counter_memparse(buf, "max", &low); 5421 if (err) 5422 return err; 5423 5424 page_counter_set_low(&memcg->memory, low); 5425 5426 return nbytes; 5427 } 5428 5429 static int memory_high_show(struct seq_file *m, void *v) 5430 { 5431 return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); 5432 } 5433 5434 static ssize_t memory_high_write(struct kernfs_open_file *of, 5435 char *buf, size_t nbytes, loff_t off) 5436 { 5437 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5438 unsigned long nr_pages; 5439 unsigned long high; 5440 int err; 5441 5442 buf = strstrip(buf); 5443 err = page_counter_memparse(buf, "max", &high); 5444 if (err) 5445 return err; 5446 5447 memcg->high = high; 5448 5449 nr_pages = page_counter_read(&memcg->memory); 5450 if (nr_pages > high) 5451 try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 5452 GFP_KERNEL, true); 5453 5454 memcg_wb_domain_size_changed(memcg); 5455 return nbytes; 5456 } 5457 5458 static int memory_max_show(struct seq_file *m, void *v) 5459 { 5460 return seq_puts_memcg_tunable(m, 5461 READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 5462 } 5463 5464 static ssize_t memory_max_write(struct kernfs_open_file *of, 5465 char *buf, size_t nbytes, loff_t off) 5466 { 5467 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5468 unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES; 5469 bool drained = false; 5470 unsigned long max; 5471 int err; 5472 5473 buf = strstrip(buf); 5474 err = page_counter_memparse(buf, "max", &max); 5475 if (err) 5476 return err; 5477 5478 xchg(&memcg->memory.max, max); 5479 5480 for (;;) { 5481 unsigned long nr_pages = page_counter_read(&memcg->memory); 5482 5483 if (nr_pages <= max) 5484 break; 5485 5486 if (signal_pending(current)) { 5487 err = -EINTR; 5488 break; 5489 } 5490 5491 if (!drained) { 5492 drain_all_stock(memcg); 5493 drained = true; 5494 continue; 5495 } 5496 5497 if (nr_reclaims) { 5498 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 5499 GFP_KERNEL, true)) 5500 nr_reclaims--; 5501 continue; 5502 } 5503 5504 memcg_memory_event(memcg, MEMCG_OOM); 5505 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 5506 break; 5507 } 5508 5509 memcg_wb_domain_size_changed(memcg); 5510 return nbytes; 5511 } 5512 5513 static int memory_events_show(struct seq_file *m, void *v) 5514 { 5515 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 5516 5517 seq_printf(m, "low %lu\n", 5518 atomic_long_read(&memcg->memory_events[MEMCG_LOW])); 5519 seq_printf(m, "high %lu\n", 5520 atomic_long_read(&memcg->memory_events[MEMCG_HIGH])); 5521 seq_printf(m, "max %lu\n", 5522 atomic_long_read(&memcg->memory_events[MEMCG_MAX])); 5523 seq_printf(m, "oom %lu\n", 5524 atomic_long_read(&memcg->memory_events[MEMCG_OOM])); 5525 seq_printf(m, "oom_kill %lu\n", 5526 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 5527 5528 return 0; 5529 } 5530 5531 static int memory_stat_show(struct seq_file *m, void *v) 5532 { 5533 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 5534 struct accumulated_stats acc; 5535 int i; 5536 5537 /* 5538 * Provide statistics on the state of the memory subsystem as 5539 * well as cumulative event counters that show past behavior. 5540 * 5541 * This list is ordered following a combination of these gradients: 5542 * 1) generic big picture -> specifics and details 5543 * 2) reflecting userspace activity -> reflecting kernel heuristics 5544 * 5545 * Current memory state: 5546 */ 5547 5548 memset(&acc, 0, sizeof(acc)); 5549 acc.stats_size = MEMCG_NR_STAT; 5550 acc.events_size = NR_VM_EVENT_ITEMS; 5551 accumulate_memcg_tree(memcg, &acc); 5552 5553 seq_printf(m, "anon %llu\n", 5554 (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE); 5555 seq_printf(m, "file %llu\n", 5556 (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE); 5557 seq_printf(m, "kernel_stack %llu\n", 5558 (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024); 5559 seq_printf(m, "slab %llu\n", 5560 (u64)(acc.stat[NR_SLAB_RECLAIMABLE] + 5561 acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); 5562 seq_printf(m, "sock %llu\n", 5563 (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE); 5564 5565 seq_printf(m, "shmem %llu\n", 5566 (u64)acc.stat[NR_SHMEM] * PAGE_SIZE); 5567 seq_printf(m, "file_mapped %llu\n", 5568 (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE); 5569 seq_printf(m, "file_dirty %llu\n", 5570 (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE); 5571 seq_printf(m, "file_writeback %llu\n", 5572 (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); 5573 5574 for (i = 0; i < NR_LRU_LISTS; i++) 5575 seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], 5576 (u64)acc.lru_pages[i] * PAGE_SIZE); 5577 5578 seq_printf(m, "slab_reclaimable %llu\n", 5579 (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); 5580 seq_printf(m, "slab_unreclaimable %llu\n", 5581 (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); 5582 5583 /* Accumulated memory events */ 5584 5585 seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); 5586 seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); 5587 5588 seq_printf(m, "workingset_refault %lu\n", 5589 acc.stat[WORKINGSET_REFAULT]); 5590 seq_printf(m, "workingset_activate %lu\n", 5591 acc.stat[WORKINGSET_ACTIVATE]); 5592 seq_printf(m, "workingset_nodereclaim %lu\n", 5593 acc.stat[WORKINGSET_NODERECLAIM]); 5594 5595 seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); 5596 seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + 5597 acc.events[PGSCAN_DIRECT]); 5598 seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] + 5599 acc.events[PGSTEAL_DIRECT]); 5600 seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]); 5601 seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]); 5602 seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); 5603 seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); 5604 5605 return 0; 5606 } 5607 5608 static int memory_oom_group_show(struct seq_file *m, void *v) 5609 { 5610 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 5611 5612 seq_printf(m, "%d\n", memcg->oom_group); 5613 5614 return 0; 5615 } 5616 5617 static ssize_t memory_oom_group_write(struct kernfs_open_file *of, 5618 char *buf, size_t nbytes, loff_t off) 5619 { 5620 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5621 int ret, oom_group; 5622 5623 buf = strstrip(buf); 5624 if (!buf) 5625 return -EINVAL; 5626 5627 ret = kstrtoint(buf, 0, &oom_group); 5628 if (ret) 5629 return ret; 5630 5631 if (oom_group != 0 && oom_group != 1) 5632 return -EINVAL; 5633 5634 memcg->oom_group = oom_group; 5635 5636 return nbytes; 5637 } 5638 5639 static struct cftype memory_files[] = { 5640 { 5641 .name = "current", 5642 .flags = CFTYPE_NOT_ON_ROOT, 5643 .read_u64 = memory_current_read, 5644 }, 5645 { 5646 .name = "min", 5647 .flags = CFTYPE_NOT_ON_ROOT, 5648 .seq_show = memory_min_show, 5649 .write = memory_min_write, 5650 }, 5651 { 5652 .name = "low", 5653 .flags = CFTYPE_NOT_ON_ROOT, 5654 .seq_show = memory_low_show, 5655 .write = memory_low_write, 5656 }, 5657 { 5658 .name = "high", 5659 .flags = CFTYPE_NOT_ON_ROOT, 5660 .seq_show = memory_high_show, 5661 .write = memory_high_write, 5662 }, 5663 { 5664 .name = "max", 5665 .flags = CFTYPE_NOT_ON_ROOT, 5666 .seq_show = memory_max_show, 5667 .write = memory_max_write, 5668 }, 5669 { 5670 .name = "events", 5671 .flags = CFTYPE_NOT_ON_ROOT, 5672 .file_offset = offsetof(struct mem_cgroup, events_file), 5673 .seq_show = memory_events_show, 5674 }, 5675 { 5676 .name = "stat", 5677 .flags = CFTYPE_NOT_ON_ROOT, 5678 .seq_show = memory_stat_show, 5679 }, 5680 { 5681 .name = "oom.group", 5682 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 5683 .seq_show = memory_oom_group_show, 5684 .write = memory_oom_group_write, 5685 }, 5686 { } /* terminate */ 5687 }; 5688 5689 struct cgroup_subsys memory_cgrp_subsys = { 5690 .css_alloc = mem_cgroup_css_alloc, 5691 .css_online = mem_cgroup_css_online, 5692 .css_offline = mem_cgroup_css_offline, 5693 .css_released = mem_cgroup_css_released, 5694 .css_free = mem_cgroup_css_free, 5695 .css_reset = mem_cgroup_css_reset, 5696 .can_attach = mem_cgroup_can_attach, 5697 .cancel_attach = mem_cgroup_cancel_attach, 5698 .post_attach = mem_cgroup_move_task, 5699 .bind = mem_cgroup_bind, 5700 .dfl_cftypes = memory_files, 5701 .legacy_cftypes = mem_cgroup_legacy_files, 5702 .early_init = 0, 5703 }; 5704 5705 /** 5706 * mem_cgroup_protected - check if memory consumption is in the normal range 5707 * @root: the top ancestor of the sub-tree being checked 5708 * @memcg: the memory cgroup to check 5709 * 5710 * WARNING: This function is not stateless! It can only be used as part 5711 * of a top-down tree iteration, not for isolated queries. 5712 * 5713 * Returns one of the following: 5714 * MEMCG_PROT_NONE: cgroup memory is not protected 5715 * MEMCG_PROT_LOW: cgroup memory is protected as long there is 5716 * an unprotected supply of reclaimable memory from other cgroups. 5717 * MEMCG_PROT_MIN: cgroup memory is protected 5718 * 5719 * @root is exclusive; it is never protected when looked at directly 5720 * 5721 * To provide a proper hierarchical behavior, effective memory.min/low values 5722 * are used. Below is the description of how effective memory.low is calculated. 5723 * Effective memory.min values is calculated in the same way. 5724 * 5725 * Effective memory.low is always equal or less than the original memory.low. 5726 * If there is no memory.low overcommittment (which is always true for 5727 * top-level memory cgroups), these two values are equal. 5728 * Otherwise, it's a part of parent's effective memory.low, 5729 * calculated as a cgroup's memory.low usage divided by sum of sibling's 5730 * memory.low usages, where memory.low usage is the size of actually 5731 * protected memory. 5732 * 5733 * low_usage 5734 * elow = min( memory.low, parent->elow * ------------------ ), 5735 * siblings_low_usage 5736 * 5737 * | memory.current, if memory.current < memory.low 5738 * low_usage = | 5739 | 0, otherwise. 5740 * 5741 * 5742 * Such definition of the effective memory.low provides the expected 5743 * hierarchical behavior: parent's memory.low value is limiting 5744 * children, unprotected memory is reclaimed first and cgroups, 5745 * which are not using their guarantee do not affect actual memory 5746 * distribution. 5747 * 5748 * For example, if there are memcgs A, A/B, A/C, A/D and A/E: 5749 * 5750 * A A/memory.low = 2G, A/memory.current = 6G 5751 * //\\ 5752 * BC DE B/memory.low = 3G B/memory.current = 2G 5753 * C/memory.low = 1G C/memory.current = 2G 5754 * D/memory.low = 0 D/memory.current = 2G 5755 * E/memory.low = 10G E/memory.current = 0 5756 * 5757 * and the memory pressure is applied, the following memory distribution 5758 * is expected (approximately): 5759 * 5760 * A/memory.current = 2G 5761 * 5762 * B/memory.current = 1.3G 5763 * C/memory.current = 0.6G 5764 * D/memory.current = 0 5765 * E/memory.current = 0 5766 * 5767 * These calculations require constant tracking of the actual low usages 5768 * (see propagate_protected_usage()), as well as recursive calculation of 5769 * effective memory.low values. But as we do call mem_cgroup_protected() 5770 * path for each memory cgroup top-down from the reclaim, 5771 * it's possible to optimize this part, and save calculated elow 5772 * for next usage. This part is intentionally racy, but it's ok, 5773 * as memory.low is a best-effort mechanism. 5774 */ 5775 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, 5776 struct mem_cgroup *memcg) 5777 { 5778 struct mem_cgroup *parent; 5779 unsigned long emin, parent_emin; 5780 unsigned long elow, parent_elow; 5781 unsigned long usage; 5782 5783 if (mem_cgroup_disabled()) 5784 return MEMCG_PROT_NONE; 5785 5786 if (!root) 5787 root = root_mem_cgroup; 5788 if (memcg == root) 5789 return MEMCG_PROT_NONE; 5790 5791 usage = page_counter_read(&memcg->memory); 5792 if (!usage) 5793 return MEMCG_PROT_NONE; 5794 5795 emin = memcg->memory.min; 5796 elow = memcg->memory.low; 5797 5798 parent = parent_mem_cgroup(memcg); 5799 /* No parent means a non-hierarchical mode on v1 memcg */ 5800 if (!parent) 5801 return MEMCG_PROT_NONE; 5802 5803 if (parent == root) 5804 goto exit; 5805 5806 parent_emin = READ_ONCE(parent->memory.emin); 5807 emin = min(emin, parent_emin); 5808 if (emin && parent_emin) { 5809 unsigned long min_usage, siblings_min_usage; 5810 5811 min_usage = min(usage, memcg->memory.min); 5812 siblings_min_usage = atomic_long_read( 5813 &parent->memory.children_min_usage); 5814 5815 if (min_usage && siblings_min_usage) 5816 emin = min(emin, parent_emin * min_usage / 5817 siblings_min_usage); 5818 } 5819 5820 parent_elow = READ_ONCE(parent->memory.elow); 5821 elow = min(elow, parent_elow); 5822 if (elow && parent_elow) { 5823 unsigned long low_usage, siblings_low_usage; 5824 5825 low_usage = min(usage, memcg->memory.low); 5826 siblings_low_usage = atomic_long_read( 5827 &parent->memory.children_low_usage); 5828 5829 if (low_usage && siblings_low_usage) 5830 elow = min(elow, parent_elow * low_usage / 5831 siblings_low_usage); 5832 } 5833 5834 exit: 5835 memcg->memory.emin = emin; 5836 memcg->memory.elow = elow; 5837 5838 if (usage <= emin) 5839 return MEMCG_PROT_MIN; 5840 else if (usage <= elow) 5841 return MEMCG_PROT_LOW; 5842 else 5843 return MEMCG_PROT_NONE; 5844 } 5845 5846 /** 5847 * mem_cgroup_try_charge - try charging a page 5848 * @page: page to charge 5849 * @mm: mm context of the victim 5850 * @gfp_mask: reclaim mode 5851 * @memcgp: charged memcg return 5852 * @compound: charge the page as compound or small page 5853 * 5854 * Try to charge @page to the memcg that @mm belongs to, reclaiming 5855 * pages according to @gfp_mask if necessary. 5856 * 5857 * Returns 0 on success, with *@memcgp pointing to the charged memcg. 5858 * Otherwise, an error code is returned. 5859 * 5860 * After page->mapping has been set up, the caller must finalize the 5861 * charge with mem_cgroup_commit_charge(). Or abort the transaction 5862 * with mem_cgroup_cancel_charge() in case page instantiation fails. 5863 */ 5864 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 5865 gfp_t gfp_mask, struct mem_cgroup **memcgp, 5866 bool compound) 5867 { 5868 struct mem_cgroup *memcg = NULL; 5869 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; 5870 int ret = 0; 5871 5872 if (mem_cgroup_disabled()) 5873 goto out; 5874 5875 if (PageSwapCache(page)) { 5876 /* 5877 * Every swap fault against a single page tries to charge the 5878 * page, bail as early as possible. shmem_unuse() encounters 5879 * already charged pages, too. The USED bit is protected by 5880 * the page lock, which serializes swap cache removal, which 5881 * in turn serializes uncharging. 5882 */ 5883 VM_BUG_ON_PAGE(!PageLocked(page), page); 5884 if (compound_head(page)->mem_cgroup) 5885 goto out; 5886 5887 if (do_swap_account) { 5888 swp_entry_t ent = { .val = page_private(page), }; 5889 unsigned short id = lookup_swap_cgroup_id(ent); 5890 5891 rcu_read_lock(); 5892 memcg = mem_cgroup_from_id(id); 5893 if (memcg && !css_tryget_online(&memcg->css)) 5894 memcg = NULL; 5895 rcu_read_unlock(); 5896 } 5897 } 5898 5899 if (!memcg) 5900 memcg = get_mem_cgroup_from_mm(mm); 5901 5902 ret = try_charge(memcg, gfp_mask, nr_pages); 5903 5904 css_put(&memcg->css); 5905 out: 5906 *memcgp = memcg; 5907 return ret; 5908 } 5909 5910 int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, 5911 gfp_t gfp_mask, struct mem_cgroup **memcgp, 5912 bool compound) 5913 { 5914 struct mem_cgroup *memcg; 5915 int ret; 5916 5917 ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); 5918 memcg = *memcgp; 5919 mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); 5920 return ret; 5921 } 5922 5923 /** 5924 * mem_cgroup_commit_charge - commit a page charge 5925 * @page: page to charge 5926 * @memcg: memcg to charge the page to 5927 * @lrucare: page might be on LRU already 5928 * @compound: charge the page as compound or small page 5929 * 5930 * Finalize a charge transaction started by mem_cgroup_try_charge(), 5931 * after page->mapping has been set up. This must happen atomically 5932 * as part of the page instantiation, i.e. under the page table lock 5933 * for anonymous pages, under the page lock for page and swap cache. 5934 * 5935 * In addition, the page must not be on the LRU during the commit, to 5936 * prevent racing with task migration. If it might be, use @lrucare. 5937 * 5938 * Use mem_cgroup_cancel_charge() to cancel the transaction instead. 5939 */ 5940 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 5941 bool lrucare, bool compound) 5942 { 5943 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; 5944 5945 VM_BUG_ON_PAGE(!page->mapping, page); 5946 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); 5947 5948 if (mem_cgroup_disabled()) 5949 return; 5950 /* 5951 * Swap faults will attempt to charge the same page multiple 5952 * times. But reuse_swap_page() might have removed the page 5953 * from swapcache already, so we can't check PageSwapCache(). 5954 */ 5955 if (!memcg) 5956 return; 5957 5958 commit_charge(page, memcg, lrucare); 5959 5960 local_irq_disable(); 5961 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); 5962 memcg_check_events(memcg, page); 5963 local_irq_enable(); 5964 5965 if (do_memsw_account() && PageSwapCache(page)) { 5966 swp_entry_t entry = { .val = page_private(page) }; 5967 /* 5968 * The swap entry might not get freed for a long time, 5969 * let's not wait for it. The page already received a 5970 * memory+swap charge, drop the swap entry duplicate. 5971 */ 5972 mem_cgroup_uncharge_swap(entry, nr_pages); 5973 } 5974 } 5975 5976 /** 5977 * mem_cgroup_cancel_charge - cancel a page charge 5978 * @page: page to charge 5979 * @memcg: memcg to charge the page to 5980 * @compound: charge the page as compound or small page 5981 * 5982 * Cancel a charge transaction started by mem_cgroup_try_charge(). 5983 */ 5984 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, 5985 bool compound) 5986 { 5987 unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; 5988 5989 if (mem_cgroup_disabled()) 5990 return; 5991 /* 5992 * Swap faults will attempt to charge the same page multiple 5993 * times. But reuse_swap_page() might have removed the page 5994 * from swapcache already, so we can't check PageSwapCache(). 5995 */ 5996 if (!memcg) 5997 return; 5998 5999 cancel_charge(memcg, nr_pages); 6000 } 6001 6002 struct uncharge_gather { 6003 struct mem_cgroup *memcg; 6004 unsigned long pgpgout; 6005 unsigned long nr_anon; 6006 unsigned long nr_file; 6007 unsigned long nr_kmem; 6008 unsigned long nr_huge; 6009 unsigned long nr_shmem; 6010 struct page *dummy_page; 6011 }; 6012 6013 static inline void uncharge_gather_clear(struct uncharge_gather *ug) 6014 { 6015 memset(ug, 0, sizeof(*ug)); 6016 } 6017 6018 static void uncharge_batch(const struct uncharge_gather *ug) 6019 { 6020 unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem; 6021 unsigned long flags; 6022 6023 if (!mem_cgroup_is_root(ug->memcg)) { 6024 page_counter_uncharge(&ug->memcg->memory, nr_pages); 6025 if (do_memsw_account()) 6026 page_counter_uncharge(&ug->memcg->memsw, nr_pages); 6027 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) 6028 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); 6029 memcg_oom_recover(ug->memcg); 6030 } 6031 6032 local_irq_save(flags); 6033 __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); 6034 __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file); 6035 __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); 6036 __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); 6037 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 6038 __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages); 6039 memcg_check_events(ug->memcg, ug->dummy_page); 6040 local_irq_restore(flags); 6041 6042 if (!mem_cgroup_is_root(ug->memcg)) 6043 css_put_many(&ug->memcg->css, nr_pages); 6044 } 6045 6046 static void uncharge_page(struct page *page, struct uncharge_gather *ug) 6047 { 6048 VM_BUG_ON_PAGE(PageLRU(page), page); 6049 VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) && 6050 !PageHWPoison(page) , page); 6051 6052 if (!page->mem_cgroup) 6053 return; 6054 6055 /* 6056 * Nobody should be changing or seriously looking at 6057 * page->mem_cgroup at this point, we have fully 6058 * exclusive access to the page. 6059 */ 6060 6061 if (ug->memcg != page->mem_cgroup) { 6062 if (ug->memcg) { 6063 uncharge_batch(ug); 6064 uncharge_gather_clear(ug); 6065 } 6066 ug->memcg = page->mem_cgroup; 6067 } 6068 6069 if (!PageKmemcg(page)) { 6070 unsigned int nr_pages = 1; 6071 6072 if (PageTransHuge(page)) { 6073 nr_pages <<= compound_order(page); 6074 ug->nr_huge += nr_pages; 6075 } 6076 if (PageAnon(page)) 6077 ug->nr_anon += nr_pages; 6078 else { 6079 ug->nr_file += nr_pages; 6080 if (PageSwapBacked(page)) 6081 ug->nr_shmem += nr_pages; 6082 } 6083 ug->pgpgout++; 6084 } else { 6085 ug->nr_kmem += 1 << compound_order(page); 6086 __ClearPageKmemcg(page); 6087 } 6088 6089 ug->dummy_page = page; 6090 page->mem_cgroup = NULL; 6091 } 6092 6093 static void uncharge_list(struct list_head *page_list) 6094 { 6095 struct uncharge_gather ug; 6096 struct list_head *next; 6097 6098 uncharge_gather_clear(&ug); 6099 6100 /* 6101 * Note that the list can be a single page->lru; hence the 6102 * do-while loop instead of a simple list_for_each_entry(). 6103 */ 6104 next = page_list->next; 6105 do { 6106 struct page *page; 6107 6108 page = list_entry(next, struct page, lru); 6109 next = page->lru.next; 6110 6111 uncharge_page(page, &ug); 6112 } while (next != page_list); 6113 6114 if (ug.memcg) 6115 uncharge_batch(&ug); 6116 } 6117 6118 /** 6119 * mem_cgroup_uncharge - uncharge a page 6120 * @page: page to uncharge 6121 * 6122 * Uncharge a page previously charged with mem_cgroup_try_charge() and 6123 * mem_cgroup_commit_charge(). 6124 */ 6125 void mem_cgroup_uncharge(struct page *page) 6126 { 6127 struct uncharge_gather ug; 6128 6129 if (mem_cgroup_disabled()) 6130 return; 6131 6132 /* Don't touch page->lru of any random page, pre-check: */ 6133 if (!page->mem_cgroup) 6134 return; 6135 6136 uncharge_gather_clear(&ug); 6137 uncharge_page(page, &ug); 6138 uncharge_batch(&ug); 6139 } 6140 6141 /** 6142 * mem_cgroup_uncharge_list - uncharge a list of page 6143 * @page_list: list of pages to uncharge 6144 * 6145 * Uncharge a list of pages previously charged with 6146 * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). 6147 */ 6148 void mem_cgroup_uncharge_list(struct list_head *page_list) 6149 { 6150 if (mem_cgroup_disabled()) 6151 return; 6152 6153 if (!list_empty(page_list)) 6154 uncharge_list(page_list); 6155 } 6156 6157 /** 6158 * mem_cgroup_migrate - charge a page's replacement 6159 * @oldpage: currently circulating page 6160 * @newpage: replacement page 6161 * 6162 * Charge @newpage as a replacement page for @oldpage. @oldpage will 6163 * be uncharged upon free. 6164 * 6165 * Both pages must be locked, @newpage->mapping must be set up. 6166 */ 6167 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) 6168 { 6169 struct mem_cgroup *memcg; 6170 unsigned int nr_pages; 6171 bool compound; 6172 unsigned long flags; 6173 6174 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 6175 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 6176 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 6177 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 6178 newpage); 6179 6180 if (mem_cgroup_disabled()) 6181 return; 6182 6183 /* Page cache replacement: new page already charged? */ 6184 if (newpage->mem_cgroup) 6185 return; 6186 6187 /* Swapcache readahead pages can get replaced before being charged */ 6188 memcg = oldpage->mem_cgroup; 6189 if (!memcg) 6190 return; 6191 6192 /* Force-charge the new page. The old one will be freed soon */ 6193 compound = PageTransHuge(newpage); 6194 nr_pages = compound ? hpage_nr_pages(newpage) : 1; 6195 6196 page_counter_charge(&memcg->memory, nr_pages); 6197 if (do_memsw_account()) 6198 page_counter_charge(&memcg->memsw, nr_pages); 6199 css_get_many(&memcg->css, nr_pages); 6200 6201 commit_charge(newpage, memcg, false); 6202 6203 local_irq_save(flags); 6204 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); 6205 memcg_check_events(memcg, newpage); 6206 local_irq_restore(flags); 6207 } 6208 6209 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 6210 EXPORT_SYMBOL(memcg_sockets_enabled_key); 6211 6212 void mem_cgroup_sk_alloc(struct sock *sk) 6213 { 6214 struct mem_cgroup *memcg; 6215 6216 if (!mem_cgroup_sockets_enabled) 6217 return; 6218 6219 /* 6220 * Socket cloning can throw us here with sk_memcg already 6221 * filled. It won't however, necessarily happen from 6222 * process context. So the test for root memcg given 6223 * the current task's memcg won't help us in this case. 6224 * 6225 * Respecting the original socket's memcg is a better 6226 * decision in this case. 6227 */ 6228 if (sk->sk_memcg) { 6229 css_get(&sk->sk_memcg->css); 6230 return; 6231 } 6232 6233 rcu_read_lock(); 6234 memcg = mem_cgroup_from_task(current); 6235 if (memcg == root_mem_cgroup) 6236 goto out; 6237 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 6238 goto out; 6239 if (css_tryget_online(&memcg->css)) 6240 sk->sk_memcg = memcg; 6241 out: 6242 rcu_read_unlock(); 6243 } 6244 6245 void mem_cgroup_sk_free(struct sock *sk) 6246 { 6247 if (sk->sk_memcg) 6248 css_put(&sk->sk_memcg->css); 6249 } 6250 6251 /** 6252 * mem_cgroup_charge_skmem - charge socket memory 6253 * @memcg: memcg to charge 6254 * @nr_pages: number of pages to charge 6255 * 6256 * Charges @nr_pages to @memcg. Returns %true if the charge fit within 6257 * @memcg's configured limit, %false if the charge had to be forced. 6258 */ 6259 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 6260 { 6261 gfp_t gfp_mask = GFP_KERNEL; 6262 6263 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 6264 struct page_counter *fail; 6265 6266 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 6267 memcg->tcpmem_pressure = 0; 6268 return true; 6269 } 6270 page_counter_charge(&memcg->tcpmem, nr_pages); 6271 memcg->tcpmem_pressure = 1; 6272 return false; 6273 } 6274 6275 /* Don't block in the packet receive path */ 6276 if (in_softirq()) 6277 gfp_mask = GFP_NOWAIT; 6278 6279 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 6280 6281 if (try_charge(memcg, gfp_mask, nr_pages) == 0) 6282 return true; 6283 6284 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); 6285 return false; 6286 } 6287 6288 /** 6289 * mem_cgroup_uncharge_skmem - uncharge socket memory 6290 * @memcg: memcg to uncharge 6291 * @nr_pages: number of pages to uncharge 6292 */ 6293 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 6294 { 6295 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 6296 page_counter_uncharge(&memcg->tcpmem, nr_pages); 6297 return; 6298 } 6299 6300 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 6301 6302 refill_stock(memcg, nr_pages); 6303 } 6304 6305 static int __init cgroup_memory(char *s) 6306 { 6307 char *token; 6308 6309 while ((token = strsep(&s, ",")) != NULL) { 6310 if (!*token) 6311 continue; 6312 if (!strcmp(token, "nosocket")) 6313 cgroup_memory_nosocket = true; 6314 if (!strcmp(token, "nokmem")) 6315 cgroup_memory_nokmem = true; 6316 } 6317 return 0; 6318 } 6319 __setup("cgroup.memory=", cgroup_memory); 6320 6321 /* 6322 * subsys_initcall() for memory controller. 6323 * 6324 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 6325 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 6326 * basically everything that doesn't depend on a specific mem_cgroup structure 6327 * should be initialized from here. 6328 */ 6329 static int __init mem_cgroup_init(void) 6330 { 6331 int cpu, node; 6332 6333 #ifdef CONFIG_MEMCG_KMEM 6334 /* 6335 * Kmem cache creation is mostly done with the slab_mutex held, 6336 * so use a workqueue with limited concurrency to avoid stalling 6337 * all worker threads in case lots of cgroups are created and 6338 * destroyed simultaneously. 6339 */ 6340 memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1); 6341 BUG_ON(!memcg_kmem_cache_wq); 6342 #endif 6343 6344 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 6345 memcg_hotplug_cpu_dead); 6346 6347 for_each_possible_cpu(cpu) 6348 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 6349 drain_local_stock); 6350 6351 for_each_node(node) { 6352 struct mem_cgroup_tree_per_node *rtpn; 6353 6354 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 6355 node_online(node) ? node : NUMA_NO_NODE); 6356 6357 rtpn->rb_root = RB_ROOT; 6358 rtpn->rb_rightmost = NULL; 6359 spin_lock_init(&rtpn->lock); 6360 soft_limit_tree.rb_tree_per_node[node] = rtpn; 6361 } 6362 6363 return 0; 6364 } 6365 subsys_initcall(mem_cgroup_init); 6366 6367 #ifdef CONFIG_MEMCG_SWAP 6368 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 6369 { 6370 while (!refcount_inc_not_zero(&memcg->id.ref)) { 6371 /* 6372 * The root cgroup cannot be destroyed, so it's refcount must 6373 * always be >= 1. 6374 */ 6375 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { 6376 VM_BUG_ON(1); 6377 break; 6378 } 6379 memcg = parent_mem_cgroup(memcg); 6380 if (!memcg) 6381 memcg = root_mem_cgroup; 6382 } 6383 return memcg; 6384 } 6385 6386 /** 6387 * mem_cgroup_swapout - transfer a memsw charge to swap 6388 * @page: page whose memsw charge to transfer 6389 * @entry: swap entry to move the charge to 6390 * 6391 * Transfer the memsw charge of @page to @entry. 6392 */ 6393 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 6394 { 6395 struct mem_cgroup *memcg, *swap_memcg; 6396 unsigned int nr_entries; 6397 unsigned short oldid; 6398 6399 VM_BUG_ON_PAGE(PageLRU(page), page); 6400 VM_BUG_ON_PAGE(page_count(page), page); 6401 6402 if (!do_memsw_account()) 6403 return; 6404 6405 memcg = page->mem_cgroup; 6406 6407 /* Readahead page, never charged */ 6408 if (!memcg) 6409 return; 6410 6411 /* 6412 * In case the memcg owning these pages has been offlined and doesn't 6413 * have an ID allocated to it anymore, charge the closest online 6414 * ancestor for the swap instead and transfer the memory+swap charge. 6415 */ 6416 swap_memcg = mem_cgroup_id_get_online(memcg); 6417 nr_entries = hpage_nr_pages(page); 6418 /* Get references for the tail pages, too */ 6419 if (nr_entries > 1) 6420 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 6421 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 6422 nr_entries); 6423 VM_BUG_ON_PAGE(oldid, page); 6424 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 6425 6426 page->mem_cgroup = NULL; 6427 6428 if (!mem_cgroup_is_root(memcg)) 6429 page_counter_uncharge(&memcg->memory, nr_entries); 6430 6431 if (memcg != swap_memcg) { 6432 if (!mem_cgroup_is_root(swap_memcg)) 6433 page_counter_charge(&swap_memcg->memsw, nr_entries); 6434 page_counter_uncharge(&memcg->memsw, nr_entries); 6435 } 6436 6437 /* 6438 * Interrupts should be disabled here because the caller holds the 6439 * i_pages lock which is taken with interrupts-off. It is 6440 * important here to have the interrupts disabled because it is the 6441 * only synchronisation we have for updating the per-CPU variables. 6442 */ 6443 VM_BUG_ON(!irqs_disabled()); 6444 mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), 6445 -nr_entries); 6446 memcg_check_events(memcg, page); 6447 6448 if (!mem_cgroup_is_root(memcg)) 6449 css_put_many(&memcg->css, nr_entries); 6450 } 6451 6452 /** 6453 * mem_cgroup_try_charge_swap - try charging swap space for a page 6454 * @page: page being added to swap 6455 * @entry: swap entry to charge 6456 * 6457 * Try to charge @page's memcg for the swap space at @entry. 6458 * 6459 * Returns 0 on success, -ENOMEM on failure. 6460 */ 6461 int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) 6462 { 6463 unsigned int nr_pages = hpage_nr_pages(page); 6464 struct page_counter *counter; 6465 struct mem_cgroup *memcg; 6466 unsigned short oldid; 6467 6468 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) 6469 return 0; 6470 6471 memcg = page->mem_cgroup; 6472 6473 /* Readahead page, never charged */ 6474 if (!memcg) 6475 return 0; 6476 6477 if (!entry.val) { 6478 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 6479 return 0; 6480 } 6481 6482 memcg = mem_cgroup_id_get_online(memcg); 6483 6484 if (!mem_cgroup_is_root(memcg) && 6485 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 6486 memcg_memory_event(memcg, MEMCG_SWAP_MAX); 6487 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 6488 mem_cgroup_id_put(memcg); 6489 return -ENOMEM; 6490 } 6491 6492 /* Get references for the tail pages, too */ 6493 if (nr_pages > 1) 6494 mem_cgroup_id_get_many(memcg, nr_pages - 1); 6495 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 6496 VM_BUG_ON_PAGE(oldid, page); 6497 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 6498 6499 return 0; 6500 } 6501 6502 /** 6503 * mem_cgroup_uncharge_swap - uncharge swap space 6504 * @entry: swap entry to uncharge 6505 * @nr_pages: the amount of swap space to uncharge 6506 */ 6507 void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 6508 { 6509 struct mem_cgroup *memcg; 6510 unsigned short id; 6511 6512 if (!do_swap_account) 6513 return; 6514 6515 id = swap_cgroup_record(entry, 0, nr_pages); 6516 rcu_read_lock(); 6517 memcg = mem_cgroup_from_id(id); 6518 if (memcg) { 6519 if (!mem_cgroup_is_root(memcg)) { 6520 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 6521 page_counter_uncharge(&memcg->swap, nr_pages); 6522 else 6523 page_counter_uncharge(&memcg->memsw, nr_pages); 6524 } 6525 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 6526 mem_cgroup_id_put_many(memcg, nr_pages); 6527 } 6528 rcu_read_unlock(); 6529 } 6530 6531 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 6532 { 6533 long nr_swap_pages = get_nr_swap_pages(); 6534 6535 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 6536 return nr_swap_pages; 6537 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) 6538 nr_swap_pages = min_t(long, nr_swap_pages, 6539 READ_ONCE(memcg->swap.max) - 6540 page_counter_read(&memcg->swap)); 6541 return nr_swap_pages; 6542 } 6543 6544 bool mem_cgroup_swap_full(struct page *page) 6545 { 6546 struct mem_cgroup *memcg; 6547 6548 VM_BUG_ON_PAGE(!PageLocked(page), page); 6549 6550 if (vm_swap_full()) 6551 return true; 6552 if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 6553 return false; 6554 6555 memcg = page->mem_cgroup; 6556 if (!memcg) 6557 return false; 6558 6559 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) 6560 if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max) 6561 return true; 6562 6563 return false; 6564 } 6565 6566 /* for remember boot option*/ 6567 #ifdef CONFIG_MEMCG_SWAP_ENABLED 6568 static int really_do_swap_account __initdata = 1; 6569 #else 6570 static int really_do_swap_account __initdata; 6571 #endif 6572 6573 static int __init enable_swap_account(char *s) 6574 { 6575 if (!strcmp(s, "1")) 6576 really_do_swap_account = 1; 6577 else if (!strcmp(s, "0")) 6578 really_do_swap_account = 0; 6579 return 1; 6580 } 6581 __setup("swapaccount=", enable_swap_account); 6582 6583 static u64 swap_current_read(struct cgroup_subsys_state *css, 6584 struct cftype *cft) 6585 { 6586 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6587 6588 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 6589 } 6590 6591 static int swap_max_show(struct seq_file *m, void *v) 6592 { 6593 return seq_puts_memcg_tunable(m, 6594 READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 6595 } 6596 6597 static ssize_t swap_max_write(struct kernfs_open_file *of, 6598 char *buf, size_t nbytes, loff_t off) 6599 { 6600 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6601 unsigned long max; 6602 int err; 6603 6604 buf = strstrip(buf); 6605 err = page_counter_memparse(buf, "max", &max); 6606 if (err) 6607 return err; 6608 6609 xchg(&memcg->swap.max, max); 6610 6611 return nbytes; 6612 } 6613 6614 static int swap_events_show(struct seq_file *m, void *v) 6615 { 6616 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6617 6618 seq_printf(m, "max %lu\n", 6619 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 6620 seq_printf(m, "fail %lu\n", 6621 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 6622 6623 return 0; 6624 } 6625 6626 static struct cftype swap_files[] = { 6627 { 6628 .name = "swap.current", 6629 .flags = CFTYPE_NOT_ON_ROOT, 6630 .read_u64 = swap_current_read, 6631 }, 6632 { 6633 .name = "swap.max", 6634 .flags = CFTYPE_NOT_ON_ROOT, 6635 .seq_show = swap_max_show, 6636 .write = swap_max_write, 6637 }, 6638 { 6639 .name = "swap.events", 6640 .flags = CFTYPE_NOT_ON_ROOT, 6641 .file_offset = offsetof(struct mem_cgroup, swap_events_file), 6642 .seq_show = swap_events_show, 6643 }, 6644 { } /* terminate */ 6645 }; 6646 6647 static struct cftype memsw_cgroup_files[] = { 6648 { 6649 .name = "memsw.usage_in_bytes", 6650 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6651 .read_u64 = mem_cgroup_read_u64, 6652 }, 6653 { 6654 .name = "memsw.max_usage_in_bytes", 6655 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6656 .write = mem_cgroup_reset, 6657 .read_u64 = mem_cgroup_read_u64, 6658 }, 6659 { 6660 .name = "memsw.limit_in_bytes", 6661 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6662 .write = mem_cgroup_write, 6663 .read_u64 = mem_cgroup_read_u64, 6664 }, 6665 { 6666 .name = "memsw.failcnt", 6667 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6668 .write = mem_cgroup_reset, 6669 .read_u64 = mem_cgroup_read_u64, 6670 }, 6671 { }, /* terminate */ 6672 }; 6673 6674 static int __init mem_cgroup_swap_init(void) 6675 { 6676 if (!mem_cgroup_disabled() && really_do_swap_account) { 6677 do_swap_account = 1; 6678 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, 6679 swap_files)); 6680 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 6681 memsw_cgroup_files)); 6682 } 6683 return 0; 6684 } 6685 subsys_initcall(mem_cgroup_swap_init); 6686 6687 #endif /* CONFIG_MEMCG_SWAP */ 6688