1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* memcontrol.c - Memory Controller 3 * 4 * Copyright IBM Corporation, 2007 5 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 6 * 7 * Copyright 2007 OpenVZ SWsoft Inc 8 * Author: Pavel Emelianov <xemul@openvz.org> 9 * 10 * Memory thresholds 11 * Copyright (C) 2009 Nokia Corporation 12 * Author: Kirill A. Shutemov 13 * 14 * Kernel Memory Controller 15 * Copyright (C) 2012 Parallels Inc. and Google Inc. 16 * Authors: Glauber Costa and Suleiman Souhlal 17 * 18 * Native page reclaim 19 * Charge lifetime sanitation 20 * Lockless page tracking & accounting 21 * Unified hierarchy configuration model 22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 23 * 24 * Per memcg lru locking 25 * Copyright (C) 2020 Alibaba, Inc, Alex Shi 26 */ 27 28 #include <linux/page_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/pagewalk.h> 32 #include <linux/sched/mm.h> 33 #include <linux/shmem_fs.h> 34 #include <linux/hugetlb.h> 35 #include <linux/pagemap.h> 36 #include <linux/vm_event_item.h> 37 #include <linux/smp.h> 38 #include <linux/page-flags.h> 39 #include <linux/backing-dev.h> 40 #include <linux/bit_spinlock.h> 41 #include <linux/rcupdate.h> 42 #include <linux/limits.h> 43 #include <linux/export.h> 44 #include <linux/mutex.h> 45 #include <linux/rbtree.h> 46 #include <linux/slab.h> 47 #include <linux/swap.h> 48 #include <linux/swapops.h> 49 #include <linux/spinlock.h> 50 #include <linux/eventfd.h> 51 #include <linux/poll.h> 52 #include <linux/sort.h> 53 #include <linux/fs.h> 54 #include <linux/seq_file.h> 55 #include <linux/vmpressure.h> 56 #include <linux/mm_inline.h> 57 #include <linux/swap_cgroup.h> 58 #include <linux/cpu.h> 59 #include <linux/oom.h> 60 #include <linux/lockdep.h> 61 #include <linux/file.h> 62 #include <linux/tracehook.h> 63 #include <linux/psi.h> 64 #include <linux/seq_buf.h> 65 #include "internal.h" 66 #include <net/sock.h> 67 #include <net/ip.h> 68 #include "slab.h" 69 70 #include <linux/uaccess.h> 71 72 #include <trace/events/vmscan.h> 73 74 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 75 EXPORT_SYMBOL(memory_cgrp_subsys); 76 77 struct mem_cgroup *root_mem_cgroup __read_mostly; 78 79 /* Active memory cgroup to use from an interrupt context */ 80 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); 81 82 /* Socket memory accounting disabled? */ 83 static bool cgroup_memory_nosocket; 84 85 /* Kernel memory accounting disabled? */ 86 static bool cgroup_memory_nokmem; 87 88 /* Whether the swap controller is active */ 89 #ifdef CONFIG_MEMCG_SWAP 90 bool cgroup_memory_noswap __read_mostly; 91 #else 92 #define cgroup_memory_noswap 1 93 #endif 94 95 #ifdef CONFIG_CGROUP_WRITEBACK 96 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); 97 #endif 98 99 /* Whether legacy memory+swap accounting is active */ 100 static bool do_memsw_account(void) 101 { 102 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; 103 } 104 105 #define THRESHOLDS_EVENTS_TARGET 128 106 #define SOFTLIMIT_EVENTS_TARGET 1024 107 108 /* 109 * Cgroups above their limits are maintained in a RB-Tree, independent of 110 * their hierarchy representation 111 */ 112 113 struct mem_cgroup_tree_per_node { 114 struct rb_root rb_root; 115 struct rb_node *rb_rightmost; 116 spinlock_t lock; 117 }; 118 119 struct mem_cgroup_tree { 120 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 121 }; 122 123 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 124 125 /* for OOM */ 126 struct mem_cgroup_eventfd_list { 127 struct list_head list; 128 struct eventfd_ctx *eventfd; 129 }; 130 131 /* 132 * cgroup_event represents events which userspace want to receive. 133 */ 134 struct mem_cgroup_event { 135 /* 136 * memcg which the event belongs to. 137 */ 138 struct mem_cgroup *memcg; 139 /* 140 * eventfd to signal userspace about the event. 141 */ 142 struct eventfd_ctx *eventfd; 143 /* 144 * Each of these stored in a list by the cgroup. 145 */ 146 struct list_head list; 147 /* 148 * register_event() callback will be used to add new userspace 149 * waiter for changes related to this event. Use eventfd_signal() 150 * on eventfd to send notification to userspace. 151 */ 152 int (*register_event)(struct mem_cgroup *memcg, 153 struct eventfd_ctx *eventfd, const char *args); 154 /* 155 * unregister_event() callback will be called when userspace closes 156 * the eventfd or on cgroup removing. This callback must be set, 157 * if you want provide notification functionality. 158 */ 159 void (*unregister_event)(struct mem_cgroup *memcg, 160 struct eventfd_ctx *eventfd); 161 /* 162 * All fields below needed to unregister event when 163 * userspace closes eventfd. 164 */ 165 poll_table pt; 166 wait_queue_head_t *wqh; 167 wait_queue_entry_t wait; 168 struct work_struct remove; 169 }; 170 171 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 172 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 173 174 /* Stuffs for move charges at task migration. */ 175 /* 176 * Types of charges to be moved. 177 */ 178 #define MOVE_ANON 0x1U 179 #define MOVE_FILE 0x2U 180 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 181 182 /* "mc" and its members are protected by cgroup_mutex */ 183 static struct move_charge_struct { 184 spinlock_t lock; /* for from, to */ 185 struct mm_struct *mm; 186 struct mem_cgroup *from; 187 struct mem_cgroup *to; 188 unsigned long flags; 189 unsigned long precharge; 190 unsigned long moved_charge; 191 unsigned long moved_swap; 192 struct task_struct *moving_task; /* a task moving charges */ 193 wait_queue_head_t waitq; /* a waitq for other context */ 194 } mc = { 195 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 196 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 197 }; 198 199 /* 200 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 201 * limit reclaim to prevent infinite loops, if they ever occur. 202 */ 203 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 204 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 205 206 /* for encoding cft->private value on file */ 207 enum res_type { 208 _MEM, 209 _MEMSWAP, 210 _OOM_TYPE, 211 _KMEM, 212 _TCP, 213 }; 214 215 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 216 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 217 #define MEMFILE_ATTR(val) ((val) & 0xffff) 218 /* Used for OOM notifier */ 219 #define OOM_CONTROL (0) 220 221 /* 222 * Iteration constructs for visiting all cgroups (under a tree). If 223 * loops are exited prematurely (break), mem_cgroup_iter_break() must 224 * be used for reference counting. 225 */ 226 #define for_each_mem_cgroup_tree(iter, root) \ 227 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 228 iter != NULL; \ 229 iter = mem_cgroup_iter(root, iter, NULL)) 230 231 #define for_each_mem_cgroup(iter) \ 232 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 233 iter != NULL; \ 234 iter = mem_cgroup_iter(NULL, iter, NULL)) 235 236 static inline bool should_force_charge(void) 237 { 238 return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 239 (current->flags & PF_EXITING); 240 } 241 242 /* Some nice accessors for the vmpressure. */ 243 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 244 { 245 if (!memcg) 246 memcg = root_mem_cgroup; 247 return &memcg->vmpressure; 248 } 249 250 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 251 { 252 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 253 } 254 255 #ifdef CONFIG_MEMCG_KMEM 256 extern spinlock_t css_set_lock; 257 258 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 259 unsigned int nr_pages); 260 261 static void obj_cgroup_release(struct percpu_ref *ref) 262 { 263 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); 264 struct mem_cgroup *memcg; 265 unsigned int nr_bytes; 266 unsigned int nr_pages; 267 unsigned long flags; 268 269 /* 270 * At this point all allocated objects are freed, and 271 * objcg->nr_charged_bytes can't have an arbitrary byte value. 272 * However, it can be PAGE_SIZE or (x * PAGE_SIZE). 273 * 274 * The following sequence can lead to it: 275 * 1) CPU0: objcg == stock->cached_objcg 276 * 2) CPU1: we do a small allocation (e.g. 92 bytes), 277 * PAGE_SIZE bytes are charged 278 * 3) CPU1: a process from another memcg is allocating something, 279 * the stock if flushed, 280 * objcg->nr_charged_bytes = PAGE_SIZE - 92 281 * 5) CPU0: we do release this object, 282 * 92 bytes are added to stock->nr_bytes 283 * 6) CPU0: stock is flushed, 284 * 92 bytes are added to objcg->nr_charged_bytes 285 * 286 * In the result, nr_charged_bytes == PAGE_SIZE. 287 * This page will be uncharged in obj_cgroup_release(). 288 */ 289 nr_bytes = atomic_read(&objcg->nr_charged_bytes); 290 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); 291 nr_pages = nr_bytes >> PAGE_SHIFT; 292 293 spin_lock_irqsave(&css_set_lock, flags); 294 memcg = obj_cgroup_memcg(objcg); 295 if (nr_pages) 296 obj_cgroup_uncharge_pages(objcg, nr_pages); 297 list_del(&objcg->list); 298 mem_cgroup_put(memcg); 299 spin_unlock_irqrestore(&css_set_lock, flags); 300 301 percpu_ref_exit(ref); 302 kfree_rcu(objcg, rcu); 303 } 304 305 static struct obj_cgroup *obj_cgroup_alloc(void) 306 { 307 struct obj_cgroup *objcg; 308 int ret; 309 310 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); 311 if (!objcg) 312 return NULL; 313 314 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, 315 GFP_KERNEL); 316 if (ret) { 317 kfree(objcg); 318 return NULL; 319 } 320 INIT_LIST_HEAD(&objcg->list); 321 return objcg; 322 } 323 324 static void memcg_reparent_objcgs(struct mem_cgroup *memcg, 325 struct mem_cgroup *parent) 326 { 327 struct obj_cgroup *objcg, *iter; 328 329 objcg = rcu_replace_pointer(memcg->objcg, NULL, true); 330 331 spin_lock_irq(&css_set_lock); 332 333 /* Move active objcg to the parent's list */ 334 xchg(&objcg->memcg, parent); 335 css_get(&parent->css); 336 list_add(&objcg->list, &parent->objcg_list); 337 338 /* Move already reparented objcgs to the parent's list */ 339 list_for_each_entry(iter, &memcg->objcg_list, list) { 340 css_get(&parent->css); 341 xchg(&iter->memcg, parent); 342 css_put(&memcg->css); 343 } 344 list_splice(&memcg->objcg_list, &parent->objcg_list); 345 346 spin_unlock_irq(&css_set_lock); 347 348 percpu_ref_kill(&objcg->refcnt); 349 } 350 351 /* 352 * This will be used as a shrinker list's index. 353 * The main reason for not using cgroup id for this: 354 * this works better in sparse environments, where we have a lot of memcgs, 355 * but only a few kmem-limited. Or also, if we have, for instance, 200 356 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 357 * 200 entry array for that. 358 * 359 * The current size of the caches array is stored in memcg_nr_cache_ids. It 360 * will double each time we have to increase it. 361 */ 362 static DEFINE_IDA(memcg_cache_ida); 363 int memcg_nr_cache_ids; 364 365 /* Protects memcg_nr_cache_ids */ 366 static DECLARE_RWSEM(memcg_cache_ids_sem); 367 368 void memcg_get_cache_ids(void) 369 { 370 down_read(&memcg_cache_ids_sem); 371 } 372 373 void memcg_put_cache_ids(void) 374 { 375 up_read(&memcg_cache_ids_sem); 376 } 377 378 /* 379 * MIN_SIZE is different than 1, because we would like to avoid going through 380 * the alloc/free process all the time. In a small machine, 4 kmem-limited 381 * cgroups is a reasonable guess. In the future, it could be a parameter or 382 * tunable, but that is strictly not necessary. 383 * 384 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 385 * this constant directly from cgroup, but it is understandable that this is 386 * better kept as an internal representation in cgroup.c. In any case, the 387 * cgrp_id space is not getting any smaller, and we don't have to necessarily 388 * increase ours as well if it increases. 389 */ 390 #define MEMCG_CACHES_MIN_SIZE 4 391 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 392 393 /* 394 * A lot of the calls to the cache allocation functions are expected to be 395 * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are 396 * conditional to this static branch, we'll have to allow modules that does 397 * kmem_cache_alloc and the such to see this symbol as well 398 */ 399 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); 400 EXPORT_SYMBOL(memcg_kmem_enabled_key); 401 #endif 402 403 /** 404 * mem_cgroup_css_from_page - css of the memcg associated with a page 405 * @page: page of interest 406 * 407 * If memcg is bound to the default hierarchy, css of the memcg associated 408 * with @page is returned. The returned css remains associated with @page 409 * until it is released. 410 * 411 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 412 * is returned. 413 */ 414 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) 415 { 416 struct mem_cgroup *memcg; 417 418 memcg = page_memcg(page); 419 420 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 421 memcg = root_mem_cgroup; 422 423 return &memcg->css; 424 } 425 426 /** 427 * page_cgroup_ino - return inode number of the memcg a page is charged to 428 * @page: the page 429 * 430 * Look up the closest online ancestor of the memory cgroup @page is charged to 431 * and return its inode number or 0 if @page is not charged to any cgroup. It 432 * is safe to call this function without holding a reference to @page. 433 * 434 * Note, this function is inherently racy, because there is nothing to prevent 435 * the cgroup inode from getting torn down and potentially reallocated a moment 436 * after page_cgroup_ino() returns, so it only should be used by callers that 437 * do not care (such as procfs interfaces). 438 */ 439 ino_t page_cgroup_ino(struct page *page) 440 { 441 struct mem_cgroup *memcg; 442 unsigned long ino = 0; 443 444 rcu_read_lock(); 445 memcg = page_memcg_check(page); 446 447 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 448 memcg = parent_mem_cgroup(memcg); 449 if (memcg) 450 ino = cgroup_ino(memcg->css.cgroup); 451 rcu_read_unlock(); 452 return ino; 453 } 454 455 static struct mem_cgroup_per_node * 456 mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) 457 { 458 int nid = page_to_nid(page); 459 460 return memcg->nodeinfo[nid]; 461 } 462 463 static struct mem_cgroup_tree_per_node * 464 soft_limit_tree_node(int nid) 465 { 466 return soft_limit_tree.rb_tree_per_node[nid]; 467 } 468 469 static struct mem_cgroup_tree_per_node * 470 soft_limit_tree_from_page(struct page *page) 471 { 472 int nid = page_to_nid(page); 473 474 return soft_limit_tree.rb_tree_per_node[nid]; 475 } 476 477 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 478 struct mem_cgroup_tree_per_node *mctz, 479 unsigned long new_usage_in_excess) 480 { 481 struct rb_node **p = &mctz->rb_root.rb_node; 482 struct rb_node *parent = NULL; 483 struct mem_cgroup_per_node *mz_node; 484 bool rightmost = true; 485 486 if (mz->on_tree) 487 return; 488 489 mz->usage_in_excess = new_usage_in_excess; 490 if (!mz->usage_in_excess) 491 return; 492 while (*p) { 493 parent = *p; 494 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 495 tree_node); 496 if (mz->usage_in_excess < mz_node->usage_in_excess) { 497 p = &(*p)->rb_left; 498 rightmost = false; 499 } else { 500 p = &(*p)->rb_right; 501 } 502 } 503 504 if (rightmost) 505 mctz->rb_rightmost = &mz->tree_node; 506 507 rb_link_node(&mz->tree_node, parent, p); 508 rb_insert_color(&mz->tree_node, &mctz->rb_root); 509 mz->on_tree = true; 510 } 511 512 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 513 struct mem_cgroup_tree_per_node *mctz) 514 { 515 if (!mz->on_tree) 516 return; 517 518 if (&mz->tree_node == mctz->rb_rightmost) 519 mctz->rb_rightmost = rb_prev(&mz->tree_node); 520 521 rb_erase(&mz->tree_node, &mctz->rb_root); 522 mz->on_tree = false; 523 } 524 525 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 526 struct mem_cgroup_tree_per_node *mctz) 527 { 528 unsigned long flags; 529 530 spin_lock_irqsave(&mctz->lock, flags); 531 __mem_cgroup_remove_exceeded(mz, mctz); 532 spin_unlock_irqrestore(&mctz->lock, flags); 533 } 534 535 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 536 { 537 unsigned long nr_pages = page_counter_read(&memcg->memory); 538 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 539 unsigned long excess = 0; 540 541 if (nr_pages > soft_limit) 542 excess = nr_pages - soft_limit; 543 544 return excess; 545 } 546 547 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 548 { 549 unsigned long excess; 550 struct mem_cgroup_per_node *mz; 551 struct mem_cgroup_tree_per_node *mctz; 552 553 mctz = soft_limit_tree_from_page(page); 554 if (!mctz) 555 return; 556 /* 557 * Necessary to update all ancestors when hierarchy is used. 558 * because their event counter is not touched. 559 */ 560 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 561 mz = mem_cgroup_page_nodeinfo(memcg, page); 562 excess = soft_limit_excess(memcg); 563 /* 564 * We have to update the tree if mz is on RB-tree or 565 * mem is over its softlimit. 566 */ 567 if (excess || mz->on_tree) { 568 unsigned long flags; 569 570 spin_lock_irqsave(&mctz->lock, flags); 571 /* if on-tree, remove it */ 572 if (mz->on_tree) 573 __mem_cgroup_remove_exceeded(mz, mctz); 574 /* 575 * Insert again. mz->usage_in_excess will be updated. 576 * If excess is 0, no tree ops. 577 */ 578 __mem_cgroup_insert_exceeded(mz, mctz, excess); 579 spin_unlock_irqrestore(&mctz->lock, flags); 580 } 581 } 582 } 583 584 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 585 { 586 struct mem_cgroup_tree_per_node *mctz; 587 struct mem_cgroup_per_node *mz; 588 int nid; 589 590 for_each_node(nid) { 591 mz = memcg->nodeinfo[nid]; 592 mctz = soft_limit_tree_node(nid); 593 if (mctz) 594 mem_cgroup_remove_exceeded(mz, mctz); 595 } 596 } 597 598 static struct mem_cgroup_per_node * 599 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 600 { 601 struct mem_cgroup_per_node *mz; 602 603 retry: 604 mz = NULL; 605 if (!mctz->rb_rightmost) 606 goto done; /* Nothing to reclaim from */ 607 608 mz = rb_entry(mctz->rb_rightmost, 609 struct mem_cgroup_per_node, tree_node); 610 /* 611 * Remove the node now but someone else can add it back, 612 * we will to add it back at the end of reclaim to its correct 613 * position in the tree. 614 */ 615 __mem_cgroup_remove_exceeded(mz, mctz); 616 if (!soft_limit_excess(mz->memcg) || 617 !css_tryget(&mz->memcg->css)) 618 goto retry; 619 done: 620 return mz; 621 } 622 623 static struct mem_cgroup_per_node * 624 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 625 { 626 struct mem_cgroup_per_node *mz; 627 628 spin_lock_irq(&mctz->lock); 629 mz = __mem_cgroup_largest_soft_limit_node(mctz); 630 spin_unlock_irq(&mctz->lock); 631 return mz; 632 } 633 634 /** 635 * __mod_memcg_state - update cgroup memory statistics 636 * @memcg: the memory cgroup 637 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item 638 * @val: delta to add to the counter, can be negative 639 */ 640 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) 641 { 642 if (mem_cgroup_disabled()) 643 return; 644 645 __this_cpu_add(memcg->vmstats_percpu->state[idx], val); 646 cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); 647 } 648 649 /* idx can be of type enum memcg_stat_item or node_stat_item. */ 650 static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) 651 { 652 long x = READ_ONCE(memcg->vmstats.state[idx]); 653 #ifdef CONFIG_SMP 654 if (x < 0) 655 x = 0; 656 #endif 657 return x; 658 } 659 660 /* idx can be of type enum memcg_stat_item or node_stat_item. */ 661 static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) 662 { 663 long x = 0; 664 int cpu; 665 666 for_each_possible_cpu(cpu) 667 x += per_cpu(memcg->vmstats_percpu->state[idx], cpu); 668 #ifdef CONFIG_SMP 669 if (x < 0) 670 x = 0; 671 #endif 672 return x; 673 } 674 675 static struct mem_cgroup_per_node * 676 parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) 677 { 678 struct mem_cgroup *parent; 679 680 parent = parent_mem_cgroup(pn->memcg); 681 if (!parent) 682 return NULL; 683 return parent->nodeinfo[nid]; 684 } 685 686 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 687 int val) 688 { 689 struct mem_cgroup_per_node *pn; 690 struct mem_cgroup *memcg; 691 long x, threshold = MEMCG_CHARGE_BATCH; 692 693 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 694 memcg = pn->memcg; 695 696 /* Update memcg */ 697 __mod_memcg_state(memcg, idx, val); 698 699 /* Update lruvec */ 700 __this_cpu_add(pn->lruvec_stat_local->count[idx], val); 701 702 if (vmstat_item_in_bytes(idx)) 703 threshold <<= PAGE_SHIFT; 704 705 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); 706 if (unlikely(abs(x) > threshold)) { 707 pg_data_t *pgdat = lruvec_pgdat(lruvec); 708 struct mem_cgroup_per_node *pi; 709 710 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) 711 atomic_long_add(x, &pi->lruvec_stat[idx]); 712 x = 0; 713 } 714 __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); 715 } 716 717 /** 718 * __mod_lruvec_state - update lruvec memory statistics 719 * @lruvec: the lruvec 720 * @idx: the stat item 721 * @val: delta to add to the counter, can be negative 722 * 723 * The lruvec is the intersection of the NUMA node and a cgroup. This 724 * function updates the all three counters that are affected by a 725 * change of state at this level: per-node, per-cgroup, per-lruvec. 726 */ 727 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 728 int val) 729 { 730 /* Update node */ 731 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); 732 733 /* Update memcg and lruvec */ 734 if (!mem_cgroup_disabled()) 735 __mod_memcg_lruvec_state(lruvec, idx, val); 736 } 737 738 void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, 739 int val) 740 { 741 struct page *head = compound_head(page); /* rmap on tail pages */ 742 struct mem_cgroup *memcg; 743 pg_data_t *pgdat = page_pgdat(page); 744 struct lruvec *lruvec; 745 746 rcu_read_lock(); 747 memcg = page_memcg(head); 748 /* Untracked pages have no memcg, no lruvec. Update only the node */ 749 if (!memcg) { 750 rcu_read_unlock(); 751 __mod_node_page_state(pgdat, idx, val); 752 return; 753 } 754 755 lruvec = mem_cgroup_lruvec(memcg, pgdat); 756 __mod_lruvec_state(lruvec, idx, val); 757 rcu_read_unlock(); 758 } 759 EXPORT_SYMBOL(__mod_lruvec_page_state); 760 761 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) 762 { 763 pg_data_t *pgdat = page_pgdat(virt_to_page(p)); 764 struct mem_cgroup *memcg; 765 struct lruvec *lruvec; 766 767 rcu_read_lock(); 768 memcg = mem_cgroup_from_obj(p); 769 770 /* 771 * Untracked pages have no memcg, no lruvec. Update only the 772 * node. If we reparent the slab objects to the root memcg, 773 * when we free the slab object, we need to update the per-memcg 774 * vmstats to keep it correct for the root memcg. 775 */ 776 if (!memcg) { 777 __mod_node_page_state(pgdat, idx, val); 778 } else { 779 lruvec = mem_cgroup_lruvec(memcg, pgdat); 780 __mod_lruvec_state(lruvec, idx, val); 781 } 782 rcu_read_unlock(); 783 } 784 785 /** 786 * __count_memcg_events - account VM events in a cgroup 787 * @memcg: the memory cgroup 788 * @idx: the event item 789 * @count: the number of events that occurred 790 */ 791 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, 792 unsigned long count) 793 { 794 if (mem_cgroup_disabled()) 795 return; 796 797 __this_cpu_add(memcg->vmstats_percpu->events[idx], count); 798 cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); 799 } 800 801 static unsigned long memcg_events(struct mem_cgroup *memcg, int event) 802 { 803 return READ_ONCE(memcg->vmstats.events[event]); 804 } 805 806 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) 807 { 808 long x = 0; 809 int cpu; 810 811 for_each_possible_cpu(cpu) 812 x += per_cpu(memcg->vmstats_percpu->events[event], cpu); 813 return x; 814 } 815 816 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 817 struct page *page, 818 int nr_pages) 819 { 820 /* pagein of a big page is an event. So, ignore page size */ 821 if (nr_pages > 0) 822 __count_memcg_events(memcg, PGPGIN, 1); 823 else { 824 __count_memcg_events(memcg, PGPGOUT, 1); 825 nr_pages = -nr_pages; /* for event */ 826 } 827 828 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); 829 } 830 831 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 832 enum mem_cgroup_events_target target) 833 { 834 unsigned long val, next; 835 836 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); 837 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); 838 /* from time_after() in jiffies.h */ 839 if ((long)(next - val) < 0) { 840 switch (target) { 841 case MEM_CGROUP_TARGET_THRESH: 842 next = val + THRESHOLDS_EVENTS_TARGET; 843 break; 844 case MEM_CGROUP_TARGET_SOFTLIMIT: 845 next = val + SOFTLIMIT_EVENTS_TARGET; 846 break; 847 default: 848 break; 849 } 850 __this_cpu_write(memcg->vmstats_percpu->targets[target], next); 851 return true; 852 } 853 return false; 854 } 855 856 /* 857 * Check events in order. 858 * 859 */ 860 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 861 { 862 /* threshold event is triggered in finer grain than soft limit */ 863 if (unlikely(mem_cgroup_event_ratelimit(memcg, 864 MEM_CGROUP_TARGET_THRESH))) { 865 bool do_softlimit; 866 867 do_softlimit = mem_cgroup_event_ratelimit(memcg, 868 MEM_CGROUP_TARGET_SOFTLIMIT); 869 mem_cgroup_threshold(memcg); 870 if (unlikely(do_softlimit)) 871 mem_cgroup_update_tree(memcg, page); 872 } 873 } 874 875 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 876 { 877 /* 878 * mm_update_next_owner() may clear mm->owner to NULL 879 * if it races with swapoff, page migration, etc. 880 * So this can be called with p == NULL. 881 */ 882 if (unlikely(!p)) 883 return NULL; 884 885 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 886 } 887 EXPORT_SYMBOL(mem_cgroup_from_task); 888 889 /** 890 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 891 * @mm: mm from which memcg should be extracted. It can be NULL. 892 * 893 * Obtain a reference on mm->memcg and returns it if successful. Otherwise 894 * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is 895 * returned. 896 */ 897 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 898 { 899 struct mem_cgroup *memcg; 900 901 if (mem_cgroup_disabled()) 902 return NULL; 903 904 rcu_read_lock(); 905 do { 906 /* 907 * Page cache insertions can happen without an 908 * actual mm context, e.g. during disk probing 909 * on boot, loopback IO, acct() writes etc. 910 */ 911 if (unlikely(!mm)) 912 memcg = root_mem_cgroup; 913 else { 914 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 915 if (unlikely(!memcg)) 916 memcg = root_mem_cgroup; 917 } 918 } while (!css_tryget(&memcg->css)); 919 rcu_read_unlock(); 920 return memcg; 921 } 922 EXPORT_SYMBOL(get_mem_cgroup_from_mm); 923 924 static __always_inline struct mem_cgroup *active_memcg(void) 925 { 926 if (in_interrupt()) 927 return this_cpu_read(int_active_memcg); 928 else 929 return current->active_memcg; 930 } 931 932 static __always_inline bool memcg_kmem_bypass(void) 933 { 934 /* Allow remote memcg charging from any context. */ 935 if (unlikely(active_memcg())) 936 return false; 937 938 /* Memcg to charge can't be determined. */ 939 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) 940 return true; 941 942 return false; 943 } 944 945 /** 946 * mem_cgroup_iter - iterate over memory cgroup hierarchy 947 * @root: hierarchy root 948 * @prev: previously returned memcg, NULL on first invocation 949 * @reclaim: cookie for shared reclaim walks, NULL for full walks 950 * 951 * Returns references to children of the hierarchy below @root, or 952 * @root itself, or %NULL after a full round-trip. 953 * 954 * Caller must pass the return value in @prev on subsequent 955 * invocations for reference counting, or use mem_cgroup_iter_break() 956 * to cancel a hierarchy walk before the round-trip is complete. 957 * 958 * Reclaimers can specify a node in @reclaim to divide up the memcgs 959 * in the hierarchy among all concurrent reclaimers operating on the 960 * same node. 961 */ 962 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 963 struct mem_cgroup *prev, 964 struct mem_cgroup_reclaim_cookie *reclaim) 965 { 966 struct mem_cgroup_reclaim_iter *iter; 967 struct cgroup_subsys_state *css = NULL; 968 struct mem_cgroup *memcg = NULL; 969 struct mem_cgroup *pos = NULL; 970 971 if (mem_cgroup_disabled()) 972 return NULL; 973 974 if (!root) 975 root = root_mem_cgroup; 976 977 if (prev && !reclaim) 978 pos = prev; 979 980 rcu_read_lock(); 981 982 if (reclaim) { 983 struct mem_cgroup_per_node *mz; 984 985 mz = root->nodeinfo[reclaim->pgdat->node_id]; 986 iter = &mz->iter; 987 988 if (prev && reclaim->generation != iter->generation) 989 goto out_unlock; 990 991 while (1) { 992 pos = READ_ONCE(iter->position); 993 if (!pos || css_tryget(&pos->css)) 994 break; 995 /* 996 * css reference reached zero, so iter->position will 997 * be cleared by ->css_released. However, we should not 998 * rely on this happening soon, because ->css_released 999 * is called from a work queue, and by busy-waiting we 1000 * might block it. So we clear iter->position right 1001 * away. 1002 */ 1003 (void)cmpxchg(&iter->position, pos, NULL); 1004 } 1005 } 1006 1007 if (pos) 1008 css = &pos->css; 1009 1010 for (;;) { 1011 css = css_next_descendant_pre(css, &root->css); 1012 if (!css) { 1013 /* 1014 * Reclaimers share the hierarchy walk, and a 1015 * new one might jump in right at the end of 1016 * the hierarchy - make sure they see at least 1017 * one group and restart from the beginning. 1018 */ 1019 if (!prev) 1020 continue; 1021 break; 1022 } 1023 1024 /* 1025 * Verify the css and acquire a reference. The root 1026 * is provided by the caller, so we know it's alive 1027 * and kicking, and don't take an extra reference. 1028 */ 1029 memcg = mem_cgroup_from_css(css); 1030 1031 if (css == &root->css) 1032 break; 1033 1034 if (css_tryget(css)) 1035 break; 1036 1037 memcg = NULL; 1038 } 1039 1040 if (reclaim) { 1041 /* 1042 * The position could have already been updated by a competing 1043 * thread, so check that the value hasn't changed since we read 1044 * it to avoid reclaiming from the same cgroup twice. 1045 */ 1046 (void)cmpxchg(&iter->position, pos, memcg); 1047 1048 if (pos) 1049 css_put(&pos->css); 1050 1051 if (!memcg) 1052 iter->generation++; 1053 else if (!prev) 1054 reclaim->generation = iter->generation; 1055 } 1056 1057 out_unlock: 1058 rcu_read_unlock(); 1059 if (prev && prev != root) 1060 css_put(&prev->css); 1061 1062 return memcg; 1063 } 1064 1065 /** 1066 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1067 * @root: hierarchy root 1068 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1069 */ 1070 void mem_cgroup_iter_break(struct mem_cgroup *root, 1071 struct mem_cgroup *prev) 1072 { 1073 if (!root) 1074 root = root_mem_cgroup; 1075 if (prev && prev != root) 1076 css_put(&prev->css); 1077 } 1078 1079 static void __invalidate_reclaim_iterators(struct mem_cgroup *from, 1080 struct mem_cgroup *dead_memcg) 1081 { 1082 struct mem_cgroup_reclaim_iter *iter; 1083 struct mem_cgroup_per_node *mz; 1084 int nid; 1085 1086 for_each_node(nid) { 1087 mz = from->nodeinfo[nid]; 1088 iter = &mz->iter; 1089 cmpxchg(&iter->position, dead_memcg, NULL); 1090 } 1091 } 1092 1093 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 1094 { 1095 struct mem_cgroup *memcg = dead_memcg; 1096 struct mem_cgroup *last; 1097 1098 do { 1099 __invalidate_reclaim_iterators(memcg, dead_memcg); 1100 last = memcg; 1101 } while ((memcg = parent_mem_cgroup(memcg))); 1102 1103 /* 1104 * When cgruop1 non-hierarchy mode is used, 1105 * parent_mem_cgroup() does not walk all the way up to the 1106 * cgroup root (root_mem_cgroup). So we have to handle 1107 * dead_memcg from cgroup root separately. 1108 */ 1109 if (last != root_mem_cgroup) 1110 __invalidate_reclaim_iterators(root_mem_cgroup, 1111 dead_memcg); 1112 } 1113 1114 /** 1115 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 1116 * @memcg: hierarchy root 1117 * @fn: function to call for each task 1118 * @arg: argument passed to @fn 1119 * 1120 * This function iterates over tasks attached to @memcg or to any of its 1121 * descendants and calls @fn for each task. If @fn returns a non-zero 1122 * value, the function breaks the iteration loop and returns the value. 1123 * Otherwise, it will iterate over all tasks and return 0. 1124 * 1125 * This function must not be called for the root memory cgroup. 1126 */ 1127 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 1128 int (*fn)(struct task_struct *, void *), void *arg) 1129 { 1130 struct mem_cgroup *iter; 1131 int ret = 0; 1132 1133 BUG_ON(memcg == root_mem_cgroup); 1134 1135 for_each_mem_cgroup_tree(iter, memcg) { 1136 struct css_task_iter it; 1137 struct task_struct *task; 1138 1139 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); 1140 while (!ret && (task = css_task_iter_next(&it))) 1141 ret = fn(task, arg); 1142 css_task_iter_end(&it); 1143 if (ret) { 1144 mem_cgroup_iter_break(memcg, iter); 1145 break; 1146 } 1147 } 1148 return ret; 1149 } 1150 1151 #ifdef CONFIG_DEBUG_VM 1152 void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) 1153 { 1154 struct mem_cgroup *memcg; 1155 1156 if (mem_cgroup_disabled()) 1157 return; 1158 1159 memcg = page_memcg(page); 1160 1161 if (!memcg) 1162 VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page); 1163 else 1164 VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page); 1165 } 1166 #endif 1167 1168 /** 1169 * lock_page_lruvec - lock and return lruvec for a given page. 1170 * @page: the page 1171 * 1172 * These functions are safe to use under any of the following conditions: 1173 * - page locked 1174 * - PageLRU cleared 1175 * - lock_page_memcg() 1176 * - page->_refcount is zero 1177 */ 1178 struct lruvec *lock_page_lruvec(struct page *page) 1179 { 1180 struct lruvec *lruvec; 1181 struct pglist_data *pgdat = page_pgdat(page); 1182 1183 lruvec = mem_cgroup_page_lruvec(page, pgdat); 1184 spin_lock(&lruvec->lru_lock); 1185 1186 lruvec_memcg_debug(lruvec, page); 1187 1188 return lruvec; 1189 } 1190 1191 struct lruvec *lock_page_lruvec_irq(struct page *page) 1192 { 1193 struct lruvec *lruvec; 1194 struct pglist_data *pgdat = page_pgdat(page); 1195 1196 lruvec = mem_cgroup_page_lruvec(page, pgdat); 1197 spin_lock_irq(&lruvec->lru_lock); 1198 1199 lruvec_memcg_debug(lruvec, page); 1200 1201 return lruvec; 1202 } 1203 1204 struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) 1205 { 1206 struct lruvec *lruvec; 1207 struct pglist_data *pgdat = page_pgdat(page); 1208 1209 lruvec = mem_cgroup_page_lruvec(page, pgdat); 1210 spin_lock_irqsave(&lruvec->lru_lock, *flags); 1211 1212 lruvec_memcg_debug(lruvec, page); 1213 1214 return lruvec; 1215 } 1216 1217 /** 1218 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1219 * @lruvec: mem_cgroup per zone lru vector 1220 * @lru: index of lru list the page is sitting on 1221 * @zid: zone id of the accounted pages 1222 * @nr_pages: positive when adding or negative when removing 1223 * 1224 * This function must be called under lru_lock, just before a page is added 1225 * to or just after a page is removed from an lru list (that ordering being 1226 * so as to allow it to check that lru_size 0 is consistent with list_empty). 1227 */ 1228 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1229 int zid, int nr_pages) 1230 { 1231 struct mem_cgroup_per_node *mz; 1232 unsigned long *lru_size; 1233 long size; 1234 1235 if (mem_cgroup_disabled()) 1236 return; 1237 1238 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 1239 lru_size = &mz->lru_zone_size[zid][lru]; 1240 1241 if (nr_pages < 0) 1242 *lru_size += nr_pages; 1243 1244 size = *lru_size; 1245 if (WARN_ONCE(size < 0, 1246 "%s(%p, %d, %d): lru_size %ld\n", 1247 __func__, lruvec, lru, nr_pages, size)) { 1248 VM_BUG_ON(1); 1249 *lru_size = 0; 1250 } 1251 1252 if (nr_pages > 0) 1253 *lru_size += nr_pages; 1254 } 1255 1256 /** 1257 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1258 * @memcg: the memory cgroup 1259 * 1260 * Returns the maximum amount of memory @mem can be charged with, in 1261 * pages. 1262 */ 1263 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1264 { 1265 unsigned long margin = 0; 1266 unsigned long count; 1267 unsigned long limit; 1268 1269 count = page_counter_read(&memcg->memory); 1270 limit = READ_ONCE(memcg->memory.max); 1271 if (count < limit) 1272 margin = limit - count; 1273 1274 if (do_memsw_account()) { 1275 count = page_counter_read(&memcg->memsw); 1276 limit = READ_ONCE(memcg->memsw.max); 1277 if (count < limit) 1278 margin = min(margin, limit - count); 1279 else 1280 margin = 0; 1281 } 1282 1283 return margin; 1284 } 1285 1286 /* 1287 * A routine for checking "mem" is under move_account() or not. 1288 * 1289 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1290 * moving cgroups. This is for waiting at high-memory pressure 1291 * caused by "move". 1292 */ 1293 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1294 { 1295 struct mem_cgroup *from; 1296 struct mem_cgroup *to; 1297 bool ret = false; 1298 /* 1299 * Unlike task_move routines, we access mc.to, mc.from not under 1300 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1301 */ 1302 spin_lock(&mc.lock); 1303 from = mc.from; 1304 to = mc.to; 1305 if (!from) 1306 goto unlock; 1307 1308 ret = mem_cgroup_is_descendant(from, memcg) || 1309 mem_cgroup_is_descendant(to, memcg); 1310 unlock: 1311 spin_unlock(&mc.lock); 1312 return ret; 1313 } 1314 1315 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1316 { 1317 if (mc.moving_task && current != mc.moving_task) { 1318 if (mem_cgroup_under_move(memcg)) { 1319 DEFINE_WAIT(wait); 1320 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1321 /* moving charge context might have finished. */ 1322 if (mc.moving_task) 1323 schedule(); 1324 finish_wait(&mc.waitq, &wait); 1325 return true; 1326 } 1327 } 1328 return false; 1329 } 1330 1331 struct memory_stat { 1332 const char *name; 1333 unsigned int idx; 1334 }; 1335 1336 static const struct memory_stat memory_stats[] = { 1337 { "anon", NR_ANON_MAPPED }, 1338 { "file", NR_FILE_PAGES }, 1339 { "kernel_stack", NR_KERNEL_STACK_KB }, 1340 { "pagetables", NR_PAGETABLE }, 1341 { "percpu", MEMCG_PERCPU_B }, 1342 { "sock", MEMCG_SOCK }, 1343 { "shmem", NR_SHMEM }, 1344 { "file_mapped", NR_FILE_MAPPED }, 1345 { "file_dirty", NR_FILE_DIRTY }, 1346 { "file_writeback", NR_WRITEBACK }, 1347 #ifdef CONFIG_SWAP 1348 { "swapcached", NR_SWAPCACHE }, 1349 #endif 1350 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1351 { "anon_thp", NR_ANON_THPS }, 1352 { "file_thp", NR_FILE_THPS }, 1353 { "shmem_thp", NR_SHMEM_THPS }, 1354 #endif 1355 { "inactive_anon", NR_INACTIVE_ANON }, 1356 { "active_anon", NR_ACTIVE_ANON }, 1357 { "inactive_file", NR_INACTIVE_FILE }, 1358 { "active_file", NR_ACTIVE_FILE }, 1359 { "unevictable", NR_UNEVICTABLE }, 1360 { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B }, 1361 { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B }, 1362 1363 /* The memory events */ 1364 { "workingset_refault_anon", WORKINGSET_REFAULT_ANON }, 1365 { "workingset_refault_file", WORKINGSET_REFAULT_FILE }, 1366 { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON }, 1367 { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE }, 1368 { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, 1369 { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, 1370 { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, 1371 }; 1372 1373 /* Translate stat items to the correct unit for memory.stat output */ 1374 static int memcg_page_state_unit(int item) 1375 { 1376 switch (item) { 1377 case MEMCG_PERCPU_B: 1378 case NR_SLAB_RECLAIMABLE_B: 1379 case NR_SLAB_UNRECLAIMABLE_B: 1380 case WORKINGSET_REFAULT_ANON: 1381 case WORKINGSET_REFAULT_FILE: 1382 case WORKINGSET_ACTIVATE_ANON: 1383 case WORKINGSET_ACTIVATE_FILE: 1384 case WORKINGSET_RESTORE_ANON: 1385 case WORKINGSET_RESTORE_FILE: 1386 case WORKINGSET_NODERECLAIM: 1387 return 1; 1388 case NR_KERNEL_STACK_KB: 1389 return SZ_1K; 1390 default: 1391 return PAGE_SIZE; 1392 } 1393 } 1394 1395 static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, 1396 int item) 1397 { 1398 return memcg_page_state(memcg, item) * memcg_page_state_unit(item); 1399 } 1400 1401 static char *memory_stat_format(struct mem_cgroup *memcg) 1402 { 1403 struct seq_buf s; 1404 int i; 1405 1406 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); 1407 if (!s.buffer) 1408 return NULL; 1409 1410 /* 1411 * Provide statistics on the state of the memory subsystem as 1412 * well as cumulative event counters that show past behavior. 1413 * 1414 * This list is ordered following a combination of these gradients: 1415 * 1) generic big picture -> specifics and details 1416 * 2) reflecting userspace activity -> reflecting kernel heuristics 1417 * 1418 * Current memory state: 1419 */ 1420 cgroup_rstat_flush(memcg->css.cgroup); 1421 1422 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 1423 u64 size; 1424 1425 size = memcg_page_state_output(memcg, memory_stats[i].idx); 1426 seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); 1427 1428 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { 1429 size += memcg_page_state_output(memcg, 1430 NR_SLAB_RECLAIMABLE_B); 1431 seq_buf_printf(&s, "slab %llu\n", size); 1432 } 1433 } 1434 1435 /* Accumulated memory events */ 1436 1437 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT), 1438 memcg_events(memcg, PGFAULT)); 1439 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT), 1440 memcg_events(memcg, PGMAJFAULT)); 1441 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL), 1442 memcg_events(memcg, PGREFILL)); 1443 seq_buf_printf(&s, "pgscan %lu\n", 1444 memcg_events(memcg, PGSCAN_KSWAPD) + 1445 memcg_events(memcg, PGSCAN_DIRECT)); 1446 seq_buf_printf(&s, "pgsteal %lu\n", 1447 memcg_events(memcg, PGSTEAL_KSWAPD) + 1448 memcg_events(memcg, PGSTEAL_DIRECT)); 1449 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE), 1450 memcg_events(memcg, PGACTIVATE)); 1451 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE), 1452 memcg_events(memcg, PGDEACTIVATE)); 1453 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE), 1454 memcg_events(memcg, PGLAZYFREE)); 1455 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED), 1456 memcg_events(memcg, PGLAZYFREED)); 1457 1458 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1459 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC), 1460 memcg_events(memcg, THP_FAULT_ALLOC)); 1461 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC), 1462 memcg_events(memcg, THP_COLLAPSE_ALLOC)); 1463 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1464 1465 /* The above should easily fit into one page */ 1466 WARN_ON_ONCE(seq_buf_has_overflowed(&s)); 1467 1468 return s.buffer; 1469 } 1470 1471 #define K(x) ((x) << (PAGE_SHIFT-10)) 1472 /** 1473 * mem_cgroup_print_oom_context: Print OOM information relevant to 1474 * memory controller. 1475 * @memcg: The memory cgroup that went over limit 1476 * @p: Task that is going to be killed 1477 * 1478 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1479 * enabled 1480 */ 1481 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 1482 { 1483 rcu_read_lock(); 1484 1485 if (memcg) { 1486 pr_cont(",oom_memcg="); 1487 pr_cont_cgroup_path(memcg->css.cgroup); 1488 } else 1489 pr_cont(",global_oom"); 1490 if (p) { 1491 pr_cont(",task_memcg="); 1492 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1493 } 1494 rcu_read_unlock(); 1495 } 1496 1497 /** 1498 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 1499 * memory controller. 1500 * @memcg: The memory cgroup that went over limit 1501 */ 1502 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 1503 { 1504 char *buf; 1505 1506 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1507 K((u64)page_counter_read(&memcg->memory)), 1508 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); 1509 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1510 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", 1511 K((u64)page_counter_read(&memcg->swap)), 1512 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); 1513 else { 1514 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1515 K((u64)page_counter_read(&memcg->memsw)), 1516 K((u64)memcg->memsw.max), memcg->memsw.failcnt); 1517 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1518 K((u64)page_counter_read(&memcg->kmem)), 1519 K((u64)memcg->kmem.max), memcg->kmem.failcnt); 1520 } 1521 1522 pr_info("Memory cgroup stats for "); 1523 pr_cont_cgroup_path(memcg->css.cgroup); 1524 pr_cont(":"); 1525 buf = memory_stat_format(memcg); 1526 if (!buf) 1527 return; 1528 pr_info("%s", buf); 1529 kfree(buf); 1530 } 1531 1532 /* 1533 * Return the memory (and swap, if configured) limit for a memcg. 1534 */ 1535 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 1536 { 1537 unsigned long max = READ_ONCE(memcg->memory.max); 1538 1539 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 1540 if (mem_cgroup_swappiness(memcg)) 1541 max += min(READ_ONCE(memcg->swap.max), 1542 (unsigned long)total_swap_pages); 1543 } else { /* v1 */ 1544 if (mem_cgroup_swappiness(memcg)) { 1545 /* Calculate swap excess capacity from memsw limit */ 1546 unsigned long swap = READ_ONCE(memcg->memsw.max) - max; 1547 1548 max += min(swap, (unsigned long)total_swap_pages); 1549 } 1550 } 1551 return max; 1552 } 1553 1554 unsigned long mem_cgroup_size(struct mem_cgroup *memcg) 1555 { 1556 return page_counter_read(&memcg->memory); 1557 } 1558 1559 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1560 int order) 1561 { 1562 struct oom_control oc = { 1563 .zonelist = NULL, 1564 .nodemask = NULL, 1565 .memcg = memcg, 1566 .gfp_mask = gfp_mask, 1567 .order = order, 1568 }; 1569 bool ret = true; 1570 1571 if (mutex_lock_killable(&oom_lock)) 1572 return true; 1573 1574 if (mem_cgroup_margin(memcg) >= (1 << order)) 1575 goto unlock; 1576 1577 /* 1578 * A few threads which were not waiting at mutex_lock_killable() can 1579 * fail to bail out. Therefore, check again after holding oom_lock. 1580 */ 1581 ret = should_force_charge() || out_of_memory(&oc); 1582 1583 unlock: 1584 mutex_unlock(&oom_lock); 1585 return ret; 1586 } 1587 1588 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1589 pg_data_t *pgdat, 1590 gfp_t gfp_mask, 1591 unsigned long *total_scanned) 1592 { 1593 struct mem_cgroup *victim = NULL; 1594 int total = 0; 1595 int loop = 0; 1596 unsigned long excess; 1597 unsigned long nr_scanned; 1598 struct mem_cgroup_reclaim_cookie reclaim = { 1599 .pgdat = pgdat, 1600 }; 1601 1602 excess = soft_limit_excess(root_memcg); 1603 1604 while (1) { 1605 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1606 if (!victim) { 1607 loop++; 1608 if (loop >= 2) { 1609 /* 1610 * If we have not been able to reclaim 1611 * anything, it might because there are 1612 * no reclaimable pages under this hierarchy 1613 */ 1614 if (!total) 1615 break; 1616 /* 1617 * We want to do more targeted reclaim. 1618 * excess >> 2 is not to excessive so as to 1619 * reclaim too much, nor too less that we keep 1620 * coming back to reclaim from this cgroup 1621 */ 1622 if (total >= (excess >> 2) || 1623 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1624 break; 1625 } 1626 continue; 1627 } 1628 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 1629 pgdat, &nr_scanned); 1630 *total_scanned += nr_scanned; 1631 if (!soft_limit_excess(root_memcg)) 1632 break; 1633 } 1634 mem_cgroup_iter_break(root_memcg, victim); 1635 return total; 1636 } 1637 1638 #ifdef CONFIG_LOCKDEP 1639 static struct lockdep_map memcg_oom_lock_dep_map = { 1640 .name = "memcg_oom_lock", 1641 }; 1642 #endif 1643 1644 static DEFINE_SPINLOCK(memcg_oom_lock); 1645 1646 /* 1647 * Check OOM-Killer is already running under our hierarchy. 1648 * If someone is running, return false. 1649 */ 1650 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1651 { 1652 struct mem_cgroup *iter, *failed = NULL; 1653 1654 spin_lock(&memcg_oom_lock); 1655 1656 for_each_mem_cgroup_tree(iter, memcg) { 1657 if (iter->oom_lock) { 1658 /* 1659 * this subtree of our hierarchy is already locked 1660 * so we cannot give a lock. 1661 */ 1662 failed = iter; 1663 mem_cgroup_iter_break(memcg, iter); 1664 break; 1665 } else 1666 iter->oom_lock = true; 1667 } 1668 1669 if (failed) { 1670 /* 1671 * OK, we failed to lock the whole subtree so we have 1672 * to clean up what we set up to the failing subtree 1673 */ 1674 for_each_mem_cgroup_tree(iter, memcg) { 1675 if (iter == failed) { 1676 mem_cgroup_iter_break(memcg, iter); 1677 break; 1678 } 1679 iter->oom_lock = false; 1680 } 1681 } else 1682 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1683 1684 spin_unlock(&memcg_oom_lock); 1685 1686 return !failed; 1687 } 1688 1689 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1690 { 1691 struct mem_cgroup *iter; 1692 1693 spin_lock(&memcg_oom_lock); 1694 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1695 for_each_mem_cgroup_tree(iter, memcg) 1696 iter->oom_lock = false; 1697 spin_unlock(&memcg_oom_lock); 1698 } 1699 1700 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1701 { 1702 struct mem_cgroup *iter; 1703 1704 spin_lock(&memcg_oom_lock); 1705 for_each_mem_cgroup_tree(iter, memcg) 1706 iter->under_oom++; 1707 spin_unlock(&memcg_oom_lock); 1708 } 1709 1710 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1711 { 1712 struct mem_cgroup *iter; 1713 1714 /* 1715 * Be careful about under_oom underflows because a child memcg 1716 * could have been added after mem_cgroup_mark_under_oom. 1717 */ 1718 spin_lock(&memcg_oom_lock); 1719 for_each_mem_cgroup_tree(iter, memcg) 1720 if (iter->under_oom > 0) 1721 iter->under_oom--; 1722 spin_unlock(&memcg_oom_lock); 1723 } 1724 1725 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1726 1727 struct oom_wait_info { 1728 struct mem_cgroup *memcg; 1729 wait_queue_entry_t wait; 1730 }; 1731 1732 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1733 unsigned mode, int sync, void *arg) 1734 { 1735 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1736 struct mem_cgroup *oom_wait_memcg; 1737 struct oom_wait_info *oom_wait_info; 1738 1739 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1740 oom_wait_memcg = oom_wait_info->memcg; 1741 1742 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1743 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1744 return 0; 1745 return autoremove_wake_function(wait, mode, sync, arg); 1746 } 1747 1748 static void memcg_oom_recover(struct mem_cgroup *memcg) 1749 { 1750 /* 1751 * For the following lockless ->under_oom test, the only required 1752 * guarantee is that it must see the state asserted by an OOM when 1753 * this function is called as a result of userland actions 1754 * triggered by the notification of the OOM. This is trivially 1755 * achieved by invoking mem_cgroup_mark_under_oom() before 1756 * triggering notification. 1757 */ 1758 if (memcg && memcg->under_oom) 1759 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1760 } 1761 1762 enum oom_status { 1763 OOM_SUCCESS, 1764 OOM_FAILED, 1765 OOM_ASYNC, 1766 OOM_SKIPPED 1767 }; 1768 1769 static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1770 { 1771 enum oom_status ret; 1772 bool locked; 1773 1774 if (order > PAGE_ALLOC_COSTLY_ORDER) 1775 return OOM_SKIPPED; 1776 1777 memcg_memory_event(memcg, MEMCG_OOM); 1778 1779 /* 1780 * We are in the middle of the charge context here, so we 1781 * don't want to block when potentially sitting on a callstack 1782 * that holds all kinds of filesystem and mm locks. 1783 * 1784 * cgroup1 allows disabling the OOM killer and waiting for outside 1785 * handling until the charge can succeed; remember the context and put 1786 * the task to sleep at the end of the page fault when all locks are 1787 * released. 1788 * 1789 * On the other hand, in-kernel OOM killer allows for an async victim 1790 * memory reclaim (oom_reaper) and that means that we are not solely 1791 * relying on the oom victim to make a forward progress and we can 1792 * invoke the oom killer here. 1793 * 1794 * Please note that mem_cgroup_out_of_memory might fail to find a 1795 * victim and then we have to bail out from the charge path. 1796 */ 1797 if (memcg->oom_kill_disable) { 1798 if (!current->in_user_fault) 1799 return OOM_SKIPPED; 1800 css_get(&memcg->css); 1801 current->memcg_in_oom = memcg; 1802 current->memcg_oom_gfp_mask = mask; 1803 current->memcg_oom_order = order; 1804 1805 return OOM_ASYNC; 1806 } 1807 1808 mem_cgroup_mark_under_oom(memcg); 1809 1810 locked = mem_cgroup_oom_trylock(memcg); 1811 1812 if (locked) 1813 mem_cgroup_oom_notify(memcg); 1814 1815 mem_cgroup_unmark_under_oom(memcg); 1816 if (mem_cgroup_out_of_memory(memcg, mask, order)) 1817 ret = OOM_SUCCESS; 1818 else 1819 ret = OOM_FAILED; 1820 1821 if (locked) 1822 mem_cgroup_oom_unlock(memcg); 1823 1824 return ret; 1825 } 1826 1827 /** 1828 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1829 * @handle: actually kill/wait or just clean up the OOM state 1830 * 1831 * This has to be called at the end of a page fault if the memcg OOM 1832 * handler was enabled. 1833 * 1834 * Memcg supports userspace OOM handling where failed allocations must 1835 * sleep on a waitqueue until the userspace task resolves the 1836 * situation. Sleeping directly in the charge context with all kinds 1837 * of locks held is not a good idea, instead we remember an OOM state 1838 * in the task and mem_cgroup_oom_synchronize() has to be called at 1839 * the end of the page fault to complete the OOM handling. 1840 * 1841 * Returns %true if an ongoing memcg OOM situation was detected and 1842 * completed, %false otherwise. 1843 */ 1844 bool mem_cgroup_oom_synchronize(bool handle) 1845 { 1846 struct mem_cgroup *memcg = current->memcg_in_oom; 1847 struct oom_wait_info owait; 1848 bool locked; 1849 1850 /* OOM is global, do not handle */ 1851 if (!memcg) 1852 return false; 1853 1854 if (!handle) 1855 goto cleanup; 1856 1857 owait.memcg = memcg; 1858 owait.wait.flags = 0; 1859 owait.wait.func = memcg_oom_wake_function; 1860 owait.wait.private = current; 1861 INIT_LIST_HEAD(&owait.wait.entry); 1862 1863 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1864 mem_cgroup_mark_under_oom(memcg); 1865 1866 locked = mem_cgroup_oom_trylock(memcg); 1867 1868 if (locked) 1869 mem_cgroup_oom_notify(memcg); 1870 1871 if (locked && !memcg->oom_kill_disable) { 1872 mem_cgroup_unmark_under_oom(memcg); 1873 finish_wait(&memcg_oom_waitq, &owait.wait); 1874 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, 1875 current->memcg_oom_order); 1876 } else { 1877 schedule(); 1878 mem_cgroup_unmark_under_oom(memcg); 1879 finish_wait(&memcg_oom_waitq, &owait.wait); 1880 } 1881 1882 if (locked) { 1883 mem_cgroup_oom_unlock(memcg); 1884 /* 1885 * There is no guarantee that an OOM-lock contender 1886 * sees the wakeups triggered by the OOM kill 1887 * uncharges. Wake any sleepers explicitly. 1888 */ 1889 memcg_oom_recover(memcg); 1890 } 1891 cleanup: 1892 current->memcg_in_oom = NULL; 1893 css_put(&memcg->css); 1894 return true; 1895 } 1896 1897 /** 1898 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 1899 * @victim: task to be killed by the OOM killer 1900 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 1901 * 1902 * Returns a pointer to a memory cgroup, which has to be cleaned up 1903 * by killing all belonging OOM-killable tasks. 1904 * 1905 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 1906 */ 1907 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 1908 struct mem_cgroup *oom_domain) 1909 { 1910 struct mem_cgroup *oom_group = NULL; 1911 struct mem_cgroup *memcg; 1912 1913 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1914 return NULL; 1915 1916 if (!oom_domain) 1917 oom_domain = root_mem_cgroup; 1918 1919 rcu_read_lock(); 1920 1921 memcg = mem_cgroup_from_task(victim); 1922 if (memcg == root_mem_cgroup) 1923 goto out; 1924 1925 /* 1926 * If the victim task has been asynchronously moved to a different 1927 * memory cgroup, we might end up killing tasks outside oom_domain. 1928 * In this case it's better to ignore memory.group.oom. 1929 */ 1930 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) 1931 goto out; 1932 1933 /* 1934 * Traverse the memory cgroup hierarchy from the victim task's 1935 * cgroup up to the OOMing cgroup (or root) to find the 1936 * highest-level memory cgroup with oom.group set. 1937 */ 1938 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 1939 if (memcg->oom_group) 1940 oom_group = memcg; 1941 1942 if (memcg == oom_domain) 1943 break; 1944 } 1945 1946 if (oom_group) 1947 css_get(&oom_group->css); 1948 out: 1949 rcu_read_unlock(); 1950 1951 return oom_group; 1952 } 1953 1954 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 1955 { 1956 pr_info("Tasks in "); 1957 pr_cont_cgroup_path(memcg->css.cgroup); 1958 pr_cont(" are going to be killed due to memory.oom.group set\n"); 1959 } 1960 1961 /** 1962 * lock_page_memcg - lock a page and memcg binding 1963 * @page: the page 1964 * 1965 * This function protects unlocked LRU pages from being moved to 1966 * another cgroup. 1967 * 1968 * It ensures lifetime of the locked memcg. Caller is responsible 1969 * for the lifetime of the page. 1970 */ 1971 void lock_page_memcg(struct page *page) 1972 { 1973 struct page *head = compound_head(page); /* rmap on tail pages */ 1974 struct mem_cgroup *memcg; 1975 unsigned long flags; 1976 1977 /* 1978 * The RCU lock is held throughout the transaction. The fast 1979 * path can get away without acquiring the memcg->move_lock 1980 * because page moving starts with an RCU grace period. 1981 */ 1982 rcu_read_lock(); 1983 1984 if (mem_cgroup_disabled()) 1985 return; 1986 again: 1987 memcg = page_memcg(head); 1988 if (unlikely(!memcg)) 1989 return; 1990 1991 #ifdef CONFIG_PROVE_LOCKING 1992 local_irq_save(flags); 1993 might_lock(&memcg->move_lock); 1994 local_irq_restore(flags); 1995 #endif 1996 1997 if (atomic_read(&memcg->moving_account) <= 0) 1998 return; 1999 2000 spin_lock_irqsave(&memcg->move_lock, flags); 2001 if (memcg != page_memcg(head)) { 2002 spin_unlock_irqrestore(&memcg->move_lock, flags); 2003 goto again; 2004 } 2005 2006 /* 2007 * When charge migration first begins, we can have multiple 2008 * critical sections holding the fast-path RCU lock and one 2009 * holding the slowpath move_lock. Track the task who has the 2010 * move_lock for unlock_page_memcg(). 2011 */ 2012 memcg->move_lock_task = current; 2013 memcg->move_lock_flags = flags; 2014 } 2015 EXPORT_SYMBOL(lock_page_memcg); 2016 2017 static void __unlock_page_memcg(struct mem_cgroup *memcg) 2018 { 2019 if (memcg && memcg->move_lock_task == current) { 2020 unsigned long flags = memcg->move_lock_flags; 2021 2022 memcg->move_lock_task = NULL; 2023 memcg->move_lock_flags = 0; 2024 2025 spin_unlock_irqrestore(&memcg->move_lock, flags); 2026 } 2027 2028 rcu_read_unlock(); 2029 } 2030 2031 /** 2032 * unlock_page_memcg - unlock a page and memcg binding 2033 * @page: the page 2034 */ 2035 void unlock_page_memcg(struct page *page) 2036 { 2037 struct page *head = compound_head(page); 2038 2039 __unlock_page_memcg(page_memcg(head)); 2040 } 2041 EXPORT_SYMBOL(unlock_page_memcg); 2042 2043 struct memcg_stock_pcp { 2044 struct mem_cgroup *cached; /* this never be root cgroup */ 2045 unsigned int nr_pages; 2046 2047 #ifdef CONFIG_MEMCG_KMEM 2048 struct obj_cgroup *cached_objcg; 2049 unsigned int nr_bytes; 2050 #endif 2051 2052 struct work_struct work; 2053 unsigned long flags; 2054 #define FLUSHING_CACHED_CHARGE 0 2055 }; 2056 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2057 static DEFINE_MUTEX(percpu_charge_mutex); 2058 2059 #ifdef CONFIG_MEMCG_KMEM 2060 static void drain_obj_stock(struct memcg_stock_pcp *stock); 2061 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2062 struct mem_cgroup *root_memcg); 2063 2064 #else 2065 static inline void drain_obj_stock(struct memcg_stock_pcp *stock) 2066 { 2067 } 2068 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2069 struct mem_cgroup *root_memcg) 2070 { 2071 return false; 2072 } 2073 #endif 2074 2075 /** 2076 * consume_stock: Try to consume stocked charge on this cpu. 2077 * @memcg: memcg to consume from. 2078 * @nr_pages: how many pages to charge. 2079 * 2080 * The charges will only happen if @memcg matches the current cpu's memcg 2081 * stock, and at least @nr_pages are available in that stock. Failure to 2082 * service an allocation will refill the stock. 2083 * 2084 * returns true if successful, false otherwise. 2085 */ 2086 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2087 { 2088 struct memcg_stock_pcp *stock; 2089 unsigned long flags; 2090 bool ret = false; 2091 2092 if (nr_pages > MEMCG_CHARGE_BATCH) 2093 return ret; 2094 2095 local_irq_save(flags); 2096 2097 stock = this_cpu_ptr(&memcg_stock); 2098 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 2099 stock->nr_pages -= nr_pages; 2100 ret = true; 2101 } 2102 2103 local_irq_restore(flags); 2104 2105 return ret; 2106 } 2107 2108 /* 2109 * Returns stocks cached in percpu and reset cached information. 2110 */ 2111 static void drain_stock(struct memcg_stock_pcp *stock) 2112 { 2113 struct mem_cgroup *old = stock->cached; 2114 2115 if (!old) 2116 return; 2117 2118 if (stock->nr_pages) { 2119 page_counter_uncharge(&old->memory, stock->nr_pages); 2120 if (do_memsw_account()) 2121 page_counter_uncharge(&old->memsw, stock->nr_pages); 2122 stock->nr_pages = 0; 2123 } 2124 2125 css_put(&old->css); 2126 stock->cached = NULL; 2127 } 2128 2129 static void drain_local_stock(struct work_struct *dummy) 2130 { 2131 struct memcg_stock_pcp *stock; 2132 unsigned long flags; 2133 2134 /* 2135 * The only protection from memory hotplug vs. drain_stock races is 2136 * that we always operate on local CPU stock here with IRQ disabled 2137 */ 2138 local_irq_save(flags); 2139 2140 stock = this_cpu_ptr(&memcg_stock); 2141 drain_obj_stock(stock); 2142 drain_stock(stock); 2143 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2144 2145 local_irq_restore(flags); 2146 } 2147 2148 /* 2149 * Cache charges(val) to local per_cpu area. 2150 * This will be consumed by consume_stock() function, later. 2151 */ 2152 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2153 { 2154 struct memcg_stock_pcp *stock; 2155 unsigned long flags; 2156 2157 local_irq_save(flags); 2158 2159 stock = this_cpu_ptr(&memcg_stock); 2160 if (stock->cached != memcg) { /* reset if necessary */ 2161 drain_stock(stock); 2162 css_get(&memcg->css); 2163 stock->cached = memcg; 2164 } 2165 stock->nr_pages += nr_pages; 2166 2167 if (stock->nr_pages > MEMCG_CHARGE_BATCH) 2168 drain_stock(stock); 2169 2170 local_irq_restore(flags); 2171 } 2172 2173 /* 2174 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2175 * of the hierarchy under it. 2176 */ 2177 static void drain_all_stock(struct mem_cgroup *root_memcg) 2178 { 2179 int cpu, curcpu; 2180 2181 /* If someone's already draining, avoid adding running more workers. */ 2182 if (!mutex_trylock(&percpu_charge_mutex)) 2183 return; 2184 /* 2185 * Notify other cpus that system-wide "drain" is running 2186 * We do not care about races with the cpu hotplug because cpu down 2187 * as well as workers from this path always operate on the local 2188 * per-cpu data. CPU up doesn't touch memcg_stock at all. 2189 */ 2190 curcpu = get_cpu(); 2191 for_each_online_cpu(cpu) { 2192 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2193 struct mem_cgroup *memcg; 2194 bool flush = false; 2195 2196 rcu_read_lock(); 2197 memcg = stock->cached; 2198 if (memcg && stock->nr_pages && 2199 mem_cgroup_is_descendant(memcg, root_memcg)) 2200 flush = true; 2201 if (obj_stock_flush_required(stock, root_memcg)) 2202 flush = true; 2203 rcu_read_unlock(); 2204 2205 if (flush && 2206 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2207 if (cpu == curcpu) 2208 drain_local_stock(&stock->work); 2209 else 2210 schedule_work_on(cpu, &stock->work); 2211 } 2212 } 2213 put_cpu(); 2214 mutex_unlock(&percpu_charge_mutex); 2215 } 2216 2217 static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu) 2218 { 2219 int nid; 2220 2221 for_each_node(nid) { 2222 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; 2223 unsigned long stat[NR_VM_NODE_STAT_ITEMS]; 2224 struct batched_lruvec_stat *lstatc; 2225 int i; 2226 2227 lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu); 2228 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { 2229 stat[i] = lstatc->count[i]; 2230 lstatc->count[i] = 0; 2231 } 2232 2233 do { 2234 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 2235 atomic_long_add(stat[i], &pn->lruvec_stat[i]); 2236 } while ((pn = parent_nodeinfo(pn, nid))); 2237 } 2238 } 2239 2240 static int memcg_hotplug_cpu_dead(unsigned int cpu) 2241 { 2242 struct memcg_stock_pcp *stock; 2243 struct mem_cgroup *memcg; 2244 2245 stock = &per_cpu(memcg_stock, cpu); 2246 drain_stock(stock); 2247 2248 for_each_mem_cgroup(memcg) 2249 memcg_flush_lruvec_page_state(memcg, cpu); 2250 2251 return 0; 2252 } 2253 2254 static unsigned long reclaim_high(struct mem_cgroup *memcg, 2255 unsigned int nr_pages, 2256 gfp_t gfp_mask) 2257 { 2258 unsigned long nr_reclaimed = 0; 2259 2260 do { 2261 unsigned long pflags; 2262 2263 if (page_counter_read(&memcg->memory) <= 2264 READ_ONCE(memcg->memory.high)) 2265 continue; 2266 2267 memcg_memory_event(memcg, MEMCG_HIGH); 2268 2269 psi_memstall_enter(&pflags); 2270 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, 2271 gfp_mask, true); 2272 psi_memstall_leave(&pflags); 2273 } while ((memcg = parent_mem_cgroup(memcg)) && 2274 !mem_cgroup_is_root(memcg)); 2275 2276 return nr_reclaimed; 2277 } 2278 2279 static void high_work_func(struct work_struct *work) 2280 { 2281 struct mem_cgroup *memcg; 2282 2283 memcg = container_of(work, struct mem_cgroup, high_work); 2284 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 2285 } 2286 2287 /* 2288 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is 2289 * enough to still cause a significant slowdown in most cases, while still 2290 * allowing diagnostics and tracing to proceed without becoming stuck. 2291 */ 2292 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) 2293 2294 /* 2295 * When calculating the delay, we use these either side of the exponentiation to 2296 * maintain precision and scale to a reasonable number of jiffies (see the table 2297 * below. 2298 * 2299 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the 2300 * overage ratio to a delay. 2301 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the 2302 * proposed penalty in order to reduce to a reasonable number of jiffies, and 2303 * to produce a reasonable delay curve. 2304 * 2305 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a 2306 * reasonable delay curve compared to precision-adjusted overage, not 2307 * penalising heavily at first, but still making sure that growth beyond the 2308 * limit penalises misbehaviour cgroups by slowing them down exponentially. For 2309 * example, with a high of 100 megabytes: 2310 * 2311 * +-------+------------------------+ 2312 * | usage | time to allocate in ms | 2313 * +-------+------------------------+ 2314 * | 100M | 0 | 2315 * | 101M | 6 | 2316 * | 102M | 25 | 2317 * | 103M | 57 | 2318 * | 104M | 102 | 2319 * | 105M | 159 | 2320 * | 106M | 230 | 2321 * | 107M | 313 | 2322 * | 108M | 409 | 2323 * | 109M | 518 | 2324 * | 110M | 639 | 2325 * | 111M | 774 | 2326 * | 112M | 921 | 2327 * | 113M | 1081 | 2328 * | 114M | 1254 | 2329 * | 115M | 1439 | 2330 * | 116M | 1638 | 2331 * | 117M | 1849 | 2332 * | 118M | 2000 | 2333 * | 119M | 2000 | 2334 * | 120M | 2000 | 2335 * +-------+------------------------+ 2336 */ 2337 #define MEMCG_DELAY_PRECISION_SHIFT 20 2338 #define MEMCG_DELAY_SCALING_SHIFT 14 2339 2340 static u64 calculate_overage(unsigned long usage, unsigned long high) 2341 { 2342 u64 overage; 2343 2344 if (usage <= high) 2345 return 0; 2346 2347 /* 2348 * Prevent division by 0 in overage calculation by acting as if 2349 * it was a threshold of 1 page 2350 */ 2351 high = max(high, 1UL); 2352 2353 overage = usage - high; 2354 overage <<= MEMCG_DELAY_PRECISION_SHIFT; 2355 return div64_u64(overage, high); 2356 } 2357 2358 static u64 mem_find_max_overage(struct mem_cgroup *memcg) 2359 { 2360 u64 overage, max_overage = 0; 2361 2362 do { 2363 overage = calculate_overage(page_counter_read(&memcg->memory), 2364 READ_ONCE(memcg->memory.high)); 2365 max_overage = max(overage, max_overage); 2366 } while ((memcg = parent_mem_cgroup(memcg)) && 2367 !mem_cgroup_is_root(memcg)); 2368 2369 return max_overage; 2370 } 2371 2372 static u64 swap_find_max_overage(struct mem_cgroup *memcg) 2373 { 2374 u64 overage, max_overage = 0; 2375 2376 do { 2377 overage = calculate_overage(page_counter_read(&memcg->swap), 2378 READ_ONCE(memcg->swap.high)); 2379 if (overage) 2380 memcg_memory_event(memcg, MEMCG_SWAP_HIGH); 2381 max_overage = max(overage, max_overage); 2382 } while ((memcg = parent_mem_cgroup(memcg)) && 2383 !mem_cgroup_is_root(memcg)); 2384 2385 return max_overage; 2386 } 2387 2388 /* 2389 * Get the number of jiffies that we should penalise a mischievous cgroup which 2390 * is exceeding its memory.high by checking both it and its ancestors. 2391 */ 2392 static unsigned long calculate_high_delay(struct mem_cgroup *memcg, 2393 unsigned int nr_pages, 2394 u64 max_overage) 2395 { 2396 unsigned long penalty_jiffies; 2397 2398 if (!max_overage) 2399 return 0; 2400 2401 /* 2402 * We use overage compared to memory.high to calculate the number of 2403 * jiffies to sleep (penalty_jiffies). Ideally this value should be 2404 * fairly lenient on small overages, and increasingly harsh when the 2405 * memcg in question makes it clear that it has no intention of stopping 2406 * its crazy behaviour, so we exponentially increase the delay based on 2407 * overage amount. 2408 */ 2409 penalty_jiffies = max_overage * max_overage * HZ; 2410 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; 2411 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; 2412 2413 /* 2414 * Factor in the task's own contribution to the overage, such that four 2415 * N-sized allocations are throttled approximately the same as one 2416 * 4N-sized allocation. 2417 * 2418 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or 2419 * larger the current charge patch is than that. 2420 */ 2421 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; 2422 } 2423 2424 /* 2425 * Scheduled by try_charge() to be executed from the userland return path 2426 * and reclaims memory over the high limit. 2427 */ 2428 void mem_cgroup_handle_over_high(void) 2429 { 2430 unsigned long penalty_jiffies; 2431 unsigned long pflags; 2432 unsigned long nr_reclaimed; 2433 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2434 int nr_retries = MAX_RECLAIM_RETRIES; 2435 struct mem_cgroup *memcg; 2436 bool in_retry = false; 2437 2438 if (likely(!nr_pages)) 2439 return; 2440 2441 memcg = get_mem_cgroup_from_mm(current->mm); 2442 current->memcg_nr_pages_over_high = 0; 2443 2444 retry_reclaim: 2445 /* 2446 * The allocating task should reclaim at least the batch size, but for 2447 * subsequent retries we only want to do what's necessary to prevent oom 2448 * or breaching resource isolation. 2449 * 2450 * This is distinct from memory.max or page allocator behaviour because 2451 * memory.high is currently batched, whereas memory.max and the page 2452 * allocator run every time an allocation is made. 2453 */ 2454 nr_reclaimed = reclaim_high(memcg, 2455 in_retry ? SWAP_CLUSTER_MAX : nr_pages, 2456 GFP_KERNEL); 2457 2458 /* 2459 * memory.high is breached and reclaim is unable to keep up. Throttle 2460 * allocators proactively to slow down excessive growth. 2461 */ 2462 penalty_jiffies = calculate_high_delay(memcg, nr_pages, 2463 mem_find_max_overage(memcg)); 2464 2465 penalty_jiffies += calculate_high_delay(memcg, nr_pages, 2466 swap_find_max_overage(memcg)); 2467 2468 /* 2469 * Clamp the max delay per usermode return so as to still keep the 2470 * application moving forwards and also permit diagnostics, albeit 2471 * extremely slowly. 2472 */ 2473 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 2474 2475 /* 2476 * Don't sleep if the amount of jiffies this memcg owes us is so low 2477 * that it's not even worth doing, in an attempt to be nice to those who 2478 * go only a small amount over their memory.high value and maybe haven't 2479 * been aggressively reclaimed enough yet. 2480 */ 2481 if (penalty_jiffies <= HZ / 100) 2482 goto out; 2483 2484 /* 2485 * If reclaim is making forward progress but we're still over 2486 * memory.high, we want to encourage that rather than doing allocator 2487 * throttling. 2488 */ 2489 if (nr_reclaimed || nr_retries--) { 2490 in_retry = true; 2491 goto retry_reclaim; 2492 } 2493 2494 /* 2495 * If we exit early, we're guaranteed to die (since 2496 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't 2497 * need to account for any ill-begotten jiffies to pay them off later. 2498 */ 2499 psi_memstall_enter(&pflags); 2500 schedule_timeout_killable(penalty_jiffies); 2501 psi_memstall_leave(&pflags); 2502 2503 out: 2504 css_put(&memcg->css); 2505 } 2506 2507 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2508 unsigned int nr_pages) 2509 { 2510 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 2511 int nr_retries = MAX_RECLAIM_RETRIES; 2512 struct mem_cgroup *mem_over_limit; 2513 struct page_counter *counter; 2514 enum oom_status oom_status; 2515 unsigned long nr_reclaimed; 2516 bool may_swap = true; 2517 bool drained = false; 2518 unsigned long pflags; 2519 2520 if (mem_cgroup_is_root(memcg)) 2521 return 0; 2522 retry: 2523 if (consume_stock(memcg, nr_pages)) 2524 return 0; 2525 2526 if (!do_memsw_account() || 2527 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2528 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 2529 goto done_restock; 2530 if (do_memsw_account()) 2531 page_counter_uncharge(&memcg->memsw, batch); 2532 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2533 } else { 2534 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2535 may_swap = false; 2536 } 2537 2538 if (batch > nr_pages) { 2539 batch = nr_pages; 2540 goto retry; 2541 } 2542 2543 /* 2544 * Memcg doesn't have a dedicated reserve for atomic 2545 * allocations. But like the global atomic pool, we need to 2546 * put the burden of reclaim on regular allocation requests 2547 * and let these go through as privileged allocations. 2548 */ 2549 if (gfp_mask & __GFP_ATOMIC) 2550 goto force; 2551 2552 /* 2553 * Unlike in global OOM situations, memcg is not in a physical 2554 * memory shortage. Allow dying and OOM-killed tasks to 2555 * bypass the last charges so that they can exit quickly and 2556 * free their memory. 2557 */ 2558 if (unlikely(should_force_charge())) 2559 goto force; 2560 2561 /* 2562 * Prevent unbounded recursion when reclaim operations need to 2563 * allocate memory. This might exceed the limits temporarily, 2564 * but we prefer facilitating memory reclaim and getting back 2565 * under the limit over triggering OOM kills in these cases. 2566 */ 2567 if (unlikely(current->flags & PF_MEMALLOC)) 2568 goto force; 2569 2570 if (unlikely(task_in_memcg_oom(current))) 2571 goto nomem; 2572 2573 if (!gfpflags_allow_blocking(gfp_mask)) 2574 goto nomem; 2575 2576 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2577 2578 psi_memstall_enter(&pflags); 2579 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2580 gfp_mask, may_swap); 2581 psi_memstall_leave(&pflags); 2582 2583 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2584 goto retry; 2585 2586 if (!drained) { 2587 drain_all_stock(mem_over_limit); 2588 drained = true; 2589 goto retry; 2590 } 2591 2592 if (gfp_mask & __GFP_NORETRY) 2593 goto nomem; 2594 /* 2595 * Even though the limit is exceeded at this point, reclaim 2596 * may have been able to free some pages. Retry the charge 2597 * before killing the task. 2598 * 2599 * Only for regular pages, though: huge pages are rather 2600 * unlikely to succeed so close to the limit, and we fall back 2601 * to regular pages anyway in case of failure. 2602 */ 2603 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2604 goto retry; 2605 /* 2606 * At task move, charge accounts can be doubly counted. So, it's 2607 * better to wait until the end of task_move if something is going on. 2608 */ 2609 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2610 goto retry; 2611 2612 if (nr_retries--) 2613 goto retry; 2614 2615 if (gfp_mask & __GFP_RETRY_MAYFAIL) 2616 goto nomem; 2617 2618 if (fatal_signal_pending(current)) 2619 goto force; 2620 2621 /* 2622 * keep retrying as long as the memcg oom killer is able to make 2623 * a forward progress or bypass the charge if the oom killer 2624 * couldn't make any progress. 2625 */ 2626 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, 2627 get_order(nr_pages * PAGE_SIZE)); 2628 switch (oom_status) { 2629 case OOM_SUCCESS: 2630 nr_retries = MAX_RECLAIM_RETRIES; 2631 goto retry; 2632 case OOM_FAILED: 2633 goto force; 2634 default: 2635 goto nomem; 2636 } 2637 nomem: 2638 if (!(gfp_mask & __GFP_NOFAIL)) 2639 return -ENOMEM; 2640 force: 2641 /* 2642 * The allocation either can't fail or will lead to more memory 2643 * being freed very soon. Allow memory usage go over the limit 2644 * temporarily by force charging it. 2645 */ 2646 page_counter_charge(&memcg->memory, nr_pages); 2647 if (do_memsw_account()) 2648 page_counter_charge(&memcg->memsw, nr_pages); 2649 2650 return 0; 2651 2652 done_restock: 2653 if (batch > nr_pages) 2654 refill_stock(memcg, batch - nr_pages); 2655 2656 /* 2657 * If the hierarchy is above the normal consumption range, schedule 2658 * reclaim on returning to userland. We can perform reclaim here 2659 * if __GFP_RECLAIM but let's always punt for simplicity and so that 2660 * GFP_KERNEL can consistently be used during reclaim. @memcg is 2661 * not recorded as it most likely matches current's and won't 2662 * change in the meantime. As high limit is checked again before 2663 * reclaim, the cost of mismatch is negligible. 2664 */ 2665 do { 2666 bool mem_high, swap_high; 2667 2668 mem_high = page_counter_read(&memcg->memory) > 2669 READ_ONCE(memcg->memory.high); 2670 swap_high = page_counter_read(&memcg->swap) > 2671 READ_ONCE(memcg->swap.high); 2672 2673 /* Don't bother a random interrupted task */ 2674 if (in_interrupt()) { 2675 if (mem_high) { 2676 schedule_work(&memcg->high_work); 2677 break; 2678 } 2679 continue; 2680 } 2681 2682 if (mem_high || swap_high) { 2683 /* 2684 * The allocating tasks in this cgroup will need to do 2685 * reclaim or be throttled to prevent further growth 2686 * of the memory or swap footprints. 2687 * 2688 * Target some best-effort fairness between the tasks, 2689 * and distribute reclaim work and delay penalties 2690 * based on how much each task is actually allocating. 2691 */ 2692 current->memcg_nr_pages_over_high += batch; 2693 set_notify_resume(current); 2694 break; 2695 } 2696 } while ((memcg = parent_mem_cgroup(memcg))); 2697 2698 return 0; 2699 } 2700 2701 #if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) 2702 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2703 { 2704 if (mem_cgroup_is_root(memcg)) 2705 return; 2706 2707 page_counter_uncharge(&memcg->memory, nr_pages); 2708 if (do_memsw_account()) 2709 page_counter_uncharge(&memcg->memsw, nr_pages); 2710 } 2711 #endif 2712 2713 static void commit_charge(struct page *page, struct mem_cgroup *memcg) 2714 { 2715 VM_BUG_ON_PAGE(page_memcg(page), page); 2716 /* 2717 * Any of the following ensures page's memcg stability: 2718 * 2719 * - the page lock 2720 * - LRU isolation 2721 * - lock_page_memcg() 2722 * - exclusive reference 2723 */ 2724 page->memcg_data = (unsigned long)memcg; 2725 } 2726 2727 static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) 2728 { 2729 struct mem_cgroup *memcg; 2730 2731 rcu_read_lock(); 2732 retry: 2733 memcg = obj_cgroup_memcg(objcg); 2734 if (unlikely(!css_tryget(&memcg->css))) 2735 goto retry; 2736 rcu_read_unlock(); 2737 2738 return memcg; 2739 } 2740 2741 #ifdef CONFIG_MEMCG_KMEM 2742 int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, 2743 gfp_t gfp, bool new_page) 2744 { 2745 unsigned int objects = objs_per_slab_page(s, page); 2746 unsigned long memcg_data; 2747 void *vec; 2748 2749 vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, 2750 page_to_nid(page)); 2751 if (!vec) 2752 return -ENOMEM; 2753 2754 memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; 2755 if (new_page) { 2756 /* 2757 * If the slab page is brand new and nobody can yet access 2758 * it's memcg_data, no synchronization is required and 2759 * memcg_data can be simply assigned. 2760 */ 2761 page->memcg_data = memcg_data; 2762 } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) { 2763 /* 2764 * If the slab page is already in use, somebody can allocate 2765 * and assign obj_cgroups in parallel. In this case the existing 2766 * objcg vector should be reused. 2767 */ 2768 kfree(vec); 2769 return 0; 2770 } 2771 2772 kmemleak_not_leak(vec); 2773 return 0; 2774 } 2775 2776 /* 2777 * Returns a pointer to the memory cgroup to which the kernel object is charged. 2778 * 2779 * A passed kernel object can be a slab object or a generic kernel page, so 2780 * different mechanisms for getting the memory cgroup pointer should be used. 2781 * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller 2782 * can not know for sure how the kernel object is implemented. 2783 * mem_cgroup_from_obj() can be safely used in such cases. 2784 * 2785 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 2786 * cgroup_mutex, etc. 2787 */ 2788 struct mem_cgroup *mem_cgroup_from_obj(void *p) 2789 { 2790 struct page *page; 2791 2792 if (mem_cgroup_disabled()) 2793 return NULL; 2794 2795 page = virt_to_head_page(p); 2796 2797 /* 2798 * Slab objects are accounted individually, not per-page. 2799 * Memcg membership data for each individual object is saved in 2800 * the page->obj_cgroups. 2801 */ 2802 if (page_objcgs_check(page)) { 2803 struct obj_cgroup *objcg; 2804 unsigned int off; 2805 2806 off = obj_to_index(page->slab_cache, page, p); 2807 objcg = page_objcgs(page)[off]; 2808 if (objcg) 2809 return obj_cgroup_memcg(objcg); 2810 2811 return NULL; 2812 } 2813 2814 /* 2815 * page_memcg_check() is used here, because page_has_obj_cgroups() 2816 * check above could fail because the object cgroups vector wasn't set 2817 * at that moment, but it can be set concurrently. 2818 * page_memcg_check(page) will guarantee that a proper memory 2819 * cgroup pointer or NULL will be returned. 2820 */ 2821 return page_memcg_check(page); 2822 } 2823 2824 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) 2825 { 2826 struct obj_cgroup *objcg = NULL; 2827 struct mem_cgroup *memcg; 2828 2829 if (memcg_kmem_bypass()) 2830 return NULL; 2831 2832 rcu_read_lock(); 2833 if (unlikely(active_memcg())) 2834 memcg = active_memcg(); 2835 else 2836 memcg = mem_cgroup_from_task(current); 2837 2838 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { 2839 objcg = rcu_dereference(memcg->objcg); 2840 if (objcg && obj_cgroup_tryget(objcg)) 2841 break; 2842 objcg = NULL; 2843 } 2844 rcu_read_unlock(); 2845 2846 return objcg; 2847 } 2848 2849 static int memcg_alloc_cache_id(void) 2850 { 2851 int id, size; 2852 int err; 2853 2854 id = ida_simple_get(&memcg_cache_ida, 2855 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2856 if (id < 0) 2857 return id; 2858 2859 if (id < memcg_nr_cache_ids) 2860 return id; 2861 2862 /* 2863 * There's no space for the new id in memcg_caches arrays, 2864 * so we have to grow them. 2865 */ 2866 down_write(&memcg_cache_ids_sem); 2867 2868 size = 2 * (id + 1); 2869 if (size < MEMCG_CACHES_MIN_SIZE) 2870 size = MEMCG_CACHES_MIN_SIZE; 2871 else if (size > MEMCG_CACHES_MAX_SIZE) 2872 size = MEMCG_CACHES_MAX_SIZE; 2873 2874 err = memcg_update_all_list_lrus(size); 2875 if (!err) 2876 memcg_nr_cache_ids = size; 2877 2878 up_write(&memcg_cache_ids_sem); 2879 2880 if (err) { 2881 ida_simple_remove(&memcg_cache_ida, id); 2882 return err; 2883 } 2884 return id; 2885 } 2886 2887 static void memcg_free_cache_id(int id) 2888 { 2889 ida_simple_remove(&memcg_cache_ida, id); 2890 } 2891 2892 /* 2893 * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg 2894 * @objcg: object cgroup to uncharge 2895 * @nr_pages: number of pages to uncharge 2896 */ 2897 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 2898 unsigned int nr_pages) 2899 { 2900 struct mem_cgroup *memcg; 2901 2902 memcg = get_mem_cgroup_from_objcg(objcg); 2903 2904 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 2905 page_counter_uncharge(&memcg->kmem, nr_pages); 2906 refill_stock(memcg, nr_pages); 2907 2908 css_put(&memcg->css); 2909 } 2910 2911 /* 2912 * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg 2913 * @objcg: object cgroup to charge 2914 * @gfp: reclaim mode 2915 * @nr_pages: number of pages to charge 2916 * 2917 * Returns 0 on success, an error code on failure. 2918 */ 2919 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, 2920 unsigned int nr_pages) 2921 { 2922 struct page_counter *counter; 2923 struct mem_cgroup *memcg; 2924 int ret; 2925 2926 memcg = get_mem_cgroup_from_objcg(objcg); 2927 2928 ret = try_charge(memcg, gfp, nr_pages); 2929 if (ret) 2930 goto out; 2931 2932 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && 2933 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { 2934 2935 /* 2936 * Enforce __GFP_NOFAIL allocation because callers are not 2937 * prepared to see failures and likely do not have any failure 2938 * handling code. 2939 */ 2940 if (gfp & __GFP_NOFAIL) { 2941 page_counter_charge(&memcg->kmem, nr_pages); 2942 goto out; 2943 } 2944 cancel_charge(memcg, nr_pages); 2945 ret = -ENOMEM; 2946 } 2947 out: 2948 css_put(&memcg->css); 2949 2950 return ret; 2951 } 2952 2953 /** 2954 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup 2955 * @page: page to charge 2956 * @gfp: reclaim mode 2957 * @order: allocation order 2958 * 2959 * Returns 0 on success, an error code on failure. 2960 */ 2961 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) 2962 { 2963 struct obj_cgroup *objcg; 2964 int ret = 0; 2965 2966 objcg = get_obj_cgroup_from_current(); 2967 if (objcg) { 2968 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); 2969 if (!ret) { 2970 page->memcg_data = (unsigned long)objcg | 2971 MEMCG_DATA_KMEM; 2972 return 0; 2973 } 2974 obj_cgroup_put(objcg); 2975 } 2976 return ret; 2977 } 2978 2979 /** 2980 * __memcg_kmem_uncharge_page: uncharge a kmem page 2981 * @page: page to uncharge 2982 * @order: allocation order 2983 */ 2984 void __memcg_kmem_uncharge_page(struct page *page, int order) 2985 { 2986 struct obj_cgroup *objcg; 2987 unsigned int nr_pages = 1 << order; 2988 2989 if (!PageMemcgKmem(page)) 2990 return; 2991 2992 objcg = __page_objcg(page); 2993 obj_cgroup_uncharge_pages(objcg, nr_pages); 2994 page->memcg_data = 0; 2995 obj_cgroup_put(objcg); 2996 } 2997 2998 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 2999 { 3000 struct memcg_stock_pcp *stock; 3001 unsigned long flags; 3002 bool ret = false; 3003 3004 local_irq_save(flags); 3005 3006 stock = this_cpu_ptr(&memcg_stock); 3007 if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { 3008 stock->nr_bytes -= nr_bytes; 3009 ret = true; 3010 } 3011 3012 local_irq_restore(flags); 3013 3014 return ret; 3015 } 3016 3017 static void drain_obj_stock(struct memcg_stock_pcp *stock) 3018 { 3019 struct obj_cgroup *old = stock->cached_objcg; 3020 3021 if (!old) 3022 return; 3023 3024 if (stock->nr_bytes) { 3025 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3026 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); 3027 3028 if (nr_pages) 3029 obj_cgroup_uncharge_pages(old, nr_pages); 3030 3031 /* 3032 * The leftover is flushed to the centralized per-memcg value. 3033 * On the next attempt to refill obj stock it will be moved 3034 * to a per-cpu stock (probably, on an other CPU), see 3035 * refill_obj_stock(). 3036 * 3037 * How often it's flushed is a trade-off between the memory 3038 * limit enforcement accuracy and potential CPU contention, 3039 * so it might be changed in the future. 3040 */ 3041 atomic_add(nr_bytes, &old->nr_charged_bytes); 3042 stock->nr_bytes = 0; 3043 } 3044 3045 obj_cgroup_put(old); 3046 stock->cached_objcg = NULL; 3047 } 3048 3049 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 3050 struct mem_cgroup *root_memcg) 3051 { 3052 struct mem_cgroup *memcg; 3053 3054 if (stock->cached_objcg) { 3055 memcg = obj_cgroup_memcg(stock->cached_objcg); 3056 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 3057 return true; 3058 } 3059 3060 return false; 3061 } 3062 3063 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 3064 { 3065 struct memcg_stock_pcp *stock; 3066 unsigned long flags; 3067 3068 local_irq_save(flags); 3069 3070 stock = this_cpu_ptr(&memcg_stock); 3071 if (stock->cached_objcg != objcg) { /* reset if necessary */ 3072 drain_obj_stock(stock); 3073 obj_cgroup_get(objcg); 3074 stock->cached_objcg = objcg; 3075 stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0); 3076 } 3077 stock->nr_bytes += nr_bytes; 3078 3079 if (stock->nr_bytes > PAGE_SIZE) 3080 drain_obj_stock(stock); 3081 3082 local_irq_restore(flags); 3083 } 3084 3085 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 3086 { 3087 unsigned int nr_pages, nr_bytes; 3088 int ret; 3089 3090 if (consume_obj_stock(objcg, size)) 3091 return 0; 3092 3093 /* 3094 * In theory, memcg->nr_charged_bytes can have enough 3095 * pre-charged bytes to satisfy the allocation. However, 3096 * flushing memcg->nr_charged_bytes requires two atomic 3097 * operations, and memcg->nr_charged_bytes can't be big, 3098 * so it's better to ignore it and try grab some new pages. 3099 * memcg->nr_charged_bytes will be flushed in 3100 * refill_obj_stock(), called from this function or 3101 * independently later. 3102 */ 3103 nr_pages = size >> PAGE_SHIFT; 3104 nr_bytes = size & (PAGE_SIZE - 1); 3105 3106 if (nr_bytes) 3107 nr_pages += 1; 3108 3109 ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); 3110 if (!ret && nr_bytes) 3111 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes); 3112 3113 return ret; 3114 } 3115 3116 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) 3117 { 3118 refill_obj_stock(objcg, size); 3119 } 3120 3121 #endif /* CONFIG_MEMCG_KMEM */ 3122 3123 /* 3124 * Because page_memcg(head) is not set on tails, set it now. 3125 */ 3126 void split_page_memcg(struct page *head, unsigned int nr) 3127 { 3128 struct mem_cgroup *memcg = page_memcg(head); 3129 int i; 3130 3131 if (mem_cgroup_disabled() || !memcg) 3132 return; 3133 3134 for (i = 1; i < nr; i++) 3135 head[i].memcg_data = head->memcg_data; 3136 3137 if (PageMemcgKmem(head)) 3138 obj_cgroup_get_many(__page_objcg(head), nr - 1); 3139 else 3140 css_get_many(&memcg->css, nr - 1); 3141 } 3142 3143 #ifdef CONFIG_MEMCG_SWAP 3144 /** 3145 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3146 * @entry: swap entry to be moved 3147 * @from: mem_cgroup which the entry is moved from 3148 * @to: mem_cgroup which the entry is moved to 3149 * 3150 * It succeeds only when the swap_cgroup's record for this entry is the same 3151 * as the mem_cgroup's id of @from. 3152 * 3153 * Returns 0 on success, -EINVAL on failure. 3154 * 3155 * The caller must have charged to @to, IOW, called page_counter_charge() about 3156 * both res and memsw, and called css_get(). 3157 */ 3158 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3159 struct mem_cgroup *from, struct mem_cgroup *to) 3160 { 3161 unsigned short old_id, new_id; 3162 3163 old_id = mem_cgroup_id(from); 3164 new_id = mem_cgroup_id(to); 3165 3166 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3167 mod_memcg_state(from, MEMCG_SWAP, -1); 3168 mod_memcg_state(to, MEMCG_SWAP, 1); 3169 return 0; 3170 } 3171 return -EINVAL; 3172 } 3173 #else 3174 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3175 struct mem_cgroup *from, struct mem_cgroup *to) 3176 { 3177 return -EINVAL; 3178 } 3179 #endif 3180 3181 static DEFINE_MUTEX(memcg_max_mutex); 3182 3183 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 3184 unsigned long max, bool memsw) 3185 { 3186 bool enlarge = false; 3187 bool drained = false; 3188 int ret; 3189 bool limits_invariant; 3190 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 3191 3192 do { 3193 if (signal_pending(current)) { 3194 ret = -EINTR; 3195 break; 3196 } 3197 3198 mutex_lock(&memcg_max_mutex); 3199 /* 3200 * Make sure that the new limit (memsw or memory limit) doesn't 3201 * break our basic invariant rule memory.max <= memsw.max. 3202 */ 3203 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 3204 max <= memcg->memsw.max; 3205 if (!limits_invariant) { 3206 mutex_unlock(&memcg_max_mutex); 3207 ret = -EINVAL; 3208 break; 3209 } 3210 if (max > counter->max) 3211 enlarge = true; 3212 ret = page_counter_set_max(counter, max); 3213 mutex_unlock(&memcg_max_mutex); 3214 3215 if (!ret) 3216 break; 3217 3218 if (!drained) { 3219 drain_all_stock(memcg); 3220 drained = true; 3221 continue; 3222 } 3223 3224 if (!try_to_free_mem_cgroup_pages(memcg, 1, 3225 GFP_KERNEL, !memsw)) { 3226 ret = -EBUSY; 3227 break; 3228 } 3229 } while (true); 3230 3231 if (!ret && enlarge) 3232 memcg_oom_recover(memcg); 3233 3234 return ret; 3235 } 3236 3237 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 3238 gfp_t gfp_mask, 3239 unsigned long *total_scanned) 3240 { 3241 unsigned long nr_reclaimed = 0; 3242 struct mem_cgroup_per_node *mz, *next_mz = NULL; 3243 unsigned long reclaimed; 3244 int loop = 0; 3245 struct mem_cgroup_tree_per_node *mctz; 3246 unsigned long excess; 3247 unsigned long nr_scanned; 3248 3249 if (order > 0) 3250 return 0; 3251 3252 mctz = soft_limit_tree_node(pgdat->node_id); 3253 3254 /* 3255 * Do not even bother to check the largest node if the root 3256 * is empty. Do it lockless to prevent lock bouncing. Races 3257 * are acceptable as soft limit is best effort anyway. 3258 */ 3259 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 3260 return 0; 3261 3262 /* 3263 * This loop can run a while, specially if mem_cgroup's continuously 3264 * keep exceeding their soft limit and putting the system under 3265 * pressure 3266 */ 3267 do { 3268 if (next_mz) 3269 mz = next_mz; 3270 else 3271 mz = mem_cgroup_largest_soft_limit_node(mctz); 3272 if (!mz) 3273 break; 3274 3275 nr_scanned = 0; 3276 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 3277 gfp_mask, &nr_scanned); 3278 nr_reclaimed += reclaimed; 3279 *total_scanned += nr_scanned; 3280 spin_lock_irq(&mctz->lock); 3281 __mem_cgroup_remove_exceeded(mz, mctz); 3282 3283 /* 3284 * If we failed to reclaim anything from this memory cgroup 3285 * it is time to move on to the next cgroup 3286 */ 3287 next_mz = NULL; 3288 if (!reclaimed) 3289 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 3290 3291 excess = soft_limit_excess(mz->memcg); 3292 /* 3293 * One school of thought says that we should not add 3294 * back the node to the tree if reclaim returns 0. 3295 * But our reclaim could return 0, simply because due 3296 * to priority we are exposing a smaller subset of 3297 * memory to reclaim from. Consider this as a longer 3298 * term TODO. 3299 */ 3300 /* If excess == 0, no tree ops */ 3301 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3302 spin_unlock_irq(&mctz->lock); 3303 css_put(&mz->memcg->css); 3304 loop++; 3305 /* 3306 * Could not reclaim anything and there are no more 3307 * mem cgroups to try or we seem to be looping without 3308 * reclaiming anything. 3309 */ 3310 if (!nr_reclaimed && 3311 (next_mz == NULL || 3312 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3313 break; 3314 } while (!nr_reclaimed); 3315 if (next_mz) 3316 css_put(&next_mz->memcg->css); 3317 return nr_reclaimed; 3318 } 3319 3320 /* 3321 * Reclaims as many pages from the given memcg as possible. 3322 * 3323 * Caller is responsible for holding css reference for memcg. 3324 */ 3325 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3326 { 3327 int nr_retries = MAX_RECLAIM_RETRIES; 3328 3329 /* we call try-to-free pages for make this cgroup empty */ 3330 lru_add_drain_all(); 3331 3332 drain_all_stock(memcg); 3333 3334 /* try to free all pages in this cgroup */ 3335 while (nr_retries && page_counter_read(&memcg->memory)) { 3336 int progress; 3337 3338 if (signal_pending(current)) 3339 return -EINTR; 3340 3341 progress = try_to_free_mem_cgroup_pages(memcg, 1, 3342 GFP_KERNEL, true); 3343 if (!progress) { 3344 nr_retries--; 3345 /* maybe some writeback is necessary */ 3346 congestion_wait(BLK_RW_ASYNC, HZ/10); 3347 } 3348 3349 } 3350 3351 return 0; 3352 } 3353 3354 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3355 char *buf, size_t nbytes, 3356 loff_t off) 3357 { 3358 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3359 3360 if (mem_cgroup_is_root(memcg)) 3361 return -EINVAL; 3362 return mem_cgroup_force_empty(memcg) ?: nbytes; 3363 } 3364 3365 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3366 struct cftype *cft) 3367 { 3368 return 1; 3369 } 3370 3371 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3372 struct cftype *cft, u64 val) 3373 { 3374 if (val == 1) 3375 return 0; 3376 3377 pr_warn_once("Non-hierarchical mode is deprecated. " 3378 "Please report your usecase to linux-mm@kvack.org if you " 3379 "depend on this functionality.\n"); 3380 3381 return -EINVAL; 3382 } 3383 3384 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3385 { 3386 unsigned long val; 3387 3388 if (mem_cgroup_is_root(memcg)) { 3389 cgroup_rstat_flush(memcg->css.cgroup); 3390 val = memcg_page_state(memcg, NR_FILE_PAGES) + 3391 memcg_page_state(memcg, NR_ANON_MAPPED); 3392 if (swap) 3393 val += memcg_page_state(memcg, MEMCG_SWAP); 3394 } else { 3395 if (!swap) 3396 val = page_counter_read(&memcg->memory); 3397 else 3398 val = page_counter_read(&memcg->memsw); 3399 } 3400 return val; 3401 } 3402 3403 enum { 3404 RES_USAGE, 3405 RES_LIMIT, 3406 RES_MAX_USAGE, 3407 RES_FAILCNT, 3408 RES_SOFT_LIMIT, 3409 }; 3410 3411 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3412 struct cftype *cft) 3413 { 3414 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3415 struct page_counter *counter; 3416 3417 switch (MEMFILE_TYPE(cft->private)) { 3418 case _MEM: 3419 counter = &memcg->memory; 3420 break; 3421 case _MEMSWAP: 3422 counter = &memcg->memsw; 3423 break; 3424 case _KMEM: 3425 counter = &memcg->kmem; 3426 break; 3427 case _TCP: 3428 counter = &memcg->tcpmem; 3429 break; 3430 default: 3431 BUG(); 3432 } 3433 3434 switch (MEMFILE_ATTR(cft->private)) { 3435 case RES_USAGE: 3436 if (counter == &memcg->memory) 3437 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 3438 if (counter == &memcg->memsw) 3439 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 3440 return (u64)page_counter_read(counter) * PAGE_SIZE; 3441 case RES_LIMIT: 3442 return (u64)counter->max * PAGE_SIZE; 3443 case RES_MAX_USAGE: 3444 return (u64)counter->watermark * PAGE_SIZE; 3445 case RES_FAILCNT: 3446 return counter->failcnt; 3447 case RES_SOFT_LIMIT: 3448 return (u64)memcg->soft_limit * PAGE_SIZE; 3449 default: 3450 BUG(); 3451 } 3452 } 3453 3454 #ifdef CONFIG_MEMCG_KMEM 3455 static int memcg_online_kmem(struct mem_cgroup *memcg) 3456 { 3457 struct obj_cgroup *objcg; 3458 int memcg_id; 3459 3460 if (cgroup_memory_nokmem) 3461 return 0; 3462 3463 BUG_ON(memcg->kmemcg_id >= 0); 3464 BUG_ON(memcg->kmem_state); 3465 3466 memcg_id = memcg_alloc_cache_id(); 3467 if (memcg_id < 0) 3468 return memcg_id; 3469 3470 objcg = obj_cgroup_alloc(); 3471 if (!objcg) { 3472 memcg_free_cache_id(memcg_id); 3473 return -ENOMEM; 3474 } 3475 objcg->memcg = memcg; 3476 rcu_assign_pointer(memcg->objcg, objcg); 3477 3478 static_branch_enable(&memcg_kmem_enabled_key); 3479 3480 memcg->kmemcg_id = memcg_id; 3481 memcg->kmem_state = KMEM_ONLINE; 3482 3483 return 0; 3484 } 3485 3486 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3487 { 3488 struct cgroup_subsys_state *css; 3489 struct mem_cgroup *parent, *child; 3490 int kmemcg_id; 3491 3492 if (memcg->kmem_state != KMEM_ONLINE) 3493 return; 3494 3495 memcg->kmem_state = KMEM_ALLOCATED; 3496 3497 parent = parent_mem_cgroup(memcg); 3498 if (!parent) 3499 parent = root_mem_cgroup; 3500 3501 memcg_reparent_objcgs(memcg, parent); 3502 3503 kmemcg_id = memcg->kmemcg_id; 3504 BUG_ON(kmemcg_id < 0); 3505 3506 /* 3507 * Change kmemcg_id of this cgroup and all its descendants to the 3508 * parent's id, and then move all entries from this cgroup's list_lrus 3509 * to ones of the parent. After we have finished, all list_lrus 3510 * corresponding to this cgroup are guaranteed to remain empty. The 3511 * ordering is imposed by list_lru_node->lock taken by 3512 * memcg_drain_all_list_lrus(). 3513 */ 3514 rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ 3515 css_for_each_descendant_pre(css, &memcg->css) { 3516 child = mem_cgroup_from_css(css); 3517 BUG_ON(child->kmemcg_id != kmemcg_id); 3518 child->kmemcg_id = parent->kmemcg_id; 3519 } 3520 rcu_read_unlock(); 3521 3522 memcg_drain_all_list_lrus(kmemcg_id, parent); 3523 3524 memcg_free_cache_id(kmemcg_id); 3525 } 3526 3527 static void memcg_free_kmem(struct mem_cgroup *memcg) 3528 { 3529 /* css_alloc() failed, offlining didn't happen */ 3530 if (unlikely(memcg->kmem_state == KMEM_ONLINE)) 3531 memcg_offline_kmem(memcg); 3532 } 3533 #else 3534 static int memcg_online_kmem(struct mem_cgroup *memcg) 3535 { 3536 return 0; 3537 } 3538 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3539 { 3540 } 3541 static void memcg_free_kmem(struct mem_cgroup *memcg) 3542 { 3543 } 3544 #endif /* CONFIG_MEMCG_KMEM */ 3545 3546 static int memcg_update_kmem_max(struct mem_cgroup *memcg, 3547 unsigned long max) 3548 { 3549 int ret; 3550 3551 mutex_lock(&memcg_max_mutex); 3552 ret = page_counter_set_max(&memcg->kmem, max); 3553 mutex_unlock(&memcg_max_mutex); 3554 return ret; 3555 } 3556 3557 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 3558 { 3559 int ret; 3560 3561 mutex_lock(&memcg_max_mutex); 3562 3563 ret = page_counter_set_max(&memcg->tcpmem, max); 3564 if (ret) 3565 goto out; 3566 3567 if (!memcg->tcpmem_active) { 3568 /* 3569 * The active flag needs to be written after the static_key 3570 * update. This is what guarantees that the socket activation 3571 * function is the last one to run. See mem_cgroup_sk_alloc() 3572 * for details, and note that we don't mark any socket as 3573 * belonging to this memcg until that flag is up. 3574 * 3575 * We need to do this, because static_keys will span multiple 3576 * sites, but we can't control their order. If we mark a socket 3577 * as accounted, but the accounting functions are not patched in 3578 * yet, we'll lose accounting. 3579 * 3580 * We never race with the readers in mem_cgroup_sk_alloc(), 3581 * because when this value change, the code to process it is not 3582 * patched in yet. 3583 */ 3584 static_branch_inc(&memcg_sockets_enabled_key); 3585 memcg->tcpmem_active = true; 3586 } 3587 out: 3588 mutex_unlock(&memcg_max_mutex); 3589 return ret; 3590 } 3591 3592 /* 3593 * The user of this function is... 3594 * RES_LIMIT. 3595 */ 3596 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3597 char *buf, size_t nbytes, loff_t off) 3598 { 3599 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3600 unsigned long nr_pages; 3601 int ret; 3602 3603 buf = strstrip(buf); 3604 ret = page_counter_memparse(buf, "-1", &nr_pages); 3605 if (ret) 3606 return ret; 3607 3608 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3609 case RES_LIMIT: 3610 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3611 ret = -EINVAL; 3612 break; 3613 } 3614 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3615 case _MEM: 3616 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 3617 break; 3618 case _MEMSWAP: 3619 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 3620 break; 3621 case _KMEM: 3622 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 3623 "Please report your usecase to linux-mm@kvack.org if you " 3624 "depend on this functionality.\n"); 3625 ret = memcg_update_kmem_max(memcg, nr_pages); 3626 break; 3627 case _TCP: 3628 ret = memcg_update_tcp_max(memcg, nr_pages); 3629 break; 3630 } 3631 break; 3632 case RES_SOFT_LIMIT: 3633 memcg->soft_limit = nr_pages; 3634 ret = 0; 3635 break; 3636 } 3637 return ret ?: nbytes; 3638 } 3639 3640 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3641 size_t nbytes, loff_t off) 3642 { 3643 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3644 struct page_counter *counter; 3645 3646 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3647 case _MEM: 3648 counter = &memcg->memory; 3649 break; 3650 case _MEMSWAP: 3651 counter = &memcg->memsw; 3652 break; 3653 case _KMEM: 3654 counter = &memcg->kmem; 3655 break; 3656 case _TCP: 3657 counter = &memcg->tcpmem; 3658 break; 3659 default: 3660 BUG(); 3661 } 3662 3663 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3664 case RES_MAX_USAGE: 3665 page_counter_reset_watermark(counter); 3666 break; 3667 case RES_FAILCNT: 3668 counter->failcnt = 0; 3669 break; 3670 default: 3671 BUG(); 3672 } 3673 3674 return nbytes; 3675 } 3676 3677 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3678 struct cftype *cft) 3679 { 3680 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3681 } 3682 3683 #ifdef CONFIG_MMU 3684 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3685 struct cftype *cft, u64 val) 3686 { 3687 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3688 3689 if (val & ~MOVE_MASK) 3690 return -EINVAL; 3691 3692 /* 3693 * No kind of locking is needed in here, because ->can_attach() will 3694 * check this value once in the beginning of the process, and then carry 3695 * on with stale data. This means that changes to this value will only 3696 * affect task migrations starting after the change. 3697 */ 3698 memcg->move_charge_at_immigrate = val; 3699 return 0; 3700 } 3701 #else 3702 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3703 struct cftype *cft, u64 val) 3704 { 3705 return -ENOSYS; 3706 } 3707 #endif 3708 3709 #ifdef CONFIG_NUMA 3710 3711 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 3712 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 3713 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 3714 3715 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 3716 int nid, unsigned int lru_mask, bool tree) 3717 { 3718 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 3719 unsigned long nr = 0; 3720 enum lru_list lru; 3721 3722 VM_BUG_ON((unsigned)nid >= nr_node_ids); 3723 3724 for_each_lru(lru) { 3725 if (!(BIT(lru) & lru_mask)) 3726 continue; 3727 if (tree) 3728 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 3729 else 3730 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 3731 } 3732 return nr; 3733 } 3734 3735 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 3736 unsigned int lru_mask, 3737 bool tree) 3738 { 3739 unsigned long nr = 0; 3740 enum lru_list lru; 3741 3742 for_each_lru(lru) { 3743 if (!(BIT(lru) & lru_mask)) 3744 continue; 3745 if (tree) 3746 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 3747 else 3748 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 3749 } 3750 return nr; 3751 } 3752 3753 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3754 { 3755 struct numa_stat { 3756 const char *name; 3757 unsigned int lru_mask; 3758 }; 3759 3760 static const struct numa_stat stats[] = { 3761 { "total", LRU_ALL }, 3762 { "file", LRU_ALL_FILE }, 3763 { "anon", LRU_ALL_ANON }, 3764 { "unevictable", BIT(LRU_UNEVICTABLE) }, 3765 }; 3766 const struct numa_stat *stat; 3767 int nid; 3768 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 3769 3770 cgroup_rstat_flush(memcg->css.cgroup); 3771 3772 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3773 seq_printf(m, "%s=%lu", stat->name, 3774 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 3775 false)); 3776 for_each_node_state(nid, N_MEMORY) 3777 seq_printf(m, " N%d=%lu", nid, 3778 mem_cgroup_node_nr_lru_pages(memcg, nid, 3779 stat->lru_mask, false)); 3780 seq_putc(m, '\n'); 3781 } 3782 3783 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3784 3785 seq_printf(m, "hierarchical_%s=%lu", stat->name, 3786 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 3787 true)); 3788 for_each_node_state(nid, N_MEMORY) 3789 seq_printf(m, " N%d=%lu", nid, 3790 mem_cgroup_node_nr_lru_pages(memcg, nid, 3791 stat->lru_mask, true)); 3792 seq_putc(m, '\n'); 3793 } 3794 3795 return 0; 3796 } 3797 #endif /* CONFIG_NUMA */ 3798 3799 static const unsigned int memcg1_stats[] = { 3800 NR_FILE_PAGES, 3801 NR_ANON_MAPPED, 3802 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3803 NR_ANON_THPS, 3804 #endif 3805 NR_SHMEM, 3806 NR_FILE_MAPPED, 3807 NR_FILE_DIRTY, 3808 NR_WRITEBACK, 3809 MEMCG_SWAP, 3810 }; 3811 3812 static const char *const memcg1_stat_names[] = { 3813 "cache", 3814 "rss", 3815 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3816 "rss_huge", 3817 #endif 3818 "shmem", 3819 "mapped_file", 3820 "dirty", 3821 "writeback", 3822 "swap", 3823 }; 3824 3825 /* Universal VM events cgroup1 shows, original sort order */ 3826 static const unsigned int memcg1_events[] = { 3827 PGPGIN, 3828 PGPGOUT, 3829 PGFAULT, 3830 PGMAJFAULT, 3831 }; 3832 3833 static int memcg_stat_show(struct seq_file *m, void *v) 3834 { 3835 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 3836 unsigned long memory, memsw; 3837 struct mem_cgroup *mi; 3838 unsigned int i; 3839 3840 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 3841 3842 cgroup_rstat_flush(memcg->css.cgroup); 3843 3844 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 3845 unsigned long nr; 3846 3847 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 3848 continue; 3849 nr = memcg_page_state_local(memcg, memcg1_stats[i]); 3850 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); 3851 } 3852 3853 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 3854 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]), 3855 memcg_events_local(memcg, memcg1_events[i])); 3856 3857 for (i = 0; i < NR_LRU_LISTS; i++) 3858 seq_printf(m, "%s %lu\n", lru_list_name(i), 3859 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 3860 PAGE_SIZE); 3861 3862 /* Hierarchical information */ 3863 memory = memsw = PAGE_COUNTER_MAX; 3864 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 3865 memory = min(memory, READ_ONCE(mi->memory.max)); 3866 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 3867 } 3868 seq_printf(m, "hierarchical_memory_limit %llu\n", 3869 (u64)memory * PAGE_SIZE); 3870 if (do_memsw_account()) 3871 seq_printf(m, "hierarchical_memsw_limit %llu\n", 3872 (u64)memsw * PAGE_SIZE); 3873 3874 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 3875 unsigned long nr; 3876 3877 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 3878 continue; 3879 nr = memcg_page_state(memcg, memcg1_stats[i]); 3880 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], 3881 (u64)nr * PAGE_SIZE); 3882 } 3883 3884 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 3885 seq_printf(m, "total_%s %llu\n", 3886 vm_event_name(memcg1_events[i]), 3887 (u64)memcg_events(memcg, memcg1_events[i])); 3888 3889 for (i = 0; i < NR_LRU_LISTS; i++) 3890 seq_printf(m, "total_%s %llu\n", lru_list_name(i), 3891 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 3892 PAGE_SIZE); 3893 3894 #ifdef CONFIG_DEBUG_VM 3895 { 3896 pg_data_t *pgdat; 3897 struct mem_cgroup_per_node *mz; 3898 unsigned long anon_cost = 0; 3899 unsigned long file_cost = 0; 3900 3901 for_each_online_pgdat(pgdat) { 3902 mz = memcg->nodeinfo[pgdat->node_id]; 3903 3904 anon_cost += mz->lruvec.anon_cost; 3905 file_cost += mz->lruvec.file_cost; 3906 } 3907 seq_printf(m, "anon_cost %lu\n", anon_cost); 3908 seq_printf(m, "file_cost %lu\n", file_cost); 3909 } 3910 #endif 3911 3912 return 0; 3913 } 3914 3915 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 3916 struct cftype *cft) 3917 { 3918 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3919 3920 return mem_cgroup_swappiness(memcg); 3921 } 3922 3923 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 3924 struct cftype *cft, u64 val) 3925 { 3926 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3927 3928 if (val > 100) 3929 return -EINVAL; 3930 3931 if (!mem_cgroup_is_root(memcg)) 3932 memcg->swappiness = val; 3933 else 3934 vm_swappiness = val; 3935 3936 return 0; 3937 } 3938 3939 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3940 { 3941 struct mem_cgroup_threshold_ary *t; 3942 unsigned long usage; 3943 int i; 3944 3945 rcu_read_lock(); 3946 if (!swap) 3947 t = rcu_dereference(memcg->thresholds.primary); 3948 else 3949 t = rcu_dereference(memcg->memsw_thresholds.primary); 3950 3951 if (!t) 3952 goto unlock; 3953 3954 usage = mem_cgroup_usage(memcg, swap); 3955 3956 /* 3957 * current_threshold points to threshold just below or equal to usage. 3958 * If it's not true, a threshold was crossed after last 3959 * call of __mem_cgroup_threshold(). 3960 */ 3961 i = t->current_threshold; 3962 3963 /* 3964 * Iterate backward over array of thresholds starting from 3965 * current_threshold and check if a threshold is crossed. 3966 * If none of thresholds below usage is crossed, we read 3967 * only one element of the array here. 3968 */ 3969 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3970 eventfd_signal(t->entries[i].eventfd, 1); 3971 3972 /* i = current_threshold + 1 */ 3973 i++; 3974 3975 /* 3976 * Iterate forward over array of thresholds starting from 3977 * current_threshold+1 and check if a threshold is crossed. 3978 * If none of thresholds above usage is crossed, we read 3979 * only one element of the array here. 3980 */ 3981 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 3982 eventfd_signal(t->entries[i].eventfd, 1); 3983 3984 /* Update current_threshold */ 3985 t->current_threshold = i - 1; 3986 unlock: 3987 rcu_read_unlock(); 3988 } 3989 3990 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3991 { 3992 while (memcg) { 3993 __mem_cgroup_threshold(memcg, false); 3994 if (do_memsw_account()) 3995 __mem_cgroup_threshold(memcg, true); 3996 3997 memcg = parent_mem_cgroup(memcg); 3998 } 3999 } 4000 4001 static int compare_thresholds(const void *a, const void *b) 4002 { 4003 const struct mem_cgroup_threshold *_a = a; 4004 const struct mem_cgroup_threshold *_b = b; 4005 4006 if (_a->threshold > _b->threshold) 4007 return 1; 4008 4009 if (_a->threshold < _b->threshold) 4010 return -1; 4011 4012 return 0; 4013 } 4014 4015 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4016 { 4017 struct mem_cgroup_eventfd_list *ev; 4018 4019 spin_lock(&memcg_oom_lock); 4020 4021 list_for_each_entry(ev, &memcg->oom_notify, list) 4022 eventfd_signal(ev->eventfd, 1); 4023 4024 spin_unlock(&memcg_oom_lock); 4025 return 0; 4026 } 4027 4028 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4029 { 4030 struct mem_cgroup *iter; 4031 4032 for_each_mem_cgroup_tree(iter, memcg) 4033 mem_cgroup_oom_notify_cb(iter); 4034 } 4035 4036 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4037 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 4038 { 4039 struct mem_cgroup_thresholds *thresholds; 4040 struct mem_cgroup_threshold_ary *new; 4041 unsigned long threshold; 4042 unsigned long usage; 4043 int i, size, ret; 4044 4045 ret = page_counter_memparse(args, "-1", &threshold); 4046 if (ret) 4047 return ret; 4048 4049 mutex_lock(&memcg->thresholds_lock); 4050 4051 if (type == _MEM) { 4052 thresholds = &memcg->thresholds; 4053 usage = mem_cgroup_usage(memcg, false); 4054 } else if (type == _MEMSWAP) { 4055 thresholds = &memcg->memsw_thresholds; 4056 usage = mem_cgroup_usage(memcg, true); 4057 } else 4058 BUG(); 4059 4060 /* Check if a threshold crossed before adding a new one */ 4061 if (thresholds->primary) 4062 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4063 4064 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4065 4066 /* Allocate memory for new array of thresholds */ 4067 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 4068 if (!new) { 4069 ret = -ENOMEM; 4070 goto unlock; 4071 } 4072 new->size = size; 4073 4074 /* Copy thresholds (if any) to new array */ 4075 if (thresholds->primary) 4076 memcpy(new->entries, thresholds->primary->entries, 4077 flex_array_size(new, entries, size - 1)); 4078 4079 /* Add new threshold */ 4080 new->entries[size - 1].eventfd = eventfd; 4081 new->entries[size - 1].threshold = threshold; 4082 4083 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4084 sort(new->entries, size, sizeof(*new->entries), 4085 compare_thresholds, NULL); 4086 4087 /* Find current threshold */ 4088 new->current_threshold = -1; 4089 for (i = 0; i < size; i++) { 4090 if (new->entries[i].threshold <= usage) { 4091 /* 4092 * new->current_threshold will not be used until 4093 * rcu_assign_pointer(), so it's safe to increment 4094 * it here. 4095 */ 4096 ++new->current_threshold; 4097 } else 4098 break; 4099 } 4100 4101 /* Free old spare buffer and save old primary buffer as spare */ 4102 kfree(thresholds->spare); 4103 thresholds->spare = thresholds->primary; 4104 4105 rcu_assign_pointer(thresholds->primary, new); 4106 4107 /* To be sure that nobody uses thresholds */ 4108 synchronize_rcu(); 4109 4110 unlock: 4111 mutex_unlock(&memcg->thresholds_lock); 4112 4113 return ret; 4114 } 4115 4116 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4117 struct eventfd_ctx *eventfd, const char *args) 4118 { 4119 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 4120 } 4121 4122 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 4123 struct eventfd_ctx *eventfd, const char *args) 4124 { 4125 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 4126 } 4127 4128 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4129 struct eventfd_ctx *eventfd, enum res_type type) 4130 { 4131 struct mem_cgroup_thresholds *thresholds; 4132 struct mem_cgroup_threshold_ary *new; 4133 unsigned long usage; 4134 int i, j, size, entries; 4135 4136 mutex_lock(&memcg->thresholds_lock); 4137 4138 if (type == _MEM) { 4139 thresholds = &memcg->thresholds; 4140 usage = mem_cgroup_usage(memcg, false); 4141 } else if (type == _MEMSWAP) { 4142 thresholds = &memcg->memsw_thresholds; 4143 usage = mem_cgroup_usage(memcg, true); 4144 } else 4145 BUG(); 4146 4147 if (!thresholds->primary) 4148 goto unlock; 4149 4150 /* Check if a threshold crossed before removing */ 4151 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4152 4153 /* Calculate new number of threshold */ 4154 size = entries = 0; 4155 for (i = 0; i < thresholds->primary->size; i++) { 4156 if (thresholds->primary->entries[i].eventfd != eventfd) 4157 size++; 4158 else 4159 entries++; 4160 } 4161 4162 new = thresholds->spare; 4163 4164 /* If no items related to eventfd have been cleared, nothing to do */ 4165 if (!entries) 4166 goto unlock; 4167 4168 /* Set thresholds array to NULL if we don't have thresholds */ 4169 if (!size) { 4170 kfree(new); 4171 new = NULL; 4172 goto swap_buffers; 4173 } 4174 4175 new->size = size; 4176 4177 /* Copy thresholds and find current threshold */ 4178 new->current_threshold = -1; 4179 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4180 if (thresholds->primary->entries[i].eventfd == eventfd) 4181 continue; 4182 4183 new->entries[j] = thresholds->primary->entries[i]; 4184 if (new->entries[j].threshold <= usage) { 4185 /* 4186 * new->current_threshold will not be used 4187 * until rcu_assign_pointer(), so it's safe to increment 4188 * it here. 4189 */ 4190 ++new->current_threshold; 4191 } 4192 j++; 4193 } 4194 4195 swap_buffers: 4196 /* Swap primary and spare array */ 4197 thresholds->spare = thresholds->primary; 4198 4199 rcu_assign_pointer(thresholds->primary, new); 4200 4201 /* To be sure that nobody uses thresholds */ 4202 synchronize_rcu(); 4203 4204 /* If all events are unregistered, free the spare array */ 4205 if (!new) { 4206 kfree(thresholds->spare); 4207 thresholds->spare = NULL; 4208 } 4209 unlock: 4210 mutex_unlock(&memcg->thresholds_lock); 4211 } 4212 4213 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4214 struct eventfd_ctx *eventfd) 4215 { 4216 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4217 } 4218 4219 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4220 struct eventfd_ctx *eventfd) 4221 { 4222 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4223 } 4224 4225 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4226 struct eventfd_ctx *eventfd, const char *args) 4227 { 4228 struct mem_cgroup_eventfd_list *event; 4229 4230 event = kmalloc(sizeof(*event), GFP_KERNEL); 4231 if (!event) 4232 return -ENOMEM; 4233 4234 spin_lock(&memcg_oom_lock); 4235 4236 event->eventfd = eventfd; 4237 list_add(&event->list, &memcg->oom_notify); 4238 4239 /* already in OOM ? */ 4240 if (memcg->under_oom) 4241 eventfd_signal(eventfd, 1); 4242 spin_unlock(&memcg_oom_lock); 4243 4244 return 0; 4245 } 4246 4247 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4248 struct eventfd_ctx *eventfd) 4249 { 4250 struct mem_cgroup_eventfd_list *ev, *tmp; 4251 4252 spin_lock(&memcg_oom_lock); 4253 4254 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4255 if (ev->eventfd == eventfd) { 4256 list_del(&ev->list); 4257 kfree(ev); 4258 } 4259 } 4260 4261 spin_unlock(&memcg_oom_lock); 4262 } 4263 4264 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4265 { 4266 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 4267 4268 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 4269 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 4270 seq_printf(sf, "oom_kill %lu\n", 4271 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 4272 return 0; 4273 } 4274 4275 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4276 struct cftype *cft, u64 val) 4277 { 4278 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4279 4280 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4281 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) 4282 return -EINVAL; 4283 4284 memcg->oom_kill_disable = val; 4285 if (!val) 4286 memcg_oom_recover(memcg); 4287 4288 return 0; 4289 } 4290 4291 #ifdef CONFIG_CGROUP_WRITEBACK 4292 4293 #include <trace/events/writeback.h> 4294 4295 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4296 { 4297 return wb_domain_init(&memcg->cgwb_domain, gfp); 4298 } 4299 4300 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4301 { 4302 wb_domain_exit(&memcg->cgwb_domain); 4303 } 4304 4305 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4306 { 4307 wb_domain_size_changed(&memcg->cgwb_domain); 4308 } 4309 4310 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 4311 { 4312 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4313 4314 if (!memcg->css.parent) 4315 return NULL; 4316 4317 return &memcg->cgwb_domain; 4318 } 4319 4320 /** 4321 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 4322 * @wb: bdi_writeback in question 4323 * @pfilepages: out parameter for number of file pages 4324 * @pheadroom: out parameter for number of allocatable pages according to memcg 4325 * @pdirty: out parameter for number of dirty pages 4326 * @pwriteback: out parameter for number of pages under writeback 4327 * 4328 * Determine the numbers of file, headroom, dirty, and writeback pages in 4329 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 4330 * is a bit more involved. 4331 * 4332 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 4333 * headroom is calculated as the lowest headroom of itself and the 4334 * ancestors. Note that this doesn't consider the actual amount of 4335 * available memory in the system. The caller should further cap 4336 * *@pheadroom accordingly. 4337 */ 4338 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 4339 unsigned long *pheadroom, unsigned long *pdirty, 4340 unsigned long *pwriteback) 4341 { 4342 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4343 struct mem_cgroup *parent; 4344 4345 cgroup_rstat_flush_irqsafe(memcg->css.cgroup); 4346 4347 *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); 4348 *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); 4349 *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + 4350 memcg_page_state(memcg, NR_ACTIVE_FILE); 4351 4352 *pheadroom = PAGE_COUNTER_MAX; 4353 while ((parent = parent_mem_cgroup(memcg))) { 4354 unsigned long ceiling = min(READ_ONCE(memcg->memory.max), 4355 READ_ONCE(memcg->memory.high)); 4356 unsigned long used = page_counter_read(&memcg->memory); 4357 4358 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 4359 memcg = parent; 4360 } 4361 } 4362 4363 /* 4364 * Foreign dirty flushing 4365 * 4366 * There's an inherent mismatch between memcg and writeback. The former 4367 * tracks ownership per-page while the latter per-inode. This was a 4368 * deliberate design decision because honoring per-page ownership in the 4369 * writeback path is complicated, may lead to higher CPU and IO overheads 4370 * and deemed unnecessary given that write-sharing an inode across 4371 * different cgroups isn't a common use-case. 4372 * 4373 * Combined with inode majority-writer ownership switching, this works well 4374 * enough in most cases but there are some pathological cases. For 4375 * example, let's say there are two cgroups A and B which keep writing to 4376 * different but confined parts of the same inode. B owns the inode and 4377 * A's memory is limited far below B's. A's dirty ratio can rise enough to 4378 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid 4379 * triggering background writeback. A will be slowed down without a way to 4380 * make writeback of the dirty pages happen. 4381 * 4382 * Conditions like the above can lead to a cgroup getting repeatedly and 4383 * severely throttled after making some progress after each 4384 * dirty_expire_interval while the underlying IO device is almost 4385 * completely idle. 4386 * 4387 * Solving this problem completely requires matching the ownership tracking 4388 * granularities between memcg and writeback in either direction. However, 4389 * the more egregious behaviors can be avoided by simply remembering the 4390 * most recent foreign dirtying events and initiating remote flushes on 4391 * them when local writeback isn't enough to keep the memory clean enough. 4392 * 4393 * The following two functions implement such mechanism. When a foreign 4394 * page - a page whose memcg and writeback ownerships don't match - is 4395 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning 4396 * bdi_writeback on the page owning memcg. When balance_dirty_pages() 4397 * decides that the memcg needs to sleep due to high dirty ratio, it calls 4398 * mem_cgroup_flush_foreign() which queues writeback on the recorded 4399 * foreign bdi_writebacks which haven't expired. Both the numbers of 4400 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are 4401 * limited to MEMCG_CGWB_FRN_CNT. 4402 * 4403 * The mechanism only remembers IDs and doesn't hold any object references. 4404 * As being wrong occasionally doesn't matter, updates and accesses to the 4405 * records are lockless and racy. 4406 */ 4407 void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, 4408 struct bdi_writeback *wb) 4409 { 4410 struct mem_cgroup *memcg = page_memcg(page); 4411 struct memcg_cgwb_frn *frn; 4412 u64 now = get_jiffies_64(); 4413 u64 oldest_at = now; 4414 int oldest = -1; 4415 int i; 4416 4417 trace_track_foreign_dirty(page, wb); 4418 4419 /* 4420 * Pick the slot to use. If there is already a slot for @wb, keep 4421 * using it. If not replace the oldest one which isn't being 4422 * written out. 4423 */ 4424 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4425 frn = &memcg->cgwb_frn[i]; 4426 if (frn->bdi_id == wb->bdi->id && 4427 frn->memcg_id == wb->memcg_css->id) 4428 break; 4429 if (time_before64(frn->at, oldest_at) && 4430 atomic_read(&frn->done.cnt) == 1) { 4431 oldest = i; 4432 oldest_at = frn->at; 4433 } 4434 } 4435 4436 if (i < MEMCG_CGWB_FRN_CNT) { 4437 /* 4438 * Re-using an existing one. Update timestamp lazily to 4439 * avoid making the cacheline hot. We want them to be 4440 * reasonably up-to-date and significantly shorter than 4441 * dirty_expire_interval as that's what expires the record. 4442 * Use the shorter of 1s and dirty_expire_interval / 8. 4443 */ 4444 unsigned long update_intv = 4445 min_t(unsigned long, HZ, 4446 msecs_to_jiffies(dirty_expire_interval * 10) / 8); 4447 4448 if (time_before64(frn->at, now - update_intv)) 4449 frn->at = now; 4450 } else if (oldest >= 0) { 4451 /* replace the oldest free one */ 4452 frn = &memcg->cgwb_frn[oldest]; 4453 frn->bdi_id = wb->bdi->id; 4454 frn->memcg_id = wb->memcg_css->id; 4455 frn->at = now; 4456 } 4457 } 4458 4459 /* issue foreign writeback flushes for recorded foreign dirtying events */ 4460 void mem_cgroup_flush_foreign(struct bdi_writeback *wb) 4461 { 4462 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4463 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); 4464 u64 now = jiffies_64; 4465 int i; 4466 4467 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4468 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; 4469 4470 /* 4471 * If the record is older than dirty_expire_interval, 4472 * writeback on it has already started. No need to kick it 4473 * off again. Also, don't start a new one if there's 4474 * already one in flight. 4475 */ 4476 if (time_after64(frn->at, now - intv) && 4477 atomic_read(&frn->done.cnt) == 1) { 4478 frn->at = 0; 4479 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); 4480 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, 4481 WB_REASON_FOREIGN_FLUSH, 4482 &frn->done); 4483 } 4484 } 4485 } 4486 4487 #else /* CONFIG_CGROUP_WRITEBACK */ 4488 4489 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4490 { 4491 return 0; 4492 } 4493 4494 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4495 { 4496 } 4497 4498 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4499 { 4500 } 4501 4502 #endif /* CONFIG_CGROUP_WRITEBACK */ 4503 4504 /* 4505 * DO NOT USE IN NEW FILES. 4506 * 4507 * "cgroup.event_control" implementation. 4508 * 4509 * This is way over-engineered. It tries to support fully configurable 4510 * events for each user. Such level of flexibility is completely 4511 * unnecessary especially in the light of the planned unified hierarchy. 4512 * 4513 * Please deprecate this and replace with something simpler if at all 4514 * possible. 4515 */ 4516 4517 /* 4518 * Unregister event and free resources. 4519 * 4520 * Gets called from workqueue. 4521 */ 4522 static void memcg_event_remove(struct work_struct *work) 4523 { 4524 struct mem_cgroup_event *event = 4525 container_of(work, struct mem_cgroup_event, remove); 4526 struct mem_cgroup *memcg = event->memcg; 4527 4528 remove_wait_queue(event->wqh, &event->wait); 4529 4530 event->unregister_event(memcg, event->eventfd); 4531 4532 /* Notify userspace the event is going away. */ 4533 eventfd_signal(event->eventfd, 1); 4534 4535 eventfd_ctx_put(event->eventfd); 4536 kfree(event); 4537 css_put(&memcg->css); 4538 } 4539 4540 /* 4541 * Gets called on EPOLLHUP on eventfd when user closes it. 4542 * 4543 * Called with wqh->lock held and interrupts disabled. 4544 */ 4545 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 4546 int sync, void *key) 4547 { 4548 struct mem_cgroup_event *event = 4549 container_of(wait, struct mem_cgroup_event, wait); 4550 struct mem_cgroup *memcg = event->memcg; 4551 __poll_t flags = key_to_poll(key); 4552 4553 if (flags & EPOLLHUP) { 4554 /* 4555 * If the event has been detached at cgroup removal, we 4556 * can simply return knowing the other side will cleanup 4557 * for us. 4558 * 4559 * We can't race against event freeing since the other 4560 * side will require wqh->lock via remove_wait_queue(), 4561 * which we hold. 4562 */ 4563 spin_lock(&memcg->event_list_lock); 4564 if (!list_empty(&event->list)) { 4565 list_del_init(&event->list); 4566 /* 4567 * We are in atomic context, but cgroup_event_remove() 4568 * may sleep, so we have to call it in workqueue. 4569 */ 4570 schedule_work(&event->remove); 4571 } 4572 spin_unlock(&memcg->event_list_lock); 4573 } 4574 4575 return 0; 4576 } 4577 4578 static void memcg_event_ptable_queue_proc(struct file *file, 4579 wait_queue_head_t *wqh, poll_table *pt) 4580 { 4581 struct mem_cgroup_event *event = 4582 container_of(pt, struct mem_cgroup_event, pt); 4583 4584 event->wqh = wqh; 4585 add_wait_queue(wqh, &event->wait); 4586 } 4587 4588 /* 4589 * DO NOT USE IN NEW FILES. 4590 * 4591 * Parse input and register new cgroup event handler. 4592 * 4593 * Input must be in format '<event_fd> <control_fd> <args>'. 4594 * Interpretation of args is defined by control file implementation. 4595 */ 4596 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4597 char *buf, size_t nbytes, loff_t off) 4598 { 4599 struct cgroup_subsys_state *css = of_css(of); 4600 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4601 struct mem_cgroup_event *event; 4602 struct cgroup_subsys_state *cfile_css; 4603 unsigned int efd, cfd; 4604 struct fd efile; 4605 struct fd cfile; 4606 const char *name; 4607 char *endp; 4608 int ret; 4609 4610 buf = strstrip(buf); 4611 4612 efd = simple_strtoul(buf, &endp, 10); 4613 if (*endp != ' ') 4614 return -EINVAL; 4615 buf = endp + 1; 4616 4617 cfd = simple_strtoul(buf, &endp, 10); 4618 if ((*endp != ' ') && (*endp != '\0')) 4619 return -EINVAL; 4620 buf = endp + 1; 4621 4622 event = kzalloc(sizeof(*event), GFP_KERNEL); 4623 if (!event) 4624 return -ENOMEM; 4625 4626 event->memcg = memcg; 4627 INIT_LIST_HEAD(&event->list); 4628 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4629 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4630 INIT_WORK(&event->remove, memcg_event_remove); 4631 4632 efile = fdget(efd); 4633 if (!efile.file) { 4634 ret = -EBADF; 4635 goto out_kfree; 4636 } 4637 4638 event->eventfd = eventfd_ctx_fileget(efile.file); 4639 if (IS_ERR(event->eventfd)) { 4640 ret = PTR_ERR(event->eventfd); 4641 goto out_put_efile; 4642 } 4643 4644 cfile = fdget(cfd); 4645 if (!cfile.file) { 4646 ret = -EBADF; 4647 goto out_put_eventfd; 4648 } 4649 4650 /* the process need read permission on control file */ 4651 /* AV: shouldn't we check that it's been opened for read instead? */ 4652 ret = file_permission(cfile.file, MAY_READ); 4653 if (ret < 0) 4654 goto out_put_cfile; 4655 4656 /* 4657 * Determine the event callbacks and set them in @event. This used 4658 * to be done via struct cftype but cgroup core no longer knows 4659 * about these events. The following is crude but the whole thing 4660 * is for compatibility anyway. 4661 * 4662 * DO NOT ADD NEW FILES. 4663 */ 4664 name = cfile.file->f_path.dentry->d_name.name; 4665 4666 if (!strcmp(name, "memory.usage_in_bytes")) { 4667 event->register_event = mem_cgroup_usage_register_event; 4668 event->unregister_event = mem_cgroup_usage_unregister_event; 4669 } else if (!strcmp(name, "memory.oom_control")) { 4670 event->register_event = mem_cgroup_oom_register_event; 4671 event->unregister_event = mem_cgroup_oom_unregister_event; 4672 } else if (!strcmp(name, "memory.pressure_level")) { 4673 event->register_event = vmpressure_register_event; 4674 event->unregister_event = vmpressure_unregister_event; 4675 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 4676 event->register_event = memsw_cgroup_usage_register_event; 4677 event->unregister_event = memsw_cgroup_usage_unregister_event; 4678 } else { 4679 ret = -EINVAL; 4680 goto out_put_cfile; 4681 } 4682 4683 /* 4684 * Verify @cfile should belong to @css. Also, remaining events are 4685 * automatically removed on cgroup destruction but the removal is 4686 * asynchronous, so take an extra ref on @css. 4687 */ 4688 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, 4689 &memory_cgrp_subsys); 4690 ret = -EINVAL; 4691 if (IS_ERR(cfile_css)) 4692 goto out_put_cfile; 4693 if (cfile_css != css) { 4694 css_put(cfile_css); 4695 goto out_put_cfile; 4696 } 4697 4698 ret = event->register_event(memcg, event->eventfd, buf); 4699 if (ret) 4700 goto out_put_css; 4701 4702 vfs_poll(efile.file, &event->pt); 4703 4704 spin_lock(&memcg->event_list_lock); 4705 list_add(&event->list, &memcg->event_list); 4706 spin_unlock(&memcg->event_list_lock); 4707 4708 fdput(cfile); 4709 fdput(efile); 4710 4711 return nbytes; 4712 4713 out_put_css: 4714 css_put(css); 4715 out_put_cfile: 4716 fdput(cfile); 4717 out_put_eventfd: 4718 eventfd_ctx_put(event->eventfd); 4719 out_put_efile: 4720 fdput(efile); 4721 out_kfree: 4722 kfree(event); 4723 4724 return ret; 4725 } 4726 4727 static struct cftype mem_cgroup_legacy_files[] = { 4728 { 4729 .name = "usage_in_bytes", 4730 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4731 .read_u64 = mem_cgroup_read_u64, 4732 }, 4733 { 4734 .name = "max_usage_in_bytes", 4735 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4736 .write = mem_cgroup_reset, 4737 .read_u64 = mem_cgroup_read_u64, 4738 }, 4739 { 4740 .name = "limit_in_bytes", 4741 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4742 .write = mem_cgroup_write, 4743 .read_u64 = mem_cgroup_read_u64, 4744 }, 4745 { 4746 .name = "soft_limit_in_bytes", 4747 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4748 .write = mem_cgroup_write, 4749 .read_u64 = mem_cgroup_read_u64, 4750 }, 4751 { 4752 .name = "failcnt", 4753 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4754 .write = mem_cgroup_reset, 4755 .read_u64 = mem_cgroup_read_u64, 4756 }, 4757 { 4758 .name = "stat", 4759 .seq_show = memcg_stat_show, 4760 }, 4761 { 4762 .name = "force_empty", 4763 .write = mem_cgroup_force_empty_write, 4764 }, 4765 { 4766 .name = "use_hierarchy", 4767 .write_u64 = mem_cgroup_hierarchy_write, 4768 .read_u64 = mem_cgroup_hierarchy_read, 4769 }, 4770 { 4771 .name = "cgroup.event_control", /* XXX: for compat */ 4772 .write = memcg_write_event_control, 4773 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 4774 }, 4775 { 4776 .name = "swappiness", 4777 .read_u64 = mem_cgroup_swappiness_read, 4778 .write_u64 = mem_cgroup_swappiness_write, 4779 }, 4780 { 4781 .name = "move_charge_at_immigrate", 4782 .read_u64 = mem_cgroup_move_charge_read, 4783 .write_u64 = mem_cgroup_move_charge_write, 4784 }, 4785 { 4786 .name = "oom_control", 4787 .seq_show = mem_cgroup_oom_control_read, 4788 .write_u64 = mem_cgroup_oom_control_write, 4789 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4790 }, 4791 { 4792 .name = "pressure_level", 4793 }, 4794 #ifdef CONFIG_NUMA 4795 { 4796 .name = "numa_stat", 4797 .seq_show = memcg_numa_stat_show, 4798 }, 4799 #endif 4800 { 4801 .name = "kmem.limit_in_bytes", 4802 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 4803 .write = mem_cgroup_write, 4804 .read_u64 = mem_cgroup_read_u64, 4805 }, 4806 { 4807 .name = "kmem.usage_in_bytes", 4808 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 4809 .read_u64 = mem_cgroup_read_u64, 4810 }, 4811 { 4812 .name = "kmem.failcnt", 4813 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 4814 .write = mem_cgroup_reset, 4815 .read_u64 = mem_cgroup_read_u64, 4816 }, 4817 { 4818 .name = "kmem.max_usage_in_bytes", 4819 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 4820 .write = mem_cgroup_reset, 4821 .read_u64 = mem_cgroup_read_u64, 4822 }, 4823 #if defined(CONFIG_MEMCG_KMEM) && \ 4824 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 4825 { 4826 .name = "kmem.slabinfo", 4827 .seq_show = memcg_slab_show, 4828 }, 4829 #endif 4830 { 4831 .name = "kmem.tcp.limit_in_bytes", 4832 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 4833 .write = mem_cgroup_write, 4834 .read_u64 = mem_cgroup_read_u64, 4835 }, 4836 { 4837 .name = "kmem.tcp.usage_in_bytes", 4838 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 4839 .read_u64 = mem_cgroup_read_u64, 4840 }, 4841 { 4842 .name = "kmem.tcp.failcnt", 4843 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 4844 .write = mem_cgroup_reset, 4845 .read_u64 = mem_cgroup_read_u64, 4846 }, 4847 { 4848 .name = "kmem.tcp.max_usage_in_bytes", 4849 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 4850 .write = mem_cgroup_reset, 4851 .read_u64 = mem_cgroup_read_u64, 4852 }, 4853 { }, /* terminate */ 4854 }; 4855 4856 /* 4857 * Private memory cgroup IDR 4858 * 4859 * Swap-out records and page cache shadow entries need to store memcg 4860 * references in constrained space, so we maintain an ID space that is 4861 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 4862 * memory-controlled cgroups to 64k. 4863 * 4864 * However, there usually are many references to the offline CSS after 4865 * the cgroup has been destroyed, such as page cache or reclaimable 4866 * slab objects, that don't need to hang on to the ID. We want to keep 4867 * those dead CSS from occupying IDs, or we might quickly exhaust the 4868 * relatively small ID space and prevent the creation of new cgroups 4869 * even when there are much fewer than 64k cgroups - possibly none. 4870 * 4871 * Maintain a private 16-bit ID space for memcg, and allow the ID to 4872 * be freed and recycled when it's no longer needed, which is usually 4873 * when the CSS is offlined. 4874 * 4875 * The only exception to that are records of swapped out tmpfs/shmem 4876 * pages that need to be attributed to live ancestors on swapin. But 4877 * those references are manageable from userspace. 4878 */ 4879 4880 static DEFINE_IDR(mem_cgroup_idr); 4881 4882 static void mem_cgroup_id_remove(struct mem_cgroup *memcg) 4883 { 4884 if (memcg->id.id > 0) { 4885 idr_remove(&mem_cgroup_idr, memcg->id.id); 4886 memcg->id.id = 0; 4887 } 4888 } 4889 4890 static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, 4891 unsigned int n) 4892 { 4893 refcount_add(n, &memcg->id.ref); 4894 } 4895 4896 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 4897 { 4898 if (refcount_sub_and_test(n, &memcg->id.ref)) { 4899 mem_cgroup_id_remove(memcg); 4900 4901 /* Memcg ID pins CSS */ 4902 css_put(&memcg->css); 4903 } 4904 } 4905 4906 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 4907 { 4908 mem_cgroup_id_put_many(memcg, 1); 4909 } 4910 4911 /** 4912 * mem_cgroup_from_id - look up a memcg from a memcg id 4913 * @id: the memcg id to look up 4914 * 4915 * Caller must hold rcu_read_lock(). 4916 */ 4917 struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 4918 { 4919 WARN_ON_ONCE(!rcu_read_lock_held()); 4920 return idr_find(&mem_cgroup_idr, id); 4921 } 4922 4923 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 4924 { 4925 struct mem_cgroup_per_node *pn; 4926 int tmp = node; 4927 /* 4928 * This routine is called against possible nodes. 4929 * But it's BUG to call kmalloc() against offline node. 4930 * 4931 * TODO: this routine can waste much memory for nodes which will 4932 * never be onlined. It's better to use memory hotplug callback 4933 * function. 4934 */ 4935 if (!node_state(node, N_NORMAL_MEMORY)) 4936 tmp = -1; 4937 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4938 if (!pn) 4939 return 1; 4940 4941 pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, 4942 GFP_KERNEL_ACCOUNT); 4943 if (!pn->lruvec_stat_local) { 4944 kfree(pn); 4945 return 1; 4946 } 4947 4948 pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat, 4949 GFP_KERNEL_ACCOUNT); 4950 if (!pn->lruvec_stat_cpu) { 4951 free_percpu(pn->lruvec_stat_local); 4952 kfree(pn); 4953 return 1; 4954 } 4955 4956 lruvec_init(&pn->lruvec); 4957 pn->usage_in_excess = 0; 4958 pn->on_tree = false; 4959 pn->memcg = memcg; 4960 4961 memcg->nodeinfo[node] = pn; 4962 return 0; 4963 } 4964 4965 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 4966 { 4967 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 4968 4969 if (!pn) 4970 return; 4971 4972 free_percpu(pn->lruvec_stat_cpu); 4973 free_percpu(pn->lruvec_stat_local); 4974 kfree(pn); 4975 } 4976 4977 static void __mem_cgroup_free(struct mem_cgroup *memcg) 4978 { 4979 int node; 4980 4981 for_each_node(node) 4982 free_mem_cgroup_per_node_info(memcg, node); 4983 free_percpu(memcg->vmstats_percpu); 4984 kfree(memcg); 4985 } 4986 4987 static void mem_cgroup_free(struct mem_cgroup *memcg) 4988 { 4989 int cpu; 4990 4991 memcg_wb_domain_exit(memcg); 4992 /* 4993 * Flush percpu lruvec stats to guarantee the value 4994 * correctness on parent's and all ancestor levels. 4995 */ 4996 for_each_online_cpu(cpu) 4997 memcg_flush_lruvec_page_state(memcg, cpu); 4998 __mem_cgroup_free(memcg); 4999 } 5000 5001 static struct mem_cgroup *mem_cgroup_alloc(void) 5002 { 5003 struct mem_cgroup *memcg; 5004 unsigned int size; 5005 int node; 5006 int __maybe_unused i; 5007 long error = -ENOMEM; 5008 5009 size = sizeof(struct mem_cgroup); 5010 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 5011 5012 memcg = kzalloc(size, GFP_KERNEL); 5013 if (!memcg) 5014 return ERR_PTR(error); 5015 5016 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 5017 1, MEM_CGROUP_ID_MAX, 5018 GFP_KERNEL); 5019 if (memcg->id.id < 0) { 5020 error = memcg->id.id; 5021 goto fail; 5022 } 5023 5024 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, 5025 GFP_KERNEL_ACCOUNT); 5026 if (!memcg->vmstats_percpu) 5027 goto fail; 5028 5029 for_each_node(node) 5030 if (alloc_mem_cgroup_per_node_info(memcg, node)) 5031 goto fail; 5032 5033 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 5034 goto fail; 5035 5036 INIT_WORK(&memcg->high_work, high_work_func); 5037 INIT_LIST_HEAD(&memcg->oom_notify); 5038 mutex_init(&memcg->thresholds_lock); 5039 spin_lock_init(&memcg->move_lock); 5040 vmpressure_init(&memcg->vmpressure); 5041 INIT_LIST_HEAD(&memcg->event_list); 5042 spin_lock_init(&memcg->event_list_lock); 5043 memcg->socket_pressure = jiffies; 5044 #ifdef CONFIG_MEMCG_KMEM 5045 memcg->kmemcg_id = -1; 5046 INIT_LIST_HEAD(&memcg->objcg_list); 5047 #endif 5048 #ifdef CONFIG_CGROUP_WRITEBACK 5049 INIT_LIST_HEAD(&memcg->cgwb_list); 5050 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5051 memcg->cgwb_frn[i].done = 5052 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 5053 #endif 5054 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5055 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); 5056 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); 5057 memcg->deferred_split_queue.split_queue_len = 0; 5058 #endif 5059 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 5060 return memcg; 5061 fail: 5062 mem_cgroup_id_remove(memcg); 5063 __mem_cgroup_free(memcg); 5064 return ERR_PTR(error); 5065 } 5066 5067 static struct cgroup_subsys_state * __ref 5068 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 5069 { 5070 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 5071 struct mem_cgroup *memcg, *old_memcg; 5072 long error = -ENOMEM; 5073 5074 old_memcg = set_active_memcg(parent); 5075 memcg = mem_cgroup_alloc(); 5076 set_active_memcg(old_memcg); 5077 if (IS_ERR(memcg)) 5078 return ERR_CAST(memcg); 5079 5080 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5081 memcg->soft_limit = PAGE_COUNTER_MAX; 5082 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5083 if (parent) { 5084 memcg->swappiness = mem_cgroup_swappiness(parent); 5085 memcg->oom_kill_disable = parent->oom_kill_disable; 5086 5087 page_counter_init(&memcg->memory, &parent->memory); 5088 page_counter_init(&memcg->swap, &parent->swap); 5089 page_counter_init(&memcg->kmem, &parent->kmem); 5090 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 5091 } else { 5092 page_counter_init(&memcg->memory, NULL); 5093 page_counter_init(&memcg->swap, NULL); 5094 page_counter_init(&memcg->kmem, NULL); 5095 page_counter_init(&memcg->tcpmem, NULL); 5096 5097 root_mem_cgroup = memcg; 5098 return &memcg->css; 5099 } 5100 5101 /* The following stuff does not apply to the root */ 5102 error = memcg_online_kmem(memcg); 5103 if (error) 5104 goto fail; 5105 5106 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5107 static_branch_inc(&memcg_sockets_enabled_key); 5108 5109 return &memcg->css; 5110 fail: 5111 mem_cgroup_id_remove(memcg); 5112 mem_cgroup_free(memcg); 5113 return ERR_PTR(error); 5114 } 5115 5116 static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 5117 { 5118 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5119 5120 /* 5121 * A memcg must be visible for expand_shrinker_info() 5122 * by the time the maps are allocated. So, we allocate maps 5123 * here, when for_each_mem_cgroup() can't skip it. 5124 */ 5125 if (alloc_shrinker_info(memcg)) { 5126 mem_cgroup_id_remove(memcg); 5127 return -ENOMEM; 5128 } 5129 5130 /* Online state pins memcg ID, memcg ID pins CSS */ 5131 refcount_set(&memcg->id.ref, 1); 5132 css_get(css); 5133 return 0; 5134 } 5135 5136 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 5137 { 5138 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5139 struct mem_cgroup_event *event, *tmp; 5140 5141 /* 5142 * Unregister events and notify userspace. 5143 * Notify userspace about cgroup removing only after rmdir of cgroup 5144 * directory to avoid race between userspace and kernelspace. 5145 */ 5146 spin_lock(&memcg->event_list_lock); 5147 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 5148 list_del_init(&event->list); 5149 schedule_work(&event->remove); 5150 } 5151 spin_unlock(&memcg->event_list_lock); 5152 5153 page_counter_set_min(&memcg->memory, 0); 5154 page_counter_set_low(&memcg->memory, 0); 5155 5156 memcg_offline_kmem(memcg); 5157 reparent_shrinker_deferred(memcg); 5158 wb_memcg_offline(memcg); 5159 5160 drain_all_stock(memcg); 5161 5162 mem_cgroup_id_put(memcg); 5163 } 5164 5165 static void mem_cgroup_css_released(struct cgroup_subsys_state *css) 5166 { 5167 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5168 5169 invalidate_reclaim_iterators(memcg); 5170 } 5171 5172 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5173 { 5174 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5175 int __maybe_unused i; 5176 5177 #ifdef CONFIG_CGROUP_WRITEBACK 5178 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5179 wb_wait_for_completion(&memcg->cgwb_frn[i].done); 5180 #endif 5181 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5182 static_branch_dec(&memcg_sockets_enabled_key); 5183 5184 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 5185 static_branch_dec(&memcg_sockets_enabled_key); 5186 5187 vmpressure_cleanup(&memcg->vmpressure); 5188 cancel_work_sync(&memcg->high_work); 5189 mem_cgroup_remove_from_trees(memcg); 5190 free_shrinker_info(memcg); 5191 memcg_free_kmem(memcg); 5192 mem_cgroup_free(memcg); 5193 } 5194 5195 /** 5196 * mem_cgroup_css_reset - reset the states of a mem_cgroup 5197 * @css: the target css 5198 * 5199 * Reset the states of the mem_cgroup associated with @css. This is 5200 * invoked when the userland requests disabling on the default hierarchy 5201 * but the memcg is pinned through dependency. The memcg should stop 5202 * applying policies and should revert to the vanilla state as it may be 5203 * made visible again. 5204 * 5205 * The current implementation only resets the essential configurations. 5206 * This needs to be expanded to cover all the visible parts. 5207 */ 5208 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 5209 { 5210 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5211 5212 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 5213 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 5214 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 5215 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 5216 page_counter_set_min(&memcg->memory, 0); 5217 page_counter_set_low(&memcg->memory, 0); 5218 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5219 memcg->soft_limit = PAGE_COUNTER_MAX; 5220 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5221 memcg_wb_domain_size_changed(memcg); 5222 } 5223 5224 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) 5225 { 5226 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5227 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5228 struct memcg_vmstats_percpu *statc; 5229 long delta, v; 5230 int i; 5231 5232 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); 5233 5234 for (i = 0; i < MEMCG_NR_STAT; i++) { 5235 /* 5236 * Collect the aggregated propagation counts of groups 5237 * below us. We're in a per-cpu loop here and this is 5238 * a global counter, so the first cycle will get them. 5239 */ 5240 delta = memcg->vmstats.state_pending[i]; 5241 if (delta) 5242 memcg->vmstats.state_pending[i] = 0; 5243 5244 /* Add CPU changes on this level since the last flush */ 5245 v = READ_ONCE(statc->state[i]); 5246 if (v != statc->state_prev[i]) { 5247 delta += v - statc->state_prev[i]; 5248 statc->state_prev[i] = v; 5249 } 5250 5251 if (!delta) 5252 continue; 5253 5254 /* Aggregate counts on this level and propagate upwards */ 5255 memcg->vmstats.state[i] += delta; 5256 if (parent) 5257 parent->vmstats.state_pending[i] += delta; 5258 } 5259 5260 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { 5261 delta = memcg->vmstats.events_pending[i]; 5262 if (delta) 5263 memcg->vmstats.events_pending[i] = 0; 5264 5265 v = READ_ONCE(statc->events[i]); 5266 if (v != statc->events_prev[i]) { 5267 delta += v - statc->events_prev[i]; 5268 statc->events_prev[i] = v; 5269 } 5270 5271 if (!delta) 5272 continue; 5273 5274 memcg->vmstats.events[i] += delta; 5275 if (parent) 5276 parent->vmstats.events_pending[i] += delta; 5277 } 5278 } 5279 5280 #ifdef CONFIG_MMU 5281 /* Handlers for move charge at task migration. */ 5282 static int mem_cgroup_do_precharge(unsigned long count) 5283 { 5284 int ret; 5285 5286 /* Try a single bulk charge without reclaim first, kswapd may wake */ 5287 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 5288 if (!ret) { 5289 mc.precharge += count; 5290 return ret; 5291 } 5292 5293 /* Try charges one by one with reclaim, but do not retry */ 5294 while (count--) { 5295 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 5296 if (ret) 5297 return ret; 5298 mc.precharge++; 5299 cond_resched(); 5300 } 5301 return 0; 5302 } 5303 5304 union mc_target { 5305 struct page *page; 5306 swp_entry_t ent; 5307 }; 5308 5309 enum mc_target_type { 5310 MC_TARGET_NONE = 0, 5311 MC_TARGET_PAGE, 5312 MC_TARGET_SWAP, 5313 MC_TARGET_DEVICE, 5314 }; 5315 5316 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5317 unsigned long addr, pte_t ptent) 5318 { 5319 struct page *page = vm_normal_page(vma, addr, ptent); 5320 5321 if (!page || !page_mapped(page)) 5322 return NULL; 5323 if (PageAnon(page)) { 5324 if (!(mc.flags & MOVE_ANON)) 5325 return NULL; 5326 } else { 5327 if (!(mc.flags & MOVE_FILE)) 5328 return NULL; 5329 } 5330 if (!get_page_unless_zero(page)) 5331 return NULL; 5332 5333 return page; 5334 } 5335 5336 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 5337 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5338 pte_t ptent, swp_entry_t *entry) 5339 { 5340 struct page *page = NULL; 5341 swp_entry_t ent = pte_to_swp_entry(ptent); 5342 5343 if (!(mc.flags & MOVE_ANON)) 5344 return NULL; 5345 5346 /* 5347 * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to 5348 * a device and because they are not accessible by CPU they are store 5349 * as special swap entry in the CPU page table. 5350 */ 5351 if (is_device_private_entry(ent)) { 5352 page = device_private_entry_to_page(ent); 5353 /* 5354 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have 5355 * a refcount of 1 when free (unlike normal page) 5356 */ 5357 if (!page_ref_add_unless(page, 1, 1)) 5358 return NULL; 5359 return page; 5360 } 5361 5362 if (non_swap_entry(ent)) 5363 return NULL; 5364 5365 /* 5366 * Because lookup_swap_cache() updates some statistics counter, 5367 * we call find_get_page() with swapper_space directly. 5368 */ 5369 page = find_get_page(swap_address_space(ent), swp_offset(ent)); 5370 entry->val = ent.val; 5371 5372 return page; 5373 } 5374 #else 5375 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5376 pte_t ptent, swp_entry_t *entry) 5377 { 5378 return NULL; 5379 } 5380 #endif 5381 5382 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5383 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5384 { 5385 if (!vma->vm_file) /* anonymous vma */ 5386 return NULL; 5387 if (!(mc.flags & MOVE_FILE)) 5388 return NULL; 5389 5390 /* page is moved even if it's not RSS of this task(page-faulted). */ 5391 /* shmem/tmpfs may report page out on swap: account for that too. */ 5392 return find_get_incore_page(vma->vm_file->f_mapping, 5393 linear_page_index(vma, addr)); 5394 } 5395 5396 /** 5397 * mem_cgroup_move_account - move account of the page 5398 * @page: the page 5399 * @compound: charge the page as compound or small page 5400 * @from: mem_cgroup which the page is moved from. 5401 * @to: mem_cgroup which the page is moved to. @from != @to. 5402 * 5403 * The caller must make sure the page is not on LRU (isolate_page() is useful.) 5404 * 5405 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 5406 * from old cgroup. 5407 */ 5408 static int mem_cgroup_move_account(struct page *page, 5409 bool compound, 5410 struct mem_cgroup *from, 5411 struct mem_cgroup *to) 5412 { 5413 struct lruvec *from_vec, *to_vec; 5414 struct pglist_data *pgdat; 5415 unsigned int nr_pages = compound ? thp_nr_pages(page) : 1; 5416 int ret; 5417 5418 VM_BUG_ON(from == to); 5419 VM_BUG_ON_PAGE(PageLRU(page), page); 5420 VM_BUG_ON(compound && !PageTransHuge(page)); 5421 5422 /* 5423 * Prevent mem_cgroup_migrate() from looking at 5424 * page's memory cgroup of its source page while we change it. 5425 */ 5426 ret = -EBUSY; 5427 if (!trylock_page(page)) 5428 goto out; 5429 5430 ret = -EINVAL; 5431 if (page_memcg(page) != from) 5432 goto out_unlock; 5433 5434 pgdat = page_pgdat(page); 5435 from_vec = mem_cgroup_lruvec(from, pgdat); 5436 to_vec = mem_cgroup_lruvec(to, pgdat); 5437 5438 lock_page_memcg(page); 5439 5440 if (PageAnon(page)) { 5441 if (page_mapped(page)) { 5442 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); 5443 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); 5444 if (PageTransHuge(page)) { 5445 __mod_lruvec_state(from_vec, NR_ANON_THPS, 5446 -nr_pages); 5447 __mod_lruvec_state(to_vec, NR_ANON_THPS, 5448 nr_pages); 5449 } 5450 } 5451 } else { 5452 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); 5453 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); 5454 5455 if (PageSwapBacked(page)) { 5456 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); 5457 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); 5458 } 5459 5460 if (page_mapped(page)) { 5461 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); 5462 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); 5463 } 5464 5465 if (PageDirty(page)) { 5466 struct address_space *mapping = page_mapping(page); 5467 5468 if (mapping_can_writeback(mapping)) { 5469 __mod_lruvec_state(from_vec, NR_FILE_DIRTY, 5470 -nr_pages); 5471 __mod_lruvec_state(to_vec, NR_FILE_DIRTY, 5472 nr_pages); 5473 } 5474 } 5475 } 5476 5477 if (PageWriteback(page)) { 5478 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); 5479 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); 5480 } 5481 5482 /* 5483 * All state has been migrated, let's switch to the new memcg. 5484 * 5485 * It is safe to change page's memcg here because the page 5486 * is referenced, charged, isolated, and locked: we can't race 5487 * with (un)charging, migration, LRU putback, or anything else 5488 * that would rely on a stable page's memory cgroup. 5489 * 5490 * Note that lock_page_memcg is a memcg lock, not a page lock, 5491 * to save space. As soon as we switch page's memory cgroup to a 5492 * new memcg that isn't locked, the above state can change 5493 * concurrently again. Make sure we're truly done with it. 5494 */ 5495 smp_mb(); 5496 5497 css_get(&to->css); 5498 css_put(&from->css); 5499 5500 page->memcg_data = (unsigned long)to; 5501 5502 __unlock_page_memcg(from); 5503 5504 ret = 0; 5505 5506 local_irq_disable(); 5507 mem_cgroup_charge_statistics(to, page, nr_pages); 5508 memcg_check_events(to, page); 5509 mem_cgroup_charge_statistics(from, page, -nr_pages); 5510 memcg_check_events(from, page); 5511 local_irq_enable(); 5512 out_unlock: 5513 unlock_page(page); 5514 out: 5515 return ret; 5516 } 5517 5518 /** 5519 * get_mctgt_type - get target type of moving charge 5520 * @vma: the vma the pte to be checked belongs 5521 * @addr: the address corresponding to the pte to be checked 5522 * @ptent: the pte to be checked 5523 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5524 * 5525 * Returns 5526 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5527 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5528 * move charge. if @target is not NULL, the page is stored in target->page 5529 * with extra refcnt got(Callers should handle it). 5530 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5531 * target for charge migration. if @target is not NULL, the entry is stored 5532 * in target->ent. 5533 * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE 5534 * (so ZONE_DEVICE page and thus not on the lru). 5535 * For now we such page is charge like a regular page would be as for all 5536 * intent and purposes it is just special memory taking the place of a 5537 * regular page. 5538 * 5539 * See Documentations/vm/hmm.txt and include/linux/hmm.h 5540 * 5541 * Called with pte lock held. 5542 */ 5543 5544 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5545 unsigned long addr, pte_t ptent, union mc_target *target) 5546 { 5547 struct page *page = NULL; 5548 enum mc_target_type ret = MC_TARGET_NONE; 5549 swp_entry_t ent = { .val = 0 }; 5550 5551 if (pte_present(ptent)) 5552 page = mc_handle_present_pte(vma, addr, ptent); 5553 else if (is_swap_pte(ptent)) 5554 page = mc_handle_swap_pte(vma, ptent, &ent); 5555 else if (pte_none(ptent)) 5556 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5557 5558 if (!page && !ent.val) 5559 return ret; 5560 if (page) { 5561 /* 5562 * Do only loose check w/o serialization. 5563 * mem_cgroup_move_account() checks the page is valid or 5564 * not under LRU exclusion. 5565 */ 5566 if (page_memcg(page) == mc.from) { 5567 ret = MC_TARGET_PAGE; 5568 if (is_device_private_page(page)) 5569 ret = MC_TARGET_DEVICE; 5570 if (target) 5571 target->page = page; 5572 } 5573 if (!ret || !target) 5574 put_page(page); 5575 } 5576 /* 5577 * There is a swap entry and a page doesn't exist or isn't charged. 5578 * But we cannot move a tail-page in a THP. 5579 */ 5580 if (ent.val && !ret && (!page || !PageTransCompound(page)) && 5581 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 5582 ret = MC_TARGET_SWAP; 5583 if (target) 5584 target->ent = ent; 5585 } 5586 return ret; 5587 } 5588 5589 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5590 /* 5591 * We don't consider PMD mapped swapping or file mapped pages because THP does 5592 * not support them for now. 5593 * Caller should make sure that pmd_trans_huge(pmd) is true. 5594 */ 5595 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5596 unsigned long addr, pmd_t pmd, union mc_target *target) 5597 { 5598 struct page *page = NULL; 5599 enum mc_target_type ret = MC_TARGET_NONE; 5600 5601 if (unlikely(is_swap_pmd(pmd))) { 5602 VM_BUG_ON(thp_migration_supported() && 5603 !is_pmd_migration_entry(pmd)); 5604 return ret; 5605 } 5606 page = pmd_page(pmd); 5607 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5608 if (!(mc.flags & MOVE_ANON)) 5609 return ret; 5610 if (page_memcg(page) == mc.from) { 5611 ret = MC_TARGET_PAGE; 5612 if (target) { 5613 get_page(page); 5614 target->page = page; 5615 } 5616 } 5617 return ret; 5618 } 5619 #else 5620 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5621 unsigned long addr, pmd_t pmd, union mc_target *target) 5622 { 5623 return MC_TARGET_NONE; 5624 } 5625 #endif 5626 5627 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5628 unsigned long addr, unsigned long end, 5629 struct mm_walk *walk) 5630 { 5631 struct vm_area_struct *vma = walk->vma; 5632 pte_t *pte; 5633 spinlock_t *ptl; 5634 5635 ptl = pmd_trans_huge_lock(pmd, vma); 5636 if (ptl) { 5637 /* 5638 * Note their can not be MC_TARGET_DEVICE for now as we do not 5639 * support transparent huge page with MEMORY_DEVICE_PRIVATE but 5640 * this might change. 5641 */ 5642 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5643 mc.precharge += HPAGE_PMD_NR; 5644 spin_unlock(ptl); 5645 return 0; 5646 } 5647 5648 if (pmd_trans_unstable(pmd)) 5649 return 0; 5650 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5651 for (; addr != end; pte++, addr += PAGE_SIZE) 5652 if (get_mctgt_type(vma, addr, *pte, NULL)) 5653 mc.precharge++; /* increment precharge temporarily */ 5654 pte_unmap_unlock(pte - 1, ptl); 5655 cond_resched(); 5656 5657 return 0; 5658 } 5659 5660 static const struct mm_walk_ops precharge_walk_ops = { 5661 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5662 }; 5663 5664 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5665 { 5666 unsigned long precharge; 5667 5668 mmap_read_lock(mm); 5669 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); 5670 mmap_read_unlock(mm); 5671 5672 precharge = mc.precharge; 5673 mc.precharge = 0; 5674 5675 return precharge; 5676 } 5677 5678 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5679 { 5680 unsigned long precharge = mem_cgroup_count_precharge(mm); 5681 5682 VM_BUG_ON(mc.moving_task); 5683 mc.moving_task = current; 5684 return mem_cgroup_do_precharge(precharge); 5685 } 5686 5687 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5688 static void __mem_cgroup_clear_mc(void) 5689 { 5690 struct mem_cgroup *from = mc.from; 5691 struct mem_cgroup *to = mc.to; 5692 5693 /* we must uncharge all the leftover precharges from mc.to */ 5694 if (mc.precharge) { 5695 cancel_charge(mc.to, mc.precharge); 5696 mc.precharge = 0; 5697 } 5698 /* 5699 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5700 * we must uncharge here. 5701 */ 5702 if (mc.moved_charge) { 5703 cancel_charge(mc.from, mc.moved_charge); 5704 mc.moved_charge = 0; 5705 } 5706 /* we must fixup refcnts and charges */ 5707 if (mc.moved_swap) { 5708 /* uncharge swap account from the old cgroup */ 5709 if (!mem_cgroup_is_root(mc.from)) 5710 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 5711 5712 mem_cgroup_id_put_many(mc.from, mc.moved_swap); 5713 5714 /* 5715 * we charged both to->memory and to->memsw, so we 5716 * should uncharge to->memory. 5717 */ 5718 if (!mem_cgroup_is_root(mc.to)) 5719 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 5720 5721 mc.moved_swap = 0; 5722 } 5723 memcg_oom_recover(from); 5724 memcg_oom_recover(to); 5725 wake_up_all(&mc.waitq); 5726 } 5727 5728 static void mem_cgroup_clear_mc(void) 5729 { 5730 struct mm_struct *mm = mc.mm; 5731 5732 /* 5733 * we must clear moving_task before waking up waiters at the end of 5734 * task migration. 5735 */ 5736 mc.moving_task = NULL; 5737 __mem_cgroup_clear_mc(); 5738 spin_lock(&mc.lock); 5739 mc.from = NULL; 5740 mc.to = NULL; 5741 mc.mm = NULL; 5742 spin_unlock(&mc.lock); 5743 5744 mmput(mm); 5745 } 5746 5747 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 5748 { 5749 struct cgroup_subsys_state *css; 5750 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 5751 struct mem_cgroup *from; 5752 struct task_struct *leader, *p; 5753 struct mm_struct *mm; 5754 unsigned long move_flags; 5755 int ret = 0; 5756 5757 /* charge immigration isn't supported on the default hierarchy */ 5758 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5759 return 0; 5760 5761 /* 5762 * Multi-process migrations only happen on the default hierarchy 5763 * where charge immigration is not used. Perform charge 5764 * immigration if @tset contains a leader and whine if there are 5765 * multiple. 5766 */ 5767 p = NULL; 5768 cgroup_taskset_for_each_leader(leader, css, tset) { 5769 WARN_ON_ONCE(p); 5770 p = leader; 5771 memcg = mem_cgroup_from_css(css); 5772 } 5773 if (!p) 5774 return 0; 5775 5776 /* 5777 * We are now committed to this value whatever it is. Changes in this 5778 * tunable will only affect upcoming migrations, not the current one. 5779 * So we need to save it, and keep it going. 5780 */ 5781 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 5782 if (!move_flags) 5783 return 0; 5784 5785 from = mem_cgroup_from_task(p); 5786 5787 VM_BUG_ON(from == memcg); 5788 5789 mm = get_task_mm(p); 5790 if (!mm) 5791 return 0; 5792 /* We move charges only when we move a owner of the mm */ 5793 if (mm->owner == p) { 5794 VM_BUG_ON(mc.from); 5795 VM_BUG_ON(mc.to); 5796 VM_BUG_ON(mc.precharge); 5797 VM_BUG_ON(mc.moved_charge); 5798 VM_BUG_ON(mc.moved_swap); 5799 5800 spin_lock(&mc.lock); 5801 mc.mm = mm; 5802 mc.from = from; 5803 mc.to = memcg; 5804 mc.flags = move_flags; 5805 spin_unlock(&mc.lock); 5806 /* We set mc.moving_task later */ 5807 5808 ret = mem_cgroup_precharge_mc(mm); 5809 if (ret) 5810 mem_cgroup_clear_mc(); 5811 } else { 5812 mmput(mm); 5813 } 5814 return ret; 5815 } 5816 5817 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 5818 { 5819 if (mc.to) 5820 mem_cgroup_clear_mc(); 5821 } 5822 5823 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5824 unsigned long addr, unsigned long end, 5825 struct mm_walk *walk) 5826 { 5827 int ret = 0; 5828 struct vm_area_struct *vma = walk->vma; 5829 pte_t *pte; 5830 spinlock_t *ptl; 5831 enum mc_target_type target_type; 5832 union mc_target target; 5833 struct page *page; 5834 5835 ptl = pmd_trans_huge_lock(pmd, vma); 5836 if (ptl) { 5837 if (mc.precharge < HPAGE_PMD_NR) { 5838 spin_unlock(ptl); 5839 return 0; 5840 } 5841 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 5842 if (target_type == MC_TARGET_PAGE) { 5843 page = target.page; 5844 if (!isolate_lru_page(page)) { 5845 if (!mem_cgroup_move_account(page, true, 5846 mc.from, mc.to)) { 5847 mc.precharge -= HPAGE_PMD_NR; 5848 mc.moved_charge += HPAGE_PMD_NR; 5849 } 5850 putback_lru_page(page); 5851 } 5852 put_page(page); 5853 } else if (target_type == MC_TARGET_DEVICE) { 5854 page = target.page; 5855 if (!mem_cgroup_move_account(page, true, 5856 mc.from, mc.to)) { 5857 mc.precharge -= HPAGE_PMD_NR; 5858 mc.moved_charge += HPAGE_PMD_NR; 5859 } 5860 put_page(page); 5861 } 5862 spin_unlock(ptl); 5863 return 0; 5864 } 5865 5866 if (pmd_trans_unstable(pmd)) 5867 return 0; 5868 retry: 5869 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5870 for (; addr != end; addr += PAGE_SIZE) { 5871 pte_t ptent = *(pte++); 5872 bool device = false; 5873 swp_entry_t ent; 5874 5875 if (!mc.precharge) 5876 break; 5877 5878 switch (get_mctgt_type(vma, addr, ptent, &target)) { 5879 case MC_TARGET_DEVICE: 5880 device = true; 5881 fallthrough; 5882 case MC_TARGET_PAGE: 5883 page = target.page; 5884 /* 5885 * We can have a part of the split pmd here. Moving it 5886 * can be done but it would be too convoluted so simply 5887 * ignore such a partial THP and keep it in original 5888 * memcg. There should be somebody mapping the head. 5889 */ 5890 if (PageTransCompound(page)) 5891 goto put; 5892 if (!device && isolate_lru_page(page)) 5893 goto put; 5894 if (!mem_cgroup_move_account(page, false, 5895 mc.from, mc.to)) { 5896 mc.precharge--; 5897 /* we uncharge from mc.from later. */ 5898 mc.moved_charge++; 5899 } 5900 if (!device) 5901 putback_lru_page(page); 5902 put: /* get_mctgt_type() gets the page */ 5903 put_page(page); 5904 break; 5905 case MC_TARGET_SWAP: 5906 ent = target.ent; 5907 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 5908 mc.precharge--; 5909 mem_cgroup_id_get_many(mc.to, 1); 5910 /* we fixup other refcnts and charges later. */ 5911 mc.moved_swap++; 5912 } 5913 break; 5914 default: 5915 break; 5916 } 5917 } 5918 pte_unmap_unlock(pte - 1, ptl); 5919 cond_resched(); 5920 5921 if (addr != end) { 5922 /* 5923 * We have consumed all precharges we got in can_attach(). 5924 * We try charge one by one, but don't do any additional 5925 * charges to mc.to if we have failed in charge once in attach() 5926 * phase. 5927 */ 5928 ret = mem_cgroup_do_precharge(1); 5929 if (!ret) 5930 goto retry; 5931 } 5932 5933 return ret; 5934 } 5935 5936 static const struct mm_walk_ops charge_walk_ops = { 5937 .pmd_entry = mem_cgroup_move_charge_pte_range, 5938 }; 5939 5940 static void mem_cgroup_move_charge(void) 5941 { 5942 lru_add_drain_all(); 5943 /* 5944 * Signal lock_page_memcg() to take the memcg's move_lock 5945 * while we're moving its pages to another memcg. Then wait 5946 * for already started RCU-only updates to finish. 5947 */ 5948 atomic_inc(&mc.from->moving_account); 5949 synchronize_rcu(); 5950 retry: 5951 if (unlikely(!mmap_read_trylock(mc.mm))) { 5952 /* 5953 * Someone who are holding the mmap_lock might be waiting in 5954 * waitq. So we cancel all extra charges, wake up all waiters, 5955 * and retry. Because we cancel precharges, we might not be able 5956 * to move enough charges, but moving charge is a best-effort 5957 * feature anyway, so it wouldn't be a big problem. 5958 */ 5959 __mem_cgroup_clear_mc(); 5960 cond_resched(); 5961 goto retry; 5962 } 5963 /* 5964 * When we have consumed all precharges and failed in doing 5965 * additional charge, the page walk just aborts. 5966 */ 5967 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, 5968 NULL); 5969 5970 mmap_read_unlock(mc.mm); 5971 atomic_dec(&mc.from->moving_account); 5972 } 5973 5974 static void mem_cgroup_move_task(void) 5975 { 5976 if (mc.to) { 5977 mem_cgroup_move_charge(); 5978 mem_cgroup_clear_mc(); 5979 } 5980 } 5981 #else /* !CONFIG_MMU */ 5982 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 5983 { 5984 return 0; 5985 } 5986 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 5987 { 5988 } 5989 static void mem_cgroup_move_task(void) 5990 { 5991 } 5992 #endif 5993 5994 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 5995 { 5996 if (value == PAGE_COUNTER_MAX) 5997 seq_puts(m, "max\n"); 5998 else 5999 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 6000 6001 return 0; 6002 } 6003 6004 static u64 memory_current_read(struct cgroup_subsys_state *css, 6005 struct cftype *cft) 6006 { 6007 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6008 6009 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 6010 } 6011 6012 static int memory_min_show(struct seq_file *m, void *v) 6013 { 6014 return seq_puts_memcg_tunable(m, 6015 READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 6016 } 6017 6018 static ssize_t memory_min_write(struct kernfs_open_file *of, 6019 char *buf, size_t nbytes, loff_t off) 6020 { 6021 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6022 unsigned long min; 6023 int err; 6024 6025 buf = strstrip(buf); 6026 err = page_counter_memparse(buf, "max", &min); 6027 if (err) 6028 return err; 6029 6030 page_counter_set_min(&memcg->memory, min); 6031 6032 return nbytes; 6033 } 6034 6035 static int memory_low_show(struct seq_file *m, void *v) 6036 { 6037 return seq_puts_memcg_tunable(m, 6038 READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 6039 } 6040 6041 static ssize_t memory_low_write(struct kernfs_open_file *of, 6042 char *buf, size_t nbytes, loff_t off) 6043 { 6044 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6045 unsigned long low; 6046 int err; 6047 6048 buf = strstrip(buf); 6049 err = page_counter_memparse(buf, "max", &low); 6050 if (err) 6051 return err; 6052 6053 page_counter_set_low(&memcg->memory, low); 6054 6055 return nbytes; 6056 } 6057 6058 static int memory_high_show(struct seq_file *m, void *v) 6059 { 6060 return seq_puts_memcg_tunable(m, 6061 READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); 6062 } 6063 6064 static ssize_t memory_high_write(struct kernfs_open_file *of, 6065 char *buf, size_t nbytes, loff_t off) 6066 { 6067 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6068 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6069 bool drained = false; 6070 unsigned long high; 6071 int err; 6072 6073 buf = strstrip(buf); 6074 err = page_counter_memparse(buf, "max", &high); 6075 if (err) 6076 return err; 6077 6078 page_counter_set_high(&memcg->memory, high); 6079 6080 for (;;) { 6081 unsigned long nr_pages = page_counter_read(&memcg->memory); 6082 unsigned long reclaimed; 6083 6084 if (nr_pages <= high) 6085 break; 6086 6087 if (signal_pending(current)) 6088 break; 6089 6090 if (!drained) { 6091 drain_all_stock(memcg); 6092 drained = true; 6093 continue; 6094 } 6095 6096 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 6097 GFP_KERNEL, true); 6098 6099 if (!reclaimed && !nr_retries--) 6100 break; 6101 } 6102 6103 memcg_wb_domain_size_changed(memcg); 6104 return nbytes; 6105 } 6106 6107 static int memory_max_show(struct seq_file *m, void *v) 6108 { 6109 return seq_puts_memcg_tunable(m, 6110 READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 6111 } 6112 6113 static ssize_t memory_max_write(struct kernfs_open_file *of, 6114 char *buf, size_t nbytes, loff_t off) 6115 { 6116 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6117 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; 6118 bool drained = false; 6119 unsigned long max; 6120 int err; 6121 6122 buf = strstrip(buf); 6123 err = page_counter_memparse(buf, "max", &max); 6124 if (err) 6125 return err; 6126 6127 xchg(&memcg->memory.max, max); 6128 6129 for (;;) { 6130 unsigned long nr_pages = page_counter_read(&memcg->memory); 6131 6132 if (nr_pages <= max) 6133 break; 6134 6135 if (signal_pending(current)) 6136 break; 6137 6138 if (!drained) { 6139 drain_all_stock(memcg); 6140 drained = true; 6141 continue; 6142 } 6143 6144 if (nr_reclaims) { 6145 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 6146 GFP_KERNEL, true)) 6147 nr_reclaims--; 6148 continue; 6149 } 6150 6151 memcg_memory_event(memcg, MEMCG_OOM); 6152 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 6153 break; 6154 } 6155 6156 memcg_wb_domain_size_changed(memcg); 6157 return nbytes; 6158 } 6159 6160 static void __memory_events_show(struct seq_file *m, atomic_long_t *events) 6161 { 6162 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); 6163 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); 6164 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); 6165 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); 6166 seq_printf(m, "oom_kill %lu\n", 6167 atomic_long_read(&events[MEMCG_OOM_KILL])); 6168 } 6169 6170 static int memory_events_show(struct seq_file *m, void *v) 6171 { 6172 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6173 6174 __memory_events_show(m, memcg->memory_events); 6175 return 0; 6176 } 6177 6178 static int memory_events_local_show(struct seq_file *m, void *v) 6179 { 6180 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6181 6182 __memory_events_show(m, memcg->memory_events_local); 6183 return 0; 6184 } 6185 6186 static int memory_stat_show(struct seq_file *m, void *v) 6187 { 6188 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6189 char *buf; 6190 6191 buf = memory_stat_format(memcg); 6192 if (!buf) 6193 return -ENOMEM; 6194 seq_puts(m, buf); 6195 kfree(buf); 6196 return 0; 6197 } 6198 6199 #ifdef CONFIG_NUMA 6200 static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, 6201 int item) 6202 { 6203 return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item); 6204 } 6205 6206 static int memory_numa_stat_show(struct seq_file *m, void *v) 6207 { 6208 int i; 6209 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6210 6211 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 6212 int nid; 6213 6214 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) 6215 continue; 6216 6217 seq_printf(m, "%s", memory_stats[i].name); 6218 for_each_node_state(nid, N_MEMORY) { 6219 u64 size; 6220 struct lruvec *lruvec; 6221 6222 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 6223 size = lruvec_page_state_output(lruvec, 6224 memory_stats[i].idx); 6225 seq_printf(m, " N%d=%llu", nid, size); 6226 } 6227 seq_putc(m, '\n'); 6228 } 6229 6230 return 0; 6231 } 6232 #endif 6233 6234 static int memory_oom_group_show(struct seq_file *m, void *v) 6235 { 6236 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6237 6238 seq_printf(m, "%d\n", memcg->oom_group); 6239 6240 return 0; 6241 } 6242 6243 static ssize_t memory_oom_group_write(struct kernfs_open_file *of, 6244 char *buf, size_t nbytes, loff_t off) 6245 { 6246 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6247 int ret, oom_group; 6248 6249 buf = strstrip(buf); 6250 if (!buf) 6251 return -EINVAL; 6252 6253 ret = kstrtoint(buf, 0, &oom_group); 6254 if (ret) 6255 return ret; 6256 6257 if (oom_group != 0 && oom_group != 1) 6258 return -EINVAL; 6259 6260 memcg->oom_group = oom_group; 6261 6262 return nbytes; 6263 } 6264 6265 static struct cftype memory_files[] = { 6266 { 6267 .name = "current", 6268 .flags = CFTYPE_NOT_ON_ROOT, 6269 .read_u64 = memory_current_read, 6270 }, 6271 { 6272 .name = "min", 6273 .flags = CFTYPE_NOT_ON_ROOT, 6274 .seq_show = memory_min_show, 6275 .write = memory_min_write, 6276 }, 6277 { 6278 .name = "low", 6279 .flags = CFTYPE_NOT_ON_ROOT, 6280 .seq_show = memory_low_show, 6281 .write = memory_low_write, 6282 }, 6283 { 6284 .name = "high", 6285 .flags = CFTYPE_NOT_ON_ROOT, 6286 .seq_show = memory_high_show, 6287 .write = memory_high_write, 6288 }, 6289 { 6290 .name = "max", 6291 .flags = CFTYPE_NOT_ON_ROOT, 6292 .seq_show = memory_max_show, 6293 .write = memory_max_write, 6294 }, 6295 { 6296 .name = "events", 6297 .flags = CFTYPE_NOT_ON_ROOT, 6298 .file_offset = offsetof(struct mem_cgroup, events_file), 6299 .seq_show = memory_events_show, 6300 }, 6301 { 6302 .name = "events.local", 6303 .flags = CFTYPE_NOT_ON_ROOT, 6304 .file_offset = offsetof(struct mem_cgroup, events_local_file), 6305 .seq_show = memory_events_local_show, 6306 }, 6307 { 6308 .name = "stat", 6309 .seq_show = memory_stat_show, 6310 }, 6311 #ifdef CONFIG_NUMA 6312 { 6313 .name = "numa_stat", 6314 .seq_show = memory_numa_stat_show, 6315 }, 6316 #endif 6317 { 6318 .name = "oom.group", 6319 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 6320 .seq_show = memory_oom_group_show, 6321 .write = memory_oom_group_write, 6322 }, 6323 { } /* terminate */ 6324 }; 6325 6326 struct cgroup_subsys memory_cgrp_subsys = { 6327 .css_alloc = mem_cgroup_css_alloc, 6328 .css_online = mem_cgroup_css_online, 6329 .css_offline = mem_cgroup_css_offline, 6330 .css_released = mem_cgroup_css_released, 6331 .css_free = mem_cgroup_css_free, 6332 .css_reset = mem_cgroup_css_reset, 6333 .css_rstat_flush = mem_cgroup_css_rstat_flush, 6334 .can_attach = mem_cgroup_can_attach, 6335 .cancel_attach = mem_cgroup_cancel_attach, 6336 .post_attach = mem_cgroup_move_task, 6337 .dfl_cftypes = memory_files, 6338 .legacy_cftypes = mem_cgroup_legacy_files, 6339 .early_init = 0, 6340 }; 6341 6342 /* 6343 * This function calculates an individual cgroup's effective 6344 * protection which is derived from its own memory.min/low, its 6345 * parent's and siblings' settings, as well as the actual memory 6346 * distribution in the tree. 6347 * 6348 * The following rules apply to the effective protection values: 6349 * 6350 * 1. At the first level of reclaim, effective protection is equal to 6351 * the declared protection in memory.min and memory.low. 6352 * 6353 * 2. To enable safe delegation of the protection configuration, at 6354 * subsequent levels the effective protection is capped to the 6355 * parent's effective protection. 6356 * 6357 * 3. To make complex and dynamic subtrees easier to configure, the 6358 * user is allowed to overcommit the declared protection at a given 6359 * level. If that is the case, the parent's effective protection is 6360 * distributed to the children in proportion to how much protection 6361 * they have declared and how much of it they are utilizing. 6362 * 6363 * This makes distribution proportional, but also work-conserving: 6364 * if one cgroup claims much more protection than it uses memory, 6365 * the unused remainder is available to its siblings. 6366 * 6367 * 4. Conversely, when the declared protection is undercommitted at a 6368 * given level, the distribution of the larger parental protection 6369 * budget is NOT proportional. A cgroup's protection from a sibling 6370 * is capped to its own memory.min/low setting. 6371 * 6372 * 5. However, to allow protecting recursive subtrees from each other 6373 * without having to declare each individual cgroup's fixed share 6374 * of the ancestor's claim to protection, any unutilized - 6375 * "floating" - protection from up the tree is distributed in 6376 * proportion to each cgroup's *usage*. This makes the protection 6377 * neutral wrt sibling cgroups and lets them compete freely over 6378 * the shared parental protection budget, but it protects the 6379 * subtree as a whole from neighboring subtrees. 6380 * 6381 * Note that 4. and 5. are not in conflict: 4. is about protecting 6382 * against immediate siblings whereas 5. is about protecting against 6383 * neighboring subtrees. 6384 */ 6385 static unsigned long effective_protection(unsigned long usage, 6386 unsigned long parent_usage, 6387 unsigned long setting, 6388 unsigned long parent_effective, 6389 unsigned long siblings_protected) 6390 { 6391 unsigned long protected; 6392 unsigned long ep; 6393 6394 protected = min(usage, setting); 6395 /* 6396 * If all cgroups at this level combined claim and use more 6397 * protection then what the parent affords them, distribute 6398 * shares in proportion to utilization. 6399 * 6400 * We are using actual utilization rather than the statically 6401 * claimed protection in order to be work-conserving: claimed 6402 * but unused protection is available to siblings that would 6403 * otherwise get a smaller chunk than what they claimed. 6404 */ 6405 if (siblings_protected > parent_effective) 6406 return protected * parent_effective / siblings_protected; 6407 6408 /* 6409 * Ok, utilized protection of all children is within what the 6410 * parent affords them, so we know whatever this child claims 6411 * and utilizes is effectively protected. 6412 * 6413 * If there is unprotected usage beyond this value, reclaim 6414 * will apply pressure in proportion to that amount. 6415 * 6416 * If there is unutilized protection, the cgroup will be fully 6417 * shielded from reclaim, but we do return a smaller value for 6418 * protection than what the group could enjoy in theory. This 6419 * is okay. With the overcommit distribution above, effective 6420 * protection is always dependent on how memory is actually 6421 * consumed among the siblings anyway. 6422 */ 6423 ep = protected; 6424 6425 /* 6426 * If the children aren't claiming (all of) the protection 6427 * afforded to them by the parent, distribute the remainder in 6428 * proportion to the (unprotected) memory of each cgroup. That 6429 * way, cgroups that aren't explicitly prioritized wrt each 6430 * other compete freely over the allowance, but they are 6431 * collectively protected from neighboring trees. 6432 * 6433 * We're using unprotected memory for the weight so that if 6434 * some cgroups DO claim explicit protection, we don't protect 6435 * the same bytes twice. 6436 * 6437 * Check both usage and parent_usage against the respective 6438 * protected values. One should imply the other, but they 6439 * aren't read atomically - make sure the division is sane. 6440 */ 6441 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) 6442 return ep; 6443 if (parent_effective > siblings_protected && 6444 parent_usage > siblings_protected && 6445 usage > protected) { 6446 unsigned long unclaimed; 6447 6448 unclaimed = parent_effective - siblings_protected; 6449 unclaimed *= usage - protected; 6450 unclaimed /= parent_usage - siblings_protected; 6451 6452 ep += unclaimed; 6453 } 6454 6455 return ep; 6456 } 6457 6458 /** 6459 * mem_cgroup_protected - check if memory consumption is in the normal range 6460 * @root: the top ancestor of the sub-tree being checked 6461 * @memcg: the memory cgroup to check 6462 * 6463 * WARNING: This function is not stateless! It can only be used as part 6464 * of a top-down tree iteration, not for isolated queries. 6465 */ 6466 void mem_cgroup_calculate_protection(struct mem_cgroup *root, 6467 struct mem_cgroup *memcg) 6468 { 6469 unsigned long usage, parent_usage; 6470 struct mem_cgroup *parent; 6471 6472 if (mem_cgroup_disabled()) 6473 return; 6474 6475 if (!root) 6476 root = root_mem_cgroup; 6477 6478 /* 6479 * Effective values of the reclaim targets are ignored so they 6480 * can be stale. Have a look at mem_cgroup_protection for more 6481 * details. 6482 * TODO: calculation should be more robust so that we do not need 6483 * that special casing. 6484 */ 6485 if (memcg == root) 6486 return; 6487 6488 usage = page_counter_read(&memcg->memory); 6489 if (!usage) 6490 return; 6491 6492 parent = parent_mem_cgroup(memcg); 6493 /* No parent means a non-hierarchical mode on v1 memcg */ 6494 if (!parent) 6495 return; 6496 6497 if (parent == root) { 6498 memcg->memory.emin = READ_ONCE(memcg->memory.min); 6499 memcg->memory.elow = READ_ONCE(memcg->memory.low); 6500 return; 6501 } 6502 6503 parent_usage = page_counter_read(&parent->memory); 6504 6505 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, 6506 READ_ONCE(memcg->memory.min), 6507 READ_ONCE(parent->memory.emin), 6508 atomic_long_read(&parent->memory.children_min_usage))); 6509 6510 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, 6511 READ_ONCE(memcg->memory.low), 6512 READ_ONCE(parent->memory.elow), 6513 atomic_long_read(&parent->memory.children_low_usage))); 6514 } 6515 6516 static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg, 6517 gfp_t gfp) 6518 { 6519 unsigned int nr_pages = thp_nr_pages(page); 6520 int ret; 6521 6522 ret = try_charge(memcg, gfp, nr_pages); 6523 if (ret) 6524 goto out; 6525 6526 css_get(&memcg->css); 6527 commit_charge(page, memcg); 6528 6529 local_irq_disable(); 6530 mem_cgroup_charge_statistics(memcg, page, nr_pages); 6531 memcg_check_events(memcg, page); 6532 local_irq_enable(); 6533 out: 6534 return ret; 6535 } 6536 6537 /** 6538 * mem_cgroup_charge - charge a newly allocated page to a cgroup 6539 * @page: page to charge 6540 * @mm: mm context of the victim 6541 * @gfp_mask: reclaim mode 6542 * 6543 * Try to charge @page to the memcg that @mm belongs to, reclaiming 6544 * pages according to @gfp_mask if necessary. 6545 * 6546 * Do not use this for pages allocated for swapin. 6547 * 6548 * Returns 0 on success. Otherwise, an error code is returned. 6549 */ 6550 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 6551 { 6552 struct mem_cgroup *memcg; 6553 int ret; 6554 6555 if (mem_cgroup_disabled()) 6556 return 0; 6557 6558 memcg = get_mem_cgroup_from_mm(mm); 6559 ret = __mem_cgroup_charge(page, memcg, gfp_mask); 6560 css_put(&memcg->css); 6561 6562 return ret; 6563 } 6564 6565 /** 6566 * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin 6567 * @page: page to charge 6568 * @mm: mm context of the victim 6569 * @gfp: reclaim mode 6570 * @entry: swap entry for which the page is allocated 6571 * 6572 * This function charges a page allocated for swapin. Please call this before 6573 * adding the page to the swapcache. 6574 * 6575 * Returns 0 on success. Otherwise, an error code is returned. 6576 */ 6577 int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, 6578 gfp_t gfp, swp_entry_t entry) 6579 { 6580 struct mem_cgroup *memcg; 6581 unsigned short id; 6582 int ret; 6583 6584 if (mem_cgroup_disabled()) 6585 return 0; 6586 6587 id = lookup_swap_cgroup_id(entry); 6588 rcu_read_lock(); 6589 memcg = mem_cgroup_from_id(id); 6590 if (!memcg || !css_tryget_online(&memcg->css)) 6591 memcg = get_mem_cgroup_from_mm(mm); 6592 rcu_read_unlock(); 6593 6594 ret = __mem_cgroup_charge(page, memcg, gfp); 6595 6596 css_put(&memcg->css); 6597 return ret; 6598 } 6599 6600 /* 6601 * mem_cgroup_swapin_uncharge_swap - uncharge swap slot 6602 * @entry: swap entry for which the page is charged 6603 * 6604 * Call this function after successfully adding the charged page to swapcache. 6605 * 6606 * Note: This function assumes the page for which swap slot is being uncharged 6607 * is order 0 page. 6608 */ 6609 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) 6610 { 6611 /* 6612 * Cgroup1's unified memory+swap counter has been charged with the 6613 * new swapcache page, finish the transfer by uncharging the swap 6614 * slot. The swap slot would also get uncharged when it dies, but 6615 * it can stick around indefinitely and we'd count the page twice 6616 * the entire time. 6617 * 6618 * Cgroup2 has separate resource counters for memory and swap, 6619 * so this is a non-issue here. Memory and swap charge lifetimes 6620 * correspond 1:1 to page and swap slot lifetimes: we charge the 6621 * page to memory here, and uncharge swap when the slot is freed. 6622 */ 6623 if (!mem_cgroup_disabled() && do_memsw_account()) { 6624 /* 6625 * The swap entry might not get freed for a long time, 6626 * let's not wait for it. The page already received a 6627 * memory+swap charge, drop the swap entry duplicate. 6628 */ 6629 mem_cgroup_uncharge_swap(entry, 1); 6630 } 6631 } 6632 6633 struct uncharge_gather { 6634 struct mem_cgroup *memcg; 6635 unsigned long nr_memory; 6636 unsigned long pgpgout; 6637 unsigned long nr_kmem; 6638 struct page *dummy_page; 6639 }; 6640 6641 static inline void uncharge_gather_clear(struct uncharge_gather *ug) 6642 { 6643 memset(ug, 0, sizeof(*ug)); 6644 } 6645 6646 static void uncharge_batch(const struct uncharge_gather *ug) 6647 { 6648 unsigned long flags; 6649 6650 if (ug->nr_memory) { 6651 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); 6652 if (do_memsw_account()) 6653 page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); 6654 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) 6655 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); 6656 memcg_oom_recover(ug->memcg); 6657 } 6658 6659 local_irq_save(flags); 6660 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 6661 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); 6662 memcg_check_events(ug->memcg, ug->dummy_page); 6663 local_irq_restore(flags); 6664 6665 /* drop reference from uncharge_page */ 6666 css_put(&ug->memcg->css); 6667 } 6668 6669 static void uncharge_page(struct page *page, struct uncharge_gather *ug) 6670 { 6671 unsigned long nr_pages; 6672 struct mem_cgroup *memcg; 6673 struct obj_cgroup *objcg; 6674 6675 VM_BUG_ON_PAGE(PageLRU(page), page); 6676 6677 /* 6678 * Nobody should be changing or seriously looking at 6679 * page memcg or objcg at this point, we have fully 6680 * exclusive access to the page. 6681 */ 6682 if (PageMemcgKmem(page)) { 6683 objcg = __page_objcg(page); 6684 /* 6685 * This get matches the put at the end of the function and 6686 * kmem pages do not hold memcg references anymore. 6687 */ 6688 memcg = get_mem_cgroup_from_objcg(objcg); 6689 } else { 6690 memcg = __page_memcg(page); 6691 } 6692 6693 if (!memcg) 6694 return; 6695 6696 if (ug->memcg != memcg) { 6697 if (ug->memcg) { 6698 uncharge_batch(ug); 6699 uncharge_gather_clear(ug); 6700 } 6701 ug->memcg = memcg; 6702 ug->dummy_page = page; 6703 6704 /* pairs with css_put in uncharge_batch */ 6705 css_get(&memcg->css); 6706 } 6707 6708 nr_pages = compound_nr(page); 6709 6710 if (PageMemcgKmem(page)) { 6711 ug->nr_memory += nr_pages; 6712 ug->nr_kmem += nr_pages; 6713 6714 page->memcg_data = 0; 6715 obj_cgroup_put(objcg); 6716 } else { 6717 /* LRU pages aren't accounted at the root level */ 6718 if (!mem_cgroup_is_root(memcg)) 6719 ug->nr_memory += nr_pages; 6720 ug->pgpgout++; 6721 6722 page->memcg_data = 0; 6723 } 6724 6725 css_put(&memcg->css); 6726 } 6727 6728 /** 6729 * mem_cgroup_uncharge - uncharge a page 6730 * @page: page to uncharge 6731 * 6732 * Uncharge a page previously charged with mem_cgroup_charge(). 6733 */ 6734 void mem_cgroup_uncharge(struct page *page) 6735 { 6736 struct uncharge_gather ug; 6737 6738 if (mem_cgroup_disabled()) 6739 return; 6740 6741 /* Don't touch page->lru of any random page, pre-check: */ 6742 if (!page_memcg(page)) 6743 return; 6744 6745 uncharge_gather_clear(&ug); 6746 uncharge_page(page, &ug); 6747 uncharge_batch(&ug); 6748 } 6749 6750 /** 6751 * mem_cgroup_uncharge_list - uncharge a list of page 6752 * @page_list: list of pages to uncharge 6753 * 6754 * Uncharge a list of pages previously charged with 6755 * mem_cgroup_charge(). 6756 */ 6757 void mem_cgroup_uncharge_list(struct list_head *page_list) 6758 { 6759 struct uncharge_gather ug; 6760 struct page *page; 6761 6762 if (mem_cgroup_disabled()) 6763 return; 6764 6765 uncharge_gather_clear(&ug); 6766 list_for_each_entry(page, page_list, lru) 6767 uncharge_page(page, &ug); 6768 if (ug.memcg) 6769 uncharge_batch(&ug); 6770 } 6771 6772 /** 6773 * mem_cgroup_migrate - charge a page's replacement 6774 * @oldpage: currently circulating page 6775 * @newpage: replacement page 6776 * 6777 * Charge @newpage as a replacement page for @oldpage. @oldpage will 6778 * be uncharged upon free. 6779 * 6780 * Both pages must be locked, @newpage->mapping must be set up. 6781 */ 6782 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) 6783 { 6784 struct mem_cgroup *memcg; 6785 unsigned int nr_pages; 6786 unsigned long flags; 6787 6788 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 6789 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 6790 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 6791 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 6792 newpage); 6793 6794 if (mem_cgroup_disabled()) 6795 return; 6796 6797 /* Page cache replacement: new page already charged? */ 6798 if (page_memcg(newpage)) 6799 return; 6800 6801 memcg = page_memcg(oldpage); 6802 VM_WARN_ON_ONCE_PAGE(!memcg, oldpage); 6803 if (!memcg) 6804 return; 6805 6806 /* Force-charge the new page. The old one will be freed soon */ 6807 nr_pages = thp_nr_pages(newpage); 6808 6809 page_counter_charge(&memcg->memory, nr_pages); 6810 if (do_memsw_account()) 6811 page_counter_charge(&memcg->memsw, nr_pages); 6812 6813 css_get(&memcg->css); 6814 commit_charge(newpage, memcg); 6815 6816 local_irq_save(flags); 6817 mem_cgroup_charge_statistics(memcg, newpage, nr_pages); 6818 memcg_check_events(memcg, newpage); 6819 local_irq_restore(flags); 6820 } 6821 6822 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 6823 EXPORT_SYMBOL(memcg_sockets_enabled_key); 6824 6825 void mem_cgroup_sk_alloc(struct sock *sk) 6826 { 6827 struct mem_cgroup *memcg; 6828 6829 if (!mem_cgroup_sockets_enabled) 6830 return; 6831 6832 /* Do not associate the sock with unrelated interrupted task's memcg. */ 6833 if (in_interrupt()) 6834 return; 6835 6836 rcu_read_lock(); 6837 memcg = mem_cgroup_from_task(current); 6838 if (memcg == root_mem_cgroup) 6839 goto out; 6840 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 6841 goto out; 6842 if (css_tryget(&memcg->css)) 6843 sk->sk_memcg = memcg; 6844 out: 6845 rcu_read_unlock(); 6846 } 6847 6848 void mem_cgroup_sk_free(struct sock *sk) 6849 { 6850 if (sk->sk_memcg) 6851 css_put(&sk->sk_memcg->css); 6852 } 6853 6854 /** 6855 * mem_cgroup_charge_skmem - charge socket memory 6856 * @memcg: memcg to charge 6857 * @nr_pages: number of pages to charge 6858 * 6859 * Charges @nr_pages to @memcg. Returns %true if the charge fit within 6860 * @memcg's configured limit, %false if the charge had to be forced. 6861 */ 6862 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 6863 { 6864 gfp_t gfp_mask = GFP_KERNEL; 6865 6866 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 6867 struct page_counter *fail; 6868 6869 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 6870 memcg->tcpmem_pressure = 0; 6871 return true; 6872 } 6873 page_counter_charge(&memcg->tcpmem, nr_pages); 6874 memcg->tcpmem_pressure = 1; 6875 return false; 6876 } 6877 6878 /* Don't block in the packet receive path */ 6879 if (in_softirq()) 6880 gfp_mask = GFP_NOWAIT; 6881 6882 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 6883 6884 if (try_charge(memcg, gfp_mask, nr_pages) == 0) 6885 return true; 6886 6887 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); 6888 return false; 6889 } 6890 6891 /** 6892 * mem_cgroup_uncharge_skmem - uncharge socket memory 6893 * @memcg: memcg to uncharge 6894 * @nr_pages: number of pages to uncharge 6895 */ 6896 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 6897 { 6898 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 6899 page_counter_uncharge(&memcg->tcpmem, nr_pages); 6900 return; 6901 } 6902 6903 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 6904 6905 refill_stock(memcg, nr_pages); 6906 } 6907 6908 static int __init cgroup_memory(char *s) 6909 { 6910 char *token; 6911 6912 while ((token = strsep(&s, ",")) != NULL) { 6913 if (!*token) 6914 continue; 6915 if (!strcmp(token, "nosocket")) 6916 cgroup_memory_nosocket = true; 6917 if (!strcmp(token, "nokmem")) 6918 cgroup_memory_nokmem = true; 6919 } 6920 return 0; 6921 } 6922 __setup("cgroup.memory=", cgroup_memory); 6923 6924 /* 6925 * subsys_initcall() for memory controller. 6926 * 6927 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 6928 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 6929 * basically everything that doesn't depend on a specific mem_cgroup structure 6930 * should be initialized from here. 6931 */ 6932 static int __init mem_cgroup_init(void) 6933 { 6934 int cpu, node; 6935 6936 /* 6937 * Currently s32 type (can refer to struct batched_lruvec_stat) is 6938 * used for per-memcg-per-cpu caching of per-node statistics. In order 6939 * to work fine, we should make sure that the overfill threshold can't 6940 * exceed S32_MAX / PAGE_SIZE. 6941 */ 6942 BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE); 6943 6944 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 6945 memcg_hotplug_cpu_dead); 6946 6947 for_each_possible_cpu(cpu) 6948 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 6949 drain_local_stock); 6950 6951 for_each_node(node) { 6952 struct mem_cgroup_tree_per_node *rtpn; 6953 6954 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 6955 node_online(node) ? node : NUMA_NO_NODE); 6956 6957 rtpn->rb_root = RB_ROOT; 6958 rtpn->rb_rightmost = NULL; 6959 spin_lock_init(&rtpn->lock); 6960 soft_limit_tree.rb_tree_per_node[node] = rtpn; 6961 } 6962 6963 return 0; 6964 } 6965 subsys_initcall(mem_cgroup_init); 6966 6967 #ifdef CONFIG_MEMCG_SWAP 6968 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 6969 { 6970 while (!refcount_inc_not_zero(&memcg->id.ref)) { 6971 /* 6972 * The root cgroup cannot be destroyed, so it's refcount must 6973 * always be >= 1. 6974 */ 6975 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { 6976 VM_BUG_ON(1); 6977 break; 6978 } 6979 memcg = parent_mem_cgroup(memcg); 6980 if (!memcg) 6981 memcg = root_mem_cgroup; 6982 } 6983 return memcg; 6984 } 6985 6986 /** 6987 * mem_cgroup_swapout - transfer a memsw charge to swap 6988 * @page: page whose memsw charge to transfer 6989 * @entry: swap entry to move the charge to 6990 * 6991 * Transfer the memsw charge of @page to @entry. 6992 */ 6993 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 6994 { 6995 struct mem_cgroup *memcg, *swap_memcg; 6996 unsigned int nr_entries; 6997 unsigned short oldid; 6998 6999 VM_BUG_ON_PAGE(PageLRU(page), page); 7000 VM_BUG_ON_PAGE(page_count(page), page); 7001 7002 if (mem_cgroup_disabled()) 7003 return; 7004 7005 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7006 return; 7007 7008 memcg = page_memcg(page); 7009 7010 VM_WARN_ON_ONCE_PAGE(!memcg, page); 7011 if (!memcg) 7012 return; 7013 7014 /* 7015 * In case the memcg owning these pages has been offlined and doesn't 7016 * have an ID allocated to it anymore, charge the closest online 7017 * ancestor for the swap instead and transfer the memory+swap charge. 7018 */ 7019 swap_memcg = mem_cgroup_id_get_online(memcg); 7020 nr_entries = thp_nr_pages(page); 7021 /* Get references for the tail pages, too */ 7022 if (nr_entries > 1) 7023 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 7024 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 7025 nr_entries); 7026 VM_BUG_ON_PAGE(oldid, page); 7027 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 7028 7029 page->memcg_data = 0; 7030 7031 if (!mem_cgroup_is_root(memcg)) 7032 page_counter_uncharge(&memcg->memory, nr_entries); 7033 7034 if (!cgroup_memory_noswap && memcg != swap_memcg) { 7035 if (!mem_cgroup_is_root(swap_memcg)) 7036 page_counter_charge(&swap_memcg->memsw, nr_entries); 7037 page_counter_uncharge(&memcg->memsw, nr_entries); 7038 } 7039 7040 /* 7041 * Interrupts should be disabled here because the caller holds the 7042 * i_pages lock which is taken with interrupts-off. It is 7043 * important here to have the interrupts disabled because it is the 7044 * only synchronisation we have for updating the per-CPU variables. 7045 */ 7046 VM_BUG_ON(!irqs_disabled()); 7047 mem_cgroup_charge_statistics(memcg, page, -nr_entries); 7048 memcg_check_events(memcg, page); 7049 7050 css_put(&memcg->css); 7051 } 7052 7053 /** 7054 * mem_cgroup_try_charge_swap - try charging swap space for a page 7055 * @page: page being added to swap 7056 * @entry: swap entry to charge 7057 * 7058 * Try to charge @page's memcg for the swap space at @entry. 7059 * 7060 * Returns 0 on success, -ENOMEM on failure. 7061 */ 7062 int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) 7063 { 7064 unsigned int nr_pages = thp_nr_pages(page); 7065 struct page_counter *counter; 7066 struct mem_cgroup *memcg; 7067 unsigned short oldid; 7068 7069 if (mem_cgroup_disabled()) 7070 return 0; 7071 7072 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7073 return 0; 7074 7075 memcg = page_memcg(page); 7076 7077 VM_WARN_ON_ONCE_PAGE(!memcg, page); 7078 if (!memcg) 7079 return 0; 7080 7081 if (!entry.val) { 7082 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7083 return 0; 7084 } 7085 7086 memcg = mem_cgroup_id_get_online(memcg); 7087 7088 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && 7089 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 7090 memcg_memory_event(memcg, MEMCG_SWAP_MAX); 7091 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7092 mem_cgroup_id_put(memcg); 7093 return -ENOMEM; 7094 } 7095 7096 /* Get references for the tail pages, too */ 7097 if (nr_pages > 1) 7098 mem_cgroup_id_get_many(memcg, nr_pages - 1); 7099 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 7100 VM_BUG_ON_PAGE(oldid, page); 7101 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 7102 7103 return 0; 7104 } 7105 7106 /** 7107 * mem_cgroup_uncharge_swap - uncharge swap space 7108 * @entry: swap entry to uncharge 7109 * @nr_pages: the amount of swap space to uncharge 7110 */ 7111 void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 7112 { 7113 struct mem_cgroup *memcg; 7114 unsigned short id; 7115 7116 id = swap_cgroup_record(entry, 0, nr_pages); 7117 rcu_read_lock(); 7118 memcg = mem_cgroup_from_id(id); 7119 if (memcg) { 7120 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { 7121 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7122 page_counter_uncharge(&memcg->swap, nr_pages); 7123 else 7124 page_counter_uncharge(&memcg->memsw, nr_pages); 7125 } 7126 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 7127 mem_cgroup_id_put_many(memcg, nr_pages); 7128 } 7129 rcu_read_unlock(); 7130 } 7131 7132 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 7133 { 7134 long nr_swap_pages = get_nr_swap_pages(); 7135 7136 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7137 return nr_swap_pages; 7138 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) 7139 nr_swap_pages = min_t(long, nr_swap_pages, 7140 READ_ONCE(memcg->swap.max) - 7141 page_counter_read(&memcg->swap)); 7142 return nr_swap_pages; 7143 } 7144 7145 bool mem_cgroup_swap_full(struct page *page) 7146 { 7147 struct mem_cgroup *memcg; 7148 7149 VM_BUG_ON_PAGE(!PageLocked(page), page); 7150 7151 if (vm_swap_full()) 7152 return true; 7153 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7154 return false; 7155 7156 memcg = page_memcg(page); 7157 if (!memcg) 7158 return false; 7159 7160 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { 7161 unsigned long usage = page_counter_read(&memcg->swap); 7162 7163 if (usage * 2 >= READ_ONCE(memcg->swap.high) || 7164 usage * 2 >= READ_ONCE(memcg->swap.max)) 7165 return true; 7166 } 7167 7168 return false; 7169 } 7170 7171 static int __init setup_swap_account(char *s) 7172 { 7173 if (!strcmp(s, "1")) 7174 cgroup_memory_noswap = false; 7175 else if (!strcmp(s, "0")) 7176 cgroup_memory_noswap = true; 7177 return 1; 7178 } 7179 __setup("swapaccount=", setup_swap_account); 7180 7181 static u64 swap_current_read(struct cgroup_subsys_state *css, 7182 struct cftype *cft) 7183 { 7184 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 7185 7186 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 7187 } 7188 7189 static int swap_high_show(struct seq_file *m, void *v) 7190 { 7191 return seq_puts_memcg_tunable(m, 7192 READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); 7193 } 7194 7195 static ssize_t swap_high_write(struct kernfs_open_file *of, 7196 char *buf, size_t nbytes, loff_t off) 7197 { 7198 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7199 unsigned long high; 7200 int err; 7201 7202 buf = strstrip(buf); 7203 err = page_counter_memparse(buf, "max", &high); 7204 if (err) 7205 return err; 7206 7207 page_counter_set_high(&memcg->swap, high); 7208 7209 return nbytes; 7210 } 7211 7212 static int swap_max_show(struct seq_file *m, void *v) 7213 { 7214 return seq_puts_memcg_tunable(m, 7215 READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 7216 } 7217 7218 static ssize_t swap_max_write(struct kernfs_open_file *of, 7219 char *buf, size_t nbytes, loff_t off) 7220 { 7221 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7222 unsigned long max; 7223 int err; 7224 7225 buf = strstrip(buf); 7226 err = page_counter_memparse(buf, "max", &max); 7227 if (err) 7228 return err; 7229 7230 xchg(&memcg->swap.max, max); 7231 7232 return nbytes; 7233 } 7234 7235 static int swap_events_show(struct seq_file *m, void *v) 7236 { 7237 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 7238 7239 seq_printf(m, "high %lu\n", 7240 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); 7241 seq_printf(m, "max %lu\n", 7242 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 7243 seq_printf(m, "fail %lu\n", 7244 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 7245 7246 return 0; 7247 } 7248 7249 static struct cftype swap_files[] = { 7250 { 7251 .name = "swap.current", 7252 .flags = CFTYPE_NOT_ON_ROOT, 7253 .read_u64 = swap_current_read, 7254 }, 7255 { 7256 .name = "swap.high", 7257 .flags = CFTYPE_NOT_ON_ROOT, 7258 .seq_show = swap_high_show, 7259 .write = swap_high_write, 7260 }, 7261 { 7262 .name = "swap.max", 7263 .flags = CFTYPE_NOT_ON_ROOT, 7264 .seq_show = swap_max_show, 7265 .write = swap_max_write, 7266 }, 7267 { 7268 .name = "swap.events", 7269 .flags = CFTYPE_NOT_ON_ROOT, 7270 .file_offset = offsetof(struct mem_cgroup, swap_events_file), 7271 .seq_show = swap_events_show, 7272 }, 7273 { } /* terminate */ 7274 }; 7275 7276 static struct cftype memsw_files[] = { 7277 { 7278 .name = "memsw.usage_in_bytes", 7279 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 7280 .read_u64 = mem_cgroup_read_u64, 7281 }, 7282 { 7283 .name = "memsw.max_usage_in_bytes", 7284 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 7285 .write = mem_cgroup_reset, 7286 .read_u64 = mem_cgroup_read_u64, 7287 }, 7288 { 7289 .name = "memsw.limit_in_bytes", 7290 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 7291 .write = mem_cgroup_write, 7292 .read_u64 = mem_cgroup_read_u64, 7293 }, 7294 { 7295 .name = "memsw.failcnt", 7296 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 7297 .write = mem_cgroup_reset, 7298 .read_u64 = mem_cgroup_read_u64, 7299 }, 7300 { }, /* terminate */ 7301 }; 7302 7303 /* 7304 * If mem_cgroup_swap_init() is implemented as a subsys_initcall() 7305 * instead of a core_initcall(), this could mean cgroup_memory_noswap still 7306 * remains set to false even when memcg is disabled via "cgroup_disable=memory" 7307 * boot parameter. This may result in premature OOPS inside 7308 * mem_cgroup_get_nr_swap_pages() function in corner cases. 7309 */ 7310 static int __init mem_cgroup_swap_init(void) 7311 { 7312 /* No memory control -> no swap control */ 7313 if (mem_cgroup_disabled()) 7314 cgroup_memory_noswap = true; 7315 7316 if (cgroup_memory_noswap) 7317 return 0; 7318 7319 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); 7320 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); 7321 7322 return 0; 7323 } 7324 core_initcall(mem_cgroup_swap_init); 7325 7326 #endif /* CONFIG_MEMCG_SWAP */ 7327