1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* memcontrol.c - Memory Controller 3 * 4 * Copyright IBM Corporation, 2007 5 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 6 * 7 * Copyright 2007 OpenVZ SWsoft Inc 8 * Author: Pavel Emelianov <xemul@openvz.org> 9 * 10 * Memory thresholds 11 * Copyright (C) 2009 Nokia Corporation 12 * Author: Kirill A. Shutemov 13 * 14 * Kernel Memory Controller 15 * Copyright (C) 2012 Parallels Inc. and Google Inc. 16 * Authors: Glauber Costa and Suleiman Souhlal 17 * 18 * Native page reclaim 19 * Charge lifetime sanitation 20 * Lockless page tracking & accounting 21 * Unified hierarchy configuration model 22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 23 * 24 * Per memcg lru locking 25 * Copyright (C) 2020 Alibaba, Inc, Alex Shi 26 */ 27 28 #include <linux/page_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/pagewalk.h> 32 #include <linux/sched/mm.h> 33 #include <linux/shmem_fs.h> 34 #include <linux/hugetlb.h> 35 #include <linux/pagemap.h> 36 #include <linux/vm_event_item.h> 37 #include <linux/smp.h> 38 #include <linux/page-flags.h> 39 #include <linux/backing-dev.h> 40 #include <linux/bit_spinlock.h> 41 #include <linux/rcupdate.h> 42 #include <linux/limits.h> 43 #include <linux/export.h> 44 #include <linux/mutex.h> 45 #include <linux/rbtree.h> 46 #include <linux/slab.h> 47 #include <linux/swap.h> 48 #include <linux/swapops.h> 49 #include <linux/spinlock.h> 50 #include <linux/eventfd.h> 51 #include <linux/poll.h> 52 #include <linux/sort.h> 53 #include <linux/fs.h> 54 #include <linux/seq_file.h> 55 #include <linux/vmpressure.h> 56 #include <linux/memremap.h> 57 #include <linux/mm_inline.h> 58 #include <linux/swap_cgroup.h> 59 #include <linux/cpu.h> 60 #include <linux/oom.h> 61 #include <linux/lockdep.h> 62 #include <linux/file.h> 63 #include <linux/resume_user_mode.h> 64 #include <linux/psi.h> 65 #include <linux/seq_buf.h> 66 #include "internal.h" 67 #include <net/sock.h> 68 #include <net/ip.h> 69 #include "slab.h" 70 #include "swap.h" 71 72 #include <linux/uaccess.h> 73 74 #include <trace/events/vmscan.h> 75 76 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 77 EXPORT_SYMBOL(memory_cgrp_subsys); 78 79 struct mem_cgroup *root_mem_cgroup __read_mostly; 80 81 /* Active memory cgroup to use from an interrupt context */ 82 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); 83 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); 84 85 /* Socket memory accounting disabled? */ 86 static bool cgroup_memory_nosocket __ro_after_init; 87 88 /* Kernel memory accounting disabled? */ 89 static bool cgroup_memory_nokmem __ro_after_init; 90 91 #ifdef CONFIG_CGROUP_WRITEBACK 92 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); 93 #endif 94 95 /* Whether legacy memory+swap accounting is active */ 96 static bool do_memsw_account(void) 97 { 98 return !cgroup_subsys_on_dfl(memory_cgrp_subsys); 99 } 100 101 #define THRESHOLDS_EVENTS_TARGET 128 102 #define SOFTLIMIT_EVENTS_TARGET 1024 103 104 /* 105 * Cgroups above their limits are maintained in a RB-Tree, independent of 106 * their hierarchy representation 107 */ 108 109 struct mem_cgroup_tree_per_node { 110 struct rb_root rb_root; 111 struct rb_node *rb_rightmost; 112 spinlock_t lock; 113 }; 114 115 struct mem_cgroup_tree { 116 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 117 }; 118 119 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 120 121 /* for OOM */ 122 struct mem_cgroup_eventfd_list { 123 struct list_head list; 124 struct eventfd_ctx *eventfd; 125 }; 126 127 /* 128 * cgroup_event represents events which userspace want to receive. 129 */ 130 struct mem_cgroup_event { 131 /* 132 * memcg which the event belongs to. 133 */ 134 struct mem_cgroup *memcg; 135 /* 136 * eventfd to signal userspace about the event. 137 */ 138 struct eventfd_ctx *eventfd; 139 /* 140 * Each of these stored in a list by the cgroup. 141 */ 142 struct list_head list; 143 /* 144 * register_event() callback will be used to add new userspace 145 * waiter for changes related to this event. Use eventfd_signal() 146 * on eventfd to send notification to userspace. 147 */ 148 int (*register_event)(struct mem_cgroup *memcg, 149 struct eventfd_ctx *eventfd, const char *args); 150 /* 151 * unregister_event() callback will be called when userspace closes 152 * the eventfd or on cgroup removing. This callback must be set, 153 * if you want provide notification functionality. 154 */ 155 void (*unregister_event)(struct mem_cgroup *memcg, 156 struct eventfd_ctx *eventfd); 157 /* 158 * All fields below needed to unregister event when 159 * userspace closes eventfd. 160 */ 161 poll_table pt; 162 wait_queue_head_t *wqh; 163 wait_queue_entry_t wait; 164 struct work_struct remove; 165 }; 166 167 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 168 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 169 170 /* Stuffs for move charges at task migration. */ 171 /* 172 * Types of charges to be moved. 173 */ 174 #define MOVE_ANON 0x1U 175 #define MOVE_FILE 0x2U 176 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 177 178 /* "mc" and its members are protected by cgroup_mutex */ 179 static struct move_charge_struct { 180 spinlock_t lock; /* for from, to */ 181 struct mm_struct *mm; 182 struct mem_cgroup *from; 183 struct mem_cgroup *to; 184 unsigned long flags; 185 unsigned long precharge; 186 unsigned long moved_charge; 187 unsigned long moved_swap; 188 struct task_struct *moving_task; /* a task moving charges */ 189 wait_queue_head_t waitq; /* a waitq for other context */ 190 } mc = { 191 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 192 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 193 }; 194 195 /* 196 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 197 * limit reclaim to prevent infinite loops, if they ever occur. 198 */ 199 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 200 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 201 202 /* for encoding cft->private value on file */ 203 enum res_type { 204 _MEM, 205 _MEMSWAP, 206 _KMEM, 207 _TCP, 208 }; 209 210 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 211 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 212 #define MEMFILE_ATTR(val) ((val) & 0xffff) 213 214 /* 215 * Iteration constructs for visiting all cgroups (under a tree). If 216 * loops are exited prematurely (break), mem_cgroup_iter_break() must 217 * be used for reference counting. 218 */ 219 #define for_each_mem_cgroup_tree(iter, root) \ 220 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 221 iter != NULL; \ 222 iter = mem_cgroup_iter(root, iter, NULL)) 223 224 #define for_each_mem_cgroup(iter) \ 225 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 226 iter != NULL; \ 227 iter = mem_cgroup_iter(NULL, iter, NULL)) 228 229 static inline bool task_is_dying(void) 230 { 231 return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 232 (current->flags & PF_EXITING); 233 } 234 235 /* Some nice accessors for the vmpressure. */ 236 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 237 { 238 if (!memcg) 239 memcg = root_mem_cgroup; 240 return &memcg->vmpressure; 241 } 242 243 struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) 244 { 245 return container_of(vmpr, struct mem_cgroup, vmpressure); 246 } 247 248 #ifdef CONFIG_MEMCG_KMEM 249 static DEFINE_SPINLOCK(objcg_lock); 250 251 bool mem_cgroup_kmem_disabled(void) 252 { 253 return cgroup_memory_nokmem; 254 } 255 256 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 257 unsigned int nr_pages); 258 259 static void obj_cgroup_release(struct percpu_ref *ref) 260 { 261 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); 262 unsigned int nr_bytes; 263 unsigned int nr_pages; 264 unsigned long flags; 265 266 /* 267 * At this point all allocated objects are freed, and 268 * objcg->nr_charged_bytes can't have an arbitrary byte value. 269 * However, it can be PAGE_SIZE or (x * PAGE_SIZE). 270 * 271 * The following sequence can lead to it: 272 * 1) CPU0: objcg == stock->cached_objcg 273 * 2) CPU1: we do a small allocation (e.g. 92 bytes), 274 * PAGE_SIZE bytes are charged 275 * 3) CPU1: a process from another memcg is allocating something, 276 * the stock if flushed, 277 * objcg->nr_charged_bytes = PAGE_SIZE - 92 278 * 5) CPU0: we do release this object, 279 * 92 bytes are added to stock->nr_bytes 280 * 6) CPU0: stock is flushed, 281 * 92 bytes are added to objcg->nr_charged_bytes 282 * 283 * In the result, nr_charged_bytes == PAGE_SIZE. 284 * This page will be uncharged in obj_cgroup_release(). 285 */ 286 nr_bytes = atomic_read(&objcg->nr_charged_bytes); 287 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); 288 nr_pages = nr_bytes >> PAGE_SHIFT; 289 290 if (nr_pages) 291 obj_cgroup_uncharge_pages(objcg, nr_pages); 292 293 spin_lock_irqsave(&objcg_lock, flags); 294 list_del(&objcg->list); 295 spin_unlock_irqrestore(&objcg_lock, flags); 296 297 percpu_ref_exit(ref); 298 kfree_rcu(objcg, rcu); 299 } 300 301 static struct obj_cgroup *obj_cgroup_alloc(void) 302 { 303 struct obj_cgroup *objcg; 304 int ret; 305 306 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); 307 if (!objcg) 308 return NULL; 309 310 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, 311 GFP_KERNEL); 312 if (ret) { 313 kfree(objcg); 314 return NULL; 315 } 316 INIT_LIST_HEAD(&objcg->list); 317 return objcg; 318 } 319 320 static void memcg_reparent_objcgs(struct mem_cgroup *memcg, 321 struct mem_cgroup *parent) 322 { 323 struct obj_cgroup *objcg, *iter; 324 325 objcg = rcu_replace_pointer(memcg->objcg, NULL, true); 326 327 spin_lock_irq(&objcg_lock); 328 329 /* 1) Ready to reparent active objcg. */ 330 list_add(&objcg->list, &memcg->objcg_list); 331 /* 2) Reparent active objcg and already reparented objcgs to parent. */ 332 list_for_each_entry(iter, &memcg->objcg_list, list) 333 WRITE_ONCE(iter->memcg, parent); 334 /* 3) Move already reparented objcgs to the parent's list */ 335 list_splice(&memcg->objcg_list, &parent->objcg_list); 336 337 spin_unlock_irq(&objcg_lock); 338 339 percpu_ref_kill(&objcg->refcnt); 340 } 341 342 /* 343 * A lot of the calls to the cache allocation functions are expected to be 344 * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are 345 * conditional to this static branch, we'll have to allow modules that does 346 * kmem_cache_alloc and the such to see this symbol as well 347 */ 348 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); 349 EXPORT_SYMBOL(memcg_kmem_enabled_key); 350 #endif 351 352 /** 353 * mem_cgroup_css_from_page - css of the memcg associated with a page 354 * @page: page of interest 355 * 356 * If memcg is bound to the default hierarchy, css of the memcg associated 357 * with @page is returned. The returned css remains associated with @page 358 * until it is released. 359 * 360 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 361 * is returned. 362 */ 363 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) 364 { 365 struct mem_cgroup *memcg; 366 367 memcg = page_memcg(page); 368 369 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 370 memcg = root_mem_cgroup; 371 372 return &memcg->css; 373 } 374 375 /** 376 * page_cgroup_ino - return inode number of the memcg a page is charged to 377 * @page: the page 378 * 379 * Look up the closest online ancestor of the memory cgroup @page is charged to 380 * and return its inode number or 0 if @page is not charged to any cgroup. It 381 * is safe to call this function without holding a reference to @page. 382 * 383 * Note, this function is inherently racy, because there is nothing to prevent 384 * the cgroup inode from getting torn down and potentially reallocated a moment 385 * after page_cgroup_ino() returns, so it only should be used by callers that 386 * do not care (such as procfs interfaces). 387 */ 388 ino_t page_cgroup_ino(struct page *page) 389 { 390 struct mem_cgroup *memcg; 391 unsigned long ino = 0; 392 393 rcu_read_lock(); 394 memcg = page_memcg_check(page); 395 396 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 397 memcg = parent_mem_cgroup(memcg); 398 if (memcg) 399 ino = cgroup_ino(memcg->css.cgroup); 400 rcu_read_unlock(); 401 return ino; 402 } 403 404 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 405 struct mem_cgroup_tree_per_node *mctz, 406 unsigned long new_usage_in_excess) 407 { 408 struct rb_node **p = &mctz->rb_root.rb_node; 409 struct rb_node *parent = NULL; 410 struct mem_cgroup_per_node *mz_node; 411 bool rightmost = true; 412 413 if (mz->on_tree) 414 return; 415 416 mz->usage_in_excess = new_usage_in_excess; 417 if (!mz->usage_in_excess) 418 return; 419 while (*p) { 420 parent = *p; 421 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 422 tree_node); 423 if (mz->usage_in_excess < mz_node->usage_in_excess) { 424 p = &(*p)->rb_left; 425 rightmost = false; 426 } else { 427 p = &(*p)->rb_right; 428 } 429 } 430 431 if (rightmost) 432 mctz->rb_rightmost = &mz->tree_node; 433 434 rb_link_node(&mz->tree_node, parent, p); 435 rb_insert_color(&mz->tree_node, &mctz->rb_root); 436 mz->on_tree = true; 437 } 438 439 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 440 struct mem_cgroup_tree_per_node *mctz) 441 { 442 if (!mz->on_tree) 443 return; 444 445 if (&mz->tree_node == mctz->rb_rightmost) 446 mctz->rb_rightmost = rb_prev(&mz->tree_node); 447 448 rb_erase(&mz->tree_node, &mctz->rb_root); 449 mz->on_tree = false; 450 } 451 452 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 453 struct mem_cgroup_tree_per_node *mctz) 454 { 455 unsigned long flags; 456 457 spin_lock_irqsave(&mctz->lock, flags); 458 __mem_cgroup_remove_exceeded(mz, mctz); 459 spin_unlock_irqrestore(&mctz->lock, flags); 460 } 461 462 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 463 { 464 unsigned long nr_pages = page_counter_read(&memcg->memory); 465 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 466 unsigned long excess = 0; 467 468 if (nr_pages > soft_limit) 469 excess = nr_pages - soft_limit; 470 471 return excess; 472 } 473 474 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) 475 { 476 unsigned long excess; 477 struct mem_cgroup_per_node *mz; 478 struct mem_cgroup_tree_per_node *mctz; 479 480 mctz = soft_limit_tree.rb_tree_per_node[nid]; 481 if (!mctz) 482 return; 483 /* 484 * Necessary to update all ancestors when hierarchy is used. 485 * because their event counter is not touched. 486 */ 487 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 488 mz = memcg->nodeinfo[nid]; 489 excess = soft_limit_excess(memcg); 490 /* 491 * We have to update the tree if mz is on RB-tree or 492 * mem is over its softlimit. 493 */ 494 if (excess || mz->on_tree) { 495 unsigned long flags; 496 497 spin_lock_irqsave(&mctz->lock, flags); 498 /* if on-tree, remove it */ 499 if (mz->on_tree) 500 __mem_cgroup_remove_exceeded(mz, mctz); 501 /* 502 * Insert again. mz->usage_in_excess will be updated. 503 * If excess is 0, no tree ops. 504 */ 505 __mem_cgroup_insert_exceeded(mz, mctz, excess); 506 spin_unlock_irqrestore(&mctz->lock, flags); 507 } 508 } 509 } 510 511 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 512 { 513 struct mem_cgroup_tree_per_node *mctz; 514 struct mem_cgroup_per_node *mz; 515 int nid; 516 517 for_each_node(nid) { 518 mz = memcg->nodeinfo[nid]; 519 mctz = soft_limit_tree.rb_tree_per_node[nid]; 520 if (mctz) 521 mem_cgroup_remove_exceeded(mz, mctz); 522 } 523 } 524 525 static struct mem_cgroup_per_node * 526 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 527 { 528 struct mem_cgroup_per_node *mz; 529 530 retry: 531 mz = NULL; 532 if (!mctz->rb_rightmost) 533 goto done; /* Nothing to reclaim from */ 534 535 mz = rb_entry(mctz->rb_rightmost, 536 struct mem_cgroup_per_node, tree_node); 537 /* 538 * Remove the node now but someone else can add it back, 539 * we will to add it back at the end of reclaim to its correct 540 * position in the tree. 541 */ 542 __mem_cgroup_remove_exceeded(mz, mctz); 543 if (!soft_limit_excess(mz->memcg) || 544 !css_tryget(&mz->memcg->css)) 545 goto retry; 546 done: 547 return mz; 548 } 549 550 static struct mem_cgroup_per_node * 551 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 552 { 553 struct mem_cgroup_per_node *mz; 554 555 spin_lock_irq(&mctz->lock); 556 mz = __mem_cgroup_largest_soft_limit_node(mctz); 557 spin_unlock_irq(&mctz->lock); 558 return mz; 559 } 560 561 /* 562 * memcg and lruvec stats flushing 563 * 564 * Many codepaths leading to stats update or read are performance sensitive and 565 * adding stats flushing in such codepaths is not desirable. So, to optimize the 566 * flushing the kernel does: 567 * 568 * 1) Periodically and asynchronously flush the stats every 2 seconds to not let 569 * rstat update tree grow unbounded. 570 * 571 * 2) Flush the stats synchronously on reader side only when there are more than 572 * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization 573 * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but 574 * only for 2 seconds due to (1). 575 */ 576 static void flush_memcg_stats_dwork(struct work_struct *w); 577 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); 578 static DEFINE_SPINLOCK(stats_flush_lock); 579 static DEFINE_PER_CPU(unsigned int, stats_updates); 580 static atomic_t stats_flush_threshold = ATOMIC_INIT(0); 581 static u64 flush_next_time; 582 583 #define FLUSH_TIME (2UL*HZ) 584 585 /* 586 * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can 587 * not rely on this as part of an acquired spinlock_t lock. These functions are 588 * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion 589 * is sufficient. 590 */ 591 static void memcg_stats_lock(void) 592 { 593 preempt_disable_nested(); 594 VM_WARN_ON_IRQS_ENABLED(); 595 } 596 597 static void __memcg_stats_lock(void) 598 { 599 preempt_disable_nested(); 600 } 601 602 static void memcg_stats_unlock(void) 603 { 604 preempt_enable_nested(); 605 } 606 607 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) 608 { 609 unsigned int x; 610 611 cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); 612 613 x = __this_cpu_add_return(stats_updates, abs(val)); 614 if (x > MEMCG_CHARGE_BATCH) { 615 /* 616 * If stats_flush_threshold exceeds the threshold 617 * (>num_online_cpus()), cgroup stats update will be triggered 618 * in __mem_cgroup_flush_stats(). Increasing this var further 619 * is redundant and simply adds overhead in atomic update. 620 */ 621 if (atomic_read(&stats_flush_threshold) <= num_online_cpus()) 622 atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); 623 __this_cpu_write(stats_updates, 0); 624 } 625 } 626 627 static void __mem_cgroup_flush_stats(void) 628 { 629 unsigned long flag; 630 631 if (!spin_trylock_irqsave(&stats_flush_lock, flag)) 632 return; 633 634 flush_next_time = jiffies_64 + 2*FLUSH_TIME; 635 cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); 636 atomic_set(&stats_flush_threshold, 0); 637 spin_unlock_irqrestore(&stats_flush_lock, flag); 638 } 639 640 void mem_cgroup_flush_stats(void) 641 { 642 if (atomic_read(&stats_flush_threshold) > num_online_cpus()) 643 __mem_cgroup_flush_stats(); 644 } 645 646 void mem_cgroup_flush_stats_delayed(void) 647 { 648 if (time_after64(jiffies_64, flush_next_time)) 649 mem_cgroup_flush_stats(); 650 } 651 652 static void flush_memcg_stats_dwork(struct work_struct *w) 653 { 654 __mem_cgroup_flush_stats(); 655 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); 656 } 657 658 /* Subset of vm_event_item to report for memcg event stats */ 659 static const unsigned int memcg_vm_event_stat[] = { 660 PGPGIN, 661 PGPGOUT, 662 PGSCAN_KSWAPD, 663 PGSCAN_DIRECT, 664 PGSCAN_KHUGEPAGED, 665 PGSTEAL_KSWAPD, 666 PGSTEAL_DIRECT, 667 PGSTEAL_KHUGEPAGED, 668 PGFAULT, 669 PGMAJFAULT, 670 PGREFILL, 671 PGACTIVATE, 672 PGDEACTIVATE, 673 PGLAZYFREE, 674 PGLAZYFREED, 675 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 676 ZSWPIN, 677 ZSWPOUT, 678 #endif 679 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 680 THP_FAULT_ALLOC, 681 THP_COLLAPSE_ALLOC, 682 #endif 683 }; 684 685 #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) 686 static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; 687 688 static void init_memcg_events(void) 689 { 690 int i; 691 692 for (i = 0; i < NR_MEMCG_EVENTS; ++i) 693 mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; 694 } 695 696 static inline int memcg_events_index(enum vm_event_item idx) 697 { 698 return mem_cgroup_events_index[idx] - 1; 699 } 700 701 struct memcg_vmstats_percpu { 702 /* Local (CPU and cgroup) page state & events */ 703 long state[MEMCG_NR_STAT]; 704 unsigned long events[NR_MEMCG_EVENTS]; 705 706 /* Delta calculation for lockless upward propagation */ 707 long state_prev[MEMCG_NR_STAT]; 708 unsigned long events_prev[NR_MEMCG_EVENTS]; 709 710 /* Cgroup1: threshold notifications & softlimit tree updates */ 711 unsigned long nr_page_events; 712 unsigned long targets[MEM_CGROUP_NTARGETS]; 713 }; 714 715 struct memcg_vmstats { 716 /* Aggregated (CPU and subtree) page state & events */ 717 long state[MEMCG_NR_STAT]; 718 unsigned long events[NR_MEMCG_EVENTS]; 719 720 /* Pending child counts during tree propagation */ 721 long state_pending[MEMCG_NR_STAT]; 722 unsigned long events_pending[NR_MEMCG_EVENTS]; 723 }; 724 725 unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) 726 { 727 long x = READ_ONCE(memcg->vmstats->state[idx]); 728 #ifdef CONFIG_SMP 729 if (x < 0) 730 x = 0; 731 #endif 732 return x; 733 } 734 735 /** 736 * __mod_memcg_state - update cgroup memory statistics 737 * @memcg: the memory cgroup 738 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item 739 * @val: delta to add to the counter, can be negative 740 */ 741 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) 742 { 743 if (mem_cgroup_disabled()) 744 return; 745 746 __this_cpu_add(memcg->vmstats_percpu->state[idx], val); 747 memcg_rstat_updated(memcg, val); 748 } 749 750 /* idx can be of type enum memcg_stat_item or node_stat_item. */ 751 static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) 752 { 753 long x = 0; 754 int cpu; 755 756 for_each_possible_cpu(cpu) 757 x += per_cpu(memcg->vmstats_percpu->state[idx], cpu); 758 #ifdef CONFIG_SMP 759 if (x < 0) 760 x = 0; 761 #endif 762 return x; 763 } 764 765 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 766 int val) 767 { 768 struct mem_cgroup_per_node *pn; 769 struct mem_cgroup *memcg; 770 771 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 772 memcg = pn->memcg; 773 774 /* 775 * The caller from rmap relay on disabled preemption becase they never 776 * update their counter from in-interrupt context. For these two 777 * counters we check that the update is never performed from an 778 * interrupt context while other caller need to have disabled interrupt. 779 */ 780 __memcg_stats_lock(); 781 if (IS_ENABLED(CONFIG_DEBUG_VM)) { 782 switch (idx) { 783 case NR_ANON_MAPPED: 784 case NR_FILE_MAPPED: 785 case NR_ANON_THPS: 786 case NR_SHMEM_PMDMAPPED: 787 case NR_FILE_PMDMAPPED: 788 WARN_ON_ONCE(!in_task()); 789 break; 790 default: 791 VM_WARN_ON_IRQS_ENABLED(); 792 } 793 } 794 795 /* Update memcg */ 796 __this_cpu_add(memcg->vmstats_percpu->state[idx], val); 797 798 /* Update lruvec */ 799 __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); 800 801 memcg_rstat_updated(memcg, val); 802 memcg_stats_unlock(); 803 } 804 805 /** 806 * __mod_lruvec_state - update lruvec memory statistics 807 * @lruvec: the lruvec 808 * @idx: the stat item 809 * @val: delta to add to the counter, can be negative 810 * 811 * The lruvec is the intersection of the NUMA node and a cgroup. This 812 * function updates the all three counters that are affected by a 813 * change of state at this level: per-node, per-cgroup, per-lruvec. 814 */ 815 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 816 int val) 817 { 818 /* Update node */ 819 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); 820 821 /* Update memcg and lruvec */ 822 if (!mem_cgroup_disabled()) 823 __mod_memcg_lruvec_state(lruvec, idx, val); 824 } 825 826 void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, 827 int val) 828 { 829 struct page *head = compound_head(page); /* rmap on tail pages */ 830 struct mem_cgroup *memcg; 831 pg_data_t *pgdat = page_pgdat(page); 832 struct lruvec *lruvec; 833 834 rcu_read_lock(); 835 memcg = page_memcg(head); 836 /* Untracked pages have no memcg, no lruvec. Update only the node */ 837 if (!memcg) { 838 rcu_read_unlock(); 839 __mod_node_page_state(pgdat, idx, val); 840 return; 841 } 842 843 lruvec = mem_cgroup_lruvec(memcg, pgdat); 844 __mod_lruvec_state(lruvec, idx, val); 845 rcu_read_unlock(); 846 } 847 EXPORT_SYMBOL(__mod_lruvec_page_state); 848 849 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) 850 { 851 pg_data_t *pgdat = page_pgdat(virt_to_page(p)); 852 struct mem_cgroup *memcg; 853 struct lruvec *lruvec; 854 855 rcu_read_lock(); 856 memcg = mem_cgroup_from_slab_obj(p); 857 858 /* 859 * Untracked pages have no memcg, no lruvec. Update only the 860 * node. If we reparent the slab objects to the root memcg, 861 * when we free the slab object, we need to update the per-memcg 862 * vmstats to keep it correct for the root memcg. 863 */ 864 if (!memcg) { 865 __mod_node_page_state(pgdat, idx, val); 866 } else { 867 lruvec = mem_cgroup_lruvec(memcg, pgdat); 868 __mod_lruvec_state(lruvec, idx, val); 869 } 870 rcu_read_unlock(); 871 } 872 873 /** 874 * __count_memcg_events - account VM events in a cgroup 875 * @memcg: the memory cgroup 876 * @idx: the event item 877 * @count: the number of events that occurred 878 */ 879 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, 880 unsigned long count) 881 { 882 int index = memcg_events_index(idx); 883 884 if (mem_cgroup_disabled() || index < 0) 885 return; 886 887 memcg_stats_lock(); 888 __this_cpu_add(memcg->vmstats_percpu->events[index], count); 889 memcg_rstat_updated(memcg, count); 890 memcg_stats_unlock(); 891 } 892 893 static unsigned long memcg_events(struct mem_cgroup *memcg, int event) 894 { 895 int index = memcg_events_index(event); 896 897 if (index < 0) 898 return 0; 899 return READ_ONCE(memcg->vmstats->events[index]); 900 } 901 902 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) 903 { 904 long x = 0; 905 int cpu; 906 int index = memcg_events_index(event); 907 908 if (index < 0) 909 return 0; 910 911 for_each_possible_cpu(cpu) 912 x += per_cpu(memcg->vmstats_percpu->events[index], cpu); 913 return x; 914 } 915 916 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 917 int nr_pages) 918 { 919 /* pagein of a big page is an event. So, ignore page size */ 920 if (nr_pages > 0) 921 __count_memcg_events(memcg, PGPGIN, 1); 922 else { 923 __count_memcg_events(memcg, PGPGOUT, 1); 924 nr_pages = -nr_pages; /* for event */ 925 } 926 927 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); 928 } 929 930 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 931 enum mem_cgroup_events_target target) 932 { 933 unsigned long val, next; 934 935 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); 936 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); 937 /* from time_after() in jiffies.h */ 938 if ((long)(next - val) < 0) { 939 switch (target) { 940 case MEM_CGROUP_TARGET_THRESH: 941 next = val + THRESHOLDS_EVENTS_TARGET; 942 break; 943 case MEM_CGROUP_TARGET_SOFTLIMIT: 944 next = val + SOFTLIMIT_EVENTS_TARGET; 945 break; 946 default: 947 break; 948 } 949 __this_cpu_write(memcg->vmstats_percpu->targets[target], next); 950 return true; 951 } 952 return false; 953 } 954 955 /* 956 * Check events in order. 957 * 958 */ 959 static void memcg_check_events(struct mem_cgroup *memcg, int nid) 960 { 961 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 962 return; 963 964 /* threshold event is triggered in finer grain than soft limit */ 965 if (unlikely(mem_cgroup_event_ratelimit(memcg, 966 MEM_CGROUP_TARGET_THRESH))) { 967 bool do_softlimit; 968 969 do_softlimit = mem_cgroup_event_ratelimit(memcg, 970 MEM_CGROUP_TARGET_SOFTLIMIT); 971 mem_cgroup_threshold(memcg); 972 if (unlikely(do_softlimit)) 973 mem_cgroup_update_tree(memcg, nid); 974 } 975 } 976 977 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 978 { 979 /* 980 * mm_update_next_owner() may clear mm->owner to NULL 981 * if it races with swapoff, page migration, etc. 982 * So this can be called with p == NULL. 983 */ 984 if (unlikely(!p)) 985 return NULL; 986 987 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 988 } 989 EXPORT_SYMBOL(mem_cgroup_from_task); 990 991 static __always_inline struct mem_cgroup *active_memcg(void) 992 { 993 if (!in_task()) 994 return this_cpu_read(int_active_memcg); 995 else 996 return current->active_memcg; 997 } 998 999 /** 1000 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 1001 * @mm: mm from which memcg should be extracted. It can be NULL. 1002 * 1003 * Obtain a reference on mm->memcg and returns it if successful. If mm 1004 * is NULL, then the memcg is chosen as follows: 1005 * 1) The active memcg, if set. 1006 * 2) current->mm->memcg, if available 1007 * 3) root memcg 1008 * If mem_cgroup is disabled, NULL is returned. 1009 */ 1010 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 1011 { 1012 struct mem_cgroup *memcg; 1013 1014 if (mem_cgroup_disabled()) 1015 return NULL; 1016 1017 /* 1018 * Page cache insertions can happen without an 1019 * actual mm context, e.g. during disk probing 1020 * on boot, loopback IO, acct() writes etc. 1021 * 1022 * No need to css_get on root memcg as the reference 1023 * counting is disabled on the root level in the 1024 * cgroup core. See CSS_NO_REF. 1025 */ 1026 if (unlikely(!mm)) { 1027 memcg = active_memcg(); 1028 if (unlikely(memcg)) { 1029 /* remote memcg must hold a ref */ 1030 css_get(&memcg->css); 1031 return memcg; 1032 } 1033 mm = current->mm; 1034 if (unlikely(!mm)) 1035 return root_mem_cgroup; 1036 } 1037 1038 rcu_read_lock(); 1039 do { 1040 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1041 if (unlikely(!memcg)) 1042 memcg = root_mem_cgroup; 1043 } while (!css_tryget(&memcg->css)); 1044 rcu_read_unlock(); 1045 return memcg; 1046 } 1047 EXPORT_SYMBOL(get_mem_cgroup_from_mm); 1048 1049 static __always_inline bool memcg_kmem_bypass(void) 1050 { 1051 /* Allow remote memcg charging from any context. */ 1052 if (unlikely(active_memcg())) 1053 return false; 1054 1055 /* Memcg to charge can't be determined. */ 1056 if (!in_task() || !current->mm || (current->flags & PF_KTHREAD)) 1057 return true; 1058 1059 return false; 1060 } 1061 1062 /** 1063 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1064 * @root: hierarchy root 1065 * @prev: previously returned memcg, NULL on first invocation 1066 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1067 * 1068 * Returns references to children of the hierarchy below @root, or 1069 * @root itself, or %NULL after a full round-trip. 1070 * 1071 * Caller must pass the return value in @prev on subsequent 1072 * invocations for reference counting, or use mem_cgroup_iter_break() 1073 * to cancel a hierarchy walk before the round-trip is complete. 1074 * 1075 * Reclaimers can specify a node in @reclaim to divide up the memcgs 1076 * in the hierarchy among all concurrent reclaimers operating on the 1077 * same node. 1078 */ 1079 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1080 struct mem_cgroup *prev, 1081 struct mem_cgroup_reclaim_cookie *reclaim) 1082 { 1083 struct mem_cgroup_reclaim_iter *iter; 1084 struct cgroup_subsys_state *css = NULL; 1085 struct mem_cgroup *memcg = NULL; 1086 struct mem_cgroup *pos = NULL; 1087 1088 if (mem_cgroup_disabled()) 1089 return NULL; 1090 1091 if (!root) 1092 root = root_mem_cgroup; 1093 1094 rcu_read_lock(); 1095 1096 if (reclaim) { 1097 struct mem_cgroup_per_node *mz; 1098 1099 mz = root->nodeinfo[reclaim->pgdat->node_id]; 1100 iter = &mz->iter; 1101 1102 /* 1103 * On start, join the current reclaim iteration cycle. 1104 * Exit when a concurrent walker completes it. 1105 */ 1106 if (!prev) 1107 reclaim->generation = iter->generation; 1108 else if (reclaim->generation != iter->generation) 1109 goto out_unlock; 1110 1111 while (1) { 1112 pos = READ_ONCE(iter->position); 1113 if (!pos || css_tryget(&pos->css)) 1114 break; 1115 /* 1116 * css reference reached zero, so iter->position will 1117 * be cleared by ->css_released. However, we should not 1118 * rely on this happening soon, because ->css_released 1119 * is called from a work queue, and by busy-waiting we 1120 * might block it. So we clear iter->position right 1121 * away. 1122 */ 1123 (void)cmpxchg(&iter->position, pos, NULL); 1124 } 1125 } else if (prev) { 1126 pos = prev; 1127 } 1128 1129 if (pos) 1130 css = &pos->css; 1131 1132 for (;;) { 1133 css = css_next_descendant_pre(css, &root->css); 1134 if (!css) { 1135 /* 1136 * Reclaimers share the hierarchy walk, and a 1137 * new one might jump in right at the end of 1138 * the hierarchy - make sure they see at least 1139 * one group and restart from the beginning. 1140 */ 1141 if (!prev) 1142 continue; 1143 break; 1144 } 1145 1146 /* 1147 * Verify the css and acquire a reference. The root 1148 * is provided by the caller, so we know it's alive 1149 * and kicking, and don't take an extra reference. 1150 */ 1151 if (css == &root->css || css_tryget(css)) { 1152 memcg = mem_cgroup_from_css(css); 1153 break; 1154 } 1155 } 1156 1157 if (reclaim) { 1158 /* 1159 * The position could have already been updated by a competing 1160 * thread, so check that the value hasn't changed since we read 1161 * it to avoid reclaiming from the same cgroup twice. 1162 */ 1163 (void)cmpxchg(&iter->position, pos, memcg); 1164 1165 if (pos) 1166 css_put(&pos->css); 1167 1168 if (!memcg) 1169 iter->generation++; 1170 } 1171 1172 out_unlock: 1173 rcu_read_unlock(); 1174 if (prev && prev != root) 1175 css_put(&prev->css); 1176 1177 return memcg; 1178 } 1179 1180 /** 1181 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1182 * @root: hierarchy root 1183 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1184 */ 1185 void mem_cgroup_iter_break(struct mem_cgroup *root, 1186 struct mem_cgroup *prev) 1187 { 1188 if (!root) 1189 root = root_mem_cgroup; 1190 if (prev && prev != root) 1191 css_put(&prev->css); 1192 } 1193 1194 static void __invalidate_reclaim_iterators(struct mem_cgroup *from, 1195 struct mem_cgroup *dead_memcg) 1196 { 1197 struct mem_cgroup_reclaim_iter *iter; 1198 struct mem_cgroup_per_node *mz; 1199 int nid; 1200 1201 for_each_node(nid) { 1202 mz = from->nodeinfo[nid]; 1203 iter = &mz->iter; 1204 cmpxchg(&iter->position, dead_memcg, NULL); 1205 } 1206 } 1207 1208 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 1209 { 1210 struct mem_cgroup *memcg = dead_memcg; 1211 struct mem_cgroup *last; 1212 1213 do { 1214 __invalidate_reclaim_iterators(memcg, dead_memcg); 1215 last = memcg; 1216 } while ((memcg = parent_mem_cgroup(memcg))); 1217 1218 /* 1219 * When cgroup1 non-hierarchy mode is used, 1220 * parent_mem_cgroup() does not walk all the way up to the 1221 * cgroup root (root_mem_cgroup). So we have to handle 1222 * dead_memcg from cgroup root separately. 1223 */ 1224 if (!mem_cgroup_is_root(last)) 1225 __invalidate_reclaim_iterators(root_mem_cgroup, 1226 dead_memcg); 1227 } 1228 1229 /** 1230 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 1231 * @memcg: hierarchy root 1232 * @fn: function to call for each task 1233 * @arg: argument passed to @fn 1234 * 1235 * This function iterates over tasks attached to @memcg or to any of its 1236 * descendants and calls @fn for each task. If @fn returns a non-zero 1237 * value, the function breaks the iteration loop and returns the value. 1238 * Otherwise, it will iterate over all tasks and return 0. 1239 * 1240 * This function must not be called for the root memory cgroup. 1241 */ 1242 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 1243 int (*fn)(struct task_struct *, void *), void *arg) 1244 { 1245 struct mem_cgroup *iter; 1246 int ret = 0; 1247 1248 BUG_ON(mem_cgroup_is_root(memcg)); 1249 1250 for_each_mem_cgroup_tree(iter, memcg) { 1251 struct css_task_iter it; 1252 struct task_struct *task; 1253 1254 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); 1255 while (!ret && (task = css_task_iter_next(&it))) 1256 ret = fn(task, arg); 1257 css_task_iter_end(&it); 1258 if (ret) { 1259 mem_cgroup_iter_break(memcg, iter); 1260 break; 1261 } 1262 } 1263 return ret; 1264 } 1265 1266 #ifdef CONFIG_DEBUG_VM 1267 void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) 1268 { 1269 struct mem_cgroup *memcg; 1270 1271 if (mem_cgroup_disabled()) 1272 return; 1273 1274 memcg = folio_memcg(folio); 1275 1276 if (!memcg) 1277 VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); 1278 else 1279 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); 1280 } 1281 #endif 1282 1283 /** 1284 * folio_lruvec_lock - Lock the lruvec for a folio. 1285 * @folio: Pointer to the folio. 1286 * 1287 * These functions are safe to use under any of the following conditions: 1288 * - folio locked 1289 * - folio_test_lru false 1290 * - folio_memcg_lock() 1291 * - folio frozen (refcount of 0) 1292 * 1293 * Return: The lruvec this folio is on with its lock held. 1294 */ 1295 struct lruvec *folio_lruvec_lock(struct folio *folio) 1296 { 1297 struct lruvec *lruvec = folio_lruvec(folio); 1298 1299 spin_lock(&lruvec->lru_lock); 1300 lruvec_memcg_debug(lruvec, folio); 1301 1302 return lruvec; 1303 } 1304 1305 /** 1306 * folio_lruvec_lock_irq - Lock the lruvec for a folio. 1307 * @folio: Pointer to the folio. 1308 * 1309 * These functions are safe to use under any of the following conditions: 1310 * - folio locked 1311 * - folio_test_lru false 1312 * - folio_memcg_lock() 1313 * - folio frozen (refcount of 0) 1314 * 1315 * Return: The lruvec this folio is on with its lock held and interrupts 1316 * disabled. 1317 */ 1318 struct lruvec *folio_lruvec_lock_irq(struct folio *folio) 1319 { 1320 struct lruvec *lruvec = folio_lruvec(folio); 1321 1322 spin_lock_irq(&lruvec->lru_lock); 1323 lruvec_memcg_debug(lruvec, folio); 1324 1325 return lruvec; 1326 } 1327 1328 /** 1329 * folio_lruvec_lock_irqsave - Lock the lruvec for a folio. 1330 * @folio: Pointer to the folio. 1331 * @flags: Pointer to irqsave flags. 1332 * 1333 * These functions are safe to use under any of the following conditions: 1334 * - folio locked 1335 * - folio_test_lru false 1336 * - folio_memcg_lock() 1337 * - folio frozen (refcount of 0) 1338 * 1339 * Return: The lruvec this folio is on with its lock held and interrupts 1340 * disabled. 1341 */ 1342 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, 1343 unsigned long *flags) 1344 { 1345 struct lruvec *lruvec = folio_lruvec(folio); 1346 1347 spin_lock_irqsave(&lruvec->lru_lock, *flags); 1348 lruvec_memcg_debug(lruvec, folio); 1349 1350 return lruvec; 1351 } 1352 1353 /** 1354 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1355 * @lruvec: mem_cgroup per zone lru vector 1356 * @lru: index of lru list the page is sitting on 1357 * @zid: zone id of the accounted pages 1358 * @nr_pages: positive when adding or negative when removing 1359 * 1360 * This function must be called under lru_lock, just before a page is added 1361 * to or just after a page is removed from an lru list. 1362 */ 1363 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1364 int zid, int nr_pages) 1365 { 1366 struct mem_cgroup_per_node *mz; 1367 unsigned long *lru_size; 1368 long size; 1369 1370 if (mem_cgroup_disabled()) 1371 return; 1372 1373 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 1374 lru_size = &mz->lru_zone_size[zid][lru]; 1375 1376 if (nr_pages < 0) 1377 *lru_size += nr_pages; 1378 1379 size = *lru_size; 1380 if (WARN_ONCE(size < 0, 1381 "%s(%p, %d, %d): lru_size %ld\n", 1382 __func__, lruvec, lru, nr_pages, size)) { 1383 VM_BUG_ON(1); 1384 *lru_size = 0; 1385 } 1386 1387 if (nr_pages > 0) 1388 *lru_size += nr_pages; 1389 } 1390 1391 /** 1392 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1393 * @memcg: the memory cgroup 1394 * 1395 * Returns the maximum amount of memory @mem can be charged with, in 1396 * pages. 1397 */ 1398 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1399 { 1400 unsigned long margin = 0; 1401 unsigned long count; 1402 unsigned long limit; 1403 1404 count = page_counter_read(&memcg->memory); 1405 limit = READ_ONCE(memcg->memory.max); 1406 if (count < limit) 1407 margin = limit - count; 1408 1409 if (do_memsw_account()) { 1410 count = page_counter_read(&memcg->memsw); 1411 limit = READ_ONCE(memcg->memsw.max); 1412 if (count < limit) 1413 margin = min(margin, limit - count); 1414 else 1415 margin = 0; 1416 } 1417 1418 return margin; 1419 } 1420 1421 /* 1422 * A routine for checking "mem" is under move_account() or not. 1423 * 1424 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1425 * moving cgroups. This is for waiting at high-memory pressure 1426 * caused by "move". 1427 */ 1428 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1429 { 1430 struct mem_cgroup *from; 1431 struct mem_cgroup *to; 1432 bool ret = false; 1433 /* 1434 * Unlike task_move routines, we access mc.to, mc.from not under 1435 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1436 */ 1437 spin_lock(&mc.lock); 1438 from = mc.from; 1439 to = mc.to; 1440 if (!from) 1441 goto unlock; 1442 1443 ret = mem_cgroup_is_descendant(from, memcg) || 1444 mem_cgroup_is_descendant(to, memcg); 1445 unlock: 1446 spin_unlock(&mc.lock); 1447 return ret; 1448 } 1449 1450 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1451 { 1452 if (mc.moving_task && current != mc.moving_task) { 1453 if (mem_cgroup_under_move(memcg)) { 1454 DEFINE_WAIT(wait); 1455 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1456 /* moving charge context might have finished. */ 1457 if (mc.moving_task) 1458 schedule(); 1459 finish_wait(&mc.waitq, &wait); 1460 return true; 1461 } 1462 } 1463 return false; 1464 } 1465 1466 struct memory_stat { 1467 const char *name; 1468 unsigned int idx; 1469 }; 1470 1471 static const struct memory_stat memory_stats[] = { 1472 { "anon", NR_ANON_MAPPED }, 1473 { "file", NR_FILE_PAGES }, 1474 { "kernel", MEMCG_KMEM }, 1475 { "kernel_stack", NR_KERNEL_STACK_KB }, 1476 { "pagetables", NR_PAGETABLE }, 1477 { "sec_pagetables", NR_SECONDARY_PAGETABLE }, 1478 { "percpu", MEMCG_PERCPU_B }, 1479 { "sock", MEMCG_SOCK }, 1480 { "vmalloc", MEMCG_VMALLOC }, 1481 { "shmem", NR_SHMEM }, 1482 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 1483 { "zswap", MEMCG_ZSWAP_B }, 1484 { "zswapped", MEMCG_ZSWAPPED }, 1485 #endif 1486 { "file_mapped", NR_FILE_MAPPED }, 1487 { "file_dirty", NR_FILE_DIRTY }, 1488 { "file_writeback", NR_WRITEBACK }, 1489 #ifdef CONFIG_SWAP 1490 { "swapcached", NR_SWAPCACHE }, 1491 #endif 1492 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1493 { "anon_thp", NR_ANON_THPS }, 1494 { "file_thp", NR_FILE_THPS }, 1495 { "shmem_thp", NR_SHMEM_THPS }, 1496 #endif 1497 { "inactive_anon", NR_INACTIVE_ANON }, 1498 { "active_anon", NR_ACTIVE_ANON }, 1499 { "inactive_file", NR_INACTIVE_FILE }, 1500 { "active_file", NR_ACTIVE_FILE }, 1501 { "unevictable", NR_UNEVICTABLE }, 1502 { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B }, 1503 { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B }, 1504 1505 /* The memory events */ 1506 { "workingset_refault_anon", WORKINGSET_REFAULT_ANON }, 1507 { "workingset_refault_file", WORKINGSET_REFAULT_FILE }, 1508 { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON }, 1509 { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE }, 1510 { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, 1511 { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, 1512 { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, 1513 }; 1514 1515 /* Translate stat items to the correct unit for memory.stat output */ 1516 static int memcg_page_state_unit(int item) 1517 { 1518 switch (item) { 1519 case MEMCG_PERCPU_B: 1520 case MEMCG_ZSWAP_B: 1521 case NR_SLAB_RECLAIMABLE_B: 1522 case NR_SLAB_UNRECLAIMABLE_B: 1523 case WORKINGSET_REFAULT_ANON: 1524 case WORKINGSET_REFAULT_FILE: 1525 case WORKINGSET_ACTIVATE_ANON: 1526 case WORKINGSET_ACTIVATE_FILE: 1527 case WORKINGSET_RESTORE_ANON: 1528 case WORKINGSET_RESTORE_FILE: 1529 case WORKINGSET_NODERECLAIM: 1530 return 1; 1531 case NR_KERNEL_STACK_KB: 1532 return SZ_1K; 1533 default: 1534 return PAGE_SIZE; 1535 } 1536 } 1537 1538 static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, 1539 int item) 1540 { 1541 return memcg_page_state(memcg, item) * memcg_page_state_unit(item); 1542 } 1543 1544 static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) 1545 { 1546 struct seq_buf s; 1547 int i; 1548 1549 seq_buf_init(&s, buf, bufsize); 1550 1551 /* 1552 * Provide statistics on the state of the memory subsystem as 1553 * well as cumulative event counters that show past behavior. 1554 * 1555 * This list is ordered following a combination of these gradients: 1556 * 1) generic big picture -> specifics and details 1557 * 2) reflecting userspace activity -> reflecting kernel heuristics 1558 * 1559 * Current memory state: 1560 */ 1561 mem_cgroup_flush_stats(); 1562 1563 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 1564 u64 size; 1565 1566 size = memcg_page_state_output(memcg, memory_stats[i].idx); 1567 seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); 1568 1569 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { 1570 size += memcg_page_state_output(memcg, 1571 NR_SLAB_RECLAIMABLE_B); 1572 seq_buf_printf(&s, "slab %llu\n", size); 1573 } 1574 } 1575 1576 /* Accumulated memory events */ 1577 seq_buf_printf(&s, "pgscan %lu\n", 1578 memcg_events(memcg, PGSCAN_KSWAPD) + 1579 memcg_events(memcg, PGSCAN_DIRECT) + 1580 memcg_events(memcg, PGSCAN_KHUGEPAGED)); 1581 seq_buf_printf(&s, "pgsteal %lu\n", 1582 memcg_events(memcg, PGSTEAL_KSWAPD) + 1583 memcg_events(memcg, PGSTEAL_DIRECT) + 1584 memcg_events(memcg, PGSTEAL_KHUGEPAGED)); 1585 1586 for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { 1587 if (memcg_vm_event_stat[i] == PGPGIN || 1588 memcg_vm_event_stat[i] == PGPGOUT) 1589 continue; 1590 1591 seq_buf_printf(&s, "%s %lu\n", 1592 vm_event_name(memcg_vm_event_stat[i]), 1593 memcg_events(memcg, memcg_vm_event_stat[i])); 1594 } 1595 1596 /* The above should easily fit into one page */ 1597 WARN_ON_ONCE(seq_buf_has_overflowed(&s)); 1598 } 1599 1600 #define K(x) ((x) << (PAGE_SHIFT-10)) 1601 /** 1602 * mem_cgroup_print_oom_context: Print OOM information relevant to 1603 * memory controller. 1604 * @memcg: The memory cgroup that went over limit 1605 * @p: Task that is going to be killed 1606 * 1607 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1608 * enabled 1609 */ 1610 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 1611 { 1612 rcu_read_lock(); 1613 1614 if (memcg) { 1615 pr_cont(",oom_memcg="); 1616 pr_cont_cgroup_path(memcg->css.cgroup); 1617 } else 1618 pr_cont(",global_oom"); 1619 if (p) { 1620 pr_cont(",task_memcg="); 1621 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1622 } 1623 rcu_read_unlock(); 1624 } 1625 1626 /** 1627 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 1628 * memory controller. 1629 * @memcg: The memory cgroup that went over limit 1630 */ 1631 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 1632 { 1633 /* Use static buffer, for the caller is holding oom_lock. */ 1634 static char buf[PAGE_SIZE]; 1635 1636 lockdep_assert_held(&oom_lock); 1637 1638 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1639 K((u64)page_counter_read(&memcg->memory)), 1640 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); 1641 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1642 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", 1643 K((u64)page_counter_read(&memcg->swap)), 1644 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); 1645 else { 1646 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1647 K((u64)page_counter_read(&memcg->memsw)), 1648 K((u64)memcg->memsw.max), memcg->memsw.failcnt); 1649 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1650 K((u64)page_counter_read(&memcg->kmem)), 1651 K((u64)memcg->kmem.max), memcg->kmem.failcnt); 1652 } 1653 1654 pr_info("Memory cgroup stats for "); 1655 pr_cont_cgroup_path(memcg->css.cgroup); 1656 pr_cont(":"); 1657 memory_stat_format(memcg, buf, sizeof(buf)); 1658 pr_info("%s", buf); 1659 } 1660 1661 /* 1662 * Return the memory (and swap, if configured) limit for a memcg. 1663 */ 1664 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 1665 { 1666 unsigned long max = READ_ONCE(memcg->memory.max); 1667 1668 if (do_memsw_account()) { 1669 if (mem_cgroup_swappiness(memcg)) { 1670 /* Calculate swap excess capacity from memsw limit */ 1671 unsigned long swap = READ_ONCE(memcg->memsw.max) - max; 1672 1673 max += min(swap, (unsigned long)total_swap_pages); 1674 } 1675 } else { 1676 if (mem_cgroup_swappiness(memcg)) 1677 max += min(READ_ONCE(memcg->swap.max), 1678 (unsigned long)total_swap_pages); 1679 } 1680 return max; 1681 } 1682 1683 unsigned long mem_cgroup_size(struct mem_cgroup *memcg) 1684 { 1685 return page_counter_read(&memcg->memory); 1686 } 1687 1688 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1689 int order) 1690 { 1691 struct oom_control oc = { 1692 .zonelist = NULL, 1693 .nodemask = NULL, 1694 .memcg = memcg, 1695 .gfp_mask = gfp_mask, 1696 .order = order, 1697 }; 1698 bool ret = true; 1699 1700 if (mutex_lock_killable(&oom_lock)) 1701 return true; 1702 1703 if (mem_cgroup_margin(memcg) >= (1 << order)) 1704 goto unlock; 1705 1706 /* 1707 * A few threads which were not waiting at mutex_lock_killable() can 1708 * fail to bail out. Therefore, check again after holding oom_lock. 1709 */ 1710 ret = task_is_dying() || out_of_memory(&oc); 1711 1712 unlock: 1713 mutex_unlock(&oom_lock); 1714 return ret; 1715 } 1716 1717 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1718 pg_data_t *pgdat, 1719 gfp_t gfp_mask, 1720 unsigned long *total_scanned) 1721 { 1722 struct mem_cgroup *victim = NULL; 1723 int total = 0; 1724 int loop = 0; 1725 unsigned long excess; 1726 unsigned long nr_scanned; 1727 struct mem_cgroup_reclaim_cookie reclaim = { 1728 .pgdat = pgdat, 1729 }; 1730 1731 excess = soft_limit_excess(root_memcg); 1732 1733 while (1) { 1734 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1735 if (!victim) { 1736 loop++; 1737 if (loop >= 2) { 1738 /* 1739 * If we have not been able to reclaim 1740 * anything, it might because there are 1741 * no reclaimable pages under this hierarchy 1742 */ 1743 if (!total) 1744 break; 1745 /* 1746 * We want to do more targeted reclaim. 1747 * excess >> 2 is not to excessive so as to 1748 * reclaim too much, nor too less that we keep 1749 * coming back to reclaim from this cgroup 1750 */ 1751 if (total >= (excess >> 2) || 1752 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1753 break; 1754 } 1755 continue; 1756 } 1757 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 1758 pgdat, &nr_scanned); 1759 *total_scanned += nr_scanned; 1760 if (!soft_limit_excess(root_memcg)) 1761 break; 1762 } 1763 mem_cgroup_iter_break(root_memcg, victim); 1764 return total; 1765 } 1766 1767 #ifdef CONFIG_LOCKDEP 1768 static struct lockdep_map memcg_oom_lock_dep_map = { 1769 .name = "memcg_oom_lock", 1770 }; 1771 #endif 1772 1773 static DEFINE_SPINLOCK(memcg_oom_lock); 1774 1775 /* 1776 * Check OOM-Killer is already running under our hierarchy. 1777 * If someone is running, return false. 1778 */ 1779 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1780 { 1781 struct mem_cgroup *iter, *failed = NULL; 1782 1783 spin_lock(&memcg_oom_lock); 1784 1785 for_each_mem_cgroup_tree(iter, memcg) { 1786 if (iter->oom_lock) { 1787 /* 1788 * this subtree of our hierarchy is already locked 1789 * so we cannot give a lock. 1790 */ 1791 failed = iter; 1792 mem_cgroup_iter_break(memcg, iter); 1793 break; 1794 } else 1795 iter->oom_lock = true; 1796 } 1797 1798 if (failed) { 1799 /* 1800 * OK, we failed to lock the whole subtree so we have 1801 * to clean up what we set up to the failing subtree 1802 */ 1803 for_each_mem_cgroup_tree(iter, memcg) { 1804 if (iter == failed) { 1805 mem_cgroup_iter_break(memcg, iter); 1806 break; 1807 } 1808 iter->oom_lock = false; 1809 } 1810 } else 1811 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1812 1813 spin_unlock(&memcg_oom_lock); 1814 1815 return !failed; 1816 } 1817 1818 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1819 { 1820 struct mem_cgroup *iter; 1821 1822 spin_lock(&memcg_oom_lock); 1823 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1824 for_each_mem_cgroup_tree(iter, memcg) 1825 iter->oom_lock = false; 1826 spin_unlock(&memcg_oom_lock); 1827 } 1828 1829 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1830 { 1831 struct mem_cgroup *iter; 1832 1833 spin_lock(&memcg_oom_lock); 1834 for_each_mem_cgroup_tree(iter, memcg) 1835 iter->under_oom++; 1836 spin_unlock(&memcg_oom_lock); 1837 } 1838 1839 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1840 { 1841 struct mem_cgroup *iter; 1842 1843 /* 1844 * Be careful about under_oom underflows because a child memcg 1845 * could have been added after mem_cgroup_mark_under_oom. 1846 */ 1847 spin_lock(&memcg_oom_lock); 1848 for_each_mem_cgroup_tree(iter, memcg) 1849 if (iter->under_oom > 0) 1850 iter->under_oom--; 1851 spin_unlock(&memcg_oom_lock); 1852 } 1853 1854 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1855 1856 struct oom_wait_info { 1857 struct mem_cgroup *memcg; 1858 wait_queue_entry_t wait; 1859 }; 1860 1861 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1862 unsigned mode, int sync, void *arg) 1863 { 1864 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1865 struct mem_cgroup *oom_wait_memcg; 1866 struct oom_wait_info *oom_wait_info; 1867 1868 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1869 oom_wait_memcg = oom_wait_info->memcg; 1870 1871 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1872 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1873 return 0; 1874 return autoremove_wake_function(wait, mode, sync, arg); 1875 } 1876 1877 static void memcg_oom_recover(struct mem_cgroup *memcg) 1878 { 1879 /* 1880 * For the following lockless ->under_oom test, the only required 1881 * guarantee is that it must see the state asserted by an OOM when 1882 * this function is called as a result of userland actions 1883 * triggered by the notification of the OOM. This is trivially 1884 * achieved by invoking mem_cgroup_mark_under_oom() before 1885 * triggering notification. 1886 */ 1887 if (memcg && memcg->under_oom) 1888 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1889 } 1890 1891 /* 1892 * Returns true if successfully killed one or more processes. Though in some 1893 * corner cases it can return true even without killing any process. 1894 */ 1895 static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1896 { 1897 bool locked, ret; 1898 1899 if (order > PAGE_ALLOC_COSTLY_ORDER) 1900 return false; 1901 1902 memcg_memory_event(memcg, MEMCG_OOM); 1903 1904 /* 1905 * We are in the middle of the charge context here, so we 1906 * don't want to block when potentially sitting on a callstack 1907 * that holds all kinds of filesystem and mm locks. 1908 * 1909 * cgroup1 allows disabling the OOM killer and waiting for outside 1910 * handling until the charge can succeed; remember the context and put 1911 * the task to sleep at the end of the page fault when all locks are 1912 * released. 1913 * 1914 * On the other hand, in-kernel OOM killer allows for an async victim 1915 * memory reclaim (oom_reaper) and that means that we are not solely 1916 * relying on the oom victim to make a forward progress and we can 1917 * invoke the oom killer here. 1918 * 1919 * Please note that mem_cgroup_out_of_memory might fail to find a 1920 * victim and then we have to bail out from the charge path. 1921 */ 1922 if (memcg->oom_kill_disable) { 1923 if (current->in_user_fault) { 1924 css_get(&memcg->css); 1925 current->memcg_in_oom = memcg; 1926 current->memcg_oom_gfp_mask = mask; 1927 current->memcg_oom_order = order; 1928 } 1929 return false; 1930 } 1931 1932 mem_cgroup_mark_under_oom(memcg); 1933 1934 locked = mem_cgroup_oom_trylock(memcg); 1935 1936 if (locked) 1937 mem_cgroup_oom_notify(memcg); 1938 1939 mem_cgroup_unmark_under_oom(memcg); 1940 ret = mem_cgroup_out_of_memory(memcg, mask, order); 1941 1942 if (locked) 1943 mem_cgroup_oom_unlock(memcg); 1944 1945 return ret; 1946 } 1947 1948 /** 1949 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1950 * @handle: actually kill/wait or just clean up the OOM state 1951 * 1952 * This has to be called at the end of a page fault if the memcg OOM 1953 * handler was enabled. 1954 * 1955 * Memcg supports userspace OOM handling where failed allocations must 1956 * sleep on a waitqueue until the userspace task resolves the 1957 * situation. Sleeping directly in the charge context with all kinds 1958 * of locks held is not a good idea, instead we remember an OOM state 1959 * in the task and mem_cgroup_oom_synchronize() has to be called at 1960 * the end of the page fault to complete the OOM handling. 1961 * 1962 * Returns %true if an ongoing memcg OOM situation was detected and 1963 * completed, %false otherwise. 1964 */ 1965 bool mem_cgroup_oom_synchronize(bool handle) 1966 { 1967 struct mem_cgroup *memcg = current->memcg_in_oom; 1968 struct oom_wait_info owait; 1969 bool locked; 1970 1971 /* OOM is global, do not handle */ 1972 if (!memcg) 1973 return false; 1974 1975 if (!handle) 1976 goto cleanup; 1977 1978 owait.memcg = memcg; 1979 owait.wait.flags = 0; 1980 owait.wait.func = memcg_oom_wake_function; 1981 owait.wait.private = current; 1982 INIT_LIST_HEAD(&owait.wait.entry); 1983 1984 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1985 mem_cgroup_mark_under_oom(memcg); 1986 1987 locked = mem_cgroup_oom_trylock(memcg); 1988 1989 if (locked) 1990 mem_cgroup_oom_notify(memcg); 1991 1992 if (locked && !memcg->oom_kill_disable) { 1993 mem_cgroup_unmark_under_oom(memcg); 1994 finish_wait(&memcg_oom_waitq, &owait.wait); 1995 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, 1996 current->memcg_oom_order); 1997 } else { 1998 schedule(); 1999 mem_cgroup_unmark_under_oom(memcg); 2000 finish_wait(&memcg_oom_waitq, &owait.wait); 2001 } 2002 2003 if (locked) { 2004 mem_cgroup_oom_unlock(memcg); 2005 /* 2006 * There is no guarantee that an OOM-lock contender 2007 * sees the wakeups triggered by the OOM kill 2008 * uncharges. Wake any sleepers explicitly. 2009 */ 2010 memcg_oom_recover(memcg); 2011 } 2012 cleanup: 2013 current->memcg_in_oom = NULL; 2014 css_put(&memcg->css); 2015 return true; 2016 } 2017 2018 /** 2019 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 2020 * @victim: task to be killed by the OOM killer 2021 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 2022 * 2023 * Returns a pointer to a memory cgroup, which has to be cleaned up 2024 * by killing all belonging OOM-killable tasks. 2025 * 2026 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 2027 */ 2028 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 2029 struct mem_cgroup *oom_domain) 2030 { 2031 struct mem_cgroup *oom_group = NULL; 2032 struct mem_cgroup *memcg; 2033 2034 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 2035 return NULL; 2036 2037 if (!oom_domain) 2038 oom_domain = root_mem_cgroup; 2039 2040 rcu_read_lock(); 2041 2042 memcg = mem_cgroup_from_task(victim); 2043 if (mem_cgroup_is_root(memcg)) 2044 goto out; 2045 2046 /* 2047 * If the victim task has been asynchronously moved to a different 2048 * memory cgroup, we might end up killing tasks outside oom_domain. 2049 * In this case it's better to ignore memory.group.oom. 2050 */ 2051 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) 2052 goto out; 2053 2054 /* 2055 * Traverse the memory cgroup hierarchy from the victim task's 2056 * cgroup up to the OOMing cgroup (or root) to find the 2057 * highest-level memory cgroup with oom.group set. 2058 */ 2059 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 2060 if (memcg->oom_group) 2061 oom_group = memcg; 2062 2063 if (memcg == oom_domain) 2064 break; 2065 } 2066 2067 if (oom_group) 2068 css_get(&oom_group->css); 2069 out: 2070 rcu_read_unlock(); 2071 2072 return oom_group; 2073 } 2074 2075 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 2076 { 2077 pr_info("Tasks in "); 2078 pr_cont_cgroup_path(memcg->css.cgroup); 2079 pr_cont(" are going to be killed due to memory.oom.group set\n"); 2080 } 2081 2082 /** 2083 * folio_memcg_lock - Bind a folio to its memcg. 2084 * @folio: The folio. 2085 * 2086 * This function prevents unlocked LRU folios from being moved to 2087 * another cgroup. 2088 * 2089 * It ensures lifetime of the bound memcg. The caller is responsible 2090 * for the lifetime of the folio. 2091 */ 2092 void folio_memcg_lock(struct folio *folio) 2093 { 2094 struct mem_cgroup *memcg; 2095 unsigned long flags; 2096 2097 /* 2098 * The RCU lock is held throughout the transaction. The fast 2099 * path can get away without acquiring the memcg->move_lock 2100 * because page moving starts with an RCU grace period. 2101 */ 2102 rcu_read_lock(); 2103 2104 if (mem_cgroup_disabled()) 2105 return; 2106 again: 2107 memcg = folio_memcg(folio); 2108 if (unlikely(!memcg)) 2109 return; 2110 2111 #ifdef CONFIG_PROVE_LOCKING 2112 local_irq_save(flags); 2113 might_lock(&memcg->move_lock); 2114 local_irq_restore(flags); 2115 #endif 2116 2117 if (atomic_read(&memcg->moving_account) <= 0) 2118 return; 2119 2120 spin_lock_irqsave(&memcg->move_lock, flags); 2121 if (memcg != folio_memcg(folio)) { 2122 spin_unlock_irqrestore(&memcg->move_lock, flags); 2123 goto again; 2124 } 2125 2126 /* 2127 * When charge migration first begins, we can have multiple 2128 * critical sections holding the fast-path RCU lock and one 2129 * holding the slowpath move_lock. Track the task who has the 2130 * move_lock for unlock_page_memcg(). 2131 */ 2132 memcg->move_lock_task = current; 2133 memcg->move_lock_flags = flags; 2134 } 2135 2136 void lock_page_memcg(struct page *page) 2137 { 2138 folio_memcg_lock(page_folio(page)); 2139 } 2140 2141 static void __folio_memcg_unlock(struct mem_cgroup *memcg) 2142 { 2143 if (memcg && memcg->move_lock_task == current) { 2144 unsigned long flags = memcg->move_lock_flags; 2145 2146 memcg->move_lock_task = NULL; 2147 memcg->move_lock_flags = 0; 2148 2149 spin_unlock_irqrestore(&memcg->move_lock, flags); 2150 } 2151 2152 rcu_read_unlock(); 2153 } 2154 2155 /** 2156 * folio_memcg_unlock - Release the binding between a folio and its memcg. 2157 * @folio: The folio. 2158 * 2159 * This releases the binding created by folio_memcg_lock(). This does 2160 * not change the accounting of this folio to its memcg, but it does 2161 * permit others to change it. 2162 */ 2163 void folio_memcg_unlock(struct folio *folio) 2164 { 2165 __folio_memcg_unlock(folio_memcg(folio)); 2166 } 2167 2168 void unlock_page_memcg(struct page *page) 2169 { 2170 folio_memcg_unlock(page_folio(page)); 2171 } 2172 2173 struct memcg_stock_pcp { 2174 local_lock_t stock_lock; 2175 struct mem_cgroup *cached; /* this never be root cgroup */ 2176 unsigned int nr_pages; 2177 2178 #ifdef CONFIG_MEMCG_KMEM 2179 struct obj_cgroup *cached_objcg; 2180 struct pglist_data *cached_pgdat; 2181 unsigned int nr_bytes; 2182 int nr_slab_reclaimable_b; 2183 int nr_slab_unreclaimable_b; 2184 #endif 2185 2186 struct work_struct work; 2187 unsigned long flags; 2188 #define FLUSHING_CACHED_CHARGE 0 2189 }; 2190 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { 2191 .stock_lock = INIT_LOCAL_LOCK(stock_lock), 2192 }; 2193 static DEFINE_MUTEX(percpu_charge_mutex); 2194 2195 #ifdef CONFIG_MEMCG_KMEM 2196 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); 2197 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2198 struct mem_cgroup *root_memcg); 2199 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); 2200 2201 #else 2202 static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) 2203 { 2204 return NULL; 2205 } 2206 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2207 struct mem_cgroup *root_memcg) 2208 { 2209 return false; 2210 } 2211 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) 2212 { 2213 } 2214 #endif 2215 2216 /** 2217 * consume_stock: Try to consume stocked charge on this cpu. 2218 * @memcg: memcg to consume from. 2219 * @nr_pages: how many pages to charge. 2220 * 2221 * The charges will only happen if @memcg matches the current cpu's memcg 2222 * stock, and at least @nr_pages are available in that stock. Failure to 2223 * service an allocation will refill the stock. 2224 * 2225 * returns true if successful, false otherwise. 2226 */ 2227 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2228 { 2229 struct memcg_stock_pcp *stock; 2230 unsigned long flags; 2231 bool ret = false; 2232 2233 if (nr_pages > MEMCG_CHARGE_BATCH) 2234 return ret; 2235 2236 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2237 2238 stock = this_cpu_ptr(&memcg_stock); 2239 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 2240 stock->nr_pages -= nr_pages; 2241 ret = true; 2242 } 2243 2244 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2245 2246 return ret; 2247 } 2248 2249 /* 2250 * Returns stocks cached in percpu and reset cached information. 2251 */ 2252 static void drain_stock(struct memcg_stock_pcp *stock) 2253 { 2254 struct mem_cgroup *old = stock->cached; 2255 2256 if (!old) 2257 return; 2258 2259 if (stock->nr_pages) { 2260 page_counter_uncharge(&old->memory, stock->nr_pages); 2261 if (do_memsw_account()) 2262 page_counter_uncharge(&old->memsw, stock->nr_pages); 2263 stock->nr_pages = 0; 2264 } 2265 2266 css_put(&old->css); 2267 stock->cached = NULL; 2268 } 2269 2270 static void drain_local_stock(struct work_struct *dummy) 2271 { 2272 struct memcg_stock_pcp *stock; 2273 struct obj_cgroup *old = NULL; 2274 unsigned long flags; 2275 2276 /* 2277 * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. 2278 * drain_stock races is that we always operate on local CPU stock 2279 * here with IRQ disabled 2280 */ 2281 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2282 2283 stock = this_cpu_ptr(&memcg_stock); 2284 old = drain_obj_stock(stock); 2285 drain_stock(stock); 2286 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2287 2288 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2289 if (old) 2290 obj_cgroup_put(old); 2291 } 2292 2293 /* 2294 * Cache charges(val) to local per_cpu area. 2295 * This will be consumed by consume_stock() function, later. 2296 */ 2297 static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2298 { 2299 struct memcg_stock_pcp *stock; 2300 2301 stock = this_cpu_ptr(&memcg_stock); 2302 if (stock->cached != memcg) { /* reset if necessary */ 2303 drain_stock(stock); 2304 css_get(&memcg->css); 2305 stock->cached = memcg; 2306 } 2307 stock->nr_pages += nr_pages; 2308 2309 if (stock->nr_pages > MEMCG_CHARGE_BATCH) 2310 drain_stock(stock); 2311 } 2312 2313 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2314 { 2315 unsigned long flags; 2316 2317 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2318 __refill_stock(memcg, nr_pages); 2319 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2320 } 2321 2322 /* 2323 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2324 * of the hierarchy under it. 2325 */ 2326 static void drain_all_stock(struct mem_cgroup *root_memcg) 2327 { 2328 int cpu, curcpu; 2329 2330 /* If someone's already draining, avoid adding running more workers. */ 2331 if (!mutex_trylock(&percpu_charge_mutex)) 2332 return; 2333 /* 2334 * Notify other cpus that system-wide "drain" is running 2335 * We do not care about races with the cpu hotplug because cpu down 2336 * as well as workers from this path always operate on the local 2337 * per-cpu data. CPU up doesn't touch memcg_stock at all. 2338 */ 2339 migrate_disable(); 2340 curcpu = smp_processor_id(); 2341 for_each_online_cpu(cpu) { 2342 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2343 struct mem_cgroup *memcg; 2344 bool flush = false; 2345 2346 rcu_read_lock(); 2347 memcg = stock->cached; 2348 if (memcg && stock->nr_pages && 2349 mem_cgroup_is_descendant(memcg, root_memcg)) 2350 flush = true; 2351 else if (obj_stock_flush_required(stock, root_memcg)) 2352 flush = true; 2353 rcu_read_unlock(); 2354 2355 if (flush && 2356 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2357 if (cpu == curcpu) 2358 drain_local_stock(&stock->work); 2359 else 2360 schedule_work_on(cpu, &stock->work); 2361 } 2362 } 2363 migrate_enable(); 2364 mutex_unlock(&percpu_charge_mutex); 2365 } 2366 2367 static int memcg_hotplug_cpu_dead(unsigned int cpu) 2368 { 2369 struct memcg_stock_pcp *stock; 2370 2371 stock = &per_cpu(memcg_stock, cpu); 2372 drain_stock(stock); 2373 2374 return 0; 2375 } 2376 2377 static unsigned long reclaim_high(struct mem_cgroup *memcg, 2378 unsigned int nr_pages, 2379 gfp_t gfp_mask) 2380 { 2381 unsigned long nr_reclaimed = 0; 2382 2383 do { 2384 unsigned long pflags; 2385 2386 if (page_counter_read(&memcg->memory) <= 2387 READ_ONCE(memcg->memory.high)) 2388 continue; 2389 2390 memcg_memory_event(memcg, MEMCG_HIGH); 2391 2392 psi_memstall_enter(&pflags); 2393 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, 2394 gfp_mask, 2395 MEMCG_RECLAIM_MAY_SWAP); 2396 psi_memstall_leave(&pflags); 2397 } while ((memcg = parent_mem_cgroup(memcg)) && 2398 !mem_cgroup_is_root(memcg)); 2399 2400 return nr_reclaimed; 2401 } 2402 2403 static void high_work_func(struct work_struct *work) 2404 { 2405 struct mem_cgroup *memcg; 2406 2407 memcg = container_of(work, struct mem_cgroup, high_work); 2408 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 2409 } 2410 2411 /* 2412 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is 2413 * enough to still cause a significant slowdown in most cases, while still 2414 * allowing diagnostics and tracing to proceed without becoming stuck. 2415 */ 2416 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) 2417 2418 /* 2419 * When calculating the delay, we use these either side of the exponentiation to 2420 * maintain precision and scale to a reasonable number of jiffies (see the table 2421 * below. 2422 * 2423 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the 2424 * overage ratio to a delay. 2425 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the 2426 * proposed penalty in order to reduce to a reasonable number of jiffies, and 2427 * to produce a reasonable delay curve. 2428 * 2429 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a 2430 * reasonable delay curve compared to precision-adjusted overage, not 2431 * penalising heavily at first, but still making sure that growth beyond the 2432 * limit penalises misbehaviour cgroups by slowing them down exponentially. For 2433 * example, with a high of 100 megabytes: 2434 * 2435 * +-------+------------------------+ 2436 * | usage | time to allocate in ms | 2437 * +-------+------------------------+ 2438 * | 100M | 0 | 2439 * | 101M | 6 | 2440 * | 102M | 25 | 2441 * | 103M | 57 | 2442 * | 104M | 102 | 2443 * | 105M | 159 | 2444 * | 106M | 230 | 2445 * | 107M | 313 | 2446 * | 108M | 409 | 2447 * | 109M | 518 | 2448 * | 110M | 639 | 2449 * | 111M | 774 | 2450 * | 112M | 921 | 2451 * | 113M | 1081 | 2452 * | 114M | 1254 | 2453 * | 115M | 1439 | 2454 * | 116M | 1638 | 2455 * | 117M | 1849 | 2456 * | 118M | 2000 | 2457 * | 119M | 2000 | 2458 * | 120M | 2000 | 2459 * +-------+------------------------+ 2460 */ 2461 #define MEMCG_DELAY_PRECISION_SHIFT 20 2462 #define MEMCG_DELAY_SCALING_SHIFT 14 2463 2464 static u64 calculate_overage(unsigned long usage, unsigned long high) 2465 { 2466 u64 overage; 2467 2468 if (usage <= high) 2469 return 0; 2470 2471 /* 2472 * Prevent division by 0 in overage calculation by acting as if 2473 * it was a threshold of 1 page 2474 */ 2475 high = max(high, 1UL); 2476 2477 overage = usage - high; 2478 overage <<= MEMCG_DELAY_PRECISION_SHIFT; 2479 return div64_u64(overage, high); 2480 } 2481 2482 static u64 mem_find_max_overage(struct mem_cgroup *memcg) 2483 { 2484 u64 overage, max_overage = 0; 2485 2486 do { 2487 overage = calculate_overage(page_counter_read(&memcg->memory), 2488 READ_ONCE(memcg->memory.high)); 2489 max_overage = max(overage, max_overage); 2490 } while ((memcg = parent_mem_cgroup(memcg)) && 2491 !mem_cgroup_is_root(memcg)); 2492 2493 return max_overage; 2494 } 2495 2496 static u64 swap_find_max_overage(struct mem_cgroup *memcg) 2497 { 2498 u64 overage, max_overage = 0; 2499 2500 do { 2501 overage = calculate_overage(page_counter_read(&memcg->swap), 2502 READ_ONCE(memcg->swap.high)); 2503 if (overage) 2504 memcg_memory_event(memcg, MEMCG_SWAP_HIGH); 2505 max_overage = max(overage, max_overage); 2506 } while ((memcg = parent_mem_cgroup(memcg)) && 2507 !mem_cgroup_is_root(memcg)); 2508 2509 return max_overage; 2510 } 2511 2512 /* 2513 * Get the number of jiffies that we should penalise a mischievous cgroup which 2514 * is exceeding its memory.high by checking both it and its ancestors. 2515 */ 2516 static unsigned long calculate_high_delay(struct mem_cgroup *memcg, 2517 unsigned int nr_pages, 2518 u64 max_overage) 2519 { 2520 unsigned long penalty_jiffies; 2521 2522 if (!max_overage) 2523 return 0; 2524 2525 /* 2526 * We use overage compared to memory.high to calculate the number of 2527 * jiffies to sleep (penalty_jiffies). Ideally this value should be 2528 * fairly lenient on small overages, and increasingly harsh when the 2529 * memcg in question makes it clear that it has no intention of stopping 2530 * its crazy behaviour, so we exponentially increase the delay based on 2531 * overage amount. 2532 */ 2533 penalty_jiffies = max_overage * max_overage * HZ; 2534 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; 2535 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; 2536 2537 /* 2538 * Factor in the task's own contribution to the overage, such that four 2539 * N-sized allocations are throttled approximately the same as one 2540 * 4N-sized allocation. 2541 * 2542 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or 2543 * larger the current charge patch is than that. 2544 */ 2545 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; 2546 } 2547 2548 /* 2549 * Scheduled by try_charge() to be executed from the userland return path 2550 * and reclaims memory over the high limit. 2551 */ 2552 void mem_cgroup_handle_over_high(void) 2553 { 2554 unsigned long penalty_jiffies; 2555 unsigned long pflags; 2556 unsigned long nr_reclaimed; 2557 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2558 int nr_retries = MAX_RECLAIM_RETRIES; 2559 struct mem_cgroup *memcg; 2560 bool in_retry = false; 2561 2562 if (likely(!nr_pages)) 2563 return; 2564 2565 memcg = get_mem_cgroup_from_mm(current->mm); 2566 current->memcg_nr_pages_over_high = 0; 2567 2568 retry_reclaim: 2569 /* 2570 * The allocating task should reclaim at least the batch size, but for 2571 * subsequent retries we only want to do what's necessary to prevent oom 2572 * or breaching resource isolation. 2573 * 2574 * This is distinct from memory.max or page allocator behaviour because 2575 * memory.high is currently batched, whereas memory.max and the page 2576 * allocator run every time an allocation is made. 2577 */ 2578 nr_reclaimed = reclaim_high(memcg, 2579 in_retry ? SWAP_CLUSTER_MAX : nr_pages, 2580 GFP_KERNEL); 2581 2582 /* 2583 * memory.high is breached and reclaim is unable to keep up. Throttle 2584 * allocators proactively to slow down excessive growth. 2585 */ 2586 penalty_jiffies = calculate_high_delay(memcg, nr_pages, 2587 mem_find_max_overage(memcg)); 2588 2589 penalty_jiffies += calculate_high_delay(memcg, nr_pages, 2590 swap_find_max_overage(memcg)); 2591 2592 /* 2593 * Clamp the max delay per usermode return so as to still keep the 2594 * application moving forwards and also permit diagnostics, albeit 2595 * extremely slowly. 2596 */ 2597 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 2598 2599 /* 2600 * Don't sleep if the amount of jiffies this memcg owes us is so low 2601 * that it's not even worth doing, in an attempt to be nice to those who 2602 * go only a small amount over their memory.high value and maybe haven't 2603 * been aggressively reclaimed enough yet. 2604 */ 2605 if (penalty_jiffies <= HZ / 100) 2606 goto out; 2607 2608 /* 2609 * If reclaim is making forward progress but we're still over 2610 * memory.high, we want to encourage that rather than doing allocator 2611 * throttling. 2612 */ 2613 if (nr_reclaimed || nr_retries--) { 2614 in_retry = true; 2615 goto retry_reclaim; 2616 } 2617 2618 /* 2619 * If we exit early, we're guaranteed to die (since 2620 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't 2621 * need to account for any ill-begotten jiffies to pay them off later. 2622 */ 2623 psi_memstall_enter(&pflags); 2624 schedule_timeout_killable(penalty_jiffies); 2625 psi_memstall_leave(&pflags); 2626 2627 out: 2628 css_put(&memcg->css); 2629 } 2630 2631 static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, 2632 unsigned int nr_pages) 2633 { 2634 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 2635 int nr_retries = MAX_RECLAIM_RETRIES; 2636 struct mem_cgroup *mem_over_limit; 2637 struct page_counter *counter; 2638 unsigned long nr_reclaimed; 2639 bool passed_oom = false; 2640 unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; 2641 bool drained = false; 2642 bool raised_max_event = false; 2643 unsigned long pflags; 2644 2645 retry: 2646 if (consume_stock(memcg, nr_pages)) 2647 return 0; 2648 2649 if (!do_memsw_account() || 2650 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2651 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 2652 goto done_restock; 2653 if (do_memsw_account()) 2654 page_counter_uncharge(&memcg->memsw, batch); 2655 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2656 } else { 2657 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2658 reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; 2659 } 2660 2661 if (batch > nr_pages) { 2662 batch = nr_pages; 2663 goto retry; 2664 } 2665 2666 /* 2667 * Prevent unbounded recursion when reclaim operations need to 2668 * allocate memory. This might exceed the limits temporarily, 2669 * but we prefer facilitating memory reclaim and getting back 2670 * under the limit over triggering OOM kills in these cases. 2671 */ 2672 if (unlikely(current->flags & PF_MEMALLOC)) 2673 goto force; 2674 2675 if (unlikely(task_in_memcg_oom(current))) 2676 goto nomem; 2677 2678 if (!gfpflags_allow_blocking(gfp_mask)) 2679 goto nomem; 2680 2681 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2682 raised_max_event = true; 2683 2684 psi_memstall_enter(&pflags); 2685 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2686 gfp_mask, reclaim_options); 2687 psi_memstall_leave(&pflags); 2688 2689 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2690 goto retry; 2691 2692 if (!drained) { 2693 drain_all_stock(mem_over_limit); 2694 drained = true; 2695 goto retry; 2696 } 2697 2698 if (gfp_mask & __GFP_NORETRY) 2699 goto nomem; 2700 /* 2701 * Even though the limit is exceeded at this point, reclaim 2702 * may have been able to free some pages. Retry the charge 2703 * before killing the task. 2704 * 2705 * Only for regular pages, though: huge pages are rather 2706 * unlikely to succeed so close to the limit, and we fall back 2707 * to regular pages anyway in case of failure. 2708 */ 2709 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2710 goto retry; 2711 /* 2712 * At task move, charge accounts can be doubly counted. So, it's 2713 * better to wait until the end of task_move if something is going on. 2714 */ 2715 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2716 goto retry; 2717 2718 if (nr_retries--) 2719 goto retry; 2720 2721 if (gfp_mask & __GFP_RETRY_MAYFAIL) 2722 goto nomem; 2723 2724 /* Avoid endless loop for tasks bypassed by the oom killer */ 2725 if (passed_oom && task_is_dying()) 2726 goto nomem; 2727 2728 /* 2729 * keep retrying as long as the memcg oom killer is able to make 2730 * a forward progress or bypass the charge if the oom killer 2731 * couldn't make any progress. 2732 */ 2733 if (mem_cgroup_oom(mem_over_limit, gfp_mask, 2734 get_order(nr_pages * PAGE_SIZE))) { 2735 passed_oom = true; 2736 nr_retries = MAX_RECLAIM_RETRIES; 2737 goto retry; 2738 } 2739 nomem: 2740 /* 2741 * Memcg doesn't have a dedicated reserve for atomic 2742 * allocations. But like the global atomic pool, we need to 2743 * put the burden of reclaim on regular allocation requests 2744 * and let these go through as privileged allocations. 2745 */ 2746 if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH))) 2747 return -ENOMEM; 2748 force: 2749 /* 2750 * If the allocation has to be enforced, don't forget to raise 2751 * a MEMCG_MAX event. 2752 */ 2753 if (!raised_max_event) 2754 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2755 2756 /* 2757 * The allocation either can't fail or will lead to more memory 2758 * being freed very soon. Allow memory usage go over the limit 2759 * temporarily by force charging it. 2760 */ 2761 page_counter_charge(&memcg->memory, nr_pages); 2762 if (do_memsw_account()) 2763 page_counter_charge(&memcg->memsw, nr_pages); 2764 2765 return 0; 2766 2767 done_restock: 2768 if (batch > nr_pages) 2769 refill_stock(memcg, batch - nr_pages); 2770 2771 /* 2772 * If the hierarchy is above the normal consumption range, schedule 2773 * reclaim on returning to userland. We can perform reclaim here 2774 * if __GFP_RECLAIM but let's always punt for simplicity and so that 2775 * GFP_KERNEL can consistently be used during reclaim. @memcg is 2776 * not recorded as it most likely matches current's and won't 2777 * change in the meantime. As high limit is checked again before 2778 * reclaim, the cost of mismatch is negligible. 2779 */ 2780 do { 2781 bool mem_high, swap_high; 2782 2783 mem_high = page_counter_read(&memcg->memory) > 2784 READ_ONCE(memcg->memory.high); 2785 swap_high = page_counter_read(&memcg->swap) > 2786 READ_ONCE(memcg->swap.high); 2787 2788 /* Don't bother a random interrupted task */ 2789 if (!in_task()) { 2790 if (mem_high) { 2791 schedule_work(&memcg->high_work); 2792 break; 2793 } 2794 continue; 2795 } 2796 2797 if (mem_high || swap_high) { 2798 /* 2799 * The allocating tasks in this cgroup will need to do 2800 * reclaim or be throttled to prevent further growth 2801 * of the memory or swap footprints. 2802 * 2803 * Target some best-effort fairness between the tasks, 2804 * and distribute reclaim work and delay penalties 2805 * based on how much each task is actually allocating. 2806 */ 2807 current->memcg_nr_pages_over_high += batch; 2808 set_notify_resume(current); 2809 break; 2810 } 2811 } while ((memcg = parent_mem_cgroup(memcg))); 2812 2813 if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && 2814 !(current->flags & PF_MEMALLOC) && 2815 gfpflags_allow_blocking(gfp_mask)) { 2816 mem_cgroup_handle_over_high(); 2817 } 2818 return 0; 2819 } 2820 2821 static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2822 unsigned int nr_pages) 2823 { 2824 if (mem_cgroup_is_root(memcg)) 2825 return 0; 2826 2827 return try_charge_memcg(memcg, gfp_mask, nr_pages); 2828 } 2829 2830 static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2831 { 2832 if (mem_cgroup_is_root(memcg)) 2833 return; 2834 2835 page_counter_uncharge(&memcg->memory, nr_pages); 2836 if (do_memsw_account()) 2837 page_counter_uncharge(&memcg->memsw, nr_pages); 2838 } 2839 2840 static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) 2841 { 2842 VM_BUG_ON_FOLIO(folio_memcg(folio), folio); 2843 /* 2844 * Any of the following ensures page's memcg stability: 2845 * 2846 * - the page lock 2847 * - LRU isolation 2848 * - lock_page_memcg() 2849 * - exclusive reference 2850 * - mem_cgroup_trylock_pages() 2851 */ 2852 folio->memcg_data = (unsigned long)memcg; 2853 } 2854 2855 #ifdef CONFIG_MEMCG_KMEM 2856 /* 2857 * The allocated objcg pointers array is not accounted directly. 2858 * Moreover, it should not come from DMA buffer and is not readily 2859 * reclaimable. So those GFP bits should be masked off. 2860 */ 2861 #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) 2862 2863 /* 2864 * mod_objcg_mlstate() may be called with irq enabled, so 2865 * mod_memcg_lruvec_state() should be used. 2866 */ 2867 static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, 2868 struct pglist_data *pgdat, 2869 enum node_stat_item idx, int nr) 2870 { 2871 struct mem_cgroup *memcg; 2872 struct lruvec *lruvec; 2873 2874 rcu_read_lock(); 2875 memcg = obj_cgroup_memcg(objcg); 2876 lruvec = mem_cgroup_lruvec(memcg, pgdat); 2877 mod_memcg_lruvec_state(lruvec, idx, nr); 2878 rcu_read_unlock(); 2879 } 2880 2881 int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, 2882 gfp_t gfp, bool new_slab) 2883 { 2884 unsigned int objects = objs_per_slab(s, slab); 2885 unsigned long memcg_data; 2886 void *vec; 2887 2888 gfp &= ~OBJCGS_CLEAR_MASK; 2889 vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, 2890 slab_nid(slab)); 2891 if (!vec) 2892 return -ENOMEM; 2893 2894 memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; 2895 if (new_slab) { 2896 /* 2897 * If the slab is brand new and nobody can yet access its 2898 * memcg_data, no synchronization is required and memcg_data can 2899 * be simply assigned. 2900 */ 2901 slab->memcg_data = memcg_data; 2902 } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) { 2903 /* 2904 * If the slab is already in use, somebody can allocate and 2905 * assign obj_cgroups in parallel. In this case the existing 2906 * objcg vector should be reused. 2907 */ 2908 kfree(vec); 2909 return 0; 2910 } 2911 2912 kmemleak_not_leak(vec); 2913 return 0; 2914 } 2915 2916 static __always_inline 2917 struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) 2918 { 2919 /* 2920 * Slab objects are accounted individually, not per-page. 2921 * Memcg membership data for each individual object is saved in 2922 * slab->memcg_data. 2923 */ 2924 if (folio_test_slab(folio)) { 2925 struct obj_cgroup **objcgs; 2926 struct slab *slab; 2927 unsigned int off; 2928 2929 slab = folio_slab(folio); 2930 objcgs = slab_objcgs(slab); 2931 if (!objcgs) 2932 return NULL; 2933 2934 off = obj_to_index(slab->slab_cache, slab, p); 2935 if (objcgs[off]) 2936 return obj_cgroup_memcg(objcgs[off]); 2937 2938 return NULL; 2939 } 2940 2941 /* 2942 * page_memcg_check() is used here, because in theory we can encounter 2943 * a folio where the slab flag has been cleared already, but 2944 * slab->memcg_data has not been freed yet 2945 * page_memcg_check(page) will guarantee that a proper memory 2946 * cgroup pointer or NULL will be returned. 2947 */ 2948 return page_memcg_check(folio_page(folio, 0)); 2949 } 2950 2951 /* 2952 * Returns a pointer to the memory cgroup to which the kernel object is charged. 2953 * 2954 * A passed kernel object can be a slab object, vmalloc object or a generic 2955 * kernel page, so different mechanisms for getting the memory cgroup pointer 2956 * should be used. 2957 * 2958 * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller 2959 * can not know for sure how the kernel object is implemented. 2960 * mem_cgroup_from_obj() can be safely used in such cases. 2961 * 2962 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 2963 * cgroup_mutex, etc. 2964 */ 2965 struct mem_cgroup *mem_cgroup_from_obj(void *p) 2966 { 2967 struct folio *folio; 2968 2969 if (mem_cgroup_disabled()) 2970 return NULL; 2971 2972 if (unlikely(is_vmalloc_addr(p))) 2973 folio = page_folio(vmalloc_to_page(p)); 2974 else 2975 folio = virt_to_folio(p); 2976 2977 return mem_cgroup_from_obj_folio(folio, p); 2978 } 2979 2980 /* 2981 * Returns a pointer to the memory cgroup to which the kernel object is charged. 2982 * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects, 2983 * allocated using vmalloc(). 2984 * 2985 * A passed kernel object must be a slab object or a generic kernel page. 2986 * 2987 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 2988 * cgroup_mutex, etc. 2989 */ 2990 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) 2991 { 2992 if (mem_cgroup_disabled()) 2993 return NULL; 2994 2995 return mem_cgroup_from_obj_folio(virt_to_folio(p), p); 2996 } 2997 2998 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) 2999 { 3000 struct obj_cgroup *objcg = NULL; 3001 3002 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 3003 objcg = rcu_dereference(memcg->objcg); 3004 if (objcg && obj_cgroup_tryget(objcg)) 3005 break; 3006 objcg = NULL; 3007 } 3008 return objcg; 3009 } 3010 3011 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) 3012 { 3013 struct obj_cgroup *objcg = NULL; 3014 struct mem_cgroup *memcg; 3015 3016 if (memcg_kmem_bypass()) 3017 return NULL; 3018 3019 rcu_read_lock(); 3020 if (unlikely(active_memcg())) 3021 memcg = active_memcg(); 3022 else 3023 memcg = mem_cgroup_from_task(current); 3024 objcg = __get_obj_cgroup_from_memcg(memcg); 3025 rcu_read_unlock(); 3026 return objcg; 3027 } 3028 3029 struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) 3030 { 3031 struct obj_cgroup *objcg; 3032 3033 if (!memcg_kmem_enabled()) 3034 return NULL; 3035 3036 if (PageMemcgKmem(page)) { 3037 objcg = __folio_objcg(page_folio(page)); 3038 obj_cgroup_get(objcg); 3039 } else { 3040 struct mem_cgroup *memcg; 3041 3042 rcu_read_lock(); 3043 memcg = __folio_memcg(page_folio(page)); 3044 if (memcg) 3045 objcg = __get_obj_cgroup_from_memcg(memcg); 3046 else 3047 objcg = NULL; 3048 rcu_read_unlock(); 3049 } 3050 return objcg; 3051 } 3052 3053 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) 3054 { 3055 mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); 3056 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 3057 if (nr_pages > 0) 3058 page_counter_charge(&memcg->kmem, nr_pages); 3059 else 3060 page_counter_uncharge(&memcg->kmem, -nr_pages); 3061 } 3062 } 3063 3064 3065 /* 3066 * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg 3067 * @objcg: object cgroup to uncharge 3068 * @nr_pages: number of pages to uncharge 3069 */ 3070 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 3071 unsigned int nr_pages) 3072 { 3073 struct mem_cgroup *memcg; 3074 3075 memcg = get_mem_cgroup_from_objcg(objcg); 3076 3077 memcg_account_kmem(memcg, -nr_pages); 3078 refill_stock(memcg, nr_pages); 3079 3080 css_put(&memcg->css); 3081 } 3082 3083 /* 3084 * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg 3085 * @objcg: object cgroup to charge 3086 * @gfp: reclaim mode 3087 * @nr_pages: number of pages to charge 3088 * 3089 * Returns 0 on success, an error code on failure. 3090 */ 3091 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, 3092 unsigned int nr_pages) 3093 { 3094 struct mem_cgroup *memcg; 3095 int ret; 3096 3097 memcg = get_mem_cgroup_from_objcg(objcg); 3098 3099 ret = try_charge_memcg(memcg, gfp, nr_pages); 3100 if (ret) 3101 goto out; 3102 3103 memcg_account_kmem(memcg, nr_pages); 3104 out: 3105 css_put(&memcg->css); 3106 3107 return ret; 3108 } 3109 3110 /** 3111 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup 3112 * @page: page to charge 3113 * @gfp: reclaim mode 3114 * @order: allocation order 3115 * 3116 * Returns 0 on success, an error code on failure. 3117 */ 3118 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) 3119 { 3120 struct obj_cgroup *objcg; 3121 int ret = 0; 3122 3123 objcg = get_obj_cgroup_from_current(); 3124 if (objcg) { 3125 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); 3126 if (!ret) { 3127 page->memcg_data = (unsigned long)objcg | 3128 MEMCG_DATA_KMEM; 3129 return 0; 3130 } 3131 obj_cgroup_put(objcg); 3132 } 3133 return ret; 3134 } 3135 3136 /** 3137 * __memcg_kmem_uncharge_page: uncharge a kmem page 3138 * @page: page to uncharge 3139 * @order: allocation order 3140 */ 3141 void __memcg_kmem_uncharge_page(struct page *page, int order) 3142 { 3143 struct folio *folio = page_folio(page); 3144 struct obj_cgroup *objcg; 3145 unsigned int nr_pages = 1 << order; 3146 3147 if (!folio_memcg_kmem(folio)) 3148 return; 3149 3150 objcg = __folio_objcg(folio); 3151 obj_cgroup_uncharge_pages(objcg, nr_pages); 3152 folio->memcg_data = 0; 3153 obj_cgroup_put(objcg); 3154 } 3155 3156 void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, 3157 enum node_stat_item idx, int nr) 3158 { 3159 struct memcg_stock_pcp *stock; 3160 struct obj_cgroup *old = NULL; 3161 unsigned long flags; 3162 int *bytes; 3163 3164 local_lock_irqsave(&memcg_stock.stock_lock, flags); 3165 stock = this_cpu_ptr(&memcg_stock); 3166 3167 /* 3168 * Save vmstat data in stock and skip vmstat array update unless 3169 * accumulating over a page of vmstat data or when pgdat or idx 3170 * changes. 3171 */ 3172 if (stock->cached_objcg != objcg) { 3173 old = drain_obj_stock(stock); 3174 obj_cgroup_get(objcg); 3175 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 3176 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 3177 stock->cached_objcg = objcg; 3178 stock->cached_pgdat = pgdat; 3179 } else if (stock->cached_pgdat != pgdat) { 3180 /* Flush the existing cached vmstat data */ 3181 struct pglist_data *oldpg = stock->cached_pgdat; 3182 3183 if (stock->nr_slab_reclaimable_b) { 3184 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, 3185 stock->nr_slab_reclaimable_b); 3186 stock->nr_slab_reclaimable_b = 0; 3187 } 3188 if (stock->nr_slab_unreclaimable_b) { 3189 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, 3190 stock->nr_slab_unreclaimable_b); 3191 stock->nr_slab_unreclaimable_b = 0; 3192 } 3193 stock->cached_pgdat = pgdat; 3194 } 3195 3196 bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b 3197 : &stock->nr_slab_unreclaimable_b; 3198 /* 3199 * Even for large object >= PAGE_SIZE, the vmstat data will still be 3200 * cached locally at least once before pushing it out. 3201 */ 3202 if (!*bytes) { 3203 *bytes = nr; 3204 nr = 0; 3205 } else { 3206 *bytes += nr; 3207 if (abs(*bytes) > PAGE_SIZE) { 3208 nr = *bytes; 3209 *bytes = 0; 3210 } else { 3211 nr = 0; 3212 } 3213 } 3214 if (nr) 3215 mod_objcg_mlstate(objcg, pgdat, idx, nr); 3216 3217 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 3218 if (old) 3219 obj_cgroup_put(old); 3220 } 3221 3222 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 3223 { 3224 struct memcg_stock_pcp *stock; 3225 unsigned long flags; 3226 bool ret = false; 3227 3228 local_lock_irqsave(&memcg_stock.stock_lock, flags); 3229 3230 stock = this_cpu_ptr(&memcg_stock); 3231 if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { 3232 stock->nr_bytes -= nr_bytes; 3233 ret = true; 3234 } 3235 3236 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 3237 3238 return ret; 3239 } 3240 3241 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) 3242 { 3243 struct obj_cgroup *old = stock->cached_objcg; 3244 3245 if (!old) 3246 return NULL; 3247 3248 if (stock->nr_bytes) { 3249 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3250 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); 3251 3252 if (nr_pages) { 3253 struct mem_cgroup *memcg; 3254 3255 memcg = get_mem_cgroup_from_objcg(old); 3256 3257 memcg_account_kmem(memcg, -nr_pages); 3258 __refill_stock(memcg, nr_pages); 3259 3260 css_put(&memcg->css); 3261 } 3262 3263 /* 3264 * The leftover is flushed to the centralized per-memcg value. 3265 * On the next attempt to refill obj stock it will be moved 3266 * to a per-cpu stock (probably, on an other CPU), see 3267 * refill_obj_stock(). 3268 * 3269 * How often it's flushed is a trade-off between the memory 3270 * limit enforcement accuracy and potential CPU contention, 3271 * so it might be changed in the future. 3272 */ 3273 atomic_add(nr_bytes, &old->nr_charged_bytes); 3274 stock->nr_bytes = 0; 3275 } 3276 3277 /* 3278 * Flush the vmstat data in current stock 3279 */ 3280 if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { 3281 if (stock->nr_slab_reclaimable_b) { 3282 mod_objcg_mlstate(old, stock->cached_pgdat, 3283 NR_SLAB_RECLAIMABLE_B, 3284 stock->nr_slab_reclaimable_b); 3285 stock->nr_slab_reclaimable_b = 0; 3286 } 3287 if (stock->nr_slab_unreclaimable_b) { 3288 mod_objcg_mlstate(old, stock->cached_pgdat, 3289 NR_SLAB_UNRECLAIMABLE_B, 3290 stock->nr_slab_unreclaimable_b); 3291 stock->nr_slab_unreclaimable_b = 0; 3292 } 3293 stock->cached_pgdat = NULL; 3294 } 3295 3296 stock->cached_objcg = NULL; 3297 /* 3298 * The `old' objects needs to be released by the caller via 3299 * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. 3300 */ 3301 return old; 3302 } 3303 3304 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 3305 struct mem_cgroup *root_memcg) 3306 { 3307 struct mem_cgroup *memcg; 3308 3309 if (stock->cached_objcg) { 3310 memcg = obj_cgroup_memcg(stock->cached_objcg); 3311 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 3312 return true; 3313 } 3314 3315 return false; 3316 } 3317 3318 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, 3319 bool allow_uncharge) 3320 { 3321 struct memcg_stock_pcp *stock; 3322 struct obj_cgroup *old = NULL; 3323 unsigned long flags; 3324 unsigned int nr_pages = 0; 3325 3326 local_lock_irqsave(&memcg_stock.stock_lock, flags); 3327 3328 stock = this_cpu_ptr(&memcg_stock); 3329 if (stock->cached_objcg != objcg) { /* reset if necessary */ 3330 old = drain_obj_stock(stock); 3331 obj_cgroup_get(objcg); 3332 stock->cached_objcg = objcg; 3333 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 3334 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 3335 allow_uncharge = true; /* Allow uncharge when objcg changes */ 3336 } 3337 stock->nr_bytes += nr_bytes; 3338 3339 if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { 3340 nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3341 stock->nr_bytes &= (PAGE_SIZE - 1); 3342 } 3343 3344 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 3345 if (old) 3346 obj_cgroup_put(old); 3347 3348 if (nr_pages) 3349 obj_cgroup_uncharge_pages(objcg, nr_pages); 3350 } 3351 3352 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 3353 { 3354 unsigned int nr_pages, nr_bytes; 3355 int ret; 3356 3357 if (consume_obj_stock(objcg, size)) 3358 return 0; 3359 3360 /* 3361 * In theory, objcg->nr_charged_bytes can have enough 3362 * pre-charged bytes to satisfy the allocation. However, 3363 * flushing objcg->nr_charged_bytes requires two atomic 3364 * operations, and objcg->nr_charged_bytes can't be big. 3365 * The shared objcg->nr_charged_bytes can also become a 3366 * performance bottleneck if all tasks of the same memcg are 3367 * trying to update it. So it's better to ignore it and try 3368 * grab some new pages. The stock's nr_bytes will be flushed to 3369 * objcg->nr_charged_bytes later on when objcg changes. 3370 * 3371 * The stock's nr_bytes may contain enough pre-charged bytes 3372 * to allow one less page from being charged, but we can't rely 3373 * on the pre-charged bytes not being changed outside of 3374 * consume_obj_stock() or refill_obj_stock(). So ignore those 3375 * pre-charged bytes as well when charging pages. To avoid a 3376 * page uncharge right after a page charge, we set the 3377 * allow_uncharge flag to false when calling refill_obj_stock() 3378 * to temporarily allow the pre-charged bytes to exceed the page 3379 * size limit. The maximum reachable value of the pre-charged 3380 * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data 3381 * race. 3382 */ 3383 nr_pages = size >> PAGE_SHIFT; 3384 nr_bytes = size & (PAGE_SIZE - 1); 3385 3386 if (nr_bytes) 3387 nr_pages += 1; 3388 3389 ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); 3390 if (!ret && nr_bytes) 3391 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); 3392 3393 return ret; 3394 } 3395 3396 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) 3397 { 3398 refill_obj_stock(objcg, size, true); 3399 } 3400 3401 #endif /* CONFIG_MEMCG_KMEM */ 3402 3403 /* 3404 * Because page_memcg(head) is not set on tails, set it now. 3405 */ 3406 void split_page_memcg(struct page *head, unsigned int nr) 3407 { 3408 struct folio *folio = page_folio(head); 3409 struct mem_cgroup *memcg = folio_memcg(folio); 3410 int i; 3411 3412 if (mem_cgroup_disabled() || !memcg) 3413 return; 3414 3415 for (i = 1; i < nr; i++) 3416 folio_page(folio, i)->memcg_data = folio->memcg_data; 3417 3418 if (folio_memcg_kmem(folio)) 3419 obj_cgroup_get_many(__folio_objcg(folio), nr - 1); 3420 else 3421 css_get_many(&memcg->css, nr - 1); 3422 } 3423 3424 #ifdef CONFIG_SWAP 3425 /** 3426 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3427 * @entry: swap entry to be moved 3428 * @from: mem_cgroup which the entry is moved from 3429 * @to: mem_cgroup which the entry is moved to 3430 * 3431 * It succeeds only when the swap_cgroup's record for this entry is the same 3432 * as the mem_cgroup's id of @from. 3433 * 3434 * Returns 0 on success, -EINVAL on failure. 3435 * 3436 * The caller must have charged to @to, IOW, called page_counter_charge() about 3437 * both res and memsw, and called css_get(). 3438 */ 3439 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3440 struct mem_cgroup *from, struct mem_cgroup *to) 3441 { 3442 unsigned short old_id, new_id; 3443 3444 old_id = mem_cgroup_id(from); 3445 new_id = mem_cgroup_id(to); 3446 3447 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3448 mod_memcg_state(from, MEMCG_SWAP, -1); 3449 mod_memcg_state(to, MEMCG_SWAP, 1); 3450 return 0; 3451 } 3452 return -EINVAL; 3453 } 3454 #else 3455 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3456 struct mem_cgroup *from, struct mem_cgroup *to) 3457 { 3458 return -EINVAL; 3459 } 3460 #endif 3461 3462 static DEFINE_MUTEX(memcg_max_mutex); 3463 3464 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 3465 unsigned long max, bool memsw) 3466 { 3467 bool enlarge = false; 3468 bool drained = false; 3469 int ret; 3470 bool limits_invariant; 3471 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 3472 3473 do { 3474 if (signal_pending(current)) { 3475 ret = -EINTR; 3476 break; 3477 } 3478 3479 mutex_lock(&memcg_max_mutex); 3480 /* 3481 * Make sure that the new limit (memsw or memory limit) doesn't 3482 * break our basic invariant rule memory.max <= memsw.max. 3483 */ 3484 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 3485 max <= memcg->memsw.max; 3486 if (!limits_invariant) { 3487 mutex_unlock(&memcg_max_mutex); 3488 ret = -EINVAL; 3489 break; 3490 } 3491 if (max > counter->max) 3492 enlarge = true; 3493 ret = page_counter_set_max(counter, max); 3494 mutex_unlock(&memcg_max_mutex); 3495 3496 if (!ret) 3497 break; 3498 3499 if (!drained) { 3500 drain_all_stock(memcg); 3501 drained = true; 3502 continue; 3503 } 3504 3505 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 3506 memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { 3507 ret = -EBUSY; 3508 break; 3509 } 3510 } while (true); 3511 3512 if (!ret && enlarge) 3513 memcg_oom_recover(memcg); 3514 3515 return ret; 3516 } 3517 3518 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 3519 gfp_t gfp_mask, 3520 unsigned long *total_scanned) 3521 { 3522 unsigned long nr_reclaimed = 0; 3523 struct mem_cgroup_per_node *mz, *next_mz = NULL; 3524 unsigned long reclaimed; 3525 int loop = 0; 3526 struct mem_cgroup_tree_per_node *mctz; 3527 unsigned long excess; 3528 3529 if (order > 0) 3530 return 0; 3531 3532 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; 3533 3534 /* 3535 * Do not even bother to check the largest node if the root 3536 * is empty. Do it lockless to prevent lock bouncing. Races 3537 * are acceptable as soft limit is best effort anyway. 3538 */ 3539 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 3540 return 0; 3541 3542 /* 3543 * This loop can run a while, specially if mem_cgroup's continuously 3544 * keep exceeding their soft limit and putting the system under 3545 * pressure 3546 */ 3547 do { 3548 if (next_mz) 3549 mz = next_mz; 3550 else 3551 mz = mem_cgroup_largest_soft_limit_node(mctz); 3552 if (!mz) 3553 break; 3554 3555 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 3556 gfp_mask, total_scanned); 3557 nr_reclaimed += reclaimed; 3558 spin_lock_irq(&mctz->lock); 3559 3560 /* 3561 * If we failed to reclaim anything from this memory cgroup 3562 * it is time to move on to the next cgroup 3563 */ 3564 next_mz = NULL; 3565 if (!reclaimed) 3566 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 3567 3568 excess = soft_limit_excess(mz->memcg); 3569 /* 3570 * One school of thought says that we should not add 3571 * back the node to the tree if reclaim returns 0. 3572 * But our reclaim could return 0, simply because due 3573 * to priority we are exposing a smaller subset of 3574 * memory to reclaim from. Consider this as a longer 3575 * term TODO. 3576 */ 3577 /* If excess == 0, no tree ops */ 3578 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3579 spin_unlock_irq(&mctz->lock); 3580 css_put(&mz->memcg->css); 3581 loop++; 3582 /* 3583 * Could not reclaim anything and there are no more 3584 * mem cgroups to try or we seem to be looping without 3585 * reclaiming anything. 3586 */ 3587 if (!nr_reclaimed && 3588 (next_mz == NULL || 3589 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3590 break; 3591 } while (!nr_reclaimed); 3592 if (next_mz) 3593 css_put(&next_mz->memcg->css); 3594 return nr_reclaimed; 3595 } 3596 3597 /* 3598 * Reclaims as many pages from the given memcg as possible. 3599 * 3600 * Caller is responsible for holding css reference for memcg. 3601 */ 3602 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3603 { 3604 int nr_retries = MAX_RECLAIM_RETRIES; 3605 3606 /* we call try-to-free pages for make this cgroup empty */ 3607 lru_add_drain_all(); 3608 3609 drain_all_stock(memcg); 3610 3611 /* try to free all pages in this cgroup */ 3612 while (nr_retries && page_counter_read(&memcg->memory)) { 3613 if (signal_pending(current)) 3614 return -EINTR; 3615 3616 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 3617 MEMCG_RECLAIM_MAY_SWAP)) 3618 nr_retries--; 3619 } 3620 3621 return 0; 3622 } 3623 3624 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3625 char *buf, size_t nbytes, 3626 loff_t off) 3627 { 3628 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3629 3630 if (mem_cgroup_is_root(memcg)) 3631 return -EINVAL; 3632 return mem_cgroup_force_empty(memcg) ?: nbytes; 3633 } 3634 3635 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3636 struct cftype *cft) 3637 { 3638 return 1; 3639 } 3640 3641 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3642 struct cftype *cft, u64 val) 3643 { 3644 if (val == 1) 3645 return 0; 3646 3647 pr_warn_once("Non-hierarchical mode is deprecated. " 3648 "Please report your usecase to linux-mm@kvack.org if you " 3649 "depend on this functionality.\n"); 3650 3651 return -EINVAL; 3652 } 3653 3654 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3655 { 3656 unsigned long val; 3657 3658 if (mem_cgroup_is_root(memcg)) { 3659 mem_cgroup_flush_stats(); 3660 val = memcg_page_state(memcg, NR_FILE_PAGES) + 3661 memcg_page_state(memcg, NR_ANON_MAPPED); 3662 if (swap) 3663 val += memcg_page_state(memcg, MEMCG_SWAP); 3664 } else { 3665 if (!swap) 3666 val = page_counter_read(&memcg->memory); 3667 else 3668 val = page_counter_read(&memcg->memsw); 3669 } 3670 return val; 3671 } 3672 3673 enum { 3674 RES_USAGE, 3675 RES_LIMIT, 3676 RES_MAX_USAGE, 3677 RES_FAILCNT, 3678 RES_SOFT_LIMIT, 3679 }; 3680 3681 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3682 struct cftype *cft) 3683 { 3684 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3685 struct page_counter *counter; 3686 3687 switch (MEMFILE_TYPE(cft->private)) { 3688 case _MEM: 3689 counter = &memcg->memory; 3690 break; 3691 case _MEMSWAP: 3692 counter = &memcg->memsw; 3693 break; 3694 case _KMEM: 3695 counter = &memcg->kmem; 3696 break; 3697 case _TCP: 3698 counter = &memcg->tcpmem; 3699 break; 3700 default: 3701 BUG(); 3702 } 3703 3704 switch (MEMFILE_ATTR(cft->private)) { 3705 case RES_USAGE: 3706 if (counter == &memcg->memory) 3707 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 3708 if (counter == &memcg->memsw) 3709 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 3710 return (u64)page_counter_read(counter) * PAGE_SIZE; 3711 case RES_LIMIT: 3712 return (u64)counter->max * PAGE_SIZE; 3713 case RES_MAX_USAGE: 3714 return (u64)counter->watermark * PAGE_SIZE; 3715 case RES_FAILCNT: 3716 return counter->failcnt; 3717 case RES_SOFT_LIMIT: 3718 return (u64)memcg->soft_limit * PAGE_SIZE; 3719 default: 3720 BUG(); 3721 } 3722 } 3723 3724 #ifdef CONFIG_MEMCG_KMEM 3725 static int memcg_online_kmem(struct mem_cgroup *memcg) 3726 { 3727 struct obj_cgroup *objcg; 3728 3729 if (mem_cgroup_kmem_disabled()) 3730 return 0; 3731 3732 if (unlikely(mem_cgroup_is_root(memcg))) 3733 return 0; 3734 3735 objcg = obj_cgroup_alloc(); 3736 if (!objcg) 3737 return -ENOMEM; 3738 3739 objcg->memcg = memcg; 3740 rcu_assign_pointer(memcg->objcg, objcg); 3741 3742 static_branch_enable(&memcg_kmem_enabled_key); 3743 3744 memcg->kmemcg_id = memcg->id.id; 3745 3746 return 0; 3747 } 3748 3749 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3750 { 3751 struct mem_cgroup *parent; 3752 3753 if (mem_cgroup_kmem_disabled()) 3754 return; 3755 3756 if (unlikely(mem_cgroup_is_root(memcg))) 3757 return; 3758 3759 parent = parent_mem_cgroup(memcg); 3760 if (!parent) 3761 parent = root_mem_cgroup; 3762 3763 memcg_reparent_objcgs(memcg, parent); 3764 3765 /* 3766 * After we have finished memcg_reparent_objcgs(), all list_lrus 3767 * corresponding to this cgroup are guaranteed to remain empty. 3768 * The ordering is imposed by list_lru_node->lock taken by 3769 * memcg_reparent_list_lrus(). 3770 */ 3771 memcg_reparent_list_lrus(memcg, parent); 3772 } 3773 #else 3774 static int memcg_online_kmem(struct mem_cgroup *memcg) 3775 { 3776 return 0; 3777 } 3778 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3779 { 3780 } 3781 #endif /* CONFIG_MEMCG_KMEM */ 3782 3783 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 3784 { 3785 int ret; 3786 3787 mutex_lock(&memcg_max_mutex); 3788 3789 ret = page_counter_set_max(&memcg->tcpmem, max); 3790 if (ret) 3791 goto out; 3792 3793 if (!memcg->tcpmem_active) { 3794 /* 3795 * The active flag needs to be written after the static_key 3796 * update. This is what guarantees that the socket activation 3797 * function is the last one to run. See mem_cgroup_sk_alloc() 3798 * for details, and note that we don't mark any socket as 3799 * belonging to this memcg until that flag is up. 3800 * 3801 * We need to do this, because static_keys will span multiple 3802 * sites, but we can't control their order. If we mark a socket 3803 * as accounted, but the accounting functions are not patched in 3804 * yet, we'll lose accounting. 3805 * 3806 * We never race with the readers in mem_cgroup_sk_alloc(), 3807 * because when this value change, the code to process it is not 3808 * patched in yet. 3809 */ 3810 static_branch_inc(&memcg_sockets_enabled_key); 3811 memcg->tcpmem_active = true; 3812 } 3813 out: 3814 mutex_unlock(&memcg_max_mutex); 3815 return ret; 3816 } 3817 3818 /* 3819 * The user of this function is... 3820 * RES_LIMIT. 3821 */ 3822 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3823 char *buf, size_t nbytes, loff_t off) 3824 { 3825 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3826 unsigned long nr_pages; 3827 int ret; 3828 3829 buf = strstrip(buf); 3830 ret = page_counter_memparse(buf, "-1", &nr_pages); 3831 if (ret) 3832 return ret; 3833 3834 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3835 case RES_LIMIT: 3836 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3837 ret = -EINVAL; 3838 break; 3839 } 3840 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3841 case _MEM: 3842 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 3843 break; 3844 case _MEMSWAP: 3845 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 3846 break; 3847 case _KMEM: 3848 /* kmem.limit_in_bytes is deprecated. */ 3849 ret = -EOPNOTSUPP; 3850 break; 3851 case _TCP: 3852 ret = memcg_update_tcp_max(memcg, nr_pages); 3853 break; 3854 } 3855 break; 3856 case RES_SOFT_LIMIT: 3857 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 3858 ret = -EOPNOTSUPP; 3859 } else { 3860 memcg->soft_limit = nr_pages; 3861 ret = 0; 3862 } 3863 break; 3864 } 3865 return ret ?: nbytes; 3866 } 3867 3868 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3869 size_t nbytes, loff_t off) 3870 { 3871 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3872 struct page_counter *counter; 3873 3874 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3875 case _MEM: 3876 counter = &memcg->memory; 3877 break; 3878 case _MEMSWAP: 3879 counter = &memcg->memsw; 3880 break; 3881 case _KMEM: 3882 counter = &memcg->kmem; 3883 break; 3884 case _TCP: 3885 counter = &memcg->tcpmem; 3886 break; 3887 default: 3888 BUG(); 3889 } 3890 3891 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3892 case RES_MAX_USAGE: 3893 page_counter_reset_watermark(counter); 3894 break; 3895 case RES_FAILCNT: 3896 counter->failcnt = 0; 3897 break; 3898 default: 3899 BUG(); 3900 } 3901 3902 return nbytes; 3903 } 3904 3905 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3906 struct cftype *cft) 3907 { 3908 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3909 } 3910 3911 #ifdef CONFIG_MMU 3912 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3913 struct cftype *cft, u64 val) 3914 { 3915 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3916 3917 if (val & ~MOVE_MASK) 3918 return -EINVAL; 3919 3920 /* 3921 * No kind of locking is needed in here, because ->can_attach() will 3922 * check this value once in the beginning of the process, and then carry 3923 * on with stale data. This means that changes to this value will only 3924 * affect task migrations starting after the change. 3925 */ 3926 memcg->move_charge_at_immigrate = val; 3927 return 0; 3928 } 3929 #else 3930 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3931 struct cftype *cft, u64 val) 3932 { 3933 return -ENOSYS; 3934 } 3935 #endif 3936 3937 #ifdef CONFIG_NUMA 3938 3939 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 3940 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 3941 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 3942 3943 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 3944 int nid, unsigned int lru_mask, bool tree) 3945 { 3946 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 3947 unsigned long nr = 0; 3948 enum lru_list lru; 3949 3950 VM_BUG_ON((unsigned)nid >= nr_node_ids); 3951 3952 for_each_lru(lru) { 3953 if (!(BIT(lru) & lru_mask)) 3954 continue; 3955 if (tree) 3956 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 3957 else 3958 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 3959 } 3960 return nr; 3961 } 3962 3963 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 3964 unsigned int lru_mask, 3965 bool tree) 3966 { 3967 unsigned long nr = 0; 3968 enum lru_list lru; 3969 3970 for_each_lru(lru) { 3971 if (!(BIT(lru) & lru_mask)) 3972 continue; 3973 if (tree) 3974 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 3975 else 3976 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 3977 } 3978 return nr; 3979 } 3980 3981 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3982 { 3983 struct numa_stat { 3984 const char *name; 3985 unsigned int lru_mask; 3986 }; 3987 3988 static const struct numa_stat stats[] = { 3989 { "total", LRU_ALL }, 3990 { "file", LRU_ALL_FILE }, 3991 { "anon", LRU_ALL_ANON }, 3992 { "unevictable", BIT(LRU_UNEVICTABLE) }, 3993 }; 3994 const struct numa_stat *stat; 3995 int nid; 3996 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 3997 3998 mem_cgroup_flush_stats(); 3999 4000 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4001 seq_printf(m, "%s=%lu", stat->name, 4002 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 4003 false)); 4004 for_each_node_state(nid, N_MEMORY) 4005 seq_printf(m, " N%d=%lu", nid, 4006 mem_cgroup_node_nr_lru_pages(memcg, nid, 4007 stat->lru_mask, false)); 4008 seq_putc(m, '\n'); 4009 } 4010 4011 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4012 4013 seq_printf(m, "hierarchical_%s=%lu", stat->name, 4014 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 4015 true)); 4016 for_each_node_state(nid, N_MEMORY) 4017 seq_printf(m, " N%d=%lu", nid, 4018 mem_cgroup_node_nr_lru_pages(memcg, nid, 4019 stat->lru_mask, true)); 4020 seq_putc(m, '\n'); 4021 } 4022 4023 return 0; 4024 } 4025 #endif /* CONFIG_NUMA */ 4026 4027 static const unsigned int memcg1_stats[] = { 4028 NR_FILE_PAGES, 4029 NR_ANON_MAPPED, 4030 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4031 NR_ANON_THPS, 4032 #endif 4033 NR_SHMEM, 4034 NR_FILE_MAPPED, 4035 NR_FILE_DIRTY, 4036 NR_WRITEBACK, 4037 WORKINGSET_REFAULT_ANON, 4038 WORKINGSET_REFAULT_FILE, 4039 MEMCG_SWAP, 4040 }; 4041 4042 static const char *const memcg1_stat_names[] = { 4043 "cache", 4044 "rss", 4045 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4046 "rss_huge", 4047 #endif 4048 "shmem", 4049 "mapped_file", 4050 "dirty", 4051 "writeback", 4052 "workingset_refault_anon", 4053 "workingset_refault_file", 4054 "swap", 4055 }; 4056 4057 /* Universal VM events cgroup1 shows, original sort order */ 4058 static const unsigned int memcg1_events[] = { 4059 PGPGIN, 4060 PGPGOUT, 4061 PGFAULT, 4062 PGMAJFAULT, 4063 }; 4064 4065 static int memcg_stat_show(struct seq_file *m, void *v) 4066 { 4067 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4068 unsigned long memory, memsw; 4069 struct mem_cgroup *mi; 4070 unsigned int i; 4071 4072 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 4073 4074 mem_cgroup_flush_stats(); 4075 4076 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4077 unsigned long nr; 4078 4079 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 4080 continue; 4081 nr = memcg_page_state_local(memcg, memcg1_stats[i]); 4082 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], 4083 nr * memcg_page_state_unit(memcg1_stats[i])); 4084 } 4085 4086 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4087 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]), 4088 memcg_events_local(memcg, memcg1_events[i])); 4089 4090 for (i = 0; i < NR_LRU_LISTS; i++) 4091 seq_printf(m, "%s %lu\n", lru_list_name(i), 4092 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 4093 PAGE_SIZE); 4094 4095 /* Hierarchical information */ 4096 memory = memsw = PAGE_COUNTER_MAX; 4097 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 4098 memory = min(memory, READ_ONCE(mi->memory.max)); 4099 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 4100 } 4101 seq_printf(m, "hierarchical_memory_limit %llu\n", 4102 (u64)memory * PAGE_SIZE); 4103 if (do_memsw_account()) 4104 seq_printf(m, "hierarchical_memsw_limit %llu\n", 4105 (u64)memsw * PAGE_SIZE); 4106 4107 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4108 unsigned long nr; 4109 4110 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 4111 continue; 4112 nr = memcg_page_state(memcg, memcg1_stats[i]); 4113 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], 4114 (u64)nr * memcg_page_state_unit(memcg1_stats[i])); 4115 } 4116 4117 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4118 seq_printf(m, "total_%s %llu\n", 4119 vm_event_name(memcg1_events[i]), 4120 (u64)memcg_events(memcg, memcg1_events[i])); 4121 4122 for (i = 0; i < NR_LRU_LISTS; i++) 4123 seq_printf(m, "total_%s %llu\n", lru_list_name(i), 4124 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 4125 PAGE_SIZE); 4126 4127 #ifdef CONFIG_DEBUG_VM 4128 { 4129 pg_data_t *pgdat; 4130 struct mem_cgroup_per_node *mz; 4131 unsigned long anon_cost = 0; 4132 unsigned long file_cost = 0; 4133 4134 for_each_online_pgdat(pgdat) { 4135 mz = memcg->nodeinfo[pgdat->node_id]; 4136 4137 anon_cost += mz->lruvec.anon_cost; 4138 file_cost += mz->lruvec.file_cost; 4139 } 4140 seq_printf(m, "anon_cost %lu\n", anon_cost); 4141 seq_printf(m, "file_cost %lu\n", file_cost); 4142 } 4143 #endif 4144 4145 return 0; 4146 } 4147 4148 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 4149 struct cftype *cft) 4150 { 4151 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4152 4153 return mem_cgroup_swappiness(memcg); 4154 } 4155 4156 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 4157 struct cftype *cft, u64 val) 4158 { 4159 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4160 4161 if (val > 200) 4162 return -EINVAL; 4163 4164 if (!mem_cgroup_is_root(memcg)) 4165 memcg->swappiness = val; 4166 else 4167 vm_swappiness = val; 4168 4169 return 0; 4170 } 4171 4172 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4173 { 4174 struct mem_cgroup_threshold_ary *t; 4175 unsigned long usage; 4176 int i; 4177 4178 rcu_read_lock(); 4179 if (!swap) 4180 t = rcu_dereference(memcg->thresholds.primary); 4181 else 4182 t = rcu_dereference(memcg->memsw_thresholds.primary); 4183 4184 if (!t) 4185 goto unlock; 4186 4187 usage = mem_cgroup_usage(memcg, swap); 4188 4189 /* 4190 * current_threshold points to threshold just below or equal to usage. 4191 * If it's not true, a threshold was crossed after last 4192 * call of __mem_cgroup_threshold(). 4193 */ 4194 i = t->current_threshold; 4195 4196 /* 4197 * Iterate backward over array of thresholds starting from 4198 * current_threshold and check if a threshold is crossed. 4199 * If none of thresholds below usage is crossed, we read 4200 * only one element of the array here. 4201 */ 4202 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4203 eventfd_signal(t->entries[i].eventfd, 1); 4204 4205 /* i = current_threshold + 1 */ 4206 i++; 4207 4208 /* 4209 * Iterate forward over array of thresholds starting from 4210 * current_threshold+1 and check if a threshold is crossed. 4211 * If none of thresholds above usage is crossed, we read 4212 * only one element of the array here. 4213 */ 4214 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4215 eventfd_signal(t->entries[i].eventfd, 1); 4216 4217 /* Update current_threshold */ 4218 t->current_threshold = i - 1; 4219 unlock: 4220 rcu_read_unlock(); 4221 } 4222 4223 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4224 { 4225 while (memcg) { 4226 __mem_cgroup_threshold(memcg, false); 4227 if (do_memsw_account()) 4228 __mem_cgroup_threshold(memcg, true); 4229 4230 memcg = parent_mem_cgroup(memcg); 4231 } 4232 } 4233 4234 static int compare_thresholds(const void *a, const void *b) 4235 { 4236 const struct mem_cgroup_threshold *_a = a; 4237 const struct mem_cgroup_threshold *_b = b; 4238 4239 if (_a->threshold > _b->threshold) 4240 return 1; 4241 4242 if (_a->threshold < _b->threshold) 4243 return -1; 4244 4245 return 0; 4246 } 4247 4248 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4249 { 4250 struct mem_cgroup_eventfd_list *ev; 4251 4252 spin_lock(&memcg_oom_lock); 4253 4254 list_for_each_entry(ev, &memcg->oom_notify, list) 4255 eventfd_signal(ev->eventfd, 1); 4256 4257 spin_unlock(&memcg_oom_lock); 4258 return 0; 4259 } 4260 4261 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4262 { 4263 struct mem_cgroup *iter; 4264 4265 for_each_mem_cgroup_tree(iter, memcg) 4266 mem_cgroup_oom_notify_cb(iter); 4267 } 4268 4269 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4270 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 4271 { 4272 struct mem_cgroup_thresholds *thresholds; 4273 struct mem_cgroup_threshold_ary *new; 4274 unsigned long threshold; 4275 unsigned long usage; 4276 int i, size, ret; 4277 4278 ret = page_counter_memparse(args, "-1", &threshold); 4279 if (ret) 4280 return ret; 4281 4282 mutex_lock(&memcg->thresholds_lock); 4283 4284 if (type == _MEM) { 4285 thresholds = &memcg->thresholds; 4286 usage = mem_cgroup_usage(memcg, false); 4287 } else if (type == _MEMSWAP) { 4288 thresholds = &memcg->memsw_thresholds; 4289 usage = mem_cgroup_usage(memcg, true); 4290 } else 4291 BUG(); 4292 4293 /* Check if a threshold crossed before adding a new one */ 4294 if (thresholds->primary) 4295 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4296 4297 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4298 4299 /* Allocate memory for new array of thresholds */ 4300 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 4301 if (!new) { 4302 ret = -ENOMEM; 4303 goto unlock; 4304 } 4305 new->size = size; 4306 4307 /* Copy thresholds (if any) to new array */ 4308 if (thresholds->primary) 4309 memcpy(new->entries, thresholds->primary->entries, 4310 flex_array_size(new, entries, size - 1)); 4311 4312 /* Add new threshold */ 4313 new->entries[size - 1].eventfd = eventfd; 4314 new->entries[size - 1].threshold = threshold; 4315 4316 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4317 sort(new->entries, size, sizeof(*new->entries), 4318 compare_thresholds, NULL); 4319 4320 /* Find current threshold */ 4321 new->current_threshold = -1; 4322 for (i = 0; i < size; i++) { 4323 if (new->entries[i].threshold <= usage) { 4324 /* 4325 * new->current_threshold will not be used until 4326 * rcu_assign_pointer(), so it's safe to increment 4327 * it here. 4328 */ 4329 ++new->current_threshold; 4330 } else 4331 break; 4332 } 4333 4334 /* Free old spare buffer and save old primary buffer as spare */ 4335 kfree(thresholds->spare); 4336 thresholds->spare = thresholds->primary; 4337 4338 rcu_assign_pointer(thresholds->primary, new); 4339 4340 /* To be sure that nobody uses thresholds */ 4341 synchronize_rcu(); 4342 4343 unlock: 4344 mutex_unlock(&memcg->thresholds_lock); 4345 4346 return ret; 4347 } 4348 4349 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4350 struct eventfd_ctx *eventfd, const char *args) 4351 { 4352 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 4353 } 4354 4355 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 4356 struct eventfd_ctx *eventfd, const char *args) 4357 { 4358 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 4359 } 4360 4361 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4362 struct eventfd_ctx *eventfd, enum res_type type) 4363 { 4364 struct mem_cgroup_thresholds *thresholds; 4365 struct mem_cgroup_threshold_ary *new; 4366 unsigned long usage; 4367 int i, j, size, entries; 4368 4369 mutex_lock(&memcg->thresholds_lock); 4370 4371 if (type == _MEM) { 4372 thresholds = &memcg->thresholds; 4373 usage = mem_cgroup_usage(memcg, false); 4374 } else if (type == _MEMSWAP) { 4375 thresholds = &memcg->memsw_thresholds; 4376 usage = mem_cgroup_usage(memcg, true); 4377 } else 4378 BUG(); 4379 4380 if (!thresholds->primary) 4381 goto unlock; 4382 4383 /* Check if a threshold crossed before removing */ 4384 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4385 4386 /* Calculate new number of threshold */ 4387 size = entries = 0; 4388 for (i = 0; i < thresholds->primary->size; i++) { 4389 if (thresholds->primary->entries[i].eventfd != eventfd) 4390 size++; 4391 else 4392 entries++; 4393 } 4394 4395 new = thresholds->spare; 4396 4397 /* If no items related to eventfd have been cleared, nothing to do */ 4398 if (!entries) 4399 goto unlock; 4400 4401 /* Set thresholds array to NULL if we don't have thresholds */ 4402 if (!size) { 4403 kfree(new); 4404 new = NULL; 4405 goto swap_buffers; 4406 } 4407 4408 new->size = size; 4409 4410 /* Copy thresholds and find current threshold */ 4411 new->current_threshold = -1; 4412 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4413 if (thresholds->primary->entries[i].eventfd == eventfd) 4414 continue; 4415 4416 new->entries[j] = thresholds->primary->entries[i]; 4417 if (new->entries[j].threshold <= usage) { 4418 /* 4419 * new->current_threshold will not be used 4420 * until rcu_assign_pointer(), so it's safe to increment 4421 * it here. 4422 */ 4423 ++new->current_threshold; 4424 } 4425 j++; 4426 } 4427 4428 swap_buffers: 4429 /* Swap primary and spare array */ 4430 thresholds->spare = thresholds->primary; 4431 4432 rcu_assign_pointer(thresholds->primary, new); 4433 4434 /* To be sure that nobody uses thresholds */ 4435 synchronize_rcu(); 4436 4437 /* If all events are unregistered, free the spare array */ 4438 if (!new) { 4439 kfree(thresholds->spare); 4440 thresholds->spare = NULL; 4441 } 4442 unlock: 4443 mutex_unlock(&memcg->thresholds_lock); 4444 } 4445 4446 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4447 struct eventfd_ctx *eventfd) 4448 { 4449 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4450 } 4451 4452 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4453 struct eventfd_ctx *eventfd) 4454 { 4455 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4456 } 4457 4458 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4459 struct eventfd_ctx *eventfd, const char *args) 4460 { 4461 struct mem_cgroup_eventfd_list *event; 4462 4463 event = kmalloc(sizeof(*event), GFP_KERNEL); 4464 if (!event) 4465 return -ENOMEM; 4466 4467 spin_lock(&memcg_oom_lock); 4468 4469 event->eventfd = eventfd; 4470 list_add(&event->list, &memcg->oom_notify); 4471 4472 /* already in OOM ? */ 4473 if (memcg->under_oom) 4474 eventfd_signal(eventfd, 1); 4475 spin_unlock(&memcg_oom_lock); 4476 4477 return 0; 4478 } 4479 4480 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4481 struct eventfd_ctx *eventfd) 4482 { 4483 struct mem_cgroup_eventfd_list *ev, *tmp; 4484 4485 spin_lock(&memcg_oom_lock); 4486 4487 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4488 if (ev->eventfd == eventfd) { 4489 list_del(&ev->list); 4490 kfree(ev); 4491 } 4492 } 4493 4494 spin_unlock(&memcg_oom_lock); 4495 } 4496 4497 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4498 { 4499 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 4500 4501 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 4502 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 4503 seq_printf(sf, "oom_kill %lu\n", 4504 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 4505 return 0; 4506 } 4507 4508 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4509 struct cftype *cft, u64 val) 4510 { 4511 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4512 4513 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4514 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) 4515 return -EINVAL; 4516 4517 memcg->oom_kill_disable = val; 4518 if (!val) 4519 memcg_oom_recover(memcg); 4520 4521 return 0; 4522 } 4523 4524 #ifdef CONFIG_CGROUP_WRITEBACK 4525 4526 #include <trace/events/writeback.h> 4527 4528 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4529 { 4530 return wb_domain_init(&memcg->cgwb_domain, gfp); 4531 } 4532 4533 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4534 { 4535 wb_domain_exit(&memcg->cgwb_domain); 4536 } 4537 4538 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4539 { 4540 wb_domain_size_changed(&memcg->cgwb_domain); 4541 } 4542 4543 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 4544 { 4545 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4546 4547 if (!memcg->css.parent) 4548 return NULL; 4549 4550 return &memcg->cgwb_domain; 4551 } 4552 4553 /** 4554 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 4555 * @wb: bdi_writeback in question 4556 * @pfilepages: out parameter for number of file pages 4557 * @pheadroom: out parameter for number of allocatable pages according to memcg 4558 * @pdirty: out parameter for number of dirty pages 4559 * @pwriteback: out parameter for number of pages under writeback 4560 * 4561 * Determine the numbers of file, headroom, dirty, and writeback pages in 4562 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 4563 * is a bit more involved. 4564 * 4565 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 4566 * headroom is calculated as the lowest headroom of itself and the 4567 * ancestors. Note that this doesn't consider the actual amount of 4568 * available memory in the system. The caller should further cap 4569 * *@pheadroom accordingly. 4570 */ 4571 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 4572 unsigned long *pheadroom, unsigned long *pdirty, 4573 unsigned long *pwriteback) 4574 { 4575 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4576 struct mem_cgroup *parent; 4577 4578 mem_cgroup_flush_stats(); 4579 4580 *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); 4581 *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); 4582 *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + 4583 memcg_page_state(memcg, NR_ACTIVE_FILE); 4584 4585 *pheadroom = PAGE_COUNTER_MAX; 4586 while ((parent = parent_mem_cgroup(memcg))) { 4587 unsigned long ceiling = min(READ_ONCE(memcg->memory.max), 4588 READ_ONCE(memcg->memory.high)); 4589 unsigned long used = page_counter_read(&memcg->memory); 4590 4591 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 4592 memcg = parent; 4593 } 4594 } 4595 4596 /* 4597 * Foreign dirty flushing 4598 * 4599 * There's an inherent mismatch between memcg and writeback. The former 4600 * tracks ownership per-page while the latter per-inode. This was a 4601 * deliberate design decision because honoring per-page ownership in the 4602 * writeback path is complicated, may lead to higher CPU and IO overheads 4603 * and deemed unnecessary given that write-sharing an inode across 4604 * different cgroups isn't a common use-case. 4605 * 4606 * Combined with inode majority-writer ownership switching, this works well 4607 * enough in most cases but there are some pathological cases. For 4608 * example, let's say there are two cgroups A and B which keep writing to 4609 * different but confined parts of the same inode. B owns the inode and 4610 * A's memory is limited far below B's. A's dirty ratio can rise enough to 4611 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid 4612 * triggering background writeback. A will be slowed down without a way to 4613 * make writeback of the dirty pages happen. 4614 * 4615 * Conditions like the above can lead to a cgroup getting repeatedly and 4616 * severely throttled after making some progress after each 4617 * dirty_expire_interval while the underlying IO device is almost 4618 * completely idle. 4619 * 4620 * Solving this problem completely requires matching the ownership tracking 4621 * granularities between memcg and writeback in either direction. However, 4622 * the more egregious behaviors can be avoided by simply remembering the 4623 * most recent foreign dirtying events and initiating remote flushes on 4624 * them when local writeback isn't enough to keep the memory clean enough. 4625 * 4626 * The following two functions implement such mechanism. When a foreign 4627 * page - a page whose memcg and writeback ownerships don't match - is 4628 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning 4629 * bdi_writeback on the page owning memcg. When balance_dirty_pages() 4630 * decides that the memcg needs to sleep due to high dirty ratio, it calls 4631 * mem_cgroup_flush_foreign() which queues writeback on the recorded 4632 * foreign bdi_writebacks which haven't expired. Both the numbers of 4633 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are 4634 * limited to MEMCG_CGWB_FRN_CNT. 4635 * 4636 * The mechanism only remembers IDs and doesn't hold any object references. 4637 * As being wrong occasionally doesn't matter, updates and accesses to the 4638 * records are lockless and racy. 4639 */ 4640 void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, 4641 struct bdi_writeback *wb) 4642 { 4643 struct mem_cgroup *memcg = folio_memcg(folio); 4644 struct memcg_cgwb_frn *frn; 4645 u64 now = get_jiffies_64(); 4646 u64 oldest_at = now; 4647 int oldest = -1; 4648 int i; 4649 4650 trace_track_foreign_dirty(folio, wb); 4651 4652 /* 4653 * Pick the slot to use. If there is already a slot for @wb, keep 4654 * using it. If not replace the oldest one which isn't being 4655 * written out. 4656 */ 4657 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4658 frn = &memcg->cgwb_frn[i]; 4659 if (frn->bdi_id == wb->bdi->id && 4660 frn->memcg_id == wb->memcg_css->id) 4661 break; 4662 if (time_before64(frn->at, oldest_at) && 4663 atomic_read(&frn->done.cnt) == 1) { 4664 oldest = i; 4665 oldest_at = frn->at; 4666 } 4667 } 4668 4669 if (i < MEMCG_CGWB_FRN_CNT) { 4670 /* 4671 * Re-using an existing one. Update timestamp lazily to 4672 * avoid making the cacheline hot. We want them to be 4673 * reasonably up-to-date and significantly shorter than 4674 * dirty_expire_interval as that's what expires the record. 4675 * Use the shorter of 1s and dirty_expire_interval / 8. 4676 */ 4677 unsigned long update_intv = 4678 min_t(unsigned long, HZ, 4679 msecs_to_jiffies(dirty_expire_interval * 10) / 8); 4680 4681 if (time_before64(frn->at, now - update_intv)) 4682 frn->at = now; 4683 } else if (oldest >= 0) { 4684 /* replace the oldest free one */ 4685 frn = &memcg->cgwb_frn[oldest]; 4686 frn->bdi_id = wb->bdi->id; 4687 frn->memcg_id = wb->memcg_css->id; 4688 frn->at = now; 4689 } 4690 } 4691 4692 /* issue foreign writeback flushes for recorded foreign dirtying events */ 4693 void mem_cgroup_flush_foreign(struct bdi_writeback *wb) 4694 { 4695 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4696 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); 4697 u64 now = jiffies_64; 4698 int i; 4699 4700 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4701 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; 4702 4703 /* 4704 * If the record is older than dirty_expire_interval, 4705 * writeback on it has already started. No need to kick it 4706 * off again. Also, don't start a new one if there's 4707 * already one in flight. 4708 */ 4709 if (time_after64(frn->at, now - intv) && 4710 atomic_read(&frn->done.cnt) == 1) { 4711 frn->at = 0; 4712 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); 4713 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 4714 WB_REASON_FOREIGN_FLUSH, 4715 &frn->done); 4716 } 4717 } 4718 } 4719 4720 #else /* CONFIG_CGROUP_WRITEBACK */ 4721 4722 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4723 { 4724 return 0; 4725 } 4726 4727 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4728 { 4729 } 4730 4731 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4732 { 4733 } 4734 4735 #endif /* CONFIG_CGROUP_WRITEBACK */ 4736 4737 /* 4738 * DO NOT USE IN NEW FILES. 4739 * 4740 * "cgroup.event_control" implementation. 4741 * 4742 * This is way over-engineered. It tries to support fully configurable 4743 * events for each user. Such level of flexibility is completely 4744 * unnecessary especially in the light of the planned unified hierarchy. 4745 * 4746 * Please deprecate this and replace with something simpler if at all 4747 * possible. 4748 */ 4749 4750 /* 4751 * Unregister event and free resources. 4752 * 4753 * Gets called from workqueue. 4754 */ 4755 static void memcg_event_remove(struct work_struct *work) 4756 { 4757 struct mem_cgroup_event *event = 4758 container_of(work, struct mem_cgroup_event, remove); 4759 struct mem_cgroup *memcg = event->memcg; 4760 4761 remove_wait_queue(event->wqh, &event->wait); 4762 4763 event->unregister_event(memcg, event->eventfd); 4764 4765 /* Notify userspace the event is going away. */ 4766 eventfd_signal(event->eventfd, 1); 4767 4768 eventfd_ctx_put(event->eventfd); 4769 kfree(event); 4770 css_put(&memcg->css); 4771 } 4772 4773 /* 4774 * Gets called on EPOLLHUP on eventfd when user closes it. 4775 * 4776 * Called with wqh->lock held and interrupts disabled. 4777 */ 4778 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 4779 int sync, void *key) 4780 { 4781 struct mem_cgroup_event *event = 4782 container_of(wait, struct mem_cgroup_event, wait); 4783 struct mem_cgroup *memcg = event->memcg; 4784 __poll_t flags = key_to_poll(key); 4785 4786 if (flags & EPOLLHUP) { 4787 /* 4788 * If the event has been detached at cgroup removal, we 4789 * can simply return knowing the other side will cleanup 4790 * for us. 4791 * 4792 * We can't race against event freeing since the other 4793 * side will require wqh->lock via remove_wait_queue(), 4794 * which we hold. 4795 */ 4796 spin_lock(&memcg->event_list_lock); 4797 if (!list_empty(&event->list)) { 4798 list_del_init(&event->list); 4799 /* 4800 * We are in atomic context, but cgroup_event_remove() 4801 * may sleep, so we have to call it in workqueue. 4802 */ 4803 schedule_work(&event->remove); 4804 } 4805 spin_unlock(&memcg->event_list_lock); 4806 } 4807 4808 return 0; 4809 } 4810 4811 static void memcg_event_ptable_queue_proc(struct file *file, 4812 wait_queue_head_t *wqh, poll_table *pt) 4813 { 4814 struct mem_cgroup_event *event = 4815 container_of(pt, struct mem_cgroup_event, pt); 4816 4817 event->wqh = wqh; 4818 add_wait_queue(wqh, &event->wait); 4819 } 4820 4821 /* 4822 * DO NOT USE IN NEW FILES. 4823 * 4824 * Parse input and register new cgroup event handler. 4825 * 4826 * Input must be in format '<event_fd> <control_fd> <args>'. 4827 * Interpretation of args is defined by control file implementation. 4828 */ 4829 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4830 char *buf, size_t nbytes, loff_t off) 4831 { 4832 struct cgroup_subsys_state *css = of_css(of); 4833 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4834 struct mem_cgroup_event *event; 4835 struct cgroup_subsys_state *cfile_css; 4836 unsigned int efd, cfd; 4837 struct fd efile; 4838 struct fd cfile; 4839 struct dentry *cdentry; 4840 const char *name; 4841 char *endp; 4842 int ret; 4843 4844 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 4845 return -EOPNOTSUPP; 4846 4847 buf = strstrip(buf); 4848 4849 efd = simple_strtoul(buf, &endp, 10); 4850 if (*endp != ' ') 4851 return -EINVAL; 4852 buf = endp + 1; 4853 4854 cfd = simple_strtoul(buf, &endp, 10); 4855 if ((*endp != ' ') && (*endp != '\0')) 4856 return -EINVAL; 4857 buf = endp + 1; 4858 4859 event = kzalloc(sizeof(*event), GFP_KERNEL); 4860 if (!event) 4861 return -ENOMEM; 4862 4863 event->memcg = memcg; 4864 INIT_LIST_HEAD(&event->list); 4865 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4866 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4867 INIT_WORK(&event->remove, memcg_event_remove); 4868 4869 efile = fdget(efd); 4870 if (!efile.file) { 4871 ret = -EBADF; 4872 goto out_kfree; 4873 } 4874 4875 event->eventfd = eventfd_ctx_fileget(efile.file); 4876 if (IS_ERR(event->eventfd)) { 4877 ret = PTR_ERR(event->eventfd); 4878 goto out_put_efile; 4879 } 4880 4881 cfile = fdget(cfd); 4882 if (!cfile.file) { 4883 ret = -EBADF; 4884 goto out_put_eventfd; 4885 } 4886 4887 /* the process need read permission on control file */ 4888 /* AV: shouldn't we check that it's been opened for read instead? */ 4889 ret = file_permission(cfile.file, MAY_READ); 4890 if (ret < 0) 4891 goto out_put_cfile; 4892 4893 /* 4894 * The control file must be a regular cgroup1 file. As a regular cgroup 4895 * file can't be renamed, it's safe to access its name afterwards. 4896 */ 4897 cdentry = cfile.file->f_path.dentry; 4898 if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { 4899 ret = -EINVAL; 4900 goto out_put_cfile; 4901 } 4902 4903 /* 4904 * Determine the event callbacks and set them in @event. This used 4905 * to be done via struct cftype but cgroup core no longer knows 4906 * about these events. The following is crude but the whole thing 4907 * is for compatibility anyway. 4908 * 4909 * DO NOT ADD NEW FILES. 4910 */ 4911 name = cdentry->d_name.name; 4912 4913 if (!strcmp(name, "memory.usage_in_bytes")) { 4914 event->register_event = mem_cgroup_usage_register_event; 4915 event->unregister_event = mem_cgroup_usage_unregister_event; 4916 } else if (!strcmp(name, "memory.oom_control")) { 4917 event->register_event = mem_cgroup_oom_register_event; 4918 event->unregister_event = mem_cgroup_oom_unregister_event; 4919 } else if (!strcmp(name, "memory.pressure_level")) { 4920 event->register_event = vmpressure_register_event; 4921 event->unregister_event = vmpressure_unregister_event; 4922 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 4923 event->register_event = memsw_cgroup_usage_register_event; 4924 event->unregister_event = memsw_cgroup_usage_unregister_event; 4925 } else { 4926 ret = -EINVAL; 4927 goto out_put_cfile; 4928 } 4929 4930 /* 4931 * Verify @cfile should belong to @css. Also, remaining events are 4932 * automatically removed on cgroup destruction but the removal is 4933 * asynchronous, so take an extra ref on @css. 4934 */ 4935 cfile_css = css_tryget_online_from_dir(cdentry->d_parent, 4936 &memory_cgrp_subsys); 4937 ret = -EINVAL; 4938 if (IS_ERR(cfile_css)) 4939 goto out_put_cfile; 4940 if (cfile_css != css) { 4941 css_put(cfile_css); 4942 goto out_put_cfile; 4943 } 4944 4945 ret = event->register_event(memcg, event->eventfd, buf); 4946 if (ret) 4947 goto out_put_css; 4948 4949 vfs_poll(efile.file, &event->pt); 4950 4951 spin_lock_irq(&memcg->event_list_lock); 4952 list_add(&event->list, &memcg->event_list); 4953 spin_unlock_irq(&memcg->event_list_lock); 4954 4955 fdput(cfile); 4956 fdput(efile); 4957 4958 return nbytes; 4959 4960 out_put_css: 4961 css_put(css); 4962 out_put_cfile: 4963 fdput(cfile); 4964 out_put_eventfd: 4965 eventfd_ctx_put(event->eventfd); 4966 out_put_efile: 4967 fdput(efile); 4968 out_kfree: 4969 kfree(event); 4970 4971 return ret; 4972 } 4973 4974 #if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 4975 static int mem_cgroup_slab_show(struct seq_file *m, void *p) 4976 { 4977 /* 4978 * Deprecated. 4979 * Please, take a look at tools/cgroup/memcg_slabinfo.py . 4980 */ 4981 return 0; 4982 } 4983 #endif 4984 4985 static struct cftype mem_cgroup_legacy_files[] = { 4986 { 4987 .name = "usage_in_bytes", 4988 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4989 .read_u64 = mem_cgroup_read_u64, 4990 }, 4991 { 4992 .name = "max_usage_in_bytes", 4993 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4994 .write = mem_cgroup_reset, 4995 .read_u64 = mem_cgroup_read_u64, 4996 }, 4997 { 4998 .name = "limit_in_bytes", 4999 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 5000 .write = mem_cgroup_write, 5001 .read_u64 = mem_cgroup_read_u64, 5002 }, 5003 { 5004 .name = "soft_limit_in_bytes", 5005 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 5006 .write = mem_cgroup_write, 5007 .read_u64 = mem_cgroup_read_u64, 5008 }, 5009 { 5010 .name = "failcnt", 5011 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 5012 .write = mem_cgroup_reset, 5013 .read_u64 = mem_cgroup_read_u64, 5014 }, 5015 { 5016 .name = "stat", 5017 .seq_show = memcg_stat_show, 5018 }, 5019 { 5020 .name = "force_empty", 5021 .write = mem_cgroup_force_empty_write, 5022 }, 5023 { 5024 .name = "use_hierarchy", 5025 .write_u64 = mem_cgroup_hierarchy_write, 5026 .read_u64 = mem_cgroup_hierarchy_read, 5027 }, 5028 { 5029 .name = "cgroup.event_control", /* XXX: for compat */ 5030 .write = memcg_write_event_control, 5031 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 5032 }, 5033 { 5034 .name = "swappiness", 5035 .read_u64 = mem_cgroup_swappiness_read, 5036 .write_u64 = mem_cgroup_swappiness_write, 5037 }, 5038 { 5039 .name = "move_charge_at_immigrate", 5040 .read_u64 = mem_cgroup_move_charge_read, 5041 .write_u64 = mem_cgroup_move_charge_write, 5042 }, 5043 { 5044 .name = "oom_control", 5045 .seq_show = mem_cgroup_oom_control_read, 5046 .write_u64 = mem_cgroup_oom_control_write, 5047 }, 5048 { 5049 .name = "pressure_level", 5050 }, 5051 #ifdef CONFIG_NUMA 5052 { 5053 .name = "numa_stat", 5054 .seq_show = memcg_numa_stat_show, 5055 }, 5056 #endif 5057 { 5058 .name = "kmem.limit_in_bytes", 5059 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 5060 .write = mem_cgroup_write, 5061 .read_u64 = mem_cgroup_read_u64, 5062 }, 5063 { 5064 .name = "kmem.usage_in_bytes", 5065 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 5066 .read_u64 = mem_cgroup_read_u64, 5067 }, 5068 { 5069 .name = "kmem.failcnt", 5070 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 5071 .write = mem_cgroup_reset, 5072 .read_u64 = mem_cgroup_read_u64, 5073 }, 5074 { 5075 .name = "kmem.max_usage_in_bytes", 5076 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 5077 .write = mem_cgroup_reset, 5078 .read_u64 = mem_cgroup_read_u64, 5079 }, 5080 #if defined(CONFIG_MEMCG_KMEM) && \ 5081 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 5082 { 5083 .name = "kmem.slabinfo", 5084 .seq_show = mem_cgroup_slab_show, 5085 }, 5086 #endif 5087 { 5088 .name = "kmem.tcp.limit_in_bytes", 5089 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 5090 .write = mem_cgroup_write, 5091 .read_u64 = mem_cgroup_read_u64, 5092 }, 5093 { 5094 .name = "kmem.tcp.usage_in_bytes", 5095 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 5096 .read_u64 = mem_cgroup_read_u64, 5097 }, 5098 { 5099 .name = "kmem.tcp.failcnt", 5100 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 5101 .write = mem_cgroup_reset, 5102 .read_u64 = mem_cgroup_read_u64, 5103 }, 5104 { 5105 .name = "kmem.tcp.max_usage_in_bytes", 5106 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 5107 .write = mem_cgroup_reset, 5108 .read_u64 = mem_cgroup_read_u64, 5109 }, 5110 { }, /* terminate */ 5111 }; 5112 5113 /* 5114 * Private memory cgroup IDR 5115 * 5116 * Swap-out records and page cache shadow entries need to store memcg 5117 * references in constrained space, so we maintain an ID space that is 5118 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 5119 * memory-controlled cgroups to 64k. 5120 * 5121 * However, there usually are many references to the offline CSS after 5122 * the cgroup has been destroyed, such as page cache or reclaimable 5123 * slab objects, that don't need to hang on to the ID. We want to keep 5124 * those dead CSS from occupying IDs, or we might quickly exhaust the 5125 * relatively small ID space and prevent the creation of new cgroups 5126 * even when there are much fewer than 64k cgroups - possibly none. 5127 * 5128 * Maintain a private 16-bit ID space for memcg, and allow the ID to 5129 * be freed and recycled when it's no longer needed, which is usually 5130 * when the CSS is offlined. 5131 * 5132 * The only exception to that are records of swapped out tmpfs/shmem 5133 * pages that need to be attributed to live ancestors on swapin. But 5134 * those references are manageable from userspace. 5135 */ 5136 5137 static DEFINE_IDR(mem_cgroup_idr); 5138 5139 static void mem_cgroup_id_remove(struct mem_cgroup *memcg) 5140 { 5141 if (memcg->id.id > 0) { 5142 idr_remove(&mem_cgroup_idr, memcg->id.id); 5143 memcg->id.id = 0; 5144 } 5145 } 5146 5147 static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, 5148 unsigned int n) 5149 { 5150 refcount_add(n, &memcg->id.ref); 5151 } 5152 5153 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 5154 { 5155 if (refcount_sub_and_test(n, &memcg->id.ref)) { 5156 mem_cgroup_id_remove(memcg); 5157 5158 /* Memcg ID pins CSS */ 5159 css_put(&memcg->css); 5160 } 5161 } 5162 5163 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 5164 { 5165 mem_cgroup_id_put_many(memcg, 1); 5166 } 5167 5168 /** 5169 * mem_cgroup_from_id - look up a memcg from a memcg id 5170 * @id: the memcg id to look up 5171 * 5172 * Caller must hold rcu_read_lock(). 5173 */ 5174 struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 5175 { 5176 WARN_ON_ONCE(!rcu_read_lock_held()); 5177 return idr_find(&mem_cgroup_idr, id); 5178 } 5179 5180 #ifdef CONFIG_SHRINKER_DEBUG 5181 struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) 5182 { 5183 struct cgroup *cgrp; 5184 struct cgroup_subsys_state *css; 5185 struct mem_cgroup *memcg; 5186 5187 cgrp = cgroup_get_from_id(ino); 5188 if (IS_ERR(cgrp)) 5189 return ERR_CAST(cgrp); 5190 5191 css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); 5192 if (css) 5193 memcg = container_of(css, struct mem_cgroup, css); 5194 else 5195 memcg = ERR_PTR(-ENOENT); 5196 5197 cgroup_put(cgrp); 5198 5199 return memcg; 5200 } 5201 #endif 5202 5203 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5204 { 5205 struct mem_cgroup_per_node *pn; 5206 5207 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); 5208 if (!pn) 5209 return 1; 5210 5211 pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, 5212 GFP_KERNEL_ACCOUNT); 5213 if (!pn->lruvec_stats_percpu) { 5214 kfree(pn); 5215 return 1; 5216 } 5217 5218 lruvec_init(&pn->lruvec); 5219 pn->memcg = memcg; 5220 5221 memcg->nodeinfo[node] = pn; 5222 return 0; 5223 } 5224 5225 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5226 { 5227 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 5228 5229 if (!pn) 5230 return; 5231 5232 free_percpu(pn->lruvec_stats_percpu); 5233 kfree(pn); 5234 } 5235 5236 static void __mem_cgroup_free(struct mem_cgroup *memcg) 5237 { 5238 int node; 5239 5240 for_each_node(node) 5241 free_mem_cgroup_per_node_info(memcg, node); 5242 kfree(memcg->vmstats); 5243 free_percpu(memcg->vmstats_percpu); 5244 kfree(memcg); 5245 } 5246 5247 static void mem_cgroup_free(struct mem_cgroup *memcg) 5248 { 5249 lru_gen_exit_memcg(memcg); 5250 memcg_wb_domain_exit(memcg); 5251 __mem_cgroup_free(memcg); 5252 } 5253 5254 static struct mem_cgroup *mem_cgroup_alloc(void) 5255 { 5256 struct mem_cgroup *memcg; 5257 int node; 5258 int __maybe_unused i; 5259 long error = -ENOMEM; 5260 5261 memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); 5262 if (!memcg) 5263 return ERR_PTR(error); 5264 5265 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 5266 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); 5267 if (memcg->id.id < 0) { 5268 error = memcg->id.id; 5269 goto fail; 5270 } 5271 5272 memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); 5273 if (!memcg->vmstats) 5274 goto fail; 5275 5276 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, 5277 GFP_KERNEL_ACCOUNT); 5278 if (!memcg->vmstats_percpu) 5279 goto fail; 5280 5281 for_each_node(node) 5282 if (alloc_mem_cgroup_per_node_info(memcg, node)) 5283 goto fail; 5284 5285 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 5286 goto fail; 5287 5288 INIT_WORK(&memcg->high_work, high_work_func); 5289 INIT_LIST_HEAD(&memcg->oom_notify); 5290 mutex_init(&memcg->thresholds_lock); 5291 spin_lock_init(&memcg->move_lock); 5292 vmpressure_init(&memcg->vmpressure); 5293 INIT_LIST_HEAD(&memcg->event_list); 5294 spin_lock_init(&memcg->event_list_lock); 5295 memcg->socket_pressure = jiffies; 5296 #ifdef CONFIG_MEMCG_KMEM 5297 memcg->kmemcg_id = -1; 5298 INIT_LIST_HEAD(&memcg->objcg_list); 5299 #endif 5300 #ifdef CONFIG_CGROUP_WRITEBACK 5301 INIT_LIST_HEAD(&memcg->cgwb_list); 5302 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5303 memcg->cgwb_frn[i].done = 5304 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 5305 #endif 5306 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5307 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); 5308 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); 5309 memcg->deferred_split_queue.split_queue_len = 0; 5310 #endif 5311 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 5312 lru_gen_init_memcg(memcg); 5313 return memcg; 5314 fail: 5315 mem_cgroup_id_remove(memcg); 5316 __mem_cgroup_free(memcg); 5317 return ERR_PTR(error); 5318 } 5319 5320 static struct cgroup_subsys_state * __ref 5321 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 5322 { 5323 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 5324 struct mem_cgroup *memcg, *old_memcg; 5325 5326 old_memcg = set_active_memcg(parent); 5327 memcg = mem_cgroup_alloc(); 5328 set_active_memcg(old_memcg); 5329 if (IS_ERR(memcg)) 5330 return ERR_CAST(memcg); 5331 5332 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5333 memcg->soft_limit = PAGE_COUNTER_MAX; 5334 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 5335 memcg->zswap_max = PAGE_COUNTER_MAX; 5336 #endif 5337 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5338 if (parent) { 5339 memcg->swappiness = mem_cgroup_swappiness(parent); 5340 memcg->oom_kill_disable = parent->oom_kill_disable; 5341 5342 page_counter_init(&memcg->memory, &parent->memory); 5343 page_counter_init(&memcg->swap, &parent->swap); 5344 page_counter_init(&memcg->kmem, &parent->kmem); 5345 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 5346 } else { 5347 init_memcg_events(); 5348 page_counter_init(&memcg->memory, NULL); 5349 page_counter_init(&memcg->swap, NULL); 5350 page_counter_init(&memcg->kmem, NULL); 5351 page_counter_init(&memcg->tcpmem, NULL); 5352 5353 root_mem_cgroup = memcg; 5354 return &memcg->css; 5355 } 5356 5357 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5358 static_branch_inc(&memcg_sockets_enabled_key); 5359 5360 return &memcg->css; 5361 } 5362 5363 static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 5364 { 5365 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5366 5367 if (memcg_online_kmem(memcg)) 5368 goto remove_id; 5369 5370 /* 5371 * A memcg must be visible for expand_shrinker_info() 5372 * by the time the maps are allocated. So, we allocate maps 5373 * here, when for_each_mem_cgroup() can't skip it. 5374 */ 5375 if (alloc_shrinker_info(memcg)) 5376 goto offline_kmem; 5377 5378 /* Online state pins memcg ID, memcg ID pins CSS */ 5379 refcount_set(&memcg->id.ref, 1); 5380 css_get(css); 5381 5382 if (unlikely(mem_cgroup_is_root(memcg))) 5383 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 5384 2UL*HZ); 5385 return 0; 5386 offline_kmem: 5387 memcg_offline_kmem(memcg); 5388 remove_id: 5389 mem_cgroup_id_remove(memcg); 5390 return -ENOMEM; 5391 } 5392 5393 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 5394 { 5395 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5396 struct mem_cgroup_event *event, *tmp; 5397 5398 /* 5399 * Unregister events and notify userspace. 5400 * Notify userspace about cgroup removing only after rmdir of cgroup 5401 * directory to avoid race between userspace and kernelspace. 5402 */ 5403 spin_lock_irq(&memcg->event_list_lock); 5404 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 5405 list_del_init(&event->list); 5406 schedule_work(&event->remove); 5407 } 5408 spin_unlock_irq(&memcg->event_list_lock); 5409 5410 page_counter_set_min(&memcg->memory, 0); 5411 page_counter_set_low(&memcg->memory, 0); 5412 5413 memcg_offline_kmem(memcg); 5414 reparent_shrinker_deferred(memcg); 5415 wb_memcg_offline(memcg); 5416 5417 drain_all_stock(memcg); 5418 5419 mem_cgroup_id_put(memcg); 5420 } 5421 5422 static void mem_cgroup_css_released(struct cgroup_subsys_state *css) 5423 { 5424 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5425 5426 invalidate_reclaim_iterators(memcg); 5427 } 5428 5429 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5430 { 5431 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5432 int __maybe_unused i; 5433 5434 #ifdef CONFIG_CGROUP_WRITEBACK 5435 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5436 wb_wait_for_completion(&memcg->cgwb_frn[i].done); 5437 #endif 5438 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5439 static_branch_dec(&memcg_sockets_enabled_key); 5440 5441 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 5442 static_branch_dec(&memcg_sockets_enabled_key); 5443 5444 vmpressure_cleanup(&memcg->vmpressure); 5445 cancel_work_sync(&memcg->high_work); 5446 mem_cgroup_remove_from_trees(memcg); 5447 free_shrinker_info(memcg); 5448 mem_cgroup_free(memcg); 5449 } 5450 5451 /** 5452 * mem_cgroup_css_reset - reset the states of a mem_cgroup 5453 * @css: the target css 5454 * 5455 * Reset the states of the mem_cgroup associated with @css. This is 5456 * invoked when the userland requests disabling on the default hierarchy 5457 * but the memcg is pinned through dependency. The memcg should stop 5458 * applying policies and should revert to the vanilla state as it may be 5459 * made visible again. 5460 * 5461 * The current implementation only resets the essential configurations. 5462 * This needs to be expanded to cover all the visible parts. 5463 */ 5464 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 5465 { 5466 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5467 5468 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 5469 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 5470 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 5471 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 5472 page_counter_set_min(&memcg->memory, 0); 5473 page_counter_set_low(&memcg->memory, 0); 5474 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5475 memcg->soft_limit = PAGE_COUNTER_MAX; 5476 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5477 memcg_wb_domain_size_changed(memcg); 5478 } 5479 5480 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) 5481 { 5482 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5483 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5484 struct memcg_vmstats_percpu *statc; 5485 long delta, v; 5486 int i, nid; 5487 5488 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); 5489 5490 for (i = 0; i < MEMCG_NR_STAT; i++) { 5491 /* 5492 * Collect the aggregated propagation counts of groups 5493 * below us. We're in a per-cpu loop here and this is 5494 * a global counter, so the first cycle will get them. 5495 */ 5496 delta = memcg->vmstats->state_pending[i]; 5497 if (delta) 5498 memcg->vmstats->state_pending[i] = 0; 5499 5500 /* Add CPU changes on this level since the last flush */ 5501 v = READ_ONCE(statc->state[i]); 5502 if (v != statc->state_prev[i]) { 5503 delta += v - statc->state_prev[i]; 5504 statc->state_prev[i] = v; 5505 } 5506 5507 if (!delta) 5508 continue; 5509 5510 /* Aggregate counts on this level and propagate upwards */ 5511 memcg->vmstats->state[i] += delta; 5512 if (parent) 5513 parent->vmstats->state_pending[i] += delta; 5514 } 5515 5516 for (i = 0; i < NR_MEMCG_EVENTS; i++) { 5517 delta = memcg->vmstats->events_pending[i]; 5518 if (delta) 5519 memcg->vmstats->events_pending[i] = 0; 5520 5521 v = READ_ONCE(statc->events[i]); 5522 if (v != statc->events_prev[i]) { 5523 delta += v - statc->events_prev[i]; 5524 statc->events_prev[i] = v; 5525 } 5526 5527 if (!delta) 5528 continue; 5529 5530 memcg->vmstats->events[i] += delta; 5531 if (parent) 5532 parent->vmstats->events_pending[i] += delta; 5533 } 5534 5535 for_each_node_state(nid, N_MEMORY) { 5536 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; 5537 struct mem_cgroup_per_node *ppn = NULL; 5538 struct lruvec_stats_percpu *lstatc; 5539 5540 if (parent) 5541 ppn = parent->nodeinfo[nid]; 5542 5543 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); 5544 5545 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { 5546 delta = pn->lruvec_stats.state_pending[i]; 5547 if (delta) 5548 pn->lruvec_stats.state_pending[i] = 0; 5549 5550 v = READ_ONCE(lstatc->state[i]); 5551 if (v != lstatc->state_prev[i]) { 5552 delta += v - lstatc->state_prev[i]; 5553 lstatc->state_prev[i] = v; 5554 } 5555 5556 if (!delta) 5557 continue; 5558 5559 pn->lruvec_stats.state[i] += delta; 5560 if (ppn) 5561 ppn->lruvec_stats.state_pending[i] += delta; 5562 } 5563 } 5564 } 5565 5566 #ifdef CONFIG_MMU 5567 /* Handlers for move charge at task migration. */ 5568 static int mem_cgroup_do_precharge(unsigned long count) 5569 { 5570 int ret; 5571 5572 /* Try a single bulk charge without reclaim first, kswapd may wake */ 5573 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 5574 if (!ret) { 5575 mc.precharge += count; 5576 return ret; 5577 } 5578 5579 /* Try charges one by one with reclaim, but do not retry */ 5580 while (count--) { 5581 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 5582 if (ret) 5583 return ret; 5584 mc.precharge++; 5585 cond_resched(); 5586 } 5587 return 0; 5588 } 5589 5590 union mc_target { 5591 struct page *page; 5592 swp_entry_t ent; 5593 }; 5594 5595 enum mc_target_type { 5596 MC_TARGET_NONE = 0, 5597 MC_TARGET_PAGE, 5598 MC_TARGET_SWAP, 5599 MC_TARGET_DEVICE, 5600 }; 5601 5602 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5603 unsigned long addr, pte_t ptent) 5604 { 5605 struct page *page = vm_normal_page(vma, addr, ptent); 5606 5607 if (!page || !page_mapped(page)) 5608 return NULL; 5609 if (PageAnon(page)) { 5610 if (!(mc.flags & MOVE_ANON)) 5611 return NULL; 5612 } else { 5613 if (!(mc.flags & MOVE_FILE)) 5614 return NULL; 5615 } 5616 if (!get_page_unless_zero(page)) 5617 return NULL; 5618 5619 return page; 5620 } 5621 5622 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 5623 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5624 pte_t ptent, swp_entry_t *entry) 5625 { 5626 struct page *page = NULL; 5627 swp_entry_t ent = pte_to_swp_entry(ptent); 5628 5629 if (!(mc.flags & MOVE_ANON)) 5630 return NULL; 5631 5632 /* 5633 * Handle device private pages that are not accessible by the CPU, but 5634 * stored as special swap entries in the page table. 5635 */ 5636 if (is_device_private_entry(ent)) { 5637 page = pfn_swap_entry_to_page(ent); 5638 if (!get_page_unless_zero(page)) 5639 return NULL; 5640 return page; 5641 } 5642 5643 if (non_swap_entry(ent)) 5644 return NULL; 5645 5646 /* 5647 * Because swap_cache_get_folio() updates some statistics counter, 5648 * we call find_get_page() with swapper_space directly. 5649 */ 5650 page = find_get_page(swap_address_space(ent), swp_offset(ent)); 5651 entry->val = ent.val; 5652 5653 return page; 5654 } 5655 #else 5656 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5657 pte_t ptent, swp_entry_t *entry) 5658 { 5659 return NULL; 5660 } 5661 #endif 5662 5663 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5664 unsigned long addr, pte_t ptent) 5665 { 5666 unsigned long index; 5667 struct folio *folio; 5668 5669 if (!vma->vm_file) /* anonymous vma */ 5670 return NULL; 5671 if (!(mc.flags & MOVE_FILE)) 5672 return NULL; 5673 5674 /* folio is moved even if it's not RSS of this task(page-faulted). */ 5675 /* shmem/tmpfs may report page out on swap: account for that too. */ 5676 index = linear_page_index(vma, addr); 5677 folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); 5678 if (!folio) 5679 return NULL; 5680 return folio_file_page(folio, index); 5681 } 5682 5683 /** 5684 * mem_cgroup_move_account - move account of the page 5685 * @page: the page 5686 * @compound: charge the page as compound or small page 5687 * @from: mem_cgroup which the page is moved from. 5688 * @to: mem_cgroup which the page is moved to. @from != @to. 5689 * 5690 * The caller must make sure the page is not on LRU (isolate_page() is useful.) 5691 * 5692 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 5693 * from old cgroup. 5694 */ 5695 static int mem_cgroup_move_account(struct page *page, 5696 bool compound, 5697 struct mem_cgroup *from, 5698 struct mem_cgroup *to) 5699 { 5700 struct folio *folio = page_folio(page); 5701 struct lruvec *from_vec, *to_vec; 5702 struct pglist_data *pgdat; 5703 unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; 5704 int nid, ret; 5705 5706 VM_BUG_ON(from == to); 5707 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 5708 VM_BUG_ON(compound && !folio_test_large(folio)); 5709 5710 /* 5711 * Prevent mem_cgroup_migrate() from looking at 5712 * page's memory cgroup of its source page while we change it. 5713 */ 5714 ret = -EBUSY; 5715 if (!folio_trylock(folio)) 5716 goto out; 5717 5718 ret = -EINVAL; 5719 if (folio_memcg(folio) != from) 5720 goto out_unlock; 5721 5722 pgdat = folio_pgdat(folio); 5723 from_vec = mem_cgroup_lruvec(from, pgdat); 5724 to_vec = mem_cgroup_lruvec(to, pgdat); 5725 5726 folio_memcg_lock(folio); 5727 5728 if (folio_test_anon(folio)) { 5729 if (folio_mapped(folio)) { 5730 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); 5731 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); 5732 if (folio_test_transhuge(folio)) { 5733 __mod_lruvec_state(from_vec, NR_ANON_THPS, 5734 -nr_pages); 5735 __mod_lruvec_state(to_vec, NR_ANON_THPS, 5736 nr_pages); 5737 } 5738 } 5739 } else { 5740 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); 5741 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); 5742 5743 if (folio_test_swapbacked(folio)) { 5744 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); 5745 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); 5746 } 5747 5748 if (folio_mapped(folio)) { 5749 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); 5750 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); 5751 } 5752 5753 if (folio_test_dirty(folio)) { 5754 struct address_space *mapping = folio_mapping(folio); 5755 5756 if (mapping_can_writeback(mapping)) { 5757 __mod_lruvec_state(from_vec, NR_FILE_DIRTY, 5758 -nr_pages); 5759 __mod_lruvec_state(to_vec, NR_FILE_DIRTY, 5760 nr_pages); 5761 } 5762 } 5763 } 5764 5765 #ifdef CONFIG_SWAP 5766 if (folio_test_swapcache(folio)) { 5767 __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages); 5768 __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages); 5769 } 5770 #endif 5771 if (folio_test_writeback(folio)) { 5772 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); 5773 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); 5774 } 5775 5776 /* 5777 * All state has been migrated, let's switch to the new memcg. 5778 * 5779 * It is safe to change page's memcg here because the page 5780 * is referenced, charged, isolated, and locked: we can't race 5781 * with (un)charging, migration, LRU putback, or anything else 5782 * that would rely on a stable page's memory cgroup. 5783 * 5784 * Note that lock_page_memcg is a memcg lock, not a page lock, 5785 * to save space. As soon as we switch page's memory cgroup to a 5786 * new memcg that isn't locked, the above state can change 5787 * concurrently again. Make sure we're truly done with it. 5788 */ 5789 smp_mb(); 5790 5791 css_get(&to->css); 5792 css_put(&from->css); 5793 5794 folio->memcg_data = (unsigned long)to; 5795 5796 __folio_memcg_unlock(from); 5797 5798 ret = 0; 5799 nid = folio_nid(folio); 5800 5801 local_irq_disable(); 5802 mem_cgroup_charge_statistics(to, nr_pages); 5803 memcg_check_events(to, nid); 5804 mem_cgroup_charge_statistics(from, -nr_pages); 5805 memcg_check_events(from, nid); 5806 local_irq_enable(); 5807 out_unlock: 5808 folio_unlock(folio); 5809 out: 5810 return ret; 5811 } 5812 5813 /** 5814 * get_mctgt_type - get target type of moving charge 5815 * @vma: the vma the pte to be checked belongs 5816 * @addr: the address corresponding to the pte to be checked 5817 * @ptent: the pte to be checked 5818 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5819 * 5820 * Returns 5821 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5822 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5823 * move charge. if @target is not NULL, the page is stored in target->page 5824 * with extra refcnt got(Callers should handle it). 5825 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5826 * target for charge migration. if @target is not NULL, the entry is stored 5827 * in target->ent. 5828 * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and 5829 * thus not on the lru. 5830 * For now we such page is charge like a regular page would be as for all 5831 * intent and purposes it is just special memory taking the place of a 5832 * regular page. 5833 * 5834 * See Documentations/vm/hmm.txt and include/linux/hmm.h 5835 * 5836 * Called with pte lock held. 5837 */ 5838 5839 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5840 unsigned long addr, pte_t ptent, union mc_target *target) 5841 { 5842 struct page *page = NULL; 5843 enum mc_target_type ret = MC_TARGET_NONE; 5844 swp_entry_t ent = { .val = 0 }; 5845 5846 if (pte_present(ptent)) 5847 page = mc_handle_present_pte(vma, addr, ptent); 5848 else if (pte_none_mostly(ptent)) 5849 /* 5850 * PTE markers should be treated as a none pte here, separated 5851 * from other swap handling below. 5852 */ 5853 page = mc_handle_file_pte(vma, addr, ptent); 5854 else if (is_swap_pte(ptent)) 5855 page = mc_handle_swap_pte(vma, ptent, &ent); 5856 5857 if (!page && !ent.val) 5858 return ret; 5859 if (page) { 5860 /* 5861 * Do only loose check w/o serialization. 5862 * mem_cgroup_move_account() checks the page is valid or 5863 * not under LRU exclusion. 5864 */ 5865 if (page_memcg(page) == mc.from) { 5866 ret = MC_TARGET_PAGE; 5867 if (is_device_private_page(page) || 5868 is_device_coherent_page(page)) 5869 ret = MC_TARGET_DEVICE; 5870 if (target) 5871 target->page = page; 5872 } 5873 if (!ret || !target) 5874 put_page(page); 5875 } 5876 /* 5877 * There is a swap entry and a page doesn't exist or isn't charged. 5878 * But we cannot move a tail-page in a THP. 5879 */ 5880 if (ent.val && !ret && (!page || !PageTransCompound(page)) && 5881 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 5882 ret = MC_TARGET_SWAP; 5883 if (target) 5884 target->ent = ent; 5885 } 5886 return ret; 5887 } 5888 5889 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5890 /* 5891 * We don't consider PMD mapped swapping or file mapped pages because THP does 5892 * not support them for now. 5893 * Caller should make sure that pmd_trans_huge(pmd) is true. 5894 */ 5895 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5896 unsigned long addr, pmd_t pmd, union mc_target *target) 5897 { 5898 struct page *page = NULL; 5899 enum mc_target_type ret = MC_TARGET_NONE; 5900 5901 if (unlikely(is_swap_pmd(pmd))) { 5902 VM_BUG_ON(thp_migration_supported() && 5903 !is_pmd_migration_entry(pmd)); 5904 return ret; 5905 } 5906 page = pmd_page(pmd); 5907 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5908 if (!(mc.flags & MOVE_ANON)) 5909 return ret; 5910 if (page_memcg(page) == mc.from) { 5911 ret = MC_TARGET_PAGE; 5912 if (target) { 5913 get_page(page); 5914 target->page = page; 5915 } 5916 } 5917 return ret; 5918 } 5919 #else 5920 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5921 unsigned long addr, pmd_t pmd, union mc_target *target) 5922 { 5923 return MC_TARGET_NONE; 5924 } 5925 #endif 5926 5927 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5928 unsigned long addr, unsigned long end, 5929 struct mm_walk *walk) 5930 { 5931 struct vm_area_struct *vma = walk->vma; 5932 pte_t *pte; 5933 spinlock_t *ptl; 5934 5935 ptl = pmd_trans_huge_lock(pmd, vma); 5936 if (ptl) { 5937 /* 5938 * Note their can not be MC_TARGET_DEVICE for now as we do not 5939 * support transparent huge page with MEMORY_DEVICE_PRIVATE but 5940 * this might change. 5941 */ 5942 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5943 mc.precharge += HPAGE_PMD_NR; 5944 spin_unlock(ptl); 5945 return 0; 5946 } 5947 5948 if (pmd_trans_unstable(pmd)) 5949 return 0; 5950 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5951 for (; addr != end; pte++, addr += PAGE_SIZE) 5952 if (get_mctgt_type(vma, addr, *pte, NULL)) 5953 mc.precharge++; /* increment precharge temporarily */ 5954 pte_unmap_unlock(pte - 1, ptl); 5955 cond_resched(); 5956 5957 return 0; 5958 } 5959 5960 static const struct mm_walk_ops precharge_walk_ops = { 5961 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5962 }; 5963 5964 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5965 { 5966 unsigned long precharge; 5967 5968 mmap_read_lock(mm); 5969 walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); 5970 mmap_read_unlock(mm); 5971 5972 precharge = mc.precharge; 5973 mc.precharge = 0; 5974 5975 return precharge; 5976 } 5977 5978 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5979 { 5980 unsigned long precharge = mem_cgroup_count_precharge(mm); 5981 5982 VM_BUG_ON(mc.moving_task); 5983 mc.moving_task = current; 5984 return mem_cgroup_do_precharge(precharge); 5985 } 5986 5987 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5988 static void __mem_cgroup_clear_mc(void) 5989 { 5990 struct mem_cgroup *from = mc.from; 5991 struct mem_cgroup *to = mc.to; 5992 5993 /* we must uncharge all the leftover precharges from mc.to */ 5994 if (mc.precharge) { 5995 cancel_charge(mc.to, mc.precharge); 5996 mc.precharge = 0; 5997 } 5998 /* 5999 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 6000 * we must uncharge here. 6001 */ 6002 if (mc.moved_charge) { 6003 cancel_charge(mc.from, mc.moved_charge); 6004 mc.moved_charge = 0; 6005 } 6006 /* we must fixup refcnts and charges */ 6007 if (mc.moved_swap) { 6008 /* uncharge swap account from the old cgroup */ 6009 if (!mem_cgroup_is_root(mc.from)) 6010 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 6011 6012 mem_cgroup_id_put_many(mc.from, mc.moved_swap); 6013 6014 /* 6015 * we charged both to->memory and to->memsw, so we 6016 * should uncharge to->memory. 6017 */ 6018 if (!mem_cgroup_is_root(mc.to)) 6019 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 6020 6021 mc.moved_swap = 0; 6022 } 6023 memcg_oom_recover(from); 6024 memcg_oom_recover(to); 6025 wake_up_all(&mc.waitq); 6026 } 6027 6028 static void mem_cgroup_clear_mc(void) 6029 { 6030 struct mm_struct *mm = mc.mm; 6031 6032 /* 6033 * we must clear moving_task before waking up waiters at the end of 6034 * task migration. 6035 */ 6036 mc.moving_task = NULL; 6037 __mem_cgroup_clear_mc(); 6038 spin_lock(&mc.lock); 6039 mc.from = NULL; 6040 mc.to = NULL; 6041 mc.mm = NULL; 6042 spin_unlock(&mc.lock); 6043 6044 mmput(mm); 6045 } 6046 6047 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 6048 { 6049 struct cgroup_subsys_state *css; 6050 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 6051 struct mem_cgroup *from; 6052 struct task_struct *leader, *p; 6053 struct mm_struct *mm; 6054 unsigned long move_flags; 6055 int ret = 0; 6056 6057 /* charge immigration isn't supported on the default hierarchy */ 6058 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 6059 return 0; 6060 6061 /* 6062 * Multi-process migrations only happen on the default hierarchy 6063 * where charge immigration is not used. Perform charge 6064 * immigration if @tset contains a leader and whine if there are 6065 * multiple. 6066 */ 6067 p = NULL; 6068 cgroup_taskset_for_each_leader(leader, css, tset) { 6069 WARN_ON_ONCE(p); 6070 p = leader; 6071 memcg = mem_cgroup_from_css(css); 6072 } 6073 if (!p) 6074 return 0; 6075 6076 /* 6077 * We are now committed to this value whatever it is. Changes in this 6078 * tunable will only affect upcoming migrations, not the current one. 6079 * So we need to save it, and keep it going. 6080 */ 6081 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 6082 if (!move_flags) 6083 return 0; 6084 6085 from = mem_cgroup_from_task(p); 6086 6087 VM_BUG_ON(from == memcg); 6088 6089 mm = get_task_mm(p); 6090 if (!mm) 6091 return 0; 6092 /* We move charges only when we move a owner of the mm */ 6093 if (mm->owner == p) { 6094 VM_BUG_ON(mc.from); 6095 VM_BUG_ON(mc.to); 6096 VM_BUG_ON(mc.precharge); 6097 VM_BUG_ON(mc.moved_charge); 6098 VM_BUG_ON(mc.moved_swap); 6099 6100 spin_lock(&mc.lock); 6101 mc.mm = mm; 6102 mc.from = from; 6103 mc.to = memcg; 6104 mc.flags = move_flags; 6105 spin_unlock(&mc.lock); 6106 /* We set mc.moving_task later */ 6107 6108 ret = mem_cgroup_precharge_mc(mm); 6109 if (ret) 6110 mem_cgroup_clear_mc(); 6111 } else { 6112 mmput(mm); 6113 } 6114 return ret; 6115 } 6116 6117 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 6118 { 6119 if (mc.to) 6120 mem_cgroup_clear_mc(); 6121 } 6122 6123 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 6124 unsigned long addr, unsigned long end, 6125 struct mm_walk *walk) 6126 { 6127 int ret = 0; 6128 struct vm_area_struct *vma = walk->vma; 6129 pte_t *pte; 6130 spinlock_t *ptl; 6131 enum mc_target_type target_type; 6132 union mc_target target; 6133 struct page *page; 6134 6135 ptl = pmd_trans_huge_lock(pmd, vma); 6136 if (ptl) { 6137 if (mc.precharge < HPAGE_PMD_NR) { 6138 spin_unlock(ptl); 6139 return 0; 6140 } 6141 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 6142 if (target_type == MC_TARGET_PAGE) { 6143 page = target.page; 6144 if (!isolate_lru_page(page)) { 6145 if (!mem_cgroup_move_account(page, true, 6146 mc.from, mc.to)) { 6147 mc.precharge -= HPAGE_PMD_NR; 6148 mc.moved_charge += HPAGE_PMD_NR; 6149 } 6150 putback_lru_page(page); 6151 } 6152 put_page(page); 6153 } else if (target_type == MC_TARGET_DEVICE) { 6154 page = target.page; 6155 if (!mem_cgroup_move_account(page, true, 6156 mc.from, mc.to)) { 6157 mc.precharge -= HPAGE_PMD_NR; 6158 mc.moved_charge += HPAGE_PMD_NR; 6159 } 6160 put_page(page); 6161 } 6162 spin_unlock(ptl); 6163 return 0; 6164 } 6165 6166 if (pmd_trans_unstable(pmd)) 6167 return 0; 6168 retry: 6169 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6170 for (; addr != end; addr += PAGE_SIZE) { 6171 pte_t ptent = *(pte++); 6172 bool device = false; 6173 swp_entry_t ent; 6174 6175 if (!mc.precharge) 6176 break; 6177 6178 switch (get_mctgt_type(vma, addr, ptent, &target)) { 6179 case MC_TARGET_DEVICE: 6180 device = true; 6181 fallthrough; 6182 case MC_TARGET_PAGE: 6183 page = target.page; 6184 /* 6185 * We can have a part of the split pmd here. Moving it 6186 * can be done but it would be too convoluted so simply 6187 * ignore such a partial THP and keep it in original 6188 * memcg. There should be somebody mapping the head. 6189 */ 6190 if (PageTransCompound(page)) 6191 goto put; 6192 if (!device && isolate_lru_page(page)) 6193 goto put; 6194 if (!mem_cgroup_move_account(page, false, 6195 mc.from, mc.to)) { 6196 mc.precharge--; 6197 /* we uncharge from mc.from later. */ 6198 mc.moved_charge++; 6199 } 6200 if (!device) 6201 putback_lru_page(page); 6202 put: /* get_mctgt_type() gets the page */ 6203 put_page(page); 6204 break; 6205 case MC_TARGET_SWAP: 6206 ent = target.ent; 6207 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 6208 mc.precharge--; 6209 mem_cgroup_id_get_many(mc.to, 1); 6210 /* we fixup other refcnts and charges later. */ 6211 mc.moved_swap++; 6212 } 6213 break; 6214 default: 6215 break; 6216 } 6217 } 6218 pte_unmap_unlock(pte - 1, ptl); 6219 cond_resched(); 6220 6221 if (addr != end) { 6222 /* 6223 * We have consumed all precharges we got in can_attach(). 6224 * We try charge one by one, but don't do any additional 6225 * charges to mc.to if we have failed in charge once in attach() 6226 * phase. 6227 */ 6228 ret = mem_cgroup_do_precharge(1); 6229 if (!ret) 6230 goto retry; 6231 } 6232 6233 return ret; 6234 } 6235 6236 static const struct mm_walk_ops charge_walk_ops = { 6237 .pmd_entry = mem_cgroup_move_charge_pte_range, 6238 }; 6239 6240 static void mem_cgroup_move_charge(void) 6241 { 6242 lru_add_drain_all(); 6243 /* 6244 * Signal lock_page_memcg() to take the memcg's move_lock 6245 * while we're moving its pages to another memcg. Then wait 6246 * for already started RCU-only updates to finish. 6247 */ 6248 atomic_inc(&mc.from->moving_account); 6249 synchronize_rcu(); 6250 retry: 6251 if (unlikely(!mmap_read_trylock(mc.mm))) { 6252 /* 6253 * Someone who are holding the mmap_lock might be waiting in 6254 * waitq. So we cancel all extra charges, wake up all waiters, 6255 * and retry. Because we cancel precharges, we might not be able 6256 * to move enough charges, but moving charge is a best-effort 6257 * feature anyway, so it wouldn't be a big problem. 6258 */ 6259 __mem_cgroup_clear_mc(); 6260 cond_resched(); 6261 goto retry; 6262 } 6263 /* 6264 * When we have consumed all precharges and failed in doing 6265 * additional charge, the page walk just aborts. 6266 */ 6267 walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); 6268 mmap_read_unlock(mc.mm); 6269 atomic_dec(&mc.from->moving_account); 6270 } 6271 6272 static void mem_cgroup_move_task(void) 6273 { 6274 if (mc.to) { 6275 mem_cgroup_move_charge(); 6276 mem_cgroup_clear_mc(); 6277 } 6278 } 6279 #else /* !CONFIG_MMU */ 6280 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 6281 { 6282 return 0; 6283 } 6284 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 6285 { 6286 } 6287 static void mem_cgroup_move_task(void) 6288 { 6289 } 6290 #endif 6291 6292 #ifdef CONFIG_LRU_GEN 6293 static void mem_cgroup_attach(struct cgroup_taskset *tset) 6294 { 6295 struct task_struct *task; 6296 struct cgroup_subsys_state *css; 6297 6298 /* find the first leader if there is any */ 6299 cgroup_taskset_for_each_leader(task, css, tset) 6300 break; 6301 6302 if (!task) 6303 return; 6304 6305 task_lock(task); 6306 if (task->mm && READ_ONCE(task->mm->owner) == task) 6307 lru_gen_migrate_mm(task->mm); 6308 task_unlock(task); 6309 } 6310 #else 6311 static void mem_cgroup_attach(struct cgroup_taskset *tset) 6312 { 6313 } 6314 #endif /* CONFIG_LRU_GEN */ 6315 6316 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 6317 { 6318 if (value == PAGE_COUNTER_MAX) 6319 seq_puts(m, "max\n"); 6320 else 6321 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 6322 6323 return 0; 6324 } 6325 6326 static u64 memory_current_read(struct cgroup_subsys_state *css, 6327 struct cftype *cft) 6328 { 6329 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6330 6331 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 6332 } 6333 6334 static u64 memory_peak_read(struct cgroup_subsys_state *css, 6335 struct cftype *cft) 6336 { 6337 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6338 6339 return (u64)memcg->memory.watermark * PAGE_SIZE; 6340 } 6341 6342 static int memory_min_show(struct seq_file *m, void *v) 6343 { 6344 return seq_puts_memcg_tunable(m, 6345 READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 6346 } 6347 6348 static ssize_t memory_min_write(struct kernfs_open_file *of, 6349 char *buf, size_t nbytes, loff_t off) 6350 { 6351 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6352 unsigned long min; 6353 int err; 6354 6355 buf = strstrip(buf); 6356 err = page_counter_memparse(buf, "max", &min); 6357 if (err) 6358 return err; 6359 6360 page_counter_set_min(&memcg->memory, min); 6361 6362 return nbytes; 6363 } 6364 6365 static int memory_low_show(struct seq_file *m, void *v) 6366 { 6367 return seq_puts_memcg_tunable(m, 6368 READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 6369 } 6370 6371 static ssize_t memory_low_write(struct kernfs_open_file *of, 6372 char *buf, size_t nbytes, loff_t off) 6373 { 6374 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6375 unsigned long low; 6376 int err; 6377 6378 buf = strstrip(buf); 6379 err = page_counter_memparse(buf, "max", &low); 6380 if (err) 6381 return err; 6382 6383 page_counter_set_low(&memcg->memory, low); 6384 6385 return nbytes; 6386 } 6387 6388 static int memory_high_show(struct seq_file *m, void *v) 6389 { 6390 return seq_puts_memcg_tunable(m, 6391 READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); 6392 } 6393 6394 static ssize_t memory_high_write(struct kernfs_open_file *of, 6395 char *buf, size_t nbytes, loff_t off) 6396 { 6397 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6398 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6399 bool drained = false; 6400 unsigned long high; 6401 int err; 6402 6403 buf = strstrip(buf); 6404 err = page_counter_memparse(buf, "max", &high); 6405 if (err) 6406 return err; 6407 6408 page_counter_set_high(&memcg->memory, high); 6409 6410 for (;;) { 6411 unsigned long nr_pages = page_counter_read(&memcg->memory); 6412 unsigned long reclaimed; 6413 6414 if (nr_pages <= high) 6415 break; 6416 6417 if (signal_pending(current)) 6418 break; 6419 6420 if (!drained) { 6421 drain_all_stock(memcg); 6422 drained = true; 6423 continue; 6424 } 6425 6426 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 6427 GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); 6428 6429 if (!reclaimed && !nr_retries--) 6430 break; 6431 } 6432 6433 memcg_wb_domain_size_changed(memcg); 6434 return nbytes; 6435 } 6436 6437 static int memory_max_show(struct seq_file *m, void *v) 6438 { 6439 return seq_puts_memcg_tunable(m, 6440 READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 6441 } 6442 6443 static ssize_t memory_max_write(struct kernfs_open_file *of, 6444 char *buf, size_t nbytes, loff_t off) 6445 { 6446 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6447 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; 6448 bool drained = false; 6449 unsigned long max; 6450 int err; 6451 6452 buf = strstrip(buf); 6453 err = page_counter_memparse(buf, "max", &max); 6454 if (err) 6455 return err; 6456 6457 xchg(&memcg->memory.max, max); 6458 6459 for (;;) { 6460 unsigned long nr_pages = page_counter_read(&memcg->memory); 6461 6462 if (nr_pages <= max) 6463 break; 6464 6465 if (signal_pending(current)) 6466 break; 6467 6468 if (!drained) { 6469 drain_all_stock(memcg); 6470 drained = true; 6471 continue; 6472 } 6473 6474 if (nr_reclaims) { 6475 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 6476 GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) 6477 nr_reclaims--; 6478 continue; 6479 } 6480 6481 memcg_memory_event(memcg, MEMCG_OOM); 6482 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 6483 break; 6484 } 6485 6486 memcg_wb_domain_size_changed(memcg); 6487 return nbytes; 6488 } 6489 6490 static void __memory_events_show(struct seq_file *m, atomic_long_t *events) 6491 { 6492 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); 6493 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); 6494 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); 6495 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); 6496 seq_printf(m, "oom_kill %lu\n", 6497 atomic_long_read(&events[MEMCG_OOM_KILL])); 6498 seq_printf(m, "oom_group_kill %lu\n", 6499 atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); 6500 } 6501 6502 static int memory_events_show(struct seq_file *m, void *v) 6503 { 6504 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6505 6506 __memory_events_show(m, memcg->memory_events); 6507 return 0; 6508 } 6509 6510 static int memory_events_local_show(struct seq_file *m, void *v) 6511 { 6512 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6513 6514 __memory_events_show(m, memcg->memory_events_local); 6515 return 0; 6516 } 6517 6518 static int memory_stat_show(struct seq_file *m, void *v) 6519 { 6520 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6521 char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 6522 6523 if (!buf) 6524 return -ENOMEM; 6525 memory_stat_format(memcg, buf, PAGE_SIZE); 6526 seq_puts(m, buf); 6527 kfree(buf); 6528 return 0; 6529 } 6530 6531 #ifdef CONFIG_NUMA 6532 static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, 6533 int item) 6534 { 6535 return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item); 6536 } 6537 6538 static int memory_numa_stat_show(struct seq_file *m, void *v) 6539 { 6540 int i; 6541 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6542 6543 mem_cgroup_flush_stats(); 6544 6545 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 6546 int nid; 6547 6548 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) 6549 continue; 6550 6551 seq_printf(m, "%s", memory_stats[i].name); 6552 for_each_node_state(nid, N_MEMORY) { 6553 u64 size; 6554 struct lruvec *lruvec; 6555 6556 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 6557 size = lruvec_page_state_output(lruvec, 6558 memory_stats[i].idx); 6559 seq_printf(m, " N%d=%llu", nid, size); 6560 } 6561 seq_putc(m, '\n'); 6562 } 6563 6564 return 0; 6565 } 6566 #endif 6567 6568 static int memory_oom_group_show(struct seq_file *m, void *v) 6569 { 6570 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6571 6572 seq_printf(m, "%d\n", memcg->oom_group); 6573 6574 return 0; 6575 } 6576 6577 static ssize_t memory_oom_group_write(struct kernfs_open_file *of, 6578 char *buf, size_t nbytes, loff_t off) 6579 { 6580 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6581 int ret, oom_group; 6582 6583 buf = strstrip(buf); 6584 if (!buf) 6585 return -EINVAL; 6586 6587 ret = kstrtoint(buf, 0, &oom_group); 6588 if (ret) 6589 return ret; 6590 6591 if (oom_group != 0 && oom_group != 1) 6592 return -EINVAL; 6593 6594 memcg->oom_group = oom_group; 6595 6596 return nbytes; 6597 } 6598 6599 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, 6600 size_t nbytes, loff_t off) 6601 { 6602 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6603 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6604 unsigned long nr_to_reclaim, nr_reclaimed = 0; 6605 unsigned int reclaim_options; 6606 int err; 6607 6608 buf = strstrip(buf); 6609 err = page_counter_memparse(buf, "", &nr_to_reclaim); 6610 if (err) 6611 return err; 6612 6613 reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; 6614 while (nr_reclaimed < nr_to_reclaim) { 6615 unsigned long reclaimed; 6616 6617 if (signal_pending(current)) 6618 return -EINTR; 6619 6620 /* 6621 * This is the final attempt, drain percpu lru caches in the 6622 * hope of introducing more evictable pages for 6623 * try_to_free_mem_cgroup_pages(). 6624 */ 6625 if (!nr_retries) 6626 lru_add_drain_all(); 6627 6628 reclaimed = try_to_free_mem_cgroup_pages(memcg, 6629 nr_to_reclaim - nr_reclaimed, 6630 GFP_KERNEL, reclaim_options); 6631 6632 if (!reclaimed && !nr_retries--) 6633 return -EAGAIN; 6634 6635 nr_reclaimed += reclaimed; 6636 } 6637 6638 return nbytes; 6639 } 6640 6641 static struct cftype memory_files[] = { 6642 { 6643 .name = "current", 6644 .flags = CFTYPE_NOT_ON_ROOT, 6645 .read_u64 = memory_current_read, 6646 }, 6647 { 6648 .name = "peak", 6649 .flags = CFTYPE_NOT_ON_ROOT, 6650 .read_u64 = memory_peak_read, 6651 }, 6652 { 6653 .name = "min", 6654 .flags = CFTYPE_NOT_ON_ROOT, 6655 .seq_show = memory_min_show, 6656 .write = memory_min_write, 6657 }, 6658 { 6659 .name = "low", 6660 .flags = CFTYPE_NOT_ON_ROOT, 6661 .seq_show = memory_low_show, 6662 .write = memory_low_write, 6663 }, 6664 { 6665 .name = "high", 6666 .flags = CFTYPE_NOT_ON_ROOT, 6667 .seq_show = memory_high_show, 6668 .write = memory_high_write, 6669 }, 6670 { 6671 .name = "max", 6672 .flags = CFTYPE_NOT_ON_ROOT, 6673 .seq_show = memory_max_show, 6674 .write = memory_max_write, 6675 }, 6676 { 6677 .name = "events", 6678 .flags = CFTYPE_NOT_ON_ROOT, 6679 .file_offset = offsetof(struct mem_cgroup, events_file), 6680 .seq_show = memory_events_show, 6681 }, 6682 { 6683 .name = "events.local", 6684 .flags = CFTYPE_NOT_ON_ROOT, 6685 .file_offset = offsetof(struct mem_cgroup, events_local_file), 6686 .seq_show = memory_events_local_show, 6687 }, 6688 { 6689 .name = "stat", 6690 .seq_show = memory_stat_show, 6691 }, 6692 #ifdef CONFIG_NUMA 6693 { 6694 .name = "numa_stat", 6695 .seq_show = memory_numa_stat_show, 6696 }, 6697 #endif 6698 { 6699 .name = "oom.group", 6700 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 6701 .seq_show = memory_oom_group_show, 6702 .write = memory_oom_group_write, 6703 }, 6704 { 6705 .name = "reclaim", 6706 .flags = CFTYPE_NS_DELEGATABLE, 6707 .write = memory_reclaim, 6708 }, 6709 { } /* terminate */ 6710 }; 6711 6712 struct cgroup_subsys memory_cgrp_subsys = { 6713 .css_alloc = mem_cgroup_css_alloc, 6714 .css_online = mem_cgroup_css_online, 6715 .css_offline = mem_cgroup_css_offline, 6716 .css_released = mem_cgroup_css_released, 6717 .css_free = mem_cgroup_css_free, 6718 .css_reset = mem_cgroup_css_reset, 6719 .css_rstat_flush = mem_cgroup_css_rstat_flush, 6720 .can_attach = mem_cgroup_can_attach, 6721 .attach = mem_cgroup_attach, 6722 .cancel_attach = mem_cgroup_cancel_attach, 6723 .post_attach = mem_cgroup_move_task, 6724 .dfl_cftypes = memory_files, 6725 .legacy_cftypes = mem_cgroup_legacy_files, 6726 .early_init = 0, 6727 }; 6728 6729 /* 6730 * This function calculates an individual cgroup's effective 6731 * protection which is derived from its own memory.min/low, its 6732 * parent's and siblings' settings, as well as the actual memory 6733 * distribution in the tree. 6734 * 6735 * The following rules apply to the effective protection values: 6736 * 6737 * 1. At the first level of reclaim, effective protection is equal to 6738 * the declared protection in memory.min and memory.low. 6739 * 6740 * 2. To enable safe delegation of the protection configuration, at 6741 * subsequent levels the effective protection is capped to the 6742 * parent's effective protection. 6743 * 6744 * 3. To make complex and dynamic subtrees easier to configure, the 6745 * user is allowed to overcommit the declared protection at a given 6746 * level. If that is the case, the parent's effective protection is 6747 * distributed to the children in proportion to how much protection 6748 * they have declared and how much of it they are utilizing. 6749 * 6750 * This makes distribution proportional, but also work-conserving: 6751 * if one cgroup claims much more protection than it uses memory, 6752 * the unused remainder is available to its siblings. 6753 * 6754 * 4. Conversely, when the declared protection is undercommitted at a 6755 * given level, the distribution of the larger parental protection 6756 * budget is NOT proportional. A cgroup's protection from a sibling 6757 * is capped to its own memory.min/low setting. 6758 * 6759 * 5. However, to allow protecting recursive subtrees from each other 6760 * without having to declare each individual cgroup's fixed share 6761 * of the ancestor's claim to protection, any unutilized - 6762 * "floating" - protection from up the tree is distributed in 6763 * proportion to each cgroup's *usage*. This makes the protection 6764 * neutral wrt sibling cgroups and lets them compete freely over 6765 * the shared parental protection budget, but it protects the 6766 * subtree as a whole from neighboring subtrees. 6767 * 6768 * Note that 4. and 5. are not in conflict: 4. is about protecting 6769 * against immediate siblings whereas 5. is about protecting against 6770 * neighboring subtrees. 6771 */ 6772 static unsigned long effective_protection(unsigned long usage, 6773 unsigned long parent_usage, 6774 unsigned long setting, 6775 unsigned long parent_effective, 6776 unsigned long siblings_protected) 6777 { 6778 unsigned long protected; 6779 unsigned long ep; 6780 6781 protected = min(usage, setting); 6782 /* 6783 * If all cgroups at this level combined claim and use more 6784 * protection then what the parent affords them, distribute 6785 * shares in proportion to utilization. 6786 * 6787 * We are using actual utilization rather than the statically 6788 * claimed protection in order to be work-conserving: claimed 6789 * but unused protection is available to siblings that would 6790 * otherwise get a smaller chunk than what they claimed. 6791 */ 6792 if (siblings_protected > parent_effective) 6793 return protected * parent_effective / siblings_protected; 6794 6795 /* 6796 * Ok, utilized protection of all children is within what the 6797 * parent affords them, so we know whatever this child claims 6798 * and utilizes is effectively protected. 6799 * 6800 * If there is unprotected usage beyond this value, reclaim 6801 * will apply pressure in proportion to that amount. 6802 * 6803 * If there is unutilized protection, the cgroup will be fully 6804 * shielded from reclaim, but we do return a smaller value for 6805 * protection than what the group could enjoy in theory. This 6806 * is okay. With the overcommit distribution above, effective 6807 * protection is always dependent on how memory is actually 6808 * consumed among the siblings anyway. 6809 */ 6810 ep = protected; 6811 6812 /* 6813 * If the children aren't claiming (all of) the protection 6814 * afforded to them by the parent, distribute the remainder in 6815 * proportion to the (unprotected) memory of each cgroup. That 6816 * way, cgroups that aren't explicitly prioritized wrt each 6817 * other compete freely over the allowance, but they are 6818 * collectively protected from neighboring trees. 6819 * 6820 * We're using unprotected memory for the weight so that if 6821 * some cgroups DO claim explicit protection, we don't protect 6822 * the same bytes twice. 6823 * 6824 * Check both usage and parent_usage against the respective 6825 * protected values. One should imply the other, but they 6826 * aren't read atomically - make sure the division is sane. 6827 */ 6828 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) 6829 return ep; 6830 if (parent_effective > siblings_protected && 6831 parent_usage > siblings_protected && 6832 usage > protected) { 6833 unsigned long unclaimed; 6834 6835 unclaimed = parent_effective - siblings_protected; 6836 unclaimed *= usage - protected; 6837 unclaimed /= parent_usage - siblings_protected; 6838 6839 ep += unclaimed; 6840 } 6841 6842 return ep; 6843 } 6844 6845 /** 6846 * mem_cgroup_calculate_protection - check if memory consumption is in the normal range 6847 * @root: the top ancestor of the sub-tree being checked 6848 * @memcg: the memory cgroup to check 6849 * 6850 * WARNING: This function is not stateless! It can only be used as part 6851 * of a top-down tree iteration, not for isolated queries. 6852 */ 6853 void mem_cgroup_calculate_protection(struct mem_cgroup *root, 6854 struct mem_cgroup *memcg) 6855 { 6856 unsigned long usage, parent_usage; 6857 struct mem_cgroup *parent; 6858 6859 if (mem_cgroup_disabled()) 6860 return; 6861 6862 if (!root) 6863 root = root_mem_cgroup; 6864 6865 /* 6866 * Effective values of the reclaim targets are ignored so they 6867 * can be stale. Have a look at mem_cgroup_protection for more 6868 * details. 6869 * TODO: calculation should be more robust so that we do not need 6870 * that special casing. 6871 */ 6872 if (memcg == root) 6873 return; 6874 6875 usage = page_counter_read(&memcg->memory); 6876 if (!usage) 6877 return; 6878 6879 parent = parent_mem_cgroup(memcg); 6880 6881 if (parent == root) { 6882 memcg->memory.emin = READ_ONCE(memcg->memory.min); 6883 memcg->memory.elow = READ_ONCE(memcg->memory.low); 6884 return; 6885 } 6886 6887 parent_usage = page_counter_read(&parent->memory); 6888 6889 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, 6890 READ_ONCE(memcg->memory.min), 6891 READ_ONCE(parent->memory.emin), 6892 atomic_long_read(&parent->memory.children_min_usage))); 6893 6894 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, 6895 READ_ONCE(memcg->memory.low), 6896 READ_ONCE(parent->memory.elow), 6897 atomic_long_read(&parent->memory.children_low_usage))); 6898 } 6899 6900 static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, 6901 gfp_t gfp) 6902 { 6903 long nr_pages = folio_nr_pages(folio); 6904 int ret; 6905 6906 ret = try_charge(memcg, gfp, nr_pages); 6907 if (ret) 6908 goto out; 6909 6910 css_get(&memcg->css); 6911 commit_charge(folio, memcg); 6912 6913 local_irq_disable(); 6914 mem_cgroup_charge_statistics(memcg, nr_pages); 6915 memcg_check_events(memcg, folio_nid(folio)); 6916 local_irq_enable(); 6917 out: 6918 return ret; 6919 } 6920 6921 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) 6922 { 6923 struct mem_cgroup *memcg; 6924 int ret; 6925 6926 memcg = get_mem_cgroup_from_mm(mm); 6927 ret = charge_memcg(folio, memcg, gfp); 6928 css_put(&memcg->css); 6929 6930 return ret; 6931 } 6932 6933 /** 6934 * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. 6935 * @folio: folio to charge. 6936 * @mm: mm context of the victim 6937 * @gfp: reclaim mode 6938 * @entry: swap entry for which the folio is allocated 6939 * 6940 * This function charges a folio allocated for swapin. Please call this before 6941 * adding the folio to the swapcache. 6942 * 6943 * Returns 0 on success. Otherwise, an error code is returned. 6944 */ 6945 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, 6946 gfp_t gfp, swp_entry_t entry) 6947 { 6948 struct mem_cgroup *memcg; 6949 unsigned short id; 6950 int ret; 6951 6952 if (mem_cgroup_disabled()) 6953 return 0; 6954 6955 id = lookup_swap_cgroup_id(entry); 6956 rcu_read_lock(); 6957 memcg = mem_cgroup_from_id(id); 6958 if (!memcg || !css_tryget_online(&memcg->css)) 6959 memcg = get_mem_cgroup_from_mm(mm); 6960 rcu_read_unlock(); 6961 6962 ret = charge_memcg(folio, memcg, gfp); 6963 6964 css_put(&memcg->css); 6965 return ret; 6966 } 6967 6968 /* 6969 * mem_cgroup_swapin_uncharge_swap - uncharge swap slot 6970 * @entry: swap entry for which the page is charged 6971 * 6972 * Call this function after successfully adding the charged page to swapcache. 6973 * 6974 * Note: This function assumes the page for which swap slot is being uncharged 6975 * is order 0 page. 6976 */ 6977 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) 6978 { 6979 /* 6980 * Cgroup1's unified memory+swap counter has been charged with the 6981 * new swapcache page, finish the transfer by uncharging the swap 6982 * slot. The swap slot would also get uncharged when it dies, but 6983 * it can stick around indefinitely and we'd count the page twice 6984 * the entire time. 6985 * 6986 * Cgroup2 has separate resource counters for memory and swap, 6987 * so this is a non-issue here. Memory and swap charge lifetimes 6988 * correspond 1:1 to page and swap slot lifetimes: we charge the 6989 * page to memory here, and uncharge swap when the slot is freed. 6990 */ 6991 if (!mem_cgroup_disabled() && do_memsw_account()) { 6992 /* 6993 * The swap entry might not get freed for a long time, 6994 * let's not wait for it. The page already received a 6995 * memory+swap charge, drop the swap entry duplicate. 6996 */ 6997 mem_cgroup_uncharge_swap(entry, 1); 6998 } 6999 } 7000 7001 struct uncharge_gather { 7002 struct mem_cgroup *memcg; 7003 unsigned long nr_memory; 7004 unsigned long pgpgout; 7005 unsigned long nr_kmem; 7006 int nid; 7007 }; 7008 7009 static inline void uncharge_gather_clear(struct uncharge_gather *ug) 7010 { 7011 memset(ug, 0, sizeof(*ug)); 7012 } 7013 7014 static void uncharge_batch(const struct uncharge_gather *ug) 7015 { 7016 unsigned long flags; 7017 7018 if (ug->nr_memory) { 7019 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); 7020 if (do_memsw_account()) 7021 page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); 7022 if (ug->nr_kmem) 7023 memcg_account_kmem(ug->memcg, -ug->nr_kmem); 7024 memcg_oom_recover(ug->memcg); 7025 } 7026 7027 local_irq_save(flags); 7028 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 7029 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); 7030 memcg_check_events(ug->memcg, ug->nid); 7031 local_irq_restore(flags); 7032 7033 /* drop reference from uncharge_folio */ 7034 css_put(&ug->memcg->css); 7035 } 7036 7037 static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) 7038 { 7039 long nr_pages; 7040 struct mem_cgroup *memcg; 7041 struct obj_cgroup *objcg; 7042 7043 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 7044 7045 /* 7046 * Nobody should be changing or seriously looking at 7047 * folio memcg or objcg at this point, we have fully 7048 * exclusive access to the folio. 7049 */ 7050 if (folio_memcg_kmem(folio)) { 7051 objcg = __folio_objcg(folio); 7052 /* 7053 * This get matches the put at the end of the function and 7054 * kmem pages do not hold memcg references anymore. 7055 */ 7056 memcg = get_mem_cgroup_from_objcg(objcg); 7057 } else { 7058 memcg = __folio_memcg(folio); 7059 } 7060 7061 if (!memcg) 7062 return; 7063 7064 if (ug->memcg != memcg) { 7065 if (ug->memcg) { 7066 uncharge_batch(ug); 7067 uncharge_gather_clear(ug); 7068 } 7069 ug->memcg = memcg; 7070 ug->nid = folio_nid(folio); 7071 7072 /* pairs with css_put in uncharge_batch */ 7073 css_get(&memcg->css); 7074 } 7075 7076 nr_pages = folio_nr_pages(folio); 7077 7078 if (folio_memcg_kmem(folio)) { 7079 ug->nr_memory += nr_pages; 7080 ug->nr_kmem += nr_pages; 7081 7082 folio->memcg_data = 0; 7083 obj_cgroup_put(objcg); 7084 } else { 7085 /* LRU pages aren't accounted at the root level */ 7086 if (!mem_cgroup_is_root(memcg)) 7087 ug->nr_memory += nr_pages; 7088 ug->pgpgout++; 7089 7090 folio->memcg_data = 0; 7091 } 7092 7093 css_put(&memcg->css); 7094 } 7095 7096 void __mem_cgroup_uncharge(struct folio *folio) 7097 { 7098 struct uncharge_gather ug; 7099 7100 /* Don't touch folio->lru of any random page, pre-check: */ 7101 if (!folio_memcg(folio)) 7102 return; 7103 7104 uncharge_gather_clear(&ug); 7105 uncharge_folio(folio, &ug); 7106 uncharge_batch(&ug); 7107 } 7108 7109 /** 7110 * __mem_cgroup_uncharge_list - uncharge a list of page 7111 * @page_list: list of pages to uncharge 7112 * 7113 * Uncharge a list of pages previously charged with 7114 * __mem_cgroup_charge(). 7115 */ 7116 void __mem_cgroup_uncharge_list(struct list_head *page_list) 7117 { 7118 struct uncharge_gather ug; 7119 struct folio *folio; 7120 7121 uncharge_gather_clear(&ug); 7122 list_for_each_entry(folio, page_list, lru) 7123 uncharge_folio(folio, &ug); 7124 if (ug.memcg) 7125 uncharge_batch(&ug); 7126 } 7127 7128 /** 7129 * mem_cgroup_migrate - Charge a folio's replacement. 7130 * @old: Currently circulating folio. 7131 * @new: Replacement folio. 7132 * 7133 * Charge @new as a replacement folio for @old. @old will 7134 * be uncharged upon free. 7135 * 7136 * Both folios must be locked, @new->mapping must be set up. 7137 */ 7138 void mem_cgroup_migrate(struct folio *old, struct folio *new) 7139 { 7140 struct mem_cgroup *memcg; 7141 long nr_pages = folio_nr_pages(new); 7142 unsigned long flags; 7143 7144 VM_BUG_ON_FOLIO(!folio_test_locked(old), old); 7145 VM_BUG_ON_FOLIO(!folio_test_locked(new), new); 7146 VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); 7147 VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new); 7148 7149 if (mem_cgroup_disabled()) 7150 return; 7151 7152 /* Page cache replacement: new folio already charged? */ 7153 if (folio_memcg(new)) 7154 return; 7155 7156 memcg = folio_memcg(old); 7157 VM_WARN_ON_ONCE_FOLIO(!memcg, old); 7158 if (!memcg) 7159 return; 7160 7161 /* Force-charge the new page. The old one will be freed soon */ 7162 if (!mem_cgroup_is_root(memcg)) { 7163 page_counter_charge(&memcg->memory, nr_pages); 7164 if (do_memsw_account()) 7165 page_counter_charge(&memcg->memsw, nr_pages); 7166 } 7167 7168 css_get(&memcg->css); 7169 commit_charge(new, memcg); 7170 7171 local_irq_save(flags); 7172 mem_cgroup_charge_statistics(memcg, nr_pages); 7173 memcg_check_events(memcg, folio_nid(new)); 7174 local_irq_restore(flags); 7175 } 7176 7177 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 7178 EXPORT_SYMBOL(memcg_sockets_enabled_key); 7179 7180 void mem_cgroup_sk_alloc(struct sock *sk) 7181 { 7182 struct mem_cgroup *memcg; 7183 7184 if (!mem_cgroup_sockets_enabled) 7185 return; 7186 7187 /* Do not associate the sock with unrelated interrupted task's memcg. */ 7188 if (!in_task()) 7189 return; 7190 7191 rcu_read_lock(); 7192 memcg = mem_cgroup_from_task(current); 7193 if (mem_cgroup_is_root(memcg)) 7194 goto out; 7195 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 7196 goto out; 7197 if (css_tryget(&memcg->css)) 7198 sk->sk_memcg = memcg; 7199 out: 7200 rcu_read_unlock(); 7201 } 7202 7203 void mem_cgroup_sk_free(struct sock *sk) 7204 { 7205 if (sk->sk_memcg) 7206 css_put(&sk->sk_memcg->css); 7207 } 7208 7209 /** 7210 * mem_cgroup_charge_skmem - charge socket memory 7211 * @memcg: memcg to charge 7212 * @nr_pages: number of pages to charge 7213 * @gfp_mask: reclaim mode 7214 * 7215 * Charges @nr_pages to @memcg. Returns %true if the charge fit within 7216 * @memcg's configured limit, %false if it doesn't. 7217 */ 7218 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, 7219 gfp_t gfp_mask) 7220 { 7221 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7222 struct page_counter *fail; 7223 7224 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 7225 memcg->tcpmem_pressure = 0; 7226 return true; 7227 } 7228 memcg->tcpmem_pressure = 1; 7229 if (gfp_mask & __GFP_NOFAIL) { 7230 page_counter_charge(&memcg->tcpmem, nr_pages); 7231 return true; 7232 } 7233 return false; 7234 } 7235 7236 if (try_charge(memcg, gfp_mask, nr_pages) == 0) { 7237 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 7238 return true; 7239 } 7240 7241 return false; 7242 } 7243 7244 /** 7245 * mem_cgroup_uncharge_skmem - uncharge socket memory 7246 * @memcg: memcg to uncharge 7247 * @nr_pages: number of pages to uncharge 7248 */ 7249 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 7250 { 7251 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7252 page_counter_uncharge(&memcg->tcpmem, nr_pages); 7253 return; 7254 } 7255 7256 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 7257 7258 refill_stock(memcg, nr_pages); 7259 } 7260 7261 static int __init cgroup_memory(char *s) 7262 { 7263 char *token; 7264 7265 while ((token = strsep(&s, ",")) != NULL) { 7266 if (!*token) 7267 continue; 7268 if (!strcmp(token, "nosocket")) 7269 cgroup_memory_nosocket = true; 7270 if (!strcmp(token, "nokmem")) 7271 cgroup_memory_nokmem = true; 7272 } 7273 return 1; 7274 } 7275 __setup("cgroup.memory=", cgroup_memory); 7276 7277 /* 7278 * subsys_initcall() for memory controller. 7279 * 7280 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 7281 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 7282 * basically everything that doesn't depend on a specific mem_cgroup structure 7283 * should be initialized from here. 7284 */ 7285 static int __init mem_cgroup_init(void) 7286 { 7287 int cpu, node; 7288 7289 /* 7290 * Currently s32 type (can refer to struct batched_lruvec_stat) is 7291 * used for per-memcg-per-cpu caching of per-node statistics. In order 7292 * to work fine, we should make sure that the overfill threshold can't 7293 * exceed S32_MAX / PAGE_SIZE. 7294 */ 7295 BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE); 7296 7297 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 7298 memcg_hotplug_cpu_dead); 7299 7300 for_each_possible_cpu(cpu) 7301 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 7302 drain_local_stock); 7303 7304 for_each_node(node) { 7305 struct mem_cgroup_tree_per_node *rtpn; 7306 7307 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 7308 node_online(node) ? node : NUMA_NO_NODE); 7309 7310 rtpn->rb_root = RB_ROOT; 7311 rtpn->rb_rightmost = NULL; 7312 spin_lock_init(&rtpn->lock); 7313 soft_limit_tree.rb_tree_per_node[node] = rtpn; 7314 } 7315 7316 return 0; 7317 } 7318 subsys_initcall(mem_cgroup_init); 7319 7320 #ifdef CONFIG_SWAP 7321 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 7322 { 7323 while (!refcount_inc_not_zero(&memcg->id.ref)) { 7324 /* 7325 * The root cgroup cannot be destroyed, so it's refcount must 7326 * always be >= 1. 7327 */ 7328 if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) { 7329 VM_BUG_ON(1); 7330 break; 7331 } 7332 memcg = parent_mem_cgroup(memcg); 7333 if (!memcg) 7334 memcg = root_mem_cgroup; 7335 } 7336 return memcg; 7337 } 7338 7339 /** 7340 * mem_cgroup_swapout - transfer a memsw charge to swap 7341 * @folio: folio whose memsw charge to transfer 7342 * @entry: swap entry to move the charge to 7343 * 7344 * Transfer the memsw charge of @folio to @entry. 7345 */ 7346 void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) 7347 { 7348 struct mem_cgroup *memcg, *swap_memcg; 7349 unsigned int nr_entries; 7350 unsigned short oldid; 7351 7352 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 7353 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 7354 7355 if (mem_cgroup_disabled()) 7356 return; 7357 7358 if (!do_memsw_account()) 7359 return; 7360 7361 memcg = folio_memcg(folio); 7362 7363 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 7364 if (!memcg) 7365 return; 7366 7367 /* 7368 * In case the memcg owning these pages has been offlined and doesn't 7369 * have an ID allocated to it anymore, charge the closest online 7370 * ancestor for the swap instead and transfer the memory+swap charge. 7371 */ 7372 swap_memcg = mem_cgroup_id_get_online(memcg); 7373 nr_entries = folio_nr_pages(folio); 7374 /* Get references for the tail pages, too */ 7375 if (nr_entries > 1) 7376 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 7377 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 7378 nr_entries); 7379 VM_BUG_ON_FOLIO(oldid, folio); 7380 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 7381 7382 folio->memcg_data = 0; 7383 7384 if (!mem_cgroup_is_root(memcg)) 7385 page_counter_uncharge(&memcg->memory, nr_entries); 7386 7387 if (memcg != swap_memcg) { 7388 if (!mem_cgroup_is_root(swap_memcg)) 7389 page_counter_charge(&swap_memcg->memsw, nr_entries); 7390 page_counter_uncharge(&memcg->memsw, nr_entries); 7391 } 7392 7393 /* 7394 * Interrupts should be disabled here because the caller holds the 7395 * i_pages lock which is taken with interrupts-off. It is 7396 * important here to have the interrupts disabled because it is the 7397 * only synchronisation we have for updating the per-CPU variables. 7398 */ 7399 memcg_stats_lock(); 7400 mem_cgroup_charge_statistics(memcg, -nr_entries); 7401 memcg_stats_unlock(); 7402 memcg_check_events(memcg, folio_nid(folio)); 7403 7404 css_put(&memcg->css); 7405 } 7406 7407 /** 7408 * __mem_cgroup_try_charge_swap - try charging swap space for a folio 7409 * @folio: folio being added to swap 7410 * @entry: swap entry to charge 7411 * 7412 * Try to charge @folio's memcg for the swap space at @entry. 7413 * 7414 * Returns 0 on success, -ENOMEM on failure. 7415 */ 7416 int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) 7417 { 7418 unsigned int nr_pages = folio_nr_pages(folio); 7419 struct page_counter *counter; 7420 struct mem_cgroup *memcg; 7421 unsigned short oldid; 7422 7423 if (do_memsw_account()) 7424 return 0; 7425 7426 memcg = folio_memcg(folio); 7427 7428 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 7429 if (!memcg) 7430 return 0; 7431 7432 if (!entry.val) { 7433 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7434 return 0; 7435 } 7436 7437 memcg = mem_cgroup_id_get_online(memcg); 7438 7439 if (!mem_cgroup_is_root(memcg) && 7440 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 7441 memcg_memory_event(memcg, MEMCG_SWAP_MAX); 7442 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7443 mem_cgroup_id_put(memcg); 7444 return -ENOMEM; 7445 } 7446 7447 /* Get references for the tail pages, too */ 7448 if (nr_pages > 1) 7449 mem_cgroup_id_get_many(memcg, nr_pages - 1); 7450 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 7451 VM_BUG_ON_FOLIO(oldid, folio); 7452 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 7453 7454 return 0; 7455 } 7456 7457 /** 7458 * __mem_cgroup_uncharge_swap - uncharge swap space 7459 * @entry: swap entry to uncharge 7460 * @nr_pages: the amount of swap space to uncharge 7461 */ 7462 void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 7463 { 7464 struct mem_cgroup *memcg; 7465 unsigned short id; 7466 7467 if (mem_cgroup_disabled()) 7468 return; 7469 7470 id = swap_cgroup_record(entry, 0, nr_pages); 7471 rcu_read_lock(); 7472 memcg = mem_cgroup_from_id(id); 7473 if (memcg) { 7474 if (!mem_cgroup_is_root(memcg)) { 7475 if (do_memsw_account()) 7476 page_counter_uncharge(&memcg->memsw, nr_pages); 7477 else 7478 page_counter_uncharge(&memcg->swap, nr_pages); 7479 } 7480 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 7481 mem_cgroup_id_put_many(memcg, nr_pages); 7482 } 7483 rcu_read_unlock(); 7484 } 7485 7486 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 7487 { 7488 long nr_swap_pages = get_nr_swap_pages(); 7489 7490 if (mem_cgroup_disabled() || do_memsw_account()) 7491 return nr_swap_pages; 7492 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) 7493 nr_swap_pages = min_t(long, nr_swap_pages, 7494 READ_ONCE(memcg->swap.max) - 7495 page_counter_read(&memcg->swap)); 7496 return nr_swap_pages; 7497 } 7498 7499 bool mem_cgroup_swap_full(struct folio *folio) 7500 { 7501 struct mem_cgroup *memcg; 7502 7503 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 7504 7505 if (vm_swap_full()) 7506 return true; 7507 if (do_memsw_account()) 7508 return false; 7509 7510 memcg = folio_memcg(folio); 7511 if (!memcg) 7512 return false; 7513 7514 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 7515 unsigned long usage = page_counter_read(&memcg->swap); 7516 7517 if (usage * 2 >= READ_ONCE(memcg->swap.high) || 7518 usage * 2 >= READ_ONCE(memcg->swap.max)) 7519 return true; 7520 } 7521 7522 return false; 7523 } 7524 7525 static int __init setup_swap_account(char *s) 7526 { 7527 pr_warn_once("The swapaccount= commandline option is deprecated. " 7528 "Please report your usecase to linux-mm@kvack.org if you " 7529 "depend on this functionality.\n"); 7530 return 1; 7531 } 7532 __setup("swapaccount=", setup_swap_account); 7533 7534 static u64 swap_current_read(struct cgroup_subsys_state *css, 7535 struct cftype *cft) 7536 { 7537 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 7538 7539 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 7540 } 7541 7542 static int swap_high_show(struct seq_file *m, void *v) 7543 { 7544 return seq_puts_memcg_tunable(m, 7545 READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); 7546 } 7547 7548 static ssize_t swap_high_write(struct kernfs_open_file *of, 7549 char *buf, size_t nbytes, loff_t off) 7550 { 7551 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7552 unsigned long high; 7553 int err; 7554 7555 buf = strstrip(buf); 7556 err = page_counter_memparse(buf, "max", &high); 7557 if (err) 7558 return err; 7559 7560 page_counter_set_high(&memcg->swap, high); 7561 7562 return nbytes; 7563 } 7564 7565 static int swap_max_show(struct seq_file *m, void *v) 7566 { 7567 return seq_puts_memcg_tunable(m, 7568 READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 7569 } 7570 7571 static ssize_t swap_max_write(struct kernfs_open_file *of, 7572 char *buf, size_t nbytes, loff_t off) 7573 { 7574 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7575 unsigned long max; 7576 int err; 7577 7578 buf = strstrip(buf); 7579 err = page_counter_memparse(buf, "max", &max); 7580 if (err) 7581 return err; 7582 7583 xchg(&memcg->swap.max, max); 7584 7585 return nbytes; 7586 } 7587 7588 static int swap_events_show(struct seq_file *m, void *v) 7589 { 7590 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 7591 7592 seq_printf(m, "high %lu\n", 7593 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); 7594 seq_printf(m, "max %lu\n", 7595 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 7596 seq_printf(m, "fail %lu\n", 7597 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 7598 7599 return 0; 7600 } 7601 7602 static struct cftype swap_files[] = { 7603 { 7604 .name = "swap.current", 7605 .flags = CFTYPE_NOT_ON_ROOT, 7606 .read_u64 = swap_current_read, 7607 }, 7608 { 7609 .name = "swap.high", 7610 .flags = CFTYPE_NOT_ON_ROOT, 7611 .seq_show = swap_high_show, 7612 .write = swap_high_write, 7613 }, 7614 { 7615 .name = "swap.max", 7616 .flags = CFTYPE_NOT_ON_ROOT, 7617 .seq_show = swap_max_show, 7618 .write = swap_max_write, 7619 }, 7620 { 7621 .name = "swap.events", 7622 .flags = CFTYPE_NOT_ON_ROOT, 7623 .file_offset = offsetof(struct mem_cgroup, swap_events_file), 7624 .seq_show = swap_events_show, 7625 }, 7626 { } /* terminate */ 7627 }; 7628 7629 static struct cftype memsw_files[] = { 7630 { 7631 .name = "memsw.usage_in_bytes", 7632 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 7633 .read_u64 = mem_cgroup_read_u64, 7634 }, 7635 { 7636 .name = "memsw.max_usage_in_bytes", 7637 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 7638 .write = mem_cgroup_reset, 7639 .read_u64 = mem_cgroup_read_u64, 7640 }, 7641 { 7642 .name = "memsw.limit_in_bytes", 7643 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 7644 .write = mem_cgroup_write, 7645 .read_u64 = mem_cgroup_read_u64, 7646 }, 7647 { 7648 .name = "memsw.failcnt", 7649 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 7650 .write = mem_cgroup_reset, 7651 .read_u64 = mem_cgroup_read_u64, 7652 }, 7653 { }, /* terminate */ 7654 }; 7655 7656 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 7657 /** 7658 * obj_cgroup_may_zswap - check if this cgroup can zswap 7659 * @objcg: the object cgroup 7660 * 7661 * Check if the hierarchical zswap limit has been reached. 7662 * 7663 * This doesn't check for specific headroom, and it is not atomic 7664 * either. But with zswap, the size of the allocation is only known 7665 * once compression has occured, and this optimistic pre-check avoids 7666 * spending cycles on compression when there is already no room left 7667 * or zswap is disabled altogether somewhere in the hierarchy. 7668 */ 7669 bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) 7670 { 7671 struct mem_cgroup *memcg, *original_memcg; 7672 bool ret = true; 7673 7674 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7675 return true; 7676 7677 original_memcg = get_mem_cgroup_from_objcg(objcg); 7678 for (memcg = original_memcg; !mem_cgroup_is_root(memcg); 7679 memcg = parent_mem_cgroup(memcg)) { 7680 unsigned long max = READ_ONCE(memcg->zswap_max); 7681 unsigned long pages; 7682 7683 if (max == PAGE_COUNTER_MAX) 7684 continue; 7685 if (max == 0) { 7686 ret = false; 7687 break; 7688 } 7689 7690 cgroup_rstat_flush(memcg->css.cgroup); 7691 pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; 7692 if (pages < max) 7693 continue; 7694 ret = false; 7695 break; 7696 } 7697 mem_cgroup_put(original_memcg); 7698 return ret; 7699 } 7700 7701 /** 7702 * obj_cgroup_charge_zswap - charge compression backend memory 7703 * @objcg: the object cgroup 7704 * @size: size of compressed object 7705 * 7706 * This forces the charge after obj_cgroup_may_swap() allowed 7707 * compression and storage in zwap for this cgroup to go ahead. 7708 */ 7709 void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) 7710 { 7711 struct mem_cgroup *memcg; 7712 7713 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7714 return; 7715 7716 VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); 7717 7718 /* PF_MEMALLOC context, charging must succeed */ 7719 if (obj_cgroup_charge(objcg, GFP_KERNEL, size)) 7720 VM_WARN_ON_ONCE(1); 7721 7722 rcu_read_lock(); 7723 memcg = obj_cgroup_memcg(objcg); 7724 mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); 7725 mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); 7726 rcu_read_unlock(); 7727 } 7728 7729 /** 7730 * obj_cgroup_uncharge_zswap - uncharge compression backend memory 7731 * @objcg: the object cgroup 7732 * @size: size of compressed object 7733 * 7734 * Uncharges zswap memory on page in. 7735 */ 7736 void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) 7737 { 7738 struct mem_cgroup *memcg; 7739 7740 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7741 return; 7742 7743 obj_cgroup_uncharge(objcg, size); 7744 7745 rcu_read_lock(); 7746 memcg = obj_cgroup_memcg(objcg); 7747 mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); 7748 mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); 7749 rcu_read_unlock(); 7750 } 7751 7752 static u64 zswap_current_read(struct cgroup_subsys_state *css, 7753 struct cftype *cft) 7754 { 7755 cgroup_rstat_flush(css->cgroup); 7756 return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B); 7757 } 7758 7759 static int zswap_max_show(struct seq_file *m, void *v) 7760 { 7761 return seq_puts_memcg_tunable(m, 7762 READ_ONCE(mem_cgroup_from_seq(m)->zswap_max)); 7763 } 7764 7765 static ssize_t zswap_max_write(struct kernfs_open_file *of, 7766 char *buf, size_t nbytes, loff_t off) 7767 { 7768 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7769 unsigned long max; 7770 int err; 7771 7772 buf = strstrip(buf); 7773 err = page_counter_memparse(buf, "max", &max); 7774 if (err) 7775 return err; 7776 7777 xchg(&memcg->zswap_max, max); 7778 7779 return nbytes; 7780 } 7781 7782 static struct cftype zswap_files[] = { 7783 { 7784 .name = "zswap.current", 7785 .flags = CFTYPE_NOT_ON_ROOT, 7786 .read_u64 = zswap_current_read, 7787 }, 7788 { 7789 .name = "zswap.max", 7790 .flags = CFTYPE_NOT_ON_ROOT, 7791 .seq_show = zswap_max_show, 7792 .write = zswap_max_write, 7793 }, 7794 { } /* terminate */ 7795 }; 7796 #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ 7797 7798 static int __init mem_cgroup_swap_init(void) 7799 { 7800 if (mem_cgroup_disabled()) 7801 return 0; 7802 7803 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); 7804 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); 7805 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 7806 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files)); 7807 #endif 7808 return 0; 7809 } 7810 subsys_initcall(mem_cgroup_swap_init); 7811 7812 #endif /* CONFIG_SWAP */ 7813