1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* memcontrol.c - Memory Controller 3 * 4 * Copyright IBM Corporation, 2007 5 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 6 * 7 * Copyright 2007 OpenVZ SWsoft Inc 8 * Author: Pavel Emelianov <xemul@openvz.org> 9 * 10 * Memory thresholds 11 * Copyright (C) 2009 Nokia Corporation 12 * Author: Kirill A. Shutemov 13 * 14 * Kernel Memory Controller 15 * Copyright (C) 2012 Parallels Inc. and Google Inc. 16 * Authors: Glauber Costa and Suleiman Souhlal 17 * 18 * Native page reclaim 19 * Charge lifetime sanitation 20 * Lockless page tracking & accounting 21 * Unified hierarchy configuration model 22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 23 * 24 * Per memcg lru locking 25 * Copyright (C) 2020 Alibaba, Inc, Alex Shi 26 */ 27 28 #include <linux/page_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/pagewalk.h> 32 #include <linux/sched/mm.h> 33 #include <linux/shmem_fs.h> 34 #include <linux/hugetlb.h> 35 #include <linux/pagemap.h> 36 #include <linux/vm_event_item.h> 37 #include <linux/smp.h> 38 #include <linux/page-flags.h> 39 #include <linux/backing-dev.h> 40 #include <linux/bit_spinlock.h> 41 #include <linux/rcupdate.h> 42 #include <linux/limits.h> 43 #include <linux/export.h> 44 #include <linux/mutex.h> 45 #include <linux/rbtree.h> 46 #include <linux/slab.h> 47 #include <linux/swap.h> 48 #include <linux/swapops.h> 49 #include <linux/spinlock.h> 50 #include <linux/eventfd.h> 51 #include <linux/poll.h> 52 #include <linux/sort.h> 53 #include <linux/fs.h> 54 #include <linux/seq_file.h> 55 #include <linux/vmpressure.h> 56 #include <linux/memremap.h> 57 #include <linux/mm_inline.h> 58 #include <linux/swap_cgroup.h> 59 #include <linux/cpu.h> 60 #include <linux/oom.h> 61 #include <linux/lockdep.h> 62 #include <linux/file.h> 63 #include <linux/resume_user_mode.h> 64 #include <linux/psi.h> 65 #include <linux/seq_buf.h> 66 #include <linux/sched/isolation.h> 67 #include "internal.h" 68 #include <net/sock.h> 69 #include <net/ip.h> 70 #include "slab.h" 71 #include "swap.h" 72 73 #include <linux/uaccess.h> 74 75 #include <trace/events/vmscan.h> 76 77 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 78 EXPORT_SYMBOL(memory_cgrp_subsys); 79 80 struct mem_cgroup *root_mem_cgroup __read_mostly; 81 82 /* Active memory cgroup to use from an interrupt context */ 83 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); 84 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); 85 86 /* Socket memory accounting disabled? */ 87 static bool cgroup_memory_nosocket __ro_after_init; 88 89 /* Kernel memory accounting disabled? */ 90 static bool cgroup_memory_nokmem __ro_after_init; 91 92 /* BPF memory accounting disabled? */ 93 static bool cgroup_memory_nobpf __ro_after_init; 94 95 #ifdef CONFIG_CGROUP_WRITEBACK 96 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); 97 #endif 98 99 /* Whether legacy memory+swap accounting is active */ 100 static bool do_memsw_account(void) 101 { 102 return !cgroup_subsys_on_dfl(memory_cgrp_subsys); 103 } 104 105 #define THRESHOLDS_EVENTS_TARGET 128 106 #define SOFTLIMIT_EVENTS_TARGET 1024 107 108 /* 109 * Cgroups above their limits are maintained in a RB-Tree, independent of 110 * their hierarchy representation 111 */ 112 113 struct mem_cgroup_tree_per_node { 114 struct rb_root rb_root; 115 struct rb_node *rb_rightmost; 116 spinlock_t lock; 117 }; 118 119 struct mem_cgroup_tree { 120 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 121 }; 122 123 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 124 125 /* for OOM */ 126 struct mem_cgroup_eventfd_list { 127 struct list_head list; 128 struct eventfd_ctx *eventfd; 129 }; 130 131 /* 132 * cgroup_event represents events which userspace want to receive. 133 */ 134 struct mem_cgroup_event { 135 /* 136 * memcg which the event belongs to. 137 */ 138 struct mem_cgroup *memcg; 139 /* 140 * eventfd to signal userspace about the event. 141 */ 142 struct eventfd_ctx *eventfd; 143 /* 144 * Each of these stored in a list by the cgroup. 145 */ 146 struct list_head list; 147 /* 148 * register_event() callback will be used to add new userspace 149 * waiter for changes related to this event. Use eventfd_signal() 150 * on eventfd to send notification to userspace. 151 */ 152 int (*register_event)(struct mem_cgroup *memcg, 153 struct eventfd_ctx *eventfd, const char *args); 154 /* 155 * unregister_event() callback will be called when userspace closes 156 * the eventfd or on cgroup removing. This callback must be set, 157 * if you want provide notification functionality. 158 */ 159 void (*unregister_event)(struct mem_cgroup *memcg, 160 struct eventfd_ctx *eventfd); 161 /* 162 * All fields below needed to unregister event when 163 * userspace closes eventfd. 164 */ 165 poll_table pt; 166 wait_queue_head_t *wqh; 167 wait_queue_entry_t wait; 168 struct work_struct remove; 169 }; 170 171 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 172 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 173 174 /* Stuffs for move charges at task migration. */ 175 /* 176 * Types of charges to be moved. 177 */ 178 #define MOVE_ANON 0x1U 179 #define MOVE_FILE 0x2U 180 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 181 182 /* "mc" and its members are protected by cgroup_mutex */ 183 static struct move_charge_struct { 184 spinlock_t lock; /* for from, to */ 185 struct mm_struct *mm; 186 struct mem_cgroup *from; 187 struct mem_cgroup *to; 188 unsigned long flags; 189 unsigned long precharge; 190 unsigned long moved_charge; 191 unsigned long moved_swap; 192 struct task_struct *moving_task; /* a task moving charges */ 193 wait_queue_head_t waitq; /* a waitq for other context */ 194 } mc = { 195 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 196 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 197 }; 198 199 /* 200 * Maximum loops in mem_cgroup_soft_reclaim(), used for soft 201 * limit reclaim to prevent infinite loops, if they ever occur. 202 */ 203 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 204 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 205 206 /* for encoding cft->private value on file */ 207 enum res_type { 208 _MEM, 209 _MEMSWAP, 210 _KMEM, 211 _TCP, 212 }; 213 214 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 215 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 216 #define MEMFILE_ATTR(val) ((val) & 0xffff) 217 218 /* 219 * Iteration constructs for visiting all cgroups (under a tree). If 220 * loops are exited prematurely (break), mem_cgroup_iter_break() must 221 * be used for reference counting. 222 */ 223 #define for_each_mem_cgroup_tree(iter, root) \ 224 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 225 iter != NULL; \ 226 iter = mem_cgroup_iter(root, iter, NULL)) 227 228 #define for_each_mem_cgroup(iter) \ 229 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 230 iter != NULL; \ 231 iter = mem_cgroup_iter(NULL, iter, NULL)) 232 233 static inline bool task_is_dying(void) 234 { 235 return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 236 (current->flags & PF_EXITING); 237 } 238 239 /* Some nice accessors for the vmpressure. */ 240 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 241 { 242 if (!memcg) 243 memcg = root_mem_cgroup; 244 return &memcg->vmpressure; 245 } 246 247 struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) 248 { 249 return container_of(vmpr, struct mem_cgroup, vmpressure); 250 } 251 252 #define CURRENT_OBJCG_UPDATE_BIT 0 253 #define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT) 254 255 #ifdef CONFIG_MEMCG_KMEM 256 static DEFINE_SPINLOCK(objcg_lock); 257 258 bool mem_cgroup_kmem_disabled(void) 259 { 260 return cgroup_memory_nokmem; 261 } 262 263 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 264 unsigned int nr_pages); 265 266 static void obj_cgroup_release(struct percpu_ref *ref) 267 { 268 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); 269 unsigned int nr_bytes; 270 unsigned int nr_pages; 271 unsigned long flags; 272 273 /* 274 * At this point all allocated objects are freed, and 275 * objcg->nr_charged_bytes can't have an arbitrary byte value. 276 * However, it can be PAGE_SIZE or (x * PAGE_SIZE). 277 * 278 * The following sequence can lead to it: 279 * 1) CPU0: objcg == stock->cached_objcg 280 * 2) CPU1: we do a small allocation (e.g. 92 bytes), 281 * PAGE_SIZE bytes are charged 282 * 3) CPU1: a process from another memcg is allocating something, 283 * the stock if flushed, 284 * objcg->nr_charged_bytes = PAGE_SIZE - 92 285 * 5) CPU0: we do release this object, 286 * 92 bytes are added to stock->nr_bytes 287 * 6) CPU0: stock is flushed, 288 * 92 bytes are added to objcg->nr_charged_bytes 289 * 290 * In the result, nr_charged_bytes == PAGE_SIZE. 291 * This page will be uncharged in obj_cgroup_release(). 292 */ 293 nr_bytes = atomic_read(&objcg->nr_charged_bytes); 294 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); 295 nr_pages = nr_bytes >> PAGE_SHIFT; 296 297 if (nr_pages) 298 obj_cgroup_uncharge_pages(objcg, nr_pages); 299 300 spin_lock_irqsave(&objcg_lock, flags); 301 list_del(&objcg->list); 302 spin_unlock_irqrestore(&objcg_lock, flags); 303 304 percpu_ref_exit(ref); 305 kfree_rcu(objcg, rcu); 306 } 307 308 static struct obj_cgroup *obj_cgroup_alloc(void) 309 { 310 struct obj_cgroup *objcg; 311 int ret; 312 313 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); 314 if (!objcg) 315 return NULL; 316 317 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, 318 GFP_KERNEL); 319 if (ret) { 320 kfree(objcg); 321 return NULL; 322 } 323 INIT_LIST_HEAD(&objcg->list); 324 return objcg; 325 } 326 327 static void memcg_reparent_objcgs(struct mem_cgroup *memcg, 328 struct mem_cgroup *parent) 329 { 330 struct obj_cgroup *objcg, *iter; 331 332 objcg = rcu_replace_pointer(memcg->objcg, NULL, true); 333 334 spin_lock_irq(&objcg_lock); 335 336 /* 1) Ready to reparent active objcg. */ 337 list_add(&objcg->list, &memcg->objcg_list); 338 /* 2) Reparent active objcg and already reparented objcgs to parent. */ 339 list_for_each_entry(iter, &memcg->objcg_list, list) 340 WRITE_ONCE(iter->memcg, parent); 341 /* 3) Move already reparented objcgs to the parent's list */ 342 list_splice(&memcg->objcg_list, &parent->objcg_list); 343 344 spin_unlock_irq(&objcg_lock); 345 346 percpu_ref_kill(&objcg->refcnt); 347 } 348 349 /* 350 * A lot of the calls to the cache allocation functions are expected to be 351 * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are 352 * conditional to this static branch, we'll have to allow modules that does 353 * kmem_cache_alloc and the such to see this symbol as well 354 */ 355 DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key); 356 EXPORT_SYMBOL(memcg_kmem_online_key); 357 358 DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key); 359 EXPORT_SYMBOL(memcg_bpf_enabled_key); 360 #endif 361 362 /** 363 * mem_cgroup_css_from_folio - css of the memcg associated with a folio 364 * @folio: folio of interest 365 * 366 * If memcg is bound to the default hierarchy, css of the memcg associated 367 * with @folio is returned. The returned css remains associated with @folio 368 * until it is released. 369 * 370 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 371 * is returned. 372 */ 373 struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) 374 { 375 struct mem_cgroup *memcg = folio_memcg(folio); 376 377 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 378 memcg = root_mem_cgroup; 379 380 return &memcg->css; 381 } 382 383 /** 384 * page_cgroup_ino - return inode number of the memcg a page is charged to 385 * @page: the page 386 * 387 * Look up the closest online ancestor of the memory cgroup @page is charged to 388 * and return its inode number or 0 if @page is not charged to any cgroup. It 389 * is safe to call this function without holding a reference to @page. 390 * 391 * Note, this function is inherently racy, because there is nothing to prevent 392 * the cgroup inode from getting torn down and potentially reallocated a moment 393 * after page_cgroup_ino() returns, so it only should be used by callers that 394 * do not care (such as procfs interfaces). 395 */ 396 ino_t page_cgroup_ino(struct page *page) 397 { 398 struct mem_cgroup *memcg; 399 unsigned long ino = 0; 400 401 rcu_read_lock(); 402 /* page_folio() is racy here, but the entire function is racy anyway */ 403 memcg = folio_memcg_check(page_folio(page)); 404 405 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 406 memcg = parent_mem_cgroup(memcg); 407 if (memcg) 408 ino = cgroup_ino(memcg->css.cgroup); 409 rcu_read_unlock(); 410 return ino; 411 } 412 413 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 414 struct mem_cgroup_tree_per_node *mctz, 415 unsigned long new_usage_in_excess) 416 { 417 struct rb_node **p = &mctz->rb_root.rb_node; 418 struct rb_node *parent = NULL; 419 struct mem_cgroup_per_node *mz_node; 420 bool rightmost = true; 421 422 if (mz->on_tree) 423 return; 424 425 mz->usage_in_excess = new_usage_in_excess; 426 if (!mz->usage_in_excess) 427 return; 428 while (*p) { 429 parent = *p; 430 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 431 tree_node); 432 if (mz->usage_in_excess < mz_node->usage_in_excess) { 433 p = &(*p)->rb_left; 434 rightmost = false; 435 } else { 436 p = &(*p)->rb_right; 437 } 438 } 439 440 if (rightmost) 441 mctz->rb_rightmost = &mz->tree_node; 442 443 rb_link_node(&mz->tree_node, parent, p); 444 rb_insert_color(&mz->tree_node, &mctz->rb_root); 445 mz->on_tree = true; 446 } 447 448 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 449 struct mem_cgroup_tree_per_node *mctz) 450 { 451 if (!mz->on_tree) 452 return; 453 454 if (&mz->tree_node == mctz->rb_rightmost) 455 mctz->rb_rightmost = rb_prev(&mz->tree_node); 456 457 rb_erase(&mz->tree_node, &mctz->rb_root); 458 mz->on_tree = false; 459 } 460 461 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 462 struct mem_cgroup_tree_per_node *mctz) 463 { 464 unsigned long flags; 465 466 spin_lock_irqsave(&mctz->lock, flags); 467 __mem_cgroup_remove_exceeded(mz, mctz); 468 spin_unlock_irqrestore(&mctz->lock, flags); 469 } 470 471 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 472 { 473 unsigned long nr_pages = page_counter_read(&memcg->memory); 474 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 475 unsigned long excess = 0; 476 477 if (nr_pages > soft_limit) 478 excess = nr_pages - soft_limit; 479 480 return excess; 481 } 482 483 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) 484 { 485 unsigned long excess; 486 struct mem_cgroup_per_node *mz; 487 struct mem_cgroup_tree_per_node *mctz; 488 489 if (lru_gen_enabled()) { 490 if (soft_limit_excess(memcg)) 491 lru_gen_soft_reclaim(memcg, nid); 492 return; 493 } 494 495 mctz = soft_limit_tree.rb_tree_per_node[nid]; 496 if (!mctz) 497 return; 498 /* 499 * Necessary to update all ancestors when hierarchy is used. 500 * because their event counter is not touched. 501 */ 502 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 503 mz = memcg->nodeinfo[nid]; 504 excess = soft_limit_excess(memcg); 505 /* 506 * We have to update the tree if mz is on RB-tree or 507 * mem is over its softlimit. 508 */ 509 if (excess || mz->on_tree) { 510 unsigned long flags; 511 512 spin_lock_irqsave(&mctz->lock, flags); 513 /* if on-tree, remove it */ 514 if (mz->on_tree) 515 __mem_cgroup_remove_exceeded(mz, mctz); 516 /* 517 * Insert again. mz->usage_in_excess will be updated. 518 * If excess is 0, no tree ops. 519 */ 520 __mem_cgroup_insert_exceeded(mz, mctz, excess); 521 spin_unlock_irqrestore(&mctz->lock, flags); 522 } 523 } 524 } 525 526 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 527 { 528 struct mem_cgroup_tree_per_node *mctz; 529 struct mem_cgroup_per_node *mz; 530 int nid; 531 532 for_each_node(nid) { 533 mz = memcg->nodeinfo[nid]; 534 mctz = soft_limit_tree.rb_tree_per_node[nid]; 535 if (mctz) 536 mem_cgroup_remove_exceeded(mz, mctz); 537 } 538 } 539 540 static struct mem_cgroup_per_node * 541 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 542 { 543 struct mem_cgroup_per_node *mz; 544 545 retry: 546 mz = NULL; 547 if (!mctz->rb_rightmost) 548 goto done; /* Nothing to reclaim from */ 549 550 mz = rb_entry(mctz->rb_rightmost, 551 struct mem_cgroup_per_node, tree_node); 552 /* 553 * Remove the node now but someone else can add it back, 554 * we will to add it back at the end of reclaim to its correct 555 * position in the tree. 556 */ 557 __mem_cgroup_remove_exceeded(mz, mctz); 558 if (!soft_limit_excess(mz->memcg) || 559 !css_tryget(&mz->memcg->css)) 560 goto retry; 561 done: 562 return mz; 563 } 564 565 static struct mem_cgroup_per_node * 566 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 567 { 568 struct mem_cgroup_per_node *mz; 569 570 spin_lock_irq(&mctz->lock); 571 mz = __mem_cgroup_largest_soft_limit_node(mctz); 572 spin_unlock_irq(&mctz->lock); 573 return mz; 574 } 575 576 /* 577 * memcg and lruvec stats flushing 578 * 579 * Many codepaths leading to stats update or read are performance sensitive and 580 * adding stats flushing in such codepaths is not desirable. So, to optimize the 581 * flushing the kernel does: 582 * 583 * 1) Periodically and asynchronously flush the stats every 2 seconds to not let 584 * rstat update tree grow unbounded. 585 * 586 * 2) Flush the stats synchronously on reader side only when there are more than 587 * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization 588 * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but 589 * only for 2 seconds due to (1). 590 */ 591 static void flush_memcg_stats_dwork(struct work_struct *w); 592 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); 593 static DEFINE_PER_CPU(unsigned int, stats_updates); 594 static atomic_t stats_flush_ongoing = ATOMIC_INIT(0); 595 static atomic_t stats_flush_threshold = ATOMIC_INIT(0); 596 static u64 flush_next_time; 597 598 #define FLUSH_TIME (2UL*HZ) 599 600 /* 601 * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can 602 * not rely on this as part of an acquired spinlock_t lock. These functions are 603 * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion 604 * is sufficient. 605 */ 606 static void memcg_stats_lock(void) 607 { 608 preempt_disable_nested(); 609 VM_WARN_ON_IRQS_ENABLED(); 610 } 611 612 static void __memcg_stats_lock(void) 613 { 614 preempt_disable_nested(); 615 } 616 617 static void memcg_stats_unlock(void) 618 { 619 preempt_enable_nested(); 620 } 621 622 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) 623 { 624 unsigned int x; 625 626 if (!val) 627 return; 628 629 cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); 630 631 x = __this_cpu_add_return(stats_updates, abs(val)); 632 if (x > MEMCG_CHARGE_BATCH) { 633 /* 634 * If stats_flush_threshold exceeds the threshold 635 * (>num_online_cpus()), cgroup stats update will be triggered 636 * in __mem_cgroup_flush_stats(). Increasing this var further 637 * is redundant and simply adds overhead in atomic update. 638 */ 639 if (atomic_read(&stats_flush_threshold) <= num_online_cpus()) 640 atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); 641 __this_cpu_write(stats_updates, 0); 642 } 643 } 644 645 static void do_flush_stats(void) 646 { 647 /* 648 * We always flush the entire tree, so concurrent flushers can just 649 * skip. This avoids a thundering herd problem on the rstat global lock 650 * from memcg flushers (e.g. reclaim, refault, etc). 651 */ 652 if (atomic_read(&stats_flush_ongoing) || 653 atomic_xchg(&stats_flush_ongoing, 1)) 654 return; 655 656 WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME); 657 658 cgroup_rstat_flush(root_mem_cgroup->css.cgroup); 659 660 atomic_set(&stats_flush_threshold, 0); 661 atomic_set(&stats_flush_ongoing, 0); 662 } 663 664 void mem_cgroup_flush_stats(void) 665 { 666 if (atomic_read(&stats_flush_threshold) > num_online_cpus()) 667 do_flush_stats(); 668 } 669 670 void mem_cgroup_flush_stats_ratelimited(void) 671 { 672 if (time_after64(jiffies_64, READ_ONCE(flush_next_time))) 673 mem_cgroup_flush_stats(); 674 } 675 676 static void flush_memcg_stats_dwork(struct work_struct *w) 677 { 678 /* 679 * Always flush here so that flushing in latency-sensitive paths is 680 * as cheap as possible. 681 */ 682 do_flush_stats(); 683 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); 684 } 685 686 /* Subset of vm_event_item to report for memcg event stats */ 687 static const unsigned int memcg_vm_event_stat[] = { 688 PGPGIN, 689 PGPGOUT, 690 PGSCAN_KSWAPD, 691 PGSCAN_DIRECT, 692 PGSCAN_KHUGEPAGED, 693 PGSTEAL_KSWAPD, 694 PGSTEAL_DIRECT, 695 PGSTEAL_KHUGEPAGED, 696 PGFAULT, 697 PGMAJFAULT, 698 PGREFILL, 699 PGACTIVATE, 700 PGDEACTIVATE, 701 PGLAZYFREE, 702 PGLAZYFREED, 703 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 704 ZSWPIN, 705 ZSWPOUT, 706 #endif 707 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 708 THP_FAULT_ALLOC, 709 THP_COLLAPSE_ALLOC, 710 THP_SWPOUT, 711 THP_SWPOUT_FALLBACK, 712 #endif 713 }; 714 715 #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) 716 static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; 717 718 static void init_memcg_events(void) 719 { 720 int i; 721 722 for (i = 0; i < NR_MEMCG_EVENTS; ++i) 723 mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; 724 } 725 726 static inline int memcg_events_index(enum vm_event_item idx) 727 { 728 return mem_cgroup_events_index[idx] - 1; 729 } 730 731 struct memcg_vmstats_percpu { 732 /* Local (CPU and cgroup) page state & events */ 733 long state[MEMCG_NR_STAT]; 734 unsigned long events[NR_MEMCG_EVENTS]; 735 736 /* Delta calculation for lockless upward propagation */ 737 long state_prev[MEMCG_NR_STAT]; 738 unsigned long events_prev[NR_MEMCG_EVENTS]; 739 740 /* Cgroup1: threshold notifications & softlimit tree updates */ 741 unsigned long nr_page_events; 742 unsigned long targets[MEM_CGROUP_NTARGETS]; 743 }; 744 745 struct memcg_vmstats { 746 /* Aggregated (CPU and subtree) page state & events */ 747 long state[MEMCG_NR_STAT]; 748 unsigned long events[NR_MEMCG_EVENTS]; 749 750 /* Non-hierarchical (CPU aggregated) page state & events */ 751 long state_local[MEMCG_NR_STAT]; 752 unsigned long events_local[NR_MEMCG_EVENTS]; 753 754 /* Pending child counts during tree propagation */ 755 long state_pending[MEMCG_NR_STAT]; 756 unsigned long events_pending[NR_MEMCG_EVENTS]; 757 }; 758 759 unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) 760 { 761 long x = READ_ONCE(memcg->vmstats->state[idx]); 762 #ifdef CONFIG_SMP 763 if (x < 0) 764 x = 0; 765 #endif 766 return x; 767 } 768 769 static int memcg_page_state_unit(int item); 770 771 /* 772 * Normalize the value passed into memcg_rstat_updated() to be in pages. Round 773 * up non-zero sub-page updates to 1 page as zero page updates are ignored. 774 */ 775 static int memcg_state_val_in_pages(int idx, int val) 776 { 777 int unit = memcg_page_state_unit(idx); 778 779 if (!val || unit == PAGE_SIZE) 780 return val; 781 else 782 return max(val * unit / PAGE_SIZE, 1UL); 783 } 784 785 /** 786 * __mod_memcg_state - update cgroup memory statistics 787 * @memcg: the memory cgroup 788 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item 789 * @val: delta to add to the counter, can be negative 790 */ 791 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) 792 { 793 if (mem_cgroup_disabled()) 794 return; 795 796 __this_cpu_add(memcg->vmstats_percpu->state[idx], val); 797 memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); 798 } 799 800 /* idx can be of type enum memcg_stat_item or node_stat_item. */ 801 static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) 802 { 803 long x = READ_ONCE(memcg->vmstats->state_local[idx]); 804 805 #ifdef CONFIG_SMP 806 if (x < 0) 807 x = 0; 808 #endif 809 return x; 810 } 811 812 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 813 int val) 814 { 815 struct mem_cgroup_per_node *pn; 816 struct mem_cgroup *memcg; 817 818 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 819 memcg = pn->memcg; 820 821 /* 822 * The caller from rmap relies on disabled preemption because they never 823 * update their counter from in-interrupt context. For these two 824 * counters we check that the update is never performed from an 825 * interrupt context while other caller need to have disabled interrupt. 826 */ 827 __memcg_stats_lock(); 828 if (IS_ENABLED(CONFIG_DEBUG_VM)) { 829 switch (idx) { 830 case NR_ANON_MAPPED: 831 case NR_FILE_MAPPED: 832 case NR_ANON_THPS: 833 case NR_SHMEM_PMDMAPPED: 834 case NR_FILE_PMDMAPPED: 835 WARN_ON_ONCE(!in_task()); 836 break; 837 default: 838 VM_WARN_ON_IRQS_ENABLED(); 839 } 840 } 841 842 /* Update memcg */ 843 __this_cpu_add(memcg->vmstats_percpu->state[idx], val); 844 845 /* Update lruvec */ 846 __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); 847 848 memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); 849 memcg_stats_unlock(); 850 } 851 852 /** 853 * __mod_lruvec_state - update lruvec memory statistics 854 * @lruvec: the lruvec 855 * @idx: the stat item 856 * @val: delta to add to the counter, can be negative 857 * 858 * The lruvec is the intersection of the NUMA node and a cgroup. This 859 * function updates the all three counters that are affected by a 860 * change of state at this level: per-node, per-cgroup, per-lruvec. 861 */ 862 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 863 int val) 864 { 865 /* Update node */ 866 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); 867 868 /* Update memcg and lruvec */ 869 if (!mem_cgroup_disabled()) 870 __mod_memcg_lruvec_state(lruvec, idx, val); 871 } 872 873 void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, 874 int val) 875 { 876 struct page *head = compound_head(page); /* rmap on tail pages */ 877 struct mem_cgroup *memcg; 878 pg_data_t *pgdat = page_pgdat(page); 879 struct lruvec *lruvec; 880 881 rcu_read_lock(); 882 memcg = page_memcg(head); 883 /* Untracked pages have no memcg, no lruvec. Update only the node */ 884 if (!memcg) { 885 rcu_read_unlock(); 886 __mod_node_page_state(pgdat, idx, val); 887 return; 888 } 889 890 lruvec = mem_cgroup_lruvec(memcg, pgdat); 891 __mod_lruvec_state(lruvec, idx, val); 892 rcu_read_unlock(); 893 } 894 EXPORT_SYMBOL(__mod_lruvec_page_state); 895 896 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) 897 { 898 pg_data_t *pgdat = page_pgdat(virt_to_page(p)); 899 struct mem_cgroup *memcg; 900 struct lruvec *lruvec; 901 902 rcu_read_lock(); 903 memcg = mem_cgroup_from_slab_obj(p); 904 905 /* 906 * Untracked pages have no memcg, no lruvec. Update only the 907 * node. If we reparent the slab objects to the root memcg, 908 * when we free the slab object, we need to update the per-memcg 909 * vmstats to keep it correct for the root memcg. 910 */ 911 if (!memcg) { 912 __mod_node_page_state(pgdat, idx, val); 913 } else { 914 lruvec = mem_cgroup_lruvec(memcg, pgdat); 915 __mod_lruvec_state(lruvec, idx, val); 916 } 917 rcu_read_unlock(); 918 } 919 920 /** 921 * __count_memcg_events - account VM events in a cgroup 922 * @memcg: the memory cgroup 923 * @idx: the event item 924 * @count: the number of events that occurred 925 */ 926 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, 927 unsigned long count) 928 { 929 int index = memcg_events_index(idx); 930 931 if (mem_cgroup_disabled() || index < 0) 932 return; 933 934 memcg_stats_lock(); 935 __this_cpu_add(memcg->vmstats_percpu->events[index], count); 936 memcg_rstat_updated(memcg, count); 937 memcg_stats_unlock(); 938 } 939 940 static unsigned long memcg_events(struct mem_cgroup *memcg, int event) 941 { 942 int index = memcg_events_index(event); 943 944 if (index < 0) 945 return 0; 946 return READ_ONCE(memcg->vmstats->events[index]); 947 } 948 949 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) 950 { 951 int index = memcg_events_index(event); 952 953 if (index < 0) 954 return 0; 955 956 return READ_ONCE(memcg->vmstats->events_local[index]); 957 } 958 959 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 960 int nr_pages) 961 { 962 /* pagein of a big page is an event. So, ignore page size */ 963 if (nr_pages > 0) 964 __count_memcg_events(memcg, PGPGIN, 1); 965 else { 966 __count_memcg_events(memcg, PGPGOUT, 1); 967 nr_pages = -nr_pages; /* for event */ 968 } 969 970 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); 971 } 972 973 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 974 enum mem_cgroup_events_target target) 975 { 976 unsigned long val, next; 977 978 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); 979 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); 980 /* from time_after() in jiffies.h */ 981 if ((long)(next - val) < 0) { 982 switch (target) { 983 case MEM_CGROUP_TARGET_THRESH: 984 next = val + THRESHOLDS_EVENTS_TARGET; 985 break; 986 case MEM_CGROUP_TARGET_SOFTLIMIT: 987 next = val + SOFTLIMIT_EVENTS_TARGET; 988 break; 989 default: 990 break; 991 } 992 __this_cpu_write(memcg->vmstats_percpu->targets[target], next); 993 return true; 994 } 995 return false; 996 } 997 998 /* 999 * Check events in order. 1000 * 1001 */ 1002 static void memcg_check_events(struct mem_cgroup *memcg, int nid) 1003 { 1004 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 1005 return; 1006 1007 /* threshold event is triggered in finer grain than soft limit */ 1008 if (unlikely(mem_cgroup_event_ratelimit(memcg, 1009 MEM_CGROUP_TARGET_THRESH))) { 1010 bool do_softlimit; 1011 1012 do_softlimit = mem_cgroup_event_ratelimit(memcg, 1013 MEM_CGROUP_TARGET_SOFTLIMIT); 1014 mem_cgroup_threshold(memcg); 1015 if (unlikely(do_softlimit)) 1016 mem_cgroup_update_tree(memcg, nid); 1017 } 1018 } 1019 1020 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1021 { 1022 /* 1023 * mm_update_next_owner() may clear mm->owner to NULL 1024 * if it races with swapoff, page migration, etc. 1025 * So this can be called with p == NULL. 1026 */ 1027 if (unlikely(!p)) 1028 return NULL; 1029 1030 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 1031 } 1032 EXPORT_SYMBOL(mem_cgroup_from_task); 1033 1034 static __always_inline struct mem_cgroup *active_memcg(void) 1035 { 1036 if (!in_task()) 1037 return this_cpu_read(int_active_memcg); 1038 else 1039 return current->active_memcg; 1040 } 1041 1042 /** 1043 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 1044 * @mm: mm from which memcg should be extracted. It can be NULL. 1045 * 1046 * Obtain a reference on mm->memcg and returns it if successful. If mm 1047 * is NULL, then the memcg is chosen as follows: 1048 * 1) The active memcg, if set. 1049 * 2) current->mm->memcg, if available 1050 * 3) root memcg 1051 * If mem_cgroup is disabled, NULL is returned. 1052 */ 1053 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 1054 { 1055 struct mem_cgroup *memcg; 1056 1057 if (mem_cgroup_disabled()) 1058 return NULL; 1059 1060 /* 1061 * Page cache insertions can happen without an 1062 * actual mm context, e.g. during disk probing 1063 * on boot, loopback IO, acct() writes etc. 1064 * 1065 * No need to css_get on root memcg as the reference 1066 * counting is disabled on the root level in the 1067 * cgroup core. See CSS_NO_REF. 1068 */ 1069 if (unlikely(!mm)) { 1070 memcg = active_memcg(); 1071 if (unlikely(memcg)) { 1072 /* remote memcg must hold a ref */ 1073 css_get(&memcg->css); 1074 return memcg; 1075 } 1076 mm = current->mm; 1077 if (unlikely(!mm)) 1078 return root_mem_cgroup; 1079 } 1080 1081 rcu_read_lock(); 1082 do { 1083 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1084 if (unlikely(!memcg)) 1085 memcg = root_mem_cgroup; 1086 } while (!css_tryget(&memcg->css)); 1087 rcu_read_unlock(); 1088 return memcg; 1089 } 1090 EXPORT_SYMBOL(get_mem_cgroup_from_mm); 1091 1092 /** 1093 * get_mem_cgroup_from_current - Obtain a reference on current task's memcg. 1094 */ 1095 struct mem_cgroup *get_mem_cgroup_from_current(void) 1096 { 1097 struct mem_cgroup *memcg; 1098 1099 if (mem_cgroup_disabled()) 1100 return NULL; 1101 1102 again: 1103 rcu_read_lock(); 1104 memcg = mem_cgroup_from_task(current); 1105 if (!css_tryget(&memcg->css)) { 1106 rcu_read_unlock(); 1107 goto again; 1108 } 1109 rcu_read_unlock(); 1110 return memcg; 1111 } 1112 1113 /** 1114 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1115 * @root: hierarchy root 1116 * @prev: previously returned memcg, NULL on first invocation 1117 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1118 * 1119 * Returns references to children of the hierarchy below @root, or 1120 * @root itself, or %NULL after a full round-trip. 1121 * 1122 * Caller must pass the return value in @prev on subsequent 1123 * invocations for reference counting, or use mem_cgroup_iter_break() 1124 * to cancel a hierarchy walk before the round-trip is complete. 1125 * 1126 * Reclaimers can specify a node in @reclaim to divide up the memcgs 1127 * in the hierarchy among all concurrent reclaimers operating on the 1128 * same node. 1129 */ 1130 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1131 struct mem_cgroup *prev, 1132 struct mem_cgroup_reclaim_cookie *reclaim) 1133 { 1134 struct mem_cgroup_reclaim_iter *iter; 1135 struct cgroup_subsys_state *css = NULL; 1136 struct mem_cgroup *memcg = NULL; 1137 struct mem_cgroup *pos = NULL; 1138 1139 if (mem_cgroup_disabled()) 1140 return NULL; 1141 1142 if (!root) 1143 root = root_mem_cgroup; 1144 1145 rcu_read_lock(); 1146 1147 if (reclaim) { 1148 struct mem_cgroup_per_node *mz; 1149 1150 mz = root->nodeinfo[reclaim->pgdat->node_id]; 1151 iter = &mz->iter; 1152 1153 /* 1154 * On start, join the current reclaim iteration cycle. 1155 * Exit when a concurrent walker completes it. 1156 */ 1157 if (!prev) 1158 reclaim->generation = iter->generation; 1159 else if (reclaim->generation != iter->generation) 1160 goto out_unlock; 1161 1162 while (1) { 1163 pos = READ_ONCE(iter->position); 1164 if (!pos || css_tryget(&pos->css)) 1165 break; 1166 /* 1167 * css reference reached zero, so iter->position will 1168 * be cleared by ->css_released. However, we should not 1169 * rely on this happening soon, because ->css_released 1170 * is called from a work queue, and by busy-waiting we 1171 * might block it. So we clear iter->position right 1172 * away. 1173 */ 1174 (void)cmpxchg(&iter->position, pos, NULL); 1175 } 1176 } else if (prev) { 1177 pos = prev; 1178 } 1179 1180 if (pos) 1181 css = &pos->css; 1182 1183 for (;;) { 1184 css = css_next_descendant_pre(css, &root->css); 1185 if (!css) { 1186 /* 1187 * Reclaimers share the hierarchy walk, and a 1188 * new one might jump in right at the end of 1189 * the hierarchy - make sure they see at least 1190 * one group and restart from the beginning. 1191 */ 1192 if (!prev) 1193 continue; 1194 break; 1195 } 1196 1197 /* 1198 * Verify the css and acquire a reference. The root 1199 * is provided by the caller, so we know it's alive 1200 * and kicking, and don't take an extra reference. 1201 */ 1202 if (css == &root->css || css_tryget(css)) { 1203 memcg = mem_cgroup_from_css(css); 1204 break; 1205 } 1206 } 1207 1208 if (reclaim) { 1209 /* 1210 * The position could have already been updated by a competing 1211 * thread, so check that the value hasn't changed since we read 1212 * it to avoid reclaiming from the same cgroup twice. 1213 */ 1214 (void)cmpxchg(&iter->position, pos, memcg); 1215 1216 if (pos) 1217 css_put(&pos->css); 1218 1219 if (!memcg) 1220 iter->generation++; 1221 } 1222 1223 out_unlock: 1224 rcu_read_unlock(); 1225 if (prev && prev != root) 1226 css_put(&prev->css); 1227 1228 return memcg; 1229 } 1230 1231 /** 1232 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1233 * @root: hierarchy root 1234 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1235 */ 1236 void mem_cgroup_iter_break(struct mem_cgroup *root, 1237 struct mem_cgroup *prev) 1238 { 1239 if (!root) 1240 root = root_mem_cgroup; 1241 if (prev && prev != root) 1242 css_put(&prev->css); 1243 } 1244 1245 static void __invalidate_reclaim_iterators(struct mem_cgroup *from, 1246 struct mem_cgroup *dead_memcg) 1247 { 1248 struct mem_cgroup_reclaim_iter *iter; 1249 struct mem_cgroup_per_node *mz; 1250 int nid; 1251 1252 for_each_node(nid) { 1253 mz = from->nodeinfo[nid]; 1254 iter = &mz->iter; 1255 cmpxchg(&iter->position, dead_memcg, NULL); 1256 } 1257 } 1258 1259 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 1260 { 1261 struct mem_cgroup *memcg = dead_memcg; 1262 struct mem_cgroup *last; 1263 1264 do { 1265 __invalidate_reclaim_iterators(memcg, dead_memcg); 1266 last = memcg; 1267 } while ((memcg = parent_mem_cgroup(memcg))); 1268 1269 /* 1270 * When cgroup1 non-hierarchy mode is used, 1271 * parent_mem_cgroup() does not walk all the way up to the 1272 * cgroup root (root_mem_cgroup). So we have to handle 1273 * dead_memcg from cgroup root separately. 1274 */ 1275 if (!mem_cgroup_is_root(last)) 1276 __invalidate_reclaim_iterators(root_mem_cgroup, 1277 dead_memcg); 1278 } 1279 1280 /** 1281 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 1282 * @memcg: hierarchy root 1283 * @fn: function to call for each task 1284 * @arg: argument passed to @fn 1285 * 1286 * This function iterates over tasks attached to @memcg or to any of its 1287 * descendants and calls @fn for each task. If @fn returns a non-zero 1288 * value, the function breaks the iteration loop. Otherwise, it will iterate 1289 * over all tasks and return 0. 1290 * 1291 * This function must not be called for the root memory cgroup. 1292 */ 1293 void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 1294 int (*fn)(struct task_struct *, void *), void *arg) 1295 { 1296 struct mem_cgroup *iter; 1297 int ret = 0; 1298 1299 BUG_ON(mem_cgroup_is_root(memcg)); 1300 1301 for_each_mem_cgroup_tree(iter, memcg) { 1302 struct css_task_iter it; 1303 struct task_struct *task; 1304 1305 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); 1306 while (!ret && (task = css_task_iter_next(&it))) 1307 ret = fn(task, arg); 1308 css_task_iter_end(&it); 1309 if (ret) { 1310 mem_cgroup_iter_break(memcg, iter); 1311 break; 1312 } 1313 } 1314 } 1315 1316 #ifdef CONFIG_DEBUG_VM 1317 void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) 1318 { 1319 struct mem_cgroup *memcg; 1320 1321 if (mem_cgroup_disabled()) 1322 return; 1323 1324 memcg = folio_memcg(folio); 1325 1326 if (!memcg) 1327 VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); 1328 else 1329 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); 1330 } 1331 #endif 1332 1333 /** 1334 * folio_lruvec_lock - Lock the lruvec for a folio. 1335 * @folio: Pointer to the folio. 1336 * 1337 * These functions are safe to use under any of the following conditions: 1338 * - folio locked 1339 * - folio_test_lru false 1340 * - folio_memcg_lock() 1341 * - folio frozen (refcount of 0) 1342 * 1343 * Return: The lruvec this folio is on with its lock held. 1344 */ 1345 struct lruvec *folio_lruvec_lock(struct folio *folio) 1346 { 1347 struct lruvec *lruvec = folio_lruvec(folio); 1348 1349 spin_lock(&lruvec->lru_lock); 1350 lruvec_memcg_debug(lruvec, folio); 1351 1352 return lruvec; 1353 } 1354 1355 /** 1356 * folio_lruvec_lock_irq - Lock the lruvec for a folio. 1357 * @folio: Pointer to the folio. 1358 * 1359 * These functions are safe to use under any of the following conditions: 1360 * - folio locked 1361 * - folio_test_lru false 1362 * - folio_memcg_lock() 1363 * - folio frozen (refcount of 0) 1364 * 1365 * Return: The lruvec this folio is on with its lock held and interrupts 1366 * disabled. 1367 */ 1368 struct lruvec *folio_lruvec_lock_irq(struct folio *folio) 1369 { 1370 struct lruvec *lruvec = folio_lruvec(folio); 1371 1372 spin_lock_irq(&lruvec->lru_lock); 1373 lruvec_memcg_debug(lruvec, folio); 1374 1375 return lruvec; 1376 } 1377 1378 /** 1379 * folio_lruvec_lock_irqsave - Lock the lruvec for a folio. 1380 * @folio: Pointer to the folio. 1381 * @flags: Pointer to irqsave flags. 1382 * 1383 * These functions are safe to use under any of the following conditions: 1384 * - folio locked 1385 * - folio_test_lru false 1386 * - folio_memcg_lock() 1387 * - folio frozen (refcount of 0) 1388 * 1389 * Return: The lruvec this folio is on with its lock held and interrupts 1390 * disabled. 1391 */ 1392 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, 1393 unsigned long *flags) 1394 { 1395 struct lruvec *lruvec = folio_lruvec(folio); 1396 1397 spin_lock_irqsave(&lruvec->lru_lock, *flags); 1398 lruvec_memcg_debug(lruvec, folio); 1399 1400 return lruvec; 1401 } 1402 1403 /** 1404 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1405 * @lruvec: mem_cgroup per zone lru vector 1406 * @lru: index of lru list the page is sitting on 1407 * @zid: zone id of the accounted pages 1408 * @nr_pages: positive when adding or negative when removing 1409 * 1410 * This function must be called under lru_lock, just before a page is added 1411 * to or just after a page is removed from an lru list. 1412 */ 1413 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1414 int zid, int nr_pages) 1415 { 1416 struct mem_cgroup_per_node *mz; 1417 unsigned long *lru_size; 1418 long size; 1419 1420 if (mem_cgroup_disabled()) 1421 return; 1422 1423 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 1424 lru_size = &mz->lru_zone_size[zid][lru]; 1425 1426 if (nr_pages < 0) 1427 *lru_size += nr_pages; 1428 1429 size = *lru_size; 1430 if (WARN_ONCE(size < 0, 1431 "%s(%p, %d, %d): lru_size %ld\n", 1432 __func__, lruvec, lru, nr_pages, size)) { 1433 VM_BUG_ON(1); 1434 *lru_size = 0; 1435 } 1436 1437 if (nr_pages > 0) 1438 *lru_size += nr_pages; 1439 } 1440 1441 /** 1442 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1443 * @memcg: the memory cgroup 1444 * 1445 * Returns the maximum amount of memory @mem can be charged with, in 1446 * pages. 1447 */ 1448 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1449 { 1450 unsigned long margin = 0; 1451 unsigned long count; 1452 unsigned long limit; 1453 1454 count = page_counter_read(&memcg->memory); 1455 limit = READ_ONCE(memcg->memory.max); 1456 if (count < limit) 1457 margin = limit - count; 1458 1459 if (do_memsw_account()) { 1460 count = page_counter_read(&memcg->memsw); 1461 limit = READ_ONCE(memcg->memsw.max); 1462 if (count < limit) 1463 margin = min(margin, limit - count); 1464 else 1465 margin = 0; 1466 } 1467 1468 return margin; 1469 } 1470 1471 /* 1472 * A routine for checking "mem" is under move_account() or not. 1473 * 1474 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1475 * moving cgroups. This is for waiting at high-memory pressure 1476 * caused by "move". 1477 */ 1478 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1479 { 1480 struct mem_cgroup *from; 1481 struct mem_cgroup *to; 1482 bool ret = false; 1483 /* 1484 * Unlike task_move routines, we access mc.to, mc.from not under 1485 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1486 */ 1487 spin_lock(&mc.lock); 1488 from = mc.from; 1489 to = mc.to; 1490 if (!from) 1491 goto unlock; 1492 1493 ret = mem_cgroup_is_descendant(from, memcg) || 1494 mem_cgroup_is_descendant(to, memcg); 1495 unlock: 1496 spin_unlock(&mc.lock); 1497 return ret; 1498 } 1499 1500 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1501 { 1502 if (mc.moving_task && current != mc.moving_task) { 1503 if (mem_cgroup_under_move(memcg)) { 1504 DEFINE_WAIT(wait); 1505 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1506 /* moving charge context might have finished. */ 1507 if (mc.moving_task) 1508 schedule(); 1509 finish_wait(&mc.waitq, &wait); 1510 return true; 1511 } 1512 } 1513 return false; 1514 } 1515 1516 struct memory_stat { 1517 const char *name; 1518 unsigned int idx; 1519 }; 1520 1521 static const struct memory_stat memory_stats[] = { 1522 { "anon", NR_ANON_MAPPED }, 1523 { "file", NR_FILE_PAGES }, 1524 { "kernel", MEMCG_KMEM }, 1525 { "kernel_stack", NR_KERNEL_STACK_KB }, 1526 { "pagetables", NR_PAGETABLE }, 1527 { "sec_pagetables", NR_SECONDARY_PAGETABLE }, 1528 { "percpu", MEMCG_PERCPU_B }, 1529 { "sock", MEMCG_SOCK }, 1530 { "vmalloc", MEMCG_VMALLOC }, 1531 { "shmem", NR_SHMEM }, 1532 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 1533 { "zswap", MEMCG_ZSWAP_B }, 1534 { "zswapped", MEMCG_ZSWAPPED }, 1535 #endif 1536 { "file_mapped", NR_FILE_MAPPED }, 1537 { "file_dirty", NR_FILE_DIRTY }, 1538 { "file_writeback", NR_WRITEBACK }, 1539 #ifdef CONFIG_SWAP 1540 { "swapcached", NR_SWAPCACHE }, 1541 #endif 1542 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1543 { "anon_thp", NR_ANON_THPS }, 1544 { "file_thp", NR_FILE_THPS }, 1545 { "shmem_thp", NR_SHMEM_THPS }, 1546 #endif 1547 { "inactive_anon", NR_INACTIVE_ANON }, 1548 { "active_anon", NR_ACTIVE_ANON }, 1549 { "inactive_file", NR_INACTIVE_FILE }, 1550 { "active_file", NR_ACTIVE_FILE }, 1551 { "unevictable", NR_UNEVICTABLE }, 1552 { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B }, 1553 { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B }, 1554 1555 /* The memory events */ 1556 { "workingset_refault_anon", WORKINGSET_REFAULT_ANON }, 1557 { "workingset_refault_file", WORKINGSET_REFAULT_FILE }, 1558 { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON }, 1559 { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE }, 1560 { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, 1561 { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, 1562 { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, 1563 }; 1564 1565 /* The actual unit of the state item, not the same as the output unit */ 1566 static int memcg_page_state_unit(int item) 1567 { 1568 switch (item) { 1569 case MEMCG_PERCPU_B: 1570 case MEMCG_ZSWAP_B: 1571 case NR_SLAB_RECLAIMABLE_B: 1572 case NR_SLAB_UNRECLAIMABLE_B: 1573 return 1; 1574 case NR_KERNEL_STACK_KB: 1575 return SZ_1K; 1576 default: 1577 return PAGE_SIZE; 1578 } 1579 } 1580 1581 /* Translate stat items to the correct unit for memory.stat output */ 1582 static int memcg_page_state_output_unit(int item) 1583 { 1584 /* 1585 * Workingset state is actually in pages, but we export it to userspace 1586 * as a scalar count of events, so special case it here. 1587 */ 1588 switch (item) { 1589 case WORKINGSET_REFAULT_ANON: 1590 case WORKINGSET_REFAULT_FILE: 1591 case WORKINGSET_ACTIVATE_ANON: 1592 case WORKINGSET_ACTIVATE_FILE: 1593 case WORKINGSET_RESTORE_ANON: 1594 case WORKINGSET_RESTORE_FILE: 1595 case WORKINGSET_NODERECLAIM: 1596 return 1; 1597 default: 1598 return memcg_page_state_unit(item); 1599 } 1600 } 1601 1602 static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, 1603 int item) 1604 { 1605 return memcg_page_state(memcg, item) * 1606 memcg_page_state_output_unit(item); 1607 } 1608 1609 static inline unsigned long memcg_page_state_local_output( 1610 struct mem_cgroup *memcg, int item) 1611 { 1612 return memcg_page_state_local(memcg, item) * 1613 memcg_page_state_output_unit(item); 1614 } 1615 1616 static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 1617 { 1618 int i; 1619 1620 /* 1621 * Provide statistics on the state of the memory subsystem as 1622 * well as cumulative event counters that show past behavior. 1623 * 1624 * This list is ordered following a combination of these gradients: 1625 * 1) generic big picture -> specifics and details 1626 * 2) reflecting userspace activity -> reflecting kernel heuristics 1627 * 1628 * Current memory state: 1629 */ 1630 mem_cgroup_flush_stats(); 1631 1632 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 1633 u64 size; 1634 1635 size = memcg_page_state_output(memcg, memory_stats[i].idx); 1636 seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size); 1637 1638 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { 1639 size += memcg_page_state_output(memcg, 1640 NR_SLAB_RECLAIMABLE_B); 1641 seq_buf_printf(s, "slab %llu\n", size); 1642 } 1643 } 1644 1645 /* Accumulated memory events */ 1646 seq_buf_printf(s, "pgscan %lu\n", 1647 memcg_events(memcg, PGSCAN_KSWAPD) + 1648 memcg_events(memcg, PGSCAN_DIRECT) + 1649 memcg_events(memcg, PGSCAN_KHUGEPAGED)); 1650 seq_buf_printf(s, "pgsteal %lu\n", 1651 memcg_events(memcg, PGSTEAL_KSWAPD) + 1652 memcg_events(memcg, PGSTEAL_DIRECT) + 1653 memcg_events(memcg, PGSTEAL_KHUGEPAGED)); 1654 1655 for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { 1656 if (memcg_vm_event_stat[i] == PGPGIN || 1657 memcg_vm_event_stat[i] == PGPGOUT) 1658 continue; 1659 1660 seq_buf_printf(s, "%s %lu\n", 1661 vm_event_name(memcg_vm_event_stat[i]), 1662 memcg_events(memcg, memcg_vm_event_stat[i])); 1663 } 1664 1665 /* The above should easily fit into one page */ 1666 WARN_ON_ONCE(seq_buf_has_overflowed(s)); 1667 } 1668 1669 static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s); 1670 1671 static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 1672 { 1673 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1674 memcg_stat_format(memcg, s); 1675 else 1676 memcg1_stat_format(memcg, s); 1677 WARN_ON_ONCE(seq_buf_has_overflowed(s)); 1678 } 1679 1680 /** 1681 * mem_cgroup_print_oom_context: Print OOM information relevant to 1682 * memory controller. 1683 * @memcg: The memory cgroup that went over limit 1684 * @p: Task that is going to be killed 1685 * 1686 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1687 * enabled 1688 */ 1689 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 1690 { 1691 rcu_read_lock(); 1692 1693 if (memcg) { 1694 pr_cont(",oom_memcg="); 1695 pr_cont_cgroup_path(memcg->css.cgroup); 1696 } else 1697 pr_cont(",global_oom"); 1698 if (p) { 1699 pr_cont(",task_memcg="); 1700 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1701 } 1702 rcu_read_unlock(); 1703 } 1704 1705 /** 1706 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 1707 * memory controller. 1708 * @memcg: The memory cgroup that went over limit 1709 */ 1710 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 1711 { 1712 /* Use static buffer, for the caller is holding oom_lock. */ 1713 static char buf[PAGE_SIZE]; 1714 struct seq_buf s; 1715 1716 lockdep_assert_held(&oom_lock); 1717 1718 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1719 K((u64)page_counter_read(&memcg->memory)), 1720 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); 1721 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1722 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", 1723 K((u64)page_counter_read(&memcg->swap)), 1724 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); 1725 else { 1726 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1727 K((u64)page_counter_read(&memcg->memsw)), 1728 K((u64)memcg->memsw.max), memcg->memsw.failcnt); 1729 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1730 K((u64)page_counter_read(&memcg->kmem)), 1731 K((u64)memcg->kmem.max), memcg->kmem.failcnt); 1732 } 1733 1734 pr_info("Memory cgroup stats for "); 1735 pr_cont_cgroup_path(memcg->css.cgroup); 1736 pr_cont(":"); 1737 seq_buf_init(&s, buf, sizeof(buf)); 1738 memory_stat_format(memcg, &s); 1739 seq_buf_do_printk(&s, KERN_INFO); 1740 } 1741 1742 /* 1743 * Return the memory (and swap, if configured) limit for a memcg. 1744 */ 1745 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 1746 { 1747 unsigned long max = READ_ONCE(memcg->memory.max); 1748 1749 if (do_memsw_account()) { 1750 if (mem_cgroup_swappiness(memcg)) { 1751 /* Calculate swap excess capacity from memsw limit */ 1752 unsigned long swap = READ_ONCE(memcg->memsw.max) - max; 1753 1754 max += min(swap, (unsigned long)total_swap_pages); 1755 } 1756 } else { 1757 if (mem_cgroup_swappiness(memcg)) 1758 max += min(READ_ONCE(memcg->swap.max), 1759 (unsigned long)total_swap_pages); 1760 } 1761 return max; 1762 } 1763 1764 unsigned long mem_cgroup_size(struct mem_cgroup *memcg) 1765 { 1766 return page_counter_read(&memcg->memory); 1767 } 1768 1769 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1770 int order) 1771 { 1772 struct oom_control oc = { 1773 .zonelist = NULL, 1774 .nodemask = NULL, 1775 .memcg = memcg, 1776 .gfp_mask = gfp_mask, 1777 .order = order, 1778 }; 1779 bool ret = true; 1780 1781 if (mutex_lock_killable(&oom_lock)) 1782 return true; 1783 1784 if (mem_cgroup_margin(memcg) >= (1 << order)) 1785 goto unlock; 1786 1787 /* 1788 * A few threads which were not waiting at mutex_lock_killable() can 1789 * fail to bail out. Therefore, check again after holding oom_lock. 1790 */ 1791 ret = task_is_dying() || out_of_memory(&oc); 1792 1793 unlock: 1794 mutex_unlock(&oom_lock); 1795 return ret; 1796 } 1797 1798 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1799 pg_data_t *pgdat, 1800 gfp_t gfp_mask, 1801 unsigned long *total_scanned) 1802 { 1803 struct mem_cgroup *victim = NULL; 1804 int total = 0; 1805 int loop = 0; 1806 unsigned long excess; 1807 unsigned long nr_scanned; 1808 struct mem_cgroup_reclaim_cookie reclaim = { 1809 .pgdat = pgdat, 1810 }; 1811 1812 excess = soft_limit_excess(root_memcg); 1813 1814 while (1) { 1815 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1816 if (!victim) { 1817 loop++; 1818 if (loop >= 2) { 1819 /* 1820 * If we have not been able to reclaim 1821 * anything, it might because there are 1822 * no reclaimable pages under this hierarchy 1823 */ 1824 if (!total) 1825 break; 1826 /* 1827 * We want to do more targeted reclaim. 1828 * excess >> 2 is not to excessive so as to 1829 * reclaim too much, nor too less that we keep 1830 * coming back to reclaim from this cgroup 1831 */ 1832 if (total >= (excess >> 2) || 1833 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1834 break; 1835 } 1836 continue; 1837 } 1838 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 1839 pgdat, &nr_scanned); 1840 *total_scanned += nr_scanned; 1841 if (!soft_limit_excess(root_memcg)) 1842 break; 1843 } 1844 mem_cgroup_iter_break(root_memcg, victim); 1845 return total; 1846 } 1847 1848 #ifdef CONFIG_LOCKDEP 1849 static struct lockdep_map memcg_oom_lock_dep_map = { 1850 .name = "memcg_oom_lock", 1851 }; 1852 #endif 1853 1854 static DEFINE_SPINLOCK(memcg_oom_lock); 1855 1856 /* 1857 * Check OOM-Killer is already running under our hierarchy. 1858 * If someone is running, return false. 1859 */ 1860 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1861 { 1862 struct mem_cgroup *iter, *failed = NULL; 1863 1864 spin_lock(&memcg_oom_lock); 1865 1866 for_each_mem_cgroup_tree(iter, memcg) { 1867 if (iter->oom_lock) { 1868 /* 1869 * this subtree of our hierarchy is already locked 1870 * so we cannot give a lock. 1871 */ 1872 failed = iter; 1873 mem_cgroup_iter_break(memcg, iter); 1874 break; 1875 } else 1876 iter->oom_lock = true; 1877 } 1878 1879 if (failed) { 1880 /* 1881 * OK, we failed to lock the whole subtree so we have 1882 * to clean up what we set up to the failing subtree 1883 */ 1884 for_each_mem_cgroup_tree(iter, memcg) { 1885 if (iter == failed) { 1886 mem_cgroup_iter_break(memcg, iter); 1887 break; 1888 } 1889 iter->oom_lock = false; 1890 } 1891 } else 1892 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1893 1894 spin_unlock(&memcg_oom_lock); 1895 1896 return !failed; 1897 } 1898 1899 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1900 { 1901 struct mem_cgroup *iter; 1902 1903 spin_lock(&memcg_oom_lock); 1904 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1905 for_each_mem_cgroup_tree(iter, memcg) 1906 iter->oom_lock = false; 1907 spin_unlock(&memcg_oom_lock); 1908 } 1909 1910 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1911 { 1912 struct mem_cgroup *iter; 1913 1914 spin_lock(&memcg_oom_lock); 1915 for_each_mem_cgroup_tree(iter, memcg) 1916 iter->under_oom++; 1917 spin_unlock(&memcg_oom_lock); 1918 } 1919 1920 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1921 { 1922 struct mem_cgroup *iter; 1923 1924 /* 1925 * Be careful about under_oom underflows because a child memcg 1926 * could have been added after mem_cgroup_mark_under_oom. 1927 */ 1928 spin_lock(&memcg_oom_lock); 1929 for_each_mem_cgroup_tree(iter, memcg) 1930 if (iter->under_oom > 0) 1931 iter->under_oom--; 1932 spin_unlock(&memcg_oom_lock); 1933 } 1934 1935 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1936 1937 struct oom_wait_info { 1938 struct mem_cgroup *memcg; 1939 wait_queue_entry_t wait; 1940 }; 1941 1942 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1943 unsigned mode, int sync, void *arg) 1944 { 1945 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1946 struct mem_cgroup *oom_wait_memcg; 1947 struct oom_wait_info *oom_wait_info; 1948 1949 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1950 oom_wait_memcg = oom_wait_info->memcg; 1951 1952 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1953 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1954 return 0; 1955 return autoremove_wake_function(wait, mode, sync, arg); 1956 } 1957 1958 static void memcg_oom_recover(struct mem_cgroup *memcg) 1959 { 1960 /* 1961 * For the following lockless ->under_oom test, the only required 1962 * guarantee is that it must see the state asserted by an OOM when 1963 * this function is called as a result of userland actions 1964 * triggered by the notification of the OOM. This is trivially 1965 * achieved by invoking mem_cgroup_mark_under_oom() before 1966 * triggering notification. 1967 */ 1968 if (memcg && memcg->under_oom) 1969 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1970 } 1971 1972 /* 1973 * Returns true if successfully killed one or more processes. Though in some 1974 * corner cases it can return true even without killing any process. 1975 */ 1976 static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1977 { 1978 bool locked, ret; 1979 1980 if (order > PAGE_ALLOC_COSTLY_ORDER) 1981 return false; 1982 1983 memcg_memory_event(memcg, MEMCG_OOM); 1984 1985 /* 1986 * We are in the middle of the charge context here, so we 1987 * don't want to block when potentially sitting on a callstack 1988 * that holds all kinds of filesystem and mm locks. 1989 * 1990 * cgroup1 allows disabling the OOM killer and waiting for outside 1991 * handling until the charge can succeed; remember the context and put 1992 * the task to sleep at the end of the page fault when all locks are 1993 * released. 1994 * 1995 * On the other hand, in-kernel OOM killer allows for an async victim 1996 * memory reclaim (oom_reaper) and that means that we are not solely 1997 * relying on the oom victim to make a forward progress and we can 1998 * invoke the oom killer here. 1999 * 2000 * Please note that mem_cgroup_out_of_memory might fail to find a 2001 * victim and then we have to bail out from the charge path. 2002 */ 2003 if (READ_ONCE(memcg->oom_kill_disable)) { 2004 if (current->in_user_fault) { 2005 css_get(&memcg->css); 2006 current->memcg_in_oom = memcg; 2007 current->memcg_oom_gfp_mask = mask; 2008 current->memcg_oom_order = order; 2009 } 2010 return false; 2011 } 2012 2013 mem_cgroup_mark_under_oom(memcg); 2014 2015 locked = mem_cgroup_oom_trylock(memcg); 2016 2017 if (locked) 2018 mem_cgroup_oom_notify(memcg); 2019 2020 mem_cgroup_unmark_under_oom(memcg); 2021 ret = mem_cgroup_out_of_memory(memcg, mask, order); 2022 2023 if (locked) 2024 mem_cgroup_oom_unlock(memcg); 2025 2026 return ret; 2027 } 2028 2029 /** 2030 * mem_cgroup_oom_synchronize - complete memcg OOM handling 2031 * @handle: actually kill/wait or just clean up the OOM state 2032 * 2033 * This has to be called at the end of a page fault if the memcg OOM 2034 * handler was enabled. 2035 * 2036 * Memcg supports userspace OOM handling where failed allocations must 2037 * sleep on a waitqueue until the userspace task resolves the 2038 * situation. Sleeping directly in the charge context with all kinds 2039 * of locks held is not a good idea, instead we remember an OOM state 2040 * in the task and mem_cgroup_oom_synchronize() has to be called at 2041 * the end of the page fault to complete the OOM handling. 2042 * 2043 * Returns %true if an ongoing memcg OOM situation was detected and 2044 * completed, %false otherwise. 2045 */ 2046 bool mem_cgroup_oom_synchronize(bool handle) 2047 { 2048 struct mem_cgroup *memcg = current->memcg_in_oom; 2049 struct oom_wait_info owait; 2050 bool locked; 2051 2052 /* OOM is global, do not handle */ 2053 if (!memcg) 2054 return false; 2055 2056 if (!handle) 2057 goto cleanup; 2058 2059 owait.memcg = memcg; 2060 owait.wait.flags = 0; 2061 owait.wait.func = memcg_oom_wake_function; 2062 owait.wait.private = current; 2063 INIT_LIST_HEAD(&owait.wait.entry); 2064 2065 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2066 mem_cgroup_mark_under_oom(memcg); 2067 2068 locked = mem_cgroup_oom_trylock(memcg); 2069 2070 if (locked) 2071 mem_cgroup_oom_notify(memcg); 2072 2073 schedule(); 2074 mem_cgroup_unmark_under_oom(memcg); 2075 finish_wait(&memcg_oom_waitq, &owait.wait); 2076 2077 if (locked) 2078 mem_cgroup_oom_unlock(memcg); 2079 cleanup: 2080 current->memcg_in_oom = NULL; 2081 css_put(&memcg->css); 2082 return true; 2083 } 2084 2085 /** 2086 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 2087 * @victim: task to be killed by the OOM killer 2088 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 2089 * 2090 * Returns a pointer to a memory cgroup, which has to be cleaned up 2091 * by killing all belonging OOM-killable tasks. 2092 * 2093 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 2094 */ 2095 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 2096 struct mem_cgroup *oom_domain) 2097 { 2098 struct mem_cgroup *oom_group = NULL; 2099 struct mem_cgroup *memcg; 2100 2101 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 2102 return NULL; 2103 2104 if (!oom_domain) 2105 oom_domain = root_mem_cgroup; 2106 2107 rcu_read_lock(); 2108 2109 memcg = mem_cgroup_from_task(victim); 2110 if (mem_cgroup_is_root(memcg)) 2111 goto out; 2112 2113 /* 2114 * If the victim task has been asynchronously moved to a different 2115 * memory cgroup, we might end up killing tasks outside oom_domain. 2116 * In this case it's better to ignore memory.group.oom. 2117 */ 2118 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) 2119 goto out; 2120 2121 /* 2122 * Traverse the memory cgroup hierarchy from the victim task's 2123 * cgroup up to the OOMing cgroup (or root) to find the 2124 * highest-level memory cgroup with oom.group set. 2125 */ 2126 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 2127 if (READ_ONCE(memcg->oom_group)) 2128 oom_group = memcg; 2129 2130 if (memcg == oom_domain) 2131 break; 2132 } 2133 2134 if (oom_group) 2135 css_get(&oom_group->css); 2136 out: 2137 rcu_read_unlock(); 2138 2139 return oom_group; 2140 } 2141 2142 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 2143 { 2144 pr_info("Tasks in "); 2145 pr_cont_cgroup_path(memcg->css.cgroup); 2146 pr_cont(" are going to be killed due to memory.oom.group set\n"); 2147 } 2148 2149 /** 2150 * folio_memcg_lock - Bind a folio to its memcg. 2151 * @folio: The folio. 2152 * 2153 * This function prevents unlocked LRU folios from being moved to 2154 * another cgroup. 2155 * 2156 * It ensures lifetime of the bound memcg. The caller is responsible 2157 * for the lifetime of the folio. 2158 */ 2159 void folio_memcg_lock(struct folio *folio) 2160 { 2161 struct mem_cgroup *memcg; 2162 unsigned long flags; 2163 2164 /* 2165 * The RCU lock is held throughout the transaction. The fast 2166 * path can get away without acquiring the memcg->move_lock 2167 * because page moving starts with an RCU grace period. 2168 */ 2169 rcu_read_lock(); 2170 2171 if (mem_cgroup_disabled()) 2172 return; 2173 again: 2174 memcg = folio_memcg(folio); 2175 if (unlikely(!memcg)) 2176 return; 2177 2178 #ifdef CONFIG_PROVE_LOCKING 2179 local_irq_save(flags); 2180 might_lock(&memcg->move_lock); 2181 local_irq_restore(flags); 2182 #endif 2183 2184 if (atomic_read(&memcg->moving_account) <= 0) 2185 return; 2186 2187 spin_lock_irqsave(&memcg->move_lock, flags); 2188 if (memcg != folio_memcg(folio)) { 2189 spin_unlock_irqrestore(&memcg->move_lock, flags); 2190 goto again; 2191 } 2192 2193 /* 2194 * When charge migration first begins, we can have multiple 2195 * critical sections holding the fast-path RCU lock and one 2196 * holding the slowpath move_lock. Track the task who has the 2197 * move_lock for folio_memcg_unlock(). 2198 */ 2199 memcg->move_lock_task = current; 2200 memcg->move_lock_flags = flags; 2201 } 2202 2203 static void __folio_memcg_unlock(struct mem_cgroup *memcg) 2204 { 2205 if (memcg && memcg->move_lock_task == current) { 2206 unsigned long flags = memcg->move_lock_flags; 2207 2208 memcg->move_lock_task = NULL; 2209 memcg->move_lock_flags = 0; 2210 2211 spin_unlock_irqrestore(&memcg->move_lock, flags); 2212 } 2213 2214 rcu_read_unlock(); 2215 } 2216 2217 /** 2218 * folio_memcg_unlock - Release the binding between a folio and its memcg. 2219 * @folio: The folio. 2220 * 2221 * This releases the binding created by folio_memcg_lock(). This does 2222 * not change the accounting of this folio to its memcg, but it does 2223 * permit others to change it. 2224 */ 2225 void folio_memcg_unlock(struct folio *folio) 2226 { 2227 __folio_memcg_unlock(folio_memcg(folio)); 2228 } 2229 2230 struct memcg_stock_pcp { 2231 local_lock_t stock_lock; 2232 struct mem_cgroup *cached; /* this never be root cgroup */ 2233 unsigned int nr_pages; 2234 2235 #ifdef CONFIG_MEMCG_KMEM 2236 struct obj_cgroup *cached_objcg; 2237 struct pglist_data *cached_pgdat; 2238 unsigned int nr_bytes; 2239 int nr_slab_reclaimable_b; 2240 int nr_slab_unreclaimable_b; 2241 #endif 2242 2243 struct work_struct work; 2244 unsigned long flags; 2245 #define FLUSHING_CACHED_CHARGE 0 2246 }; 2247 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { 2248 .stock_lock = INIT_LOCAL_LOCK(stock_lock), 2249 }; 2250 static DEFINE_MUTEX(percpu_charge_mutex); 2251 2252 #ifdef CONFIG_MEMCG_KMEM 2253 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); 2254 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2255 struct mem_cgroup *root_memcg); 2256 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); 2257 2258 #else 2259 static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) 2260 { 2261 return NULL; 2262 } 2263 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2264 struct mem_cgroup *root_memcg) 2265 { 2266 return false; 2267 } 2268 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) 2269 { 2270 } 2271 #endif 2272 2273 /** 2274 * consume_stock: Try to consume stocked charge on this cpu. 2275 * @memcg: memcg to consume from. 2276 * @nr_pages: how many pages to charge. 2277 * 2278 * The charges will only happen if @memcg matches the current cpu's memcg 2279 * stock, and at least @nr_pages are available in that stock. Failure to 2280 * service an allocation will refill the stock. 2281 * 2282 * returns true if successful, false otherwise. 2283 */ 2284 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2285 { 2286 struct memcg_stock_pcp *stock; 2287 unsigned long flags; 2288 bool ret = false; 2289 2290 if (nr_pages > MEMCG_CHARGE_BATCH) 2291 return ret; 2292 2293 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2294 2295 stock = this_cpu_ptr(&memcg_stock); 2296 if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) { 2297 stock->nr_pages -= nr_pages; 2298 ret = true; 2299 } 2300 2301 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2302 2303 return ret; 2304 } 2305 2306 /* 2307 * Returns stocks cached in percpu and reset cached information. 2308 */ 2309 static void drain_stock(struct memcg_stock_pcp *stock) 2310 { 2311 struct mem_cgroup *old = READ_ONCE(stock->cached); 2312 2313 if (!old) 2314 return; 2315 2316 if (stock->nr_pages) { 2317 page_counter_uncharge(&old->memory, stock->nr_pages); 2318 if (do_memsw_account()) 2319 page_counter_uncharge(&old->memsw, stock->nr_pages); 2320 stock->nr_pages = 0; 2321 } 2322 2323 css_put(&old->css); 2324 WRITE_ONCE(stock->cached, NULL); 2325 } 2326 2327 static void drain_local_stock(struct work_struct *dummy) 2328 { 2329 struct memcg_stock_pcp *stock; 2330 struct obj_cgroup *old = NULL; 2331 unsigned long flags; 2332 2333 /* 2334 * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. 2335 * drain_stock races is that we always operate on local CPU stock 2336 * here with IRQ disabled 2337 */ 2338 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2339 2340 stock = this_cpu_ptr(&memcg_stock); 2341 old = drain_obj_stock(stock); 2342 drain_stock(stock); 2343 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2344 2345 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2346 if (old) 2347 obj_cgroup_put(old); 2348 } 2349 2350 /* 2351 * Cache charges(val) to local per_cpu area. 2352 * This will be consumed by consume_stock() function, later. 2353 */ 2354 static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2355 { 2356 struct memcg_stock_pcp *stock; 2357 2358 stock = this_cpu_ptr(&memcg_stock); 2359 if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ 2360 drain_stock(stock); 2361 css_get(&memcg->css); 2362 WRITE_ONCE(stock->cached, memcg); 2363 } 2364 stock->nr_pages += nr_pages; 2365 2366 if (stock->nr_pages > MEMCG_CHARGE_BATCH) 2367 drain_stock(stock); 2368 } 2369 2370 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2371 { 2372 unsigned long flags; 2373 2374 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2375 __refill_stock(memcg, nr_pages); 2376 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2377 } 2378 2379 /* 2380 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2381 * of the hierarchy under it. 2382 */ 2383 static void drain_all_stock(struct mem_cgroup *root_memcg) 2384 { 2385 int cpu, curcpu; 2386 2387 /* If someone's already draining, avoid adding running more workers. */ 2388 if (!mutex_trylock(&percpu_charge_mutex)) 2389 return; 2390 /* 2391 * Notify other cpus that system-wide "drain" is running 2392 * We do not care about races with the cpu hotplug because cpu down 2393 * as well as workers from this path always operate on the local 2394 * per-cpu data. CPU up doesn't touch memcg_stock at all. 2395 */ 2396 migrate_disable(); 2397 curcpu = smp_processor_id(); 2398 for_each_online_cpu(cpu) { 2399 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2400 struct mem_cgroup *memcg; 2401 bool flush = false; 2402 2403 rcu_read_lock(); 2404 memcg = READ_ONCE(stock->cached); 2405 if (memcg && stock->nr_pages && 2406 mem_cgroup_is_descendant(memcg, root_memcg)) 2407 flush = true; 2408 else if (obj_stock_flush_required(stock, root_memcg)) 2409 flush = true; 2410 rcu_read_unlock(); 2411 2412 if (flush && 2413 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2414 if (cpu == curcpu) 2415 drain_local_stock(&stock->work); 2416 else if (!cpu_is_isolated(cpu)) 2417 schedule_work_on(cpu, &stock->work); 2418 } 2419 } 2420 migrate_enable(); 2421 mutex_unlock(&percpu_charge_mutex); 2422 } 2423 2424 static int memcg_hotplug_cpu_dead(unsigned int cpu) 2425 { 2426 struct memcg_stock_pcp *stock; 2427 2428 stock = &per_cpu(memcg_stock, cpu); 2429 drain_stock(stock); 2430 2431 return 0; 2432 } 2433 2434 static unsigned long reclaim_high(struct mem_cgroup *memcg, 2435 unsigned int nr_pages, 2436 gfp_t gfp_mask) 2437 { 2438 unsigned long nr_reclaimed = 0; 2439 2440 do { 2441 unsigned long pflags; 2442 2443 if (page_counter_read(&memcg->memory) <= 2444 READ_ONCE(memcg->memory.high)) 2445 continue; 2446 2447 memcg_memory_event(memcg, MEMCG_HIGH); 2448 2449 psi_memstall_enter(&pflags); 2450 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, 2451 gfp_mask, 2452 MEMCG_RECLAIM_MAY_SWAP); 2453 psi_memstall_leave(&pflags); 2454 } while ((memcg = parent_mem_cgroup(memcg)) && 2455 !mem_cgroup_is_root(memcg)); 2456 2457 return nr_reclaimed; 2458 } 2459 2460 static void high_work_func(struct work_struct *work) 2461 { 2462 struct mem_cgroup *memcg; 2463 2464 memcg = container_of(work, struct mem_cgroup, high_work); 2465 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 2466 } 2467 2468 /* 2469 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is 2470 * enough to still cause a significant slowdown in most cases, while still 2471 * allowing diagnostics and tracing to proceed without becoming stuck. 2472 */ 2473 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) 2474 2475 /* 2476 * When calculating the delay, we use these either side of the exponentiation to 2477 * maintain precision and scale to a reasonable number of jiffies (see the table 2478 * below. 2479 * 2480 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the 2481 * overage ratio to a delay. 2482 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the 2483 * proposed penalty in order to reduce to a reasonable number of jiffies, and 2484 * to produce a reasonable delay curve. 2485 * 2486 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a 2487 * reasonable delay curve compared to precision-adjusted overage, not 2488 * penalising heavily at first, but still making sure that growth beyond the 2489 * limit penalises misbehaviour cgroups by slowing them down exponentially. For 2490 * example, with a high of 100 megabytes: 2491 * 2492 * +-------+------------------------+ 2493 * | usage | time to allocate in ms | 2494 * +-------+------------------------+ 2495 * | 100M | 0 | 2496 * | 101M | 6 | 2497 * | 102M | 25 | 2498 * | 103M | 57 | 2499 * | 104M | 102 | 2500 * | 105M | 159 | 2501 * | 106M | 230 | 2502 * | 107M | 313 | 2503 * | 108M | 409 | 2504 * | 109M | 518 | 2505 * | 110M | 639 | 2506 * | 111M | 774 | 2507 * | 112M | 921 | 2508 * | 113M | 1081 | 2509 * | 114M | 1254 | 2510 * | 115M | 1439 | 2511 * | 116M | 1638 | 2512 * | 117M | 1849 | 2513 * | 118M | 2000 | 2514 * | 119M | 2000 | 2515 * | 120M | 2000 | 2516 * +-------+------------------------+ 2517 */ 2518 #define MEMCG_DELAY_PRECISION_SHIFT 20 2519 #define MEMCG_DELAY_SCALING_SHIFT 14 2520 2521 static u64 calculate_overage(unsigned long usage, unsigned long high) 2522 { 2523 u64 overage; 2524 2525 if (usage <= high) 2526 return 0; 2527 2528 /* 2529 * Prevent division by 0 in overage calculation by acting as if 2530 * it was a threshold of 1 page 2531 */ 2532 high = max(high, 1UL); 2533 2534 overage = usage - high; 2535 overage <<= MEMCG_DELAY_PRECISION_SHIFT; 2536 return div64_u64(overage, high); 2537 } 2538 2539 static u64 mem_find_max_overage(struct mem_cgroup *memcg) 2540 { 2541 u64 overage, max_overage = 0; 2542 2543 do { 2544 overage = calculate_overage(page_counter_read(&memcg->memory), 2545 READ_ONCE(memcg->memory.high)); 2546 max_overage = max(overage, max_overage); 2547 } while ((memcg = parent_mem_cgroup(memcg)) && 2548 !mem_cgroup_is_root(memcg)); 2549 2550 return max_overage; 2551 } 2552 2553 static u64 swap_find_max_overage(struct mem_cgroup *memcg) 2554 { 2555 u64 overage, max_overage = 0; 2556 2557 do { 2558 overage = calculate_overage(page_counter_read(&memcg->swap), 2559 READ_ONCE(memcg->swap.high)); 2560 if (overage) 2561 memcg_memory_event(memcg, MEMCG_SWAP_HIGH); 2562 max_overage = max(overage, max_overage); 2563 } while ((memcg = parent_mem_cgroup(memcg)) && 2564 !mem_cgroup_is_root(memcg)); 2565 2566 return max_overage; 2567 } 2568 2569 /* 2570 * Get the number of jiffies that we should penalise a mischievous cgroup which 2571 * is exceeding its memory.high by checking both it and its ancestors. 2572 */ 2573 static unsigned long calculate_high_delay(struct mem_cgroup *memcg, 2574 unsigned int nr_pages, 2575 u64 max_overage) 2576 { 2577 unsigned long penalty_jiffies; 2578 2579 if (!max_overage) 2580 return 0; 2581 2582 /* 2583 * We use overage compared to memory.high to calculate the number of 2584 * jiffies to sleep (penalty_jiffies). Ideally this value should be 2585 * fairly lenient on small overages, and increasingly harsh when the 2586 * memcg in question makes it clear that it has no intention of stopping 2587 * its crazy behaviour, so we exponentially increase the delay based on 2588 * overage amount. 2589 */ 2590 penalty_jiffies = max_overage * max_overage * HZ; 2591 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; 2592 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; 2593 2594 /* 2595 * Factor in the task's own contribution to the overage, such that four 2596 * N-sized allocations are throttled approximately the same as one 2597 * 4N-sized allocation. 2598 * 2599 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or 2600 * larger the current charge patch is than that. 2601 */ 2602 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; 2603 } 2604 2605 /* 2606 * Scheduled by try_charge() to be executed from the userland return path 2607 * and reclaims memory over the high limit. 2608 */ 2609 void mem_cgroup_handle_over_high(gfp_t gfp_mask) 2610 { 2611 unsigned long penalty_jiffies; 2612 unsigned long pflags; 2613 unsigned long nr_reclaimed; 2614 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2615 int nr_retries = MAX_RECLAIM_RETRIES; 2616 struct mem_cgroup *memcg; 2617 bool in_retry = false; 2618 2619 if (likely(!nr_pages)) 2620 return; 2621 2622 memcg = get_mem_cgroup_from_mm(current->mm); 2623 current->memcg_nr_pages_over_high = 0; 2624 2625 retry_reclaim: 2626 /* 2627 * The allocating task should reclaim at least the batch size, but for 2628 * subsequent retries we only want to do what's necessary to prevent oom 2629 * or breaching resource isolation. 2630 * 2631 * This is distinct from memory.max or page allocator behaviour because 2632 * memory.high is currently batched, whereas memory.max and the page 2633 * allocator run every time an allocation is made. 2634 */ 2635 nr_reclaimed = reclaim_high(memcg, 2636 in_retry ? SWAP_CLUSTER_MAX : nr_pages, 2637 gfp_mask); 2638 2639 /* 2640 * memory.high is breached and reclaim is unable to keep up. Throttle 2641 * allocators proactively to slow down excessive growth. 2642 */ 2643 penalty_jiffies = calculate_high_delay(memcg, nr_pages, 2644 mem_find_max_overage(memcg)); 2645 2646 penalty_jiffies += calculate_high_delay(memcg, nr_pages, 2647 swap_find_max_overage(memcg)); 2648 2649 /* 2650 * Clamp the max delay per usermode return so as to still keep the 2651 * application moving forwards and also permit diagnostics, albeit 2652 * extremely slowly. 2653 */ 2654 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 2655 2656 /* 2657 * Don't sleep if the amount of jiffies this memcg owes us is so low 2658 * that it's not even worth doing, in an attempt to be nice to those who 2659 * go only a small amount over their memory.high value and maybe haven't 2660 * been aggressively reclaimed enough yet. 2661 */ 2662 if (penalty_jiffies <= HZ / 100) 2663 goto out; 2664 2665 /* 2666 * If reclaim is making forward progress but we're still over 2667 * memory.high, we want to encourage that rather than doing allocator 2668 * throttling. 2669 */ 2670 if (nr_reclaimed || nr_retries--) { 2671 in_retry = true; 2672 goto retry_reclaim; 2673 } 2674 2675 /* 2676 * If we exit early, we're guaranteed to die (since 2677 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't 2678 * need to account for any ill-begotten jiffies to pay them off later. 2679 */ 2680 psi_memstall_enter(&pflags); 2681 schedule_timeout_killable(penalty_jiffies); 2682 psi_memstall_leave(&pflags); 2683 2684 out: 2685 css_put(&memcg->css); 2686 } 2687 2688 static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, 2689 unsigned int nr_pages) 2690 { 2691 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 2692 int nr_retries = MAX_RECLAIM_RETRIES; 2693 struct mem_cgroup *mem_over_limit; 2694 struct page_counter *counter; 2695 unsigned long nr_reclaimed; 2696 bool passed_oom = false; 2697 unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; 2698 bool drained = false; 2699 bool raised_max_event = false; 2700 unsigned long pflags; 2701 2702 retry: 2703 if (consume_stock(memcg, nr_pages)) 2704 return 0; 2705 2706 if (!do_memsw_account() || 2707 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2708 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 2709 goto done_restock; 2710 if (do_memsw_account()) 2711 page_counter_uncharge(&memcg->memsw, batch); 2712 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2713 } else { 2714 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2715 reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; 2716 } 2717 2718 if (batch > nr_pages) { 2719 batch = nr_pages; 2720 goto retry; 2721 } 2722 2723 /* 2724 * Prevent unbounded recursion when reclaim operations need to 2725 * allocate memory. This might exceed the limits temporarily, 2726 * but we prefer facilitating memory reclaim and getting back 2727 * under the limit over triggering OOM kills in these cases. 2728 */ 2729 if (unlikely(current->flags & PF_MEMALLOC)) 2730 goto force; 2731 2732 if (unlikely(task_in_memcg_oom(current))) 2733 goto nomem; 2734 2735 if (!gfpflags_allow_blocking(gfp_mask)) 2736 goto nomem; 2737 2738 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2739 raised_max_event = true; 2740 2741 psi_memstall_enter(&pflags); 2742 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2743 gfp_mask, reclaim_options); 2744 psi_memstall_leave(&pflags); 2745 2746 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2747 goto retry; 2748 2749 if (!drained) { 2750 drain_all_stock(mem_over_limit); 2751 drained = true; 2752 goto retry; 2753 } 2754 2755 if (gfp_mask & __GFP_NORETRY) 2756 goto nomem; 2757 /* 2758 * Even though the limit is exceeded at this point, reclaim 2759 * may have been able to free some pages. Retry the charge 2760 * before killing the task. 2761 * 2762 * Only for regular pages, though: huge pages are rather 2763 * unlikely to succeed so close to the limit, and we fall back 2764 * to regular pages anyway in case of failure. 2765 */ 2766 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2767 goto retry; 2768 /* 2769 * At task move, charge accounts can be doubly counted. So, it's 2770 * better to wait until the end of task_move if something is going on. 2771 */ 2772 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2773 goto retry; 2774 2775 if (nr_retries--) 2776 goto retry; 2777 2778 if (gfp_mask & __GFP_RETRY_MAYFAIL) 2779 goto nomem; 2780 2781 /* Avoid endless loop for tasks bypassed by the oom killer */ 2782 if (passed_oom && task_is_dying()) 2783 goto nomem; 2784 2785 /* 2786 * keep retrying as long as the memcg oom killer is able to make 2787 * a forward progress or bypass the charge if the oom killer 2788 * couldn't make any progress. 2789 */ 2790 if (mem_cgroup_oom(mem_over_limit, gfp_mask, 2791 get_order(nr_pages * PAGE_SIZE))) { 2792 passed_oom = true; 2793 nr_retries = MAX_RECLAIM_RETRIES; 2794 goto retry; 2795 } 2796 nomem: 2797 /* 2798 * Memcg doesn't have a dedicated reserve for atomic 2799 * allocations. But like the global atomic pool, we need to 2800 * put the burden of reclaim on regular allocation requests 2801 * and let these go through as privileged allocations. 2802 */ 2803 if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH))) 2804 return -ENOMEM; 2805 force: 2806 /* 2807 * If the allocation has to be enforced, don't forget to raise 2808 * a MEMCG_MAX event. 2809 */ 2810 if (!raised_max_event) 2811 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2812 2813 /* 2814 * The allocation either can't fail or will lead to more memory 2815 * being freed very soon. Allow memory usage go over the limit 2816 * temporarily by force charging it. 2817 */ 2818 page_counter_charge(&memcg->memory, nr_pages); 2819 if (do_memsw_account()) 2820 page_counter_charge(&memcg->memsw, nr_pages); 2821 2822 return 0; 2823 2824 done_restock: 2825 if (batch > nr_pages) 2826 refill_stock(memcg, batch - nr_pages); 2827 2828 /* 2829 * If the hierarchy is above the normal consumption range, schedule 2830 * reclaim on returning to userland. We can perform reclaim here 2831 * if __GFP_RECLAIM but let's always punt for simplicity and so that 2832 * GFP_KERNEL can consistently be used during reclaim. @memcg is 2833 * not recorded as it most likely matches current's and won't 2834 * change in the meantime. As high limit is checked again before 2835 * reclaim, the cost of mismatch is negligible. 2836 */ 2837 do { 2838 bool mem_high, swap_high; 2839 2840 mem_high = page_counter_read(&memcg->memory) > 2841 READ_ONCE(memcg->memory.high); 2842 swap_high = page_counter_read(&memcg->swap) > 2843 READ_ONCE(memcg->swap.high); 2844 2845 /* Don't bother a random interrupted task */ 2846 if (!in_task()) { 2847 if (mem_high) { 2848 schedule_work(&memcg->high_work); 2849 break; 2850 } 2851 continue; 2852 } 2853 2854 if (mem_high || swap_high) { 2855 /* 2856 * The allocating tasks in this cgroup will need to do 2857 * reclaim or be throttled to prevent further growth 2858 * of the memory or swap footprints. 2859 * 2860 * Target some best-effort fairness between the tasks, 2861 * and distribute reclaim work and delay penalties 2862 * based on how much each task is actually allocating. 2863 */ 2864 current->memcg_nr_pages_over_high += batch; 2865 set_notify_resume(current); 2866 break; 2867 } 2868 } while ((memcg = parent_mem_cgroup(memcg))); 2869 2870 if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && 2871 !(current->flags & PF_MEMALLOC) && 2872 gfpflags_allow_blocking(gfp_mask)) { 2873 mem_cgroup_handle_over_high(gfp_mask); 2874 } 2875 return 0; 2876 } 2877 2878 static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2879 unsigned int nr_pages) 2880 { 2881 if (mem_cgroup_is_root(memcg)) 2882 return 0; 2883 2884 return try_charge_memcg(memcg, gfp_mask, nr_pages); 2885 } 2886 2887 /** 2888 * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call. 2889 * @memcg: memcg previously charged. 2890 * @nr_pages: number of pages previously charged. 2891 */ 2892 void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2893 { 2894 if (mem_cgroup_is_root(memcg)) 2895 return; 2896 2897 page_counter_uncharge(&memcg->memory, nr_pages); 2898 if (do_memsw_account()) 2899 page_counter_uncharge(&memcg->memsw, nr_pages); 2900 } 2901 2902 static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) 2903 { 2904 VM_BUG_ON_FOLIO(folio_memcg(folio), folio); 2905 /* 2906 * Any of the following ensures page's memcg stability: 2907 * 2908 * - the page lock 2909 * - LRU isolation 2910 * - folio_memcg_lock() 2911 * - exclusive reference 2912 * - mem_cgroup_trylock_pages() 2913 */ 2914 folio->memcg_data = (unsigned long)memcg; 2915 } 2916 2917 /** 2918 * mem_cgroup_commit_charge - commit a previously successful try_charge(). 2919 * @folio: folio to commit the charge to. 2920 * @memcg: memcg previously charged. 2921 */ 2922 void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) 2923 { 2924 css_get(&memcg->css); 2925 commit_charge(folio, memcg); 2926 2927 local_irq_disable(); 2928 mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio)); 2929 memcg_check_events(memcg, folio_nid(folio)); 2930 local_irq_enable(); 2931 } 2932 2933 #ifdef CONFIG_MEMCG_KMEM 2934 /* 2935 * The allocated objcg pointers array is not accounted directly. 2936 * Moreover, it should not come from DMA buffer and is not readily 2937 * reclaimable. So those GFP bits should be masked off. 2938 */ 2939 #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) 2940 2941 /* 2942 * mod_objcg_mlstate() may be called with irq enabled, so 2943 * mod_memcg_lruvec_state() should be used. 2944 */ 2945 static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, 2946 struct pglist_data *pgdat, 2947 enum node_stat_item idx, int nr) 2948 { 2949 struct mem_cgroup *memcg; 2950 struct lruvec *lruvec; 2951 2952 rcu_read_lock(); 2953 memcg = obj_cgroup_memcg(objcg); 2954 lruvec = mem_cgroup_lruvec(memcg, pgdat); 2955 mod_memcg_lruvec_state(lruvec, idx, nr); 2956 rcu_read_unlock(); 2957 } 2958 2959 int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, 2960 gfp_t gfp, bool new_slab) 2961 { 2962 unsigned int objects = objs_per_slab(s, slab); 2963 unsigned long memcg_data; 2964 void *vec; 2965 2966 gfp &= ~OBJCGS_CLEAR_MASK; 2967 vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, 2968 slab_nid(slab)); 2969 if (!vec) 2970 return -ENOMEM; 2971 2972 memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; 2973 if (new_slab) { 2974 /* 2975 * If the slab is brand new and nobody can yet access its 2976 * memcg_data, no synchronization is required and memcg_data can 2977 * be simply assigned. 2978 */ 2979 slab->memcg_data = memcg_data; 2980 } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) { 2981 /* 2982 * If the slab is already in use, somebody can allocate and 2983 * assign obj_cgroups in parallel. In this case the existing 2984 * objcg vector should be reused. 2985 */ 2986 kfree(vec); 2987 return 0; 2988 } 2989 2990 kmemleak_not_leak(vec); 2991 return 0; 2992 } 2993 2994 static __always_inline 2995 struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) 2996 { 2997 /* 2998 * Slab objects are accounted individually, not per-page. 2999 * Memcg membership data for each individual object is saved in 3000 * slab->memcg_data. 3001 */ 3002 if (folio_test_slab(folio)) { 3003 struct obj_cgroup **objcgs; 3004 struct slab *slab; 3005 unsigned int off; 3006 3007 slab = folio_slab(folio); 3008 objcgs = slab_objcgs(slab); 3009 if (!objcgs) 3010 return NULL; 3011 3012 off = obj_to_index(slab->slab_cache, slab, p); 3013 if (objcgs[off]) 3014 return obj_cgroup_memcg(objcgs[off]); 3015 3016 return NULL; 3017 } 3018 3019 /* 3020 * folio_memcg_check() is used here, because in theory we can encounter 3021 * a folio where the slab flag has been cleared already, but 3022 * slab->memcg_data has not been freed yet 3023 * folio_memcg_check() will guarantee that a proper memory 3024 * cgroup pointer or NULL will be returned. 3025 */ 3026 return folio_memcg_check(folio); 3027 } 3028 3029 /* 3030 * Returns a pointer to the memory cgroup to which the kernel object is charged. 3031 * 3032 * A passed kernel object can be a slab object, vmalloc object or a generic 3033 * kernel page, so different mechanisms for getting the memory cgroup pointer 3034 * should be used. 3035 * 3036 * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller 3037 * can not know for sure how the kernel object is implemented. 3038 * mem_cgroup_from_obj() can be safely used in such cases. 3039 * 3040 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 3041 * cgroup_mutex, etc. 3042 */ 3043 struct mem_cgroup *mem_cgroup_from_obj(void *p) 3044 { 3045 struct folio *folio; 3046 3047 if (mem_cgroup_disabled()) 3048 return NULL; 3049 3050 if (unlikely(is_vmalloc_addr(p))) 3051 folio = page_folio(vmalloc_to_page(p)); 3052 else 3053 folio = virt_to_folio(p); 3054 3055 return mem_cgroup_from_obj_folio(folio, p); 3056 } 3057 3058 /* 3059 * Returns a pointer to the memory cgroup to which the kernel object is charged. 3060 * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects, 3061 * allocated using vmalloc(). 3062 * 3063 * A passed kernel object must be a slab object or a generic kernel page. 3064 * 3065 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 3066 * cgroup_mutex, etc. 3067 */ 3068 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) 3069 { 3070 if (mem_cgroup_disabled()) 3071 return NULL; 3072 3073 return mem_cgroup_from_obj_folio(virt_to_folio(p), p); 3074 } 3075 3076 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) 3077 { 3078 struct obj_cgroup *objcg = NULL; 3079 3080 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 3081 objcg = rcu_dereference(memcg->objcg); 3082 if (likely(objcg && obj_cgroup_tryget(objcg))) 3083 break; 3084 objcg = NULL; 3085 } 3086 return objcg; 3087 } 3088 3089 static struct obj_cgroup *current_objcg_update(void) 3090 { 3091 struct mem_cgroup *memcg; 3092 struct obj_cgroup *old, *objcg = NULL; 3093 3094 do { 3095 /* Atomically drop the update bit. */ 3096 old = xchg(¤t->objcg, NULL); 3097 if (old) { 3098 old = (struct obj_cgroup *) 3099 ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG); 3100 if (old) 3101 obj_cgroup_put(old); 3102 3103 old = NULL; 3104 } 3105 3106 /* If new objcg is NULL, no reason for the second atomic update. */ 3107 if (!current->mm || (current->flags & PF_KTHREAD)) 3108 return NULL; 3109 3110 /* 3111 * Release the objcg pointer from the previous iteration, 3112 * if try_cmpxcg() below fails. 3113 */ 3114 if (unlikely(objcg)) { 3115 obj_cgroup_put(objcg); 3116 objcg = NULL; 3117 } 3118 3119 /* 3120 * Obtain the new objcg pointer. The current task can be 3121 * asynchronously moved to another memcg and the previous 3122 * memcg can be offlined. So let's get the memcg pointer 3123 * and try get a reference to objcg under a rcu read lock. 3124 */ 3125 3126 rcu_read_lock(); 3127 memcg = mem_cgroup_from_task(current); 3128 objcg = __get_obj_cgroup_from_memcg(memcg); 3129 rcu_read_unlock(); 3130 3131 /* 3132 * Try set up a new objcg pointer atomically. If it 3133 * fails, it means the update flag was set concurrently, so 3134 * the whole procedure should be repeated. 3135 */ 3136 } while (!try_cmpxchg(¤t->objcg, &old, objcg)); 3137 3138 return objcg; 3139 } 3140 3141 __always_inline struct obj_cgroup *current_obj_cgroup(void) 3142 { 3143 struct mem_cgroup *memcg; 3144 struct obj_cgroup *objcg; 3145 3146 if (in_task()) { 3147 memcg = current->active_memcg; 3148 if (unlikely(memcg)) 3149 goto from_memcg; 3150 3151 objcg = READ_ONCE(current->objcg); 3152 if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG)) 3153 objcg = current_objcg_update(); 3154 /* 3155 * Objcg reference is kept by the task, so it's safe 3156 * to use the objcg by the current task. 3157 */ 3158 return objcg; 3159 } 3160 3161 memcg = this_cpu_read(int_active_memcg); 3162 if (unlikely(memcg)) 3163 goto from_memcg; 3164 3165 return NULL; 3166 3167 from_memcg: 3168 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 3169 /* 3170 * Memcg pointer is protected by scope (see set_active_memcg()) 3171 * and is pinning the corresponding objcg, so objcg can't go 3172 * away and can be used within the scope without any additional 3173 * protection. 3174 */ 3175 objcg = rcu_dereference_check(memcg->objcg, 1); 3176 if (likely(objcg)) 3177 break; 3178 objcg = NULL; 3179 } 3180 3181 return objcg; 3182 } 3183 3184 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) 3185 { 3186 struct obj_cgroup *objcg; 3187 3188 if (!memcg_kmem_online()) 3189 return NULL; 3190 3191 if (folio_memcg_kmem(folio)) { 3192 objcg = __folio_objcg(folio); 3193 obj_cgroup_get(objcg); 3194 } else { 3195 struct mem_cgroup *memcg; 3196 3197 rcu_read_lock(); 3198 memcg = __folio_memcg(folio); 3199 if (memcg) 3200 objcg = __get_obj_cgroup_from_memcg(memcg); 3201 else 3202 objcg = NULL; 3203 rcu_read_unlock(); 3204 } 3205 return objcg; 3206 } 3207 3208 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) 3209 { 3210 mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); 3211 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 3212 if (nr_pages > 0) 3213 page_counter_charge(&memcg->kmem, nr_pages); 3214 else 3215 page_counter_uncharge(&memcg->kmem, -nr_pages); 3216 } 3217 } 3218 3219 3220 /* 3221 * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg 3222 * @objcg: object cgroup to uncharge 3223 * @nr_pages: number of pages to uncharge 3224 */ 3225 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 3226 unsigned int nr_pages) 3227 { 3228 struct mem_cgroup *memcg; 3229 3230 memcg = get_mem_cgroup_from_objcg(objcg); 3231 3232 memcg_account_kmem(memcg, -nr_pages); 3233 refill_stock(memcg, nr_pages); 3234 3235 css_put(&memcg->css); 3236 } 3237 3238 /* 3239 * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg 3240 * @objcg: object cgroup to charge 3241 * @gfp: reclaim mode 3242 * @nr_pages: number of pages to charge 3243 * 3244 * Returns 0 on success, an error code on failure. 3245 */ 3246 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, 3247 unsigned int nr_pages) 3248 { 3249 struct mem_cgroup *memcg; 3250 int ret; 3251 3252 memcg = get_mem_cgroup_from_objcg(objcg); 3253 3254 ret = try_charge_memcg(memcg, gfp, nr_pages); 3255 if (ret) 3256 goto out; 3257 3258 memcg_account_kmem(memcg, nr_pages); 3259 out: 3260 css_put(&memcg->css); 3261 3262 return ret; 3263 } 3264 3265 /** 3266 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup 3267 * @page: page to charge 3268 * @gfp: reclaim mode 3269 * @order: allocation order 3270 * 3271 * Returns 0 on success, an error code on failure. 3272 */ 3273 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) 3274 { 3275 struct obj_cgroup *objcg; 3276 int ret = 0; 3277 3278 objcg = current_obj_cgroup(); 3279 if (objcg) { 3280 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); 3281 if (!ret) { 3282 obj_cgroup_get(objcg); 3283 page->memcg_data = (unsigned long)objcg | 3284 MEMCG_DATA_KMEM; 3285 return 0; 3286 } 3287 } 3288 return ret; 3289 } 3290 3291 /** 3292 * __memcg_kmem_uncharge_page: uncharge a kmem page 3293 * @page: page to uncharge 3294 * @order: allocation order 3295 */ 3296 void __memcg_kmem_uncharge_page(struct page *page, int order) 3297 { 3298 struct folio *folio = page_folio(page); 3299 struct obj_cgroup *objcg; 3300 unsigned int nr_pages = 1 << order; 3301 3302 if (!folio_memcg_kmem(folio)) 3303 return; 3304 3305 objcg = __folio_objcg(folio); 3306 obj_cgroup_uncharge_pages(objcg, nr_pages); 3307 folio->memcg_data = 0; 3308 obj_cgroup_put(objcg); 3309 } 3310 3311 void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, 3312 enum node_stat_item idx, int nr) 3313 { 3314 struct memcg_stock_pcp *stock; 3315 struct obj_cgroup *old = NULL; 3316 unsigned long flags; 3317 int *bytes; 3318 3319 local_lock_irqsave(&memcg_stock.stock_lock, flags); 3320 stock = this_cpu_ptr(&memcg_stock); 3321 3322 /* 3323 * Save vmstat data in stock and skip vmstat array update unless 3324 * accumulating over a page of vmstat data or when pgdat or idx 3325 * changes. 3326 */ 3327 if (READ_ONCE(stock->cached_objcg) != objcg) { 3328 old = drain_obj_stock(stock); 3329 obj_cgroup_get(objcg); 3330 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 3331 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 3332 WRITE_ONCE(stock->cached_objcg, objcg); 3333 stock->cached_pgdat = pgdat; 3334 } else if (stock->cached_pgdat != pgdat) { 3335 /* Flush the existing cached vmstat data */ 3336 struct pglist_data *oldpg = stock->cached_pgdat; 3337 3338 if (stock->nr_slab_reclaimable_b) { 3339 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, 3340 stock->nr_slab_reclaimable_b); 3341 stock->nr_slab_reclaimable_b = 0; 3342 } 3343 if (stock->nr_slab_unreclaimable_b) { 3344 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, 3345 stock->nr_slab_unreclaimable_b); 3346 stock->nr_slab_unreclaimable_b = 0; 3347 } 3348 stock->cached_pgdat = pgdat; 3349 } 3350 3351 bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b 3352 : &stock->nr_slab_unreclaimable_b; 3353 /* 3354 * Even for large object >= PAGE_SIZE, the vmstat data will still be 3355 * cached locally at least once before pushing it out. 3356 */ 3357 if (!*bytes) { 3358 *bytes = nr; 3359 nr = 0; 3360 } else { 3361 *bytes += nr; 3362 if (abs(*bytes) > PAGE_SIZE) { 3363 nr = *bytes; 3364 *bytes = 0; 3365 } else { 3366 nr = 0; 3367 } 3368 } 3369 if (nr) 3370 mod_objcg_mlstate(objcg, pgdat, idx, nr); 3371 3372 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 3373 if (old) 3374 obj_cgroup_put(old); 3375 } 3376 3377 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 3378 { 3379 struct memcg_stock_pcp *stock; 3380 unsigned long flags; 3381 bool ret = false; 3382 3383 local_lock_irqsave(&memcg_stock.stock_lock, flags); 3384 3385 stock = this_cpu_ptr(&memcg_stock); 3386 if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { 3387 stock->nr_bytes -= nr_bytes; 3388 ret = true; 3389 } 3390 3391 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 3392 3393 return ret; 3394 } 3395 3396 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) 3397 { 3398 struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); 3399 3400 if (!old) 3401 return NULL; 3402 3403 if (stock->nr_bytes) { 3404 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3405 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); 3406 3407 if (nr_pages) { 3408 struct mem_cgroup *memcg; 3409 3410 memcg = get_mem_cgroup_from_objcg(old); 3411 3412 memcg_account_kmem(memcg, -nr_pages); 3413 __refill_stock(memcg, nr_pages); 3414 3415 css_put(&memcg->css); 3416 } 3417 3418 /* 3419 * The leftover is flushed to the centralized per-memcg value. 3420 * On the next attempt to refill obj stock it will be moved 3421 * to a per-cpu stock (probably, on an other CPU), see 3422 * refill_obj_stock(). 3423 * 3424 * How often it's flushed is a trade-off between the memory 3425 * limit enforcement accuracy and potential CPU contention, 3426 * so it might be changed in the future. 3427 */ 3428 atomic_add(nr_bytes, &old->nr_charged_bytes); 3429 stock->nr_bytes = 0; 3430 } 3431 3432 /* 3433 * Flush the vmstat data in current stock 3434 */ 3435 if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { 3436 if (stock->nr_slab_reclaimable_b) { 3437 mod_objcg_mlstate(old, stock->cached_pgdat, 3438 NR_SLAB_RECLAIMABLE_B, 3439 stock->nr_slab_reclaimable_b); 3440 stock->nr_slab_reclaimable_b = 0; 3441 } 3442 if (stock->nr_slab_unreclaimable_b) { 3443 mod_objcg_mlstate(old, stock->cached_pgdat, 3444 NR_SLAB_UNRECLAIMABLE_B, 3445 stock->nr_slab_unreclaimable_b); 3446 stock->nr_slab_unreclaimable_b = 0; 3447 } 3448 stock->cached_pgdat = NULL; 3449 } 3450 3451 WRITE_ONCE(stock->cached_objcg, NULL); 3452 /* 3453 * The `old' objects needs to be released by the caller via 3454 * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. 3455 */ 3456 return old; 3457 } 3458 3459 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 3460 struct mem_cgroup *root_memcg) 3461 { 3462 struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); 3463 struct mem_cgroup *memcg; 3464 3465 if (objcg) { 3466 memcg = obj_cgroup_memcg(objcg); 3467 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 3468 return true; 3469 } 3470 3471 return false; 3472 } 3473 3474 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, 3475 bool allow_uncharge) 3476 { 3477 struct memcg_stock_pcp *stock; 3478 struct obj_cgroup *old = NULL; 3479 unsigned long flags; 3480 unsigned int nr_pages = 0; 3481 3482 local_lock_irqsave(&memcg_stock.stock_lock, flags); 3483 3484 stock = this_cpu_ptr(&memcg_stock); 3485 if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ 3486 old = drain_obj_stock(stock); 3487 obj_cgroup_get(objcg); 3488 WRITE_ONCE(stock->cached_objcg, objcg); 3489 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 3490 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 3491 allow_uncharge = true; /* Allow uncharge when objcg changes */ 3492 } 3493 stock->nr_bytes += nr_bytes; 3494 3495 if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { 3496 nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3497 stock->nr_bytes &= (PAGE_SIZE - 1); 3498 } 3499 3500 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 3501 if (old) 3502 obj_cgroup_put(old); 3503 3504 if (nr_pages) 3505 obj_cgroup_uncharge_pages(objcg, nr_pages); 3506 } 3507 3508 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 3509 { 3510 unsigned int nr_pages, nr_bytes; 3511 int ret; 3512 3513 if (consume_obj_stock(objcg, size)) 3514 return 0; 3515 3516 /* 3517 * In theory, objcg->nr_charged_bytes can have enough 3518 * pre-charged bytes to satisfy the allocation. However, 3519 * flushing objcg->nr_charged_bytes requires two atomic 3520 * operations, and objcg->nr_charged_bytes can't be big. 3521 * The shared objcg->nr_charged_bytes can also become a 3522 * performance bottleneck if all tasks of the same memcg are 3523 * trying to update it. So it's better to ignore it and try 3524 * grab some new pages. The stock's nr_bytes will be flushed to 3525 * objcg->nr_charged_bytes later on when objcg changes. 3526 * 3527 * The stock's nr_bytes may contain enough pre-charged bytes 3528 * to allow one less page from being charged, but we can't rely 3529 * on the pre-charged bytes not being changed outside of 3530 * consume_obj_stock() or refill_obj_stock(). So ignore those 3531 * pre-charged bytes as well when charging pages. To avoid a 3532 * page uncharge right after a page charge, we set the 3533 * allow_uncharge flag to false when calling refill_obj_stock() 3534 * to temporarily allow the pre-charged bytes to exceed the page 3535 * size limit. The maximum reachable value of the pre-charged 3536 * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data 3537 * race. 3538 */ 3539 nr_pages = size >> PAGE_SHIFT; 3540 nr_bytes = size & (PAGE_SIZE - 1); 3541 3542 if (nr_bytes) 3543 nr_pages += 1; 3544 3545 ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); 3546 if (!ret && nr_bytes) 3547 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); 3548 3549 return ret; 3550 } 3551 3552 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) 3553 { 3554 refill_obj_stock(objcg, size, true); 3555 } 3556 3557 #endif /* CONFIG_MEMCG_KMEM */ 3558 3559 /* 3560 * Because page_memcg(head) is not set on tails, set it now. 3561 */ 3562 void split_page_memcg(struct page *head, unsigned int nr) 3563 { 3564 struct folio *folio = page_folio(head); 3565 struct mem_cgroup *memcg = folio_memcg(folio); 3566 int i; 3567 3568 if (mem_cgroup_disabled() || !memcg) 3569 return; 3570 3571 for (i = 1; i < nr; i++) 3572 folio_page(folio, i)->memcg_data = folio->memcg_data; 3573 3574 if (folio_memcg_kmem(folio)) 3575 obj_cgroup_get_many(__folio_objcg(folio), nr - 1); 3576 else 3577 css_get_many(&memcg->css, nr - 1); 3578 } 3579 3580 #ifdef CONFIG_SWAP 3581 /** 3582 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3583 * @entry: swap entry to be moved 3584 * @from: mem_cgroup which the entry is moved from 3585 * @to: mem_cgroup which the entry is moved to 3586 * 3587 * It succeeds only when the swap_cgroup's record for this entry is the same 3588 * as the mem_cgroup's id of @from. 3589 * 3590 * Returns 0 on success, -EINVAL on failure. 3591 * 3592 * The caller must have charged to @to, IOW, called page_counter_charge() about 3593 * both res and memsw, and called css_get(). 3594 */ 3595 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3596 struct mem_cgroup *from, struct mem_cgroup *to) 3597 { 3598 unsigned short old_id, new_id; 3599 3600 old_id = mem_cgroup_id(from); 3601 new_id = mem_cgroup_id(to); 3602 3603 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3604 mod_memcg_state(from, MEMCG_SWAP, -1); 3605 mod_memcg_state(to, MEMCG_SWAP, 1); 3606 return 0; 3607 } 3608 return -EINVAL; 3609 } 3610 #else 3611 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3612 struct mem_cgroup *from, struct mem_cgroup *to) 3613 { 3614 return -EINVAL; 3615 } 3616 #endif 3617 3618 static DEFINE_MUTEX(memcg_max_mutex); 3619 3620 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 3621 unsigned long max, bool memsw) 3622 { 3623 bool enlarge = false; 3624 bool drained = false; 3625 int ret; 3626 bool limits_invariant; 3627 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 3628 3629 do { 3630 if (signal_pending(current)) { 3631 ret = -EINTR; 3632 break; 3633 } 3634 3635 mutex_lock(&memcg_max_mutex); 3636 /* 3637 * Make sure that the new limit (memsw or memory limit) doesn't 3638 * break our basic invariant rule memory.max <= memsw.max. 3639 */ 3640 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 3641 max <= memcg->memsw.max; 3642 if (!limits_invariant) { 3643 mutex_unlock(&memcg_max_mutex); 3644 ret = -EINVAL; 3645 break; 3646 } 3647 if (max > counter->max) 3648 enlarge = true; 3649 ret = page_counter_set_max(counter, max); 3650 mutex_unlock(&memcg_max_mutex); 3651 3652 if (!ret) 3653 break; 3654 3655 if (!drained) { 3656 drain_all_stock(memcg); 3657 drained = true; 3658 continue; 3659 } 3660 3661 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 3662 memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { 3663 ret = -EBUSY; 3664 break; 3665 } 3666 } while (true); 3667 3668 if (!ret && enlarge) 3669 memcg_oom_recover(memcg); 3670 3671 return ret; 3672 } 3673 3674 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 3675 gfp_t gfp_mask, 3676 unsigned long *total_scanned) 3677 { 3678 unsigned long nr_reclaimed = 0; 3679 struct mem_cgroup_per_node *mz, *next_mz = NULL; 3680 unsigned long reclaimed; 3681 int loop = 0; 3682 struct mem_cgroup_tree_per_node *mctz; 3683 unsigned long excess; 3684 3685 if (lru_gen_enabled()) 3686 return 0; 3687 3688 if (order > 0) 3689 return 0; 3690 3691 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; 3692 3693 /* 3694 * Do not even bother to check the largest node if the root 3695 * is empty. Do it lockless to prevent lock bouncing. Races 3696 * are acceptable as soft limit is best effort anyway. 3697 */ 3698 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 3699 return 0; 3700 3701 /* 3702 * This loop can run a while, specially if mem_cgroup's continuously 3703 * keep exceeding their soft limit and putting the system under 3704 * pressure 3705 */ 3706 do { 3707 if (next_mz) 3708 mz = next_mz; 3709 else 3710 mz = mem_cgroup_largest_soft_limit_node(mctz); 3711 if (!mz) 3712 break; 3713 3714 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 3715 gfp_mask, total_scanned); 3716 nr_reclaimed += reclaimed; 3717 spin_lock_irq(&mctz->lock); 3718 3719 /* 3720 * If we failed to reclaim anything from this memory cgroup 3721 * it is time to move on to the next cgroup 3722 */ 3723 next_mz = NULL; 3724 if (!reclaimed) 3725 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 3726 3727 excess = soft_limit_excess(mz->memcg); 3728 /* 3729 * One school of thought says that we should not add 3730 * back the node to the tree if reclaim returns 0. 3731 * But our reclaim could return 0, simply because due 3732 * to priority we are exposing a smaller subset of 3733 * memory to reclaim from. Consider this as a longer 3734 * term TODO. 3735 */ 3736 /* If excess == 0, no tree ops */ 3737 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3738 spin_unlock_irq(&mctz->lock); 3739 css_put(&mz->memcg->css); 3740 loop++; 3741 /* 3742 * Could not reclaim anything and there are no more 3743 * mem cgroups to try or we seem to be looping without 3744 * reclaiming anything. 3745 */ 3746 if (!nr_reclaimed && 3747 (next_mz == NULL || 3748 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3749 break; 3750 } while (!nr_reclaimed); 3751 if (next_mz) 3752 css_put(&next_mz->memcg->css); 3753 return nr_reclaimed; 3754 } 3755 3756 /* 3757 * Reclaims as many pages from the given memcg as possible. 3758 * 3759 * Caller is responsible for holding css reference for memcg. 3760 */ 3761 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3762 { 3763 int nr_retries = MAX_RECLAIM_RETRIES; 3764 3765 /* we call try-to-free pages for make this cgroup empty */ 3766 lru_add_drain_all(); 3767 3768 drain_all_stock(memcg); 3769 3770 /* try to free all pages in this cgroup */ 3771 while (nr_retries && page_counter_read(&memcg->memory)) { 3772 if (signal_pending(current)) 3773 return -EINTR; 3774 3775 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 3776 MEMCG_RECLAIM_MAY_SWAP)) 3777 nr_retries--; 3778 } 3779 3780 return 0; 3781 } 3782 3783 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3784 char *buf, size_t nbytes, 3785 loff_t off) 3786 { 3787 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3788 3789 if (mem_cgroup_is_root(memcg)) 3790 return -EINVAL; 3791 return mem_cgroup_force_empty(memcg) ?: nbytes; 3792 } 3793 3794 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3795 struct cftype *cft) 3796 { 3797 return 1; 3798 } 3799 3800 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3801 struct cftype *cft, u64 val) 3802 { 3803 if (val == 1) 3804 return 0; 3805 3806 pr_warn_once("Non-hierarchical mode is deprecated. " 3807 "Please report your usecase to linux-mm@kvack.org if you " 3808 "depend on this functionality.\n"); 3809 3810 return -EINVAL; 3811 } 3812 3813 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3814 { 3815 unsigned long val; 3816 3817 if (mem_cgroup_is_root(memcg)) { 3818 /* 3819 * Approximate root's usage from global state. This isn't 3820 * perfect, but the root usage was always an approximation. 3821 */ 3822 val = global_node_page_state(NR_FILE_PAGES) + 3823 global_node_page_state(NR_ANON_MAPPED); 3824 if (swap) 3825 val += total_swap_pages - get_nr_swap_pages(); 3826 } else { 3827 if (!swap) 3828 val = page_counter_read(&memcg->memory); 3829 else 3830 val = page_counter_read(&memcg->memsw); 3831 } 3832 return val; 3833 } 3834 3835 enum { 3836 RES_USAGE, 3837 RES_LIMIT, 3838 RES_MAX_USAGE, 3839 RES_FAILCNT, 3840 RES_SOFT_LIMIT, 3841 }; 3842 3843 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3844 struct cftype *cft) 3845 { 3846 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3847 struct page_counter *counter; 3848 3849 switch (MEMFILE_TYPE(cft->private)) { 3850 case _MEM: 3851 counter = &memcg->memory; 3852 break; 3853 case _MEMSWAP: 3854 counter = &memcg->memsw; 3855 break; 3856 case _KMEM: 3857 counter = &memcg->kmem; 3858 break; 3859 case _TCP: 3860 counter = &memcg->tcpmem; 3861 break; 3862 default: 3863 BUG(); 3864 } 3865 3866 switch (MEMFILE_ATTR(cft->private)) { 3867 case RES_USAGE: 3868 if (counter == &memcg->memory) 3869 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 3870 if (counter == &memcg->memsw) 3871 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 3872 return (u64)page_counter_read(counter) * PAGE_SIZE; 3873 case RES_LIMIT: 3874 return (u64)counter->max * PAGE_SIZE; 3875 case RES_MAX_USAGE: 3876 return (u64)counter->watermark * PAGE_SIZE; 3877 case RES_FAILCNT: 3878 return counter->failcnt; 3879 case RES_SOFT_LIMIT: 3880 return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE; 3881 default: 3882 BUG(); 3883 } 3884 } 3885 3886 /* 3887 * This function doesn't do anything useful. Its only job is to provide a read 3888 * handler for a file so that cgroup_file_mode() will add read permissions. 3889 */ 3890 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m, 3891 __always_unused void *v) 3892 { 3893 return -EINVAL; 3894 } 3895 3896 #ifdef CONFIG_MEMCG_KMEM 3897 static int memcg_online_kmem(struct mem_cgroup *memcg) 3898 { 3899 struct obj_cgroup *objcg; 3900 3901 if (mem_cgroup_kmem_disabled()) 3902 return 0; 3903 3904 if (unlikely(mem_cgroup_is_root(memcg))) 3905 return 0; 3906 3907 objcg = obj_cgroup_alloc(); 3908 if (!objcg) 3909 return -ENOMEM; 3910 3911 objcg->memcg = memcg; 3912 rcu_assign_pointer(memcg->objcg, objcg); 3913 obj_cgroup_get(objcg); 3914 memcg->orig_objcg = objcg; 3915 3916 static_branch_enable(&memcg_kmem_online_key); 3917 3918 memcg->kmemcg_id = memcg->id.id; 3919 3920 return 0; 3921 } 3922 3923 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3924 { 3925 struct mem_cgroup *parent; 3926 3927 if (mem_cgroup_kmem_disabled()) 3928 return; 3929 3930 if (unlikely(mem_cgroup_is_root(memcg))) 3931 return; 3932 3933 parent = parent_mem_cgroup(memcg); 3934 if (!parent) 3935 parent = root_mem_cgroup; 3936 3937 memcg_reparent_objcgs(memcg, parent); 3938 3939 /* 3940 * After we have finished memcg_reparent_objcgs(), all list_lrus 3941 * corresponding to this cgroup are guaranteed to remain empty. 3942 * The ordering is imposed by list_lru_node->lock taken by 3943 * memcg_reparent_list_lrus(). 3944 */ 3945 memcg_reparent_list_lrus(memcg, parent); 3946 } 3947 #else 3948 static int memcg_online_kmem(struct mem_cgroup *memcg) 3949 { 3950 return 0; 3951 } 3952 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3953 { 3954 } 3955 #endif /* CONFIG_MEMCG_KMEM */ 3956 3957 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 3958 { 3959 int ret; 3960 3961 mutex_lock(&memcg_max_mutex); 3962 3963 ret = page_counter_set_max(&memcg->tcpmem, max); 3964 if (ret) 3965 goto out; 3966 3967 if (!memcg->tcpmem_active) { 3968 /* 3969 * The active flag needs to be written after the static_key 3970 * update. This is what guarantees that the socket activation 3971 * function is the last one to run. See mem_cgroup_sk_alloc() 3972 * for details, and note that we don't mark any socket as 3973 * belonging to this memcg until that flag is up. 3974 * 3975 * We need to do this, because static_keys will span multiple 3976 * sites, but we can't control their order. If we mark a socket 3977 * as accounted, but the accounting functions are not patched in 3978 * yet, we'll lose accounting. 3979 * 3980 * We never race with the readers in mem_cgroup_sk_alloc(), 3981 * because when this value change, the code to process it is not 3982 * patched in yet. 3983 */ 3984 static_branch_inc(&memcg_sockets_enabled_key); 3985 memcg->tcpmem_active = true; 3986 } 3987 out: 3988 mutex_unlock(&memcg_max_mutex); 3989 return ret; 3990 } 3991 3992 /* 3993 * The user of this function is... 3994 * RES_LIMIT. 3995 */ 3996 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3997 char *buf, size_t nbytes, loff_t off) 3998 { 3999 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4000 unsigned long nr_pages; 4001 int ret; 4002 4003 buf = strstrip(buf); 4004 ret = page_counter_memparse(buf, "-1", &nr_pages); 4005 if (ret) 4006 return ret; 4007 4008 switch (MEMFILE_ATTR(of_cft(of)->private)) { 4009 case RES_LIMIT: 4010 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 4011 ret = -EINVAL; 4012 break; 4013 } 4014 switch (MEMFILE_TYPE(of_cft(of)->private)) { 4015 case _MEM: 4016 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 4017 break; 4018 case _MEMSWAP: 4019 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 4020 break; 4021 case _KMEM: 4022 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 4023 "Writing any value to this file has no effect. " 4024 "Please report your usecase to linux-mm@kvack.org if you " 4025 "depend on this functionality.\n"); 4026 ret = 0; 4027 break; 4028 case _TCP: 4029 ret = memcg_update_tcp_max(memcg, nr_pages); 4030 break; 4031 } 4032 break; 4033 case RES_SOFT_LIMIT: 4034 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 4035 ret = -EOPNOTSUPP; 4036 } else { 4037 WRITE_ONCE(memcg->soft_limit, nr_pages); 4038 ret = 0; 4039 } 4040 break; 4041 } 4042 return ret ?: nbytes; 4043 } 4044 4045 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 4046 size_t nbytes, loff_t off) 4047 { 4048 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4049 struct page_counter *counter; 4050 4051 switch (MEMFILE_TYPE(of_cft(of)->private)) { 4052 case _MEM: 4053 counter = &memcg->memory; 4054 break; 4055 case _MEMSWAP: 4056 counter = &memcg->memsw; 4057 break; 4058 case _KMEM: 4059 counter = &memcg->kmem; 4060 break; 4061 case _TCP: 4062 counter = &memcg->tcpmem; 4063 break; 4064 default: 4065 BUG(); 4066 } 4067 4068 switch (MEMFILE_ATTR(of_cft(of)->private)) { 4069 case RES_MAX_USAGE: 4070 page_counter_reset_watermark(counter); 4071 break; 4072 case RES_FAILCNT: 4073 counter->failcnt = 0; 4074 break; 4075 default: 4076 BUG(); 4077 } 4078 4079 return nbytes; 4080 } 4081 4082 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 4083 struct cftype *cft) 4084 { 4085 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 4086 } 4087 4088 #ifdef CONFIG_MMU 4089 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 4090 struct cftype *cft, u64 val) 4091 { 4092 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4093 4094 pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " 4095 "Please report your usecase to linux-mm@kvack.org if you " 4096 "depend on this functionality.\n"); 4097 4098 if (val & ~MOVE_MASK) 4099 return -EINVAL; 4100 4101 /* 4102 * No kind of locking is needed in here, because ->can_attach() will 4103 * check this value once in the beginning of the process, and then carry 4104 * on with stale data. This means that changes to this value will only 4105 * affect task migrations starting after the change. 4106 */ 4107 memcg->move_charge_at_immigrate = val; 4108 return 0; 4109 } 4110 #else 4111 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 4112 struct cftype *cft, u64 val) 4113 { 4114 return -ENOSYS; 4115 } 4116 #endif 4117 4118 #ifdef CONFIG_NUMA 4119 4120 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 4121 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 4122 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 4123 4124 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 4125 int nid, unsigned int lru_mask, bool tree) 4126 { 4127 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 4128 unsigned long nr = 0; 4129 enum lru_list lru; 4130 4131 VM_BUG_ON((unsigned)nid >= nr_node_ids); 4132 4133 for_each_lru(lru) { 4134 if (!(BIT(lru) & lru_mask)) 4135 continue; 4136 if (tree) 4137 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 4138 else 4139 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 4140 } 4141 return nr; 4142 } 4143 4144 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 4145 unsigned int lru_mask, 4146 bool tree) 4147 { 4148 unsigned long nr = 0; 4149 enum lru_list lru; 4150 4151 for_each_lru(lru) { 4152 if (!(BIT(lru) & lru_mask)) 4153 continue; 4154 if (tree) 4155 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 4156 else 4157 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 4158 } 4159 return nr; 4160 } 4161 4162 static int memcg_numa_stat_show(struct seq_file *m, void *v) 4163 { 4164 struct numa_stat { 4165 const char *name; 4166 unsigned int lru_mask; 4167 }; 4168 4169 static const struct numa_stat stats[] = { 4170 { "total", LRU_ALL }, 4171 { "file", LRU_ALL_FILE }, 4172 { "anon", LRU_ALL_ANON }, 4173 { "unevictable", BIT(LRU_UNEVICTABLE) }, 4174 }; 4175 const struct numa_stat *stat; 4176 int nid; 4177 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4178 4179 mem_cgroup_flush_stats(); 4180 4181 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4182 seq_printf(m, "%s=%lu", stat->name, 4183 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 4184 false)); 4185 for_each_node_state(nid, N_MEMORY) 4186 seq_printf(m, " N%d=%lu", nid, 4187 mem_cgroup_node_nr_lru_pages(memcg, nid, 4188 stat->lru_mask, false)); 4189 seq_putc(m, '\n'); 4190 } 4191 4192 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4193 4194 seq_printf(m, "hierarchical_%s=%lu", stat->name, 4195 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 4196 true)); 4197 for_each_node_state(nid, N_MEMORY) 4198 seq_printf(m, " N%d=%lu", nid, 4199 mem_cgroup_node_nr_lru_pages(memcg, nid, 4200 stat->lru_mask, true)); 4201 seq_putc(m, '\n'); 4202 } 4203 4204 return 0; 4205 } 4206 #endif /* CONFIG_NUMA */ 4207 4208 static const unsigned int memcg1_stats[] = { 4209 NR_FILE_PAGES, 4210 NR_ANON_MAPPED, 4211 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4212 NR_ANON_THPS, 4213 #endif 4214 NR_SHMEM, 4215 NR_FILE_MAPPED, 4216 NR_FILE_DIRTY, 4217 NR_WRITEBACK, 4218 WORKINGSET_REFAULT_ANON, 4219 WORKINGSET_REFAULT_FILE, 4220 #ifdef CONFIG_SWAP 4221 MEMCG_SWAP, 4222 NR_SWAPCACHE, 4223 #endif 4224 }; 4225 4226 static const char *const memcg1_stat_names[] = { 4227 "cache", 4228 "rss", 4229 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4230 "rss_huge", 4231 #endif 4232 "shmem", 4233 "mapped_file", 4234 "dirty", 4235 "writeback", 4236 "workingset_refault_anon", 4237 "workingset_refault_file", 4238 #ifdef CONFIG_SWAP 4239 "swap", 4240 "swapcached", 4241 #endif 4242 }; 4243 4244 /* Universal VM events cgroup1 shows, original sort order */ 4245 static const unsigned int memcg1_events[] = { 4246 PGPGIN, 4247 PGPGOUT, 4248 PGFAULT, 4249 PGMAJFAULT, 4250 }; 4251 4252 static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 4253 { 4254 unsigned long memory, memsw; 4255 struct mem_cgroup *mi; 4256 unsigned int i; 4257 4258 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 4259 4260 mem_cgroup_flush_stats(); 4261 4262 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4263 unsigned long nr; 4264 4265 nr = memcg_page_state_local_output(memcg, memcg1_stats[i]); 4266 seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr); 4267 } 4268 4269 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4270 seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]), 4271 memcg_events_local(memcg, memcg1_events[i])); 4272 4273 for (i = 0; i < NR_LRU_LISTS; i++) 4274 seq_buf_printf(s, "%s %lu\n", lru_list_name(i), 4275 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 4276 PAGE_SIZE); 4277 4278 /* Hierarchical information */ 4279 memory = memsw = PAGE_COUNTER_MAX; 4280 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 4281 memory = min(memory, READ_ONCE(mi->memory.max)); 4282 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 4283 } 4284 seq_buf_printf(s, "hierarchical_memory_limit %llu\n", 4285 (u64)memory * PAGE_SIZE); 4286 seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", 4287 (u64)memsw * PAGE_SIZE); 4288 4289 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4290 unsigned long nr; 4291 4292 nr = memcg_page_state_output(memcg, memcg1_stats[i]); 4293 seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], 4294 (u64)nr); 4295 } 4296 4297 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4298 seq_buf_printf(s, "total_%s %llu\n", 4299 vm_event_name(memcg1_events[i]), 4300 (u64)memcg_events(memcg, memcg1_events[i])); 4301 4302 for (i = 0; i < NR_LRU_LISTS; i++) 4303 seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i), 4304 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 4305 PAGE_SIZE); 4306 4307 #ifdef CONFIG_DEBUG_VM 4308 { 4309 pg_data_t *pgdat; 4310 struct mem_cgroup_per_node *mz; 4311 unsigned long anon_cost = 0; 4312 unsigned long file_cost = 0; 4313 4314 for_each_online_pgdat(pgdat) { 4315 mz = memcg->nodeinfo[pgdat->node_id]; 4316 4317 anon_cost += mz->lruvec.anon_cost; 4318 file_cost += mz->lruvec.file_cost; 4319 } 4320 seq_buf_printf(s, "anon_cost %lu\n", anon_cost); 4321 seq_buf_printf(s, "file_cost %lu\n", file_cost); 4322 } 4323 #endif 4324 } 4325 4326 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 4327 struct cftype *cft) 4328 { 4329 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4330 4331 return mem_cgroup_swappiness(memcg); 4332 } 4333 4334 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 4335 struct cftype *cft, u64 val) 4336 { 4337 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4338 4339 if (val > 200) 4340 return -EINVAL; 4341 4342 if (!mem_cgroup_is_root(memcg)) 4343 WRITE_ONCE(memcg->swappiness, val); 4344 else 4345 WRITE_ONCE(vm_swappiness, val); 4346 4347 return 0; 4348 } 4349 4350 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4351 { 4352 struct mem_cgroup_threshold_ary *t; 4353 unsigned long usage; 4354 int i; 4355 4356 rcu_read_lock(); 4357 if (!swap) 4358 t = rcu_dereference(memcg->thresholds.primary); 4359 else 4360 t = rcu_dereference(memcg->memsw_thresholds.primary); 4361 4362 if (!t) 4363 goto unlock; 4364 4365 usage = mem_cgroup_usage(memcg, swap); 4366 4367 /* 4368 * current_threshold points to threshold just below or equal to usage. 4369 * If it's not true, a threshold was crossed after last 4370 * call of __mem_cgroup_threshold(). 4371 */ 4372 i = t->current_threshold; 4373 4374 /* 4375 * Iterate backward over array of thresholds starting from 4376 * current_threshold and check if a threshold is crossed. 4377 * If none of thresholds below usage is crossed, we read 4378 * only one element of the array here. 4379 */ 4380 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4381 eventfd_signal(t->entries[i].eventfd, 1); 4382 4383 /* i = current_threshold + 1 */ 4384 i++; 4385 4386 /* 4387 * Iterate forward over array of thresholds starting from 4388 * current_threshold+1 and check if a threshold is crossed. 4389 * If none of thresholds above usage is crossed, we read 4390 * only one element of the array here. 4391 */ 4392 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4393 eventfd_signal(t->entries[i].eventfd, 1); 4394 4395 /* Update current_threshold */ 4396 t->current_threshold = i - 1; 4397 unlock: 4398 rcu_read_unlock(); 4399 } 4400 4401 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4402 { 4403 while (memcg) { 4404 __mem_cgroup_threshold(memcg, false); 4405 if (do_memsw_account()) 4406 __mem_cgroup_threshold(memcg, true); 4407 4408 memcg = parent_mem_cgroup(memcg); 4409 } 4410 } 4411 4412 static int compare_thresholds(const void *a, const void *b) 4413 { 4414 const struct mem_cgroup_threshold *_a = a; 4415 const struct mem_cgroup_threshold *_b = b; 4416 4417 if (_a->threshold > _b->threshold) 4418 return 1; 4419 4420 if (_a->threshold < _b->threshold) 4421 return -1; 4422 4423 return 0; 4424 } 4425 4426 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4427 { 4428 struct mem_cgroup_eventfd_list *ev; 4429 4430 spin_lock(&memcg_oom_lock); 4431 4432 list_for_each_entry(ev, &memcg->oom_notify, list) 4433 eventfd_signal(ev->eventfd, 1); 4434 4435 spin_unlock(&memcg_oom_lock); 4436 return 0; 4437 } 4438 4439 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4440 { 4441 struct mem_cgroup *iter; 4442 4443 for_each_mem_cgroup_tree(iter, memcg) 4444 mem_cgroup_oom_notify_cb(iter); 4445 } 4446 4447 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4448 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 4449 { 4450 struct mem_cgroup_thresholds *thresholds; 4451 struct mem_cgroup_threshold_ary *new; 4452 unsigned long threshold; 4453 unsigned long usage; 4454 int i, size, ret; 4455 4456 ret = page_counter_memparse(args, "-1", &threshold); 4457 if (ret) 4458 return ret; 4459 4460 mutex_lock(&memcg->thresholds_lock); 4461 4462 if (type == _MEM) { 4463 thresholds = &memcg->thresholds; 4464 usage = mem_cgroup_usage(memcg, false); 4465 } else if (type == _MEMSWAP) { 4466 thresholds = &memcg->memsw_thresholds; 4467 usage = mem_cgroup_usage(memcg, true); 4468 } else 4469 BUG(); 4470 4471 /* Check if a threshold crossed before adding a new one */ 4472 if (thresholds->primary) 4473 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4474 4475 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4476 4477 /* Allocate memory for new array of thresholds */ 4478 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 4479 if (!new) { 4480 ret = -ENOMEM; 4481 goto unlock; 4482 } 4483 new->size = size; 4484 4485 /* Copy thresholds (if any) to new array */ 4486 if (thresholds->primary) 4487 memcpy(new->entries, thresholds->primary->entries, 4488 flex_array_size(new, entries, size - 1)); 4489 4490 /* Add new threshold */ 4491 new->entries[size - 1].eventfd = eventfd; 4492 new->entries[size - 1].threshold = threshold; 4493 4494 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4495 sort(new->entries, size, sizeof(*new->entries), 4496 compare_thresholds, NULL); 4497 4498 /* Find current threshold */ 4499 new->current_threshold = -1; 4500 for (i = 0; i < size; i++) { 4501 if (new->entries[i].threshold <= usage) { 4502 /* 4503 * new->current_threshold will not be used until 4504 * rcu_assign_pointer(), so it's safe to increment 4505 * it here. 4506 */ 4507 ++new->current_threshold; 4508 } else 4509 break; 4510 } 4511 4512 /* Free old spare buffer and save old primary buffer as spare */ 4513 kfree(thresholds->spare); 4514 thresholds->spare = thresholds->primary; 4515 4516 rcu_assign_pointer(thresholds->primary, new); 4517 4518 /* To be sure that nobody uses thresholds */ 4519 synchronize_rcu(); 4520 4521 unlock: 4522 mutex_unlock(&memcg->thresholds_lock); 4523 4524 return ret; 4525 } 4526 4527 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4528 struct eventfd_ctx *eventfd, const char *args) 4529 { 4530 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 4531 } 4532 4533 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 4534 struct eventfd_ctx *eventfd, const char *args) 4535 { 4536 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 4537 } 4538 4539 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4540 struct eventfd_ctx *eventfd, enum res_type type) 4541 { 4542 struct mem_cgroup_thresholds *thresholds; 4543 struct mem_cgroup_threshold_ary *new; 4544 unsigned long usage; 4545 int i, j, size, entries; 4546 4547 mutex_lock(&memcg->thresholds_lock); 4548 4549 if (type == _MEM) { 4550 thresholds = &memcg->thresholds; 4551 usage = mem_cgroup_usage(memcg, false); 4552 } else if (type == _MEMSWAP) { 4553 thresholds = &memcg->memsw_thresholds; 4554 usage = mem_cgroup_usage(memcg, true); 4555 } else 4556 BUG(); 4557 4558 if (!thresholds->primary) 4559 goto unlock; 4560 4561 /* Check if a threshold crossed before removing */ 4562 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4563 4564 /* Calculate new number of threshold */ 4565 size = entries = 0; 4566 for (i = 0; i < thresholds->primary->size; i++) { 4567 if (thresholds->primary->entries[i].eventfd != eventfd) 4568 size++; 4569 else 4570 entries++; 4571 } 4572 4573 new = thresholds->spare; 4574 4575 /* If no items related to eventfd have been cleared, nothing to do */ 4576 if (!entries) 4577 goto unlock; 4578 4579 /* Set thresholds array to NULL if we don't have thresholds */ 4580 if (!size) { 4581 kfree(new); 4582 new = NULL; 4583 goto swap_buffers; 4584 } 4585 4586 new->size = size; 4587 4588 /* Copy thresholds and find current threshold */ 4589 new->current_threshold = -1; 4590 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4591 if (thresholds->primary->entries[i].eventfd == eventfd) 4592 continue; 4593 4594 new->entries[j] = thresholds->primary->entries[i]; 4595 if (new->entries[j].threshold <= usage) { 4596 /* 4597 * new->current_threshold will not be used 4598 * until rcu_assign_pointer(), so it's safe to increment 4599 * it here. 4600 */ 4601 ++new->current_threshold; 4602 } 4603 j++; 4604 } 4605 4606 swap_buffers: 4607 /* Swap primary and spare array */ 4608 thresholds->spare = thresholds->primary; 4609 4610 rcu_assign_pointer(thresholds->primary, new); 4611 4612 /* To be sure that nobody uses thresholds */ 4613 synchronize_rcu(); 4614 4615 /* If all events are unregistered, free the spare array */ 4616 if (!new) { 4617 kfree(thresholds->spare); 4618 thresholds->spare = NULL; 4619 } 4620 unlock: 4621 mutex_unlock(&memcg->thresholds_lock); 4622 } 4623 4624 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4625 struct eventfd_ctx *eventfd) 4626 { 4627 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4628 } 4629 4630 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4631 struct eventfd_ctx *eventfd) 4632 { 4633 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4634 } 4635 4636 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4637 struct eventfd_ctx *eventfd, const char *args) 4638 { 4639 struct mem_cgroup_eventfd_list *event; 4640 4641 event = kmalloc(sizeof(*event), GFP_KERNEL); 4642 if (!event) 4643 return -ENOMEM; 4644 4645 spin_lock(&memcg_oom_lock); 4646 4647 event->eventfd = eventfd; 4648 list_add(&event->list, &memcg->oom_notify); 4649 4650 /* already in OOM ? */ 4651 if (memcg->under_oom) 4652 eventfd_signal(eventfd, 1); 4653 spin_unlock(&memcg_oom_lock); 4654 4655 return 0; 4656 } 4657 4658 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4659 struct eventfd_ctx *eventfd) 4660 { 4661 struct mem_cgroup_eventfd_list *ev, *tmp; 4662 4663 spin_lock(&memcg_oom_lock); 4664 4665 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4666 if (ev->eventfd == eventfd) { 4667 list_del(&ev->list); 4668 kfree(ev); 4669 } 4670 } 4671 4672 spin_unlock(&memcg_oom_lock); 4673 } 4674 4675 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4676 { 4677 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 4678 4679 seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable)); 4680 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 4681 seq_printf(sf, "oom_kill %lu\n", 4682 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 4683 return 0; 4684 } 4685 4686 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4687 struct cftype *cft, u64 val) 4688 { 4689 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4690 4691 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4692 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) 4693 return -EINVAL; 4694 4695 WRITE_ONCE(memcg->oom_kill_disable, val); 4696 if (!val) 4697 memcg_oom_recover(memcg); 4698 4699 return 0; 4700 } 4701 4702 #ifdef CONFIG_CGROUP_WRITEBACK 4703 4704 #include <trace/events/writeback.h> 4705 4706 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4707 { 4708 return wb_domain_init(&memcg->cgwb_domain, gfp); 4709 } 4710 4711 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4712 { 4713 wb_domain_exit(&memcg->cgwb_domain); 4714 } 4715 4716 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4717 { 4718 wb_domain_size_changed(&memcg->cgwb_domain); 4719 } 4720 4721 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 4722 { 4723 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4724 4725 if (!memcg->css.parent) 4726 return NULL; 4727 4728 return &memcg->cgwb_domain; 4729 } 4730 4731 /** 4732 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 4733 * @wb: bdi_writeback in question 4734 * @pfilepages: out parameter for number of file pages 4735 * @pheadroom: out parameter for number of allocatable pages according to memcg 4736 * @pdirty: out parameter for number of dirty pages 4737 * @pwriteback: out parameter for number of pages under writeback 4738 * 4739 * Determine the numbers of file, headroom, dirty, and writeback pages in 4740 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 4741 * is a bit more involved. 4742 * 4743 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 4744 * headroom is calculated as the lowest headroom of itself and the 4745 * ancestors. Note that this doesn't consider the actual amount of 4746 * available memory in the system. The caller should further cap 4747 * *@pheadroom accordingly. 4748 */ 4749 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 4750 unsigned long *pheadroom, unsigned long *pdirty, 4751 unsigned long *pwriteback) 4752 { 4753 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4754 struct mem_cgroup *parent; 4755 4756 mem_cgroup_flush_stats(); 4757 4758 *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); 4759 *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); 4760 *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + 4761 memcg_page_state(memcg, NR_ACTIVE_FILE); 4762 4763 *pheadroom = PAGE_COUNTER_MAX; 4764 while ((parent = parent_mem_cgroup(memcg))) { 4765 unsigned long ceiling = min(READ_ONCE(memcg->memory.max), 4766 READ_ONCE(memcg->memory.high)); 4767 unsigned long used = page_counter_read(&memcg->memory); 4768 4769 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 4770 memcg = parent; 4771 } 4772 } 4773 4774 /* 4775 * Foreign dirty flushing 4776 * 4777 * There's an inherent mismatch between memcg and writeback. The former 4778 * tracks ownership per-page while the latter per-inode. This was a 4779 * deliberate design decision because honoring per-page ownership in the 4780 * writeback path is complicated, may lead to higher CPU and IO overheads 4781 * and deemed unnecessary given that write-sharing an inode across 4782 * different cgroups isn't a common use-case. 4783 * 4784 * Combined with inode majority-writer ownership switching, this works well 4785 * enough in most cases but there are some pathological cases. For 4786 * example, let's say there are two cgroups A and B which keep writing to 4787 * different but confined parts of the same inode. B owns the inode and 4788 * A's memory is limited far below B's. A's dirty ratio can rise enough to 4789 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid 4790 * triggering background writeback. A will be slowed down without a way to 4791 * make writeback of the dirty pages happen. 4792 * 4793 * Conditions like the above can lead to a cgroup getting repeatedly and 4794 * severely throttled after making some progress after each 4795 * dirty_expire_interval while the underlying IO device is almost 4796 * completely idle. 4797 * 4798 * Solving this problem completely requires matching the ownership tracking 4799 * granularities between memcg and writeback in either direction. However, 4800 * the more egregious behaviors can be avoided by simply remembering the 4801 * most recent foreign dirtying events and initiating remote flushes on 4802 * them when local writeback isn't enough to keep the memory clean enough. 4803 * 4804 * The following two functions implement such mechanism. When a foreign 4805 * page - a page whose memcg and writeback ownerships don't match - is 4806 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning 4807 * bdi_writeback on the page owning memcg. When balance_dirty_pages() 4808 * decides that the memcg needs to sleep due to high dirty ratio, it calls 4809 * mem_cgroup_flush_foreign() which queues writeback on the recorded 4810 * foreign bdi_writebacks which haven't expired. Both the numbers of 4811 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are 4812 * limited to MEMCG_CGWB_FRN_CNT. 4813 * 4814 * The mechanism only remembers IDs and doesn't hold any object references. 4815 * As being wrong occasionally doesn't matter, updates and accesses to the 4816 * records are lockless and racy. 4817 */ 4818 void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, 4819 struct bdi_writeback *wb) 4820 { 4821 struct mem_cgroup *memcg = folio_memcg(folio); 4822 struct memcg_cgwb_frn *frn; 4823 u64 now = get_jiffies_64(); 4824 u64 oldest_at = now; 4825 int oldest = -1; 4826 int i; 4827 4828 trace_track_foreign_dirty(folio, wb); 4829 4830 /* 4831 * Pick the slot to use. If there is already a slot for @wb, keep 4832 * using it. If not replace the oldest one which isn't being 4833 * written out. 4834 */ 4835 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4836 frn = &memcg->cgwb_frn[i]; 4837 if (frn->bdi_id == wb->bdi->id && 4838 frn->memcg_id == wb->memcg_css->id) 4839 break; 4840 if (time_before64(frn->at, oldest_at) && 4841 atomic_read(&frn->done.cnt) == 1) { 4842 oldest = i; 4843 oldest_at = frn->at; 4844 } 4845 } 4846 4847 if (i < MEMCG_CGWB_FRN_CNT) { 4848 /* 4849 * Re-using an existing one. Update timestamp lazily to 4850 * avoid making the cacheline hot. We want them to be 4851 * reasonably up-to-date and significantly shorter than 4852 * dirty_expire_interval as that's what expires the record. 4853 * Use the shorter of 1s and dirty_expire_interval / 8. 4854 */ 4855 unsigned long update_intv = 4856 min_t(unsigned long, HZ, 4857 msecs_to_jiffies(dirty_expire_interval * 10) / 8); 4858 4859 if (time_before64(frn->at, now - update_intv)) 4860 frn->at = now; 4861 } else if (oldest >= 0) { 4862 /* replace the oldest free one */ 4863 frn = &memcg->cgwb_frn[oldest]; 4864 frn->bdi_id = wb->bdi->id; 4865 frn->memcg_id = wb->memcg_css->id; 4866 frn->at = now; 4867 } 4868 } 4869 4870 /* issue foreign writeback flushes for recorded foreign dirtying events */ 4871 void mem_cgroup_flush_foreign(struct bdi_writeback *wb) 4872 { 4873 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4874 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); 4875 u64 now = jiffies_64; 4876 int i; 4877 4878 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4879 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; 4880 4881 /* 4882 * If the record is older than dirty_expire_interval, 4883 * writeback on it has already started. No need to kick it 4884 * off again. Also, don't start a new one if there's 4885 * already one in flight. 4886 */ 4887 if (time_after64(frn->at, now - intv) && 4888 atomic_read(&frn->done.cnt) == 1) { 4889 frn->at = 0; 4890 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); 4891 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 4892 WB_REASON_FOREIGN_FLUSH, 4893 &frn->done); 4894 } 4895 } 4896 } 4897 4898 #else /* CONFIG_CGROUP_WRITEBACK */ 4899 4900 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4901 { 4902 return 0; 4903 } 4904 4905 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4906 { 4907 } 4908 4909 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4910 { 4911 } 4912 4913 #endif /* CONFIG_CGROUP_WRITEBACK */ 4914 4915 /* 4916 * DO NOT USE IN NEW FILES. 4917 * 4918 * "cgroup.event_control" implementation. 4919 * 4920 * This is way over-engineered. It tries to support fully configurable 4921 * events for each user. Such level of flexibility is completely 4922 * unnecessary especially in the light of the planned unified hierarchy. 4923 * 4924 * Please deprecate this and replace with something simpler if at all 4925 * possible. 4926 */ 4927 4928 /* 4929 * Unregister event and free resources. 4930 * 4931 * Gets called from workqueue. 4932 */ 4933 static void memcg_event_remove(struct work_struct *work) 4934 { 4935 struct mem_cgroup_event *event = 4936 container_of(work, struct mem_cgroup_event, remove); 4937 struct mem_cgroup *memcg = event->memcg; 4938 4939 remove_wait_queue(event->wqh, &event->wait); 4940 4941 event->unregister_event(memcg, event->eventfd); 4942 4943 /* Notify userspace the event is going away. */ 4944 eventfd_signal(event->eventfd, 1); 4945 4946 eventfd_ctx_put(event->eventfd); 4947 kfree(event); 4948 css_put(&memcg->css); 4949 } 4950 4951 /* 4952 * Gets called on EPOLLHUP on eventfd when user closes it. 4953 * 4954 * Called with wqh->lock held and interrupts disabled. 4955 */ 4956 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 4957 int sync, void *key) 4958 { 4959 struct mem_cgroup_event *event = 4960 container_of(wait, struct mem_cgroup_event, wait); 4961 struct mem_cgroup *memcg = event->memcg; 4962 __poll_t flags = key_to_poll(key); 4963 4964 if (flags & EPOLLHUP) { 4965 /* 4966 * If the event has been detached at cgroup removal, we 4967 * can simply return knowing the other side will cleanup 4968 * for us. 4969 * 4970 * We can't race against event freeing since the other 4971 * side will require wqh->lock via remove_wait_queue(), 4972 * which we hold. 4973 */ 4974 spin_lock(&memcg->event_list_lock); 4975 if (!list_empty(&event->list)) { 4976 list_del_init(&event->list); 4977 /* 4978 * We are in atomic context, but cgroup_event_remove() 4979 * may sleep, so we have to call it in workqueue. 4980 */ 4981 schedule_work(&event->remove); 4982 } 4983 spin_unlock(&memcg->event_list_lock); 4984 } 4985 4986 return 0; 4987 } 4988 4989 static void memcg_event_ptable_queue_proc(struct file *file, 4990 wait_queue_head_t *wqh, poll_table *pt) 4991 { 4992 struct mem_cgroup_event *event = 4993 container_of(pt, struct mem_cgroup_event, pt); 4994 4995 event->wqh = wqh; 4996 add_wait_queue(wqh, &event->wait); 4997 } 4998 4999 /* 5000 * DO NOT USE IN NEW FILES. 5001 * 5002 * Parse input and register new cgroup event handler. 5003 * 5004 * Input must be in format '<event_fd> <control_fd> <args>'. 5005 * Interpretation of args is defined by control file implementation. 5006 */ 5007 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 5008 char *buf, size_t nbytes, loff_t off) 5009 { 5010 struct cgroup_subsys_state *css = of_css(of); 5011 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5012 struct mem_cgroup_event *event; 5013 struct cgroup_subsys_state *cfile_css; 5014 unsigned int efd, cfd; 5015 struct fd efile; 5016 struct fd cfile; 5017 struct dentry *cdentry; 5018 const char *name; 5019 char *endp; 5020 int ret; 5021 5022 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 5023 return -EOPNOTSUPP; 5024 5025 buf = strstrip(buf); 5026 5027 efd = simple_strtoul(buf, &endp, 10); 5028 if (*endp != ' ') 5029 return -EINVAL; 5030 buf = endp + 1; 5031 5032 cfd = simple_strtoul(buf, &endp, 10); 5033 if ((*endp != ' ') && (*endp != '\0')) 5034 return -EINVAL; 5035 buf = endp + 1; 5036 5037 event = kzalloc(sizeof(*event), GFP_KERNEL); 5038 if (!event) 5039 return -ENOMEM; 5040 5041 event->memcg = memcg; 5042 INIT_LIST_HEAD(&event->list); 5043 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 5044 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 5045 INIT_WORK(&event->remove, memcg_event_remove); 5046 5047 efile = fdget(efd); 5048 if (!efile.file) { 5049 ret = -EBADF; 5050 goto out_kfree; 5051 } 5052 5053 event->eventfd = eventfd_ctx_fileget(efile.file); 5054 if (IS_ERR(event->eventfd)) { 5055 ret = PTR_ERR(event->eventfd); 5056 goto out_put_efile; 5057 } 5058 5059 cfile = fdget(cfd); 5060 if (!cfile.file) { 5061 ret = -EBADF; 5062 goto out_put_eventfd; 5063 } 5064 5065 /* the process need read permission on control file */ 5066 /* AV: shouldn't we check that it's been opened for read instead? */ 5067 ret = file_permission(cfile.file, MAY_READ); 5068 if (ret < 0) 5069 goto out_put_cfile; 5070 5071 /* 5072 * The control file must be a regular cgroup1 file. As a regular cgroup 5073 * file can't be renamed, it's safe to access its name afterwards. 5074 */ 5075 cdentry = cfile.file->f_path.dentry; 5076 if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { 5077 ret = -EINVAL; 5078 goto out_put_cfile; 5079 } 5080 5081 /* 5082 * Determine the event callbacks and set them in @event. This used 5083 * to be done via struct cftype but cgroup core no longer knows 5084 * about these events. The following is crude but the whole thing 5085 * is for compatibility anyway. 5086 * 5087 * DO NOT ADD NEW FILES. 5088 */ 5089 name = cdentry->d_name.name; 5090 5091 if (!strcmp(name, "memory.usage_in_bytes")) { 5092 event->register_event = mem_cgroup_usage_register_event; 5093 event->unregister_event = mem_cgroup_usage_unregister_event; 5094 } else if (!strcmp(name, "memory.oom_control")) { 5095 event->register_event = mem_cgroup_oom_register_event; 5096 event->unregister_event = mem_cgroup_oom_unregister_event; 5097 } else if (!strcmp(name, "memory.pressure_level")) { 5098 event->register_event = vmpressure_register_event; 5099 event->unregister_event = vmpressure_unregister_event; 5100 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 5101 event->register_event = memsw_cgroup_usage_register_event; 5102 event->unregister_event = memsw_cgroup_usage_unregister_event; 5103 } else { 5104 ret = -EINVAL; 5105 goto out_put_cfile; 5106 } 5107 5108 /* 5109 * Verify @cfile should belong to @css. Also, remaining events are 5110 * automatically removed on cgroup destruction but the removal is 5111 * asynchronous, so take an extra ref on @css. 5112 */ 5113 cfile_css = css_tryget_online_from_dir(cdentry->d_parent, 5114 &memory_cgrp_subsys); 5115 ret = -EINVAL; 5116 if (IS_ERR(cfile_css)) 5117 goto out_put_cfile; 5118 if (cfile_css != css) { 5119 css_put(cfile_css); 5120 goto out_put_cfile; 5121 } 5122 5123 ret = event->register_event(memcg, event->eventfd, buf); 5124 if (ret) 5125 goto out_put_css; 5126 5127 vfs_poll(efile.file, &event->pt); 5128 5129 spin_lock_irq(&memcg->event_list_lock); 5130 list_add(&event->list, &memcg->event_list); 5131 spin_unlock_irq(&memcg->event_list_lock); 5132 5133 fdput(cfile); 5134 fdput(efile); 5135 5136 return nbytes; 5137 5138 out_put_css: 5139 css_put(css); 5140 out_put_cfile: 5141 fdput(cfile); 5142 out_put_eventfd: 5143 eventfd_ctx_put(event->eventfd); 5144 out_put_efile: 5145 fdput(efile); 5146 out_kfree: 5147 kfree(event); 5148 5149 return ret; 5150 } 5151 5152 #if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 5153 static int mem_cgroup_slab_show(struct seq_file *m, void *p) 5154 { 5155 /* 5156 * Deprecated. 5157 * Please, take a look at tools/cgroup/memcg_slabinfo.py . 5158 */ 5159 return 0; 5160 } 5161 #endif 5162 5163 static int memory_stat_show(struct seq_file *m, void *v); 5164 5165 static struct cftype mem_cgroup_legacy_files[] = { 5166 { 5167 .name = "usage_in_bytes", 5168 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 5169 .read_u64 = mem_cgroup_read_u64, 5170 }, 5171 { 5172 .name = "max_usage_in_bytes", 5173 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 5174 .write = mem_cgroup_reset, 5175 .read_u64 = mem_cgroup_read_u64, 5176 }, 5177 { 5178 .name = "limit_in_bytes", 5179 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 5180 .write = mem_cgroup_write, 5181 .read_u64 = mem_cgroup_read_u64, 5182 }, 5183 { 5184 .name = "soft_limit_in_bytes", 5185 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 5186 .write = mem_cgroup_write, 5187 .read_u64 = mem_cgroup_read_u64, 5188 }, 5189 { 5190 .name = "failcnt", 5191 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 5192 .write = mem_cgroup_reset, 5193 .read_u64 = mem_cgroup_read_u64, 5194 }, 5195 { 5196 .name = "stat", 5197 .seq_show = memory_stat_show, 5198 }, 5199 { 5200 .name = "force_empty", 5201 .write = mem_cgroup_force_empty_write, 5202 }, 5203 { 5204 .name = "use_hierarchy", 5205 .write_u64 = mem_cgroup_hierarchy_write, 5206 .read_u64 = mem_cgroup_hierarchy_read, 5207 }, 5208 { 5209 .name = "cgroup.event_control", /* XXX: for compat */ 5210 .write = memcg_write_event_control, 5211 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 5212 }, 5213 { 5214 .name = "swappiness", 5215 .read_u64 = mem_cgroup_swappiness_read, 5216 .write_u64 = mem_cgroup_swappiness_write, 5217 }, 5218 { 5219 .name = "move_charge_at_immigrate", 5220 .read_u64 = mem_cgroup_move_charge_read, 5221 .write_u64 = mem_cgroup_move_charge_write, 5222 }, 5223 { 5224 .name = "oom_control", 5225 .seq_show = mem_cgroup_oom_control_read, 5226 .write_u64 = mem_cgroup_oom_control_write, 5227 }, 5228 { 5229 .name = "pressure_level", 5230 .seq_show = mem_cgroup_dummy_seq_show, 5231 }, 5232 #ifdef CONFIG_NUMA 5233 { 5234 .name = "numa_stat", 5235 .seq_show = memcg_numa_stat_show, 5236 }, 5237 #endif 5238 { 5239 .name = "kmem.limit_in_bytes", 5240 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 5241 .write = mem_cgroup_write, 5242 .read_u64 = mem_cgroup_read_u64, 5243 }, 5244 { 5245 .name = "kmem.usage_in_bytes", 5246 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 5247 .read_u64 = mem_cgroup_read_u64, 5248 }, 5249 { 5250 .name = "kmem.failcnt", 5251 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 5252 .write = mem_cgroup_reset, 5253 .read_u64 = mem_cgroup_read_u64, 5254 }, 5255 { 5256 .name = "kmem.max_usage_in_bytes", 5257 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 5258 .write = mem_cgroup_reset, 5259 .read_u64 = mem_cgroup_read_u64, 5260 }, 5261 #if defined(CONFIG_MEMCG_KMEM) && \ 5262 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 5263 { 5264 .name = "kmem.slabinfo", 5265 .seq_show = mem_cgroup_slab_show, 5266 }, 5267 #endif 5268 { 5269 .name = "kmem.tcp.limit_in_bytes", 5270 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 5271 .write = mem_cgroup_write, 5272 .read_u64 = mem_cgroup_read_u64, 5273 }, 5274 { 5275 .name = "kmem.tcp.usage_in_bytes", 5276 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 5277 .read_u64 = mem_cgroup_read_u64, 5278 }, 5279 { 5280 .name = "kmem.tcp.failcnt", 5281 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 5282 .write = mem_cgroup_reset, 5283 .read_u64 = mem_cgroup_read_u64, 5284 }, 5285 { 5286 .name = "kmem.tcp.max_usage_in_bytes", 5287 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 5288 .write = mem_cgroup_reset, 5289 .read_u64 = mem_cgroup_read_u64, 5290 }, 5291 { }, /* terminate */ 5292 }; 5293 5294 /* 5295 * Private memory cgroup IDR 5296 * 5297 * Swap-out records and page cache shadow entries need to store memcg 5298 * references in constrained space, so we maintain an ID space that is 5299 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 5300 * memory-controlled cgroups to 64k. 5301 * 5302 * However, there usually are many references to the offline CSS after 5303 * the cgroup has been destroyed, such as page cache or reclaimable 5304 * slab objects, that don't need to hang on to the ID. We want to keep 5305 * those dead CSS from occupying IDs, or we might quickly exhaust the 5306 * relatively small ID space and prevent the creation of new cgroups 5307 * even when there are much fewer than 64k cgroups - possibly none. 5308 * 5309 * Maintain a private 16-bit ID space for memcg, and allow the ID to 5310 * be freed and recycled when it's no longer needed, which is usually 5311 * when the CSS is offlined. 5312 * 5313 * The only exception to that are records of swapped out tmpfs/shmem 5314 * pages that need to be attributed to live ancestors on swapin. But 5315 * those references are manageable from userspace. 5316 */ 5317 5318 #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) 5319 static DEFINE_IDR(mem_cgroup_idr); 5320 5321 static void mem_cgroup_id_remove(struct mem_cgroup *memcg) 5322 { 5323 if (memcg->id.id > 0) { 5324 idr_remove(&mem_cgroup_idr, memcg->id.id); 5325 memcg->id.id = 0; 5326 } 5327 } 5328 5329 static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, 5330 unsigned int n) 5331 { 5332 refcount_add(n, &memcg->id.ref); 5333 } 5334 5335 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 5336 { 5337 if (refcount_sub_and_test(n, &memcg->id.ref)) { 5338 mem_cgroup_id_remove(memcg); 5339 5340 /* Memcg ID pins CSS */ 5341 css_put(&memcg->css); 5342 } 5343 } 5344 5345 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 5346 { 5347 mem_cgroup_id_put_many(memcg, 1); 5348 } 5349 5350 /** 5351 * mem_cgroup_from_id - look up a memcg from a memcg id 5352 * @id: the memcg id to look up 5353 * 5354 * Caller must hold rcu_read_lock(). 5355 */ 5356 struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 5357 { 5358 WARN_ON_ONCE(!rcu_read_lock_held()); 5359 return idr_find(&mem_cgroup_idr, id); 5360 } 5361 5362 #ifdef CONFIG_SHRINKER_DEBUG 5363 struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) 5364 { 5365 struct cgroup *cgrp; 5366 struct cgroup_subsys_state *css; 5367 struct mem_cgroup *memcg; 5368 5369 cgrp = cgroup_get_from_id(ino); 5370 if (IS_ERR(cgrp)) 5371 return ERR_CAST(cgrp); 5372 5373 css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); 5374 if (css) 5375 memcg = container_of(css, struct mem_cgroup, css); 5376 else 5377 memcg = ERR_PTR(-ENOENT); 5378 5379 cgroup_put(cgrp); 5380 5381 return memcg; 5382 } 5383 #endif 5384 5385 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5386 { 5387 struct mem_cgroup_per_node *pn; 5388 5389 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); 5390 if (!pn) 5391 return 1; 5392 5393 pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, 5394 GFP_KERNEL_ACCOUNT); 5395 if (!pn->lruvec_stats_percpu) { 5396 kfree(pn); 5397 return 1; 5398 } 5399 5400 lruvec_init(&pn->lruvec); 5401 pn->memcg = memcg; 5402 5403 memcg->nodeinfo[node] = pn; 5404 return 0; 5405 } 5406 5407 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5408 { 5409 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 5410 5411 if (!pn) 5412 return; 5413 5414 free_percpu(pn->lruvec_stats_percpu); 5415 kfree(pn); 5416 } 5417 5418 static void __mem_cgroup_free(struct mem_cgroup *memcg) 5419 { 5420 int node; 5421 5422 if (memcg->orig_objcg) 5423 obj_cgroup_put(memcg->orig_objcg); 5424 5425 for_each_node(node) 5426 free_mem_cgroup_per_node_info(memcg, node); 5427 kfree(memcg->vmstats); 5428 free_percpu(memcg->vmstats_percpu); 5429 kfree(memcg); 5430 } 5431 5432 static void mem_cgroup_free(struct mem_cgroup *memcg) 5433 { 5434 lru_gen_exit_memcg(memcg); 5435 memcg_wb_domain_exit(memcg); 5436 __mem_cgroup_free(memcg); 5437 } 5438 5439 static struct mem_cgroup *mem_cgroup_alloc(void) 5440 { 5441 struct mem_cgroup *memcg; 5442 int node; 5443 int __maybe_unused i; 5444 long error = -ENOMEM; 5445 5446 memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); 5447 if (!memcg) 5448 return ERR_PTR(error); 5449 5450 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 5451 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); 5452 if (memcg->id.id < 0) { 5453 error = memcg->id.id; 5454 goto fail; 5455 } 5456 5457 memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); 5458 if (!memcg->vmstats) 5459 goto fail; 5460 5461 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, 5462 GFP_KERNEL_ACCOUNT); 5463 if (!memcg->vmstats_percpu) 5464 goto fail; 5465 5466 for_each_node(node) 5467 if (alloc_mem_cgroup_per_node_info(memcg, node)) 5468 goto fail; 5469 5470 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 5471 goto fail; 5472 5473 INIT_WORK(&memcg->high_work, high_work_func); 5474 INIT_LIST_HEAD(&memcg->oom_notify); 5475 mutex_init(&memcg->thresholds_lock); 5476 spin_lock_init(&memcg->move_lock); 5477 vmpressure_init(&memcg->vmpressure); 5478 INIT_LIST_HEAD(&memcg->event_list); 5479 spin_lock_init(&memcg->event_list_lock); 5480 memcg->socket_pressure = jiffies; 5481 #ifdef CONFIG_MEMCG_KMEM 5482 memcg->kmemcg_id = -1; 5483 INIT_LIST_HEAD(&memcg->objcg_list); 5484 #endif 5485 #ifdef CONFIG_CGROUP_WRITEBACK 5486 INIT_LIST_HEAD(&memcg->cgwb_list); 5487 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5488 memcg->cgwb_frn[i].done = 5489 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 5490 #endif 5491 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5492 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); 5493 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); 5494 memcg->deferred_split_queue.split_queue_len = 0; 5495 #endif 5496 lru_gen_init_memcg(memcg); 5497 return memcg; 5498 fail: 5499 mem_cgroup_id_remove(memcg); 5500 __mem_cgroup_free(memcg); 5501 return ERR_PTR(error); 5502 } 5503 5504 static struct cgroup_subsys_state * __ref 5505 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 5506 { 5507 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 5508 struct mem_cgroup *memcg, *old_memcg; 5509 5510 old_memcg = set_active_memcg(parent); 5511 memcg = mem_cgroup_alloc(); 5512 set_active_memcg(old_memcg); 5513 if (IS_ERR(memcg)) 5514 return ERR_CAST(memcg); 5515 5516 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5517 WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); 5518 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 5519 memcg->zswap_max = PAGE_COUNTER_MAX; 5520 #endif 5521 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5522 if (parent) { 5523 WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent)); 5524 WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable)); 5525 5526 page_counter_init(&memcg->memory, &parent->memory); 5527 page_counter_init(&memcg->swap, &parent->swap); 5528 page_counter_init(&memcg->kmem, &parent->kmem); 5529 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 5530 } else { 5531 init_memcg_events(); 5532 page_counter_init(&memcg->memory, NULL); 5533 page_counter_init(&memcg->swap, NULL); 5534 page_counter_init(&memcg->kmem, NULL); 5535 page_counter_init(&memcg->tcpmem, NULL); 5536 5537 root_mem_cgroup = memcg; 5538 return &memcg->css; 5539 } 5540 5541 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5542 static_branch_inc(&memcg_sockets_enabled_key); 5543 5544 #if defined(CONFIG_MEMCG_KMEM) 5545 if (!cgroup_memory_nobpf) 5546 static_branch_inc(&memcg_bpf_enabled_key); 5547 #endif 5548 5549 return &memcg->css; 5550 } 5551 5552 static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 5553 { 5554 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5555 5556 if (memcg_online_kmem(memcg)) 5557 goto remove_id; 5558 5559 /* 5560 * A memcg must be visible for expand_shrinker_info() 5561 * by the time the maps are allocated. So, we allocate maps 5562 * here, when for_each_mem_cgroup() can't skip it. 5563 */ 5564 if (alloc_shrinker_info(memcg)) 5565 goto offline_kmem; 5566 5567 if (unlikely(mem_cgroup_is_root(memcg))) 5568 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 5569 FLUSH_TIME); 5570 lru_gen_online_memcg(memcg); 5571 5572 /* Online state pins memcg ID, memcg ID pins CSS */ 5573 refcount_set(&memcg->id.ref, 1); 5574 css_get(css); 5575 5576 /* 5577 * Ensure mem_cgroup_from_id() works once we're fully online. 5578 * 5579 * We could do this earlier and require callers to filter with 5580 * css_tryget_online(). But right now there are no users that 5581 * need earlier access, and the workingset code relies on the 5582 * cgroup tree linkage (mem_cgroup_get_nr_swap_pages()). So 5583 * publish it here at the end of onlining. This matches the 5584 * regular ID destruction during offlining. 5585 */ 5586 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 5587 5588 return 0; 5589 offline_kmem: 5590 memcg_offline_kmem(memcg); 5591 remove_id: 5592 mem_cgroup_id_remove(memcg); 5593 return -ENOMEM; 5594 } 5595 5596 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 5597 { 5598 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5599 struct mem_cgroup_event *event, *tmp; 5600 5601 /* 5602 * Unregister events and notify userspace. 5603 * Notify userspace about cgroup removing only after rmdir of cgroup 5604 * directory to avoid race between userspace and kernelspace. 5605 */ 5606 spin_lock_irq(&memcg->event_list_lock); 5607 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 5608 list_del_init(&event->list); 5609 schedule_work(&event->remove); 5610 } 5611 spin_unlock_irq(&memcg->event_list_lock); 5612 5613 page_counter_set_min(&memcg->memory, 0); 5614 page_counter_set_low(&memcg->memory, 0); 5615 5616 memcg_offline_kmem(memcg); 5617 reparent_shrinker_deferred(memcg); 5618 wb_memcg_offline(memcg); 5619 lru_gen_offline_memcg(memcg); 5620 5621 drain_all_stock(memcg); 5622 5623 mem_cgroup_id_put(memcg); 5624 } 5625 5626 static void mem_cgroup_css_released(struct cgroup_subsys_state *css) 5627 { 5628 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5629 5630 invalidate_reclaim_iterators(memcg); 5631 lru_gen_release_memcg(memcg); 5632 } 5633 5634 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5635 { 5636 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5637 int __maybe_unused i; 5638 5639 #ifdef CONFIG_CGROUP_WRITEBACK 5640 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5641 wb_wait_for_completion(&memcg->cgwb_frn[i].done); 5642 #endif 5643 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5644 static_branch_dec(&memcg_sockets_enabled_key); 5645 5646 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 5647 static_branch_dec(&memcg_sockets_enabled_key); 5648 5649 #if defined(CONFIG_MEMCG_KMEM) 5650 if (!cgroup_memory_nobpf) 5651 static_branch_dec(&memcg_bpf_enabled_key); 5652 #endif 5653 5654 vmpressure_cleanup(&memcg->vmpressure); 5655 cancel_work_sync(&memcg->high_work); 5656 mem_cgroup_remove_from_trees(memcg); 5657 free_shrinker_info(memcg); 5658 mem_cgroup_free(memcg); 5659 } 5660 5661 /** 5662 * mem_cgroup_css_reset - reset the states of a mem_cgroup 5663 * @css: the target css 5664 * 5665 * Reset the states of the mem_cgroup associated with @css. This is 5666 * invoked when the userland requests disabling on the default hierarchy 5667 * but the memcg is pinned through dependency. The memcg should stop 5668 * applying policies and should revert to the vanilla state as it may be 5669 * made visible again. 5670 * 5671 * The current implementation only resets the essential configurations. 5672 * This needs to be expanded to cover all the visible parts. 5673 */ 5674 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 5675 { 5676 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5677 5678 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 5679 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 5680 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 5681 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 5682 page_counter_set_min(&memcg->memory, 0); 5683 page_counter_set_low(&memcg->memory, 0); 5684 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5685 WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); 5686 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5687 memcg_wb_domain_size_changed(memcg); 5688 } 5689 5690 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) 5691 { 5692 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5693 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5694 struct memcg_vmstats_percpu *statc; 5695 long delta, delta_cpu, v; 5696 int i, nid; 5697 5698 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); 5699 5700 for (i = 0; i < MEMCG_NR_STAT; i++) { 5701 /* 5702 * Collect the aggregated propagation counts of groups 5703 * below us. We're in a per-cpu loop here and this is 5704 * a global counter, so the first cycle will get them. 5705 */ 5706 delta = memcg->vmstats->state_pending[i]; 5707 if (delta) 5708 memcg->vmstats->state_pending[i] = 0; 5709 5710 /* Add CPU changes on this level since the last flush */ 5711 delta_cpu = 0; 5712 v = READ_ONCE(statc->state[i]); 5713 if (v != statc->state_prev[i]) { 5714 delta_cpu = v - statc->state_prev[i]; 5715 delta += delta_cpu; 5716 statc->state_prev[i] = v; 5717 } 5718 5719 /* Aggregate counts on this level and propagate upwards */ 5720 if (delta_cpu) 5721 memcg->vmstats->state_local[i] += delta_cpu; 5722 5723 if (delta) { 5724 memcg->vmstats->state[i] += delta; 5725 if (parent) 5726 parent->vmstats->state_pending[i] += delta; 5727 } 5728 } 5729 5730 for (i = 0; i < NR_MEMCG_EVENTS; i++) { 5731 delta = memcg->vmstats->events_pending[i]; 5732 if (delta) 5733 memcg->vmstats->events_pending[i] = 0; 5734 5735 delta_cpu = 0; 5736 v = READ_ONCE(statc->events[i]); 5737 if (v != statc->events_prev[i]) { 5738 delta_cpu = v - statc->events_prev[i]; 5739 delta += delta_cpu; 5740 statc->events_prev[i] = v; 5741 } 5742 5743 if (delta_cpu) 5744 memcg->vmstats->events_local[i] += delta_cpu; 5745 5746 if (delta) { 5747 memcg->vmstats->events[i] += delta; 5748 if (parent) 5749 parent->vmstats->events_pending[i] += delta; 5750 } 5751 } 5752 5753 for_each_node_state(nid, N_MEMORY) { 5754 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; 5755 struct mem_cgroup_per_node *ppn = NULL; 5756 struct lruvec_stats_percpu *lstatc; 5757 5758 if (parent) 5759 ppn = parent->nodeinfo[nid]; 5760 5761 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); 5762 5763 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { 5764 delta = pn->lruvec_stats.state_pending[i]; 5765 if (delta) 5766 pn->lruvec_stats.state_pending[i] = 0; 5767 5768 delta_cpu = 0; 5769 v = READ_ONCE(lstatc->state[i]); 5770 if (v != lstatc->state_prev[i]) { 5771 delta_cpu = v - lstatc->state_prev[i]; 5772 delta += delta_cpu; 5773 lstatc->state_prev[i] = v; 5774 } 5775 5776 if (delta_cpu) 5777 pn->lruvec_stats.state_local[i] += delta_cpu; 5778 5779 if (delta) { 5780 pn->lruvec_stats.state[i] += delta; 5781 if (ppn) 5782 ppn->lruvec_stats.state_pending[i] += delta; 5783 } 5784 } 5785 } 5786 } 5787 5788 #ifdef CONFIG_MMU 5789 /* Handlers for move charge at task migration. */ 5790 static int mem_cgroup_do_precharge(unsigned long count) 5791 { 5792 int ret; 5793 5794 /* Try a single bulk charge without reclaim first, kswapd may wake */ 5795 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 5796 if (!ret) { 5797 mc.precharge += count; 5798 return ret; 5799 } 5800 5801 /* Try charges one by one with reclaim, but do not retry */ 5802 while (count--) { 5803 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 5804 if (ret) 5805 return ret; 5806 mc.precharge++; 5807 cond_resched(); 5808 } 5809 return 0; 5810 } 5811 5812 union mc_target { 5813 struct page *page; 5814 swp_entry_t ent; 5815 }; 5816 5817 enum mc_target_type { 5818 MC_TARGET_NONE = 0, 5819 MC_TARGET_PAGE, 5820 MC_TARGET_SWAP, 5821 MC_TARGET_DEVICE, 5822 }; 5823 5824 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5825 unsigned long addr, pte_t ptent) 5826 { 5827 struct page *page = vm_normal_page(vma, addr, ptent); 5828 5829 if (!page) 5830 return NULL; 5831 if (PageAnon(page)) { 5832 if (!(mc.flags & MOVE_ANON)) 5833 return NULL; 5834 } else { 5835 if (!(mc.flags & MOVE_FILE)) 5836 return NULL; 5837 } 5838 get_page(page); 5839 5840 return page; 5841 } 5842 5843 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 5844 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5845 pte_t ptent, swp_entry_t *entry) 5846 { 5847 struct page *page = NULL; 5848 swp_entry_t ent = pte_to_swp_entry(ptent); 5849 5850 if (!(mc.flags & MOVE_ANON)) 5851 return NULL; 5852 5853 /* 5854 * Handle device private pages that are not accessible by the CPU, but 5855 * stored as special swap entries in the page table. 5856 */ 5857 if (is_device_private_entry(ent)) { 5858 page = pfn_swap_entry_to_page(ent); 5859 if (!get_page_unless_zero(page)) 5860 return NULL; 5861 return page; 5862 } 5863 5864 if (non_swap_entry(ent)) 5865 return NULL; 5866 5867 /* 5868 * Because swap_cache_get_folio() updates some statistics counter, 5869 * we call find_get_page() with swapper_space directly. 5870 */ 5871 page = find_get_page(swap_address_space(ent), swp_offset(ent)); 5872 entry->val = ent.val; 5873 5874 return page; 5875 } 5876 #else 5877 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5878 pte_t ptent, swp_entry_t *entry) 5879 { 5880 return NULL; 5881 } 5882 #endif 5883 5884 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5885 unsigned long addr, pte_t ptent) 5886 { 5887 unsigned long index; 5888 struct folio *folio; 5889 5890 if (!vma->vm_file) /* anonymous vma */ 5891 return NULL; 5892 if (!(mc.flags & MOVE_FILE)) 5893 return NULL; 5894 5895 /* folio is moved even if it's not RSS of this task(page-faulted). */ 5896 /* shmem/tmpfs may report page out on swap: account for that too. */ 5897 index = linear_page_index(vma, addr); 5898 folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); 5899 if (IS_ERR(folio)) 5900 return NULL; 5901 return folio_file_page(folio, index); 5902 } 5903 5904 /** 5905 * mem_cgroup_move_account - move account of the page 5906 * @page: the page 5907 * @compound: charge the page as compound or small page 5908 * @from: mem_cgroup which the page is moved from. 5909 * @to: mem_cgroup which the page is moved to. @from != @to. 5910 * 5911 * The page must be locked and not on the LRU. 5912 * 5913 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 5914 * from old cgroup. 5915 */ 5916 static int mem_cgroup_move_account(struct page *page, 5917 bool compound, 5918 struct mem_cgroup *from, 5919 struct mem_cgroup *to) 5920 { 5921 struct folio *folio = page_folio(page); 5922 struct lruvec *from_vec, *to_vec; 5923 struct pglist_data *pgdat; 5924 unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; 5925 int nid, ret; 5926 5927 VM_BUG_ON(from == to); 5928 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 5929 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 5930 VM_BUG_ON(compound && !folio_test_large(folio)); 5931 5932 ret = -EINVAL; 5933 if (folio_memcg(folio) != from) 5934 goto out; 5935 5936 pgdat = folio_pgdat(folio); 5937 from_vec = mem_cgroup_lruvec(from, pgdat); 5938 to_vec = mem_cgroup_lruvec(to, pgdat); 5939 5940 folio_memcg_lock(folio); 5941 5942 if (folio_test_anon(folio)) { 5943 if (folio_mapped(folio)) { 5944 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); 5945 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); 5946 if (folio_test_pmd_mappable(folio)) { 5947 __mod_lruvec_state(from_vec, NR_ANON_THPS, 5948 -nr_pages); 5949 __mod_lruvec_state(to_vec, NR_ANON_THPS, 5950 nr_pages); 5951 } 5952 } 5953 } else { 5954 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); 5955 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); 5956 5957 if (folio_test_swapbacked(folio)) { 5958 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); 5959 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); 5960 } 5961 5962 if (folio_mapped(folio)) { 5963 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); 5964 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); 5965 } 5966 5967 if (folio_test_dirty(folio)) { 5968 struct address_space *mapping = folio_mapping(folio); 5969 5970 if (mapping_can_writeback(mapping)) { 5971 __mod_lruvec_state(from_vec, NR_FILE_DIRTY, 5972 -nr_pages); 5973 __mod_lruvec_state(to_vec, NR_FILE_DIRTY, 5974 nr_pages); 5975 } 5976 } 5977 } 5978 5979 #ifdef CONFIG_SWAP 5980 if (folio_test_swapcache(folio)) { 5981 __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages); 5982 __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages); 5983 } 5984 #endif 5985 if (folio_test_writeback(folio)) { 5986 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); 5987 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); 5988 } 5989 5990 /* 5991 * All state has been migrated, let's switch to the new memcg. 5992 * 5993 * It is safe to change page's memcg here because the page 5994 * is referenced, charged, isolated, and locked: we can't race 5995 * with (un)charging, migration, LRU putback, or anything else 5996 * that would rely on a stable page's memory cgroup. 5997 * 5998 * Note that folio_memcg_lock is a memcg lock, not a page lock, 5999 * to save space. As soon as we switch page's memory cgroup to a 6000 * new memcg that isn't locked, the above state can change 6001 * concurrently again. Make sure we're truly done with it. 6002 */ 6003 smp_mb(); 6004 6005 css_get(&to->css); 6006 css_put(&from->css); 6007 6008 folio->memcg_data = (unsigned long)to; 6009 6010 __folio_memcg_unlock(from); 6011 6012 ret = 0; 6013 nid = folio_nid(folio); 6014 6015 local_irq_disable(); 6016 mem_cgroup_charge_statistics(to, nr_pages); 6017 memcg_check_events(to, nid); 6018 mem_cgroup_charge_statistics(from, -nr_pages); 6019 memcg_check_events(from, nid); 6020 local_irq_enable(); 6021 out: 6022 return ret; 6023 } 6024 6025 /** 6026 * get_mctgt_type - get target type of moving charge 6027 * @vma: the vma the pte to be checked belongs 6028 * @addr: the address corresponding to the pte to be checked 6029 * @ptent: the pte to be checked 6030 * @target: the pointer the target page or swap ent will be stored(can be NULL) 6031 * 6032 * Context: Called with pte lock held. 6033 * Return: 6034 * * MC_TARGET_NONE - If the pte is not a target for move charge. 6035 * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for 6036 * move charge. If @target is not NULL, the page is stored in target->page 6037 * with extra refcnt taken (Caller should release it). 6038 * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a 6039 * target for charge migration. If @target is not NULL, the entry is 6040 * stored in target->ent. 6041 * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and 6042 * thus not on the lru. For now such page is charged like a regular page 6043 * would be as it is just special memory taking the place of a regular page. 6044 * See Documentations/vm/hmm.txt and include/linux/hmm.h 6045 */ 6046 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 6047 unsigned long addr, pte_t ptent, union mc_target *target) 6048 { 6049 struct page *page = NULL; 6050 enum mc_target_type ret = MC_TARGET_NONE; 6051 swp_entry_t ent = { .val = 0 }; 6052 6053 if (pte_present(ptent)) 6054 page = mc_handle_present_pte(vma, addr, ptent); 6055 else if (pte_none_mostly(ptent)) 6056 /* 6057 * PTE markers should be treated as a none pte here, separated 6058 * from other swap handling below. 6059 */ 6060 page = mc_handle_file_pte(vma, addr, ptent); 6061 else if (is_swap_pte(ptent)) 6062 page = mc_handle_swap_pte(vma, ptent, &ent); 6063 6064 if (target && page) { 6065 if (!trylock_page(page)) { 6066 put_page(page); 6067 return ret; 6068 } 6069 /* 6070 * page_mapped() must be stable during the move. This 6071 * pte is locked, so if it's present, the page cannot 6072 * become unmapped. If it isn't, we have only partial 6073 * control over the mapped state: the page lock will 6074 * prevent new faults against pagecache and swapcache, 6075 * so an unmapped page cannot become mapped. However, 6076 * if the page is already mapped elsewhere, it can 6077 * unmap, and there is nothing we can do about it. 6078 * Alas, skip moving the page in this case. 6079 */ 6080 if (!pte_present(ptent) && page_mapped(page)) { 6081 unlock_page(page); 6082 put_page(page); 6083 return ret; 6084 } 6085 } 6086 6087 if (!page && !ent.val) 6088 return ret; 6089 if (page) { 6090 /* 6091 * Do only loose check w/o serialization. 6092 * mem_cgroup_move_account() checks the page is valid or 6093 * not under LRU exclusion. 6094 */ 6095 if (page_memcg(page) == mc.from) { 6096 ret = MC_TARGET_PAGE; 6097 if (is_device_private_page(page) || 6098 is_device_coherent_page(page)) 6099 ret = MC_TARGET_DEVICE; 6100 if (target) 6101 target->page = page; 6102 } 6103 if (!ret || !target) { 6104 if (target) 6105 unlock_page(page); 6106 put_page(page); 6107 } 6108 } 6109 /* 6110 * There is a swap entry and a page doesn't exist or isn't charged. 6111 * But we cannot move a tail-page in a THP. 6112 */ 6113 if (ent.val && !ret && (!page || !PageTransCompound(page)) && 6114 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 6115 ret = MC_TARGET_SWAP; 6116 if (target) 6117 target->ent = ent; 6118 } 6119 return ret; 6120 } 6121 6122 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6123 /* 6124 * We don't consider PMD mapped swapping or file mapped pages because THP does 6125 * not support them for now. 6126 * Caller should make sure that pmd_trans_huge(pmd) is true. 6127 */ 6128 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 6129 unsigned long addr, pmd_t pmd, union mc_target *target) 6130 { 6131 struct page *page = NULL; 6132 enum mc_target_type ret = MC_TARGET_NONE; 6133 6134 if (unlikely(is_swap_pmd(pmd))) { 6135 VM_BUG_ON(thp_migration_supported() && 6136 !is_pmd_migration_entry(pmd)); 6137 return ret; 6138 } 6139 page = pmd_page(pmd); 6140 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 6141 if (!(mc.flags & MOVE_ANON)) 6142 return ret; 6143 if (page_memcg(page) == mc.from) { 6144 ret = MC_TARGET_PAGE; 6145 if (target) { 6146 get_page(page); 6147 if (!trylock_page(page)) { 6148 put_page(page); 6149 return MC_TARGET_NONE; 6150 } 6151 target->page = page; 6152 } 6153 } 6154 return ret; 6155 } 6156 #else 6157 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 6158 unsigned long addr, pmd_t pmd, union mc_target *target) 6159 { 6160 return MC_TARGET_NONE; 6161 } 6162 #endif 6163 6164 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 6165 unsigned long addr, unsigned long end, 6166 struct mm_walk *walk) 6167 { 6168 struct vm_area_struct *vma = walk->vma; 6169 pte_t *pte; 6170 spinlock_t *ptl; 6171 6172 ptl = pmd_trans_huge_lock(pmd, vma); 6173 if (ptl) { 6174 /* 6175 * Note their can not be MC_TARGET_DEVICE for now as we do not 6176 * support transparent huge page with MEMORY_DEVICE_PRIVATE but 6177 * this might change. 6178 */ 6179 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 6180 mc.precharge += HPAGE_PMD_NR; 6181 spin_unlock(ptl); 6182 return 0; 6183 } 6184 6185 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6186 if (!pte) 6187 return 0; 6188 for (; addr != end; pte++, addr += PAGE_SIZE) 6189 if (get_mctgt_type(vma, addr, ptep_get(pte), NULL)) 6190 mc.precharge++; /* increment precharge temporarily */ 6191 pte_unmap_unlock(pte - 1, ptl); 6192 cond_resched(); 6193 6194 return 0; 6195 } 6196 6197 static const struct mm_walk_ops precharge_walk_ops = { 6198 .pmd_entry = mem_cgroup_count_precharge_pte_range, 6199 .walk_lock = PGWALK_RDLOCK, 6200 }; 6201 6202 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 6203 { 6204 unsigned long precharge; 6205 6206 mmap_read_lock(mm); 6207 walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); 6208 mmap_read_unlock(mm); 6209 6210 precharge = mc.precharge; 6211 mc.precharge = 0; 6212 6213 return precharge; 6214 } 6215 6216 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 6217 { 6218 unsigned long precharge = mem_cgroup_count_precharge(mm); 6219 6220 VM_BUG_ON(mc.moving_task); 6221 mc.moving_task = current; 6222 return mem_cgroup_do_precharge(precharge); 6223 } 6224 6225 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 6226 static void __mem_cgroup_clear_mc(void) 6227 { 6228 struct mem_cgroup *from = mc.from; 6229 struct mem_cgroup *to = mc.to; 6230 6231 /* we must uncharge all the leftover precharges from mc.to */ 6232 if (mc.precharge) { 6233 mem_cgroup_cancel_charge(mc.to, mc.precharge); 6234 mc.precharge = 0; 6235 } 6236 /* 6237 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 6238 * we must uncharge here. 6239 */ 6240 if (mc.moved_charge) { 6241 mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 6242 mc.moved_charge = 0; 6243 } 6244 /* we must fixup refcnts and charges */ 6245 if (mc.moved_swap) { 6246 /* uncharge swap account from the old cgroup */ 6247 if (!mem_cgroup_is_root(mc.from)) 6248 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 6249 6250 mem_cgroup_id_put_many(mc.from, mc.moved_swap); 6251 6252 /* 6253 * we charged both to->memory and to->memsw, so we 6254 * should uncharge to->memory. 6255 */ 6256 if (!mem_cgroup_is_root(mc.to)) 6257 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 6258 6259 mc.moved_swap = 0; 6260 } 6261 memcg_oom_recover(from); 6262 memcg_oom_recover(to); 6263 wake_up_all(&mc.waitq); 6264 } 6265 6266 static void mem_cgroup_clear_mc(void) 6267 { 6268 struct mm_struct *mm = mc.mm; 6269 6270 /* 6271 * we must clear moving_task before waking up waiters at the end of 6272 * task migration. 6273 */ 6274 mc.moving_task = NULL; 6275 __mem_cgroup_clear_mc(); 6276 spin_lock(&mc.lock); 6277 mc.from = NULL; 6278 mc.to = NULL; 6279 mc.mm = NULL; 6280 spin_unlock(&mc.lock); 6281 6282 mmput(mm); 6283 } 6284 6285 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 6286 { 6287 struct cgroup_subsys_state *css; 6288 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 6289 struct mem_cgroup *from; 6290 struct task_struct *leader, *p; 6291 struct mm_struct *mm; 6292 unsigned long move_flags; 6293 int ret = 0; 6294 6295 /* charge immigration isn't supported on the default hierarchy */ 6296 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 6297 return 0; 6298 6299 /* 6300 * Multi-process migrations only happen on the default hierarchy 6301 * where charge immigration is not used. Perform charge 6302 * immigration if @tset contains a leader and whine if there are 6303 * multiple. 6304 */ 6305 p = NULL; 6306 cgroup_taskset_for_each_leader(leader, css, tset) { 6307 WARN_ON_ONCE(p); 6308 p = leader; 6309 memcg = mem_cgroup_from_css(css); 6310 } 6311 if (!p) 6312 return 0; 6313 6314 /* 6315 * We are now committed to this value whatever it is. Changes in this 6316 * tunable will only affect upcoming migrations, not the current one. 6317 * So we need to save it, and keep it going. 6318 */ 6319 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 6320 if (!move_flags) 6321 return 0; 6322 6323 from = mem_cgroup_from_task(p); 6324 6325 VM_BUG_ON(from == memcg); 6326 6327 mm = get_task_mm(p); 6328 if (!mm) 6329 return 0; 6330 /* We move charges only when we move a owner of the mm */ 6331 if (mm->owner == p) { 6332 VM_BUG_ON(mc.from); 6333 VM_BUG_ON(mc.to); 6334 VM_BUG_ON(mc.precharge); 6335 VM_BUG_ON(mc.moved_charge); 6336 VM_BUG_ON(mc.moved_swap); 6337 6338 spin_lock(&mc.lock); 6339 mc.mm = mm; 6340 mc.from = from; 6341 mc.to = memcg; 6342 mc.flags = move_flags; 6343 spin_unlock(&mc.lock); 6344 /* We set mc.moving_task later */ 6345 6346 ret = mem_cgroup_precharge_mc(mm); 6347 if (ret) 6348 mem_cgroup_clear_mc(); 6349 } else { 6350 mmput(mm); 6351 } 6352 return ret; 6353 } 6354 6355 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 6356 { 6357 if (mc.to) 6358 mem_cgroup_clear_mc(); 6359 } 6360 6361 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 6362 unsigned long addr, unsigned long end, 6363 struct mm_walk *walk) 6364 { 6365 int ret = 0; 6366 struct vm_area_struct *vma = walk->vma; 6367 pte_t *pte; 6368 spinlock_t *ptl; 6369 enum mc_target_type target_type; 6370 union mc_target target; 6371 struct page *page; 6372 6373 ptl = pmd_trans_huge_lock(pmd, vma); 6374 if (ptl) { 6375 if (mc.precharge < HPAGE_PMD_NR) { 6376 spin_unlock(ptl); 6377 return 0; 6378 } 6379 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 6380 if (target_type == MC_TARGET_PAGE) { 6381 page = target.page; 6382 if (isolate_lru_page(page)) { 6383 if (!mem_cgroup_move_account(page, true, 6384 mc.from, mc.to)) { 6385 mc.precharge -= HPAGE_PMD_NR; 6386 mc.moved_charge += HPAGE_PMD_NR; 6387 } 6388 putback_lru_page(page); 6389 } 6390 unlock_page(page); 6391 put_page(page); 6392 } else if (target_type == MC_TARGET_DEVICE) { 6393 page = target.page; 6394 if (!mem_cgroup_move_account(page, true, 6395 mc.from, mc.to)) { 6396 mc.precharge -= HPAGE_PMD_NR; 6397 mc.moved_charge += HPAGE_PMD_NR; 6398 } 6399 unlock_page(page); 6400 put_page(page); 6401 } 6402 spin_unlock(ptl); 6403 return 0; 6404 } 6405 6406 retry: 6407 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6408 if (!pte) 6409 return 0; 6410 for (; addr != end; addr += PAGE_SIZE) { 6411 pte_t ptent = ptep_get(pte++); 6412 bool device = false; 6413 swp_entry_t ent; 6414 6415 if (!mc.precharge) 6416 break; 6417 6418 switch (get_mctgt_type(vma, addr, ptent, &target)) { 6419 case MC_TARGET_DEVICE: 6420 device = true; 6421 fallthrough; 6422 case MC_TARGET_PAGE: 6423 page = target.page; 6424 /* 6425 * We can have a part of the split pmd here. Moving it 6426 * can be done but it would be too convoluted so simply 6427 * ignore such a partial THP and keep it in original 6428 * memcg. There should be somebody mapping the head. 6429 */ 6430 if (PageTransCompound(page)) 6431 goto put; 6432 if (!device && !isolate_lru_page(page)) 6433 goto put; 6434 if (!mem_cgroup_move_account(page, false, 6435 mc.from, mc.to)) { 6436 mc.precharge--; 6437 /* we uncharge from mc.from later. */ 6438 mc.moved_charge++; 6439 } 6440 if (!device) 6441 putback_lru_page(page); 6442 put: /* get_mctgt_type() gets & locks the page */ 6443 unlock_page(page); 6444 put_page(page); 6445 break; 6446 case MC_TARGET_SWAP: 6447 ent = target.ent; 6448 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 6449 mc.precharge--; 6450 mem_cgroup_id_get_many(mc.to, 1); 6451 /* we fixup other refcnts and charges later. */ 6452 mc.moved_swap++; 6453 } 6454 break; 6455 default: 6456 break; 6457 } 6458 } 6459 pte_unmap_unlock(pte - 1, ptl); 6460 cond_resched(); 6461 6462 if (addr != end) { 6463 /* 6464 * We have consumed all precharges we got in can_attach(). 6465 * We try charge one by one, but don't do any additional 6466 * charges to mc.to if we have failed in charge once in attach() 6467 * phase. 6468 */ 6469 ret = mem_cgroup_do_precharge(1); 6470 if (!ret) 6471 goto retry; 6472 } 6473 6474 return ret; 6475 } 6476 6477 static const struct mm_walk_ops charge_walk_ops = { 6478 .pmd_entry = mem_cgroup_move_charge_pte_range, 6479 .walk_lock = PGWALK_RDLOCK, 6480 }; 6481 6482 static void mem_cgroup_move_charge(void) 6483 { 6484 lru_add_drain_all(); 6485 /* 6486 * Signal folio_memcg_lock() to take the memcg's move_lock 6487 * while we're moving its pages to another memcg. Then wait 6488 * for already started RCU-only updates to finish. 6489 */ 6490 atomic_inc(&mc.from->moving_account); 6491 synchronize_rcu(); 6492 retry: 6493 if (unlikely(!mmap_read_trylock(mc.mm))) { 6494 /* 6495 * Someone who are holding the mmap_lock might be waiting in 6496 * waitq. So we cancel all extra charges, wake up all waiters, 6497 * and retry. Because we cancel precharges, we might not be able 6498 * to move enough charges, but moving charge is a best-effort 6499 * feature anyway, so it wouldn't be a big problem. 6500 */ 6501 __mem_cgroup_clear_mc(); 6502 cond_resched(); 6503 goto retry; 6504 } 6505 /* 6506 * When we have consumed all precharges and failed in doing 6507 * additional charge, the page walk just aborts. 6508 */ 6509 walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); 6510 mmap_read_unlock(mc.mm); 6511 atomic_dec(&mc.from->moving_account); 6512 } 6513 6514 static void mem_cgroup_move_task(void) 6515 { 6516 if (mc.to) { 6517 mem_cgroup_move_charge(); 6518 mem_cgroup_clear_mc(); 6519 } 6520 } 6521 6522 #else /* !CONFIG_MMU */ 6523 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 6524 { 6525 return 0; 6526 } 6527 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 6528 { 6529 } 6530 static void mem_cgroup_move_task(void) 6531 { 6532 } 6533 #endif 6534 6535 #ifdef CONFIG_MEMCG_KMEM 6536 static void mem_cgroup_fork(struct task_struct *task) 6537 { 6538 /* 6539 * Set the update flag to cause task->objcg to be initialized lazily 6540 * on the first allocation. It can be done without any synchronization 6541 * because it's always performed on the current task, so does 6542 * current_objcg_update(). 6543 */ 6544 task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG; 6545 } 6546 6547 static void mem_cgroup_exit(struct task_struct *task) 6548 { 6549 struct obj_cgroup *objcg = task->objcg; 6550 6551 objcg = (struct obj_cgroup *) 6552 ((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG); 6553 if (objcg) 6554 obj_cgroup_put(objcg); 6555 6556 /* 6557 * Some kernel allocations can happen after this point, 6558 * but let's ignore them. It can be done without any synchronization 6559 * because it's always performed on the current task, so does 6560 * current_objcg_update(). 6561 */ 6562 task->objcg = NULL; 6563 } 6564 #endif 6565 6566 #ifdef CONFIG_LRU_GEN 6567 static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) 6568 { 6569 struct task_struct *task; 6570 struct cgroup_subsys_state *css; 6571 6572 /* find the first leader if there is any */ 6573 cgroup_taskset_for_each_leader(task, css, tset) 6574 break; 6575 6576 if (!task) 6577 return; 6578 6579 task_lock(task); 6580 if (task->mm && READ_ONCE(task->mm->owner) == task) 6581 lru_gen_migrate_mm(task->mm); 6582 task_unlock(task); 6583 } 6584 #else 6585 static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {} 6586 #endif /* CONFIG_LRU_GEN */ 6587 6588 #ifdef CONFIG_MEMCG_KMEM 6589 static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) 6590 { 6591 struct task_struct *task; 6592 struct cgroup_subsys_state *css; 6593 6594 cgroup_taskset_for_each(task, css, tset) { 6595 /* atomically set the update bit */ 6596 set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg); 6597 } 6598 } 6599 #else 6600 static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) {} 6601 #endif /* CONFIG_MEMCG_KMEM */ 6602 6603 #if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM) 6604 static void mem_cgroup_attach(struct cgroup_taskset *tset) 6605 { 6606 mem_cgroup_lru_gen_attach(tset); 6607 mem_cgroup_kmem_attach(tset); 6608 } 6609 #endif 6610 6611 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 6612 { 6613 if (value == PAGE_COUNTER_MAX) 6614 seq_puts(m, "max\n"); 6615 else 6616 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 6617 6618 return 0; 6619 } 6620 6621 static u64 memory_current_read(struct cgroup_subsys_state *css, 6622 struct cftype *cft) 6623 { 6624 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6625 6626 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 6627 } 6628 6629 static u64 memory_peak_read(struct cgroup_subsys_state *css, 6630 struct cftype *cft) 6631 { 6632 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6633 6634 return (u64)memcg->memory.watermark * PAGE_SIZE; 6635 } 6636 6637 static int memory_min_show(struct seq_file *m, void *v) 6638 { 6639 return seq_puts_memcg_tunable(m, 6640 READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 6641 } 6642 6643 static ssize_t memory_min_write(struct kernfs_open_file *of, 6644 char *buf, size_t nbytes, loff_t off) 6645 { 6646 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6647 unsigned long min; 6648 int err; 6649 6650 buf = strstrip(buf); 6651 err = page_counter_memparse(buf, "max", &min); 6652 if (err) 6653 return err; 6654 6655 page_counter_set_min(&memcg->memory, min); 6656 6657 return nbytes; 6658 } 6659 6660 static int memory_low_show(struct seq_file *m, void *v) 6661 { 6662 return seq_puts_memcg_tunable(m, 6663 READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 6664 } 6665 6666 static ssize_t memory_low_write(struct kernfs_open_file *of, 6667 char *buf, size_t nbytes, loff_t off) 6668 { 6669 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6670 unsigned long low; 6671 int err; 6672 6673 buf = strstrip(buf); 6674 err = page_counter_memparse(buf, "max", &low); 6675 if (err) 6676 return err; 6677 6678 page_counter_set_low(&memcg->memory, low); 6679 6680 return nbytes; 6681 } 6682 6683 static int memory_high_show(struct seq_file *m, void *v) 6684 { 6685 return seq_puts_memcg_tunable(m, 6686 READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); 6687 } 6688 6689 static ssize_t memory_high_write(struct kernfs_open_file *of, 6690 char *buf, size_t nbytes, loff_t off) 6691 { 6692 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6693 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6694 bool drained = false; 6695 unsigned long high; 6696 int err; 6697 6698 buf = strstrip(buf); 6699 err = page_counter_memparse(buf, "max", &high); 6700 if (err) 6701 return err; 6702 6703 page_counter_set_high(&memcg->memory, high); 6704 6705 for (;;) { 6706 unsigned long nr_pages = page_counter_read(&memcg->memory); 6707 unsigned long reclaimed; 6708 6709 if (nr_pages <= high) 6710 break; 6711 6712 if (signal_pending(current)) 6713 break; 6714 6715 if (!drained) { 6716 drain_all_stock(memcg); 6717 drained = true; 6718 continue; 6719 } 6720 6721 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 6722 GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); 6723 6724 if (!reclaimed && !nr_retries--) 6725 break; 6726 } 6727 6728 memcg_wb_domain_size_changed(memcg); 6729 return nbytes; 6730 } 6731 6732 static int memory_max_show(struct seq_file *m, void *v) 6733 { 6734 return seq_puts_memcg_tunable(m, 6735 READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 6736 } 6737 6738 static ssize_t memory_max_write(struct kernfs_open_file *of, 6739 char *buf, size_t nbytes, loff_t off) 6740 { 6741 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6742 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; 6743 bool drained = false; 6744 unsigned long max; 6745 int err; 6746 6747 buf = strstrip(buf); 6748 err = page_counter_memparse(buf, "max", &max); 6749 if (err) 6750 return err; 6751 6752 xchg(&memcg->memory.max, max); 6753 6754 for (;;) { 6755 unsigned long nr_pages = page_counter_read(&memcg->memory); 6756 6757 if (nr_pages <= max) 6758 break; 6759 6760 if (signal_pending(current)) 6761 break; 6762 6763 if (!drained) { 6764 drain_all_stock(memcg); 6765 drained = true; 6766 continue; 6767 } 6768 6769 if (nr_reclaims) { 6770 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 6771 GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) 6772 nr_reclaims--; 6773 continue; 6774 } 6775 6776 memcg_memory_event(memcg, MEMCG_OOM); 6777 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 6778 break; 6779 } 6780 6781 memcg_wb_domain_size_changed(memcg); 6782 return nbytes; 6783 } 6784 6785 static void __memory_events_show(struct seq_file *m, atomic_long_t *events) 6786 { 6787 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); 6788 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); 6789 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); 6790 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); 6791 seq_printf(m, "oom_kill %lu\n", 6792 atomic_long_read(&events[MEMCG_OOM_KILL])); 6793 seq_printf(m, "oom_group_kill %lu\n", 6794 atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); 6795 } 6796 6797 static int memory_events_show(struct seq_file *m, void *v) 6798 { 6799 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6800 6801 __memory_events_show(m, memcg->memory_events); 6802 return 0; 6803 } 6804 6805 static int memory_events_local_show(struct seq_file *m, void *v) 6806 { 6807 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6808 6809 __memory_events_show(m, memcg->memory_events_local); 6810 return 0; 6811 } 6812 6813 static int memory_stat_show(struct seq_file *m, void *v) 6814 { 6815 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6816 char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 6817 struct seq_buf s; 6818 6819 if (!buf) 6820 return -ENOMEM; 6821 seq_buf_init(&s, buf, PAGE_SIZE); 6822 memory_stat_format(memcg, &s); 6823 seq_puts(m, buf); 6824 kfree(buf); 6825 return 0; 6826 } 6827 6828 #ifdef CONFIG_NUMA 6829 static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, 6830 int item) 6831 { 6832 return lruvec_page_state(lruvec, item) * 6833 memcg_page_state_output_unit(item); 6834 } 6835 6836 static int memory_numa_stat_show(struct seq_file *m, void *v) 6837 { 6838 int i; 6839 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6840 6841 mem_cgroup_flush_stats(); 6842 6843 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 6844 int nid; 6845 6846 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) 6847 continue; 6848 6849 seq_printf(m, "%s", memory_stats[i].name); 6850 for_each_node_state(nid, N_MEMORY) { 6851 u64 size; 6852 struct lruvec *lruvec; 6853 6854 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 6855 size = lruvec_page_state_output(lruvec, 6856 memory_stats[i].idx); 6857 seq_printf(m, " N%d=%llu", nid, size); 6858 } 6859 seq_putc(m, '\n'); 6860 } 6861 6862 return 0; 6863 } 6864 #endif 6865 6866 static int memory_oom_group_show(struct seq_file *m, void *v) 6867 { 6868 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6869 6870 seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group)); 6871 6872 return 0; 6873 } 6874 6875 static ssize_t memory_oom_group_write(struct kernfs_open_file *of, 6876 char *buf, size_t nbytes, loff_t off) 6877 { 6878 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6879 int ret, oom_group; 6880 6881 buf = strstrip(buf); 6882 if (!buf) 6883 return -EINVAL; 6884 6885 ret = kstrtoint(buf, 0, &oom_group); 6886 if (ret) 6887 return ret; 6888 6889 if (oom_group != 0 && oom_group != 1) 6890 return -EINVAL; 6891 6892 WRITE_ONCE(memcg->oom_group, oom_group); 6893 6894 return nbytes; 6895 } 6896 6897 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, 6898 size_t nbytes, loff_t off) 6899 { 6900 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6901 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6902 unsigned long nr_to_reclaim, nr_reclaimed = 0; 6903 unsigned int reclaim_options; 6904 int err; 6905 6906 buf = strstrip(buf); 6907 err = page_counter_memparse(buf, "", &nr_to_reclaim); 6908 if (err) 6909 return err; 6910 6911 reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; 6912 while (nr_reclaimed < nr_to_reclaim) { 6913 unsigned long reclaimed; 6914 6915 if (signal_pending(current)) 6916 return -EINTR; 6917 6918 /* 6919 * This is the final attempt, drain percpu lru caches in the 6920 * hope of introducing more evictable pages for 6921 * try_to_free_mem_cgroup_pages(). 6922 */ 6923 if (!nr_retries) 6924 lru_add_drain_all(); 6925 6926 reclaimed = try_to_free_mem_cgroup_pages(memcg, 6927 min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX), 6928 GFP_KERNEL, reclaim_options); 6929 6930 if (!reclaimed && !nr_retries--) 6931 return -EAGAIN; 6932 6933 nr_reclaimed += reclaimed; 6934 } 6935 6936 return nbytes; 6937 } 6938 6939 static struct cftype memory_files[] = { 6940 { 6941 .name = "current", 6942 .flags = CFTYPE_NOT_ON_ROOT, 6943 .read_u64 = memory_current_read, 6944 }, 6945 { 6946 .name = "peak", 6947 .flags = CFTYPE_NOT_ON_ROOT, 6948 .read_u64 = memory_peak_read, 6949 }, 6950 { 6951 .name = "min", 6952 .flags = CFTYPE_NOT_ON_ROOT, 6953 .seq_show = memory_min_show, 6954 .write = memory_min_write, 6955 }, 6956 { 6957 .name = "low", 6958 .flags = CFTYPE_NOT_ON_ROOT, 6959 .seq_show = memory_low_show, 6960 .write = memory_low_write, 6961 }, 6962 { 6963 .name = "high", 6964 .flags = CFTYPE_NOT_ON_ROOT, 6965 .seq_show = memory_high_show, 6966 .write = memory_high_write, 6967 }, 6968 { 6969 .name = "max", 6970 .flags = CFTYPE_NOT_ON_ROOT, 6971 .seq_show = memory_max_show, 6972 .write = memory_max_write, 6973 }, 6974 { 6975 .name = "events", 6976 .flags = CFTYPE_NOT_ON_ROOT, 6977 .file_offset = offsetof(struct mem_cgroup, events_file), 6978 .seq_show = memory_events_show, 6979 }, 6980 { 6981 .name = "events.local", 6982 .flags = CFTYPE_NOT_ON_ROOT, 6983 .file_offset = offsetof(struct mem_cgroup, events_local_file), 6984 .seq_show = memory_events_local_show, 6985 }, 6986 { 6987 .name = "stat", 6988 .seq_show = memory_stat_show, 6989 }, 6990 #ifdef CONFIG_NUMA 6991 { 6992 .name = "numa_stat", 6993 .seq_show = memory_numa_stat_show, 6994 }, 6995 #endif 6996 { 6997 .name = "oom.group", 6998 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 6999 .seq_show = memory_oom_group_show, 7000 .write = memory_oom_group_write, 7001 }, 7002 { 7003 .name = "reclaim", 7004 .flags = CFTYPE_NS_DELEGATABLE, 7005 .write = memory_reclaim, 7006 }, 7007 { } /* terminate */ 7008 }; 7009 7010 struct cgroup_subsys memory_cgrp_subsys = { 7011 .css_alloc = mem_cgroup_css_alloc, 7012 .css_online = mem_cgroup_css_online, 7013 .css_offline = mem_cgroup_css_offline, 7014 .css_released = mem_cgroup_css_released, 7015 .css_free = mem_cgroup_css_free, 7016 .css_reset = mem_cgroup_css_reset, 7017 .css_rstat_flush = mem_cgroup_css_rstat_flush, 7018 .can_attach = mem_cgroup_can_attach, 7019 #if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM) 7020 .attach = mem_cgroup_attach, 7021 #endif 7022 .cancel_attach = mem_cgroup_cancel_attach, 7023 .post_attach = mem_cgroup_move_task, 7024 #ifdef CONFIG_MEMCG_KMEM 7025 .fork = mem_cgroup_fork, 7026 .exit = mem_cgroup_exit, 7027 #endif 7028 .dfl_cftypes = memory_files, 7029 .legacy_cftypes = mem_cgroup_legacy_files, 7030 .early_init = 0, 7031 }; 7032 7033 /* 7034 * This function calculates an individual cgroup's effective 7035 * protection which is derived from its own memory.min/low, its 7036 * parent's and siblings' settings, as well as the actual memory 7037 * distribution in the tree. 7038 * 7039 * The following rules apply to the effective protection values: 7040 * 7041 * 1. At the first level of reclaim, effective protection is equal to 7042 * the declared protection in memory.min and memory.low. 7043 * 7044 * 2. To enable safe delegation of the protection configuration, at 7045 * subsequent levels the effective protection is capped to the 7046 * parent's effective protection. 7047 * 7048 * 3. To make complex and dynamic subtrees easier to configure, the 7049 * user is allowed to overcommit the declared protection at a given 7050 * level. If that is the case, the parent's effective protection is 7051 * distributed to the children in proportion to how much protection 7052 * they have declared and how much of it they are utilizing. 7053 * 7054 * This makes distribution proportional, but also work-conserving: 7055 * if one cgroup claims much more protection than it uses memory, 7056 * the unused remainder is available to its siblings. 7057 * 7058 * 4. Conversely, when the declared protection is undercommitted at a 7059 * given level, the distribution of the larger parental protection 7060 * budget is NOT proportional. A cgroup's protection from a sibling 7061 * is capped to its own memory.min/low setting. 7062 * 7063 * 5. However, to allow protecting recursive subtrees from each other 7064 * without having to declare each individual cgroup's fixed share 7065 * of the ancestor's claim to protection, any unutilized - 7066 * "floating" - protection from up the tree is distributed in 7067 * proportion to each cgroup's *usage*. This makes the protection 7068 * neutral wrt sibling cgroups and lets them compete freely over 7069 * the shared parental protection budget, but it protects the 7070 * subtree as a whole from neighboring subtrees. 7071 * 7072 * Note that 4. and 5. are not in conflict: 4. is about protecting 7073 * against immediate siblings whereas 5. is about protecting against 7074 * neighboring subtrees. 7075 */ 7076 static unsigned long effective_protection(unsigned long usage, 7077 unsigned long parent_usage, 7078 unsigned long setting, 7079 unsigned long parent_effective, 7080 unsigned long siblings_protected) 7081 { 7082 unsigned long protected; 7083 unsigned long ep; 7084 7085 protected = min(usage, setting); 7086 /* 7087 * If all cgroups at this level combined claim and use more 7088 * protection than what the parent affords them, distribute 7089 * shares in proportion to utilization. 7090 * 7091 * We are using actual utilization rather than the statically 7092 * claimed protection in order to be work-conserving: claimed 7093 * but unused protection is available to siblings that would 7094 * otherwise get a smaller chunk than what they claimed. 7095 */ 7096 if (siblings_protected > parent_effective) 7097 return protected * parent_effective / siblings_protected; 7098 7099 /* 7100 * Ok, utilized protection of all children is within what the 7101 * parent affords them, so we know whatever this child claims 7102 * and utilizes is effectively protected. 7103 * 7104 * If there is unprotected usage beyond this value, reclaim 7105 * will apply pressure in proportion to that amount. 7106 * 7107 * If there is unutilized protection, the cgroup will be fully 7108 * shielded from reclaim, but we do return a smaller value for 7109 * protection than what the group could enjoy in theory. This 7110 * is okay. With the overcommit distribution above, effective 7111 * protection is always dependent on how memory is actually 7112 * consumed among the siblings anyway. 7113 */ 7114 ep = protected; 7115 7116 /* 7117 * If the children aren't claiming (all of) the protection 7118 * afforded to them by the parent, distribute the remainder in 7119 * proportion to the (unprotected) memory of each cgroup. That 7120 * way, cgroups that aren't explicitly prioritized wrt each 7121 * other compete freely over the allowance, but they are 7122 * collectively protected from neighboring trees. 7123 * 7124 * We're using unprotected memory for the weight so that if 7125 * some cgroups DO claim explicit protection, we don't protect 7126 * the same bytes twice. 7127 * 7128 * Check both usage and parent_usage against the respective 7129 * protected values. One should imply the other, but they 7130 * aren't read atomically - make sure the division is sane. 7131 */ 7132 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) 7133 return ep; 7134 if (parent_effective > siblings_protected && 7135 parent_usage > siblings_protected && 7136 usage > protected) { 7137 unsigned long unclaimed; 7138 7139 unclaimed = parent_effective - siblings_protected; 7140 unclaimed *= usage - protected; 7141 unclaimed /= parent_usage - siblings_protected; 7142 7143 ep += unclaimed; 7144 } 7145 7146 return ep; 7147 } 7148 7149 /** 7150 * mem_cgroup_calculate_protection - check if memory consumption is in the normal range 7151 * @root: the top ancestor of the sub-tree being checked 7152 * @memcg: the memory cgroup to check 7153 * 7154 * WARNING: This function is not stateless! It can only be used as part 7155 * of a top-down tree iteration, not for isolated queries. 7156 */ 7157 void mem_cgroup_calculate_protection(struct mem_cgroup *root, 7158 struct mem_cgroup *memcg) 7159 { 7160 unsigned long usage, parent_usage; 7161 struct mem_cgroup *parent; 7162 7163 if (mem_cgroup_disabled()) 7164 return; 7165 7166 if (!root) 7167 root = root_mem_cgroup; 7168 7169 /* 7170 * Effective values of the reclaim targets are ignored so they 7171 * can be stale. Have a look at mem_cgroup_protection for more 7172 * details. 7173 * TODO: calculation should be more robust so that we do not need 7174 * that special casing. 7175 */ 7176 if (memcg == root) 7177 return; 7178 7179 usage = page_counter_read(&memcg->memory); 7180 if (!usage) 7181 return; 7182 7183 parent = parent_mem_cgroup(memcg); 7184 7185 if (parent == root) { 7186 memcg->memory.emin = READ_ONCE(memcg->memory.min); 7187 memcg->memory.elow = READ_ONCE(memcg->memory.low); 7188 return; 7189 } 7190 7191 parent_usage = page_counter_read(&parent->memory); 7192 7193 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, 7194 READ_ONCE(memcg->memory.min), 7195 READ_ONCE(parent->memory.emin), 7196 atomic_long_read(&parent->memory.children_min_usage))); 7197 7198 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, 7199 READ_ONCE(memcg->memory.low), 7200 READ_ONCE(parent->memory.elow), 7201 atomic_long_read(&parent->memory.children_low_usage))); 7202 } 7203 7204 static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, 7205 gfp_t gfp) 7206 { 7207 int ret; 7208 7209 ret = try_charge(memcg, gfp, folio_nr_pages(folio)); 7210 if (ret) 7211 goto out; 7212 7213 mem_cgroup_commit_charge(folio, memcg); 7214 out: 7215 return ret; 7216 } 7217 7218 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) 7219 { 7220 struct mem_cgroup *memcg; 7221 int ret; 7222 7223 memcg = get_mem_cgroup_from_mm(mm); 7224 ret = charge_memcg(folio, memcg, gfp); 7225 css_put(&memcg->css); 7226 7227 return ret; 7228 } 7229 7230 /** 7231 * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio 7232 * @memcg: memcg to charge. 7233 * @gfp: reclaim mode. 7234 * @nr_pages: number of pages to charge. 7235 * 7236 * This function is called when allocating a huge page folio to determine if 7237 * the memcg has the capacity for it. It does not commit the charge yet, 7238 * as the hugetlb folio itself has not been obtained from the hugetlb pool. 7239 * 7240 * Once we have obtained the hugetlb folio, we can call 7241 * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the 7242 * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect 7243 * of try_charge(). 7244 * 7245 * Returns 0 on success. Otherwise, an error code is returned. 7246 */ 7247 int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, 7248 long nr_pages) 7249 { 7250 /* 7251 * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation, 7252 * but do not attempt to commit charge later (or cancel on error) either. 7253 */ 7254 if (mem_cgroup_disabled() || !memcg || 7255 !cgroup_subsys_on_dfl(memory_cgrp_subsys) || 7256 !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) 7257 return -EOPNOTSUPP; 7258 7259 if (try_charge(memcg, gfp, nr_pages)) 7260 return -ENOMEM; 7261 7262 return 0; 7263 } 7264 7265 /** 7266 * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. 7267 * @folio: folio to charge. 7268 * @mm: mm context of the victim 7269 * @gfp: reclaim mode 7270 * @entry: swap entry for which the folio is allocated 7271 * 7272 * This function charges a folio allocated for swapin. Please call this before 7273 * adding the folio to the swapcache. 7274 * 7275 * Returns 0 on success. Otherwise, an error code is returned. 7276 */ 7277 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, 7278 gfp_t gfp, swp_entry_t entry) 7279 { 7280 struct mem_cgroup *memcg; 7281 unsigned short id; 7282 int ret; 7283 7284 if (mem_cgroup_disabled()) 7285 return 0; 7286 7287 id = lookup_swap_cgroup_id(entry); 7288 rcu_read_lock(); 7289 memcg = mem_cgroup_from_id(id); 7290 if (!memcg || !css_tryget_online(&memcg->css)) 7291 memcg = get_mem_cgroup_from_mm(mm); 7292 rcu_read_unlock(); 7293 7294 ret = charge_memcg(folio, memcg, gfp); 7295 7296 css_put(&memcg->css); 7297 return ret; 7298 } 7299 7300 /* 7301 * mem_cgroup_swapin_uncharge_swap - uncharge swap slot 7302 * @entry: swap entry for which the page is charged 7303 * 7304 * Call this function after successfully adding the charged page to swapcache. 7305 * 7306 * Note: This function assumes the page for which swap slot is being uncharged 7307 * is order 0 page. 7308 */ 7309 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) 7310 { 7311 /* 7312 * Cgroup1's unified memory+swap counter has been charged with the 7313 * new swapcache page, finish the transfer by uncharging the swap 7314 * slot. The swap slot would also get uncharged when it dies, but 7315 * it can stick around indefinitely and we'd count the page twice 7316 * the entire time. 7317 * 7318 * Cgroup2 has separate resource counters for memory and swap, 7319 * so this is a non-issue here. Memory and swap charge lifetimes 7320 * correspond 1:1 to page and swap slot lifetimes: we charge the 7321 * page to memory here, and uncharge swap when the slot is freed. 7322 */ 7323 if (!mem_cgroup_disabled() && do_memsw_account()) { 7324 /* 7325 * The swap entry might not get freed for a long time, 7326 * let's not wait for it. The page already received a 7327 * memory+swap charge, drop the swap entry duplicate. 7328 */ 7329 mem_cgroup_uncharge_swap(entry, 1); 7330 } 7331 } 7332 7333 struct uncharge_gather { 7334 struct mem_cgroup *memcg; 7335 unsigned long nr_memory; 7336 unsigned long pgpgout; 7337 unsigned long nr_kmem; 7338 int nid; 7339 }; 7340 7341 static inline void uncharge_gather_clear(struct uncharge_gather *ug) 7342 { 7343 memset(ug, 0, sizeof(*ug)); 7344 } 7345 7346 static void uncharge_batch(const struct uncharge_gather *ug) 7347 { 7348 unsigned long flags; 7349 7350 if (ug->nr_memory) { 7351 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); 7352 if (do_memsw_account()) 7353 page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); 7354 if (ug->nr_kmem) 7355 memcg_account_kmem(ug->memcg, -ug->nr_kmem); 7356 memcg_oom_recover(ug->memcg); 7357 } 7358 7359 local_irq_save(flags); 7360 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 7361 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); 7362 memcg_check_events(ug->memcg, ug->nid); 7363 local_irq_restore(flags); 7364 7365 /* drop reference from uncharge_folio */ 7366 css_put(&ug->memcg->css); 7367 } 7368 7369 static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) 7370 { 7371 long nr_pages; 7372 struct mem_cgroup *memcg; 7373 struct obj_cgroup *objcg; 7374 7375 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 7376 7377 /* 7378 * Nobody should be changing or seriously looking at 7379 * folio memcg or objcg at this point, we have fully 7380 * exclusive access to the folio. 7381 */ 7382 if (folio_memcg_kmem(folio)) { 7383 objcg = __folio_objcg(folio); 7384 /* 7385 * This get matches the put at the end of the function and 7386 * kmem pages do not hold memcg references anymore. 7387 */ 7388 memcg = get_mem_cgroup_from_objcg(objcg); 7389 } else { 7390 memcg = __folio_memcg(folio); 7391 } 7392 7393 if (!memcg) 7394 return; 7395 7396 if (ug->memcg != memcg) { 7397 if (ug->memcg) { 7398 uncharge_batch(ug); 7399 uncharge_gather_clear(ug); 7400 } 7401 ug->memcg = memcg; 7402 ug->nid = folio_nid(folio); 7403 7404 /* pairs with css_put in uncharge_batch */ 7405 css_get(&memcg->css); 7406 } 7407 7408 nr_pages = folio_nr_pages(folio); 7409 7410 if (folio_memcg_kmem(folio)) { 7411 ug->nr_memory += nr_pages; 7412 ug->nr_kmem += nr_pages; 7413 7414 folio->memcg_data = 0; 7415 obj_cgroup_put(objcg); 7416 } else { 7417 /* LRU pages aren't accounted at the root level */ 7418 if (!mem_cgroup_is_root(memcg)) 7419 ug->nr_memory += nr_pages; 7420 ug->pgpgout++; 7421 7422 folio->memcg_data = 0; 7423 } 7424 7425 css_put(&memcg->css); 7426 } 7427 7428 void __mem_cgroup_uncharge(struct folio *folio) 7429 { 7430 struct uncharge_gather ug; 7431 7432 /* Don't touch folio->lru of any random page, pre-check: */ 7433 if (!folio_memcg(folio)) 7434 return; 7435 7436 uncharge_gather_clear(&ug); 7437 uncharge_folio(folio, &ug); 7438 uncharge_batch(&ug); 7439 } 7440 7441 /** 7442 * __mem_cgroup_uncharge_list - uncharge a list of page 7443 * @page_list: list of pages to uncharge 7444 * 7445 * Uncharge a list of pages previously charged with 7446 * __mem_cgroup_charge(). 7447 */ 7448 void __mem_cgroup_uncharge_list(struct list_head *page_list) 7449 { 7450 struct uncharge_gather ug; 7451 struct folio *folio; 7452 7453 uncharge_gather_clear(&ug); 7454 list_for_each_entry(folio, page_list, lru) 7455 uncharge_folio(folio, &ug); 7456 if (ug.memcg) 7457 uncharge_batch(&ug); 7458 } 7459 7460 /** 7461 * mem_cgroup_replace_folio - Charge a folio's replacement. 7462 * @old: Currently circulating folio. 7463 * @new: Replacement folio. 7464 * 7465 * Charge @new as a replacement folio for @old. @old will 7466 * be uncharged upon free. This is only used by the page cache 7467 * (in replace_page_cache_folio()). 7468 * 7469 * Both folios must be locked, @new->mapping must be set up. 7470 */ 7471 void mem_cgroup_replace_folio(struct folio *old, struct folio *new) 7472 { 7473 struct mem_cgroup *memcg; 7474 long nr_pages = folio_nr_pages(new); 7475 unsigned long flags; 7476 7477 VM_BUG_ON_FOLIO(!folio_test_locked(old), old); 7478 VM_BUG_ON_FOLIO(!folio_test_locked(new), new); 7479 VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); 7480 VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new); 7481 7482 if (mem_cgroup_disabled()) 7483 return; 7484 7485 /* Page cache replacement: new folio already charged? */ 7486 if (folio_memcg(new)) 7487 return; 7488 7489 memcg = folio_memcg(old); 7490 VM_WARN_ON_ONCE_FOLIO(!memcg, old); 7491 if (!memcg) 7492 return; 7493 7494 /* Force-charge the new page. The old one will be freed soon */ 7495 if (!mem_cgroup_is_root(memcg)) { 7496 page_counter_charge(&memcg->memory, nr_pages); 7497 if (do_memsw_account()) 7498 page_counter_charge(&memcg->memsw, nr_pages); 7499 } 7500 7501 css_get(&memcg->css); 7502 commit_charge(new, memcg); 7503 7504 local_irq_save(flags); 7505 mem_cgroup_charge_statistics(memcg, nr_pages); 7506 memcg_check_events(memcg, folio_nid(new)); 7507 local_irq_restore(flags); 7508 } 7509 7510 /** 7511 * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio. 7512 * @old: Currently circulating folio. 7513 * @new: Replacement folio. 7514 * 7515 * Transfer the memcg data from the old folio to the new folio for migration. 7516 * The old folio's data info will be cleared. Note that the memory counters 7517 * will remain unchanged throughout the process. 7518 * 7519 * Both folios must be locked, @new->mapping must be set up. 7520 */ 7521 void mem_cgroup_migrate(struct folio *old, struct folio *new) 7522 { 7523 struct mem_cgroup *memcg; 7524 7525 VM_BUG_ON_FOLIO(!folio_test_locked(old), old); 7526 VM_BUG_ON_FOLIO(!folio_test_locked(new), new); 7527 VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); 7528 VM_BUG_ON_FOLIO(folio_nr_pages(old) != folio_nr_pages(new), new); 7529 7530 if (mem_cgroup_disabled()) 7531 return; 7532 7533 memcg = folio_memcg(old); 7534 /* 7535 * Note that it is normal to see !memcg for a hugetlb folio. 7536 * For e.g, itt could have been allocated when memory_hugetlb_accounting 7537 * was not selected. 7538 */ 7539 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old); 7540 if (!memcg) 7541 return; 7542 7543 /* Transfer the charge and the css ref */ 7544 commit_charge(new, memcg); 7545 old->memcg_data = 0; 7546 } 7547 7548 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 7549 EXPORT_SYMBOL(memcg_sockets_enabled_key); 7550 7551 void mem_cgroup_sk_alloc(struct sock *sk) 7552 { 7553 struct mem_cgroup *memcg; 7554 7555 if (!mem_cgroup_sockets_enabled) 7556 return; 7557 7558 /* Do not associate the sock with unrelated interrupted task's memcg. */ 7559 if (!in_task()) 7560 return; 7561 7562 rcu_read_lock(); 7563 memcg = mem_cgroup_from_task(current); 7564 if (mem_cgroup_is_root(memcg)) 7565 goto out; 7566 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 7567 goto out; 7568 if (css_tryget(&memcg->css)) 7569 sk->sk_memcg = memcg; 7570 out: 7571 rcu_read_unlock(); 7572 } 7573 7574 void mem_cgroup_sk_free(struct sock *sk) 7575 { 7576 if (sk->sk_memcg) 7577 css_put(&sk->sk_memcg->css); 7578 } 7579 7580 /** 7581 * mem_cgroup_charge_skmem - charge socket memory 7582 * @memcg: memcg to charge 7583 * @nr_pages: number of pages to charge 7584 * @gfp_mask: reclaim mode 7585 * 7586 * Charges @nr_pages to @memcg. Returns %true if the charge fit within 7587 * @memcg's configured limit, %false if it doesn't. 7588 */ 7589 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, 7590 gfp_t gfp_mask) 7591 { 7592 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7593 struct page_counter *fail; 7594 7595 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 7596 memcg->tcpmem_pressure = 0; 7597 return true; 7598 } 7599 memcg->tcpmem_pressure = 1; 7600 if (gfp_mask & __GFP_NOFAIL) { 7601 page_counter_charge(&memcg->tcpmem, nr_pages); 7602 return true; 7603 } 7604 return false; 7605 } 7606 7607 if (try_charge(memcg, gfp_mask, nr_pages) == 0) { 7608 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 7609 return true; 7610 } 7611 7612 return false; 7613 } 7614 7615 /** 7616 * mem_cgroup_uncharge_skmem - uncharge socket memory 7617 * @memcg: memcg to uncharge 7618 * @nr_pages: number of pages to uncharge 7619 */ 7620 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 7621 { 7622 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7623 page_counter_uncharge(&memcg->tcpmem, nr_pages); 7624 return; 7625 } 7626 7627 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 7628 7629 refill_stock(memcg, nr_pages); 7630 } 7631 7632 static int __init cgroup_memory(char *s) 7633 { 7634 char *token; 7635 7636 while ((token = strsep(&s, ",")) != NULL) { 7637 if (!*token) 7638 continue; 7639 if (!strcmp(token, "nosocket")) 7640 cgroup_memory_nosocket = true; 7641 if (!strcmp(token, "nokmem")) 7642 cgroup_memory_nokmem = true; 7643 if (!strcmp(token, "nobpf")) 7644 cgroup_memory_nobpf = true; 7645 } 7646 return 1; 7647 } 7648 __setup("cgroup.memory=", cgroup_memory); 7649 7650 /* 7651 * subsys_initcall() for memory controller. 7652 * 7653 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 7654 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 7655 * basically everything that doesn't depend on a specific mem_cgroup structure 7656 * should be initialized from here. 7657 */ 7658 static int __init mem_cgroup_init(void) 7659 { 7660 int cpu, node; 7661 7662 /* 7663 * Currently s32 type (can refer to struct batched_lruvec_stat) is 7664 * used for per-memcg-per-cpu caching of per-node statistics. In order 7665 * to work fine, we should make sure that the overfill threshold can't 7666 * exceed S32_MAX / PAGE_SIZE. 7667 */ 7668 BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE); 7669 7670 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 7671 memcg_hotplug_cpu_dead); 7672 7673 for_each_possible_cpu(cpu) 7674 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 7675 drain_local_stock); 7676 7677 for_each_node(node) { 7678 struct mem_cgroup_tree_per_node *rtpn; 7679 7680 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); 7681 7682 rtpn->rb_root = RB_ROOT; 7683 rtpn->rb_rightmost = NULL; 7684 spin_lock_init(&rtpn->lock); 7685 soft_limit_tree.rb_tree_per_node[node] = rtpn; 7686 } 7687 7688 return 0; 7689 } 7690 subsys_initcall(mem_cgroup_init); 7691 7692 #ifdef CONFIG_SWAP 7693 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 7694 { 7695 while (!refcount_inc_not_zero(&memcg->id.ref)) { 7696 /* 7697 * The root cgroup cannot be destroyed, so it's refcount must 7698 * always be >= 1. 7699 */ 7700 if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) { 7701 VM_BUG_ON(1); 7702 break; 7703 } 7704 memcg = parent_mem_cgroup(memcg); 7705 if (!memcg) 7706 memcg = root_mem_cgroup; 7707 } 7708 return memcg; 7709 } 7710 7711 /** 7712 * mem_cgroup_swapout - transfer a memsw charge to swap 7713 * @folio: folio whose memsw charge to transfer 7714 * @entry: swap entry to move the charge to 7715 * 7716 * Transfer the memsw charge of @folio to @entry. 7717 */ 7718 void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) 7719 { 7720 struct mem_cgroup *memcg, *swap_memcg; 7721 unsigned int nr_entries; 7722 unsigned short oldid; 7723 7724 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 7725 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 7726 7727 if (mem_cgroup_disabled()) 7728 return; 7729 7730 if (!do_memsw_account()) 7731 return; 7732 7733 memcg = folio_memcg(folio); 7734 7735 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 7736 if (!memcg) 7737 return; 7738 7739 /* 7740 * In case the memcg owning these pages has been offlined and doesn't 7741 * have an ID allocated to it anymore, charge the closest online 7742 * ancestor for the swap instead and transfer the memory+swap charge. 7743 */ 7744 swap_memcg = mem_cgroup_id_get_online(memcg); 7745 nr_entries = folio_nr_pages(folio); 7746 /* Get references for the tail pages, too */ 7747 if (nr_entries > 1) 7748 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 7749 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 7750 nr_entries); 7751 VM_BUG_ON_FOLIO(oldid, folio); 7752 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 7753 7754 folio->memcg_data = 0; 7755 7756 if (!mem_cgroup_is_root(memcg)) 7757 page_counter_uncharge(&memcg->memory, nr_entries); 7758 7759 if (memcg != swap_memcg) { 7760 if (!mem_cgroup_is_root(swap_memcg)) 7761 page_counter_charge(&swap_memcg->memsw, nr_entries); 7762 page_counter_uncharge(&memcg->memsw, nr_entries); 7763 } 7764 7765 /* 7766 * Interrupts should be disabled here because the caller holds the 7767 * i_pages lock which is taken with interrupts-off. It is 7768 * important here to have the interrupts disabled because it is the 7769 * only synchronisation we have for updating the per-CPU variables. 7770 */ 7771 memcg_stats_lock(); 7772 mem_cgroup_charge_statistics(memcg, -nr_entries); 7773 memcg_stats_unlock(); 7774 memcg_check_events(memcg, folio_nid(folio)); 7775 7776 css_put(&memcg->css); 7777 } 7778 7779 /** 7780 * __mem_cgroup_try_charge_swap - try charging swap space for a folio 7781 * @folio: folio being added to swap 7782 * @entry: swap entry to charge 7783 * 7784 * Try to charge @folio's memcg for the swap space at @entry. 7785 * 7786 * Returns 0 on success, -ENOMEM on failure. 7787 */ 7788 int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) 7789 { 7790 unsigned int nr_pages = folio_nr_pages(folio); 7791 struct page_counter *counter; 7792 struct mem_cgroup *memcg; 7793 unsigned short oldid; 7794 7795 if (do_memsw_account()) 7796 return 0; 7797 7798 memcg = folio_memcg(folio); 7799 7800 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 7801 if (!memcg) 7802 return 0; 7803 7804 if (!entry.val) { 7805 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7806 return 0; 7807 } 7808 7809 memcg = mem_cgroup_id_get_online(memcg); 7810 7811 if (!mem_cgroup_is_root(memcg) && 7812 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 7813 memcg_memory_event(memcg, MEMCG_SWAP_MAX); 7814 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7815 mem_cgroup_id_put(memcg); 7816 return -ENOMEM; 7817 } 7818 7819 /* Get references for the tail pages, too */ 7820 if (nr_pages > 1) 7821 mem_cgroup_id_get_many(memcg, nr_pages - 1); 7822 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 7823 VM_BUG_ON_FOLIO(oldid, folio); 7824 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 7825 7826 return 0; 7827 } 7828 7829 /** 7830 * __mem_cgroup_uncharge_swap - uncharge swap space 7831 * @entry: swap entry to uncharge 7832 * @nr_pages: the amount of swap space to uncharge 7833 */ 7834 void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 7835 { 7836 struct mem_cgroup *memcg; 7837 unsigned short id; 7838 7839 id = swap_cgroup_record(entry, 0, nr_pages); 7840 rcu_read_lock(); 7841 memcg = mem_cgroup_from_id(id); 7842 if (memcg) { 7843 if (!mem_cgroup_is_root(memcg)) { 7844 if (do_memsw_account()) 7845 page_counter_uncharge(&memcg->memsw, nr_pages); 7846 else 7847 page_counter_uncharge(&memcg->swap, nr_pages); 7848 } 7849 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 7850 mem_cgroup_id_put_many(memcg, nr_pages); 7851 } 7852 rcu_read_unlock(); 7853 } 7854 7855 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 7856 { 7857 long nr_swap_pages = get_nr_swap_pages(); 7858 7859 if (mem_cgroup_disabled() || do_memsw_account()) 7860 return nr_swap_pages; 7861 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) 7862 nr_swap_pages = min_t(long, nr_swap_pages, 7863 READ_ONCE(memcg->swap.max) - 7864 page_counter_read(&memcg->swap)); 7865 return nr_swap_pages; 7866 } 7867 7868 bool mem_cgroup_swap_full(struct folio *folio) 7869 { 7870 struct mem_cgroup *memcg; 7871 7872 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 7873 7874 if (vm_swap_full()) 7875 return true; 7876 if (do_memsw_account()) 7877 return false; 7878 7879 memcg = folio_memcg(folio); 7880 if (!memcg) 7881 return false; 7882 7883 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 7884 unsigned long usage = page_counter_read(&memcg->swap); 7885 7886 if (usage * 2 >= READ_ONCE(memcg->swap.high) || 7887 usage * 2 >= READ_ONCE(memcg->swap.max)) 7888 return true; 7889 } 7890 7891 return false; 7892 } 7893 7894 static int __init setup_swap_account(char *s) 7895 { 7896 pr_warn_once("The swapaccount= commandline option is deprecated. " 7897 "Please report your usecase to linux-mm@kvack.org if you " 7898 "depend on this functionality.\n"); 7899 return 1; 7900 } 7901 __setup("swapaccount=", setup_swap_account); 7902 7903 static u64 swap_current_read(struct cgroup_subsys_state *css, 7904 struct cftype *cft) 7905 { 7906 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 7907 7908 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 7909 } 7910 7911 static u64 swap_peak_read(struct cgroup_subsys_state *css, 7912 struct cftype *cft) 7913 { 7914 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 7915 7916 return (u64)memcg->swap.watermark * PAGE_SIZE; 7917 } 7918 7919 static int swap_high_show(struct seq_file *m, void *v) 7920 { 7921 return seq_puts_memcg_tunable(m, 7922 READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); 7923 } 7924 7925 static ssize_t swap_high_write(struct kernfs_open_file *of, 7926 char *buf, size_t nbytes, loff_t off) 7927 { 7928 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7929 unsigned long high; 7930 int err; 7931 7932 buf = strstrip(buf); 7933 err = page_counter_memparse(buf, "max", &high); 7934 if (err) 7935 return err; 7936 7937 page_counter_set_high(&memcg->swap, high); 7938 7939 return nbytes; 7940 } 7941 7942 static int swap_max_show(struct seq_file *m, void *v) 7943 { 7944 return seq_puts_memcg_tunable(m, 7945 READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 7946 } 7947 7948 static ssize_t swap_max_write(struct kernfs_open_file *of, 7949 char *buf, size_t nbytes, loff_t off) 7950 { 7951 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7952 unsigned long max; 7953 int err; 7954 7955 buf = strstrip(buf); 7956 err = page_counter_memparse(buf, "max", &max); 7957 if (err) 7958 return err; 7959 7960 xchg(&memcg->swap.max, max); 7961 7962 return nbytes; 7963 } 7964 7965 static int swap_events_show(struct seq_file *m, void *v) 7966 { 7967 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 7968 7969 seq_printf(m, "high %lu\n", 7970 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); 7971 seq_printf(m, "max %lu\n", 7972 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 7973 seq_printf(m, "fail %lu\n", 7974 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 7975 7976 return 0; 7977 } 7978 7979 static struct cftype swap_files[] = { 7980 { 7981 .name = "swap.current", 7982 .flags = CFTYPE_NOT_ON_ROOT, 7983 .read_u64 = swap_current_read, 7984 }, 7985 { 7986 .name = "swap.high", 7987 .flags = CFTYPE_NOT_ON_ROOT, 7988 .seq_show = swap_high_show, 7989 .write = swap_high_write, 7990 }, 7991 { 7992 .name = "swap.max", 7993 .flags = CFTYPE_NOT_ON_ROOT, 7994 .seq_show = swap_max_show, 7995 .write = swap_max_write, 7996 }, 7997 { 7998 .name = "swap.peak", 7999 .flags = CFTYPE_NOT_ON_ROOT, 8000 .read_u64 = swap_peak_read, 8001 }, 8002 { 8003 .name = "swap.events", 8004 .flags = CFTYPE_NOT_ON_ROOT, 8005 .file_offset = offsetof(struct mem_cgroup, swap_events_file), 8006 .seq_show = swap_events_show, 8007 }, 8008 { } /* terminate */ 8009 }; 8010 8011 static struct cftype memsw_files[] = { 8012 { 8013 .name = "memsw.usage_in_bytes", 8014 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 8015 .read_u64 = mem_cgroup_read_u64, 8016 }, 8017 { 8018 .name = "memsw.max_usage_in_bytes", 8019 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 8020 .write = mem_cgroup_reset, 8021 .read_u64 = mem_cgroup_read_u64, 8022 }, 8023 { 8024 .name = "memsw.limit_in_bytes", 8025 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 8026 .write = mem_cgroup_write, 8027 .read_u64 = mem_cgroup_read_u64, 8028 }, 8029 { 8030 .name = "memsw.failcnt", 8031 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 8032 .write = mem_cgroup_reset, 8033 .read_u64 = mem_cgroup_read_u64, 8034 }, 8035 { }, /* terminate */ 8036 }; 8037 8038 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 8039 /** 8040 * obj_cgroup_may_zswap - check if this cgroup can zswap 8041 * @objcg: the object cgroup 8042 * 8043 * Check if the hierarchical zswap limit has been reached. 8044 * 8045 * This doesn't check for specific headroom, and it is not atomic 8046 * either. But with zswap, the size of the allocation is only known 8047 * once compression has occurred, and this optimistic pre-check avoids 8048 * spending cycles on compression when there is already no room left 8049 * or zswap is disabled altogether somewhere in the hierarchy. 8050 */ 8051 bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) 8052 { 8053 struct mem_cgroup *memcg, *original_memcg; 8054 bool ret = true; 8055 8056 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 8057 return true; 8058 8059 original_memcg = get_mem_cgroup_from_objcg(objcg); 8060 for (memcg = original_memcg; !mem_cgroup_is_root(memcg); 8061 memcg = parent_mem_cgroup(memcg)) { 8062 unsigned long max = READ_ONCE(memcg->zswap_max); 8063 unsigned long pages; 8064 8065 if (max == PAGE_COUNTER_MAX) 8066 continue; 8067 if (max == 0) { 8068 ret = false; 8069 break; 8070 } 8071 8072 cgroup_rstat_flush(memcg->css.cgroup); 8073 pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; 8074 if (pages < max) 8075 continue; 8076 ret = false; 8077 break; 8078 } 8079 mem_cgroup_put(original_memcg); 8080 return ret; 8081 } 8082 8083 /** 8084 * obj_cgroup_charge_zswap - charge compression backend memory 8085 * @objcg: the object cgroup 8086 * @size: size of compressed object 8087 * 8088 * This forces the charge after obj_cgroup_may_zswap() allowed 8089 * compression and storage in zwap for this cgroup to go ahead. 8090 */ 8091 void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) 8092 { 8093 struct mem_cgroup *memcg; 8094 8095 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 8096 return; 8097 8098 VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); 8099 8100 /* PF_MEMALLOC context, charging must succeed */ 8101 if (obj_cgroup_charge(objcg, GFP_KERNEL, size)) 8102 VM_WARN_ON_ONCE(1); 8103 8104 rcu_read_lock(); 8105 memcg = obj_cgroup_memcg(objcg); 8106 mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); 8107 mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); 8108 rcu_read_unlock(); 8109 } 8110 8111 /** 8112 * obj_cgroup_uncharge_zswap - uncharge compression backend memory 8113 * @objcg: the object cgroup 8114 * @size: size of compressed object 8115 * 8116 * Uncharges zswap memory on page in. 8117 */ 8118 void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) 8119 { 8120 struct mem_cgroup *memcg; 8121 8122 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 8123 return; 8124 8125 obj_cgroup_uncharge(objcg, size); 8126 8127 rcu_read_lock(); 8128 memcg = obj_cgroup_memcg(objcg); 8129 mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); 8130 mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); 8131 rcu_read_unlock(); 8132 } 8133 8134 static u64 zswap_current_read(struct cgroup_subsys_state *css, 8135 struct cftype *cft) 8136 { 8137 cgroup_rstat_flush(css->cgroup); 8138 return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B); 8139 } 8140 8141 static int zswap_max_show(struct seq_file *m, void *v) 8142 { 8143 return seq_puts_memcg_tunable(m, 8144 READ_ONCE(mem_cgroup_from_seq(m)->zswap_max)); 8145 } 8146 8147 static ssize_t zswap_max_write(struct kernfs_open_file *of, 8148 char *buf, size_t nbytes, loff_t off) 8149 { 8150 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 8151 unsigned long max; 8152 int err; 8153 8154 buf = strstrip(buf); 8155 err = page_counter_memparse(buf, "max", &max); 8156 if (err) 8157 return err; 8158 8159 xchg(&memcg->zswap_max, max); 8160 8161 return nbytes; 8162 } 8163 8164 static struct cftype zswap_files[] = { 8165 { 8166 .name = "zswap.current", 8167 .flags = CFTYPE_NOT_ON_ROOT, 8168 .read_u64 = zswap_current_read, 8169 }, 8170 { 8171 .name = "zswap.max", 8172 .flags = CFTYPE_NOT_ON_ROOT, 8173 .seq_show = zswap_max_show, 8174 .write = zswap_max_write, 8175 }, 8176 { } /* terminate */ 8177 }; 8178 #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ 8179 8180 static int __init mem_cgroup_swap_init(void) 8181 { 8182 if (mem_cgroup_disabled()) 8183 return 0; 8184 8185 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); 8186 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); 8187 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) 8188 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files)); 8189 #endif 8190 return 0; 8191 } 8192 subsys_initcall(mem_cgroup_swap_init); 8193 8194 #endif /* CONFIG_SWAP */ 8195