1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* memcontrol.c - Memory Controller 3 * 4 * Copyright IBM Corporation, 2007 5 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 6 * 7 * Copyright 2007 OpenVZ SWsoft Inc 8 * Author: Pavel Emelianov <xemul@openvz.org> 9 * 10 * Memory thresholds 11 * Copyright (C) 2009 Nokia Corporation 12 * Author: Kirill A. Shutemov 13 * 14 * Kernel Memory Controller 15 * Copyright (C) 2012 Parallels Inc. and Google Inc. 16 * Authors: Glauber Costa and Suleiman Souhlal 17 * 18 * Native page reclaim 19 * Charge lifetime sanitation 20 * Lockless page tracking & accounting 21 * Unified hierarchy configuration model 22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 23 * 24 * Per memcg lru locking 25 * Copyright (C) 2020 Alibaba, Inc, Alex Shi 26 */ 27 28 #include <linux/cgroup-defs.h> 29 #include <linux/page_counter.h> 30 #include <linux/memcontrol.h> 31 #include <linux/cgroup.h> 32 #include <linux/sched/mm.h> 33 #include <linux/shmem_fs.h> 34 #include <linux/hugetlb.h> 35 #include <linux/pagemap.h> 36 #include <linux/pagevec.h> 37 #include <linux/vm_event_item.h> 38 #include <linux/smp.h> 39 #include <linux/page-flags.h> 40 #include <linux/backing-dev.h> 41 #include <linux/bit_spinlock.h> 42 #include <linux/rcupdate.h> 43 #include <linux/limits.h> 44 #include <linux/export.h> 45 #include <linux/list.h> 46 #include <linux/mutex.h> 47 #include <linux/rbtree.h> 48 #include <linux/slab.h> 49 #include <linux/swapops.h> 50 #include <linux/spinlock.h> 51 #include <linux/fs.h> 52 #include <linux/seq_file.h> 53 #include <linux/parser.h> 54 #include <linux/vmpressure.h> 55 #include <linux/memremap.h> 56 #include <linux/mm_inline.h> 57 #include <linux/swap_cgroup.h> 58 #include <linux/cpu.h> 59 #include <linux/oom.h> 60 #include <linux/lockdep.h> 61 #include <linux/resume_user_mode.h> 62 #include <linux/psi.h> 63 #include <linux/seq_buf.h> 64 #include <linux/sched/isolation.h> 65 #include <linux/kmemleak.h> 66 #include "internal.h" 67 #include <net/sock.h> 68 #include <net/ip.h> 69 #include "slab.h" 70 #include "memcontrol-v1.h" 71 72 #include <linux/uaccess.h> 73 74 #include <trace/events/vmscan.h> 75 76 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 77 EXPORT_SYMBOL(memory_cgrp_subsys); 78 79 struct mem_cgroup *root_mem_cgroup __read_mostly; 80 81 /* Active memory cgroup to use from an interrupt context */ 82 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); 83 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); 84 85 /* Socket memory accounting disabled? */ 86 static bool cgroup_memory_nosocket __ro_after_init; 87 88 /* Kernel memory accounting disabled? */ 89 static bool cgroup_memory_nokmem __ro_after_init; 90 91 /* BPF memory accounting disabled? */ 92 static bool cgroup_memory_nobpf __ro_after_init; 93 94 #ifdef CONFIG_CGROUP_WRITEBACK 95 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); 96 #endif 97 98 #define THRESHOLDS_EVENTS_TARGET 128 99 #define SOFTLIMIT_EVENTS_TARGET 1024 100 101 static inline bool task_is_dying(void) 102 { 103 return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 104 (current->flags & PF_EXITING); 105 } 106 107 /* Some nice accessors for the vmpressure. */ 108 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 109 { 110 if (!memcg) 111 memcg = root_mem_cgroup; 112 return &memcg->vmpressure; 113 } 114 115 struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) 116 { 117 return container_of(vmpr, struct mem_cgroup, vmpressure); 118 } 119 120 #define CURRENT_OBJCG_UPDATE_BIT 0 121 #define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT) 122 123 static DEFINE_SPINLOCK(objcg_lock); 124 125 bool mem_cgroup_kmem_disabled(void) 126 { 127 return cgroup_memory_nokmem; 128 } 129 130 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 131 unsigned int nr_pages); 132 133 static void obj_cgroup_release(struct percpu_ref *ref) 134 { 135 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); 136 unsigned int nr_bytes; 137 unsigned int nr_pages; 138 unsigned long flags; 139 140 /* 141 * At this point all allocated objects are freed, and 142 * objcg->nr_charged_bytes can't have an arbitrary byte value. 143 * However, it can be PAGE_SIZE or (x * PAGE_SIZE). 144 * 145 * The following sequence can lead to it: 146 * 1) CPU0: objcg == stock->cached_objcg 147 * 2) CPU1: we do a small allocation (e.g. 92 bytes), 148 * PAGE_SIZE bytes are charged 149 * 3) CPU1: a process from another memcg is allocating something, 150 * the stock if flushed, 151 * objcg->nr_charged_bytes = PAGE_SIZE - 92 152 * 5) CPU0: we do release this object, 153 * 92 bytes are added to stock->nr_bytes 154 * 6) CPU0: stock is flushed, 155 * 92 bytes are added to objcg->nr_charged_bytes 156 * 157 * In the result, nr_charged_bytes == PAGE_SIZE. 158 * This page will be uncharged in obj_cgroup_release(). 159 */ 160 nr_bytes = atomic_read(&objcg->nr_charged_bytes); 161 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); 162 nr_pages = nr_bytes >> PAGE_SHIFT; 163 164 if (nr_pages) 165 obj_cgroup_uncharge_pages(objcg, nr_pages); 166 167 spin_lock_irqsave(&objcg_lock, flags); 168 list_del(&objcg->list); 169 spin_unlock_irqrestore(&objcg_lock, flags); 170 171 percpu_ref_exit(ref); 172 kfree_rcu(objcg, rcu); 173 } 174 175 static struct obj_cgroup *obj_cgroup_alloc(void) 176 { 177 struct obj_cgroup *objcg; 178 int ret; 179 180 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); 181 if (!objcg) 182 return NULL; 183 184 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, 185 GFP_KERNEL); 186 if (ret) { 187 kfree(objcg); 188 return NULL; 189 } 190 INIT_LIST_HEAD(&objcg->list); 191 return objcg; 192 } 193 194 static void memcg_reparent_objcgs(struct mem_cgroup *memcg, 195 struct mem_cgroup *parent) 196 { 197 struct obj_cgroup *objcg, *iter; 198 199 objcg = rcu_replace_pointer(memcg->objcg, NULL, true); 200 201 spin_lock_irq(&objcg_lock); 202 203 /* 1) Ready to reparent active objcg. */ 204 list_add(&objcg->list, &memcg->objcg_list); 205 /* 2) Reparent active objcg and already reparented objcgs to parent. */ 206 list_for_each_entry(iter, &memcg->objcg_list, list) 207 WRITE_ONCE(iter->memcg, parent); 208 /* 3) Move already reparented objcgs to the parent's list */ 209 list_splice(&memcg->objcg_list, &parent->objcg_list); 210 211 spin_unlock_irq(&objcg_lock); 212 213 percpu_ref_kill(&objcg->refcnt); 214 } 215 216 /* 217 * A lot of the calls to the cache allocation functions are expected to be 218 * inlined by the compiler. Since the calls to memcg_slab_post_alloc_hook() are 219 * conditional to this static branch, we'll have to allow modules that does 220 * kmem_cache_alloc and the such to see this symbol as well 221 */ 222 DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key); 223 EXPORT_SYMBOL(memcg_kmem_online_key); 224 225 DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key); 226 EXPORT_SYMBOL(memcg_bpf_enabled_key); 227 228 /** 229 * mem_cgroup_css_from_folio - css of the memcg associated with a folio 230 * @folio: folio of interest 231 * 232 * If memcg is bound to the default hierarchy, css of the memcg associated 233 * with @folio is returned. The returned css remains associated with @folio 234 * until it is released. 235 * 236 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 237 * is returned. 238 */ 239 struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) 240 { 241 struct mem_cgroup *memcg = folio_memcg(folio); 242 243 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 244 memcg = root_mem_cgroup; 245 246 return &memcg->css; 247 } 248 249 /** 250 * page_cgroup_ino - return inode number of the memcg a page is charged to 251 * @page: the page 252 * 253 * Look up the closest online ancestor of the memory cgroup @page is charged to 254 * and return its inode number or 0 if @page is not charged to any cgroup. It 255 * is safe to call this function without holding a reference to @page. 256 * 257 * Note, this function is inherently racy, because there is nothing to prevent 258 * the cgroup inode from getting torn down and potentially reallocated a moment 259 * after page_cgroup_ino() returns, so it only should be used by callers that 260 * do not care (such as procfs interfaces). 261 */ 262 ino_t page_cgroup_ino(struct page *page) 263 { 264 struct mem_cgroup *memcg; 265 unsigned long ino = 0; 266 267 rcu_read_lock(); 268 /* page_folio() is racy here, but the entire function is racy anyway */ 269 memcg = folio_memcg_check(page_folio(page)); 270 271 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 272 memcg = parent_mem_cgroup(memcg); 273 if (memcg) 274 ino = cgroup_ino(memcg->css.cgroup); 275 rcu_read_unlock(); 276 return ino; 277 } 278 279 /* Subset of node_stat_item for memcg stats */ 280 static const unsigned int memcg_node_stat_items[] = { 281 NR_INACTIVE_ANON, 282 NR_ACTIVE_ANON, 283 NR_INACTIVE_FILE, 284 NR_ACTIVE_FILE, 285 NR_UNEVICTABLE, 286 NR_SLAB_RECLAIMABLE_B, 287 NR_SLAB_UNRECLAIMABLE_B, 288 WORKINGSET_REFAULT_ANON, 289 WORKINGSET_REFAULT_FILE, 290 WORKINGSET_ACTIVATE_ANON, 291 WORKINGSET_ACTIVATE_FILE, 292 WORKINGSET_RESTORE_ANON, 293 WORKINGSET_RESTORE_FILE, 294 WORKINGSET_NODERECLAIM, 295 NR_ANON_MAPPED, 296 NR_FILE_MAPPED, 297 NR_FILE_PAGES, 298 NR_FILE_DIRTY, 299 NR_WRITEBACK, 300 NR_SHMEM, 301 NR_SHMEM_THPS, 302 NR_FILE_THPS, 303 NR_ANON_THPS, 304 NR_KERNEL_STACK_KB, 305 NR_PAGETABLE, 306 NR_SECONDARY_PAGETABLE, 307 #ifdef CONFIG_SWAP 308 NR_SWAPCACHE, 309 #endif 310 }; 311 312 static const unsigned int memcg_stat_items[] = { 313 MEMCG_SWAP, 314 MEMCG_SOCK, 315 MEMCG_PERCPU_B, 316 MEMCG_VMALLOC, 317 MEMCG_KMEM, 318 MEMCG_ZSWAP_B, 319 MEMCG_ZSWAPPED, 320 }; 321 322 #define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items) 323 #define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \ 324 ARRAY_SIZE(memcg_stat_items)) 325 #define BAD_STAT_IDX(index) ((u32)(index) >= U8_MAX) 326 static u8 mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly; 327 328 static void init_memcg_stats(void) 329 { 330 u8 i, j = 0; 331 332 BUILD_BUG_ON(MEMCG_NR_STAT >= U8_MAX); 333 334 memset(mem_cgroup_stats_index, U8_MAX, sizeof(mem_cgroup_stats_index)); 335 336 for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i, ++j) 337 mem_cgroup_stats_index[memcg_node_stat_items[i]] = j; 338 339 for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i, ++j) 340 mem_cgroup_stats_index[memcg_stat_items[i]] = j; 341 } 342 343 static inline int memcg_stats_index(int idx) 344 { 345 return mem_cgroup_stats_index[idx]; 346 } 347 348 struct lruvec_stats_percpu { 349 /* Local (CPU and cgroup) state */ 350 long state[NR_MEMCG_NODE_STAT_ITEMS]; 351 352 /* Delta calculation for lockless upward propagation */ 353 long state_prev[NR_MEMCG_NODE_STAT_ITEMS]; 354 }; 355 356 struct lruvec_stats { 357 /* Aggregated (CPU and subtree) state */ 358 long state[NR_MEMCG_NODE_STAT_ITEMS]; 359 360 /* Non-hierarchical (CPU aggregated) state */ 361 long state_local[NR_MEMCG_NODE_STAT_ITEMS]; 362 363 /* Pending child counts during tree propagation */ 364 long state_pending[NR_MEMCG_NODE_STAT_ITEMS]; 365 }; 366 367 unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) 368 { 369 struct mem_cgroup_per_node *pn; 370 long x; 371 int i; 372 373 if (mem_cgroup_disabled()) 374 return node_page_state(lruvec_pgdat(lruvec), idx); 375 376 i = memcg_stats_index(idx); 377 if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) 378 return 0; 379 380 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 381 x = READ_ONCE(pn->lruvec_stats->state[i]); 382 #ifdef CONFIG_SMP 383 if (x < 0) 384 x = 0; 385 #endif 386 return x; 387 } 388 389 unsigned long lruvec_page_state_local(struct lruvec *lruvec, 390 enum node_stat_item idx) 391 { 392 struct mem_cgroup_per_node *pn; 393 long x; 394 int i; 395 396 if (mem_cgroup_disabled()) 397 return node_page_state(lruvec_pgdat(lruvec), idx); 398 399 i = memcg_stats_index(idx); 400 if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) 401 return 0; 402 403 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 404 x = READ_ONCE(pn->lruvec_stats->state_local[i]); 405 #ifdef CONFIG_SMP 406 if (x < 0) 407 x = 0; 408 #endif 409 return x; 410 } 411 412 /* Subset of vm_event_item to report for memcg event stats */ 413 static const unsigned int memcg_vm_event_stat[] = { 414 PGPGIN, 415 PGPGOUT, 416 PGSCAN_KSWAPD, 417 PGSCAN_DIRECT, 418 PGSCAN_KHUGEPAGED, 419 PGSTEAL_KSWAPD, 420 PGSTEAL_DIRECT, 421 PGSTEAL_KHUGEPAGED, 422 PGFAULT, 423 PGMAJFAULT, 424 PGREFILL, 425 PGACTIVATE, 426 PGDEACTIVATE, 427 PGLAZYFREE, 428 PGLAZYFREED, 429 #ifdef CONFIG_ZSWAP 430 ZSWPIN, 431 ZSWPOUT, 432 ZSWPWB, 433 #endif 434 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 435 THP_FAULT_ALLOC, 436 THP_COLLAPSE_ALLOC, 437 THP_SWPOUT, 438 THP_SWPOUT_FALLBACK, 439 #endif 440 }; 441 442 #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) 443 static u8 mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; 444 445 static void init_memcg_events(void) 446 { 447 u8 i; 448 449 BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= U8_MAX); 450 451 memset(mem_cgroup_events_index, U8_MAX, 452 sizeof(mem_cgroup_events_index)); 453 454 for (i = 0; i < NR_MEMCG_EVENTS; ++i) 455 mem_cgroup_events_index[memcg_vm_event_stat[i]] = i; 456 } 457 458 static inline int memcg_events_index(enum vm_event_item idx) 459 { 460 return mem_cgroup_events_index[idx]; 461 } 462 463 struct memcg_vmstats_percpu { 464 /* Stats updates since the last flush */ 465 unsigned int stats_updates; 466 467 /* Cached pointers for fast iteration in memcg_rstat_updated() */ 468 struct memcg_vmstats_percpu *parent; 469 struct memcg_vmstats *vmstats; 470 471 /* The above should fit a single cacheline for memcg_rstat_updated() */ 472 473 /* Local (CPU and cgroup) page state & events */ 474 long state[MEMCG_VMSTAT_SIZE]; 475 unsigned long events[NR_MEMCG_EVENTS]; 476 477 /* Delta calculation for lockless upward propagation */ 478 long state_prev[MEMCG_VMSTAT_SIZE]; 479 unsigned long events_prev[NR_MEMCG_EVENTS]; 480 481 /* Cgroup1: threshold notifications & softlimit tree updates */ 482 unsigned long nr_page_events; 483 unsigned long targets[MEM_CGROUP_NTARGETS]; 484 } ____cacheline_aligned; 485 486 struct memcg_vmstats { 487 /* Aggregated (CPU and subtree) page state & events */ 488 long state[MEMCG_VMSTAT_SIZE]; 489 unsigned long events[NR_MEMCG_EVENTS]; 490 491 /* Non-hierarchical (CPU aggregated) page state & events */ 492 long state_local[MEMCG_VMSTAT_SIZE]; 493 unsigned long events_local[NR_MEMCG_EVENTS]; 494 495 /* Pending child counts during tree propagation */ 496 long state_pending[MEMCG_VMSTAT_SIZE]; 497 unsigned long events_pending[NR_MEMCG_EVENTS]; 498 499 /* Stats updates since the last flush */ 500 atomic64_t stats_updates; 501 }; 502 503 /* 504 * memcg and lruvec stats flushing 505 * 506 * Many codepaths leading to stats update or read are performance sensitive and 507 * adding stats flushing in such codepaths is not desirable. So, to optimize the 508 * flushing the kernel does: 509 * 510 * 1) Periodically and asynchronously flush the stats every 2 seconds to not let 511 * rstat update tree grow unbounded. 512 * 513 * 2) Flush the stats synchronously on reader side only when there are more than 514 * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization 515 * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but 516 * only for 2 seconds due to (1). 517 */ 518 static void flush_memcg_stats_dwork(struct work_struct *w); 519 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); 520 static u64 flush_last_time; 521 522 #define FLUSH_TIME (2UL*HZ) 523 524 /* 525 * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can 526 * not rely on this as part of an acquired spinlock_t lock. These functions are 527 * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion 528 * is sufficient. 529 */ 530 static void memcg_stats_lock(void) 531 { 532 preempt_disable_nested(); 533 VM_WARN_ON_IRQS_ENABLED(); 534 } 535 536 static void __memcg_stats_lock(void) 537 { 538 preempt_disable_nested(); 539 } 540 541 static void memcg_stats_unlock(void) 542 { 543 preempt_enable_nested(); 544 } 545 546 547 static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) 548 { 549 return atomic64_read(&vmstats->stats_updates) > 550 MEMCG_CHARGE_BATCH * num_online_cpus(); 551 } 552 553 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) 554 { 555 struct memcg_vmstats_percpu *statc; 556 int cpu = smp_processor_id(); 557 unsigned int stats_updates; 558 559 if (!val) 560 return; 561 562 cgroup_rstat_updated(memcg->css.cgroup, cpu); 563 statc = this_cpu_ptr(memcg->vmstats_percpu); 564 for (; statc; statc = statc->parent) { 565 stats_updates = READ_ONCE(statc->stats_updates) + abs(val); 566 WRITE_ONCE(statc->stats_updates, stats_updates); 567 if (stats_updates < MEMCG_CHARGE_BATCH) 568 continue; 569 570 /* 571 * If @memcg is already flush-able, increasing stats_updates is 572 * redundant. Avoid the overhead of the atomic update. 573 */ 574 if (!memcg_vmstats_needs_flush(statc->vmstats)) 575 atomic64_add(stats_updates, 576 &statc->vmstats->stats_updates); 577 WRITE_ONCE(statc->stats_updates, 0); 578 } 579 } 580 581 static void do_flush_stats(struct mem_cgroup *memcg) 582 { 583 if (mem_cgroup_is_root(memcg)) 584 WRITE_ONCE(flush_last_time, jiffies_64); 585 586 cgroup_rstat_flush(memcg->css.cgroup); 587 } 588 589 /* 590 * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree 591 * @memcg: root of the subtree to flush 592 * 593 * Flushing is serialized by the underlying global rstat lock. There is also a 594 * minimum amount of work to be done even if there are no stat updates to flush. 595 * Hence, we only flush the stats if the updates delta exceeds a threshold. This 596 * avoids unnecessary work and contention on the underlying lock. 597 */ 598 void mem_cgroup_flush_stats(struct mem_cgroup *memcg) 599 { 600 if (mem_cgroup_disabled()) 601 return; 602 603 if (!memcg) 604 memcg = root_mem_cgroup; 605 606 if (memcg_vmstats_needs_flush(memcg->vmstats)) 607 do_flush_stats(memcg); 608 } 609 610 void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) 611 { 612 /* Only flush if the periodic flusher is one full cycle late */ 613 if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME)) 614 mem_cgroup_flush_stats(memcg); 615 } 616 617 static void flush_memcg_stats_dwork(struct work_struct *w) 618 { 619 /* 620 * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing 621 * in latency-sensitive paths is as cheap as possible. 622 */ 623 do_flush_stats(root_mem_cgroup); 624 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); 625 } 626 627 unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) 628 { 629 long x; 630 int i = memcg_stats_index(idx); 631 632 if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) 633 return 0; 634 635 x = READ_ONCE(memcg->vmstats->state[i]); 636 #ifdef CONFIG_SMP 637 if (x < 0) 638 x = 0; 639 #endif 640 return x; 641 } 642 643 static int memcg_page_state_unit(int item); 644 645 /* 646 * Normalize the value passed into memcg_rstat_updated() to be in pages. Round 647 * up non-zero sub-page updates to 1 page as zero page updates are ignored. 648 */ 649 static int memcg_state_val_in_pages(int idx, int val) 650 { 651 int unit = memcg_page_state_unit(idx); 652 653 if (!val || unit == PAGE_SIZE) 654 return val; 655 else 656 return max(val * unit / PAGE_SIZE, 1UL); 657 } 658 659 /** 660 * __mod_memcg_state - update cgroup memory statistics 661 * @memcg: the memory cgroup 662 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item 663 * @val: delta to add to the counter, can be negative 664 */ 665 void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, 666 int val) 667 { 668 int i = memcg_stats_index(idx); 669 670 if (mem_cgroup_disabled()) 671 return; 672 673 if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) 674 return; 675 676 __this_cpu_add(memcg->vmstats_percpu->state[i], val); 677 memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); 678 } 679 680 /* idx can be of type enum memcg_stat_item or node_stat_item. */ 681 unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) 682 { 683 long x; 684 int i = memcg_stats_index(idx); 685 686 if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) 687 return 0; 688 689 x = READ_ONCE(memcg->vmstats->state_local[i]); 690 #ifdef CONFIG_SMP 691 if (x < 0) 692 x = 0; 693 #endif 694 return x; 695 } 696 697 static void __mod_memcg_lruvec_state(struct lruvec *lruvec, 698 enum node_stat_item idx, 699 int val) 700 { 701 struct mem_cgroup_per_node *pn; 702 struct mem_cgroup *memcg; 703 int i = memcg_stats_index(idx); 704 705 if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) 706 return; 707 708 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 709 memcg = pn->memcg; 710 711 /* 712 * The caller from rmap relies on disabled preemption because they never 713 * update their counter from in-interrupt context. For these two 714 * counters we check that the update is never performed from an 715 * interrupt context while other caller need to have disabled interrupt. 716 */ 717 __memcg_stats_lock(); 718 if (IS_ENABLED(CONFIG_DEBUG_VM)) { 719 switch (idx) { 720 case NR_ANON_MAPPED: 721 case NR_FILE_MAPPED: 722 case NR_ANON_THPS: 723 WARN_ON_ONCE(!in_task()); 724 break; 725 default: 726 VM_WARN_ON_IRQS_ENABLED(); 727 } 728 } 729 730 /* Update memcg */ 731 __this_cpu_add(memcg->vmstats_percpu->state[i], val); 732 733 /* Update lruvec */ 734 __this_cpu_add(pn->lruvec_stats_percpu->state[i], val); 735 736 memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); 737 memcg_stats_unlock(); 738 } 739 740 /** 741 * __mod_lruvec_state - update lruvec memory statistics 742 * @lruvec: the lruvec 743 * @idx: the stat item 744 * @val: delta to add to the counter, can be negative 745 * 746 * The lruvec is the intersection of the NUMA node and a cgroup. This 747 * function updates the all three counters that are affected by a 748 * change of state at this level: per-node, per-cgroup, per-lruvec. 749 */ 750 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 751 int val) 752 { 753 /* Update node */ 754 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); 755 756 /* Update memcg and lruvec */ 757 if (!mem_cgroup_disabled()) 758 __mod_memcg_lruvec_state(lruvec, idx, val); 759 } 760 761 void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, 762 int val) 763 { 764 struct mem_cgroup *memcg; 765 pg_data_t *pgdat = folio_pgdat(folio); 766 struct lruvec *lruvec; 767 768 rcu_read_lock(); 769 memcg = folio_memcg(folio); 770 /* Untracked pages have no memcg, no lruvec. Update only the node */ 771 if (!memcg) { 772 rcu_read_unlock(); 773 __mod_node_page_state(pgdat, idx, val); 774 return; 775 } 776 777 lruvec = mem_cgroup_lruvec(memcg, pgdat); 778 __mod_lruvec_state(lruvec, idx, val); 779 rcu_read_unlock(); 780 } 781 EXPORT_SYMBOL(__lruvec_stat_mod_folio); 782 783 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) 784 { 785 pg_data_t *pgdat = page_pgdat(virt_to_page(p)); 786 struct mem_cgroup *memcg; 787 struct lruvec *lruvec; 788 789 rcu_read_lock(); 790 memcg = mem_cgroup_from_slab_obj(p); 791 792 /* 793 * Untracked pages have no memcg, no lruvec. Update only the 794 * node. If we reparent the slab objects to the root memcg, 795 * when we free the slab object, we need to update the per-memcg 796 * vmstats to keep it correct for the root memcg. 797 */ 798 if (!memcg) { 799 __mod_node_page_state(pgdat, idx, val); 800 } else { 801 lruvec = mem_cgroup_lruvec(memcg, pgdat); 802 __mod_lruvec_state(lruvec, idx, val); 803 } 804 rcu_read_unlock(); 805 } 806 807 /** 808 * __count_memcg_events - account VM events in a cgroup 809 * @memcg: the memory cgroup 810 * @idx: the event item 811 * @count: the number of events that occurred 812 */ 813 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, 814 unsigned long count) 815 { 816 int i = memcg_events_index(idx); 817 818 if (mem_cgroup_disabled()) 819 return; 820 821 if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) 822 return; 823 824 memcg_stats_lock(); 825 __this_cpu_add(memcg->vmstats_percpu->events[i], count); 826 memcg_rstat_updated(memcg, count); 827 memcg_stats_unlock(); 828 } 829 830 unsigned long memcg_events(struct mem_cgroup *memcg, int event) 831 { 832 int i = memcg_events_index(event); 833 834 if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event)) 835 return 0; 836 837 return READ_ONCE(memcg->vmstats->events[i]); 838 } 839 840 unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) 841 { 842 int i = memcg_events_index(event); 843 844 if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event)) 845 return 0; 846 847 return READ_ONCE(memcg->vmstats->events_local[i]); 848 } 849 850 void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages) 851 { 852 /* pagein of a big page is an event. So, ignore page size */ 853 if (nr_pages > 0) 854 __count_memcg_events(memcg, PGPGIN, 1); 855 else { 856 __count_memcg_events(memcg, PGPGOUT, 1); 857 nr_pages = -nr_pages; /* for event */ 858 } 859 860 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); 861 } 862 863 bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 864 enum mem_cgroup_events_target target) 865 { 866 unsigned long val, next; 867 868 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); 869 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); 870 /* from time_after() in jiffies.h */ 871 if ((long)(next - val) < 0) { 872 switch (target) { 873 case MEM_CGROUP_TARGET_THRESH: 874 next = val + THRESHOLDS_EVENTS_TARGET; 875 break; 876 case MEM_CGROUP_TARGET_SOFTLIMIT: 877 next = val + SOFTLIMIT_EVENTS_TARGET; 878 break; 879 default: 880 break; 881 } 882 __this_cpu_write(memcg->vmstats_percpu->targets[target], next); 883 return true; 884 } 885 return false; 886 } 887 888 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 889 { 890 /* 891 * mm_update_next_owner() may clear mm->owner to NULL 892 * if it races with swapoff, page migration, etc. 893 * So this can be called with p == NULL. 894 */ 895 if (unlikely(!p)) 896 return NULL; 897 898 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 899 } 900 EXPORT_SYMBOL(mem_cgroup_from_task); 901 902 static __always_inline struct mem_cgroup *active_memcg(void) 903 { 904 if (!in_task()) 905 return this_cpu_read(int_active_memcg); 906 else 907 return current->active_memcg; 908 } 909 910 /** 911 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 912 * @mm: mm from which memcg should be extracted. It can be NULL. 913 * 914 * Obtain a reference on mm->memcg and returns it if successful. If mm 915 * is NULL, then the memcg is chosen as follows: 916 * 1) The active memcg, if set. 917 * 2) current->mm->memcg, if available 918 * 3) root memcg 919 * If mem_cgroup is disabled, NULL is returned. 920 */ 921 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 922 { 923 struct mem_cgroup *memcg; 924 925 if (mem_cgroup_disabled()) 926 return NULL; 927 928 /* 929 * Page cache insertions can happen without an 930 * actual mm context, e.g. during disk probing 931 * on boot, loopback IO, acct() writes etc. 932 * 933 * No need to css_get on root memcg as the reference 934 * counting is disabled on the root level in the 935 * cgroup core. See CSS_NO_REF. 936 */ 937 if (unlikely(!mm)) { 938 memcg = active_memcg(); 939 if (unlikely(memcg)) { 940 /* remote memcg must hold a ref */ 941 css_get(&memcg->css); 942 return memcg; 943 } 944 mm = current->mm; 945 if (unlikely(!mm)) 946 return root_mem_cgroup; 947 } 948 949 rcu_read_lock(); 950 do { 951 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 952 if (unlikely(!memcg)) 953 memcg = root_mem_cgroup; 954 } while (!css_tryget(&memcg->css)); 955 rcu_read_unlock(); 956 return memcg; 957 } 958 EXPORT_SYMBOL(get_mem_cgroup_from_mm); 959 960 /** 961 * get_mem_cgroup_from_current - Obtain a reference on current task's memcg. 962 */ 963 struct mem_cgroup *get_mem_cgroup_from_current(void) 964 { 965 struct mem_cgroup *memcg; 966 967 if (mem_cgroup_disabled()) 968 return NULL; 969 970 again: 971 rcu_read_lock(); 972 memcg = mem_cgroup_from_task(current); 973 if (!css_tryget(&memcg->css)) { 974 rcu_read_unlock(); 975 goto again; 976 } 977 rcu_read_unlock(); 978 return memcg; 979 } 980 981 /** 982 * mem_cgroup_iter - iterate over memory cgroup hierarchy 983 * @root: hierarchy root 984 * @prev: previously returned memcg, NULL on first invocation 985 * @reclaim: cookie for shared reclaim walks, NULL for full walks 986 * 987 * Returns references to children of the hierarchy below @root, or 988 * @root itself, or %NULL after a full round-trip. 989 * 990 * Caller must pass the return value in @prev on subsequent 991 * invocations for reference counting, or use mem_cgroup_iter_break() 992 * to cancel a hierarchy walk before the round-trip is complete. 993 * 994 * Reclaimers can specify a node in @reclaim to divide up the memcgs 995 * in the hierarchy among all concurrent reclaimers operating on the 996 * same node. 997 */ 998 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 999 struct mem_cgroup *prev, 1000 struct mem_cgroup_reclaim_cookie *reclaim) 1001 { 1002 struct mem_cgroup_reclaim_iter *iter; 1003 struct cgroup_subsys_state *css = NULL; 1004 struct mem_cgroup *memcg = NULL; 1005 struct mem_cgroup *pos = NULL; 1006 1007 if (mem_cgroup_disabled()) 1008 return NULL; 1009 1010 if (!root) 1011 root = root_mem_cgroup; 1012 1013 rcu_read_lock(); 1014 1015 if (reclaim) { 1016 struct mem_cgroup_per_node *mz; 1017 1018 mz = root->nodeinfo[reclaim->pgdat->node_id]; 1019 iter = &mz->iter; 1020 1021 /* 1022 * On start, join the current reclaim iteration cycle. 1023 * Exit when a concurrent walker completes it. 1024 */ 1025 if (!prev) 1026 reclaim->generation = iter->generation; 1027 else if (reclaim->generation != iter->generation) 1028 goto out_unlock; 1029 1030 while (1) { 1031 pos = READ_ONCE(iter->position); 1032 if (!pos || css_tryget(&pos->css)) 1033 break; 1034 /* 1035 * css reference reached zero, so iter->position will 1036 * be cleared by ->css_released. However, we should not 1037 * rely on this happening soon, because ->css_released 1038 * is called from a work queue, and by busy-waiting we 1039 * might block it. So we clear iter->position right 1040 * away. 1041 */ 1042 (void)cmpxchg(&iter->position, pos, NULL); 1043 } 1044 } else if (prev) { 1045 pos = prev; 1046 } 1047 1048 if (pos) 1049 css = &pos->css; 1050 1051 for (;;) { 1052 css = css_next_descendant_pre(css, &root->css); 1053 if (!css) { 1054 /* 1055 * Reclaimers share the hierarchy walk, and a 1056 * new one might jump in right at the end of 1057 * the hierarchy - make sure they see at least 1058 * one group and restart from the beginning. 1059 */ 1060 if (!prev) 1061 continue; 1062 break; 1063 } 1064 1065 /* 1066 * Verify the css and acquire a reference. The root 1067 * is provided by the caller, so we know it's alive 1068 * and kicking, and don't take an extra reference. 1069 */ 1070 if (css == &root->css || css_tryget(css)) { 1071 memcg = mem_cgroup_from_css(css); 1072 break; 1073 } 1074 } 1075 1076 if (reclaim) { 1077 /* 1078 * The position could have already been updated by a competing 1079 * thread, so check that the value hasn't changed since we read 1080 * it to avoid reclaiming from the same cgroup twice. 1081 */ 1082 (void)cmpxchg(&iter->position, pos, memcg); 1083 1084 if (pos) 1085 css_put(&pos->css); 1086 1087 if (!memcg) 1088 iter->generation++; 1089 } 1090 1091 out_unlock: 1092 rcu_read_unlock(); 1093 if (prev && prev != root) 1094 css_put(&prev->css); 1095 1096 return memcg; 1097 } 1098 1099 /** 1100 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1101 * @root: hierarchy root 1102 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1103 */ 1104 void mem_cgroup_iter_break(struct mem_cgroup *root, 1105 struct mem_cgroup *prev) 1106 { 1107 if (!root) 1108 root = root_mem_cgroup; 1109 if (prev && prev != root) 1110 css_put(&prev->css); 1111 } 1112 1113 static void __invalidate_reclaim_iterators(struct mem_cgroup *from, 1114 struct mem_cgroup *dead_memcg) 1115 { 1116 struct mem_cgroup_reclaim_iter *iter; 1117 struct mem_cgroup_per_node *mz; 1118 int nid; 1119 1120 for_each_node(nid) { 1121 mz = from->nodeinfo[nid]; 1122 iter = &mz->iter; 1123 cmpxchg(&iter->position, dead_memcg, NULL); 1124 } 1125 } 1126 1127 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 1128 { 1129 struct mem_cgroup *memcg = dead_memcg; 1130 struct mem_cgroup *last; 1131 1132 do { 1133 __invalidate_reclaim_iterators(memcg, dead_memcg); 1134 last = memcg; 1135 } while ((memcg = parent_mem_cgroup(memcg))); 1136 1137 /* 1138 * When cgroup1 non-hierarchy mode is used, 1139 * parent_mem_cgroup() does not walk all the way up to the 1140 * cgroup root (root_mem_cgroup). So we have to handle 1141 * dead_memcg from cgroup root separately. 1142 */ 1143 if (!mem_cgroup_is_root(last)) 1144 __invalidate_reclaim_iterators(root_mem_cgroup, 1145 dead_memcg); 1146 } 1147 1148 /** 1149 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 1150 * @memcg: hierarchy root 1151 * @fn: function to call for each task 1152 * @arg: argument passed to @fn 1153 * 1154 * This function iterates over tasks attached to @memcg or to any of its 1155 * descendants and calls @fn for each task. If @fn returns a non-zero 1156 * value, the function breaks the iteration loop. Otherwise, it will iterate 1157 * over all tasks and return 0. 1158 * 1159 * This function must not be called for the root memory cgroup. 1160 */ 1161 void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 1162 int (*fn)(struct task_struct *, void *), void *arg) 1163 { 1164 struct mem_cgroup *iter; 1165 int ret = 0; 1166 1167 BUG_ON(mem_cgroup_is_root(memcg)); 1168 1169 for_each_mem_cgroup_tree(iter, memcg) { 1170 struct css_task_iter it; 1171 struct task_struct *task; 1172 1173 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); 1174 while (!ret && (task = css_task_iter_next(&it))) 1175 ret = fn(task, arg); 1176 css_task_iter_end(&it); 1177 if (ret) { 1178 mem_cgroup_iter_break(memcg, iter); 1179 break; 1180 } 1181 } 1182 } 1183 1184 #ifdef CONFIG_DEBUG_VM 1185 void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) 1186 { 1187 struct mem_cgroup *memcg; 1188 1189 if (mem_cgroup_disabled()) 1190 return; 1191 1192 memcg = folio_memcg(folio); 1193 1194 if (!memcg) 1195 VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); 1196 else 1197 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); 1198 } 1199 #endif 1200 1201 /** 1202 * folio_lruvec_lock - Lock the lruvec for a folio. 1203 * @folio: Pointer to the folio. 1204 * 1205 * These functions are safe to use under any of the following conditions: 1206 * - folio locked 1207 * - folio_test_lru false 1208 * - folio_memcg_lock() 1209 * - folio frozen (refcount of 0) 1210 * 1211 * Return: The lruvec this folio is on with its lock held. 1212 */ 1213 struct lruvec *folio_lruvec_lock(struct folio *folio) 1214 { 1215 struct lruvec *lruvec = folio_lruvec(folio); 1216 1217 spin_lock(&lruvec->lru_lock); 1218 lruvec_memcg_debug(lruvec, folio); 1219 1220 return lruvec; 1221 } 1222 1223 /** 1224 * folio_lruvec_lock_irq - Lock the lruvec for a folio. 1225 * @folio: Pointer to the folio. 1226 * 1227 * These functions are safe to use under any of the following conditions: 1228 * - folio locked 1229 * - folio_test_lru false 1230 * - folio_memcg_lock() 1231 * - folio frozen (refcount of 0) 1232 * 1233 * Return: The lruvec this folio is on with its lock held and interrupts 1234 * disabled. 1235 */ 1236 struct lruvec *folio_lruvec_lock_irq(struct folio *folio) 1237 { 1238 struct lruvec *lruvec = folio_lruvec(folio); 1239 1240 spin_lock_irq(&lruvec->lru_lock); 1241 lruvec_memcg_debug(lruvec, folio); 1242 1243 return lruvec; 1244 } 1245 1246 /** 1247 * folio_lruvec_lock_irqsave - Lock the lruvec for a folio. 1248 * @folio: Pointer to the folio. 1249 * @flags: Pointer to irqsave flags. 1250 * 1251 * These functions are safe to use under any of the following conditions: 1252 * - folio locked 1253 * - folio_test_lru false 1254 * - folio_memcg_lock() 1255 * - folio frozen (refcount of 0) 1256 * 1257 * Return: The lruvec this folio is on with its lock held and interrupts 1258 * disabled. 1259 */ 1260 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, 1261 unsigned long *flags) 1262 { 1263 struct lruvec *lruvec = folio_lruvec(folio); 1264 1265 spin_lock_irqsave(&lruvec->lru_lock, *flags); 1266 lruvec_memcg_debug(lruvec, folio); 1267 1268 return lruvec; 1269 } 1270 1271 /** 1272 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1273 * @lruvec: mem_cgroup per zone lru vector 1274 * @lru: index of lru list the page is sitting on 1275 * @zid: zone id of the accounted pages 1276 * @nr_pages: positive when adding or negative when removing 1277 * 1278 * This function must be called under lru_lock, just before a page is added 1279 * to or just after a page is removed from an lru list. 1280 */ 1281 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1282 int zid, int nr_pages) 1283 { 1284 struct mem_cgroup_per_node *mz; 1285 unsigned long *lru_size; 1286 long size; 1287 1288 if (mem_cgroup_disabled()) 1289 return; 1290 1291 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 1292 lru_size = &mz->lru_zone_size[zid][lru]; 1293 1294 if (nr_pages < 0) 1295 *lru_size += nr_pages; 1296 1297 size = *lru_size; 1298 if (WARN_ONCE(size < 0, 1299 "%s(%p, %d, %d): lru_size %ld\n", 1300 __func__, lruvec, lru, nr_pages, size)) { 1301 VM_BUG_ON(1); 1302 *lru_size = 0; 1303 } 1304 1305 if (nr_pages > 0) 1306 *lru_size += nr_pages; 1307 } 1308 1309 /** 1310 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1311 * @memcg: the memory cgroup 1312 * 1313 * Returns the maximum amount of memory @mem can be charged with, in 1314 * pages. 1315 */ 1316 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1317 { 1318 unsigned long margin = 0; 1319 unsigned long count; 1320 unsigned long limit; 1321 1322 count = page_counter_read(&memcg->memory); 1323 limit = READ_ONCE(memcg->memory.max); 1324 if (count < limit) 1325 margin = limit - count; 1326 1327 if (do_memsw_account()) { 1328 count = page_counter_read(&memcg->memsw); 1329 limit = READ_ONCE(memcg->memsw.max); 1330 if (count < limit) 1331 margin = min(margin, limit - count); 1332 else 1333 margin = 0; 1334 } 1335 1336 return margin; 1337 } 1338 1339 struct memory_stat { 1340 const char *name; 1341 unsigned int idx; 1342 }; 1343 1344 static const struct memory_stat memory_stats[] = { 1345 { "anon", NR_ANON_MAPPED }, 1346 { "file", NR_FILE_PAGES }, 1347 { "kernel", MEMCG_KMEM }, 1348 { "kernel_stack", NR_KERNEL_STACK_KB }, 1349 { "pagetables", NR_PAGETABLE }, 1350 { "sec_pagetables", NR_SECONDARY_PAGETABLE }, 1351 { "percpu", MEMCG_PERCPU_B }, 1352 { "sock", MEMCG_SOCK }, 1353 { "vmalloc", MEMCG_VMALLOC }, 1354 { "shmem", NR_SHMEM }, 1355 #ifdef CONFIG_ZSWAP 1356 { "zswap", MEMCG_ZSWAP_B }, 1357 { "zswapped", MEMCG_ZSWAPPED }, 1358 #endif 1359 { "file_mapped", NR_FILE_MAPPED }, 1360 { "file_dirty", NR_FILE_DIRTY }, 1361 { "file_writeback", NR_WRITEBACK }, 1362 #ifdef CONFIG_SWAP 1363 { "swapcached", NR_SWAPCACHE }, 1364 #endif 1365 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1366 { "anon_thp", NR_ANON_THPS }, 1367 { "file_thp", NR_FILE_THPS }, 1368 { "shmem_thp", NR_SHMEM_THPS }, 1369 #endif 1370 { "inactive_anon", NR_INACTIVE_ANON }, 1371 { "active_anon", NR_ACTIVE_ANON }, 1372 { "inactive_file", NR_INACTIVE_FILE }, 1373 { "active_file", NR_ACTIVE_FILE }, 1374 { "unevictable", NR_UNEVICTABLE }, 1375 { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B }, 1376 { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B }, 1377 1378 /* The memory events */ 1379 { "workingset_refault_anon", WORKINGSET_REFAULT_ANON }, 1380 { "workingset_refault_file", WORKINGSET_REFAULT_FILE }, 1381 { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON }, 1382 { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE }, 1383 { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, 1384 { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, 1385 { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, 1386 }; 1387 1388 /* The actual unit of the state item, not the same as the output unit */ 1389 static int memcg_page_state_unit(int item) 1390 { 1391 switch (item) { 1392 case MEMCG_PERCPU_B: 1393 case MEMCG_ZSWAP_B: 1394 case NR_SLAB_RECLAIMABLE_B: 1395 case NR_SLAB_UNRECLAIMABLE_B: 1396 return 1; 1397 case NR_KERNEL_STACK_KB: 1398 return SZ_1K; 1399 default: 1400 return PAGE_SIZE; 1401 } 1402 } 1403 1404 /* Translate stat items to the correct unit for memory.stat output */ 1405 static int memcg_page_state_output_unit(int item) 1406 { 1407 /* 1408 * Workingset state is actually in pages, but we export it to userspace 1409 * as a scalar count of events, so special case it here. 1410 */ 1411 switch (item) { 1412 case WORKINGSET_REFAULT_ANON: 1413 case WORKINGSET_REFAULT_FILE: 1414 case WORKINGSET_ACTIVATE_ANON: 1415 case WORKINGSET_ACTIVATE_FILE: 1416 case WORKINGSET_RESTORE_ANON: 1417 case WORKINGSET_RESTORE_FILE: 1418 case WORKINGSET_NODERECLAIM: 1419 return 1; 1420 default: 1421 return memcg_page_state_unit(item); 1422 } 1423 } 1424 1425 unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item) 1426 { 1427 return memcg_page_state(memcg, item) * 1428 memcg_page_state_output_unit(item); 1429 } 1430 1431 unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item) 1432 { 1433 return memcg_page_state_local(memcg, item) * 1434 memcg_page_state_output_unit(item); 1435 } 1436 1437 static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 1438 { 1439 int i; 1440 1441 /* 1442 * Provide statistics on the state of the memory subsystem as 1443 * well as cumulative event counters that show past behavior. 1444 * 1445 * This list is ordered following a combination of these gradients: 1446 * 1) generic big picture -> specifics and details 1447 * 2) reflecting userspace activity -> reflecting kernel heuristics 1448 * 1449 * Current memory state: 1450 */ 1451 mem_cgroup_flush_stats(memcg); 1452 1453 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 1454 u64 size; 1455 1456 size = memcg_page_state_output(memcg, memory_stats[i].idx); 1457 seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size); 1458 1459 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { 1460 size += memcg_page_state_output(memcg, 1461 NR_SLAB_RECLAIMABLE_B); 1462 seq_buf_printf(s, "slab %llu\n", size); 1463 } 1464 } 1465 1466 /* Accumulated memory events */ 1467 seq_buf_printf(s, "pgscan %lu\n", 1468 memcg_events(memcg, PGSCAN_KSWAPD) + 1469 memcg_events(memcg, PGSCAN_DIRECT) + 1470 memcg_events(memcg, PGSCAN_KHUGEPAGED)); 1471 seq_buf_printf(s, "pgsteal %lu\n", 1472 memcg_events(memcg, PGSTEAL_KSWAPD) + 1473 memcg_events(memcg, PGSTEAL_DIRECT) + 1474 memcg_events(memcg, PGSTEAL_KHUGEPAGED)); 1475 1476 for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { 1477 if (memcg_vm_event_stat[i] == PGPGIN || 1478 memcg_vm_event_stat[i] == PGPGOUT) 1479 continue; 1480 1481 seq_buf_printf(s, "%s %lu\n", 1482 vm_event_name(memcg_vm_event_stat[i]), 1483 memcg_events(memcg, memcg_vm_event_stat[i])); 1484 } 1485 } 1486 1487 static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 1488 { 1489 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1490 memcg_stat_format(memcg, s); 1491 else 1492 memcg1_stat_format(memcg, s); 1493 if (seq_buf_has_overflowed(s)) 1494 pr_warn("%s: Warning, stat buffer overflow, please report\n", __func__); 1495 } 1496 1497 /** 1498 * mem_cgroup_print_oom_context: Print OOM information relevant to 1499 * memory controller. 1500 * @memcg: The memory cgroup that went over limit 1501 * @p: Task that is going to be killed 1502 * 1503 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1504 * enabled 1505 */ 1506 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 1507 { 1508 rcu_read_lock(); 1509 1510 if (memcg) { 1511 pr_cont(",oom_memcg="); 1512 pr_cont_cgroup_path(memcg->css.cgroup); 1513 } else 1514 pr_cont(",global_oom"); 1515 if (p) { 1516 pr_cont(",task_memcg="); 1517 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1518 } 1519 rcu_read_unlock(); 1520 } 1521 1522 /** 1523 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 1524 * memory controller. 1525 * @memcg: The memory cgroup that went over limit 1526 */ 1527 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 1528 { 1529 /* Use static buffer, for the caller is holding oom_lock. */ 1530 static char buf[PAGE_SIZE]; 1531 struct seq_buf s; 1532 1533 lockdep_assert_held(&oom_lock); 1534 1535 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1536 K((u64)page_counter_read(&memcg->memory)), 1537 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); 1538 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1539 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", 1540 K((u64)page_counter_read(&memcg->swap)), 1541 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); 1542 #ifdef CONFIG_MEMCG_V1 1543 else { 1544 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1545 K((u64)page_counter_read(&memcg->memsw)), 1546 K((u64)memcg->memsw.max), memcg->memsw.failcnt); 1547 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1548 K((u64)page_counter_read(&memcg->kmem)), 1549 K((u64)memcg->kmem.max), memcg->kmem.failcnt); 1550 } 1551 #endif 1552 1553 pr_info("Memory cgroup stats for "); 1554 pr_cont_cgroup_path(memcg->css.cgroup); 1555 pr_cont(":"); 1556 seq_buf_init(&s, buf, sizeof(buf)); 1557 memory_stat_format(memcg, &s); 1558 seq_buf_do_printk(&s, KERN_INFO); 1559 } 1560 1561 /* 1562 * Return the memory (and swap, if configured) limit for a memcg. 1563 */ 1564 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 1565 { 1566 unsigned long max = READ_ONCE(memcg->memory.max); 1567 1568 if (do_memsw_account()) { 1569 if (mem_cgroup_swappiness(memcg)) { 1570 /* Calculate swap excess capacity from memsw limit */ 1571 unsigned long swap = READ_ONCE(memcg->memsw.max) - max; 1572 1573 max += min(swap, (unsigned long)total_swap_pages); 1574 } 1575 } else { 1576 if (mem_cgroup_swappiness(memcg)) 1577 max += min(READ_ONCE(memcg->swap.max), 1578 (unsigned long)total_swap_pages); 1579 } 1580 return max; 1581 } 1582 1583 unsigned long mem_cgroup_size(struct mem_cgroup *memcg) 1584 { 1585 return page_counter_read(&memcg->memory); 1586 } 1587 1588 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1589 int order) 1590 { 1591 struct oom_control oc = { 1592 .zonelist = NULL, 1593 .nodemask = NULL, 1594 .memcg = memcg, 1595 .gfp_mask = gfp_mask, 1596 .order = order, 1597 }; 1598 bool ret = true; 1599 1600 if (mutex_lock_killable(&oom_lock)) 1601 return true; 1602 1603 if (mem_cgroup_margin(memcg) >= (1 << order)) 1604 goto unlock; 1605 1606 /* 1607 * A few threads which were not waiting at mutex_lock_killable() can 1608 * fail to bail out. Therefore, check again after holding oom_lock. 1609 */ 1610 ret = task_is_dying() || out_of_memory(&oc); 1611 1612 unlock: 1613 mutex_unlock(&oom_lock); 1614 return ret; 1615 } 1616 1617 /* 1618 * Returns true if successfully killed one or more processes. Though in some 1619 * corner cases it can return true even without killing any process. 1620 */ 1621 static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1622 { 1623 bool locked, ret; 1624 1625 if (order > PAGE_ALLOC_COSTLY_ORDER) 1626 return false; 1627 1628 memcg_memory_event(memcg, MEMCG_OOM); 1629 1630 if (!memcg1_oom_prepare(memcg, &locked)) 1631 return false; 1632 1633 ret = mem_cgroup_out_of_memory(memcg, mask, order); 1634 1635 memcg1_oom_finish(memcg, locked); 1636 1637 return ret; 1638 } 1639 1640 /** 1641 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 1642 * @victim: task to be killed by the OOM killer 1643 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 1644 * 1645 * Returns a pointer to a memory cgroup, which has to be cleaned up 1646 * by killing all belonging OOM-killable tasks. 1647 * 1648 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 1649 */ 1650 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 1651 struct mem_cgroup *oom_domain) 1652 { 1653 struct mem_cgroup *oom_group = NULL; 1654 struct mem_cgroup *memcg; 1655 1656 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1657 return NULL; 1658 1659 if (!oom_domain) 1660 oom_domain = root_mem_cgroup; 1661 1662 rcu_read_lock(); 1663 1664 memcg = mem_cgroup_from_task(victim); 1665 if (mem_cgroup_is_root(memcg)) 1666 goto out; 1667 1668 /* 1669 * If the victim task has been asynchronously moved to a different 1670 * memory cgroup, we might end up killing tasks outside oom_domain. 1671 * In this case it's better to ignore memory.group.oom. 1672 */ 1673 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) 1674 goto out; 1675 1676 /* 1677 * Traverse the memory cgroup hierarchy from the victim task's 1678 * cgroup up to the OOMing cgroup (or root) to find the 1679 * highest-level memory cgroup with oom.group set. 1680 */ 1681 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 1682 if (READ_ONCE(memcg->oom_group)) 1683 oom_group = memcg; 1684 1685 if (memcg == oom_domain) 1686 break; 1687 } 1688 1689 if (oom_group) 1690 css_get(&oom_group->css); 1691 out: 1692 rcu_read_unlock(); 1693 1694 return oom_group; 1695 } 1696 1697 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 1698 { 1699 pr_info("Tasks in "); 1700 pr_cont_cgroup_path(memcg->css.cgroup); 1701 pr_cont(" are going to be killed due to memory.oom.group set\n"); 1702 } 1703 1704 struct memcg_stock_pcp { 1705 local_lock_t stock_lock; 1706 struct mem_cgroup *cached; /* this never be root cgroup */ 1707 unsigned int nr_pages; 1708 1709 struct obj_cgroup *cached_objcg; 1710 struct pglist_data *cached_pgdat; 1711 unsigned int nr_bytes; 1712 int nr_slab_reclaimable_b; 1713 int nr_slab_unreclaimable_b; 1714 1715 struct work_struct work; 1716 unsigned long flags; 1717 #define FLUSHING_CACHED_CHARGE 0 1718 }; 1719 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { 1720 .stock_lock = INIT_LOCAL_LOCK(stock_lock), 1721 }; 1722 static DEFINE_MUTEX(percpu_charge_mutex); 1723 1724 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); 1725 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 1726 struct mem_cgroup *root_memcg); 1727 1728 /** 1729 * consume_stock: Try to consume stocked charge on this cpu. 1730 * @memcg: memcg to consume from. 1731 * @nr_pages: how many pages to charge. 1732 * 1733 * The charges will only happen if @memcg matches the current cpu's memcg 1734 * stock, and at least @nr_pages are available in that stock. Failure to 1735 * service an allocation will refill the stock. 1736 * 1737 * returns true if successful, false otherwise. 1738 */ 1739 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 1740 { 1741 struct memcg_stock_pcp *stock; 1742 unsigned int stock_pages; 1743 unsigned long flags; 1744 bool ret = false; 1745 1746 if (nr_pages > MEMCG_CHARGE_BATCH) 1747 return ret; 1748 1749 local_lock_irqsave(&memcg_stock.stock_lock, flags); 1750 1751 stock = this_cpu_ptr(&memcg_stock); 1752 stock_pages = READ_ONCE(stock->nr_pages); 1753 if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) { 1754 WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages); 1755 ret = true; 1756 } 1757 1758 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 1759 1760 return ret; 1761 } 1762 1763 /* 1764 * Returns stocks cached in percpu and reset cached information. 1765 */ 1766 static void drain_stock(struct memcg_stock_pcp *stock) 1767 { 1768 unsigned int stock_pages = READ_ONCE(stock->nr_pages); 1769 struct mem_cgroup *old = READ_ONCE(stock->cached); 1770 1771 if (!old) 1772 return; 1773 1774 if (stock_pages) { 1775 page_counter_uncharge(&old->memory, stock_pages); 1776 if (do_memsw_account()) 1777 page_counter_uncharge(&old->memsw, stock_pages); 1778 1779 WRITE_ONCE(stock->nr_pages, 0); 1780 } 1781 1782 css_put(&old->css); 1783 WRITE_ONCE(stock->cached, NULL); 1784 } 1785 1786 static void drain_local_stock(struct work_struct *dummy) 1787 { 1788 struct memcg_stock_pcp *stock; 1789 struct obj_cgroup *old = NULL; 1790 unsigned long flags; 1791 1792 /* 1793 * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. 1794 * drain_stock races is that we always operate on local CPU stock 1795 * here with IRQ disabled 1796 */ 1797 local_lock_irqsave(&memcg_stock.stock_lock, flags); 1798 1799 stock = this_cpu_ptr(&memcg_stock); 1800 old = drain_obj_stock(stock); 1801 drain_stock(stock); 1802 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 1803 1804 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 1805 obj_cgroup_put(old); 1806 } 1807 1808 /* 1809 * Cache charges(val) to local per_cpu area. 1810 * This will be consumed by consume_stock() function, later. 1811 */ 1812 static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 1813 { 1814 struct memcg_stock_pcp *stock; 1815 unsigned int stock_pages; 1816 1817 stock = this_cpu_ptr(&memcg_stock); 1818 if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ 1819 drain_stock(stock); 1820 css_get(&memcg->css); 1821 WRITE_ONCE(stock->cached, memcg); 1822 } 1823 stock_pages = READ_ONCE(stock->nr_pages) + nr_pages; 1824 WRITE_ONCE(stock->nr_pages, stock_pages); 1825 1826 if (stock_pages > MEMCG_CHARGE_BATCH) 1827 drain_stock(stock); 1828 } 1829 1830 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 1831 { 1832 unsigned long flags; 1833 1834 local_lock_irqsave(&memcg_stock.stock_lock, flags); 1835 __refill_stock(memcg, nr_pages); 1836 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 1837 } 1838 1839 /* 1840 * Drains all per-CPU charge caches for given root_memcg resp. subtree 1841 * of the hierarchy under it. 1842 */ 1843 void drain_all_stock(struct mem_cgroup *root_memcg) 1844 { 1845 int cpu, curcpu; 1846 1847 /* If someone's already draining, avoid adding running more workers. */ 1848 if (!mutex_trylock(&percpu_charge_mutex)) 1849 return; 1850 /* 1851 * Notify other cpus that system-wide "drain" is running 1852 * We do not care about races with the cpu hotplug because cpu down 1853 * as well as workers from this path always operate on the local 1854 * per-cpu data. CPU up doesn't touch memcg_stock at all. 1855 */ 1856 migrate_disable(); 1857 curcpu = smp_processor_id(); 1858 for_each_online_cpu(cpu) { 1859 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 1860 struct mem_cgroup *memcg; 1861 bool flush = false; 1862 1863 rcu_read_lock(); 1864 memcg = READ_ONCE(stock->cached); 1865 if (memcg && READ_ONCE(stock->nr_pages) && 1866 mem_cgroup_is_descendant(memcg, root_memcg)) 1867 flush = true; 1868 else if (obj_stock_flush_required(stock, root_memcg)) 1869 flush = true; 1870 rcu_read_unlock(); 1871 1872 if (flush && 1873 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 1874 if (cpu == curcpu) 1875 drain_local_stock(&stock->work); 1876 else if (!cpu_is_isolated(cpu)) 1877 schedule_work_on(cpu, &stock->work); 1878 } 1879 } 1880 migrate_enable(); 1881 mutex_unlock(&percpu_charge_mutex); 1882 } 1883 1884 static int memcg_hotplug_cpu_dead(unsigned int cpu) 1885 { 1886 struct memcg_stock_pcp *stock; 1887 1888 stock = &per_cpu(memcg_stock, cpu); 1889 drain_stock(stock); 1890 1891 return 0; 1892 } 1893 1894 static unsigned long reclaim_high(struct mem_cgroup *memcg, 1895 unsigned int nr_pages, 1896 gfp_t gfp_mask) 1897 { 1898 unsigned long nr_reclaimed = 0; 1899 1900 do { 1901 unsigned long pflags; 1902 1903 if (page_counter_read(&memcg->memory) <= 1904 READ_ONCE(memcg->memory.high)) 1905 continue; 1906 1907 memcg_memory_event(memcg, MEMCG_HIGH); 1908 1909 psi_memstall_enter(&pflags); 1910 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, 1911 gfp_mask, 1912 MEMCG_RECLAIM_MAY_SWAP, 1913 NULL); 1914 psi_memstall_leave(&pflags); 1915 } while ((memcg = parent_mem_cgroup(memcg)) && 1916 !mem_cgroup_is_root(memcg)); 1917 1918 return nr_reclaimed; 1919 } 1920 1921 static void high_work_func(struct work_struct *work) 1922 { 1923 struct mem_cgroup *memcg; 1924 1925 memcg = container_of(work, struct mem_cgroup, high_work); 1926 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 1927 } 1928 1929 /* 1930 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is 1931 * enough to still cause a significant slowdown in most cases, while still 1932 * allowing diagnostics and tracing to proceed without becoming stuck. 1933 */ 1934 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) 1935 1936 /* 1937 * When calculating the delay, we use these either side of the exponentiation to 1938 * maintain precision and scale to a reasonable number of jiffies (see the table 1939 * below. 1940 * 1941 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the 1942 * overage ratio to a delay. 1943 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the 1944 * proposed penalty in order to reduce to a reasonable number of jiffies, and 1945 * to produce a reasonable delay curve. 1946 * 1947 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a 1948 * reasonable delay curve compared to precision-adjusted overage, not 1949 * penalising heavily at first, but still making sure that growth beyond the 1950 * limit penalises misbehaviour cgroups by slowing them down exponentially. For 1951 * example, with a high of 100 megabytes: 1952 * 1953 * +-------+------------------------+ 1954 * | usage | time to allocate in ms | 1955 * +-------+------------------------+ 1956 * | 100M | 0 | 1957 * | 101M | 6 | 1958 * | 102M | 25 | 1959 * | 103M | 57 | 1960 * | 104M | 102 | 1961 * | 105M | 159 | 1962 * | 106M | 230 | 1963 * | 107M | 313 | 1964 * | 108M | 409 | 1965 * | 109M | 518 | 1966 * | 110M | 639 | 1967 * | 111M | 774 | 1968 * | 112M | 921 | 1969 * | 113M | 1081 | 1970 * | 114M | 1254 | 1971 * | 115M | 1439 | 1972 * | 116M | 1638 | 1973 * | 117M | 1849 | 1974 * | 118M | 2000 | 1975 * | 119M | 2000 | 1976 * | 120M | 2000 | 1977 * +-------+------------------------+ 1978 */ 1979 #define MEMCG_DELAY_PRECISION_SHIFT 20 1980 #define MEMCG_DELAY_SCALING_SHIFT 14 1981 1982 static u64 calculate_overage(unsigned long usage, unsigned long high) 1983 { 1984 u64 overage; 1985 1986 if (usage <= high) 1987 return 0; 1988 1989 /* 1990 * Prevent division by 0 in overage calculation by acting as if 1991 * it was a threshold of 1 page 1992 */ 1993 high = max(high, 1UL); 1994 1995 overage = usage - high; 1996 overage <<= MEMCG_DELAY_PRECISION_SHIFT; 1997 return div64_u64(overage, high); 1998 } 1999 2000 static u64 mem_find_max_overage(struct mem_cgroup *memcg) 2001 { 2002 u64 overage, max_overage = 0; 2003 2004 do { 2005 overage = calculate_overage(page_counter_read(&memcg->memory), 2006 READ_ONCE(memcg->memory.high)); 2007 max_overage = max(overage, max_overage); 2008 } while ((memcg = parent_mem_cgroup(memcg)) && 2009 !mem_cgroup_is_root(memcg)); 2010 2011 return max_overage; 2012 } 2013 2014 static u64 swap_find_max_overage(struct mem_cgroup *memcg) 2015 { 2016 u64 overage, max_overage = 0; 2017 2018 do { 2019 overage = calculate_overage(page_counter_read(&memcg->swap), 2020 READ_ONCE(memcg->swap.high)); 2021 if (overage) 2022 memcg_memory_event(memcg, MEMCG_SWAP_HIGH); 2023 max_overage = max(overage, max_overage); 2024 } while ((memcg = parent_mem_cgroup(memcg)) && 2025 !mem_cgroup_is_root(memcg)); 2026 2027 return max_overage; 2028 } 2029 2030 /* 2031 * Get the number of jiffies that we should penalise a mischievous cgroup which 2032 * is exceeding its memory.high by checking both it and its ancestors. 2033 */ 2034 static unsigned long calculate_high_delay(struct mem_cgroup *memcg, 2035 unsigned int nr_pages, 2036 u64 max_overage) 2037 { 2038 unsigned long penalty_jiffies; 2039 2040 if (!max_overage) 2041 return 0; 2042 2043 /* 2044 * We use overage compared to memory.high to calculate the number of 2045 * jiffies to sleep (penalty_jiffies). Ideally this value should be 2046 * fairly lenient on small overages, and increasingly harsh when the 2047 * memcg in question makes it clear that it has no intention of stopping 2048 * its crazy behaviour, so we exponentially increase the delay based on 2049 * overage amount. 2050 */ 2051 penalty_jiffies = max_overage * max_overage * HZ; 2052 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; 2053 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; 2054 2055 /* 2056 * Factor in the task's own contribution to the overage, such that four 2057 * N-sized allocations are throttled approximately the same as one 2058 * 4N-sized allocation. 2059 * 2060 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or 2061 * larger the current charge patch is than that. 2062 */ 2063 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; 2064 } 2065 2066 /* 2067 * Reclaims memory over the high limit. Called directly from 2068 * try_charge() (context permitting), as well as from the userland 2069 * return path where reclaim is always able to block. 2070 */ 2071 void mem_cgroup_handle_over_high(gfp_t gfp_mask) 2072 { 2073 unsigned long penalty_jiffies; 2074 unsigned long pflags; 2075 unsigned long nr_reclaimed; 2076 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2077 int nr_retries = MAX_RECLAIM_RETRIES; 2078 struct mem_cgroup *memcg; 2079 bool in_retry = false; 2080 2081 if (likely(!nr_pages)) 2082 return; 2083 2084 memcg = get_mem_cgroup_from_mm(current->mm); 2085 current->memcg_nr_pages_over_high = 0; 2086 2087 retry_reclaim: 2088 /* 2089 * Bail if the task is already exiting. Unlike memory.max, 2090 * memory.high enforcement isn't as strict, and there is no 2091 * OOM killer involved, which means the excess could already 2092 * be much bigger (and still growing) than it could for 2093 * memory.max; the dying task could get stuck in fruitless 2094 * reclaim for a long time, which isn't desirable. 2095 */ 2096 if (task_is_dying()) 2097 goto out; 2098 2099 /* 2100 * The allocating task should reclaim at least the batch size, but for 2101 * subsequent retries we only want to do what's necessary to prevent oom 2102 * or breaching resource isolation. 2103 * 2104 * This is distinct from memory.max or page allocator behaviour because 2105 * memory.high is currently batched, whereas memory.max and the page 2106 * allocator run every time an allocation is made. 2107 */ 2108 nr_reclaimed = reclaim_high(memcg, 2109 in_retry ? SWAP_CLUSTER_MAX : nr_pages, 2110 gfp_mask); 2111 2112 /* 2113 * memory.high is breached and reclaim is unable to keep up. Throttle 2114 * allocators proactively to slow down excessive growth. 2115 */ 2116 penalty_jiffies = calculate_high_delay(memcg, nr_pages, 2117 mem_find_max_overage(memcg)); 2118 2119 penalty_jiffies += calculate_high_delay(memcg, nr_pages, 2120 swap_find_max_overage(memcg)); 2121 2122 /* 2123 * Clamp the max delay per usermode return so as to still keep the 2124 * application moving forwards and also permit diagnostics, albeit 2125 * extremely slowly. 2126 */ 2127 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 2128 2129 /* 2130 * Don't sleep if the amount of jiffies this memcg owes us is so low 2131 * that it's not even worth doing, in an attempt to be nice to those who 2132 * go only a small amount over their memory.high value and maybe haven't 2133 * been aggressively reclaimed enough yet. 2134 */ 2135 if (penalty_jiffies <= HZ / 100) 2136 goto out; 2137 2138 /* 2139 * If reclaim is making forward progress but we're still over 2140 * memory.high, we want to encourage that rather than doing allocator 2141 * throttling. 2142 */ 2143 if (nr_reclaimed || nr_retries--) { 2144 in_retry = true; 2145 goto retry_reclaim; 2146 } 2147 2148 /* 2149 * Reclaim didn't manage to push usage below the limit, slow 2150 * this allocating task down. 2151 * 2152 * If we exit early, we're guaranteed to die (since 2153 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't 2154 * need to account for any ill-begotten jiffies to pay them off later. 2155 */ 2156 psi_memstall_enter(&pflags); 2157 schedule_timeout_killable(penalty_jiffies); 2158 psi_memstall_leave(&pflags); 2159 2160 out: 2161 css_put(&memcg->css); 2162 } 2163 2164 int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, 2165 unsigned int nr_pages) 2166 { 2167 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 2168 int nr_retries = MAX_RECLAIM_RETRIES; 2169 struct mem_cgroup *mem_over_limit; 2170 struct page_counter *counter; 2171 unsigned long nr_reclaimed; 2172 bool passed_oom = false; 2173 unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; 2174 bool drained = false; 2175 bool raised_max_event = false; 2176 unsigned long pflags; 2177 2178 retry: 2179 if (consume_stock(memcg, nr_pages)) 2180 return 0; 2181 2182 if (!do_memsw_account() || 2183 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2184 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 2185 goto done_restock; 2186 if (do_memsw_account()) 2187 page_counter_uncharge(&memcg->memsw, batch); 2188 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2189 } else { 2190 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2191 reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; 2192 } 2193 2194 if (batch > nr_pages) { 2195 batch = nr_pages; 2196 goto retry; 2197 } 2198 2199 /* 2200 * Prevent unbounded recursion when reclaim operations need to 2201 * allocate memory. This might exceed the limits temporarily, 2202 * but we prefer facilitating memory reclaim and getting back 2203 * under the limit over triggering OOM kills in these cases. 2204 */ 2205 if (unlikely(current->flags & PF_MEMALLOC)) 2206 goto force; 2207 2208 if (unlikely(task_in_memcg_oom(current))) 2209 goto nomem; 2210 2211 if (!gfpflags_allow_blocking(gfp_mask)) 2212 goto nomem; 2213 2214 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2215 raised_max_event = true; 2216 2217 psi_memstall_enter(&pflags); 2218 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2219 gfp_mask, reclaim_options, NULL); 2220 psi_memstall_leave(&pflags); 2221 2222 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2223 goto retry; 2224 2225 if (!drained) { 2226 drain_all_stock(mem_over_limit); 2227 drained = true; 2228 goto retry; 2229 } 2230 2231 if (gfp_mask & __GFP_NORETRY) 2232 goto nomem; 2233 /* 2234 * Even though the limit is exceeded at this point, reclaim 2235 * may have been able to free some pages. Retry the charge 2236 * before killing the task. 2237 * 2238 * Only for regular pages, though: huge pages are rather 2239 * unlikely to succeed so close to the limit, and we fall back 2240 * to regular pages anyway in case of failure. 2241 */ 2242 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2243 goto retry; 2244 /* 2245 * At task move, charge accounts can be doubly counted. So, it's 2246 * better to wait until the end of task_move if something is going on. 2247 */ 2248 if (memcg1_wait_acct_move(mem_over_limit)) 2249 goto retry; 2250 2251 if (nr_retries--) 2252 goto retry; 2253 2254 if (gfp_mask & __GFP_RETRY_MAYFAIL) 2255 goto nomem; 2256 2257 /* Avoid endless loop for tasks bypassed by the oom killer */ 2258 if (passed_oom && task_is_dying()) 2259 goto nomem; 2260 2261 /* 2262 * keep retrying as long as the memcg oom killer is able to make 2263 * a forward progress or bypass the charge if the oom killer 2264 * couldn't make any progress. 2265 */ 2266 if (mem_cgroup_oom(mem_over_limit, gfp_mask, 2267 get_order(nr_pages * PAGE_SIZE))) { 2268 passed_oom = true; 2269 nr_retries = MAX_RECLAIM_RETRIES; 2270 goto retry; 2271 } 2272 nomem: 2273 /* 2274 * Memcg doesn't have a dedicated reserve for atomic 2275 * allocations. But like the global atomic pool, we need to 2276 * put the burden of reclaim on regular allocation requests 2277 * and let these go through as privileged allocations. 2278 */ 2279 if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH))) 2280 return -ENOMEM; 2281 force: 2282 /* 2283 * If the allocation has to be enforced, don't forget to raise 2284 * a MEMCG_MAX event. 2285 */ 2286 if (!raised_max_event) 2287 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2288 2289 /* 2290 * The allocation either can't fail or will lead to more memory 2291 * being freed very soon. Allow memory usage go over the limit 2292 * temporarily by force charging it. 2293 */ 2294 page_counter_charge(&memcg->memory, nr_pages); 2295 if (do_memsw_account()) 2296 page_counter_charge(&memcg->memsw, nr_pages); 2297 2298 return 0; 2299 2300 done_restock: 2301 if (batch > nr_pages) 2302 refill_stock(memcg, batch - nr_pages); 2303 2304 /* 2305 * If the hierarchy is above the normal consumption range, schedule 2306 * reclaim on returning to userland. We can perform reclaim here 2307 * if __GFP_RECLAIM but let's always punt for simplicity and so that 2308 * GFP_KERNEL can consistently be used during reclaim. @memcg is 2309 * not recorded as it most likely matches current's and won't 2310 * change in the meantime. As high limit is checked again before 2311 * reclaim, the cost of mismatch is negligible. 2312 */ 2313 do { 2314 bool mem_high, swap_high; 2315 2316 mem_high = page_counter_read(&memcg->memory) > 2317 READ_ONCE(memcg->memory.high); 2318 swap_high = page_counter_read(&memcg->swap) > 2319 READ_ONCE(memcg->swap.high); 2320 2321 /* Don't bother a random interrupted task */ 2322 if (!in_task()) { 2323 if (mem_high) { 2324 schedule_work(&memcg->high_work); 2325 break; 2326 } 2327 continue; 2328 } 2329 2330 if (mem_high || swap_high) { 2331 /* 2332 * The allocating tasks in this cgroup will need to do 2333 * reclaim or be throttled to prevent further growth 2334 * of the memory or swap footprints. 2335 * 2336 * Target some best-effort fairness between the tasks, 2337 * and distribute reclaim work and delay penalties 2338 * based on how much each task is actually allocating. 2339 */ 2340 current->memcg_nr_pages_over_high += batch; 2341 set_notify_resume(current); 2342 break; 2343 } 2344 } while ((memcg = parent_mem_cgroup(memcg))); 2345 2346 /* 2347 * Reclaim is set up above to be called from the userland 2348 * return path. But also attempt synchronous reclaim to avoid 2349 * excessive overrun while the task is still inside the 2350 * kernel. If this is successful, the return path will see it 2351 * when it rechecks the overage and simply bail out. 2352 */ 2353 if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && 2354 !(current->flags & PF_MEMALLOC) && 2355 gfpflags_allow_blocking(gfp_mask)) 2356 mem_cgroup_handle_over_high(gfp_mask); 2357 return 0; 2358 } 2359 2360 /** 2361 * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call. 2362 * @memcg: memcg previously charged. 2363 * @nr_pages: number of pages previously charged. 2364 */ 2365 void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2366 { 2367 if (mem_cgroup_is_root(memcg)) 2368 return; 2369 2370 page_counter_uncharge(&memcg->memory, nr_pages); 2371 if (do_memsw_account()) 2372 page_counter_uncharge(&memcg->memsw, nr_pages); 2373 } 2374 2375 static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) 2376 { 2377 VM_BUG_ON_FOLIO(folio_memcg(folio), folio); 2378 /* 2379 * Any of the following ensures page's memcg stability: 2380 * 2381 * - the page lock 2382 * - LRU isolation 2383 * - folio_memcg_lock() 2384 * - exclusive reference 2385 * - mem_cgroup_trylock_pages() 2386 */ 2387 folio->memcg_data = (unsigned long)memcg; 2388 } 2389 2390 /** 2391 * mem_cgroup_commit_charge - commit a previously successful try_charge(). 2392 * @folio: folio to commit the charge to. 2393 * @memcg: memcg previously charged. 2394 */ 2395 void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) 2396 { 2397 css_get(&memcg->css); 2398 commit_charge(folio, memcg); 2399 2400 local_irq_disable(); 2401 mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio)); 2402 memcg1_check_events(memcg, folio_nid(folio)); 2403 local_irq_enable(); 2404 } 2405 2406 static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, 2407 struct pglist_data *pgdat, 2408 enum node_stat_item idx, int nr) 2409 { 2410 struct mem_cgroup *memcg; 2411 struct lruvec *lruvec; 2412 2413 rcu_read_lock(); 2414 memcg = obj_cgroup_memcg(objcg); 2415 lruvec = mem_cgroup_lruvec(memcg, pgdat); 2416 __mod_memcg_lruvec_state(lruvec, idx, nr); 2417 rcu_read_unlock(); 2418 } 2419 2420 static __always_inline 2421 struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) 2422 { 2423 /* 2424 * Slab objects are accounted individually, not per-page. 2425 * Memcg membership data for each individual object is saved in 2426 * slab->obj_exts. 2427 */ 2428 if (folio_test_slab(folio)) { 2429 struct slabobj_ext *obj_exts; 2430 struct slab *slab; 2431 unsigned int off; 2432 2433 slab = folio_slab(folio); 2434 obj_exts = slab_obj_exts(slab); 2435 if (!obj_exts) 2436 return NULL; 2437 2438 off = obj_to_index(slab->slab_cache, slab, p); 2439 if (obj_exts[off].objcg) 2440 return obj_cgroup_memcg(obj_exts[off].objcg); 2441 2442 return NULL; 2443 } 2444 2445 /* 2446 * folio_memcg_check() is used here, because in theory we can encounter 2447 * a folio where the slab flag has been cleared already, but 2448 * slab->obj_exts has not been freed yet 2449 * folio_memcg_check() will guarantee that a proper memory 2450 * cgroup pointer or NULL will be returned. 2451 */ 2452 return folio_memcg_check(folio); 2453 } 2454 2455 /* 2456 * Returns a pointer to the memory cgroup to which the kernel object is charged. 2457 * It is not suitable for objects allocated using vmalloc(). 2458 * 2459 * A passed kernel object must be a slab object or a generic kernel page. 2460 * 2461 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 2462 * cgroup_mutex, etc. 2463 */ 2464 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) 2465 { 2466 if (mem_cgroup_disabled()) 2467 return NULL; 2468 2469 return mem_cgroup_from_obj_folio(virt_to_folio(p), p); 2470 } 2471 2472 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) 2473 { 2474 struct obj_cgroup *objcg = NULL; 2475 2476 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 2477 objcg = rcu_dereference(memcg->objcg); 2478 if (likely(objcg && obj_cgroup_tryget(objcg))) 2479 break; 2480 objcg = NULL; 2481 } 2482 return objcg; 2483 } 2484 2485 static struct obj_cgroup *current_objcg_update(void) 2486 { 2487 struct mem_cgroup *memcg; 2488 struct obj_cgroup *old, *objcg = NULL; 2489 2490 do { 2491 /* Atomically drop the update bit. */ 2492 old = xchg(¤t->objcg, NULL); 2493 if (old) { 2494 old = (struct obj_cgroup *) 2495 ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG); 2496 obj_cgroup_put(old); 2497 2498 old = NULL; 2499 } 2500 2501 /* If new objcg is NULL, no reason for the second atomic update. */ 2502 if (!current->mm || (current->flags & PF_KTHREAD)) 2503 return NULL; 2504 2505 /* 2506 * Release the objcg pointer from the previous iteration, 2507 * if try_cmpxcg() below fails. 2508 */ 2509 if (unlikely(objcg)) { 2510 obj_cgroup_put(objcg); 2511 objcg = NULL; 2512 } 2513 2514 /* 2515 * Obtain the new objcg pointer. The current task can be 2516 * asynchronously moved to another memcg and the previous 2517 * memcg can be offlined. So let's get the memcg pointer 2518 * and try get a reference to objcg under a rcu read lock. 2519 */ 2520 2521 rcu_read_lock(); 2522 memcg = mem_cgroup_from_task(current); 2523 objcg = __get_obj_cgroup_from_memcg(memcg); 2524 rcu_read_unlock(); 2525 2526 /* 2527 * Try set up a new objcg pointer atomically. If it 2528 * fails, it means the update flag was set concurrently, so 2529 * the whole procedure should be repeated. 2530 */ 2531 } while (!try_cmpxchg(¤t->objcg, &old, objcg)); 2532 2533 return objcg; 2534 } 2535 2536 __always_inline struct obj_cgroup *current_obj_cgroup(void) 2537 { 2538 struct mem_cgroup *memcg; 2539 struct obj_cgroup *objcg; 2540 2541 if (in_task()) { 2542 memcg = current->active_memcg; 2543 if (unlikely(memcg)) 2544 goto from_memcg; 2545 2546 objcg = READ_ONCE(current->objcg); 2547 if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG)) 2548 objcg = current_objcg_update(); 2549 /* 2550 * Objcg reference is kept by the task, so it's safe 2551 * to use the objcg by the current task. 2552 */ 2553 return objcg; 2554 } 2555 2556 memcg = this_cpu_read(int_active_memcg); 2557 if (unlikely(memcg)) 2558 goto from_memcg; 2559 2560 return NULL; 2561 2562 from_memcg: 2563 objcg = NULL; 2564 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 2565 /* 2566 * Memcg pointer is protected by scope (see set_active_memcg()) 2567 * and is pinning the corresponding objcg, so objcg can't go 2568 * away and can be used within the scope without any additional 2569 * protection. 2570 */ 2571 objcg = rcu_dereference_check(memcg->objcg, 1); 2572 if (likely(objcg)) 2573 break; 2574 } 2575 2576 return objcg; 2577 } 2578 2579 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) 2580 { 2581 struct obj_cgroup *objcg; 2582 2583 if (!memcg_kmem_online()) 2584 return NULL; 2585 2586 if (folio_memcg_kmem(folio)) { 2587 objcg = __folio_objcg(folio); 2588 obj_cgroup_get(objcg); 2589 } else { 2590 struct mem_cgroup *memcg; 2591 2592 rcu_read_lock(); 2593 memcg = __folio_memcg(folio); 2594 if (memcg) 2595 objcg = __get_obj_cgroup_from_memcg(memcg); 2596 else 2597 objcg = NULL; 2598 rcu_read_unlock(); 2599 } 2600 return objcg; 2601 } 2602 2603 /* 2604 * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg 2605 * @objcg: object cgroup to uncharge 2606 * @nr_pages: number of pages to uncharge 2607 */ 2608 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 2609 unsigned int nr_pages) 2610 { 2611 struct mem_cgroup *memcg; 2612 2613 memcg = get_mem_cgroup_from_objcg(objcg); 2614 2615 mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); 2616 memcg1_account_kmem(memcg, -nr_pages); 2617 refill_stock(memcg, nr_pages); 2618 2619 css_put(&memcg->css); 2620 } 2621 2622 /* 2623 * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg 2624 * @objcg: object cgroup to charge 2625 * @gfp: reclaim mode 2626 * @nr_pages: number of pages to charge 2627 * 2628 * Returns 0 on success, an error code on failure. 2629 */ 2630 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, 2631 unsigned int nr_pages) 2632 { 2633 struct mem_cgroup *memcg; 2634 int ret; 2635 2636 memcg = get_mem_cgroup_from_objcg(objcg); 2637 2638 ret = try_charge_memcg(memcg, gfp, nr_pages); 2639 if (ret) 2640 goto out; 2641 2642 mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); 2643 memcg1_account_kmem(memcg, nr_pages); 2644 out: 2645 css_put(&memcg->css); 2646 2647 return ret; 2648 } 2649 2650 /** 2651 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup 2652 * @page: page to charge 2653 * @gfp: reclaim mode 2654 * @order: allocation order 2655 * 2656 * Returns 0 on success, an error code on failure. 2657 */ 2658 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) 2659 { 2660 struct obj_cgroup *objcg; 2661 int ret = 0; 2662 2663 objcg = current_obj_cgroup(); 2664 if (objcg) { 2665 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); 2666 if (!ret) { 2667 obj_cgroup_get(objcg); 2668 page->memcg_data = (unsigned long)objcg | 2669 MEMCG_DATA_KMEM; 2670 return 0; 2671 } 2672 } 2673 return ret; 2674 } 2675 2676 /** 2677 * __memcg_kmem_uncharge_page: uncharge a kmem page 2678 * @page: page to uncharge 2679 * @order: allocation order 2680 */ 2681 void __memcg_kmem_uncharge_page(struct page *page, int order) 2682 { 2683 struct folio *folio = page_folio(page); 2684 struct obj_cgroup *objcg; 2685 unsigned int nr_pages = 1 << order; 2686 2687 if (!folio_memcg_kmem(folio)) 2688 return; 2689 2690 objcg = __folio_objcg(folio); 2691 obj_cgroup_uncharge_pages(objcg, nr_pages); 2692 folio->memcg_data = 0; 2693 obj_cgroup_put(objcg); 2694 } 2695 2696 static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, 2697 enum node_stat_item idx, int nr) 2698 { 2699 struct memcg_stock_pcp *stock; 2700 struct obj_cgroup *old = NULL; 2701 unsigned long flags; 2702 int *bytes; 2703 2704 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2705 stock = this_cpu_ptr(&memcg_stock); 2706 2707 /* 2708 * Save vmstat data in stock and skip vmstat array update unless 2709 * accumulating over a page of vmstat data or when pgdat or idx 2710 * changes. 2711 */ 2712 if (READ_ONCE(stock->cached_objcg) != objcg) { 2713 old = drain_obj_stock(stock); 2714 obj_cgroup_get(objcg); 2715 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 2716 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 2717 WRITE_ONCE(stock->cached_objcg, objcg); 2718 stock->cached_pgdat = pgdat; 2719 } else if (stock->cached_pgdat != pgdat) { 2720 /* Flush the existing cached vmstat data */ 2721 struct pglist_data *oldpg = stock->cached_pgdat; 2722 2723 if (stock->nr_slab_reclaimable_b) { 2724 __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, 2725 stock->nr_slab_reclaimable_b); 2726 stock->nr_slab_reclaimable_b = 0; 2727 } 2728 if (stock->nr_slab_unreclaimable_b) { 2729 __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, 2730 stock->nr_slab_unreclaimable_b); 2731 stock->nr_slab_unreclaimable_b = 0; 2732 } 2733 stock->cached_pgdat = pgdat; 2734 } 2735 2736 bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b 2737 : &stock->nr_slab_unreclaimable_b; 2738 /* 2739 * Even for large object >= PAGE_SIZE, the vmstat data will still be 2740 * cached locally at least once before pushing it out. 2741 */ 2742 if (!*bytes) { 2743 *bytes = nr; 2744 nr = 0; 2745 } else { 2746 *bytes += nr; 2747 if (abs(*bytes) > PAGE_SIZE) { 2748 nr = *bytes; 2749 *bytes = 0; 2750 } else { 2751 nr = 0; 2752 } 2753 } 2754 if (nr) 2755 __mod_objcg_mlstate(objcg, pgdat, idx, nr); 2756 2757 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2758 obj_cgroup_put(old); 2759 } 2760 2761 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 2762 { 2763 struct memcg_stock_pcp *stock; 2764 unsigned long flags; 2765 bool ret = false; 2766 2767 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2768 2769 stock = this_cpu_ptr(&memcg_stock); 2770 if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { 2771 stock->nr_bytes -= nr_bytes; 2772 ret = true; 2773 } 2774 2775 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2776 2777 return ret; 2778 } 2779 2780 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) 2781 { 2782 struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); 2783 2784 if (!old) 2785 return NULL; 2786 2787 if (stock->nr_bytes) { 2788 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; 2789 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); 2790 2791 if (nr_pages) { 2792 struct mem_cgroup *memcg; 2793 2794 memcg = get_mem_cgroup_from_objcg(old); 2795 2796 mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); 2797 memcg1_account_kmem(memcg, -nr_pages); 2798 __refill_stock(memcg, nr_pages); 2799 2800 css_put(&memcg->css); 2801 } 2802 2803 /* 2804 * The leftover is flushed to the centralized per-memcg value. 2805 * On the next attempt to refill obj stock it will be moved 2806 * to a per-cpu stock (probably, on an other CPU), see 2807 * refill_obj_stock(). 2808 * 2809 * How often it's flushed is a trade-off between the memory 2810 * limit enforcement accuracy and potential CPU contention, 2811 * so it might be changed in the future. 2812 */ 2813 atomic_add(nr_bytes, &old->nr_charged_bytes); 2814 stock->nr_bytes = 0; 2815 } 2816 2817 /* 2818 * Flush the vmstat data in current stock 2819 */ 2820 if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { 2821 if (stock->nr_slab_reclaimable_b) { 2822 __mod_objcg_mlstate(old, stock->cached_pgdat, 2823 NR_SLAB_RECLAIMABLE_B, 2824 stock->nr_slab_reclaimable_b); 2825 stock->nr_slab_reclaimable_b = 0; 2826 } 2827 if (stock->nr_slab_unreclaimable_b) { 2828 __mod_objcg_mlstate(old, stock->cached_pgdat, 2829 NR_SLAB_UNRECLAIMABLE_B, 2830 stock->nr_slab_unreclaimable_b); 2831 stock->nr_slab_unreclaimable_b = 0; 2832 } 2833 stock->cached_pgdat = NULL; 2834 } 2835 2836 WRITE_ONCE(stock->cached_objcg, NULL); 2837 /* 2838 * The `old' objects needs to be released by the caller via 2839 * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. 2840 */ 2841 return old; 2842 } 2843 2844 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2845 struct mem_cgroup *root_memcg) 2846 { 2847 struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); 2848 struct mem_cgroup *memcg; 2849 2850 if (objcg) { 2851 memcg = obj_cgroup_memcg(objcg); 2852 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 2853 return true; 2854 } 2855 2856 return false; 2857 } 2858 2859 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, 2860 bool allow_uncharge) 2861 { 2862 struct memcg_stock_pcp *stock; 2863 struct obj_cgroup *old = NULL; 2864 unsigned long flags; 2865 unsigned int nr_pages = 0; 2866 2867 local_lock_irqsave(&memcg_stock.stock_lock, flags); 2868 2869 stock = this_cpu_ptr(&memcg_stock); 2870 if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ 2871 old = drain_obj_stock(stock); 2872 obj_cgroup_get(objcg); 2873 WRITE_ONCE(stock->cached_objcg, objcg); 2874 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 2875 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 2876 allow_uncharge = true; /* Allow uncharge when objcg changes */ 2877 } 2878 stock->nr_bytes += nr_bytes; 2879 2880 if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { 2881 nr_pages = stock->nr_bytes >> PAGE_SHIFT; 2882 stock->nr_bytes &= (PAGE_SIZE - 1); 2883 } 2884 2885 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2886 obj_cgroup_put(old); 2887 2888 if (nr_pages) 2889 obj_cgroup_uncharge_pages(objcg, nr_pages); 2890 } 2891 2892 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 2893 { 2894 unsigned int nr_pages, nr_bytes; 2895 int ret; 2896 2897 if (consume_obj_stock(objcg, size)) 2898 return 0; 2899 2900 /* 2901 * In theory, objcg->nr_charged_bytes can have enough 2902 * pre-charged bytes to satisfy the allocation. However, 2903 * flushing objcg->nr_charged_bytes requires two atomic 2904 * operations, and objcg->nr_charged_bytes can't be big. 2905 * The shared objcg->nr_charged_bytes can also become a 2906 * performance bottleneck if all tasks of the same memcg are 2907 * trying to update it. So it's better to ignore it and try 2908 * grab some new pages. The stock's nr_bytes will be flushed to 2909 * objcg->nr_charged_bytes later on when objcg changes. 2910 * 2911 * The stock's nr_bytes may contain enough pre-charged bytes 2912 * to allow one less page from being charged, but we can't rely 2913 * on the pre-charged bytes not being changed outside of 2914 * consume_obj_stock() or refill_obj_stock(). So ignore those 2915 * pre-charged bytes as well when charging pages. To avoid a 2916 * page uncharge right after a page charge, we set the 2917 * allow_uncharge flag to false when calling refill_obj_stock() 2918 * to temporarily allow the pre-charged bytes to exceed the page 2919 * size limit. The maximum reachable value of the pre-charged 2920 * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data 2921 * race. 2922 */ 2923 nr_pages = size >> PAGE_SHIFT; 2924 nr_bytes = size & (PAGE_SIZE - 1); 2925 2926 if (nr_bytes) 2927 nr_pages += 1; 2928 2929 ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); 2930 if (!ret && nr_bytes) 2931 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); 2932 2933 return ret; 2934 } 2935 2936 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) 2937 { 2938 refill_obj_stock(objcg, size, true); 2939 } 2940 2941 static inline size_t obj_full_size(struct kmem_cache *s) 2942 { 2943 /* 2944 * For each accounted object there is an extra space which is used 2945 * to store obj_cgroup membership. Charge it too. 2946 */ 2947 return s->size + sizeof(struct obj_cgroup *); 2948 } 2949 2950 bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, 2951 gfp_t flags, size_t size, void **p) 2952 { 2953 struct obj_cgroup *objcg; 2954 struct slab *slab; 2955 unsigned long off; 2956 size_t i; 2957 2958 /* 2959 * The obtained objcg pointer is safe to use within the current scope, 2960 * defined by current task or set_active_memcg() pair. 2961 * obj_cgroup_get() is used to get a permanent reference. 2962 */ 2963 objcg = current_obj_cgroup(); 2964 if (!objcg) 2965 return true; 2966 2967 /* 2968 * slab_alloc_node() avoids the NULL check, so we might be called with a 2969 * single NULL object. kmem_cache_alloc_bulk() aborts if it can't fill 2970 * the whole requested size. 2971 * return success as there's nothing to free back 2972 */ 2973 if (unlikely(*p == NULL)) 2974 return true; 2975 2976 flags &= gfp_allowed_mask; 2977 2978 if (lru) { 2979 int ret; 2980 struct mem_cgroup *memcg; 2981 2982 memcg = get_mem_cgroup_from_objcg(objcg); 2983 ret = memcg_list_lru_alloc(memcg, lru, flags); 2984 css_put(&memcg->css); 2985 2986 if (ret) 2987 return false; 2988 } 2989 2990 if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s))) 2991 return false; 2992 2993 for (i = 0; i < size; i++) { 2994 slab = virt_to_slab(p[i]); 2995 2996 if (!slab_obj_exts(slab) && 2997 alloc_slab_obj_exts(slab, s, flags, false)) { 2998 obj_cgroup_uncharge(objcg, obj_full_size(s)); 2999 continue; 3000 } 3001 3002 off = obj_to_index(s, slab, p[i]); 3003 obj_cgroup_get(objcg); 3004 slab_obj_exts(slab)[off].objcg = objcg; 3005 mod_objcg_state(objcg, slab_pgdat(slab), 3006 cache_vmstat_idx(s), obj_full_size(s)); 3007 } 3008 3009 return true; 3010 } 3011 3012 void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, 3013 void **p, int objects, struct slabobj_ext *obj_exts) 3014 { 3015 for (int i = 0; i < objects; i++) { 3016 struct obj_cgroup *objcg; 3017 unsigned int off; 3018 3019 off = obj_to_index(s, slab, p[i]); 3020 objcg = obj_exts[off].objcg; 3021 if (!objcg) 3022 continue; 3023 3024 obj_exts[off].objcg = NULL; 3025 obj_cgroup_uncharge(objcg, obj_full_size(s)); 3026 mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), 3027 -obj_full_size(s)); 3028 obj_cgroup_put(objcg); 3029 } 3030 } 3031 3032 /* 3033 * Because folio_memcg(head) is not set on tails, set it now. 3034 */ 3035 void split_page_memcg(struct page *head, int old_order, int new_order) 3036 { 3037 struct folio *folio = page_folio(head); 3038 struct mem_cgroup *memcg = folio_memcg(folio); 3039 int i; 3040 unsigned int old_nr = 1 << old_order; 3041 unsigned int new_nr = 1 << new_order; 3042 3043 if (mem_cgroup_disabled() || !memcg) 3044 return; 3045 3046 for (i = new_nr; i < old_nr; i += new_nr) 3047 folio_page(folio, i)->memcg_data = folio->memcg_data; 3048 3049 if (folio_memcg_kmem(folio)) 3050 obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1); 3051 else 3052 css_get_many(&memcg->css, old_nr / new_nr - 1); 3053 } 3054 3055 unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3056 { 3057 unsigned long val; 3058 3059 if (mem_cgroup_is_root(memcg)) { 3060 /* 3061 * Approximate root's usage from global state. This isn't 3062 * perfect, but the root usage was always an approximation. 3063 */ 3064 val = global_node_page_state(NR_FILE_PAGES) + 3065 global_node_page_state(NR_ANON_MAPPED); 3066 if (swap) 3067 val += total_swap_pages - get_nr_swap_pages(); 3068 } else { 3069 if (!swap) 3070 val = page_counter_read(&memcg->memory); 3071 else 3072 val = page_counter_read(&memcg->memsw); 3073 } 3074 return val; 3075 } 3076 3077 static int memcg_online_kmem(struct mem_cgroup *memcg) 3078 { 3079 struct obj_cgroup *objcg; 3080 3081 if (mem_cgroup_kmem_disabled()) 3082 return 0; 3083 3084 if (unlikely(mem_cgroup_is_root(memcg))) 3085 return 0; 3086 3087 objcg = obj_cgroup_alloc(); 3088 if (!objcg) 3089 return -ENOMEM; 3090 3091 objcg->memcg = memcg; 3092 rcu_assign_pointer(memcg->objcg, objcg); 3093 obj_cgroup_get(objcg); 3094 memcg->orig_objcg = objcg; 3095 3096 static_branch_enable(&memcg_kmem_online_key); 3097 3098 memcg->kmemcg_id = memcg->id.id; 3099 3100 return 0; 3101 } 3102 3103 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3104 { 3105 struct mem_cgroup *parent; 3106 3107 if (mem_cgroup_kmem_disabled()) 3108 return; 3109 3110 if (unlikely(mem_cgroup_is_root(memcg))) 3111 return; 3112 3113 parent = parent_mem_cgroup(memcg); 3114 if (!parent) 3115 parent = root_mem_cgroup; 3116 3117 memcg_reparent_objcgs(memcg, parent); 3118 3119 /* 3120 * After we have finished memcg_reparent_objcgs(), all list_lrus 3121 * corresponding to this cgroup are guaranteed to remain empty. 3122 * The ordering is imposed by list_lru_node->lock taken by 3123 * memcg_reparent_list_lrus(). 3124 */ 3125 memcg_reparent_list_lrus(memcg, parent); 3126 } 3127 3128 #ifdef CONFIG_CGROUP_WRITEBACK 3129 3130 #include <trace/events/writeback.h> 3131 3132 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 3133 { 3134 return wb_domain_init(&memcg->cgwb_domain, gfp); 3135 } 3136 3137 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 3138 { 3139 wb_domain_exit(&memcg->cgwb_domain); 3140 } 3141 3142 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 3143 { 3144 wb_domain_size_changed(&memcg->cgwb_domain); 3145 } 3146 3147 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 3148 { 3149 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 3150 3151 if (!memcg->css.parent) 3152 return NULL; 3153 3154 return &memcg->cgwb_domain; 3155 } 3156 3157 /** 3158 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 3159 * @wb: bdi_writeback in question 3160 * @pfilepages: out parameter for number of file pages 3161 * @pheadroom: out parameter for number of allocatable pages according to memcg 3162 * @pdirty: out parameter for number of dirty pages 3163 * @pwriteback: out parameter for number of pages under writeback 3164 * 3165 * Determine the numbers of file, headroom, dirty, and writeback pages in 3166 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 3167 * is a bit more involved. 3168 * 3169 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 3170 * headroom is calculated as the lowest headroom of itself and the 3171 * ancestors. Note that this doesn't consider the actual amount of 3172 * available memory in the system. The caller should further cap 3173 * *@pheadroom accordingly. 3174 */ 3175 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 3176 unsigned long *pheadroom, unsigned long *pdirty, 3177 unsigned long *pwriteback) 3178 { 3179 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 3180 struct mem_cgroup *parent; 3181 3182 mem_cgroup_flush_stats_ratelimited(memcg); 3183 3184 *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); 3185 *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); 3186 *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + 3187 memcg_page_state(memcg, NR_ACTIVE_FILE); 3188 3189 *pheadroom = PAGE_COUNTER_MAX; 3190 while ((parent = parent_mem_cgroup(memcg))) { 3191 unsigned long ceiling = min(READ_ONCE(memcg->memory.max), 3192 READ_ONCE(memcg->memory.high)); 3193 unsigned long used = page_counter_read(&memcg->memory); 3194 3195 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 3196 memcg = parent; 3197 } 3198 } 3199 3200 /* 3201 * Foreign dirty flushing 3202 * 3203 * There's an inherent mismatch between memcg and writeback. The former 3204 * tracks ownership per-page while the latter per-inode. This was a 3205 * deliberate design decision because honoring per-page ownership in the 3206 * writeback path is complicated, may lead to higher CPU and IO overheads 3207 * and deemed unnecessary given that write-sharing an inode across 3208 * different cgroups isn't a common use-case. 3209 * 3210 * Combined with inode majority-writer ownership switching, this works well 3211 * enough in most cases but there are some pathological cases. For 3212 * example, let's say there are two cgroups A and B which keep writing to 3213 * different but confined parts of the same inode. B owns the inode and 3214 * A's memory is limited far below B's. A's dirty ratio can rise enough to 3215 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid 3216 * triggering background writeback. A will be slowed down without a way to 3217 * make writeback of the dirty pages happen. 3218 * 3219 * Conditions like the above can lead to a cgroup getting repeatedly and 3220 * severely throttled after making some progress after each 3221 * dirty_expire_interval while the underlying IO device is almost 3222 * completely idle. 3223 * 3224 * Solving this problem completely requires matching the ownership tracking 3225 * granularities between memcg and writeback in either direction. However, 3226 * the more egregious behaviors can be avoided by simply remembering the 3227 * most recent foreign dirtying events and initiating remote flushes on 3228 * them when local writeback isn't enough to keep the memory clean enough. 3229 * 3230 * The following two functions implement such mechanism. When a foreign 3231 * page - a page whose memcg and writeback ownerships don't match - is 3232 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning 3233 * bdi_writeback on the page owning memcg. When balance_dirty_pages() 3234 * decides that the memcg needs to sleep due to high dirty ratio, it calls 3235 * mem_cgroup_flush_foreign() which queues writeback on the recorded 3236 * foreign bdi_writebacks which haven't expired. Both the numbers of 3237 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are 3238 * limited to MEMCG_CGWB_FRN_CNT. 3239 * 3240 * The mechanism only remembers IDs and doesn't hold any object references. 3241 * As being wrong occasionally doesn't matter, updates and accesses to the 3242 * records are lockless and racy. 3243 */ 3244 void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, 3245 struct bdi_writeback *wb) 3246 { 3247 struct mem_cgroup *memcg = folio_memcg(folio); 3248 struct memcg_cgwb_frn *frn; 3249 u64 now = get_jiffies_64(); 3250 u64 oldest_at = now; 3251 int oldest = -1; 3252 int i; 3253 3254 trace_track_foreign_dirty(folio, wb); 3255 3256 /* 3257 * Pick the slot to use. If there is already a slot for @wb, keep 3258 * using it. If not replace the oldest one which isn't being 3259 * written out. 3260 */ 3261 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 3262 frn = &memcg->cgwb_frn[i]; 3263 if (frn->bdi_id == wb->bdi->id && 3264 frn->memcg_id == wb->memcg_css->id) 3265 break; 3266 if (time_before64(frn->at, oldest_at) && 3267 atomic_read(&frn->done.cnt) == 1) { 3268 oldest = i; 3269 oldest_at = frn->at; 3270 } 3271 } 3272 3273 if (i < MEMCG_CGWB_FRN_CNT) { 3274 /* 3275 * Re-using an existing one. Update timestamp lazily to 3276 * avoid making the cacheline hot. We want them to be 3277 * reasonably up-to-date and significantly shorter than 3278 * dirty_expire_interval as that's what expires the record. 3279 * Use the shorter of 1s and dirty_expire_interval / 8. 3280 */ 3281 unsigned long update_intv = 3282 min_t(unsigned long, HZ, 3283 msecs_to_jiffies(dirty_expire_interval * 10) / 8); 3284 3285 if (time_before64(frn->at, now - update_intv)) 3286 frn->at = now; 3287 } else if (oldest >= 0) { 3288 /* replace the oldest free one */ 3289 frn = &memcg->cgwb_frn[oldest]; 3290 frn->bdi_id = wb->bdi->id; 3291 frn->memcg_id = wb->memcg_css->id; 3292 frn->at = now; 3293 } 3294 } 3295 3296 /* issue foreign writeback flushes for recorded foreign dirtying events */ 3297 void mem_cgroup_flush_foreign(struct bdi_writeback *wb) 3298 { 3299 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 3300 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); 3301 u64 now = jiffies_64; 3302 int i; 3303 3304 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 3305 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; 3306 3307 /* 3308 * If the record is older than dirty_expire_interval, 3309 * writeback on it has already started. No need to kick it 3310 * off again. Also, don't start a new one if there's 3311 * already one in flight. 3312 */ 3313 if (time_after64(frn->at, now - intv) && 3314 atomic_read(&frn->done.cnt) == 1) { 3315 frn->at = 0; 3316 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); 3317 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 3318 WB_REASON_FOREIGN_FLUSH, 3319 &frn->done); 3320 } 3321 } 3322 } 3323 3324 #else /* CONFIG_CGROUP_WRITEBACK */ 3325 3326 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 3327 { 3328 return 0; 3329 } 3330 3331 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 3332 { 3333 } 3334 3335 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 3336 { 3337 } 3338 3339 #endif /* CONFIG_CGROUP_WRITEBACK */ 3340 3341 /* 3342 * Private memory cgroup IDR 3343 * 3344 * Swap-out records and page cache shadow entries need to store memcg 3345 * references in constrained space, so we maintain an ID space that is 3346 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 3347 * memory-controlled cgroups to 64k. 3348 * 3349 * However, there usually are many references to the offline CSS after 3350 * the cgroup has been destroyed, such as page cache or reclaimable 3351 * slab objects, that don't need to hang on to the ID. We want to keep 3352 * those dead CSS from occupying IDs, or we might quickly exhaust the 3353 * relatively small ID space and prevent the creation of new cgroups 3354 * even when there are much fewer than 64k cgroups - possibly none. 3355 * 3356 * Maintain a private 16-bit ID space for memcg, and allow the ID to 3357 * be freed and recycled when it's no longer needed, which is usually 3358 * when the CSS is offlined. 3359 * 3360 * The only exception to that are records of swapped out tmpfs/shmem 3361 * pages that need to be attributed to live ancestors on swapin. But 3362 * those references are manageable from userspace. 3363 */ 3364 3365 #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) 3366 static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids); 3367 3368 static void mem_cgroup_id_remove(struct mem_cgroup *memcg) 3369 { 3370 if (memcg->id.id > 0) { 3371 xa_erase(&mem_cgroup_ids, memcg->id.id); 3372 memcg->id.id = 0; 3373 } 3374 } 3375 3376 void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, 3377 unsigned int n) 3378 { 3379 refcount_add(n, &memcg->id.ref); 3380 } 3381 3382 void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 3383 { 3384 if (refcount_sub_and_test(n, &memcg->id.ref)) { 3385 mem_cgroup_id_remove(memcg); 3386 3387 /* Memcg ID pins CSS */ 3388 css_put(&memcg->css); 3389 } 3390 } 3391 3392 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 3393 { 3394 mem_cgroup_id_put_many(memcg, 1); 3395 } 3396 3397 /** 3398 * mem_cgroup_from_id - look up a memcg from a memcg id 3399 * @id: the memcg id to look up 3400 * 3401 * Caller must hold rcu_read_lock(). 3402 */ 3403 struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 3404 { 3405 WARN_ON_ONCE(!rcu_read_lock_held()); 3406 return xa_load(&mem_cgroup_ids, id); 3407 } 3408 3409 #ifdef CONFIG_SHRINKER_DEBUG 3410 struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) 3411 { 3412 struct cgroup *cgrp; 3413 struct cgroup_subsys_state *css; 3414 struct mem_cgroup *memcg; 3415 3416 cgrp = cgroup_get_from_id(ino); 3417 if (IS_ERR(cgrp)) 3418 return ERR_CAST(cgrp); 3419 3420 css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); 3421 if (css) 3422 memcg = container_of(css, struct mem_cgroup, css); 3423 else 3424 memcg = ERR_PTR(-ENOENT); 3425 3426 cgroup_put(cgrp); 3427 3428 return memcg; 3429 } 3430 #endif 3431 3432 static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 3433 { 3434 struct mem_cgroup_per_node *pn; 3435 3436 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); 3437 if (!pn) 3438 return false; 3439 3440 pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), 3441 GFP_KERNEL_ACCOUNT, node); 3442 if (!pn->lruvec_stats) 3443 goto fail; 3444 3445 pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, 3446 GFP_KERNEL_ACCOUNT); 3447 if (!pn->lruvec_stats_percpu) 3448 goto fail; 3449 3450 lruvec_init(&pn->lruvec); 3451 pn->memcg = memcg; 3452 3453 memcg->nodeinfo[node] = pn; 3454 return true; 3455 fail: 3456 kfree(pn->lruvec_stats); 3457 kfree(pn); 3458 return false; 3459 } 3460 3461 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 3462 { 3463 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 3464 3465 if (!pn) 3466 return; 3467 3468 free_percpu(pn->lruvec_stats_percpu); 3469 kfree(pn->lruvec_stats); 3470 kfree(pn); 3471 } 3472 3473 static void __mem_cgroup_free(struct mem_cgroup *memcg) 3474 { 3475 int node; 3476 3477 obj_cgroup_put(memcg->orig_objcg); 3478 3479 for_each_node(node) 3480 free_mem_cgroup_per_node_info(memcg, node); 3481 kfree(memcg->vmstats); 3482 free_percpu(memcg->vmstats_percpu); 3483 kfree(memcg); 3484 } 3485 3486 static void mem_cgroup_free(struct mem_cgroup *memcg) 3487 { 3488 lru_gen_exit_memcg(memcg); 3489 memcg_wb_domain_exit(memcg); 3490 __mem_cgroup_free(memcg); 3491 } 3492 3493 static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) 3494 { 3495 struct memcg_vmstats_percpu *statc, *pstatc; 3496 struct mem_cgroup *memcg; 3497 int node, cpu; 3498 int __maybe_unused i; 3499 long error; 3500 3501 memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); 3502 if (!memcg) 3503 return ERR_PTR(-ENOMEM); 3504 3505 error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL, 3506 XA_LIMIT(1, MEM_CGROUP_ID_MAX), GFP_KERNEL); 3507 if (error) 3508 goto fail; 3509 error = -ENOMEM; 3510 3511 memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), 3512 GFP_KERNEL_ACCOUNT); 3513 if (!memcg->vmstats) 3514 goto fail; 3515 3516 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, 3517 GFP_KERNEL_ACCOUNT); 3518 if (!memcg->vmstats_percpu) 3519 goto fail; 3520 3521 for_each_possible_cpu(cpu) { 3522 if (parent) 3523 pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu); 3524 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); 3525 statc->parent = parent ? pstatc : NULL; 3526 statc->vmstats = memcg->vmstats; 3527 } 3528 3529 for_each_node(node) 3530 if (!alloc_mem_cgroup_per_node_info(memcg, node)) 3531 goto fail; 3532 3533 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 3534 goto fail; 3535 3536 INIT_WORK(&memcg->high_work, high_work_func); 3537 vmpressure_init(&memcg->vmpressure); 3538 INIT_LIST_HEAD(&memcg->memory_peaks); 3539 INIT_LIST_HEAD(&memcg->swap_peaks); 3540 spin_lock_init(&memcg->peaks_lock); 3541 memcg->socket_pressure = jiffies; 3542 memcg1_memcg_init(memcg); 3543 memcg->kmemcg_id = -1; 3544 INIT_LIST_HEAD(&memcg->objcg_list); 3545 #ifdef CONFIG_CGROUP_WRITEBACK 3546 INIT_LIST_HEAD(&memcg->cgwb_list); 3547 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 3548 memcg->cgwb_frn[i].done = 3549 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 3550 #endif 3551 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3552 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); 3553 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); 3554 memcg->deferred_split_queue.split_queue_len = 0; 3555 #endif 3556 lru_gen_init_memcg(memcg); 3557 return memcg; 3558 fail: 3559 mem_cgroup_id_remove(memcg); 3560 __mem_cgroup_free(memcg); 3561 return ERR_PTR(error); 3562 } 3563 3564 static struct cgroup_subsys_state * __ref 3565 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 3566 { 3567 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 3568 struct mem_cgroup *memcg, *old_memcg; 3569 3570 old_memcg = set_active_memcg(parent); 3571 memcg = mem_cgroup_alloc(parent); 3572 set_active_memcg(old_memcg); 3573 if (IS_ERR(memcg)) 3574 return ERR_CAST(memcg); 3575 3576 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 3577 memcg1_soft_limit_reset(memcg); 3578 #ifdef CONFIG_ZSWAP 3579 memcg->zswap_max = PAGE_COUNTER_MAX; 3580 WRITE_ONCE(memcg->zswap_writeback, 3581 !parent || READ_ONCE(parent->zswap_writeback)); 3582 #endif 3583 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 3584 if (parent) { 3585 WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent)); 3586 3587 page_counter_init(&memcg->memory, &parent->memory, true); 3588 page_counter_init(&memcg->swap, &parent->swap, false); 3589 #ifdef CONFIG_MEMCG_V1 3590 WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable)); 3591 page_counter_init(&memcg->kmem, &parent->kmem, false); 3592 page_counter_init(&memcg->tcpmem, &parent->tcpmem, false); 3593 #endif 3594 } else { 3595 init_memcg_stats(); 3596 init_memcg_events(); 3597 page_counter_init(&memcg->memory, NULL, true); 3598 page_counter_init(&memcg->swap, NULL, false); 3599 #ifdef CONFIG_MEMCG_V1 3600 page_counter_init(&memcg->kmem, NULL, false); 3601 page_counter_init(&memcg->tcpmem, NULL, false); 3602 #endif 3603 root_mem_cgroup = memcg; 3604 return &memcg->css; 3605 } 3606 3607 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 3608 static_branch_inc(&memcg_sockets_enabled_key); 3609 3610 if (!cgroup_memory_nobpf) 3611 static_branch_inc(&memcg_bpf_enabled_key); 3612 3613 return &memcg->css; 3614 } 3615 3616 static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 3617 { 3618 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3619 3620 if (memcg_online_kmem(memcg)) 3621 goto remove_id; 3622 3623 /* 3624 * A memcg must be visible for expand_shrinker_info() 3625 * by the time the maps are allocated. So, we allocate maps 3626 * here, when for_each_mem_cgroup() can't skip it. 3627 */ 3628 if (alloc_shrinker_info(memcg)) 3629 goto offline_kmem; 3630 3631 if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled()) 3632 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 3633 FLUSH_TIME); 3634 lru_gen_online_memcg(memcg); 3635 3636 /* Online state pins memcg ID, memcg ID pins CSS */ 3637 refcount_set(&memcg->id.ref, 1); 3638 css_get(css); 3639 3640 /* 3641 * Ensure mem_cgroup_from_id() works once we're fully online. 3642 * 3643 * We could do this earlier and require callers to filter with 3644 * css_tryget_online(). But right now there are no users that 3645 * need earlier access, and the workingset code relies on the 3646 * cgroup tree linkage (mem_cgroup_get_nr_swap_pages()). So 3647 * publish it here at the end of onlining. This matches the 3648 * regular ID destruction during offlining. 3649 */ 3650 xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL); 3651 3652 return 0; 3653 offline_kmem: 3654 memcg_offline_kmem(memcg); 3655 remove_id: 3656 mem_cgroup_id_remove(memcg); 3657 return -ENOMEM; 3658 } 3659 3660 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 3661 { 3662 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3663 3664 memcg1_css_offline(memcg); 3665 3666 page_counter_set_min(&memcg->memory, 0); 3667 page_counter_set_low(&memcg->memory, 0); 3668 3669 zswap_memcg_offline_cleanup(memcg); 3670 3671 memcg_offline_kmem(memcg); 3672 reparent_shrinker_deferred(memcg); 3673 wb_memcg_offline(memcg); 3674 lru_gen_offline_memcg(memcg); 3675 3676 drain_all_stock(memcg); 3677 3678 mem_cgroup_id_put(memcg); 3679 } 3680 3681 static void mem_cgroup_css_released(struct cgroup_subsys_state *css) 3682 { 3683 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3684 3685 invalidate_reclaim_iterators(memcg); 3686 lru_gen_release_memcg(memcg); 3687 } 3688 3689 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 3690 { 3691 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3692 int __maybe_unused i; 3693 3694 #ifdef CONFIG_CGROUP_WRITEBACK 3695 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 3696 wb_wait_for_completion(&memcg->cgwb_frn[i].done); 3697 #endif 3698 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 3699 static_branch_dec(&memcg_sockets_enabled_key); 3700 3701 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg1_tcpmem_active(memcg)) 3702 static_branch_dec(&memcg_sockets_enabled_key); 3703 3704 if (!cgroup_memory_nobpf) 3705 static_branch_dec(&memcg_bpf_enabled_key); 3706 3707 vmpressure_cleanup(&memcg->vmpressure); 3708 cancel_work_sync(&memcg->high_work); 3709 memcg1_remove_from_trees(memcg); 3710 free_shrinker_info(memcg); 3711 mem_cgroup_free(memcg); 3712 } 3713 3714 /** 3715 * mem_cgroup_css_reset - reset the states of a mem_cgroup 3716 * @css: the target css 3717 * 3718 * Reset the states of the mem_cgroup associated with @css. This is 3719 * invoked when the userland requests disabling on the default hierarchy 3720 * but the memcg is pinned through dependency. The memcg should stop 3721 * applying policies and should revert to the vanilla state as it may be 3722 * made visible again. 3723 * 3724 * The current implementation only resets the essential configurations. 3725 * This needs to be expanded to cover all the visible parts. 3726 */ 3727 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 3728 { 3729 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3730 3731 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 3732 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 3733 #ifdef CONFIG_MEMCG_V1 3734 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 3735 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 3736 #endif 3737 page_counter_set_min(&memcg->memory, 0); 3738 page_counter_set_low(&memcg->memory, 0); 3739 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 3740 memcg1_soft_limit_reset(memcg); 3741 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 3742 memcg_wb_domain_size_changed(memcg); 3743 } 3744 3745 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) 3746 { 3747 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3748 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 3749 struct memcg_vmstats_percpu *statc; 3750 long delta, delta_cpu, v; 3751 int i, nid; 3752 3753 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); 3754 3755 for (i = 0; i < MEMCG_VMSTAT_SIZE; i++) { 3756 /* 3757 * Collect the aggregated propagation counts of groups 3758 * below us. We're in a per-cpu loop here and this is 3759 * a global counter, so the first cycle will get them. 3760 */ 3761 delta = memcg->vmstats->state_pending[i]; 3762 if (delta) 3763 memcg->vmstats->state_pending[i] = 0; 3764 3765 /* Add CPU changes on this level since the last flush */ 3766 delta_cpu = 0; 3767 v = READ_ONCE(statc->state[i]); 3768 if (v != statc->state_prev[i]) { 3769 delta_cpu = v - statc->state_prev[i]; 3770 delta += delta_cpu; 3771 statc->state_prev[i] = v; 3772 } 3773 3774 /* Aggregate counts on this level and propagate upwards */ 3775 if (delta_cpu) 3776 memcg->vmstats->state_local[i] += delta_cpu; 3777 3778 if (delta) { 3779 memcg->vmstats->state[i] += delta; 3780 if (parent) 3781 parent->vmstats->state_pending[i] += delta; 3782 } 3783 } 3784 3785 for (i = 0; i < NR_MEMCG_EVENTS; i++) { 3786 delta = memcg->vmstats->events_pending[i]; 3787 if (delta) 3788 memcg->vmstats->events_pending[i] = 0; 3789 3790 delta_cpu = 0; 3791 v = READ_ONCE(statc->events[i]); 3792 if (v != statc->events_prev[i]) { 3793 delta_cpu = v - statc->events_prev[i]; 3794 delta += delta_cpu; 3795 statc->events_prev[i] = v; 3796 } 3797 3798 if (delta_cpu) 3799 memcg->vmstats->events_local[i] += delta_cpu; 3800 3801 if (delta) { 3802 memcg->vmstats->events[i] += delta; 3803 if (parent) 3804 parent->vmstats->events_pending[i] += delta; 3805 } 3806 } 3807 3808 for_each_node_state(nid, N_MEMORY) { 3809 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; 3810 struct lruvec_stats *lstats = pn->lruvec_stats; 3811 struct lruvec_stats *plstats = NULL; 3812 struct lruvec_stats_percpu *lstatc; 3813 3814 if (parent) 3815 plstats = parent->nodeinfo[nid]->lruvec_stats; 3816 3817 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); 3818 3819 for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) { 3820 delta = lstats->state_pending[i]; 3821 if (delta) 3822 lstats->state_pending[i] = 0; 3823 3824 delta_cpu = 0; 3825 v = READ_ONCE(lstatc->state[i]); 3826 if (v != lstatc->state_prev[i]) { 3827 delta_cpu = v - lstatc->state_prev[i]; 3828 delta += delta_cpu; 3829 lstatc->state_prev[i] = v; 3830 } 3831 3832 if (delta_cpu) 3833 lstats->state_local[i] += delta_cpu; 3834 3835 if (delta) { 3836 lstats->state[i] += delta; 3837 if (plstats) 3838 plstats->state_pending[i] += delta; 3839 } 3840 } 3841 } 3842 WRITE_ONCE(statc->stats_updates, 0); 3843 /* We are in a per-cpu loop here, only do the atomic write once */ 3844 if (atomic64_read(&memcg->vmstats->stats_updates)) 3845 atomic64_set(&memcg->vmstats->stats_updates, 0); 3846 } 3847 3848 static void mem_cgroup_fork(struct task_struct *task) 3849 { 3850 /* 3851 * Set the update flag to cause task->objcg to be initialized lazily 3852 * on the first allocation. It can be done without any synchronization 3853 * because it's always performed on the current task, so does 3854 * current_objcg_update(). 3855 */ 3856 task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG; 3857 } 3858 3859 static void mem_cgroup_exit(struct task_struct *task) 3860 { 3861 struct obj_cgroup *objcg = task->objcg; 3862 3863 objcg = (struct obj_cgroup *) 3864 ((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG); 3865 obj_cgroup_put(objcg); 3866 3867 /* 3868 * Some kernel allocations can happen after this point, 3869 * but let's ignore them. It can be done without any synchronization 3870 * because it's always performed on the current task, so does 3871 * current_objcg_update(). 3872 */ 3873 task->objcg = NULL; 3874 } 3875 3876 #ifdef CONFIG_LRU_GEN 3877 static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) 3878 { 3879 struct task_struct *task; 3880 struct cgroup_subsys_state *css; 3881 3882 /* find the first leader if there is any */ 3883 cgroup_taskset_for_each_leader(task, css, tset) 3884 break; 3885 3886 if (!task) 3887 return; 3888 3889 task_lock(task); 3890 if (task->mm && READ_ONCE(task->mm->owner) == task) 3891 lru_gen_migrate_mm(task->mm); 3892 task_unlock(task); 3893 } 3894 #else 3895 static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {} 3896 #endif /* CONFIG_LRU_GEN */ 3897 3898 static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) 3899 { 3900 struct task_struct *task; 3901 struct cgroup_subsys_state *css; 3902 3903 cgroup_taskset_for_each(task, css, tset) { 3904 /* atomically set the update bit */ 3905 set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg); 3906 } 3907 } 3908 3909 static void mem_cgroup_attach(struct cgroup_taskset *tset) 3910 { 3911 mem_cgroup_lru_gen_attach(tset); 3912 mem_cgroup_kmem_attach(tset); 3913 } 3914 3915 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 3916 { 3917 if (value == PAGE_COUNTER_MAX) 3918 seq_puts(m, "max\n"); 3919 else 3920 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 3921 3922 return 0; 3923 } 3924 3925 static u64 memory_current_read(struct cgroup_subsys_state *css, 3926 struct cftype *cft) 3927 { 3928 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3929 3930 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 3931 } 3932 3933 #define OFP_PEAK_UNSET (((-1UL))) 3934 3935 static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc) 3936 { 3937 struct cgroup_of_peak *ofp = of_peak(sf->private); 3938 u64 fd_peak = READ_ONCE(ofp->value), peak; 3939 3940 /* User wants global or local peak? */ 3941 if (fd_peak == OFP_PEAK_UNSET) 3942 peak = pc->watermark; 3943 else 3944 peak = max(fd_peak, READ_ONCE(pc->local_watermark)); 3945 3946 seq_printf(sf, "%llu\n", peak * PAGE_SIZE); 3947 return 0; 3948 } 3949 3950 static int memory_peak_show(struct seq_file *sf, void *v) 3951 { 3952 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 3953 3954 return peak_show(sf, v, &memcg->memory); 3955 } 3956 3957 static int peak_open(struct kernfs_open_file *of) 3958 { 3959 struct cgroup_of_peak *ofp = of_peak(of); 3960 3961 ofp->value = OFP_PEAK_UNSET; 3962 return 0; 3963 } 3964 3965 static void peak_release(struct kernfs_open_file *of) 3966 { 3967 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3968 struct cgroup_of_peak *ofp = of_peak(of); 3969 3970 if (ofp->value == OFP_PEAK_UNSET) { 3971 /* fast path (no writes on this fd) */ 3972 return; 3973 } 3974 spin_lock(&memcg->peaks_lock); 3975 list_del(&ofp->list); 3976 spin_unlock(&memcg->peaks_lock); 3977 } 3978 3979 static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes, 3980 loff_t off, struct page_counter *pc, 3981 struct list_head *watchers) 3982 { 3983 unsigned long usage; 3984 struct cgroup_of_peak *peer_ctx; 3985 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3986 struct cgroup_of_peak *ofp = of_peak(of); 3987 3988 spin_lock(&memcg->peaks_lock); 3989 3990 usage = page_counter_read(pc); 3991 WRITE_ONCE(pc->local_watermark, usage); 3992 3993 list_for_each_entry(peer_ctx, watchers, list) 3994 if (usage > peer_ctx->value) 3995 WRITE_ONCE(peer_ctx->value, usage); 3996 3997 /* initial write, register watcher */ 3998 if (ofp->value == -1) 3999 list_add(&ofp->list, watchers); 4000 4001 WRITE_ONCE(ofp->value, usage); 4002 spin_unlock(&memcg->peaks_lock); 4003 4004 return nbytes; 4005 } 4006 4007 static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf, 4008 size_t nbytes, loff_t off) 4009 { 4010 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4011 4012 return peak_write(of, buf, nbytes, off, &memcg->memory, 4013 &memcg->memory_peaks); 4014 } 4015 4016 #undef OFP_PEAK_UNSET 4017 4018 static int memory_min_show(struct seq_file *m, void *v) 4019 { 4020 return seq_puts_memcg_tunable(m, 4021 READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 4022 } 4023 4024 static ssize_t memory_min_write(struct kernfs_open_file *of, 4025 char *buf, size_t nbytes, loff_t off) 4026 { 4027 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4028 unsigned long min; 4029 int err; 4030 4031 buf = strstrip(buf); 4032 err = page_counter_memparse(buf, "max", &min); 4033 if (err) 4034 return err; 4035 4036 page_counter_set_min(&memcg->memory, min); 4037 4038 return nbytes; 4039 } 4040 4041 static int memory_low_show(struct seq_file *m, void *v) 4042 { 4043 return seq_puts_memcg_tunable(m, 4044 READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 4045 } 4046 4047 static ssize_t memory_low_write(struct kernfs_open_file *of, 4048 char *buf, size_t nbytes, loff_t off) 4049 { 4050 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4051 unsigned long low; 4052 int err; 4053 4054 buf = strstrip(buf); 4055 err = page_counter_memparse(buf, "max", &low); 4056 if (err) 4057 return err; 4058 4059 page_counter_set_low(&memcg->memory, low); 4060 4061 return nbytes; 4062 } 4063 4064 static int memory_high_show(struct seq_file *m, void *v) 4065 { 4066 return seq_puts_memcg_tunable(m, 4067 READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); 4068 } 4069 4070 static ssize_t memory_high_write(struct kernfs_open_file *of, 4071 char *buf, size_t nbytes, loff_t off) 4072 { 4073 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4074 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 4075 bool drained = false; 4076 unsigned long high; 4077 int err; 4078 4079 buf = strstrip(buf); 4080 err = page_counter_memparse(buf, "max", &high); 4081 if (err) 4082 return err; 4083 4084 page_counter_set_high(&memcg->memory, high); 4085 4086 for (;;) { 4087 unsigned long nr_pages = page_counter_read(&memcg->memory); 4088 unsigned long reclaimed; 4089 4090 if (nr_pages <= high) 4091 break; 4092 4093 if (signal_pending(current)) 4094 break; 4095 4096 if (!drained) { 4097 drain_all_stock(memcg); 4098 drained = true; 4099 continue; 4100 } 4101 4102 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 4103 GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL); 4104 4105 if (!reclaimed && !nr_retries--) 4106 break; 4107 } 4108 4109 memcg_wb_domain_size_changed(memcg); 4110 return nbytes; 4111 } 4112 4113 static int memory_max_show(struct seq_file *m, void *v) 4114 { 4115 return seq_puts_memcg_tunable(m, 4116 READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 4117 } 4118 4119 static ssize_t memory_max_write(struct kernfs_open_file *of, 4120 char *buf, size_t nbytes, loff_t off) 4121 { 4122 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4123 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; 4124 bool drained = false; 4125 unsigned long max; 4126 int err; 4127 4128 buf = strstrip(buf); 4129 err = page_counter_memparse(buf, "max", &max); 4130 if (err) 4131 return err; 4132 4133 xchg(&memcg->memory.max, max); 4134 4135 for (;;) { 4136 unsigned long nr_pages = page_counter_read(&memcg->memory); 4137 4138 if (nr_pages <= max) 4139 break; 4140 4141 if (signal_pending(current)) 4142 break; 4143 4144 if (!drained) { 4145 drain_all_stock(memcg); 4146 drained = true; 4147 continue; 4148 } 4149 4150 if (nr_reclaims) { 4151 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 4152 GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL)) 4153 nr_reclaims--; 4154 continue; 4155 } 4156 4157 memcg_memory_event(memcg, MEMCG_OOM); 4158 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 4159 break; 4160 } 4161 4162 memcg_wb_domain_size_changed(memcg); 4163 return nbytes; 4164 } 4165 4166 /* 4167 * Note: don't forget to update the 'samples/cgroup/memcg_event_listener' 4168 * if any new events become available. 4169 */ 4170 static void __memory_events_show(struct seq_file *m, atomic_long_t *events) 4171 { 4172 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); 4173 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); 4174 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); 4175 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); 4176 seq_printf(m, "oom_kill %lu\n", 4177 atomic_long_read(&events[MEMCG_OOM_KILL])); 4178 seq_printf(m, "oom_group_kill %lu\n", 4179 atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); 4180 } 4181 4182 static int memory_events_show(struct seq_file *m, void *v) 4183 { 4184 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4185 4186 __memory_events_show(m, memcg->memory_events); 4187 return 0; 4188 } 4189 4190 static int memory_events_local_show(struct seq_file *m, void *v) 4191 { 4192 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4193 4194 __memory_events_show(m, memcg->memory_events_local); 4195 return 0; 4196 } 4197 4198 int memory_stat_show(struct seq_file *m, void *v) 4199 { 4200 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4201 char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4202 struct seq_buf s; 4203 4204 if (!buf) 4205 return -ENOMEM; 4206 seq_buf_init(&s, buf, PAGE_SIZE); 4207 memory_stat_format(memcg, &s); 4208 seq_puts(m, buf); 4209 kfree(buf); 4210 return 0; 4211 } 4212 4213 #ifdef CONFIG_NUMA 4214 static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, 4215 int item) 4216 { 4217 return lruvec_page_state(lruvec, item) * 4218 memcg_page_state_output_unit(item); 4219 } 4220 4221 static int memory_numa_stat_show(struct seq_file *m, void *v) 4222 { 4223 int i; 4224 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4225 4226 mem_cgroup_flush_stats(memcg); 4227 4228 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 4229 int nid; 4230 4231 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) 4232 continue; 4233 4234 seq_printf(m, "%s", memory_stats[i].name); 4235 for_each_node_state(nid, N_MEMORY) { 4236 u64 size; 4237 struct lruvec *lruvec; 4238 4239 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 4240 size = lruvec_page_state_output(lruvec, 4241 memory_stats[i].idx); 4242 seq_printf(m, " N%d=%llu", nid, size); 4243 } 4244 seq_putc(m, '\n'); 4245 } 4246 4247 return 0; 4248 } 4249 #endif 4250 4251 static int memory_oom_group_show(struct seq_file *m, void *v) 4252 { 4253 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4254 4255 seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group)); 4256 4257 return 0; 4258 } 4259 4260 static ssize_t memory_oom_group_write(struct kernfs_open_file *of, 4261 char *buf, size_t nbytes, loff_t off) 4262 { 4263 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4264 int ret, oom_group; 4265 4266 buf = strstrip(buf); 4267 if (!buf) 4268 return -EINVAL; 4269 4270 ret = kstrtoint(buf, 0, &oom_group); 4271 if (ret) 4272 return ret; 4273 4274 if (oom_group != 0 && oom_group != 1) 4275 return -EINVAL; 4276 4277 WRITE_ONCE(memcg->oom_group, oom_group); 4278 4279 return nbytes; 4280 } 4281 4282 enum { 4283 MEMORY_RECLAIM_SWAPPINESS = 0, 4284 MEMORY_RECLAIM_NULL, 4285 }; 4286 4287 static const match_table_t tokens = { 4288 { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, 4289 { MEMORY_RECLAIM_NULL, NULL }, 4290 }; 4291 4292 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, 4293 size_t nbytes, loff_t off) 4294 { 4295 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4296 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 4297 unsigned long nr_to_reclaim, nr_reclaimed = 0; 4298 int swappiness = -1; 4299 unsigned int reclaim_options; 4300 char *old_buf, *start; 4301 substring_t args[MAX_OPT_ARGS]; 4302 4303 buf = strstrip(buf); 4304 4305 old_buf = buf; 4306 nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; 4307 if (buf == old_buf) 4308 return -EINVAL; 4309 4310 buf = strstrip(buf); 4311 4312 while ((start = strsep(&buf, " ")) != NULL) { 4313 if (!strlen(start)) 4314 continue; 4315 switch (match_token(start, tokens, args)) { 4316 case MEMORY_RECLAIM_SWAPPINESS: 4317 if (match_int(&args[0], &swappiness)) 4318 return -EINVAL; 4319 if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS) 4320 return -EINVAL; 4321 break; 4322 default: 4323 return -EINVAL; 4324 } 4325 } 4326 4327 reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; 4328 while (nr_reclaimed < nr_to_reclaim) { 4329 /* Will converge on zero, but reclaim enforces a minimum */ 4330 unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; 4331 unsigned long reclaimed; 4332 4333 if (signal_pending(current)) 4334 return -EINTR; 4335 4336 /* 4337 * This is the final attempt, drain percpu lru caches in the 4338 * hope of introducing more evictable pages for 4339 * try_to_free_mem_cgroup_pages(). 4340 */ 4341 if (!nr_retries) 4342 lru_add_drain_all(); 4343 4344 reclaimed = try_to_free_mem_cgroup_pages(memcg, 4345 batch_size, GFP_KERNEL, 4346 reclaim_options, 4347 swappiness == -1 ? NULL : &swappiness); 4348 4349 if (!reclaimed && !nr_retries--) 4350 return -EAGAIN; 4351 4352 nr_reclaimed += reclaimed; 4353 } 4354 4355 return nbytes; 4356 } 4357 4358 static struct cftype memory_files[] = { 4359 { 4360 .name = "current", 4361 .flags = CFTYPE_NOT_ON_ROOT, 4362 .read_u64 = memory_current_read, 4363 }, 4364 { 4365 .name = "peak", 4366 .flags = CFTYPE_NOT_ON_ROOT, 4367 .open = peak_open, 4368 .release = peak_release, 4369 .seq_show = memory_peak_show, 4370 .write = memory_peak_write, 4371 }, 4372 { 4373 .name = "min", 4374 .flags = CFTYPE_NOT_ON_ROOT, 4375 .seq_show = memory_min_show, 4376 .write = memory_min_write, 4377 }, 4378 { 4379 .name = "low", 4380 .flags = CFTYPE_NOT_ON_ROOT, 4381 .seq_show = memory_low_show, 4382 .write = memory_low_write, 4383 }, 4384 { 4385 .name = "high", 4386 .flags = CFTYPE_NOT_ON_ROOT, 4387 .seq_show = memory_high_show, 4388 .write = memory_high_write, 4389 }, 4390 { 4391 .name = "max", 4392 .flags = CFTYPE_NOT_ON_ROOT, 4393 .seq_show = memory_max_show, 4394 .write = memory_max_write, 4395 }, 4396 { 4397 .name = "events", 4398 .flags = CFTYPE_NOT_ON_ROOT, 4399 .file_offset = offsetof(struct mem_cgroup, events_file), 4400 .seq_show = memory_events_show, 4401 }, 4402 { 4403 .name = "events.local", 4404 .flags = CFTYPE_NOT_ON_ROOT, 4405 .file_offset = offsetof(struct mem_cgroup, events_local_file), 4406 .seq_show = memory_events_local_show, 4407 }, 4408 { 4409 .name = "stat", 4410 .seq_show = memory_stat_show, 4411 }, 4412 #ifdef CONFIG_NUMA 4413 { 4414 .name = "numa_stat", 4415 .seq_show = memory_numa_stat_show, 4416 }, 4417 #endif 4418 { 4419 .name = "oom.group", 4420 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 4421 .seq_show = memory_oom_group_show, 4422 .write = memory_oom_group_write, 4423 }, 4424 { 4425 .name = "reclaim", 4426 .flags = CFTYPE_NS_DELEGATABLE, 4427 .write = memory_reclaim, 4428 }, 4429 { } /* terminate */ 4430 }; 4431 4432 struct cgroup_subsys memory_cgrp_subsys = { 4433 .css_alloc = mem_cgroup_css_alloc, 4434 .css_online = mem_cgroup_css_online, 4435 .css_offline = mem_cgroup_css_offline, 4436 .css_released = mem_cgroup_css_released, 4437 .css_free = mem_cgroup_css_free, 4438 .css_reset = mem_cgroup_css_reset, 4439 .css_rstat_flush = mem_cgroup_css_rstat_flush, 4440 .attach = mem_cgroup_attach, 4441 .fork = mem_cgroup_fork, 4442 .exit = mem_cgroup_exit, 4443 .dfl_cftypes = memory_files, 4444 #ifdef CONFIG_MEMCG_V1 4445 .can_attach = memcg1_can_attach, 4446 .cancel_attach = memcg1_cancel_attach, 4447 .post_attach = memcg1_move_task, 4448 .legacy_cftypes = mem_cgroup_legacy_files, 4449 #endif 4450 .early_init = 0, 4451 }; 4452 4453 /** 4454 * mem_cgroup_calculate_protection - check if memory consumption is in the normal range 4455 * @root: the top ancestor of the sub-tree being checked 4456 * @memcg: the memory cgroup to check 4457 * 4458 * WARNING: This function is not stateless! It can only be used as part 4459 * of a top-down tree iteration, not for isolated queries. 4460 */ 4461 void mem_cgroup_calculate_protection(struct mem_cgroup *root, 4462 struct mem_cgroup *memcg) 4463 { 4464 bool recursive_protection = 4465 cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT; 4466 4467 if (mem_cgroup_disabled()) 4468 return; 4469 4470 if (!root) 4471 root = root_mem_cgroup; 4472 4473 page_counter_calculate_protection(&root->memory, &memcg->memory, recursive_protection); 4474 } 4475 4476 static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, 4477 gfp_t gfp) 4478 { 4479 int ret; 4480 4481 ret = try_charge(memcg, gfp, folio_nr_pages(folio)); 4482 if (ret) 4483 goto out; 4484 4485 mem_cgroup_commit_charge(folio, memcg); 4486 out: 4487 return ret; 4488 } 4489 4490 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) 4491 { 4492 struct mem_cgroup *memcg; 4493 int ret; 4494 4495 memcg = get_mem_cgroup_from_mm(mm); 4496 ret = charge_memcg(folio, memcg, gfp); 4497 css_put(&memcg->css); 4498 4499 return ret; 4500 } 4501 4502 /** 4503 * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio 4504 * @memcg: memcg to charge. 4505 * @gfp: reclaim mode. 4506 * @nr_pages: number of pages to charge. 4507 * 4508 * This function is called when allocating a huge page folio to determine if 4509 * the memcg has the capacity for it. It does not commit the charge yet, 4510 * as the hugetlb folio itself has not been obtained from the hugetlb pool. 4511 * 4512 * Once we have obtained the hugetlb folio, we can call 4513 * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the 4514 * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect 4515 * of try_charge(). 4516 * 4517 * Returns 0 on success. Otherwise, an error code is returned. 4518 */ 4519 int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, 4520 long nr_pages) 4521 { 4522 /* 4523 * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation, 4524 * but do not attempt to commit charge later (or cancel on error) either. 4525 */ 4526 if (mem_cgroup_disabled() || !memcg || 4527 !cgroup_subsys_on_dfl(memory_cgrp_subsys) || 4528 !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) 4529 return -EOPNOTSUPP; 4530 4531 if (try_charge(memcg, gfp, nr_pages)) 4532 return -ENOMEM; 4533 4534 return 0; 4535 } 4536 4537 /** 4538 * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. 4539 * @folio: folio to charge. 4540 * @mm: mm context of the victim 4541 * @gfp: reclaim mode 4542 * @entry: swap entry for which the folio is allocated 4543 * 4544 * This function charges a folio allocated for swapin. Please call this before 4545 * adding the folio to the swapcache. 4546 * 4547 * Returns 0 on success. Otherwise, an error code is returned. 4548 */ 4549 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, 4550 gfp_t gfp, swp_entry_t entry) 4551 { 4552 struct mem_cgroup *memcg; 4553 unsigned short id; 4554 int ret; 4555 4556 if (mem_cgroup_disabled()) 4557 return 0; 4558 4559 id = lookup_swap_cgroup_id(entry); 4560 rcu_read_lock(); 4561 memcg = mem_cgroup_from_id(id); 4562 if (!memcg || !css_tryget_online(&memcg->css)) 4563 memcg = get_mem_cgroup_from_mm(mm); 4564 rcu_read_unlock(); 4565 4566 ret = charge_memcg(folio, memcg, gfp); 4567 4568 css_put(&memcg->css); 4569 return ret; 4570 } 4571 4572 /* 4573 * mem_cgroup_swapin_uncharge_swap - uncharge swap slot 4574 * @entry: swap entry for which the page is charged 4575 * 4576 * Call this function after successfully adding the charged page to swapcache. 4577 * 4578 * Note: This function assumes the page for which swap slot is being uncharged 4579 * is order 0 page. 4580 */ 4581 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) 4582 { 4583 /* 4584 * Cgroup1's unified memory+swap counter has been charged with the 4585 * new swapcache page, finish the transfer by uncharging the swap 4586 * slot. The swap slot would also get uncharged when it dies, but 4587 * it can stick around indefinitely and we'd count the page twice 4588 * the entire time. 4589 * 4590 * Cgroup2 has separate resource counters for memory and swap, 4591 * so this is a non-issue here. Memory and swap charge lifetimes 4592 * correspond 1:1 to page and swap slot lifetimes: we charge the 4593 * page to memory here, and uncharge swap when the slot is freed. 4594 */ 4595 if (!mem_cgroup_disabled() && do_memsw_account()) { 4596 /* 4597 * The swap entry might not get freed for a long time, 4598 * let's not wait for it. The page already received a 4599 * memory+swap charge, drop the swap entry duplicate. 4600 */ 4601 mem_cgroup_uncharge_swap(entry, 1); 4602 } 4603 } 4604 4605 struct uncharge_gather { 4606 struct mem_cgroup *memcg; 4607 unsigned long nr_memory; 4608 unsigned long pgpgout; 4609 unsigned long nr_kmem; 4610 int nid; 4611 }; 4612 4613 static inline void uncharge_gather_clear(struct uncharge_gather *ug) 4614 { 4615 memset(ug, 0, sizeof(*ug)); 4616 } 4617 4618 static void uncharge_batch(const struct uncharge_gather *ug) 4619 { 4620 unsigned long flags; 4621 4622 if (ug->nr_memory) { 4623 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); 4624 if (do_memsw_account()) 4625 page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); 4626 if (ug->nr_kmem) { 4627 mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem); 4628 memcg1_account_kmem(ug->memcg, -ug->nr_kmem); 4629 } 4630 memcg1_oom_recover(ug->memcg); 4631 } 4632 4633 local_irq_save(flags); 4634 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 4635 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); 4636 memcg1_check_events(ug->memcg, ug->nid); 4637 local_irq_restore(flags); 4638 4639 /* drop reference from uncharge_folio */ 4640 css_put(&ug->memcg->css); 4641 } 4642 4643 static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) 4644 { 4645 long nr_pages; 4646 struct mem_cgroup *memcg; 4647 struct obj_cgroup *objcg; 4648 4649 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 4650 VM_BUG_ON_FOLIO(folio_order(folio) > 1 && 4651 !folio_test_hugetlb(folio) && 4652 !list_empty(&folio->_deferred_list), folio); 4653 4654 /* 4655 * Nobody should be changing or seriously looking at 4656 * folio memcg or objcg at this point, we have fully 4657 * exclusive access to the folio. 4658 */ 4659 if (folio_memcg_kmem(folio)) { 4660 objcg = __folio_objcg(folio); 4661 /* 4662 * This get matches the put at the end of the function and 4663 * kmem pages do not hold memcg references anymore. 4664 */ 4665 memcg = get_mem_cgroup_from_objcg(objcg); 4666 } else { 4667 memcg = __folio_memcg(folio); 4668 } 4669 4670 if (!memcg) 4671 return; 4672 4673 if (ug->memcg != memcg) { 4674 if (ug->memcg) { 4675 uncharge_batch(ug); 4676 uncharge_gather_clear(ug); 4677 } 4678 ug->memcg = memcg; 4679 ug->nid = folio_nid(folio); 4680 4681 /* pairs with css_put in uncharge_batch */ 4682 css_get(&memcg->css); 4683 } 4684 4685 nr_pages = folio_nr_pages(folio); 4686 4687 if (folio_memcg_kmem(folio)) { 4688 ug->nr_memory += nr_pages; 4689 ug->nr_kmem += nr_pages; 4690 4691 folio->memcg_data = 0; 4692 obj_cgroup_put(objcg); 4693 } else { 4694 /* LRU pages aren't accounted at the root level */ 4695 if (!mem_cgroup_is_root(memcg)) 4696 ug->nr_memory += nr_pages; 4697 ug->pgpgout++; 4698 4699 folio->memcg_data = 0; 4700 } 4701 4702 css_put(&memcg->css); 4703 } 4704 4705 void __mem_cgroup_uncharge(struct folio *folio) 4706 { 4707 struct uncharge_gather ug; 4708 4709 /* Don't touch folio->lru of any random page, pre-check: */ 4710 if (!folio_memcg(folio)) 4711 return; 4712 4713 uncharge_gather_clear(&ug); 4714 uncharge_folio(folio, &ug); 4715 uncharge_batch(&ug); 4716 } 4717 4718 void __mem_cgroup_uncharge_folios(struct folio_batch *folios) 4719 { 4720 struct uncharge_gather ug; 4721 unsigned int i; 4722 4723 uncharge_gather_clear(&ug); 4724 for (i = 0; i < folios->nr; i++) 4725 uncharge_folio(folios->folios[i], &ug); 4726 if (ug.memcg) 4727 uncharge_batch(&ug); 4728 } 4729 4730 /** 4731 * mem_cgroup_replace_folio - Charge a folio's replacement. 4732 * @old: Currently circulating folio. 4733 * @new: Replacement folio. 4734 * 4735 * Charge @new as a replacement folio for @old. @old will 4736 * be uncharged upon free. 4737 * 4738 * Both folios must be locked, @new->mapping must be set up. 4739 */ 4740 void mem_cgroup_replace_folio(struct folio *old, struct folio *new) 4741 { 4742 struct mem_cgroup *memcg; 4743 long nr_pages = folio_nr_pages(new); 4744 unsigned long flags; 4745 4746 VM_BUG_ON_FOLIO(!folio_test_locked(old), old); 4747 VM_BUG_ON_FOLIO(!folio_test_locked(new), new); 4748 VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); 4749 VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new); 4750 4751 if (mem_cgroup_disabled()) 4752 return; 4753 4754 /* Page cache replacement: new folio already charged? */ 4755 if (folio_memcg(new)) 4756 return; 4757 4758 memcg = folio_memcg(old); 4759 VM_WARN_ON_ONCE_FOLIO(!memcg, old); 4760 if (!memcg) 4761 return; 4762 4763 /* Force-charge the new page. The old one will be freed soon */ 4764 if (!mem_cgroup_is_root(memcg)) { 4765 page_counter_charge(&memcg->memory, nr_pages); 4766 if (do_memsw_account()) 4767 page_counter_charge(&memcg->memsw, nr_pages); 4768 } 4769 4770 css_get(&memcg->css); 4771 commit_charge(new, memcg); 4772 4773 local_irq_save(flags); 4774 mem_cgroup_charge_statistics(memcg, nr_pages); 4775 memcg1_check_events(memcg, folio_nid(new)); 4776 local_irq_restore(flags); 4777 } 4778 4779 /** 4780 * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio. 4781 * @old: Currently circulating folio. 4782 * @new: Replacement folio. 4783 * 4784 * Transfer the memcg data from the old folio to the new folio for migration. 4785 * The old folio's data info will be cleared. Note that the memory counters 4786 * will remain unchanged throughout the process. 4787 * 4788 * Both folios must be locked, @new->mapping must be set up. 4789 */ 4790 void mem_cgroup_migrate(struct folio *old, struct folio *new) 4791 { 4792 struct mem_cgroup *memcg; 4793 4794 VM_BUG_ON_FOLIO(!folio_test_locked(old), old); 4795 VM_BUG_ON_FOLIO(!folio_test_locked(new), new); 4796 VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); 4797 VM_BUG_ON_FOLIO(folio_nr_pages(old) != folio_nr_pages(new), new); 4798 VM_BUG_ON_FOLIO(folio_test_lru(old), old); 4799 4800 if (mem_cgroup_disabled()) 4801 return; 4802 4803 memcg = folio_memcg(old); 4804 /* 4805 * Note that it is normal to see !memcg for a hugetlb folio. 4806 * For e.g, itt could have been allocated when memory_hugetlb_accounting 4807 * was not selected. 4808 */ 4809 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old); 4810 if (!memcg) 4811 return; 4812 4813 /* Transfer the charge and the css ref */ 4814 commit_charge(new, memcg); 4815 old->memcg_data = 0; 4816 } 4817 4818 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 4819 EXPORT_SYMBOL(memcg_sockets_enabled_key); 4820 4821 void mem_cgroup_sk_alloc(struct sock *sk) 4822 { 4823 struct mem_cgroup *memcg; 4824 4825 if (!mem_cgroup_sockets_enabled) 4826 return; 4827 4828 /* Do not associate the sock with unrelated interrupted task's memcg. */ 4829 if (!in_task()) 4830 return; 4831 4832 rcu_read_lock(); 4833 memcg = mem_cgroup_from_task(current); 4834 if (mem_cgroup_is_root(memcg)) 4835 goto out; 4836 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg1_tcpmem_active(memcg)) 4837 goto out; 4838 if (css_tryget(&memcg->css)) 4839 sk->sk_memcg = memcg; 4840 out: 4841 rcu_read_unlock(); 4842 } 4843 4844 void mem_cgroup_sk_free(struct sock *sk) 4845 { 4846 if (sk->sk_memcg) 4847 css_put(&sk->sk_memcg->css); 4848 } 4849 4850 /** 4851 * mem_cgroup_charge_skmem - charge socket memory 4852 * @memcg: memcg to charge 4853 * @nr_pages: number of pages to charge 4854 * @gfp_mask: reclaim mode 4855 * 4856 * Charges @nr_pages to @memcg. Returns %true if the charge fit within 4857 * @memcg's configured limit, %false if it doesn't. 4858 */ 4859 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, 4860 gfp_t gfp_mask) 4861 { 4862 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 4863 return memcg1_charge_skmem(memcg, nr_pages, gfp_mask); 4864 4865 if (try_charge(memcg, gfp_mask, nr_pages) == 0) { 4866 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 4867 return true; 4868 } 4869 4870 return false; 4871 } 4872 4873 /** 4874 * mem_cgroup_uncharge_skmem - uncharge socket memory 4875 * @memcg: memcg to uncharge 4876 * @nr_pages: number of pages to uncharge 4877 */ 4878 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 4879 { 4880 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 4881 memcg1_uncharge_skmem(memcg, nr_pages); 4882 return; 4883 } 4884 4885 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 4886 4887 refill_stock(memcg, nr_pages); 4888 } 4889 4890 static int __init cgroup_memory(char *s) 4891 { 4892 char *token; 4893 4894 while ((token = strsep(&s, ",")) != NULL) { 4895 if (!*token) 4896 continue; 4897 if (!strcmp(token, "nosocket")) 4898 cgroup_memory_nosocket = true; 4899 if (!strcmp(token, "nokmem")) 4900 cgroup_memory_nokmem = true; 4901 if (!strcmp(token, "nobpf")) 4902 cgroup_memory_nobpf = true; 4903 } 4904 return 1; 4905 } 4906 __setup("cgroup.memory=", cgroup_memory); 4907 4908 /* 4909 * subsys_initcall() for memory controller. 4910 * 4911 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 4912 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 4913 * basically everything that doesn't depend on a specific mem_cgroup structure 4914 * should be initialized from here. 4915 */ 4916 static int __init mem_cgroup_init(void) 4917 { 4918 int cpu; 4919 4920 /* 4921 * Currently s32 type (can refer to struct batched_lruvec_stat) is 4922 * used for per-memcg-per-cpu caching of per-node statistics. In order 4923 * to work fine, we should make sure that the overfill threshold can't 4924 * exceed S32_MAX / PAGE_SIZE. 4925 */ 4926 BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE); 4927 4928 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 4929 memcg_hotplug_cpu_dead); 4930 4931 for_each_possible_cpu(cpu) 4932 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 4933 drain_local_stock); 4934 4935 return 0; 4936 } 4937 subsys_initcall(mem_cgroup_init); 4938 4939 #ifdef CONFIG_SWAP 4940 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 4941 { 4942 while (!refcount_inc_not_zero(&memcg->id.ref)) { 4943 /* 4944 * The root cgroup cannot be destroyed, so it's refcount must 4945 * always be >= 1. 4946 */ 4947 if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) { 4948 VM_BUG_ON(1); 4949 break; 4950 } 4951 memcg = parent_mem_cgroup(memcg); 4952 if (!memcg) 4953 memcg = root_mem_cgroup; 4954 } 4955 return memcg; 4956 } 4957 4958 /** 4959 * mem_cgroup_swapout - transfer a memsw charge to swap 4960 * @folio: folio whose memsw charge to transfer 4961 * @entry: swap entry to move the charge to 4962 * 4963 * Transfer the memsw charge of @folio to @entry. 4964 */ 4965 void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) 4966 { 4967 struct mem_cgroup *memcg, *swap_memcg; 4968 unsigned int nr_entries; 4969 unsigned short oldid; 4970 4971 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 4972 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 4973 4974 if (mem_cgroup_disabled()) 4975 return; 4976 4977 if (!do_memsw_account()) 4978 return; 4979 4980 memcg = folio_memcg(folio); 4981 4982 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 4983 if (!memcg) 4984 return; 4985 4986 /* 4987 * In case the memcg owning these pages has been offlined and doesn't 4988 * have an ID allocated to it anymore, charge the closest online 4989 * ancestor for the swap instead and transfer the memory+swap charge. 4990 */ 4991 swap_memcg = mem_cgroup_id_get_online(memcg); 4992 nr_entries = folio_nr_pages(folio); 4993 /* Get references for the tail pages, too */ 4994 if (nr_entries > 1) 4995 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 4996 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 4997 nr_entries); 4998 VM_BUG_ON_FOLIO(oldid, folio); 4999 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 5000 5001 folio->memcg_data = 0; 5002 5003 if (!mem_cgroup_is_root(memcg)) 5004 page_counter_uncharge(&memcg->memory, nr_entries); 5005 5006 if (memcg != swap_memcg) { 5007 if (!mem_cgroup_is_root(swap_memcg)) 5008 page_counter_charge(&swap_memcg->memsw, nr_entries); 5009 page_counter_uncharge(&memcg->memsw, nr_entries); 5010 } 5011 5012 /* 5013 * Interrupts should be disabled here because the caller holds the 5014 * i_pages lock which is taken with interrupts-off. It is 5015 * important here to have the interrupts disabled because it is the 5016 * only synchronisation we have for updating the per-CPU variables. 5017 */ 5018 memcg_stats_lock(); 5019 mem_cgroup_charge_statistics(memcg, -nr_entries); 5020 memcg_stats_unlock(); 5021 memcg1_check_events(memcg, folio_nid(folio)); 5022 5023 css_put(&memcg->css); 5024 } 5025 5026 /** 5027 * __mem_cgroup_try_charge_swap - try charging swap space for a folio 5028 * @folio: folio being added to swap 5029 * @entry: swap entry to charge 5030 * 5031 * Try to charge @folio's memcg for the swap space at @entry. 5032 * 5033 * Returns 0 on success, -ENOMEM on failure. 5034 */ 5035 int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) 5036 { 5037 unsigned int nr_pages = folio_nr_pages(folio); 5038 struct page_counter *counter; 5039 struct mem_cgroup *memcg; 5040 unsigned short oldid; 5041 5042 if (do_memsw_account()) 5043 return 0; 5044 5045 memcg = folio_memcg(folio); 5046 5047 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 5048 if (!memcg) 5049 return 0; 5050 5051 if (!entry.val) { 5052 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 5053 return 0; 5054 } 5055 5056 memcg = mem_cgroup_id_get_online(memcg); 5057 5058 if (!mem_cgroup_is_root(memcg) && 5059 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 5060 memcg_memory_event(memcg, MEMCG_SWAP_MAX); 5061 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 5062 mem_cgroup_id_put(memcg); 5063 return -ENOMEM; 5064 } 5065 5066 /* Get references for the tail pages, too */ 5067 if (nr_pages > 1) 5068 mem_cgroup_id_get_many(memcg, nr_pages - 1); 5069 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 5070 VM_BUG_ON_FOLIO(oldid, folio); 5071 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 5072 5073 return 0; 5074 } 5075 5076 /** 5077 * __mem_cgroup_uncharge_swap - uncharge swap space 5078 * @entry: swap entry to uncharge 5079 * @nr_pages: the amount of swap space to uncharge 5080 */ 5081 void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 5082 { 5083 struct mem_cgroup *memcg; 5084 unsigned short id; 5085 5086 id = swap_cgroup_record(entry, 0, nr_pages); 5087 rcu_read_lock(); 5088 memcg = mem_cgroup_from_id(id); 5089 if (memcg) { 5090 if (!mem_cgroup_is_root(memcg)) { 5091 if (do_memsw_account()) 5092 page_counter_uncharge(&memcg->memsw, nr_pages); 5093 else 5094 page_counter_uncharge(&memcg->swap, nr_pages); 5095 } 5096 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 5097 mem_cgroup_id_put_many(memcg, nr_pages); 5098 } 5099 rcu_read_unlock(); 5100 } 5101 5102 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 5103 { 5104 long nr_swap_pages = get_nr_swap_pages(); 5105 5106 if (mem_cgroup_disabled() || do_memsw_account()) 5107 return nr_swap_pages; 5108 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) 5109 nr_swap_pages = min_t(long, nr_swap_pages, 5110 READ_ONCE(memcg->swap.max) - 5111 page_counter_read(&memcg->swap)); 5112 return nr_swap_pages; 5113 } 5114 5115 bool mem_cgroup_swap_full(struct folio *folio) 5116 { 5117 struct mem_cgroup *memcg; 5118 5119 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 5120 5121 if (vm_swap_full()) 5122 return true; 5123 if (do_memsw_account()) 5124 return false; 5125 5126 memcg = folio_memcg(folio); 5127 if (!memcg) 5128 return false; 5129 5130 for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { 5131 unsigned long usage = page_counter_read(&memcg->swap); 5132 5133 if (usage * 2 >= READ_ONCE(memcg->swap.high) || 5134 usage * 2 >= READ_ONCE(memcg->swap.max)) 5135 return true; 5136 } 5137 5138 return false; 5139 } 5140 5141 static int __init setup_swap_account(char *s) 5142 { 5143 bool res; 5144 5145 if (!kstrtobool(s, &res) && !res) 5146 pr_warn_once("The swapaccount=0 commandline option is deprecated " 5147 "in favor of configuring swap control via cgroupfs. " 5148 "Please report your usecase to linux-mm@kvack.org if you " 5149 "depend on this functionality.\n"); 5150 return 1; 5151 } 5152 __setup("swapaccount=", setup_swap_account); 5153 5154 static u64 swap_current_read(struct cgroup_subsys_state *css, 5155 struct cftype *cft) 5156 { 5157 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5158 5159 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 5160 } 5161 5162 static int swap_peak_show(struct seq_file *sf, void *v) 5163 { 5164 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 5165 5166 return peak_show(sf, v, &memcg->swap); 5167 } 5168 5169 static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf, 5170 size_t nbytes, loff_t off) 5171 { 5172 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5173 5174 return peak_write(of, buf, nbytes, off, &memcg->swap, 5175 &memcg->swap_peaks); 5176 } 5177 5178 static int swap_high_show(struct seq_file *m, void *v) 5179 { 5180 return seq_puts_memcg_tunable(m, 5181 READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); 5182 } 5183 5184 static ssize_t swap_high_write(struct kernfs_open_file *of, 5185 char *buf, size_t nbytes, loff_t off) 5186 { 5187 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5188 unsigned long high; 5189 int err; 5190 5191 buf = strstrip(buf); 5192 err = page_counter_memparse(buf, "max", &high); 5193 if (err) 5194 return err; 5195 5196 page_counter_set_high(&memcg->swap, high); 5197 5198 return nbytes; 5199 } 5200 5201 static int swap_max_show(struct seq_file *m, void *v) 5202 { 5203 return seq_puts_memcg_tunable(m, 5204 READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 5205 } 5206 5207 static ssize_t swap_max_write(struct kernfs_open_file *of, 5208 char *buf, size_t nbytes, loff_t off) 5209 { 5210 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5211 unsigned long max; 5212 int err; 5213 5214 buf = strstrip(buf); 5215 err = page_counter_memparse(buf, "max", &max); 5216 if (err) 5217 return err; 5218 5219 xchg(&memcg->swap.max, max); 5220 5221 return nbytes; 5222 } 5223 5224 static int swap_events_show(struct seq_file *m, void *v) 5225 { 5226 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 5227 5228 seq_printf(m, "high %lu\n", 5229 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); 5230 seq_printf(m, "max %lu\n", 5231 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 5232 seq_printf(m, "fail %lu\n", 5233 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 5234 5235 return 0; 5236 } 5237 5238 static struct cftype swap_files[] = { 5239 { 5240 .name = "swap.current", 5241 .flags = CFTYPE_NOT_ON_ROOT, 5242 .read_u64 = swap_current_read, 5243 }, 5244 { 5245 .name = "swap.high", 5246 .flags = CFTYPE_NOT_ON_ROOT, 5247 .seq_show = swap_high_show, 5248 .write = swap_high_write, 5249 }, 5250 { 5251 .name = "swap.max", 5252 .flags = CFTYPE_NOT_ON_ROOT, 5253 .seq_show = swap_max_show, 5254 .write = swap_max_write, 5255 }, 5256 { 5257 .name = "swap.peak", 5258 .flags = CFTYPE_NOT_ON_ROOT, 5259 .open = peak_open, 5260 .release = peak_release, 5261 .seq_show = swap_peak_show, 5262 .write = swap_peak_write, 5263 }, 5264 { 5265 .name = "swap.events", 5266 .flags = CFTYPE_NOT_ON_ROOT, 5267 .file_offset = offsetof(struct mem_cgroup, swap_events_file), 5268 .seq_show = swap_events_show, 5269 }, 5270 { } /* terminate */ 5271 }; 5272 5273 #ifdef CONFIG_ZSWAP 5274 /** 5275 * obj_cgroup_may_zswap - check if this cgroup can zswap 5276 * @objcg: the object cgroup 5277 * 5278 * Check if the hierarchical zswap limit has been reached. 5279 * 5280 * This doesn't check for specific headroom, and it is not atomic 5281 * either. But with zswap, the size of the allocation is only known 5282 * once compression has occurred, and this optimistic pre-check avoids 5283 * spending cycles on compression when there is already no room left 5284 * or zswap is disabled altogether somewhere in the hierarchy. 5285 */ 5286 bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) 5287 { 5288 struct mem_cgroup *memcg, *original_memcg; 5289 bool ret = true; 5290 5291 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5292 return true; 5293 5294 original_memcg = get_mem_cgroup_from_objcg(objcg); 5295 for (memcg = original_memcg; !mem_cgroup_is_root(memcg); 5296 memcg = parent_mem_cgroup(memcg)) { 5297 unsigned long max = READ_ONCE(memcg->zswap_max); 5298 unsigned long pages; 5299 5300 if (max == PAGE_COUNTER_MAX) 5301 continue; 5302 if (max == 0) { 5303 ret = false; 5304 break; 5305 } 5306 5307 /* 5308 * mem_cgroup_flush_stats() ignores small changes. Use 5309 * do_flush_stats() directly to get accurate stats for charging. 5310 */ 5311 do_flush_stats(memcg); 5312 pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; 5313 if (pages < max) 5314 continue; 5315 ret = false; 5316 break; 5317 } 5318 mem_cgroup_put(original_memcg); 5319 return ret; 5320 } 5321 5322 /** 5323 * obj_cgroup_charge_zswap - charge compression backend memory 5324 * @objcg: the object cgroup 5325 * @size: size of compressed object 5326 * 5327 * This forces the charge after obj_cgroup_may_zswap() allowed 5328 * compression and storage in zwap for this cgroup to go ahead. 5329 */ 5330 void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) 5331 { 5332 struct mem_cgroup *memcg; 5333 5334 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5335 return; 5336 5337 VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); 5338 5339 /* PF_MEMALLOC context, charging must succeed */ 5340 if (obj_cgroup_charge(objcg, GFP_KERNEL, size)) 5341 VM_WARN_ON_ONCE(1); 5342 5343 rcu_read_lock(); 5344 memcg = obj_cgroup_memcg(objcg); 5345 mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); 5346 mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); 5347 rcu_read_unlock(); 5348 } 5349 5350 /** 5351 * obj_cgroup_uncharge_zswap - uncharge compression backend memory 5352 * @objcg: the object cgroup 5353 * @size: size of compressed object 5354 * 5355 * Uncharges zswap memory on page in. 5356 */ 5357 void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) 5358 { 5359 struct mem_cgroup *memcg; 5360 5361 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5362 return; 5363 5364 obj_cgroup_uncharge(objcg, size); 5365 5366 rcu_read_lock(); 5367 memcg = obj_cgroup_memcg(objcg); 5368 mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); 5369 mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); 5370 rcu_read_unlock(); 5371 } 5372 5373 bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) 5374 { 5375 /* if zswap is disabled, do not block pages going to the swapping device */ 5376 return !zswap_is_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback); 5377 } 5378 5379 static u64 zswap_current_read(struct cgroup_subsys_state *css, 5380 struct cftype *cft) 5381 { 5382 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5383 5384 mem_cgroup_flush_stats(memcg); 5385 return memcg_page_state(memcg, MEMCG_ZSWAP_B); 5386 } 5387 5388 static int zswap_max_show(struct seq_file *m, void *v) 5389 { 5390 return seq_puts_memcg_tunable(m, 5391 READ_ONCE(mem_cgroup_from_seq(m)->zswap_max)); 5392 } 5393 5394 static ssize_t zswap_max_write(struct kernfs_open_file *of, 5395 char *buf, size_t nbytes, loff_t off) 5396 { 5397 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5398 unsigned long max; 5399 int err; 5400 5401 buf = strstrip(buf); 5402 err = page_counter_memparse(buf, "max", &max); 5403 if (err) 5404 return err; 5405 5406 xchg(&memcg->zswap_max, max); 5407 5408 return nbytes; 5409 } 5410 5411 static int zswap_writeback_show(struct seq_file *m, void *v) 5412 { 5413 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 5414 5415 seq_printf(m, "%d\n", READ_ONCE(memcg->zswap_writeback)); 5416 return 0; 5417 } 5418 5419 static ssize_t zswap_writeback_write(struct kernfs_open_file *of, 5420 char *buf, size_t nbytes, loff_t off) 5421 { 5422 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5423 int zswap_writeback; 5424 ssize_t parse_ret = kstrtoint(strstrip(buf), 0, &zswap_writeback); 5425 5426 if (parse_ret) 5427 return parse_ret; 5428 5429 if (zswap_writeback != 0 && zswap_writeback != 1) 5430 return -EINVAL; 5431 5432 WRITE_ONCE(memcg->zswap_writeback, zswap_writeback); 5433 return nbytes; 5434 } 5435 5436 static struct cftype zswap_files[] = { 5437 { 5438 .name = "zswap.current", 5439 .flags = CFTYPE_NOT_ON_ROOT, 5440 .read_u64 = zswap_current_read, 5441 }, 5442 { 5443 .name = "zswap.max", 5444 .flags = CFTYPE_NOT_ON_ROOT, 5445 .seq_show = zswap_max_show, 5446 .write = zswap_max_write, 5447 }, 5448 { 5449 .name = "zswap.writeback", 5450 .seq_show = zswap_writeback_show, 5451 .write = zswap_writeback_write, 5452 }, 5453 { } /* terminate */ 5454 }; 5455 #endif /* CONFIG_ZSWAP */ 5456 5457 static int __init mem_cgroup_swap_init(void) 5458 { 5459 if (mem_cgroup_disabled()) 5460 return 0; 5461 5462 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); 5463 #ifdef CONFIG_MEMCG_V1 5464 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); 5465 #endif 5466 #ifdef CONFIG_ZSWAP 5467 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files)); 5468 #endif 5469 return 0; 5470 } 5471 subsys_initcall(mem_cgroup_swap_init); 5472 5473 #endif /* CONFIG_SWAP */ 5474