xref: /linux/mm/memcontrol.c (revision e0b2fdb352b7991664b23ae5e15b537cd79a7820)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* memcontrol.c - Memory Controller
3  *
4  * Copyright IBM Corporation, 2007
5  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
6  *
7  * Copyright 2007 OpenVZ SWsoft Inc
8  * Author: Pavel Emelianov <xemul@openvz.org>
9  *
10  * Memory thresholds
11  * Copyright (C) 2009 Nokia Corporation
12  * Author: Kirill A. Shutemov
13  *
14  * Kernel Memory Controller
15  * Copyright (C) 2012 Parallels Inc. and Google Inc.
16  * Authors: Glauber Costa and Suleiman Souhlal
17  *
18  * Native page reclaim
19  * Charge lifetime sanitation
20  * Lockless page tracking & accounting
21  * Unified hierarchy configuration model
22  * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
23  *
24  * Per memcg lru locking
25  * Copyright (C) 2020 Alibaba, Inc, Alex Shi
26  */
27 
28 #include <linux/page_counter.h>
29 #include <linux/memcontrol.h>
30 #include <linux/cgroup.h>
31 #include <linux/sched/mm.h>
32 #include <linux/shmem_fs.h>
33 #include <linux/hugetlb.h>
34 #include <linux/pagemap.h>
35 #include <linux/pagevec.h>
36 #include <linux/vm_event_item.h>
37 #include <linux/smp.h>
38 #include <linux/page-flags.h>
39 #include <linux/backing-dev.h>
40 #include <linux/bit_spinlock.h>
41 #include <linux/rcupdate.h>
42 #include <linux/limits.h>
43 #include <linux/export.h>
44 #include <linux/mutex.h>
45 #include <linux/rbtree.h>
46 #include <linux/slab.h>
47 #include <linux/swapops.h>
48 #include <linux/spinlock.h>
49 #include <linux/fs.h>
50 #include <linux/seq_file.h>
51 #include <linux/parser.h>
52 #include <linux/vmpressure.h>
53 #include <linux/memremap.h>
54 #include <linux/mm_inline.h>
55 #include <linux/swap_cgroup.h>
56 #include <linux/cpu.h>
57 #include <linux/oom.h>
58 #include <linux/lockdep.h>
59 #include <linux/resume_user_mode.h>
60 #include <linux/psi.h>
61 #include <linux/seq_buf.h>
62 #include <linux/sched/isolation.h>
63 #include <linux/kmemleak.h>
64 #include "internal.h"
65 #include <net/sock.h>
66 #include <net/ip.h>
67 #include "slab.h"
68 #include "memcontrol-v1.h"
69 
70 #include <linux/uaccess.h>
71 
72 #include <trace/events/vmscan.h>
73 
74 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
75 EXPORT_SYMBOL(memory_cgrp_subsys);
76 
77 struct mem_cgroup *root_mem_cgroup __read_mostly;
78 
79 /* Active memory cgroup to use from an interrupt context */
80 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
81 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
82 
83 /* Socket memory accounting disabled? */
84 static bool cgroup_memory_nosocket __ro_after_init;
85 
86 /* Kernel memory accounting disabled? */
87 static bool cgroup_memory_nokmem __ro_after_init;
88 
89 /* BPF memory accounting disabled? */
90 static bool cgroup_memory_nobpf __ro_after_init;
91 
92 #ifdef CONFIG_CGROUP_WRITEBACK
93 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
94 #endif
95 
96 #define THRESHOLDS_EVENTS_TARGET 128
97 #define SOFTLIMIT_EVENTS_TARGET 1024
98 
99 static inline bool task_is_dying(void)
100 {
101 	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
102 		(current->flags & PF_EXITING);
103 }
104 
105 /* Some nice accessors for the vmpressure. */
106 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
107 {
108 	if (!memcg)
109 		memcg = root_mem_cgroup;
110 	return &memcg->vmpressure;
111 }
112 
113 struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
114 {
115 	return container_of(vmpr, struct mem_cgroup, vmpressure);
116 }
117 
118 #define CURRENT_OBJCG_UPDATE_BIT 0
119 #define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT)
120 
121 static DEFINE_SPINLOCK(objcg_lock);
122 
123 bool mem_cgroup_kmem_disabled(void)
124 {
125 	return cgroup_memory_nokmem;
126 }
127 
128 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
129 				      unsigned int nr_pages);
130 
131 static void obj_cgroup_release(struct percpu_ref *ref)
132 {
133 	struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
134 	unsigned int nr_bytes;
135 	unsigned int nr_pages;
136 	unsigned long flags;
137 
138 	/*
139 	 * At this point all allocated objects are freed, and
140 	 * objcg->nr_charged_bytes can't have an arbitrary byte value.
141 	 * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
142 	 *
143 	 * The following sequence can lead to it:
144 	 * 1) CPU0: objcg == stock->cached_objcg
145 	 * 2) CPU1: we do a small allocation (e.g. 92 bytes),
146 	 *          PAGE_SIZE bytes are charged
147 	 * 3) CPU1: a process from another memcg is allocating something,
148 	 *          the stock if flushed,
149 	 *          objcg->nr_charged_bytes = PAGE_SIZE - 92
150 	 * 5) CPU0: we do release this object,
151 	 *          92 bytes are added to stock->nr_bytes
152 	 * 6) CPU0: stock is flushed,
153 	 *          92 bytes are added to objcg->nr_charged_bytes
154 	 *
155 	 * In the result, nr_charged_bytes == PAGE_SIZE.
156 	 * This page will be uncharged in obj_cgroup_release().
157 	 */
158 	nr_bytes = atomic_read(&objcg->nr_charged_bytes);
159 	WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
160 	nr_pages = nr_bytes >> PAGE_SHIFT;
161 
162 	if (nr_pages)
163 		obj_cgroup_uncharge_pages(objcg, nr_pages);
164 
165 	spin_lock_irqsave(&objcg_lock, flags);
166 	list_del(&objcg->list);
167 	spin_unlock_irqrestore(&objcg_lock, flags);
168 
169 	percpu_ref_exit(ref);
170 	kfree_rcu(objcg, rcu);
171 }
172 
173 static struct obj_cgroup *obj_cgroup_alloc(void)
174 {
175 	struct obj_cgroup *objcg;
176 	int ret;
177 
178 	objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
179 	if (!objcg)
180 		return NULL;
181 
182 	ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
183 			      GFP_KERNEL);
184 	if (ret) {
185 		kfree(objcg);
186 		return NULL;
187 	}
188 	INIT_LIST_HEAD(&objcg->list);
189 	return objcg;
190 }
191 
192 static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
193 				  struct mem_cgroup *parent)
194 {
195 	struct obj_cgroup *objcg, *iter;
196 
197 	objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
198 
199 	spin_lock_irq(&objcg_lock);
200 
201 	/* 1) Ready to reparent active objcg. */
202 	list_add(&objcg->list, &memcg->objcg_list);
203 	/* 2) Reparent active objcg and already reparented objcgs to parent. */
204 	list_for_each_entry(iter, &memcg->objcg_list, list)
205 		WRITE_ONCE(iter->memcg, parent);
206 	/* 3) Move already reparented objcgs to the parent's list */
207 	list_splice(&memcg->objcg_list, &parent->objcg_list);
208 
209 	spin_unlock_irq(&objcg_lock);
210 
211 	percpu_ref_kill(&objcg->refcnt);
212 }
213 
214 /*
215  * A lot of the calls to the cache allocation functions are expected to be
216  * inlined by the compiler. Since the calls to memcg_slab_post_alloc_hook() are
217  * conditional to this static branch, we'll have to allow modules that does
218  * kmem_cache_alloc and the such to see this symbol as well
219  */
220 DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key);
221 EXPORT_SYMBOL(memcg_kmem_online_key);
222 
223 DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
224 EXPORT_SYMBOL(memcg_bpf_enabled_key);
225 
226 /**
227  * mem_cgroup_css_from_folio - css of the memcg associated with a folio
228  * @folio: folio of interest
229  *
230  * If memcg is bound to the default hierarchy, css of the memcg associated
231  * with @folio is returned.  The returned css remains associated with @folio
232  * until it is released.
233  *
234  * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
235  * is returned.
236  */
237 struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
238 {
239 	struct mem_cgroup *memcg = folio_memcg(folio);
240 
241 	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
242 		memcg = root_mem_cgroup;
243 
244 	return &memcg->css;
245 }
246 
247 /**
248  * page_cgroup_ino - return inode number of the memcg a page is charged to
249  * @page: the page
250  *
251  * Look up the closest online ancestor of the memory cgroup @page is charged to
252  * and return its inode number or 0 if @page is not charged to any cgroup. It
253  * is safe to call this function without holding a reference to @page.
254  *
255  * Note, this function is inherently racy, because there is nothing to prevent
256  * the cgroup inode from getting torn down and potentially reallocated a moment
257  * after page_cgroup_ino() returns, so it only should be used by callers that
258  * do not care (such as procfs interfaces).
259  */
260 ino_t page_cgroup_ino(struct page *page)
261 {
262 	struct mem_cgroup *memcg;
263 	unsigned long ino = 0;
264 
265 	rcu_read_lock();
266 	/* page_folio() is racy here, but the entire function is racy anyway */
267 	memcg = folio_memcg_check(page_folio(page));
268 
269 	while (memcg && !(memcg->css.flags & CSS_ONLINE))
270 		memcg = parent_mem_cgroup(memcg);
271 	if (memcg)
272 		ino = cgroup_ino(memcg->css.cgroup);
273 	rcu_read_unlock();
274 	return ino;
275 }
276 
277 /* Subset of node_stat_item for memcg stats */
278 static const unsigned int memcg_node_stat_items[] = {
279 	NR_INACTIVE_ANON,
280 	NR_ACTIVE_ANON,
281 	NR_INACTIVE_FILE,
282 	NR_ACTIVE_FILE,
283 	NR_UNEVICTABLE,
284 	NR_SLAB_RECLAIMABLE_B,
285 	NR_SLAB_UNRECLAIMABLE_B,
286 	WORKINGSET_REFAULT_ANON,
287 	WORKINGSET_REFAULT_FILE,
288 	WORKINGSET_ACTIVATE_ANON,
289 	WORKINGSET_ACTIVATE_FILE,
290 	WORKINGSET_RESTORE_ANON,
291 	WORKINGSET_RESTORE_FILE,
292 	WORKINGSET_NODERECLAIM,
293 	NR_ANON_MAPPED,
294 	NR_FILE_MAPPED,
295 	NR_FILE_PAGES,
296 	NR_FILE_DIRTY,
297 	NR_WRITEBACK,
298 	NR_SHMEM,
299 	NR_SHMEM_THPS,
300 	NR_FILE_THPS,
301 	NR_ANON_THPS,
302 	NR_KERNEL_STACK_KB,
303 	NR_PAGETABLE,
304 	NR_SECONDARY_PAGETABLE,
305 #ifdef CONFIG_SWAP
306 	NR_SWAPCACHE,
307 #endif
308 };
309 
310 static const unsigned int memcg_stat_items[] = {
311 	MEMCG_SWAP,
312 	MEMCG_SOCK,
313 	MEMCG_PERCPU_B,
314 	MEMCG_VMALLOC,
315 	MEMCG_KMEM,
316 	MEMCG_ZSWAP_B,
317 	MEMCG_ZSWAPPED,
318 };
319 
320 #define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
321 #define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \
322 			   ARRAY_SIZE(memcg_stat_items))
323 #define BAD_STAT_IDX(index) ((u32)(index) >= U8_MAX)
324 static u8 mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
325 
326 static void init_memcg_stats(void)
327 {
328 	u8 i, j = 0;
329 
330 	BUILD_BUG_ON(MEMCG_NR_STAT >= U8_MAX);
331 
332 	memset(mem_cgroup_stats_index, U8_MAX, sizeof(mem_cgroup_stats_index));
333 
334 	for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i, ++j)
335 		mem_cgroup_stats_index[memcg_node_stat_items[i]] = j;
336 
337 	for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i, ++j)
338 		mem_cgroup_stats_index[memcg_stat_items[i]] = j;
339 }
340 
341 static inline int memcg_stats_index(int idx)
342 {
343 	return mem_cgroup_stats_index[idx];
344 }
345 
346 struct lruvec_stats_percpu {
347 	/* Local (CPU and cgroup) state */
348 	long state[NR_MEMCG_NODE_STAT_ITEMS];
349 
350 	/* Delta calculation for lockless upward propagation */
351 	long state_prev[NR_MEMCG_NODE_STAT_ITEMS];
352 };
353 
354 struct lruvec_stats {
355 	/* Aggregated (CPU and subtree) state */
356 	long state[NR_MEMCG_NODE_STAT_ITEMS];
357 
358 	/* Non-hierarchical (CPU aggregated) state */
359 	long state_local[NR_MEMCG_NODE_STAT_ITEMS];
360 
361 	/* Pending child counts during tree propagation */
362 	long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
363 };
364 
365 unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
366 {
367 	struct mem_cgroup_per_node *pn;
368 	long x;
369 	int i;
370 
371 	if (mem_cgroup_disabled())
372 		return node_page_state(lruvec_pgdat(lruvec), idx);
373 
374 	i = memcg_stats_index(idx);
375 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
376 		return 0;
377 
378 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
379 	x = READ_ONCE(pn->lruvec_stats->state[i]);
380 #ifdef CONFIG_SMP
381 	if (x < 0)
382 		x = 0;
383 #endif
384 	return x;
385 }
386 
387 unsigned long lruvec_page_state_local(struct lruvec *lruvec,
388 				      enum node_stat_item idx)
389 {
390 	struct mem_cgroup_per_node *pn;
391 	long x;
392 	int i;
393 
394 	if (mem_cgroup_disabled())
395 		return node_page_state(lruvec_pgdat(lruvec), idx);
396 
397 	i = memcg_stats_index(idx);
398 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
399 		return 0;
400 
401 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
402 	x = READ_ONCE(pn->lruvec_stats->state_local[i]);
403 #ifdef CONFIG_SMP
404 	if (x < 0)
405 		x = 0;
406 #endif
407 	return x;
408 }
409 
410 /* Subset of vm_event_item to report for memcg event stats */
411 static const unsigned int memcg_vm_event_stat[] = {
412 	PGPGIN,
413 	PGPGOUT,
414 	PGSCAN_KSWAPD,
415 	PGSCAN_DIRECT,
416 	PGSCAN_KHUGEPAGED,
417 	PGSTEAL_KSWAPD,
418 	PGSTEAL_DIRECT,
419 	PGSTEAL_KHUGEPAGED,
420 	PGFAULT,
421 	PGMAJFAULT,
422 	PGREFILL,
423 	PGACTIVATE,
424 	PGDEACTIVATE,
425 	PGLAZYFREE,
426 	PGLAZYFREED,
427 #ifdef CONFIG_ZSWAP
428 	ZSWPIN,
429 	ZSWPOUT,
430 	ZSWPWB,
431 #endif
432 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
433 	THP_FAULT_ALLOC,
434 	THP_COLLAPSE_ALLOC,
435 	THP_SWPOUT,
436 	THP_SWPOUT_FALLBACK,
437 #endif
438 };
439 
440 #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
441 static u8 mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
442 
443 static void init_memcg_events(void)
444 {
445 	u8 i;
446 
447 	BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= U8_MAX);
448 
449 	memset(mem_cgroup_events_index, U8_MAX,
450 	       sizeof(mem_cgroup_events_index));
451 
452 	for (i = 0; i < NR_MEMCG_EVENTS; ++i)
453 		mem_cgroup_events_index[memcg_vm_event_stat[i]] = i;
454 }
455 
456 static inline int memcg_events_index(enum vm_event_item idx)
457 {
458 	return mem_cgroup_events_index[idx];
459 }
460 
461 struct memcg_vmstats_percpu {
462 	/* Stats updates since the last flush */
463 	unsigned int			stats_updates;
464 
465 	/* Cached pointers for fast iteration in memcg_rstat_updated() */
466 	struct memcg_vmstats_percpu	*parent;
467 	struct memcg_vmstats		*vmstats;
468 
469 	/* The above should fit a single cacheline for memcg_rstat_updated() */
470 
471 	/* Local (CPU and cgroup) page state & events */
472 	long			state[MEMCG_VMSTAT_SIZE];
473 	unsigned long		events[NR_MEMCG_EVENTS];
474 
475 	/* Delta calculation for lockless upward propagation */
476 	long			state_prev[MEMCG_VMSTAT_SIZE];
477 	unsigned long		events_prev[NR_MEMCG_EVENTS];
478 
479 	/* Cgroup1: threshold notifications & softlimit tree updates */
480 	unsigned long		nr_page_events;
481 	unsigned long		targets[MEM_CGROUP_NTARGETS];
482 } ____cacheline_aligned;
483 
484 struct memcg_vmstats {
485 	/* Aggregated (CPU and subtree) page state & events */
486 	long			state[MEMCG_VMSTAT_SIZE];
487 	unsigned long		events[NR_MEMCG_EVENTS];
488 
489 	/* Non-hierarchical (CPU aggregated) page state & events */
490 	long			state_local[MEMCG_VMSTAT_SIZE];
491 	unsigned long		events_local[NR_MEMCG_EVENTS];
492 
493 	/* Pending child counts during tree propagation */
494 	long			state_pending[MEMCG_VMSTAT_SIZE];
495 	unsigned long		events_pending[NR_MEMCG_EVENTS];
496 
497 	/* Stats updates since the last flush */
498 	atomic64_t		stats_updates;
499 };
500 
501 /*
502  * memcg and lruvec stats flushing
503  *
504  * Many codepaths leading to stats update or read are performance sensitive and
505  * adding stats flushing in such codepaths is not desirable. So, to optimize the
506  * flushing the kernel does:
507  *
508  * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
509  *    rstat update tree grow unbounded.
510  *
511  * 2) Flush the stats synchronously on reader side only when there are more than
512  *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
513  *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
514  *    only for 2 seconds due to (1).
515  */
516 static void flush_memcg_stats_dwork(struct work_struct *w);
517 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
518 static u64 flush_last_time;
519 
520 #define FLUSH_TIME (2UL*HZ)
521 
522 /*
523  * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
524  * not rely on this as part of an acquired spinlock_t lock. These functions are
525  * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
526  * is sufficient.
527  */
528 static void memcg_stats_lock(void)
529 {
530 	preempt_disable_nested();
531 	VM_WARN_ON_IRQS_ENABLED();
532 }
533 
534 static void __memcg_stats_lock(void)
535 {
536 	preempt_disable_nested();
537 }
538 
539 static void memcg_stats_unlock(void)
540 {
541 	preempt_enable_nested();
542 }
543 
544 
545 static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
546 {
547 	return atomic64_read(&vmstats->stats_updates) >
548 		MEMCG_CHARGE_BATCH * num_online_cpus();
549 }
550 
551 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
552 {
553 	struct memcg_vmstats_percpu *statc;
554 	int cpu = smp_processor_id();
555 	unsigned int stats_updates;
556 
557 	if (!val)
558 		return;
559 
560 	cgroup_rstat_updated(memcg->css.cgroup, cpu);
561 	statc = this_cpu_ptr(memcg->vmstats_percpu);
562 	for (; statc; statc = statc->parent) {
563 		stats_updates = READ_ONCE(statc->stats_updates) + abs(val);
564 		WRITE_ONCE(statc->stats_updates, stats_updates);
565 		if (stats_updates < MEMCG_CHARGE_BATCH)
566 			continue;
567 
568 		/*
569 		 * If @memcg is already flush-able, increasing stats_updates is
570 		 * redundant. Avoid the overhead of the atomic update.
571 		 */
572 		if (!memcg_vmstats_needs_flush(statc->vmstats))
573 			atomic64_add(stats_updates,
574 				     &statc->vmstats->stats_updates);
575 		WRITE_ONCE(statc->stats_updates, 0);
576 	}
577 }
578 
579 static void do_flush_stats(struct mem_cgroup *memcg)
580 {
581 	if (mem_cgroup_is_root(memcg))
582 		WRITE_ONCE(flush_last_time, jiffies_64);
583 
584 	cgroup_rstat_flush(memcg->css.cgroup);
585 }
586 
587 /*
588  * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree
589  * @memcg: root of the subtree to flush
590  *
591  * Flushing is serialized by the underlying global rstat lock. There is also a
592  * minimum amount of work to be done even if there are no stat updates to flush.
593  * Hence, we only flush the stats if the updates delta exceeds a threshold. This
594  * avoids unnecessary work and contention on the underlying lock.
595  */
596 void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
597 {
598 	if (mem_cgroup_disabled())
599 		return;
600 
601 	if (!memcg)
602 		memcg = root_mem_cgroup;
603 
604 	if (memcg_vmstats_needs_flush(memcg->vmstats))
605 		do_flush_stats(memcg);
606 }
607 
608 void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
609 {
610 	/* Only flush if the periodic flusher is one full cycle late */
611 	if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME))
612 		mem_cgroup_flush_stats(memcg);
613 }
614 
615 static void flush_memcg_stats_dwork(struct work_struct *w)
616 {
617 	/*
618 	 * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
619 	 * in latency-sensitive paths is as cheap as possible.
620 	 */
621 	do_flush_stats(root_mem_cgroup);
622 	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
623 }
624 
625 unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
626 {
627 	long x;
628 	int i = memcg_stats_index(idx);
629 
630 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
631 		return 0;
632 
633 	x = READ_ONCE(memcg->vmstats->state[i]);
634 #ifdef CONFIG_SMP
635 	if (x < 0)
636 		x = 0;
637 #endif
638 	return x;
639 }
640 
641 static int memcg_page_state_unit(int item);
642 
643 /*
644  * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
645  * up non-zero sub-page updates to 1 page as zero page updates are ignored.
646  */
647 static int memcg_state_val_in_pages(int idx, int val)
648 {
649 	int unit = memcg_page_state_unit(idx);
650 
651 	if (!val || unit == PAGE_SIZE)
652 		return val;
653 	else
654 		return max(val * unit / PAGE_SIZE, 1UL);
655 }
656 
657 /**
658  * __mod_memcg_state - update cgroup memory statistics
659  * @memcg: the memory cgroup
660  * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
661  * @val: delta to add to the counter, can be negative
662  */
663 void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
664 		       int val)
665 {
666 	int i = memcg_stats_index(idx);
667 
668 	if (mem_cgroup_disabled())
669 		return;
670 
671 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
672 		return;
673 
674 	__this_cpu_add(memcg->vmstats_percpu->state[i], val);
675 	memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
676 }
677 
678 /* idx can be of type enum memcg_stat_item or node_stat_item. */
679 unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
680 {
681 	long x;
682 	int i = memcg_stats_index(idx);
683 
684 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
685 		return 0;
686 
687 	x = READ_ONCE(memcg->vmstats->state_local[i]);
688 #ifdef CONFIG_SMP
689 	if (x < 0)
690 		x = 0;
691 #endif
692 	return x;
693 }
694 
695 static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
696 				     enum node_stat_item idx,
697 				     int val)
698 {
699 	struct mem_cgroup_per_node *pn;
700 	struct mem_cgroup *memcg;
701 	int i = memcg_stats_index(idx);
702 
703 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
704 		return;
705 
706 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
707 	memcg = pn->memcg;
708 
709 	/*
710 	 * The caller from rmap relies on disabled preemption because they never
711 	 * update their counter from in-interrupt context. For these two
712 	 * counters we check that the update is never performed from an
713 	 * interrupt context while other caller need to have disabled interrupt.
714 	 */
715 	__memcg_stats_lock();
716 	if (IS_ENABLED(CONFIG_DEBUG_VM)) {
717 		switch (idx) {
718 		case NR_ANON_MAPPED:
719 		case NR_FILE_MAPPED:
720 		case NR_ANON_THPS:
721 			WARN_ON_ONCE(!in_task());
722 			break;
723 		default:
724 			VM_WARN_ON_IRQS_ENABLED();
725 		}
726 	}
727 
728 	/* Update memcg */
729 	__this_cpu_add(memcg->vmstats_percpu->state[i], val);
730 
731 	/* Update lruvec */
732 	__this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
733 
734 	memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
735 	memcg_stats_unlock();
736 }
737 
738 /**
739  * __mod_lruvec_state - update lruvec memory statistics
740  * @lruvec: the lruvec
741  * @idx: the stat item
742  * @val: delta to add to the counter, can be negative
743  *
744  * The lruvec is the intersection of the NUMA node and a cgroup. This
745  * function updates the all three counters that are affected by a
746  * change of state at this level: per-node, per-cgroup, per-lruvec.
747  */
748 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
749 			int val)
750 {
751 	/* Update node */
752 	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
753 
754 	/* Update memcg and lruvec */
755 	if (!mem_cgroup_disabled())
756 		__mod_memcg_lruvec_state(lruvec, idx, val);
757 }
758 
759 void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
760 			     int val)
761 {
762 	struct mem_cgroup *memcg;
763 	pg_data_t *pgdat = folio_pgdat(folio);
764 	struct lruvec *lruvec;
765 
766 	rcu_read_lock();
767 	memcg = folio_memcg(folio);
768 	/* Untracked pages have no memcg, no lruvec. Update only the node */
769 	if (!memcg) {
770 		rcu_read_unlock();
771 		__mod_node_page_state(pgdat, idx, val);
772 		return;
773 	}
774 
775 	lruvec = mem_cgroup_lruvec(memcg, pgdat);
776 	__mod_lruvec_state(lruvec, idx, val);
777 	rcu_read_unlock();
778 }
779 EXPORT_SYMBOL(__lruvec_stat_mod_folio);
780 
781 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
782 {
783 	pg_data_t *pgdat = page_pgdat(virt_to_page(p));
784 	struct mem_cgroup *memcg;
785 	struct lruvec *lruvec;
786 
787 	rcu_read_lock();
788 	memcg = mem_cgroup_from_slab_obj(p);
789 
790 	/*
791 	 * Untracked pages have no memcg, no lruvec. Update only the
792 	 * node. If we reparent the slab objects to the root memcg,
793 	 * when we free the slab object, we need to update the per-memcg
794 	 * vmstats to keep it correct for the root memcg.
795 	 */
796 	if (!memcg) {
797 		__mod_node_page_state(pgdat, idx, val);
798 	} else {
799 		lruvec = mem_cgroup_lruvec(memcg, pgdat);
800 		__mod_lruvec_state(lruvec, idx, val);
801 	}
802 	rcu_read_unlock();
803 }
804 
805 /**
806  * __count_memcg_events - account VM events in a cgroup
807  * @memcg: the memory cgroup
808  * @idx: the event item
809  * @count: the number of events that occurred
810  */
811 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
812 			  unsigned long count)
813 {
814 	int i = memcg_events_index(idx);
815 
816 	if (mem_cgroup_disabled())
817 		return;
818 
819 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
820 		return;
821 
822 	memcg_stats_lock();
823 	__this_cpu_add(memcg->vmstats_percpu->events[i], count);
824 	memcg_rstat_updated(memcg, count);
825 	memcg_stats_unlock();
826 }
827 
828 unsigned long memcg_events(struct mem_cgroup *memcg, int event)
829 {
830 	int i = memcg_events_index(event);
831 
832 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
833 		return 0;
834 
835 	return READ_ONCE(memcg->vmstats->events[i]);
836 }
837 
838 unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
839 {
840 	int i = memcg_events_index(event);
841 
842 	if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
843 		return 0;
844 
845 	return READ_ONCE(memcg->vmstats->events_local[i]);
846 }
847 
848 void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
849 {
850 	/* pagein of a big page is an event. So, ignore page size */
851 	if (nr_pages > 0)
852 		__count_memcg_events(memcg, PGPGIN, 1);
853 	else {
854 		__count_memcg_events(memcg, PGPGOUT, 1);
855 		nr_pages = -nr_pages; /* for event */
856 	}
857 
858 	__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
859 }
860 
861 bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
862 				enum mem_cgroup_events_target target)
863 {
864 	unsigned long val, next;
865 
866 	val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
867 	next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
868 	/* from time_after() in jiffies.h */
869 	if ((long)(next - val) < 0) {
870 		switch (target) {
871 		case MEM_CGROUP_TARGET_THRESH:
872 			next = val + THRESHOLDS_EVENTS_TARGET;
873 			break;
874 		case MEM_CGROUP_TARGET_SOFTLIMIT:
875 			next = val + SOFTLIMIT_EVENTS_TARGET;
876 			break;
877 		default:
878 			break;
879 		}
880 		__this_cpu_write(memcg->vmstats_percpu->targets[target], next);
881 		return true;
882 	}
883 	return false;
884 }
885 
886 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
887 {
888 	/*
889 	 * mm_update_next_owner() may clear mm->owner to NULL
890 	 * if it races with swapoff, page migration, etc.
891 	 * So this can be called with p == NULL.
892 	 */
893 	if (unlikely(!p))
894 		return NULL;
895 
896 	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
897 }
898 EXPORT_SYMBOL(mem_cgroup_from_task);
899 
900 static __always_inline struct mem_cgroup *active_memcg(void)
901 {
902 	if (!in_task())
903 		return this_cpu_read(int_active_memcg);
904 	else
905 		return current->active_memcg;
906 }
907 
908 /**
909  * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
910  * @mm: mm from which memcg should be extracted. It can be NULL.
911  *
912  * Obtain a reference on mm->memcg and returns it if successful. If mm
913  * is NULL, then the memcg is chosen as follows:
914  * 1) The active memcg, if set.
915  * 2) current->mm->memcg, if available
916  * 3) root memcg
917  * If mem_cgroup is disabled, NULL is returned.
918  */
919 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
920 {
921 	struct mem_cgroup *memcg;
922 
923 	if (mem_cgroup_disabled())
924 		return NULL;
925 
926 	/*
927 	 * Page cache insertions can happen without an
928 	 * actual mm context, e.g. during disk probing
929 	 * on boot, loopback IO, acct() writes etc.
930 	 *
931 	 * No need to css_get on root memcg as the reference
932 	 * counting is disabled on the root level in the
933 	 * cgroup core. See CSS_NO_REF.
934 	 */
935 	if (unlikely(!mm)) {
936 		memcg = active_memcg();
937 		if (unlikely(memcg)) {
938 			/* remote memcg must hold a ref */
939 			css_get(&memcg->css);
940 			return memcg;
941 		}
942 		mm = current->mm;
943 		if (unlikely(!mm))
944 			return root_mem_cgroup;
945 	}
946 
947 	rcu_read_lock();
948 	do {
949 		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
950 		if (unlikely(!memcg))
951 			memcg = root_mem_cgroup;
952 	} while (!css_tryget(&memcg->css));
953 	rcu_read_unlock();
954 	return memcg;
955 }
956 EXPORT_SYMBOL(get_mem_cgroup_from_mm);
957 
958 /**
959  * get_mem_cgroup_from_current - Obtain a reference on current task's memcg.
960  */
961 struct mem_cgroup *get_mem_cgroup_from_current(void)
962 {
963 	struct mem_cgroup *memcg;
964 
965 	if (mem_cgroup_disabled())
966 		return NULL;
967 
968 again:
969 	rcu_read_lock();
970 	memcg = mem_cgroup_from_task(current);
971 	if (!css_tryget(&memcg->css)) {
972 		rcu_read_unlock();
973 		goto again;
974 	}
975 	rcu_read_unlock();
976 	return memcg;
977 }
978 
979 /**
980  * mem_cgroup_iter - iterate over memory cgroup hierarchy
981  * @root: hierarchy root
982  * @prev: previously returned memcg, NULL on first invocation
983  * @reclaim: cookie for shared reclaim walks, NULL for full walks
984  *
985  * Returns references to children of the hierarchy below @root, or
986  * @root itself, or %NULL after a full round-trip.
987  *
988  * Caller must pass the return value in @prev on subsequent
989  * invocations for reference counting, or use mem_cgroup_iter_break()
990  * to cancel a hierarchy walk before the round-trip is complete.
991  *
992  * Reclaimers can specify a node in @reclaim to divide up the memcgs
993  * in the hierarchy among all concurrent reclaimers operating on the
994  * same node.
995  */
996 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
997 				   struct mem_cgroup *prev,
998 				   struct mem_cgroup_reclaim_cookie *reclaim)
999 {
1000 	struct mem_cgroup_reclaim_iter *iter;
1001 	struct cgroup_subsys_state *css = NULL;
1002 	struct mem_cgroup *memcg = NULL;
1003 	struct mem_cgroup *pos = NULL;
1004 
1005 	if (mem_cgroup_disabled())
1006 		return NULL;
1007 
1008 	if (!root)
1009 		root = root_mem_cgroup;
1010 
1011 	rcu_read_lock();
1012 
1013 	if (reclaim) {
1014 		struct mem_cgroup_per_node *mz;
1015 
1016 		mz = root->nodeinfo[reclaim->pgdat->node_id];
1017 		iter = &mz->iter;
1018 
1019 		/*
1020 		 * On start, join the current reclaim iteration cycle.
1021 		 * Exit when a concurrent walker completes it.
1022 		 */
1023 		if (!prev)
1024 			reclaim->generation = iter->generation;
1025 		else if (reclaim->generation != iter->generation)
1026 			goto out_unlock;
1027 
1028 		while (1) {
1029 			pos = READ_ONCE(iter->position);
1030 			if (!pos || css_tryget(&pos->css))
1031 				break;
1032 			/*
1033 			 * css reference reached zero, so iter->position will
1034 			 * be cleared by ->css_released. However, we should not
1035 			 * rely on this happening soon, because ->css_released
1036 			 * is called from a work queue, and by busy-waiting we
1037 			 * might block it. So we clear iter->position right
1038 			 * away.
1039 			 */
1040 			(void)cmpxchg(&iter->position, pos, NULL);
1041 		}
1042 	} else if (prev) {
1043 		pos = prev;
1044 	}
1045 
1046 	if (pos)
1047 		css = &pos->css;
1048 
1049 	for (;;) {
1050 		css = css_next_descendant_pre(css, &root->css);
1051 		if (!css) {
1052 			/*
1053 			 * Reclaimers share the hierarchy walk, and a
1054 			 * new one might jump in right at the end of
1055 			 * the hierarchy - make sure they see at least
1056 			 * one group and restart from the beginning.
1057 			 */
1058 			if (!prev)
1059 				continue;
1060 			break;
1061 		}
1062 
1063 		/*
1064 		 * Verify the css and acquire a reference.  The root
1065 		 * is provided by the caller, so we know it's alive
1066 		 * and kicking, and don't take an extra reference.
1067 		 */
1068 		if (css == &root->css || css_tryget(css)) {
1069 			memcg = mem_cgroup_from_css(css);
1070 			break;
1071 		}
1072 	}
1073 
1074 	if (reclaim) {
1075 		/*
1076 		 * The position could have already been updated by a competing
1077 		 * thread, so check that the value hasn't changed since we read
1078 		 * it to avoid reclaiming from the same cgroup twice.
1079 		 */
1080 		(void)cmpxchg(&iter->position, pos, memcg);
1081 
1082 		if (pos)
1083 			css_put(&pos->css);
1084 
1085 		if (!memcg)
1086 			iter->generation++;
1087 	}
1088 
1089 out_unlock:
1090 	rcu_read_unlock();
1091 	if (prev && prev != root)
1092 		css_put(&prev->css);
1093 
1094 	return memcg;
1095 }
1096 
1097 /**
1098  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1099  * @root: hierarchy root
1100  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1101  */
1102 void mem_cgroup_iter_break(struct mem_cgroup *root,
1103 			   struct mem_cgroup *prev)
1104 {
1105 	if (!root)
1106 		root = root_mem_cgroup;
1107 	if (prev && prev != root)
1108 		css_put(&prev->css);
1109 }
1110 
1111 static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1112 					struct mem_cgroup *dead_memcg)
1113 {
1114 	struct mem_cgroup_reclaim_iter *iter;
1115 	struct mem_cgroup_per_node *mz;
1116 	int nid;
1117 
1118 	for_each_node(nid) {
1119 		mz = from->nodeinfo[nid];
1120 		iter = &mz->iter;
1121 		cmpxchg(&iter->position, dead_memcg, NULL);
1122 	}
1123 }
1124 
1125 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1126 {
1127 	struct mem_cgroup *memcg = dead_memcg;
1128 	struct mem_cgroup *last;
1129 
1130 	do {
1131 		__invalidate_reclaim_iterators(memcg, dead_memcg);
1132 		last = memcg;
1133 	} while ((memcg = parent_mem_cgroup(memcg)));
1134 
1135 	/*
1136 	 * When cgroup1 non-hierarchy mode is used,
1137 	 * parent_mem_cgroup() does not walk all the way up to the
1138 	 * cgroup root (root_mem_cgroup). So we have to handle
1139 	 * dead_memcg from cgroup root separately.
1140 	 */
1141 	if (!mem_cgroup_is_root(last))
1142 		__invalidate_reclaim_iterators(root_mem_cgroup,
1143 						dead_memcg);
1144 }
1145 
1146 /**
1147  * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1148  * @memcg: hierarchy root
1149  * @fn: function to call for each task
1150  * @arg: argument passed to @fn
1151  *
1152  * This function iterates over tasks attached to @memcg or to any of its
1153  * descendants and calls @fn for each task. If @fn returns a non-zero
1154  * value, the function breaks the iteration loop. Otherwise, it will iterate
1155  * over all tasks and return 0.
1156  *
1157  * This function must not be called for the root memory cgroup.
1158  */
1159 void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1160 			   int (*fn)(struct task_struct *, void *), void *arg)
1161 {
1162 	struct mem_cgroup *iter;
1163 	int ret = 0;
1164 
1165 	BUG_ON(mem_cgroup_is_root(memcg));
1166 
1167 	for_each_mem_cgroup_tree(iter, memcg) {
1168 		struct css_task_iter it;
1169 		struct task_struct *task;
1170 
1171 		css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1172 		while (!ret && (task = css_task_iter_next(&it)))
1173 			ret = fn(task, arg);
1174 		css_task_iter_end(&it);
1175 		if (ret) {
1176 			mem_cgroup_iter_break(memcg, iter);
1177 			break;
1178 		}
1179 	}
1180 }
1181 
1182 #ifdef CONFIG_DEBUG_VM
1183 void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
1184 {
1185 	struct mem_cgroup *memcg;
1186 
1187 	if (mem_cgroup_disabled())
1188 		return;
1189 
1190 	memcg = folio_memcg(folio);
1191 
1192 	if (!memcg)
1193 		VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
1194 	else
1195 		VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
1196 }
1197 #endif
1198 
1199 /**
1200  * folio_lruvec_lock - Lock the lruvec for a folio.
1201  * @folio: Pointer to the folio.
1202  *
1203  * These functions are safe to use under any of the following conditions:
1204  * - folio locked
1205  * - folio_test_lru false
1206  * - folio_memcg_lock()
1207  * - folio frozen (refcount of 0)
1208  *
1209  * Return: The lruvec this folio is on with its lock held.
1210  */
1211 struct lruvec *folio_lruvec_lock(struct folio *folio)
1212 {
1213 	struct lruvec *lruvec = folio_lruvec(folio);
1214 
1215 	spin_lock(&lruvec->lru_lock);
1216 	lruvec_memcg_debug(lruvec, folio);
1217 
1218 	return lruvec;
1219 }
1220 
1221 /**
1222  * folio_lruvec_lock_irq - Lock the lruvec for a folio.
1223  * @folio: Pointer to the folio.
1224  *
1225  * These functions are safe to use under any of the following conditions:
1226  * - folio locked
1227  * - folio_test_lru false
1228  * - folio_memcg_lock()
1229  * - folio frozen (refcount of 0)
1230  *
1231  * Return: The lruvec this folio is on with its lock held and interrupts
1232  * disabled.
1233  */
1234 struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
1235 {
1236 	struct lruvec *lruvec = folio_lruvec(folio);
1237 
1238 	spin_lock_irq(&lruvec->lru_lock);
1239 	lruvec_memcg_debug(lruvec, folio);
1240 
1241 	return lruvec;
1242 }
1243 
1244 /**
1245  * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
1246  * @folio: Pointer to the folio.
1247  * @flags: Pointer to irqsave flags.
1248  *
1249  * These functions are safe to use under any of the following conditions:
1250  * - folio locked
1251  * - folio_test_lru false
1252  * - folio_memcg_lock()
1253  * - folio frozen (refcount of 0)
1254  *
1255  * Return: The lruvec this folio is on with its lock held and interrupts
1256  * disabled.
1257  */
1258 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
1259 		unsigned long *flags)
1260 {
1261 	struct lruvec *lruvec = folio_lruvec(folio);
1262 
1263 	spin_lock_irqsave(&lruvec->lru_lock, *flags);
1264 	lruvec_memcg_debug(lruvec, folio);
1265 
1266 	return lruvec;
1267 }
1268 
1269 /**
1270  * mem_cgroup_update_lru_size - account for adding or removing an lru page
1271  * @lruvec: mem_cgroup per zone lru vector
1272  * @lru: index of lru list the page is sitting on
1273  * @zid: zone id of the accounted pages
1274  * @nr_pages: positive when adding or negative when removing
1275  *
1276  * This function must be called under lru_lock, just before a page is added
1277  * to or just after a page is removed from an lru list.
1278  */
1279 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1280 				int zid, int nr_pages)
1281 {
1282 	struct mem_cgroup_per_node *mz;
1283 	unsigned long *lru_size;
1284 	long size;
1285 
1286 	if (mem_cgroup_disabled())
1287 		return;
1288 
1289 	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1290 	lru_size = &mz->lru_zone_size[zid][lru];
1291 
1292 	if (nr_pages < 0)
1293 		*lru_size += nr_pages;
1294 
1295 	size = *lru_size;
1296 	if (WARN_ONCE(size < 0,
1297 		"%s(%p, %d, %d): lru_size %ld\n",
1298 		__func__, lruvec, lru, nr_pages, size)) {
1299 		VM_BUG_ON(1);
1300 		*lru_size = 0;
1301 	}
1302 
1303 	if (nr_pages > 0)
1304 		*lru_size += nr_pages;
1305 }
1306 
1307 /**
1308  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1309  * @memcg: the memory cgroup
1310  *
1311  * Returns the maximum amount of memory @mem can be charged with, in
1312  * pages.
1313  */
1314 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1315 {
1316 	unsigned long margin = 0;
1317 	unsigned long count;
1318 	unsigned long limit;
1319 
1320 	count = page_counter_read(&memcg->memory);
1321 	limit = READ_ONCE(memcg->memory.max);
1322 	if (count < limit)
1323 		margin = limit - count;
1324 
1325 	if (do_memsw_account()) {
1326 		count = page_counter_read(&memcg->memsw);
1327 		limit = READ_ONCE(memcg->memsw.max);
1328 		if (count < limit)
1329 			margin = min(margin, limit - count);
1330 		else
1331 			margin = 0;
1332 	}
1333 
1334 	return margin;
1335 }
1336 
1337 struct memory_stat {
1338 	const char *name;
1339 	unsigned int idx;
1340 };
1341 
1342 static const struct memory_stat memory_stats[] = {
1343 	{ "anon",			NR_ANON_MAPPED			},
1344 	{ "file",			NR_FILE_PAGES			},
1345 	{ "kernel",			MEMCG_KMEM			},
1346 	{ "kernel_stack",		NR_KERNEL_STACK_KB		},
1347 	{ "pagetables",			NR_PAGETABLE			},
1348 	{ "sec_pagetables",		NR_SECONDARY_PAGETABLE		},
1349 	{ "percpu",			MEMCG_PERCPU_B			},
1350 	{ "sock",			MEMCG_SOCK			},
1351 	{ "vmalloc",			MEMCG_VMALLOC			},
1352 	{ "shmem",			NR_SHMEM			},
1353 #ifdef CONFIG_ZSWAP
1354 	{ "zswap",			MEMCG_ZSWAP_B			},
1355 	{ "zswapped",			MEMCG_ZSWAPPED			},
1356 #endif
1357 	{ "file_mapped",		NR_FILE_MAPPED			},
1358 	{ "file_dirty",			NR_FILE_DIRTY			},
1359 	{ "file_writeback",		NR_WRITEBACK			},
1360 #ifdef CONFIG_SWAP
1361 	{ "swapcached",			NR_SWAPCACHE			},
1362 #endif
1363 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1364 	{ "anon_thp",			NR_ANON_THPS			},
1365 	{ "file_thp",			NR_FILE_THPS			},
1366 	{ "shmem_thp",			NR_SHMEM_THPS			},
1367 #endif
1368 	{ "inactive_anon",		NR_INACTIVE_ANON		},
1369 	{ "active_anon",		NR_ACTIVE_ANON			},
1370 	{ "inactive_file",		NR_INACTIVE_FILE		},
1371 	{ "active_file",		NR_ACTIVE_FILE			},
1372 	{ "unevictable",		NR_UNEVICTABLE			},
1373 	{ "slab_reclaimable",		NR_SLAB_RECLAIMABLE_B		},
1374 	{ "slab_unreclaimable",		NR_SLAB_UNRECLAIMABLE_B		},
1375 
1376 	/* The memory events */
1377 	{ "workingset_refault_anon",	WORKINGSET_REFAULT_ANON		},
1378 	{ "workingset_refault_file",	WORKINGSET_REFAULT_FILE		},
1379 	{ "workingset_activate_anon",	WORKINGSET_ACTIVATE_ANON	},
1380 	{ "workingset_activate_file",	WORKINGSET_ACTIVATE_FILE	},
1381 	{ "workingset_restore_anon",	WORKINGSET_RESTORE_ANON		},
1382 	{ "workingset_restore_file",	WORKINGSET_RESTORE_FILE		},
1383 	{ "workingset_nodereclaim",	WORKINGSET_NODERECLAIM		},
1384 };
1385 
1386 /* The actual unit of the state item, not the same as the output unit */
1387 static int memcg_page_state_unit(int item)
1388 {
1389 	switch (item) {
1390 	case MEMCG_PERCPU_B:
1391 	case MEMCG_ZSWAP_B:
1392 	case NR_SLAB_RECLAIMABLE_B:
1393 	case NR_SLAB_UNRECLAIMABLE_B:
1394 		return 1;
1395 	case NR_KERNEL_STACK_KB:
1396 		return SZ_1K;
1397 	default:
1398 		return PAGE_SIZE;
1399 	}
1400 }
1401 
1402 /* Translate stat items to the correct unit for memory.stat output */
1403 static int memcg_page_state_output_unit(int item)
1404 {
1405 	/*
1406 	 * Workingset state is actually in pages, but we export it to userspace
1407 	 * as a scalar count of events, so special case it here.
1408 	 */
1409 	switch (item) {
1410 	case WORKINGSET_REFAULT_ANON:
1411 	case WORKINGSET_REFAULT_FILE:
1412 	case WORKINGSET_ACTIVATE_ANON:
1413 	case WORKINGSET_ACTIVATE_FILE:
1414 	case WORKINGSET_RESTORE_ANON:
1415 	case WORKINGSET_RESTORE_FILE:
1416 	case WORKINGSET_NODERECLAIM:
1417 		return 1;
1418 	default:
1419 		return memcg_page_state_unit(item);
1420 	}
1421 }
1422 
1423 unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item)
1424 {
1425 	return memcg_page_state(memcg, item) *
1426 		memcg_page_state_output_unit(item);
1427 }
1428 
1429 unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item)
1430 {
1431 	return memcg_page_state_local(memcg, item) *
1432 		memcg_page_state_output_unit(item);
1433 }
1434 
1435 static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1436 {
1437 	int i;
1438 
1439 	/*
1440 	 * Provide statistics on the state of the memory subsystem as
1441 	 * well as cumulative event counters that show past behavior.
1442 	 *
1443 	 * This list is ordered following a combination of these gradients:
1444 	 * 1) generic big picture -> specifics and details
1445 	 * 2) reflecting userspace activity -> reflecting kernel heuristics
1446 	 *
1447 	 * Current memory state:
1448 	 */
1449 	mem_cgroup_flush_stats(memcg);
1450 
1451 	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1452 		u64 size;
1453 
1454 		size = memcg_page_state_output(memcg, memory_stats[i].idx);
1455 		seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
1456 
1457 		if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1458 			size += memcg_page_state_output(memcg,
1459 							NR_SLAB_RECLAIMABLE_B);
1460 			seq_buf_printf(s, "slab %llu\n", size);
1461 		}
1462 	}
1463 
1464 	/* Accumulated memory events */
1465 	seq_buf_printf(s, "pgscan %lu\n",
1466 		       memcg_events(memcg, PGSCAN_KSWAPD) +
1467 		       memcg_events(memcg, PGSCAN_DIRECT) +
1468 		       memcg_events(memcg, PGSCAN_KHUGEPAGED));
1469 	seq_buf_printf(s, "pgsteal %lu\n",
1470 		       memcg_events(memcg, PGSTEAL_KSWAPD) +
1471 		       memcg_events(memcg, PGSTEAL_DIRECT) +
1472 		       memcg_events(memcg, PGSTEAL_KHUGEPAGED));
1473 
1474 	for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
1475 		if (memcg_vm_event_stat[i] == PGPGIN ||
1476 		    memcg_vm_event_stat[i] == PGPGOUT)
1477 			continue;
1478 
1479 		seq_buf_printf(s, "%s %lu\n",
1480 			       vm_event_name(memcg_vm_event_stat[i]),
1481 			       memcg_events(memcg, memcg_vm_event_stat[i]));
1482 	}
1483 }
1484 
1485 static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1486 {
1487 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1488 		memcg_stat_format(memcg, s);
1489 	else
1490 		memcg1_stat_format(memcg, s);
1491 	if (seq_buf_has_overflowed(s))
1492 		pr_warn("%s: Warning, stat buffer overflow, please report\n", __func__);
1493 }
1494 
1495 /**
1496  * mem_cgroup_print_oom_context: Print OOM information relevant to
1497  * memory controller.
1498  * @memcg: The memory cgroup that went over limit
1499  * @p: Task that is going to be killed
1500  *
1501  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1502  * enabled
1503  */
1504 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1505 {
1506 	rcu_read_lock();
1507 
1508 	if (memcg) {
1509 		pr_cont(",oom_memcg=");
1510 		pr_cont_cgroup_path(memcg->css.cgroup);
1511 	} else
1512 		pr_cont(",global_oom");
1513 	if (p) {
1514 		pr_cont(",task_memcg=");
1515 		pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1516 	}
1517 	rcu_read_unlock();
1518 }
1519 
1520 /**
1521  * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1522  * memory controller.
1523  * @memcg: The memory cgroup that went over limit
1524  */
1525 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1526 {
1527 	/* Use static buffer, for the caller is holding oom_lock. */
1528 	static char buf[PAGE_SIZE];
1529 	struct seq_buf s;
1530 
1531 	lockdep_assert_held(&oom_lock);
1532 
1533 	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1534 		K((u64)page_counter_read(&memcg->memory)),
1535 		K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1536 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1537 		pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1538 			K((u64)page_counter_read(&memcg->swap)),
1539 			K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1540 #ifdef CONFIG_MEMCG_V1
1541 	else {
1542 		pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1543 			K((u64)page_counter_read(&memcg->memsw)),
1544 			K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1545 		pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1546 			K((u64)page_counter_read(&memcg->kmem)),
1547 			K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1548 	}
1549 #endif
1550 
1551 	pr_info("Memory cgroup stats for ");
1552 	pr_cont_cgroup_path(memcg->css.cgroup);
1553 	pr_cont(":");
1554 	seq_buf_init(&s, buf, sizeof(buf));
1555 	memory_stat_format(memcg, &s);
1556 	seq_buf_do_printk(&s, KERN_INFO);
1557 }
1558 
1559 /*
1560  * Return the memory (and swap, if configured) limit for a memcg.
1561  */
1562 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1563 {
1564 	unsigned long max = READ_ONCE(memcg->memory.max);
1565 
1566 	if (do_memsw_account()) {
1567 		if (mem_cgroup_swappiness(memcg)) {
1568 			/* Calculate swap excess capacity from memsw limit */
1569 			unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1570 
1571 			max += min(swap, (unsigned long)total_swap_pages);
1572 		}
1573 	} else {
1574 		if (mem_cgroup_swappiness(memcg))
1575 			max += min(READ_ONCE(memcg->swap.max),
1576 				   (unsigned long)total_swap_pages);
1577 	}
1578 	return max;
1579 }
1580 
1581 unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1582 {
1583 	return page_counter_read(&memcg->memory);
1584 }
1585 
1586 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1587 				     int order)
1588 {
1589 	struct oom_control oc = {
1590 		.zonelist = NULL,
1591 		.nodemask = NULL,
1592 		.memcg = memcg,
1593 		.gfp_mask = gfp_mask,
1594 		.order = order,
1595 	};
1596 	bool ret = true;
1597 
1598 	if (mutex_lock_killable(&oom_lock))
1599 		return true;
1600 
1601 	if (mem_cgroup_margin(memcg) >= (1 << order))
1602 		goto unlock;
1603 
1604 	/*
1605 	 * A few threads which were not waiting at mutex_lock_killable() can
1606 	 * fail to bail out. Therefore, check again after holding oom_lock.
1607 	 */
1608 	ret = task_is_dying() || out_of_memory(&oc);
1609 
1610 unlock:
1611 	mutex_unlock(&oom_lock);
1612 	return ret;
1613 }
1614 
1615 /*
1616  * Returns true if successfully killed one or more processes. Though in some
1617  * corner cases it can return true even without killing any process.
1618  */
1619 static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1620 {
1621 	bool locked, ret;
1622 
1623 	if (order > PAGE_ALLOC_COSTLY_ORDER)
1624 		return false;
1625 
1626 	memcg_memory_event(memcg, MEMCG_OOM);
1627 
1628 	if (!memcg1_oom_prepare(memcg, &locked))
1629 		return false;
1630 
1631 	ret = mem_cgroup_out_of_memory(memcg, mask, order);
1632 
1633 	memcg1_oom_finish(memcg, locked);
1634 
1635 	return ret;
1636 }
1637 
1638 /**
1639  * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
1640  * @victim: task to be killed by the OOM killer
1641  * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
1642  *
1643  * Returns a pointer to a memory cgroup, which has to be cleaned up
1644  * by killing all belonging OOM-killable tasks.
1645  *
1646  * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
1647  */
1648 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1649 					    struct mem_cgroup *oom_domain)
1650 {
1651 	struct mem_cgroup *oom_group = NULL;
1652 	struct mem_cgroup *memcg;
1653 
1654 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1655 		return NULL;
1656 
1657 	if (!oom_domain)
1658 		oom_domain = root_mem_cgroup;
1659 
1660 	rcu_read_lock();
1661 
1662 	memcg = mem_cgroup_from_task(victim);
1663 	if (mem_cgroup_is_root(memcg))
1664 		goto out;
1665 
1666 	/*
1667 	 * If the victim task has been asynchronously moved to a different
1668 	 * memory cgroup, we might end up killing tasks outside oom_domain.
1669 	 * In this case it's better to ignore memory.group.oom.
1670 	 */
1671 	if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
1672 		goto out;
1673 
1674 	/*
1675 	 * Traverse the memory cgroup hierarchy from the victim task's
1676 	 * cgroup up to the OOMing cgroup (or root) to find the
1677 	 * highest-level memory cgroup with oom.group set.
1678 	 */
1679 	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1680 		if (READ_ONCE(memcg->oom_group))
1681 			oom_group = memcg;
1682 
1683 		if (memcg == oom_domain)
1684 			break;
1685 	}
1686 
1687 	if (oom_group)
1688 		css_get(&oom_group->css);
1689 out:
1690 	rcu_read_unlock();
1691 
1692 	return oom_group;
1693 }
1694 
1695 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1696 {
1697 	pr_info("Tasks in ");
1698 	pr_cont_cgroup_path(memcg->css.cgroup);
1699 	pr_cont(" are going to be killed due to memory.oom.group set\n");
1700 }
1701 
1702 struct memcg_stock_pcp {
1703 	local_lock_t stock_lock;
1704 	struct mem_cgroup *cached; /* this never be root cgroup */
1705 	unsigned int nr_pages;
1706 
1707 	struct obj_cgroup *cached_objcg;
1708 	struct pglist_data *cached_pgdat;
1709 	unsigned int nr_bytes;
1710 	int nr_slab_reclaimable_b;
1711 	int nr_slab_unreclaimable_b;
1712 
1713 	struct work_struct work;
1714 	unsigned long flags;
1715 #define FLUSHING_CACHED_CHARGE	0
1716 };
1717 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
1718 	.stock_lock = INIT_LOCAL_LOCK(stock_lock),
1719 };
1720 static DEFINE_MUTEX(percpu_charge_mutex);
1721 
1722 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
1723 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
1724 				     struct mem_cgroup *root_memcg);
1725 
1726 /**
1727  * consume_stock: Try to consume stocked charge on this cpu.
1728  * @memcg: memcg to consume from.
1729  * @nr_pages: how many pages to charge.
1730  *
1731  * The charges will only happen if @memcg matches the current cpu's memcg
1732  * stock, and at least @nr_pages are available in that stock.  Failure to
1733  * service an allocation will refill the stock.
1734  *
1735  * returns true if successful, false otherwise.
1736  */
1737 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1738 {
1739 	struct memcg_stock_pcp *stock;
1740 	unsigned int stock_pages;
1741 	unsigned long flags;
1742 	bool ret = false;
1743 
1744 	if (nr_pages > MEMCG_CHARGE_BATCH)
1745 		return ret;
1746 
1747 	local_lock_irqsave(&memcg_stock.stock_lock, flags);
1748 
1749 	stock = this_cpu_ptr(&memcg_stock);
1750 	stock_pages = READ_ONCE(stock->nr_pages);
1751 	if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {
1752 		WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages);
1753 		ret = true;
1754 	}
1755 
1756 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
1757 
1758 	return ret;
1759 }
1760 
1761 /*
1762  * Returns stocks cached in percpu and reset cached information.
1763  */
1764 static void drain_stock(struct memcg_stock_pcp *stock)
1765 {
1766 	unsigned int stock_pages = READ_ONCE(stock->nr_pages);
1767 	struct mem_cgroup *old = READ_ONCE(stock->cached);
1768 
1769 	if (!old)
1770 		return;
1771 
1772 	if (stock_pages) {
1773 		page_counter_uncharge(&old->memory, stock_pages);
1774 		if (do_memsw_account())
1775 			page_counter_uncharge(&old->memsw, stock_pages);
1776 
1777 		WRITE_ONCE(stock->nr_pages, 0);
1778 	}
1779 
1780 	css_put(&old->css);
1781 	WRITE_ONCE(stock->cached, NULL);
1782 }
1783 
1784 static void drain_local_stock(struct work_struct *dummy)
1785 {
1786 	struct memcg_stock_pcp *stock;
1787 	struct obj_cgroup *old = NULL;
1788 	unsigned long flags;
1789 
1790 	/*
1791 	 * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
1792 	 * drain_stock races is that we always operate on local CPU stock
1793 	 * here with IRQ disabled
1794 	 */
1795 	local_lock_irqsave(&memcg_stock.stock_lock, flags);
1796 
1797 	stock = this_cpu_ptr(&memcg_stock);
1798 	old = drain_obj_stock(stock);
1799 	drain_stock(stock);
1800 	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1801 
1802 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
1803 	obj_cgroup_put(old);
1804 }
1805 
1806 /*
1807  * Cache charges(val) to local per_cpu area.
1808  * This will be consumed by consume_stock() function, later.
1809  */
1810 static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1811 {
1812 	struct memcg_stock_pcp *stock;
1813 	unsigned int stock_pages;
1814 
1815 	stock = this_cpu_ptr(&memcg_stock);
1816 	if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
1817 		drain_stock(stock);
1818 		css_get(&memcg->css);
1819 		WRITE_ONCE(stock->cached, memcg);
1820 	}
1821 	stock_pages = READ_ONCE(stock->nr_pages) + nr_pages;
1822 	WRITE_ONCE(stock->nr_pages, stock_pages);
1823 
1824 	if (stock_pages > MEMCG_CHARGE_BATCH)
1825 		drain_stock(stock);
1826 }
1827 
1828 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1829 {
1830 	unsigned long flags;
1831 
1832 	local_lock_irqsave(&memcg_stock.stock_lock, flags);
1833 	__refill_stock(memcg, nr_pages);
1834 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
1835 }
1836 
1837 /*
1838  * Drains all per-CPU charge caches for given root_memcg resp. subtree
1839  * of the hierarchy under it.
1840  */
1841 void drain_all_stock(struct mem_cgroup *root_memcg)
1842 {
1843 	int cpu, curcpu;
1844 
1845 	/* If someone's already draining, avoid adding running more workers. */
1846 	if (!mutex_trylock(&percpu_charge_mutex))
1847 		return;
1848 	/*
1849 	 * Notify other cpus that system-wide "drain" is running
1850 	 * We do not care about races with the cpu hotplug because cpu down
1851 	 * as well as workers from this path always operate on the local
1852 	 * per-cpu data. CPU up doesn't touch memcg_stock at all.
1853 	 */
1854 	migrate_disable();
1855 	curcpu = smp_processor_id();
1856 	for_each_online_cpu(cpu) {
1857 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1858 		struct mem_cgroup *memcg;
1859 		bool flush = false;
1860 
1861 		rcu_read_lock();
1862 		memcg = READ_ONCE(stock->cached);
1863 		if (memcg && READ_ONCE(stock->nr_pages) &&
1864 		    mem_cgroup_is_descendant(memcg, root_memcg))
1865 			flush = true;
1866 		else if (obj_stock_flush_required(stock, root_memcg))
1867 			flush = true;
1868 		rcu_read_unlock();
1869 
1870 		if (flush &&
1871 		    !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
1872 			if (cpu == curcpu)
1873 				drain_local_stock(&stock->work);
1874 			else if (!cpu_is_isolated(cpu))
1875 				schedule_work_on(cpu, &stock->work);
1876 		}
1877 	}
1878 	migrate_enable();
1879 	mutex_unlock(&percpu_charge_mutex);
1880 }
1881 
1882 static int memcg_hotplug_cpu_dead(unsigned int cpu)
1883 {
1884 	struct memcg_stock_pcp *stock;
1885 
1886 	stock = &per_cpu(memcg_stock, cpu);
1887 	drain_stock(stock);
1888 
1889 	return 0;
1890 }
1891 
1892 static unsigned long reclaim_high(struct mem_cgroup *memcg,
1893 				  unsigned int nr_pages,
1894 				  gfp_t gfp_mask)
1895 {
1896 	unsigned long nr_reclaimed = 0;
1897 
1898 	do {
1899 		unsigned long pflags;
1900 
1901 		if (page_counter_read(&memcg->memory) <=
1902 		    READ_ONCE(memcg->memory.high))
1903 			continue;
1904 
1905 		memcg_memory_event(memcg, MEMCG_HIGH);
1906 
1907 		psi_memstall_enter(&pflags);
1908 		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
1909 							gfp_mask,
1910 							MEMCG_RECLAIM_MAY_SWAP,
1911 							NULL);
1912 		psi_memstall_leave(&pflags);
1913 	} while ((memcg = parent_mem_cgroup(memcg)) &&
1914 		 !mem_cgroup_is_root(memcg));
1915 
1916 	return nr_reclaimed;
1917 }
1918 
1919 static void high_work_func(struct work_struct *work)
1920 {
1921 	struct mem_cgroup *memcg;
1922 
1923 	memcg = container_of(work, struct mem_cgroup, high_work);
1924 	reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
1925 }
1926 
1927 /*
1928  * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
1929  * enough to still cause a significant slowdown in most cases, while still
1930  * allowing diagnostics and tracing to proceed without becoming stuck.
1931  */
1932 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
1933 
1934 /*
1935  * When calculating the delay, we use these either side of the exponentiation to
1936  * maintain precision and scale to a reasonable number of jiffies (see the table
1937  * below.
1938  *
1939  * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
1940  *   overage ratio to a delay.
1941  * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
1942  *   proposed penalty in order to reduce to a reasonable number of jiffies, and
1943  *   to produce a reasonable delay curve.
1944  *
1945  * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
1946  * reasonable delay curve compared to precision-adjusted overage, not
1947  * penalising heavily at first, but still making sure that growth beyond the
1948  * limit penalises misbehaviour cgroups by slowing them down exponentially. For
1949  * example, with a high of 100 megabytes:
1950  *
1951  *  +-------+------------------------+
1952  *  | usage | time to allocate in ms |
1953  *  +-------+------------------------+
1954  *  | 100M  |                      0 |
1955  *  | 101M  |                      6 |
1956  *  | 102M  |                     25 |
1957  *  | 103M  |                     57 |
1958  *  | 104M  |                    102 |
1959  *  | 105M  |                    159 |
1960  *  | 106M  |                    230 |
1961  *  | 107M  |                    313 |
1962  *  | 108M  |                    409 |
1963  *  | 109M  |                    518 |
1964  *  | 110M  |                    639 |
1965  *  | 111M  |                    774 |
1966  *  | 112M  |                    921 |
1967  *  | 113M  |                   1081 |
1968  *  | 114M  |                   1254 |
1969  *  | 115M  |                   1439 |
1970  *  | 116M  |                   1638 |
1971  *  | 117M  |                   1849 |
1972  *  | 118M  |                   2000 |
1973  *  | 119M  |                   2000 |
1974  *  | 120M  |                   2000 |
1975  *  +-------+------------------------+
1976  */
1977  #define MEMCG_DELAY_PRECISION_SHIFT 20
1978  #define MEMCG_DELAY_SCALING_SHIFT 14
1979 
1980 static u64 calculate_overage(unsigned long usage, unsigned long high)
1981 {
1982 	u64 overage;
1983 
1984 	if (usage <= high)
1985 		return 0;
1986 
1987 	/*
1988 	 * Prevent division by 0 in overage calculation by acting as if
1989 	 * it was a threshold of 1 page
1990 	 */
1991 	high = max(high, 1UL);
1992 
1993 	overage = usage - high;
1994 	overage <<= MEMCG_DELAY_PRECISION_SHIFT;
1995 	return div64_u64(overage, high);
1996 }
1997 
1998 static u64 mem_find_max_overage(struct mem_cgroup *memcg)
1999 {
2000 	u64 overage, max_overage = 0;
2001 
2002 	do {
2003 		overage = calculate_overage(page_counter_read(&memcg->memory),
2004 					    READ_ONCE(memcg->memory.high));
2005 		max_overage = max(overage, max_overage);
2006 	} while ((memcg = parent_mem_cgroup(memcg)) &&
2007 		 !mem_cgroup_is_root(memcg));
2008 
2009 	return max_overage;
2010 }
2011 
2012 static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2013 {
2014 	u64 overage, max_overage = 0;
2015 
2016 	do {
2017 		overage = calculate_overage(page_counter_read(&memcg->swap),
2018 					    READ_ONCE(memcg->swap.high));
2019 		if (overage)
2020 			memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2021 		max_overage = max(overage, max_overage);
2022 	} while ((memcg = parent_mem_cgroup(memcg)) &&
2023 		 !mem_cgroup_is_root(memcg));
2024 
2025 	return max_overage;
2026 }
2027 
2028 /*
2029  * Get the number of jiffies that we should penalise a mischievous cgroup which
2030  * is exceeding its memory.high by checking both it and its ancestors.
2031  */
2032 static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2033 					  unsigned int nr_pages,
2034 					  u64 max_overage)
2035 {
2036 	unsigned long penalty_jiffies;
2037 
2038 	if (!max_overage)
2039 		return 0;
2040 
2041 	/*
2042 	 * We use overage compared to memory.high to calculate the number of
2043 	 * jiffies to sleep (penalty_jiffies). Ideally this value should be
2044 	 * fairly lenient on small overages, and increasingly harsh when the
2045 	 * memcg in question makes it clear that it has no intention of stopping
2046 	 * its crazy behaviour, so we exponentially increase the delay based on
2047 	 * overage amount.
2048 	 */
2049 	penalty_jiffies = max_overage * max_overage * HZ;
2050 	penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2051 	penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2052 
2053 	/*
2054 	 * Factor in the task's own contribution to the overage, such that four
2055 	 * N-sized allocations are throttled approximately the same as one
2056 	 * 4N-sized allocation.
2057 	 *
2058 	 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2059 	 * larger the current charge patch is than that.
2060 	 */
2061 	return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2062 }
2063 
2064 /*
2065  * Reclaims memory over the high limit. Called directly from
2066  * try_charge() (context permitting), as well as from the userland
2067  * return path where reclaim is always able to block.
2068  */
2069 void mem_cgroup_handle_over_high(gfp_t gfp_mask)
2070 {
2071 	unsigned long penalty_jiffies;
2072 	unsigned long pflags;
2073 	unsigned long nr_reclaimed;
2074 	unsigned int nr_pages = current->memcg_nr_pages_over_high;
2075 	int nr_retries = MAX_RECLAIM_RETRIES;
2076 	struct mem_cgroup *memcg;
2077 	bool in_retry = false;
2078 
2079 	if (likely(!nr_pages))
2080 		return;
2081 
2082 	memcg = get_mem_cgroup_from_mm(current->mm);
2083 	current->memcg_nr_pages_over_high = 0;
2084 
2085 retry_reclaim:
2086 	/*
2087 	 * Bail if the task is already exiting. Unlike memory.max,
2088 	 * memory.high enforcement isn't as strict, and there is no
2089 	 * OOM killer involved, which means the excess could already
2090 	 * be much bigger (and still growing) than it could for
2091 	 * memory.max; the dying task could get stuck in fruitless
2092 	 * reclaim for a long time, which isn't desirable.
2093 	 */
2094 	if (task_is_dying())
2095 		goto out;
2096 
2097 	/*
2098 	 * The allocating task should reclaim at least the batch size, but for
2099 	 * subsequent retries we only want to do what's necessary to prevent oom
2100 	 * or breaching resource isolation.
2101 	 *
2102 	 * This is distinct from memory.max or page allocator behaviour because
2103 	 * memory.high is currently batched, whereas memory.max and the page
2104 	 * allocator run every time an allocation is made.
2105 	 */
2106 	nr_reclaimed = reclaim_high(memcg,
2107 				    in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2108 				    gfp_mask);
2109 
2110 	/*
2111 	 * memory.high is breached and reclaim is unable to keep up. Throttle
2112 	 * allocators proactively to slow down excessive growth.
2113 	 */
2114 	penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2115 					       mem_find_max_overage(memcg));
2116 
2117 	penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2118 						swap_find_max_overage(memcg));
2119 
2120 	/*
2121 	 * Clamp the max delay per usermode return so as to still keep the
2122 	 * application moving forwards and also permit diagnostics, albeit
2123 	 * extremely slowly.
2124 	 */
2125 	penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2126 
2127 	/*
2128 	 * Don't sleep if the amount of jiffies this memcg owes us is so low
2129 	 * that it's not even worth doing, in an attempt to be nice to those who
2130 	 * go only a small amount over their memory.high value and maybe haven't
2131 	 * been aggressively reclaimed enough yet.
2132 	 */
2133 	if (penalty_jiffies <= HZ / 100)
2134 		goto out;
2135 
2136 	/*
2137 	 * If reclaim is making forward progress but we're still over
2138 	 * memory.high, we want to encourage that rather than doing allocator
2139 	 * throttling.
2140 	 */
2141 	if (nr_reclaimed || nr_retries--) {
2142 		in_retry = true;
2143 		goto retry_reclaim;
2144 	}
2145 
2146 	/*
2147 	 * Reclaim didn't manage to push usage below the limit, slow
2148 	 * this allocating task down.
2149 	 *
2150 	 * If we exit early, we're guaranteed to die (since
2151 	 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2152 	 * need to account for any ill-begotten jiffies to pay them off later.
2153 	 */
2154 	psi_memstall_enter(&pflags);
2155 	schedule_timeout_killable(penalty_jiffies);
2156 	psi_memstall_leave(&pflags);
2157 
2158 out:
2159 	css_put(&memcg->css);
2160 }
2161 
2162 int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
2163 		     unsigned int nr_pages)
2164 {
2165 	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2166 	int nr_retries = MAX_RECLAIM_RETRIES;
2167 	struct mem_cgroup *mem_over_limit;
2168 	struct page_counter *counter;
2169 	unsigned long nr_reclaimed;
2170 	bool passed_oom = false;
2171 	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
2172 	bool drained = false;
2173 	bool raised_max_event = false;
2174 	unsigned long pflags;
2175 
2176 retry:
2177 	if (consume_stock(memcg, nr_pages))
2178 		return 0;
2179 
2180 	if (!do_memsw_account() ||
2181 	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2182 		if (page_counter_try_charge(&memcg->memory, batch, &counter))
2183 			goto done_restock;
2184 		if (do_memsw_account())
2185 			page_counter_uncharge(&memcg->memsw, batch);
2186 		mem_over_limit = mem_cgroup_from_counter(counter, memory);
2187 	} else {
2188 		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2189 		reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
2190 	}
2191 
2192 	if (batch > nr_pages) {
2193 		batch = nr_pages;
2194 		goto retry;
2195 	}
2196 
2197 	/*
2198 	 * Prevent unbounded recursion when reclaim operations need to
2199 	 * allocate memory. This might exceed the limits temporarily,
2200 	 * but we prefer facilitating memory reclaim and getting back
2201 	 * under the limit over triggering OOM kills in these cases.
2202 	 */
2203 	if (unlikely(current->flags & PF_MEMALLOC))
2204 		goto force;
2205 
2206 	if (unlikely(task_in_memcg_oom(current)))
2207 		goto nomem;
2208 
2209 	if (!gfpflags_allow_blocking(gfp_mask))
2210 		goto nomem;
2211 
2212 	memcg_memory_event(mem_over_limit, MEMCG_MAX);
2213 	raised_max_event = true;
2214 
2215 	psi_memstall_enter(&pflags);
2216 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2217 						    gfp_mask, reclaim_options, NULL);
2218 	psi_memstall_leave(&pflags);
2219 
2220 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2221 		goto retry;
2222 
2223 	if (!drained) {
2224 		drain_all_stock(mem_over_limit);
2225 		drained = true;
2226 		goto retry;
2227 	}
2228 
2229 	if (gfp_mask & __GFP_NORETRY)
2230 		goto nomem;
2231 	/*
2232 	 * Even though the limit is exceeded at this point, reclaim
2233 	 * may have been able to free some pages.  Retry the charge
2234 	 * before killing the task.
2235 	 *
2236 	 * Only for regular pages, though: huge pages are rather
2237 	 * unlikely to succeed so close to the limit, and we fall back
2238 	 * to regular pages anyway in case of failure.
2239 	 */
2240 	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2241 		goto retry;
2242 	/*
2243 	 * At task move, charge accounts can be doubly counted. So, it's
2244 	 * better to wait until the end of task_move if something is going on.
2245 	 */
2246 	if (memcg1_wait_acct_move(mem_over_limit))
2247 		goto retry;
2248 
2249 	if (nr_retries--)
2250 		goto retry;
2251 
2252 	if (gfp_mask & __GFP_RETRY_MAYFAIL)
2253 		goto nomem;
2254 
2255 	/* Avoid endless loop for tasks bypassed by the oom killer */
2256 	if (passed_oom && task_is_dying())
2257 		goto nomem;
2258 
2259 	/*
2260 	 * keep retrying as long as the memcg oom killer is able to make
2261 	 * a forward progress or bypass the charge if the oom killer
2262 	 * couldn't make any progress.
2263 	 */
2264 	if (mem_cgroup_oom(mem_over_limit, gfp_mask,
2265 			   get_order(nr_pages * PAGE_SIZE))) {
2266 		passed_oom = true;
2267 		nr_retries = MAX_RECLAIM_RETRIES;
2268 		goto retry;
2269 	}
2270 nomem:
2271 	/*
2272 	 * Memcg doesn't have a dedicated reserve for atomic
2273 	 * allocations. But like the global atomic pool, we need to
2274 	 * put the burden of reclaim on regular allocation requests
2275 	 * and let these go through as privileged allocations.
2276 	 */
2277 	if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
2278 		return -ENOMEM;
2279 force:
2280 	/*
2281 	 * If the allocation has to be enforced, don't forget to raise
2282 	 * a MEMCG_MAX event.
2283 	 */
2284 	if (!raised_max_event)
2285 		memcg_memory_event(mem_over_limit, MEMCG_MAX);
2286 
2287 	/*
2288 	 * The allocation either can't fail or will lead to more memory
2289 	 * being freed very soon.  Allow memory usage go over the limit
2290 	 * temporarily by force charging it.
2291 	 */
2292 	page_counter_charge(&memcg->memory, nr_pages);
2293 	if (do_memsw_account())
2294 		page_counter_charge(&memcg->memsw, nr_pages);
2295 
2296 	return 0;
2297 
2298 done_restock:
2299 	if (batch > nr_pages)
2300 		refill_stock(memcg, batch - nr_pages);
2301 
2302 	/*
2303 	 * If the hierarchy is above the normal consumption range, schedule
2304 	 * reclaim on returning to userland.  We can perform reclaim here
2305 	 * if __GFP_RECLAIM but let's always punt for simplicity and so that
2306 	 * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2307 	 * not recorded as it most likely matches current's and won't
2308 	 * change in the meantime.  As high limit is checked again before
2309 	 * reclaim, the cost of mismatch is negligible.
2310 	 */
2311 	do {
2312 		bool mem_high, swap_high;
2313 
2314 		mem_high = page_counter_read(&memcg->memory) >
2315 			READ_ONCE(memcg->memory.high);
2316 		swap_high = page_counter_read(&memcg->swap) >
2317 			READ_ONCE(memcg->swap.high);
2318 
2319 		/* Don't bother a random interrupted task */
2320 		if (!in_task()) {
2321 			if (mem_high) {
2322 				schedule_work(&memcg->high_work);
2323 				break;
2324 			}
2325 			continue;
2326 		}
2327 
2328 		if (mem_high || swap_high) {
2329 			/*
2330 			 * The allocating tasks in this cgroup will need to do
2331 			 * reclaim or be throttled to prevent further growth
2332 			 * of the memory or swap footprints.
2333 			 *
2334 			 * Target some best-effort fairness between the tasks,
2335 			 * and distribute reclaim work and delay penalties
2336 			 * based on how much each task is actually allocating.
2337 			 */
2338 			current->memcg_nr_pages_over_high += batch;
2339 			set_notify_resume(current);
2340 			break;
2341 		}
2342 	} while ((memcg = parent_mem_cgroup(memcg)));
2343 
2344 	/*
2345 	 * Reclaim is set up above to be called from the userland
2346 	 * return path. But also attempt synchronous reclaim to avoid
2347 	 * excessive overrun while the task is still inside the
2348 	 * kernel. If this is successful, the return path will see it
2349 	 * when it rechecks the overage and simply bail out.
2350 	 */
2351 	if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
2352 	    !(current->flags & PF_MEMALLOC) &&
2353 	    gfpflags_allow_blocking(gfp_mask))
2354 		mem_cgroup_handle_over_high(gfp_mask);
2355 	return 0;
2356 }
2357 
2358 /**
2359  * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call.
2360  * @memcg: memcg previously charged.
2361  * @nr_pages: number of pages previously charged.
2362  */
2363 void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2364 {
2365 	if (mem_cgroup_is_root(memcg))
2366 		return;
2367 
2368 	page_counter_uncharge(&memcg->memory, nr_pages);
2369 	if (do_memsw_account())
2370 		page_counter_uncharge(&memcg->memsw, nr_pages);
2371 }
2372 
2373 static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
2374 {
2375 	VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
2376 	/*
2377 	 * Any of the following ensures page's memcg stability:
2378 	 *
2379 	 * - the page lock
2380 	 * - LRU isolation
2381 	 * - folio_memcg_lock()
2382 	 * - exclusive reference
2383 	 * - mem_cgroup_trylock_pages()
2384 	 */
2385 	folio->memcg_data = (unsigned long)memcg;
2386 }
2387 
2388 /**
2389  * mem_cgroup_commit_charge - commit a previously successful try_charge().
2390  * @folio: folio to commit the charge to.
2391  * @memcg: memcg previously charged.
2392  */
2393 void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
2394 {
2395 	css_get(&memcg->css);
2396 	commit_charge(folio, memcg);
2397 
2398 	local_irq_disable();
2399 	mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio));
2400 	memcg1_check_events(memcg, folio_nid(folio));
2401 	local_irq_enable();
2402 }
2403 
2404 static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
2405 				       struct pglist_data *pgdat,
2406 				       enum node_stat_item idx, int nr)
2407 {
2408 	struct mem_cgroup *memcg;
2409 	struct lruvec *lruvec;
2410 
2411 	rcu_read_lock();
2412 	memcg = obj_cgroup_memcg(objcg);
2413 	lruvec = mem_cgroup_lruvec(memcg, pgdat);
2414 	__mod_memcg_lruvec_state(lruvec, idx, nr);
2415 	rcu_read_unlock();
2416 }
2417 
2418 static __always_inline
2419 struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
2420 {
2421 	/*
2422 	 * Slab objects are accounted individually, not per-page.
2423 	 * Memcg membership data for each individual object is saved in
2424 	 * slab->obj_exts.
2425 	 */
2426 	if (folio_test_slab(folio)) {
2427 		struct slabobj_ext *obj_exts;
2428 		struct slab *slab;
2429 		unsigned int off;
2430 
2431 		slab = folio_slab(folio);
2432 		obj_exts = slab_obj_exts(slab);
2433 		if (!obj_exts)
2434 			return NULL;
2435 
2436 		off = obj_to_index(slab->slab_cache, slab, p);
2437 		if (obj_exts[off].objcg)
2438 			return obj_cgroup_memcg(obj_exts[off].objcg);
2439 
2440 		return NULL;
2441 	}
2442 
2443 	/*
2444 	 * folio_memcg_check() is used here, because in theory we can encounter
2445 	 * a folio where the slab flag has been cleared already, but
2446 	 * slab->obj_exts has not been freed yet
2447 	 * folio_memcg_check() will guarantee that a proper memory
2448 	 * cgroup pointer or NULL will be returned.
2449 	 */
2450 	return folio_memcg_check(folio);
2451 }
2452 
2453 /*
2454  * Returns a pointer to the memory cgroup to which the kernel object is charged.
2455  * It is not suitable for objects allocated using vmalloc().
2456  *
2457  * A passed kernel object must be a slab object or a generic kernel page.
2458  *
2459  * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2460  * cgroup_mutex, etc.
2461  */
2462 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
2463 {
2464 	if (mem_cgroup_disabled())
2465 		return NULL;
2466 
2467 	return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
2468 }
2469 
2470 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
2471 {
2472 	struct obj_cgroup *objcg = NULL;
2473 
2474 	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
2475 		objcg = rcu_dereference(memcg->objcg);
2476 		if (likely(objcg && obj_cgroup_tryget(objcg)))
2477 			break;
2478 		objcg = NULL;
2479 	}
2480 	return objcg;
2481 }
2482 
2483 static struct obj_cgroup *current_objcg_update(void)
2484 {
2485 	struct mem_cgroup *memcg;
2486 	struct obj_cgroup *old, *objcg = NULL;
2487 
2488 	do {
2489 		/* Atomically drop the update bit. */
2490 		old = xchg(&current->objcg, NULL);
2491 		if (old) {
2492 			old = (struct obj_cgroup *)
2493 				((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG);
2494 			obj_cgroup_put(old);
2495 
2496 			old = NULL;
2497 		}
2498 
2499 		/* If new objcg is NULL, no reason for the second atomic update. */
2500 		if (!current->mm || (current->flags & PF_KTHREAD))
2501 			return NULL;
2502 
2503 		/*
2504 		 * Release the objcg pointer from the previous iteration,
2505 		 * if try_cmpxcg() below fails.
2506 		 */
2507 		if (unlikely(objcg)) {
2508 			obj_cgroup_put(objcg);
2509 			objcg = NULL;
2510 		}
2511 
2512 		/*
2513 		 * Obtain the new objcg pointer. The current task can be
2514 		 * asynchronously moved to another memcg and the previous
2515 		 * memcg can be offlined. So let's get the memcg pointer
2516 		 * and try get a reference to objcg under a rcu read lock.
2517 		 */
2518 
2519 		rcu_read_lock();
2520 		memcg = mem_cgroup_from_task(current);
2521 		objcg = __get_obj_cgroup_from_memcg(memcg);
2522 		rcu_read_unlock();
2523 
2524 		/*
2525 		 * Try set up a new objcg pointer atomically. If it
2526 		 * fails, it means the update flag was set concurrently, so
2527 		 * the whole procedure should be repeated.
2528 		 */
2529 	} while (!try_cmpxchg(&current->objcg, &old, objcg));
2530 
2531 	return objcg;
2532 }
2533 
2534 __always_inline struct obj_cgroup *current_obj_cgroup(void)
2535 {
2536 	struct mem_cgroup *memcg;
2537 	struct obj_cgroup *objcg;
2538 
2539 	if (in_task()) {
2540 		memcg = current->active_memcg;
2541 		if (unlikely(memcg))
2542 			goto from_memcg;
2543 
2544 		objcg = READ_ONCE(current->objcg);
2545 		if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG))
2546 			objcg = current_objcg_update();
2547 		/*
2548 		 * Objcg reference is kept by the task, so it's safe
2549 		 * to use the objcg by the current task.
2550 		 */
2551 		return objcg;
2552 	}
2553 
2554 	memcg = this_cpu_read(int_active_memcg);
2555 	if (unlikely(memcg))
2556 		goto from_memcg;
2557 
2558 	return NULL;
2559 
2560 from_memcg:
2561 	objcg = NULL;
2562 	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
2563 		/*
2564 		 * Memcg pointer is protected by scope (see set_active_memcg())
2565 		 * and is pinning the corresponding objcg, so objcg can't go
2566 		 * away and can be used within the scope without any additional
2567 		 * protection.
2568 		 */
2569 		objcg = rcu_dereference_check(memcg->objcg, 1);
2570 		if (likely(objcg))
2571 			break;
2572 	}
2573 
2574 	return objcg;
2575 }
2576 
2577 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
2578 {
2579 	struct obj_cgroup *objcg;
2580 
2581 	if (!memcg_kmem_online())
2582 		return NULL;
2583 
2584 	if (folio_memcg_kmem(folio)) {
2585 		objcg = __folio_objcg(folio);
2586 		obj_cgroup_get(objcg);
2587 	} else {
2588 		struct mem_cgroup *memcg;
2589 
2590 		rcu_read_lock();
2591 		memcg = __folio_memcg(folio);
2592 		if (memcg)
2593 			objcg = __get_obj_cgroup_from_memcg(memcg);
2594 		else
2595 			objcg = NULL;
2596 		rcu_read_unlock();
2597 	}
2598 	return objcg;
2599 }
2600 
2601 /*
2602  * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
2603  * @objcg: object cgroup to uncharge
2604  * @nr_pages: number of pages to uncharge
2605  */
2606 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
2607 				      unsigned int nr_pages)
2608 {
2609 	struct mem_cgroup *memcg;
2610 
2611 	memcg = get_mem_cgroup_from_objcg(objcg);
2612 
2613 	mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
2614 	memcg1_account_kmem(memcg, -nr_pages);
2615 	refill_stock(memcg, nr_pages);
2616 
2617 	css_put(&memcg->css);
2618 }
2619 
2620 /*
2621  * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
2622  * @objcg: object cgroup to charge
2623  * @gfp: reclaim mode
2624  * @nr_pages: number of pages to charge
2625  *
2626  * Returns 0 on success, an error code on failure.
2627  */
2628 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
2629 				   unsigned int nr_pages)
2630 {
2631 	struct mem_cgroup *memcg;
2632 	int ret;
2633 
2634 	memcg = get_mem_cgroup_from_objcg(objcg);
2635 
2636 	ret = try_charge_memcg(memcg, gfp, nr_pages);
2637 	if (ret)
2638 		goto out;
2639 
2640 	mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
2641 	memcg1_account_kmem(memcg, nr_pages);
2642 out:
2643 	css_put(&memcg->css);
2644 
2645 	return ret;
2646 }
2647 
2648 /**
2649  * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
2650  * @page: page to charge
2651  * @gfp: reclaim mode
2652  * @order: allocation order
2653  *
2654  * Returns 0 on success, an error code on failure.
2655  */
2656 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
2657 {
2658 	struct obj_cgroup *objcg;
2659 	int ret = 0;
2660 
2661 	objcg = current_obj_cgroup();
2662 	if (objcg) {
2663 		ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
2664 		if (!ret) {
2665 			obj_cgroup_get(objcg);
2666 			page->memcg_data = (unsigned long)objcg |
2667 				MEMCG_DATA_KMEM;
2668 			return 0;
2669 		}
2670 	}
2671 	return ret;
2672 }
2673 
2674 /**
2675  * __memcg_kmem_uncharge_page: uncharge a kmem page
2676  * @page: page to uncharge
2677  * @order: allocation order
2678  */
2679 void __memcg_kmem_uncharge_page(struct page *page, int order)
2680 {
2681 	struct folio *folio = page_folio(page);
2682 	struct obj_cgroup *objcg;
2683 	unsigned int nr_pages = 1 << order;
2684 
2685 	if (!folio_memcg_kmem(folio))
2686 		return;
2687 
2688 	objcg = __folio_objcg(folio);
2689 	obj_cgroup_uncharge_pages(objcg, nr_pages);
2690 	folio->memcg_data = 0;
2691 	obj_cgroup_put(objcg);
2692 }
2693 
2694 static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
2695 		     enum node_stat_item idx, int nr)
2696 {
2697 	struct memcg_stock_pcp *stock;
2698 	struct obj_cgroup *old = NULL;
2699 	unsigned long flags;
2700 	int *bytes;
2701 
2702 	local_lock_irqsave(&memcg_stock.stock_lock, flags);
2703 	stock = this_cpu_ptr(&memcg_stock);
2704 
2705 	/*
2706 	 * Save vmstat data in stock and skip vmstat array update unless
2707 	 * accumulating over a page of vmstat data or when pgdat or idx
2708 	 * changes.
2709 	 */
2710 	if (READ_ONCE(stock->cached_objcg) != objcg) {
2711 		old = drain_obj_stock(stock);
2712 		obj_cgroup_get(objcg);
2713 		stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
2714 				? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
2715 		WRITE_ONCE(stock->cached_objcg, objcg);
2716 		stock->cached_pgdat = pgdat;
2717 	} else if (stock->cached_pgdat != pgdat) {
2718 		/* Flush the existing cached vmstat data */
2719 		struct pglist_data *oldpg = stock->cached_pgdat;
2720 
2721 		if (stock->nr_slab_reclaimable_b) {
2722 			__mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
2723 					  stock->nr_slab_reclaimable_b);
2724 			stock->nr_slab_reclaimable_b = 0;
2725 		}
2726 		if (stock->nr_slab_unreclaimable_b) {
2727 			__mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
2728 					  stock->nr_slab_unreclaimable_b);
2729 			stock->nr_slab_unreclaimable_b = 0;
2730 		}
2731 		stock->cached_pgdat = pgdat;
2732 	}
2733 
2734 	bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
2735 					       : &stock->nr_slab_unreclaimable_b;
2736 	/*
2737 	 * Even for large object >= PAGE_SIZE, the vmstat data will still be
2738 	 * cached locally at least once before pushing it out.
2739 	 */
2740 	if (!*bytes) {
2741 		*bytes = nr;
2742 		nr = 0;
2743 	} else {
2744 		*bytes += nr;
2745 		if (abs(*bytes) > PAGE_SIZE) {
2746 			nr = *bytes;
2747 			*bytes = 0;
2748 		} else {
2749 			nr = 0;
2750 		}
2751 	}
2752 	if (nr)
2753 		__mod_objcg_mlstate(objcg, pgdat, idx, nr);
2754 
2755 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2756 	obj_cgroup_put(old);
2757 }
2758 
2759 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
2760 {
2761 	struct memcg_stock_pcp *stock;
2762 	unsigned long flags;
2763 	bool ret = false;
2764 
2765 	local_lock_irqsave(&memcg_stock.stock_lock, flags);
2766 
2767 	stock = this_cpu_ptr(&memcg_stock);
2768 	if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
2769 		stock->nr_bytes -= nr_bytes;
2770 		ret = true;
2771 	}
2772 
2773 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2774 
2775 	return ret;
2776 }
2777 
2778 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
2779 {
2780 	struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
2781 
2782 	if (!old)
2783 		return NULL;
2784 
2785 	if (stock->nr_bytes) {
2786 		unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
2787 		unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
2788 
2789 		if (nr_pages) {
2790 			struct mem_cgroup *memcg;
2791 
2792 			memcg = get_mem_cgroup_from_objcg(old);
2793 
2794 			mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
2795 			memcg1_account_kmem(memcg, -nr_pages);
2796 			__refill_stock(memcg, nr_pages);
2797 
2798 			css_put(&memcg->css);
2799 		}
2800 
2801 		/*
2802 		 * The leftover is flushed to the centralized per-memcg value.
2803 		 * On the next attempt to refill obj stock it will be moved
2804 		 * to a per-cpu stock (probably, on an other CPU), see
2805 		 * refill_obj_stock().
2806 		 *
2807 		 * How often it's flushed is a trade-off between the memory
2808 		 * limit enforcement accuracy and potential CPU contention,
2809 		 * so it might be changed in the future.
2810 		 */
2811 		atomic_add(nr_bytes, &old->nr_charged_bytes);
2812 		stock->nr_bytes = 0;
2813 	}
2814 
2815 	/*
2816 	 * Flush the vmstat data in current stock
2817 	 */
2818 	if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
2819 		if (stock->nr_slab_reclaimable_b) {
2820 			__mod_objcg_mlstate(old, stock->cached_pgdat,
2821 					  NR_SLAB_RECLAIMABLE_B,
2822 					  stock->nr_slab_reclaimable_b);
2823 			stock->nr_slab_reclaimable_b = 0;
2824 		}
2825 		if (stock->nr_slab_unreclaimable_b) {
2826 			__mod_objcg_mlstate(old, stock->cached_pgdat,
2827 					  NR_SLAB_UNRECLAIMABLE_B,
2828 					  stock->nr_slab_unreclaimable_b);
2829 			stock->nr_slab_unreclaimable_b = 0;
2830 		}
2831 		stock->cached_pgdat = NULL;
2832 	}
2833 
2834 	WRITE_ONCE(stock->cached_objcg, NULL);
2835 	/*
2836 	 * The `old' objects needs to be released by the caller via
2837 	 * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
2838 	 */
2839 	return old;
2840 }
2841 
2842 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2843 				     struct mem_cgroup *root_memcg)
2844 {
2845 	struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
2846 	struct mem_cgroup *memcg;
2847 
2848 	if (objcg) {
2849 		memcg = obj_cgroup_memcg(objcg);
2850 		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
2851 			return true;
2852 	}
2853 
2854 	return false;
2855 }
2856 
2857 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
2858 			     bool allow_uncharge)
2859 {
2860 	struct memcg_stock_pcp *stock;
2861 	struct obj_cgroup *old = NULL;
2862 	unsigned long flags;
2863 	unsigned int nr_pages = 0;
2864 
2865 	local_lock_irqsave(&memcg_stock.stock_lock, flags);
2866 
2867 	stock = this_cpu_ptr(&memcg_stock);
2868 	if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
2869 		old = drain_obj_stock(stock);
2870 		obj_cgroup_get(objcg);
2871 		WRITE_ONCE(stock->cached_objcg, objcg);
2872 		stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
2873 				? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
2874 		allow_uncharge = true;	/* Allow uncharge when objcg changes */
2875 	}
2876 	stock->nr_bytes += nr_bytes;
2877 
2878 	if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
2879 		nr_pages = stock->nr_bytes >> PAGE_SHIFT;
2880 		stock->nr_bytes &= (PAGE_SIZE - 1);
2881 	}
2882 
2883 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2884 	obj_cgroup_put(old);
2885 
2886 	if (nr_pages)
2887 		obj_cgroup_uncharge_pages(objcg, nr_pages);
2888 }
2889 
2890 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
2891 {
2892 	unsigned int nr_pages, nr_bytes;
2893 	int ret;
2894 
2895 	if (consume_obj_stock(objcg, size))
2896 		return 0;
2897 
2898 	/*
2899 	 * In theory, objcg->nr_charged_bytes can have enough
2900 	 * pre-charged bytes to satisfy the allocation. However,
2901 	 * flushing objcg->nr_charged_bytes requires two atomic
2902 	 * operations, and objcg->nr_charged_bytes can't be big.
2903 	 * The shared objcg->nr_charged_bytes can also become a
2904 	 * performance bottleneck if all tasks of the same memcg are
2905 	 * trying to update it. So it's better to ignore it and try
2906 	 * grab some new pages. The stock's nr_bytes will be flushed to
2907 	 * objcg->nr_charged_bytes later on when objcg changes.
2908 	 *
2909 	 * The stock's nr_bytes may contain enough pre-charged bytes
2910 	 * to allow one less page from being charged, but we can't rely
2911 	 * on the pre-charged bytes not being changed outside of
2912 	 * consume_obj_stock() or refill_obj_stock(). So ignore those
2913 	 * pre-charged bytes as well when charging pages. To avoid a
2914 	 * page uncharge right after a page charge, we set the
2915 	 * allow_uncharge flag to false when calling refill_obj_stock()
2916 	 * to temporarily allow the pre-charged bytes to exceed the page
2917 	 * size limit. The maximum reachable value of the pre-charged
2918 	 * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
2919 	 * race.
2920 	 */
2921 	nr_pages = size >> PAGE_SHIFT;
2922 	nr_bytes = size & (PAGE_SIZE - 1);
2923 
2924 	if (nr_bytes)
2925 		nr_pages += 1;
2926 
2927 	ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
2928 	if (!ret && nr_bytes)
2929 		refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
2930 
2931 	return ret;
2932 }
2933 
2934 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
2935 {
2936 	refill_obj_stock(objcg, size, true);
2937 }
2938 
2939 static inline size_t obj_full_size(struct kmem_cache *s)
2940 {
2941 	/*
2942 	 * For each accounted object there is an extra space which is used
2943 	 * to store obj_cgroup membership. Charge it too.
2944 	 */
2945 	return s->size + sizeof(struct obj_cgroup *);
2946 }
2947 
2948 bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
2949 				  gfp_t flags, size_t size, void **p)
2950 {
2951 	struct obj_cgroup *objcg;
2952 	struct slab *slab;
2953 	unsigned long off;
2954 	size_t i;
2955 
2956 	/*
2957 	 * The obtained objcg pointer is safe to use within the current scope,
2958 	 * defined by current task or set_active_memcg() pair.
2959 	 * obj_cgroup_get() is used to get a permanent reference.
2960 	 */
2961 	objcg = current_obj_cgroup();
2962 	if (!objcg)
2963 		return true;
2964 
2965 	/*
2966 	 * slab_alloc_node() avoids the NULL check, so we might be called with a
2967 	 * single NULL object. kmem_cache_alloc_bulk() aborts if it can't fill
2968 	 * the whole requested size.
2969 	 * return success as there's nothing to free back
2970 	 */
2971 	if (unlikely(*p == NULL))
2972 		return true;
2973 
2974 	flags &= gfp_allowed_mask;
2975 
2976 	if (lru) {
2977 		int ret;
2978 		struct mem_cgroup *memcg;
2979 
2980 		memcg = get_mem_cgroup_from_objcg(objcg);
2981 		ret = memcg_list_lru_alloc(memcg, lru, flags);
2982 		css_put(&memcg->css);
2983 
2984 		if (ret)
2985 			return false;
2986 	}
2987 
2988 	if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s)))
2989 		return false;
2990 
2991 	for (i = 0; i < size; i++) {
2992 		slab = virt_to_slab(p[i]);
2993 
2994 		if (!slab_obj_exts(slab) &&
2995 		    alloc_slab_obj_exts(slab, s, flags, false)) {
2996 			obj_cgroup_uncharge(objcg, obj_full_size(s));
2997 			continue;
2998 		}
2999 
3000 		off = obj_to_index(s, slab, p[i]);
3001 		obj_cgroup_get(objcg);
3002 		slab_obj_exts(slab)[off].objcg = objcg;
3003 		mod_objcg_state(objcg, slab_pgdat(slab),
3004 				cache_vmstat_idx(s), obj_full_size(s));
3005 	}
3006 
3007 	return true;
3008 }
3009 
3010 void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
3011 			    void **p, int objects, struct slabobj_ext *obj_exts)
3012 {
3013 	for (int i = 0; i < objects; i++) {
3014 		struct obj_cgroup *objcg;
3015 		unsigned int off;
3016 
3017 		off = obj_to_index(s, slab, p[i]);
3018 		objcg = obj_exts[off].objcg;
3019 		if (!objcg)
3020 			continue;
3021 
3022 		obj_exts[off].objcg = NULL;
3023 		obj_cgroup_uncharge(objcg, obj_full_size(s));
3024 		mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
3025 				-obj_full_size(s));
3026 		obj_cgroup_put(objcg);
3027 	}
3028 }
3029 
3030 /*
3031  * Because folio_memcg(head) is not set on tails, set it now.
3032  */
3033 void split_page_memcg(struct page *head, int old_order, int new_order)
3034 {
3035 	struct folio *folio = page_folio(head);
3036 	struct mem_cgroup *memcg = folio_memcg(folio);
3037 	int i;
3038 	unsigned int old_nr = 1 << old_order;
3039 	unsigned int new_nr = 1 << new_order;
3040 
3041 	if (mem_cgroup_disabled() || !memcg)
3042 		return;
3043 
3044 	for (i = new_nr; i < old_nr; i += new_nr)
3045 		folio_page(folio, i)->memcg_data = folio->memcg_data;
3046 
3047 	if (folio_memcg_kmem(folio))
3048 		obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1);
3049 	else
3050 		css_get_many(&memcg->css, old_nr / new_nr - 1);
3051 }
3052 
3053 unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3054 {
3055 	unsigned long val;
3056 
3057 	if (mem_cgroup_is_root(memcg)) {
3058 		/*
3059 		 * Approximate root's usage from global state. This isn't
3060 		 * perfect, but the root usage was always an approximation.
3061 		 */
3062 		val = global_node_page_state(NR_FILE_PAGES) +
3063 			global_node_page_state(NR_ANON_MAPPED);
3064 		if (swap)
3065 			val += total_swap_pages - get_nr_swap_pages();
3066 	} else {
3067 		if (!swap)
3068 			val = page_counter_read(&memcg->memory);
3069 		else
3070 			val = page_counter_read(&memcg->memsw);
3071 	}
3072 	return val;
3073 }
3074 
3075 static int memcg_online_kmem(struct mem_cgroup *memcg)
3076 {
3077 	struct obj_cgroup *objcg;
3078 
3079 	if (mem_cgroup_kmem_disabled())
3080 		return 0;
3081 
3082 	if (unlikely(mem_cgroup_is_root(memcg)))
3083 		return 0;
3084 
3085 	objcg = obj_cgroup_alloc();
3086 	if (!objcg)
3087 		return -ENOMEM;
3088 
3089 	objcg->memcg = memcg;
3090 	rcu_assign_pointer(memcg->objcg, objcg);
3091 	obj_cgroup_get(objcg);
3092 	memcg->orig_objcg = objcg;
3093 
3094 	static_branch_enable(&memcg_kmem_online_key);
3095 
3096 	memcg->kmemcg_id = memcg->id.id;
3097 
3098 	return 0;
3099 }
3100 
3101 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3102 {
3103 	struct mem_cgroup *parent;
3104 
3105 	if (mem_cgroup_kmem_disabled())
3106 		return;
3107 
3108 	if (unlikely(mem_cgroup_is_root(memcg)))
3109 		return;
3110 
3111 	parent = parent_mem_cgroup(memcg);
3112 	if (!parent)
3113 		parent = root_mem_cgroup;
3114 
3115 	memcg_reparent_objcgs(memcg, parent);
3116 
3117 	/*
3118 	 * After we have finished memcg_reparent_objcgs(), all list_lrus
3119 	 * corresponding to this cgroup are guaranteed to remain empty.
3120 	 * The ordering is imposed by list_lru_node->lock taken by
3121 	 * memcg_reparent_list_lrus().
3122 	 */
3123 	memcg_reparent_list_lrus(memcg, parent);
3124 }
3125 
3126 #ifdef CONFIG_CGROUP_WRITEBACK
3127 
3128 #include <trace/events/writeback.h>
3129 
3130 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3131 {
3132 	return wb_domain_init(&memcg->cgwb_domain, gfp);
3133 }
3134 
3135 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3136 {
3137 	wb_domain_exit(&memcg->cgwb_domain);
3138 }
3139 
3140 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3141 {
3142 	wb_domain_size_changed(&memcg->cgwb_domain);
3143 }
3144 
3145 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
3146 {
3147 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3148 
3149 	if (!memcg->css.parent)
3150 		return NULL;
3151 
3152 	return &memcg->cgwb_domain;
3153 }
3154 
3155 /**
3156  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
3157  * @wb: bdi_writeback in question
3158  * @pfilepages: out parameter for number of file pages
3159  * @pheadroom: out parameter for number of allocatable pages according to memcg
3160  * @pdirty: out parameter for number of dirty pages
3161  * @pwriteback: out parameter for number of pages under writeback
3162  *
3163  * Determine the numbers of file, headroom, dirty, and writeback pages in
3164  * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
3165  * is a bit more involved.
3166  *
3167  * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
3168  * headroom is calculated as the lowest headroom of itself and the
3169  * ancestors.  Note that this doesn't consider the actual amount of
3170  * available memory in the system.  The caller should further cap
3171  * *@pheadroom accordingly.
3172  */
3173 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
3174 			 unsigned long *pheadroom, unsigned long *pdirty,
3175 			 unsigned long *pwriteback)
3176 {
3177 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3178 	struct mem_cgroup *parent;
3179 
3180 	mem_cgroup_flush_stats_ratelimited(memcg);
3181 
3182 	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
3183 	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
3184 	*pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
3185 			memcg_page_state(memcg, NR_ACTIVE_FILE);
3186 
3187 	*pheadroom = PAGE_COUNTER_MAX;
3188 	while ((parent = parent_mem_cgroup(memcg))) {
3189 		unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
3190 					    READ_ONCE(memcg->memory.high));
3191 		unsigned long used = page_counter_read(&memcg->memory);
3192 
3193 		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
3194 		memcg = parent;
3195 	}
3196 }
3197 
3198 /*
3199  * Foreign dirty flushing
3200  *
3201  * There's an inherent mismatch between memcg and writeback.  The former
3202  * tracks ownership per-page while the latter per-inode.  This was a
3203  * deliberate design decision because honoring per-page ownership in the
3204  * writeback path is complicated, may lead to higher CPU and IO overheads
3205  * and deemed unnecessary given that write-sharing an inode across
3206  * different cgroups isn't a common use-case.
3207  *
3208  * Combined with inode majority-writer ownership switching, this works well
3209  * enough in most cases but there are some pathological cases.  For
3210  * example, let's say there are two cgroups A and B which keep writing to
3211  * different but confined parts of the same inode.  B owns the inode and
3212  * A's memory is limited far below B's.  A's dirty ratio can rise enough to
3213  * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
3214  * triggering background writeback.  A will be slowed down without a way to
3215  * make writeback of the dirty pages happen.
3216  *
3217  * Conditions like the above can lead to a cgroup getting repeatedly and
3218  * severely throttled after making some progress after each
3219  * dirty_expire_interval while the underlying IO device is almost
3220  * completely idle.
3221  *
3222  * Solving this problem completely requires matching the ownership tracking
3223  * granularities between memcg and writeback in either direction.  However,
3224  * the more egregious behaviors can be avoided by simply remembering the
3225  * most recent foreign dirtying events and initiating remote flushes on
3226  * them when local writeback isn't enough to keep the memory clean enough.
3227  *
3228  * The following two functions implement such mechanism.  When a foreign
3229  * page - a page whose memcg and writeback ownerships don't match - is
3230  * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
3231  * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
3232  * decides that the memcg needs to sleep due to high dirty ratio, it calls
3233  * mem_cgroup_flush_foreign() which queues writeback on the recorded
3234  * foreign bdi_writebacks which haven't expired.  Both the numbers of
3235  * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
3236  * limited to MEMCG_CGWB_FRN_CNT.
3237  *
3238  * The mechanism only remembers IDs and doesn't hold any object references.
3239  * As being wrong occasionally doesn't matter, updates and accesses to the
3240  * records are lockless and racy.
3241  */
3242 void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
3243 					     struct bdi_writeback *wb)
3244 {
3245 	struct mem_cgroup *memcg = folio_memcg(folio);
3246 	struct memcg_cgwb_frn *frn;
3247 	u64 now = get_jiffies_64();
3248 	u64 oldest_at = now;
3249 	int oldest = -1;
3250 	int i;
3251 
3252 	trace_track_foreign_dirty(folio, wb);
3253 
3254 	/*
3255 	 * Pick the slot to use.  If there is already a slot for @wb, keep
3256 	 * using it.  If not replace the oldest one which isn't being
3257 	 * written out.
3258 	 */
3259 	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
3260 		frn = &memcg->cgwb_frn[i];
3261 		if (frn->bdi_id == wb->bdi->id &&
3262 		    frn->memcg_id == wb->memcg_css->id)
3263 			break;
3264 		if (time_before64(frn->at, oldest_at) &&
3265 		    atomic_read(&frn->done.cnt) == 1) {
3266 			oldest = i;
3267 			oldest_at = frn->at;
3268 		}
3269 	}
3270 
3271 	if (i < MEMCG_CGWB_FRN_CNT) {
3272 		/*
3273 		 * Re-using an existing one.  Update timestamp lazily to
3274 		 * avoid making the cacheline hot.  We want them to be
3275 		 * reasonably up-to-date and significantly shorter than
3276 		 * dirty_expire_interval as that's what expires the record.
3277 		 * Use the shorter of 1s and dirty_expire_interval / 8.
3278 		 */
3279 		unsigned long update_intv =
3280 			min_t(unsigned long, HZ,
3281 			      msecs_to_jiffies(dirty_expire_interval * 10) / 8);
3282 
3283 		if (time_before64(frn->at, now - update_intv))
3284 			frn->at = now;
3285 	} else if (oldest >= 0) {
3286 		/* replace the oldest free one */
3287 		frn = &memcg->cgwb_frn[oldest];
3288 		frn->bdi_id = wb->bdi->id;
3289 		frn->memcg_id = wb->memcg_css->id;
3290 		frn->at = now;
3291 	}
3292 }
3293 
3294 /* issue foreign writeback flushes for recorded foreign dirtying events */
3295 void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
3296 {
3297 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3298 	unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
3299 	u64 now = jiffies_64;
3300 	int i;
3301 
3302 	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
3303 		struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
3304 
3305 		/*
3306 		 * If the record is older than dirty_expire_interval,
3307 		 * writeback on it has already started.  No need to kick it
3308 		 * off again.  Also, don't start a new one if there's
3309 		 * already one in flight.
3310 		 */
3311 		if (time_after64(frn->at, now - intv) &&
3312 		    atomic_read(&frn->done.cnt) == 1) {
3313 			frn->at = 0;
3314 			trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
3315 			cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
3316 					       WB_REASON_FOREIGN_FLUSH,
3317 					       &frn->done);
3318 		}
3319 	}
3320 }
3321 
3322 #else	/* CONFIG_CGROUP_WRITEBACK */
3323 
3324 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3325 {
3326 	return 0;
3327 }
3328 
3329 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3330 {
3331 }
3332 
3333 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3334 {
3335 }
3336 
3337 #endif	/* CONFIG_CGROUP_WRITEBACK */
3338 
3339 /*
3340  * Private memory cgroup IDR
3341  *
3342  * Swap-out records and page cache shadow entries need to store memcg
3343  * references in constrained space, so we maintain an ID space that is
3344  * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
3345  * memory-controlled cgroups to 64k.
3346  *
3347  * However, there usually are many references to the offline CSS after
3348  * the cgroup has been destroyed, such as page cache or reclaimable
3349  * slab objects, that don't need to hang on to the ID. We want to keep
3350  * those dead CSS from occupying IDs, or we might quickly exhaust the
3351  * relatively small ID space and prevent the creation of new cgroups
3352  * even when there are much fewer than 64k cgroups - possibly none.
3353  *
3354  * Maintain a private 16-bit ID space for memcg, and allow the ID to
3355  * be freed and recycled when it's no longer needed, which is usually
3356  * when the CSS is offlined.
3357  *
3358  * The only exception to that are records of swapped out tmpfs/shmem
3359  * pages that need to be attributed to live ancestors on swapin. But
3360  * those references are manageable from userspace.
3361  */
3362 
3363 #define MEM_CGROUP_ID_MAX	((1UL << MEM_CGROUP_ID_SHIFT) - 1)
3364 static DEFINE_IDR(mem_cgroup_idr);
3365 static DEFINE_SPINLOCK(memcg_idr_lock);
3366 
3367 static int mem_cgroup_alloc_id(void)
3368 {
3369 	int ret;
3370 
3371 	idr_preload(GFP_KERNEL);
3372 	spin_lock(&memcg_idr_lock);
3373 	ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1,
3374 			GFP_NOWAIT);
3375 	spin_unlock(&memcg_idr_lock);
3376 	idr_preload_end();
3377 	return ret;
3378 }
3379 
3380 static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
3381 {
3382 	if (memcg->id.id > 0) {
3383 		spin_lock(&memcg_idr_lock);
3384 		idr_remove(&mem_cgroup_idr, memcg->id.id);
3385 		spin_unlock(&memcg_idr_lock);
3386 
3387 		memcg->id.id = 0;
3388 	}
3389 }
3390 
3391 void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
3392 					   unsigned int n)
3393 {
3394 	refcount_add(n, &memcg->id.ref);
3395 }
3396 
3397 void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
3398 {
3399 	if (refcount_sub_and_test(n, &memcg->id.ref)) {
3400 		mem_cgroup_id_remove(memcg);
3401 
3402 		/* Memcg ID pins CSS */
3403 		css_put(&memcg->css);
3404 	}
3405 }
3406 
3407 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
3408 {
3409 	mem_cgroup_id_put_many(memcg, 1);
3410 }
3411 
3412 /**
3413  * mem_cgroup_from_id - look up a memcg from a memcg id
3414  * @id: the memcg id to look up
3415  *
3416  * Caller must hold rcu_read_lock().
3417  */
3418 struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
3419 {
3420 	WARN_ON_ONCE(!rcu_read_lock_held());
3421 	return idr_find(&mem_cgroup_idr, id);
3422 }
3423 
3424 #ifdef CONFIG_SHRINKER_DEBUG
3425 struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
3426 {
3427 	struct cgroup *cgrp;
3428 	struct cgroup_subsys_state *css;
3429 	struct mem_cgroup *memcg;
3430 
3431 	cgrp = cgroup_get_from_id(ino);
3432 	if (IS_ERR(cgrp))
3433 		return ERR_CAST(cgrp);
3434 
3435 	css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
3436 	if (css)
3437 		memcg = container_of(css, struct mem_cgroup, css);
3438 	else
3439 		memcg = ERR_PTR(-ENOENT);
3440 
3441 	cgroup_put(cgrp);
3442 
3443 	return memcg;
3444 }
3445 #endif
3446 
3447 static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
3448 {
3449 	struct mem_cgroup_per_node *pn;
3450 
3451 	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
3452 	if (!pn)
3453 		return false;
3454 
3455 	pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats),
3456 					GFP_KERNEL_ACCOUNT, node);
3457 	if (!pn->lruvec_stats)
3458 		goto fail;
3459 
3460 	pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
3461 						   GFP_KERNEL_ACCOUNT);
3462 	if (!pn->lruvec_stats_percpu)
3463 		goto fail;
3464 
3465 	lruvec_init(&pn->lruvec);
3466 	pn->memcg = memcg;
3467 
3468 	memcg->nodeinfo[node] = pn;
3469 	return true;
3470 fail:
3471 	kfree(pn->lruvec_stats);
3472 	kfree(pn);
3473 	return false;
3474 }
3475 
3476 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
3477 {
3478 	struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3479 
3480 	if (!pn)
3481 		return;
3482 
3483 	free_percpu(pn->lruvec_stats_percpu);
3484 	kfree(pn->lruvec_stats);
3485 	kfree(pn);
3486 }
3487 
3488 static void __mem_cgroup_free(struct mem_cgroup *memcg)
3489 {
3490 	int node;
3491 
3492 	obj_cgroup_put(memcg->orig_objcg);
3493 
3494 	for_each_node(node)
3495 		free_mem_cgroup_per_node_info(memcg, node);
3496 	kfree(memcg->vmstats);
3497 	free_percpu(memcg->vmstats_percpu);
3498 	kfree(memcg);
3499 }
3500 
3501 static void mem_cgroup_free(struct mem_cgroup *memcg)
3502 {
3503 	lru_gen_exit_memcg(memcg);
3504 	memcg_wb_domain_exit(memcg);
3505 	__mem_cgroup_free(memcg);
3506 }
3507 
3508 static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
3509 {
3510 	struct memcg_vmstats_percpu *statc, *pstatc;
3511 	struct mem_cgroup *memcg;
3512 	int node, cpu;
3513 	int __maybe_unused i;
3514 	long error = -ENOMEM;
3515 
3516 	memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
3517 	if (!memcg)
3518 		return ERR_PTR(error);
3519 
3520 	memcg->id.id = mem_cgroup_alloc_id();
3521 	if (memcg->id.id < 0) {
3522 		error = memcg->id.id;
3523 		goto fail;
3524 	}
3525 
3526 	memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats),
3527 				 GFP_KERNEL_ACCOUNT);
3528 	if (!memcg->vmstats)
3529 		goto fail;
3530 
3531 	memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
3532 						 GFP_KERNEL_ACCOUNT);
3533 	if (!memcg->vmstats_percpu)
3534 		goto fail;
3535 
3536 	for_each_possible_cpu(cpu) {
3537 		if (parent)
3538 			pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
3539 		statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
3540 		statc->parent = parent ? pstatc : NULL;
3541 		statc->vmstats = memcg->vmstats;
3542 	}
3543 
3544 	for_each_node(node)
3545 		if (!alloc_mem_cgroup_per_node_info(memcg, node))
3546 			goto fail;
3547 
3548 	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
3549 		goto fail;
3550 
3551 	INIT_WORK(&memcg->high_work, high_work_func);
3552 	vmpressure_init(&memcg->vmpressure);
3553 	memcg->socket_pressure = jiffies;
3554 	memcg1_memcg_init(memcg);
3555 	memcg->kmemcg_id = -1;
3556 	INIT_LIST_HEAD(&memcg->objcg_list);
3557 #ifdef CONFIG_CGROUP_WRITEBACK
3558 	INIT_LIST_HEAD(&memcg->cgwb_list);
3559 	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
3560 		memcg->cgwb_frn[i].done =
3561 			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
3562 #endif
3563 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3564 	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
3565 	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
3566 	memcg->deferred_split_queue.split_queue_len = 0;
3567 #endif
3568 	lru_gen_init_memcg(memcg);
3569 	return memcg;
3570 fail:
3571 	mem_cgroup_id_remove(memcg);
3572 	__mem_cgroup_free(memcg);
3573 	return ERR_PTR(error);
3574 }
3575 
3576 static struct cgroup_subsys_state * __ref
3577 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
3578 {
3579 	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
3580 	struct mem_cgroup *memcg, *old_memcg;
3581 
3582 	old_memcg = set_active_memcg(parent);
3583 	memcg = mem_cgroup_alloc(parent);
3584 	set_active_memcg(old_memcg);
3585 	if (IS_ERR(memcg))
3586 		return ERR_CAST(memcg);
3587 
3588 	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
3589 	memcg1_soft_limit_reset(memcg);
3590 #ifdef CONFIG_ZSWAP
3591 	memcg->zswap_max = PAGE_COUNTER_MAX;
3592 	WRITE_ONCE(memcg->zswap_writeback,
3593 		!parent || READ_ONCE(parent->zswap_writeback));
3594 #endif
3595 	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
3596 	if (parent) {
3597 		WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
3598 
3599 		page_counter_init(&memcg->memory, &parent->memory);
3600 		page_counter_init(&memcg->swap, &parent->swap);
3601 #ifdef CONFIG_MEMCG_V1
3602 		WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
3603 		page_counter_init(&memcg->kmem, &parent->kmem);
3604 		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
3605 #endif
3606 	} else {
3607 		init_memcg_stats();
3608 		init_memcg_events();
3609 		page_counter_init(&memcg->memory, NULL);
3610 		page_counter_init(&memcg->swap, NULL);
3611 #ifdef CONFIG_MEMCG_V1
3612 		page_counter_init(&memcg->kmem, NULL);
3613 		page_counter_init(&memcg->tcpmem, NULL);
3614 #endif
3615 		root_mem_cgroup = memcg;
3616 		return &memcg->css;
3617 	}
3618 
3619 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
3620 		static_branch_inc(&memcg_sockets_enabled_key);
3621 
3622 	if (!cgroup_memory_nobpf)
3623 		static_branch_inc(&memcg_bpf_enabled_key);
3624 
3625 	return &memcg->css;
3626 }
3627 
3628 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
3629 {
3630 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3631 
3632 	if (memcg_online_kmem(memcg))
3633 		goto remove_id;
3634 
3635 	/*
3636 	 * A memcg must be visible for expand_shrinker_info()
3637 	 * by the time the maps are allocated. So, we allocate maps
3638 	 * here, when for_each_mem_cgroup() can't skip it.
3639 	 */
3640 	if (alloc_shrinker_info(memcg))
3641 		goto offline_kmem;
3642 
3643 	if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
3644 		queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
3645 				   FLUSH_TIME);
3646 	lru_gen_online_memcg(memcg);
3647 
3648 	/* Online state pins memcg ID, memcg ID pins CSS */
3649 	refcount_set(&memcg->id.ref, 1);
3650 	css_get(css);
3651 
3652 	/*
3653 	 * Ensure mem_cgroup_from_id() works once we're fully online.
3654 	 *
3655 	 * We could do this earlier and require callers to filter with
3656 	 * css_tryget_online(). But right now there are no users that
3657 	 * need earlier access, and the workingset code relies on the
3658 	 * cgroup tree linkage (mem_cgroup_get_nr_swap_pages()). So
3659 	 * publish it here at the end of onlining. This matches the
3660 	 * regular ID destruction during offlining.
3661 	 */
3662 	spin_lock(&memcg_idr_lock);
3663 	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
3664 	spin_unlock(&memcg_idr_lock);
3665 
3666 	return 0;
3667 offline_kmem:
3668 	memcg_offline_kmem(memcg);
3669 remove_id:
3670 	mem_cgroup_id_remove(memcg);
3671 	return -ENOMEM;
3672 }
3673 
3674 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
3675 {
3676 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3677 
3678 	memcg1_css_offline(memcg);
3679 
3680 	page_counter_set_min(&memcg->memory, 0);
3681 	page_counter_set_low(&memcg->memory, 0);
3682 
3683 	zswap_memcg_offline_cleanup(memcg);
3684 
3685 	memcg_offline_kmem(memcg);
3686 	reparent_shrinker_deferred(memcg);
3687 	wb_memcg_offline(memcg);
3688 	lru_gen_offline_memcg(memcg);
3689 
3690 	drain_all_stock(memcg);
3691 
3692 	mem_cgroup_id_put(memcg);
3693 }
3694 
3695 static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
3696 {
3697 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3698 
3699 	invalidate_reclaim_iterators(memcg);
3700 	lru_gen_release_memcg(memcg);
3701 }
3702 
3703 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
3704 {
3705 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3706 	int __maybe_unused i;
3707 
3708 #ifdef CONFIG_CGROUP_WRITEBACK
3709 	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
3710 		wb_wait_for_completion(&memcg->cgwb_frn[i].done);
3711 #endif
3712 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
3713 		static_branch_dec(&memcg_sockets_enabled_key);
3714 
3715 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg1_tcpmem_active(memcg))
3716 		static_branch_dec(&memcg_sockets_enabled_key);
3717 
3718 	if (!cgroup_memory_nobpf)
3719 		static_branch_dec(&memcg_bpf_enabled_key);
3720 
3721 	vmpressure_cleanup(&memcg->vmpressure);
3722 	cancel_work_sync(&memcg->high_work);
3723 	memcg1_remove_from_trees(memcg);
3724 	free_shrinker_info(memcg);
3725 	mem_cgroup_free(memcg);
3726 }
3727 
3728 /**
3729  * mem_cgroup_css_reset - reset the states of a mem_cgroup
3730  * @css: the target css
3731  *
3732  * Reset the states of the mem_cgroup associated with @css.  This is
3733  * invoked when the userland requests disabling on the default hierarchy
3734  * but the memcg is pinned through dependency.  The memcg should stop
3735  * applying policies and should revert to the vanilla state as it may be
3736  * made visible again.
3737  *
3738  * The current implementation only resets the essential configurations.
3739  * This needs to be expanded to cover all the visible parts.
3740  */
3741 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
3742 {
3743 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3744 
3745 	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
3746 	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
3747 #ifdef CONFIG_MEMCG_V1
3748 	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
3749 	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
3750 #endif
3751 	page_counter_set_min(&memcg->memory, 0);
3752 	page_counter_set_low(&memcg->memory, 0);
3753 	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
3754 	memcg1_soft_limit_reset(memcg);
3755 	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
3756 	memcg_wb_domain_size_changed(memcg);
3757 }
3758 
3759 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
3760 {
3761 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3762 	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
3763 	struct memcg_vmstats_percpu *statc;
3764 	long delta, delta_cpu, v;
3765 	int i, nid;
3766 
3767 	statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
3768 
3769 	for (i = 0; i < MEMCG_VMSTAT_SIZE; i++) {
3770 		/*
3771 		 * Collect the aggregated propagation counts of groups
3772 		 * below us. We're in a per-cpu loop here and this is
3773 		 * a global counter, so the first cycle will get them.
3774 		 */
3775 		delta = memcg->vmstats->state_pending[i];
3776 		if (delta)
3777 			memcg->vmstats->state_pending[i] = 0;
3778 
3779 		/* Add CPU changes on this level since the last flush */
3780 		delta_cpu = 0;
3781 		v = READ_ONCE(statc->state[i]);
3782 		if (v != statc->state_prev[i]) {
3783 			delta_cpu = v - statc->state_prev[i];
3784 			delta += delta_cpu;
3785 			statc->state_prev[i] = v;
3786 		}
3787 
3788 		/* Aggregate counts on this level and propagate upwards */
3789 		if (delta_cpu)
3790 			memcg->vmstats->state_local[i] += delta_cpu;
3791 
3792 		if (delta) {
3793 			memcg->vmstats->state[i] += delta;
3794 			if (parent)
3795 				parent->vmstats->state_pending[i] += delta;
3796 		}
3797 	}
3798 
3799 	for (i = 0; i < NR_MEMCG_EVENTS; i++) {
3800 		delta = memcg->vmstats->events_pending[i];
3801 		if (delta)
3802 			memcg->vmstats->events_pending[i] = 0;
3803 
3804 		delta_cpu = 0;
3805 		v = READ_ONCE(statc->events[i]);
3806 		if (v != statc->events_prev[i]) {
3807 			delta_cpu = v - statc->events_prev[i];
3808 			delta += delta_cpu;
3809 			statc->events_prev[i] = v;
3810 		}
3811 
3812 		if (delta_cpu)
3813 			memcg->vmstats->events_local[i] += delta_cpu;
3814 
3815 		if (delta) {
3816 			memcg->vmstats->events[i] += delta;
3817 			if (parent)
3818 				parent->vmstats->events_pending[i] += delta;
3819 		}
3820 	}
3821 
3822 	for_each_node_state(nid, N_MEMORY) {
3823 		struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
3824 		struct lruvec_stats *lstats = pn->lruvec_stats;
3825 		struct lruvec_stats *plstats = NULL;
3826 		struct lruvec_stats_percpu *lstatc;
3827 
3828 		if (parent)
3829 			plstats = parent->nodeinfo[nid]->lruvec_stats;
3830 
3831 		lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
3832 
3833 		for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) {
3834 			delta = lstats->state_pending[i];
3835 			if (delta)
3836 				lstats->state_pending[i] = 0;
3837 
3838 			delta_cpu = 0;
3839 			v = READ_ONCE(lstatc->state[i]);
3840 			if (v != lstatc->state_prev[i]) {
3841 				delta_cpu = v - lstatc->state_prev[i];
3842 				delta += delta_cpu;
3843 				lstatc->state_prev[i] = v;
3844 			}
3845 
3846 			if (delta_cpu)
3847 				lstats->state_local[i] += delta_cpu;
3848 
3849 			if (delta) {
3850 				lstats->state[i] += delta;
3851 				if (plstats)
3852 					plstats->state_pending[i] += delta;
3853 			}
3854 		}
3855 	}
3856 	WRITE_ONCE(statc->stats_updates, 0);
3857 	/* We are in a per-cpu loop here, only do the atomic write once */
3858 	if (atomic64_read(&memcg->vmstats->stats_updates))
3859 		atomic64_set(&memcg->vmstats->stats_updates, 0);
3860 }
3861 
3862 static void mem_cgroup_fork(struct task_struct *task)
3863 {
3864 	/*
3865 	 * Set the update flag to cause task->objcg to be initialized lazily
3866 	 * on the first allocation. It can be done without any synchronization
3867 	 * because it's always performed on the current task, so does
3868 	 * current_objcg_update().
3869 	 */
3870 	task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG;
3871 }
3872 
3873 static void mem_cgroup_exit(struct task_struct *task)
3874 {
3875 	struct obj_cgroup *objcg = task->objcg;
3876 
3877 	objcg = (struct obj_cgroup *)
3878 		((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG);
3879 	obj_cgroup_put(objcg);
3880 
3881 	/*
3882 	 * Some kernel allocations can happen after this point,
3883 	 * but let's ignore them. It can be done without any synchronization
3884 	 * because it's always performed on the current task, so does
3885 	 * current_objcg_update().
3886 	 */
3887 	task->objcg = NULL;
3888 }
3889 
3890 #ifdef CONFIG_LRU_GEN
3891 static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset)
3892 {
3893 	struct task_struct *task;
3894 	struct cgroup_subsys_state *css;
3895 
3896 	/* find the first leader if there is any */
3897 	cgroup_taskset_for_each_leader(task, css, tset)
3898 		break;
3899 
3900 	if (!task)
3901 		return;
3902 
3903 	task_lock(task);
3904 	if (task->mm && READ_ONCE(task->mm->owner) == task)
3905 		lru_gen_migrate_mm(task->mm);
3906 	task_unlock(task);
3907 }
3908 #else
3909 static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {}
3910 #endif /* CONFIG_LRU_GEN */
3911 
3912 static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset)
3913 {
3914 	struct task_struct *task;
3915 	struct cgroup_subsys_state *css;
3916 
3917 	cgroup_taskset_for_each(task, css, tset) {
3918 		/* atomically set the update bit */
3919 		set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg);
3920 	}
3921 }
3922 
3923 static void mem_cgroup_attach(struct cgroup_taskset *tset)
3924 {
3925 	mem_cgroup_lru_gen_attach(tset);
3926 	mem_cgroup_kmem_attach(tset);
3927 }
3928 
3929 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
3930 {
3931 	if (value == PAGE_COUNTER_MAX)
3932 		seq_puts(m, "max\n");
3933 	else
3934 		seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
3935 
3936 	return 0;
3937 }
3938 
3939 static u64 memory_current_read(struct cgroup_subsys_state *css,
3940 			       struct cftype *cft)
3941 {
3942 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3943 
3944 	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
3945 }
3946 
3947 static u64 memory_peak_read(struct cgroup_subsys_state *css,
3948 			    struct cftype *cft)
3949 {
3950 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3951 
3952 	return (u64)memcg->memory.watermark * PAGE_SIZE;
3953 }
3954 
3955 static int memory_min_show(struct seq_file *m, void *v)
3956 {
3957 	return seq_puts_memcg_tunable(m,
3958 		READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
3959 }
3960 
3961 static ssize_t memory_min_write(struct kernfs_open_file *of,
3962 				char *buf, size_t nbytes, loff_t off)
3963 {
3964 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3965 	unsigned long min;
3966 	int err;
3967 
3968 	buf = strstrip(buf);
3969 	err = page_counter_memparse(buf, "max", &min);
3970 	if (err)
3971 		return err;
3972 
3973 	page_counter_set_min(&memcg->memory, min);
3974 
3975 	return nbytes;
3976 }
3977 
3978 static int memory_low_show(struct seq_file *m, void *v)
3979 {
3980 	return seq_puts_memcg_tunable(m,
3981 		READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
3982 }
3983 
3984 static ssize_t memory_low_write(struct kernfs_open_file *of,
3985 				char *buf, size_t nbytes, loff_t off)
3986 {
3987 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3988 	unsigned long low;
3989 	int err;
3990 
3991 	buf = strstrip(buf);
3992 	err = page_counter_memparse(buf, "max", &low);
3993 	if (err)
3994 		return err;
3995 
3996 	page_counter_set_low(&memcg->memory, low);
3997 
3998 	return nbytes;
3999 }
4000 
4001 static int memory_high_show(struct seq_file *m, void *v)
4002 {
4003 	return seq_puts_memcg_tunable(m,
4004 		READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
4005 }
4006 
4007 static ssize_t memory_high_write(struct kernfs_open_file *of,
4008 				 char *buf, size_t nbytes, loff_t off)
4009 {
4010 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4011 	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
4012 	bool drained = false;
4013 	unsigned long high;
4014 	int err;
4015 
4016 	buf = strstrip(buf);
4017 	err = page_counter_memparse(buf, "max", &high);
4018 	if (err)
4019 		return err;
4020 
4021 	page_counter_set_high(&memcg->memory, high);
4022 
4023 	for (;;) {
4024 		unsigned long nr_pages = page_counter_read(&memcg->memory);
4025 		unsigned long reclaimed;
4026 
4027 		if (nr_pages <= high)
4028 			break;
4029 
4030 		if (signal_pending(current))
4031 			break;
4032 
4033 		if (!drained) {
4034 			drain_all_stock(memcg);
4035 			drained = true;
4036 			continue;
4037 		}
4038 
4039 		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
4040 					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
4041 
4042 		if (!reclaimed && !nr_retries--)
4043 			break;
4044 	}
4045 
4046 	memcg_wb_domain_size_changed(memcg);
4047 	return nbytes;
4048 }
4049 
4050 static int memory_max_show(struct seq_file *m, void *v)
4051 {
4052 	return seq_puts_memcg_tunable(m,
4053 		READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
4054 }
4055 
4056 static ssize_t memory_max_write(struct kernfs_open_file *of,
4057 				char *buf, size_t nbytes, loff_t off)
4058 {
4059 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4060 	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
4061 	bool drained = false;
4062 	unsigned long max;
4063 	int err;
4064 
4065 	buf = strstrip(buf);
4066 	err = page_counter_memparse(buf, "max", &max);
4067 	if (err)
4068 		return err;
4069 
4070 	xchg(&memcg->memory.max, max);
4071 
4072 	for (;;) {
4073 		unsigned long nr_pages = page_counter_read(&memcg->memory);
4074 
4075 		if (nr_pages <= max)
4076 			break;
4077 
4078 		if (signal_pending(current))
4079 			break;
4080 
4081 		if (!drained) {
4082 			drain_all_stock(memcg);
4083 			drained = true;
4084 			continue;
4085 		}
4086 
4087 		if (nr_reclaims) {
4088 			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
4089 					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
4090 				nr_reclaims--;
4091 			continue;
4092 		}
4093 
4094 		memcg_memory_event(memcg, MEMCG_OOM);
4095 		if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
4096 			break;
4097 	}
4098 
4099 	memcg_wb_domain_size_changed(memcg);
4100 	return nbytes;
4101 }
4102 
4103 /*
4104  * Note: don't forget to update the 'samples/cgroup/memcg_event_listener'
4105  * if any new events become available.
4106  */
4107 static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
4108 {
4109 	seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
4110 	seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
4111 	seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
4112 	seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
4113 	seq_printf(m, "oom_kill %lu\n",
4114 		   atomic_long_read(&events[MEMCG_OOM_KILL]));
4115 	seq_printf(m, "oom_group_kill %lu\n",
4116 		   atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
4117 }
4118 
4119 static int memory_events_show(struct seq_file *m, void *v)
4120 {
4121 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4122 
4123 	__memory_events_show(m, memcg->memory_events);
4124 	return 0;
4125 }
4126 
4127 static int memory_events_local_show(struct seq_file *m, void *v)
4128 {
4129 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4130 
4131 	__memory_events_show(m, memcg->memory_events_local);
4132 	return 0;
4133 }
4134 
4135 int memory_stat_show(struct seq_file *m, void *v)
4136 {
4137 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4138 	char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
4139 	struct seq_buf s;
4140 
4141 	if (!buf)
4142 		return -ENOMEM;
4143 	seq_buf_init(&s, buf, PAGE_SIZE);
4144 	memory_stat_format(memcg, &s);
4145 	seq_puts(m, buf);
4146 	kfree(buf);
4147 	return 0;
4148 }
4149 
4150 #ifdef CONFIG_NUMA
4151 static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
4152 						     int item)
4153 {
4154 	return lruvec_page_state(lruvec, item) *
4155 		memcg_page_state_output_unit(item);
4156 }
4157 
4158 static int memory_numa_stat_show(struct seq_file *m, void *v)
4159 {
4160 	int i;
4161 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4162 
4163 	mem_cgroup_flush_stats(memcg);
4164 
4165 	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
4166 		int nid;
4167 
4168 		if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
4169 			continue;
4170 
4171 		seq_printf(m, "%s", memory_stats[i].name);
4172 		for_each_node_state(nid, N_MEMORY) {
4173 			u64 size;
4174 			struct lruvec *lruvec;
4175 
4176 			lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
4177 			size = lruvec_page_state_output(lruvec,
4178 							memory_stats[i].idx);
4179 			seq_printf(m, " N%d=%llu", nid, size);
4180 		}
4181 		seq_putc(m, '\n');
4182 	}
4183 
4184 	return 0;
4185 }
4186 #endif
4187 
4188 static int memory_oom_group_show(struct seq_file *m, void *v)
4189 {
4190 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4191 
4192 	seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group));
4193 
4194 	return 0;
4195 }
4196 
4197 static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
4198 				      char *buf, size_t nbytes, loff_t off)
4199 {
4200 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4201 	int ret, oom_group;
4202 
4203 	buf = strstrip(buf);
4204 	if (!buf)
4205 		return -EINVAL;
4206 
4207 	ret = kstrtoint(buf, 0, &oom_group);
4208 	if (ret)
4209 		return ret;
4210 
4211 	if (oom_group != 0 && oom_group != 1)
4212 		return -EINVAL;
4213 
4214 	WRITE_ONCE(memcg->oom_group, oom_group);
4215 
4216 	return nbytes;
4217 }
4218 
4219 enum {
4220 	MEMORY_RECLAIM_SWAPPINESS = 0,
4221 	MEMORY_RECLAIM_NULL,
4222 };
4223 
4224 static const match_table_t tokens = {
4225 	{ MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
4226 	{ MEMORY_RECLAIM_NULL, NULL },
4227 };
4228 
4229 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
4230 			      size_t nbytes, loff_t off)
4231 {
4232 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4233 	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
4234 	unsigned long nr_to_reclaim, nr_reclaimed = 0;
4235 	int swappiness = -1;
4236 	unsigned int reclaim_options;
4237 	char *old_buf, *start;
4238 	substring_t args[MAX_OPT_ARGS];
4239 
4240 	buf = strstrip(buf);
4241 
4242 	old_buf = buf;
4243 	nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
4244 	if (buf == old_buf)
4245 		return -EINVAL;
4246 
4247 	buf = strstrip(buf);
4248 
4249 	while ((start = strsep(&buf, " ")) != NULL) {
4250 		if (!strlen(start))
4251 			continue;
4252 		switch (match_token(start, tokens, args)) {
4253 		case MEMORY_RECLAIM_SWAPPINESS:
4254 			if (match_int(&args[0], &swappiness))
4255 				return -EINVAL;
4256 			if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
4257 				return -EINVAL;
4258 			break;
4259 		default:
4260 			return -EINVAL;
4261 		}
4262 	}
4263 
4264 	reclaim_options	= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
4265 	while (nr_reclaimed < nr_to_reclaim) {
4266 		/* Will converge on zero, but reclaim enforces a minimum */
4267 		unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
4268 		unsigned long reclaimed;
4269 
4270 		if (signal_pending(current))
4271 			return -EINTR;
4272 
4273 		/*
4274 		 * This is the final attempt, drain percpu lru caches in the
4275 		 * hope of introducing more evictable pages for
4276 		 * try_to_free_mem_cgroup_pages().
4277 		 */
4278 		if (!nr_retries)
4279 			lru_add_drain_all();
4280 
4281 		reclaimed = try_to_free_mem_cgroup_pages(memcg,
4282 					batch_size, GFP_KERNEL,
4283 					reclaim_options,
4284 					swappiness == -1 ? NULL : &swappiness);
4285 
4286 		if (!reclaimed && !nr_retries--)
4287 			return -EAGAIN;
4288 
4289 		nr_reclaimed += reclaimed;
4290 	}
4291 
4292 	return nbytes;
4293 }
4294 
4295 static struct cftype memory_files[] = {
4296 	{
4297 		.name = "current",
4298 		.flags = CFTYPE_NOT_ON_ROOT,
4299 		.read_u64 = memory_current_read,
4300 	},
4301 	{
4302 		.name = "peak",
4303 		.flags = CFTYPE_NOT_ON_ROOT,
4304 		.read_u64 = memory_peak_read,
4305 	},
4306 	{
4307 		.name = "min",
4308 		.flags = CFTYPE_NOT_ON_ROOT,
4309 		.seq_show = memory_min_show,
4310 		.write = memory_min_write,
4311 	},
4312 	{
4313 		.name = "low",
4314 		.flags = CFTYPE_NOT_ON_ROOT,
4315 		.seq_show = memory_low_show,
4316 		.write = memory_low_write,
4317 	},
4318 	{
4319 		.name = "high",
4320 		.flags = CFTYPE_NOT_ON_ROOT,
4321 		.seq_show = memory_high_show,
4322 		.write = memory_high_write,
4323 	},
4324 	{
4325 		.name = "max",
4326 		.flags = CFTYPE_NOT_ON_ROOT,
4327 		.seq_show = memory_max_show,
4328 		.write = memory_max_write,
4329 	},
4330 	{
4331 		.name = "events",
4332 		.flags = CFTYPE_NOT_ON_ROOT,
4333 		.file_offset = offsetof(struct mem_cgroup, events_file),
4334 		.seq_show = memory_events_show,
4335 	},
4336 	{
4337 		.name = "events.local",
4338 		.flags = CFTYPE_NOT_ON_ROOT,
4339 		.file_offset = offsetof(struct mem_cgroup, events_local_file),
4340 		.seq_show = memory_events_local_show,
4341 	},
4342 	{
4343 		.name = "stat",
4344 		.seq_show = memory_stat_show,
4345 	},
4346 #ifdef CONFIG_NUMA
4347 	{
4348 		.name = "numa_stat",
4349 		.seq_show = memory_numa_stat_show,
4350 	},
4351 #endif
4352 	{
4353 		.name = "oom.group",
4354 		.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
4355 		.seq_show = memory_oom_group_show,
4356 		.write = memory_oom_group_write,
4357 	},
4358 	{
4359 		.name = "reclaim",
4360 		.flags = CFTYPE_NS_DELEGATABLE,
4361 		.write = memory_reclaim,
4362 	},
4363 	{ }	/* terminate */
4364 };
4365 
4366 struct cgroup_subsys memory_cgrp_subsys = {
4367 	.css_alloc = mem_cgroup_css_alloc,
4368 	.css_online = mem_cgroup_css_online,
4369 	.css_offline = mem_cgroup_css_offline,
4370 	.css_released = mem_cgroup_css_released,
4371 	.css_free = mem_cgroup_css_free,
4372 	.css_reset = mem_cgroup_css_reset,
4373 	.css_rstat_flush = mem_cgroup_css_rstat_flush,
4374 	.attach = mem_cgroup_attach,
4375 	.fork = mem_cgroup_fork,
4376 	.exit = mem_cgroup_exit,
4377 	.dfl_cftypes = memory_files,
4378 #ifdef CONFIG_MEMCG_V1
4379 	.can_attach = memcg1_can_attach,
4380 	.cancel_attach = memcg1_cancel_attach,
4381 	.post_attach = memcg1_move_task,
4382 	.legacy_cftypes = mem_cgroup_legacy_files,
4383 #endif
4384 	.early_init = 0,
4385 };
4386 
4387 /**
4388  * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
4389  * @root: the top ancestor of the sub-tree being checked
4390  * @memcg: the memory cgroup to check
4391  *
4392  * WARNING: This function is not stateless! It can only be used as part
4393  *          of a top-down tree iteration, not for isolated queries.
4394  */
4395 void mem_cgroup_calculate_protection(struct mem_cgroup *root,
4396 				     struct mem_cgroup *memcg)
4397 {
4398 	bool recursive_protection =
4399 		cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT;
4400 
4401 	if (mem_cgroup_disabled())
4402 		return;
4403 
4404 	if (!root)
4405 		root = root_mem_cgroup;
4406 
4407 	page_counter_calculate_protection(&root->memory, &memcg->memory, recursive_protection);
4408 }
4409 
4410 static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
4411 			gfp_t gfp)
4412 {
4413 	int ret;
4414 
4415 	ret = try_charge(memcg, gfp, folio_nr_pages(folio));
4416 	if (ret)
4417 		goto out;
4418 
4419 	mem_cgroup_commit_charge(folio, memcg);
4420 out:
4421 	return ret;
4422 }
4423 
4424 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
4425 {
4426 	struct mem_cgroup *memcg;
4427 	int ret;
4428 
4429 	memcg = get_mem_cgroup_from_mm(mm);
4430 	ret = charge_memcg(folio, memcg, gfp);
4431 	css_put(&memcg->css);
4432 
4433 	return ret;
4434 }
4435 
4436 /**
4437  * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
4438  * @memcg: memcg to charge.
4439  * @gfp: reclaim mode.
4440  * @nr_pages: number of pages to charge.
4441  *
4442  * This function is called when allocating a huge page folio to determine if
4443  * the memcg has the capacity for it. It does not commit the charge yet,
4444  * as the hugetlb folio itself has not been obtained from the hugetlb pool.
4445  *
4446  * Once we have obtained the hugetlb folio, we can call
4447  * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
4448  * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
4449  * of try_charge().
4450  *
4451  * Returns 0 on success. Otherwise, an error code is returned.
4452  */
4453 int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
4454 			long nr_pages)
4455 {
4456 	/*
4457 	 * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
4458 	 * but do not attempt to commit charge later (or cancel on error) either.
4459 	 */
4460 	if (mem_cgroup_disabled() || !memcg ||
4461 		!cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
4462 		!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
4463 		return -EOPNOTSUPP;
4464 
4465 	if (try_charge(memcg, gfp, nr_pages))
4466 		return -ENOMEM;
4467 
4468 	return 0;
4469 }
4470 
4471 /**
4472  * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
4473  * @folio: folio to charge.
4474  * @mm: mm context of the victim
4475  * @gfp: reclaim mode
4476  * @entry: swap entry for which the folio is allocated
4477  *
4478  * This function charges a folio allocated for swapin. Please call this before
4479  * adding the folio to the swapcache.
4480  *
4481  * Returns 0 on success. Otherwise, an error code is returned.
4482  */
4483 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
4484 				  gfp_t gfp, swp_entry_t entry)
4485 {
4486 	struct mem_cgroup *memcg;
4487 	unsigned short id;
4488 	int ret;
4489 
4490 	if (mem_cgroup_disabled())
4491 		return 0;
4492 
4493 	id = lookup_swap_cgroup_id(entry);
4494 	rcu_read_lock();
4495 	memcg = mem_cgroup_from_id(id);
4496 	if (!memcg || !css_tryget_online(&memcg->css))
4497 		memcg = get_mem_cgroup_from_mm(mm);
4498 	rcu_read_unlock();
4499 
4500 	ret = charge_memcg(folio, memcg, gfp);
4501 
4502 	css_put(&memcg->css);
4503 	return ret;
4504 }
4505 
4506 /*
4507  * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
4508  * @entry: swap entry for which the page is charged
4509  *
4510  * Call this function after successfully adding the charged page to swapcache.
4511  *
4512  * Note: This function assumes the page for which swap slot is being uncharged
4513  * is order 0 page.
4514  */
4515 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
4516 {
4517 	/*
4518 	 * Cgroup1's unified memory+swap counter has been charged with the
4519 	 * new swapcache page, finish the transfer by uncharging the swap
4520 	 * slot. The swap slot would also get uncharged when it dies, but
4521 	 * it can stick around indefinitely and we'd count the page twice
4522 	 * the entire time.
4523 	 *
4524 	 * Cgroup2 has separate resource counters for memory and swap,
4525 	 * so this is a non-issue here. Memory and swap charge lifetimes
4526 	 * correspond 1:1 to page and swap slot lifetimes: we charge the
4527 	 * page to memory here, and uncharge swap when the slot is freed.
4528 	 */
4529 	if (!mem_cgroup_disabled() && do_memsw_account()) {
4530 		/*
4531 		 * The swap entry might not get freed for a long time,
4532 		 * let's not wait for it.  The page already received a
4533 		 * memory+swap charge, drop the swap entry duplicate.
4534 		 */
4535 		mem_cgroup_uncharge_swap(entry, 1);
4536 	}
4537 }
4538 
4539 struct uncharge_gather {
4540 	struct mem_cgroup *memcg;
4541 	unsigned long nr_memory;
4542 	unsigned long pgpgout;
4543 	unsigned long nr_kmem;
4544 	int nid;
4545 };
4546 
4547 static inline void uncharge_gather_clear(struct uncharge_gather *ug)
4548 {
4549 	memset(ug, 0, sizeof(*ug));
4550 }
4551 
4552 static void uncharge_batch(const struct uncharge_gather *ug)
4553 {
4554 	unsigned long flags;
4555 
4556 	if (ug->nr_memory) {
4557 		page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
4558 		if (do_memsw_account())
4559 			page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
4560 		if (ug->nr_kmem) {
4561 			mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem);
4562 			memcg1_account_kmem(ug->memcg, -ug->nr_kmem);
4563 		}
4564 		memcg1_oom_recover(ug->memcg);
4565 	}
4566 
4567 	local_irq_save(flags);
4568 	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
4569 	__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
4570 	memcg1_check_events(ug->memcg, ug->nid);
4571 	local_irq_restore(flags);
4572 
4573 	/* drop reference from uncharge_folio */
4574 	css_put(&ug->memcg->css);
4575 }
4576 
4577 static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
4578 {
4579 	long nr_pages;
4580 	struct mem_cgroup *memcg;
4581 	struct obj_cgroup *objcg;
4582 
4583 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
4584 	VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
4585 			!folio_test_hugetlb(folio) &&
4586 			!list_empty(&folio->_deferred_list), folio);
4587 
4588 	/*
4589 	 * Nobody should be changing or seriously looking at
4590 	 * folio memcg or objcg at this point, we have fully
4591 	 * exclusive access to the folio.
4592 	 */
4593 	if (folio_memcg_kmem(folio)) {
4594 		objcg = __folio_objcg(folio);
4595 		/*
4596 		 * This get matches the put at the end of the function and
4597 		 * kmem pages do not hold memcg references anymore.
4598 		 */
4599 		memcg = get_mem_cgroup_from_objcg(objcg);
4600 	} else {
4601 		memcg = __folio_memcg(folio);
4602 	}
4603 
4604 	if (!memcg)
4605 		return;
4606 
4607 	if (ug->memcg != memcg) {
4608 		if (ug->memcg) {
4609 			uncharge_batch(ug);
4610 			uncharge_gather_clear(ug);
4611 		}
4612 		ug->memcg = memcg;
4613 		ug->nid = folio_nid(folio);
4614 
4615 		/* pairs with css_put in uncharge_batch */
4616 		css_get(&memcg->css);
4617 	}
4618 
4619 	nr_pages = folio_nr_pages(folio);
4620 
4621 	if (folio_memcg_kmem(folio)) {
4622 		ug->nr_memory += nr_pages;
4623 		ug->nr_kmem += nr_pages;
4624 
4625 		folio->memcg_data = 0;
4626 		obj_cgroup_put(objcg);
4627 	} else {
4628 		/* LRU pages aren't accounted at the root level */
4629 		if (!mem_cgroup_is_root(memcg))
4630 			ug->nr_memory += nr_pages;
4631 		ug->pgpgout++;
4632 
4633 		folio->memcg_data = 0;
4634 	}
4635 
4636 	css_put(&memcg->css);
4637 }
4638 
4639 void __mem_cgroup_uncharge(struct folio *folio)
4640 {
4641 	struct uncharge_gather ug;
4642 
4643 	/* Don't touch folio->lru of any random page, pre-check: */
4644 	if (!folio_memcg(folio))
4645 		return;
4646 
4647 	uncharge_gather_clear(&ug);
4648 	uncharge_folio(folio, &ug);
4649 	uncharge_batch(&ug);
4650 }
4651 
4652 void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
4653 {
4654 	struct uncharge_gather ug;
4655 	unsigned int i;
4656 
4657 	uncharge_gather_clear(&ug);
4658 	for (i = 0; i < folios->nr; i++)
4659 		uncharge_folio(folios->folios[i], &ug);
4660 	if (ug.memcg)
4661 		uncharge_batch(&ug);
4662 }
4663 
4664 /**
4665  * mem_cgroup_replace_folio - Charge a folio's replacement.
4666  * @old: Currently circulating folio.
4667  * @new: Replacement folio.
4668  *
4669  * Charge @new as a replacement folio for @old. @old will
4670  * be uncharged upon free.
4671  *
4672  * Both folios must be locked, @new->mapping must be set up.
4673  */
4674 void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
4675 {
4676 	struct mem_cgroup *memcg;
4677 	long nr_pages = folio_nr_pages(new);
4678 	unsigned long flags;
4679 
4680 	VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
4681 	VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
4682 	VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
4683 	VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
4684 
4685 	if (mem_cgroup_disabled())
4686 		return;
4687 
4688 	/* Page cache replacement: new folio already charged? */
4689 	if (folio_memcg(new))
4690 		return;
4691 
4692 	memcg = folio_memcg(old);
4693 	VM_WARN_ON_ONCE_FOLIO(!memcg, old);
4694 	if (!memcg)
4695 		return;
4696 
4697 	/* Force-charge the new page. The old one will be freed soon */
4698 	if (!mem_cgroup_is_root(memcg)) {
4699 		page_counter_charge(&memcg->memory, nr_pages);
4700 		if (do_memsw_account())
4701 			page_counter_charge(&memcg->memsw, nr_pages);
4702 	}
4703 
4704 	css_get(&memcg->css);
4705 	commit_charge(new, memcg);
4706 
4707 	local_irq_save(flags);
4708 	mem_cgroup_charge_statistics(memcg, nr_pages);
4709 	memcg1_check_events(memcg, folio_nid(new));
4710 	local_irq_restore(flags);
4711 }
4712 
4713 /**
4714  * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio.
4715  * @old: Currently circulating folio.
4716  * @new: Replacement folio.
4717  *
4718  * Transfer the memcg data from the old folio to the new folio for migration.
4719  * The old folio's data info will be cleared. Note that the memory counters
4720  * will remain unchanged throughout the process.
4721  *
4722  * Both folios must be locked, @new->mapping must be set up.
4723  */
4724 void mem_cgroup_migrate(struct folio *old, struct folio *new)
4725 {
4726 	struct mem_cgroup *memcg;
4727 
4728 	VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
4729 	VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
4730 	VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
4731 	VM_BUG_ON_FOLIO(folio_nr_pages(old) != folio_nr_pages(new), new);
4732 	VM_BUG_ON_FOLIO(folio_test_lru(old), old);
4733 
4734 	if (mem_cgroup_disabled())
4735 		return;
4736 
4737 	memcg = folio_memcg(old);
4738 	/*
4739 	 * Note that it is normal to see !memcg for a hugetlb folio.
4740 	 * For e.g, itt could have been allocated when memory_hugetlb_accounting
4741 	 * was not selected.
4742 	 */
4743 	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
4744 	if (!memcg)
4745 		return;
4746 
4747 	/* Transfer the charge and the css ref */
4748 	commit_charge(new, memcg);
4749 	old->memcg_data = 0;
4750 }
4751 
4752 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
4753 EXPORT_SYMBOL(memcg_sockets_enabled_key);
4754 
4755 void mem_cgroup_sk_alloc(struct sock *sk)
4756 {
4757 	struct mem_cgroup *memcg;
4758 
4759 	if (!mem_cgroup_sockets_enabled)
4760 		return;
4761 
4762 	/* Do not associate the sock with unrelated interrupted task's memcg. */
4763 	if (!in_task())
4764 		return;
4765 
4766 	rcu_read_lock();
4767 	memcg = mem_cgroup_from_task(current);
4768 	if (mem_cgroup_is_root(memcg))
4769 		goto out;
4770 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg1_tcpmem_active(memcg))
4771 		goto out;
4772 	if (css_tryget(&memcg->css))
4773 		sk->sk_memcg = memcg;
4774 out:
4775 	rcu_read_unlock();
4776 }
4777 
4778 void mem_cgroup_sk_free(struct sock *sk)
4779 {
4780 	if (sk->sk_memcg)
4781 		css_put(&sk->sk_memcg->css);
4782 }
4783 
4784 /**
4785  * mem_cgroup_charge_skmem - charge socket memory
4786  * @memcg: memcg to charge
4787  * @nr_pages: number of pages to charge
4788  * @gfp_mask: reclaim mode
4789  *
4790  * Charges @nr_pages to @memcg. Returns %true if the charge fit within
4791  * @memcg's configured limit, %false if it doesn't.
4792  */
4793 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
4794 			     gfp_t gfp_mask)
4795 {
4796 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
4797 		return memcg1_charge_skmem(memcg, nr_pages, gfp_mask);
4798 
4799 	if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
4800 		mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
4801 		return true;
4802 	}
4803 
4804 	return false;
4805 }
4806 
4807 /**
4808  * mem_cgroup_uncharge_skmem - uncharge socket memory
4809  * @memcg: memcg to uncharge
4810  * @nr_pages: number of pages to uncharge
4811  */
4812 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
4813 {
4814 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
4815 		memcg1_uncharge_skmem(memcg, nr_pages);
4816 		return;
4817 	}
4818 
4819 	mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
4820 
4821 	refill_stock(memcg, nr_pages);
4822 }
4823 
4824 static int __init cgroup_memory(char *s)
4825 {
4826 	char *token;
4827 
4828 	while ((token = strsep(&s, ",")) != NULL) {
4829 		if (!*token)
4830 			continue;
4831 		if (!strcmp(token, "nosocket"))
4832 			cgroup_memory_nosocket = true;
4833 		if (!strcmp(token, "nokmem"))
4834 			cgroup_memory_nokmem = true;
4835 		if (!strcmp(token, "nobpf"))
4836 			cgroup_memory_nobpf = true;
4837 	}
4838 	return 1;
4839 }
4840 __setup("cgroup.memory=", cgroup_memory);
4841 
4842 /*
4843  * subsys_initcall() for memory controller.
4844  *
4845  * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
4846  * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
4847  * basically everything that doesn't depend on a specific mem_cgroup structure
4848  * should be initialized from here.
4849  */
4850 static int __init mem_cgroup_init(void)
4851 {
4852 	int cpu;
4853 
4854 	/*
4855 	 * Currently s32 type (can refer to struct batched_lruvec_stat) is
4856 	 * used for per-memcg-per-cpu caching of per-node statistics. In order
4857 	 * to work fine, we should make sure that the overfill threshold can't
4858 	 * exceed S32_MAX / PAGE_SIZE.
4859 	 */
4860 	BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
4861 
4862 	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
4863 				  memcg_hotplug_cpu_dead);
4864 
4865 	for_each_possible_cpu(cpu)
4866 		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
4867 			  drain_local_stock);
4868 
4869 	return 0;
4870 }
4871 subsys_initcall(mem_cgroup_init);
4872 
4873 #ifdef CONFIG_SWAP
4874 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
4875 {
4876 	while (!refcount_inc_not_zero(&memcg->id.ref)) {
4877 		/*
4878 		 * The root cgroup cannot be destroyed, so it's refcount must
4879 		 * always be >= 1.
4880 		 */
4881 		if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
4882 			VM_BUG_ON(1);
4883 			break;
4884 		}
4885 		memcg = parent_mem_cgroup(memcg);
4886 		if (!memcg)
4887 			memcg = root_mem_cgroup;
4888 	}
4889 	return memcg;
4890 }
4891 
4892 /**
4893  * mem_cgroup_swapout - transfer a memsw charge to swap
4894  * @folio: folio whose memsw charge to transfer
4895  * @entry: swap entry to move the charge to
4896  *
4897  * Transfer the memsw charge of @folio to @entry.
4898  */
4899 void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
4900 {
4901 	struct mem_cgroup *memcg, *swap_memcg;
4902 	unsigned int nr_entries;
4903 	unsigned short oldid;
4904 
4905 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
4906 	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
4907 
4908 	if (mem_cgroup_disabled())
4909 		return;
4910 
4911 	if (!do_memsw_account())
4912 		return;
4913 
4914 	memcg = folio_memcg(folio);
4915 
4916 	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
4917 	if (!memcg)
4918 		return;
4919 
4920 	/*
4921 	 * In case the memcg owning these pages has been offlined and doesn't
4922 	 * have an ID allocated to it anymore, charge the closest online
4923 	 * ancestor for the swap instead and transfer the memory+swap charge.
4924 	 */
4925 	swap_memcg = mem_cgroup_id_get_online(memcg);
4926 	nr_entries = folio_nr_pages(folio);
4927 	/* Get references for the tail pages, too */
4928 	if (nr_entries > 1)
4929 		mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
4930 	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
4931 				   nr_entries);
4932 	VM_BUG_ON_FOLIO(oldid, folio);
4933 	mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
4934 
4935 	folio->memcg_data = 0;
4936 
4937 	if (!mem_cgroup_is_root(memcg))
4938 		page_counter_uncharge(&memcg->memory, nr_entries);
4939 
4940 	if (memcg != swap_memcg) {
4941 		if (!mem_cgroup_is_root(swap_memcg))
4942 			page_counter_charge(&swap_memcg->memsw, nr_entries);
4943 		page_counter_uncharge(&memcg->memsw, nr_entries);
4944 	}
4945 
4946 	/*
4947 	 * Interrupts should be disabled here because the caller holds the
4948 	 * i_pages lock which is taken with interrupts-off. It is
4949 	 * important here to have the interrupts disabled because it is the
4950 	 * only synchronisation we have for updating the per-CPU variables.
4951 	 */
4952 	memcg_stats_lock();
4953 	mem_cgroup_charge_statistics(memcg, -nr_entries);
4954 	memcg_stats_unlock();
4955 	memcg1_check_events(memcg, folio_nid(folio));
4956 
4957 	css_put(&memcg->css);
4958 }
4959 
4960 /**
4961  * __mem_cgroup_try_charge_swap - try charging swap space for a folio
4962  * @folio: folio being added to swap
4963  * @entry: swap entry to charge
4964  *
4965  * Try to charge @folio's memcg for the swap space at @entry.
4966  *
4967  * Returns 0 on success, -ENOMEM on failure.
4968  */
4969 int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
4970 {
4971 	unsigned int nr_pages = folio_nr_pages(folio);
4972 	struct page_counter *counter;
4973 	struct mem_cgroup *memcg;
4974 	unsigned short oldid;
4975 
4976 	if (do_memsw_account())
4977 		return 0;
4978 
4979 	memcg = folio_memcg(folio);
4980 
4981 	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
4982 	if (!memcg)
4983 		return 0;
4984 
4985 	if (!entry.val) {
4986 		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
4987 		return 0;
4988 	}
4989 
4990 	memcg = mem_cgroup_id_get_online(memcg);
4991 
4992 	if (!mem_cgroup_is_root(memcg) &&
4993 	    !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
4994 		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
4995 		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
4996 		mem_cgroup_id_put(memcg);
4997 		return -ENOMEM;
4998 	}
4999 
5000 	/* Get references for the tail pages, too */
5001 	if (nr_pages > 1)
5002 		mem_cgroup_id_get_many(memcg, nr_pages - 1);
5003 	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
5004 	VM_BUG_ON_FOLIO(oldid, folio);
5005 	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
5006 
5007 	return 0;
5008 }
5009 
5010 /**
5011  * __mem_cgroup_uncharge_swap - uncharge swap space
5012  * @entry: swap entry to uncharge
5013  * @nr_pages: the amount of swap space to uncharge
5014  */
5015 void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
5016 {
5017 	struct mem_cgroup *memcg;
5018 	unsigned short id;
5019 
5020 	id = swap_cgroup_record(entry, 0, nr_pages);
5021 	rcu_read_lock();
5022 	memcg = mem_cgroup_from_id(id);
5023 	if (memcg) {
5024 		if (!mem_cgroup_is_root(memcg)) {
5025 			if (do_memsw_account())
5026 				page_counter_uncharge(&memcg->memsw, nr_pages);
5027 			else
5028 				page_counter_uncharge(&memcg->swap, nr_pages);
5029 		}
5030 		mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
5031 		mem_cgroup_id_put_many(memcg, nr_pages);
5032 	}
5033 	rcu_read_unlock();
5034 }
5035 
5036 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
5037 {
5038 	long nr_swap_pages = get_nr_swap_pages();
5039 
5040 	if (mem_cgroup_disabled() || do_memsw_account())
5041 		return nr_swap_pages;
5042 	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
5043 		nr_swap_pages = min_t(long, nr_swap_pages,
5044 				      READ_ONCE(memcg->swap.max) -
5045 				      page_counter_read(&memcg->swap));
5046 	return nr_swap_pages;
5047 }
5048 
5049 bool mem_cgroup_swap_full(struct folio *folio)
5050 {
5051 	struct mem_cgroup *memcg;
5052 
5053 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
5054 
5055 	if (vm_swap_full())
5056 		return true;
5057 	if (do_memsw_account())
5058 		return false;
5059 
5060 	memcg = folio_memcg(folio);
5061 	if (!memcg)
5062 		return false;
5063 
5064 	for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
5065 		unsigned long usage = page_counter_read(&memcg->swap);
5066 
5067 		if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
5068 		    usage * 2 >= READ_ONCE(memcg->swap.max))
5069 			return true;
5070 	}
5071 
5072 	return false;
5073 }
5074 
5075 static int __init setup_swap_account(char *s)
5076 {
5077 	bool res;
5078 
5079 	if (!kstrtobool(s, &res) && !res)
5080 		pr_warn_once("The swapaccount=0 commandline option is deprecated "
5081 			     "in favor of configuring swap control via cgroupfs. "
5082 			     "Please report your usecase to linux-mm@kvack.org if you "
5083 			     "depend on this functionality.\n");
5084 	return 1;
5085 }
5086 __setup("swapaccount=", setup_swap_account);
5087 
5088 static u64 swap_current_read(struct cgroup_subsys_state *css,
5089 			     struct cftype *cft)
5090 {
5091 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5092 
5093 	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
5094 }
5095 
5096 static u64 swap_peak_read(struct cgroup_subsys_state *css,
5097 			  struct cftype *cft)
5098 {
5099 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5100 
5101 	return (u64)memcg->swap.watermark * PAGE_SIZE;
5102 }
5103 
5104 static int swap_high_show(struct seq_file *m, void *v)
5105 {
5106 	return seq_puts_memcg_tunable(m,
5107 		READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
5108 }
5109 
5110 static ssize_t swap_high_write(struct kernfs_open_file *of,
5111 			       char *buf, size_t nbytes, loff_t off)
5112 {
5113 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5114 	unsigned long high;
5115 	int err;
5116 
5117 	buf = strstrip(buf);
5118 	err = page_counter_memparse(buf, "max", &high);
5119 	if (err)
5120 		return err;
5121 
5122 	page_counter_set_high(&memcg->swap, high);
5123 
5124 	return nbytes;
5125 }
5126 
5127 static int swap_max_show(struct seq_file *m, void *v)
5128 {
5129 	return seq_puts_memcg_tunable(m,
5130 		READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
5131 }
5132 
5133 static ssize_t swap_max_write(struct kernfs_open_file *of,
5134 			      char *buf, size_t nbytes, loff_t off)
5135 {
5136 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5137 	unsigned long max;
5138 	int err;
5139 
5140 	buf = strstrip(buf);
5141 	err = page_counter_memparse(buf, "max", &max);
5142 	if (err)
5143 		return err;
5144 
5145 	xchg(&memcg->swap.max, max);
5146 
5147 	return nbytes;
5148 }
5149 
5150 static int swap_events_show(struct seq_file *m, void *v)
5151 {
5152 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5153 
5154 	seq_printf(m, "high %lu\n",
5155 		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
5156 	seq_printf(m, "max %lu\n",
5157 		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
5158 	seq_printf(m, "fail %lu\n",
5159 		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
5160 
5161 	return 0;
5162 }
5163 
5164 static struct cftype swap_files[] = {
5165 	{
5166 		.name = "swap.current",
5167 		.flags = CFTYPE_NOT_ON_ROOT,
5168 		.read_u64 = swap_current_read,
5169 	},
5170 	{
5171 		.name = "swap.high",
5172 		.flags = CFTYPE_NOT_ON_ROOT,
5173 		.seq_show = swap_high_show,
5174 		.write = swap_high_write,
5175 	},
5176 	{
5177 		.name = "swap.max",
5178 		.flags = CFTYPE_NOT_ON_ROOT,
5179 		.seq_show = swap_max_show,
5180 		.write = swap_max_write,
5181 	},
5182 	{
5183 		.name = "swap.peak",
5184 		.flags = CFTYPE_NOT_ON_ROOT,
5185 		.read_u64 = swap_peak_read,
5186 	},
5187 	{
5188 		.name = "swap.events",
5189 		.flags = CFTYPE_NOT_ON_ROOT,
5190 		.file_offset = offsetof(struct mem_cgroup, swap_events_file),
5191 		.seq_show = swap_events_show,
5192 	},
5193 	{ }	/* terminate */
5194 };
5195 
5196 #ifdef CONFIG_ZSWAP
5197 /**
5198  * obj_cgroup_may_zswap - check if this cgroup can zswap
5199  * @objcg: the object cgroup
5200  *
5201  * Check if the hierarchical zswap limit has been reached.
5202  *
5203  * This doesn't check for specific headroom, and it is not atomic
5204  * either. But with zswap, the size of the allocation is only known
5205  * once compression has occurred, and this optimistic pre-check avoids
5206  * spending cycles on compression when there is already no room left
5207  * or zswap is disabled altogether somewhere in the hierarchy.
5208  */
5209 bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
5210 {
5211 	struct mem_cgroup *memcg, *original_memcg;
5212 	bool ret = true;
5213 
5214 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
5215 		return true;
5216 
5217 	original_memcg = get_mem_cgroup_from_objcg(objcg);
5218 	for (memcg = original_memcg; !mem_cgroup_is_root(memcg);
5219 	     memcg = parent_mem_cgroup(memcg)) {
5220 		unsigned long max = READ_ONCE(memcg->zswap_max);
5221 		unsigned long pages;
5222 
5223 		if (max == PAGE_COUNTER_MAX)
5224 			continue;
5225 		if (max == 0) {
5226 			ret = false;
5227 			break;
5228 		}
5229 
5230 		/*
5231 		 * mem_cgroup_flush_stats() ignores small changes. Use
5232 		 * do_flush_stats() directly to get accurate stats for charging.
5233 		 */
5234 		do_flush_stats(memcg);
5235 		pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
5236 		if (pages < max)
5237 			continue;
5238 		ret = false;
5239 		break;
5240 	}
5241 	mem_cgroup_put(original_memcg);
5242 	return ret;
5243 }
5244 
5245 /**
5246  * obj_cgroup_charge_zswap - charge compression backend memory
5247  * @objcg: the object cgroup
5248  * @size: size of compressed object
5249  *
5250  * This forces the charge after obj_cgroup_may_zswap() allowed
5251  * compression and storage in zwap for this cgroup to go ahead.
5252  */
5253 void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
5254 {
5255 	struct mem_cgroup *memcg;
5256 
5257 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
5258 		return;
5259 
5260 	VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
5261 
5262 	/* PF_MEMALLOC context, charging must succeed */
5263 	if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
5264 		VM_WARN_ON_ONCE(1);
5265 
5266 	rcu_read_lock();
5267 	memcg = obj_cgroup_memcg(objcg);
5268 	mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
5269 	mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
5270 	rcu_read_unlock();
5271 }
5272 
5273 /**
5274  * obj_cgroup_uncharge_zswap - uncharge compression backend memory
5275  * @objcg: the object cgroup
5276  * @size: size of compressed object
5277  *
5278  * Uncharges zswap memory on page in.
5279  */
5280 void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
5281 {
5282 	struct mem_cgroup *memcg;
5283 
5284 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
5285 		return;
5286 
5287 	obj_cgroup_uncharge(objcg, size);
5288 
5289 	rcu_read_lock();
5290 	memcg = obj_cgroup_memcg(objcg);
5291 	mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
5292 	mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
5293 	rcu_read_unlock();
5294 }
5295 
5296 bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
5297 {
5298 	/* if zswap is disabled, do not block pages going to the swapping device */
5299 	return !zswap_is_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback);
5300 }
5301 
5302 static u64 zswap_current_read(struct cgroup_subsys_state *css,
5303 			      struct cftype *cft)
5304 {
5305 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5306 
5307 	mem_cgroup_flush_stats(memcg);
5308 	return memcg_page_state(memcg, MEMCG_ZSWAP_B);
5309 }
5310 
5311 static int zswap_max_show(struct seq_file *m, void *v)
5312 {
5313 	return seq_puts_memcg_tunable(m,
5314 		READ_ONCE(mem_cgroup_from_seq(m)->zswap_max));
5315 }
5316 
5317 static ssize_t zswap_max_write(struct kernfs_open_file *of,
5318 			       char *buf, size_t nbytes, loff_t off)
5319 {
5320 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5321 	unsigned long max;
5322 	int err;
5323 
5324 	buf = strstrip(buf);
5325 	err = page_counter_memparse(buf, "max", &max);
5326 	if (err)
5327 		return err;
5328 
5329 	xchg(&memcg->zswap_max, max);
5330 
5331 	return nbytes;
5332 }
5333 
5334 static int zswap_writeback_show(struct seq_file *m, void *v)
5335 {
5336 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5337 
5338 	seq_printf(m, "%d\n", READ_ONCE(memcg->zswap_writeback));
5339 	return 0;
5340 }
5341 
5342 static ssize_t zswap_writeback_write(struct kernfs_open_file *of,
5343 				char *buf, size_t nbytes, loff_t off)
5344 {
5345 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5346 	int zswap_writeback;
5347 	ssize_t parse_ret = kstrtoint(strstrip(buf), 0, &zswap_writeback);
5348 
5349 	if (parse_ret)
5350 		return parse_ret;
5351 
5352 	if (zswap_writeback != 0 && zswap_writeback != 1)
5353 		return -EINVAL;
5354 
5355 	WRITE_ONCE(memcg->zswap_writeback, zswap_writeback);
5356 	return nbytes;
5357 }
5358 
5359 static struct cftype zswap_files[] = {
5360 	{
5361 		.name = "zswap.current",
5362 		.flags = CFTYPE_NOT_ON_ROOT,
5363 		.read_u64 = zswap_current_read,
5364 	},
5365 	{
5366 		.name = "zswap.max",
5367 		.flags = CFTYPE_NOT_ON_ROOT,
5368 		.seq_show = zswap_max_show,
5369 		.write = zswap_max_write,
5370 	},
5371 	{
5372 		.name = "zswap.writeback",
5373 		.seq_show = zswap_writeback_show,
5374 		.write = zswap_writeback_write,
5375 	},
5376 	{ }	/* terminate */
5377 };
5378 #endif /* CONFIG_ZSWAP */
5379 
5380 static int __init mem_cgroup_swap_init(void)
5381 {
5382 	if (mem_cgroup_disabled())
5383 		return 0;
5384 
5385 	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
5386 #ifdef CONFIG_MEMCG_V1
5387 	WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
5388 #endif
5389 #ifdef CONFIG_ZSWAP
5390 	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
5391 #endif
5392 	return 0;
5393 }
5394 subsys_initcall(mem_cgroup_swap_init);
5395 
5396 #endif /* CONFIG_SWAP */
5397