xref: /linux/mm/memcontrol-v1.c (revision 9d8a2b033db179bef9b6b5bad492f611a0fe89b7)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 #include <linux/memcontrol.h>
4 #include <linux/swap.h>
5 #include <linux/mm_inline.h>
6 #include <linux/pagewalk.h>
7 #include <linux/backing-dev.h>
8 #include <linux/swap_cgroup.h>
9 #include <linux/eventfd.h>
10 #include <linux/poll.h>
11 #include <linux/sort.h>
12 #include <linux/file.h>
13 #include <linux/seq_buf.h>
14 
15 #include "internal.h"
16 #include "swap.h"
17 #include "memcontrol-v1.h"
18 
19 /*
20  * Cgroups above their limits are maintained in a RB-Tree, independent of
21  * their hierarchy representation
22  */
23 
24 struct mem_cgroup_tree_per_node {
25 	struct rb_root rb_root;
26 	struct rb_node *rb_rightmost;
27 	spinlock_t lock;
28 };
29 
30 struct mem_cgroup_tree {
31 	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
32 };
33 
34 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
35 
36 /*
37  * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
38  * limit reclaim to prevent infinite loops, if they ever occur.
39  */
40 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
41 #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
42 
43 /* Stuffs for move charges at task migration. */
44 /*
45  * Types of charges to be moved.
46  */
47 #define MOVE_ANON	0x1ULL
48 #define MOVE_FILE	0x2ULL
49 #define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
50 
51 /* "mc" and its members are protected by cgroup_mutex */
52 static struct move_charge_struct {
53 	spinlock_t	  lock; /* for from, to */
54 	struct mm_struct  *mm;
55 	struct mem_cgroup *from;
56 	struct mem_cgroup *to;
57 	unsigned long flags;
58 	unsigned long precharge;
59 	unsigned long moved_charge;
60 	unsigned long moved_swap;
61 	struct task_struct *moving_task;	/* a task moving charges */
62 	wait_queue_head_t waitq;		/* a waitq for other context */
63 } mc = {
64 	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
65 	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
66 };
67 
68 /* for OOM */
69 struct mem_cgroup_eventfd_list {
70 	struct list_head list;
71 	struct eventfd_ctx *eventfd;
72 };
73 
74 /*
75  * cgroup_event represents events which userspace want to receive.
76  */
77 struct mem_cgroup_event {
78 	/*
79 	 * memcg which the event belongs to.
80 	 */
81 	struct mem_cgroup *memcg;
82 	/*
83 	 * eventfd to signal userspace about the event.
84 	 */
85 	struct eventfd_ctx *eventfd;
86 	/*
87 	 * Each of these stored in a list by the cgroup.
88 	 */
89 	struct list_head list;
90 	/*
91 	 * register_event() callback will be used to add new userspace
92 	 * waiter for changes related to this event.  Use eventfd_signal()
93 	 * on eventfd to send notification to userspace.
94 	 */
95 	int (*register_event)(struct mem_cgroup *memcg,
96 			      struct eventfd_ctx *eventfd, const char *args);
97 	/*
98 	 * unregister_event() callback will be called when userspace closes
99 	 * the eventfd or on cgroup removing.  This callback must be set,
100 	 * if you want provide notification functionality.
101 	 */
102 	void (*unregister_event)(struct mem_cgroup *memcg,
103 				 struct eventfd_ctx *eventfd);
104 	/*
105 	 * All fields below needed to unregister event when
106 	 * userspace closes eventfd.
107 	 */
108 	poll_table pt;
109 	wait_queue_head_t *wqh;
110 	wait_queue_entry_t wait;
111 	struct work_struct remove;
112 };
113 
114 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
115 #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
116 #define MEMFILE_ATTR(val)	((val) & 0xffff)
117 
118 enum {
119 	RES_USAGE,
120 	RES_LIMIT,
121 	RES_MAX_USAGE,
122 	RES_FAILCNT,
123 	RES_SOFT_LIMIT,
124 };
125 
126 #ifdef CONFIG_LOCKDEP
127 static struct lockdep_map memcg_oom_lock_dep_map = {
128 	.name = "memcg_oom_lock",
129 };
130 #endif
131 
132 DEFINE_SPINLOCK(memcg_oom_lock);
133 
134 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
135 					 struct mem_cgroup_tree_per_node *mctz,
136 					 unsigned long new_usage_in_excess)
137 {
138 	struct rb_node **p = &mctz->rb_root.rb_node;
139 	struct rb_node *parent = NULL;
140 	struct mem_cgroup_per_node *mz_node;
141 	bool rightmost = true;
142 
143 	if (mz->on_tree)
144 		return;
145 
146 	mz->usage_in_excess = new_usage_in_excess;
147 	if (!mz->usage_in_excess)
148 		return;
149 	while (*p) {
150 		parent = *p;
151 		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
152 					tree_node);
153 		if (mz->usage_in_excess < mz_node->usage_in_excess) {
154 			p = &(*p)->rb_left;
155 			rightmost = false;
156 		} else {
157 			p = &(*p)->rb_right;
158 		}
159 	}
160 
161 	if (rightmost)
162 		mctz->rb_rightmost = &mz->tree_node;
163 
164 	rb_link_node(&mz->tree_node, parent, p);
165 	rb_insert_color(&mz->tree_node, &mctz->rb_root);
166 	mz->on_tree = true;
167 }
168 
169 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
170 					 struct mem_cgroup_tree_per_node *mctz)
171 {
172 	if (!mz->on_tree)
173 		return;
174 
175 	if (&mz->tree_node == mctz->rb_rightmost)
176 		mctz->rb_rightmost = rb_prev(&mz->tree_node);
177 
178 	rb_erase(&mz->tree_node, &mctz->rb_root);
179 	mz->on_tree = false;
180 }
181 
182 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
183 				       struct mem_cgroup_tree_per_node *mctz)
184 {
185 	unsigned long flags;
186 
187 	spin_lock_irqsave(&mctz->lock, flags);
188 	__mem_cgroup_remove_exceeded(mz, mctz);
189 	spin_unlock_irqrestore(&mctz->lock, flags);
190 }
191 
192 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
193 {
194 	unsigned long nr_pages = page_counter_read(&memcg->memory);
195 	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
196 	unsigned long excess = 0;
197 
198 	if (nr_pages > soft_limit)
199 		excess = nr_pages - soft_limit;
200 
201 	return excess;
202 }
203 
204 static void memcg1_update_tree(struct mem_cgroup *memcg, int nid)
205 {
206 	unsigned long excess;
207 	struct mem_cgroup_per_node *mz;
208 	struct mem_cgroup_tree_per_node *mctz;
209 
210 	if (lru_gen_enabled()) {
211 		if (soft_limit_excess(memcg))
212 			lru_gen_soft_reclaim(memcg, nid);
213 		return;
214 	}
215 
216 	mctz = soft_limit_tree.rb_tree_per_node[nid];
217 	if (!mctz)
218 		return;
219 	/*
220 	 * Necessary to update all ancestors when hierarchy is used.
221 	 * because their event counter is not touched.
222 	 */
223 	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
224 		mz = memcg->nodeinfo[nid];
225 		excess = soft_limit_excess(memcg);
226 		/*
227 		 * We have to update the tree if mz is on RB-tree or
228 		 * mem is over its softlimit.
229 		 */
230 		if (excess || mz->on_tree) {
231 			unsigned long flags;
232 
233 			spin_lock_irqsave(&mctz->lock, flags);
234 			/* if on-tree, remove it */
235 			if (mz->on_tree)
236 				__mem_cgroup_remove_exceeded(mz, mctz);
237 			/*
238 			 * Insert again. mz->usage_in_excess will be updated.
239 			 * If excess is 0, no tree ops.
240 			 */
241 			__mem_cgroup_insert_exceeded(mz, mctz, excess);
242 			spin_unlock_irqrestore(&mctz->lock, flags);
243 		}
244 	}
245 }
246 
247 void memcg1_remove_from_trees(struct mem_cgroup *memcg)
248 {
249 	struct mem_cgroup_tree_per_node *mctz;
250 	struct mem_cgroup_per_node *mz;
251 	int nid;
252 
253 	for_each_node(nid) {
254 		mz = memcg->nodeinfo[nid];
255 		mctz = soft_limit_tree.rb_tree_per_node[nid];
256 		if (mctz)
257 			mem_cgroup_remove_exceeded(mz, mctz);
258 	}
259 }
260 
261 static struct mem_cgroup_per_node *
262 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
263 {
264 	struct mem_cgroup_per_node *mz;
265 
266 retry:
267 	mz = NULL;
268 	if (!mctz->rb_rightmost)
269 		goto done;		/* Nothing to reclaim from */
270 
271 	mz = rb_entry(mctz->rb_rightmost,
272 		      struct mem_cgroup_per_node, tree_node);
273 	/*
274 	 * Remove the node now but someone else can add it back,
275 	 * we will to add it back at the end of reclaim to its correct
276 	 * position in the tree.
277 	 */
278 	__mem_cgroup_remove_exceeded(mz, mctz);
279 	if (!soft_limit_excess(mz->memcg) ||
280 	    !css_tryget(&mz->memcg->css))
281 		goto retry;
282 done:
283 	return mz;
284 }
285 
286 static struct mem_cgroup_per_node *
287 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
288 {
289 	struct mem_cgroup_per_node *mz;
290 
291 	spin_lock_irq(&mctz->lock);
292 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
293 	spin_unlock_irq(&mctz->lock);
294 	return mz;
295 }
296 
297 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
298 				   pg_data_t *pgdat,
299 				   gfp_t gfp_mask,
300 				   unsigned long *total_scanned)
301 {
302 	struct mem_cgroup *victim = NULL;
303 	int total = 0;
304 	int loop = 0;
305 	unsigned long excess;
306 	unsigned long nr_scanned;
307 	struct mem_cgroup_reclaim_cookie reclaim = {
308 		.pgdat = pgdat,
309 	};
310 
311 	excess = soft_limit_excess(root_memcg);
312 
313 	while (1) {
314 		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
315 		if (!victim) {
316 			loop++;
317 			if (loop >= 2) {
318 				/*
319 				 * If we have not been able to reclaim
320 				 * anything, it might because there are
321 				 * no reclaimable pages under this hierarchy
322 				 */
323 				if (!total)
324 					break;
325 				/*
326 				 * We want to do more targeted reclaim.
327 				 * excess >> 2 is not to excessive so as to
328 				 * reclaim too much, nor too less that we keep
329 				 * coming back to reclaim from this cgroup
330 				 */
331 				if (total >= (excess >> 2) ||
332 					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
333 					break;
334 			}
335 			continue;
336 		}
337 		total += mem_cgroup_shrink_node(victim, gfp_mask, false,
338 					pgdat, &nr_scanned);
339 		*total_scanned += nr_scanned;
340 		if (!soft_limit_excess(root_memcg))
341 			break;
342 	}
343 	mem_cgroup_iter_break(root_memcg, victim);
344 	return total;
345 }
346 
347 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
348 					    gfp_t gfp_mask,
349 					    unsigned long *total_scanned)
350 {
351 	unsigned long nr_reclaimed = 0;
352 	struct mem_cgroup_per_node *mz, *next_mz = NULL;
353 	unsigned long reclaimed;
354 	int loop = 0;
355 	struct mem_cgroup_tree_per_node *mctz;
356 	unsigned long excess;
357 
358 	if (lru_gen_enabled())
359 		return 0;
360 
361 	if (order > 0)
362 		return 0;
363 
364 	mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
365 
366 	/*
367 	 * Do not even bother to check the largest node if the root
368 	 * is empty. Do it lockless to prevent lock bouncing. Races
369 	 * are acceptable as soft limit is best effort anyway.
370 	 */
371 	if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
372 		return 0;
373 
374 	/*
375 	 * This loop can run a while, specially if mem_cgroup's continuously
376 	 * keep exceeding their soft limit and putting the system under
377 	 * pressure
378 	 */
379 	do {
380 		if (next_mz)
381 			mz = next_mz;
382 		else
383 			mz = mem_cgroup_largest_soft_limit_node(mctz);
384 		if (!mz)
385 			break;
386 
387 		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
388 						    gfp_mask, total_scanned);
389 		nr_reclaimed += reclaimed;
390 		spin_lock_irq(&mctz->lock);
391 
392 		/*
393 		 * If we failed to reclaim anything from this memory cgroup
394 		 * it is time to move on to the next cgroup
395 		 */
396 		next_mz = NULL;
397 		if (!reclaimed)
398 			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
399 
400 		excess = soft_limit_excess(mz->memcg);
401 		/*
402 		 * One school of thought says that we should not add
403 		 * back the node to the tree if reclaim returns 0.
404 		 * But our reclaim could return 0, simply because due
405 		 * to priority we are exposing a smaller subset of
406 		 * memory to reclaim from. Consider this as a longer
407 		 * term TODO.
408 		 */
409 		/* If excess == 0, no tree ops */
410 		__mem_cgroup_insert_exceeded(mz, mctz, excess);
411 		spin_unlock_irq(&mctz->lock);
412 		css_put(&mz->memcg->css);
413 		loop++;
414 		/*
415 		 * Could not reclaim anything and there are no more
416 		 * mem cgroups to try or we seem to be looping without
417 		 * reclaiming anything.
418 		 */
419 		if (!nr_reclaimed &&
420 			(next_mz == NULL ||
421 			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
422 			break;
423 	} while (!nr_reclaimed);
424 	if (next_mz)
425 		css_put(&next_mz->memcg->css);
426 	return nr_reclaimed;
427 }
428 
429 /*
430  * A routine for checking "mem" is under move_account() or not.
431  *
432  * Checking a cgroup is mc.from or mc.to or under hierarchy of
433  * moving cgroups. This is for waiting at high-memory pressure
434  * caused by "move".
435  */
436 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
437 {
438 	struct mem_cgroup *from;
439 	struct mem_cgroup *to;
440 	bool ret = false;
441 	/*
442 	 * Unlike task_move routines, we access mc.to, mc.from not under
443 	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
444 	 */
445 	spin_lock(&mc.lock);
446 	from = mc.from;
447 	to = mc.to;
448 	if (!from)
449 		goto unlock;
450 
451 	ret = mem_cgroup_is_descendant(from, memcg) ||
452 		mem_cgroup_is_descendant(to, memcg);
453 unlock:
454 	spin_unlock(&mc.lock);
455 	return ret;
456 }
457 
458 bool memcg1_wait_acct_move(struct mem_cgroup *memcg)
459 {
460 	if (mc.moving_task && current != mc.moving_task) {
461 		if (mem_cgroup_under_move(memcg)) {
462 			DEFINE_WAIT(wait);
463 			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
464 			/* moving charge context might have finished. */
465 			if (mc.moving_task)
466 				schedule();
467 			finish_wait(&mc.waitq, &wait);
468 			return true;
469 		}
470 	}
471 	return false;
472 }
473 
474 /**
475  * folio_memcg_lock - Bind a folio to its memcg.
476  * @folio: The folio.
477  *
478  * This function prevents unlocked LRU folios from being moved to
479  * another cgroup.
480  *
481  * It ensures lifetime of the bound memcg.  The caller is responsible
482  * for the lifetime of the folio.
483  */
484 void folio_memcg_lock(struct folio *folio)
485 {
486 	struct mem_cgroup *memcg;
487 	unsigned long flags;
488 
489 	/*
490 	 * The RCU lock is held throughout the transaction.  The fast
491 	 * path can get away without acquiring the memcg->move_lock
492 	 * because page moving starts with an RCU grace period.
493          */
494 	rcu_read_lock();
495 
496 	if (mem_cgroup_disabled())
497 		return;
498 again:
499 	memcg = folio_memcg(folio);
500 	if (unlikely(!memcg))
501 		return;
502 
503 #ifdef CONFIG_PROVE_LOCKING
504 	local_irq_save(flags);
505 	might_lock(&memcg->move_lock);
506 	local_irq_restore(flags);
507 #endif
508 
509 	if (atomic_read(&memcg->moving_account) <= 0)
510 		return;
511 
512 	spin_lock_irqsave(&memcg->move_lock, flags);
513 	if (memcg != folio_memcg(folio)) {
514 		spin_unlock_irqrestore(&memcg->move_lock, flags);
515 		goto again;
516 	}
517 
518 	/*
519 	 * When charge migration first begins, we can have multiple
520 	 * critical sections holding the fast-path RCU lock and one
521 	 * holding the slowpath move_lock. Track the task who has the
522 	 * move_lock for folio_memcg_unlock().
523 	 */
524 	memcg->move_lock_task = current;
525 	memcg->move_lock_flags = flags;
526 }
527 
528 static void __folio_memcg_unlock(struct mem_cgroup *memcg)
529 {
530 	if (memcg && memcg->move_lock_task == current) {
531 		unsigned long flags = memcg->move_lock_flags;
532 
533 		memcg->move_lock_task = NULL;
534 		memcg->move_lock_flags = 0;
535 
536 		spin_unlock_irqrestore(&memcg->move_lock, flags);
537 	}
538 
539 	rcu_read_unlock();
540 }
541 
542 /**
543  * folio_memcg_unlock - Release the binding between a folio and its memcg.
544  * @folio: The folio.
545  *
546  * This releases the binding created by folio_memcg_lock().  This does
547  * not change the accounting of this folio to its memcg, but it does
548  * permit others to change it.
549  */
550 void folio_memcg_unlock(struct folio *folio)
551 {
552 	__folio_memcg_unlock(folio_memcg(folio));
553 }
554 
555 #ifdef CONFIG_SWAP
556 /**
557  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
558  * @entry: swap entry to be moved
559  * @from:  mem_cgroup which the entry is moved from
560  * @to:  mem_cgroup which the entry is moved to
561  *
562  * It succeeds only when the swap_cgroup's record for this entry is the same
563  * as the mem_cgroup's id of @from.
564  *
565  * Returns 0 on success, -EINVAL on failure.
566  *
567  * The caller must have charged to @to, IOW, called page_counter_charge() about
568  * both res and memsw, and called css_get().
569  */
570 static int mem_cgroup_move_swap_account(swp_entry_t entry,
571 				struct mem_cgroup *from, struct mem_cgroup *to)
572 {
573 	unsigned short old_id, new_id;
574 
575 	old_id = mem_cgroup_id(from);
576 	new_id = mem_cgroup_id(to);
577 
578 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
579 		mod_memcg_state(from, MEMCG_SWAP, -1);
580 		mod_memcg_state(to, MEMCG_SWAP, 1);
581 		return 0;
582 	}
583 	return -EINVAL;
584 }
585 #else
586 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
587 				struct mem_cgroup *from, struct mem_cgroup *to)
588 {
589 	return -EINVAL;
590 }
591 #endif
592 
593 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
594 				struct cftype *cft)
595 {
596 	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
597 }
598 
599 #ifdef CONFIG_MMU
600 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
601 				 struct cftype *cft, u64 val)
602 {
603 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
604 
605 	pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
606 		     "Please report your usecase to linux-mm@kvack.org if you "
607 		     "depend on this functionality.\n");
608 
609 	if (val & ~MOVE_MASK)
610 		return -EINVAL;
611 
612 	/*
613 	 * No kind of locking is needed in here, because ->can_attach() will
614 	 * check this value once in the beginning of the process, and then carry
615 	 * on with stale data. This means that changes to this value will only
616 	 * affect task migrations starting after the change.
617 	 */
618 	memcg->move_charge_at_immigrate = val;
619 	return 0;
620 }
621 #else
622 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
623 				 struct cftype *cft, u64 val)
624 {
625 	return -ENOSYS;
626 }
627 #endif
628 
629 #ifdef CONFIG_MMU
630 /* Handlers for move charge at task migration. */
631 static int mem_cgroup_do_precharge(unsigned long count)
632 {
633 	int ret;
634 
635 	/* Try a single bulk charge without reclaim first, kswapd may wake */
636 	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
637 	if (!ret) {
638 		mc.precharge += count;
639 		return ret;
640 	}
641 
642 	/* Try charges one by one with reclaim, but do not retry */
643 	while (count--) {
644 		ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
645 		if (ret)
646 			return ret;
647 		mc.precharge++;
648 		cond_resched();
649 	}
650 	return 0;
651 }
652 
653 union mc_target {
654 	struct folio	*folio;
655 	swp_entry_t	ent;
656 };
657 
658 enum mc_target_type {
659 	MC_TARGET_NONE = 0,
660 	MC_TARGET_PAGE,
661 	MC_TARGET_SWAP,
662 	MC_TARGET_DEVICE,
663 };
664 
665 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
666 						unsigned long addr, pte_t ptent)
667 {
668 	struct page *page = vm_normal_page(vma, addr, ptent);
669 
670 	if (!page)
671 		return NULL;
672 	if (PageAnon(page)) {
673 		if (!(mc.flags & MOVE_ANON))
674 			return NULL;
675 	} else {
676 		if (!(mc.flags & MOVE_FILE))
677 			return NULL;
678 	}
679 	get_page(page);
680 
681 	return page;
682 }
683 
684 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
685 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
686 			pte_t ptent, swp_entry_t *entry)
687 {
688 	struct page *page = NULL;
689 	swp_entry_t ent = pte_to_swp_entry(ptent);
690 
691 	if (!(mc.flags & MOVE_ANON))
692 		return NULL;
693 
694 	/*
695 	 * Handle device private pages that are not accessible by the CPU, but
696 	 * stored as special swap entries in the page table.
697 	 */
698 	if (is_device_private_entry(ent)) {
699 		page = pfn_swap_entry_to_page(ent);
700 		if (!get_page_unless_zero(page))
701 			return NULL;
702 		return page;
703 	}
704 
705 	if (non_swap_entry(ent))
706 		return NULL;
707 
708 	/*
709 	 * Because swap_cache_get_folio() updates some statistics counter,
710 	 * we call find_get_page() with swapper_space directly.
711 	 */
712 	page = find_get_page(swap_address_space(ent), swap_cache_index(ent));
713 	entry->val = ent.val;
714 
715 	return page;
716 }
717 #else
718 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
719 			pte_t ptent, swp_entry_t *entry)
720 {
721 	return NULL;
722 }
723 #endif
724 
725 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
726 			unsigned long addr, pte_t ptent)
727 {
728 	unsigned long index;
729 	struct folio *folio;
730 
731 	if (!vma->vm_file) /* anonymous vma */
732 		return NULL;
733 	if (!(mc.flags & MOVE_FILE))
734 		return NULL;
735 
736 	/* folio is moved even if it's not RSS of this task(page-faulted). */
737 	/* shmem/tmpfs may report page out on swap: account for that too. */
738 	index = linear_page_index(vma, addr);
739 	folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
740 	if (IS_ERR(folio))
741 		return NULL;
742 	return folio_file_page(folio, index);
743 }
744 
745 static void memcg1_check_events(struct mem_cgroup *memcg, int nid);
746 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages);
747 
748 /**
749  * mem_cgroup_move_account - move account of the folio
750  * @folio: The folio.
751  * @compound: charge the page as compound or small page
752  * @from: mem_cgroup which the folio is moved from.
753  * @to:	mem_cgroup which the folio is moved to. @from != @to.
754  *
755  * The folio must be locked and not on the LRU.
756  *
757  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
758  * from old cgroup.
759  */
760 static int mem_cgroup_move_account(struct folio *folio,
761 				   bool compound,
762 				   struct mem_cgroup *from,
763 				   struct mem_cgroup *to)
764 {
765 	struct lruvec *from_vec, *to_vec;
766 	struct pglist_data *pgdat;
767 	unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
768 	int nid, ret;
769 
770 	VM_BUG_ON(from == to);
771 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
772 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
773 	VM_BUG_ON(compound && !folio_test_large(folio));
774 
775 	ret = -EINVAL;
776 	if (folio_memcg(folio) != from)
777 		goto out;
778 
779 	pgdat = folio_pgdat(folio);
780 	from_vec = mem_cgroup_lruvec(from, pgdat);
781 	to_vec = mem_cgroup_lruvec(to, pgdat);
782 
783 	folio_memcg_lock(folio);
784 
785 	if (folio_test_anon(folio)) {
786 		if (folio_mapped(folio)) {
787 			__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
788 			__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
789 			if (folio_test_pmd_mappable(folio)) {
790 				__mod_lruvec_state(from_vec, NR_ANON_THPS,
791 						   -nr_pages);
792 				__mod_lruvec_state(to_vec, NR_ANON_THPS,
793 						   nr_pages);
794 			}
795 		}
796 	} else {
797 		__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
798 		__mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
799 
800 		if (folio_test_swapbacked(folio)) {
801 			__mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
802 			__mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
803 		}
804 
805 		if (folio_mapped(folio)) {
806 			__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
807 			__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
808 		}
809 
810 		if (folio_test_dirty(folio)) {
811 			struct address_space *mapping = folio_mapping(folio);
812 
813 			if (mapping_can_writeback(mapping)) {
814 				__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
815 						   -nr_pages);
816 				__mod_lruvec_state(to_vec, NR_FILE_DIRTY,
817 						   nr_pages);
818 			}
819 		}
820 	}
821 
822 #ifdef CONFIG_SWAP
823 	if (folio_test_swapcache(folio)) {
824 		__mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages);
825 		__mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages);
826 	}
827 #endif
828 	if (folio_test_writeback(folio)) {
829 		__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
830 		__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
831 	}
832 
833 	/*
834 	 * All state has been migrated, let's switch to the new memcg.
835 	 *
836 	 * It is safe to change page's memcg here because the page
837 	 * is referenced, charged, isolated, and locked: we can't race
838 	 * with (un)charging, migration, LRU putback, or anything else
839 	 * that would rely on a stable page's memory cgroup.
840 	 *
841 	 * Note that folio_memcg_lock is a memcg lock, not a page lock,
842 	 * to save space. As soon as we switch page's memory cgroup to a
843 	 * new memcg that isn't locked, the above state can change
844 	 * concurrently again. Make sure we're truly done with it.
845 	 */
846 	smp_mb();
847 
848 	css_get(&to->css);
849 	css_put(&from->css);
850 
851 	/* Warning should never happen, so don't worry about refcount non-0 */
852 	WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
853 	folio->memcg_data = (unsigned long)to;
854 
855 	__folio_memcg_unlock(from);
856 
857 	ret = 0;
858 	nid = folio_nid(folio);
859 
860 	local_irq_disable();
861 	memcg1_charge_statistics(to, nr_pages);
862 	memcg1_check_events(to, nid);
863 	memcg1_charge_statistics(from, -nr_pages);
864 	memcg1_check_events(from, nid);
865 	local_irq_enable();
866 out:
867 	return ret;
868 }
869 
870 /**
871  * get_mctgt_type - get target type of moving charge
872  * @vma: the vma the pte to be checked belongs
873  * @addr: the address corresponding to the pte to be checked
874  * @ptent: the pte to be checked
875  * @target: the pointer the target page or swap ent will be stored(can be NULL)
876  *
877  * Context: Called with pte lock held.
878  * Return:
879  * * MC_TARGET_NONE - If the pte is not a target for move charge.
880  * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
881  *   move charge. If @target is not NULL, the folio is stored in target->folio
882  *   with extra refcnt taken (Caller should release it).
883  * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
884  *   target for charge migration.  If @target is not NULL, the entry is
885  *   stored in target->ent.
886  * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and
887  *   thus not on the lru.  For now such page is charged like a regular page
888  *   would be as it is just special memory taking the place of a regular page.
889  *   See Documentations/vm/hmm.txt and include/linux/hmm.h
890  */
891 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
892 		unsigned long addr, pte_t ptent, union mc_target *target)
893 {
894 	struct page *page = NULL;
895 	struct folio *folio;
896 	enum mc_target_type ret = MC_TARGET_NONE;
897 	swp_entry_t ent = { .val = 0 };
898 
899 	if (pte_present(ptent))
900 		page = mc_handle_present_pte(vma, addr, ptent);
901 	else if (pte_none_mostly(ptent))
902 		/*
903 		 * PTE markers should be treated as a none pte here, separated
904 		 * from other swap handling below.
905 		 */
906 		page = mc_handle_file_pte(vma, addr, ptent);
907 	else if (is_swap_pte(ptent))
908 		page = mc_handle_swap_pte(vma, ptent, &ent);
909 
910 	if (page)
911 		folio = page_folio(page);
912 	if (target && page) {
913 		if (!folio_trylock(folio)) {
914 			folio_put(folio);
915 			return ret;
916 		}
917 		/*
918 		 * page_mapped() must be stable during the move. This
919 		 * pte is locked, so if it's present, the page cannot
920 		 * become unmapped. If it isn't, we have only partial
921 		 * control over the mapped state: the page lock will
922 		 * prevent new faults against pagecache and swapcache,
923 		 * so an unmapped page cannot become mapped. However,
924 		 * if the page is already mapped elsewhere, it can
925 		 * unmap, and there is nothing we can do about it.
926 		 * Alas, skip moving the page in this case.
927 		 */
928 		if (!pte_present(ptent) && page_mapped(page)) {
929 			folio_unlock(folio);
930 			folio_put(folio);
931 			return ret;
932 		}
933 	}
934 
935 	if (!page && !ent.val)
936 		return ret;
937 	if (page) {
938 		/*
939 		 * Do only loose check w/o serialization.
940 		 * mem_cgroup_move_account() checks the page is valid or
941 		 * not under LRU exclusion.
942 		 */
943 		if (folio_memcg(folio) == mc.from) {
944 			ret = MC_TARGET_PAGE;
945 			if (folio_is_device_private(folio) ||
946 			    folio_is_device_coherent(folio))
947 				ret = MC_TARGET_DEVICE;
948 			if (target)
949 				target->folio = folio;
950 		}
951 		if (!ret || !target) {
952 			if (target)
953 				folio_unlock(folio);
954 			folio_put(folio);
955 		}
956 	}
957 	/*
958 	 * There is a swap entry and a page doesn't exist or isn't charged.
959 	 * But we cannot move a tail-page in a THP.
960 	 */
961 	if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
962 	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
963 		ret = MC_TARGET_SWAP;
964 		if (target)
965 			target->ent = ent;
966 	}
967 	return ret;
968 }
969 
970 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
971 /*
972  * We don't consider PMD mapped swapping or file mapped pages because THP does
973  * not support them for now.
974  * Caller should make sure that pmd_trans_huge(pmd) is true.
975  */
976 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
977 		unsigned long addr, pmd_t pmd, union mc_target *target)
978 {
979 	struct page *page = NULL;
980 	struct folio *folio;
981 	enum mc_target_type ret = MC_TARGET_NONE;
982 
983 	if (unlikely(is_swap_pmd(pmd))) {
984 		VM_BUG_ON(thp_migration_supported() &&
985 				  !is_pmd_migration_entry(pmd));
986 		return ret;
987 	}
988 	page = pmd_page(pmd);
989 	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
990 	folio = page_folio(page);
991 	if (!(mc.flags & MOVE_ANON))
992 		return ret;
993 	if (folio_memcg(folio) == mc.from) {
994 		ret = MC_TARGET_PAGE;
995 		if (target) {
996 			folio_get(folio);
997 			if (!folio_trylock(folio)) {
998 				folio_put(folio);
999 				return MC_TARGET_NONE;
1000 			}
1001 			target->folio = folio;
1002 		}
1003 	}
1004 	return ret;
1005 }
1006 #else
1007 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
1008 		unsigned long addr, pmd_t pmd, union mc_target *target)
1009 {
1010 	return MC_TARGET_NONE;
1011 }
1012 #endif
1013 
1014 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
1015 					unsigned long addr, unsigned long end,
1016 					struct mm_walk *walk)
1017 {
1018 	struct vm_area_struct *vma = walk->vma;
1019 	pte_t *pte;
1020 	spinlock_t *ptl;
1021 
1022 	ptl = pmd_trans_huge_lock(pmd, vma);
1023 	if (ptl) {
1024 		/*
1025 		 * Note their can not be MC_TARGET_DEVICE for now as we do not
1026 		 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
1027 		 * this might change.
1028 		 */
1029 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
1030 			mc.precharge += HPAGE_PMD_NR;
1031 		spin_unlock(ptl);
1032 		return 0;
1033 	}
1034 
1035 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1036 	if (!pte)
1037 		return 0;
1038 	for (; addr != end; pte++, addr += PAGE_SIZE)
1039 		if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
1040 			mc.precharge++;	/* increment precharge temporarily */
1041 	pte_unmap_unlock(pte - 1, ptl);
1042 	cond_resched();
1043 
1044 	return 0;
1045 }
1046 
1047 static const struct mm_walk_ops precharge_walk_ops = {
1048 	.pmd_entry	= mem_cgroup_count_precharge_pte_range,
1049 	.walk_lock	= PGWALK_RDLOCK,
1050 };
1051 
1052 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
1053 {
1054 	unsigned long precharge;
1055 
1056 	mmap_read_lock(mm);
1057 	walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
1058 	mmap_read_unlock(mm);
1059 
1060 	precharge = mc.precharge;
1061 	mc.precharge = 0;
1062 
1063 	return precharge;
1064 }
1065 
1066 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
1067 {
1068 	unsigned long precharge = mem_cgroup_count_precharge(mm);
1069 
1070 	VM_BUG_ON(mc.moving_task);
1071 	mc.moving_task = current;
1072 	return mem_cgroup_do_precharge(precharge);
1073 }
1074 
1075 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
1076 static void __mem_cgroup_clear_mc(void)
1077 {
1078 	struct mem_cgroup *from = mc.from;
1079 	struct mem_cgroup *to = mc.to;
1080 
1081 	/* we must uncharge all the leftover precharges from mc.to */
1082 	if (mc.precharge) {
1083 		mem_cgroup_cancel_charge(mc.to, mc.precharge);
1084 		mc.precharge = 0;
1085 	}
1086 	/*
1087 	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
1088 	 * we must uncharge here.
1089 	 */
1090 	if (mc.moved_charge) {
1091 		mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
1092 		mc.moved_charge = 0;
1093 	}
1094 	/* we must fixup refcnts and charges */
1095 	if (mc.moved_swap) {
1096 		/* uncharge swap account from the old cgroup */
1097 		if (!mem_cgroup_is_root(mc.from))
1098 			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
1099 
1100 		mem_cgroup_id_put_many(mc.from, mc.moved_swap);
1101 
1102 		/*
1103 		 * we charged both to->memory and to->memsw, so we
1104 		 * should uncharge to->memory.
1105 		 */
1106 		if (!mem_cgroup_is_root(mc.to))
1107 			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
1108 
1109 		mc.moved_swap = 0;
1110 	}
1111 	memcg1_oom_recover(from);
1112 	memcg1_oom_recover(to);
1113 	wake_up_all(&mc.waitq);
1114 }
1115 
1116 static void mem_cgroup_clear_mc(void)
1117 {
1118 	struct mm_struct *mm = mc.mm;
1119 
1120 	/*
1121 	 * we must clear moving_task before waking up waiters at the end of
1122 	 * task migration.
1123 	 */
1124 	mc.moving_task = NULL;
1125 	__mem_cgroup_clear_mc();
1126 	spin_lock(&mc.lock);
1127 	mc.from = NULL;
1128 	mc.to = NULL;
1129 	mc.mm = NULL;
1130 	spin_unlock(&mc.lock);
1131 
1132 	mmput(mm);
1133 }
1134 
1135 int memcg1_can_attach(struct cgroup_taskset *tset)
1136 {
1137 	struct cgroup_subsys_state *css;
1138 	struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
1139 	struct mem_cgroup *from;
1140 	struct task_struct *leader, *p;
1141 	struct mm_struct *mm;
1142 	unsigned long move_flags;
1143 	int ret = 0;
1144 
1145 	/* charge immigration isn't supported on the default hierarchy */
1146 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1147 		return 0;
1148 
1149 	/*
1150 	 * Multi-process migrations only happen on the default hierarchy
1151 	 * where charge immigration is not used.  Perform charge
1152 	 * immigration if @tset contains a leader and whine if there are
1153 	 * multiple.
1154 	 */
1155 	p = NULL;
1156 	cgroup_taskset_for_each_leader(leader, css, tset) {
1157 		WARN_ON_ONCE(p);
1158 		p = leader;
1159 		memcg = mem_cgroup_from_css(css);
1160 	}
1161 	if (!p)
1162 		return 0;
1163 
1164 	/*
1165 	 * We are now committed to this value whatever it is. Changes in this
1166 	 * tunable will only affect upcoming migrations, not the current one.
1167 	 * So we need to save it, and keep it going.
1168 	 */
1169 	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
1170 	if (!move_flags)
1171 		return 0;
1172 
1173 	from = mem_cgroup_from_task(p);
1174 
1175 	VM_BUG_ON(from == memcg);
1176 
1177 	mm = get_task_mm(p);
1178 	if (!mm)
1179 		return 0;
1180 	/* We move charges only when we move a owner of the mm */
1181 	if (mm->owner == p) {
1182 		VM_BUG_ON(mc.from);
1183 		VM_BUG_ON(mc.to);
1184 		VM_BUG_ON(mc.precharge);
1185 		VM_BUG_ON(mc.moved_charge);
1186 		VM_BUG_ON(mc.moved_swap);
1187 
1188 		spin_lock(&mc.lock);
1189 		mc.mm = mm;
1190 		mc.from = from;
1191 		mc.to = memcg;
1192 		mc.flags = move_flags;
1193 		spin_unlock(&mc.lock);
1194 		/* We set mc.moving_task later */
1195 
1196 		ret = mem_cgroup_precharge_mc(mm);
1197 		if (ret)
1198 			mem_cgroup_clear_mc();
1199 	} else {
1200 		mmput(mm);
1201 	}
1202 	return ret;
1203 }
1204 
1205 void memcg1_cancel_attach(struct cgroup_taskset *tset)
1206 {
1207 	if (mc.to)
1208 		mem_cgroup_clear_mc();
1209 }
1210 
1211 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
1212 				unsigned long addr, unsigned long end,
1213 				struct mm_walk *walk)
1214 {
1215 	int ret = 0;
1216 	struct vm_area_struct *vma = walk->vma;
1217 	pte_t *pte;
1218 	spinlock_t *ptl;
1219 	enum mc_target_type target_type;
1220 	union mc_target target;
1221 	struct folio *folio;
1222 	bool tried_split_before = false;
1223 
1224 retry_pmd:
1225 	ptl = pmd_trans_huge_lock(pmd, vma);
1226 	if (ptl) {
1227 		if (mc.precharge < HPAGE_PMD_NR) {
1228 			spin_unlock(ptl);
1229 			return 0;
1230 		}
1231 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
1232 		if (target_type == MC_TARGET_PAGE) {
1233 			folio = target.folio;
1234 			/*
1235 			 * Deferred split queue locking depends on memcg,
1236 			 * and unqueue is unsafe unless folio refcount is 0:
1237 			 * split or skip if on the queue? first try to split.
1238 			 */
1239 			if (!list_empty(&folio->_deferred_list)) {
1240 				spin_unlock(ptl);
1241 				if (!tried_split_before)
1242 					split_folio(folio);
1243 				folio_unlock(folio);
1244 				folio_put(folio);
1245 				if (tried_split_before)
1246 					return 0;
1247 				tried_split_before = true;
1248 				goto retry_pmd;
1249 			}
1250 			/*
1251 			 * So long as that pmd lock is held, the folio cannot
1252 			 * be racily added to the _deferred_list, because
1253 			 * __folio_remove_rmap() will find !partially_mapped.
1254 			 */
1255 			if (folio_isolate_lru(folio)) {
1256 				if (!mem_cgroup_move_account(folio, true,
1257 							     mc.from, mc.to)) {
1258 					mc.precharge -= HPAGE_PMD_NR;
1259 					mc.moved_charge += HPAGE_PMD_NR;
1260 				}
1261 				folio_putback_lru(folio);
1262 			}
1263 			folio_unlock(folio);
1264 			folio_put(folio);
1265 		} else if (target_type == MC_TARGET_DEVICE) {
1266 			folio = target.folio;
1267 			if (!mem_cgroup_move_account(folio, true,
1268 						     mc.from, mc.to)) {
1269 				mc.precharge -= HPAGE_PMD_NR;
1270 				mc.moved_charge += HPAGE_PMD_NR;
1271 			}
1272 			folio_unlock(folio);
1273 			folio_put(folio);
1274 		}
1275 		spin_unlock(ptl);
1276 		return 0;
1277 	}
1278 
1279 retry:
1280 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1281 	if (!pte)
1282 		return 0;
1283 	for (; addr != end; addr += PAGE_SIZE) {
1284 		pte_t ptent = ptep_get(pte++);
1285 		bool device = false;
1286 		swp_entry_t ent;
1287 
1288 		if (!mc.precharge)
1289 			break;
1290 
1291 		switch (get_mctgt_type(vma, addr, ptent, &target)) {
1292 		case MC_TARGET_DEVICE:
1293 			device = true;
1294 			fallthrough;
1295 		case MC_TARGET_PAGE:
1296 			folio = target.folio;
1297 			/*
1298 			 * We can have a part of the split pmd here. Moving it
1299 			 * can be done but it would be too convoluted so simply
1300 			 * ignore such a partial THP and keep it in original
1301 			 * memcg. There should be somebody mapping the head.
1302 			 */
1303 			if (folio_test_large(folio))
1304 				goto put;
1305 			if (!device && !folio_isolate_lru(folio))
1306 				goto put;
1307 			if (!mem_cgroup_move_account(folio, false,
1308 						mc.from, mc.to)) {
1309 				mc.precharge--;
1310 				/* we uncharge from mc.from later. */
1311 				mc.moved_charge++;
1312 			}
1313 			if (!device)
1314 				folio_putback_lru(folio);
1315 put:			/* get_mctgt_type() gets & locks the page */
1316 			folio_unlock(folio);
1317 			folio_put(folio);
1318 			break;
1319 		case MC_TARGET_SWAP:
1320 			ent = target.ent;
1321 			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
1322 				mc.precharge--;
1323 				mem_cgroup_id_get_many(mc.to, 1);
1324 				/* we fixup other refcnts and charges later. */
1325 				mc.moved_swap++;
1326 			}
1327 			break;
1328 		default:
1329 			break;
1330 		}
1331 	}
1332 	pte_unmap_unlock(pte - 1, ptl);
1333 	cond_resched();
1334 
1335 	if (addr != end) {
1336 		/*
1337 		 * We have consumed all precharges we got in can_attach().
1338 		 * We try charge one by one, but don't do any additional
1339 		 * charges to mc.to if we have failed in charge once in attach()
1340 		 * phase.
1341 		 */
1342 		ret = mem_cgroup_do_precharge(1);
1343 		if (!ret)
1344 			goto retry;
1345 	}
1346 
1347 	return ret;
1348 }
1349 
1350 static const struct mm_walk_ops charge_walk_ops = {
1351 	.pmd_entry	= mem_cgroup_move_charge_pte_range,
1352 	.walk_lock	= PGWALK_RDLOCK,
1353 };
1354 
1355 static void mem_cgroup_move_charge(void)
1356 {
1357 	lru_add_drain_all();
1358 	/*
1359 	 * Signal folio_memcg_lock() to take the memcg's move_lock
1360 	 * while we're moving its pages to another memcg. Then wait
1361 	 * for already started RCU-only updates to finish.
1362 	 */
1363 	atomic_inc(&mc.from->moving_account);
1364 	synchronize_rcu();
1365 retry:
1366 	if (unlikely(!mmap_read_trylock(mc.mm))) {
1367 		/*
1368 		 * Someone who are holding the mmap_lock might be waiting in
1369 		 * waitq. So we cancel all extra charges, wake up all waiters,
1370 		 * and retry. Because we cancel precharges, we might not be able
1371 		 * to move enough charges, but moving charge is a best-effort
1372 		 * feature anyway, so it wouldn't be a big problem.
1373 		 */
1374 		__mem_cgroup_clear_mc();
1375 		cond_resched();
1376 		goto retry;
1377 	}
1378 	/*
1379 	 * When we have consumed all precharges and failed in doing
1380 	 * additional charge, the page walk just aborts.
1381 	 */
1382 	walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
1383 	mmap_read_unlock(mc.mm);
1384 	atomic_dec(&mc.from->moving_account);
1385 }
1386 
1387 void memcg1_move_task(void)
1388 {
1389 	if (mc.to) {
1390 		mem_cgroup_move_charge();
1391 		mem_cgroup_clear_mc();
1392 	}
1393 }
1394 
1395 #else	/* !CONFIG_MMU */
1396 int memcg1_can_attach(struct cgroup_taskset *tset)
1397 {
1398 	return 0;
1399 }
1400 void memcg1_cancel_attach(struct cgroup_taskset *tset)
1401 {
1402 }
1403 void memcg1_move_task(void)
1404 {
1405 }
1406 #endif
1407 
1408 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
1409 {
1410 	struct mem_cgroup_threshold_ary *t;
1411 	unsigned long usage;
1412 	int i;
1413 
1414 	rcu_read_lock();
1415 	if (!swap)
1416 		t = rcu_dereference(memcg->thresholds.primary);
1417 	else
1418 		t = rcu_dereference(memcg->memsw_thresholds.primary);
1419 
1420 	if (!t)
1421 		goto unlock;
1422 
1423 	usage = mem_cgroup_usage(memcg, swap);
1424 
1425 	/*
1426 	 * current_threshold points to threshold just below or equal to usage.
1427 	 * If it's not true, a threshold was crossed after last
1428 	 * call of __mem_cgroup_threshold().
1429 	 */
1430 	i = t->current_threshold;
1431 
1432 	/*
1433 	 * Iterate backward over array of thresholds starting from
1434 	 * current_threshold and check if a threshold is crossed.
1435 	 * If none of thresholds below usage is crossed, we read
1436 	 * only one element of the array here.
1437 	 */
1438 	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
1439 		eventfd_signal(t->entries[i].eventfd);
1440 
1441 	/* i = current_threshold + 1 */
1442 	i++;
1443 
1444 	/*
1445 	 * Iterate forward over array of thresholds starting from
1446 	 * current_threshold+1 and check if a threshold is crossed.
1447 	 * If none of thresholds above usage is crossed, we read
1448 	 * only one element of the array here.
1449 	 */
1450 	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
1451 		eventfd_signal(t->entries[i].eventfd);
1452 
1453 	/* Update current_threshold */
1454 	t->current_threshold = i - 1;
1455 unlock:
1456 	rcu_read_unlock();
1457 }
1458 
1459 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
1460 {
1461 	while (memcg) {
1462 		__mem_cgroup_threshold(memcg, false);
1463 		if (do_memsw_account())
1464 			__mem_cgroup_threshold(memcg, true);
1465 
1466 		memcg = parent_mem_cgroup(memcg);
1467 	}
1468 }
1469 
1470 /* Cgroup1: threshold notifications & softlimit tree updates */
1471 struct memcg1_events_percpu {
1472 	unsigned long nr_page_events;
1473 	unsigned long targets[MEM_CGROUP_NTARGETS];
1474 };
1475 
1476 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
1477 {
1478 	/* pagein of a big page is an event. So, ignore page size */
1479 	if (nr_pages > 0)
1480 		__count_memcg_events(memcg, PGPGIN, 1);
1481 	else {
1482 		__count_memcg_events(memcg, PGPGOUT, 1);
1483 		nr_pages = -nr_pages; /* for event */
1484 	}
1485 
1486 	__this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages);
1487 }
1488 
1489 #define THRESHOLDS_EVENTS_TARGET 128
1490 #define SOFTLIMIT_EVENTS_TARGET 1024
1491 
1492 static bool memcg1_event_ratelimit(struct mem_cgroup *memcg,
1493 				enum mem_cgroup_events_target target)
1494 {
1495 	unsigned long val, next;
1496 
1497 	val = __this_cpu_read(memcg->events_percpu->nr_page_events);
1498 	next = __this_cpu_read(memcg->events_percpu->targets[target]);
1499 	/* from time_after() in jiffies.h */
1500 	if ((long)(next - val) < 0) {
1501 		switch (target) {
1502 		case MEM_CGROUP_TARGET_THRESH:
1503 			next = val + THRESHOLDS_EVENTS_TARGET;
1504 			break;
1505 		case MEM_CGROUP_TARGET_SOFTLIMIT:
1506 			next = val + SOFTLIMIT_EVENTS_TARGET;
1507 			break;
1508 		default:
1509 			break;
1510 		}
1511 		__this_cpu_write(memcg->events_percpu->targets[target], next);
1512 		return true;
1513 	}
1514 	return false;
1515 }
1516 
1517 /*
1518  * Check events in order.
1519  *
1520  */
1521 static void memcg1_check_events(struct mem_cgroup *memcg, int nid)
1522 {
1523 	if (IS_ENABLED(CONFIG_PREEMPT_RT))
1524 		return;
1525 
1526 	/* threshold event is triggered in finer grain than soft limit */
1527 	if (unlikely(memcg1_event_ratelimit(memcg,
1528 						MEM_CGROUP_TARGET_THRESH))) {
1529 		bool do_softlimit;
1530 
1531 		do_softlimit = memcg1_event_ratelimit(memcg,
1532 						MEM_CGROUP_TARGET_SOFTLIMIT);
1533 		mem_cgroup_threshold(memcg);
1534 		if (unlikely(do_softlimit))
1535 			memcg1_update_tree(memcg, nid);
1536 	}
1537 }
1538 
1539 void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
1540 {
1541 	unsigned long flags;
1542 
1543 	local_irq_save(flags);
1544 	memcg1_charge_statistics(memcg, folio_nr_pages(folio));
1545 	memcg1_check_events(memcg, folio_nid(folio));
1546 	local_irq_restore(flags);
1547 }
1548 
1549 void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
1550 {
1551 	/*
1552 	 * Interrupts should be disabled here because the caller holds the
1553 	 * i_pages lock which is taken with interrupts-off. It is
1554 	 * important here to have the interrupts disabled because it is the
1555 	 * only synchronisation we have for updating the per-CPU variables.
1556 	 */
1557 	preempt_disable_nested();
1558 	VM_WARN_ON_IRQS_ENABLED();
1559 	memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
1560 	preempt_enable_nested();
1561 	memcg1_check_events(memcg, folio_nid(folio));
1562 }
1563 
1564 void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
1565 			   unsigned long nr_memory, int nid)
1566 {
1567 	unsigned long flags;
1568 
1569 	local_irq_save(flags);
1570 	__count_memcg_events(memcg, PGPGOUT, pgpgout);
1571 	__this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
1572 	memcg1_check_events(memcg, nid);
1573 	local_irq_restore(flags);
1574 }
1575 
1576 static int compare_thresholds(const void *a, const void *b)
1577 {
1578 	const struct mem_cgroup_threshold *_a = a;
1579 	const struct mem_cgroup_threshold *_b = b;
1580 
1581 	if (_a->threshold > _b->threshold)
1582 		return 1;
1583 
1584 	if (_a->threshold < _b->threshold)
1585 		return -1;
1586 
1587 	return 0;
1588 }
1589 
1590 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
1591 {
1592 	struct mem_cgroup_eventfd_list *ev;
1593 
1594 	spin_lock(&memcg_oom_lock);
1595 
1596 	list_for_each_entry(ev, &memcg->oom_notify, list)
1597 		eventfd_signal(ev->eventfd);
1598 
1599 	spin_unlock(&memcg_oom_lock);
1600 	return 0;
1601 }
1602 
1603 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
1604 {
1605 	struct mem_cgroup *iter;
1606 
1607 	for_each_mem_cgroup_tree(iter, memcg)
1608 		mem_cgroup_oom_notify_cb(iter);
1609 }
1610 
1611 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
1612 	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
1613 {
1614 	struct mem_cgroup_thresholds *thresholds;
1615 	struct mem_cgroup_threshold_ary *new;
1616 	unsigned long threshold;
1617 	unsigned long usage;
1618 	int i, size, ret;
1619 
1620 	ret = page_counter_memparse(args, "-1", &threshold);
1621 	if (ret)
1622 		return ret;
1623 
1624 	mutex_lock(&memcg->thresholds_lock);
1625 
1626 	if (type == _MEM) {
1627 		thresholds = &memcg->thresholds;
1628 		usage = mem_cgroup_usage(memcg, false);
1629 	} else if (type == _MEMSWAP) {
1630 		thresholds = &memcg->memsw_thresholds;
1631 		usage = mem_cgroup_usage(memcg, true);
1632 	} else
1633 		BUG();
1634 
1635 	/* Check if a threshold crossed before adding a new one */
1636 	if (thresholds->primary)
1637 		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
1638 
1639 	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
1640 
1641 	/* Allocate memory for new array of thresholds */
1642 	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
1643 	if (!new) {
1644 		ret = -ENOMEM;
1645 		goto unlock;
1646 	}
1647 	new->size = size;
1648 
1649 	/* Copy thresholds (if any) to new array */
1650 	if (thresholds->primary)
1651 		memcpy(new->entries, thresholds->primary->entries,
1652 		       flex_array_size(new, entries, size - 1));
1653 
1654 	/* Add new threshold */
1655 	new->entries[size - 1].eventfd = eventfd;
1656 	new->entries[size - 1].threshold = threshold;
1657 
1658 	/* Sort thresholds. Registering of new threshold isn't time-critical */
1659 	sort(new->entries, size, sizeof(*new->entries),
1660 			compare_thresholds, NULL);
1661 
1662 	/* Find current threshold */
1663 	new->current_threshold = -1;
1664 	for (i = 0; i < size; i++) {
1665 		if (new->entries[i].threshold <= usage) {
1666 			/*
1667 			 * new->current_threshold will not be used until
1668 			 * rcu_assign_pointer(), so it's safe to increment
1669 			 * it here.
1670 			 */
1671 			++new->current_threshold;
1672 		} else
1673 			break;
1674 	}
1675 
1676 	/* Free old spare buffer and save old primary buffer as spare */
1677 	kfree(thresholds->spare);
1678 	thresholds->spare = thresholds->primary;
1679 
1680 	rcu_assign_pointer(thresholds->primary, new);
1681 
1682 	/* To be sure that nobody uses thresholds */
1683 	synchronize_rcu();
1684 
1685 unlock:
1686 	mutex_unlock(&memcg->thresholds_lock);
1687 
1688 	return ret;
1689 }
1690 
1691 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
1692 	struct eventfd_ctx *eventfd, const char *args)
1693 {
1694 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
1695 }
1696 
1697 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
1698 	struct eventfd_ctx *eventfd, const char *args)
1699 {
1700 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
1701 }
1702 
1703 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
1704 	struct eventfd_ctx *eventfd, enum res_type type)
1705 {
1706 	struct mem_cgroup_thresholds *thresholds;
1707 	struct mem_cgroup_threshold_ary *new;
1708 	unsigned long usage;
1709 	int i, j, size, entries;
1710 
1711 	mutex_lock(&memcg->thresholds_lock);
1712 
1713 	if (type == _MEM) {
1714 		thresholds = &memcg->thresholds;
1715 		usage = mem_cgroup_usage(memcg, false);
1716 	} else if (type == _MEMSWAP) {
1717 		thresholds = &memcg->memsw_thresholds;
1718 		usage = mem_cgroup_usage(memcg, true);
1719 	} else
1720 		BUG();
1721 
1722 	if (!thresholds->primary)
1723 		goto unlock;
1724 
1725 	/* Check if a threshold crossed before removing */
1726 	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
1727 
1728 	/* Calculate new number of threshold */
1729 	size = entries = 0;
1730 	for (i = 0; i < thresholds->primary->size; i++) {
1731 		if (thresholds->primary->entries[i].eventfd != eventfd)
1732 			size++;
1733 		else
1734 			entries++;
1735 	}
1736 
1737 	new = thresholds->spare;
1738 
1739 	/* If no items related to eventfd have been cleared, nothing to do */
1740 	if (!entries)
1741 		goto unlock;
1742 
1743 	/* Set thresholds array to NULL if we don't have thresholds */
1744 	if (!size) {
1745 		kfree(new);
1746 		new = NULL;
1747 		goto swap_buffers;
1748 	}
1749 
1750 	new->size = size;
1751 
1752 	/* Copy thresholds and find current threshold */
1753 	new->current_threshold = -1;
1754 	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
1755 		if (thresholds->primary->entries[i].eventfd == eventfd)
1756 			continue;
1757 
1758 		new->entries[j] = thresholds->primary->entries[i];
1759 		if (new->entries[j].threshold <= usage) {
1760 			/*
1761 			 * new->current_threshold will not be used
1762 			 * until rcu_assign_pointer(), so it's safe to increment
1763 			 * it here.
1764 			 */
1765 			++new->current_threshold;
1766 		}
1767 		j++;
1768 	}
1769 
1770 swap_buffers:
1771 	/* Swap primary and spare array */
1772 	thresholds->spare = thresholds->primary;
1773 
1774 	rcu_assign_pointer(thresholds->primary, new);
1775 
1776 	/* To be sure that nobody uses thresholds */
1777 	synchronize_rcu();
1778 
1779 	/* If all events are unregistered, free the spare array */
1780 	if (!new) {
1781 		kfree(thresholds->spare);
1782 		thresholds->spare = NULL;
1783 	}
1784 unlock:
1785 	mutex_unlock(&memcg->thresholds_lock);
1786 }
1787 
1788 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
1789 	struct eventfd_ctx *eventfd)
1790 {
1791 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
1792 }
1793 
1794 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
1795 	struct eventfd_ctx *eventfd)
1796 {
1797 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
1798 }
1799 
1800 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
1801 	struct eventfd_ctx *eventfd, const char *args)
1802 {
1803 	struct mem_cgroup_eventfd_list *event;
1804 
1805 	event = kmalloc(sizeof(*event),	GFP_KERNEL);
1806 	if (!event)
1807 		return -ENOMEM;
1808 
1809 	spin_lock(&memcg_oom_lock);
1810 
1811 	event->eventfd = eventfd;
1812 	list_add(&event->list, &memcg->oom_notify);
1813 
1814 	/* already in OOM ? */
1815 	if (memcg->under_oom)
1816 		eventfd_signal(eventfd);
1817 	spin_unlock(&memcg_oom_lock);
1818 
1819 	return 0;
1820 }
1821 
1822 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
1823 	struct eventfd_ctx *eventfd)
1824 {
1825 	struct mem_cgroup_eventfd_list *ev, *tmp;
1826 
1827 	spin_lock(&memcg_oom_lock);
1828 
1829 	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
1830 		if (ev->eventfd == eventfd) {
1831 			list_del(&ev->list);
1832 			kfree(ev);
1833 		}
1834 	}
1835 
1836 	spin_unlock(&memcg_oom_lock);
1837 }
1838 
1839 /*
1840  * DO NOT USE IN NEW FILES.
1841  *
1842  * "cgroup.event_control" implementation.
1843  *
1844  * This is way over-engineered.  It tries to support fully configurable
1845  * events for each user.  Such level of flexibility is completely
1846  * unnecessary especially in the light of the planned unified hierarchy.
1847  *
1848  * Please deprecate this and replace with something simpler if at all
1849  * possible.
1850  */
1851 
1852 /*
1853  * Unregister event and free resources.
1854  *
1855  * Gets called from workqueue.
1856  */
1857 static void memcg_event_remove(struct work_struct *work)
1858 {
1859 	struct mem_cgroup_event *event =
1860 		container_of(work, struct mem_cgroup_event, remove);
1861 	struct mem_cgroup *memcg = event->memcg;
1862 
1863 	remove_wait_queue(event->wqh, &event->wait);
1864 
1865 	event->unregister_event(memcg, event->eventfd);
1866 
1867 	/* Notify userspace the event is going away. */
1868 	eventfd_signal(event->eventfd);
1869 
1870 	eventfd_ctx_put(event->eventfd);
1871 	kfree(event);
1872 	css_put(&memcg->css);
1873 }
1874 
1875 /*
1876  * Gets called on EPOLLHUP on eventfd when user closes it.
1877  *
1878  * Called with wqh->lock held and interrupts disabled.
1879  */
1880 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
1881 			    int sync, void *key)
1882 {
1883 	struct mem_cgroup_event *event =
1884 		container_of(wait, struct mem_cgroup_event, wait);
1885 	struct mem_cgroup *memcg = event->memcg;
1886 	__poll_t flags = key_to_poll(key);
1887 
1888 	if (flags & EPOLLHUP) {
1889 		/*
1890 		 * If the event has been detached at cgroup removal, we
1891 		 * can simply return knowing the other side will cleanup
1892 		 * for us.
1893 		 *
1894 		 * We can't race against event freeing since the other
1895 		 * side will require wqh->lock via remove_wait_queue(),
1896 		 * which we hold.
1897 		 */
1898 		spin_lock(&memcg->event_list_lock);
1899 		if (!list_empty(&event->list)) {
1900 			list_del_init(&event->list);
1901 			/*
1902 			 * We are in atomic context, but cgroup_event_remove()
1903 			 * may sleep, so we have to call it in workqueue.
1904 			 */
1905 			schedule_work(&event->remove);
1906 		}
1907 		spin_unlock(&memcg->event_list_lock);
1908 	}
1909 
1910 	return 0;
1911 }
1912 
1913 static void memcg_event_ptable_queue_proc(struct file *file,
1914 		wait_queue_head_t *wqh, poll_table *pt)
1915 {
1916 	struct mem_cgroup_event *event =
1917 		container_of(pt, struct mem_cgroup_event, pt);
1918 
1919 	event->wqh = wqh;
1920 	add_wait_queue(wqh, &event->wait);
1921 }
1922 
1923 /*
1924  * DO NOT USE IN NEW FILES.
1925  *
1926  * Parse input and register new cgroup event handler.
1927  *
1928  * Input must be in format '<event_fd> <control_fd> <args>'.
1929  * Interpretation of args is defined by control file implementation.
1930  */
1931 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
1932 					 char *buf, size_t nbytes, loff_t off)
1933 {
1934 	struct cgroup_subsys_state *css = of_css(of);
1935 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1936 	struct mem_cgroup_event *event;
1937 	struct cgroup_subsys_state *cfile_css;
1938 	unsigned int efd, cfd;
1939 	struct dentry *cdentry;
1940 	const char *name;
1941 	char *endp;
1942 	int ret;
1943 
1944 	if (IS_ENABLED(CONFIG_PREEMPT_RT))
1945 		return -EOPNOTSUPP;
1946 
1947 	buf = strstrip(buf);
1948 
1949 	efd = simple_strtoul(buf, &endp, 10);
1950 	if (*endp != ' ')
1951 		return -EINVAL;
1952 	buf = endp + 1;
1953 
1954 	cfd = simple_strtoul(buf, &endp, 10);
1955 	if (*endp == '\0')
1956 		buf = endp;
1957 	else if (*endp == ' ')
1958 		buf = endp + 1;
1959 	else
1960 		return -EINVAL;
1961 
1962 	CLASS(fd, efile)(efd);
1963 	if (fd_empty(efile))
1964 		return -EBADF;
1965 
1966 	CLASS(fd, cfile)(cfd);
1967 
1968 	event = kzalloc(sizeof(*event), GFP_KERNEL);
1969 	if (!event)
1970 		return -ENOMEM;
1971 
1972 	event->memcg = memcg;
1973 	INIT_LIST_HEAD(&event->list);
1974 	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
1975 	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
1976 	INIT_WORK(&event->remove, memcg_event_remove);
1977 
1978 	event->eventfd = eventfd_ctx_fileget(fd_file(efile));
1979 	if (IS_ERR(event->eventfd)) {
1980 		ret = PTR_ERR(event->eventfd);
1981 		goto out_kfree;
1982 	}
1983 
1984 	if (fd_empty(cfile)) {
1985 		ret = -EBADF;
1986 		goto out_put_eventfd;
1987 	}
1988 
1989 	/* the process need read permission on control file */
1990 	/* AV: shouldn't we check that it's been opened for read instead? */
1991 	ret = file_permission(fd_file(cfile), MAY_READ);
1992 	if (ret < 0)
1993 		goto out_put_eventfd;
1994 
1995 	/*
1996 	 * The control file must be a regular cgroup1 file. As a regular cgroup
1997 	 * file can't be renamed, it's safe to access its name afterwards.
1998 	 */
1999 	cdentry = fd_file(cfile)->f_path.dentry;
2000 	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
2001 		ret = -EINVAL;
2002 		goto out_put_eventfd;
2003 	}
2004 
2005 	/*
2006 	 * Determine the event callbacks and set them in @event.  This used
2007 	 * to be done via struct cftype but cgroup core no longer knows
2008 	 * about these events.  The following is crude but the whole thing
2009 	 * is for compatibility anyway.
2010 	 *
2011 	 * DO NOT ADD NEW FILES.
2012 	 */
2013 	name = cdentry->d_name.name;
2014 
2015 	if (!strcmp(name, "memory.usage_in_bytes")) {
2016 		event->register_event = mem_cgroup_usage_register_event;
2017 		event->unregister_event = mem_cgroup_usage_unregister_event;
2018 	} else if (!strcmp(name, "memory.oom_control")) {
2019 		pr_warn_once("oom_control is deprecated and will be removed. "
2020 			     "Please report your usecase to linux-mm-@kvack.org"
2021 			     " if you depend on this functionality. \n");
2022 		event->register_event = mem_cgroup_oom_register_event;
2023 		event->unregister_event = mem_cgroup_oom_unregister_event;
2024 	} else if (!strcmp(name, "memory.pressure_level")) {
2025 		pr_warn_once("pressure_level is deprecated and will be removed. "
2026 			     "Please report your usecase to linux-mm-@kvack.org "
2027 			     "if you depend on this functionality. \n");
2028 		event->register_event = vmpressure_register_event;
2029 		event->unregister_event = vmpressure_unregister_event;
2030 	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
2031 		event->register_event = memsw_cgroup_usage_register_event;
2032 		event->unregister_event = memsw_cgroup_usage_unregister_event;
2033 	} else {
2034 		ret = -EINVAL;
2035 		goto out_put_eventfd;
2036 	}
2037 
2038 	/*
2039 	 * Verify @cfile should belong to @css.  Also, remaining events are
2040 	 * automatically removed on cgroup destruction but the removal is
2041 	 * asynchronous, so take an extra ref on @css.
2042 	 */
2043 	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
2044 					       &memory_cgrp_subsys);
2045 	ret = -EINVAL;
2046 	if (IS_ERR(cfile_css))
2047 		goto out_put_eventfd;
2048 	if (cfile_css != css)
2049 		goto out_put_css;
2050 
2051 	ret = event->register_event(memcg, event->eventfd, buf);
2052 	if (ret)
2053 		goto out_put_css;
2054 
2055 	vfs_poll(fd_file(efile), &event->pt);
2056 
2057 	spin_lock_irq(&memcg->event_list_lock);
2058 	list_add(&event->list, &memcg->event_list);
2059 	spin_unlock_irq(&memcg->event_list_lock);
2060 	return nbytes;
2061 
2062 out_put_css:
2063 	css_put(cfile_css);
2064 out_put_eventfd:
2065 	eventfd_ctx_put(event->eventfd);
2066 out_kfree:
2067 	kfree(event);
2068 	return ret;
2069 }
2070 
2071 void memcg1_memcg_init(struct mem_cgroup *memcg)
2072 {
2073 	INIT_LIST_HEAD(&memcg->oom_notify);
2074 	mutex_init(&memcg->thresholds_lock);
2075 	spin_lock_init(&memcg->move_lock);
2076 	INIT_LIST_HEAD(&memcg->event_list);
2077 	spin_lock_init(&memcg->event_list_lock);
2078 }
2079 
2080 void memcg1_css_offline(struct mem_cgroup *memcg)
2081 {
2082 	struct mem_cgroup_event *event, *tmp;
2083 
2084 	/*
2085 	 * Unregister events and notify userspace.
2086 	 * Notify userspace about cgroup removing only after rmdir of cgroup
2087 	 * directory to avoid race between userspace and kernelspace.
2088 	 */
2089 	spin_lock_irq(&memcg->event_list_lock);
2090 	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
2091 		list_del_init(&event->list);
2092 		schedule_work(&event->remove);
2093 	}
2094 	spin_unlock_irq(&memcg->event_list_lock);
2095 }
2096 
2097 /*
2098  * Check OOM-Killer is already running under our hierarchy.
2099  * If someone is running, return false.
2100  */
2101 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
2102 {
2103 	struct mem_cgroup *iter, *failed = NULL;
2104 
2105 	spin_lock(&memcg_oom_lock);
2106 
2107 	for_each_mem_cgroup_tree(iter, memcg) {
2108 		if (iter->oom_lock) {
2109 			/*
2110 			 * this subtree of our hierarchy is already locked
2111 			 * so we cannot give a lock.
2112 			 */
2113 			failed = iter;
2114 			mem_cgroup_iter_break(memcg, iter);
2115 			break;
2116 		} else
2117 			iter->oom_lock = true;
2118 	}
2119 
2120 	if (failed) {
2121 		/*
2122 		 * OK, we failed to lock the whole subtree so we have
2123 		 * to clean up what we set up to the failing subtree
2124 		 */
2125 		for_each_mem_cgroup_tree(iter, memcg) {
2126 			if (iter == failed) {
2127 				mem_cgroup_iter_break(memcg, iter);
2128 				break;
2129 			}
2130 			iter->oom_lock = false;
2131 		}
2132 	} else
2133 		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
2134 
2135 	spin_unlock(&memcg_oom_lock);
2136 
2137 	return !failed;
2138 }
2139 
2140 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2141 {
2142 	struct mem_cgroup *iter;
2143 
2144 	spin_lock(&memcg_oom_lock);
2145 	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
2146 	for_each_mem_cgroup_tree(iter, memcg)
2147 		iter->oom_lock = false;
2148 	spin_unlock(&memcg_oom_lock);
2149 }
2150 
2151 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
2152 {
2153 	struct mem_cgroup *iter;
2154 
2155 	spin_lock(&memcg_oom_lock);
2156 	for_each_mem_cgroup_tree(iter, memcg)
2157 		iter->under_oom++;
2158 	spin_unlock(&memcg_oom_lock);
2159 }
2160 
2161 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
2162 {
2163 	struct mem_cgroup *iter;
2164 
2165 	/*
2166 	 * Be careful about under_oom underflows because a child memcg
2167 	 * could have been added after mem_cgroup_mark_under_oom.
2168 	 */
2169 	spin_lock(&memcg_oom_lock);
2170 	for_each_mem_cgroup_tree(iter, memcg)
2171 		if (iter->under_oom > 0)
2172 			iter->under_oom--;
2173 	spin_unlock(&memcg_oom_lock);
2174 }
2175 
2176 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2177 
2178 struct oom_wait_info {
2179 	struct mem_cgroup *memcg;
2180 	wait_queue_entry_t	wait;
2181 };
2182 
2183 static int memcg_oom_wake_function(wait_queue_entry_t *wait,
2184 	unsigned mode, int sync, void *arg)
2185 {
2186 	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2187 	struct mem_cgroup *oom_wait_memcg;
2188 	struct oom_wait_info *oom_wait_info;
2189 
2190 	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2191 	oom_wait_memcg = oom_wait_info->memcg;
2192 
2193 	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
2194 	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
2195 		return 0;
2196 	return autoremove_wake_function(wait, mode, sync, arg);
2197 }
2198 
2199 void memcg1_oom_recover(struct mem_cgroup *memcg)
2200 {
2201 	/*
2202 	 * For the following lockless ->under_oom test, the only required
2203 	 * guarantee is that it must see the state asserted by an OOM when
2204 	 * this function is called as a result of userland actions
2205 	 * triggered by the notification of the OOM.  This is trivially
2206 	 * achieved by invoking mem_cgroup_mark_under_oom() before
2207 	 * triggering notification.
2208 	 */
2209 	if (memcg && memcg->under_oom)
2210 		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2211 }
2212 
2213 /**
2214  * mem_cgroup_oom_synchronize - complete memcg OOM handling
2215  * @handle: actually kill/wait or just clean up the OOM state
2216  *
2217  * This has to be called at the end of a page fault if the memcg OOM
2218  * handler was enabled.
2219  *
2220  * Memcg supports userspace OOM handling where failed allocations must
2221  * sleep on a waitqueue until the userspace task resolves the
2222  * situation.  Sleeping directly in the charge context with all kinds
2223  * of locks held is not a good idea, instead we remember an OOM state
2224  * in the task and mem_cgroup_oom_synchronize() has to be called at
2225  * the end of the page fault to complete the OOM handling.
2226  *
2227  * Returns %true if an ongoing memcg OOM situation was detected and
2228  * completed, %false otherwise.
2229  */
2230 bool mem_cgroup_oom_synchronize(bool handle)
2231 {
2232 	struct mem_cgroup *memcg = current->memcg_in_oom;
2233 	struct oom_wait_info owait;
2234 	bool locked;
2235 
2236 	/* OOM is global, do not handle */
2237 	if (!memcg)
2238 		return false;
2239 
2240 	if (!handle)
2241 		goto cleanup;
2242 
2243 	owait.memcg = memcg;
2244 	owait.wait.flags = 0;
2245 	owait.wait.func = memcg_oom_wake_function;
2246 	owait.wait.private = current;
2247 	INIT_LIST_HEAD(&owait.wait.entry);
2248 
2249 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2250 	mem_cgroup_mark_under_oom(memcg);
2251 
2252 	locked = mem_cgroup_oom_trylock(memcg);
2253 
2254 	if (locked)
2255 		mem_cgroup_oom_notify(memcg);
2256 
2257 	schedule();
2258 	mem_cgroup_unmark_under_oom(memcg);
2259 	finish_wait(&memcg_oom_waitq, &owait.wait);
2260 
2261 	if (locked)
2262 		mem_cgroup_oom_unlock(memcg);
2263 cleanup:
2264 	current->memcg_in_oom = NULL;
2265 	css_put(&memcg->css);
2266 	return true;
2267 }
2268 
2269 
2270 bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked)
2271 {
2272 	/*
2273 	 * We are in the middle of the charge context here, so we
2274 	 * don't want to block when potentially sitting on a callstack
2275 	 * that holds all kinds of filesystem and mm locks.
2276 	 *
2277 	 * cgroup1 allows disabling the OOM killer and waiting for outside
2278 	 * handling until the charge can succeed; remember the context and put
2279 	 * the task to sleep at the end of the page fault when all locks are
2280 	 * released.
2281 	 *
2282 	 * On the other hand, in-kernel OOM killer allows for an async victim
2283 	 * memory reclaim (oom_reaper) and that means that we are not solely
2284 	 * relying on the oom victim to make a forward progress and we can
2285 	 * invoke the oom killer here.
2286 	 *
2287 	 * Please note that mem_cgroup_out_of_memory might fail to find a
2288 	 * victim and then we have to bail out from the charge path.
2289 	 */
2290 	if (READ_ONCE(memcg->oom_kill_disable)) {
2291 		if (current->in_user_fault) {
2292 			css_get(&memcg->css);
2293 			current->memcg_in_oom = memcg;
2294 		}
2295 		return false;
2296 	}
2297 
2298 	mem_cgroup_mark_under_oom(memcg);
2299 
2300 	*locked = mem_cgroup_oom_trylock(memcg);
2301 
2302 	if (*locked)
2303 		mem_cgroup_oom_notify(memcg);
2304 
2305 	mem_cgroup_unmark_under_oom(memcg);
2306 
2307 	return true;
2308 }
2309 
2310 void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked)
2311 {
2312 	if (locked)
2313 		mem_cgroup_oom_unlock(memcg);
2314 }
2315 
2316 static DEFINE_MUTEX(memcg_max_mutex);
2317 
2318 static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
2319 				 unsigned long max, bool memsw)
2320 {
2321 	bool enlarge = false;
2322 	bool drained = false;
2323 	int ret;
2324 	bool limits_invariant;
2325 	struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
2326 
2327 	do {
2328 		if (signal_pending(current)) {
2329 			ret = -EINTR;
2330 			break;
2331 		}
2332 
2333 		mutex_lock(&memcg_max_mutex);
2334 		/*
2335 		 * Make sure that the new limit (memsw or memory limit) doesn't
2336 		 * break our basic invariant rule memory.max <= memsw.max.
2337 		 */
2338 		limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
2339 					   max <= memcg->memsw.max;
2340 		if (!limits_invariant) {
2341 			mutex_unlock(&memcg_max_mutex);
2342 			ret = -EINVAL;
2343 			break;
2344 		}
2345 		if (max > counter->max)
2346 			enlarge = true;
2347 		ret = page_counter_set_max(counter, max);
2348 		mutex_unlock(&memcg_max_mutex);
2349 
2350 		if (!ret)
2351 			break;
2352 
2353 		if (!drained) {
2354 			drain_all_stock(memcg);
2355 			drained = true;
2356 			continue;
2357 		}
2358 
2359 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
2360 				memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
2361 			ret = -EBUSY;
2362 			break;
2363 		}
2364 	} while (true);
2365 
2366 	if (!ret && enlarge)
2367 		memcg1_oom_recover(memcg);
2368 
2369 	return ret;
2370 }
2371 
2372 /*
2373  * Reclaims as many pages from the given memcg as possible.
2374  *
2375  * Caller is responsible for holding css reference for memcg.
2376  */
2377 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2378 {
2379 	int nr_retries = MAX_RECLAIM_RETRIES;
2380 
2381 	/* we call try-to-free pages for make this cgroup empty */
2382 	lru_add_drain_all();
2383 
2384 	drain_all_stock(memcg);
2385 
2386 	/* try to free all pages in this cgroup */
2387 	while (nr_retries && page_counter_read(&memcg->memory)) {
2388 		if (signal_pending(current))
2389 			return -EINTR;
2390 
2391 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
2392 						  MEMCG_RECLAIM_MAY_SWAP, NULL))
2393 			nr_retries--;
2394 	}
2395 
2396 	return 0;
2397 }
2398 
2399 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
2400 					    char *buf, size_t nbytes,
2401 					    loff_t off)
2402 {
2403 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2404 
2405 	if (mem_cgroup_is_root(memcg))
2406 		return -EINVAL;
2407 	return mem_cgroup_force_empty(memcg) ?: nbytes;
2408 }
2409 
2410 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
2411 				     struct cftype *cft)
2412 {
2413 	return 1;
2414 }
2415 
2416 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
2417 				      struct cftype *cft, u64 val)
2418 {
2419 	if (val == 1)
2420 		return 0;
2421 
2422 	pr_warn_once("Non-hierarchical mode is deprecated. "
2423 		     "Please report your usecase to linux-mm@kvack.org if you "
2424 		     "depend on this functionality.\n");
2425 
2426 	return -EINVAL;
2427 }
2428 
2429 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
2430 			       struct cftype *cft)
2431 {
2432 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2433 	struct page_counter *counter;
2434 
2435 	switch (MEMFILE_TYPE(cft->private)) {
2436 	case _MEM:
2437 		counter = &memcg->memory;
2438 		break;
2439 	case _MEMSWAP:
2440 		counter = &memcg->memsw;
2441 		break;
2442 	case _KMEM:
2443 		counter = &memcg->kmem;
2444 		break;
2445 	case _TCP:
2446 		counter = &memcg->tcpmem;
2447 		break;
2448 	default:
2449 		BUG();
2450 	}
2451 
2452 	switch (MEMFILE_ATTR(cft->private)) {
2453 	case RES_USAGE:
2454 		if (counter == &memcg->memory)
2455 			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
2456 		if (counter == &memcg->memsw)
2457 			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
2458 		return (u64)page_counter_read(counter) * PAGE_SIZE;
2459 	case RES_LIMIT:
2460 		return (u64)counter->max * PAGE_SIZE;
2461 	case RES_MAX_USAGE:
2462 		return (u64)counter->watermark * PAGE_SIZE;
2463 	case RES_FAILCNT:
2464 		return counter->failcnt;
2465 	case RES_SOFT_LIMIT:
2466 		return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
2467 	default:
2468 		BUG();
2469 	}
2470 }
2471 
2472 /*
2473  * This function doesn't do anything useful. Its only job is to provide a read
2474  * handler for a file so that cgroup_file_mode() will add read permissions.
2475  */
2476 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m,
2477 				     __always_unused void *v)
2478 {
2479 	return -EINVAL;
2480 }
2481 
2482 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
2483 {
2484 	int ret;
2485 
2486 	mutex_lock(&memcg_max_mutex);
2487 
2488 	ret = page_counter_set_max(&memcg->tcpmem, max);
2489 	if (ret)
2490 		goto out;
2491 
2492 	if (!memcg->tcpmem_active) {
2493 		/*
2494 		 * The active flag needs to be written after the static_key
2495 		 * update. This is what guarantees that the socket activation
2496 		 * function is the last one to run. See mem_cgroup_sk_alloc()
2497 		 * for details, and note that we don't mark any socket as
2498 		 * belonging to this memcg until that flag is up.
2499 		 *
2500 		 * We need to do this, because static_keys will span multiple
2501 		 * sites, but we can't control their order. If we mark a socket
2502 		 * as accounted, but the accounting functions are not patched in
2503 		 * yet, we'll lose accounting.
2504 		 *
2505 		 * We never race with the readers in mem_cgroup_sk_alloc(),
2506 		 * because when this value change, the code to process it is not
2507 		 * patched in yet.
2508 		 */
2509 		static_branch_inc(&memcg_sockets_enabled_key);
2510 		memcg->tcpmem_active = true;
2511 	}
2512 out:
2513 	mutex_unlock(&memcg_max_mutex);
2514 	return ret;
2515 }
2516 
2517 /*
2518  * The user of this function is...
2519  * RES_LIMIT.
2520  */
2521 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
2522 				char *buf, size_t nbytes, loff_t off)
2523 {
2524 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2525 	unsigned long nr_pages;
2526 	int ret;
2527 
2528 	buf = strstrip(buf);
2529 	ret = page_counter_memparse(buf, "-1", &nr_pages);
2530 	if (ret)
2531 		return ret;
2532 
2533 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
2534 	case RES_LIMIT:
2535 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2536 			ret = -EINVAL;
2537 			break;
2538 		}
2539 		switch (MEMFILE_TYPE(of_cft(of)->private)) {
2540 		case _MEM:
2541 			ret = mem_cgroup_resize_max(memcg, nr_pages, false);
2542 			break;
2543 		case _MEMSWAP:
2544 			ret = mem_cgroup_resize_max(memcg, nr_pages, true);
2545 			break;
2546 		case _KMEM:
2547 			pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
2548 				     "Writing any value to this file has no effect. "
2549 				     "Please report your usecase to linux-mm@kvack.org if you "
2550 				     "depend on this functionality.\n");
2551 			ret = 0;
2552 			break;
2553 		case _TCP:
2554 			pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. "
2555 				     "Please report your usecase to linux-mm@kvack.org if you "
2556 				     "depend on this functionality.\n");
2557 			ret = memcg_update_tcp_max(memcg, nr_pages);
2558 			break;
2559 		}
2560 		break;
2561 	case RES_SOFT_LIMIT:
2562 		if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
2563 			ret = -EOPNOTSUPP;
2564 		} else {
2565 			pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. "
2566 				     "Please report your usecase to linux-mm@kvack.org if you "
2567 				     "depend on this functionality.\n");
2568 			WRITE_ONCE(memcg->soft_limit, nr_pages);
2569 			ret = 0;
2570 		}
2571 		break;
2572 	}
2573 	return ret ?: nbytes;
2574 }
2575 
2576 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
2577 				size_t nbytes, loff_t off)
2578 {
2579 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2580 	struct page_counter *counter;
2581 
2582 	switch (MEMFILE_TYPE(of_cft(of)->private)) {
2583 	case _MEM:
2584 		counter = &memcg->memory;
2585 		break;
2586 	case _MEMSWAP:
2587 		counter = &memcg->memsw;
2588 		break;
2589 	case _KMEM:
2590 		counter = &memcg->kmem;
2591 		break;
2592 	case _TCP:
2593 		counter = &memcg->tcpmem;
2594 		break;
2595 	default:
2596 		BUG();
2597 	}
2598 
2599 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
2600 	case RES_MAX_USAGE:
2601 		page_counter_reset_watermark(counter);
2602 		break;
2603 	case RES_FAILCNT:
2604 		counter->failcnt = 0;
2605 		break;
2606 	default:
2607 		BUG();
2608 	}
2609 
2610 	return nbytes;
2611 }
2612 
2613 #ifdef CONFIG_NUMA
2614 
2615 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
2616 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
2617 #define LRU_ALL	     ((1 << NR_LRU_LISTS) - 1)
2618 
2619 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
2620 				int nid, unsigned int lru_mask, bool tree)
2621 {
2622 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
2623 	unsigned long nr = 0;
2624 	enum lru_list lru;
2625 
2626 	VM_BUG_ON((unsigned)nid >= nr_node_ids);
2627 
2628 	for_each_lru(lru) {
2629 		if (!(BIT(lru) & lru_mask))
2630 			continue;
2631 		if (tree)
2632 			nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
2633 		else
2634 			nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
2635 	}
2636 	return nr;
2637 }
2638 
2639 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
2640 					     unsigned int lru_mask,
2641 					     bool tree)
2642 {
2643 	unsigned long nr = 0;
2644 	enum lru_list lru;
2645 
2646 	for_each_lru(lru) {
2647 		if (!(BIT(lru) & lru_mask))
2648 			continue;
2649 		if (tree)
2650 			nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
2651 		else
2652 			nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
2653 	}
2654 	return nr;
2655 }
2656 
2657 static int memcg_numa_stat_show(struct seq_file *m, void *v)
2658 {
2659 	struct numa_stat {
2660 		const char *name;
2661 		unsigned int lru_mask;
2662 	};
2663 
2664 	static const struct numa_stat stats[] = {
2665 		{ "total", LRU_ALL },
2666 		{ "file", LRU_ALL_FILE },
2667 		{ "anon", LRU_ALL_ANON },
2668 		{ "unevictable", BIT(LRU_UNEVICTABLE) },
2669 	};
2670 	const struct numa_stat *stat;
2671 	int nid;
2672 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
2673 
2674 	mem_cgroup_flush_stats(memcg);
2675 
2676 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
2677 		seq_printf(m, "%s=%lu", stat->name,
2678 			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
2679 						   false));
2680 		for_each_node_state(nid, N_MEMORY)
2681 			seq_printf(m, " N%d=%lu", nid,
2682 				   mem_cgroup_node_nr_lru_pages(memcg, nid,
2683 							stat->lru_mask, false));
2684 		seq_putc(m, '\n');
2685 	}
2686 
2687 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
2688 
2689 		seq_printf(m, "hierarchical_%s=%lu", stat->name,
2690 			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
2691 						   true));
2692 		for_each_node_state(nid, N_MEMORY)
2693 			seq_printf(m, " N%d=%lu", nid,
2694 				   mem_cgroup_node_nr_lru_pages(memcg, nid,
2695 							stat->lru_mask, true));
2696 		seq_putc(m, '\n');
2697 	}
2698 
2699 	return 0;
2700 }
2701 #endif /* CONFIG_NUMA */
2702 
2703 static const unsigned int memcg1_stats[] = {
2704 	NR_FILE_PAGES,
2705 	NR_ANON_MAPPED,
2706 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2707 	NR_ANON_THPS,
2708 #endif
2709 	NR_SHMEM,
2710 	NR_FILE_MAPPED,
2711 	NR_FILE_DIRTY,
2712 	NR_WRITEBACK,
2713 	WORKINGSET_REFAULT_ANON,
2714 	WORKINGSET_REFAULT_FILE,
2715 #ifdef CONFIG_SWAP
2716 	MEMCG_SWAP,
2717 	NR_SWAPCACHE,
2718 #endif
2719 };
2720 
2721 static const char *const memcg1_stat_names[] = {
2722 	"cache",
2723 	"rss",
2724 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2725 	"rss_huge",
2726 #endif
2727 	"shmem",
2728 	"mapped_file",
2729 	"dirty",
2730 	"writeback",
2731 	"workingset_refault_anon",
2732 	"workingset_refault_file",
2733 #ifdef CONFIG_SWAP
2734 	"swap",
2735 	"swapcached",
2736 #endif
2737 };
2738 
2739 /* Universal VM events cgroup1 shows, original sort order */
2740 static const unsigned int memcg1_events[] = {
2741 	PGPGIN,
2742 	PGPGOUT,
2743 	PGFAULT,
2744 	PGMAJFAULT,
2745 };
2746 
2747 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
2748 {
2749 	unsigned long memory, memsw;
2750 	struct mem_cgroup *mi;
2751 	unsigned int i;
2752 
2753 	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
2754 
2755 	mem_cgroup_flush_stats(memcg);
2756 
2757 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
2758 		unsigned long nr;
2759 
2760 		nr = memcg_page_state_local_output(memcg, memcg1_stats[i]);
2761 		seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr);
2762 	}
2763 
2764 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
2765 		seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
2766 			       memcg_events_local(memcg, memcg1_events[i]));
2767 
2768 	for (i = 0; i < NR_LRU_LISTS; i++)
2769 		seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
2770 			       memcg_page_state_local(memcg, NR_LRU_BASE + i) *
2771 			       PAGE_SIZE);
2772 
2773 	/* Hierarchical information */
2774 	memory = memsw = PAGE_COUNTER_MAX;
2775 	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
2776 		memory = min(memory, READ_ONCE(mi->memory.max));
2777 		memsw = min(memsw, READ_ONCE(mi->memsw.max));
2778 	}
2779 	seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
2780 		       (u64)memory * PAGE_SIZE);
2781 	seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
2782 		       (u64)memsw * PAGE_SIZE);
2783 
2784 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
2785 		unsigned long nr;
2786 
2787 		nr = memcg_page_state_output(memcg, memcg1_stats[i]);
2788 		seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
2789 			       (u64)nr);
2790 	}
2791 
2792 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
2793 		seq_buf_printf(s, "total_%s %llu\n",
2794 			       vm_event_name(memcg1_events[i]),
2795 			       (u64)memcg_events(memcg, memcg1_events[i]));
2796 
2797 	for (i = 0; i < NR_LRU_LISTS; i++)
2798 		seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
2799 			       (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
2800 			       PAGE_SIZE);
2801 
2802 #ifdef CONFIG_DEBUG_VM
2803 	{
2804 		pg_data_t *pgdat;
2805 		struct mem_cgroup_per_node *mz;
2806 		unsigned long anon_cost = 0;
2807 		unsigned long file_cost = 0;
2808 
2809 		for_each_online_pgdat(pgdat) {
2810 			mz = memcg->nodeinfo[pgdat->node_id];
2811 
2812 			anon_cost += mz->lruvec.anon_cost;
2813 			file_cost += mz->lruvec.file_cost;
2814 		}
2815 		seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
2816 		seq_buf_printf(s, "file_cost %lu\n", file_cost);
2817 	}
2818 #endif
2819 }
2820 
2821 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
2822 				      struct cftype *cft)
2823 {
2824 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2825 
2826 	return mem_cgroup_swappiness(memcg);
2827 }
2828 
2829 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
2830 				       struct cftype *cft, u64 val)
2831 {
2832 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2833 
2834 	if (val > MAX_SWAPPINESS)
2835 		return -EINVAL;
2836 
2837 	if (!mem_cgroup_is_root(memcg))
2838 		WRITE_ONCE(memcg->swappiness, val);
2839 	else
2840 		WRITE_ONCE(vm_swappiness, val);
2841 
2842 	return 0;
2843 }
2844 
2845 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
2846 {
2847 	struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
2848 
2849 	seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable));
2850 	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
2851 	seq_printf(sf, "oom_kill %lu\n",
2852 		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
2853 	return 0;
2854 }
2855 
2856 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
2857 	struct cftype *cft, u64 val)
2858 {
2859 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2860 
2861 	pr_warn_once("oom_control is deprecated and will be removed. "
2862 		     "Please report your usecase to linux-mm-@kvack.org if you "
2863 		     "depend on this functionality. \n");
2864 
2865 	/* cannot set to root cgroup and only 0 and 1 are allowed */
2866 	if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
2867 		return -EINVAL;
2868 
2869 	WRITE_ONCE(memcg->oom_kill_disable, val);
2870 	if (!val)
2871 		memcg1_oom_recover(memcg);
2872 
2873 	return 0;
2874 }
2875 
2876 #ifdef CONFIG_SLUB_DEBUG
2877 static int mem_cgroup_slab_show(struct seq_file *m, void *p)
2878 {
2879 	/*
2880 	 * Deprecated.
2881 	 * Please, take a look at tools/cgroup/memcg_slabinfo.py .
2882 	 */
2883 	return 0;
2884 }
2885 #endif
2886 
2887 struct cftype mem_cgroup_legacy_files[] = {
2888 	{
2889 		.name = "usage_in_bytes",
2890 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
2891 		.read_u64 = mem_cgroup_read_u64,
2892 	},
2893 	{
2894 		.name = "max_usage_in_bytes",
2895 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
2896 		.write = mem_cgroup_reset,
2897 		.read_u64 = mem_cgroup_read_u64,
2898 	},
2899 	{
2900 		.name = "limit_in_bytes",
2901 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
2902 		.write = mem_cgroup_write,
2903 		.read_u64 = mem_cgroup_read_u64,
2904 	},
2905 	{
2906 		.name = "soft_limit_in_bytes",
2907 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2908 		.write = mem_cgroup_write,
2909 		.read_u64 = mem_cgroup_read_u64,
2910 	},
2911 	{
2912 		.name = "failcnt",
2913 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2914 		.write = mem_cgroup_reset,
2915 		.read_u64 = mem_cgroup_read_u64,
2916 	},
2917 	{
2918 		.name = "stat",
2919 		.seq_show = memory_stat_show,
2920 	},
2921 	{
2922 		.name = "force_empty",
2923 		.write = mem_cgroup_force_empty_write,
2924 	},
2925 	{
2926 		.name = "use_hierarchy",
2927 		.write_u64 = mem_cgroup_hierarchy_write,
2928 		.read_u64 = mem_cgroup_hierarchy_read,
2929 	},
2930 	{
2931 		.name = "cgroup.event_control",		/* XXX: for compat */
2932 		.write = memcg_write_event_control,
2933 		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
2934 	},
2935 	{
2936 		.name = "swappiness",
2937 		.read_u64 = mem_cgroup_swappiness_read,
2938 		.write_u64 = mem_cgroup_swappiness_write,
2939 	},
2940 	{
2941 		.name = "move_charge_at_immigrate",
2942 		.read_u64 = mem_cgroup_move_charge_read,
2943 		.write_u64 = mem_cgroup_move_charge_write,
2944 	},
2945 	{
2946 		.name = "oom_control",
2947 		.seq_show = mem_cgroup_oom_control_read,
2948 		.write_u64 = mem_cgroup_oom_control_write,
2949 	},
2950 	{
2951 		.name = "pressure_level",
2952 		.seq_show = mem_cgroup_dummy_seq_show,
2953 	},
2954 #ifdef CONFIG_NUMA
2955 	{
2956 		.name = "numa_stat",
2957 		.seq_show = memcg_numa_stat_show,
2958 	},
2959 #endif
2960 	{
2961 		.name = "kmem.limit_in_bytes",
2962 		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
2963 		.write = mem_cgroup_write,
2964 		.read_u64 = mem_cgroup_read_u64,
2965 	},
2966 	{
2967 		.name = "kmem.usage_in_bytes",
2968 		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
2969 		.read_u64 = mem_cgroup_read_u64,
2970 	},
2971 	{
2972 		.name = "kmem.failcnt",
2973 		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
2974 		.write = mem_cgroup_reset,
2975 		.read_u64 = mem_cgroup_read_u64,
2976 	},
2977 	{
2978 		.name = "kmem.max_usage_in_bytes",
2979 		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
2980 		.write = mem_cgroup_reset,
2981 		.read_u64 = mem_cgroup_read_u64,
2982 	},
2983 #ifdef CONFIG_SLUB_DEBUG
2984 	{
2985 		.name = "kmem.slabinfo",
2986 		.seq_show = mem_cgroup_slab_show,
2987 	},
2988 #endif
2989 	{
2990 		.name = "kmem.tcp.limit_in_bytes",
2991 		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
2992 		.write = mem_cgroup_write,
2993 		.read_u64 = mem_cgroup_read_u64,
2994 	},
2995 	{
2996 		.name = "kmem.tcp.usage_in_bytes",
2997 		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
2998 		.read_u64 = mem_cgroup_read_u64,
2999 	},
3000 	{
3001 		.name = "kmem.tcp.failcnt",
3002 		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
3003 		.write = mem_cgroup_reset,
3004 		.read_u64 = mem_cgroup_read_u64,
3005 	},
3006 	{
3007 		.name = "kmem.tcp.max_usage_in_bytes",
3008 		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
3009 		.write = mem_cgroup_reset,
3010 		.read_u64 = mem_cgroup_read_u64,
3011 	},
3012 	{ },	/* terminate */
3013 };
3014 
3015 struct cftype memsw_files[] = {
3016 	{
3017 		.name = "memsw.usage_in_bytes",
3018 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3019 		.read_u64 = mem_cgroup_read_u64,
3020 	},
3021 	{
3022 		.name = "memsw.max_usage_in_bytes",
3023 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
3024 		.write = mem_cgroup_reset,
3025 		.read_u64 = mem_cgroup_read_u64,
3026 	},
3027 	{
3028 		.name = "memsw.limit_in_bytes",
3029 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
3030 		.write = mem_cgroup_write,
3031 		.read_u64 = mem_cgroup_read_u64,
3032 	},
3033 	{
3034 		.name = "memsw.failcnt",
3035 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
3036 		.write = mem_cgroup_reset,
3037 		.read_u64 = mem_cgroup_read_u64,
3038 	},
3039 	{ },	/* terminate */
3040 };
3041 
3042 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages)
3043 {
3044 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
3045 		if (nr_pages > 0)
3046 			page_counter_charge(&memcg->kmem, nr_pages);
3047 		else
3048 			page_counter_uncharge(&memcg->kmem, -nr_pages);
3049 	}
3050 }
3051 
3052 bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
3053 			 gfp_t gfp_mask)
3054 {
3055 	struct page_counter *fail;
3056 
3057 	if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
3058 		memcg->tcpmem_pressure = 0;
3059 		return true;
3060 	}
3061 	memcg->tcpmem_pressure = 1;
3062 	if (gfp_mask & __GFP_NOFAIL) {
3063 		page_counter_charge(&memcg->tcpmem, nr_pages);
3064 		return true;
3065 	}
3066 	return false;
3067 }
3068 
3069 bool memcg1_alloc_events(struct mem_cgroup *memcg)
3070 {
3071 	memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu,
3072 						GFP_KERNEL_ACCOUNT);
3073 	return !!memcg->events_percpu;
3074 }
3075 
3076 void memcg1_free_events(struct mem_cgroup *memcg)
3077 {
3078 	if (memcg->events_percpu)
3079 		free_percpu(memcg->events_percpu);
3080 }
3081 
3082 static int __init memcg1_init(void)
3083 {
3084 	int node;
3085 
3086 	for_each_node(node) {
3087 		struct mem_cgroup_tree_per_node *rtpn;
3088 
3089 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
3090 
3091 		rtpn->rb_root = RB_ROOT;
3092 		rtpn->rb_rightmost = NULL;
3093 		spin_lock_init(&rtpn->lock);
3094 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
3095 	}
3096 
3097 	return 0;
3098 }
3099 subsys_initcall(memcg1_init);
3100