xref: /linux/mm/memcontrol-v1.c (revision c532de5a67a70f8533d495f8f2aaa9a0491c3ad0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 #include <linux/memcontrol.h>
4 #include <linux/swap.h>
5 #include <linux/mm_inline.h>
6 #include <linux/pagewalk.h>
7 #include <linux/backing-dev.h>
8 #include <linux/swap_cgroup.h>
9 #include <linux/eventfd.h>
10 #include <linux/poll.h>
11 #include <linux/sort.h>
12 #include <linux/file.h>
13 #include <linux/seq_buf.h>
14 
15 #include "internal.h"
16 #include "swap.h"
17 #include "memcontrol-v1.h"
18 
19 /*
20  * Cgroups above their limits are maintained in a RB-Tree, independent of
21  * their hierarchy representation
22  */
23 
24 struct mem_cgroup_tree_per_node {
25 	struct rb_root rb_root;
26 	struct rb_node *rb_rightmost;
27 	spinlock_t lock;
28 };
29 
30 struct mem_cgroup_tree {
31 	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
32 };
33 
34 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
35 
36 /*
37  * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
38  * limit reclaim to prevent infinite loops, if they ever occur.
39  */
40 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
41 #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
42 
43 /* Stuffs for move charges at task migration. */
44 /*
45  * Types of charges to be moved.
46  */
47 #define MOVE_ANON	0x1ULL
48 #define MOVE_FILE	0x2ULL
49 #define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
50 
51 /* "mc" and its members are protected by cgroup_mutex */
52 static struct move_charge_struct {
53 	spinlock_t	  lock; /* for from, to */
54 	struct mm_struct  *mm;
55 	struct mem_cgroup *from;
56 	struct mem_cgroup *to;
57 	unsigned long flags;
58 	unsigned long precharge;
59 	unsigned long moved_charge;
60 	unsigned long moved_swap;
61 	struct task_struct *moving_task;	/* a task moving charges */
62 	wait_queue_head_t waitq;		/* a waitq for other context */
63 } mc = {
64 	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
65 	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
66 };
67 
68 /* for OOM */
69 struct mem_cgroup_eventfd_list {
70 	struct list_head list;
71 	struct eventfd_ctx *eventfd;
72 };
73 
74 /*
75  * cgroup_event represents events which userspace want to receive.
76  */
77 struct mem_cgroup_event {
78 	/*
79 	 * memcg which the event belongs to.
80 	 */
81 	struct mem_cgroup *memcg;
82 	/*
83 	 * eventfd to signal userspace about the event.
84 	 */
85 	struct eventfd_ctx *eventfd;
86 	/*
87 	 * Each of these stored in a list by the cgroup.
88 	 */
89 	struct list_head list;
90 	/*
91 	 * register_event() callback will be used to add new userspace
92 	 * waiter for changes related to this event.  Use eventfd_signal()
93 	 * on eventfd to send notification to userspace.
94 	 */
95 	int (*register_event)(struct mem_cgroup *memcg,
96 			      struct eventfd_ctx *eventfd, const char *args);
97 	/*
98 	 * unregister_event() callback will be called when userspace closes
99 	 * the eventfd or on cgroup removing.  This callback must be set,
100 	 * if you want provide notification functionality.
101 	 */
102 	void (*unregister_event)(struct mem_cgroup *memcg,
103 				 struct eventfd_ctx *eventfd);
104 	/*
105 	 * All fields below needed to unregister event when
106 	 * userspace closes eventfd.
107 	 */
108 	poll_table pt;
109 	wait_queue_head_t *wqh;
110 	wait_queue_entry_t wait;
111 	struct work_struct remove;
112 };
113 
114 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
115 #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
116 #define MEMFILE_ATTR(val)	((val) & 0xffff)
117 
118 enum {
119 	RES_USAGE,
120 	RES_LIMIT,
121 	RES_MAX_USAGE,
122 	RES_FAILCNT,
123 	RES_SOFT_LIMIT,
124 };
125 
126 #ifdef CONFIG_LOCKDEP
127 static struct lockdep_map memcg_oom_lock_dep_map = {
128 	.name = "memcg_oom_lock",
129 };
130 #endif
131 
132 DEFINE_SPINLOCK(memcg_oom_lock);
133 
134 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
135 					 struct mem_cgroup_tree_per_node *mctz,
136 					 unsigned long new_usage_in_excess)
137 {
138 	struct rb_node **p = &mctz->rb_root.rb_node;
139 	struct rb_node *parent = NULL;
140 	struct mem_cgroup_per_node *mz_node;
141 	bool rightmost = true;
142 
143 	if (mz->on_tree)
144 		return;
145 
146 	mz->usage_in_excess = new_usage_in_excess;
147 	if (!mz->usage_in_excess)
148 		return;
149 	while (*p) {
150 		parent = *p;
151 		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
152 					tree_node);
153 		if (mz->usage_in_excess < mz_node->usage_in_excess) {
154 			p = &(*p)->rb_left;
155 			rightmost = false;
156 		} else {
157 			p = &(*p)->rb_right;
158 		}
159 	}
160 
161 	if (rightmost)
162 		mctz->rb_rightmost = &mz->tree_node;
163 
164 	rb_link_node(&mz->tree_node, parent, p);
165 	rb_insert_color(&mz->tree_node, &mctz->rb_root);
166 	mz->on_tree = true;
167 }
168 
169 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
170 					 struct mem_cgroup_tree_per_node *mctz)
171 {
172 	if (!mz->on_tree)
173 		return;
174 
175 	if (&mz->tree_node == mctz->rb_rightmost)
176 		mctz->rb_rightmost = rb_prev(&mz->tree_node);
177 
178 	rb_erase(&mz->tree_node, &mctz->rb_root);
179 	mz->on_tree = false;
180 }
181 
182 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
183 				       struct mem_cgroup_tree_per_node *mctz)
184 {
185 	unsigned long flags;
186 
187 	spin_lock_irqsave(&mctz->lock, flags);
188 	__mem_cgroup_remove_exceeded(mz, mctz);
189 	spin_unlock_irqrestore(&mctz->lock, flags);
190 }
191 
192 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
193 {
194 	unsigned long nr_pages = page_counter_read(&memcg->memory);
195 	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
196 	unsigned long excess = 0;
197 
198 	if (nr_pages > soft_limit)
199 		excess = nr_pages - soft_limit;
200 
201 	return excess;
202 }
203 
204 static void memcg1_update_tree(struct mem_cgroup *memcg, int nid)
205 {
206 	unsigned long excess;
207 	struct mem_cgroup_per_node *mz;
208 	struct mem_cgroup_tree_per_node *mctz;
209 
210 	if (lru_gen_enabled()) {
211 		if (soft_limit_excess(memcg))
212 			lru_gen_soft_reclaim(memcg, nid);
213 		return;
214 	}
215 
216 	mctz = soft_limit_tree.rb_tree_per_node[nid];
217 	if (!mctz)
218 		return;
219 	/*
220 	 * Necessary to update all ancestors when hierarchy is used.
221 	 * because their event counter is not touched.
222 	 */
223 	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
224 		mz = memcg->nodeinfo[nid];
225 		excess = soft_limit_excess(memcg);
226 		/*
227 		 * We have to update the tree if mz is on RB-tree or
228 		 * mem is over its softlimit.
229 		 */
230 		if (excess || mz->on_tree) {
231 			unsigned long flags;
232 
233 			spin_lock_irqsave(&mctz->lock, flags);
234 			/* if on-tree, remove it */
235 			if (mz->on_tree)
236 				__mem_cgroup_remove_exceeded(mz, mctz);
237 			/*
238 			 * Insert again. mz->usage_in_excess will be updated.
239 			 * If excess is 0, no tree ops.
240 			 */
241 			__mem_cgroup_insert_exceeded(mz, mctz, excess);
242 			spin_unlock_irqrestore(&mctz->lock, flags);
243 		}
244 	}
245 }
246 
247 void memcg1_remove_from_trees(struct mem_cgroup *memcg)
248 {
249 	struct mem_cgroup_tree_per_node *mctz;
250 	struct mem_cgroup_per_node *mz;
251 	int nid;
252 
253 	for_each_node(nid) {
254 		mz = memcg->nodeinfo[nid];
255 		mctz = soft_limit_tree.rb_tree_per_node[nid];
256 		if (mctz)
257 			mem_cgroup_remove_exceeded(mz, mctz);
258 	}
259 }
260 
261 static struct mem_cgroup_per_node *
262 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
263 {
264 	struct mem_cgroup_per_node *mz;
265 
266 retry:
267 	mz = NULL;
268 	if (!mctz->rb_rightmost)
269 		goto done;		/* Nothing to reclaim from */
270 
271 	mz = rb_entry(mctz->rb_rightmost,
272 		      struct mem_cgroup_per_node, tree_node);
273 	/*
274 	 * Remove the node now but someone else can add it back,
275 	 * we will to add it back at the end of reclaim to its correct
276 	 * position in the tree.
277 	 */
278 	__mem_cgroup_remove_exceeded(mz, mctz);
279 	if (!soft_limit_excess(mz->memcg) ||
280 	    !css_tryget(&mz->memcg->css))
281 		goto retry;
282 done:
283 	return mz;
284 }
285 
286 static struct mem_cgroup_per_node *
287 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
288 {
289 	struct mem_cgroup_per_node *mz;
290 
291 	spin_lock_irq(&mctz->lock);
292 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
293 	spin_unlock_irq(&mctz->lock);
294 	return mz;
295 }
296 
297 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
298 				   pg_data_t *pgdat,
299 				   gfp_t gfp_mask,
300 				   unsigned long *total_scanned)
301 {
302 	struct mem_cgroup *victim = NULL;
303 	int total = 0;
304 	int loop = 0;
305 	unsigned long excess;
306 	unsigned long nr_scanned;
307 	struct mem_cgroup_reclaim_cookie reclaim = {
308 		.pgdat = pgdat,
309 	};
310 
311 	excess = soft_limit_excess(root_memcg);
312 
313 	while (1) {
314 		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
315 		if (!victim) {
316 			loop++;
317 			if (loop >= 2) {
318 				/*
319 				 * If we have not been able to reclaim
320 				 * anything, it might because there are
321 				 * no reclaimable pages under this hierarchy
322 				 */
323 				if (!total)
324 					break;
325 				/*
326 				 * We want to do more targeted reclaim.
327 				 * excess >> 2 is not to excessive so as to
328 				 * reclaim too much, nor too less that we keep
329 				 * coming back to reclaim from this cgroup
330 				 */
331 				if (total >= (excess >> 2) ||
332 					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
333 					break;
334 			}
335 			continue;
336 		}
337 		total += mem_cgroup_shrink_node(victim, gfp_mask, false,
338 					pgdat, &nr_scanned);
339 		*total_scanned += nr_scanned;
340 		if (!soft_limit_excess(root_memcg))
341 			break;
342 	}
343 	mem_cgroup_iter_break(root_memcg, victim);
344 	return total;
345 }
346 
347 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
348 					    gfp_t gfp_mask,
349 					    unsigned long *total_scanned)
350 {
351 	unsigned long nr_reclaimed = 0;
352 	struct mem_cgroup_per_node *mz, *next_mz = NULL;
353 	unsigned long reclaimed;
354 	int loop = 0;
355 	struct mem_cgroup_tree_per_node *mctz;
356 	unsigned long excess;
357 
358 	if (lru_gen_enabled())
359 		return 0;
360 
361 	if (order > 0)
362 		return 0;
363 
364 	mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
365 
366 	/*
367 	 * Do not even bother to check the largest node if the root
368 	 * is empty. Do it lockless to prevent lock bouncing. Races
369 	 * are acceptable as soft limit is best effort anyway.
370 	 */
371 	if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
372 		return 0;
373 
374 	/*
375 	 * This loop can run a while, specially if mem_cgroup's continuously
376 	 * keep exceeding their soft limit and putting the system under
377 	 * pressure
378 	 */
379 	do {
380 		if (next_mz)
381 			mz = next_mz;
382 		else
383 			mz = mem_cgroup_largest_soft_limit_node(mctz);
384 		if (!mz)
385 			break;
386 
387 		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
388 						    gfp_mask, total_scanned);
389 		nr_reclaimed += reclaimed;
390 		spin_lock_irq(&mctz->lock);
391 
392 		/*
393 		 * If we failed to reclaim anything from this memory cgroup
394 		 * it is time to move on to the next cgroup
395 		 */
396 		next_mz = NULL;
397 		if (!reclaimed)
398 			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
399 
400 		excess = soft_limit_excess(mz->memcg);
401 		/*
402 		 * One school of thought says that we should not add
403 		 * back the node to the tree if reclaim returns 0.
404 		 * But our reclaim could return 0, simply because due
405 		 * to priority we are exposing a smaller subset of
406 		 * memory to reclaim from. Consider this as a longer
407 		 * term TODO.
408 		 */
409 		/* If excess == 0, no tree ops */
410 		__mem_cgroup_insert_exceeded(mz, mctz, excess);
411 		spin_unlock_irq(&mctz->lock);
412 		css_put(&mz->memcg->css);
413 		loop++;
414 		/*
415 		 * Could not reclaim anything and there are no more
416 		 * mem cgroups to try or we seem to be looping without
417 		 * reclaiming anything.
418 		 */
419 		if (!nr_reclaimed &&
420 			(next_mz == NULL ||
421 			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
422 			break;
423 	} while (!nr_reclaimed);
424 	if (next_mz)
425 		css_put(&next_mz->memcg->css);
426 	return nr_reclaimed;
427 }
428 
429 /*
430  * A routine for checking "mem" is under move_account() or not.
431  *
432  * Checking a cgroup is mc.from or mc.to or under hierarchy of
433  * moving cgroups. This is for waiting at high-memory pressure
434  * caused by "move".
435  */
436 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
437 {
438 	struct mem_cgroup *from;
439 	struct mem_cgroup *to;
440 	bool ret = false;
441 	/*
442 	 * Unlike task_move routines, we access mc.to, mc.from not under
443 	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
444 	 */
445 	spin_lock(&mc.lock);
446 	from = mc.from;
447 	to = mc.to;
448 	if (!from)
449 		goto unlock;
450 
451 	ret = mem_cgroup_is_descendant(from, memcg) ||
452 		mem_cgroup_is_descendant(to, memcg);
453 unlock:
454 	spin_unlock(&mc.lock);
455 	return ret;
456 }
457 
458 bool memcg1_wait_acct_move(struct mem_cgroup *memcg)
459 {
460 	if (mc.moving_task && current != mc.moving_task) {
461 		if (mem_cgroup_under_move(memcg)) {
462 			DEFINE_WAIT(wait);
463 			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
464 			/* moving charge context might have finished. */
465 			if (mc.moving_task)
466 				schedule();
467 			finish_wait(&mc.waitq, &wait);
468 			return true;
469 		}
470 	}
471 	return false;
472 }
473 
474 /**
475  * folio_memcg_lock - Bind a folio to its memcg.
476  * @folio: The folio.
477  *
478  * This function prevents unlocked LRU folios from being moved to
479  * another cgroup.
480  *
481  * It ensures lifetime of the bound memcg.  The caller is responsible
482  * for the lifetime of the folio.
483  */
484 void folio_memcg_lock(struct folio *folio)
485 {
486 	struct mem_cgroup *memcg;
487 	unsigned long flags;
488 
489 	/*
490 	 * The RCU lock is held throughout the transaction.  The fast
491 	 * path can get away without acquiring the memcg->move_lock
492 	 * because page moving starts with an RCU grace period.
493          */
494 	rcu_read_lock();
495 
496 	if (mem_cgroup_disabled())
497 		return;
498 again:
499 	memcg = folio_memcg(folio);
500 	if (unlikely(!memcg))
501 		return;
502 
503 #ifdef CONFIG_PROVE_LOCKING
504 	local_irq_save(flags);
505 	might_lock(&memcg->move_lock);
506 	local_irq_restore(flags);
507 #endif
508 
509 	if (atomic_read(&memcg->moving_account) <= 0)
510 		return;
511 
512 	spin_lock_irqsave(&memcg->move_lock, flags);
513 	if (memcg != folio_memcg(folio)) {
514 		spin_unlock_irqrestore(&memcg->move_lock, flags);
515 		goto again;
516 	}
517 
518 	/*
519 	 * When charge migration first begins, we can have multiple
520 	 * critical sections holding the fast-path RCU lock and one
521 	 * holding the slowpath move_lock. Track the task who has the
522 	 * move_lock for folio_memcg_unlock().
523 	 */
524 	memcg->move_lock_task = current;
525 	memcg->move_lock_flags = flags;
526 }
527 
528 static void __folio_memcg_unlock(struct mem_cgroup *memcg)
529 {
530 	if (memcg && memcg->move_lock_task == current) {
531 		unsigned long flags = memcg->move_lock_flags;
532 
533 		memcg->move_lock_task = NULL;
534 		memcg->move_lock_flags = 0;
535 
536 		spin_unlock_irqrestore(&memcg->move_lock, flags);
537 	}
538 
539 	rcu_read_unlock();
540 }
541 
542 /**
543  * folio_memcg_unlock - Release the binding between a folio and its memcg.
544  * @folio: The folio.
545  *
546  * This releases the binding created by folio_memcg_lock().  This does
547  * not change the accounting of this folio to its memcg, but it does
548  * permit others to change it.
549  */
550 void folio_memcg_unlock(struct folio *folio)
551 {
552 	__folio_memcg_unlock(folio_memcg(folio));
553 }
554 
555 #ifdef CONFIG_SWAP
556 /**
557  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
558  * @entry: swap entry to be moved
559  * @from:  mem_cgroup which the entry is moved from
560  * @to:  mem_cgroup which the entry is moved to
561  *
562  * It succeeds only when the swap_cgroup's record for this entry is the same
563  * as the mem_cgroup's id of @from.
564  *
565  * Returns 0 on success, -EINVAL on failure.
566  *
567  * The caller must have charged to @to, IOW, called page_counter_charge() about
568  * both res and memsw, and called css_get().
569  */
570 static int mem_cgroup_move_swap_account(swp_entry_t entry,
571 				struct mem_cgroup *from, struct mem_cgroup *to)
572 {
573 	unsigned short old_id, new_id;
574 
575 	old_id = mem_cgroup_id(from);
576 	new_id = mem_cgroup_id(to);
577 
578 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
579 		mod_memcg_state(from, MEMCG_SWAP, -1);
580 		mod_memcg_state(to, MEMCG_SWAP, 1);
581 		return 0;
582 	}
583 	return -EINVAL;
584 }
585 #else
586 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
587 				struct mem_cgroup *from, struct mem_cgroup *to)
588 {
589 	return -EINVAL;
590 }
591 #endif
592 
593 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
594 				struct cftype *cft)
595 {
596 	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
597 }
598 
599 #ifdef CONFIG_MMU
600 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
601 				 struct cftype *cft, u64 val)
602 {
603 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
604 
605 	pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
606 		     "Please report your usecase to linux-mm@kvack.org if you "
607 		     "depend on this functionality.\n");
608 
609 	if (val & ~MOVE_MASK)
610 		return -EINVAL;
611 
612 	/*
613 	 * No kind of locking is needed in here, because ->can_attach() will
614 	 * check this value once in the beginning of the process, and then carry
615 	 * on with stale data. This means that changes to this value will only
616 	 * affect task migrations starting after the change.
617 	 */
618 	memcg->move_charge_at_immigrate = val;
619 	return 0;
620 }
621 #else
622 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
623 				 struct cftype *cft, u64 val)
624 {
625 	return -ENOSYS;
626 }
627 #endif
628 
629 #ifdef CONFIG_MMU
630 /* Handlers for move charge at task migration. */
631 static int mem_cgroup_do_precharge(unsigned long count)
632 {
633 	int ret;
634 
635 	/* Try a single bulk charge without reclaim first, kswapd may wake */
636 	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
637 	if (!ret) {
638 		mc.precharge += count;
639 		return ret;
640 	}
641 
642 	/* Try charges one by one with reclaim, but do not retry */
643 	while (count--) {
644 		ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
645 		if (ret)
646 			return ret;
647 		mc.precharge++;
648 		cond_resched();
649 	}
650 	return 0;
651 }
652 
653 union mc_target {
654 	struct folio	*folio;
655 	swp_entry_t	ent;
656 };
657 
658 enum mc_target_type {
659 	MC_TARGET_NONE = 0,
660 	MC_TARGET_PAGE,
661 	MC_TARGET_SWAP,
662 	MC_TARGET_DEVICE,
663 };
664 
665 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
666 						unsigned long addr, pte_t ptent)
667 {
668 	struct page *page = vm_normal_page(vma, addr, ptent);
669 
670 	if (!page)
671 		return NULL;
672 	if (PageAnon(page)) {
673 		if (!(mc.flags & MOVE_ANON))
674 			return NULL;
675 	} else {
676 		if (!(mc.flags & MOVE_FILE))
677 			return NULL;
678 	}
679 	get_page(page);
680 
681 	return page;
682 }
683 
684 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
685 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
686 			pte_t ptent, swp_entry_t *entry)
687 {
688 	struct page *page = NULL;
689 	swp_entry_t ent = pte_to_swp_entry(ptent);
690 
691 	if (!(mc.flags & MOVE_ANON))
692 		return NULL;
693 
694 	/*
695 	 * Handle device private pages that are not accessible by the CPU, but
696 	 * stored as special swap entries in the page table.
697 	 */
698 	if (is_device_private_entry(ent)) {
699 		page = pfn_swap_entry_to_page(ent);
700 		if (!get_page_unless_zero(page))
701 			return NULL;
702 		return page;
703 	}
704 
705 	if (non_swap_entry(ent))
706 		return NULL;
707 
708 	/*
709 	 * Because swap_cache_get_folio() updates some statistics counter,
710 	 * we call find_get_page() with swapper_space directly.
711 	 */
712 	page = find_get_page(swap_address_space(ent), swap_cache_index(ent));
713 	entry->val = ent.val;
714 
715 	return page;
716 }
717 #else
718 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
719 			pte_t ptent, swp_entry_t *entry)
720 {
721 	return NULL;
722 }
723 #endif
724 
725 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
726 			unsigned long addr, pte_t ptent)
727 {
728 	unsigned long index;
729 	struct folio *folio;
730 
731 	if (!vma->vm_file) /* anonymous vma */
732 		return NULL;
733 	if (!(mc.flags & MOVE_FILE))
734 		return NULL;
735 
736 	/* folio is moved even if it's not RSS of this task(page-faulted). */
737 	/* shmem/tmpfs may report page out on swap: account for that too. */
738 	index = linear_page_index(vma, addr);
739 	folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
740 	if (IS_ERR(folio))
741 		return NULL;
742 	return folio_file_page(folio, index);
743 }
744 
745 static void memcg1_check_events(struct mem_cgroup *memcg, int nid);
746 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages);
747 
748 /**
749  * mem_cgroup_move_account - move account of the folio
750  * @folio: The folio.
751  * @compound: charge the page as compound or small page
752  * @from: mem_cgroup which the folio is moved from.
753  * @to:	mem_cgroup which the folio is moved to. @from != @to.
754  *
755  * The folio must be locked and not on the LRU.
756  *
757  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
758  * from old cgroup.
759  */
760 static int mem_cgroup_move_account(struct folio *folio,
761 				   bool compound,
762 				   struct mem_cgroup *from,
763 				   struct mem_cgroup *to)
764 {
765 	struct lruvec *from_vec, *to_vec;
766 	struct pglist_data *pgdat;
767 	unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
768 	int nid, ret;
769 
770 	VM_BUG_ON(from == to);
771 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
772 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
773 	VM_BUG_ON(compound && !folio_test_large(folio));
774 
775 	ret = -EINVAL;
776 	if (folio_memcg(folio) != from)
777 		goto out;
778 
779 	pgdat = folio_pgdat(folio);
780 	from_vec = mem_cgroup_lruvec(from, pgdat);
781 	to_vec = mem_cgroup_lruvec(to, pgdat);
782 
783 	folio_memcg_lock(folio);
784 
785 	if (folio_test_anon(folio)) {
786 		if (folio_mapped(folio)) {
787 			__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
788 			__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
789 			if (folio_test_pmd_mappable(folio)) {
790 				__mod_lruvec_state(from_vec, NR_ANON_THPS,
791 						   -nr_pages);
792 				__mod_lruvec_state(to_vec, NR_ANON_THPS,
793 						   nr_pages);
794 			}
795 		}
796 	} else {
797 		__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
798 		__mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
799 
800 		if (folio_test_swapbacked(folio)) {
801 			__mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
802 			__mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
803 		}
804 
805 		if (folio_mapped(folio)) {
806 			__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
807 			__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
808 		}
809 
810 		if (folio_test_dirty(folio)) {
811 			struct address_space *mapping = folio_mapping(folio);
812 
813 			if (mapping_can_writeback(mapping)) {
814 				__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
815 						   -nr_pages);
816 				__mod_lruvec_state(to_vec, NR_FILE_DIRTY,
817 						   nr_pages);
818 			}
819 		}
820 	}
821 
822 #ifdef CONFIG_SWAP
823 	if (folio_test_swapcache(folio)) {
824 		__mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages);
825 		__mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages);
826 	}
827 #endif
828 	if (folio_test_writeback(folio)) {
829 		__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
830 		__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
831 	}
832 
833 	/*
834 	 * All state has been migrated, let's switch to the new memcg.
835 	 *
836 	 * It is safe to change page's memcg here because the page
837 	 * is referenced, charged, isolated, and locked: we can't race
838 	 * with (un)charging, migration, LRU putback, or anything else
839 	 * that would rely on a stable page's memory cgroup.
840 	 *
841 	 * Note that folio_memcg_lock is a memcg lock, not a page lock,
842 	 * to save space. As soon as we switch page's memory cgroup to a
843 	 * new memcg that isn't locked, the above state can change
844 	 * concurrently again. Make sure we're truly done with it.
845 	 */
846 	smp_mb();
847 
848 	css_get(&to->css);
849 	css_put(&from->css);
850 
851 	/* Warning should never happen, so don't worry about refcount non-0 */
852 	WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
853 	folio->memcg_data = (unsigned long)to;
854 
855 	__folio_memcg_unlock(from);
856 
857 	ret = 0;
858 	nid = folio_nid(folio);
859 
860 	local_irq_disable();
861 	memcg1_charge_statistics(to, nr_pages);
862 	memcg1_check_events(to, nid);
863 	memcg1_charge_statistics(from, -nr_pages);
864 	memcg1_check_events(from, nid);
865 	local_irq_enable();
866 out:
867 	return ret;
868 }
869 
870 /**
871  * get_mctgt_type - get target type of moving charge
872  * @vma: the vma the pte to be checked belongs
873  * @addr: the address corresponding to the pte to be checked
874  * @ptent: the pte to be checked
875  * @target: the pointer the target page or swap ent will be stored(can be NULL)
876  *
877  * Context: Called with pte lock held.
878  * Return:
879  * * MC_TARGET_NONE - If the pte is not a target for move charge.
880  * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
881  *   move charge. If @target is not NULL, the folio is stored in target->folio
882  *   with extra refcnt taken (Caller should release it).
883  * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
884  *   target for charge migration.  If @target is not NULL, the entry is
885  *   stored in target->ent.
886  * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and
887  *   thus not on the lru.  For now such page is charged like a regular page
888  *   would be as it is just special memory taking the place of a regular page.
889  *   See Documentations/vm/hmm.txt and include/linux/hmm.h
890  */
891 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
892 		unsigned long addr, pte_t ptent, union mc_target *target)
893 {
894 	struct page *page = NULL;
895 	struct folio *folio;
896 	enum mc_target_type ret = MC_TARGET_NONE;
897 	swp_entry_t ent = { .val = 0 };
898 
899 	if (pte_present(ptent))
900 		page = mc_handle_present_pte(vma, addr, ptent);
901 	else if (pte_none_mostly(ptent))
902 		/*
903 		 * PTE markers should be treated as a none pte here, separated
904 		 * from other swap handling below.
905 		 */
906 		page = mc_handle_file_pte(vma, addr, ptent);
907 	else if (is_swap_pte(ptent))
908 		page = mc_handle_swap_pte(vma, ptent, &ent);
909 
910 	if (page)
911 		folio = page_folio(page);
912 	if (target && page) {
913 		if (!folio_trylock(folio)) {
914 			folio_put(folio);
915 			return ret;
916 		}
917 		/*
918 		 * page_mapped() must be stable during the move. This
919 		 * pte is locked, so if it's present, the page cannot
920 		 * become unmapped. If it isn't, we have only partial
921 		 * control over the mapped state: the page lock will
922 		 * prevent new faults against pagecache and swapcache,
923 		 * so an unmapped page cannot become mapped. However,
924 		 * if the page is already mapped elsewhere, it can
925 		 * unmap, and there is nothing we can do about it.
926 		 * Alas, skip moving the page in this case.
927 		 */
928 		if (!pte_present(ptent) && page_mapped(page)) {
929 			folio_unlock(folio);
930 			folio_put(folio);
931 			return ret;
932 		}
933 	}
934 
935 	if (!page && !ent.val)
936 		return ret;
937 	if (page) {
938 		/*
939 		 * Do only loose check w/o serialization.
940 		 * mem_cgroup_move_account() checks the page is valid or
941 		 * not under LRU exclusion.
942 		 */
943 		if (folio_memcg(folio) == mc.from) {
944 			ret = MC_TARGET_PAGE;
945 			if (folio_is_device_private(folio) ||
946 			    folio_is_device_coherent(folio))
947 				ret = MC_TARGET_DEVICE;
948 			if (target)
949 				target->folio = folio;
950 		}
951 		if (!ret || !target) {
952 			if (target)
953 				folio_unlock(folio);
954 			folio_put(folio);
955 		}
956 	}
957 	/*
958 	 * There is a swap entry and a page doesn't exist or isn't charged.
959 	 * But we cannot move a tail-page in a THP.
960 	 */
961 	if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
962 	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
963 		ret = MC_TARGET_SWAP;
964 		if (target)
965 			target->ent = ent;
966 	}
967 	return ret;
968 }
969 
970 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
971 /*
972  * We don't consider PMD mapped swapping or file mapped pages because THP does
973  * not support them for now.
974  * Caller should make sure that pmd_trans_huge(pmd) is true.
975  */
976 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
977 		unsigned long addr, pmd_t pmd, union mc_target *target)
978 {
979 	struct page *page = NULL;
980 	struct folio *folio;
981 	enum mc_target_type ret = MC_TARGET_NONE;
982 
983 	if (unlikely(is_swap_pmd(pmd))) {
984 		VM_BUG_ON(thp_migration_supported() &&
985 				  !is_pmd_migration_entry(pmd));
986 		return ret;
987 	}
988 	page = pmd_page(pmd);
989 	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
990 	folio = page_folio(page);
991 	if (!(mc.flags & MOVE_ANON))
992 		return ret;
993 	if (folio_memcg(folio) == mc.from) {
994 		ret = MC_TARGET_PAGE;
995 		if (target) {
996 			folio_get(folio);
997 			if (!folio_trylock(folio)) {
998 				folio_put(folio);
999 				return MC_TARGET_NONE;
1000 			}
1001 			target->folio = folio;
1002 		}
1003 	}
1004 	return ret;
1005 }
1006 #else
1007 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
1008 		unsigned long addr, pmd_t pmd, union mc_target *target)
1009 {
1010 	return MC_TARGET_NONE;
1011 }
1012 #endif
1013 
1014 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
1015 					unsigned long addr, unsigned long end,
1016 					struct mm_walk *walk)
1017 {
1018 	struct vm_area_struct *vma = walk->vma;
1019 	pte_t *pte;
1020 	spinlock_t *ptl;
1021 
1022 	ptl = pmd_trans_huge_lock(pmd, vma);
1023 	if (ptl) {
1024 		/*
1025 		 * Note their can not be MC_TARGET_DEVICE for now as we do not
1026 		 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
1027 		 * this might change.
1028 		 */
1029 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
1030 			mc.precharge += HPAGE_PMD_NR;
1031 		spin_unlock(ptl);
1032 		return 0;
1033 	}
1034 
1035 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1036 	if (!pte)
1037 		return 0;
1038 	for (; addr != end; pte++, addr += PAGE_SIZE)
1039 		if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
1040 			mc.precharge++;	/* increment precharge temporarily */
1041 	pte_unmap_unlock(pte - 1, ptl);
1042 	cond_resched();
1043 
1044 	return 0;
1045 }
1046 
1047 static const struct mm_walk_ops precharge_walk_ops = {
1048 	.pmd_entry	= mem_cgroup_count_precharge_pte_range,
1049 	.walk_lock	= PGWALK_RDLOCK,
1050 };
1051 
1052 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
1053 {
1054 	unsigned long precharge;
1055 
1056 	mmap_read_lock(mm);
1057 	walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
1058 	mmap_read_unlock(mm);
1059 
1060 	precharge = mc.precharge;
1061 	mc.precharge = 0;
1062 
1063 	return precharge;
1064 }
1065 
1066 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
1067 {
1068 	unsigned long precharge = mem_cgroup_count_precharge(mm);
1069 
1070 	VM_BUG_ON(mc.moving_task);
1071 	mc.moving_task = current;
1072 	return mem_cgroup_do_precharge(precharge);
1073 }
1074 
1075 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
1076 static void __mem_cgroup_clear_mc(void)
1077 {
1078 	struct mem_cgroup *from = mc.from;
1079 	struct mem_cgroup *to = mc.to;
1080 
1081 	/* we must uncharge all the leftover precharges from mc.to */
1082 	if (mc.precharge) {
1083 		mem_cgroup_cancel_charge(mc.to, mc.precharge);
1084 		mc.precharge = 0;
1085 	}
1086 	/*
1087 	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
1088 	 * we must uncharge here.
1089 	 */
1090 	if (mc.moved_charge) {
1091 		mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
1092 		mc.moved_charge = 0;
1093 	}
1094 	/* we must fixup refcnts and charges */
1095 	if (mc.moved_swap) {
1096 		/* uncharge swap account from the old cgroup */
1097 		if (!mem_cgroup_is_root(mc.from))
1098 			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
1099 
1100 		mem_cgroup_id_put_many(mc.from, mc.moved_swap);
1101 
1102 		/*
1103 		 * we charged both to->memory and to->memsw, so we
1104 		 * should uncharge to->memory.
1105 		 */
1106 		if (!mem_cgroup_is_root(mc.to))
1107 			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
1108 
1109 		mc.moved_swap = 0;
1110 	}
1111 	memcg1_oom_recover(from);
1112 	memcg1_oom_recover(to);
1113 	wake_up_all(&mc.waitq);
1114 }
1115 
1116 static void mem_cgroup_clear_mc(void)
1117 {
1118 	struct mm_struct *mm = mc.mm;
1119 
1120 	/*
1121 	 * we must clear moving_task before waking up waiters at the end of
1122 	 * task migration.
1123 	 */
1124 	mc.moving_task = NULL;
1125 	__mem_cgroup_clear_mc();
1126 	spin_lock(&mc.lock);
1127 	mc.from = NULL;
1128 	mc.to = NULL;
1129 	mc.mm = NULL;
1130 	spin_unlock(&mc.lock);
1131 
1132 	mmput(mm);
1133 }
1134 
1135 int memcg1_can_attach(struct cgroup_taskset *tset)
1136 {
1137 	struct cgroup_subsys_state *css;
1138 	struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
1139 	struct mem_cgroup *from;
1140 	struct task_struct *leader, *p;
1141 	struct mm_struct *mm;
1142 	unsigned long move_flags;
1143 	int ret = 0;
1144 
1145 	/* charge immigration isn't supported on the default hierarchy */
1146 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1147 		return 0;
1148 
1149 	/*
1150 	 * Multi-process migrations only happen on the default hierarchy
1151 	 * where charge immigration is not used.  Perform charge
1152 	 * immigration if @tset contains a leader and whine if there are
1153 	 * multiple.
1154 	 */
1155 	p = NULL;
1156 	cgroup_taskset_for_each_leader(leader, css, tset) {
1157 		WARN_ON_ONCE(p);
1158 		p = leader;
1159 		memcg = mem_cgroup_from_css(css);
1160 	}
1161 	if (!p)
1162 		return 0;
1163 
1164 	/*
1165 	 * We are now committed to this value whatever it is. Changes in this
1166 	 * tunable will only affect upcoming migrations, not the current one.
1167 	 * So we need to save it, and keep it going.
1168 	 */
1169 	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
1170 	if (!move_flags)
1171 		return 0;
1172 
1173 	from = mem_cgroup_from_task(p);
1174 
1175 	VM_BUG_ON(from == memcg);
1176 
1177 	mm = get_task_mm(p);
1178 	if (!mm)
1179 		return 0;
1180 	/* We move charges only when we move a owner of the mm */
1181 	if (mm->owner == p) {
1182 		VM_BUG_ON(mc.from);
1183 		VM_BUG_ON(mc.to);
1184 		VM_BUG_ON(mc.precharge);
1185 		VM_BUG_ON(mc.moved_charge);
1186 		VM_BUG_ON(mc.moved_swap);
1187 
1188 		spin_lock(&mc.lock);
1189 		mc.mm = mm;
1190 		mc.from = from;
1191 		mc.to = memcg;
1192 		mc.flags = move_flags;
1193 		spin_unlock(&mc.lock);
1194 		/* We set mc.moving_task later */
1195 
1196 		ret = mem_cgroup_precharge_mc(mm);
1197 		if (ret)
1198 			mem_cgroup_clear_mc();
1199 	} else {
1200 		mmput(mm);
1201 	}
1202 	return ret;
1203 }
1204 
1205 void memcg1_cancel_attach(struct cgroup_taskset *tset)
1206 {
1207 	if (mc.to)
1208 		mem_cgroup_clear_mc();
1209 }
1210 
1211 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
1212 				unsigned long addr, unsigned long end,
1213 				struct mm_walk *walk)
1214 {
1215 	int ret = 0;
1216 	struct vm_area_struct *vma = walk->vma;
1217 	pte_t *pte;
1218 	spinlock_t *ptl;
1219 	enum mc_target_type target_type;
1220 	union mc_target target;
1221 	struct folio *folio;
1222 	bool tried_split_before = false;
1223 
1224 retry_pmd:
1225 	ptl = pmd_trans_huge_lock(pmd, vma);
1226 	if (ptl) {
1227 		if (mc.precharge < HPAGE_PMD_NR) {
1228 			spin_unlock(ptl);
1229 			return 0;
1230 		}
1231 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
1232 		if (target_type == MC_TARGET_PAGE) {
1233 			folio = target.folio;
1234 			/*
1235 			 * Deferred split queue locking depends on memcg,
1236 			 * and unqueue is unsafe unless folio refcount is 0:
1237 			 * split or skip if on the queue? first try to split.
1238 			 */
1239 			if (!list_empty(&folio->_deferred_list)) {
1240 				spin_unlock(ptl);
1241 				if (!tried_split_before)
1242 					split_folio(folio);
1243 				folio_unlock(folio);
1244 				folio_put(folio);
1245 				if (tried_split_before)
1246 					return 0;
1247 				tried_split_before = true;
1248 				goto retry_pmd;
1249 			}
1250 			/*
1251 			 * So long as that pmd lock is held, the folio cannot
1252 			 * be racily added to the _deferred_list, because
1253 			 * __folio_remove_rmap() will find !partially_mapped.
1254 			 */
1255 			if (folio_isolate_lru(folio)) {
1256 				if (!mem_cgroup_move_account(folio, true,
1257 							     mc.from, mc.to)) {
1258 					mc.precharge -= HPAGE_PMD_NR;
1259 					mc.moved_charge += HPAGE_PMD_NR;
1260 				}
1261 				folio_putback_lru(folio);
1262 			}
1263 			folio_unlock(folio);
1264 			folio_put(folio);
1265 		} else if (target_type == MC_TARGET_DEVICE) {
1266 			folio = target.folio;
1267 			if (!mem_cgroup_move_account(folio, true,
1268 						     mc.from, mc.to)) {
1269 				mc.precharge -= HPAGE_PMD_NR;
1270 				mc.moved_charge += HPAGE_PMD_NR;
1271 			}
1272 			folio_unlock(folio);
1273 			folio_put(folio);
1274 		}
1275 		spin_unlock(ptl);
1276 		return 0;
1277 	}
1278 
1279 retry:
1280 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1281 	if (!pte)
1282 		return 0;
1283 	for (; addr != end; addr += PAGE_SIZE) {
1284 		pte_t ptent = ptep_get(pte++);
1285 		bool device = false;
1286 		swp_entry_t ent;
1287 
1288 		if (!mc.precharge)
1289 			break;
1290 
1291 		switch (get_mctgt_type(vma, addr, ptent, &target)) {
1292 		case MC_TARGET_DEVICE:
1293 			device = true;
1294 			fallthrough;
1295 		case MC_TARGET_PAGE:
1296 			folio = target.folio;
1297 			/*
1298 			 * We can have a part of the split pmd here. Moving it
1299 			 * can be done but it would be too convoluted so simply
1300 			 * ignore such a partial THP and keep it in original
1301 			 * memcg. There should be somebody mapping the head.
1302 			 */
1303 			if (folio_test_large(folio))
1304 				goto put;
1305 			if (!device && !folio_isolate_lru(folio))
1306 				goto put;
1307 			if (!mem_cgroup_move_account(folio, false,
1308 						mc.from, mc.to)) {
1309 				mc.precharge--;
1310 				/* we uncharge from mc.from later. */
1311 				mc.moved_charge++;
1312 			}
1313 			if (!device)
1314 				folio_putback_lru(folio);
1315 put:			/* get_mctgt_type() gets & locks the page */
1316 			folio_unlock(folio);
1317 			folio_put(folio);
1318 			break;
1319 		case MC_TARGET_SWAP:
1320 			ent = target.ent;
1321 			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
1322 				mc.precharge--;
1323 				mem_cgroup_id_get_many(mc.to, 1);
1324 				/* we fixup other refcnts and charges later. */
1325 				mc.moved_swap++;
1326 			}
1327 			break;
1328 		default:
1329 			break;
1330 		}
1331 	}
1332 	pte_unmap_unlock(pte - 1, ptl);
1333 	cond_resched();
1334 
1335 	if (addr != end) {
1336 		/*
1337 		 * We have consumed all precharges we got in can_attach().
1338 		 * We try charge one by one, but don't do any additional
1339 		 * charges to mc.to if we have failed in charge once in attach()
1340 		 * phase.
1341 		 */
1342 		ret = mem_cgroup_do_precharge(1);
1343 		if (!ret)
1344 			goto retry;
1345 	}
1346 
1347 	return ret;
1348 }
1349 
1350 static const struct mm_walk_ops charge_walk_ops = {
1351 	.pmd_entry	= mem_cgroup_move_charge_pte_range,
1352 	.walk_lock	= PGWALK_RDLOCK,
1353 };
1354 
1355 static void mem_cgroup_move_charge(void)
1356 {
1357 	lru_add_drain_all();
1358 	/*
1359 	 * Signal folio_memcg_lock() to take the memcg's move_lock
1360 	 * while we're moving its pages to another memcg. Then wait
1361 	 * for already started RCU-only updates to finish.
1362 	 */
1363 	atomic_inc(&mc.from->moving_account);
1364 	synchronize_rcu();
1365 retry:
1366 	if (unlikely(!mmap_read_trylock(mc.mm))) {
1367 		/*
1368 		 * Someone who are holding the mmap_lock might be waiting in
1369 		 * waitq. So we cancel all extra charges, wake up all waiters,
1370 		 * and retry. Because we cancel precharges, we might not be able
1371 		 * to move enough charges, but moving charge is a best-effort
1372 		 * feature anyway, so it wouldn't be a big problem.
1373 		 */
1374 		__mem_cgroup_clear_mc();
1375 		cond_resched();
1376 		goto retry;
1377 	}
1378 	/*
1379 	 * When we have consumed all precharges and failed in doing
1380 	 * additional charge, the page walk just aborts.
1381 	 */
1382 	walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
1383 	mmap_read_unlock(mc.mm);
1384 	atomic_dec(&mc.from->moving_account);
1385 }
1386 
1387 void memcg1_move_task(void)
1388 {
1389 	if (mc.to) {
1390 		mem_cgroup_move_charge();
1391 		mem_cgroup_clear_mc();
1392 	}
1393 }
1394 
1395 #else	/* !CONFIG_MMU */
1396 int memcg1_can_attach(struct cgroup_taskset *tset)
1397 {
1398 	return 0;
1399 }
1400 void memcg1_cancel_attach(struct cgroup_taskset *tset)
1401 {
1402 }
1403 void memcg1_move_task(void)
1404 {
1405 }
1406 #endif
1407 
1408 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
1409 {
1410 	struct mem_cgroup_threshold_ary *t;
1411 	unsigned long usage;
1412 	int i;
1413 
1414 	rcu_read_lock();
1415 	if (!swap)
1416 		t = rcu_dereference(memcg->thresholds.primary);
1417 	else
1418 		t = rcu_dereference(memcg->memsw_thresholds.primary);
1419 
1420 	if (!t)
1421 		goto unlock;
1422 
1423 	usage = mem_cgroup_usage(memcg, swap);
1424 
1425 	/*
1426 	 * current_threshold points to threshold just below or equal to usage.
1427 	 * If it's not true, a threshold was crossed after last
1428 	 * call of __mem_cgroup_threshold().
1429 	 */
1430 	i = t->current_threshold;
1431 
1432 	/*
1433 	 * Iterate backward over array of thresholds starting from
1434 	 * current_threshold and check if a threshold is crossed.
1435 	 * If none of thresholds below usage is crossed, we read
1436 	 * only one element of the array here.
1437 	 */
1438 	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
1439 		eventfd_signal(t->entries[i].eventfd);
1440 
1441 	/* i = current_threshold + 1 */
1442 	i++;
1443 
1444 	/*
1445 	 * Iterate forward over array of thresholds starting from
1446 	 * current_threshold+1 and check if a threshold is crossed.
1447 	 * If none of thresholds above usage is crossed, we read
1448 	 * only one element of the array here.
1449 	 */
1450 	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
1451 		eventfd_signal(t->entries[i].eventfd);
1452 
1453 	/* Update current_threshold */
1454 	t->current_threshold = i - 1;
1455 unlock:
1456 	rcu_read_unlock();
1457 }
1458 
1459 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
1460 {
1461 	while (memcg) {
1462 		__mem_cgroup_threshold(memcg, false);
1463 		if (do_memsw_account())
1464 			__mem_cgroup_threshold(memcg, true);
1465 
1466 		memcg = parent_mem_cgroup(memcg);
1467 	}
1468 }
1469 
1470 /* Cgroup1: threshold notifications & softlimit tree updates */
1471 struct memcg1_events_percpu {
1472 	unsigned long nr_page_events;
1473 	unsigned long targets[MEM_CGROUP_NTARGETS];
1474 };
1475 
1476 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
1477 {
1478 	/* pagein of a big page is an event. So, ignore page size */
1479 	if (nr_pages > 0)
1480 		__count_memcg_events(memcg, PGPGIN, 1);
1481 	else {
1482 		__count_memcg_events(memcg, PGPGOUT, 1);
1483 		nr_pages = -nr_pages; /* for event */
1484 	}
1485 
1486 	__this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages);
1487 }
1488 
1489 #define THRESHOLDS_EVENTS_TARGET 128
1490 #define SOFTLIMIT_EVENTS_TARGET 1024
1491 
1492 static bool memcg1_event_ratelimit(struct mem_cgroup *memcg,
1493 				enum mem_cgroup_events_target target)
1494 {
1495 	unsigned long val, next;
1496 
1497 	val = __this_cpu_read(memcg->events_percpu->nr_page_events);
1498 	next = __this_cpu_read(memcg->events_percpu->targets[target]);
1499 	/* from time_after() in jiffies.h */
1500 	if ((long)(next - val) < 0) {
1501 		switch (target) {
1502 		case MEM_CGROUP_TARGET_THRESH:
1503 			next = val + THRESHOLDS_EVENTS_TARGET;
1504 			break;
1505 		case MEM_CGROUP_TARGET_SOFTLIMIT:
1506 			next = val + SOFTLIMIT_EVENTS_TARGET;
1507 			break;
1508 		default:
1509 			break;
1510 		}
1511 		__this_cpu_write(memcg->events_percpu->targets[target], next);
1512 		return true;
1513 	}
1514 	return false;
1515 }
1516 
1517 /*
1518  * Check events in order.
1519  *
1520  */
1521 static void memcg1_check_events(struct mem_cgroup *memcg, int nid)
1522 {
1523 	if (IS_ENABLED(CONFIG_PREEMPT_RT))
1524 		return;
1525 
1526 	/* threshold event is triggered in finer grain than soft limit */
1527 	if (unlikely(memcg1_event_ratelimit(memcg,
1528 						MEM_CGROUP_TARGET_THRESH))) {
1529 		bool do_softlimit;
1530 
1531 		do_softlimit = memcg1_event_ratelimit(memcg,
1532 						MEM_CGROUP_TARGET_SOFTLIMIT);
1533 		mem_cgroup_threshold(memcg);
1534 		if (unlikely(do_softlimit))
1535 			memcg1_update_tree(memcg, nid);
1536 	}
1537 }
1538 
1539 void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
1540 {
1541 	unsigned long flags;
1542 
1543 	local_irq_save(flags);
1544 	memcg1_charge_statistics(memcg, folio_nr_pages(folio));
1545 	memcg1_check_events(memcg, folio_nid(folio));
1546 	local_irq_restore(flags);
1547 }
1548 
1549 void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
1550 {
1551 	/*
1552 	 * Interrupts should be disabled here because the caller holds the
1553 	 * i_pages lock which is taken with interrupts-off. It is
1554 	 * important here to have the interrupts disabled because it is the
1555 	 * only synchronisation we have for updating the per-CPU variables.
1556 	 */
1557 	preempt_disable_nested();
1558 	VM_WARN_ON_IRQS_ENABLED();
1559 	memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
1560 	preempt_enable_nested();
1561 	memcg1_check_events(memcg, folio_nid(folio));
1562 }
1563 
1564 void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
1565 			   unsigned long nr_memory, int nid)
1566 {
1567 	unsigned long flags;
1568 
1569 	local_irq_save(flags);
1570 	__count_memcg_events(memcg, PGPGOUT, pgpgout);
1571 	__this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
1572 	memcg1_check_events(memcg, nid);
1573 	local_irq_restore(flags);
1574 }
1575 
1576 static int compare_thresholds(const void *a, const void *b)
1577 {
1578 	const struct mem_cgroup_threshold *_a = a;
1579 	const struct mem_cgroup_threshold *_b = b;
1580 
1581 	if (_a->threshold > _b->threshold)
1582 		return 1;
1583 
1584 	if (_a->threshold < _b->threshold)
1585 		return -1;
1586 
1587 	return 0;
1588 }
1589 
1590 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
1591 {
1592 	struct mem_cgroup_eventfd_list *ev;
1593 
1594 	spin_lock(&memcg_oom_lock);
1595 
1596 	list_for_each_entry(ev, &memcg->oom_notify, list)
1597 		eventfd_signal(ev->eventfd);
1598 
1599 	spin_unlock(&memcg_oom_lock);
1600 	return 0;
1601 }
1602 
1603 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
1604 {
1605 	struct mem_cgroup *iter;
1606 
1607 	for_each_mem_cgroup_tree(iter, memcg)
1608 		mem_cgroup_oom_notify_cb(iter);
1609 }
1610 
1611 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
1612 	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
1613 {
1614 	struct mem_cgroup_thresholds *thresholds;
1615 	struct mem_cgroup_threshold_ary *new;
1616 	unsigned long threshold;
1617 	unsigned long usage;
1618 	int i, size, ret;
1619 
1620 	ret = page_counter_memparse(args, "-1", &threshold);
1621 	if (ret)
1622 		return ret;
1623 
1624 	mutex_lock(&memcg->thresholds_lock);
1625 
1626 	if (type == _MEM) {
1627 		thresholds = &memcg->thresholds;
1628 		usage = mem_cgroup_usage(memcg, false);
1629 	} else if (type == _MEMSWAP) {
1630 		thresholds = &memcg->memsw_thresholds;
1631 		usage = mem_cgroup_usage(memcg, true);
1632 	} else
1633 		BUG();
1634 
1635 	/* Check if a threshold crossed before adding a new one */
1636 	if (thresholds->primary)
1637 		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
1638 
1639 	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
1640 
1641 	/* Allocate memory for new array of thresholds */
1642 	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
1643 	if (!new) {
1644 		ret = -ENOMEM;
1645 		goto unlock;
1646 	}
1647 	new->size = size;
1648 
1649 	/* Copy thresholds (if any) to new array */
1650 	if (thresholds->primary)
1651 		memcpy(new->entries, thresholds->primary->entries,
1652 		       flex_array_size(new, entries, size - 1));
1653 
1654 	/* Add new threshold */
1655 	new->entries[size - 1].eventfd = eventfd;
1656 	new->entries[size - 1].threshold = threshold;
1657 
1658 	/* Sort thresholds. Registering of new threshold isn't time-critical */
1659 	sort(new->entries, size, sizeof(*new->entries),
1660 			compare_thresholds, NULL);
1661 
1662 	/* Find current threshold */
1663 	new->current_threshold = -1;
1664 	for (i = 0; i < size; i++) {
1665 		if (new->entries[i].threshold <= usage) {
1666 			/*
1667 			 * new->current_threshold will not be used until
1668 			 * rcu_assign_pointer(), so it's safe to increment
1669 			 * it here.
1670 			 */
1671 			++new->current_threshold;
1672 		} else
1673 			break;
1674 	}
1675 
1676 	/* Free old spare buffer and save old primary buffer as spare */
1677 	kfree(thresholds->spare);
1678 	thresholds->spare = thresholds->primary;
1679 
1680 	rcu_assign_pointer(thresholds->primary, new);
1681 
1682 	/* To be sure that nobody uses thresholds */
1683 	synchronize_rcu();
1684 
1685 unlock:
1686 	mutex_unlock(&memcg->thresholds_lock);
1687 
1688 	return ret;
1689 }
1690 
1691 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
1692 	struct eventfd_ctx *eventfd, const char *args)
1693 {
1694 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
1695 }
1696 
1697 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
1698 	struct eventfd_ctx *eventfd, const char *args)
1699 {
1700 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
1701 }
1702 
1703 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
1704 	struct eventfd_ctx *eventfd, enum res_type type)
1705 {
1706 	struct mem_cgroup_thresholds *thresholds;
1707 	struct mem_cgroup_threshold_ary *new;
1708 	unsigned long usage;
1709 	int i, j, size, entries;
1710 
1711 	mutex_lock(&memcg->thresholds_lock);
1712 
1713 	if (type == _MEM) {
1714 		thresholds = &memcg->thresholds;
1715 		usage = mem_cgroup_usage(memcg, false);
1716 	} else if (type == _MEMSWAP) {
1717 		thresholds = &memcg->memsw_thresholds;
1718 		usage = mem_cgroup_usage(memcg, true);
1719 	} else
1720 		BUG();
1721 
1722 	if (!thresholds->primary)
1723 		goto unlock;
1724 
1725 	/* Check if a threshold crossed before removing */
1726 	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
1727 
1728 	/* Calculate new number of threshold */
1729 	size = entries = 0;
1730 	for (i = 0; i < thresholds->primary->size; i++) {
1731 		if (thresholds->primary->entries[i].eventfd != eventfd)
1732 			size++;
1733 		else
1734 			entries++;
1735 	}
1736 
1737 	new = thresholds->spare;
1738 
1739 	/* If no items related to eventfd have been cleared, nothing to do */
1740 	if (!entries)
1741 		goto unlock;
1742 
1743 	/* Set thresholds array to NULL if we don't have thresholds */
1744 	if (!size) {
1745 		kfree(new);
1746 		new = NULL;
1747 		goto swap_buffers;
1748 	}
1749 
1750 	new->size = size;
1751 
1752 	/* Copy thresholds and find current threshold */
1753 	new->current_threshold = -1;
1754 	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
1755 		if (thresholds->primary->entries[i].eventfd == eventfd)
1756 			continue;
1757 
1758 		new->entries[j] = thresholds->primary->entries[i];
1759 		if (new->entries[j].threshold <= usage) {
1760 			/*
1761 			 * new->current_threshold will not be used
1762 			 * until rcu_assign_pointer(), so it's safe to increment
1763 			 * it here.
1764 			 */
1765 			++new->current_threshold;
1766 		}
1767 		j++;
1768 	}
1769 
1770 swap_buffers:
1771 	/* Swap primary and spare array */
1772 	thresholds->spare = thresholds->primary;
1773 
1774 	rcu_assign_pointer(thresholds->primary, new);
1775 
1776 	/* To be sure that nobody uses thresholds */
1777 	synchronize_rcu();
1778 
1779 	/* If all events are unregistered, free the spare array */
1780 	if (!new) {
1781 		kfree(thresholds->spare);
1782 		thresholds->spare = NULL;
1783 	}
1784 unlock:
1785 	mutex_unlock(&memcg->thresholds_lock);
1786 }
1787 
1788 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
1789 	struct eventfd_ctx *eventfd)
1790 {
1791 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
1792 }
1793 
1794 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
1795 	struct eventfd_ctx *eventfd)
1796 {
1797 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
1798 }
1799 
1800 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
1801 	struct eventfd_ctx *eventfd, const char *args)
1802 {
1803 	struct mem_cgroup_eventfd_list *event;
1804 
1805 	event = kmalloc(sizeof(*event),	GFP_KERNEL);
1806 	if (!event)
1807 		return -ENOMEM;
1808 
1809 	spin_lock(&memcg_oom_lock);
1810 
1811 	event->eventfd = eventfd;
1812 	list_add(&event->list, &memcg->oom_notify);
1813 
1814 	/* already in OOM ? */
1815 	if (memcg->under_oom)
1816 		eventfd_signal(eventfd);
1817 	spin_unlock(&memcg_oom_lock);
1818 
1819 	return 0;
1820 }
1821 
1822 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
1823 	struct eventfd_ctx *eventfd)
1824 {
1825 	struct mem_cgroup_eventfd_list *ev, *tmp;
1826 
1827 	spin_lock(&memcg_oom_lock);
1828 
1829 	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
1830 		if (ev->eventfd == eventfd) {
1831 			list_del(&ev->list);
1832 			kfree(ev);
1833 		}
1834 	}
1835 
1836 	spin_unlock(&memcg_oom_lock);
1837 }
1838 
1839 /*
1840  * DO NOT USE IN NEW FILES.
1841  *
1842  * "cgroup.event_control" implementation.
1843  *
1844  * This is way over-engineered.  It tries to support fully configurable
1845  * events for each user.  Such level of flexibility is completely
1846  * unnecessary especially in the light of the planned unified hierarchy.
1847  *
1848  * Please deprecate this and replace with something simpler if at all
1849  * possible.
1850  */
1851 
1852 /*
1853  * Unregister event and free resources.
1854  *
1855  * Gets called from workqueue.
1856  */
1857 static void memcg_event_remove(struct work_struct *work)
1858 {
1859 	struct mem_cgroup_event *event =
1860 		container_of(work, struct mem_cgroup_event, remove);
1861 	struct mem_cgroup *memcg = event->memcg;
1862 
1863 	remove_wait_queue(event->wqh, &event->wait);
1864 
1865 	event->unregister_event(memcg, event->eventfd);
1866 
1867 	/* Notify userspace the event is going away. */
1868 	eventfd_signal(event->eventfd);
1869 
1870 	eventfd_ctx_put(event->eventfd);
1871 	kfree(event);
1872 	css_put(&memcg->css);
1873 }
1874 
1875 /*
1876  * Gets called on EPOLLHUP on eventfd when user closes it.
1877  *
1878  * Called with wqh->lock held and interrupts disabled.
1879  */
1880 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
1881 			    int sync, void *key)
1882 {
1883 	struct mem_cgroup_event *event =
1884 		container_of(wait, struct mem_cgroup_event, wait);
1885 	struct mem_cgroup *memcg = event->memcg;
1886 	__poll_t flags = key_to_poll(key);
1887 
1888 	if (flags & EPOLLHUP) {
1889 		/*
1890 		 * If the event has been detached at cgroup removal, we
1891 		 * can simply return knowing the other side will cleanup
1892 		 * for us.
1893 		 *
1894 		 * We can't race against event freeing since the other
1895 		 * side will require wqh->lock via remove_wait_queue(),
1896 		 * which we hold.
1897 		 */
1898 		spin_lock(&memcg->event_list_lock);
1899 		if (!list_empty(&event->list)) {
1900 			list_del_init(&event->list);
1901 			/*
1902 			 * We are in atomic context, but cgroup_event_remove()
1903 			 * may sleep, so we have to call it in workqueue.
1904 			 */
1905 			schedule_work(&event->remove);
1906 		}
1907 		spin_unlock(&memcg->event_list_lock);
1908 	}
1909 
1910 	return 0;
1911 }
1912 
1913 static void memcg_event_ptable_queue_proc(struct file *file,
1914 		wait_queue_head_t *wqh, poll_table *pt)
1915 {
1916 	struct mem_cgroup_event *event =
1917 		container_of(pt, struct mem_cgroup_event, pt);
1918 
1919 	event->wqh = wqh;
1920 	add_wait_queue(wqh, &event->wait);
1921 }
1922 
1923 /*
1924  * DO NOT USE IN NEW FILES.
1925  *
1926  * Parse input and register new cgroup event handler.
1927  *
1928  * Input must be in format '<event_fd> <control_fd> <args>'.
1929  * Interpretation of args is defined by control file implementation.
1930  */
1931 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
1932 					 char *buf, size_t nbytes, loff_t off)
1933 {
1934 	struct cgroup_subsys_state *css = of_css(of);
1935 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1936 	struct mem_cgroup_event *event;
1937 	struct cgroup_subsys_state *cfile_css;
1938 	unsigned int efd, cfd;
1939 	struct fd efile;
1940 	struct fd cfile;
1941 	struct dentry *cdentry;
1942 	const char *name;
1943 	char *endp;
1944 	int ret;
1945 
1946 	if (IS_ENABLED(CONFIG_PREEMPT_RT))
1947 		return -EOPNOTSUPP;
1948 
1949 	buf = strstrip(buf);
1950 
1951 	efd = simple_strtoul(buf, &endp, 10);
1952 	if (*endp != ' ')
1953 		return -EINVAL;
1954 	buf = endp + 1;
1955 
1956 	cfd = simple_strtoul(buf, &endp, 10);
1957 	if (*endp == '\0')
1958 		buf = endp;
1959 	else if (*endp == ' ')
1960 		buf = endp + 1;
1961 	else
1962 		return -EINVAL;
1963 
1964 	event = kzalloc(sizeof(*event), GFP_KERNEL);
1965 	if (!event)
1966 		return -ENOMEM;
1967 
1968 	event->memcg = memcg;
1969 	INIT_LIST_HEAD(&event->list);
1970 	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
1971 	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
1972 	INIT_WORK(&event->remove, memcg_event_remove);
1973 
1974 	efile = fdget(efd);
1975 	if (!fd_file(efile)) {
1976 		ret = -EBADF;
1977 		goto out_kfree;
1978 	}
1979 
1980 	event->eventfd = eventfd_ctx_fileget(fd_file(efile));
1981 	if (IS_ERR(event->eventfd)) {
1982 		ret = PTR_ERR(event->eventfd);
1983 		goto out_put_efile;
1984 	}
1985 
1986 	cfile = fdget(cfd);
1987 	if (!fd_file(cfile)) {
1988 		ret = -EBADF;
1989 		goto out_put_eventfd;
1990 	}
1991 
1992 	/* the process need read permission on control file */
1993 	/* AV: shouldn't we check that it's been opened for read instead? */
1994 	ret = file_permission(fd_file(cfile), MAY_READ);
1995 	if (ret < 0)
1996 		goto out_put_cfile;
1997 
1998 	/*
1999 	 * The control file must be a regular cgroup1 file. As a regular cgroup
2000 	 * file can't be renamed, it's safe to access its name afterwards.
2001 	 */
2002 	cdentry = fd_file(cfile)->f_path.dentry;
2003 	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
2004 		ret = -EINVAL;
2005 		goto out_put_cfile;
2006 	}
2007 
2008 	/*
2009 	 * Determine the event callbacks and set them in @event.  This used
2010 	 * to be done via struct cftype but cgroup core no longer knows
2011 	 * about these events.  The following is crude but the whole thing
2012 	 * is for compatibility anyway.
2013 	 *
2014 	 * DO NOT ADD NEW FILES.
2015 	 */
2016 	name = cdentry->d_name.name;
2017 
2018 	if (!strcmp(name, "memory.usage_in_bytes")) {
2019 		event->register_event = mem_cgroup_usage_register_event;
2020 		event->unregister_event = mem_cgroup_usage_unregister_event;
2021 	} else if (!strcmp(name, "memory.oom_control")) {
2022 		pr_warn_once("oom_control is deprecated and will be removed. "
2023 			     "Please report your usecase to linux-mm-@kvack.org"
2024 			     " if you depend on this functionality. \n");
2025 		event->register_event = mem_cgroup_oom_register_event;
2026 		event->unregister_event = mem_cgroup_oom_unregister_event;
2027 	} else if (!strcmp(name, "memory.pressure_level")) {
2028 		pr_warn_once("pressure_level is deprecated and will be removed. "
2029 			     "Please report your usecase to linux-mm-@kvack.org "
2030 			     "if you depend on this functionality. \n");
2031 		event->register_event = vmpressure_register_event;
2032 		event->unregister_event = vmpressure_unregister_event;
2033 	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
2034 		event->register_event = memsw_cgroup_usage_register_event;
2035 		event->unregister_event = memsw_cgroup_usage_unregister_event;
2036 	} else {
2037 		ret = -EINVAL;
2038 		goto out_put_cfile;
2039 	}
2040 
2041 	/*
2042 	 * Verify @cfile should belong to @css.  Also, remaining events are
2043 	 * automatically removed on cgroup destruction but the removal is
2044 	 * asynchronous, so take an extra ref on @css.
2045 	 */
2046 	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
2047 					       &memory_cgrp_subsys);
2048 	ret = -EINVAL;
2049 	if (IS_ERR(cfile_css))
2050 		goto out_put_cfile;
2051 	if (cfile_css != css) {
2052 		css_put(cfile_css);
2053 		goto out_put_cfile;
2054 	}
2055 
2056 	ret = event->register_event(memcg, event->eventfd, buf);
2057 	if (ret)
2058 		goto out_put_css;
2059 
2060 	vfs_poll(fd_file(efile), &event->pt);
2061 
2062 	spin_lock_irq(&memcg->event_list_lock);
2063 	list_add(&event->list, &memcg->event_list);
2064 	spin_unlock_irq(&memcg->event_list_lock);
2065 
2066 	fdput(cfile);
2067 	fdput(efile);
2068 
2069 	return nbytes;
2070 
2071 out_put_css:
2072 	css_put(css);
2073 out_put_cfile:
2074 	fdput(cfile);
2075 out_put_eventfd:
2076 	eventfd_ctx_put(event->eventfd);
2077 out_put_efile:
2078 	fdput(efile);
2079 out_kfree:
2080 	kfree(event);
2081 
2082 	return ret;
2083 }
2084 
2085 void memcg1_memcg_init(struct mem_cgroup *memcg)
2086 {
2087 	INIT_LIST_HEAD(&memcg->oom_notify);
2088 	mutex_init(&memcg->thresholds_lock);
2089 	spin_lock_init(&memcg->move_lock);
2090 	INIT_LIST_HEAD(&memcg->event_list);
2091 	spin_lock_init(&memcg->event_list_lock);
2092 }
2093 
2094 void memcg1_css_offline(struct mem_cgroup *memcg)
2095 {
2096 	struct mem_cgroup_event *event, *tmp;
2097 
2098 	/*
2099 	 * Unregister events and notify userspace.
2100 	 * Notify userspace about cgroup removing only after rmdir of cgroup
2101 	 * directory to avoid race between userspace and kernelspace.
2102 	 */
2103 	spin_lock_irq(&memcg->event_list_lock);
2104 	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
2105 		list_del_init(&event->list);
2106 		schedule_work(&event->remove);
2107 	}
2108 	spin_unlock_irq(&memcg->event_list_lock);
2109 }
2110 
2111 /*
2112  * Check OOM-Killer is already running under our hierarchy.
2113  * If someone is running, return false.
2114  */
2115 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
2116 {
2117 	struct mem_cgroup *iter, *failed = NULL;
2118 
2119 	spin_lock(&memcg_oom_lock);
2120 
2121 	for_each_mem_cgroup_tree(iter, memcg) {
2122 		if (iter->oom_lock) {
2123 			/*
2124 			 * this subtree of our hierarchy is already locked
2125 			 * so we cannot give a lock.
2126 			 */
2127 			failed = iter;
2128 			mem_cgroup_iter_break(memcg, iter);
2129 			break;
2130 		} else
2131 			iter->oom_lock = true;
2132 	}
2133 
2134 	if (failed) {
2135 		/*
2136 		 * OK, we failed to lock the whole subtree so we have
2137 		 * to clean up what we set up to the failing subtree
2138 		 */
2139 		for_each_mem_cgroup_tree(iter, memcg) {
2140 			if (iter == failed) {
2141 				mem_cgroup_iter_break(memcg, iter);
2142 				break;
2143 			}
2144 			iter->oom_lock = false;
2145 		}
2146 	} else
2147 		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
2148 
2149 	spin_unlock(&memcg_oom_lock);
2150 
2151 	return !failed;
2152 }
2153 
2154 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2155 {
2156 	struct mem_cgroup *iter;
2157 
2158 	spin_lock(&memcg_oom_lock);
2159 	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
2160 	for_each_mem_cgroup_tree(iter, memcg)
2161 		iter->oom_lock = false;
2162 	spin_unlock(&memcg_oom_lock);
2163 }
2164 
2165 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
2166 {
2167 	struct mem_cgroup *iter;
2168 
2169 	spin_lock(&memcg_oom_lock);
2170 	for_each_mem_cgroup_tree(iter, memcg)
2171 		iter->under_oom++;
2172 	spin_unlock(&memcg_oom_lock);
2173 }
2174 
2175 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
2176 {
2177 	struct mem_cgroup *iter;
2178 
2179 	/*
2180 	 * Be careful about under_oom underflows because a child memcg
2181 	 * could have been added after mem_cgroup_mark_under_oom.
2182 	 */
2183 	spin_lock(&memcg_oom_lock);
2184 	for_each_mem_cgroup_tree(iter, memcg)
2185 		if (iter->under_oom > 0)
2186 			iter->under_oom--;
2187 	spin_unlock(&memcg_oom_lock);
2188 }
2189 
2190 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2191 
2192 struct oom_wait_info {
2193 	struct mem_cgroup *memcg;
2194 	wait_queue_entry_t	wait;
2195 };
2196 
2197 static int memcg_oom_wake_function(wait_queue_entry_t *wait,
2198 	unsigned mode, int sync, void *arg)
2199 {
2200 	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2201 	struct mem_cgroup *oom_wait_memcg;
2202 	struct oom_wait_info *oom_wait_info;
2203 
2204 	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2205 	oom_wait_memcg = oom_wait_info->memcg;
2206 
2207 	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
2208 	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
2209 		return 0;
2210 	return autoremove_wake_function(wait, mode, sync, arg);
2211 }
2212 
2213 void memcg1_oom_recover(struct mem_cgroup *memcg)
2214 {
2215 	/*
2216 	 * For the following lockless ->under_oom test, the only required
2217 	 * guarantee is that it must see the state asserted by an OOM when
2218 	 * this function is called as a result of userland actions
2219 	 * triggered by the notification of the OOM.  This is trivially
2220 	 * achieved by invoking mem_cgroup_mark_under_oom() before
2221 	 * triggering notification.
2222 	 */
2223 	if (memcg && memcg->under_oom)
2224 		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2225 }
2226 
2227 /**
2228  * mem_cgroup_oom_synchronize - complete memcg OOM handling
2229  * @handle: actually kill/wait or just clean up the OOM state
2230  *
2231  * This has to be called at the end of a page fault if the memcg OOM
2232  * handler was enabled.
2233  *
2234  * Memcg supports userspace OOM handling where failed allocations must
2235  * sleep on a waitqueue until the userspace task resolves the
2236  * situation.  Sleeping directly in the charge context with all kinds
2237  * of locks held is not a good idea, instead we remember an OOM state
2238  * in the task and mem_cgroup_oom_synchronize() has to be called at
2239  * the end of the page fault to complete the OOM handling.
2240  *
2241  * Returns %true if an ongoing memcg OOM situation was detected and
2242  * completed, %false otherwise.
2243  */
2244 bool mem_cgroup_oom_synchronize(bool handle)
2245 {
2246 	struct mem_cgroup *memcg = current->memcg_in_oom;
2247 	struct oom_wait_info owait;
2248 	bool locked;
2249 
2250 	/* OOM is global, do not handle */
2251 	if (!memcg)
2252 		return false;
2253 
2254 	if (!handle)
2255 		goto cleanup;
2256 
2257 	owait.memcg = memcg;
2258 	owait.wait.flags = 0;
2259 	owait.wait.func = memcg_oom_wake_function;
2260 	owait.wait.private = current;
2261 	INIT_LIST_HEAD(&owait.wait.entry);
2262 
2263 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2264 	mem_cgroup_mark_under_oom(memcg);
2265 
2266 	locked = mem_cgroup_oom_trylock(memcg);
2267 
2268 	if (locked)
2269 		mem_cgroup_oom_notify(memcg);
2270 
2271 	schedule();
2272 	mem_cgroup_unmark_under_oom(memcg);
2273 	finish_wait(&memcg_oom_waitq, &owait.wait);
2274 
2275 	if (locked)
2276 		mem_cgroup_oom_unlock(memcg);
2277 cleanup:
2278 	current->memcg_in_oom = NULL;
2279 	css_put(&memcg->css);
2280 	return true;
2281 }
2282 
2283 
2284 bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked)
2285 {
2286 	/*
2287 	 * We are in the middle of the charge context here, so we
2288 	 * don't want to block when potentially sitting on a callstack
2289 	 * that holds all kinds of filesystem and mm locks.
2290 	 *
2291 	 * cgroup1 allows disabling the OOM killer and waiting for outside
2292 	 * handling until the charge can succeed; remember the context and put
2293 	 * the task to sleep at the end of the page fault when all locks are
2294 	 * released.
2295 	 *
2296 	 * On the other hand, in-kernel OOM killer allows for an async victim
2297 	 * memory reclaim (oom_reaper) and that means that we are not solely
2298 	 * relying on the oom victim to make a forward progress and we can
2299 	 * invoke the oom killer here.
2300 	 *
2301 	 * Please note that mem_cgroup_out_of_memory might fail to find a
2302 	 * victim and then we have to bail out from the charge path.
2303 	 */
2304 	if (READ_ONCE(memcg->oom_kill_disable)) {
2305 		if (current->in_user_fault) {
2306 			css_get(&memcg->css);
2307 			current->memcg_in_oom = memcg;
2308 		}
2309 		return false;
2310 	}
2311 
2312 	mem_cgroup_mark_under_oom(memcg);
2313 
2314 	*locked = mem_cgroup_oom_trylock(memcg);
2315 
2316 	if (*locked)
2317 		mem_cgroup_oom_notify(memcg);
2318 
2319 	mem_cgroup_unmark_under_oom(memcg);
2320 
2321 	return true;
2322 }
2323 
2324 void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked)
2325 {
2326 	if (locked)
2327 		mem_cgroup_oom_unlock(memcg);
2328 }
2329 
2330 static DEFINE_MUTEX(memcg_max_mutex);
2331 
2332 static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
2333 				 unsigned long max, bool memsw)
2334 {
2335 	bool enlarge = false;
2336 	bool drained = false;
2337 	int ret;
2338 	bool limits_invariant;
2339 	struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
2340 
2341 	do {
2342 		if (signal_pending(current)) {
2343 			ret = -EINTR;
2344 			break;
2345 		}
2346 
2347 		mutex_lock(&memcg_max_mutex);
2348 		/*
2349 		 * Make sure that the new limit (memsw or memory limit) doesn't
2350 		 * break our basic invariant rule memory.max <= memsw.max.
2351 		 */
2352 		limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
2353 					   max <= memcg->memsw.max;
2354 		if (!limits_invariant) {
2355 			mutex_unlock(&memcg_max_mutex);
2356 			ret = -EINVAL;
2357 			break;
2358 		}
2359 		if (max > counter->max)
2360 			enlarge = true;
2361 		ret = page_counter_set_max(counter, max);
2362 		mutex_unlock(&memcg_max_mutex);
2363 
2364 		if (!ret)
2365 			break;
2366 
2367 		if (!drained) {
2368 			drain_all_stock(memcg);
2369 			drained = true;
2370 			continue;
2371 		}
2372 
2373 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
2374 				memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
2375 			ret = -EBUSY;
2376 			break;
2377 		}
2378 	} while (true);
2379 
2380 	if (!ret && enlarge)
2381 		memcg1_oom_recover(memcg);
2382 
2383 	return ret;
2384 }
2385 
2386 /*
2387  * Reclaims as many pages from the given memcg as possible.
2388  *
2389  * Caller is responsible for holding css reference for memcg.
2390  */
2391 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2392 {
2393 	int nr_retries = MAX_RECLAIM_RETRIES;
2394 
2395 	/* we call try-to-free pages for make this cgroup empty */
2396 	lru_add_drain_all();
2397 
2398 	drain_all_stock(memcg);
2399 
2400 	/* try to free all pages in this cgroup */
2401 	while (nr_retries && page_counter_read(&memcg->memory)) {
2402 		if (signal_pending(current))
2403 			return -EINTR;
2404 
2405 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
2406 						  MEMCG_RECLAIM_MAY_SWAP, NULL))
2407 			nr_retries--;
2408 	}
2409 
2410 	return 0;
2411 }
2412 
2413 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
2414 					    char *buf, size_t nbytes,
2415 					    loff_t off)
2416 {
2417 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2418 
2419 	if (mem_cgroup_is_root(memcg))
2420 		return -EINVAL;
2421 	return mem_cgroup_force_empty(memcg) ?: nbytes;
2422 }
2423 
2424 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
2425 				     struct cftype *cft)
2426 {
2427 	return 1;
2428 }
2429 
2430 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
2431 				      struct cftype *cft, u64 val)
2432 {
2433 	if (val == 1)
2434 		return 0;
2435 
2436 	pr_warn_once("Non-hierarchical mode is deprecated. "
2437 		     "Please report your usecase to linux-mm@kvack.org if you "
2438 		     "depend on this functionality.\n");
2439 
2440 	return -EINVAL;
2441 }
2442 
2443 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
2444 			       struct cftype *cft)
2445 {
2446 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2447 	struct page_counter *counter;
2448 
2449 	switch (MEMFILE_TYPE(cft->private)) {
2450 	case _MEM:
2451 		counter = &memcg->memory;
2452 		break;
2453 	case _MEMSWAP:
2454 		counter = &memcg->memsw;
2455 		break;
2456 	case _KMEM:
2457 		counter = &memcg->kmem;
2458 		break;
2459 	case _TCP:
2460 		counter = &memcg->tcpmem;
2461 		break;
2462 	default:
2463 		BUG();
2464 	}
2465 
2466 	switch (MEMFILE_ATTR(cft->private)) {
2467 	case RES_USAGE:
2468 		if (counter == &memcg->memory)
2469 			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
2470 		if (counter == &memcg->memsw)
2471 			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
2472 		return (u64)page_counter_read(counter) * PAGE_SIZE;
2473 	case RES_LIMIT:
2474 		return (u64)counter->max * PAGE_SIZE;
2475 	case RES_MAX_USAGE:
2476 		return (u64)counter->watermark * PAGE_SIZE;
2477 	case RES_FAILCNT:
2478 		return counter->failcnt;
2479 	case RES_SOFT_LIMIT:
2480 		return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
2481 	default:
2482 		BUG();
2483 	}
2484 }
2485 
2486 /*
2487  * This function doesn't do anything useful. Its only job is to provide a read
2488  * handler for a file so that cgroup_file_mode() will add read permissions.
2489  */
2490 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m,
2491 				     __always_unused void *v)
2492 {
2493 	return -EINVAL;
2494 }
2495 
2496 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
2497 {
2498 	int ret;
2499 
2500 	mutex_lock(&memcg_max_mutex);
2501 
2502 	ret = page_counter_set_max(&memcg->tcpmem, max);
2503 	if (ret)
2504 		goto out;
2505 
2506 	if (!memcg->tcpmem_active) {
2507 		/*
2508 		 * The active flag needs to be written after the static_key
2509 		 * update. This is what guarantees that the socket activation
2510 		 * function is the last one to run. See mem_cgroup_sk_alloc()
2511 		 * for details, and note that we don't mark any socket as
2512 		 * belonging to this memcg until that flag is up.
2513 		 *
2514 		 * We need to do this, because static_keys will span multiple
2515 		 * sites, but we can't control their order. If we mark a socket
2516 		 * as accounted, but the accounting functions are not patched in
2517 		 * yet, we'll lose accounting.
2518 		 *
2519 		 * We never race with the readers in mem_cgroup_sk_alloc(),
2520 		 * because when this value change, the code to process it is not
2521 		 * patched in yet.
2522 		 */
2523 		static_branch_inc(&memcg_sockets_enabled_key);
2524 		memcg->tcpmem_active = true;
2525 	}
2526 out:
2527 	mutex_unlock(&memcg_max_mutex);
2528 	return ret;
2529 }
2530 
2531 /*
2532  * The user of this function is...
2533  * RES_LIMIT.
2534  */
2535 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
2536 				char *buf, size_t nbytes, loff_t off)
2537 {
2538 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2539 	unsigned long nr_pages;
2540 	int ret;
2541 
2542 	buf = strstrip(buf);
2543 	ret = page_counter_memparse(buf, "-1", &nr_pages);
2544 	if (ret)
2545 		return ret;
2546 
2547 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
2548 	case RES_LIMIT:
2549 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2550 			ret = -EINVAL;
2551 			break;
2552 		}
2553 		switch (MEMFILE_TYPE(of_cft(of)->private)) {
2554 		case _MEM:
2555 			ret = mem_cgroup_resize_max(memcg, nr_pages, false);
2556 			break;
2557 		case _MEMSWAP:
2558 			ret = mem_cgroup_resize_max(memcg, nr_pages, true);
2559 			break;
2560 		case _KMEM:
2561 			pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
2562 				     "Writing any value to this file has no effect. "
2563 				     "Please report your usecase to linux-mm@kvack.org if you "
2564 				     "depend on this functionality.\n");
2565 			ret = 0;
2566 			break;
2567 		case _TCP:
2568 			pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. "
2569 				     "Please report your usecase to linux-mm@kvack.org if you "
2570 				     "depend on this functionality.\n");
2571 			ret = memcg_update_tcp_max(memcg, nr_pages);
2572 			break;
2573 		}
2574 		break;
2575 	case RES_SOFT_LIMIT:
2576 		if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
2577 			ret = -EOPNOTSUPP;
2578 		} else {
2579 			pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. "
2580 				     "Please report your usecase to linux-mm@kvack.org if you "
2581 				     "depend on this functionality.\n");
2582 			WRITE_ONCE(memcg->soft_limit, nr_pages);
2583 			ret = 0;
2584 		}
2585 		break;
2586 	}
2587 	return ret ?: nbytes;
2588 }
2589 
2590 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
2591 				size_t nbytes, loff_t off)
2592 {
2593 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2594 	struct page_counter *counter;
2595 
2596 	switch (MEMFILE_TYPE(of_cft(of)->private)) {
2597 	case _MEM:
2598 		counter = &memcg->memory;
2599 		break;
2600 	case _MEMSWAP:
2601 		counter = &memcg->memsw;
2602 		break;
2603 	case _KMEM:
2604 		counter = &memcg->kmem;
2605 		break;
2606 	case _TCP:
2607 		counter = &memcg->tcpmem;
2608 		break;
2609 	default:
2610 		BUG();
2611 	}
2612 
2613 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
2614 	case RES_MAX_USAGE:
2615 		page_counter_reset_watermark(counter);
2616 		break;
2617 	case RES_FAILCNT:
2618 		counter->failcnt = 0;
2619 		break;
2620 	default:
2621 		BUG();
2622 	}
2623 
2624 	return nbytes;
2625 }
2626 
2627 #ifdef CONFIG_NUMA
2628 
2629 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
2630 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
2631 #define LRU_ALL	     ((1 << NR_LRU_LISTS) - 1)
2632 
2633 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
2634 				int nid, unsigned int lru_mask, bool tree)
2635 {
2636 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
2637 	unsigned long nr = 0;
2638 	enum lru_list lru;
2639 
2640 	VM_BUG_ON((unsigned)nid >= nr_node_ids);
2641 
2642 	for_each_lru(lru) {
2643 		if (!(BIT(lru) & lru_mask))
2644 			continue;
2645 		if (tree)
2646 			nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
2647 		else
2648 			nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
2649 	}
2650 	return nr;
2651 }
2652 
2653 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
2654 					     unsigned int lru_mask,
2655 					     bool tree)
2656 {
2657 	unsigned long nr = 0;
2658 	enum lru_list lru;
2659 
2660 	for_each_lru(lru) {
2661 		if (!(BIT(lru) & lru_mask))
2662 			continue;
2663 		if (tree)
2664 			nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
2665 		else
2666 			nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
2667 	}
2668 	return nr;
2669 }
2670 
2671 static int memcg_numa_stat_show(struct seq_file *m, void *v)
2672 {
2673 	struct numa_stat {
2674 		const char *name;
2675 		unsigned int lru_mask;
2676 	};
2677 
2678 	static const struct numa_stat stats[] = {
2679 		{ "total", LRU_ALL },
2680 		{ "file", LRU_ALL_FILE },
2681 		{ "anon", LRU_ALL_ANON },
2682 		{ "unevictable", BIT(LRU_UNEVICTABLE) },
2683 	};
2684 	const struct numa_stat *stat;
2685 	int nid;
2686 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
2687 
2688 	mem_cgroup_flush_stats(memcg);
2689 
2690 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
2691 		seq_printf(m, "%s=%lu", stat->name,
2692 			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
2693 						   false));
2694 		for_each_node_state(nid, N_MEMORY)
2695 			seq_printf(m, " N%d=%lu", nid,
2696 				   mem_cgroup_node_nr_lru_pages(memcg, nid,
2697 							stat->lru_mask, false));
2698 		seq_putc(m, '\n');
2699 	}
2700 
2701 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
2702 
2703 		seq_printf(m, "hierarchical_%s=%lu", stat->name,
2704 			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
2705 						   true));
2706 		for_each_node_state(nid, N_MEMORY)
2707 			seq_printf(m, " N%d=%lu", nid,
2708 				   mem_cgroup_node_nr_lru_pages(memcg, nid,
2709 							stat->lru_mask, true));
2710 		seq_putc(m, '\n');
2711 	}
2712 
2713 	return 0;
2714 }
2715 #endif /* CONFIG_NUMA */
2716 
2717 static const unsigned int memcg1_stats[] = {
2718 	NR_FILE_PAGES,
2719 	NR_ANON_MAPPED,
2720 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2721 	NR_ANON_THPS,
2722 #endif
2723 	NR_SHMEM,
2724 	NR_FILE_MAPPED,
2725 	NR_FILE_DIRTY,
2726 	NR_WRITEBACK,
2727 	WORKINGSET_REFAULT_ANON,
2728 	WORKINGSET_REFAULT_FILE,
2729 #ifdef CONFIG_SWAP
2730 	MEMCG_SWAP,
2731 	NR_SWAPCACHE,
2732 #endif
2733 };
2734 
2735 static const char *const memcg1_stat_names[] = {
2736 	"cache",
2737 	"rss",
2738 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2739 	"rss_huge",
2740 #endif
2741 	"shmem",
2742 	"mapped_file",
2743 	"dirty",
2744 	"writeback",
2745 	"workingset_refault_anon",
2746 	"workingset_refault_file",
2747 #ifdef CONFIG_SWAP
2748 	"swap",
2749 	"swapcached",
2750 #endif
2751 };
2752 
2753 /* Universal VM events cgroup1 shows, original sort order */
2754 static const unsigned int memcg1_events[] = {
2755 	PGPGIN,
2756 	PGPGOUT,
2757 	PGFAULT,
2758 	PGMAJFAULT,
2759 };
2760 
2761 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
2762 {
2763 	unsigned long memory, memsw;
2764 	struct mem_cgroup *mi;
2765 	unsigned int i;
2766 
2767 	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
2768 
2769 	mem_cgroup_flush_stats(memcg);
2770 
2771 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
2772 		unsigned long nr;
2773 
2774 		nr = memcg_page_state_local_output(memcg, memcg1_stats[i]);
2775 		seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr);
2776 	}
2777 
2778 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
2779 		seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
2780 			       memcg_events_local(memcg, memcg1_events[i]));
2781 
2782 	for (i = 0; i < NR_LRU_LISTS; i++)
2783 		seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
2784 			       memcg_page_state_local(memcg, NR_LRU_BASE + i) *
2785 			       PAGE_SIZE);
2786 
2787 	/* Hierarchical information */
2788 	memory = memsw = PAGE_COUNTER_MAX;
2789 	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
2790 		memory = min(memory, READ_ONCE(mi->memory.max));
2791 		memsw = min(memsw, READ_ONCE(mi->memsw.max));
2792 	}
2793 	seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
2794 		       (u64)memory * PAGE_SIZE);
2795 	seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
2796 		       (u64)memsw * PAGE_SIZE);
2797 
2798 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
2799 		unsigned long nr;
2800 
2801 		nr = memcg_page_state_output(memcg, memcg1_stats[i]);
2802 		seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
2803 			       (u64)nr);
2804 	}
2805 
2806 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
2807 		seq_buf_printf(s, "total_%s %llu\n",
2808 			       vm_event_name(memcg1_events[i]),
2809 			       (u64)memcg_events(memcg, memcg1_events[i]));
2810 
2811 	for (i = 0; i < NR_LRU_LISTS; i++)
2812 		seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
2813 			       (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
2814 			       PAGE_SIZE);
2815 
2816 #ifdef CONFIG_DEBUG_VM
2817 	{
2818 		pg_data_t *pgdat;
2819 		struct mem_cgroup_per_node *mz;
2820 		unsigned long anon_cost = 0;
2821 		unsigned long file_cost = 0;
2822 
2823 		for_each_online_pgdat(pgdat) {
2824 			mz = memcg->nodeinfo[pgdat->node_id];
2825 
2826 			anon_cost += mz->lruvec.anon_cost;
2827 			file_cost += mz->lruvec.file_cost;
2828 		}
2829 		seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
2830 		seq_buf_printf(s, "file_cost %lu\n", file_cost);
2831 	}
2832 #endif
2833 }
2834 
2835 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
2836 				      struct cftype *cft)
2837 {
2838 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2839 
2840 	return mem_cgroup_swappiness(memcg);
2841 }
2842 
2843 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
2844 				       struct cftype *cft, u64 val)
2845 {
2846 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2847 
2848 	if (val > MAX_SWAPPINESS)
2849 		return -EINVAL;
2850 
2851 	if (!mem_cgroup_is_root(memcg))
2852 		WRITE_ONCE(memcg->swappiness, val);
2853 	else
2854 		WRITE_ONCE(vm_swappiness, val);
2855 
2856 	return 0;
2857 }
2858 
2859 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
2860 {
2861 	struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
2862 
2863 	seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable));
2864 	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
2865 	seq_printf(sf, "oom_kill %lu\n",
2866 		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
2867 	return 0;
2868 }
2869 
2870 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
2871 	struct cftype *cft, u64 val)
2872 {
2873 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2874 
2875 	pr_warn_once("oom_control is deprecated and will be removed. "
2876 		     "Please report your usecase to linux-mm-@kvack.org if you "
2877 		     "depend on this functionality. \n");
2878 
2879 	/* cannot set to root cgroup and only 0 and 1 are allowed */
2880 	if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
2881 		return -EINVAL;
2882 
2883 	WRITE_ONCE(memcg->oom_kill_disable, val);
2884 	if (!val)
2885 		memcg1_oom_recover(memcg);
2886 
2887 	return 0;
2888 }
2889 
2890 #ifdef CONFIG_SLUB_DEBUG
2891 static int mem_cgroup_slab_show(struct seq_file *m, void *p)
2892 {
2893 	/*
2894 	 * Deprecated.
2895 	 * Please, take a look at tools/cgroup/memcg_slabinfo.py .
2896 	 */
2897 	return 0;
2898 }
2899 #endif
2900 
2901 struct cftype mem_cgroup_legacy_files[] = {
2902 	{
2903 		.name = "usage_in_bytes",
2904 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
2905 		.read_u64 = mem_cgroup_read_u64,
2906 	},
2907 	{
2908 		.name = "max_usage_in_bytes",
2909 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
2910 		.write = mem_cgroup_reset,
2911 		.read_u64 = mem_cgroup_read_u64,
2912 	},
2913 	{
2914 		.name = "limit_in_bytes",
2915 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
2916 		.write = mem_cgroup_write,
2917 		.read_u64 = mem_cgroup_read_u64,
2918 	},
2919 	{
2920 		.name = "soft_limit_in_bytes",
2921 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2922 		.write = mem_cgroup_write,
2923 		.read_u64 = mem_cgroup_read_u64,
2924 	},
2925 	{
2926 		.name = "failcnt",
2927 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2928 		.write = mem_cgroup_reset,
2929 		.read_u64 = mem_cgroup_read_u64,
2930 	},
2931 	{
2932 		.name = "stat",
2933 		.seq_show = memory_stat_show,
2934 	},
2935 	{
2936 		.name = "force_empty",
2937 		.write = mem_cgroup_force_empty_write,
2938 	},
2939 	{
2940 		.name = "use_hierarchy",
2941 		.write_u64 = mem_cgroup_hierarchy_write,
2942 		.read_u64 = mem_cgroup_hierarchy_read,
2943 	},
2944 	{
2945 		.name = "cgroup.event_control",		/* XXX: for compat */
2946 		.write = memcg_write_event_control,
2947 		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
2948 	},
2949 	{
2950 		.name = "swappiness",
2951 		.read_u64 = mem_cgroup_swappiness_read,
2952 		.write_u64 = mem_cgroup_swappiness_write,
2953 	},
2954 	{
2955 		.name = "move_charge_at_immigrate",
2956 		.read_u64 = mem_cgroup_move_charge_read,
2957 		.write_u64 = mem_cgroup_move_charge_write,
2958 	},
2959 	{
2960 		.name = "oom_control",
2961 		.seq_show = mem_cgroup_oom_control_read,
2962 		.write_u64 = mem_cgroup_oom_control_write,
2963 	},
2964 	{
2965 		.name = "pressure_level",
2966 		.seq_show = mem_cgroup_dummy_seq_show,
2967 	},
2968 #ifdef CONFIG_NUMA
2969 	{
2970 		.name = "numa_stat",
2971 		.seq_show = memcg_numa_stat_show,
2972 	},
2973 #endif
2974 	{
2975 		.name = "kmem.limit_in_bytes",
2976 		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
2977 		.write = mem_cgroup_write,
2978 		.read_u64 = mem_cgroup_read_u64,
2979 	},
2980 	{
2981 		.name = "kmem.usage_in_bytes",
2982 		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
2983 		.read_u64 = mem_cgroup_read_u64,
2984 	},
2985 	{
2986 		.name = "kmem.failcnt",
2987 		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
2988 		.write = mem_cgroup_reset,
2989 		.read_u64 = mem_cgroup_read_u64,
2990 	},
2991 	{
2992 		.name = "kmem.max_usage_in_bytes",
2993 		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
2994 		.write = mem_cgroup_reset,
2995 		.read_u64 = mem_cgroup_read_u64,
2996 	},
2997 #ifdef CONFIG_SLUB_DEBUG
2998 	{
2999 		.name = "kmem.slabinfo",
3000 		.seq_show = mem_cgroup_slab_show,
3001 	},
3002 #endif
3003 	{
3004 		.name = "kmem.tcp.limit_in_bytes",
3005 		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
3006 		.write = mem_cgroup_write,
3007 		.read_u64 = mem_cgroup_read_u64,
3008 	},
3009 	{
3010 		.name = "kmem.tcp.usage_in_bytes",
3011 		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
3012 		.read_u64 = mem_cgroup_read_u64,
3013 	},
3014 	{
3015 		.name = "kmem.tcp.failcnt",
3016 		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
3017 		.write = mem_cgroup_reset,
3018 		.read_u64 = mem_cgroup_read_u64,
3019 	},
3020 	{
3021 		.name = "kmem.tcp.max_usage_in_bytes",
3022 		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
3023 		.write = mem_cgroup_reset,
3024 		.read_u64 = mem_cgroup_read_u64,
3025 	},
3026 	{ },	/* terminate */
3027 };
3028 
3029 struct cftype memsw_files[] = {
3030 	{
3031 		.name = "memsw.usage_in_bytes",
3032 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3033 		.read_u64 = mem_cgroup_read_u64,
3034 	},
3035 	{
3036 		.name = "memsw.max_usage_in_bytes",
3037 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
3038 		.write = mem_cgroup_reset,
3039 		.read_u64 = mem_cgroup_read_u64,
3040 	},
3041 	{
3042 		.name = "memsw.limit_in_bytes",
3043 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
3044 		.write = mem_cgroup_write,
3045 		.read_u64 = mem_cgroup_read_u64,
3046 	},
3047 	{
3048 		.name = "memsw.failcnt",
3049 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
3050 		.write = mem_cgroup_reset,
3051 		.read_u64 = mem_cgroup_read_u64,
3052 	},
3053 	{ },	/* terminate */
3054 };
3055 
3056 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages)
3057 {
3058 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
3059 		if (nr_pages > 0)
3060 			page_counter_charge(&memcg->kmem, nr_pages);
3061 		else
3062 			page_counter_uncharge(&memcg->kmem, -nr_pages);
3063 	}
3064 }
3065 
3066 bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
3067 			 gfp_t gfp_mask)
3068 {
3069 	struct page_counter *fail;
3070 
3071 	if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
3072 		memcg->tcpmem_pressure = 0;
3073 		return true;
3074 	}
3075 	memcg->tcpmem_pressure = 1;
3076 	if (gfp_mask & __GFP_NOFAIL) {
3077 		page_counter_charge(&memcg->tcpmem, nr_pages);
3078 		return true;
3079 	}
3080 	return false;
3081 }
3082 
3083 bool memcg1_alloc_events(struct mem_cgroup *memcg)
3084 {
3085 	memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu,
3086 						GFP_KERNEL_ACCOUNT);
3087 	return !!memcg->events_percpu;
3088 }
3089 
3090 void memcg1_free_events(struct mem_cgroup *memcg)
3091 {
3092 	if (memcg->events_percpu)
3093 		free_percpu(memcg->events_percpu);
3094 }
3095 
3096 static int __init memcg1_init(void)
3097 {
3098 	int node;
3099 
3100 	for_each_node(node) {
3101 		struct mem_cgroup_tree_per_node *rtpn;
3102 
3103 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
3104 
3105 		rtpn->rb_root = RB_ROOT;
3106 		rtpn->rb_rightmost = NULL;
3107 		spin_lock_init(&rtpn->lock);
3108 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
3109 	}
3110 
3111 	return 0;
3112 }
3113 subsys_initcall(memcg1_init);
3114