xref: /linux/mm/memcontrol-v1.c (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 #include <linux/memcontrol.h>
4 #include <linux/swap.h>
5 #include <linux/mm_inline.h>
6 #include <linux/pagewalk.h>
7 #include <linux/backing-dev.h>
8 #include <linux/eventfd.h>
9 #include <linux/poll.h>
10 #include <linux/sort.h>
11 #include <linux/file.h>
12 #include <linux/seq_buf.h>
13 
14 #include "internal.h"
15 #include "swap.h"
16 #include "swap_table.h"
17 #include "memcontrol-v1.h"
18 
19 /*
20  * Cgroups above their limits are maintained in a RB-Tree, independent of
21  * their hierarchy representation
22  */
23 
24 struct mem_cgroup_tree_per_node {
25 	struct rb_root rb_root;
26 	struct rb_node *rb_rightmost;
27 	spinlock_t lock;
28 };
29 
30 struct mem_cgroup_tree {
31 	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
32 };
33 
34 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
35 
36 /*
37  * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
38  * limit reclaim to prevent infinite loops, if they ever occur.
39  */
40 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
41 #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
42 
43 /* for OOM */
44 struct mem_cgroup_eventfd_list {
45 	struct list_head list;
46 	struct eventfd_ctx *eventfd;
47 };
48 
49 /*
50  * cgroup_event represents events which userspace want to receive.
51  */
52 struct mem_cgroup_event {
53 	/*
54 	 * memcg which the event belongs to.
55 	 */
56 	struct mem_cgroup *memcg;
57 	/*
58 	 * eventfd to signal userspace about the event.
59 	 */
60 	struct eventfd_ctx *eventfd;
61 	/*
62 	 * Each of these stored in a list by the cgroup.
63 	 */
64 	struct list_head list;
65 	/*
66 	 * register_event() callback will be used to add new userspace
67 	 * waiter for changes related to this event.  Use eventfd_signal()
68 	 * on eventfd to send notification to userspace.
69 	 */
70 	int (*register_event)(struct mem_cgroup *memcg,
71 			      struct eventfd_ctx *eventfd, const char *args);
72 	/*
73 	 * unregister_event() callback will be called when userspace closes
74 	 * the eventfd or on cgroup removing.  This callback must be set,
75 	 * if you want provide notification functionality.
76 	 */
77 	void (*unregister_event)(struct mem_cgroup *memcg,
78 				 struct eventfd_ctx *eventfd);
79 	/*
80 	 * All fields below needed to unregister event when
81 	 * userspace closes eventfd.
82 	 */
83 	poll_table pt;
84 	wait_queue_head_t *wqh;
85 	wait_queue_entry_t wait;
86 	struct work_struct remove;
87 };
88 
89 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
90 #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
91 #define MEMFILE_ATTR(val)	((val) & 0xffff)
92 
93 enum {
94 	RES_USAGE,
95 	RES_LIMIT,
96 	RES_MAX_USAGE,
97 	RES_FAILCNT,
98 	RES_SOFT_LIMIT,
99 };
100 
101 #ifdef CONFIG_LOCKDEP
102 static struct lockdep_map memcg_oom_lock_dep_map = {
103 	.name = "memcg_oom_lock",
104 };
105 #endif
106 
107 DEFINE_SPINLOCK(memcg_oom_lock);
108 
109 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
110 					 struct mem_cgroup_tree_per_node *mctz,
111 					 unsigned long new_usage_in_excess)
112 {
113 	struct rb_node **p = &mctz->rb_root.rb_node;
114 	struct rb_node *parent = NULL;
115 	struct mem_cgroup_per_node *mz_node;
116 	bool rightmost = true;
117 
118 	if (mz->on_tree)
119 		return;
120 
121 	mz->usage_in_excess = new_usage_in_excess;
122 	if (!mz->usage_in_excess)
123 		return;
124 	while (*p) {
125 		parent = *p;
126 		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
127 					tree_node);
128 		if (mz->usage_in_excess < mz_node->usage_in_excess) {
129 			p = &(*p)->rb_left;
130 			rightmost = false;
131 		} else {
132 			p = &(*p)->rb_right;
133 		}
134 	}
135 
136 	if (rightmost)
137 		mctz->rb_rightmost = &mz->tree_node;
138 
139 	rb_link_node(&mz->tree_node, parent, p);
140 	rb_insert_color(&mz->tree_node, &mctz->rb_root);
141 	mz->on_tree = true;
142 }
143 
144 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
145 					 struct mem_cgroup_tree_per_node *mctz)
146 {
147 	if (!mz->on_tree)
148 		return;
149 
150 	if (&mz->tree_node == mctz->rb_rightmost)
151 		mctz->rb_rightmost = rb_prev(&mz->tree_node);
152 
153 	rb_erase(&mz->tree_node, &mctz->rb_root);
154 	mz->on_tree = false;
155 }
156 
157 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
158 				       struct mem_cgroup_tree_per_node *mctz)
159 {
160 	unsigned long flags;
161 
162 	spin_lock_irqsave(&mctz->lock, flags);
163 	__mem_cgroup_remove_exceeded(mz, mctz);
164 	spin_unlock_irqrestore(&mctz->lock, flags);
165 }
166 
167 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
168 {
169 	unsigned long nr_pages = page_counter_read(&memcg->memory);
170 	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
171 	unsigned long excess = 0;
172 
173 	if (nr_pages > soft_limit)
174 		excess = nr_pages - soft_limit;
175 
176 	return excess;
177 }
178 
179 static void memcg1_update_tree(struct mem_cgroup *memcg, int nid)
180 {
181 	unsigned long excess;
182 	struct mem_cgroup_per_node *mz;
183 	struct mem_cgroup_tree_per_node *mctz;
184 
185 	if (lru_gen_enabled()) {
186 		if (soft_limit_excess(memcg))
187 			lru_gen_soft_reclaim(memcg, nid);
188 		return;
189 	}
190 
191 	mctz = soft_limit_tree.rb_tree_per_node[nid];
192 	if (!mctz)
193 		return;
194 	/*
195 	 * Necessary to update all ancestors when hierarchy is used.
196 	 * because their event counter is not touched.
197 	 */
198 	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
199 		mz = memcg->nodeinfo[nid];
200 		excess = soft_limit_excess(memcg);
201 		/*
202 		 * We have to update the tree if mz is on RB-tree or
203 		 * mem is over its softlimit.
204 		 */
205 		if (excess || mz->on_tree) {
206 			unsigned long flags;
207 
208 			spin_lock_irqsave(&mctz->lock, flags);
209 			/* if on-tree, remove it */
210 			if (mz->on_tree)
211 				__mem_cgroup_remove_exceeded(mz, mctz);
212 			/*
213 			 * Insert again. mz->usage_in_excess will be updated.
214 			 * If excess is 0, no tree ops.
215 			 */
216 			__mem_cgroup_insert_exceeded(mz, mctz, excess);
217 			spin_unlock_irqrestore(&mctz->lock, flags);
218 		}
219 	}
220 }
221 
222 void memcg1_remove_from_trees(struct mem_cgroup *memcg)
223 {
224 	struct mem_cgroup_tree_per_node *mctz;
225 	struct mem_cgroup_per_node *mz;
226 	int nid;
227 
228 	for_each_node(nid) {
229 		mz = memcg->nodeinfo[nid];
230 		mctz = soft_limit_tree.rb_tree_per_node[nid];
231 		if (mctz)
232 			mem_cgroup_remove_exceeded(mz, mctz);
233 	}
234 }
235 
236 static struct mem_cgroup_per_node *
237 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
238 {
239 	struct mem_cgroup_per_node *mz;
240 
241 retry:
242 	mz = NULL;
243 	if (!mctz->rb_rightmost)
244 		goto done;		/* Nothing to reclaim from */
245 
246 	mz = rb_entry(mctz->rb_rightmost,
247 		      struct mem_cgroup_per_node, tree_node);
248 	/*
249 	 * Remove the node now but someone else can add it back,
250 	 * we will to add it back at the end of reclaim to its correct
251 	 * position in the tree.
252 	 */
253 	__mem_cgroup_remove_exceeded(mz, mctz);
254 	if (!soft_limit_excess(mz->memcg) ||
255 	    !css_tryget(&mz->memcg->css))
256 		goto retry;
257 done:
258 	return mz;
259 }
260 
261 static struct mem_cgroup_per_node *
262 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
263 {
264 	struct mem_cgroup_per_node *mz;
265 
266 	spin_lock_irq(&mctz->lock);
267 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
268 	spin_unlock_irq(&mctz->lock);
269 	return mz;
270 }
271 
272 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
273 				   pg_data_t *pgdat,
274 				   gfp_t gfp_mask,
275 				   unsigned long *total_scanned)
276 {
277 	struct mem_cgroup *victim = NULL;
278 	int total = 0;
279 	int loop = 0;
280 	unsigned long excess;
281 	unsigned long nr_scanned;
282 	struct mem_cgroup_reclaim_cookie reclaim = {
283 		.pgdat = pgdat,
284 	};
285 
286 	excess = soft_limit_excess(root_memcg);
287 
288 	while (1) {
289 		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
290 		if (!victim) {
291 			loop++;
292 			if (loop >= 2) {
293 				/*
294 				 * If we have not been able to reclaim
295 				 * anything, it might because there are
296 				 * no reclaimable pages under this hierarchy
297 				 */
298 				if (!total)
299 					break;
300 				/*
301 				 * We want to do more targeted reclaim.
302 				 * excess >> 2 is not to excessive so as to
303 				 * reclaim too much, nor too less that we keep
304 				 * coming back to reclaim from this cgroup
305 				 */
306 				if (total >= (excess >> 2) ||
307 					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
308 					break;
309 			}
310 			continue;
311 		}
312 		total += mem_cgroup_shrink_node(victim, gfp_mask, false,
313 					pgdat, &nr_scanned);
314 		*total_scanned += nr_scanned;
315 		if (!soft_limit_excess(root_memcg))
316 			break;
317 	}
318 	mem_cgroup_iter_break(root_memcg, victim);
319 	return total;
320 }
321 
322 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
323 					    gfp_t gfp_mask,
324 					    unsigned long *total_scanned)
325 {
326 	unsigned long nr_reclaimed = 0;
327 	struct mem_cgroup_per_node *mz, *next_mz = NULL;
328 	unsigned long reclaimed;
329 	int loop = 0;
330 	struct mem_cgroup_tree_per_node *mctz;
331 	unsigned long excess;
332 
333 	if (lru_gen_enabled())
334 		return 0;
335 
336 	if (order > 0)
337 		return 0;
338 
339 	mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
340 
341 	/*
342 	 * Do not even bother to check the largest node if the root
343 	 * is empty. Do it lockless to prevent lock bouncing. Races
344 	 * are acceptable as soft limit is best effort anyway.
345 	 */
346 	if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
347 		return 0;
348 
349 	/*
350 	 * This loop can run a while, specially if mem_cgroup's continuously
351 	 * keep exceeding their soft limit and putting the system under
352 	 * pressure
353 	 */
354 	do {
355 		if (next_mz)
356 			mz = next_mz;
357 		else
358 			mz = mem_cgroup_largest_soft_limit_node(mctz);
359 		if (!mz)
360 			break;
361 
362 		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
363 						    gfp_mask, total_scanned);
364 		nr_reclaimed += reclaimed;
365 		spin_lock_irq(&mctz->lock);
366 
367 		/*
368 		 * If we failed to reclaim anything from this memory cgroup
369 		 * it is time to move on to the next cgroup
370 		 */
371 		next_mz = NULL;
372 		if (!reclaimed)
373 			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
374 
375 		excess = soft_limit_excess(mz->memcg);
376 		/*
377 		 * One school of thought says that we should not add
378 		 * back the node to the tree if reclaim returns 0.
379 		 * But our reclaim could return 0, simply because due
380 		 * to priority we are exposing a smaller subset of
381 		 * memory to reclaim from. Consider this as a longer
382 		 * term TODO.
383 		 */
384 		/* If excess == 0, no tree ops */
385 		__mem_cgroup_insert_exceeded(mz, mctz, excess);
386 		spin_unlock_irq(&mctz->lock);
387 		css_put(&mz->memcg->css);
388 		loop++;
389 		/*
390 		 * Could not reclaim anything and there are no more
391 		 * mem cgroups to try or we seem to be looping without
392 		 * reclaiming anything.
393 		 */
394 		if (!nr_reclaimed &&
395 			(next_mz == NULL ||
396 			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
397 			break;
398 	} while (!nr_reclaimed);
399 	if (next_mz)
400 		css_put(&next_mz->memcg->css);
401 	return nr_reclaimed;
402 }
403 
404 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
405 				struct cftype *cft)
406 {
407 	return 0;
408 }
409 
410 #ifdef CONFIG_MMU
411 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
412 				 struct cftype *cft, u64 val)
413 {
414 	pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
415 		     "Please report your usecase to linux-mm@kvack.org if you "
416 		     "depend on this functionality.\n");
417 
418 	if (val != 0)
419 		return -EINVAL;
420 	return 0;
421 }
422 #else
423 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
424 				 struct cftype *cft, u64 val)
425 {
426 	return -ENOSYS;
427 }
428 #endif
429 
430 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
431 {
432 	unsigned long val;
433 
434 	if (mem_cgroup_is_root(memcg)) {
435 		/*
436 		 * Approximate root's usage from global state. This isn't
437 		 * perfect, but the root usage was always an approximation.
438 		 */
439 		val = global_node_page_state(NR_FILE_PAGES) +
440 			global_node_page_state(NR_ANON_MAPPED);
441 		if (swap)
442 			val += total_swap_pages - get_nr_swap_pages();
443 	} else {
444 		if (!swap)
445 			val = page_counter_read(&memcg->memory);
446 		else
447 			val = page_counter_read(&memcg->memsw);
448 	}
449 	return val;
450 }
451 
452 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
453 {
454 	struct mem_cgroup_threshold_ary *t;
455 	unsigned long usage;
456 	int i;
457 
458 	rcu_read_lock();
459 	if (!swap)
460 		t = rcu_dereference(memcg->thresholds.primary);
461 	else
462 		t = rcu_dereference(memcg->memsw_thresholds.primary);
463 
464 	if (!t)
465 		goto unlock;
466 
467 	usage = mem_cgroup_usage(memcg, swap);
468 
469 	/*
470 	 * current_threshold points to threshold just below or equal to usage.
471 	 * If it's not true, a threshold was crossed after last
472 	 * call of __mem_cgroup_threshold().
473 	 */
474 	i = t->current_threshold;
475 
476 	/*
477 	 * Iterate backward over array of thresholds starting from
478 	 * current_threshold and check if a threshold is crossed.
479 	 * If none of thresholds below usage is crossed, we read
480 	 * only one element of the array here.
481 	 */
482 	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
483 		eventfd_signal(t->entries[i].eventfd);
484 
485 	/* i = current_threshold + 1 */
486 	i++;
487 
488 	/*
489 	 * Iterate forward over array of thresholds starting from
490 	 * current_threshold+1 and check if a threshold is crossed.
491 	 * If none of thresholds above usage is crossed, we read
492 	 * only one element of the array here.
493 	 */
494 	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
495 		eventfd_signal(t->entries[i].eventfd);
496 
497 	/* Update current_threshold */
498 	t->current_threshold = i - 1;
499 unlock:
500 	rcu_read_unlock();
501 }
502 
503 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
504 {
505 	while (memcg) {
506 		__mem_cgroup_threshold(memcg, false);
507 		if (do_memsw_account())
508 			__mem_cgroup_threshold(memcg, true);
509 
510 		memcg = parent_mem_cgroup(memcg);
511 	}
512 }
513 
514 /* Cgroup1: threshold notifications & softlimit tree updates */
515 
516 /*
517  * Per memcg event counter is incremented at every pagein/pageout. With THP,
518  * it will be incremented by the number of pages. This counter is used
519  * to trigger some periodic events. This is straightforward and better
520  * than using jiffies etc. to handle periodic memcg event.
521  */
522 enum mem_cgroup_events_target {
523 	MEM_CGROUP_TARGET_THRESH,
524 	MEM_CGROUP_TARGET_SOFTLIMIT,
525 	MEM_CGROUP_NTARGETS,
526 };
527 
528 struct memcg1_events_percpu {
529 	unsigned long nr_page_events;
530 	unsigned long targets[MEM_CGROUP_NTARGETS];
531 };
532 
533 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
534 {
535 	/* pagein of a big page is an event. So, ignore page size */
536 	if (nr_pages > 0)
537 		count_memcg_events(memcg, PGPGIN, 1);
538 	else {
539 		count_memcg_events(memcg, PGPGOUT, 1);
540 		nr_pages = -nr_pages; /* for event */
541 	}
542 
543 	__this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages);
544 }
545 
546 #define THRESHOLDS_EVENTS_TARGET 128
547 #define SOFTLIMIT_EVENTS_TARGET 1024
548 
549 static bool memcg1_event_ratelimit(struct mem_cgroup *memcg,
550 				enum mem_cgroup_events_target target)
551 {
552 	unsigned long val, next;
553 
554 	val = __this_cpu_read(memcg->events_percpu->nr_page_events);
555 	next = __this_cpu_read(memcg->events_percpu->targets[target]);
556 	/* from time_after() in jiffies.h */
557 	if ((long)(next - val) < 0) {
558 		switch (target) {
559 		case MEM_CGROUP_TARGET_THRESH:
560 			next = val + THRESHOLDS_EVENTS_TARGET;
561 			break;
562 		case MEM_CGROUP_TARGET_SOFTLIMIT:
563 			next = val + SOFTLIMIT_EVENTS_TARGET;
564 			break;
565 		default:
566 			break;
567 		}
568 		__this_cpu_write(memcg->events_percpu->targets[target], next);
569 		return true;
570 	}
571 	return false;
572 }
573 
574 /*
575  * Check events in order.
576  *
577  */
578 static void memcg1_check_events(struct mem_cgroup *memcg, int nid)
579 {
580 	if (IS_ENABLED(CONFIG_PREEMPT_RT))
581 		return;
582 
583 	/* threshold event is triggered in finer grain than soft limit */
584 	if (unlikely(memcg1_event_ratelimit(memcg,
585 						MEM_CGROUP_TARGET_THRESH))) {
586 		bool do_softlimit;
587 
588 		do_softlimit = memcg1_event_ratelimit(memcg,
589 						MEM_CGROUP_TARGET_SOFTLIMIT);
590 		mem_cgroup_threshold(memcg);
591 		if (unlikely(do_softlimit))
592 			memcg1_update_tree(memcg, nid);
593 	}
594 }
595 
596 void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
597 {
598 	unsigned long flags;
599 
600 	local_irq_save(flags);
601 	memcg1_charge_statistics(memcg, folio_nr_pages(folio));
602 	memcg1_check_events(memcg, folio_nid(folio));
603 	local_irq_restore(flags);
604 }
605 
606 #ifdef CONFIG_SWAP
607 /**
608  * __memcg1_swapout - transfer a memsw charge to swap
609  * @folio: folio whose memsw charge to transfer
610  * @ci: the locked swap cluster holding the swap entries
611  *
612  * Transfer the memsw charge of @folio to the swap entry stored in
613  * folio->swap.
614  *
615  * Context: folio must be isolated, unmapped, locked and is just about to
616  * be freed, and caller must disable IRQs and hold the swap cluster lock.
617  */
618 void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci)
619 {
620 	struct mem_cgroup *memcg, *swap_memcg;
621 	struct obj_cgroup *objcg;
622 	unsigned int nr_entries;
623 
624 	VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
625 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
626 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
627 	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
628 
629 	if (mem_cgroup_disabled())
630 		return;
631 
632 	if (!do_memsw_account())
633 		return;
634 
635 	objcg = folio_objcg(folio);
636 	VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
637 	if (!objcg)
638 		return;
639 
640 	rcu_read_lock();
641 	memcg = obj_cgroup_memcg(objcg);
642 	/*
643 	 * In case the memcg owning these pages has been offlined and doesn't
644 	 * have an ID allocated to it anymore, charge the closest online
645 	 * ancestor for the swap instead and transfer the memory+swap charge.
646 	 */
647 	nr_entries = folio_nr_pages(folio);
648 	swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries);
649 	mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
650 
651 	__swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_entries,
652 			  mem_cgroup_private_id(swap_memcg));
653 
654 	folio_unqueue_deferred_split(folio);
655 	folio->memcg_data = 0;
656 
657 	if (!obj_cgroup_is_root(objcg))
658 		page_counter_uncharge(&memcg->memory, nr_entries);
659 
660 	if (memcg != swap_memcg) {
661 		if (!mem_cgroup_is_root(swap_memcg))
662 			page_counter_charge(&swap_memcg->memsw, nr_entries);
663 		page_counter_uncharge(&memcg->memsw, nr_entries);
664 	}
665 
666 	/*
667 	 * The caller must hold the swap cluster lock with IRQ off. It is
668 	 * important here to have the interrupts disabled because it is the
669 	 * only synchronisation we have for updating the per-CPU variables.
670 	 */
671 	preempt_disable_nested();
672 	VM_WARN_ON_IRQS_ENABLED();
673 	memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
674 	preempt_enable_nested();
675 	memcg1_check_events(memcg, folio_nid(folio));
676 
677 	rcu_read_unlock();
678 	obj_cgroup_put(objcg);
679 }
680 
681 /**
682  * memcg1_swapin - uncharge swap slot on swapin
683  * @folio: folio being swapped in
684  *
685  * Call this function after successfully adding the charged
686  * folio to swapcache.
687  *
688  * Context: The folio has to be in swap cache and locked.
689  */
690 void memcg1_swapin(struct folio *folio)
691 {
692 	struct swap_cluster_info *ci;
693 	unsigned long nr_pages;
694 	unsigned short id;
695 
696 	VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
697 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
698 
699 	/*
700 	 * Cgroup1's unified memory+swap counter has been charged with the
701 	 * new swapcache page, finish the transfer by uncharging the swap
702 	 * slot. The swap slot would also get uncharged when it dies, but
703 	 * it can stick around indefinitely and we'd count the page twice
704 	 * the entire time.
705 	 *
706 	 * Cgroup2 has separate resource counters for memory and swap,
707 	 * so this is a non-issue here. Memory and swap charge lifetimes
708 	 * correspond 1:1 to page and swap slot lifetimes: we charge the
709 	 * page to memory here, and uncharge swap when the slot is freed.
710 	 */
711 	if (!do_memsw_account())
712 		return;
713 
714 	/*
715 	 * The swap entry might not get freed for a long time,
716 	 * let's not wait for it.  The page already received a
717 	 * memory+swap charge, drop the swap entry duplicate.
718 	 */
719 	nr_pages = folio_nr_pages(folio);
720 	ci = swap_cluster_get_and_lock(folio);
721 	id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap),
722 				 nr_pages);
723 	swap_cluster_unlock(ci);
724 	mem_cgroup_uncharge_swap(id, nr_pages);
725 }
726 #endif
727 
728 void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
729 			   unsigned long nr_memory, int nid)
730 {
731 	unsigned long flags;
732 
733 	local_irq_save(flags);
734 	count_memcg_events(memcg, PGPGOUT, pgpgout);
735 	__this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
736 	memcg1_check_events(memcg, nid);
737 	local_irq_restore(flags);
738 }
739 
740 static int compare_thresholds(const void *a, const void *b)
741 {
742 	const struct mem_cgroup_threshold *_a = a;
743 	const struct mem_cgroup_threshold *_b = b;
744 
745 	if (_a->threshold > _b->threshold)
746 		return 1;
747 
748 	if (_a->threshold < _b->threshold)
749 		return -1;
750 
751 	return 0;
752 }
753 
754 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
755 {
756 	struct mem_cgroup_eventfd_list *ev;
757 
758 	spin_lock(&memcg_oom_lock);
759 
760 	list_for_each_entry(ev, &memcg->oom_notify, list)
761 		eventfd_signal(ev->eventfd);
762 
763 	spin_unlock(&memcg_oom_lock);
764 	return 0;
765 }
766 
767 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
768 {
769 	struct mem_cgroup *iter;
770 
771 	for_each_mem_cgroup_tree(iter, memcg)
772 		mem_cgroup_oom_notify_cb(iter);
773 }
774 
775 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
776 	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
777 {
778 	struct mem_cgroup_thresholds *thresholds;
779 	struct mem_cgroup_threshold_ary *new;
780 	unsigned long threshold;
781 	unsigned long usage;
782 	int i, size, ret;
783 
784 	ret = page_counter_memparse(args, "-1", &threshold);
785 	if (ret)
786 		return ret;
787 
788 	mutex_lock(&memcg->thresholds_lock);
789 
790 	if (type == _MEM) {
791 		thresholds = &memcg->thresholds;
792 		usage = mem_cgroup_usage(memcg, false);
793 	} else if (type == _MEMSWAP) {
794 		thresholds = &memcg->memsw_thresholds;
795 		usage = mem_cgroup_usage(memcg, true);
796 	} else
797 		BUG();
798 
799 	/* Check if a threshold crossed before adding a new one */
800 	if (thresholds->primary)
801 		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
802 
803 	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
804 
805 	/* Allocate memory for new array of thresholds */
806 	new = kmalloc_flex(*new, entries, size, GFP_KERNEL_ACCOUNT);
807 	if (!new) {
808 		ret = -ENOMEM;
809 		goto unlock;
810 	}
811 	new->size = size;
812 
813 	/* Copy thresholds (if any) to new array */
814 	if (thresholds->primary)
815 		memcpy(new->entries, thresholds->primary->entries,
816 		       flex_array_size(new, entries, size - 1));
817 
818 	/* Add new threshold */
819 	new->entries[size - 1].eventfd = eventfd;
820 	new->entries[size - 1].threshold = threshold;
821 
822 	/* Sort thresholds. Registering of new threshold isn't time-critical */
823 	sort(new->entries, size, sizeof(*new->entries),
824 			compare_thresholds, NULL);
825 
826 	/* Find current threshold */
827 	new->current_threshold = -1;
828 	for (i = 0; i < size; i++) {
829 		if (new->entries[i].threshold <= usage) {
830 			/*
831 			 * new->current_threshold will not be used until
832 			 * rcu_assign_pointer(), so it's safe to increment
833 			 * it here.
834 			 */
835 			++new->current_threshold;
836 		} else
837 			break;
838 	}
839 
840 	/* Free old spare buffer and save old primary buffer as spare */
841 	kfree(thresholds->spare);
842 	thresholds->spare = thresholds->primary;
843 
844 	rcu_assign_pointer(thresholds->primary, new);
845 
846 	/* To be sure that nobody uses thresholds */
847 	synchronize_rcu();
848 
849 unlock:
850 	mutex_unlock(&memcg->thresholds_lock);
851 
852 	return ret;
853 }
854 
855 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
856 	struct eventfd_ctx *eventfd, const char *args)
857 {
858 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
859 }
860 
861 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
862 	struct eventfd_ctx *eventfd, const char *args)
863 {
864 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
865 }
866 
867 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
868 	struct eventfd_ctx *eventfd, enum res_type type)
869 {
870 	struct mem_cgroup_thresholds *thresholds;
871 	struct mem_cgroup_threshold_ary *new;
872 	unsigned long usage;
873 	int i, j, size, entries;
874 
875 	mutex_lock(&memcg->thresholds_lock);
876 
877 	if (type == _MEM) {
878 		thresholds = &memcg->thresholds;
879 		usage = mem_cgroup_usage(memcg, false);
880 	} else if (type == _MEMSWAP) {
881 		thresholds = &memcg->memsw_thresholds;
882 		usage = mem_cgroup_usage(memcg, true);
883 	} else
884 		BUG();
885 
886 	if (!thresholds->primary)
887 		goto unlock;
888 
889 	/* Check if a threshold crossed before removing */
890 	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
891 
892 	/* Calculate new number of threshold */
893 	size = entries = 0;
894 	for (i = 0; i < thresholds->primary->size; i++) {
895 		if (thresholds->primary->entries[i].eventfd != eventfd)
896 			size++;
897 		else
898 			entries++;
899 	}
900 
901 	new = thresholds->spare;
902 
903 	/* If no items related to eventfd have been cleared, nothing to do */
904 	if (!entries)
905 		goto unlock;
906 
907 	/* Set thresholds array to NULL if we don't have thresholds */
908 	if (!size) {
909 		kfree(new);
910 		new = NULL;
911 		goto swap_buffers;
912 	}
913 
914 	new->size = size;
915 
916 	/* Copy thresholds and find current threshold */
917 	new->current_threshold = -1;
918 	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
919 		if (thresholds->primary->entries[i].eventfd == eventfd)
920 			continue;
921 
922 		new->entries[j] = thresholds->primary->entries[i];
923 		if (new->entries[j].threshold <= usage) {
924 			/*
925 			 * new->current_threshold will not be used
926 			 * until rcu_assign_pointer(), so it's safe to increment
927 			 * it here.
928 			 */
929 			++new->current_threshold;
930 		}
931 		j++;
932 	}
933 
934 swap_buffers:
935 	/* Swap primary and spare array */
936 	thresholds->spare = thresholds->primary;
937 
938 	rcu_assign_pointer(thresholds->primary, new);
939 
940 	/* To be sure that nobody uses thresholds */
941 	synchronize_rcu();
942 
943 	/* If all events are unregistered, free the spare array */
944 	if (!new) {
945 		kfree(thresholds->spare);
946 		thresholds->spare = NULL;
947 	}
948 unlock:
949 	mutex_unlock(&memcg->thresholds_lock);
950 }
951 
952 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
953 	struct eventfd_ctx *eventfd)
954 {
955 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
956 }
957 
958 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
959 	struct eventfd_ctx *eventfd)
960 {
961 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
962 }
963 
964 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
965 	struct eventfd_ctx *eventfd, const char *args)
966 {
967 	struct mem_cgroup_eventfd_list *event;
968 
969 	event = kmalloc_obj(*event, GFP_KERNEL_ACCOUNT);
970 	if (!event)
971 		return -ENOMEM;
972 
973 	spin_lock(&memcg_oom_lock);
974 
975 	event->eventfd = eventfd;
976 	list_add(&event->list, &memcg->oom_notify);
977 
978 	/* already in OOM ? */
979 	if (memcg->under_oom)
980 		eventfd_signal(eventfd);
981 	spin_unlock(&memcg_oom_lock);
982 
983 	return 0;
984 }
985 
986 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
987 	struct eventfd_ctx *eventfd)
988 {
989 	struct mem_cgroup_eventfd_list *ev, *tmp;
990 
991 	spin_lock(&memcg_oom_lock);
992 
993 	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
994 		if (ev->eventfd == eventfd) {
995 			list_del(&ev->list);
996 			kfree(ev);
997 		}
998 	}
999 
1000 	spin_unlock(&memcg_oom_lock);
1001 }
1002 
1003 /*
1004  * DO NOT USE IN NEW FILES.
1005  *
1006  * "cgroup.event_control" implementation.
1007  *
1008  * This is way over-engineered.  It tries to support fully configurable
1009  * events for each user.  Such level of flexibility is completely
1010  * unnecessary especially in the light of the planned unified hierarchy.
1011  *
1012  * Please deprecate this and replace with something simpler if at all
1013  * possible.
1014  */
1015 
1016 /*
1017  * Unregister event and free resources.
1018  *
1019  * Gets called from workqueue.
1020  */
1021 static void memcg_event_remove(struct work_struct *work)
1022 {
1023 	struct mem_cgroup_event *event =
1024 		container_of(work, struct mem_cgroup_event, remove);
1025 	struct mem_cgroup *memcg = event->memcg;
1026 
1027 	remove_wait_queue(event->wqh, &event->wait);
1028 
1029 	event->unregister_event(memcg, event->eventfd);
1030 
1031 	/* Notify userspace the event is going away. */
1032 	eventfd_signal(event->eventfd);
1033 
1034 	eventfd_ctx_put(event->eventfd);
1035 	kfree(event);
1036 	css_put(&memcg->css);
1037 }
1038 
1039 /*
1040  * Gets called on EPOLLHUP on eventfd when user closes it.
1041  *
1042  * Called with wqh->lock held and interrupts disabled.
1043  */
1044 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned int mode,
1045 			    int sync, void *key)
1046 {
1047 	struct mem_cgroup_event *event =
1048 		container_of(wait, struct mem_cgroup_event, wait);
1049 	struct mem_cgroup *memcg = event->memcg;
1050 	__poll_t flags = key_to_poll(key);
1051 
1052 	if (flags & EPOLLHUP) {
1053 		/*
1054 		 * If the event has been detached at cgroup removal, we
1055 		 * can simply return knowing the other side will cleanup
1056 		 * for us.
1057 		 *
1058 		 * We can't race against event freeing since the other
1059 		 * side will require wqh->lock via remove_wait_queue(),
1060 		 * which we hold.
1061 		 */
1062 		spin_lock(&memcg->event_list_lock);
1063 		if (!list_empty(&event->list)) {
1064 			list_del_init(&event->list);
1065 			/*
1066 			 * We are in atomic context, but cgroup_event_remove()
1067 			 * may sleep, so we have to call it in workqueue.
1068 			 */
1069 			schedule_work(&event->remove);
1070 		}
1071 		spin_unlock(&memcg->event_list_lock);
1072 	}
1073 
1074 	return 0;
1075 }
1076 
1077 static void memcg_event_ptable_queue_proc(struct file *file,
1078 		wait_queue_head_t *wqh, poll_table *pt)
1079 {
1080 	struct mem_cgroup_event *event =
1081 		container_of(pt, struct mem_cgroup_event, pt);
1082 
1083 	event->wqh = wqh;
1084 	add_wait_queue(wqh, &event->wait);
1085 }
1086 
1087 /*
1088  * DO NOT USE IN NEW FILES.
1089  *
1090  * Parse input and register new cgroup event handler.
1091  *
1092  * Input must be in format '<event_fd> <control_fd> <args>'.
1093  * Interpretation of args is defined by control file implementation.
1094  */
1095 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
1096 					 char *buf, size_t nbytes, loff_t off)
1097 {
1098 	struct cgroup_subsys_state *css = of_css(of);
1099 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1100 	struct mem_cgroup_event *event;
1101 	struct cgroup_subsys_state *cfile_css;
1102 	unsigned int efd, cfd;
1103 	struct dentry *cdentry;
1104 	const char *name;
1105 	char *endp;
1106 	int ret;
1107 
1108 	if (IS_ENABLED(CONFIG_PREEMPT_RT))
1109 		return -EOPNOTSUPP;
1110 
1111 	buf = strstrip(buf);
1112 
1113 	efd = simple_strtoul(buf, &endp, 10);
1114 	if (*endp != ' ')
1115 		return -EINVAL;
1116 	buf = endp + 1;
1117 
1118 	cfd = simple_strtoul(buf, &endp, 10);
1119 	if (*endp == '\0')
1120 		buf = endp;
1121 	else if (*endp == ' ')
1122 		buf = endp + 1;
1123 	else
1124 		return -EINVAL;
1125 
1126 	CLASS(fd, efile)(efd);
1127 	if (fd_empty(efile))
1128 		return -EBADF;
1129 
1130 	CLASS(fd, cfile)(cfd);
1131 
1132 	event = kzalloc_obj(*event, GFP_KERNEL_ACCOUNT);
1133 	if (!event)
1134 		return -ENOMEM;
1135 
1136 	event->memcg = memcg;
1137 	INIT_LIST_HEAD(&event->list);
1138 	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
1139 	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
1140 	INIT_WORK(&event->remove, memcg_event_remove);
1141 
1142 	event->eventfd = eventfd_ctx_fileget(fd_file(efile));
1143 	if (IS_ERR(event->eventfd)) {
1144 		ret = PTR_ERR(event->eventfd);
1145 		goto out_kfree;
1146 	}
1147 
1148 	if (fd_empty(cfile)) {
1149 		ret = -EBADF;
1150 		goto out_put_eventfd;
1151 	}
1152 
1153 	/* the process need read permission on control file */
1154 	/* AV: shouldn't we check that it's been opened for read instead? */
1155 	ret = file_permission(fd_file(cfile), MAY_READ);
1156 	if (ret < 0)
1157 		goto out_put_eventfd;
1158 
1159 	/*
1160 	 * The control file must be a regular cgroup1 file. As a regular cgroup
1161 	 * file can't be renamed, it's safe to access its name afterwards.
1162 	 */
1163 	cdentry = fd_file(cfile)->f_path.dentry;
1164 	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
1165 		ret = -EINVAL;
1166 		goto out_put_eventfd;
1167 	}
1168 
1169 	/*
1170 	 * Determine the event callbacks and set them in @event.  This used
1171 	 * to be done via struct cftype but cgroup core no longer knows
1172 	 * about these events.  The following is crude but the whole thing
1173 	 * is for compatibility anyway.
1174 	 *
1175 	 * DO NOT ADD NEW FILES.
1176 	 */
1177 	name = cdentry->d_name.name;
1178 
1179 	if (!strcmp(name, "memory.usage_in_bytes")) {
1180 		event->register_event = mem_cgroup_usage_register_event;
1181 		event->unregister_event = mem_cgroup_usage_unregister_event;
1182 	} else if (!strcmp(name, "memory.oom_control")) {
1183 		pr_warn_once("oom_control is deprecated and will be removed. "
1184 			     "Please report your usecase to linux-mm-@kvack.org"
1185 			     " if you depend on this functionality.\n");
1186 		event->register_event = mem_cgroup_oom_register_event;
1187 		event->unregister_event = mem_cgroup_oom_unregister_event;
1188 	} else if (!strcmp(name, "memory.pressure_level")) {
1189 		pr_warn_once("pressure_level is deprecated and will be removed. "
1190 			     "Please report your usecase to linux-mm-@kvack.org "
1191 			     "if you depend on this functionality.\n");
1192 		event->register_event = vmpressure_register_event;
1193 		event->unregister_event = vmpressure_unregister_event;
1194 	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
1195 		event->register_event = memsw_cgroup_usage_register_event;
1196 		event->unregister_event = memsw_cgroup_usage_unregister_event;
1197 	} else {
1198 		ret = -EINVAL;
1199 		goto out_put_eventfd;
1200 	}
1201 
1202 	/*
1203 	 * Verify @cfile should belong to @css.  Also, remaining events are
1204 	 * automatically removed on cgroup destruction but the removal is
1205 	 * asynchronous, so take an extra ref on @css.
1206 	 */
1207 	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
1208 					       &memory_cgrp_subsys);
1209 	ret = -EINVAL;
1210 	if (IS_ERR(cfile_css))
1211 		goto out_put_eventfd;
1212 	if (cfile_css != css)
1213 		goto out_put_css;
1214 
1215 	ret = event->register_event(memcg, event->eventfd, buf);
1216 	if (ret)
1217 		goto out_put_css;
1218 
1219 	vfs_poll(fd_file(efile), &event->pt);
1220 
1221 	spin_lock_irq(&memcg->event_list_lock);
1222 	list_add(&event->list, &memcg->event_list);
1223 	spin_unlock_irq(&memcg->event_list_lock);
1224 	return nbytes;
1225 
1226 out_put_css:
1227 	css_put(cfile_css);
1228 out_put_eventfd:
1229 	eventfd_ctx_put(event->eventfd);
1230 out_kfree:
1231 	kfree(event);
1232 	return ret;
1233 }
1234 
1235 void memcg1_memcg_init(struct mem_cgroup *memcg)
1236 {
1237 	INIT_LIST_HEAD(&memcg->oom_notify);
1238 	mutex_init(&memcg->thresholds_lock);
1239 	INIT_LIST_HEAD(&memcg->event_list);
1240 	spin_lock_init(&memcg->event_list_lock);
1241 }
1242 
1243 void memcg1_css_offline(struct mem_cgroup *memcg)
1244 {
1245 	struct mem_cgroup_event *event, *tmp;
1246 
1247 	/*
1248 	 * Unregister events and notify userspace.
1249 	 * Notify userspace about cgroup removing only after rmdir of cgroup
1250 	 * directory to avoid race between userspace and kernelspace.
1251 	 */
1252 	spin_lock_irq(&memcg->event_list_lock);
1253 	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
1254 		list_del_init(&event->list);
1255 		schedule_work(&event->remove);
1256 	}
1257 	spin_unlock_irq(&memcg->event_list_lock);
1258 }
1259 
1260 /*
1261  * Check OOM-Killer is already running under our hierarchy.
1262  * If someone is running, return false.
1263  */
1264 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1265 {
1266 	struct mem_cgroup *iter, *failed = NULL;
1267 
1268 	spin_lock(&memcg_oom_lock);
1269 
1270 	for_each_mem_cgroup_tree(iter, memcg) {
1271 		if (iter->oom_lock) {
1272 			/*
1273 			 * this subtree of our hierarchy is already locked
1274 			 * so we cannot give a lock.
1275 			 */
1276 			failed = iter;
1277 			mem_cgroup_iter_break(memcg, iter);
1278 			break;
1279 		}
1280 		iter->oom_lock = true;
1281 	}
1282 
1283 	if (failed) {
1284 		/*
1285 		 * OK, we failed to lock the whole subtree so we have
1286 		 * to clean up what we set up to the failing subtree
1287 		 */
1288 		for_each_mem_cgroup_tree(iter, memcg) {
1289 			if (iter == failed) {
1290 				mem_cgroup_iter_break(memcg, iter);
1291 				break;
1292 			}
1293 			iter->oom_lock = false;
1294 		}
1295 	} else
1296 		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1297 
1298 	spin_unlock(&memcg_oom_lock);
1299 
1300 	return !failed;
1301 }
1302 
1303 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1304 {
1305 	struct mem_cgroup *iter;
1306 
1307 	spin_lock(&memcg_oom_lock);
1308 	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1309 	for_each_mem_cgroup_tree(iter, memcg)
1310 		iter->oom_lock = false;
1311 	spin_unlock(&memcg_oom_lock);
1312 }
1313 
1314 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1315 {
1316 	struct mem_cgroup *iter;
1317 
1318 	spin_lock(&memcg_oom_lock);
1319 	for_each_mem_cgroup_tree(iter, memcg)
1320 		iter->under_oom++;
1321 	spin_unlock(&memcg_oom_lock);
1322 }
1323 
1324 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1325 {
1326 	struct mem_cgroup *iter;
1327 
1328 	/*
1329 	 * Be careful about under_oom underflows because a child memcg
1330 	 * could have been added after mem_cgroup_mark_under_oom.
1331 	 */
1332 	spin_lock(&memcg_oom_lock);
1333 	for_each_mem_cgroup_tree(iter, memcg)
1334 		if (iter->under_oom > 0)
1335 			iter->under_oom--;
1336 	spin_unlock(&memcg_oom_lock);
1337 }
1338 
1339 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1340 
1341 struct oom_wait_info {
1342 	struct mem_cgroup *memcg;
1343 	wait_queue_entry_t	wait;
1344 };
1345 
1346 static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1347 	unsigned int mode, int sync, void *arg)
1348 {
1349 	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1350 	struct mem_cgroup *oom_wait_memcg;
1351 	struct oom_wait_info *oom_wait_info;
1352 
1353 	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1354 	oom_wait_memcg = oom_wait_info->memcg;
1355 
1356 	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1357 	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1358 		return 0;
1359 	return autoremove_wake_function(wait, mode, sync, arg);
1360 }
1361 
1362 void memcg1_oom_recover(struct mem_cgroup *memcg)
1363 {
1364 	/*
1365 	 * For the following lockless ->under_oom test, the only required
1366 	 * guarantee is that it must see the state asserted by an OOM when
1367 	 * this function is called as a result of userland actions
1368 	 * triggered by the notification of the OOM.  This is trivially
1369 	 * achieved by invoking mem_cgroup_mark_under_oom() before
1370 	 * triggering notification.
1371 	 */
1372 	if (memcg && memcg->under_oom)
1373 		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1374 }
1375 
1376 /**
1377  * mem_cgroup_oom_synchronize - complete memcg OOM handling
1378  * @handle: actually kill/wait or just clean up the OOM state
1379  *
1380  * This has to be called at the end of a page fault if the memcg OOM
1381  * handler was enabled.
1382  *
1383  * Memcg supports userspace OOM handling where failed allocations must
1384  * sleep on a waitqueue until the userspace task resolves the
1385  * situation.  Sleeping directly in the charge context with all kinds
1386  * of locks held is not a good idea, instead we remember an OOM state
1387  * in the task and mem_cgroup_oom_synchronize() has to be called at
1388  * the end of the page fault to complete the OOM handling.
1389  *
1390  * Returns %true if an ongoing memcg OOM situation was detected and
1391  * completed, %false otherwise.
1392  */
1393 bool mem_cgroup_oom_synchronize(bool handle)
1394 {
1395 	struct mem_cgroup *memcg = current->memcg_in_oom;
1396 	struct oom_wait_info owait;
1397 	bool locked;
1398 
1399 	/* OOM is global, do not handle */
1400 	if (!memcg)
1401 		return false;
1402 
1403 	if (!handle)
1404 		goto cleanup;
1405 
1406 	owait.memcg = memcg;
1407 	owait.wait.flags = 0;
1408 	owait.wait.func = memcg_oom_wake_function;
1409 	owait.wait.private = current;
1410 	INIT_LIST_HEAD(&owait.wait.entry);
1411 
1412 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1413 	mem_cgroup_mark_under_oom(memcg);
1414 
1415 	locked = mem_cgroup_oom_trylock(memcg);
1416 
1417 	if (locked)
1418 		mem_cgroup_oom_notify(memcg);
1419 
1420 	schedule();
1421 	mem_cgroup_unmark_under_oom(memcg);
1422 	finish_wait(&memcg_oom_waitq, &owait.wait);
1423 
1424 	if (locked)
1425 		mem_cgroup_oom_unlock(memcg);
1426 cleanup:
1427 	current->memcg_in_oom = NULL;
1428 	css_put(&memcg->css);
1429 	return true;
1430 }
1431 
1432 
1433 bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked)
1434 {
1435 	/*
1436 	 * We are in the middle of the charge context here, so we
1437 	 * don't want to block when potentially sitting on a callstack
1438 	 * that holds all kinds of filesystem and mm locks.
1439 	 *
1440 	 * cgroup1 allows disabling the OOM killer and waiting for outside
1441 	 * handling until the charge can succeed; remember the context and put
1442 	 * the task to sleep at the end of the page fault when all locks are
1443 	 * released.
1444 	 *
1445 	 * On the other hand, in-kernel OOM killer allows for an async victim
1446 	 * memory reclaim (oom_reaper) and that means that we are not solely
1447 	 * relying on the oom victim to make a forward progress and we can
1448 	 * invoke the oom killer here.
1449 	 *
1450 	 * Please note that mem_cgroup_out_of_memory might fail to find a
1451 	 * victim and then we have to bail out from the charge path.
1452 	 */
1453 	if (READ_ONCE(memcg->oom_kill_disable)) {
1454 		if (current->in_user_fault) {
1455 			css_get(&memcg->css);
1456 			current->memcg_in_oom = memcg;
1457 		}
1458 		return false;
1459 	}
1460 
1461 	mem_cgroup_mark_under_oom(memcg);
1462 
1463 	*locked = mem_cgroup_oom_trylock(memcg);
1464 
1465 	if (*locked)
1466 		mem_cgroup_oom_notify(memcg);
1467 
1468 	mem_cgroup_unmark_under_oom(memcg);
1469 
1470 	return true;
1471 }
1472 
1473 void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked)
1474 {
1475 	if (locked)
1476 		mem_cgroup_oom_unlock(memcg);
1477 }
1478 
1479 static DEFINE_MUTEX(memcg_max_mutex);
1480 
1481 static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
1482 				 unsigned long max, bool memsw)
1483 {
1484 	bool enlarge = false;
1485 	bool drained = false;
1486 	int ret;
1487 	bool limits_invariant;
1488 	struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
1489 
1490 	do {
1491 		if (signal_pending(current)) {
1492 			ret = -EINTR;
1493 			break;
1494 		}
1495 
1496 		mutex_lock(&memcg_max_mutex);
1497 		/*
1498 		 * Make sure that the new limit (memsw or memory limit) doesn't
1499 		 * break our basic invariant rule memory.max <= memsw.max.
1500 		 */
1501 		limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
1502 					   max <= memcg->memsw.max;
1503 		if (!limits_invariant) {
1504 			mutex_unlock(&memcg_max_mutex);
1505 			ret = -EINVAL;
1506 			break;
1507 		}
1508 		if (max > counter->max)
1509 			enlarge = true;
1510 		ret = page_counter_set_max(counter, max);
1511 		mutex_unlock(&memcg_max_mutex);
1512 
1513 		if (!ret)
1514 			break;
1515 
1516 		if (!drained) {
1517 			drain_all_stock(memcg);
1518 			drained = true;
1519 			continue;
1520 		}
1521 
1522 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
1523 				memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
1524 			ret = -EBUSY;
1525 			break;
1526 		}
1527 	} while (true);
1528 
1529 	if (!ret && enlarge)
1530 		memcg1_oom_recover(memcg);
1531 
1532 	return ret;
1533 }
1534 
1535 /*
1536  * Reclaims as many pages from the given memcg as possible.
1537  *
1538  * Caller is responsible for holding css reference for memcg.
1539  */
1540 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
1541 {
1542 	int nr_retries = MAX_RECLAIM_RETRIES;
1543 
1544 	/* we call try-to-free pages for make this cgroup empty */
1545 	lru_add_drain_all();
1546 
1547 	drain_all_stock(memcg);
1548 
1549 	/* try to free all pages in this cgroup */
1550 	while (nr_retries && page_counter_read(&memcg->memory)) {
1551 		if (signal_pending(current))
1552 			return -EINTR;
1553 
1554 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
1555 						  MEMCG_RECLAIM_MAY_SWAP, NULL))
1556 			nr_retries--;
1557 	}
1558 
1559 	return 0;
1560 }
1561 
1562 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
1563 					    char *buf, size_t nbytes,
1564 					    loff_t off)
1565 {
1566 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
1567 
1568 	if (mem_cgroup_is_root(memcg))
1569 		return -EINVAL;
1570 	return mem_cgroup_force_empty(memcg) ?: nbytes;
1571 }
1572 
1573 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
1574 				     struct cftype *cft)
1575 {
1576 	return 1;
1577 }
1578 
1579 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
1580 				      struct cftype *cft, u64 val)
1581 {
1582 	if (val == 1)
1583 		return 0;
1584 
1585 	pr_warn_once("Non-hierarchical mode is deprecated. "
1586 		     "Please report your usecase to linux-mm@kvack.org if you "
1587 		     "depend on this functionality.\n");
1588 
1589 	return -EINVAL;
1590 }
1591 
1592 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
1593 			       struct cftype *cft)
1594 {
1595 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1596 	struct page_counter *counter;
1597 
1598 	switch (MEMFILE_TYPE(cft->private)) {
1599 	case _MEM:
1600 		counter = &memcg->memory;
1601 		break;
1602 	case _MEMSWAP:
1603 		counter = &memcg->memsw;
1604 		break;
1605 	case _KMEM:
1606 		counter = &memcg->kmem;
1607 		break;
1608 	case _TCP:
1609 		counter = &memcg->tcpmem;
1610 		break;
1611 	default:
1612 		BUG();
1613 	}
1614 
1615 	switch (MEMFILE_ATTR(cft->private)) {
1616 	case RES_USAGE:
1617 		if (counter == &memcg->memory)
1618 			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
1619 		if (counter == &memcg->memsw)
1620 			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
1621 		return (u64)page_counter_read(counter) * PAGE_SIZE;
1622 	case RES_LIMIT:
1623 		return (u64)counter->max * PAGE_SIZE;
1624 	case RES_MAX_USAGE:
1625 		return (u64)counter->watermark * PAGE_SIZE;
1626 	case RES_FAILCNT:
1627 		return counter->failcnt;
1628 	case RES_SOFT_LIMIT:
1629 		return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
1630 	default:
1631 		BUG();
1632 	}
1633 }
1634 
1635 /*
1636  * This function doesn't do anything useful. Its only job is to provide a read
1637  * handler for a file so that cgroup_file_mode() will add read permissions.
1638  */
1639 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m,
1640 				     __always_unused void *v)
1641 {
1642 	return -EINVAL;
1643 }
1644 
1645 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
1646 {
1647 	int ret;
1648 
1649 	mutex_lock(&memcg_max_mutex);
1650 
1651 	ret = page_counter_set_max(&memcg->tcpmem, max);
1652 	if (ret)
1653 		goto out;
1654 
1655 	if (!memcg->tcpmem_active) {
1656 		/*
1657 		 * The active flag needs to be written after the static_key
1658 		 * update. This is what guarantees that the socket activation
1659 		 * function is the last one to run. See mem_cgroup_sk_alloc()
1660 		 * for details, and note that we don't mark any socket as
1661 		 * belonging to this memcg until that flag is up.
1662 		 *
1663 		 * We need to do this, because static_keys will span multiple
1664 		 * sites, but we can't control their order. If we mark a socket
1665 		 * as accounted, but the accounting functions are not patched in
1666 		 * yet, we'll lose accounting.
1667 		 *
1668 		 * We never race with the readers in mem_cgroup_sk_alloc(),
1669 		 * because when this value change, the code to process it is not
1670 		 * patched in yet.
1671 		 */
1672 		static_branch_inc(&memcg_sockets_enabled_key);
1673 		memcg->tcpmem_active = true;
1674 	}
1675 out:
1676 	mutex_unlock(&memcg_max_mutex);
1677 	return ret;
1678 }
1679 
1680 /*
1681  * The user of this function is...
1682  * RES_LIMIT.
1683  */
1684 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
1685 				char *buf, size_t nbytes, loff_t off)
1686 {
1687 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
1688 	unsigned long nr_pages;
1689 	int ret;
1690 
1691 	buf = strstrip(buf);
1692 	ret = page_counter_memparse(buf, "-1", &nr_pages);
1693 	if (ret)
1694 		return ret;
1695 
1696 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
1697 	case RES_LIMIT:
1698 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
1699 			ret = -EINVAL;
1700 			break;
1701 		}
1702 		switch (MEMFILE_TYPE(of_cft(of)->private)) {
1703 		case _MEM:
1704 			ret = mem_cgroup_resize_max(memcg, nr_pages, false);
1705 			break;
1706 		case _MEMSWAP:
1707 			ret = mem_cgroup_resize_max(memcg, nr_pages, true);
1708 			break;
1709 		case _KMEM:
1710 			pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
1711 				     "Writing any value to this file has no effect. "
1712 				     "Please report your usecase to linux-mm@kvack.org if you "
1713 				     "depend on this functionality.\n");
1714 			ret = 0;
1715 			break;
1716 		case _TCP:
1717 			pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. "
1718 				     "Please report your usecase to linux-mm@kvack.org if you "
1719 				     "depend on this functionality.\n");
1720 			ret = memcg_update_tcp_max(memcg, nr_pages);
1721 			break;
1722 		}
1723 		break;
1724 	case RES_SOFT_LIMIT:
1725 		if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
1726 			ret = -EOPNOTSUPP;
1727 		} else {
1728 			pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. "
1729 				     "Please report your usecase to linux-mm@kvack.org if you "
1730 				     "depend on this functionality.\n");
1731 			WRITE_ONCE(memcg->soft_limit, nr_pages);
1732 			ret = 0;
1733 		}
1734 		break;
1735 	}
1736 	return ret ?: nbytes;
1737 }
1738 
1739 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
1740 				size_t nbytes, loff_t off)
1741 {
1742 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
1743 	struct page_counter *counter;
1744 
1745 	switch (MEMFILE_TYPE(of_cft(of)->private)) {
1746 	case _MEM:
1747 		counter = &memcg->memory;
1748 		break;
1749 	case _MEMSWAP:
1750 		counter = &memcg->memsw;
1751 		break;
1752 	case _KMEM:
1753 		counter = &memcg->kmem;
1754 		break;
1755 	case _TCP:
1756 		counter = &memcg->tcpmem;
1757 		break;
1758 	default:
1759 		BUG();
1760 	}
1761 
1762 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
1763 	case RES_MAX_USAGE:
1764 		page_counter_reset_watermark(counter);
1765 		break;
1766 	case RES_FAILCNT:
1767 		counter->failcnt = 0;
1768 		break;
1769 	default:
1770 		BUG();
1771 	}
1772 
1773 	return nbytes;
1774 }
1775 
1776 #ifdef CONFIG_NUMA
1777 
1778 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
1779 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
1780 #define LRU_ALL	     ((1 << NR_LRU_LISTS) - 1)
1781 
1782 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
1783 				int nid, unsigned int lru_mask, bool tree)
1784 {
1785 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
1786 	unsigned long nr = 0;
1787 	enum lru_list lru;
1788 
1789 	VM_BUG_ON((unsigned int)nid >= nr_node_ids);
1790 
1791 	for_each_lru(lru) {
1792 		if (!(BIT(lru) & lru_mask))
1793 			continue;
1794 		if (tree)
1795 			nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
1796 		else
1797 			nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
1798 	}
1799 	return nr;
1800 }
1801 
1802 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
1803 					     unsigned int lru_mask,
1804 					     bool tree)
1805 {
1806 	unsigned long nr = 0;
1807 	enum lru_list lru;
1808 
1809 	for_each_lru(lru) {
1810 		if (!(BIT(lru) & lru_mask))
1811 			continue;
1812 		if (tree)
1813 			nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
1814 		else
1815 			nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
1816 	}
1817 	return nr;
1818 }
1819 
1820 static int memcg_numa_stat_show(struct seq_file *m, void *v)
1821 {
1822 	struct numa_stat {
1823 		const char *name;
1824 		unsigned int lru_mask;
1825 	};
1826 
1827 	static const struct numa_stat stats[] = {
1828 		{ "total", LRU_ALL },
1829 		{ "file", LRU_ALL_FILE },
1830 		{ "anon", LRU_ALL_ANON },
1831 		{ "unevictable", BIT(LRU_UNEVICTABLE) },
1832 	};
1833 	const struct numa_stat *stat;
1834 	int nid;
1835 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
1836 
1837 	mem_cgroup_flush_stats(memcg);
1838 
1839 	for (stat = stats; stat < ARRAY_END(stats); stat++) {
1840 		seq_printf(m, "%s=%lu", stat->name,
1841 			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
1842 						   false));
1843 		for_each_node_state(nid, N_MEMORY)
1844 			seq_printf(m, " N%d=%lu", nid,
1845 				   mem_cgroup_node_nr_lru_pages(memcg, nid,
1846 							stat->lru_mask, false));
1847 		seq_putc(m, '\n');
1848 	}
1849 
1850 	for (stat = stats; stat < ARRAY_END(stats); stat++) {
1851 
1852 		seq_printf(m, "hierarchical_%s=%lu", stat->name,
1853 			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
1854 						   true));
1855 		for_each_node_state(nid, N_MEMORY)
1856 			seq_printf(m, " N%d=%lu", nid,
1857 				   mem_cgroup_node_nr_lru_pages(memcg, nid,
1858 							stat->lru_mask, true));
1859 		seq_putc(m, '\n');
1860 	}
1861 
1862 	return 0;
1863 }
1864 #endif /* CONFIG_NUMA */
1865 
1866 static const unsigned int memcg1_stats[] = {
1867 	NR_FILE_PAGES,
1868 	NR_ANON_MAPPED,
1869 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1870 	NR_ANON_THPS,
1871 #endif
1872 	NR_SHMEM,
1873 	NR_FILE_MAPPED,
1874 	NR_FILE_DIRTY,
1875 	NR_WRITEBACK,
1876 	WORKINGSET_REFAULT_ANON,
1877 	WORKINGSET_REFAULT_FILE,
1878 #ifdef CONFIG_SWAP
1879 	MEMCG_SWAP,
1880 	NR_SWAPCACHE,
1881 #endif
1882 };
1883 
1884 static const char *const memcg1_stat_names[] = {
1885 	"cache",
1886 	"rss",
1887 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1888 	"rss_huge",
1889 #endif
1890 	"shmem",
1891 	"mapped_file",
1892 	"dirty",
1893 	"writeback",
1894 	"workingset_refault_anon",
1895 	"workingset_refault_file",
1896 #ifdef CONFIG_SWAP
1897 	"swap",
1898 	"swapcached",
1899 #endif
1900 };
1901 
1902 /* Universal VM events cgroup1 shows, original sort order */
1903 static const unsigned int memcg1_events[] = {
1904 	PGPGIN,
1905 	PGPGOUT,
1906 	PGFAULT,
1907 	PGMAJFAULT,
1908 };
1909 
1910 void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
1911 {
1912 	int i;
1913 
1914 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
1915 		reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
1916 }
1917 
1918 void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
1919 {
1920 	int i;
1921 
1922 	for (i = 0; i < NR_LRU_LISTS; i++)
1923 		reparent_memcg_lruvec_state_local(memcg, parent, i);
1924 }
1925 
1926 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1927 {
1928 	unsigned long memory, memsw;
1929 	struct mem_cgroup *mi;
1930 	unsigned int i;
1931 
1932 	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
1933 
1934 	mem_cgroup_flush_stats(memcg);
1935 
1936 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1937 		unsigned long nr;
1938 
1939 		nr = memcg_page_state_local_output(memcg, memcg1_stats[i]);
1940 		seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr);
1941 	}
1942 
1943 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
1944 		seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
1945 			       memcg_events_local(memcg, memcg1_events[i]));
1946 
1947 	for (i = 0; i < NR_LRU_LISTS; i++)
1948 		seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
1949 			       memcg_page_state_local(memcg, NR_LRU_BASE + i) *
1950 			       PAGE_SIZE);
1951 
1952 	/* Hierarchical information */
1953 	memory = memsw = PAGE_COUNTER_MAX;
1954 	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
1955 		memory = min(memory, READ_ONCE(mi->memory.max));
1956 		memsw = min(memsw, READ_ONCE(mi->memsw.max));
1957 	}
1958 	seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
1959 		       (u64)memory * PAGE_SIZE);
1960 	seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
1961 		       (u64)memsw * PAGE_SIZE);
1962 
1963 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1964 		unsigned long nr;
1965 
1966 		nr = memcg_page_state_output(memcg, memcg1_stats[i]);
1967 		seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
1968 			       (u64)nr);
1969 	}
1970 
1971 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
1972 		seq_buf_printf(s, "total_%s %llu\n",
1973 			       vm_event_name(memcg1_events[i]),
1974 			       (u64)memcg_events(memcg, memcg1_events[i]));
1975 
1976 	for (i = 0; i < NR_LRU_LISTS; i++)
1977 		seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
1978 			       (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1979 			       PAGE_SIZE);
1980 
1981 #ifdef CONFIG_DEBUG_VM
1982 	{
1983 		pg_data_t *pgdat;
1984 		struct mem_cgroup_per_node *mz;
1985 		unsigned long anon_cost = 0;
1986 		unsigned long file_cost = 0;
1987 
1988 		for_each_online_pgdat(pgdat) {
1989 			mz = memcg->nodeinfo[pgdat->node_id];
1990 
1991 			anon_cost += mz->lruvec.anon_cost;
1992 			file_cost += mz->lruvec.file_cost;
1993 		}
1994 		seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
1995 		seq_buf_printf(s, "file_cost %lu\n", file_cost);
1996 	}
1997 #endif
1998 }
1999 
2000 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
2001 				      struct cftype *cft)
2002 {
2003 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2004 
2005 	return mem_cgroup_swappiness(memcg);
2006 }
2007 
2008 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
2009 				       struct cftype *cft, u64 val)
2010 {
2011 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2012 
2013 	if (val > MAX_SWAPPINESS)
2014 		return -EINVAL;
2015 
2016 	if (!mem_cgroup_is_root(memcg)) {
2017 		pr_info_once("Per memcg swappiness does not exist in cgroup v2. "
2018 			     "See memory.reclaim or memory.swap.max there\n ");
2019 		WRITE_ONCE(memcg->swappiness, val);
2020 	} else
2021 		WRITE_ONCE(vm_swappiness, val);
2022 
2023 	return 0;
2024 }
2025 
2026 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
2027 {
2028 	struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
2029 
2030 	seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable));
2031 	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
2032 	seq_printf(sf, "oom_kill %lu\n",
2033 		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
2034 	return 0;
2035 }
2036 
2037 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
2038 	struct cftype *cft, u64 val)
2039 {
2040 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2041 
2042 	pr_warn_once("oom_control is deprecated and will be removed. "
2043 		     "Please report your usecase to linux-mm-@kvack.org if you "
2044 		     "depend on this functionality.\n");
2045 
2046 	/* cannot set to root cgroup and only 0 and 1 are allowed */
2047 	if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
2048 		return -EINVAL;
2049 
2050 	WRITE_ONCE(memcg->oom_kill_disable, val);
2051 	if (!val)
2052 		memcg1_oom_recover(memcg);
2053 
2054 	return 0;
2055 }
2056 
2057 #ifdef CONFIG_SLUB_DEBUG
2058 static int mem_cgroup_slab_show(struct seq_file *m, void *p)
2059 {
2060 	/*
2061 	 * Deprecated.
2062 	 * Please, take a look at tools/cgroup/memcg_slabinfo.py .
2063 	 */
2064 	return 0;
2065 }
2066 #endif
2067 
2068 struct cftype mem_cgroup_legacy_files[] = {
2069 	{
2070 		.name = "usage_in_bytes",
2071 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
2072 		.read_u64 = mem_cgroup_read_u64,
2073 	},
2074 	{
2075 		.name = "max_usage_in_bytes",
2076 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
2077 		.write = mem_cgroup_reset,
2078 		.read_u64 = mem_cgroup_read_u64,
2079 	},
2080 	{
2081 		.name = "limit_in_bytes",
2082 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
2083 		.write = mem_cgroup_write,
2084 		.read_u64 = mem_cgroup_read_u64,
2085 	},
2086 	{
2087 		.name = "soft_limit_in_bytes",
2088 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2089 		.write = mem_cgroup_write,
2090 		.read_u64 = mem_cgroup_read_u64,
2091 	},
2092 	{
2093 		.name = "failcnt",
2094 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2095 		.write = mem_cgroup_reset,
2096 		.read_u64 = mem_cgroup_read_u64,
2097 	},
2098 	{
2099 		.name = "stat",
2100 		.seq_show = memory_stat_show,
2101 	},
2102 	{
2103 		.name = "force_empty",
2104 		.write = mem_cgroup_force_empty_write,
2105 	},
2106 	{
2107 		.name = "use_hierarchy",
2108 		.write_u64 = mem_cgroup_hierarchy_write,
2109 		.read_u64 = mem_cgroup_hierarchy_read,
2110 	},
2111 	{
2112 		.name = "cgroup.event_control",		/* XXX: for compat */
2113 		.write = memcg_write_event_control,
2114 		.flags = CFTYPE_NO_PREFIX,
2115 	},
2116 	{
2117 		.name = "swappiness",
2118 		.read_u64 = mem_cgroup_swappiness_read,
2119 		.write_u64 = mem_cgroup_swappiness_write,
2120 	},
2121 	{
2122 		.name = "move_charge_at_immigrate",
2123 		.read_u64 = mem_cgroup_move_charge_read,
2124 		.write_u64 = mem_cgroup_move_charge_write,
2125 	},
2126 	{
2127 		.name = "oom_control",
2128 		.seq_show = mem_cgroup_oom_control_read,
2129 		.write_u64 = mem_cgroup_oom_control_write,
2130 	},
2131 	{
2132 		.name = "pressure_level",
2133 		.seq_show = mem_cgroup_dummy_seq_show,
2134 	},
2135 #ifdef CONFIG_NUMA
2136 	{
2137 		.name = "numa_stat",
2138 		.seq_show = memcg_numa_stat_show,
2139 	},
2140 #endif
2141 	{
2142 		.name = "kmem.limit_in_bytes",
2143 		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
2144 		.write = mem_cgroup_write,
2145 		.read_u64 = mem_cgroup_read_u64,
2146 	},
2147 	{
2148 		.name = "kmem.usage_in_bytes",
2149 		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
2150 		.read_u64 = mem_cgroup_read_u64,
2151 	},
2152 	{
2153 		.name = "kmem.failcnt",
2154 		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
2155 		.write = mem_cgroup_reset,
2156 		.read_u64 = mem_cgroup_read_u64,
2157 	},
2158 	{
2159 		.name = "kmem.max_usage_in_bytes",
2160 		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
2161 		.write = mem_cgroup_reset,
2162 		.read_u64 = mem_cgroup_read_u64,
2163 	},
2164 #ifdef CONFIG_SLUB_DEBUG
2165 	{
2166 		.name = "kmem.slabinfo",
2167 		.seq_show = mem_cgroup_slab_show,
2168 	},
2169 #endif
2170 	{
2171 		.name = "kmem.tcp.limit_in_bytes",
2172 		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
2173 		.write = mem_cgroup_write,
2174 		.read_u64 = mem_cgroup_read_u64,
2175 	},
2176 	{
2177 		.name = "kmem.tcp.usage_in_bytes",
2178 		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
2179 		.read_u64 = mem_cgroup_read_u64,
2180 	},
2181 	{
2182 		.name = "kmem.tcp.failcnt",
2183 		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
2184 		.write = mem_cgroup_reset,
2185 		.read_u64 = mem_cgroup_read_u64,
2186 	},
2187 	{
2188 		.name = "kmem.tcp.max_usage_in_bytes",
2189 		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
2190 		.write = mem_cgroup_reset,
2191 		.read_u64 = mem_cgroup_read_u64,
2192 	},
2193 	{ },	/* terminate */
2194 };
2195 
2196 struct cftype memsw_files[] = {
2197 	{
2198 		.name = "memsw.usage_in_bytes",
2199 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
2200 		.read_u64 = mem_cgroup_read_u64,
2201 	},
2202 	{
2203 		.name = "memsw.max_usage_in_bytes",
2204 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
2205 		.write = mem_cgroup_reset,
2206 		.read_u64 = mem_cgroup_read_u64,
2207 	},
2208 	{
2209 		.name = "memsw.limit_in_bytes",
2210 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
2211 		.write = mem_cgroup_write,
2212 		.read_u64 = mem_cgroup_read_u64,
2213 	},
2214 	{
2215 		.name = "memsw.failcnt",
2216 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
2217 		.write = mem_cgroup_reset,
2218 		.read_u64 = mem_cgroup_read_u64,
2219 	},
2220 	{ },	/* terminate */
2221 };
2222 
2223 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages)
2224 {
2225 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
2226 		if (nr_pages > 0)
2227 			page_counter_charge(&memcg->kmem, nr_pages);
2228 		else
2229 			page_counter_uncharge(&memcg->kmem, -nr_pages);
2230 	}
2231 }
2232 
2233 bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
2234 			 gfp_t gfp_mask)
2235 {
2236 	struct page_counter *fail;
2237 
2238 	if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
2239 		memcg->tcpmem_pressure = 0;
2240 		return true;
2241 	}
2242 	memcg->tcpmem_pressure = 1;
2243 	if (gfp_mask & __GFP_NOFAIL) {
2244 		page_counter_charge(&memcg->tcpmem, nr_pages);
2245 		return true;
2246 	}
2247 	return false;
2248 }
2249 
2250 bool memcg1_alloc_events(struct mem_cgroup *memcg)
2251 {
2252 	memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu,
2253 						GFP_KERNEL_ACCOUNT);
2254 	return !!memcg->events_percpu;
2255 }
2256 
2257 void memcg1_free_events(struct mem_cgroup *memcg)
2258 {
2259 	free_percpu(memcg->events_percpu);
2260 }
2261 
2262 static int __init memcg1_init(void)
2263 {
2264 	int node;
2265 
2266 	for_each_node(node) {
2267 		struct mem_cgroup_tree_per_node *rtpn;
2268 
2269 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
2270 
2271 		rtpn->rb_root = RB_ROOT;
2272 		rtpn->rb_rightmost = NULL;
2273 		spin_lock_init(&rtpn->lock);
2274 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
2275 	}
2276 
2277 	return 0;
2278 }
2279 subsys_initcall(memcg1_init);
2280