xref: /linux/mm/memcontrol.c (revision cc4589ebfae6f8dbb5cf880a0a67eedab3416492)
1 /* memcontrol.c - Memory Controller
2  *
3  * Copyright IBM Corporation, 2007
4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5  *
6  * Copyright 2007 OpenVZ SWsoft Inc
7  * Author: Pavel Emelianov <xemul@openvz.org>
8  *
9  * Memory thresholds
10  * Copyright (C) 2009 Nokia Corporation
11  * Author: Kirill A. Shutemov
12  *
13  * This program is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2 of the License, or
16  * (at your option) any later version.
17  *
18  * This program is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  */
23 
24 #include <linux/res_counter.h>
25 #include <linux/memcontrol.h>
26 #include <linux/cgroup.h>
27 #include <linux/mm.h>
28 #include <linux/hugetlb.h>
29 #include <linux/pagemap.h>
30 #include <linux/smp.h>
31 #include <linux/page-flags.h>
32 #include <linux/backing-dev.h>
33 #include <linux/bit_spinlock.h>
34 #include <linux/rcupdate.h>
35 #include <linux/limits.h>
36 #include <linux/mutex.h>
37 #include <linux/rbtree.h>
38 #include <linux/slab.h>
39 #include <linux/swap.h>
40 #include <linux/swapops.h>
41 #include <linux/spinlock.h>
42 #include <linux/eventfd.h>
43 #include <linux/sort.h>
44 #include <linux/fs.h>
45 #include <linux/seq_file.h>
46 #include <linux/vmalloc.h>
47 #include <linux/mm_inline.h>
48 #include <linux/page_cgroup.h>
49 #include <linux/cpu.h>
50 #include "internal.h"
51 
52 #include <asm/uaccess.h>
53 
54 #include <trace/events/vmscan.h>
55 
56 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
57 #define MEM_CGROUP_RECLAIM_RETRIES	5
58 struct mem_cgroup *root_mem_cgroup __read_mostly;
59 
60 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
61 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
62 int do_swap_account __read_mostly;
63 static int really_do_swap_account __initdata = 1; /* for remember boot option*/
64 #else
65 #define do_swap_account		(0)
66 #endif
67 
68 /*
69  * Per memcg event counter is incremented at every pagein/pageout. This counter
70  * is used for trigger some periodic events. This is straightforward and better
71  * than using jiffies etc. to handle periodic memcg event.
72  *
73  * These values will be used as !((event) & ((1 <<(thresh)) - 1))
74  */
75 #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
76 #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
77 
78 /*
79  * Statistics for memory cgroup.
80  */
81 enum mem_cgroup_stat_index {
82 	/*
83 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
84 	 */
85 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
86 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
87 	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
88 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
89 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
90 	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
91 	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
92 
93 	MEM_CGROUP_STAT_NSTATS,
94 };
95 
96 struct mem_cgroup_stat_cpu {
97 	s64 count[MEM_CGROUP_STAT_NSTATS];
98 };
99 
100 /*
101  * per-zone information in memory controller.
102  */
103 struct mem_cgroup_per_zone {
104 	/*
105 	 * spin_lock to protect the per cgroup LRU
106 	 */
107 	struct list_head	lists[NR_LRU_LISTS];
108 	unsigned long		count[NR_LRU_LISTS];
109 
110 	struct zone_reclaim_stat reclaim_stat;
111 	struct rb_node		tree_node;	/* RB tree node */
112 	unsigned long long	usage_in_excess;/* Set to the value by which */
113 						/* the soft limit is exceeded*/
114 	bool			on_tree;
115 	struct mem_cgroup	*mem;		/* Back pointer, we cannot */
116 						/* use container_of	   */
117 };
118 /* Macro for accessing counter */
119 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
120 
121 struct mem_cgroup_per_node {
122 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
123 };
124 
125 struct mem_cgroup_lru_info {
126 	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
127 };
128 
129 /*
130  * Cgroups above their limits are maintained in a RB-Tree, independent of
131  * their hierarchy representation
132  */
133 
134 struct mem_cgroup_tree_per_zone {
135 	struct rb_root rb_root;
136 	spinlock_t lock;
137 };
138 
139 struct mem_cgroup_tree_per_node {
140 	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
141 };
142 
143 struct mem_cgroup_tree {
144 	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
145 };
146 
147 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
148 
149 struct mem_cgroup_threshold {
150 	struct eventfd_ctx *eventfd;
151 	u64 threshold;
152 };
153 
154 /* For threshold */
155 struct mem_cgroup_threshold_ary {
156 	/* An array index points to threshold just below usage. */
157 	int current_threshold;
158 	/* Size of entries[] */
159 	unsigned int size;
160 	/* Array of thresholds */
161 	struct mem_cgroup_threshold entries[0];
162 };
163 
164 struct mem_cgroup_thresholds {
165 	/* Primary thresholds array */
166 	struct mem_cgroup_threshold_ary *primary;
167 	/*
168 	 * Spare threshold array.
169 	 * This is needed to make mem_cgroup_unregister_event() "never fail".
170 	 * It must be able to store at least primary->size - 1 entries.
171 	 */
172 	struct mem_cgroup_threshold_ary *spare;
173 };
174 
175 /* for OOM */
176 struct mem_cgroup_eventfd_list {
177 	struct list_head list;
178 	struct eventfd_ctx *eventfd;
179 };
180 
181 static void mem_cgroup_threshold(struct mem_cgroup *mem);
182 static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
183 
184 /*
185  * The memory controller data structure. The memory controller controls both
186  * page cache and RSS per cgroup. We would eventually like to provide
187  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
188  * to help the administrator determine what knobs to tune.
189  *
190  * TODO: Add a water mark for the memory controller. Reclaim will begin when
191  * we hit the water mark. May be even add a low water mark, such that
192  * no reclaim occurs from a cgroup at it's low water mark, this is
193  * a feature that will be implemented much later in the future.
194  */
195 struct mem_cgroup {
196 	struct cgroup_subsys_state css;
197 	/*
198 	 * the counter to account for memory usage
199 	 */
200 	struct res_counter res;
201 	/*
202 	 * the counter to account for mem+swap usage.
203 	 */
204 	struct res_counter memsw;
205 	/*
206 	 * Per cgroup active and inactive list, similar to the
207 	 * per zone LRU lists.
208 	 */
209 	struct mem_cgroup_lru_info info;
210 
211 	/*
212 	  protect against reclaim related member.
213 	*/
214 	spinlock_t reclaim_param_lock;
215 
216 	/*
217 	 * While reclaiming in a hierarchy, we cache the last child we
218 	 * reclaimed from.
219 	 */
220 	int last_scanned_child;
221 	/*
222 	 * Should the accounting and control be hierarchical, per subtree?
223 	 */
224 	bool use_hierarchy;
225 	atomic_t	oom_lock;
226 	atomic_t	refcnt;
227 
228 	unsigned int	swappiness;
229 	/* OOM-Killer disable */
230 	int		oom_kill_disable;
231 
232 	/* set when res.limit == memsw.limit */
233 	bool		memsw_is_minimum;
234 
235 	/* protect arrays of thresholds */
236 	struct mutex thresholds_lock;
237 
238 	/* thresholds for memory usage. RCU-protected */
239 	struct mem_cgroup_thresholds thresholds;
240 
241 	/* thresholds for mem+swap usage. RCU-protected */
242 	struct mem_cgroup_thresholds memsw_thresholds;
243 
244 	/* For oom notifier event fd */
245 	struct list_head oom_notify;
246 
247 	/*
248 	 * Should we move charges of a task when a task is moved into this
249 	 * mem_cgroup ? And what type of charges should we move ?
250 	 */
251 	unsigned long 	move_charge_at_immigrate;
252 	/*
253 	 * percpu counter.
254 	 */
255 	struct mem_cgroup_stat_cpu *stat;
256 };
257 
258 /* Stuffs for move charges at task migration. */
259 /*
260  * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
261  * left-shifted bitmap of these types.
262  */
263 enum move_type {
264 	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
265 	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
266 	NR_MOVE_TYPE,
267 };
268 
269 /* "mc" and its members are protected by cgroup_mutex */
270 static struct move_charge_struct {
271 	struct mem_cgroup *from;
272 	struct mem_cgroup *to;
273 	unsigned long precharge;
274 	unsigned long moved_charge;
275 	unsigned long moved_swap;
276 	struct task_struct *moving_task;	/* a task moving charges */
277 	wait_queue_head_t waitq;		/* a waitq for other context */
278 } mc = {
279 	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
280 };
281 
282 static bool move_anon(void)
283 {
284 	return test_bit(MOVE_CHARGE_TYPE_ANON,
285 					&mc.to->move_charge_at_immigrate);
286 }
287 
288 static bool move_file(void)
289 {
290 	return test_bit(MOVE_CHARGE_TYPE_FILE,
291 					&mc.to->move_charge_at_immigrate);
292 }
293 
294 /*
295  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
296  * limit reclaim to prevent infinite loops, if they ever occur.
297  */
298 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		(100)
299 #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	(2)
300 
301 enum charge_type {
302 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
303 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
304 	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
305 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
306 	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
307 	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
308 	NR_CHARGE_TYPE,
309 };
310 
311 /* only for here (for easy reading.) */
312 #define PCGF_CACHE	(1UL << PCG_CACHE)
313 #define PCGF_USED	(1UL << PCG_USED)
314 #define PCGF_LOCK	(1UL << PCG_LOCK)
315 /* Not used, but added here for completeness */
316 #define PCGF_ACCT	(1UL << PCG_ACCT)
317 
318 /* for encoding cft->private value on file */
319 #define _MEM			(0)
320 #define _MEMSWAP		(1)
321 #define _OOM_TYPE		(2)
322 #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
323 #define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
324 #define MEMFILE_ATTR(val)	((val) & 0xffff)
325 /* Used for OOM nofiier */
326 #define OOM_CONTROL		(0)
327 
328 /*
329  * Reclaim flags for mem_cgroup_hierarchical_reclaim
330  */
331 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
332 #define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
333 #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
334 #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
335 #define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
336 #define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
337 
338 static void mem_cgroup_get(struct mem_cgroup *mem);
339 static void mem_cgroup_put(struct mem_cgroup *mem);
340 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
341 static void drain_all_stock_async(void);
342 
343 static struct mem_cgroup_per_zone *
344 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
345 {
346 	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
347 }
348 
349 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
350 {
351 	return &mem->css;
352 }
353 
354 static struct mem_cgroup_per_zone *
355 page_cgroup_zoneinfo(struct page_cgroup *pc)
356 {
357 	struct mem_cgroup *mem = pc->mem_cgroup;
358 	int nid = page_cgroup_nid(pc);
359 	int zid = page_cgroup_zid(pc);
360 
361 	if (!mem)
362 		return NULL;
363 
364 	return mem_cgroup_zoneinfo(mem, nid, zid);
365 }
366 
367 static struct mem_cgroup_tree_per_zone *
368 soft_limit_tree_node_zone(int nid, int zid)
369 {
370 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
371 }
372 
373 static struct mem_cgroup_tree_per_zone *
374 soft_limit_tree_from_page(struct page *page)
375 {
376 	int nid = page_to_nid(page);
377 	int zid = page_zonenum(page);
378 
379 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
380 }
381 
382 static void
383 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
384 				struct mem_cgroup_per_zone *mz,
385 				struct mem_cgroup_tree_per_zone *mctz,
386 				unsigned long long new_usage_in_excess)
387 {
388 	struct rb_node **p = &mctz->rb_root.rb_node;
389 	struct rb_node *parent = NULL;
390 	struct mem_cgroup_per_zone *mz_node;
391 
392 	if (mz->on_tree)
393 		return;
394 
395 	mz->usage_in_excess = new_usage_in_excess;
396 	if (!mz->usage_in_excess)
397 		return;
398 	while (*p) {
399 		parent = *p;
400 		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
401 					tree_node);
402 		if (mz->usage_in_excess < mz_node->usage_in_excess)
403 			p = &(*p)->rb_left;
404 		/*
405 		 * We can't avoid mem cgroups that are over their soft
406 		 * limit by the same amount
407 		 */
408 		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
409 			p = &(*p)->rb_right;
410 	}
411 	rb_link_node(&mz->tree_node, parent, p);
412 	rb_insert_color(&mz->tree_node, &mctz->rb_root);
413 	mz->on_tree = true;
414 }
415 
416 static void
417 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
418 				struct mem_cgroup_per_zone *mz,
419 				struct mem_cgroup_tree_per_zone *mctz)
420 {
421 	if (!mz->on_tree)
422 		return;
423 	rb_erase(&mz->tree_node, &mctz->rb_root);
424 	mz->on_tree = false;
425 }
426 
427 static void
428 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
429 				struct mem_cgroup_per_zone *mz,
430 				struct mem_cgroup_tree_per_zone *mctz)
431 {
432 	spin_lock(&mctz->lock);
433 	__mem_cgroup_remove_exceeded(mem, mz, mctz);
434 	spin_unlock(&mctz->lock);
435 }
436 
437 
438 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
439 {
440 	unsigned long long excess;
441 	struct mem_cgroup_per_zone *mz;
442 	struct mem_cgroup_tree_per_zone *mctz;
443 	int nid = page_to_nid(page);
444 	int zid = page_zonenum(page);
445 	mctz = soft_limit_tree_from_page(page);
446 
447 	/*
448 	 * Necessary to update all ancestors when hierarchy is used.
449 	 * because their event counter is not touched.
450 	 */
451 	for (; mem; mem = parent_mem_cgroup(mem)) {
452 		mz = mem_cgroup_zoneinfo(mem, nid, zid);
453 		excess = res_counter_soft_limit_excess(&mem->res);
454 		/*
455 		 * We have to update the tree if mz is on RB-tree or
456 		 * mem is over its softlimit.
457 		 */
458 		if (excess || mz->on_tree) {
459 			spin_lock(&mctz->lock);
460 			/* if on-tree, remove it */
461 			if (mz->on_tree)
462 				__mem_cgroup_remove_exceeded(mem, mz, mctz);
463 			/*
464 			 * Insert again. mz->usage_in_excess will be updated.
465 			 * If excess is 0, no tree ops.
466 			 */
467 			__mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
468 			spin_unlock(&mctz->lock);
469 		}
470 	}
471 }
472 
473 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
474 {
475 	int node, zone;
476 	struct mem_cgroup_per_zone *mz;
477 	struct mem_cgroup_tree_per_zone *mctz;
478 
479 	for_each_node_state(node, N_POSSIBLE) {
480 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
481 			mz = mem_cgroup_zoneinfo(mem, node, zone);
482 			mctz = soft_limit_tree_node_zone(node, zone);
483 			mem_cgroup_remove_exceeded(mem, mz, mctz);
484 		}
485 	}
486 }
487 
488 static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
489 {
490 	return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
491 }
492 
493 static struct mem_cgroup_per_zone *
494 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
495 {
496 	struct rb_node *rightmost = NULL;
497 	struct mem_cgroup_per_zone *mz;
498 
499 retry:
500 	mz = NULL;
501 	rightmost = rb_last(&mctz->rb_root);
502 	if (!rightmost)
503 		goto done;		/* Nothing to reclaim from */
504 
505 	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
506 	/*
507 	 * Remove the node now but someone else can add it back,
508 	 * we will to add it back at the end of reclaim to its correct
509 	 * position in the tree.
510 	 */
511 	__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
512 	if (!res_counter_soft_limit_excess(&mz->mem->res) ||
513 		!css_tryget(&mz->mem->css))
514 		goto retry;
515 done:
516 	return mz;
517 }
518 
519 static struct mem_cgroup_per_zone *
520 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
521 {
522 	struct mem_cgroup_per_zone *mz;
523 
524 	spin_lock(&mctz->lock);
525 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
526 	spin_unlock(&mctz->lock);
527 	return mz;
528 }
529 
530 static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
531 		enum mem_cgroup_stat_index idx)
532 {
533 	int cpu;
534 	s64 val = 0;
535 
536 	for_each_possible_cpu(cpu)
537 		val += per_cpu(mem->stat->count[idx], cpu);
538 	return val;
539 }
540 
541 static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
542 {
543 	s64 ret;
544 
545 	ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
546 	ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
547 	return ret;
548 }
549 
550 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
551 					 bool charge)
552 {
553 	int val = (charge) ? 1 : -1;
554 	this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
555 }
556 
557 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
558 					 struct page_cgroup *pc,
559 					 bool charge)
560 {
561 	int val = (charge) ? 1 : -1;
562 
563 	preempt_disable();
564 
565 	if (PageCgroupCache(pc))
566 		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
567 	else
568 		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
569 
570 	if (charge)
571 		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
572 	else
573 		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
574 	__this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
575 
576 	preempt_enable();
577 }
578 
579 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
580 					enum lru_list idx)
581 {
582 	int nid, zid;
583 	struct mem_cgroup_per_zone *mz;
584 	u64 total = 0;
585 
586 	for_each_online_node(nid)
587 		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
588 			mz = mem_cgroup_zoneinfo(mem, nid, zid);
589 			total += MEM_CGROUP_ZSTAT(mz, idx);
590 		}
591 	return total;
592 }
593 
594 static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
595 {
596 	s64 val;
597 
598 	val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
599 
600 	return !(val & ((1 << event_mask_shift) - 1));
601 }
602 
603 /*
604  * Check events in order.
605  *
606  */
607 static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
608 {
609 	/* threshold event is triggered in finer grain than soft limit */
610 	if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
611 		mem_cgroup_threshold(mem);
612 		if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
613 			mem_cgroup_update_tree(mem, page);
614 	}
615 }
616 
617 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
618 {
619 	return container_of(cgroup_subsys_state(cont,
620 				mem_cgroup_subsys_id), struct mem_cgroup,
621 				css);
622 }
623 
624 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
625 {
626 	/*
627 	 * mm_update_next_owner() may clear mm->owner to NULL
628 	 * if it races with swapoff, page migration, etc.
629 	 * So this can be called with p == NULL.
630 	 */
631 	if (unlikely(!p))
632 		return NULL;
633 
634 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
635 				struct mem_cgroup, css);
636 }
637 
638 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
639 {
640 	struct mem_cgroup *mem = NULL;
641 
642 	if (!mm)
643 		return NULL;
644 	/*
645 	 * Because we have no locks, mm->owner's may be being moved to other
646 	 * cgroup. We use css_tryget() here even if this looks
647 	 * pessimistic (rather than adding locks here).
648 	 */
649 	rcu_read_lock();
650 	do {
651 		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
652 		if (unlikely(!mem))
653 			break;
654 	} while (!css_tryget(&mem->css));
655 	rcu_read_unlock();
656 	return mem;
657 }
658 
659 /*
660  * Call callback function against all cgroup under hierarchy tree.
661  */
662 static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
663 			  int (*func)(struct mem_cgroup *, void *))
664 {
665 	int found, ret, nextid;
666 	struct cgroup_subsys_state *css;
667 	struct mem_cgroup *mem;
668 
669 	if (!root->use_hierarchy)
670 		return (*func)(root, data);
671 
672 	nextid = 1;
673 	do {
674 		ret = 0;
675 		mem = NULL;
676 
677 		rcu_read_lock();
678 		css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
679 				   &found);
680 		if (css && css_tryget(css))
681 			mem = container_of(css, struct mem_cgroup, css);
682 		rcu_read_unlock();
683 
684 		if (mem) {
685 			ret = (*func)(mem, data);
686 			css_put(&mem->css);
687 		}
688 		nextid = found + 1;
689 	} while (!ret && css);
690 
691 	return ret;
692 }
693 
694 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
695 {
696 	return (mem == root_mem_cgroup);
697 }
698 
699 /*
700  * Following LRU functions are allowed to be used without PCG_LOCK.
701  * Operations are called by routine of global LRU independently from memcg.
702  * What we have to take care of here is validness of pc->mem_cgroup.
703  *
704  * Changes to pc->mem_cgroup happens when
705  * 1. charge
706  * 2. moving account
707  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
708  * It is added to LRU before charge.
709  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
710  * When moving account, the page is not on LRU. It's isolated.
711  */
712 
713 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
714 {
715 	struct page_cgroup *pc;
716 	struct mem_cgroup_per_zone *mz;
717 
718 	if (mem_cgroup_disabled())
719 		return;
720 	pc = lookup_page_cgroup(page);
721 	/* can happen while we handle swapcache. */
722 	if (!TestClearPageCgroupAcctLRU(pc))
723 		return;
724 	VM_BUG_ON(!pc->mem_cgroup);
725 	/*
726 	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
727 	 * removed from global LRU.
728 	 */
729 	mz = page_cgroup_zoneinfo(pc);
730 	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
731 	if (mem_cgroup_is_root(pc->mem_cgroup))
732 		return;
733 	VM_BUG_ON(list_empty(&pc->lru));
734 	list_del_init(&pc->lru);
735 	return;
736 }
737 
738 void mem_cgroup_del_lru(struct page *page)
739 {
740 	mem_cgroup_del_lru_list(page, page_lru(page));
741 }
742 
743 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
744 {
745 	struct mem_cgroup_per_zone *mz;
746 	struct page_cgroup *pc;
747 
748 	if (mem_cgroup_disabled())
749 		return;
750 
751 	pc = lookup_page_cgroup(page);
752 	/*
753 	 * Used bit is set without atomic ops but after smp_wmb().
754 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
755 	 */
756 	smp_rmb();
757 	/* unused or root page is not rotated. */
758 	if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
759 		return;
760 	mz = page_cgroup_zoneinfo(pc);
761 	list_move(&pc->lru, &mz->lists[lru]);
762 }
763 
764 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
765 {
766 	struct page_cgroup *pc;
767 	struct mem_cgroup_per_zone *mz;
768 
769 	if (mem_cgroup_disabled())
770 		return;
771 	pc = lookup_page_cgroup(page);
772 	VM_BUG_ON(PageCgroupAcctLRU(pc));
773 	/*
774 	 * Used bit is set without atomic ops but after smp_wmb().
775 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
776 	 */
777 	smp_rmb();
778 	if (!PageCgroupUsed(pc))
779 		return;
780 
781 	mz = page_cgroup_zoneinfo(pc);
782 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
783 	SetPageCgroupAcctLRU(pc);
784 	if (mem_cgroup_is_root(pc->mem_cgroup))
785 		return;
786 	list_add(&pc->lru, &mz->lists[lru]);
787 }
788 
789 /*
790  * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
791  * lru because the page may.be reused after it's fully uncharged (because of
792  * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
793  * it again. This function is only used to charge SwapCache. It's done under
794  * lock_page and expected that zone->lru_lock is never held.
795  */
796 static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
797 {
798 	unsigned long flags;
799 	struct zone *zone = page_zone(page);
800 	struct page_cgroup *pc = lookup_page_cgroup(page);
801 
802 	spin_lock_irqsave(&zone->lru_lock, flags);
803 	/*
804 	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
805 	 * is guarded by lock_page() because the page is SwapCache.
806 	 */
807 	if (!PageCgroupUsed(pc))
808 		mem_cgroup_del_lru_list(page, page_lru(page));
809 	spin_unlock_irqrestore(&zone->lru_lock, flags);
810 }
811 
812 static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
813 {
814 	unsigned long flags;
815 	struct zone *zone = page_zone(page);
816 	struct page_cgroup *pc = lookup_page_cgroup(page);
817 
818 	spin_lock_irqsave(&zone->lru_lock, flags);
819 	/* link when the page is linked to LRU but page_cgroup isn't */
820 	if (PageLRU(page) && !PageCgroupAcctLRU(pc))
821 		mem_cgroup_add_lru_list(page, page_lru(page));
822 	spin_unlock_irqrestore(&zone->lru_lock, flags);
823 }
824 
825 
826 void mem_cgroup_move_lists(struct page *page,
827 			   enum lru_list from, enum lru_list to)
828 {
829 	if (mem_cgroup_disabled())
830 		return;
831 	mem_cgroup_del_lru_list(page, from);
832 	mem_cgroup_add_lru_list(page, to);
833 }
834 
835 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
836 {
837 	int ret;
838 	struct mem_cgroup *curr = NULL;
839 
840 	task_lock(task);
841 	rcu_read_lock();
842 	curr = try_get_mem_cgroup_from_mm(task->mm);
843 	rcu_read_unlock();
844 	task_unlock(task);
845 	if (!curr)
846 		return 0;
847 	/*
848 	 * We should check use_hierarchy of "mem" not "curr". Because checking
849 	 * use_hierarchy of "curr" here make this function true if hierarchy is
850 	 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
851 	 * hierarchy(even if use_hierarchy is disabled in "mem").
852 	 */
853 	if (mem->use_hierarchy)
854 		ret = css_is_ancestor(&curr->css, &mem->css);
855 	else
856 		ret = (curr == mem);
857 	css_put(&curr->css);
858 	return ret;
859 }
860 
861 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
862 {
863 	unsigned long active;
864 	unsigned long inactive;
865 	unsigned long gb;
866 	unsigned long inactive_ratio;
867 
868 	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
869 	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
870 
871 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
872 	if (gb)
873 		inactive_ratio = int_sqrt(10 * gb);
874 	else
875 		inactive_ratio = 1;
876 
877 	if (present_pages) {
878 		present_pages[0] = inactive;
879 		present_pages[1] = active;
880 	}
881 
882 	return inactive_ratio;
883 }
884 
885 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
886 {
887 	unsigned long active;
888 	unsigned long inactive;
889 	unsigned long present_pages[2];
890 	unsigned long inactive_ratio;
891 
892 	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
893 
894 	inactive = present_pages[0];
895 	active = present_pages[1];
896 
897 	if (inactive * inactive_ratio < active)
898 		return 1;
899 
900 	return 0;
901 }
902 
903 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
904 {
905 	unsigned long active;
906 	unsigned long inactive;
907 
908 	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
909 	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
910 
911 	return (active > inactive);
912 }
913 
914 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
915 				       struct zone *zone,
916 				       enum lru_list lru)
917 {
918 	int nid = zone->zone_pgdat->node_id;
919 	int zid = zone_idx(zone);
920 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
921 
922 	return MEM_CGROUP_ZSTAT(mz, lru);
923 }
924 
925 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
926 						      struct zone *zone)
927 {
928 	int nid = zone->zone_pgdat->node_id;
929 	int zid = zone_idx(zone);
930 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
931 
932 	return &mz->reclaim_stat;
933 }
934 
935 struct zone_reclaim_stat *
936 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
937 {
938 	struct page_cgroup *pc;
939 	struct mem_cgroup_per_zone *mz;
940 
941 	if (mem_cgroup_disabled())
942 		return NULL;
943 
944 	pc = lookup_page_cgroup(page);
945 	/*
946 	 * Used bit is set without atomic ops but after smp_wmb().
947 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
948 	 */
949 	smp_rmb();
950 	if (!PageCgroupUsed(pc))
951 		return NULL;
952 
953 	mz = page_cgroup_zoneinfo(pc);
954 	if (!mz)
955 		return NULL;
956 
957 	return &mz->reclaim_stat;
958 }
959 
960 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
961 					struct list_head *dst,
962 					unsigned long *scanned, int order,
963 					int mode, struct zone *z,
964 					struct mem_cgroup *mem_cont,
965 					int active, int file)
966 {
967 	unsigned long nr_taken = 0;
968 	struct page *page;
969 	unsigned long scan;
970 	LIST_HEAD(pc_list);
971 	struct list_head *src;
972 	struct page_cgroup *pc, *tmp;
973 	int nid = z->zone_pgdat->node_id;
974 	int zid = zone_idx(z);
975 	struct mem_cgroup_per_zone *mz;
976 	int lru = LRU_FILE * file + active;
977 	int ret;
978 
979 	BUG_ON(!mem_cont);
980 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
981 	src = &mz->lists[lru];
982 
983 	scan = 0;
984 	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
985 		if (scan >= nr_to_scan)
986 			break;
987 
988 		page = pc->page;
989 		if (unlikely(!PageCgroupUsed(pc)))
990 			continue;
991 		if (unlikely(!PageLRU(page)))
992 			continue;
993 
994 		scan++;
995 		ret = __isolate_lru_page(page, mode, file);
996 		switch (ret) {
997 		case 0:
998 			list_move(&page->lru, dst);
999 			mem_cgroup_del_lru(page);
1000 			nr_taken++;
1001 			break;
1002 		case -EBUSY:
1003 			/* we don't affect global LRU but rotate in our LRU */
1004 			mem_cgroup_rotate_lru_list(page, page_lru(page));
1005 			break;
1006 		default:
1007 			break;
1008 		}
1009 	}
1010 
1011 	*scanned = scan;
1012 
1013 	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1014 				      0, 0, 0, mode);
1015 
1016 	return nr_taken;
1017 }
1018 
1019 #define mem_cgroup_from_res_counter(counter, member)	\
1020 	container_of(counter, struct mem_cgroup, member)
1021 
1022 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1023 {
1024 	if (do_swap_account) {
1025 		if (res_counter_check_under_limit(&mem->res) &&
1026 			res_counter_check_under_limit(&mem->memsw))
1027 			return true;
1028 	} else
1029 		if (res_counter_check_under_limit(&mem->res))
1030 			return true;
1031 	return false;
1032 }
1033 
1034 static unsigned int get_swappiness(struct mem_cgroup *memcg)
1035 {
1036 	struct cgroup *cgrp = memcg->css.cgroup;
1037 	unsigned int swappiness;
1038 
1039 	/* root ? */
1040 	if (cgrp->parent == NULL)
1041 		return vm_swappiness;
1042 
1043 	spin_lock(&memcg->reclaim_param_lock);
1044 	swappiness = memcg->swappiness;
1045 	spin_unlock(&memcg->reclaim_param_lock);
1046 
1047 	return swappiness;
1048 }
1049 
1050 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1051 {
1052 	int *val = data;
1053 	(*val)++;
1054 	return 0;
1055 }
1056 
1057 /**
1058  * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1059  * @memcg: The memory cgroup that went over limit
1060  * @p: Task that is going to be killed
1061  *
1062  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1063  * enabled
1064  */
1065 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1066 {
1067 	struct cgroup *task_cgrp;
1068 	struct cgroup *mem_cgrp;
1069 	/*
1070 	 * Need a buffer in BSS, can't rely on allocations. The code relies
1071 	 * on the assumption that OOM is serialized for memory controller.
1072 	 * If this assumption is broken, revisit this code.
1073 	 */
1074 	static char memcg_name[PATH_MAX];
1075 	int ret;
1076 
1077 	if (!memcg || !p)
1078 		return;
1079 
1080 
1081 	rcu_read_lock();
1082 
1083 	mem_cgrp = memcg->css.cgroup;
1084 	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1085 
1086 	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1087 	if (ret < 0) {
1088 		/*
1089 		 * Unfortunately, we are unable to convert to a useful name
1090 		 * But we'll still print out the usage information
1091 		 */
1092 		rcu_read_unlock();
1093 		goto done;
1094 	}
1095 	rcu_read_unlock();
1096 
1097 	printk(KERN_INFO "Task in %s killed", memcg_name);
1098 
1099 	rcu_read_lock();
1100 	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1101 	if (ret < 0) {
1102 		rcu_read_unlock();
1103 		goto done;
1104 	}
1105 	rcu_read_unlock();
1106 
1107 	/*
1108 	 * Continues from above, so we don't need an KERN_ level
1109 	 */
1110 	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1111 done:
1112 
1113 	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1114 		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1115 		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1116 		res_counter_read_u64(&memcg->res, RES_FAILCNT));
1117 	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1118 		"failcnt %llu\n",
1119 		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1120 		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1121 		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1122 }
1123 
1124 /*
1125  * This function returns the number of memcg under hierarchy tree. Returns
1126  * 1(self count) if no children.
1127  */
1128 static int mem_cgroup_count_children(struct mem_cgroup *mem)
1129 {
1130 	int num = 0;
1131  	mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
1132 	return num;
1133 }
1134 
1135 /*
1136  * Return the memory (and swap, if configured) limit for a memcg.
1137  */
1138 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1139 {
1140 	u64 limit;
1141 	u64 memsw;
1142 
1143 	limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
1144 			total_swap_pages;
1145 	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1146 	/*
1147 	 * If memsw is finite and limits the amount of swap space available
1148 	 * to this memcg, return that limit.
1149 	 */
1150 	return min(limit, memsw);
1151 }
1152 
1153 /*
1154  * Visit the first child (need not be the first child as per the ordering
1155  * of the cgroup list, since we track last_scanned_child) of @mem and use
1156  * that to reclaim free pages from.
1157  */
1158 static struct mem_cgroup *
1159 mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1160 {
1161 	struct mem_cgroup *ret = NULL;
1162 	struct cgroup_subsys_state *css;
1163 	int nextid, found;
1164 
1165 	if (!root_mem->use_hierarchy) {
1166 		css_get(&root_mem->css);
1167 		ret = root_mem;
1168 	}
1169 
1170 	while (!ret) {
1171 		rcu_read_lock();
1172 		nextid = root_mem->last_scanned_child + 1;
1173 		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1174 				   &found);
1175 		if (css && css_tryget(css))
1176 			ret = container_of(css, struct mem_cgroup, css);
1177 
1178 		rcu_read_unlock();
1179 		/* Updates scanning parameter */
1180 		spin_lock(&root_mem->reclaim_param_lock);
1181 		if (!css) {
1182 			/* this means start scan from ID:1 */
1183 			root_mem->last_scanned_child = 0;
1184 		} else
1185 			root_mem->last_scanned_child = found;
1186 		spin_unlock(&root_mem->reclaim_param_lock);
1187 	}
1188 
1189 	return ret;
1190 }
1191 
1192 /*
1193  * Scan the hierarchy if needed to reclaim memory. We remember the last child
1194  * we reclaimed from, so that we don't end up penalizing one child extensively
1195  * based on its position in the children list.
1196  *
1197  * root_mem is the original ancestor that we've been reclaim from.
1198  *
1199  * We give up and return to the caller when we visit root_mem twice.
1200  * (other groups can be removed while we're walking....)
1201  *
1202  * If shrink==true, for avoiding to free too much, this returns immedieately.
1203  */
1204 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1205 						struct zone *zone,
1206 						gfp_t gfp_mask,
1207 						unsigned long reclaim_options)
1208 {
1209 	struct mem_cgroup *victim;
1210 	int ret, total = 0;
1211 	int loop = 0;
1212 	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1213 	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1214 	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1215 	unsigned long excess = mem_cgroup_get_excess(root_mem);
1216 
1217 	/* If memsw_is_minimum==1, swap-out is of-no-use. */
1218 	if (root_mem->memsw_is_minimum)
1219 		noswap = true;
1220 
1221 	while (1) {
1222 		victim = mem_cgroup_select_victim(root_mem);
1223 		if (victim == root_mem) {
1224 			loop++;
1225 			if (loop >= 1)
1226 				drain_all_stock_async();
1227 			if (loop >= 2) {
1228 				/*
1229 				 * If we have not been able to reclaim
1230 				 * anything, it might because there are
1231 				 * no reclaimable pages under this hierarchy
1232 				 */
1233 				if (!check_soft || !total) {
1234 					css_put(&victim->css);
1235 					break;
1236 				}
1237 				/*
1238 				 * We want to do more targetted reclaim.
1239 				 * excess >> 2 is not to excessive so as to
1240 				 * reclaim too much, nor too less that we keep
1241 				 * coming back to reclaim from this cgroup
1242 				 */
1243 				if (total >= (excess >> 2) ||
1244 					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1245 					css_put(&victim->css);
1246 					break;
1247 				}
1248 			}
1249 		}
1250 		if (!mem_cgroup_local_usage(victim)) {
1251 			/* this cgroup's local usage == 0 */
1252 			css_put(&victim->css);
1253 			continue;
1254 		}
1255 		/* we use swappiness of local cgroup */
1256 		if (check_soft)
1257 			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1258 				noswap, get_swappiness(victim), zone,
1259 				zone->zone_pgdat->node_id);
1260 		else
1261 			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1262 						noswap, get_swappiness(victim));
1263 		css_put(&victim->css);
1264 		/*
1265 		 * At shrinking usage, we can't check we should stop here or
1266 		 * reclaim more. It's depends on callers. last_scanned_child
1267 		 * will work enough for keeping fairness under tree.
1268 		 */
1269 		if (shrink)
1270 			return ret;
1271 		total += ret;
1272 		if (check_soft) {
1273 			if (res_counter_check_under_soft_limit(&root_mem->res))
1274 				return total;
1275 		} else if (mem_cgroup_check_under_limit(root_mem))
1276 			return 1 + total;
1277 	}
1278 	return total;
1279 }
1280 
1281 static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1282 {
1283 	int *val = (int *)data;
1284 	int x;
1285 	/*
1286 	 * Logically, we can stop scanning immediately when we find
1287 	 * a memcg is already locked. But condidering unlock ops and
1288 	 * creation/removal of memcg, scan-all is simple operation.
1289 	 */
1290 	x = atomic_inc_return(&mem->oom_lock);
1291 	*val = max(x, *val);
1292 	return 0;
1293 }
1294 /*
1295  * Check OOM-Killer is already running under our hierarchy.
1296  * If someone is running, return false.
1297  */
1298 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1299 {
1300 	int lock_count = 0;
1301 
1302 	mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
1303 
1304 	if (lock_count == 1)
1305 		return true;
1306 	return false;
1307 }
1308 
1309 static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
1310 {
1311 	/*
1312 	 * When a new child is created while the hierarchy is under oom,
1313 	 * mem_cgroup_oom_lock() may not be called. We have to use
1314 	 * atomic_add_unless() here.
1315 	 */
1316 	atomic_add_unless(&mem->oom_lock, -1, 0);
1317 	return 0;
1318 }
1319 
1320 static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1321 {
1322 	mem_cgroup_walk_tree(mem, NULL,	mem_cgroup_oom_unlock_cb);
1323 }
1324 
1325 static DEFINE_MUTEX(memcg_oom_mutex);
1326 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1327 
1328 struct oom_wait_info {
1329 	struct mem_cgroup *mem;
1330 	wait_queue_t	wait;
1331 };
1332 
1333 static int memcg_oom_wake_function(wait_queue_t *wait,
1334 	unsigned mode, int sync, void *arg)
1335 {
1336 	struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
1337 	struct oom_wait_info *oom_wait_info;
1338 
1339 	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1340 
1341 	if (oom_wait_info->mem == wake_mem)
1342 		goto wakeup;
1343 	/* if no hierarchy, no match */
1344 	if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1345 		return 0;
1346 	/*
1347 	 * Both of oom_wait_info->mem and wake_mem are stable under us.
1348 	 * Then we can use css_is_ancestor without taking care of RCU.
1349 	 */
1350 	if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
1351 	    !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
1352 		return 0;
1353 
1354 wakeup:
1355 	return autoremove_wake_function(wait, mode, sync, arg);
1356 }
1357 
1358 static void memcg_wakeup_oom(struct mem_cgroup *mem)
1359 {
1360 	/* for filtering, pass "mem" as argument. */
1361 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1362 }
1363 
1364 static void memcg_oom_recover(struct mem_cgroup *mem)
1365 {
1366 	if (atomic_read(&mem->oom_lock))
1367 		memcg_wakeup_oom(mem);
1368 }
1369 
1370 /*
1371  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1372  */
1373 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1374 {
1375 	struct oom_wait_info owait;
1376 	bool locked, need_to_kill;
1377 
1378 	owait.mem = mem;
1379 	owait.wait.flags = 0;
1380 	owait.wait.func = memcg_oom_wake_function;
1381 	owait.wait.private = current;
1382 	INIT_LIST_HEAD(&owait.wait.task_list);
1383 	need_to_kill = true;
1384 	/* At first, try to OOM lock hierarchy under mem.*/
1385 	mutex_lock(&memcg_oom_mutex);
1386 	locked = mem_cgroup_oom_lock(mem);
1387 	/*
1388 	 * Even if signal_pending(), we can't quit charge() loop without
1389 	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1390 	 * under OOM is always welcomed, use TASK_KILLABLE here.
1391 	 */
1392 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1393 	if (!locked || mem->oom_kill_disable)
1394 		need_to_kill = false;
1395 	if (locked)
1396 		mem_cgroup_oom_notify(mem);
1397 	mutex_unlock(&memcg_oom_mutex);
1398 
1399 	if (need_to_kill) {
1400 		finish_wait(&memcg_oom_waitq, &owait.wait);
1401 		mem_cgroup_out_of_memory(mem, mask);
1402 	} else {
1403 		schedule();
1404 		finish_wait(&memcg_oom_waitq, &owait.wait);
1405 	}
1406 	mutex_lock(&memcg_oom_mutex);
1407 	mem_cgroup_oom_unlock(mem);
1408 	memcg_wakeup_oom(mem);
1409 	mutex_unlock(&memcg_oom_mutex);
1410 
1411 	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1412 		return false;
1413 	/* Give chance to dying process */
1414 	schedule_timeout(1);
1415 	return true;
1416 }
1417 
1418 /*
1419  * Currently used to update mapped file statistics, but the routine can be
1420  * generalized to update other statistics as well.
1421  */
1422 void mem_cgroup_update_file_mapped(struct page *page, int val)
1423 {
1424 	struct mem_cgroup *mem;
1425 	struct page_cgroup *pc;
1426 
1427 	pc = lookup_page_cgroup(page);
1428 	if (unlikely(!pc))
1429 		return;
1430 
1431 	lock_page_cgroup(pc);
1432 	mem = pc->mem_cgroup;
1433 	if (!mem || !PageCgroupUsed(pc))
1434 		goto done;
1435 
1436 	/*
1437 	 * Preemption is already disabled. We can use __this_cpu_xxx
1438 	 */
1439 	if (val > 0) {
1440 		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1441 		SetPageCgroupFileMapped(pc);
1442 	} else {
1443 		__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1444 		ClearPageCgroupFileMapped(pc);
1445 	}
1446 
1447 done:
1448 	unlock_page_cgroup(pc);
1449 }
1450 
1451 /*
1452  * size of first charge trial. "32" comes from vmscan.c's magic value.
1453  * TODO: maybe necessary to use big numbers in big irons.
1454  */
1455 #define CHARGE_SIZE	(32 * PAGE_SIZE)
1456 struct memcg_stock_pcp {
1457 	struct mem_cgroup *cached; /* this never be root cgroup */
1458 	int charge;
1459 	struct work_struct work;
1460 };
1461 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1462 static atomic_t memcg_drain_count;
1463 
1464 /*
1465  * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
1466  * from local stock and true is returned. If the stock is 0 or charges from a
1467  * cgroup which is not current target, returns false. This stock will be
1468  * refilled.
1469  */
1470 static bool consume_stock(struct mem_cgroup *mem)
1471 {
1472 	struct memcg_stock_pcp *stock;
1473 	bool ret = true;
1474 
1475 	stock = &get_cpu_var(memcg_stock);
1476 	if (mem == stock->cached && stock->charge)
1477 		stock->charge -= PAGE_SIZE;
1478 	else /* need to call res_counter_charge */
1479 		ret = false;
1480 	put_cpu_var(memcg_stock);
1481 	return ret;
1482 }
1483 
1484 /*
1485  * Returns stocks cached in percpu to res_counter and reset cached information.
1486  */
1487 static void drain_stock(struct memcg_stock_pcp *stock)
1488 {
1489 	struct mem_cgroup *old = stock->cached;
1490 
1491 	if (stock->charge) {
1492 		res_counter_uncharge(&old->res, stock->charge);
1493 		if (do_swap_account)
1494 			res_counter_uncharge(&old->memsw, stock->charge);
1495 	}
1496 	stock->cached = NULL;
1497 	stock->charge = 0;
1498 }
1499 
1500 /*
1501  * This must be called under preempt disabled or must be called by
1502  * a thread which is pinned to local cpu.
1503  */
1504 static void drain_local_stock(struct work_struct *dummy)
1505 {
1506 	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1507 	drain_stock(stock);
1508 }
1509 
1510 /*
1511  * Cache charges(val) which is from res_counter, to local per_cpu area.
1512  * This will be consumed by consume_stock() function, later.
1513  */
1514 static void refill_stock(struct mem_cgroup *mem, int val)
1515 {
1516 	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1517 
1518 	if (stock->cached != mem) { /* reset if necessary */
1519 		drain_stock(stock);
1520 		stock->cached = mem;
1521 	}
1522 	stock->charge += val;
1523 	put_cpu_var(memcg_stock);
1524 }
1525 
1526 /*
1527  * Tries to drain stocked charges in other cpus. This function is asynchronous
1528  * and just put a work per cpu for draining localy on each cpu. Caller can
1529  * expects some charges will be back to res_counter later but cannot wait for
1530  * it.
1531  */
1532 static void drain_all_stock_async(void)
1533 {
1534 	int cpu;
1535 	/* This function is for scheduling "drain" in asynchronous way.
1536 	 * The result of "drain" is not directly handled by callers. Then,
1537 	 * if someone is calling drain, we don't have to call drain more.
1538 	 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
1539 	 * there is a race. We just do loose check here.
1540 	 */
1541 	if (atomic_read(&memcg_drain_count))
1542 		return;
1543 	/* Notify other cpus that system-wide "drain" is running */
1544 	atomic_inc(&memcg_drain_count);
1545 	get_online_cpus();
1546 	for_each_online_cpu(cpu) {
1547 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1548 		schedule_work_on(cpu, &stock->work);
1549 	}
1550  	put_online_cpus();
1551 	atomic_dec(&memcg_drain_count);
1552 	/* We don't wait for flush_work */
1553 }
1554 
1555 /* This is a synchronous drain interface. */
1556 static void drain_all_stock_sync(void)
1557 {
1558 	/* called when force_empty is called */
1559 	atomic_inc(&memcg_drain_count);
1560 	schedule_on_each_cpu(drain_local_stock);
1561 	atomic_dec(&memcg_drain_count);
1562 }
1563 
1564 static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1565 					unsigned long action,
1566 					void *hcpu)
1567 {
1568 	int cpu = (unsigned long)hcpu;
1569 	struct memcg_stock_pcp *stock;
1570 
1571 	if (action != CPU_DEAD)
1572 		return NOTIFY_OK;
1573 	stock = &per_cpu(memcg_stock, cpu);
1574 	drain_stock(stock);
1575 	return NOTIFY_OK;
1576 }
1577 
1578 /*
1579  * Unlike exported interface, "oom" parameter is added. if oom==true,
1580  * oom-killer can be invoked.
1581  */
1582 static int __mem_cgroup_try_charge(struct mm_struct *mm,
1583 			gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1584 {
1585 	struct mem_cgroup *mem, *mem_over_limit;
1586 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1587 	struct res_counter *fail_res;
1588 	int csize = CHARGE_SIZE;
1589 
1590 	/*
1591 	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
1592 	 * in system level. So, allow to go ahead dying process in addition to
1593 	 * MEMDIE process.
1594 	 */
1595 	if (unlikely(test_thread_flag(TIF_MEMDIE)
1596 		     || fatal_signal_pending(current)))
1597 		goto bypass;
1598 
1599 	/*
1600 	 * We always charge the cgroup the mm_struct belongs to.
1601 	 * The mm_struct's mem_cgroup changes on task migration if the
1602 	 * thread group leader migrates. It's possible that mm is not
1603 	 * set, if so charge the init_mm (happens for pagecache usage).
1604 	 */
1605 	mem = *memcg;
1606 	if (likely(!mem)) {
1607 		mem = try_get_mem_cgroup_from_mm(mm);
1608 		*memcg = mem;
1609 	} else {
1610 		css_get(&mem->css);
1611 	}
1612 	if (unlikely(!mem))
1613 		return 0;
1614 
1615 	VM_BUG_ON(css_is_removed(&mem->css));
1616 	if (mem_cgroup_is_root(mem))
1617 		goto done;
1618 
1619 	while (1) {
1620 		int ret = 0;
1621 		unsigned long flags = 0;
1622 
1623 		if (consume_stock(mem))
1624 			goto done;
1625 
1626 		ret = res_counter_charge(&mem->res, csize, &fail_res);
1627 		if (likely(!ret)) {
1628 			if (!do_swap_account)
1629 				break;
1630 			ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1631 			if (likely(!ret))
1632 				break;
1633 			/* mem+swap counter fails */
1634 			res_counter_uncharge(&mem->res, csize);
1635 			flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1636 			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1637 									memsw);
1638 		} else
1639 			/* mem counter fails */
1640 			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1641 									res);
1642 
1643 		/* reduce request size and retry */
1644 		if (csize > PAGE_SIZE) {
1645 			csize = PAGE_SIZE;
1646 			continue;
1647 		}
1648 		if (!(gfp_mask & __GFP_WAIT))
1649 			goto nomem;
1650 
1651 		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1652 						gfp_mask, flags);
1653 		if (ret)
1654 			continue;
1655 
1656 		/*
1657 		 * try_to_free_mem_cgroup_pages() might not give us a full
1658 		 * picture of reclaim. Some pages are reclaimed and might be
1659 		 * moved to swap cache or just unmapped from the cgroup.
1660 		 * Check the limit again to see if the reclaim reduced the
1661 		 * current usage of the cgroup before giving up
1662 		 *
1663 		 */
1664 		if (mem_cgroup_check_under_limit(mem_over_limit))
1665 			continue;
1666 
1667 		/* try to avoid oom while someone is moving charge */
1668 		if (mc.moving_task && current != mc.moving_task) {
1669 			struct mem_cgroup *from, *to;
1670 			bool do_continue = false;
1671 			/*
1672 			 * There is a small race that "from" or "to" can be
1673 			 * freed by rmdir, so we use css_tryget().
1674 			 */
1675 			from = mc.from;
1676 			to = mc.to;
1677 			if (from && css_tryget(&from->css)) {
1678 				if (mem_over_limit->use_hierarchy)
1679 					do_continue = css_is_ancestor(
1680 							&from->css,
1681 							&mem_over_limit->css);
1682 				else
1683 					do_continue = (from == mem_over_limit);
1684 				css_put(&from->css);
1685 			}
1686 			if (!do_continue && to && css_tryget(&to->css)) {
1687 				if (mem_over_limit->use_hierarchy)
1688 					do_continue = css_is_ancestor(
1689 							&to->css,
1690 							&mem_over_limit->css);
1691 				else
1692 					do_continue = (to == mem_over_limit);
1693 				css_put(&to->css);
1694 			}
1695 			if (do_continue) {
1696 				DEFINE_WAIT(wait);
1697 				prepare_to_wait(&mc.waitq, &wait,
1698 							TASK_INTERRUPTIBLE);
1699 				/* moving charge context might have finished. */
1700 				if (mc.moving_task)
1701 					schedule();
1702 				finish_wait(&mc.waitq, &wait);
1703 				continue;
1704 			}
1705 		}
1706 
1707 		if (!nr_retries--) {
1708 			if (!oom)
1709 				goto nomem;
1710 			if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
1711 				nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1712 				continue;
1713 			}
1714 			/* When we reach here, current task is dying .*/
1715 			css_put(&mem->css);
1716 			goto bypass;
1717 		}
1718 	}
1719 	if (csize > PAGE_SIZE)
1720 		refill_stock(mem, csize - PAGE_SIZE);
1721 done:
1722 	return 0;
1723 nomem:
1724 	css_put(&mem->css);
1725 	return -ENOMEM;
1726 bypass:
1727 	*memcg = NULL;
1728 	return 0;
1729 }
1730 
1731 /*
1732  * Somemtimes we have to undo a charge we got by try_charge().
1733  * This function is for that and do uncharge, put css's refcnt.
1734  * gotten by try_charge().
1735  */
1736 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1737 							unsigned long count)
1738 {
1739 	if (!mem_cgroup_is_root(mem)) {
1740 		res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1741 		if (do_swap_account)
1742 			res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1743 		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1744 		WARN_ON_ONCE(count > INT_MAX);
1745 		__css_put(&mem->css, (int)count);
1746 	}
1747 	/* we don't need css_put for root */
1748 }
1749 
1750 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1751 {
1752 	__mem_cgroup_cancel_charge(mem, 1);
1753 }
1754 
1755 /*
1756  * A helper function to get mem_cgroup from ID. must be called under
1757  * rcu_read_lock(). The caller must check css_is_removed() or some if
1758  * it's concern. (dropping refcnt from swap can be called against removed
1759  * memcg.)
1760  */
1761 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1762 {
1763 	struct cgroup_subsys_state *css;
1764 
1765 	/* ID 0 is unused ID */
1766 	if (!id)
1767 		return NULL;
1768 	css = css_lookup(&mem_cgroup_subsys, id);
1769 	if (!css)
1770 		return NULL;
1771 	return container_of(css, struct mem_cgroup, css);
1772 }
1773 
1774 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1775 {
1776 	struct mem_cgroup *mem = NULL;
1777 	struct page_cgroup *pc;
1778 	unsigned short id;
1779 	swp_entry_t ent;
1780 
1781 	VM_BUG_ON(!PageLocked(page));
1782 
1783 	pc = lookup_page_cgroup(page);
1784 	lock_page_cgroup(pc);
1785 	if (PageCgroupUsed(pc)) {
1786 		mem = pc->mem_cgroup;
1787 		if (mem && !css_tryget(&mem->css))
1788 			mem = NULL;
1789 	} else if (PageSwapCache(page)) {
1790 		ent.val = page_private(page);
1791 		id = lookup_swap_cgroup(ent);
1792 		rcu_read_lock();
1793 		mem = mem_cgroup_lookup(id);
1794 		if (mem && !css_tryget(&mem->css))
1795 			mem = NULL;
1796 		rcu_read_unlock();
1797 	}
1798 	unlock_page_cgroup(pc);
1799 	return mem;
1800 }
1801 
1802 /*
1803  * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
1804  * USED state. If already USED, uncharge and return.
1805  */
1806 
1807 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1808 				     struct page_cgroup *pc,
1809 				     enum charge_type ctype)
1810 {
1811 	/* try_charge() can return NULL to *memcg, taking care of it. */
1812 	if (!mem)
1813 		return;
1814 
1815 	lock_page_cgroup(pc);
1816 	if (unlikely(PageCgroupUsed(pc))) {
1817 		unlock_page_cgroup(pc);
1818 		mem_cgroup_cancel_charge(mem);
1819 		return;
1820 	}
1821 
1822 	pc->mem_cgroup = mem;
1823 	/*
1824 	 * We access a page_cgroup asynchronously without lock_page_cgroup().
1825 	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1826 	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
1827 	 * before USED bit, we need memory barrier here.
1828 	 * See mem_cgroup_add_lru_list(), etc.
1829  	 */
1830 	smp_wmb();
1831 	switch (ctype) {
1832 	case MEM_CGROUP_CHARGE_TYPE_CACHE:
1833 	case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1834 		SetPageCgroupCache(pc);
1835 		SetPageCgroupUsed(pc);
1836 		break;
1837 	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1838 		ClearPageCgroupCache(pc);
1839 		SetPageCgroupUsed(pc);
1840 		break;
1841 	default:
1842 		break;
1843 	}
1844 
1845 	mem_cgroup_charge_statistics(mem, pc, true);
1846 
1847 	unlock_page_cgroup(pc);
1848 	/*
1849 	 * "charge_statistics" updated event counter. Then, check it.
1850 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1851 	 * if they exceeds softlimit.
1852 	 */
1853 	memcg_check_events(mem, pc->page);
1854 }
1855 
1856 /**
1857  * __mem_cgroup_move_account - move account of the page
1858  * @pc:	page_cgroup of the page.
1859  * @from: mem_cgroup which the page is moved from.
1860  * @to:	mem_cgroup which the page is moved to. @from != @to.
1861  * @uncharge: whether we should call uncharge and css_put against @from.
1862  *
1863  * The caller must confirm following.
1864  * - page is not on LRU (isolate_page() is useful.)
1865  * - the pc is locked, used, and ->mem_cgroup points to @from.
1866  *
1867  * This function doesn't do "charge" nor css_get to new cgroup. It should be
1868  * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
1869  * true, this function does "uncharge" from old cgroup, but it doesn't if
1870  * @uncharge is false, so a caller should do "uncharge".
1871  */
1872 
1873 static void __mem_cgroup_move_account(struct page_cgroup *pc,
1874 	struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1875 {
1876 	VM_BUG_ON(from == to);
1877 	VM_BUG_ON(PageLRU(pc->page));
1878 	VM_BUG_ON(!PageCgroupLocked(pc));
1879 	VM_BUG_ON(!PageCgroupUsed(pc));
1880 	VM_BUG_ON(pc->mem_cgroup != from);
1881 
1882 	if (PageCgroupFileMapped(pc)) {
1883 		/* Update mapped_file data for mem_cgroup */
1884 		preempt_disable();
1885 		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1886 		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1887 		preempt_enable();
1888 	}
1889 	mem_cgroup_charge_statistics(from, pc, false);
1890 	if (uncharge)
1891 		/* This is not "cancel", but cancel_charge does all we need. */
1892 		mem_cgroup_cancel_charge(from);
1893 
1894 	/* caller should have done css_get */
1895 	pc->mem_cgroup = to;
1896 	mem_cgroup_charge_statistics(to, pc, true);
1897 	/*
1898 	 * We charges against "to" which may not have any tasks. Then, "to"
1899 	 * can be under rmdir(). But in current implementation, caller of
1900 	 * this function is just force_empty() and move charge, so it's
1901 	 * garanteed that "to" is never removed. So, we don't check rmdir
1902 	 * status here.
1903 	 */
1904 }
1905 
1906 /*
1907  * check whether the @pc is valid for moving account and call
1908  * __mem_cgroup_move_account()
1909  */
1910 static int mem_cgroup_move_account(struct page_cgroup *pc,
1911 		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1912 {
1913 	int ret = -EINVAL;
1914 	lock_page_cgroup(pc);
1915 	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1916 		__mem_cgroup_move_account(pc, from, to, uncharge);
1917 		ret = 0;
1918 	}
1919 	unlock_page_cgroup(pc);
1920 	/*
1921 	 * check events
1922 	 */
1923 	memcg_check_events(to, pc->page);
1924 	memcg_check_events(from, pc->page);
1925 	return ret;
1926 }
1927 
1928 /*
1929  * move charges to its parent.
1930  */
1931 
1932 static int mem_cgroup_move_parent(struct page_cgroup *pc,
1933 				  struct mem_cgroup *child,
1934 				  gfp_t gfp_mask)
1935 {
1936 	struct page *page = pc->page;
1937 	struct cgroup *cg = child->css.cgroup;
1938 	struct cgroup *pcg = cg->parent;
1939 	struct mem_cgroup *parent;
1940 	int ret;
1941 
1942 	/* Is ROOT ? */
1943 	if (!pcg)
1944 		return -EINVAL;
1945 
1946 	ret = -EBUSY;
1947 	if (!get_page_unless_zero(page))
1948 		goto out;
1949 	if (isolate_lru_page(page))
1950 		goto put;
1951 
1952 	parent = mem_cgroup_from_cont(pcg);
1953 	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1954 	if (ret || !parent)
1955 		goto put_back;
1956 
1957 	ret = mem_cgroup_move_account(pc, child, parent, true);
1958 	if (ret)
1959 		mem_cgroup_cancel_charge(parent);
1960 put_back:
1961 	putback_lru_page(page);
1962 put:
1963 	put_page(page);
1964 out:
1965 	return ret;
1966 }
1967 
1968 /*
1969  * Charge the memory controller for page usage.
1970  * Return
1971  * 0 if the charge was successful
1972  * < 0 if the cgroup is over its limit
1973  */
1974 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1975 				gfp_t gfp_mask, enum charge_type ctype,
1976 				struct mem_cgroup *memcg)
1977 {
1978 	struct mem_cgroup *mem;
1979 	struct page_cgroup *pc;
1980 	int ret;
1981 
1982 	pc = lookup_page_cgroup(page);
1983 	/* can happen at boot */
1984 	if (unlikely(!pc))
1985 		return 0;
1986 	prefetchw(pc);
1987 
1988 	mem = memcg;
1989 	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1990 	if (ret || !mem)
1991 		return ret;
1992 
1993 	__mem_cgroup_commit_charge(mem, pc, ctype);
1994 	return 0;
1995 }
1996 
1997 int mem_cgroup_newpage_charge(struct page *page,
1998 			      struct mm_struct *mm, gfp_t gfp_mask)
1999 {
2000 	if (mem_cgroup_disabled())
2001 		return 0;
2002 	if (PageCompound(page))
2003 		return 0;
2004 	/*
2005 	 * If already mapped, we don't have to account.
2006 	 * If page cache, page->mapping has address_space.
2007 	 * But page->mapping may have out-of-use anon_vma pointer,
2008 	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
2009 	 * is NULL.
2010   	 */
2011 	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2012 		return 0;
2013 	if (unlikely(!mm))
2014 		mm = &init_mm;
2015 	return mem_cgroup_charge_common(page, mm, gfp_mask,
2016 				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
2017 }
2018 
2019 static void
2020 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2021 					enum charge_type ctype);
2022 
2023 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2024 				gfp_t gfp_mask)
2025 {
2026 	struct mem_cgroup *mem = NULL;
2027 	int ret;
2028 
2029 	if (mem_cgroup_disabled())
2030 		return 0;
2031 	if (PageCompound(page))
2032 		return 0;
2033 	/*
2034 	 * Corner case handling. This is called from add_to_page_cache()
2035 	 * in usual. But some FS (shmem) precharges this page before calling it
2036 	 * and call add_to_page_cache() with GFP_NOWAIT.
2037 	 *
2038 	 * For GFP_NOWAIT case, the page may be pre-charged before calling
2039 	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
2040 	 * charge twice. (It works but has to pay a bit larger cost.)
2041 	 * And when the page is SwapCache, it should take swap information
2042 	 * into account. This is under lock_page() now.
2043 	 */
2044 	if (!(gfp_mask & __GFP_WAIT)) {
2045 		struct page_cgroup *pc;
2046 
2047 
2048 		pc = lookup_page_cgroup(page);
2049 		if (!pc)
2050 			return 0;
2051 		lock_page_cgroup(pc);
2052 		if (PageCgroupUsed(pc)) {
2053 			unlock_page_cgroup(pc);
2054 			return 0;
2055 		}
2056 		unlock_page_cgroup(pc);
2057 	}
2058 
2059 	if (unlikely(!mm && !mem))
2060 		mm = &init_mm;
2061 
2062 	if (page_is_file_cache(page))
2063 		return mem_cgroup_charge_common(page, mm, gfp_mask,
2064 				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
2065 
2066 	/* shmem */
2067 	if (PageSwapCache(page)) {
2068 		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2069 		if (!ret)
2070 			__mem_cgroup_commit_charge_swapin(page, mem,
2071 					MEM_CGROUP_CHARGE_TYPE_SHMEM);
2072 	} else
2073 		ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2074 					MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
2075 
2076 	return ret;
2077 }
2078 
2079 /*
2080  * While swap-in, try_charge -> commit or cancel, the page is locked.
2081  * And when try_charge() successfully returns, one refcnt to memcg without
2082  * struct page_cgroup is acquired. This refcnt will be consumed by
2083  * "commit()" or removed by "cancel()"
2084  */
2085 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2086 				 struct page *page,
2087 				 gfp_t mask, struct mem_cgroup **ptr)
2088 {
2089 	struct mem_cgroup *mem;
2090 	int ret;
2091 
2092 	if (mem_cgroup_disabled())
2093 		return 0;
2094 
2095 	if (!do_swap_account)
2096 		goto charge_cur_mm;
2097 	/*
2098 	 * A racing thread's fault, or swapoff, may have already updated
2099 	 * the pte, and even removed page from swap cache: in those cases
2100 	 * do_swap_page()'s pte_same() test will fail; but there's also a
2101 	 * KSM case which does need to charge the page.
2102 	 */
2103 	if (!PageSwapCache(page))
2104 		goto charge_cur_mm;
2105 	mem = try_get_mem_cgroup_from_page(page);
2106 	if (!mem)
2107 		goto charge_cur_mm;
2108 	*ptr = mem;
2109 	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
2110 	/* drop extra refcnt from tryget */
2111 	css_put(&mem->css);
2112 	return ret;
2113 charge_cur_mm:
2114 	if (unlikely(!mm))
2115 		mm = &init_mm;
2116 	return __mem_cgroup_try_charge(mm, mask, ptr, true);
2117 }
2118 
2119 static void
2120 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2121 					enum charge_type ctype)
2122 {
2123 	struct page_cgroup *pc;
2124 
2125 	if (mem_cgroup_disabled())
2126 		return;
2127 	if (!ptr)
2128 		return;
2129 	cgroup_exclude_rmdir(&ptr->css);
2130 	pc = lookup_page_cgroup(page);
2131 	mem_cgroup_lru_del_before_commit_swapcache(page);
2132 	__mem_cgroup_commit_charge(ptr, pc, ctype);
2133 	mem_cgroup_lru_add_after_commit_swapcache(page);
2134 	/*
2135 	 * Now swap is on-memory. This means this page may be
2136 	 * counted both as mem and swap....double count.
2137 	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
2138 	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
2139 	 * may call delete_from_swap_cache() before reach here.
2140 	 */
2141 	if (do_swap_account && PageSwapCache(page)) {
2142 		swp_entry_t ent = {.val = page_private(page)};
2143 		unsigned short id;
2144 		struct mem_cgroup *memcg;
2145 
2146 		id = swap_cgroup_record(ent, 0);
2147 		rcu_read_lock();
2148 		memcg = mem_cgroup_lookup(id);
2149 		if (memcg) {
2150 			/*
2151 			 * This recorded memcg can be obsolete one. So, avoid
2152 			 * calling css_tryget
2153 			 */
2154 			if (!mem_cgroup_is_root(memcg))
2155 				res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2156 			mem_cgroup_swap_statistics(memcg, false);
2157 			mem_cgroup_put(memcg);
2158 		}
2159 		rcu_read_unlock();
2160 	}
2161 	/*
2162 	 * At swapin, we may charge account against cgroup which has no tasks.
2163 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
2164 	 * In that case, we need to call pre_destroy() again. check it here.
2165 	 */
2166 	cgroup_release_and_wakeup_rmdir(&ptr->css);
2167 }
2168 
2169 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2170 {
2171 	__mem_cgroup_commit_charge_swapin(page, ptr,
2172 					MEM_CGROUP_CHARGE_TYPE_MAPPED);
2173 }
2174 
2175 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2176 {
2177 	if (mem_cgroup_disabled())
2178 		return;
2179 	if (!mem)
2180 		return;
2181 	mem_cgroup_cancel_charge(mem);
2182 }
2183 
2184 static void
2185 __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2186 {
2187 	struct memcg_batch_info *batch = NULL;
2188 	bool uncharge_memsw = true;
2189 	/* If swapout, usage of swap doesn't decrease */
2190 	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2191 		uncharge_memsw = false;
2192 
2193 	batch = &current->memcg_batch;
2194 	/*
2195 	 * In usual, we do css_get() when we remember memcg pointer.
2196 	 * But in this case, we keep res->usage until end of a series of
2197 	 * uncharges. Then, it's ok to ignore memcg's refcnt.
2198 	 */
2199 	if (!batch->memcg)
2200 		batch->memcg = mem;
2201 	/*
2202 	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2203 	 * In those cases, all pages freed continously can be expected to be in
2204 	 * the same cgroup and we have chance to coalesce uncharges.
2205 	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2206 	 * because we want to do uncharge as soon as possible.
2207 	 */
2208 
2209 	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2210 		goto direct_uncharge;
2211 
2212 	/*
2213 	 * In typical case, batch->memcg == mem. This means we can
2214 	 * merge a series of uncharges to an uncharge of res_counter.
2215 	 * If not, we uncharge res_counter ony by one.
2216 	 */
2217 	if (batch->memcg != mem)
2218 		goto direct_uncharge;
2219 	/* remember freed charge and uncharge it later */
2220 	batch->bytes += PAGE_SIZE;
2221 	if (uncharge_memsw)
2222 		batch->memsw_bytes += PAGE_SIZE;
2223 	return;
2224 direct_uncharge:
2225 	res_counter_uncharge(&mem->res, PAGE_SIZE);
2226 	if (uncharge_memsw)
2227 		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2228 	if (unlikely(batch->memcg != mem))
2229 		memcg_oom_recover(mem);
2230 	return;
2231 }
2232 
2233 /*
2234  * uncharge if !page_mapped(page)
2235  */
2236 static struct mem_cgroup *
2237 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2238 {
2239 	struct page_cgroup *pc;
2240 	struct mem_cgroup *mem = NULL;
2241 	struct mem_cgroup_per_zone *mz;
2242 
2243 	if (mem_cgroup_disabled())
2244 		return NULL;
2245 
2246 	if (PageSwapCache(page))
2247 		return NULL;
2248 
2249 	/*
2250 	 * Check if our page_cgroup is valid
2251 	 */
2252 	pc = lookup_page_cgroup(page);
2253 	if (unlikely(!pc || !PageCgroupUsed(pc)))
2254 		return NULL;
2255 
2256 	lock_page_cgroup(pc);
2257 
2258 	mem = pc->mem_cgroup;
2259 
2260 	if (!PageCgroupUsed(pc))
2261 		goto unlock_out;
2262 
2263 	switch (ctype) {
2264 	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2265 	case MEM_CGROUP_CHARGE_TYPE_DROP:
2266 		/* See mem_cgroup_prepare_migration() */
2267 		if (page_mapped(page) || PageCgroupMigration(pc))
2268 			goto unlock_out;
2269 		break;
2270 	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2271 		if (!PageAnon(page)) {	/* Shared memory */
2272 			if (page->mapping && !page_is_file_cache(page))
2273 				goto unlock_out;
2274 		} else if (page_mapped(page)) /* Anon */
2275 				goto unlock_out;
2276 		break;
2277 	default:
2278 		break;
2279 	}
2280 
2281 	if (!mem_cgroup_is_root(mem))
2282 		__do_uncharge(mem, ctype);
2283 	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2284 		mem_cgroup_swap_statistics(mem, true);
2285 	mem_cgroup_charge_statistics(mem, pc, false);
2286 
2287 	ClearPageCgroupUsed(pc);
2288 	/*
2289 	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
2290 	 * freed from LRU. This is safe because uncharged page is expected not
2291 	 * to be reused (freed soon). Exception is SwapCache, it's handled by
2292 	 * special functions.
2293 	 */
2294 
2295 	mz = page_cgroup_zoneinfo(pc);
2296 	unlock_page_cgroup(pc);
2297 
2298 	memcg_check_events(mem, page);
2299 	/* at swapout, this memcg will be accessed to record to swap */
2300 	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2301 		css_put(&mem->css);
2302 
2303 	return mem;
2304 
2305 unlock_out:
2306 	unlock_page_cgroup(pc);
2307 	return NULL;
2308 }
2309 
2310 void mem_cgroup_uncharge_page(struct page *page)
2311 {
2312 	/* early check. */
2313 	if (page_mapped(page))
2314 		return;
2315 	if (page->mapping && !PageAnon(page))
2316 		return;
2317 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
2318 }
2319 
2320 void mem_cgroup_uncharge_cache_page(struct page *page)
2321 {
2322 	VM_BUG_ON(page_mapped(page));
2323 	VM_BUG_ON(page->mapping);
2324 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
2325 }
2326 
2327 /*
2328  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
2329  * In that cases, pages are freed continuously and we can expect pages
2330  * are in the same memcg. All these calls itself limits the number of
2331  * pages freed at once, then uncharge_start/end() is called properly.
2332  * This may be called prural(2) times in a context,
2333  */
2334 
2335 void mem_cgroup_uncharge_start(void)
2336 {
2337 	current->memcg_batch.do_batch++;
2338 	/* We can do nest. */
2339 	if (current->memcg_batch.do_batch == 1) {
2340 		current->memcg_batch.memcg = NULL;
2341 		current->memcg_batch.bytes = 0;
2342 		current->memcg_batch.memsw_bytes = 0;
2343 	}
2344 }
2345 
2346 void mem_cgroup_uncharge_end(void)
2347 {
2348 	struct memcg_batch_info *batch = &current->memcg_batch;
2349 
2350 	if (!batch->do_batch)
2351 		return;
2352 
2353 	batch->do_batch--;
2354 	if (batch->do_batch) /* If stacked, do nothing. */
2355 		return;
2356 
2357 	if (!batch->memcg)
2358 		return;
2359 	/*
2360 	 * This "batch->memcg" is valid without any css_get/put etc...
2361 	 * bacause we hide charges behind us.
2362 	 */
2363 	if (batch->bytes)
2364 		res_counter_uncharge(&batch->memcg->res, batch->bytes);
2365 	if (batch->memsw_bytes)
2366 		res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2367 	memcg_oom_recover(batch->memcg);
2368 	/* forget this pointer (for sanity check) */
2369 	batch->memcg = NULL;
2370 }
2371 
2372 #ifdef CONFIG_SWAP
2373 /*
2374  * called after __delete_from_swap_cache() and drop "page" account.
2375  * memcg information is recorded to swap_cgroup of "ent"
2376  */
2377 void
2378 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2379 {
2380 	struct mem_cgroup *memcg;
2381 	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
2382 
2383 	if (!swapout) /* this was a swap cache but the swap is unused ! */
2384 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
2385 
2386 	memcg = __mem_cgroup_uncharge_common(page, ctype);
2387 
2388 	/* record memcg information */
2389 	if (do_swap_account && swapout && memcg) {
2390 		swap_cgroup_record(ent, css_id(&memcg->css));
2391 		mem_cgroup_get(memcg);
2392 	}
2393 	if (swapout && memcg)
2394 		css_put(&memcg->css);
2395 }
2396 #endif
2397 
2398 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2399 /*
2400  * called from swap_entry_free(). remove record in swap_cgroup and
2401  * uncharge "memsw" account.
2402  */
2403 void mem_cgroup_uncharge_swap(swp_entry_t ent)
2404 {
2405 	struct mem_cgroup *memcg;
2406 	unsigned short id;
2407 
2408 	if (!do_swap_account)
2409 		return;
2410 
2411 	id = swap_cgroup_record(ent, 0);
2412 	rcu_read_lock();
2413 	memcg = mem_cgroup_lookup(id);
2414 	if (memcg) {
2415 		/*
2416 		 * We uncharge this because swap is freed.
2417 		 * This memcg can be obsolete one. We avoid calling css_tryget
2418 		 */
2419 		if (!mem_cgroup_is_root(memcg))
2420 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2421 		mem_cgroup_swap_statistics(memcg, false);
2422 		mem_cgroup_put(memcg);
2423 	}
2424 	rcu_read_unlock();
2425 }
2426 
2427 /**
2428  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2429  * @entry: swap entry to be moved
2430  * @from:  mem_cgroup which the entry is moved from
2431  * @to:  mem_cgroup which the entry is moved to
2432  * @need_fixup: whether we should fixup res_counters and refcounts.
2433  *
2434  * It succeeds only when the swap_cgroup's record for this entry is the same
2435  * as the mem_cgroup's id of @from.
2436  *
2437  * Returns 0 on success, -EINVAL on failure.
2438  *
2439  * The caller must have charged to @to, IOW, called res_counter_charge() about
2440  * both res and memsw, and called css_get().
2441  */
2442 static int mem_cgroup_move_swap_account(swp_entry_t entry,
2443 		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2444 {
2445 	unsigned short old_id, new_id;
2446 
2447 	old_id = css_id(&from->css);
2448 	new_id = css_id(&to->css);
2449 
2450 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2451 		mem_cgroup_swap_statistics(from, false);
2452 		mem_cgroup_swap_statistics(to, true);
2453 		/*
2454 		 * This function is only called from task migration context now.
2455 		 * It postpones res_counter and refcount handling till the end
2456 		 * of task migration(mem_cgroup_clear_mc()) for performance
2457 		 * improvement. But we cannot postpone mem_cgroup_get(to)
2458 		 * because if the process that has been moved to @to does
2459 		 * swap-in, the refcount of @to might be decreased to 0.
2460 		 */
2461 		mem_cgroup_get(to);
2462 		if (need_fixup) {
2463 			if (!mem_cgroup_is_root(from))
2464 				res_counter_uncharge(&from->memsw, PAGE_SIZE);
2465 			mem_cgroup_put(from);
2466 			/*
2467 			 * we charged both to->res and to->memsw, so we should
2468 			 * uncharge to->res.
2469 			 */
2470 			if (!mem_cgroup_is_root(to))
2471 				res_counter_uncharge(&to->res, PAGE_SIZE);
2472 			css_put(&to->css);
2473 		}
2474 		return 0;
2475 	}
2476 	return -EINVAL;
2477 }
2478 #else
2479 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2480 		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2481 {
2482 	return -EINVAL;
2483 }
2484 #endif
2485 
2486 /*
2487  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
2488  * page belongs to.
2489  */
2490 int mem_cgroup_prepare_migration(struct page *page,
2491 	struct page *newpage, struct mem_cgroup **ptr)
2492 {
2493 	struct page_cgroup *pc;
2494 	struct mem_cgroup *mem = NULL;
2495 	enum charge_type ctype;
2496 	int ret = 0;
2497 
2498 	if (mem_cgroup_disabled())
2499 		return 0;
2500 
2501 	pc = lookup_page_cgroup(page);
2502 	lock_page_cgroup(pc);
2503 	if (PageCgroupUsed(pc)) {
2504 		mem = pc->mem_cgroup;
2505 		css_get(&mem->css);
2506 		/*
2507 		 * At migrating an anonymous page, its mapcount goes down
2508 		 * to 0 and uncharge() will be called. But, even if it's fully
2509 		 * unmapped, migration may fail and this page has to be
2510 		 * charged again. We set MIGRATION flag here and delay uncharge
2511 		 * until end_migration() is called
2512 		 *
2513 		 * Corner Case Thinking
2514 		 * A)
2515 		 * When the old page was mapped as Anon and it's unmap-and-freed
2516 		 * while migration was ongoing.
2517 		 * If unmap finds the old page, uncharge() of it will be delayed
2518 		 * until end_migration(). If unmap finds a new page, it's
2519 		 * uncharged when it make mapcount to be 1->0. If unmap code
2520 		 * finds swap_migration_entry, the new page will not be mapped
2521 		 * and end_migration() will find it(mapcount==0).
2522 		 *
2523 		 * B)
2524 		 * When the old page was mapped but migraion fails, the kernel
2525 		 * remaps it. A charge for it is kept by MIGRATION flag even
2526 		 * if mapcount goes down to 0. We can do remap successfully
2527 		 * without charging it again.
2528 		 *
2529 		 * C)
2530 		 * The "old" page is under lock_page() until the end of
2531 		 * migration, so, the old page itself will not be swapped-out.
2532 		 * If the new page is swapped out before end_migraton, our
2533 		 * hook to usual swap-out path will catch the event.
2534 		 */
2535 		if (PageAnon(page))
2536 			SetPageCgroupMigration(pc);
2537 	}
2538 	unlock_page_cgroup(pc);
2539 	/*
2540 	 * If the page is not charged at this point,
2541 	 * we return here.
2542 	 */
2543 	if (!mem)
2544 		return 0;
2545 
2546 	*ptr = mem;
2547 	ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
2548 	css_put(&mem->css);/* drop extra refcnt */
2549 	if (ret || *ptr == NULL) {
2550 		if (PageAnon(page)) {
2551 			lock_page_cgroup(pc);
2552 			ClearPageCgroupMigration(pc);
2553 			unlock_page_cgroup(pc);
2554 			/*
2555 			 * The old page may be fully unmapped while we kept it.
2556 			 */
2557 			mem_cgroup_uncharge_page(page);
2558 		}
2559 		return -ENOMEM;
2560 	}
2561 	/*
2562 	 * We charge new page before it's used/mapped. So, even if unlock_page()
2563 	 * is called before end_migration, we can catch all events on this new
2564 	 * page. In the case new page is migrated but not remapped, new page's
2565 	 * mapcount will be finally 0 and we call uncharge in end_migration().
2566 	 */
2567 	pc = lookup_page_cgroup(newpage);
2568 	if (PageAnon(page))
2569 		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2570 	else if (page_is_file_cache(page))
2571 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2572 	else
2573 		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2574 	__mem_cgroup_commit_charge(mem, pc, ctype);
2575 	return ret;
2576 }
2577 
2578 /* remove redundant charge if migration failed*/
2579 void mem_cgroup_end_migration(struct mem_cgroup *mem,
2580 	struct page *oldpage, struct page *newpage)
2581 {
2582 	struct page *used, *unused;
2583 	struct page_cgroup *pc;
2584 
2585 	if (!mem)
2586 		return;
2587 	/* blocks rmdir() */
2588 	cgroup_exclude_rmdir(&mem->css);
2589 	/* at migration success, oldpage->mapping is NULL. */
2590 	if (oldpage->mapping) {
2591 		used = oldpage;
2592 		unused = newpage;
2593 	} else {
2594 		used = newpage;
2595 		unused = oldpage;
2596 	}
2597 	/*
2598 	 * We disallowed uncharge of pages under migration because mapcount
2599 	 * of the page goes down to zero, temporarly.
2600 	 * Clear the flag and check the page should be charged.
2601 	 */
2602 	pc = lookup_page_cgroup(oldpage);
2603 	lock_page_cgroup(pc);
2604 	ClearPageCgroupMigration(pc);
2605 	unlock_page_cgroup(pc);
2606 
2607 	if (unused != oldpage)
2608 		pc = lookup_page_cgroup(unused);
2609 	__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
2610 
2611 	pc = lookup_page_cgroup(used);
2612 	/*
2613 	 * If a page is a file cache, radix-tree replacement is very atomic
2614 	 * and we can skip this check. When it was an Anon page, its mapcount
2615 	 * goes down to 0. But because we added MIGRATION flage, it's not
2616 	 * uncharged yet. There are several case but page->mapcount check
2617 	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
2618 	 * check. (see prepare_charge() also)
2619 	 */
2620 	if (PageAnon(used))
2621 		mem_cgroup_uncharge_page(used);
2622 	/*
2623 	 * At migration, we may charge account against cgroup which has no
2624 	 * tasks.
2625 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
2626 	 * In that case, we need to call pre_destroy() again. check it here.
2627 	 */
2628 	cgroup_release_and_wakeup_rmdir(&mem->css);
2629 }
2630 
2631 /*
2632  * A call to try to shrink memory usage on charge failure at shmem's swapin.
2633  * Calling hierarchical_reclaim is not enough because we should update
2634  * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
2635  * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
2636  * not from the memcg which this page would be charged to.
2637  * try_charge_swapin does all of these works properly.
2638  */
2639 int mem_cgroup_shmem_charge_fallback(struct page *page,
2640 			    struct mm_struct *mm,
2641 			    gfp_t gfp_mask)
2642 {
2643 	struct mem_cgroup *mem = NULL;
2644 	int ret;
2645 
2646 	if (mem_cgroup_disabled())
2647 		return 0;
2648 
2649 	ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2650 	if (!ret)
2651 		mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
2652 
2653 	return ret;
2654 }
2655 
2656 static DEFINE_MUTEX(set_limit_mutex);
2657 
2658 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2659 				unsigned long long val)
2660 {
2661 	int retry_count;
2662 	u64 memswlimit, memlimit;
2663 	int ret = 0;
2664 	int children = mem_cgroup_count_children(memcg);
2665 	u64 curusage, oldusage;
2666 	int enlarge;
2667 
2668 	/*
2669 	 * For keeping hierarchical_reclaim simple, how long we should retry
2670 	 * is depends on callers. We set our retry-count to be function
2671 	 * of # of children which we should visit in this loop.
2672 	 */
2673 	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
2674 
2675 	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2676 
2677 	enlarge = 0;
2678 	while (retry_count) {
2679 		if (signal_pending(current)) {
2680 			ret = -EINTR;
2681 			break;
2682 		}
2683 		/*
2684 		 * Rather than hide all in some function, I do this in
2685 		 * open coded manner. You see what this really does.
2686 		 * We have to guarantee mem->res.limit < mem->memsw.limit.
2687 		 */
2688 		mutex_lock(&set_limit_mutex);
2689 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2690 		if (memswlimit < val) {
2691 			ret = -EINVAL;
2692 			mutex_unlock(&set_limit_mutex);
2693 			break;
2694 		}
2695 
2696 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2697 		if (memlimit < val)
2698 			enlarge = 1;
2699 
2700 		ret = res_counter_set_limit(&memcg->res, val);
2701 		if (!ret) {
2702 			if (memswlimit == val)
2703 				memcg->memsw_is_minimum = true;
2704 			else
2705 				memcg->memsw_is_minimum = false;
2706 		}
2707 		mutex_unlock(&set_limit_mutex);
2708 
2709 		if (!ret)
2710 			break;
2711 
2712 		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2713 						MEM_CGROUP_RECLAIM_SHRINK);
2714 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2715 		/* Usage is reduced ? */
2716   		if (curusage >= oldusage)
2717 			retry_count--;
2718 		else
2719 			oldusage = curusage;
2720 	}
2721 	if (!ret && enlarge)
2722 		memcg_oom_recover(memcg);
2723 
2724 	return ret;
2725 }
2726 
2727 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2728 					unsigned long long val)
2729 {
2730 	int retry_count;
2731 	u64 memlimit, memswlimit, oldusage, curusage;
2732 	int children = mem_cgroup_count_children(memcg);
2733 	int ret = -EBUSY;
2734 	int enlarge = 0;
2735 
2736 	/* see mem_cgroup_resize_res_limit */
2737  	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
2738 	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2739 	while (retry_count) {
2740 		if (signal_pending(current)) {
2741 			ret = -EINTR;
2742 			break;
2743 		}
2744 		/*
2745 		 * Rather than hide all in some function, I do this in
2746 		 * open coded manner. You see what this really does.
2747 		 * We have to guarantee mem->res.limit < mem->memsw.limit.
2748 		 */
2749 		mutex_lock(&set_limit_mutex);
2750 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2751 		if (memlimit > val) {
2752 			ret = -EINVAL;
2753 			mutex_unlock(&set_limit_mutex);
2754 			break;
2755 		}
2756 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2757 		if (memswlimit < val)
2758 			enlarge = 1;
2759 		ret = res_counter_set_limit(&memcg->memsw, val);
2760 		if (!ret) {
2761 			if (memlimit == val)
2762 				memcg->memsw_is_minimum = true;
2763 			else
2764 				memcg->memsw_is_minimum = false;
2765 		}
2766 		mutex_unlock(&set_limit_mutex);
2767 
2768 		if (!ret)
2769 			break;
2770 
2771 		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2772 						MEM_CGROUP_RECLAIM_NOSWAP |
2773 						MEM_CGROUP_RECLAIM_SHRINK);
2774 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2775 		/* Usage is reduced ? */
2776 		if (curusage >= oldusage)
2777 			retry_count--;
2778 		else
2779 			oldusage = curusage;
2780 	}
2781 	if (!ret && enlarge)
2782 		memcg_oom_recover(memcg);
2783 	return ret;
2784 }
2785 
2786 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2787 						gfp_t gfp_mask, int nid,
2788 						int zid)
2789 {
2790 	unsigned long nr_reclaimed = 0;
2791 	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2792 	unsigned long reclaimed;
2793 	int loop = 0;
2794 	struct mem_cgroup_tree_per_zone *mctz;
2795 	unsigned long long excess;
2796 
2797 	if (order > 0)
2798 		return 0;
2799 
2800 	mctz = soft_limit_tree_node_zone(nid, zid);
2801 	/*
2802 	 * This loop can run a while, specially if mem_cgroup's continuously
2803 	 * keep exceeding their soft limit and putting the system under
2804 	 * pressure
2805 	 */
2806 	do {
2807 		if (next_mz)
2808 			mz = next_mz;
2809 		else
2810 			mz = mem_cgroup_largest_soft_limit_node(mctz);
2811 		if (!mz)
2812 			break;
2813 
2814 		reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2815 						gfp_mask,
2816 						MEM_CGROUP_RECLAIM_SOFT);
2817 		nr_reclaimed += reclaimed;
2818 		spin_lock(&mctz->lock);
2819 
2820 		/*
2821 		 * If we failed to reclaim anything from this memory cgroup
2822 		 * it is time to move on to the next cgroup
2823 		 */
2824 		next_mz = NULL;
2825 		if (!reclaimed) {
2826 			do {
2827 				/*
2828 				 * Loop until we find yet another one.
2829 				 *
2830 				 * By the time we get the soft_limit lock
2831 				 * again, someone might have aded the
2832 				 * group back on the RB tree. Iterate to
2833 				 * make sure we get a different mem.
2834 				 * mem_cgroup_largest_soft_limit_node returns
2835 				 * NULL if no other cgroup is present on
2836 				 * the tree
2837 				 */
2838 				next_mz =
2839 				__mem_cgroup_largest_soft_limit_node(mctz);
2840 				if (next_mz == mz) {
2841 					css_put(&next_mz->mem->css);
2842 					next_mz = NULL;
2843 				} else /* next_mz == NULL or other memcg */
2844 					break;
2845 			} while (1);
2846 		}
2847 		__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2848 		excess = res_counter_soft_limit_excess(&mz->mem->res);
2849 		/*
2850 		 * One school of thought says that we should not add
2851 		 * back the node to the tree if reclaim returns 0.
2852 		 * But our reclaim could return 0, simply because due
2853 		 * to priority we are exposing a smaller subset of
2854 		 * memory to reclaim from. Consider this as a longer
2855 		 * term TODO.
2856 		 */
2857 		/* If excess == 0, no tree ops */
2858 		__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
2859 		spin_unlock(&mctz->lock);
2860 		css_put(&mz->mem->css);
2861 		loop++;
2862 		/*
2863 		 * Could not reclaim anything and there are no more
2864 		 * mem cgroups to try or we seem to be looping without
2865 		 * reclaiming anything.
2866 		 */
2867 		if (!nr_reclaimed &&
2868 			(next_mz == NULL ||
2869 			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2870 			break;
2871 	} while (!nr_reclaimed);
2872 	if (next_mz)
2873 		css_put(&next_mz->mem->css);
2874 	return nr_reclaimed;
2875 }
2876 
2877 /*
2878  * This routine traverse page_cgroup in given list and drop them all.
2879  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
2880  */
2881 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2882 				int node, int zid, enum lru_list lru)
2883 {
2884 	struct zone *zone;
2885 	struct mem_cgroup_per_zone *mz;
2886 	struct page_cgroup *pc, *busy;
2887 	unsigned long flags, loop;
2888 	struct list_head *list;
2889 	int ret = 0;
2890 
2891 	zone = &NODE_DATA(node)->node_zones[zid];
2892 	mz = mem_cgroup_zoneinfo(mem, node, zid);
2893 	list = &mz->lists[lru];
2894 
2895 	loop = MEM_CGROUP_ZSTAT(mz, lru);
2896 	/* give some margin against EBUSY etc...*/
2897 	loop += 256;
2898 	busy = NULL;
2899 	while (loop--) {
2900 		ret = 0;
2901 		spin_lock_irqsave(&zone->lru_lock, flags);
2902 		if (list_empty(list)) {
2903 			spin_unlock_irqrestore(&zone->lru_lock, flags);
2904 			break;
2905 		}
2906 		pc = list_entry(list->prev, struct page_cgroup, lru);
2907 		if (busy == pc) {
2908 			list_move(&pc->lru, list);
2909 			busy = NULL;
2910 			spin_unlock_irqrestore(&zone->lru_lock, flags);
2911 			continue;
2912 		}
2913 		spin_unlock_irqrestore(&zone->lru_lock, flags);
2914 
2915 		ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
2916 		if (ret == -ENOMEM)
2917 			break;
2918 
2919 		if (ret == -EBUSY || ret == -EINVAL) {
2920 			/* found lock contention or "pc" is obsolete. */
2921 			busy = pc;
2922 			cond_resched();
2923 		} else
2924 			busy = NULL;
2925 	}
2926 
2927 	if (!ret && !list_empty(list))
2928 		return -EBUSY;
2929 	return ret;
2930 }
2931 
2932 /*
2933  * make mem_cgroup's charge to be 0 if there is no task.
2934  * This enables deleting this mem_cgroup.
2935  */
2936 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
2937 {
2938 	int ret;
2939 	int node, zid, shrink;
2940 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2941 	struct cgroup *cgrp = mem->css.cgroup;
2942 
2943 	css_get(&mem->css);
2944 
2945 	shrink = 0;
2946 	/* should free all ? */
2947 	if (free_all)
2948 		goto try_to_free;
2949 move_account:
2950 	do {
2951 		ret = -EBUSY;
2952 		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
2953 			goto out;
2954 		ret = -EINTR;
2955 		if (signal_pending(current))
2956 			goto out;
2957 		/* This is for making all *used* pages to be on LRU. */
2958 		lru_add_drain_all();
2959 		drain_all_stock_sync();
2960 		ret = 0;
2961 		for_each_node_state(node, N_HIGH_MEMORY) {
2962 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
2963 				enum lru_list l;
2964 				for_each_lru(l) {
2965 					ret = mem_cgroup_force_empty_list(mem,
2966 							node, zid, l);
2967 					if (ret)
2968 						break;
2969 				}
2970 			}
2971 			if (ret)
2972 				break;
2973 		}
2974 		memcg_oom_recover(mem);
2975 		/* it seems parent cgroup doesn't have enough mem */
2976 		if (ret == -ENOMEM)
2977 			goto try_to_free;
2978 		cond_resched();
2979 	/* "ret" should also be checked to ensure all lists are empty. */
2980 	} while (mem->res.usage > 0 || ret);
2981 out:
2982 	css_put(&mem->css);
2983 	return ret;
2984 
2985 try_to_free:
2986 	/* returns EBUSY if there is a task or if we come here twice. */
2987 	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
2988 		ret = -EBUSY;
2989 		goto out;
2990 	}
2991 	/* we call try-to-free pages for make this cgroup empty */
2992 	lru_add_drain_all();
2993 	/* try to free all pages in this cgroup */
2994 	shrink = 1;
2995 	while (nr_retries && mem->res.usage > 0) {
2996 		int progress;
2997 
2998 		if (signal_pending(current)) {
2999 			ret = -EINTR;
3000 			goto out;
3001 		}
3002 		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3003 						false, get_swappiness(mem));
3004 		if (!progress) {
3005 			nr_retries--;
3006 			/* maybe some writeback is necessary */
3007 			congestion_wait(BLK_RW_ASYNC, HZ/10);
3008 		}
3009 
3010 	}
3011 	lru_add_drain();
3012 	/* try move_account...there may be some *locked* pages. */
3013 	goto move_account;
3014 }
3015 
3016 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3017 {
3018 	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3019 }
3020 
3021 
3022 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3023 {
3024 	return mem_cgroup_from_cont(cont)->use_hierarchy;
3025 }
3026 
3027 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3028 					u64 val)
3029 {
3030 	int retval = 0;
3031 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3032 	struct cgroup *parent = cont->parent;
3033 	struct mem_cgroup *parent_mem = NULL;
3034 
3035 	if (parent)
3036 		parent_mem = mem_cgroup_from_cont(parent);
3037 
3038 	cgroup_lock();
3039 	/*
3040 	 * If parent's use_hierarchy is set, we can't make any modifications
3041 	 * in the child subtrees. If it is unset, then the change can
3042 	 * occur, provided the current cgroup has no children.
3043 	 *
3044 	 * For the root cgroup, parent_mem is NULL, we allow value to be
3045 	 * set if there are no children.
3046 	 */
3047 	if ((!parent_mem || !parent_mem->use_hierarchy) &&
3048 				(val == 1 || val == 0)) {
3049 		if (list_empty(&cont->children))
3050 			mem->use_hierarchy = val;
3051 		else
3052 			retval = -EBUSY;
3053 	} else
3054 		retval = -EINVAL;
3055 	cgroup_unlock();
3056 
3057 	return retval;
3058 }
3059 
3060 struct mem_cgroup_idx_data {
3061 	s64 val;
3062 	enum mem_cgroup_stat_index idx;
3063 };
3064 
3065 static int
3066 mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
3067 {
3068 	struct mem_cgroup_idx_data *d = data;
3069 	d->val += mem_cgroup_read_stat(mem, d->idx);
3070 	return 0;
3071 }
3072 
3073 static void
3074 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
3075 				enum mem_cgroup_stat_index idx, s64 *val)
3076 {
3077 	struct mem_cgroup_idx_data d;
3078 	d.idx = idx;
3079 	d.val = 0;
3080 	mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
3081 	*val = d.val;
3082 }
3083 
3084 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3085 {
3086 	u64 idx_val, val;
3087 
3088 	if (!mem_cgroup_is_root(mem)) {
3089 		if (!swap)
3090 			return res_counter_read_u64(&mem->res, RES_USAGE);
3091 		else
3092 			return res_counter_read_u64(&mem->memsw, RES_USAGE);
3093 	}
3094 
3095 	mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
3096 	val = idx_val;
3097 	mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
3098 	val += idx_val;
3099 
3100 	if (swap) {
3101 		mem_cgroup_get_recursive_idx_stat(mem,
3102 				MEM_CGROUP_STAT_SWAPOUT, &idx_val);
3103 		val += idx_val;
3104 	}
3105 
3106 	return val << PAGE_SHIFT;
3107 }
3108 
3109 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3110 {
3111 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3112 	u64 val;
3113 	int type, name;
3114 
3115 	type = MEMFILE_TYPE(cft->private);
3116 	name = MEMFILE_ATTR(cft->private);
3117 	switch (type) {
3118 	case _MEM:
3119 		if (name == RES_USAGE)
3120 			val = mem_cgroup_usage(mem, false);
3121 		else
3122 			val = res_counter_read_u64(&mem->res, name);
3123 		break;
3124 	case _MEMSWAP:
3125 		if (name == RES_USAGE)
3126 			val = mem_cgroup_usage(mem, true);
3127 		else
3128 			val = res_counter_read_u64(&mem->memsw, name);
3129 		break;
3130 	default:
3131 		BUG();
3132 		break;
3133 	}
3134 	return val;
3135 }
3136 /*
3137  * The user of this function is...
3138  * RES_LIMIT.
3139  */
3140 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3141 			    const char *buffer)
3142 {
3143 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3144 	int type, name;
3145 	unsigned long long val;
3146 	int ret;
3147 
3148 	type = MEMFILE_TYPE(cft->private);
3149 	name = MEMFILE_ATTR(cft->private);
3150 	switch (name) {
3151 	case RES_LIMIT:
3152 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3153 			ret = -EINVAL;
3154 			break;
3155 		}
3156 		/* This function does all necessary parse...reuse it */
3157 		ret = res_counter_memparse_write_strategy(buffer, &val);
3158 		if (ret)
3159 			break;
3160 		if (type == _MEM)
3161 			ret = mem_cgroup_resize_limit(memcg, val);
3162 		else
3163 			ret = mem_cgroup_resize_memsw_limit(memcg, val);
3164 		break;
3165 	case RES_SOFT_LIMIT:
3166 		ret = res_counter_memparse_write_strategy(buffer, &val);
3167 		if (ret)
3168 			break;
3169 		/*
3170 		 * For memsw, soft limits are hard to implement in terms
3171 		 * of semantics, for now, we support soft limits for
3172 		 * control without swap
3173 		 */
3174 		if (type == _MEM)
3175 			ret = res_counter_set_soft_limit(&memcg->res, val);
3176 		else
3177 			ret = -EINVAL;
3178 		break;
3179 	default:
3180 		ret = -EINVAL; /* should be BUG() ? */
3181 		break;
3182 	}
3183 	return ret;
3184 }
3185 
3186 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3187 		unsigned long long *mem_limit, unsigned long long *memsw_limit)
3188 {
3189 	struct cgroup *cgroup;
3190 	unsigned long long min_limit, min_memsw_limit, tmp;
3191 
3192 	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3193 	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3194 	cgroup = memcg->css.cgroup;
3195 	if (!memcg->use_hierarchy)
3196 		goto out;
3197 
3198 	while (cgroup->parent) {
3199 		cgroup = cgroup->parent;
3200 		memcg = mem_cgroup_from_cont(cgroup);
3201 		if (!memcg->use_hierarchy)
3202 			break;
3203 		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
3204 		min_limit = min(min_limit, tmp);
3205 		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3206 		min_memsw_limit = min(min_memsw_limit, tmp);
3207 	}
3208 out:
3209 	*mem_limit = min_limit;
3210 	*memsw_limit = min_memsw_limit;
3211 	return;
3212 }
3213 
3214 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3215 {
3216 	struct mem_cgroup *mem;
3217 	int type, name;
3218 
3219 	mem = mem_cgroup_from_cont(cont);
3220 	type = MEMFILE_TYPE(event);
3221 	name = MEMFILE_ATTR(event);
3222 	switch (name) {
3223 	case RES_MAX_USAGE:
3224 		if (type == _MEM)
3225 			res_counter_reset_max(&mem->res);
3226 		else
3227 			res_counter_reset_max(&mem->memsw);
3228 		break;
3229 	case RES_FAILCNT:
3230 		if (type == _MEM)
3231 			res_counter_reset_failcnt(&mem->res);
3232 		else
3233 			res_counter_reset_failcnt(&mem->memsw);
3234 		break;
3235 	}
3236 
3237 	return 0;
3238 }
3239 
3240 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3241 					struct cftype *cft)
3242 {
3243 	return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3244 }
3245 
3246 #ifdef CONFIG_MMU
3247 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3248 					struct cftype *cft, u64 val)
3249 {
3250 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3251 
3252 	if (val >= (1 << NR_MOVE_TYPE))
3253 		return -EINVAL;
3254 	/*
3255 	 * We check this value several times in both in can_attach() and
3256 	 * attach(), so we need cgroup lock to prevent this value from being
3257 	 * inconsistent.
3258 	 */
3259 	cgroup_lock();
3260 	mem->move_charge_at_immigrate = val;
3261 	cgroup_unlock();
3262 
3263 	return 0;
3264 }
3265 #else
3266 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3267 					struct cftype *cft, u64 val)
3268 {
3269 	return -ENOSYS;
3270 }
3271 #endif
3272 
3273 
3274 /* For read statistics */
3275 enum {
3276 	MCS_CACHE,
3277 	MCS_RSS,
3278 	MCS_FILE_MAPPED,
3279 	MCS_PGPGIN,
3280 	MCS_PGPGOUT,
3281 	MCS_SWAP,
3282 	MCS_INACTIVE_ANON,
3283 	MCS_ACTIVE_ANON,
3284 	MCS_INACTIVE_FILE,
3285 	MCS_ACTIVE_FILE,
3286 	MCS_UNEVICTABLE,
3287 	NR_MCS_STAT,
3288 };
3289 
3290 struct mcs_total_stat {
3291 	s64 stat[NR_MCS_STAT];
3292 };
3293 
3294 struct {
3295 	char *local_name;
3296 	char *total_name;
3297 } memcg_stat_strings[NR_MCS_STAT] = {
3298 	{"cache", "total_cache"},
3299 	{"rss", "total_rss"},
3300 	{"mapped_file", "total_mapped_file"},
3301 	{"pgpgin", "total_pgpgin"},
3302 	{"pgpgout", "total_pgpgout"},
3303 	{"swap", "total_swap"},
3304 	{"inactive_anon", "total_inactive_anon"},
3305 	{"active_anon", "total_active_anon"},
3306 	{"inactive_file", "total_inactive_file"},
3307 	{"active_file", "total_active_file"},
3308 	{"unevictable", "total_unevictable"}
3309 };
3310 
3311 
3312 static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3313 {
3314 	struct mcs_total_stat *s = data;
3315 	s64 val;
3316 
3317 	/* per cpu stat */
3318 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
3319 	s->stat[MCS_CACHE] += val * PAGE_SIZE;
3320 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
3321 	s->stat[MCS_RSS] += val * PAGE_SIZE;
3322 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3323 	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3324 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
3325 	s->stat[MCS_PGPGIN] += val;
3326 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
3327 	s->stat[MCS_PGPGOUT] += val;
3328 	if (do_swap_account) {
3329 		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3330 		s->stat[MCS_SWAP] += val * PAGE_SIZE;
3331 	}
3332 
3333 	/* per zone stat */
3334 	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
3335 	s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
3336 	val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
3337 	s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
3338 	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
3339 	s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
3340 	val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
3341 	s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3342 	val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3343 	s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3344 	return 0;
3345 }
3346 
3347 static void
3348 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3349 {
3350 	mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
3351 }
3352 
3353 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3354 				 struct cgroup_map_cb *cb)
3355 {
3356 	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
3357 	struct mcs_total_stat mystat;
3358 	int i;
3359 
3360 	memset(&mystat, 0, sizeof(mystat));
3361 	mem_cgroup_get_local_stat(mem_cont, &mystat);
3362 
3363 	for (i = 0; i < NR_MCS_STAT; i++) {
3364 		if (i == MCS_SWAP && !do_swap_account)
3365 			continue;
3366 		cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
3367 	}
3368 
3369 	/* Hierarchical information */
3370 	{
3371 		unsigned long long limit, memsw_limit;
3372 		memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
3373 		cb->fill(cb, "hierarchical_memory_limit", limit);
3374 		if (do_swap_account)
3375 			cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
3376 	}
3377 
3378 	memset(&mystat, 0, sizeof(mystat));
3379 	mem_cgroup_get_total_stat(mem_cont, &mystat);
3380 	for (i = 0; i < NR_MCS_STAT; i++) {
3381 		if (i == MCS_SWAP && !do_swap_account)
3382 			continue;
3383 		cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
3384 	}
3385 
3386 #ifdef CONFIG_DEBUG_VM
3387 	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
3388 
3389 	{
3390 		int nid, zid;
3391 		struct mem_cgroup_per_zone *mz;
3392 		unsigned long recent_rotated[2] = {0, 0};
3393 		unsigned long recent_scanned[2] = {0, 0};
3394 
3395 		for_each_online_node(nid)
3396 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3397 				mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
3398 
3399 				recent_rotated[0] +=
3400 					mz->reclaim_stat.recent_rotated[0];
3401 				recent_rotated[1] +=
3402 					mz->reclaim_stat.recent_rotated[1];
3403 				recent_scanned[0] +=
3404 					mz->reclaim_stat.recent_scanned[0];
3405 				recent_scanned[1] +=
3406 					mz->reclaim_stat.recent_scanned[1];
3407 			}
3408 		cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
3409 		cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
3410 		cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
3411 		cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
3412 	}
3413 #endif
3414 
3415 	return 0;
3416 }
3417 
3418 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
3419 {
3420 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3421 
3422 	return get_swappiness(memcg);
3423 }
3424 
3425 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3426 				       u64 val)
3427 {
3428 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3429 	struct mem_cgroup *parent;
3430 
3431 	if (val > 100)
3432 		return -EINVAL;
3433 
3434 	if (cgrp->parent == NULL)
3435 		return -EINVAL;
3436 
3437 	parent = mem_cgroup_from_cont(cgrp->parent);
3438 
3439 	cgroup_lock();
3440 
3441 	/* If under hierarchy, only empty-root can set this value */
3442 	if ((parent->use_hierarchy) ||
3443 	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
3444 		cgroup_unlock();
3445 		return -EINVAL;
3446 	}
3447 
3448 	spin_lock(&memcg->reclaim_param_lock);
3449 	memcg->swappiness = val;
3450 	spin_unlock(&memcg->reclaim_param_lock);
3451 
3452 	cgroup_unlock();
3453 
3454 	return 0;
3455 }
3456 
3457 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3458 {
3459 	struct mem_cgroup_threshold_ary *t;
3460 	u64 usage;
3461 	int i;
3462 
3463 	rcu_read_lock();
3464 	if (!swap)
3465 		t = rcu_dereference(memcg->thresholds.primary);
3466 	else
3467 		t = rcu_dereference(memcg->memsw_thresholds.primary);
3468 
3469 	if (!t)
3470 		goto unlock;
3471 
3472 	usage = mem_cgroup_usage(memcg, swap);
3473 
3474 	/*
3475 	 * current_threshold points to threshold just below usage.
3476 	 * If it's not true, a threshold was crossed after last
3477 	 * call of __mem_cgroup_threshold().
3478 	 */
3479 	i = t->current_threshold;
3480 
3481 	/*
3482 	 * Iterate backward over array of thresholds starting from
3483 	 * current_threshold and check if a threshold is crossed.
3484 	 * If none of thresholds below usage is crossed, we read
3485 	 * only one element of the array here.
3486 	 */
3487 	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3488 		eventfd_signal(t->entries[i].eventfd, 1);
3489 
3490 	/* i = current_threshold + 1 */
3491 	i++;
3492 
3493 	/*
3494 	 * Iterate forward over array of thresholds starting from
3495 	 * current_threshold+1 and check if a threshold is crossed.
3496 	 * If none of thresholds above usage is crossed, we read
3497 	 * only one element of the array here.
3498 	 */
3499 	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3500 		eventfd_signal(t->entries[i].eventfd, 1);
3501 
3502 	/* Update current_threshold */
3503 	t->current_threshold = i - 1;
3504 unlock:
3505 	rcu_read_unlock();
3506 }
3507 
3508 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3509 {
3510 	__mem_cgroup_threshold(memcg, false);
3511 	if (do_swap_account)
3512 		__mem_cgroup_threshold(memcg, true);
3513 }
3514 
3515 static int compare_thresholds(const void *a, const void *b)
3516 {
3517 	const struct mem_cgroup_threshold *_a = a;
3518 	const struct mem_cgroup_threshold *_b = b;
3519 
3520 	return _a->threshold - _b->threshold;
3521 }
3522 
3523 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3524 {
3525 	struct mem_cgroup_eventfd_list *ev;
3526 
3527 	list_for_each_entry(ev, &mem->oom_notify, list)
3528 		eventfd_signal(ev->eventfd, 1);
3529 	return 0;
3530 }
3531 
3532 static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3533 {
3534 	mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
3535 }
3536 
3537 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
3538 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3539 {
3540 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3541 	struct mem_cgroup_thresholds *thresholds;
3542 	struct mem_cgroup_threshold_ary *new;
3543 	int type = MEMFILE_TYPE(cft->private);
3544 	u64 threshold, usage;
3545 	int i, size, ret;
3546 
3547 	ret = res_counter_memparse_write_strategy(args, &threshold);
3548 	if (ret)
3549 		return ret;
3550 
3551 	mutex_lock(&memcg->thresholds_lock);
3552 
3553 	if (type == _MEM)
3554 		thresholds = &memcg->thresholds;
3555 	else if (type == _MEMSWAP)
3556 		thresholds = &memcg->memsw_thresholds;
3557 	else
3558 		BUG();
3559 
3560 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3561 
3562 	/* Check if a threshold crossed before adding a new one */
3563 	if (thresholds->primary)
3564 		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
3565 
3566 	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3567 
3568 	/* Allocate memory for new array of thresholds */
3569 	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3570 			GFP_KERNEL);
3571 	if (!new) {
3572 		ret = -ENOMEM;
3573 		goto unlock;
3574 	}
3575 	new->size = size;
3576 
3577 	/* Copy thresholds (if any) to new array */
3578 	if (thresholds->primary) {
3579 		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3580 				sizeof(struct mem_cgroup_threshold));
3581 	}
3582 
3583 	/* Add new threshold */
3584 	new->entries[size - 1].eventfd = eventfd;
3585 	new->entries[size - 1].threshold = threshold;
3586 
3587 	/* Sort thresholds. Registering of new threshold isn't time-critical */
3588 	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3589 			compare_thresholds, NULL);
3590 
3591 	/* Find current threshold */
3592 	new->current_threshold = -1;
3593 	for (i = 0; i < size; i++) {
3594 		if (new->entries[i].threshold < usage) {
3595 			/*
3596 			 * new->current_threshold will not be used until
3597 			 * rcu_assign_pointer(), so it's safe to increment
3598 			 * it here.
3599 			 */
3600 			++new->current_threshold;
3601 		}
3602 	}
3603 
3604 	/* Free old spare buffer and save old primary buffer as spare */
3605 	kfree(thresholds->spare);
3606 	thresholds->spare = thresholds->primary;
3607 
3608 	rcu_assign_pointer(thresholds->primary, new);
3609 
3610 	/* To be sure that nobody uses thresholds */
3611 	synchronize_rcu();
3612 
3613 unlock:
3614 	mutex_unlock(&memcg->thresholds_lock);
3615 
3616 	return ret;
3617 }
3618 
3619 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
3620 	struct cftype *cft, struct eventfd_ctx *eventfd)
3621 {
3622 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3623 	struct mem_cgroup_thresholds *thresholds;
3624 	struct mem_cgroup_threshold_ary *new;
3625 	int type = MEMFILE_TYPE(cft->private);
3626 	u64 usage;
3627 	int i, j, size;
3628 
3629 	mutex_lock(&memcg->thresholds_lock);
3630 	if (type == _MEM)
3631 		thresholds = &memcg->thresholds;
3632 	else if (type == _MEMSWAP)
3633 		thresholds = &memcg->memsw_thresholds;
3634 	else
3635 		BUG();
3636 
3637 	/*
3638 	 * Something went wrong if we trying to unregister a threshold
3639 	 * if we don't have thresholds
3640 	 */
3641 	BUG_ON(!thresholds);
3642 
3643 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3644 
3645 	/* Check if a threshold crossed before removing */
3646 	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
3647 
3648 	/* Calculate new number of threshold */
3649 	size = 0;
3650 	for (i = 0; i < thresholds->primary->size; i++) {
3651 		if (thresholds->primary->entries[i].eventfd != eventfd)
3652 			size++;
3653 	}
3654 
3655 	new = thresholds->spare;
3656 
3657 	/* Set thresholds array to NULL if we don't have thresholds */
3658 	if (!size) {
3659 		kfree(new);
3660 		new = NULL;
3661 		goto swap_buffers;
3662 	}
3663 
3664 	new->size = size;
3665 
3666 	/* Copy thresholds and find current threshold */
3667 	new->current_threshold = -1;
3668 	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3669 		if (thresholds->primary->entries[i].eventfd == eventfd)
3670 			continue;
3671 
3672 		new->entries[j] = thresholds->primary->entries[i];
3673 		if (new->entries[j].threshold < usage) {
3674 			/*
3675 			 * new->current_threshold will not be used
3676 			 * until rcu_assign_pointer(), so it's safe to increment
3677 			 * it here.
3678 			 */
3679 			++new->current_threshold;
3680 		}
3681 		j++;
3682 	}
3683 
3684 swap_buffers:
3685 	/* Swap primary and spare array */
3686 	thresholds->spare = thresholds->primary;
3687 	rcu_assign_pointer(thresholds->primary, new);
3688 
3689 	/* To be sure that nobody uses thresholds */
3690 	synchronize_rcu();
3691 
3692 	mutex_unlock(&memcg->thresholds_lock);
3693 }
3694 
3695 static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
3696 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3697 {
3698 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3699 	struct mem_cgroup_eventfd_list *event;
3700 	int type = MEMFILE_TYPE(cft->private);
3701 
3702 	BUG_ON(type != _OOM_TYPE);
3703 	event = kmalloc(sizeof(*event),	GFP_KERNEL);
3704 	if (!event)
3705 		return -ENOMEM;
3706 
3707 	mutex_lock(&memcg_oom_mutex);
3708 
3709 	event->eventfd = eventfd;
3710 	list_add(&event->list, &memcg->oom_notify);
3711 
3712 	/* already in OOM ? */
3713 	if (atomic_read(&memcg->oom_lock))
3714 		eventfd_signal(eventfd, 1);
3715 	mutex_unlock(&memcg_oom_mutex);
3716 
3717 	return 0;
3718 }
3719 
3720 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
3721 	struct cftype *cft, struct eventfd_ctx *eventfd)
3722 {
3723 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3724 	struct mem_cgroup_eventfd_list *ev, *tmp;
3725 	int type = MEMFILE_TYPE(cft->private);
3726 
3727 	BUG_ON(type != _OOM_TYPE);
3728 
3729 	mutex_lock(&memcg_oom_mutex);
3730 
3731 	list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
3732 		if (ev->eventfd == eventfd) {
3733 			list_del(&ev->list);
3734 			kfree(ev);
3735 		}
3736 	}
3737 
3738 	mutex_unlock(&memcg_oom_mutex);
3739 }
3740 
3741 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
3742 	struct cftype *cft,  struct cgroup_map_cb *cb)
3743 {
3744 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3745 
3746 	cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
3747 
3748 	if (atomic_read(&mem->oom_lock))
3749 		cb->fill(cb, "under_oom", 1);
3750 	else
3751 		cb->fill(cb, "under_oom", 0);
3752 	return 0;
3753 }
3754 
3755 /*
3756  */
3757 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
3758 	struct cftype *cft, u64 val)
3759 {
3760 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3761 	struct mem_cgroup *parent;
3762 
3763 	/* cannot set to root cgroup and only 0 and 1 are allowed */
3764 	if (!cgrp->parent || !((val == 0) || (val == 1)))
3765 		return -EINVAL;
3766 
3767 	parent = mem_cgroup_from_cont(cgrp->parent);
3768 
3769 	cgroup_lock();
3770 	/* oom-kill-disable is a flag for subhierarchy. */
3771 	if ((parent->use_hierarchy) ||
3772 	    (mem->use_hierarchy && !list_empty(&cgrp->children))) {
3773 		cgroup_unlock();
3774 		return -EINVAL;
3775 	}
3776 	mem->oom_kill_disable = val;
3777 	if (!val)
3778 		memcg_oom_recover(mem);
3779 	cgroup_unlock();
3780 	return 0;
3781 }
3782 
3783 static struct cftype mem_cgroup_files[] = {
3784 	{
3785 		.name = "usage_in_bytes",
3786 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3787 		.read_u64 = mem_cgroup_read,
3788 		.register_event = mem_cgroup_usage_register_event,
3789 		.unregister_event = mem_cgroup_usage_unregister_event,
3790 	},
3791 	{
3792 		.name = "max_usage_in_bytes",
3793 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
3794 		.trigger = mem_cgroup_reset,
3795 		.read_u64 = mem_cgroup_read,
3796 	},
3797 	{
3798 		.name = "limit_in_bytes",
3799 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
3800 		.write_string = mem_cgroup_write,
3801 		.read_u64 = mem_cgroup_read,
3802 	},
3803 	{
3804 		.name = "soft_limit_in_bytes",
3805 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
3806 		.write_string = mem_cgroup_write,
3807 		.read_u64 = mem_cgroup_read,
3808 	},
3809 	{
3810 		.name = "failcnt",
3811 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
3812 		.trigger = mem_cgroup_reset,
3813 		.read_u64 = mem_cgroup_read,
3814 	},
3815 	{
3816 		.name = "stat",
3817 		.read_map = mem_control_stat_show,
3818 	},
3819 	{
3820 		.name = "force_empty",
3821 		.trigger = mem_cgroup_force_empty_write,
3822 	},
3823 	{
3824 		.name = "use_hierarchy",
3825 		.write_u64 = mem_cgroup_hierarchy_write,
3826 		.read_u64 = mem_cgroup_hierarchy_read,
3827 	},
3828 	{
3829 		.name = "swappiness",
3830 		.read_u64 = mem_cgroup_swappiness_read,
3831 		.write_u64 = mem_cgroup_swappiness_write,
3832 	},
3833 	{
3834 		.name = "move_charge_at_immigrate",
3835 		.read_u64 = mem_cgroup_move_charge_read,
3836 		.write_u64 = mem_cgroup_move_charge_write,
3837 	},
3838 	{
3839 		.name = "oom_control",
3840 		.read_map = mem_cgroup_oom_control_read,
3841 		.write_u64 = mem_cgroup_oom_control_write,
3842 		.register_event = mem_cgroup_oom_register_event,
3843 		.unregister_event = mem_cgroup_oom_unregister_event,
3844 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3845 	},
3846 };
3847 
3848 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3849 static struct cftype memsw_cgroup_files[] = {
3850 	{
3851 		.name = "memsw.usage_in_bytes",
3852 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3853 		.read_u64 = mem_cgroup_read,
3854 		.register_event = mem_cgroup_usage_register_event,
3855 		.unregister_event = mem_cgroup_usage_unregister_event,
3856 	},
3857 	{
3858 		.name = "memsw.max_usage_in_bytes",
3859 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
3860 		.trigger = mem_cgroup_reset,
3861 		.read_u64 = mem_cgroup_read,
3862 	},
3863 	{
3864 		.name = "memsw.limit_in_bytes",
3865 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
3866 		.write_string = mem_cgroup_write,
3867 		.read_u64 = mem_cgroup_read,
3868 	},
3869 	{
3870 		.name = "memsw.failcnt",
3871 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
3872 		.trigger = mem_cgroup_reset,
3873 		.read_u64 = mem_cgroup_read,
3874 	},
3875 };
3876 
3877 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3878 {
3879 	if (!do_swap_account)
3880 		return 0;
3881 	return cgroup_add_files(cont, ss, memsw_cgroup_files,
3882 				ARRAY_SIZE(memsw_cgroup_files));
3883 };
3884 #else
3885 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3886 {
3887 	return 0;
3888 }
3889 #endif
3890 
3891 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3892 {
3893 	struct mem_cgroup_per_node *pn;
3894 	struct mem_cgroup_per_zone *mz;
3895 	enum lru_list l;
3896 	int zone, tmp = node;
3897 	/*
3898 	 * This routine is called against possible nodes.
3899 	 * But it's BUG to call kmalloc() against offline node.
3900 	 *
3901 	 * TODO: this routine can waste much memory for nodes which will
3902 	 *       never be onlined. It's better to use memory hotplug callback
3903 	 *       function.
3904 	 */
3905 	if (!node_state(node, N_NORMAL_MEMORY))
3906 		tmp = -1;
3907 	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
3908 	if (!pn)
3909 		return 1;
3910 
3911 	mem->info.nodeinfo[node] = pn;
3912 	memset(pn, 0, sizeof(*pn));
3913 
3914 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3915 		mz = &pn->zoneinfo[zone];
3916 		for_each_lru(l)
3917 			INIT_LIST_HEAD(&mz->lists[l]);
3918 		mz->usage_in_excess = 0;
3919 		mz->on_tree = false;
3920 		mz->mem = mem;
3921 	}
3922 	return 0;
3923 }
3924 
3925 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3926 {
3927 	kfree(mem->info.nodeinfo[node]);
3928 }
3929 
3930 static struct mem_cgroup *mem_cgroup_alloc(void)
3931 {
3932 	struct mem_cgroup *mem;
3933 	int size = sizeof(struct mem_cgroup);
3934 
3935 	/* Can be very big if MAX_NUMNODES is very big */
3936 	if (size < PAGE_SIZE)
3937 		mem = kmalloc(size, GFP_KERNEL);
3938 	else
3939 		mem = vmalloc(size);
3940 
3941 	if (!mem)
3942 		return NULL;
3943 
3944 	memset(mem, 0, size);
3945 	mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
3946 	if (!mem->stat) {
3947 		if (size < PAGE_SIZE)
3948 			kfree(mem);
3949 		else
3950 			vfree(mem);
3951 		mem = NULL;
3952 	}
3953 	return mem;
3954 }
3955 
3956 /*
3957  * At destroying mem_cgroup, references from swap_cgroup can remain.
3958  * (scanning all at force_empty is too costly...)
3959  *
3960  * Instead of clearing all references at force_empty, we remember
3961  * the number of reference from swap_cgroup and free mem_cgroup when
3962  * it goes down to 0.
3963  *
3964  * Removal of cgroup itself succeeds regardless of refs from swap.
3965  */
3966 
3967 static void __mem_cgroup_free(struct mem_cgroup *mem)
3968 {
3969 	int node;
3970 
3971 	mem_cgroup_remove_from_trees(mem);
3972 	free_css_id(&mem_cgroup_subsys, &mem->css);
3973 
3974 	for_each_node_state(node, N_POSSIBLE)
3975 		free_mem_cgroup_per_zone_info(mem, node);
3976 
3977 	free_percpu(mem->stat);
3978 	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
3979 		kfree(mem);
3980 	else
3981 		vfree(mem);
3982 }
3983 
3984 static void mem_cgroup_get(struct mem_cgroup *mem)
3985 {
3986 	atomic_inc(&mem->refcnt);
3987 }
3988 
3989 static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
3990 {
3991 	if (atomic_sub_and_test(count, &mem->refcnt)) {
3992 		struct mem_cgroup *parent = parent_mem_cgroup(mem);
3993 		__mem_cgroup_free(mem);
3994 		if (parent)
3995 			mem_cgroup_put(parent);
3996 	}
3997 }
3998 
3999 static void mem_cgroup_put(struct mem_cgroup *mem)
4000 {
4001 	__mem_cgroup_put(mem, 1);
4002 }
4003 
4004 /*
4005  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
4006  */
4007 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
4008 {
4009 	if (!mem->res.parent)
4010 		return NULL;
4011 	return mem_cgroup_from_res_counter(mem->res.parent, res);
4012 }
4013 
4014 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4015 static void __init enable_swap_cgroup(void)
4016 {
4017 	if (!mem_cgroup_disabled() && really_do_swap_account)
4018 		do_swap_account = 1;
4019 }
4020 #else
4021 static void __init enable_swap_cgroup(void)
4022 {
4023 }
4024 #endif
4025 
4026 static int mem_cgroup_soft_limit_tree_init(void)
4027 {
4028 	struct mem_cgroup_tree_per_node *rtpn;
4029 	struct mem_cgroup_tree_per_zone *rtpz;
4030 	int tmp, node, zone;
4031 
4032 	for_each_node_state(node, N_POSSIBLE) {
4033 		tmp = node;
4034 		if (!node_state(node, N_NORMAL_MEMORY))
4035 			tmp = -1;
4036 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4037 		if (!rtpn)
4038 			return 1;
4039 
4040 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
4041 
4042 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4043 			rtpz = &rtpn->rb_tree_per_zone[zone];
4044 			rtpz->rb_root = RB_ROOT;
4045 			spin_lock_init(&rtpz->lock);
4046 		}
4047 	}
4048 	return 0;
4049 }
4050 
4051 static struct cgroup_subsys_state * __ref
4052 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4053 {
4054 	struct mem_cgroup *mem, *parent;
4055 	long error = -ENOMEM;
4056 	int node;
4057 
4058 	mem = mem_cgroup_alloc();
4059 	if (!mem)
4060 		return ERR_PTR(error);
4061 
4062 	for_each_node_state(node, N_POSSIBLE)
4063 		if (alloc_mem_cgroup_per_zone_info(mem, node))
4064 			goto free_out;
4065 
4066 	/* root ? */
4067 	if (cont->parent == NULL) {
4068 		int cpu;
4069 		enable_swap_cgroup();
4070 		parent = NULL;
4071 		root_mem_cgroup = mem;
4072 		if (mem_cgroup_soft_limit_tree_init())
4073 			goto free_out;
4074 		for_each_possible_cpu(cpu) {
4075 			struct memcg_stock_pcp *stock =
4076 						&per_cpu(memcg_stock, cpu);
4077 			INIT_WORK(&stock->work, drain_local_stock);
4078 		}
4079 		hotcpu_notifier(memcg_stock_cpu_callback, 0);
4080 	} else {
4081 		parent = mem_cgroup_from_cont(cont->parent);
4082 		mem->use_hierarchy = parent->use_hierarchy;
4083 		mem->oom_kill_disable = parent->oom_kill_disable;
4084 	}
4085 
4086 	if (parent && parent->use_hierarchy) {
4087 		res_counter_init(&mem->res, &parent->res);
4088 		res_counter_init(&mem->memsw, &parent->memsw);
4089 		/*
4090 		 * We increment refcnt of the parent to ensure that we can
4091 		 * safely access it on res_counter_charge/uncharge.
4092 		 * This refcnt will be decremented when freeing this
4093 		 * mem_cgroup(see mem_cgroup_put).
4094 		 */
4095 		mem_cgroup_get(parent);
4096 	} else {
4097 		res_counter_init(&mem->res, NULL);
4098 		res_counter_init(&mem->memsw, NULL);
4099 	}
4100 	mem->last_scanned_child = 0;
4101 	spin_lock_init(&mem->reclaim_param_lock);
4102 	INIT_LIST_HEAD(&mem->oom_notify);
4103 
4104 	if (parent)
4105 		mem->swappiness = get_swappiness(parent);
4106 	atomic_set(&mem->refcnt, 1);
4107 	mem->move_charge_at_immigrate = 0;
4108 	mutex_init(&mem->thresholds_lock);
4109 	return &mem->css;
4110 free_out:
4111 	__mem_cgroup_free(mem);
4112 	root_mem_cgroup = NULL;
4113 	return ERR_PTR(error);
4114 }
4115 
4116 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
4117 					struct cgroup *cont)
4118 {
4119 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4120 
4121 	return mem_cgroup_force_empty(mem, false);
4122 }
4123 
4124 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
4125 				struct cgroup *cont)
4126 {
4127 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4128 
4129 	mem_cgroup_put(mem);
4130 }
4131 
4132 static int mem_cgroup_populate(struct cgroup_subsys *ss,
4133 				struct cgroup *cont)
4134 {
4135 	int ret;
4136 
4137 	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
4138 				ARRAY_SIZE(mem_cgroup_files));
4139 
4140 	if (!ret)
4141 		ret = register_memsw_files(cont, ss);
4142 	return ret;
4143 }
4144 
4145 #ifdef CONFIG_MMU
4146 /* Handlers for move charge at task migration. */
4147 #define PRECHARGE_COUNT_AT_ONCE	256
4148 static int mem_cgroup_do_precharge(unsigned long count)
4149 {
4150 	int ret = 0;
4151 	int batch_count = PRECHARGE_COUNT_AT_ONCE;
4152 	struct mem_cgroup *mem = mc.to;
4153 
4154 	if (mem_cgroup_is_root(mem)) {
4155 		mc.precharge += count;
4156 		/* we don't need css_get for root */
4157 		return ret;
4158 	}
4159 	/* try to charge at once */
4160 	if (count > 1) {
4161 		struct res_counter *dummy;
4162 		/*
4163 		 * "mem" cannot be under rmdir() because we've already checked
4164 		 * by cgroup_lock_live_cgroup() that it is not removed and we
4165 		 * are still under the same cgroup_mutex. So we can postpone
4166 		 * css_get().
4167 		 */
4168 		if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
4169 			goto one_by_one;
4170 		if (do_swap_account && res_counter_charge(&mem->memsw,
4171 						PAGE_SIZE * count, &dummy)) {
4172 			res_counter_uncharge(&mem->res, PAGE_SIZE * count);
4173 			goto one_by_one;
4174 		}
4175 		mc.precharge += count;
4176 		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
4177 		WARN_ON_ONCE(count > INT_MAX);
4178 		__css_get(&mem->css, (int)count);
4179 		return ret;
4180 	}
4181 one_by_one:
4182 	/* fall back to one by one charge */
4183 	while (count--) {
4184 		if (signal_pending(current)) {
4185 			ret = -EINTR;
4186 			break;
4187 		}
4188 		if (!batch_count--) {
4189 			batch_count = PRECHARGE_COUNT_AT_ONCE;
4190 			cond_resched();
4191 		}
4192 		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
4193 		if (ret || !mem)
4194 			/* mem_cgroup_clear_mc() will do uncharge later */
4195 			return -ENOMEM;
4196 		mc.precharge++;
4197 	}
4198 	return ret;
4199 }
4200 
4201 /**
4202  * is_target_pte_for_mc - check a pte whether it is valid for move charge
4203  * @vma: the vma the pte to be checked belongs
4204  * @addr: the address corresponding to the pte to be checked
4205  * @ptent: the pte to be checked
4206  * @target: the pointer the target page or swap ent will be stored(can be NULL)
4207  *
4208  * Returns
4209  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
4210  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4211  *     move charge. if @target is not NULL, the page is stored in target->page
4212  *     with extra refcnt got(Callers should handle it).
4213  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4214  *     target for charge migration. if @target is not NULL, the entry is stored
4215  *     in target->ent.
4216  *
4217  * Called with pte lock held.
4218  */
4219 union mc_target {
4220 	struct page	*page;
4221 	swp_entry_t	ent;
4222 };
4223 
4224 enum mc_target_type {
4225 	MC_TARGET_NONE,	/* not used */
4226 	MC_TARGET_PAGE,
4227 	MC_TARGET_SWAP,
4228 };
4229 
4230 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4231 						unsigned long addr, pte_t ptent)
4232 {
4233 	struct page *page = vm_normal_page(vma, addr, ptent);
4234 
4235 	if (!page || !page_mapped(page))
4236 		return NULL;
4237 	if (PageAnon(page)) {
4238 		/* we don't move shared anon */
4239 		if (!move_anon() || page_mapcount(page) > 2)
4240 			return NULL;
4241 	} else if (!move_file())
4242 		/* we ignore mapcount for file pages */
4243 		return NULL;
4244 	if (!get_page_unless_zero(page))
4245 		return NULL;
4246 
4247 	return page;
4248 }
4249 
4250 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4251 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
4252 {
4253 	int usage_count;
4254 	struct page *page = NULL;
4255 	swp_entry_t ent = pte_to_swp_entry(ptent);
4256 
4257 	if (!move_anon() || non_swap_entry(ent))
4258 		return NULL;
4259 	usage_count = mem_cgroup_count_swap_user(ent, &page);
4260 	if (usage_count > 1) { /* we don't move shared anon */
4261 		if (page)
4262 			put_page(page);
4263 		return NULL;
4264 	}
4265 	if (do_swap_account)
4266 		entry->val = ent.val;
4267 
4268 	return page;
4269 }
4270 
4271 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4272 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
4273 {
4274 	struct page *page = NULL;
4275 	struct inode *inode;
4276 	struct address_space *mapping;
4277 	pgoff_t pgoff;
4278 
4279 	if (!vma->vm_file) /* anonymous vma */
4280 		return NULL;
4281 	if (!move_file())
4282 		return NULL;
4283 
4284 	inode = vma->vm_file->f_path.dentry->d_inode;
4285 	mapping = vma->vm_file->f_mapping;
4286 	if (pte_none(ptent))
4287 		pgoff = linear_page_index(vma, addr);
4288 	else /* pte_file(ptent) is true */
4289 		pgoff = pte_to_pgoff(ptent);
4290 
4291 	/* page is moved even if it's not RSS of this task(page-faulted). */
4292 	if (!mapping_cap_swap_backed(mapping)) { /* normal file */
4293 		page = find_get_page(mapping, pgoff);
4294 	} else { /* shmem/tmpfs file. we should take account of swap too. */
4295 		swp_entry_t ent;
4296 		mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
4297 		if (do_swap_account)
4298 			entry->val = ent.val;
4299 	}
4300 
4301 	return page;
4302 }
4303 
4304 static int is_target_pte_for_mc(struct vm_area_struct *vma,
4305 		unsigned long addr, pte_t ptent, union mc_target *target)
4306 {
4307 	struct page *page = NULL;
4308 	struct page_cgroup *pc;
4309 	int ret = 0;
4310 	swp_entry_t ent = { .val = 0 };
4311 
4312 	if (pte_present(ptent))
4313 		page = mc_handle_present_pte(vma, addr, ptent);
4314 	else if (is_swap_pte(ptent))
4315 		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4316 	else if (pte_none(ptent) || pte_file(ptent))
4317 		page = mc_handle_file_pte(vma, addr, ptent, &ent);
4318 
4319 	if (!page && !ent.val)
4320 		return 0;
4321 	if (page) {
4322 		pc = lookup_page_cgroup(page);
4323 		/*
4324 		 * Do only loose check w/o page_cgroup lock.
4325 		 * mem_cgroup_move_account() checks the pc is valid or not under
4326 		 * the lock.
4327 		 */
4328 		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
4329 			ret = MC_TARGET_PAGE;
4330 			if (target)
4331 				target->page = page;
4332 		}
4333 		if (!ret || !target)
4334 			put_page(page);
4335 	}
4336 	/* There is a swap entry and a page doesn't exist or isn't charged */
4337 	if (ent.val && !ret &&
4338 			css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
4339 		ret = MC_TARGET_SWAP;
4340 		if (target)
4341 			target->ent = ent;
4342 	}
4343 	return ret;
4344 }
4345 
4346 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4347 					unsigned long addr, unsigned long end,
4348 					struct mm_walk *walk)
4349 {
4350 	struct vm_area_struct *vma = walk->private;
4351 	pte_t *pte;
4352 	spinlock_t *ptl;
4353 
4354 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4355 	for (; addr != end; pte++, addr += PAGE_SIZE)
4356 		if (is_target_pte_for_mc(vma, addr, *pte, NULL))
4357 			mc.precharge++;	/* increment precharge temporarily */
4358 	pte_unmap_unlock(pte - 1, ptl);
4359 	cond_resched();
4360 
4361 	return 0;
4362 }
4363 
4364 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4365 {
4366 	unsigned long precharge;
4367 	struct vm_area_struct *vma;
4368 
4369 	down_read(&mm->mmap_sem);
4370 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
4371 		struct mm_walk mem_cgroup_count_precharge_walk = {
4372 			.pmd_entry = mem_cgroup_count_precharge_pte_range,
4373 			.mm = mm,
4374 			.private = vma,
4375 		};
4376 		if (is_vm_hugetlb_page(vma))
4377 			continue;
4378 		walk_page_range(vma->vm_start, vma->vm_end,
4379 					&mem_cgroup_count_precharge_walk);
4380 	}
4381 	up_read(&mm->mmap_sem);
4382 
4383 	precharge = mc.precharge;
4384 	mc.precharge = 0;
4385 
4386 	return precharge;
4387 }
4388 
4389 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4390 {
4391 	return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
4392 }
4393 
4394 static void mem_cgroup_clear_mc(void)
4395 {
4396 	/* we must uncharge all the leftover precharges from mc.to */
4397 	if (mc.precharge) {
4398 		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
4399 		mc.precharge = 0;
4400 		memcg_oom_recover(mc.to);
4401 	}
4402 	/*
4403 	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4404 	 * we must uncharge here.
4405 	 */
4406 	if (mc.moved_charge) {
4407 		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4408 		mc.moved_charge = 0;
4409 		memcg_oom_recover(mc.from);
4410 	}
4411 	/* we must fixup refcnts and charges */
4412 	if (mc.moved_swap) {
4413 		WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4414 		/* uncharge swap account from the old cgroup */
4415 		if (!mem_cgroup_is_root(mc.from))
4416 			res_counter_uncharge(&mc.from->memsw,
4417 						PAGE_SIZE * mc.moved_swap);
4418 		__mem_cgroup_put(mc.from, mc.moved_swap);
4419 
4420 		if (!mem_cgroup_is_root(mc.to)) {
4421 			/*
4422 			 * we charged both to->res and to->memsw, so we should
4423 			 * uncharge to->res.
4424 			 */
4425 			res_counter_uncharge(&mc.to->res,
4426 						PAGE_SIZE * mc.moved_swap);
4427 			VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4428 			__css_put(&mc.to->css, mc.moved_swap);
4429 		}
4430 		/* we've already done mem_cgroup_get(mc.to) */
4431 
4432 		mc.moved_swap = 0;
4433 	}
4434 	mc.from = NULL;
4435 	mc.to = NULL;
4436 	mc.moving_task = NULL;
4437 	wake_up_all(&mc.waitq);
4438 }
4439 
4440 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4441 				struct cgroup *cgroup,
4442 				struct task_struct *p,
4443 				bool threadgroup)
4444 {
4445 	int ret = 0;
4446 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
4447 
4448 	if (mem->move_charge_at_immigrate) {
4449 		struct mm_struct *mm;
4450 		struct mem_cgroup *from = mem_cgroup_from_task(p);
4451 
4452 		VM_BUG_ON(from == mem);
4453 
4454 		mm = get_task_mm(p);
4455 		if (!mm)
4456 			return 0;
4457 		/* We move charges only when we move a owner of the mm */
4458 		if (mm->owner == p) {
4459 			VM_BUG_ON(mc.from);
4460 			VM_BUG_ON(mc.to);
4461 			VM_BUG_ON(mc.precharge);
4462 			VM_BUG_ON(mc.moved_charge);
4463 			VM_BUG_ON(mc.moved_swap);
4464 			VM_BUG_ON(mc.moving_task);
4465 			mc.from = from;
4466 			mc.to = mem;
4467 			mc.precharge = 0;
4468 			mc.moved_charge = 0;
4469 			mc.moved_swap = 0;
4470 			mc.moving_task = current;
4471 
4472 			ret = mem_cgroup_precharge_mc(mm);
4473 			if (ret)
4474 				mem_cgroup_clear_mc();
4475 		}
4476 		mmput(mm);
4477 	}
4478 	return ret;
4479 }
4480 
4481 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4482 				struct cgroup *cgroup,
4483 				struct task_struct *p,
4484 				bool threadgroup)
4485 {
4486 	mem_cgroup_clear_mc();
4487 }
4488 
4489 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4490 				unsigned long addr, unsigned long end,
4491 				struct mm_walk *walk)
4492 {
4493 	int ret = 0;
4494 	struct vm_area_struct *vma = walk->private;
4495 	pte_t *pte;
4496 	spinlock_t *ptl;
4497 
4498 retry:
4499 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4500 	for (; addr != end; addr += PAGE_SIZE) {
4501 		pte_t ptent = *(pte++);
4502 		union mc_target target;
4503 		int type;
4504 		struct page *page;
4505 		struct page_cgroup *pc;
4506 		swp_entry_t ent;
4507 
4508 		if (!mc.precharge)
4509 			break;
4510 
4511 		type = is_target_pte_for_mc(vma, addr, ptent, &target);
4512 		switch (type) {
4513 		case MC_TARGET_PAGE:
4514 			page = target.page;
4515 			if (isolate_lru_page(page))
4516 				goto put;
4517 			pc = lookup_page_cgroup(page);
4518 			if (!mem_cgroup_move_account(pc,
4519 						mc.from, mc.to, false)) {
4520 				mc.precharge--;
4521 				/* we uncharge from mc.from later. */
4522 				mc.moved_charge++;
4523 			}
4524 			putback_lru_page(page);
4525 put:			/* is_target_pte_for_mc() gets the page */
4526 			put_page(page);
4527 			break;
4528 		case MC_TARGET_SWAP:
4529 			ent = target.ent;
4530 			if (!mem_cgroup_move_swap_account(ent,
4531 						mc.from, mc.to, false)) {
4532 				mc.precharge--;
4533 				/* we fixup refcnts and charges later. */
4534 				mc.moved_swap++;
4535 			}
4536 			break;
4537 		default:
4538 			break;
4539 		}
4540 	}
4541 	pte_unmap_unlock(pte - 1, ptl);
4542 	cond_resched();
4543 
4544 	if (addr != end) {
4545 		/*
4546 		 * We have consumed all precharges we got in can_attach().
4547 		 * We try charge one by one, but don't do any additional
4548 		 * charges to mc.to if we have failed in charge once in attach()
4549 		 * phase.
4550 		 */
4551 		ret = mem_cgroup_do_precharge(1);
4552 		if (!ret)
4553 			goto retry;
4554 	}
4555 
4556 	return ret;
4557 }
4558 
4559 static void mem_cgroup_move_charge(struct mm_struct *mm)
4560 {
4561 	struct vm_area_struct *vma;
4562 
4563 	lru_add_drain_all();
4564 	down_read(&mm->mmap_sem);
4565 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
4566 		int ret;
4567 		struct mm_walk mem_cgroup_move_charge_walk = {
4568 			.pmd_entry = mem_cgroup_move_charge_pte_range,
4569 			.mm = mm,
4570 			.private = vma,
4571 		};
4572 		if (is_vm_hugetlb_page(vma))
4573 			continue;
4574 		ret = walk_page_range(vma->vm_start, vma->vm_end,
4575 						&mem_cgroup_move_charge_walk);
4576 		if (ret)
4577 			/*
4578 			 * means we have consumed all precharges and failed in
4579 			 * doing additional charge. Just abandon here.
4580 			 */
4581 			break;
4582 	}
4583 	up_read(&mm->mmap_sem);
4584 }
4585 
4586 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4587 				struct cgroup *cont,
4588 				struct cgroup *old_cont,
4589 				struct task_struct *p,
4590 				bool threadgroup)
4591 {
4592 	struct mm_struct *mm;
4593 
4594 	if (!mc.to)
4595 		/* no need to move charge */
4596 		return;
4597 
4598 	mm = get_task_mm(p);
4599 	if (mm) {
4600 		mem_cgroup_move_charge(mm);
4601 		mmput(mm);
4602 	}
4603 	mem_cgroup_clear_mc();
4604 }
4605 #else	/* !CONFIG_MMU */
4606 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4607 				struct cgroup *cgroup,
4608 				struct task_struct *p,
4609 				bool threadgroup)
4610 {
4611 	return 0;
4612 }
4613 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4614 				struct cgroup *cgroup,
4615 				struct task_struct *p,
4616 				bool threadgroup)
4617 {
4618 }
4619 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4620 				struct cgroup *cont,
4621 				struct cgroup *old_cont,
4622 				struct task_struct *p,
4623 				bool threadgroup)
4624 {
4625 }
4626 #endif
4627 
4628 struct cgroup_subsys mem_cgroup_subsys = {
4629 	.name = "memory",
4630 	.subsys_id = mem_cgroup_subsys_id,
4631 	.create = mem_cgroup_create,
4632 	.pre_destroy = mem_cgroup_pre_destroy,
4633 	.destroy = mem_cgroup_destroy,
4634 	.populate = mem_cgroup_populate,
4635 	.can_attach = mem_cgroup_can_attach,
4636 	.cancel_attach = mem_cgroup_cancel_attach,
4637 	.attach = mem_cgroup_move_task,
4638 	.early_init = 0,
4639 	.use_id = 1,
4640 };
4641 
4642 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4643 
4644 static int __init disable_swap_account(char *s)
4645 {
4646 	really_do_swap_account = 0;
4647 	return 1;
4648 }
4649 __setup("noswapaccount", disable_swap_account);
4650 #endif
4651