xref: /linux/mm/memcontrol.c (revision b8bb76713ec50df2f11efee386e16f93d51e1076)
1 /* memcontrol.c - Memory Controller
2  *
3  * Copyright IBM Corporation, 2007
4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5  *
6  * Copyright 2007 OpenVZ SWsoft Inc
7  * Author: Pavel Emelianov <xemul@openvz.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  */
19 
20 #include <linux/res_counter.h>
21 #include <linux/memcontrol.h>
22 #include <linux/cgroup.h>
23 #include <linux/mm.h>
24 #include <linux/pagemap.h>
25 #include <linux/smp.h>
26 #include <linux/page-flags.h>
27 #include <linux/backing-dev.h>
28 #include <linux/bit_spinlock.h>
29 #include <linux/rcupdate.h>
30 #include <linux/mutex.h>
31 #include <linux/slab.h>
32 #include <linux/swap.h>
33 #include <linux/spinlock.h>
34 #include <linux/fs.h>
35 #include <linux/seq_file.h>
36 #include <linux/vmalloc.h>
37 #include <linux/mm_inline.h>
38 #include <linux/page_cgroup.h>
39 #include "internal.h"
40 
41 #include <asm/uaccess.h>
42 
43 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
44 #define MEM_CGROUP_RECLAIM_RETRIES	5
45 
46 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
47 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
48 int do_swap_account __read_mostly;
49 static int really_do_swap_account __initdata = 1; /* for remember boot option*/
50 #else
51 #define do_swap_account		(0)
52 #endif
53 
54 static DEFINE_MUTEX(memcg_tasklist);	/* can be hold under cgroup_mutex */
55 
56 /*
57  * Statistics for memory cgroup.
58  */
59 enum mem_cgroup_stat_index {
60 	/*
61 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
62 	 */
63 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
64 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
65 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
66 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
67 
68 	MEM_CGROUP_STAT_NSTATS,
69 };
70 
71 struct mem_cgroup_stat_cpu {
72 	s64 count[MEM_CGROUP_STAT_NSTATS];
73 } ____cacheline_aligned_in_smp;
74 
75 struct mem_cgroup_stat {
76 	struct mem_cgroup_stat_cpu cpustat[0];
77 };
78 
79 /*
80  * For accounting under irq disable, no need for increment preempt count.
81  */
82 static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
83 		enum mem_cgroup_stat_index idx, int val)
84 {
85 	stat->count[idx] += val;
86 }
87 
88 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
89 		enum mem_cgroup_stat_index idx)
90 {
91 	int cpu;
92 	s64 ret = 0;
93 	for_each_possible_cpu(cpu)
94 		ret += stat->cpustat[cpu].count[idx];
95 	return ret;
96 }
97 
98 /*
99  * per-zone information in memory controller.
100  */
101 struct mem_cgroup_per_zone {
102 	/*
103 	 * spin_lock to protect the per cgroup LRU
104 	 */
105 	struct list_head	lists[NR_LRU_LISTS];
106 	unsigned long		count[NR_LRU_LISTS];
107 
108 	struct zone_reclaim_stat reclaim_stat;
109 };
110 /* Macro for accessing counter */
111 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
112 
113 struct mem_cgroup_per_node {
114 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
115 };
116 
117 struct mem_cgroup_lru_info {
118 	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
119 };
120 
121 /*
122  * The memory controller data structure. The memory controller controls both
123  * page cache and RSS per cgroup. We would eventually like to provide
124  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
125  * to help the administrator determine what knobs to tune.
126  *
127  * TODO: Add a water mark for the memory controller. Reclaim will begin when
128  * we hit the water mark. May be even add a low water mark, such that
129  * no reclaim occurs from a cgroup at it's low water mark, this is
130  * a feature that will be implemented much later in the future.
131  */
132 struct mem_cgroup {
133 	struct cgroup_subsys_state css;
134 	/*
135 	 * the counter to account for memory usage
136 	 */
137 	struct res_counter res;
138 	/*
139 	 * the counter to account for mem+swap usage.
140 	 */
141 	struct res_counter memsw;
142 	/*
143 	 * Per cgroup active and inactive list, similar to the
144 	 * per zone LRU lists.
145 	 */
146 	struct mem_cgroup_lru_info info;
147 
148 	/*
149 	  protect against reclaim related member.
150 	*/
151 	spinlock_t reclaim_param_lock;
152 
153 	int	prev_priority;	/* for recording reclaim priority */
154 
155 	/*
156 	 * While reclaiming in a hiearchy, we cache the last child we
157 	 * reclaimed from. Protected by hierarchy_mutex
158 	 */
159 	struct mem_cgroup *last_scanned_child;
160 	/*
161 	 * Should the accounting and control be hierarchical, per subtree?
162 	 */
163 	bool use_hierarchy;
164 	unsigned long	last_oom_jiffies;
165 	atomic_t	refcnt;
166 
167 	unsigned int	swappiness;
168 
169 	/*
170 	 * statistics. This must be placed at the end of memcg.
171 	 */
172 	struct mem_cgroup_stat stat;
173 };
174 
175 enum charge_type {
176 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
177 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
178 	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
179 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
180 	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
181 	NR_CHARGE_TYPE,
182 };
183 
184 /* only for here (for easy reading.) */
185 #define PCGF_CACHE	(1UL << PCG_CACHE)
186 #define PCGF_USED	(1UL << PCG_USED)
187 #define PCGF_LOCK	(1UL << PCG_LOCK)
188 static const unsigned long
189 pcg_default_flags[NR_CHARGE_TYPE] = {
190 	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
191 	PCGF_USED | PCGF_LOCK, /* Anon */
192 	PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
193 	0, /* FORCE */
194 };
195 
196 /* for encoding cft->private value on file */
197 #define _MEM			(0)
198 #define _MEMSWAP		(1)
199 #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
200 #define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
201 #define MEMFILE_ATTR(val)	((val) & 0xffff)
202 
203 static void mem_cgroup_get(struct mem_cgroup *mem);
204 static void mem_cgroup_put(struct mem_cgroup *mem);
205 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
206 
207 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
208 					 struct page_cgroup *pc,
209 					 bool charge)
210 {
211 	int val = (charge)? 1 : -1;
212 	struct mem_cgroup_stat *stat = &mem->stat;
213 	struct mem_cgroup_stat_cpu *cpustat;
214 	int cpu = get_cpu();
215 
216 	cpustat = &stat->cpustat[cpu];
217 	if (PageCgroupCache(pc))
218 		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
219 	else
220 		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
221 
222 	if (charge)
223 		__mem_cgroup_stat_add_safe(cpustat,
224 				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
225 	else
226 		__mem_cgroup_stat_add_safe(cpustat,
227 				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
228 	put_cpu();
229 }
230 
231 static struct mem_cgroup_per_zone *
232 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
233 {
234 	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
235 }
236 
237 static struct mem_cgroup_per_zone *
238 page_cgroup_zoneinfo(struct page_cgroup *pc)
239 {
240 	struct mem_cgroup *mem = pc->mem_cgroup;
241 	int nid = page_cgroup_nid(pc);
242 	int zid = page_cgroup_zid(pc);
243 
244 	if (!mem)
245 		return NULL;
246 
247 	return mem_cgroup_zoneinfo(mem, nid, zid);
248 }
249 
250 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
251 					enum lru_list idx)
252 {
253 	int nid, zid;
254 	struct mem_cgroup_per_zone *mz;
255 	u64 total = 0;
256 
257 	for_each_online_node(nid)
258 		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
259 			mz = mem_cgroup_zoneinfo(mem, nid, zid);
260 			total += MEM_CGROUP_ZSTAT(mz, idx);
261 		}
262 	return total;
263 }
264 
265 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
266 {
267 	return container_of(cgroup_subsys_state(cont,
268 				mem_cgroup_subsys_id), struct mem_cgroup,
269 				css);
270 }
271 
272 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
273 {
274 	/*
275 	 * mm_update_next_owner() may clear mm->owner to NULL
276 	 * if it races with swapoff, page migration, etc.
277 	 * So this can be called with p == NULL.
278 	 */
279 	if (unlikely(!p))
280 		return NULL;
281 
282 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
283 				struct mem_cgroup, css);
284 }
285 
286 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
287 {
288 	struct mem_cgroup *mem = NULL;
289 	/*
290 	 * Because we have no locks, mm->owner's may be being moved to other
291 	 * cgroup. We use css_tryget() here even if this looks
292 	 * pessimistic (rather than adding locks here).
293 	 */
294 	rcu_read_lock();
295 	do {
296 		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
297 		if (unlikely(!mem))
298 			break;
299 	} while (!css_tryget(&mem->css));
300 	rcu_read_unlock();
301 	return mem;
302 }
303 
304 static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
305 {
306 	if (!mem)
307 		return true;
308 	return css_is_removed(&mem->css);
309 }
310 
311 /*
312  * Following LRU functions are allowed to be used without PCG_LOCK.
313  * Operations are called by routine of global LRU independently from memcg.
314  * What we have to take care of here is validness of pc->mem_cgroup.
315  *
316  * Changes to pc->mem_cgroup happens when
317  * 1. charge
318  * 2. moving account
319  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
320  * It is added to LRU before charge.
321  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
322  * When moving account, the page is not on LRU. It's isolated.
323  */
324 
325 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
326 {
327 	struct page_cgroup *pc;
328 	struct mem_cgroup *mem;
329 	struct mem_cgroup_per_zone *mz;
330 
331 	if (mem_cgroup_disabled())
332 		return;
333 	pc = lookup_page_cgroup(page);
334 	/* can happen while we handle swapcache. */
335 	if (list_empty(&pc->lru) || !pc->mem_cgroup)
336 		return;
337 	/*
338 	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
339 	 * removed from global LRU.
340 	 */
341 	mz = page_cgroup_zoneinfo(pc);
342 	mem = pc->mem_cgroup;
343 	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
344 	list_del_init(&pc->lru);
345 	return;
346 }
347 
348 void mem_cgroup_del_lru(struct page *page)
349 {
350 	mem_cgroup_del_lru_list(page, page_lru(page));
351 }
352 
353 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
354 {
355 	struct mem_cgroup_per_zone *mz;
356 	struct page_cgroup *pc;
357 
358 	if (mem_cgroup_disabled())
359 		return;
360 
361 	pc = lookup_page_cgroup(page);
362 	/*
363 	 * Used bit is set without atomic ops but after smp_wmb().
364 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
365 	 */
366 	smp_rmb();
367 	/* unused page is not rotated. */
368 	if (!PageCgroupUsed(pc))
369 		return;
370 	mz = page_cgroup_zoneinfo(pc);
371 	list_move(&pc->lru, &mz->lists[lru]);
372 }
373 
374 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
375 {
376 	struct page_cgroup *pc;
377 	struct mem_cgroup_per_zone *mz;
378 
379 	if (mem_cgroup_disabled())
380 		return;
381 	pc = lookup_page_cgroup(page);
382 	/*
383 	 * Used bit is set without atomic ops but after smp_wmb().
384 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
385 	 */
386 	smp_rmb();
387 	if (!PageCgroupUsed(pc))
388 		return;
389 
390 	mz = page_cgroup_zoneinfo(pc);
391 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
392 	list_add(&pc->lru, &mz->lists[lru]);
393 }
394 
395 /*
396  * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
397  * lru because the page may.be reused after it's fully uncharged (because of
398  * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
399  * it again. This function is only used to charge SwapCache. It's done under
400  * lock_page and expected that zone->lru_lock is never held.
401  */
402 static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
403 {
404 	unsigned long flags;
405 	struct zone *zone = page_zone(page);
406 	struct page_cgroup *pc = lookup_page_cgroup(page);
407 
408 	spin_lock_irqsave(&zone->lru_lock, flags);
409 	/*
410 	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
411 	 * is guarded by lock_page() because the page is SwapCache.
412 	 */
413 	if (!PageCgroupUsed(pc))
414 		mem_cgroup_del_lru_list(page, page_lru(page));
415 	spin_unlock_irqrestore(&zone->lru_lock, flags);
416 }
417 
418 static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
419 {
420 	unsigned long flags;
421 	struct zone *zone = page_zone(page);
422 	struct page_cgroup *pc = lookup_page_cgroup(page);
423 
424 	spin_lock_irqsave(&zone->lru_lock, flags);
425 	/* link when the page is linked to LRU but page_cgroup isn't */
426 	if (PageLRU(page) && list_empty(&pc->lru))
427 		mem_cgroup_add_lru_list(page, page_lru(page));
428 	spin_unlock_irqrestore(&zone->lru_lock, flags);
429 }
430 
431 
432 void mem_cgroup_move_lists(struct page *page,
433 			   enum lru_list from, enum lru_list to)
434 {
435 	if (mem_cgroup_disabled())
436 		return;
437 	mem_cgroup_del_lru_list(page, from);
438 	mem_cgroup_add_lru_list(page, to);
439 }
440 
441 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
442 {
443 	int ret;
444 
445 	task_lock(task);
446 	ret = task->mm && mm_match_cgroup(task->mm, mem);
447 	task_unlock(task);
448 	return ret;
449 }
450 
451 /*
452  * Calculate mapped_ratio under memory controller. This will be used in
453  * vmscan.c for deteremining we have to reclaim mapped pages.
454  */
455 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
456 {
457 	long total, rss;
458 
459 	/*
460 	 * usage is recorded in bytes. But, here, we assume the number of
461 	 * physical pages can be represented by "long" on any arch.
462 	 */
463 	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
464 	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
465 	return (int)((rss * 100L) / total);
466 }
467 
468 /*
469  * prev_priority control...this will be used in memory reclaim path.
470  */
471 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
472 {
473 	int prev_priority;
474 
475 	spin_lock(&mem->reclaim_param_lock);
476 	prev_priority = mem->prev_priority;
477 	spin_unlock(&mem->reclaim_param_lock);
478 
479 	return prev_priority;
480 }
481 
482 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
483 {
484 	spin_lock(&mem->reclaim_param_lock);
485 	if (priority < mem->prev_priority)
486 		mem->prev_priority = priority;
487 	spin_unlock(&mem->reclaim_param_lock);
488 }
489 
490 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
491 {
492 	spin_lock(&mem->reclaim_param_lock);
493 	mem->prev_priority = priority;
494 	spin_unlock(&mem->reclaim_param_lock);
495 }
496 
497 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
498 {
499 	unsigned long active;
500 	unsigned long inactive;
501 	unsigned long gb;
502 	unsigned long inactive_ratio;
503 
504 	inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
505 	active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
506 
507 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
508 	if (gb)
509 		inactive_ratio = int_sqrt(10 * gb);
510 	else
511 		inactive_ratio = 1;
512 
513 	if (present_pages) {
514 		present_pages[0] = inactive;
515 		present_pages[1] = active;
516 	}
517 
518 	return inactive_ratio;
519 }
520 
521 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
522 {
523 	unsigned long active;
524 	unsigned long inactive;
525 	unsigned long present_pages[2];
526 	unsigned long inactive_ratio;
527 
528 	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
529 
530 	inactive = present_pages[0];
531 	active = present_pages[1];
532 
533 	if (inactive * inactive_ratio < active)
534 		return 1;
535 
536 	return 0;
537 }
538 
539 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
540 				       struct zone *zone,
541 				       enum lru_list lru)
542 {
543 	int nid = zone->zone_pgdat->node_id;
544 	int zid = zone_idx(zone);
545 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
546 
547 	return MEM_CGROUP_ZSTAT(mz, lru);
548 }
549 
550 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
551 						      struct zone *zone)
552 {
553 	int nid = zone->zone_pgdat->node_id;
554 	int zid = zone_idx(zone);
555 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
556 
557 	return &mz->reclaim_stat;
558 }
559 
560 struct zone_reclaim_stat *
561 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
562 {
563 	struct page_cgroup *pc;
564 	struct mem_cgroup_per_zone *mz;
565 
566 	if (mem_cgroup_disabled())
567 		return NULL;
568 
569 	pc = lookup_page_cgroup(page);
570 	/*
571 	 * Used bit is set without atomic ops but after smp_wmb().
572 	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
573 	 */
574 	smp_rmb();
575 	if (!PageCgroupUsed(pc))
576 		return NULL;
577 
578 	mz = page_cgroup_zoneinfo(pc);
579 	if (!mz)
580 		return NULL;
581 
582 	return &mz->reclaim_stat;
583 }
584 
585 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
586 					struct list_head *dst,
587 					unsigned long *scanned, int order,
588 					int mode, struct zone *z,
589 					struct mem_cgroup *mem_cont,
590 					int active, int file)
591 {
592 	unsigned long nr_taken = 0;
593 	struct page *page;
594 	unsigned long scan;
595 	LIST_HEAD(pc_list);
596 	struct list_head *src;
597 	struct page_cgroup *pc, *tmp;
598 	int nid = z->zone_pgdat->node_id;
599 	int zid = zone_idx(z);
600 	struct mem_cgroup_per_zone *mz;
601 	int lru = LRU_FILE * !!file + !!active;
602 
603 	BUG_ON(!mem_cont);
604 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
605 	src = &mz->lists[lru];
606 
607 	scan = 0;
608 	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
609 		if (scan >= nr_to_scan)
610 			break;
611 
612 		page = pc->page;
613 		if (unlikely(!PageCgroupUsed(pc)))
614 			continue;
615 		if (unlikely(!PageLRU(page)))
616 			continue;
617 
618 		scan++;
619 		if (__isolate_lru_page(page, mode, file) == 0) {
620 			list_move(&page->lru, dst);
621 			nr_taken++;
622 		}
623 	}
624 
625 	*scanned = scan;
626 	return nr_taken;
627 }
628 
629 #define mem_cgroup_from_res_counter(counter, member)	\
630 	container_of(counter, struct mem_cgroup, member)
631 
632 /*
633  * This routine finds the DFS walk successor. This routine should be
634  * called with hierarchy_mutex held
635  */
636 static struct mem_cgroup *
637 __mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
638 {
639 	struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
640 
641 	curr_cgroup = curr->css.cgroup;
642 	root_cgroup = root_mem->css.cgroup;
643 
644 	if (!list_empty(&curr_cgroup->children)) {
645 		/*
646 		 * Walk down to children
647 		 */
648 		cgroup = list_entry(curr_cgroup->children.next,
649 						struct cgroup, sibling);
650 		curr = mem_cgroup_from_cont(cgroup);
651 		goto done;
652 	}
653 
654 visit_parent:
655 	if (curr_cgroup == root_cgroup) {
656 		/* caller handles NULL case */
657 		curr = NULL;
658 		goto done;
659 	}
660 
661 	/*
662 	 * Goto next sibling
663 	 */
664 	if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
665 		cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
666 						sibling);
667 		curr = mem_cgroup_from_cont(cgroup);
668 		goto done;
669 	}
670 
671 	/*
672 	 * Go up to next parent and next parent's sibling if need be
673 	 */
674 	curr_cgroup = curr_cgroup->parent;
675 	goto visit_parent;
676 
677 done:
678 	return curr;
679 }
680 
681 /*
682  * Visit the first child (need not be the first child as per the ordering
683  * of the cgroup list, since we track last_scanned_child) of @mem and use
684  * that to reclaim free pages from.
685  */
686 static struct mem_cgroup *
687 mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
688 {
689 	struct cgroup *cgroup;
690 	struct mem_cgroup *orig, *next;
691 	bool obsolete;
692 
693 	/*
694 	 * Scan all children under the mem_cgroup mem
695 	 */
696 	mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
697 
698 	orig = root_mem->last_scanned_child;
699 	obsolete = mem_cgroup_is_obsolete(orig);
700 
701 	if (list_empty(&root_mem->css.cgroup->children)) {
702 		/*
703 		 * root_mem might have children before and last_scanned_child
704 		 * may point to one of them. We put it later.
705 		 */
706 		if (orig)
707 			VM_BUG_ON(!obsolete);
708 		next = NULL;
709 		goto done;
710 	}
711 
712 	if (!orig || obsolete) {
713 		cgroup = list_first_entry(&root_mem->css.cgroup->children,
714 				struct cgroup, sibling);
715 		next = mem_cgroup_from_cont(cgroup);
716 	} else
717 		next = __mem_cgroup_get_next_node(orig, root_mem);
718 
719 done:
720 	if (next)
721 		mem_cgroup_get(next);
722 	root_mem->last_scanned_child = next;
723 	if (orig)
724 		mem_cgroup_put(orig);
725 	mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
726 	return (next) ? next : root_mem;
727 }
728 
729 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
730 {
731 	if (do_swap_account) {
732 		if (res_counter_check_under_limit(&mem->res) &&
733 			res_counter_check_under_limit(&mem->memsw))
734 			return true;
735 	} else
736 		if (res_counter_check_under_limit(&mem->res))
737 			return true;
738 	return false;
739 }
740 
741 static unsigned int get_swappiness(struct mem_cgroup *memcg)
742 {
743 	struct cgroup *cgrp = memcg->css.cgroup;
744 	unsigned int swappiness;
745 
746 	/* root ? */
747 	if (cgrp->parent == NULL)
748 		return vm_swappiness;
749 
750 	spin_lock(&memcg->reclaim_param_lock);
751 	swappiness = memcg->swappiness;
752 	spin_unlock(&memcg->reclaim_param_lock);
753 
754 	return swappiness;
755 }
756 
757 /*
758  * Dance down the hierarchy if needed to reclaim memory. We remember the
759  * last child we reclaimed from, so that we don't end up penalizing
760  * one child extensively based on its position in the children list.
761  *
762  * root_mem is the original ancestor that we've been reclaim from.
763  */
764 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
765 						gfp_t gfp_mask, bool noswap)
766 {
767 	struct mem_cgroup *next_mem;
768 	int ret = 0;
769 
770 	/*
771 	 * Reclaim unconditionally and don't check for return value.
772 	 * We need to reclaim in the current group and down the tree.
773 	 * One might think about checking for children before reclaiming,
774 	 * but there might be left over accounting, even after children
775 	 * have left.
776 	 */
777 	ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
778 					   get_swappiness(root_mem));
779 	if (mem_cgroup_check_under_limit(root_mem))
780 		return 1;	/* indicate reclaim has succeeded */
781 	if (!root_mem->use_hierarchy)
782 		return ret;
783 
784 	next_mem = mem_cgroup_get_next_node(root_mem);
785 
786 	while (next_mem != root_mem) {
787 		if (mem_cgroup_is_obsolete(next_mem)) {
788 			next_mem = mem_cgroup_get_next_node(root_mem);
789 			continue;
790 		}
791 		ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
792 						   get_swappiness(next_mem));
793 		if (mem_cgroup_check_under_limit(root_mem))
794 			return 1;	/* indicate reclaim has succeeded */
795 		next_mem = mem_cgroup_get_next_node(root_mem);
796 	}
797 	return ret;
798 }
799 
800 bool mem_cgroup_oom_called(struct task_struct *task)
801 {
802 	bool ret = false;
803 	struct mem_cgroup *mem;
804 	struct mm_struct *mm;
805 
806 	rcu_read_lock();
807 	mm = task->mm;
808 	if (!mm)
809 		mm = &init_mm;
810 	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
811 	if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
812 		ret = true;
813 	rcu_read_unlock();
814 	return ret;
815 }
816 /*
817  * Unlike exported interface, "oom" parameter is added. if oom==true,
818  * oom-killer can be invoked.
819  */
820 static int __mem_cgroup_try_charge(struct mm_struct *mm,
821 			gfp_t gfp_mask, struct mem_cgroup **memcg,
822 			bool oom)
823 {
824 	struct mem_cgroup *mem, *mem_over_limit;
825 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
826 	struct res_counter *fail_res;
827 
828 	if (unlikely(test_thread_flag(TIF_MEMDIE))) {
829 		/* Don't account this! */
830 		*memcg = NULL;
831 		return 0;
832 	}
833 
834 	/*
835 	 * We always charge the cgroup the mm_struct belongs to.
836 	 * The mm_struct's mem_cgroup changes on task migration if the
837 	 * thread group leader migrates. It's possible that mm is not
838 	 * set, if so charge the init_mm (happens for pagecache usage).
839 	 */
840 	mem = *memcg;
841 	if (likely(!mem)) {
842 		mem = try_get_mem_cgroup_from_mm(mm);
843 		*memcg = mem;
844 	} else {
845 		css_get(&mem->css);
846 	}
847 	if (unlikely(!mem))
848 		return 0;
849 
850 	VM_BUG_ON(mem_cgroup_is_obsolete(mem));
851 
852 	while (1) {
853 		int ret;
854 		bool noswap = false;
855 
856 		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
857 		if (likely(!ret)) {
858 			if (!do_swap_account)
859 				break;
860 			ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
861 							&fail_res);
862 			if (likely(!ret))
863 				break;
864 			/* mem+swap counter fails */
865 			res_counter_uncharge(&mem->res, PAGE_SIZE);
866 			noswap = true;
867 			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
868 									memsw);
869 		} else
870 			/* mem counter fails */
871 			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
872 									res);
873 
874 		if (!(gfp_mask & __GFP_WAIT))
875 			goto nomem;
876 
877 		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
878 							noswap);
879 		if (ret)
880 			continue;
881 
882 		/*
883 		 * try_to_free_mem_cgroup_pages() might not give us a full
884 		 * picture of reclaim. Some pages are reclaimed and might be
885 		 * moved to swap cache or just unmapped from the cgroup.
886 		 * Check the limit again to see if the reclaim reduced the
887 		 * current usage of the cgroup before giving up
888 		 *
889 		 */
890 		if (mem_cgroup_check_under_limit(mem_over_limit))
891 			continue;
892 
893 		if (!nr_retries--) {
894 			if (oom) {
895 				mutex_lock(&memcg_tasklist);
896 				mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
897 				mutex_unlock(&memcg_tasklist);
898 				mem_over_limit->last_oom_jiffies = jiffies;
899 			}
900 			goto nomem;
901 		}
902 	}
903 	return 0;
904 nomem:
905 	css_put(&mem->css);
906 	return -ENOMEM;
907 }
908 
909 static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
910 {
911 	struct mem_cgroup *mem;
912 	swp_entry_t ent;
913 
914 	if (!PageSwapCache(page))
915 		return NULL;
916 
917 	ent.val = page_private(page);
918 	mem = lookup_swap_cgroup(ent);
919 	if (!mem)
920 		return NULL;
921 	if (!css_tryget(&mem->css))
922 		return NULL;
923 	return mem;
924 }
925 
926 /*
927  * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
928  * USED state. If already USED, uncharge and return.
929  */
930 
931 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
932 				     struct page_cgroup *pc,
933 				     enum charge_type ctype)
934 {
935 	/* try_charge() can return NULL to *memcg, taking care of it. */
936 	if (!mem)
937 		return;
938 
939 	lock_page_cgroup(pc);
940 	if (unlikely(PageCgroupUsed(pc))) {
941 		unlock_page_cgroup(pc);
942 		res_counter_uncharge(&mem->res, PAGE_SIZE);
943 		if (do_swap_account)
944 			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
945 		css_put(&mem->css);
946 		return;
947 	}
948 	pc->mem_cgroup = mem;
949 	smp_wmb();
950 	pc->flags = pcg_default_flags[ctype];
951 
952 	mem_cgroup_charge_statistics(mem, pc, true);
953 
954 	unlock_page_cgroup(pc);
955 }
956 
957 /**
958  * mem_cgroup_move_account - move account of the page
959  * @pc:	page_cgroup of the page.
960  * @from: mem_cgroup which the page is moved from.
961  * @to:	mem_cgroup which the page is moved to. @from != @to.
962  *
963  * The caller must confirm following.
964  * - page is not on LRU (isolate_page() is useful.)
965  *
966  * returns 0 at success,
967  * returns -EBUSY when lock is busy or "pc" is unstable.
968  *
969  * This function does "uncharge" from old cgroup but doesn't do "charge" to
970  * new cgroup. It should be done by a caller.
971  */
972 
973 static int mem_cgroup_move_account(struct page_cgroup *pc,
974 	struct mem_cgroup *from, struct mem_cgroup *to)
975 {
976 	struct mem_cgroup_per_zone *from_mz, *to_mz;
977 	int nid, zid;
978 	int ret = -EBUSY;
979 
980 	VM_BUG_ON(from == to);
981 	VM_BUG_ON(PageLRU(pc->page));
982 
983 	nid = page_cgroup_nid(pc);
984 	zid = page_cgroup_zid(pc);
985 	from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
986 	to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
987 
988 	if (!trylock_page_cgroup(pc))
989 		return ret;
990 
991 	if (!PageCgroupUsed(pc))
992 		goto out;
993 
994 	if (pc->mem_cgroup != from)
995 		goto out;
996 
997 	res_counter_uncharge(&from->res, PAGE_SIZE);
998 	mem_cgroup_charge_statistics(from, pc, false);
999 	if (do_swap_account)
1000 		res_counter_uncharge(&from->memsw, PAGE_SIZE);
1001 	css_put(&from->css);
1002 
1003 	css_get(&to->css);
1004 	pc->mem_cgroup = to;
1005 	mem_cgroup_charge_statistics(to, pc, true);
1006 	ret = 0;
1007 out:
1008 	unlock_page_cgroup(pc);
1009 	return ret;
1010 }
1011 
1012 /*
1013  * move charges to its parent.
1014  */
1015 
1016 static int mem_cgroup_move_parent(struct page_cgroup *pc,
1017 				  struct mem_cgroup *child,
1018 				  gfp_t gfp_mask)
1019 {
1020 	struct page *page = pc->page;
1021 	struct cgroup *cg = child->css.cgroup;
1022 	struct cgroup *pcg = cg->parent;
1023 	struct mem_cgroup *parent;
1024 	int ret;
1025 
1026 	/* Is ROOT ? */
1027 	if (!pcg)
1028 		return -EINVAL;
1029 
1030 
1031 	parent = mem_cgroup_from_cont(pcg);
1032 
1033 
1034 	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1035 	if (ret || !parent)
1036 		return ret;
1037 
1038 	if (!get_page_unless_zero(page)) {
1039 		ret = -EBUSY;
1040 		goto uncharge;
1041 	}
1042 
1043 	ret = isolate_lru_page(page);
1044 
1045 	if (ret)
1046 		goto cancel;
1047 
1048 	ret = mem_cgroup_move_account(pc, child, parent);
1049 
1050 	putback_lru_page(page);
1051 	if (!ret) {
1052 		put_page(page);
1053 		/* drop extra refcnt by try_charge() */
1054 		css_put(&parent->css);
1055 		return 0;
1056 	}
1057 
1058 cancel:
1059 	put_page(page);
1060 uncharge:
1061 	/* drop extra refcnt by try_charge() */
1062 	css_put(&parent->css);
1063 	/* uncharge if move fails */
1064 	res_counter_uncharge(&parent->res, PAGE_SIZE);
1065 	if (do_swap_account)
1066 		res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1067 	return ret;
1068 }
1069 
1070 /*
1071  * Charge the memory controller for page usage.
1072  * Return
1073  * 0 if the charge was successful
1074  * < 0 if the cgroup is over its limit
1075  */
1076 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1077 				gfp_t gfp_mask, enum charge_type ctype,
1078 				struct mem_cgroup *memcg)
1079 {
1080 	struct mem_cgroup *mem;
1081 	struct page_cgroup *pc;
1082 	int ret;
1083 
1084 	pc = lookup_page_cgroup(page);
1085 	/* can happen at boot */
1086 	if (unlikely(!pc))
1087 		return 0;
1088 	prefetchw(pc);
1089 
1090 	mem = memcg;
1091 	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1092 	if (ret || !mem)
1093 		return ret;
1094 
1095 	__mem_cgroup_commit_charge(mem, pc, ctype);
1096 	return 0;
1097 }
1098 
1099 int mem_cgroup_newpage_charge(struct page *page,
1100 			      struct mm_struct *mm, gfp_t gfp_mask)
1101 {
1102 	if (mem_cgroup_disabled())
1103 		return 0;
1104 	if (PageCompound(page))
1105 		return 0;
1106 	/*
1107 	 * If already mapped, we don't have to account.
1108 	 * If page cache, page->mapping has address_space.
1109 	 * But page->mapping may have out-of-use anon_vma pointer,
1110 	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
1111 	 * is NULL.
1112   	 */
1113 	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
1114 		return 0;
1115 	if (unlikely(!mm))
1116 		mm = &init_mm;
1117 	return mem_cgroup_charge_common(page, mm, gfp_mask,
1118 				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1119 }
1120 
1121 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1122 				gfp_t gfp_mask)
1123 {
1124 	struct mem_cgroup *mem = NULL;
1125 	int ret;
1126 
1127 	if (mem_cgroup_disabled())
1128 		return 0;
1129 	if (PageCompound(page))
1130 		return 0;
1131 	/*
1132 	 * Corner case handling. This is called from add_to_page_cache()
1133 	 * in usual. But some FS (shmem) precharges this page before calling it
1134 	 * and call add_to_page_cache() with GFP_NOWAIT.
1135 	 *
1136 	 * For GFP_NOWAIT case, the page may be pre-charged before calling
1137 	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
1138 	 * charge twice. (It works but has to pay a bit larger cost.)
1139 	 * And when the page is SwapCache, it should take swap information
1140 	 * into account. This is under lock_page() now.
1141 	 */
1142 	if (!(gfp_mask & __GFP_WAIT)) {
1143 		struct page_cgroup *pc;
1144 
1145 
1146 		pc = lookup_page_cgroup(page);
1147 		if (!pc)
1148 			return 0;
1149 		lock_page_cgroup(pc);
1150 		if (PageCgroupUsed(pc)) {
1151 			unlock_page_cgroup(pc);
1152 			return 0;
1153 		}
1154 		unlock_page_cgroup(pc);
1155 	}
1156 
1157 	if (do_swap_account && PageSwapCache(page)) {
1158 		mem = try_get_mem_cgroup_from_swapcache(page);
1159 		if (mem)
1160 			mm = NULL;
1161 		  else
1162 			mem = NULL;
1163 		/* SwapCache may be still linked to LRU now. */
1164 		mem_cgroup_lru_del_before_commit_swapcache(page);
1165 	}
1166 
1167 	if (unlikely(!mm && !mem))
1168 		mm = &init_mm;
1169 
1170 	if (page_is_file_cache(page))
1171 		return mem_cgroup_charge_common(page, mm, gfp_mask,
1172 				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1173 
1174 	ret = mem_cgroup_charge_common(page, mm, gfp_mask,
1175 				MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1176 	if (mem)
1177 		css_put(&mem->css);
1178 	if (PageSwapCache(page))
1179 		mem_cgroup_lru_add_after_commit_swapcache(page);
1180 
1181 	if (do_swap_account && !ret && PageSwapCache(page)) {
1182 		swp_entry_t ent = {.val = page_private(page)};
1183 		/* avoid double counting */
1184 		mem = swap_cgroup_record(ent, NULL);
1185 		if (mem) {
1186 			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1187 			mem_cgroup_put(mem);
1188 		}
1189 	}
1190 	return ret;
1191 }
1192 
1193 /*
1194  * While swap-in, try_charge -> commit or cancel, the page is locked.
1195  * And when try_charge() successfully returns, one refcnt to memcg without
1196  * struct page_cgroup is aquired. This refcnt will be cumsumed by
1197  * "commit()" or removed by "cancel()"
1198  */
1199 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1200 				 struct page *page,
1201 				 gfp_t mask, struct mem_cgroup **ptr)
1202 {
1203 	struct mem_cgroup *mem;
1204 	int ret;
1205 
1206 	if (mem_cgroup_disabled())
1207 		return 0;
1208 
1209 	if (!do_swap_account)
1210 		goto charge_cur_mm;
1211 	/*
1212 	 * A racing thread's fault, or swapoff, may have already updated
1213 	 * the pte, and even removed page from swap cache: return success
1214 	 * to go on to do_swap_page()'s pte_same() test, which should fail.
1215 	 */
1216 	if (!PageSwapCache(page))
1217 		return 0;
1218 	mem = try_get_mem_cgroup_from_swapcache(page);
1219 	if (!mem)
1220 		goto charge_cur_mm;
1221 	*ptr = mem;
1222 	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
1223 	/* drop extra refcnt from tryget */
1224 	css_put(&mem->css);
1225 	return ret;
1226 charge_cur_mm:
1227 	if (unlikely(!mm))
1228 		mm = &init_mm;
1229 	return __mem_cgroup_try_charge(mm, mask, ptr, true);
1230 }
1231 
1232 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1233 {
1234 	struct page_cgroup *pc;
1235 
1236 	if (mem_cgroup_disabled())
1237 		return;
1238 	if (!ptr)
1239 		return;
1240 	pc = lookup_page_cgroup(page);
1241 	mem_cgroup_lru_del_before_commit_swapcache(page);
1242 	__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1243 	mem_cgroup_lru_add_after_commit_swapcache(page);
1244 	/*
1245 	 * Now swap is on-memory. This means this page may be
1246 	 * counted both as mem and swap....double count.
1247 	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
1248 	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
1249 	 * may call delete_from_swap_cache() before reach here.
1250 	 */
1251 	if (do_swap_account && PageSwapCache(page)) {
1252 		swp_entry_t ent = {.val = page_private(page)};
1253 		struct mem_cgroup *memcg;
1254 		memcg = swap_cgroup_record(ent, NULL);
1255 		if (memcg) {
1256 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1257 			mem_cgroup_put(memcg);
1258 		}
1259 
1260 	}
1261 	/* add this page(page_cgroup) to the LRU we want. */
1262 
1263 }
1264 
1265 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1266 {
1267 	if (mem_cgroup_disabled())
1268 		return;
1269 	if (!mem)
1270 		return;
1271 	res_counter_uncharge(&mem->res, PAGE_SIZE);
1272 	if (do_swap_account)
1273 		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1274 	css_put(&mem->css);
1275 }
1276 
1277 
1278 /*
1279  * uncharge if !page_mapped(page)
1280  */
1281 static struct mem_cgroup *
1282 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1283 {
1284 	struct page_cgroup *pc;
1285 	struct mem_cgroup *mem = NULL;
1286 	struct mem_cgroup_per_zone *mz;
1287 
1288 	if (mem_cgroup_disabled())
1289 		return NULL;
1290 
1291 	if (PageSwapCache(page))
1292 		return NULL;
1293 
1294 	/*
1295 	 * Check if our page_cgroup is valid
1296 	 */
1297 	pc = lookup_page_cgroup(page);
1298 	if (unlikely(!pc || !PageCgroupUsed(pc)))
1299 		return NULL;
1300 
1301 	lock_page_cgroup(pc);
1302 
1303 	mem = pc->mem_cgroup;
1304 
1305 	if (!PageCgroupUsed(pc))
1306 		goto unlock_out;
1307 
1308 	switch (ctype) {
1309 	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1310 		if (page_mapped(page))
1311 			goto unlock_out;
1312 		break;
1313 	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
1314 		if (!PageAnon(page)) {	/* Shared memory */
1315 			if (page->mapping && !page_is_file_cache(page))
1316 				goto unlock_out;
1317 		} else if (page_mapped(page)) /* Anon */
1318 				goto unlock_out;
1319 		break;
1320 	default:
1321 		break;
1322 	}
1323 
1324 	res_counter_uncharge(&mem->res, PAGE_SIZE);
1325 	if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1326 		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1327 
1328 	mem_cgroup_charge_statistics(mem, pc, false);
1329 	ClearPageCgroupUsed(pc);
1330 	/*
1331 	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
1332 	 * freed from LRU. This is safe because uncharged page is expected not
1333 	 * to be reused (freed soon). Exception is SwapCache, it's handled by
1334 	 * special functions.
1335 	 */
1336 
1337 	mz = page_cgroup_zoneinfo(pc);
1338 	unlock_page_cgroup(pc);
1339 
1340 	/* at swapout, this memcg will be accessed to record to swap */
1341 	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1342 		css_put(&mem->css);
1343 
1344 	return mem;
1345 
1346 unlock_out:
1347 	unlock_page_cgroup(pc);
1348 	return NULL;
1349 }
1350 
1351 void mem_cgroup_uncharge_page(struct page *page)
1352 {
1353 	/* early check. */
1354 	if (page_mapped(page))
1355 		return;
1356 	if (page->mapping && !PageAnon(page))
1357 		return;
1358 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1359 }
1360 
1361 void mem_cgroup_uncharge_cache_page(struct page *page)
1362 {
1363 	VM_BUG_ON(page_mapped(page));
1364 	VM_BUG_ON(page->mapping);
1365 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1366 }
1367 
1368 /*
1369  * called from __delete_from_swap_cache() and drop "page" account.
1370  * memcg information is recorded to swap_cgroup of "ent"
1371  */
1372 void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1373 {
1374 	struct mem_cgroup *memcg;
1375 
1376 	memcg = __mem_cgroup_uncharge_common(page,
1377 					MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1378 	/* record memcg information */
1379 	if (do_swap_account && memcg) {
1380 		swap_cgroup_record(ent, memcg);
1381 		mem_cgroup_get(memcg);
1382 	}
1383 	if (memcg)
1384 		css_put(&memcg->css);
1385 }
1386 
1387 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1388 /*
1389  * called from swap_entry_free(). remove record in swap_cgroup and
1390  * uncharge "memsw" account.
1391  */
1392 void mem_cgroup_uncharge_swap(swp_entry_t ent)
1393 {
1394 	struct mem_cgroup *memcg;
1395 
1396 	if (!do_swap_account)
1397 		return;
1398 
1399 	memcg = swap_cgroup_record(ent, NULL);
1400 	if (memcg) {
1401 		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1402 		mem_cgroup_put(memcg);
1403 	}
1404 }
1405 #endif
1406 
1407 /*
1408  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
1409  * page belongs to.
1410  */
1411 int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1412 {
1413 	struct page_cgroup *pc;
1414 	struct mem_cgroup *mem = NULL;
1415 	int ret = 0;
1416 
1417 	if (mem_cgroup_disabled())
1418 		return 0;
1419 
1420 	pc = lookup_page_cgroup(page);
1421 	lock_page_cgroup(pc);
1422 	if (PageCgroupUsed(pc)) {
1423 		mem = pc->mem_cgroup;
1424 		css_get(&mem->css);
1425 	}
1426 	unlock_page_cgroup(pc);
1427 
1428 	if (mem) {
1429 		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
1430 		css_put(&mem->css);
1431 	}
1432 	*ptr = mem;
1433 	return ret;
1434 }
1435 
1436 /* remove redundant charge if migration failed*/
1437 void mem_cgroup_end_migration(struct mem_cgroup *mem,
1438 		struct page *oldpage, struct page *newpage)
1439 {
1440 	struct page *target, *unused;
1441 	struct page_cgroup *pc;
1442 	enum charge_type ctype;
1443 
1444 	if (!mem)
1445 		return;
1446 
1447 	/* at migration success, oldpage->mapping is NULL. */
1448 	if (oldpage->mapping) {
1449 		target = oldpage;
1450 		unused = NULL;
1451 	} else {
1452 		target = newpage;
1453 		unused = oldpage;
1454 	}
1455 
1456 	if (PageAnon(target))
1457 		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
1458 	else if (page_is_file_cache(target))
1459 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
1460 	else
1461 		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
1462 
1463 	/* unused page is not on radix-tree now. */
1464 	if (unused)
1465 		__mem_cgroup_uncharge_common(unused, ctype);
1466 
1467 	pc = lookup_page_cgroup(target);
1468 	/*
1469 	 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
1470 	 * So, double-counting is effectively avoided.
1471 	 */
1472 	__mem_cgroup_commit_charge(mem, pc, ctype);
1473 
1474 	/*
1475 	 * Both of oldpage and newpage are still under lock_page().
1476 	 * Then, we don't have to care about race in radix-tree.
1477 	 * But we have to be careful that this page is unmapped or not.
1478 	 *
1479 	 * There is a case for !page_mapped(). At the start of
1480 	 * migration, oldpage was mapped. But now, it's zapped.
1481 	 * But we know *target* page is not freed/reused under us.
1482 	 * mem_cgroup_uncharge_page() does all necessary checks.
1483 	 */
1484 	if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1485 		mem_cgroup_uncharge_page(target);
1486 }
1487 
1488 /*
1489  * A call to try to shrink memory usage under specified resource controller.
1490  * This is typically used for page reclaiming for shmem for reducing side
1491  * effect of page allocation from shmem, which is used by some mem_cgroup.
1492  */
1493 int mem_cgroup_shrink_usage(struct page *page,
1494 			    struct mm_struct *mm,
1495 			    gfp_t gfp_mask)
1496 {
1497 	struct mem_cgroup *mem = NULL;
1498 	int progress = 0;
1499 	int retry = MEM_CGROUP_RECLAIM_RETRIES;
1500 
1501 	if (mem_cgroup_disabled())
1502 		return 0;
1503 	if (page)
1504 		mem = try_get_mem_cgroup_from_swapcache(page);
1505 	if (!mem && mm)
1506 		mem = try_get_mem_cgroup_from_mm(mm);
1507 	if (unlikely(!mem))
1508 		return 0;
1509 
1510 	do {
1511 		progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true);
1512 		progress += mem_cgroup_check_under_limit(mem);
1513 	} while (!progress && --retry);
1514 
1515 	css_put(&mem->css);
1516 	if (!retry)
1517 		return -ENOMEM;
1518 	return 0;
1519 }
1520 
1521 static DEFINE_MUTEX(set_limit_mutex);
1522 
1523 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1524 				unsigned long long val)
1525 {
1526 
1527 	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1528 	int progress;
1529 	u64 memswlimit;
1530 	int ret = 0;
1531 
1532 	while (retry_count) {
1533 		if (signal_pending(current)) {
1534 			ret = -EINTR;
1535 			break;
1536 		}
1537 		/*
1538 		 * Rather than hide all in some function, I do this in
1539 		 * open coded manner. You see what this really does.
1540 		 * We have to guarantee mem->res.limit < mem->memsw.limit.
1541 		 */
1542 		mutex_lock(&set_limit_mutex);
1543 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1544 		if (memswlimit < val) {
1545 			ret = -EINVAL;
1546 			mutex_unlock(&set_limit_mutex);
1547 			break;
1548 		}
1549 		ret = res_counter_set_limit(&memcg->res, val);
1550 		mutex_unlock(&set_limit_mutex);
1551 
1552 		if (!ret)
1553 			break;
1554 
1555 		progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
1556 							   false);
1557   		if (!progress)			retry_count--;
1558 	}
1559 
1560 	return ret;
1561 }
1562 
1563 int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1564 				unsigned long long val)
1565 {
1566 	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1567 	u64 memlimit, oldusage, curusage;
1568 	int ret;
1569 
1570 	if (!do_swap_account)
1571 		return -EINVAL;
1572 
1573 	while (retry_count) {
1574 		if (signal_pending(current)) {
1575 			ret = -EINTR;
1576 			break;
1577 		}
1578 		/*
1579 		 * Rather than hide all in some function, I do this in
1580 		 * open coded manner. You see what this really does.
1581 		 * We have to guarantee mem->res.limit < mem->memsw.limit.
1582 		 */
1583 		mutex_lock(&set_limit_mutex);
1584 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1585 		if (memlimit > val) {
1586 			ret = -EINVAL;
1587 			mutex_unlock(&set_limit_mutex);
1588 			break;
1589 		}
1590 		ret = res_counter_set_limit(&memcg->memsw, val);
1591 		mutex_unlock(&set_limit_mutex);
1592 
1593 		if (!ret)
1594 			break;
1595 
1596 		oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1597 		mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true);
1598 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1599 		if (curusage >= oldusage)
1600 			retry_count--;
1601 	}
1602 	return ret;
1603 }
1604 
1605 /*
1606  * This routine traverse page_cgroup in given list and drop them all.
1607  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
1608  */
1609 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
1610 				int node, int zid, enum lru_list lru)
1611 {
1612 	struct zone *zone;
1613 	struct mem_cgroup_per_zone *mz;
1614 	struct page_cgroup *pc, *busy;
1615 	unsigned long flags, loop;
1616 	struct list_head *list;
1617 	int ret = 0;
1618 
1619 	zone = &NODE_DATA(node)->node_zones[zid];
1620 	mz = mem_cgroup_zoneinfo(mem, node, zid);
1621 	list = &mz->lists[lru];
1622 
1623 	loop = MEM_CGROUP_ZSTAT(mz, lru);
1624 	/* give some margin against EBUSY etc...*/
1625 	loop += 256;
1626 	busy = NULL;
1627 	while (loop--) {
1628 		ret = 0;
1629 		spin_lock_irqsave(&zone->lru_lock, flags);
1630 		if (list_empty(list)) {
1631 			spin_unlock_irqrestore(&zone->lru_lock, flags);
1632 			break;
1633 		}
1634 		pc = list_entry(list->prev, struct page_cgroup, lru);
1635 		if (busy == pc) {
1636 			list_move(&pc->lru, list);
1637 			busy = 0;
1638 			spin_unlock_irqrestore(&zone->lru_lock, flags);
1639 			continue;
1640 		}
1641 		spin_unlock_irqrestore(&zone->lru_lock, flags);
1642 
1643 		ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
1644 		if (ret == -ENOMEM)
1645 			break;
1646 
1647 		if (ret == -EBUSY || ret == -EINVAL) {
1648 			/* found lock contention or "pc" is obsolete. */
1649 			busy = pc;
1650 			cond_resched();
1651 		} else
1652 			busy = NULL;
1653 	}
1654 
1655 	if (!ret && !list_empty(list))
1656 		return -EBUSY;
1657 	return ret;
1658 }
1659 
1660 /*
1661  * make mem_cgroup's charge to be 0 if there is no task.
1662  * This enables deleting this mem_cgroup.
1663  */
1664 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
1665 {
1666 	int ret;
1667 	int node, zid, shrink;
1668 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1669 	struct cgroup *cgrp = mem->css.cgroup;
1670 
1671 	css_get(&mem->css);
1672 
1673 	shrink = 0;
1674 	/* should free all ? */
1675 	if (free_all)
1676 		goto try_to_free;
1677 move_account:
1678 	while (mem->res.usage > 0) {
1679 		ret = -EBUSY;
1680 		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
1681 			goto out;
1682 		ret = -EINTR;
1683 		if (signal_pending(current))
1684 			goto out;
1685 		/* This is for making all *used* pages to be on LRU. */
1686 		lru_add_drain_all();
1687 		ret = 0;
1688 		for_each_node_state(node, N_HIGH_MEMORY) {
1689 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1690 				enum lru_list l;
1691 				for_each_lru(l) {
1692 					ret = mem_cgroup_force_empty_list(mem,
1693 							node, zid, l);
1694 					if (ret)
1695 						break;
1696 				}
1697 			}
1698 			if (ret)
1699 				break;
1700 		}
1701 		/* it seems parent cgroup doesn't have enough mem */
1702 		if (ret == -ENOMEM)
1703 			goto try_to_free;
1704 		cond_resched();
1705 	}
1706 	ret = 0;
1707 out:
1708 	css_put(&mem->css);
1709 	return ret;
1710 
1711 try_to_free:
1712 	/* returns EBUSY if there is a task or if we come here twice. */
1713 	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
1714 		ret = -EBUSY;
1715 		goto out;
1716 	}
1717 	/* we call try-to-free pages for make this cgroup empty */
1718 	lru_add_drain_all();
1719 	/* try to free all pages in this cgroup */
1720 	shrink = 1;
1721 	while (nr_retries && mem->res.usage > 0) {
1722 		int progress;
1723 
1724 		if (signal_pending(current)) {
1725 			ret = -EINTR;
1726 			goto out;
1727 		}
1728 		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
1729 						false, get_swappiness(mem));
1730 		if (!progress) {
1731 			nr_retries--;
1732 			/* maybe some writeback is necessary */
1733 			congestion_wait(WRITE, HZ/10);
1734 		}
1735 
1736 	}
1737 	lru_add_drain();
1738 	/* try move_account...there may be some *locked* pages. */
1739 	if (mem->res.usage)
1740 		goto move_account;
1741 	ret = 0;
1742 	goto out;
1743 }
1744 
1745 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
1746 {
1747 	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
1748 }
1749 
1750 
1751 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
1752 {
1753 	return mem_cgroup_from_cont(cont)->use_hierarchy;
1754 }
1755 
1756 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
1757 					u64 val)
1758 {
1759 	int retval = 0;
1760 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1761 	struct cgroup *parent = cont->parent;
1762 	struct mem_cgroup *parent_mem = NULL;
1763 
1764 	if (parent)
1765 		parent_mem = mem_cgroup_from_cont(parent);
1766 
1767 	cgroup_lock();
1768 	/*
1769 	 * If parent's use_hiearchy is set, we can't make any modifications
1770 	 * in the child subtrees. If it is unset, then the change can
1771 	 * occur, provided the current cgroup has no children.
1772 	 *
1773 	 * For the root cgroup, parent_mem is NULL, we allow value to be
1774 	 * set if there are no children.
1775 	 */
1776 	if ((!parent_mem || !parent_mem->use_hierarchy) &&
1777 				(val == 1 || val == 0)) {
1778 		if (list_empty(&cont->children))
1779 			mem->use_hierarchy = val;
1780 		else
1781 			retval = -EBUSY;
1782 	} else
1783 		retval = -EINVAL;
1784 	cgroup_unlock();
1785 
1786 	return retval;
1787 }
1788 
1789 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
1790 {
1791 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1792 	u64 val = 0;
1793 	int type, name;
1794 
1795 	type = MEMFILE_TYPE(cft->private);
1796 	name = MEMFILE_ATTR(cft->private);
1797 	switch (type) {
1798 	case _MEM:
1799 		val = res_counter_read_u64(&mem->res, name);
1800 		break;
1801 	case _MEMSWAP:
1802 		if (do_swap_account)
1803 			val = res_counter_read_u64(&mem->memsw, name);
1804 		break;
1805 	default:
1806 		BUG();
1807 		break;
1808 	}
1809 	return val;
1810 }
1811 /*
1812  * The user of this function is...
1813  * RES_LIMIT.
1814  */
1815 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
1816 			    const char *buffer)
1817 {
1818 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
1819 	int type, name;
1820 	unsigned long long val;
1821 	int ret;
1822 
1823 	type = MEMFILE_TYPE(cft->private);
1824 	name = MEMFILE_ATTR(cft->private);
1825 	switch (name) {
1826 	case RES_LIMIT:
1827 		/* This function does all necessary parse...reuse it */
1828 		ret = res_counter_memparse_write_strategy(buffer, &val);
1829 		if (ret)
1830 			break;
1831 		if (type == _MEM)
1832 			ret = mem_cgroup_resize_limit(memcg, val);
1833 		else
1834 			ret = mem_cgroup_resize_memsw_limit(memcg, val);
1835 		break;
1836 	default:
1837 		ret = -EINVAL; /* should be BUG() ? */
1838 		break;
1839 	}
1840 	return ret;
1841 }
1842 
1843 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
1844 		unsigned long long *mem_limit, unsigned long long *memsw_limit)
1845 {
1846 	struct cgroup *cgroup;
1847 	unsigned long long min_limit, min_memsw_limit, tmp;
1848 
1849 	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1850 	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1851 	cgroup = memcg->css.cgroup;
1852 	if (!memcg->use_hierarchy)
1853 		goto out;
1854 
1855 	while (cgroup->parent) {
1856 		cgroup = cgroup->parent;
1857 		memcg = mem_cgroup_from_cont(cgroup);
1858 		if (!memcg->use_hierarchy)
1859 			break;
1860 		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
1861 		min_limit = min(min_limit, tmp);
1862 		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1863 		min_memsw_limit = min(min_memsw_limit, tmp);
1864 	}
1865 out:
1866 	*mem_limit = min_limit;
1867 	*memsw_limit = min_memsw_limit;
1868 	return;
1869 }
1870 
1871 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1872 {
1873 	struct mem_cgroup *mem;
1874 	int type, name;
1875 
1876 	mem = mem_cgroup_from_cont(cont);
1877 	type = MEMFILE_TYPE(event);
1878 	name = MEMFILE_ATTR(event);
1879 	switch (name) {
1880 	case RES_MAX_USAGE:
1881 		if (type == _MEM)
1882 			res_counter_reset_max(&mem->res);
1883 		else
1884 			res_counter_reset_max(&mem->memsw);
1885 		break;
1886 	case RES_FAILCNT:
1887 		if (type == _MEM)
1888 			res_counter_reset_failcnt(&mem->res);
1889 		else
1890 			res_counter_reset_failcnt(&mem->memsw);
1891 		break;
1892 	}
1893 	return 0;
1894 }
1895 
1896 static const struct mem_cgroup_stat_desc {
1897 	const char *msg;
1898 	u64 unit;
1899 } mem_cgroup_stat_desc[] = {
1900 	[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
1901 	[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
1902 	[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
1903 	[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
1904 };
1905 
1906 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1907 				 struct cgroup_map_cb *cb)
1908 {
1909 	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
1910 	struct mem_cgroup_stat *stat = &mem_cont->stat;
1911 	int i;
1912 
1913 	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
1914 		s64 val;
1915 
1916 		val = mem_cgroup_read_stat(stat, i);
1917 		val *= mem_cgroup_stat_desc[i].unit;
1918 		cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1919 	}
1920 	/* showing # of active pages */
1921 	{
1922 		unsigned long active_anon, inactive_anon;
1923 		unsigned long active_file, inactive_file;
1924 		unsigned long unevictable;
1925 
1926 		inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1927 						LRU_INACTIVE_ANON);
1928 		active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1929 						LRU_ACTIVE_ANON);
1930 		inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
1931 						LRU_INACTIVE_FILE);
1932 		active_file = mem_cgroup_get_all_zonestat(mem_cont,
1933 						LRU_ACTIVE_FILE);
1934 		unevictable = mem_cgroup_get_all_zonestat(mem_cont,
1935 							LRU_UNEVICTABLE);
1936 
1937 		cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
1938 		cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
1939 		cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
1940 		cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
1941 		cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
1942 
1943 	}
1944 	{
1945 		unsigned long long limit, memsw_limit;
1946 		memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
1947 		cb->fill(cb, "hierarchical_memory_limit", limit);
1948 		if (do_swap_account)
1949 			cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
1950 	}
1951 
1952 #ifdef CONFIG_DEBUG_VM
1953 	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
1954 
1955 	{
1956 		int nid, zid;
1957 		struct mem_cgroup_per_zone *mz;
1958 		unsigned long recent_rotated[2] = {0, 0};
1959 		unsigned long recent_scanned[2] = {0, 0};
1960 
1961 		for_each_online_node(nid)
1962 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1963 				mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1964 
1965 				recent_rotated[0] +=
1966 					mz->reclaim_stat.recent_rotated[0];
1967 				recent_rotated[1] +=
1968 					mz->reclaim_stat.recent_rotated[1];
1969 				recent_scanned[0] +=
1970 					mz->reclaim_stat.recent_scanned[0];
1971 				recent_scanned[1] +=
1972 					mz->reclaim_stat.recent_scanned[1];
1973 			}
1974 		cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
1975 		cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
1976 		cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
1977 		cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
1978 	}
1979 #endif
1980 
1981 	return 0;
1982 }
1983 
1984 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
1985 {
1986 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
1987 
1988 	return get_swappiness(memcg);
1989 }
1990 
1991 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
1992 				       u64 val)
1993 {
1994 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
1995 	struct mem_cgroup *parent;
1996 
1997 	if (val > 100)
1998 		return -EINVAL;
1999 
2000 	if (cgrp->parent == NULL)
2001 		return -EINVAL;
2002 
2003 	parent = mem_cgroup_from_cont(cgrp->parent);
2004 
2005 	cgroup_lock();
2006 
2007 	/* If under hierarchy, only empty-root can set this value */
2008 	if ((parent->use_hierarchy) ||
2009 	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
2010 		cgroup_unlock();
2011 		return -EINVAL;
2012 	}
2013 
2014 	spin_lock(&memcg->reclaim_param_lock);
2015 	memcg->swappiness = val;
2016 	spin_unlock(&memcg->reclaim_param_lock);
2017 
2018 	cgroup_unlock();
2019 
2020 	return 0;
2021 }
2022 
2023 
2024 static struct cftype mem_cgroup_files[] = {
2025 	{
2026 		.name = "usage_in_bytes",
2027 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
2028 		.read_u64 = mem_cgroup_read,
2029 	},
2030 	{
2031 		.name = "max_usage_in_bytes",
2032 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
2033 		.trigger = mem_cgroup_reset,
2034 		.read_u64 = mem_cgroup_read,
2035 	},
2036 	{
2037 		.name = "limit_in_bytes",
2038 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
2039 		.write_string = mem_cgroup_write,
2040 		.read_u64 = mem_cgroup_read,
2041 	},
2042 	{
2043 		.name = "failcnt",
2044 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2045 		.trigger = mem_cgroup_reset,
2046 		.read_u64 = mem_cgroup_read,
2047 	},
2048 	{
2049 		.name = "stat",
2050 		.read_map = mem_control_stat_show,
2051 	},
2052 	{
2053 		.name = "force_empty",
2054 		.trigger = mem_cgroup_force_empty_write,
2055 	},
2056 	{
2057 		.name = "use_hierarchy",
2058 		.write_u64 = mem_cgroup_hierarchy_write,
2059 		.read_u64 = mem_cgroup_hierarchy_read,
2060 	},
2061 	{
2062 		.name = "swappiness",
2063 		.read_u64 = mem_cgroup_swappiness_read,
2064 		.write_u64 = mem_cgroup_swappiness_write,
2065 	},
2066 };
2067 
2068 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2069 static struct cftype memsw_cgroup_files[] = {
2070 	{
2071 		.name = "memsw.usage_in_bytes",
2072 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
2073 		.read_u64 = mem_cgroup_read,
2074 	},
2075 	{
2076 		.name = "memsw.max_usage_in_bytes",
2077 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
2078 		.trigger = mem_cgroup_reset,
2079 		.read_u64 = mem_cgroup_read,
2080 	},
2081 	{
2082 		.name = "memsw.limit_in_bytes",
2083 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
2084 		.write_string = mem_cgroup_write,
2085 		.read_u64 = mem_cgroup_read,
2086 	},
2087 	{
2088 		.name = "memsw.failcnt",
2089 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
2090 		.trigger = mem_cgroup_reset,
2091 		.read_u64 = mem_cgroup_read,
2092 	},
2093 };
2094 
2095 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
2096 {
2097 	if (!do_swap_account)
2098 		return 0;
2099 	return cgroup_add_files(cont, ss, memsw_cgroup_files,
2100 				ARRAY_SIZE(memsw_cgroup_files));
2101 };
2102 #else
2103 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
2104 {
2105 	return 0;
2106 }
2107 #endif
2108 
2109 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2110 {
2111 	struct mem_cgroup_per_node *pn;
2112 	struct mem_cgroup_per_zone *mz;
2113 	enum lru_list l;
2114 	int zone, tmp = node;
2115 	/*
2116 	 * This routine is called against possible nodes.
2117 	 * But it's BUG to call kmalloc() against offline node.
2118 	 *
2119 	 * TODO: this routine can waste much memory for nodes which will
2120 	 *       never be onlined. It's better to use memory hotplug callback
2121 	 *       function.
2122 	 */
2123 	if (!node_state(node, N_NORMAL_MEMORY))
2124 		tmp = -1;
2125 	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
2126 	if (!pn)
2127 		return 1;
2128 
2129 	mem->info.nodeinfo[node] = pn;
2130 	memset(pn, 0, sizeof(*pn));
2131 
2132 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
2133 		mz = &pn->zoneinfo[zone];
2134 		for_each_lru(l)
2135 			INIT_LIST_HEAD(&mz->lists[l]);
2136 	}
2137 	return 0;
2138 }
2139 
2140 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2141 {
2142 	kfree(mem->info.nodeinfo[node]);
2143 }
2144 
2145 static int mem_cgroup_size(void)
2146 {
2147 	int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
2148 	return sizeof(struct mem_cgroup) + cpustat_size;
2149 }
2150 
2151 static struct mem_cgroup *mem_cgroup_alloc(void)
2152 {
2153 	struct mem_cgroup *mem;
2154 	int size = mem_cgroup_size();
2155 
2156 	if (size < PAGE_SIZE)
2157 		mem = kmalloc(size, GFP_KERNEL);
2158 	else
2159 		mem = vmalloc(size);
2160 
2161 	if (mem)
2162 		memset(mem, 0, size);
2163 	return mem;
2164 }
2165 
2166 /*
2167  * At destroying mem_cgroup, references from swap_cgroup can remain.
2168  * (scanning all at force_empty is too costly...)
2169  *
2170  * Instead of clearing all references at force_empty, we remember
2171  * the number of reference from swap_cgroup and free mem_cgroup when
2172  * it goes down to 0.
2173  *
2174  * Removal of cgroup itself succeeds regardless of refs from swap.
2175  */
2176 
2177 static void __mem_cgroup_free(struct mem_cgroup *mem)
2178 {
2179 	int node;
2180 
2181 	for_each_node_state(node, N_POSSIBLE)
2182 		free_mem_cgroup_per_zone_info(mem, node);
2183 
2184 	if (mem_cgroup_size() < PAGE_SIZE)
2185 		kfree(mem);
2186 	else
2187 		vfree(mem);
2188 }
2189 
2190 static void mem_cgroup_get(struct mem_cgroup *mem)
2191 {
2192 	atomic_inc(&mem->refcnt);
2193 }
2194 
2195 static void mem_cgroup_put(struct mem_cgroup *mem)
2196 {
2197 	if (atomic_dec_and_test(&mem->refcnt)) {
2198 		struct mem_cgroup *parent = parent_mem_cgroup(mem);
2199 		__mem_cgroup_free(mem);
2200 		if (parent)
2201 			mem_cgroup_put(parent);
2202 	}
2203 }
2204 
2205 /*
2206  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
2207  */
2208 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
2209 {
2210 	if (!mem->res.parent)
2211 		return NULL;
2212 	return mem_cgroup_from_res_counter(mem->res.parent, res);
2213 }
2214 
2215 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2216 static void __init enable_swap_cgroup(void)
2217 {
2218 	if (!mem_cgroup_disabled() && really_do_swap_account)
2219 		do_swap_account = 1;
2220 }
2221 #else
2222 static void __init enable_swap_cgroup(void)
2223 {
2224 }
2225 #endif
2226 
2227 static struct cgroup_subsys_state * __ref
2228 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2229 {
2230 	struct mem_cgroup *mem, *parent;
2231 	int node;
2232 
2233 	mem = mem_cgroup_alloc();
2234 	if (!mem)
2235 		return ERR_PTR(-ENOMEM);
2236 
2237 	for_each_node_state(node, N_POSSIBLE)
2238 		if (alloc_mem_cgroup_per_zone_info(mem, node))
2239 			goto free_out;
2240 	/* root ? */
2241 	if (cont->parent == NULL) {
2242 		enable_swap_cgroup();
2243 		parent = NULL;
2244 	} else {
2245 		parent = mem_cgroup_from_cont(cont->parent);
2246 		mem->use_hierarchy = parent->use_hierarchy;
2247 	}
2248 
2249 	if (parent && parent->use_hierarchy) {
2250 		res_counter_init(&mem->res, &parent->res);
2251 		res_counter_init(&mem->memsw, &parent->memsw);
2252 		/*
2253 		 * We increment refcnt of the parent to ensure that we can
2254 		 * safely access it on res_counter_charge/uncharge.
2255 		 * This refcnt will be decremented when freeing this
2256 		 * mem_cgroup(see mem_cgroup_put).
2257 		 */
2258 		mem_cgroup_get(parent);
2259 	} else {
2260 		res_counter_init(&mem->res, NULL);
2261 		res_counter_init(&mem->memsw, NULL);
2262 	}
2263 	mem->last_scanned_child = NULL;
2264 	spin_lock_init(&mem->reclaim_param_lock);
2265 
2266 	if (parent)
2267 		mem->swappiness = get_swappiness(parent);
2268 	atomic_set(&mem->refcnt, 1);
2269 	return &mem->css;
2270 free_out:
2271 	__mem_cgroup_free(mem);
2272 	return ERR_PTR(-ENOMEM);
2273 }
2274 
2275 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
2276 					struct cgroup *cont)
2277 {
2278 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2279 	mem_cgroup_force_empty(mem, false);
2280 }
2281 
2282 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
2283 				struct cgroup *cont)
2284 {
2285 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2286 	struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
2287 
2288 	if (last_scanned_child) {
2289 		VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
2290 		mem_cgroup_put(last_scanned_child);
2291 	}
2292 	mem_cgroup_put(mem);
2293 }
2294 
2295 static int mem_cgroup_populate(struct cgroup_subsys *ss,
2296 				struct cgroup *cont)
2297 {
2298 	int ret;
2299 
2300 	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
2301 				ARRAY_SIZE(mem_cgroup_files));
2302 
2303 	if (!ret)
2304 		ret = register_memsw_files(cont, ss);
2305 	return ret;
2306 }
2307 
2308 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
2309 				struct cgroup *cont,
2310 				struct cgroup *old_cont,
2311 				struct task_struct *p)
2312 {
2313 	mutex_lock(&memcg_tasklist);
2314 	/*
2315 	 * FIXME: It's better to move charges of this process from old
2316 	 * memcg to new memcg. But it's just on TODO-List now.
2317 	 */
2318 	mutex_unlock(&memcg_tasklist);
2319 }
2320 
2321 struct cgroup_subsys mem_cgroup_subsys = {
2322 	.name = "memory",
2323 	.subsys_id = mem_cgroup_subsys_id,
2324 	.create = mem_cgroup_create,
2325 	.pre_destroy = mem_cgroup_pre_destroy,
2326 	.destroy = mem_cgroup_destroy,
2327 	.populate = mem_cgroup_populate,
2328 	.attach = mem_cgroup_move_task,
2329 	.early_init = 0,
2330 };
2331 
2332 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2333 
2334 static int __init disable_swap_account(char *s)
2335 {
2336 	really_do_swap_account = 0;
2337 	return 1;
2338 }
2339 __setup("noswapaccount", disable_swap_account);
2340 #endif
2341