xref: /linux/mm/mempolicy.c (revision ed3174d93c342b8b2eeba6bbd124707d55304a7b)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/nodemask.h>
76 #include <linux/cpuset.h>
77 #include <linux/gfp.h>
78 #include <linux/slab.h>
79 #include <linux/string.h>
80 #include <linux/module.h>
81 #include <linux/nsproxy.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/swap.h>
86 #include <linux/seq_file.h>
87 #include <linux/proc_fs.h>
88 #include <linux/migrate.h>
89 #include <linux/rmap.h>
90 #include <linux/security.h>
91 #include <linux/syscalls.h>
92 
93 #include <asm/tlbflush.h>
94 #include <asm/uaccess.h>
95 
96 /* Internal flags */
97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
100 
101 static struct kmem_cache *policy_cache;
102 static struct kmem_cache *sn_cache;
103 
104 /* Highest zone. An specific allocation for a zone below that is not
105    policied. */
106 enum zone_type policy_zone = 0;
107 
108 struct mempolicy default_policy = {
109 	.refcnt = ATOMIC_INIT(1), /* never free it */
110 	.policy = MPOL_DEFAULT,
111 };
112 
113 static void mpol_rebind_policy(struct mempolicy *pol,
114                                const nodemask_t *newmask);
115 
116 /* Do sanity checking on a policy */
117 static int mpol_check_policy(int mode, nodemask_t *nodes)
118 {
119 	int was_empty, is_empty;
120 
121 	if (!nodes)
122 		return 0;
123 
124 	/*
125 	 * "Contextualize" the in-coming nodemast for cpusets:
126 	 * Remember whether in-coming nodemask was empty,  If not,
127 	 * restrict the nodes to the allowed nodes in the cpuset.
128 	 * This is guaranteed to be a subset of nodes with memory.
129 	 */
130 	cpuset_update_task_memory_state();
131 	is_empty = was_empty = nodes_empty(*nodes);
132 	if (!was_empty) {
133 		nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
134 		is_empty = nodes_empty(*nodes);	/* after "contextualization" */
135 	}
136 
137 	switch (mode) {
138 	case MPOL_DEFAULT:
139 		/*
140 		 * require caller to specify an empty nodemask
141 		 * before "contextualization"
142 		 */
143 		if (!was_empty)
144 			return -EINVAL;
145 		break;
146 	case MPOL_BIND:
147 	case MPOL_INTERLEAVE:
148 		/*
149 		 * require at least 1 valid node after "contextualization"
150 		 */
151 		if (is_empty)
152 			return -EINVAL;
153 		break;
154 	case MPOL_PREFERRED:
155 		/*
156 		 * Did caller specify invalid nodes?
157 		 * Don't silently accept this as "local allocation".
158 		 */
159 		if (!was_empty && is_empty)
160 			return -EINVAL;
161 		break;
162 	}
163 	return 0;
164 }
165 
166 /* Generate a custom zonelist for the BIND policy. */
167 static struct zonelist *bind_zonelist(nodemask_t *nodes)
168 {
169 	struct zonelist *zl;
170 	int num, max, nd;
171 	enum zone_type k;
172 
173 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
174 	max++;			/* space for zlcache_ptr (see mmzone.h) */
175 	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
176 	if (!zl)
177 		return ERR_PTR(-ENOMEM);
178 	zl->zlcache_ptr = NULL;
179 	num = 0;
180 	/* First put in the highest zones from all nodes, then all the next
181 	   lower zones etc. Avoid empty zones because the memory allocator
182 	   doesn't like them. If you implement node hot removal you
183 	   have to fix that. */
184 	k = MAX_NR_ZONES - 1;
185 	while (1) {
186 		for_each_node_mask(nd, *nodes) {
187 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
188 			if (z->present_pages > 0)
189 				zl->zones[num++] = z;
190 		}
191 		if (k == 0)
192 			break;
193 		k--;
194 	}
195 	if (num == 0) {
196 		kfree(zl);
197 		return ERR_PTR(-EINVAL);
198 	}
199 	zl->zones[num] = NULL;
200 	return zl;
201 }
202 
203 /* Create a new policy */
204 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
205 {
206 	struct mempolicy *policy;
207 
208 	pr_debug("setting mode %d nodes[0] %lx\n",
209 		 mode, nodes ? nodes_addr(*nodes)[0] : -1);
210 
211 	if (mode == MPOL_DEFAULT)
212 		return NULL;
213 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
214 	if (!policy)
215 		return ERR_PTR(-ENOMEM);
216 	atomic_set(&policy->refcnt, 1);
217 	switch (mode) {
218 	case MPOL_INTERLEAVE:
219 		policy->v.nodes = *nodes;
220 		if (nodes_weight(policy->v.nodes) == 0) {
221 			kmem_cache_free(policy_cache, policy);
222 			return ERR_PTR(-EINVAL);
223 		}
224 		break;
225 	case MPOL_PREFERRED:
226 		policy->v.preferred_node = first_node(*nodes);
227 		if (policy->v.preferred_node >= MAX_NUMNODES)
228 			policy->v.preferred_node = -1;
229 		break;
230 	case MPOL_BIND:
231 		policy->v.zonelist = bind_zonelist(nodes);
232 		if (IS_ERR(policy->v.zonelist)) {
233 			void *error_code = policy->v.zonelist;
234 			kmem_cache_free(policy_cache, policy);
235 			return error_code;
236 		}
237 		break;
238 	}
239 	policy->policy = mode;
240 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
241 	return policy;
242 }
243 
244 static void gather_stats(struct page *, void *, int pte_dirty);
245 static void migrate_page_add(struct page *page, struct list_head *pagelist,
246 				unsigned long flags);
247 
248 /* Scan through pages checking if pages follow certain conditions. */
249 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
250 		unsigned long addr, unsigned long end,
251 		const nodemask_t *nodes, unsigned long flags,
252 		void *private)
253 {
254 	pte_t *orig_pte;
255 	pte_t *pte;
256 	spinlock_t *ptl;
257 
258 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
259 	do {
260 		struct page *page;
261 		int nid;
262 
263 		if (!pte_present(*pte))
264 			continue;
265 		page = vm_normal_page(vma, addr, *pte);
266 		if (!page)
267 			continue;
268 		/*
269 		 * The check for PageReserved here is important to avoid
270 		 * handling zero pages and other pages that may have been
271 		 * marked special by the system.
272 		 *
273 		 * If the PageReserved would not be checked here then f.e.
274 		 * the location of the zero page could have an influence
275 		 * on MPOL_MF_STRICT, zero pages would be counted for
276 		 * the per node stats, and there would be useless attempts
277 		 * to put zero pages on the migration list.
278 		 */
279 		if (PageReserved(page))
280 			continue;
281 		nid = page_to_nid(page);
282 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
283 			continue;
284 
285 		if (flags & MPOL_MF_STATS)
286 			gather_stats(page, private, pte_dirty(*pte));
287 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
288 			migrate_page_add(page, private, flags);
289 		else
290 			break;
291 	} while (pte++, addr += PAGE_SIZE, addr != end);
292 	pte_unmap_unlock(orig_pte, ptl);
293 	return addr != end;
294 }
295 
296 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
297 		unsigned long addr, unsigned long end,
298 		const nodemask_t *nodes, unsigned long flags,
299 		void *private)
300 {
301 	pmd_t *pmd;
302 	unsigned long next;
303 
304 	pmd = pmd_offset(pud, addr);
305 	do {
306 		next = pmd_addr_end(addr, end);
307 		if (pmd_none_or_clear_bad(pmd))
308 			continue;
309 		if (check_pte_range(vma, pmd, addr, next, nodes,
310 				    flags, private))
311 			return -EIO;
312 	} while (pmd++, addr = next, addr != end);
313 	return 0;
314 }
315 
316 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
317 		unsigned long addr, unsigned long end,
318 		const nodemask_t *nodes, unsigned long flags,
319 		void *private)
320 {
321 	pud_t *pud;
322 	unsigned long next;
323 
324 	pud = pud_offset(pgd, addr);
325 	do {
326 		next = pud_addr_end(addr, end);
327 		if (pud_none_or_clear_bad(pud))
328 			continue;
329 		if (check_pmd_range(vma, pud, addr, next, nodes,
330 				    flags, private))
331 			return -EIO;
332 	} while (pud++, addr = next, addr != end);
333 	return 0;
334 }
335 
336 static inline int check_pgd_range(struct vm_area_struct *vma,
337 		unsigned long addr, unsigned long end,
338 		const nodemask_t *nodes, unsigned long flags,
339 		void *private)
340 {
341 	pgd_t *pgd;
342 	unsigned long next;
343 
344 	pgd = pgd_offset(vma->vm_mm, addr);
345 	do {
346 		next = pgd_addr_end(addr, end);
347 		if (pgd_none_or_clear_bad(pgd))
348 			continue;
349 		if (check_pud_range(vma, pgd, addr, next, nodes,
350 				    flags, private))
351 			return -EIO;
352 	} while (pgd++, addr = next, addr != end);
353 	return 0;
354 }
355 
356 /*
357  * Check if all pages in a range are on a set of nodes.
358  * If pagelist != NULL then isolate pages from the LRU and
359  * put them on the pagelist.
360  */
361 static struct vm_area_struct *
362 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
363 		const nodemask_t *nodes, unsigned long flags, void *private)
364 {
365 	int err;
366 	struct vm_area_struct *first, *vma, *prev;
367 
368 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
369 
370 		err = migrate_prep();
371 		if (err)
372 			return ERR_PTR(err);
373 	}
374 
375 	first = find_vma(mm, start);
376 	if (!first)
377 		return ERR_PTR(-EFAULT);
378 	prev = NULL;
379 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
380 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
381 			if (!vma->vm_next && vma->vm_end < end)
382 				return ERR_PTR(-EFAULT);
383 			if (prev && prev->vm_end < vma->vm_start)
384 				return ERR_PTR(-EFAULT);
385 		}
386 		if (!is_vm_hugetlb_page(vma) &&
387 		    ((flags & MPOL_MF_STRICT) ||
388 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
389 				vma_migratable(vma)))) {
390 			unsigned long endvma = vma->vm_end;
391 
392 			if (endvma > end)
393 				endvma = end;
394 			if (vma->vm_start > start)
395 				start = vma->vm_start;
396 			err = check_pgd_range(vma, start, endvma, nodes,
397 						flags, private);
398 			if (err) {
399 				first = ERR_PTR(err);
400 				break;
401 			}
402 		}
403 		prev = vma;
404 	}
405 	return first;
406 }
407 
408 /* Apply policy to a single VMA */
409 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
410 {
411 	int err = 0;
412 	struct mempolicy *old = vma->vm_policy;
413 
414 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
415 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
416 		 vma->vm_ops, vma->vm_file,
417 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
418 
419 	if (vma->vm_ops && vma->vm_ops->set_policy)
420 		err = vma->vm_ops->set_policy(vma, new);
421 	if (!err) {
422 		mpol_get(new);
423 		vma->vm_policy = new;
424 		mpol_free(old);
425 	}
426 	return err;
427 }
428 
429 /* Step 2: apply policy to a range and do splits. */
430 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
431 		       unsigned long end, struct mempolicy *new)
432 {
433 	struct vm_area_struct *next;
434 	int err;
435 
436 	err = 0;
437 	for (; vma && vma->vm_start < end; vma = next) {
438 		next = vma->vm_next;
439 		if (vma->vm_start < start)
440 			err = split_vma(vma->vm_mm, vma, start, 1);
441 		if (!err && vma->vm_end > end)
442 			err = split_vma(vma->vm_mm, vma, end, 0);
443 		if (!err)
444 			err = policy_vma(vma, new);
445 		if (err)
446 			break;
447 	}
448 	return err;
449 }
450 
451 /*
452  * Update task->flags PF_MEMPOLICY bit: set iff non-default
453  * mempolicy.  Allows more rapid checking of this (combined perhaps
454  * with other PF_* flag bits) on memory allocation hot code paths.
455  *
456  * If called from outside this file, the task 'p' should -only- be
457  * a newly forked child not yet visible on the task list, because
458  * manipulating the task flags of a visible task is not safe.
459  *
460  * The above limitation is why this routine has the funny name
461  * mpol_fix_fork_child_flag().
462  *
463  * It is also safe to call this with a task pointer of current,
464  * which the static wrapper mpol_set_task_struct_flag() does,
465  * for use within this file.
466  */
467 
468 void mpol_fix_fork_child_flag(struct task_struct *p)
469 {
470 	if (p->mempolicy)
471 		p->flags |= PF_MEMPOLICY;
472 	else
473 		p->flags &= ~PF_MEMPOLICY;
474 }
475 
476 static void mpol_set_task_struct_flag(void)
477 {
478 	mpol_fix_fork_child_flag(current);
479 }
480 
481 /* Set the process memory policy */
482 static long do_set_mempolicy(int mode, nodemask_t *nodes)
483 {
484 	struct mempolicy *new;
485 
486 	if (mpol_check_policy(mode, nodes))
487 		return -EINVAL;
488 	new = mpol_new(mode, nodes);
489 	if (IS_ERR(new))
490 		return PTR_ERR(new);
491 	mpol_free(current->mempolicy);
492 	current->mempolicy = new;
493 	mpol_set_task_struct_flag();
494 	if (new && new->policy == MPOL_INTERLEAVE)
495 		current->il_next = first_node(new->v.nodes);
496 	return 0;
497 }
498 
499 /* Fill a zone bitmap for a policy */
500 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
501 {
502 	int i;
503 
504 	nodes_clear(*nodes);
505 	switch (p->policy) {
506 	case MPOL_BIND:
507 		for (i = 0; p->v.zonelist->zones[i]; i++)
508 			node_set(zone_to_nid(p->v.zonelist->zones[i]),
509 				*nodes);
510 		break;
511 	case MPOL_DEFAULT:
512 		break;
513 	case MPOL_INTERLEAVE:
514 		*nodes = p->v.nodes;
515 		break;
516 	case MPOL_PREFERRED:
517 		/* or use current node instead of memory_map? */
518 		if (p->v.preferred_node < 0)
519 			*nodes = node_states[N_HIGH_MEMORY];
520 		else
521 			node_set(p->v.preferred_node, *nodes);
522 		break;
523 	default:
524 		BUG();
525 	}
526 }
527 
528 static int lookup_node(struct mm_struct *mm, unsigned long addr)
529 {
530 	struct page *p;
531 	int err;
532 
533 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
534 	if (err >= 0) {
535 		err = page_to_nid(p);
536 		put_page(p);
537 	}
538 	return err;
539 }
540 
541 /* Retrieve NUMA policy */
542 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
543 			     unsigned long addr, unsigned long flags)
544 {
545 	int err;
546 	struct mm_struct *mm = current->mm;
547 	struct vm_area_struct *vma = NULL;
548 	struct mempolicy *pol = current->mempolicy;
549 
550 	cpuset_update_task_memory_state();
551 	if (flags &
552 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
553 		return -EINVAL;
554 
555 	if (flags & MPOL_F_MEMS_ALLOWED) {
556 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
557 			return -EINVAL;
558 		*policy = 0;	/* just so it's initialized */
559 		*nmask  = cpuset_current_mems_allowed;
560 		return 0;
561 	}
562 
563 	if (flags & MPOL_F_ADDR) {
564 		down_read(&mm->mmap_sem);
565 		vma = find_vma_intersection(mm, addr, addr+1);
566 		if (!vma) {
567 			up_read(&mm->mmap_sem);
568 			return -EFAULT;
569 		}
570 		if (vma->vm_ops && vma->vm_ops->get_policy)
571 			pol = vma->vm_ops->get_policy(vma, addr);
572 		else
573 			pol = vma->vm_policy;
574 	} else if (addr)
575 		return -EINVAL;
576 
577 	if (!pol)
578 		pol = &default_policy;
579 
580 	if (flags & MPOL_F_NODE) {
581 		if (flags & MPOL_F_ADDR) {
582 			err = lookup_node(mm, addr);
583 			if (err < 0)
584 				goto out;
585 			*policy = err;
586 		} else if (pol == current->mempolicy &&
587 				pol->policy == MPOL_INTERLEAVE) {
588 			*policy = current->il_next;
589 		} else {
590 			err = -EINVAL;
591 			goto out;
592 		}
593 	} else
594 		*policy = pol->policy;
595 
596 	if (vma) {
597 		up_read(&current->mm->mmap_sem);
598 		vma = NULL;
599 	}
600 
601 	err = 0;
602 	if (nmask)
603 		get_zonemask(pol, nmask);
604 
605  out:
606 	if (vma)
607 		up_read(&current->mm->mmap_sem);
608 	return err;
609 }
610 
611 #ifdef CONFIG_MIGRATION
612 /*
613  * page migration
614  */
615 static void migrate_page_add(struct page *page, struct list_head *pagelist,
616 				unsigned long flags)
617 {
618 	/*
619 	 * Avoid migrating a page that is shared with others.
620 	 */
621 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
622 		isolate_lru_page(page, pagelist);
623 }
624 
625 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
626 {
627 	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
628 }
629 
630 /*
631  * Migrate pages from one node to a target node.
632  * Returns error or the number of pages not migrated.
633  */
634 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
635 			   int flags)
636 {
637 	nodemask_t nmask;
638 	LIST_HEAD(pagelist);
639 	int err = 0;
640 
641 	nodes_clear(nmask);
642 	node_set(source, nmask);
643 
644 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
645 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
646 
647 	if (!list_empty(&pagelist))
648 		err = migrate_pages(&pagelist, new_node_page, dest);
649 
650 	return err;
651 }
652 
653 /*
654  * Move pages between the two nodesets so as to preserve the physical
655  * layout as much as possible.
656  *
657  * Returns the number of page that could not be moved.
658  */
659 int do_migrate_pages(struct mm_struct *mm,
660 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
661 {
662 	LIST_HEAD(pagelist);
663 	int busy = 0;
664 	int err = 0;
665 	nodemask_t tmp;
666 
667   	down_read(&mm->mmap_sem);
668 
669 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
670 	if (err)
671 		goto out;
672 
673 /*
674  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
675  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
676  * bit in 'tmp', and return that <source, dest> pair for migration.
677  * The pair of nodemasks 'to' and 'from' define the map.
678  *
679  * If no pair of bits is found that way, fallback to picking some
680  * pair of 'source' and 'dest' bits that are not the same.  If the
681  * 'source' and 'dest' bits are the same, this represents a node
682  * that will be migrating to itself, so no pages need move.
683  *
684  * If no bits are left in 'tmp', or if all remaining bits left
685  * in 'tmp' correspond to the same bit in 'to', return false
686  * (nothing left to migrate).
687  *
688  * This lets us pick a pair of nodes to migrate between, such that
689  * if possible the dest node is not already occupied by some other
690  * source node, minimizing the risk of overloading the memory on a
691  * node that would happen if we migrated incoming memory to a node
692  * before migrating outgoing memory source that same node.
693  *
694  * A single scan of tmp is sufficient.  As we go, we remember the
695  * most recent <s, d> pair that moved (s != d).  If we find a pair
696  * that not only moved, but what's better, moved to an empty slot
697  * (d is not set in tmp), then we break out then, with that pair.
698  * Otherwise when we finish scannng from_tmp, we at least have the
699  * most recent <s, d> pair that moved.  If we get all the way through
700  * the scan of tmp without finding any node that moved, much less
701  * moved to an empty node, then there is nothing left worth migrating.
702  */
703 
704 	tmp = *from_nodes;
705 	while (!nodes_empty(tmp)) {
706 		int s,d;
707 		int source = -1;
708 		int dest = 0;
709 
710 		for_each_node_mask(s, tmp) {
711 			d = node_remap(s, *from_nodes, *to_nodes);
712 			if (s == d)
713 				continue;
714 
715 			source = s;	/* Node moved. Memorize */
716 			dest = d;
717 
718 			/* dest not in remaining from nodes? */
719 			if (!node_isset(dest, tmp))
720 				break;
721 		}
722 		if (source == -1)
723 			break;
724 
725 		node_clear(source, tmp);
726 		err = migrate_to_node(mm, source, dest, flags);
727 		if (err > 0)
728 			busy += err;
729 		if (err < 0)
730 			break;
731 	}
732 out:
733 	up_read(&mm->mmap_sem);
734 	if (err < 0)
735 		return err;
736 	return busy;
737 
738 }
739 
740 /*
741  * Allocate a new page for page migration based on vma policy.
742  * Start assuming that page is mapped by vma pointed to by @private.
743  * Search forward from there, if not.  N.B., this assumes that the
744  * list of pages handed to migrate_pages()--which is how we get here--
745  * is in virtual address order.
746  */
747 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
748 {
749 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
750 	unsigned long uninitialized_var(address);
751 
752 	while (vma) {
753 		address = page_address_in_vma(page, vma);
754 		if (address != -EFAULT)
755 			break;
756 		vma = vma->vm_next;
757 	}
758 
759 	/*
760 	 * if !vma, alloc_page_vma() will use task or system default policy
761 	 */
762 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
763 }
764 #else
765 
766 static void migrate_page_add(struct page *page, struct list_head *pagelist,
767 				unsigned long flags)
768 {
769 }
770 
771 int do_migrate_pages(struct mm_struct *mm,
772 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
773 {
774 	return -ENOSYS;
775 }
776 
777 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
778 {
779 	return NULL;
780 }
781 #endif
782 
783 static long do_mbind(unsigned long start, unsigned long len,
784 		     unsigned long mode, nodemask_t *nmask,
785 		     unsigned long flags)
786 {
787 	struct vm_area_struct *vma;
788 	struct mm_struct *mm = current->mm;
789 	struct mempolicy *new;
790 	unsigned long end;
791 	int err;
792 	LIST_HEAD(pagelist);
793 
794 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
795 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
796 	    || mode > MPOL_MAX)
797 		return -EINVAL;
798 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
799 		return -EPERM;
800 
801 	if (start & ~PAGE_MASK)
802 		return -EINVAL;
803 
804 	if (mode == MPOL_DEFAULT)
805 		flags &= ~MPOL_MF_STRICT;
806 
807 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
808 	end = start + len;
809 
810 	if (end < start)
811 		return -EINVAL;
812 	if (end == start)
813 		return 0;
814 
815 	if (mpol_check_policy(mode, nmask))
816 		return -EINVAL;
817 
818 	new = mpol_new(mode, nmask);
819 	if (IS_ERR(new))
820 		return PTR_ERR(new);
821 
822 	/*
823 	 * If we are using the default policy then operation
824 	 * on discontinuous address spaces is okay after all
825 	 */
826 	if (!new)
827 		flags |= MPOL_MF_DISCONTIG_OK;
828 
829 	pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
830 		 mode, nmask ? nodes_addr(*nmask)[0] : -1);
831 
832 	down_write(&mm->mmap_sem);
833 	vma = check_range(mm, start, end, nmask,
834 			  flags | MPOL_MF_INVERT, &pagelist);
835 
836 	err = PTR_ERR(vma);
837 	if (!IS_ERR(vma)) {
838 		int nr_failed = 0;
839 
840 		err = mbind_range(vma, start, end, new);
841 
842 		if (!list_empty(&pagelist))
843 			nr_failed = migrate_pages(&pagelist, new_vma_page,
844 						(unsigned long)vma);
845 
846 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
847 			err = -EIO;
848 	}
849 
850 	up_write(&mm->mmap_sem);
851 	mpol_free(new);
852 	return err;
853 }
854 
855 /*
856  * User space interface with variable sized bitmaps for nodelists.
857  */
858 
859 /* Copy a node mask from user space. */
860 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
861 		     unsigned long maxnode)
862 {
863 	unsigned long k;
864 	unsigned long nlongs;
865 	unsigned long endmask;
866 
867 	--maxnode;
868 	nodes_clear(*nodes);
869 	if (maxnode == 0 || !nmask)
870 		return 0;
871 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
872 		return -EINVAL;
873 
874 	nlongs = BITS_TO_LONGS(maxnode);
875 	if ((maxnode % BITS_PER_LONG) == 0)
876 		endmask = ~0UL;
877 	else
878 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
879 
880 	/* When the user specified more nodes than supported just check
881 	   if the non supported part is all zero. */
882 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
883 		if (nlongs > PAGE_SIZE/sizeof(long))
884 			return -EINVAL;
885 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
886 			unsigned long t;
887 			if (get_user(t, nmask + k))
888 				return -EFAULT;
889 			if (k == nlongs - 1) {
890 				if (t & endmask)
891 					return -EINVAL;
892 			} else if (t)
893 				return -EINVAL;
894 		}
895 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
896 		endmask = ~0UL;
897 	}
898 
899 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
900 		return -EFAULT;
901 	nodes_addr(*nodes)[nlongs-1] &= endmask;
902 	return 0;
903 }
904 
905 /* Copy a kernel node mask to user space */
906 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
907 			      nodemask_t *nodes)
908 {
909 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
910 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
911 
912 	if (copy > nbytes) {
913 		if (copy > PAGE_SIZE)
914 			return -EINVAL;
915 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
916 			return -EFAULT;
917 		copy = nbytes;
918 	}
919 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
920 }
921 
922 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
923 			unsigned long mode,
924 			unsigned long __user *nmask, unsigned long maxnode,
925 			unsigned flags)
926 {
927 	nodemask_t nodes;
928 	int err;
929 
930 	err = get_nodes(&nodes, nmask, maxnode);
931 	if (err)
932 		return err;
933 	return do_mbind(start, len, mode, &nodes, flags);
934 }
935 
936 /* Set the process memory policy */
937 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
938 		unsigned long maxnode)
939 {
940 	int err;
941 	nodemask_t nodes;
942 
943 	if (mode < 0 || mode > MPOL_MAX)
944 		return -EINVAL;
945 	err = get_nodes(&nodes, nmask, maxnode);
946 	if (err)
947 		return err;
948 	return do_set_mempolicy(mode, &nodes);
949 }
950 
951 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
952 		const unsigned long __user *old_nodes,
953 		const unsigned long __user *new_nodes)
954 {
955 	struct mm_struct *mm;
956 	struct task_struct *task;
957 	nodemask_t old;
958 	nodemask_t new;
959 	nodemask_t task_nodes;
960 	int err;
961 
962 	err = get_nodes(&old, old_nodes, maxnode);
963 	if (err)
964 		return err;
965 
966 	err = get_nodes(&new, new_nodes, maxnode);
967 	if (err)
968 		return err;
969 
970 	/* Find the mm_struct */
971 	read_lock(&tasklist_lock);
972 	task = pid ? find_task_by_vpid(pid) : current;
973 	if (!task) {
974 		read_unlock(&tasklist_lock);
975 		return -ESRCH;
976 	}
977 	mm = get_task_mm(task);
978 	read_unlock(&tasklist_lock);
979 
980 	if (!mm)
981 		return -EINVAL;
982 
983 	/*
984 	 * Check if this process has the right to modify the specified
985 	 * process. The right exists if the process has administrative
986 	 * capabilities, superuser privileges or the same
987 	 * userid as the target process.
988 	 */
989 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
990 	    (current->uid != task->suid) && (current->uid != task->uid) &&
991 	    !capable(CAP_SYS_NICE)) {
992 		err = -EPERM;
993 		goto out;
994 	}
995 
996 	task_nodes = cpuset_mems_allowed(task);
997 	/* Is the user allowed to access the target nodes? */
998 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
999 		err = -EPERM;
1000 		goto out;
1001 	}
1002 
1003 	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1004 		err = -EINVAL;
1005 		goto out;
1006 	}
1007 
1008 	err = security_task_movememory(task);
1009 	if (err)
1010 		goto out;
1011 
1012 	err = do_migrate_pages(mm, &old, &new,
1013 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1014 out:
1015 	mmput(mm);
1016 	return err;
1017 }
1018 
1019 
1020 /* Retrieve NUMA policy */
1021 asmlinkage long sys_get_mempolicy(int __user *policy,
1022 				unsigned long __user *nmask,
1023 				unsigned long maxnode,
1024 				unsigned long addr, unsigned long flags)
1025 {
1026 	int err;
1027 	int uninitialized_var(pval);
1028 	nodemask_t nodes;
1029 
1030 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1031 		return -EINVAL;
1032 
1033 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1034 
1035 	if (err)
1036 		return err;
1037 
1038 	if (policy && put_user(pval, policy))
1039 		return -EFAULT;
1040 
1041 	if (nmask)
1042 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1043 
1044 	return err;
1045 }
1046 
1047 #ifdef CONFIG_COMPAT
1048 
1049 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1050 				     compat_ulong_t __user *nmask,
1051 				     compat_ulong_t maxnode,
1052 				     compat_ulong_t addr, compat_ulong_t flags)
1053 {
1054 	long err;
1055 	unsigned long __user *nm = NULL;
1056 	unsigned long nr_bits, alloc_size;
1057 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1058 
1059 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1060 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1061 
1062 	if (nmask)
1063 		nm = compat_alloc_user_space(alloc_size);
1064 
1065 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1066 
1067 	if (!err && nmask) {
1068 		err = copy_from_user(bm, nm, alloc_size);
1069 		/* ensure entire bitmap is zeroed */
1070 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1071 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1072 	}
1073 
1074 	return err;
1075 }
1076 
1077 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1078 				     compat_ulong_t maxnode)
1079 {
1080 	long err = 0;
1081 	unsigned long __user *nm = NULL;
1082 	unsigned long nr_bits, alloc_size;
1083 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1084 
1085 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1086 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1087 
1088 	if (nmask) {
1089 		err = compat_get_bitmap(bm, nmask, nr_bits);
1090 		nm = compat_alloc_user_space(alloc_size);
1091 		err |= copy_to_user(nm, bm, alloc_size);
1092 	}
1093 
1094 	if (err)
1095 		return -EFAULT;
1096 
1097 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1098 }
1099 
1100 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1101 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1102 			     compat_ulong_t maxnode, compat_ulong_t flags)
1103 {
1104 	long err = 0;
1105 	unsigned long __user *nm = NULL;
1106 	unsigned long nr_bits, alloc_size;
1107 	nodemask_t bm;
1108 
1109 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1110 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1111 
1112 	if (nmask) {
1113 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1114 		nm = compat_alloc_user_space(alloc_size);
1115 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1116 	}
1117 
1118 	if (err)
1119 		return -EFAULT;
1120 
1121 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1122 }
1123 
1124 #endif
1125 
1126 /*
1127  * get_vma_policy(@task, @vma, @addr)
1128  * @task - task for fallback if vma policy == default
1129  * @vma   - virtual memory area whose policy is sought
1130  * @addr  - address in @vma for shared policy lookup
1131  *
1132  * Returns effective policy for a VMA at specified address.
1133  * Falls back to @task or system default policy, as necessary.
1134  * Returned policy has extra reference count if shared, vma,
1135  * or some other task's policy [show_numa_maps() can pass
1136  * @task != current].  It is the caller's responsibility to
1137  * free the reference in these cases.
1138  */
1139 static struct mempolicy * get_vma_policy(struct task_struct *task,
1140 		struct vm_area_struct *vma, unsigned long addr)
1141 {
1142 	struct mempolicy *pol = task->mempolicy;
1143 	int shared_pol = 0;
1144 
1145 	if (vma) {
1146 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1147 			pol = vma->vm_ops->get_policy(vma, addr);
1148 			shared_pol = 1;	/* if pol non-NULL, add ref below */
1149 		} else if (vma->vm_policy &&
1150 				vma->vm_policy->policy != MPOL_DEFAULT)
1151 			pol = vma->vm_policy;
1152 	}
1153 	if (!pol)
1154 		pol = &default_policy;
1155 	else if (!shared_pol && pol != current->mempolicy)
1156 		mpol_get(pol);	/* vma or other task's policy */
1157 	return pol;
1158 }
1159 
1160 /* Return a zonelist representing a mempolicy */
1161 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1162 {
1163 	int nd;
1164 
1165 	switch (policy->policy) {
1166 	case MPOL_PREFERRED:
1167 		nd = policy->v.preferred_node;
1168 		if (nd < 0)
1169 			nd = numa_node_id();
1170 		break;
1171 	case MPOL_BIND:
1172 		/* Lower zones don't get a policy applied */
1173 		/* Careful: current->mems_allowed might have moved */
1174 		if (gfp_zone(gfp) >= policy_zone)
1175 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1176 				return policy->v.zonelist;
1177 		/*FALL THROUGH*/
1178 	case MPOL_INTERLEAVE: /* should not happen */
1179 	case MPOL_DEFAULT:
1180 		nd = numa_node_id();
1181 		break;
1182 	default:
1183 		nd = 0;
1184 		BUG();
1185 	}
1186 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1187 }
1188 
1189 /* Do dynamic interleaving for a process */
1190 static unsigned interleave_nodes(struct mempolicy *policy)
1191 {
1192 	unsigned nid, next;
1193 	struct task_struct *me = current;
1194 
1195 	nid = me->il_next;
1196 	next = next_node(nid, policy->v.nodes);
1197 	if (next >= MAX_NUMNODES)
1198 		next = first_node(policy->v.nodes);
1199 	me->il_next = next;
1200 	return nid;
1201 }
1202 
1203 /*
1204  * Depending on the memory policy provide a node from which to allocate the
1205  * next slab entry.
1206  */
1207 unsigned slab_node(struct mempolicy *policy)
1208 {
1209 	int pol = policy ? policy->policy : MPOL_DEFAULT;
1210 
1211 	switch (pol) {
1212 	case MPOL_INTERLEAVE:
1213 		return interleave_nodes(policy);
1214 
1215 	case MPOL_BIND:
1216 		/*
1217 		 * Follow bind policy behavior and start allocation at the
1218 		 * first node.
1219 		 */
1220 		return zone_to_nid(policy->v.zonelist->zones[0]);
1221 
1222 	case MPOL_PREFERRED:
1223 		if (policy->v.preferred_node >= 0)
1224 			return policy->v.preferred_node;
1225 		/* Fall through */
1226 
1227 	default:
1228 		return numa_node_id();
1229 	}
1230 }
1231 
1232 /* Do static interleaving for a VMA with known offset. */
1233 static unsigned offset_il_node(struct mempolicy *pol,
1234 		struct vm_area_struct *vma, unsigned long off)
1235 {
1236 	unsigned nnodes = nodes_weight(pol->v.nodes);
1237 	unsigned target = (unsigned)off % nnodes;
1238 	int c;
1239 	int nid = -1;
1240 
1241 	c = 0;
1242 	do {
1243 		nid = next_node(nid, pol->v.nodes);
1244 		c++;
1245 	} while (c <= target);
1246 	return nid;
1247 }
1248 
1249 /* Determine a node number for interleave */
1250 static inline unsigned interleave_nid(struct mempolicy *pol,
1251 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1252 {
1253 	if (vma) {
1254 		unsigned long off;
1255 
1256 		/*
1257 		 * for small pages, there is no difference between
1258 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1259 		 * for huge pages, since vm_pgoff is in units of small
1260 		 * pages, we need to shift off the always 0 bits to get
1261 		 * a useful offset.
1262 		 */
1263 		BUG_ON(shift < PAGE_SHIFT);
1264 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1265 		off += (addr - vma->vm_start) >> shift;
1266 		return offset_il_node(pol, vma, off);
1267 	} else
1268 		return interleave_nodes(pol);
1269 }
1270 
1271 #ifdef CONFIG_HUGETLBFS
1272 /*
1273  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1274  * @vma = virtual memory area whose policy is sought
1275  * @addr = address in @vma for shared policy lookup and interleave policy
1276  * @gfp_flags = for requested zone
1277  * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
1278  *
1279  * Returns a zonelist suitable for a huge page allocation.
1280  * If the effective policy is 'BIND, returns pointer to policy's zonelist.
1281  * If it is also a policy for which get_vma_policy() returns an extra
1282  * reference, we must hold that reference until after allocation.
1283  * In that case, return policy via @mpol so hugetlb allocation can drop
1284  * the reference.  For non-'BIND referenced policies, we can/do drop the
1285  * reference here, so the caller doesn't need to know about the special case
1286  * for default and current task policy.
1287  */
1288 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1289 				gfp_t gfp_flags, struct mempolicy **mpol)
1290 {
1291 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1292 	struct zonelist *zl;
1293 
1294 	*mpol = NULL;		/* probably no unref needed */
1295 	if (pol->policy == MPOL_INTERLEAVE) {
1296 		unsigned nid;
1297 
1298 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1299 		__mpol_free(pol);		/* finished with pol */
1300 		return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
1301 	}
1302 
1303 	zl = zonelist_policy(GFP_HIGHUSER, pol);
1304 	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1305 		if (pol->policy != MPOL_BIND)
1306 			__mpol_free(pol);	/* finished with pol */
1307 		else
1308 			*mpol = pol;	/* unref needed after allocation */
1309 	}
1310 	return zl;
1311 }
1312 #endif
1313 
1314 /* Allocate a page in interleaved policy.
1315    Own path because it needs to do special accounting. */
1316 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1317 					unsigned nid)
1318 {
1319 	struct zonelist *zl;
1320 	struct page *page;
1321 
1322 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1323 	page = __alloc_pages(gfp, order, zl);
1324 	if (page && page_zone(page) == zl->zones[0])
1325 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1326 	return page;
1327 }
1328 
1329 /**
1330  * 	alloc_page_vma	- Allocate a page for a VMA.
1331  *
1332  * 	@gfp:
1333  *      %GFP_USER    user allocation.
1334  *      %GFP_KERNEL  kernel allocations,
1335  *      %GFP_HIGHMEM highmem/user allocations,
1336  *      %GFP_FS      allocation should not call back into a file system.
1337  *      %GFP_ATOMIC  don't sleep.
1338  *
1339  * 	@vma:  Pointer to VMA or NULL if not available.
1340  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1341  *
1342  * 	This function allocates a page from the kernel page pool and applies
1343  *	a NUMA policy associated with the VMA or the current process.
1344  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1345  *	mm_struct of the VMA to prevent it from going away. Should be used for
1346  *	all allocations for pages that will be mapped into
1347  * 	user space. Returns NULL when no page can be allocated.
1348  *
1349  *	Should be called with the mm_sem of the vma hold.
1350  */
1351 struct page *
1352 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1353 {
1354 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1355 	struct zonelist *zl;
1356 
1357 	cpuset_update_task_memory_state();
1358 
1359 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1360 		unsigned nid;
1361 
1362 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1363 		return alloc_page_interleave(gfp, 0, nid);
1364 	}
1365 	zl = zonelist_policy(gfp, pol);
1366 	if (pol != &default_policy && pol != current->mempolicy) {
1367 		/*
1368 		 * slow path: ref counted policy -- shared or vma
1369 		 */
1370 		struct page *page =  __alloc_pages(gfp, 0, zl);
1371 		__mpol_free(pol);
1372 		return page;
1373 	}
1374 	/*
1375 	 * fast path:  default or task policy
1376 	 */
1377 	return __alloc_pages(gfp, 0, zl);
1378 }
1379 
1380 /**
1381  * 	alloc_pages_current - Allocate pages.
1382  *
1383  *	@gfp:
1384  *		%GFP_USER   user allocation,
1385  *      	%GFP_KERNEL kernel allocation,
1386  *      	%GFP_HIGHMEM highmem allocation,
1387  *      	%GFP_FS     don't call back into a file system.
1388  *      	%GFP_ATOMIC don't sleep.
1389  *	@order: Power of two of allocation size in pages. 0 is a single page.
1390  *
1391  *	Allocate a page from the kernel page pool.  When not in
1392  *	interrupt context and apply the current process NUMA policy.
1393  *	Returns NULL when no page can be allocated.
1394  *
1395  *	Don't call cpuset_update_task_memory_state() unless
1396  *	1) it's ok to take cpuset_sem (can WAIT), and
1397  *	2) allocating for current task (not interrupt).
1398  */
1399 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1400 {
1401 	struct mempolicy *pol = current->mempolicy;
1402 
1403 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1404 		cpuset_update_task_memory_state();
1405 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1406 		pol = &default_policy;
1407 	if (pol->policy == MPOL_INTERLEAVE)
1408 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1409 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1410 }
1411 EXPORT_SYMBOL(alloc_pages_current);
1412 
1413 /*
1414  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1415  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1416  * with the mems_allowed returned by cpuset_mems_allowed().  This
1417  * keeps mempolicies cpuset relative after its cpuset moves.  See
1418  * further kernel/cpuset.c update_nodemask().
1419  */
1420 
1421 /* Slow path of a mempolicy copy */
1422 struct mempolicy *__mpol_copy(struct mempolicy *old)
1423 {
1424 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1425 
1426 	if (!new)
1427 		return ERR_PTR(-ENOMEM);
1428 	if (current_cpuset_is_being_rebound()) {
1429 		nodemask_t mems = cpuset_mems_allowed(current);
1430 		mpol_rebind_policy(old, &mems);
1431 	}
1432 	*new = *old;
1433 	atomic_set(&new->refcnt, 1);
1434 	if (new->policy == MPOL_BIND) {
1435 		int sz = ksize(old->v.zonelist);
1436 		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1437 		if (!new->v.zonelist) {
1438 			kmem_cache_free(policy_cache, new);
1439 			return ERR_PTR(-ENOMEM);
1440 		}
1441 	}
1442 	return new;
1443 }
1444 
1445 /* Slow path of a mempolicy comparison */
1446 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1447 {
1448 	if (!a || !b)
1449 		return 0;
1450 	if (a->policy != b->policy)
1451 		return 0;
1452 	switch (a->policy) {
1453 	case MPOL_DEFAULT:
1454 		return 1;
1455 	case MPOL_INTERLEAVE:
1456 		return nodes_equal(a->v.nodes, b->v.nodes);
1457 	case MPOL_PREFERRED:
1458 		return a->v.preferred_node == b->v.preferred_node;
1459 	case MPOL_BIND: {
1460 		int i;
1461 		for (i = 0; a->v.zonelist->zones[i]; i++)
1462 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1463 				return 0;
1464 		return b->v.zonelist->zones[i] == NULL;
1465 	}
1466 	default:
1467 		BUG();
1468 		return 0;
1469 	}
1470 }
1471 
1472 /* Slow path of a mpol destructor. */
1473 void __mpol_free(struct mempolicy *p)
1474 {
1475 	if (!atomic_dec_and_test(&p->refcnt))
1476 		return;
1477 	if (p->policy == MPOL_BIND)
1478 		kfree(p->v.zonelist);
1479 	p->policy = MPOL_DEFAULT;
1480 	kmem_cache_free(policy_cache, p);
1481 }
1482 
1483 /*
1484  * Shared memory backing store policy support.
1485  *
1486  * Remember policies even when nobody has shared memory mapped.
1487  * The policies are kept in Red-Black tree linked from the inode.
1488  * They are protected by the sp->lock spinlock, which should be held
1489  * for any accesses to the tree.
1490  */
1491 
1492 /* lookup first element intersecting start-end */
1493 /* Caller holds sp->lock */
1494 static struct sp_node *
1495 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1496 {
1497 	struct rb_node *n = sp->root.rb_node;
1498 
1499 	while (n) {
1500 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1501 
1502 		if (start >= p->end)
1503 			n = n->rb_right;
1504 		else if (end <= p->start)
1505 			n = n->rb_left;
1506 		else
1507 			break;
1508 	}
1509 	if (!n)
1510 		return NULL;
1511 	for (;;) {
1512 		struct sp_node *w = NULL;
1513 		struct rb_node *prev = rb_prev(n);
1514 		if (!prev)
1515 			break;
1516 		w = rb_entry(prev, struct sp_node, nd);
1517 		if (w->end <= start)
1518 			break;
1519 		n = prev;
1520 	}
1521 	return rb_entry(n, struct sp_node, nd);
1522 }
1523 
1524 /* Insert a new shared policy into the list. */
1525 /* Caller holds sp->lock */
1526 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1527 {
1528 	struct rb_node **p = &sp->root.rb_node;
1529 	struct rb_node *parent = NULL;
1530 	struct sp_node *nd;
1531 
1532 	while (*p) {
1533 		parent = *p;
1534 		nd = rb_entry(parent, struct sp_node, nd);
1535 		if (new->start < nd->start)
1536 			p = &(*p)->rb_left;
1537 		else if (new->end > nd->end)
1538 			p = &(*p)->rb_right;
1539 		else
1540 			BUG();
1541 	}
1542 	rb_link_node(&new->nd, parent, p);
1543 	rb_insert_color(&new->nd, &sp->root);
1544 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1545 		 new->policy ? new->policy->policy : 0);
1546 }
1547 
1548 /* Find shared policy intersecting idx */
1549 struct mempolicy *
1550 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1551 {
1552 	struct mempolicy *pol = NULL;
1553 	struct sp_node *sn;
1554 
1555 	if (!sp->root.rb_node)
1556 		return NULL;
1557 	spin_lock(&sp->lock);
1558 	sn = sp_lookup(sp, idx, idx+1);
1559 	if (sn) {
1560 		mpol_get(sn->policy);
1561 		pol = sn->policy;
1562 	}
1563 	spin_unlock(&sp->lock);
1564 	return pol;
1565 }
1566 
1567 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1568 {
1569 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1570 	rb_erase(&n->nd, &sp->root);
1571 	mpol_free(n->policy);
1572 	kmem_cache_free(sn_cache, n);
1573 }
1574 
1575 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1576 				struct mempolicy *pol)
1577 {
1578 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1579 
1580 	if (!n)
1581 		return NULL;
1582 	n->start = start;
1583 	n->end = end;
1584 	mpol_get(pol);
1585 	n->policy = pol;
1586 	return n;
1587 }
1588 
1589 /* Replace a policy range. */
1590 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1591 				 unsigned long end, struct sp_node *new)
1592 {
1593 	struct sp_node *n, *new2 = NULL;
1594 
1595 restart:
1596 	spin_lock(&sp->lock);
1597 	n = sp_lookup(sp, start, end);
1598 	/* Take care of old policies in the same range. */
1599 	while (n && n->start < end) {
1600 		struct rb_node *next = rb_next(&n->nd);
1601 		if (n->start >= start) {
1602 			if (n->end <= end)
1603 				sp_delete(sp, n);
1604 			else
1605 				n->start = end;
1606 		} else {
1607 			/* Old policy spanning whole new range. */
1608 			if (n->end > end) {
1609 				if (!new2) {
1610 					spin_unlock(&sp->lock);
1611 					new2 = sp_alloc(end, n->end, n->policy);
1612 					if (!new2)
1613 						return -ENOMEM;
1614 					goto restart;
1615 				}
1616 				n->end = start;
1617 				sp_insert(sp, new2);
1618 				new2 = NULL;
1619 				break;
1620 			} else
1621 				n->end = start;
1622 		}
1623 		if (!next)
1624 			break;
1625 		n = rb_entry(next, struct sp_node, nd);
1626 	}
1627 	if (new)
1628 		sp_insert(sp, new);
1629 	spin_unlock(&sp->lock);
1630 	if (new2) {
1631 		mpol_free(new2->policy);
1632 		kmem_cache_free(sn_cache, new2);
1633 	}
1634 	return 0;
1635 }
1636 
1637 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1638 				nodemask_t *policy_nodes)
1639 {
1640 	info->root = RB_ROOT;
1641 	spin_lock_init(&info->lock);
1642 
1643 	if (policy != MPOL_DEFAULT) {
1644 		struct mempolicy *newpol;
1645 
1646 		/* Falls back to MPOL_DEFAULT on any error */
1647 		newpol = mpol_new(policy, policy_nodes);
1648 		if (!IS_ERR(newpol)) {
1649 			/* Create pseudo-vma that contains just the policy */
1650 			struct vm_area_struct pvma;
1651 
1652 			memset(&pvma, 0, sizeof(struct vm_area_struct));
1653 			/* Policy covers entire file */
1654 			pvma.vm_end = TASK_SIZE;
1655 			mpol_set_shared_policy(info, &pvma, newpol);
1656 			mpol_free(newpol);
1657 		}
1658 	}
1659 }
1660 
1661 int mpol_set_shared_policy(struct shared_policy *info,
1662 			struct vm_area_struct *vma, struct mempolicy *npol)
1663 {
1664 	int err;
1665 	struct sp_node *new = NULL;
1666 	unsigned long sz = vma_pages(vma);
1667 
1668 	pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
1669 		 vma->vm_pgoff,
1670 		 sz, npol? npol->policy : -1,
1671 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1672 
1673 	if (npol) {
1674 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1675 		if (!new)
1676 			return -ENOMEM;
1677 	}
1678 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1679 	if (err && new)
1680 		kmem_cache_free(sn_cache, new);
1681 	return err;
1682 }
1683 
1684 /* Free a backing policy store on inode delete. */
1685 void mpol_free_shared_policy(struct shared_policy *p)
1686 {
1687 	struct sp_node *n;
1688 	struct rb_node *next;
1689 
1690 	if (!p->root.rb_node)
1691 		return;
1692 	spin_lock(&p->lock);
1693 	next = rb_first(&p->root);
1694 	while (next) {
1695 		n = rb_entry(next, struct sp_node, nd);
1696 		next = rb_next(&n->nd);
1697 		rb_erase(&n->nd, &p->root);
1698 		mpol_free(n->policy);
1699 		kmem_cache_free(sn_cache, n);
1700 	}
1701 	spin_unlock(&p->lock);
1702 }
1703 
1704 /* assumes fs == KERNEL_DS */
1705 void __init numa_policy_init(void)
1706 {
1707 	nodemask_t interleave_nodes;
1708 	unsigned long largest = 0;
1709 	int nid, prefer = 0;
1710 
1711 	policy_cache = kmem_cache_create("numa_policy",
1712 					 sizeof(struct mempolicy),
1713 					 0, SLAB_PANIC, NULL);
1714 
1715 	sn_cache = kmem_cache_create("shared_policy_node",
1716 				     sizeof(struct sp_node),
1717 				     0, SLAB_PANIC, NULL);
1718 
1719 	/*
1720 	 * Set interleaving policy for system init. Interleaving is only
1721 	 * enabled across suitably sized nodes (default is >= 16MB), or
1722 	 * fall back to the largest node if they're all smaller.
1723 	 */
1724 	nodes_clear(interleave_nodes);
1725 	for_each_node_state(nid, N_HIGH_MEMORY) {
1726 		unsigned long total_pages = node_present_pages(nid);
1727 
1728 		/* Preserve the largest node */
1729 		if (largest < total_pages) {
1730 			largest = total_pages;
1731 			prefer = nid;
1732 		}
1733 
1734 		/* Interleave this node? */
1735 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1736 			node_set(nid, interleave_nodes);
1737 	}
1738 
1739 	/* All too small, use the largest */
1740 	if (unlikely(nodes_empty(interleave_nodes)))
1741 		node_set(prefer, interleave_nodes);
1742 
1743 	if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
1744 		printk("numa_policy_init: interleaving failed\n");
1745 }
1746 
1747 /* Reset policy of current process to default */
1748 void numa_default_policy(void)
1749 {
1750 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1751 }
1752 
1753 /* Migrate a policy to a different set of nodes */
1754 static void mpol_rebind_policy(struct mempolicy *pol,
1755 			       const nodemask_t *newmask)
1756 {
1757 	nodemask_t *mpolmask;
1758 	nodemask_t tmp;
1759 
1760 	if (!pol)
1761 		return;
1762 	mpolmask = &pol->cpuset_mems_allowed;
1763 	if (nodes_equal(*mpolmask, *newmask))
1764 		return;
1765 
1766 	switch (pol->policy) {
1767 	case MPOL_DEFAULT:
1768 		break;
1769 	case MPOL_INTERLEAVE:
1770 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1771 		pol->v.nodes = tmp;
1772 		*mpolmask = *newmask;
1773 		current->il_next = node_remap(current->il_next,
1774 						*mpolmask, *newmask);
1775 		break;
1776 	case MPOL_PREFERRED:
1777 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1778 						*mpolmask, *newmask);
1779 		*mpolmask = *newmask;
1780 		break;
1781 	case MPOL_BIND: {
1782 		nodemask_t nodes;
1783 		struct zone **z;
1784 		struct zonelist *zonelist;
1785 
1786 		nodes_clear(nodes);
1787 		for (z = pol->v.zonelist->zones; *z; z++)
1788 			node_set(zone_to_nid(*z), nodes);
1789 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
1790 		nodes = tmp;
1791 
1792 		zonelist = bind_zonelist(&nodes);
1793 
1794 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1795 		 * If that old zonelist has no remaining mems_allowed nodes,
1796 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1797 		 */
1798 
1799 		if (!IS_ERR(zonelist)) {
1800 			/* Good - got mem - substitute new zonelist */
1801 			kfree(pol->v.zonelist);
1802 			pol->v.zonelist = zonelist;
1803 		}
1804 		*mpolmask = *newmask;
1805 		break;
1806 	}
1807 	default:
1808 		BUG();
1809 		break;
1810 	}
1811 }
1812 
1813 /*
1814  * Wrapper for mpol_rebind_policy() that just requires task
1815  * pointer, and updates task mempolicy.
1816  */
1817 
1818 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1819 {
1820 	mpol_rebind_policy(tsk->mempolicy, new);
1821 }
1822 
1823 /*
1824  * Rebind each vma in mm to new nodemask.
1825  *
1826  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1827  */
1828 
1829 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1830 {
1831 	struct vm_area_struct *vma;
1832 
1833 	down_write(&mm->mmap_sem);
1834 	for (vma = mm->mmap; vma; vma = vma->vm_next)
1835 		mpol_rebind_policy(vma->vm_policy, new);
1836 	up_write(&mm->mmap_sem);
1837 }
1838 
1839 /*
1840  * Display pages allocated per node and memory policy via /proc.
1841  */
1842 
1843 static const char * const policy_types[] =
1844 	{ "default", "prefer", "bind", "interleave" };
1845 
1846 /*
1847  * Convert a mempolicy into a string.
1848  * Returns the number of characters in buffer (if positive)
1849  * or an error (negative)
1850  */
1851 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1852 {
1853 	char *p = buffer;
1854 	int l;
1855 	nodemask_t nodes;
1856 	int mode = pol ? pol->policy : MPOL_DEFAULT;
1857 
1858 	switch (mode) {
1859 	case MPOL_DEFAULT:
1860 		nodes_clear(nodes);
1861 		break;
1862 
1863 	case MPOL_PREFERRED:
1864 		nodes_clear(nodes);
1865 		node_set(pol->v.preferred_node, nodes);
1866 		break;
1867 
1868 	case MPOL_BIND:
1869 		get_zonemask(pol, &nodes);
1870 		break;
1871 
1872 	case MPOL_INTERLEAVE:
1873 		nodes = pol->v.nodes;
1874 		break;
1875 
1876 	default:
1877 		BUG();
1878 		return -EFAULT;
1879 	}
1880 
1881 	l = strlen(policy_types[mode]);
1882  	if (buffer + maxlen < p + l + 1)
1883  		return -ENOSPC;
1884 
1885 	strcpy(p, policy_types[mode]);
1886 	p += l;
1887 
1888 	if (!nodes_empty(nodes)) {
1889 		if (buffer + maxlen < p + 2)
1890 			return -ENOSPC;
1891 		*p++ = '=';
1892 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1893 	}
1894 	return p - buffer;
1895 }
1896 
1897 struct numa_maps {
1898 	unsigned long pages;
1899 	unsigned long anon;
1900 	unsigned long active;
1901 	unsigned long writeback;
1902 	unsigned long mapcount_max;
1903 	unsigned long dirty;
1904 	unsigned long swapcache;
1905 	unsigned long node[MAX_NUMNODES];
1906 };
1907 
1908 static void gather_stats(struct page *page, void *private, int pte_dirty)
1909 {
1910 	struct numa_maps *md = private;
1911 	int count = page_mapcount(page);
1912 
1913 	md->pages++;
1914 	if (pte_dirty || PageDirty(page))
1915 		md->dirty++;
1916 
1917 	if (PageSwapCache(page))
1918 		md->swapcache++;
1919 
1920 	if (PageActive(page))
1921 		md->active++;
1922 
1923 	if (PageWriteback(page))
1924 		md->writeback++;
1925 
1926 	if (PageAnon(page))
1927 		md->anon++;
1928 
1929 	if (count > md->mapcount_max)
1930 		md->mapcount_max = count;
1931 
1932 	md->node[page_to_nid(page)]++;
1933 }
1934 
1935 #ifdef CONFIG_HUGETLB_PAGE
1936 static void check_huge_range(struct vm_area_struct *vma,
1937 		unsigned long start, unsigned long end,
1938 		struct numa_maps *md)
1939 {
1940 	unsigned long addr;
1941 	struct page *page;
1942 
1943 	for (addr = start; addr < end; addr += HPAGE_SIZE) {
1944 		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1945 		pte_t pte;
1946 
1947 		if (!ptep)
1948 			continue;
1949 
1950 		pte = *ptep;
1951 		if (pte_none(pte))
1952 			continue;
1953 
1954 		page = pte_page(pte);
1955 		if (!page)
1956 			continue;
1957 
1958 		gather_stats(page, md, pte_dirty(*ptep));
1959 	}
1960 }
1961 #else
1962 static inline void check_huge_range(struct vm_area_struct *vma,
1963 		unsigned long start, unsigned long end,
1964 		struct numa_maps *md)
1965 {
1966 }
1967 #endif
1968 
1969 int show_numa_map(struct seq_file *m, void *v)
1970 {
1971 	struct proc_maps_private *priv = m->private;
1972 	struct vm_area_struct *vma = v;
1973 	struct numa_maps *md;
1974 	struct file *file = vma->vm_file;
1975 	struct mm_struct *mm = vma->vm_mm;
1976 	struct mempolicy *pol;
1977 	int n;
1978 	char buffer[50];
1979 
1980 	if (!mm)
1981 		return 0;
1982 
1983 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1984 	if (!md)
1985 		return 0;
1986 
1987 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
1988 	mpol_to_str(buffer, sizeof(buffer), pol);
1989 	/*
1990 	 * unref shared or other task's mempolicy
1991 	 */
1992 	if (pol != &default_policy && pol != current->mempolicy)
1993 		__mpol_free(pol);
1994 
1995 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1996 
1997 	if (file) {
1998 		seq_printf(m, " file=");
1999 		seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
2000 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2001 		seq_printf(m, " heap");
2002 	} else if (vma->vm_start <= mm->start_stack &&
2003 			vma->vm_end >= mm->start_stack) {
2004 		seq_printf(m, " stack");
2005 	}
2006 
2007 	if (is_vm_hugetlb_page(vma)) {
2008 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2009 		seq_printf(m, " huge");
2010 	} else {
2011 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2012 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2013 	}
2014 
2015 	if (!md->pages)
2016 		goto out;
2017 
2018 	if (md->anon)
2019 		seq_printf(m," anon=%lu",md->anon);
2020 
2021 	if (md->dirty)
2022 		seq_printf(m," dirty=%lu",md->dirty);
2023 
2024 	if (md->pages != md->anon && md->pages != md->dirty)
2025 		seq_printf(m, " mapped=%lu", md->pages);
2026 
2027 	if (md->mapcount_max > 1)
2028 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2029 
2030 	if (md->swapcache)
2031 		seq_printf(m," swapcache=%lu", md->swapcache);
2032 
2033 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2034 		seq_printf(m," active=%lu", md->active);
2035 
2036 	if (md->writeback)
2037 		seq_printf(m," writeback=%lu", md->writeback);
2038 
2039 	for_each_node_state(n, N_HIGH_MEMORY)
2040 		if (md->node[n])
2041 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2042 out:
2043 	seq_putc(m, '\n');
2044 	kfree(md);
2045 
2046 	if (m->count < m->size)
2047 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2048 	return 0;
2049 }
2050