xref: /linux/mm/mempolicy.c (revision efad798b9f01300565f65058b153250cc49d58f2)
1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case node -1 here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55 
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66    could replace all the switch()es with a mempolicy_ops structure.
67 */
68 
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/nodemask.h>
76 #include <linux/cpuset.h>
77 #include <linux/gfp.h>
78 #include <linux/slab.h>
79 #include <linux/string.h>
80 #include <linux/module.h>
81 #include <linux/nsproxy.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/swap.h>
86 #include <linux/seq_file.h>
87 #include <linux/proc_fs.h>
88 #include <linux/migrate.h>
89 #include <linux/rmap.h>
90 #include <linux/security.h>
91 #include <linux/syscalls.h>
92 
93 #include <asm/tlbflush.h>
94 #include <asm/uaccess.h>
95 
96 /* Internal flags */
97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
100 
101 static struct kmem_cache *policy_cache;
102 static struct kmem_cache *sn_cache;
103 
104 /* Highest zone. An specific allocation for a zone below that is not
105    policied. */
106 enum zone_type policy_zone = 0;
107 
108 struct mempolicy default_policy = {
109 	.refcnt = ATOMIC_INIT(1), /* never free it */
110 	.policy = MPOL_DEFAULT,
111 };
112 
113 static void mpol_rebind_policy(struct mempolicy *pol,
114                                const nodemask_t *newmask);
115 
116 /* Do sanity checking on a policy */
117 static int mpol_check_policy(int mode, nodemask_t *nodes)
118 {
119 	int empty = nodes_empty(*nodes);
120 
121 	switch (mode) {
122 	case MPOL_DEFAULT:
123 		if (!empty)
124 			return -EINVAL;
125 		break;
126 	case MPOL_BIND:
127 	case MPOL_INTERLEAVE:
128 		/* Preferred will only use the first bit, but allow
129 		   more for now. */
130 		if (empty)
131 			return -EINVAL;
132 		break;
133 	}
134  	return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
135 }
136 
137 /* Generate a custom zonelist for the BIND policy. */
138 static struct zonelist *bind_zonelist(nodemask_t *nodes)
139 {
140 	struct zonelist *zl;
141 	int num, max, nd;
142 	enum zone_type k;
143 
144 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
145 	max++;			/* space for zlcache_ptr (see mmzone.h) */
146 	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
147 	if (!zl)
148 		return ERR_PTR(-ENOMEM);
149 	zl->zlcache_ptr = NULL;
150 	num = 0;
151 	/* First put in the highest zones from all nodes, then all the next
152 	   lower zones etc. Avoid empty zones because the memory allocator
153 	   doesn't like them. If you implement node hot removal you
154 	   have to fix that. */
155 	k = MAX_NR_ZONES - 1;
156 	while (1) {
157 		for_each_node_mask(nd, *nodes) {
158 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
159 			if (z->present_pages > 0)
160 				zl->zones[num++] = z;
161 		}
162 		if (k == 0)
163 			break;
164 		k--;
165 	}
166 	if (num == 0) {
167 		kfree(zl);
168 		return ERR_PTR(-EINVAL);
169 	}
170 	zl->zones[num] = NULL;
171 	return zl;
172 }
173 
174 /* Create a new policy */
175 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
176 {
177 	struct mempolicy *policy;
178 
179 	pr_debug("setting mode %d nodes[0] %lx\n",
180 		 mode, nodes ? nodes_addr(*nodes)[0] : -1);
181 
182 	if (mode == MPOL_DEFAULT)
183 		return NULL;
184 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
185 	if (!policy)
186 		return ERR_PTR(-ENOMEM);
187 	atomic_set(&policy->refcnt, 1);
188 	switch (mode) {
189 	case MPOL_INTERLEAVE:
190 		policy->v.nodes = *nodes;
191 		nodes_and(policy->v.nodes, policy->v.nodes,
192 					node_states[N_HIGH_MEMORY]);
193 		if (nodes_weight(policy->v.nodes) == 0) {
194 			kmem_cache_free(policy_cache, policy);
195 			return ERR_PTR(-EINVAL);
196 		}
197 		break;
198 	case MPOL_PREFERRED:
199 		policy->v.preferred_node = first_node(*nodes);
200 		if (policy->v.preferred_node >= MAX_NUMNODES)
201 			policy->v.preferred_node = -1;
202 		break;
203 	case MPOL_BIND:
204 		policy->v.zonelist = bind_zonelist(nodes);
205 		if (IS_ERR(policy->v.zonelist)) {
206 			void *error_code = policy->v.zonelist;
207 			kmem_cache_free(policy_cache, policy);
208 			return error_code;
209 		}
210 		break;
211 	}
212 	policy->policy = mode;
213 	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
214 	return policy;
215 }
216 
217 static void gather_stats(struct page *, void *, int pte_dirty);
218 static void migrate_page_add(struct page *page, struct list_head *pagelist,
219 				unsigned long flags);
220 
221 /* Scan through pages checking if pages follow certain conditions. */
222 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
223 		unsigned long addr, unsigned long end,
224 		const nodemask_t *nodes, unsigned long flags,
225 		void *private)
226 {
227 	pte_t *orig_pte;
228 	pte_t *pte;
229 	spinlock_t *ptl;
230 
231 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
232 	do {
233 		struct page *page;
234 		int nid;
235 
236 		if (!pte_present(*pte))
237 			continue;
238 		page = vm_normal_page(vma, addr, *pte);
239 		if (!page)
240 			continue;
241 		/*
242 		 * The check for PageReserved here is important to avoid
243 		 * handling zero pages and other pages that may have been
244 		 * marked special by the system.
245 		 *
246 		 * If the PageReserved would not be checked here then f.e.
247 		 * the location of the zero page could have an influence
248 		 * on MPOL_MF_STRICT, zero pages would be counted for
249 		 * the per node stats, and there would be useless attempts
250 		 * to put zero pages on the migration list.
251 		 */
252 		if (PageReserved(page))
253 			continue;
254 		nid = page_to_nid(page);
255 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
256 			continue;
257 
258 		if (flags & MPOL_MF_STATS)
259 			gather_stats(page, private, pte_dirty(*pte));
260 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
261 			migrate_page_add(page, private, flags);
262 		else
263 			break;
264 	} while (pte++, addr += PAGE_SIZE, addr != end);
265 	pte_unmap_unlock(orig_pte, ptl);
266 	return addr != end;
267 }
268 
269 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
270 		unsigned long addr, unsigned long end,
271 		const nodemask_t *nodes, unsigned long flags,
272 		void *private)
273 {
274 	pmd_t *pmd;
275 	unsigned long next;
276 
277 	pmd = pmd_offset(pud, addr);
278 	do {
279 		next = pmd_addr_end(addr, end);
280 		if (pmd_none_or_clear_bad(pmd))
281 			continue;
282 		if (check_pte_range(vma, pmd, addr, next, nodes,
283 				    flags, private))
284 			return -EIO;
285 	} while (pmd++, addr = next, addr != end);
286 	return 0;
287 }
288 
289 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
290 		unsigned long addr, unsigned long end,
291 		const nodemask_t *nodes, unsigned long flags,
292 		void *private)
293 {
294 	pud_t *pud;
295 	unsigned long next;
296 
297 	pud = pud_offset(pgd, addr);
298 	do {
299 		next = pud_addr_end(addr, end);
300 		if (pud_none_or_clear_bad(pud))
301 			continue;
302 		if (check_pmd_range(vma, pud, addr, next, nodes,
303 				    flags, private))
304 			return -EIO;
305 	} while (pud++, addr = next, addr != end);
306 	return 0;
307 }
308 
309 static inline int check_pgd_range(struct vm_area_struct *vma,
310 		unsigned long addr, unsigned long end,
311 		const nodemask_t *nodes, unsigned long flags,
312 		void *private)
313 {
314 	pgd_t *pgd;
315 	unsigned long next;
316 
317 	pgd = pgd_offset(vma->vm_mm, addr);
318 	do {
319 		next = pgd_addr_end(addr, end);
320 		if (pgd_none_or_clear_bad(pgd))
321 			continue;
322 		if (check_pud_range(vma, pgd, addr, next, nodes,
323 				    flags, private))
324 			return -EIO;
325 	} while (pgd++, addr = next, addr != end);
326 	return 0;
327 }
328 
329 /*
330  * Check if all pages in a range are on a set of nodes.
331  * If pagelist != NULL then isolate pages from the LRU and
332  * put them on the pagelist.
333  */
334 static struct vm_area_struct *
335 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
336 		const nodemask_t *nodes, unsigned long flags, void *private)
337 {
338 	int err;
339 	struct vm_area_struct *first, *vma, *prev;
340 
341 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
342 
343 		err = migrate_prep();
344 		if (err)
345 			return ERR_PTR(err);
346 	}
347 
348 	first = find_vma(mm, start);
349 	if (!first)
350 		return ERR_PTR(-EFAULT);
351 	prev = NULL;
352 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
353 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
354 			if (!vma->vm_next && vma->vm_end < end)
355 				return ERR_PTR(-EFAULT);
356 			if (prev && prev->vm_end < vma->vm_start)
357 				return ERR_PTR(-EFAULT);
358 		}
359 		if (!is_vm_hugetlb_page(vma) &&
360 		    ((flags & MPOL_MF_STRICT) ||
361 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
362 				vma_migratable(vma)))) {
363 			unsigned long endvma = vma->vm_end;
364 
365 			if (endvma > end)
366 				endvma = end;
367 			if (vma->vm_start > start)
368 				start = vma->vm_start;
369 			err = check_pgd_range(vma, start, endvma, nodes,
370 						flags, private);
371 			if (err) {
372 				first = ERR_PTR(err);
373 				break;
374 			}
375 		}
376 		prev = vma;
377 	}
378 	return first;
379 }
380 
381 /* Apply policy to a single VMA */
382 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
383 {
384 	int err = 0;
385 	struct mempolicy *old = vma->vm_policy;
386 
387 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
388 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
389 		 vma->vm_ops, vma->vm_file,
390 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
391 
392 	if (vma->vm_ops && vma->vm_ops->set_policy)
393 		err = vma->vm_ops->set_policy(vma, new);
394 	if (!err) {
395 		mpol_get(new);
396 		vma->vm_policy = new;
397 		mpol_free(old);
398 	}
399 	return err;
400 }
401 
402 /* Step 2: apply policy to a range and do splits. */
403 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
404 		       unsigned long end, struct mempolicy *new)
405 {
406 	struct vm_area_struct *next;
407 	int err;
408 
409 	err = 0;
410 	for (; vma && vma->vm_start < end; vma = next) {
411 		next = vma->vm_next;
412 		if (vma->vm_start < start)
413 			err = split_vma(vma->vm_mm, vma, start, 1);
414 		if (!err && vma->vm_end > end)
415 			err = split_vma(vma->vm_mm, vma, end, 0);
416 		if (!err)
417 			err = policy_vma(vma, new);
418 		if (err)
419 			break;
420 	}
421 	return err;
422 }
423 
424 static int contextualize_policy(int mode, nodemask_t *nodes)
425 {
426 	if (!nodes)
427 		return 0;
428 
429 	cpuset_update_task_memory_state();
430 	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
431 		return -EINVAL;
432 	return mpol_check_policy(mode, nodes);
433 }
434 
435 
436 /*
437  * Update task->flags PF_MEMPOLICY bit: set iff non-default
438  * mempolicy.  Allows more rapid checking of this (combined perhaps
439  * with other PF_* flag bits) on memory allocation hot code paths.
440  *
441  * If called from outside this file, the task 'p' should -only- be
442  * a newly forked child not yet visible on the task list, because
443  * manipulating the task flags of a visible task is not safe.
444  *
445  * The above limitation is why this routine has the funny name
446  * mpol_fix_fork_child_flag().
447  *
448  * It is also safe to call this with a task pointer of current,
449  * which the static wrapper mpol_set_task_struct_flag() does,
450  * for use within this file.
451  */
452 
453 void mpol_fix_fork_child_flag(struct task_struct *p)
454 {
455 	if (p->mempolicy)
456 		p->flags |= PF_MEMPOLICY;
457 	else
458 		p->flags &= ~PF_MEMPOLICY;
459 }
460 
461 static void mpol_set_task_struct_flag(void)
462 {
463 	mpol_fix_fork_child_flag(current);
464 }
465 
466 /* Set the process memory policy */
467 static long do_set_mempolicy(int mode, nodemask_t *nodes)
468 {
469 	struct mempolicy *new;
470 
471 	if (contextualize_policy(mode, nodes))
472 		return -EINVAL;
473 	new = mpol_new(mode, nodes);
474 	if (IS_ERR(new))
475 		return PTR_ERR(new);
476 	mpol_free(current->mempolicy);
477 	current->mempolicy = new;
478 	mpol_set_task_struct_flag();
479 	if (new && new->policy == MPOL_INTERLEAVE)
480 		current->il_next = first_node(new->v.nodes);
481 	return 0;
482 }
483 
484 /* Fill a zone bitmap for a policy */
485 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
486 {
487 	int i;
488 
489 	nodes_clear(*nodes);
490 	switch (p->policy) {
491 	case MPOL_BIND:
492 		for (i = 0; p->v.zonelist->zones[i]; i++)
493 			node_set(zone_to_nid(p->v.zonelist->zones[i]),
494 				*nodes);
495 		break;
496 	case MPOL_DEFAULT:
497 		break;
498 	case MPOL_INTERLEAVE:
499 		*nodes = p->v.nodes;
500 		break;
501 	case MPOL_PREFERRED:
502 		/* or use current node instead of memory_map? */
503 		if (p->v.preferred_node < 0)
504 			*nodes = node_states[N_HIGH_MEMORY];
505 		else
506 			node_set(p->v.preferred_node, *nodes);
507 		break;
508 	default:
509 		BUG();
510 	}
511 }
512 
513 static int lookup_node(struct mm_struct *mm, unsigned long addr)
514 {
515 	struct page *p;
516 	int err;
517 
518 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
519 	if (err >= 0) {
520 		err = page_to_nid(p);
521 		put_page(p);
522 	}
523 	return err;
524 }
525 
526 /* Retrieve NUMA policy */
527 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
528 			     unsigned long addr, unsigned long flags)
529 {
530 	int err;
531 	struct mm_struct *mm = current->mm;
532 	struct vm_area_struct *vma = NULL;
533 	struct mempolicy *pol = current->mempolicy;
534 
535 	cpuset_update_task_memory_state();
536 	if (flags &
537 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
538 		return -EINVAL;
539 
540 	if (flags & MPOL_F_MEMS_ALLOWED) {
541 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
542 			return -EINVAL;
543 		*policy = 0;	/* just so it's initialized */
544 		*nmask  = cpuset_current_mems_allowed;
545 		return 0;
546 	}
547 
548 	if (flags & MPOL_F_ADDR) {
549 		down_read(&mm->mmap_sem);
550 		vma = find_vma_intersection(mm, addr, addr+1);
551 		if (!vma) {
552 			up_read(&mm->mmap_sem);
553 			return -EFAULT;
554 		}
555 		if (vma->vm_ops && vma->vm_ops->get_policy)
556 			pol = vma->vm_ops->get_policy(vma, addr);
557 		else
558 			pol = vma->vm_policy;
559 	} else if (addr)
560 		return -EINVAL;
561 
562 	if (!pol)
563 		pol = &default_policy;
564 
565 	if (flags & MPOL_F_NODE) {
566 		if (flags & MPOL_F_ADDR) {
567 			err = lookup_node(mm, addr);
568 			if (err < 0)
569 				goto out;
570 			*policy = err;
571 		} else if (pol == current->mempolicy &&
572 				pol->policy == MPOL_INTERLEAVE) {
573 			*policy = current->il_next;
574 		} else {
575 			err = -EINVAL;
576 			goto out;
577 		}
578 	} else
579 		*policy = pol->policy;
580 
581 	if (vma) {
582 		up_read(&current->mm->mmap_sem);
583 		vma = NULL;
584 	}
585 
586 	err = 0;
587 	if (nmask)
588 		get_zonemask(pol, nmask);
589 
590  out:
591 	if (vma)
592 		up_read(&current->mm->mmap_sem);
593 	return err;
594 }
595 
596 #ifdef CONFIG_MIGRATION
597 /*
598  * page migration
599  */
600 static void migrate_page_add(struct page *page, struct list_head *pagelist,
601 				unsigned long flags)
602 {
603 	/*
604 	 * Avoid migrating a page that is shared with others.
605 	 */
606 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
607 		isolate_lru_page(page, pagelist);
608 }
609 
610 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
611 {
612 	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
613 }
614 
615 /*
616  * Migrate pages from one node to a target node.
617  * Returns error or the number of pages not migrated.
618  */
619 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
620 			   int flags)
621 {
622 	nodemask_t nmask;
623 	LIST_HEAD(pagelist);
624 	int err = 0;
625 
626 	nodes_clear(nmask);
627 	node_set(source, nmask);
628 
629 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
630 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
631 
632 	if (!list_empty(&pagelist))
633 		err = migrate_pages(&pagelist, new_node_page, dest);
634 
635 	return err;
636 }
637 
638 /*
639  * Move pages between the two nodesets so as to preserve the physical
640  * layout as much as possible.
641  *
642  * Returns the number of page that could not be moved.
643  */
644 int do_migrate_pages(struct mm_struct *mm,
645 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
646 {
647 	LIST_HEAD(pagelist);
648 	int busy = 0;
649 	int err = 0;
650 	nodemask_t tmp;
651 
652   	down_read(&mm->mmap_sem);
653 
654 	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
655 	if (err)
656 		goto out;
657 
658 /*
659  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
660  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
661  * bit in 'tmp', and return that <source, dest> pair for migration.
662  * The pair of nodemasks 'to' and 'from' define the map.
663  *
664  * If no pair of bits is found that way, fallback to picking some
665  * pair of 'source' and 'dest' bits that are not the same.  If the
666  * 'source' and 'dest' bits are the same, this represents a node
667  * that will be migrating to itself, so no pages need move.
668  *
669  * If no bits are left in 'tmp', or if all remaining bits left
670  * in 'tmp' correspond to the same bit in 'to', return false
671  * (nothing left to migrate).
672  *
673  * This lets us pick a pair of nodes to migrate between, such that
674  * if possible the dest node is not already occupied by some other
675  * source node, minimizing the risk of overloading the memory on a
676  * node that would happen if we migrated incoming memory to a node
677  * before migrating outgoing memory source that same node.
678  *
679  * A single scan of tmp is sufficient.  As we go, we remember the
680  * most recent <s, d> pair that moved (s != d).  If we find a pair
681  * that not only moved, but what's better, moved to an empty slot
682  * (d is not set in tmp), then we break out then, with that pair.
683  * Otherwise when we finish scannng from_tmp, we at least have the
684  * most recent <s, d> pair that moved.  If we get all the way through
685  * the scan of tmp without finding any node that moved, much less
686  * moved to an empty node, then there is nothing left worth migrating.
687  */
688 
689 	tmp = *from_nodes;
690 	while (!nodes_empty(tmp)) {
691 		int s,d;
692 		int source = -1;
693 		int dest = 0;
694 
695 		for_each_node_mask(s, tmp) {
696 			d = node_remap(s, *from_nodes, *to_nodes);
697 			if (s == d)
698 				continue;
699 
700 			source = s;	/* Node moved. Memorize */
701 			dest = d;
702 
703 			/* dest not in remaining from nodes? */
704 			if (!node_isset(dest, tmp))
705 				break;
706 		}
707 		if (source == -1)
708 			break;
709 
710 		node_clear(source, tmp);
711 		err = migrate_to_node(mm, source, dest, flags);
712 		if (err > 0)
713 			busy += err;
714 		if (err < 0)
715 			break;
716 	}
717 out:
718 	up_read(&mm->mmap_sem);
719 	if (err < 0)
720 		return err;
721 	return busy;
722 
723 }
724 
725 /*
726  * Allocate a new page for page migration based on vma policy.
727  * Start assuming that page is mapped by vma pointed to by @private.
728  * Search forward from there, if not.  N.B., this assumes that the
729  * list of pages handed to migrate_pages()--which is how we get here--
730  * is in virtual address order.
731  */
732 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
733 {
734 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
735 	unsigned long uninitialized_var(address);
736 
737 	while (vma) {
738 		address = page_address_in_vma(page, vma);
739 		if (address != -EFAULT)
740 			break;
741 		vma = vma->vm_next;
742 	}
743 
744 	/*
745 	 * if !vma, alloc_page_vma() will use task or system default policy
746 	 */
747 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
748 }
749 #else
750 
751 static void migrate_page_add(struct page *page, struct list_head *pagelist,
752 				unsigned long flags)
753 {
754 }
755 
756 int do_migrate_pages(struct mm_struct *mm,
757 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
758 {
759 	return -ENOSYS;
760 }
761 
762 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
763 {
764 	return NULL;
765 }
766 #endif
767 
768 static long do_mbind(unsigned long start, unsigned long len,
769 		     unsigned long mode, nodemask_t *nmask,
770 		     unsigned long flags)
771 {
772 	struct vm_area_struct *vma;
773 	struct mm_struct *mm = current->mm;
774 	struct mempolicy *new;
775 	unsigned long end;
776 	int err;
777 	LIST_HEAD(pagelist);
778 
779 	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
780 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
781 	    || mode > MPOL_MAX)
782 		return -EINVAL;
783 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
784 		return -EPERM;
785 
786 	if (start & ~PAGE_MASK)
787 		return -EINVAL;
788 
789 	if (mode == MPOL_DEFAULT)
790 		flags &= ~MPOL_MF_STRICT;
791 
792 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
793 	end = start + len;
794 
795 	if (end < start)
796 		return -EINVAL;
797 	if (end == start)
798 		return 0;
799 
800 	if (mpol_check_policy(mode, nmask))
801 		return -EINVAL;
802 
803 	new = mpol_new(mode, nmask);
804 	if (IS_ERR(new))
805 		return PTR_ERR(new);
806 
807 	/*
808 	 * If we are using the default policy then operation
809 	 * on discontinuous address spaces is okay after all
810 	 */
811 	if (!new)
812 		flags |= MPOL_MF_DISCONTIG_OK;
813 
814 	pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
815 		 mode, nmask ? nodes_addr(*nmask)[0] : -1);
816 
817 	down_write(&mm->mmap_sem);
818 	vma = check_range(mm, start, end, nmask,
819 			  flags | MPOL_MF_INVERT, &pagelist);
820 
821 	err = PTR_ERR(vma);
822 	if (!IS_ERR(vma)) {
823 		int nr_failed = 0;
824 
825 		err = mbind_range(vma, start, end, new);
826 
827 		if (!list_empty(&pagelist))
828 			nr_failed = migrate_pages(&pagelist, new_vma_page,
829 						(unsigned long)vma);
830 
831 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
832 			err = -EIO;
833 	}
834 
835 	up_write(&mm->mmap_sem);
836 	mpol_free(new);
837 	return err;
838 }
839 
840 /*
841  * User space interface with variable sized bitmaps for nodelists.
842  */
843 
844 /* Copy a node mask from user space. */
845 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
846 		     unsigned long maxnode)
847 {
848 	unsigned long k;
849 	unsigned long nlongs;
850 	unsigned long endmask;
851 
852 	--maxnode;
853 	nodes_clear(*nodes);
854 	if (maxnode == 0 || !nmask)
855 		return 0;
856 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
857 		return -EINVAL;
858 
859 	nlongs = BITS_TO_LONGS(maxnode);
860 	if ((maxnode % BITS_PER_LONG) == 0)
861 		endmask = ~0UL;
862 	else
863 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
864 
865 	/* When the user specified more nodes than supported just check
866 	   if the non supported part is all zero. */
867 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
868 		if (nlongs > PAGE_SIZE/sizeof(long))
869 			return -EINVAL;
870 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
871 			unsigned long t;
872 			if (get_user(t, nmask + k))
873 				return -EFAULT;
874 			if (k == nlongs - 1) {
875 				if (t & endmask)
876 					return -EINVAL;
877 			} else if (t)
878 				return -EINVAL;
879 		}
880 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
881 		endmask = ~0UL;
882 	}
883 
884 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
885 		return -EFAULT;
886 	nodes_addr(*nodes)[nlongs-1] &= endmask;
887 	return 0;
888 }
889 
890 /* Copy a kernel node mask to user space */
891 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
892 			      nodemask_t *nodes)
893 {
894 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
895 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
896 
897 	if (copy > nbytes) {
898 		if (copy > PAGE_SIZE)
899 			return -EINVAL;
900 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
901 			return -EFAULT;
902 		copy = nbytes;
903 	}
904 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
905 }
906 
907 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
908 			unsigned long mode,
909 			unsigned long __user *nmask, unsigned long maxnode,
910 			unsigned flags)
911 {
912 	nodemask_t nodes;
913 	int err;
914 
915 	err = get_nodes(&nodes, nmask, maxnode);
916 	if (err)
917 		return err;
918 #ifdef CONFIG_CPUSETS
919 	/* Restrict the nodes to the allowed nodes in the cpuset */
920 	nodes_and(nodes, nodes, current->mems_allowed);
921 #endif
922 	return do_mbind(start, len, mode, &nodes, flags);
923 }
924 
925 /* Set the process memory policy */
926 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
927 		unsigned long maxnode)
928 {
929 	int err;
930 	nodemask_t nodes;
931 
932 	if (mode < 0 || mode > MPOL_MAX)
933 		return -EINVAL;
934 	err = get_nodes(&nodes, nmask, maxnode);
935 	if (err)
936 		return err;
937 	return do_set_mempolicy(mode, &nodes);
938 }
939 
940 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
941 		const unsigned long __user *old_nodes,
942 		const unsigned long __user *new_nodes)
943 {
944 	struct mm_struct *mm;
945 	struct task_struct *task;
946 	nodemask_t old;
947 	nodemask_t new;
948 	nodemask_t task_nodes;
949 	int err;
950 
951 	err = get_nodes(&old, old_nodes, maxnode);
952 	if (err)
953 		return err;
954 
955 	err = get_nodes(&new, new_nodes, maxnode);
956 	if (err)
957 		return err;
958 
959 	/* Find the mm_struct */
960 	read_lock(&tasklist_lock);
961 	task = pid ? find_task_by_vpid(pid) : current;
962 	if (!task) {
963 		read_unlock(&tasklist_lock);
964 		return -ESRCH;
965 	}
966 	mm = get_task_mm(task);
967 	read_unlock(&tasklist_lock);
968 
969 	if (!mm)
970 		return -EINVAL;
971 
972 	/*
973 	 * Check if this process has the right to modify the specified
974 	 * process. The right exists if the process has administrative
975 	 * capabilities, superuser privileges or the same
976 	 * userid as the target process.
977 	 */
978 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
979 	    (current->uid != task->suid) && (current->uid != task->uid) &&
980 	    !capable(CAP_SYS_NICE)) {
981 		err = -EPERM;
982 		goto out;
983 	}
984 
985 	task_nodes = cpuset_mems_allowed(task);
986 	/* Is the user allowed to access the target nodes? */
987 	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
988 		err = -EPERM;
989 		goto out;
990 	}
991 
992 	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
993 		err = -EINVAL;
994 		goto out;
995 	}
996 
997 	err = security_task_movememory(task);
998 	if (err)
999 		goto out;
1000 
1001 	err = do_migrate_pages(mm, &old, &new,
1002 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1003 out:
1004 	mmput(mm);
1005 	return err;
1006 }
1007 
1008 
1009 /* Retrieve NUMA policy */
1010 asmlinkage long sys_get_mempolicy(int __user *policy,
1011 				unsigned long __user *nmask,
1012 				unsigned long maxnode,
1013 				unsigned long addr, unsigned long flags)
1014 {
1015 	int err;
1016 	int uninitialized_var(pval);
1017 	nodemask_t nodes;
1018 
1019 	if (nmask != NULL && maxnode < MAX_NUMNODES)
1020 		return -EINVAL;
1021 
1022 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1023 
1024 	if (err)
1025 		return err;
1026 
1027 	if (policy && put_user(pval, policy))
1028 		return -EFAULT;
1029 
1030 	if (nmask)
1031 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1032 
1033 	return err;
1034 }
1035 
1036 #ifdef CONFIG_COMPAT
1037 
1038 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1039 				     compat_ulong_t __user *nmask,
1040 				     compat_ulong_t maxnode,
1041 				     compat_ulong_t addr, compat_ulong_t flags)
1042 {
1043 	long err;
1044 	unsigned long __user *nm = NULL;
1045 	unsigned long nr_bits, alloc_size;
1046 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1047 
1048 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1049 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1050 
1051 	if (nmask)
1052 		nm = compat_alloc_user_space(alloc_size);
1053 
1054 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1055 
1056 	if (!err && nmask) {
1057 		err = copy_from_user(bm, nm, alloc_size);
1058 		/* ensure entire bitmap is zeroed */
1059 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1060 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1061 	}
1062 
1063 	return err;
1064 }
1065 
1066 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1067 				     compat_ulong_t maxnode)
1068 {
1069 	long err = 0;
1070 	unsigned long __user *nm = NULL;
1071 	unsigned long nr_bits, alloc_size;
1072 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1073 
1074 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1075 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1076 
1077 	if (nmask) {
1078 		err = compat_get_bitmap(bm, nmask, nr_bits);
1079 		nm = compat_alloc_user_space(alloc_size);
1080 		err |= copy_to_user(nm, bm, alloc_size);
1081 	}
1082 
1083 	if (err)
1084 		return -EFAULT;
1085 
1086 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1087 }
1088 
1089 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1090 			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1091 			     compat_ulong_t maxnode, compat_ulong_t flags)
1092 {
1093 	long err = 0;
1094 	unsigned long __user *nm = NULL;
1095 	unsigned long nr_bits, alloc_size;
1096 	nodemask_t bm;
1097 
1098 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1099 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1100 
1101 	if (nmask) {
1102 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1103 		nm = compat_alloc_user_space(alloc_size);
1104 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1105 	}
1106 
1107 	if (err)
1108 		return -EFAULT;
1109 
1110 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1111 }
1112 
1113 #endif
1114 
1115 /*
1116  * get_vma_policy(@task, @vma, @addr)
1117  * @task - task for fallback if vma policy == default
1118  * @vma   - virtual memory area whose policy is sought
1119  * @addr  - address in @vma for shared policy lookup
1120  *
1121  * Returns effective policy for a VMA at specified address.
1122  * Falls back to @task or system default policy, as necessary.
1123  * Returned policy has extra reference count if shared, vma,
1124  * or some other task's policy [show_numa_maps() can pass
1125  * @task != current].  It is the caller's responsibility to
1126  * free the reference in these cases.
1127  */
1128 static struct mempolicy * get_vma_policy(struct task_struct *task,
1129 		struct vm_area_struct *vma, unsigned long addr)
1130 {
1131 	struct mempolicy *pol = task->mempolicy;
1132 	int shared_pol = 0;
1133 
1134 	if (vma) {
1135 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1136 			pol = vma->vm_ops->get_policy(vma, addr);
1137 			shared_pol = 1;	/* if pol non-NULL, add ref below */
1138 		} else if (vma->vm_policy &&
1139 				vma->vm_policy->policy != MPOL_DEFAULT)
1140 			pol = vma->vm_policy;
1141 	}
1142 	if (!pol)
1143 		pol = &default_policy;
1144 	else if (!shared_pol && pol != current->mempolicy)
1145 		mpol_get(pol);	/* vma or other task's policy */
1146 	return pol;
1147 }
1148 
1149 /* Return a zonelist representing a mempolicy */
1150 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1151 {
1152 	int nd;
1153 
1154 	switch (policy->policy) {
1155 	case MPOL_PREFERRED:
1156 		nd = policy->v.preferred_node;
1157 		if (nd < 0)
1158 			nd = numa_node_id();
1159 		break;
1160 	case MPOL_BIND:
1161 		/* Lower zones don't get a policy applied */
1162 		/* Careful: current->mems_allowed might have moved */
1163 		if (gfp_zone(gfp) >= policy_zone)
1164 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1165 				return policy->v.zonelist;
1166 		/*FALL THROUGH*/
1167 	case MPOL_INTERLEAVE: /* should not happen */
1168 	case MPOL_DEFAULT:
1169 		nd = numa_node_id();
1170 		break;
1171 	default:
1172 		nd = 0;
1173 		BUG();
1174 	}
1175 	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1176 }
1177 
1178 /* Do dynamic interleaving for a process */
1179 static unsigned interleave_nodes(struct mempolicy *policy)
1180 {
1181 	unsigned nid, next;
1182 	struct task_struct *me = current;
1183 
1184 	nid = me->il_next;
1185 	next = next_node(nid, policy->v.nodes);
1186 	if (next >= MAX_NUMNODES)
1187 		next = first_node(policy->v.nodes);
1188 	me->il_next = next;
1189 	return nid;
1190 }
1191 
1192 /*
1193  * Depending on the memory policy provide a node from which to allocate the
1194  * next slab entry.
1195  */
1196 unsigned slab_node(struct mempolicy *policy)
1197 {
1198 	int pol = policy ? policy->policy : MPOL_DEFAULT;
1199 
1200 	switch (pol) {
1201 	case MPOL_INTERLEAVE:
1202 		return interleave_nodes(policy);
1203 
1204 	case MPOL_BIND:
1205 		/*
1206 		 * Follow bind policy behavior and start allocation at the
1207 		 * first node.
1208 		 */
1209 		return zone_to_nid(policy->v.zonelist->zones[0]);
1210 
1211 	case MPOL_PREFERRED:
1212 		if (policy->v.preferred_node >= 0)
1213 			return policy->v.preferred_node;
1214 		/* Fall through */
1215 
1216 	default:
1217 		return numa_node_id();
1218 	}
1219 }
1220 
1221 /* Do static interleaving for a VMA with known offset. */
1222 static unsigned offset_il_node(struct mempolicy *pol,
1223 		struct vm_area_struct *vma, unsigned long off)
1224 {
1225 	unsigned nnodes = nodes_weight(pol->v.nodes);
1226 	unsigned target = (unsigned)off % nnodes;
1227 	int c;
1228 	int nid = -1;
1229 
1230 	c = 0;
1231 	do {
1232 		nid = next_node(nid, pol->v.nodes);
1233 		c++;
1234 	} while (c <= target);
1235 	return nid;
1236 }
1237 
1238 /* Determine a node number for interleave */
1239 static inline unsigned interleave_nid(struct mempolicy *pol,
1240 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1241 {
1242 	if (vma) {
1243 		unsigned long off;
1244 
1245 		/*
1246 		 * for small pages, there is no difference between
1247 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1248 		 * for huge pages, since vm_pgoff is in units of small
1249 		 * pages, we need to shift off the always 0 bits to get
1250 		 * a useful offset.
1251 		 */
1252 		BUG_ON(shift < PAGE_SHIFT);
1253 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1254 		off += (addr - vma->vm_start) >> shift;
1255 		return offset_il_node(pol, vma, off);
1256 	} else
1257 		return interleave_nodes(pol);
1258 }
1259 
1260 #ifdef CONFIG_HUGETLBFS
1261 /*
1262  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1263  * @vma = virtual memory area whose policy is sought
1264  * @addr = address in @vma for shared policy lookup and interleave policy
1265  * @gfp_flags = for requested zone
1266  * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
1267  *
1268  * Returns a zonelist suitable for a huge page allocation.
1269  * If the effective policy is 'BIND, returns pointer to policy's zonelist.
1270  * If it is also a policy for which get_vma_policy() returns an extra
1271  * reference, we must hold that reference until after allocation.
1272  * In that case, return policy via @mpol so hugetlb allocation can drop
1273  * the reference.  For non-'BIND referenced policies, we can/do drop the
1274  * reference here, so the caller doesn't need to know about the special case
1275  * for default and current task policy.
1276  */
1277 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1278 				gfp_t gfp_flags, struct mempolicy **mpol)
1279 {
1280 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1281 	struct zonelist *zl;
1282 
1283 	*mpol = NULL;		/* probably no unref needed */
1284 	if (pol->policy == MPOL_INTERLEAVE) {
1285 		unsigned nid;
1286 
1287 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1288 		__mpol_free(pol);		/* finished with pol */
1289 		return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
1290 	}
1291 
1292 	zl = zonelist_policy(GFP_HIGHUSER, pol);
1293 	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1294 		if (pol->policy != MPOL_BIND)
1295 			__mpol_free(pol);	/* finished with pol */
1296 		else
1297 			*mpol = pol;	/* unref needed after allocation */
1298 	}
1299 	return zl;
1300 }
1301 #endif
1302 
1303 /* Allocate a page in interleaved policy.
1304    Own path because it needs to do special accounting. */
1305 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1306 					unsigned nid)
1307 {
1308 	struct zonelist *zl;
1309 	struct page *page;
1310 
1311 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1312 	page = __alloc_pages(gfp, order, zl);
1313 	if (page && page_zone(page) == zl->zones[0])
1314 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1315 	return page;
1316 }
1317 
1318 /**
1319  * 	alloc_page_vma	- Allocate a page for a VMA.
1320  *
1321  * 	@gfp:
1322  *      %GFP_USER    user allocation.
1323  *      %GFP_KERNEL  kernel allocations,
1324  *      %GFP_HIGHMEM highmem/user allocations,
1325  *      %GFP_FS      allocation should not call back into a file system.
1326  *      %GFP_ATOMIC  don't sleep.
1327  *
1328  * 	@vma:  Pointer to VMA or NULL if not available.
1329  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1330  *
1331  * 	This function allocates a page from the kernel page pool and applies
1332  *	a NUMA policy associated with the VMA or the current process.
1333  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1334  *	mm_struct of the VMA to prevent it from going away. Should be used for
1335  *	all allocations for pages that will be mapped into
1336  * 	user space. Returns NULL when no page can be allocated.
1337  *
1338  *	Should be called with the mm_sem of the vma hold.
1339  */
1340 struct page *
1341 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1342 {
1343 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1344 	struct zonelist *zl;
1345 
1346 	cpuset_update_task_memory_state();
1347 
1348 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1349 		unsigned nid;
1350 
1351 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1352 		return alloc_page_interleave(gfp, 0, nid);
1353 	}
1354 	zl = zonelist_policy(gfp, pol);
1355 	if (pol != &default_policy && pol != current->mempolicy) {
1356 		/*
1357 		 * slow path: ref counted policy -- shared or vma
1358 		 */
1359 		struct page *page =  __alloc_pages(gfp, 0, zl);
1360 		__mpol_free(pol);
1361 		return page;
1362 	}
1363 	/*
1364 	 * fast path:  default or task policy
1365 	 */
1366 	return __alloc_pages(gfp, 0, zl);
1367 }
1368 
1369 /**
1370  * 	alloc_pages_current - Allocate pages.
1371  *
1372  *	@gfp:
1373  *		%GFP_USER   user allocation,
1374  *      	%GFP_KERNEL kernel allocation,
1375  *      	%GFP_HIGHMEM highmem allocation,
1376  *      	%GFP_FS     don't call back into a file system.
1377  *      	%GFP_ATOMIC don't sleep.
1378  *	@order: Power of two of allocation size in pages. 0 is a single page.
1379  *
1380  *	Allocate a page from the kernel page pool.  When not in
1381  *	interrupt context and apply the current process NUMA policy.
1382  *	Returns NULL when no page can be allocated.
1383  *
1384  *	Don't call cpuset_update_task_memory_state() unless
1385  *	1) it's ok to take cpuset_sem (can WAIT), and
1386  *	2) allocating for current task (not interrupt).
1387  */
1388 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1389 {
1390 	struct mempolicy *pol = current->mempolicy;
1391 
1392 	if ((gfp & __GFP_WAIT) && !in_interrupt())
1393 		cpuset_update_task_memory_state();
1394 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1395 		pol = &default_policy;
1396 	if (pol->policy == MPOL_INTERLEAVE)
1397 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1398 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1399 }
1400 EXPORT_SYMBOL(alloc_pages_current);
1401 
1402 /*
1403  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1404  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1405  * with the mems_allowed returned by cpuset_mems_allowed().  This
1406  * keeps mempolicies cpuset relative after its cpuset moves.  See
1407  * further kernel/cpuset.c update_nodemask().
1408  */
1409 
1410 /* Slow path of a mempolicy copy */
1411 struct mempolicy *__mpol_copy(struct mempolicy *old)
1412 {
1413 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1414 
1415 	if (!new)
1416 		return ERR_PTR(-ENOMEM);
1417 	if (current_cpuset_is_being_rebound()) {
1418 		nodemask_t mems = cpuset_mems_allowed(current);
1419 		mpol_rebind_policy(old, &mems);
1420 	}
1421 	*new = *old;
1422 	atomic_set(&new->refcnt, 1);
1423 	if (new->policy == MPOL_BIND) {
1424 		int sz = ksize(old->v.zonelist);
1425 		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1426 		if (!new->v.zonelist) {
1427 			kmem_cache_free(policy_cache, new);
1428 			return ERR_PTR(-ENOMEM);
1429 		}
1430 	}
1431 	return new;
1432 }
1433 
1434 /* Slow path of a mempolicy comparison */
1435 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1436 {
1437 	if (!a || !b)
1438 		return 0;
1439 	if (a->policy != b->policy)
1440 		return 0;
1441 	switch (a->policy) {
1442 	case MPOL_DEFAULT:
1443 		return 1;
1444 	case MPOL_INTERLEAVE:
1445 		return nodes_equal(a->v.nodes, b->v.nodes);
1446 	case MPOL_PREFERRED:
1447 		return a->v.preferred_node == b->v.preferred_node;
1448 	case MPOL_BIND: {
1449 		int i;
1450 		for (i = 0; a->v.zonelist->zones[i]; i++)
1451 			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1452 				return 0;
1453 		return b->v.zonelist->zones[i] == NULL;
1454 	}
1455 	default:
1456 		BUG();
1457 		return 0;
1458 	}
1459 }
1460 
1461 /* Slow path of a mpol destructor. */
1462 void __mpol_free(struct mempolicy *p)
1463 {
1464 	if (!atomic_dec_and_test(&p->refcnt))
1465 		return;
1466 	if (p->policy == MPOL_BIND)
1467 		kfree(p->v.zonelist);
1468 	p->policy = MPOL_DEFAULT;
1469 	kmem_cache_free(policy_cache, p);
1470 }
1471 
1472 /*
1473  * Shared memory backing store policy support.
1474  *
1475  * Remember policies even when nobody has shared memory mapped.
1476  * The policies are kept in Red-Black tree linked from the inode.
1477  * They are protected by the sp->lock spinlock, which should be held
1478  * for any accesses to the tree.
1479  */
1480 
1481 /* lookup first element intersecting start-end */
1482 /* Caller holds sp->lock */
1483 static struct sp_node *
1484 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1485 {
1486 	struct rb_node *n = sp->root.rb_node;
1487 
1488 	while (n) {
1489 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1490 
1491 		if (start >= p->end)
1492 			n = n->rb_right;
1493 		else if (end <= p->start)
1494 			n = n->rb_left;
1495 		else
1496 			break;
1497 	}
1498 	if (!n)
1499 		return NULL;
1500 	for (;;) {
1501 		struct sp_node *w = NULL;
1502 		struct rb_node *prev = rb_prev(n);
1503 		if (!prev)
1504 			break;
1505 		w = rb_entry(prev, struct sp_node, nd);
1506 		if (w->end <= start)
1507 			break;
1508 		n = prev;
1509 	}
1510 	return rb_entry(n, struct sp_node, nd);
1511 }
1512 
1513 /* Insert a new shared policy into the list. */
1514 /* Caller holds sp->lock */
1515 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1516 {
1517 	struct rb_node **p = &sp->root.rb_node;
1518 	struct rb_node *parent = NULL;
1519 	struct sp_node *nd;
1520 
1521 	while (*p) {
1522 		parent = *p;
1523 		nd = rb_entry(parent, struct sp_node, nd);
1524 		if (new->start < nd->start)
1525 			p = &(*p)->rb_left;
1526 		else if (new->end > nd->end)
1527 			p = &(*p)->rb_right;
1528 		else
1529 			BUG();
1530 	}
1531 	rb_link_node(&new->nd, parent, p);
1532 	rb_insert_color(&new->nd, &sp->root);
1533 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1534 		 new->policy ? new->policy->policy : 0);
1535 }
1536 
1537 /* Find shared policy intersecting idx */
1538 struct mempolicy *
1539 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1540 {
1541 	struct mempolicy *pol = NULL;
1542 	struct sp_node *sn;
1543 
1544 	if (!sp->root.rb_node)
1545 		return NULL;
1546 	spin_lock(&sp->lock);
1547 	sn = sp_lookup(sp, idx, idx+1);
1548 	if (sn) {
1549 		mpol_get(sn->policy);
1550 		pol = sn->policy;
1551 	}
1552 	spin_unlock(&sp->lock);
1553 	return pol;
1554 }
1555 
1556 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1557 {
1558 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1559 	rb_erase(&n->nd, &sp->root);
1560 	mpol_free(n->policy);
1561 	kmem_cache_free(sn_cache, n);
1562 }
1563 
1564 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1565 				struct mempolicy *pol)
1566 {
1567 	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1568 
1569 	if (!n)
1570 		return NULL;
1571 	n->start = start;
1572 	n->end = end;
1573 	mpol_get(pol);
1574 	n->policy = pol;
1575 	return n;
1576 }
1577 
1578 /* Replace a policy range. */
1579 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1580 				 unsigned long end, struct sp_node *new)
1581 {
1582 	struct sp_node *n, *new2 = NULL;
1583 
1584 restart:
1585 	spin_lock(&sp->lock);
1586 	n = sp_lookup(sp, start, end);
1587 	/* Take care of old policies in the same range. */
1588 	while (n && n->start < end) {
1589 		struct rb_node *next = rb_next(&n->nd);
1590 		if (n->start >= start) {
1591 			if (n->end <= end)
1592 				sp_delete(sp, n);
1593 			else
1594 				n->start = end;
1595 		} else {
1596 			/* Old policy spanning whole new range. */
1597 			if (n->end > end) {
1598 				if (!new2) {
1599 					spin_unlock(&sp->lock);
1600 					new2 = sp_alloc(end, n->end, n->policy);
1601 					if (!new2)
1602 						return -ENOMEM;
1603 					goto restart;
1604 				}
1605 				n->end = start;
1606 				sp_insert(sp, new2);
1607 				new2 = NULL;
1608 				break;
1609 			} else
1610 				n->end = start;
1611 		}
1612 		if (!next)
1613 			break;
1614 		n = rb_entry(next, struct sp_node, nd);
1615 	}
1616 	if (new)
1617 		sp_insert(sp, new);
1618 	spin_unlock(&sp->lock);
1619 	if (new2) {
1620 		mpol_free(new2->policy);
1621 		kmem_cache_free(sn_cache, new2);
1622 	}
1623 	return 0;
1624 }
1625 
1626 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1627 				nodemask_t *policy_nodes)
1628 {
1629 	info->root = RB_ROOT;
1630 	spin_lock_init(&info->lock);
1631 
1632 	if (policy != MPOL_DEFAULT) {
1633 		struct mempolicy *newpol;
1634 
1635 		/* Falls back to MPOL_DEFAULT on any error */
1636 		newpol = mpol_new(policy, policy_nodes);
1637 		if (!IS_ERR(newpol)) {
1638 			/* Create pseudo-vma that contains just the policy */
1639 			struct vm_area_struct pvma;
1640 
1641 			memset(&pvma, 0, sizeof(struct vm_area_struct));
1642 			/* Policy covers entire file */
1643 			pvma.vm_end = TASK_SIZE;
1644 			mpol_set_shared_policy(info, &pvma, newpol);
1645 			mpol_free(newpol);
1646 		}
1647 	}
1648 }
1649 
1650 int mpol_set_shared_policy(struct shared_policy *info,
1651 			struct vm_area_struct *vma, struct mempolicy *npol)
1652 {
1653 	int err;
1654 	struct sp_node *new = NULL;
1655 	unsigned long sz = vma_pages(vma);
1656 
1657 	pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
1658 		 vma->vm_pgoff,
1659 		 sz, npol? npol->policy : -1,
1660 		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1661 
1662 	if (npol) {
1663 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1664 		if (!new)
1665 			return -ENOMEM;
1666 	}
1667 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1668 	if (err && new)
1669 		kmem_cache_free(sn_cache, new);
1670 	return err;
1671 }
1672 
1673 /* Free a backing policy store on inode delete. */
1674 void mpol_free_shared_policy(struct shared_policy *p)
1675 {
1676 	struct sp_node *n;
1677 	struct rb_node *next;
1678 
1679 	if (!p->root.rb_node)
1680 		return;
1681 	spin_lock(&p->lock);
1682 	next = rb_first(&p->root);
1683 	while (next) {
1684 		n = rb_entry(next, struct sp_node, nd);
1685 		next = rb_next(&n->nd);
1686 		rb_erase(&n->nd, &p->root);
1687 		mpol_free(n->policy);
1688 		kmem_cache_free(sn_cache, n);
1689 	}
1690 	spin_unlock(&p->lock);
1691 }
1692 
1693 /* assumes fs == KERNEL_DS */
1694 void __init numa_policy_init(void)
1695 {
1696 	nodemask_t interleave_nodes;
1697 	unsigned long largest = 0;
1698 	int nid, prefer = 0;
1699 
1700 	policy_cache = kmem_cache_create("numa_policy",
1701 					 sizeof(struct mempolicy),
1702 					 0, SLAB_PANIC, NULL);
1703 
1704 	sn_cache = kmem_cache_create("shared_policy_node",
1705 				     sizeof(struct sp_node),
1706 				     0, SLAB_PANIC, NULL);
1707 
1708 	/*
1709 	 * Set interleaving policy for system init. Interleaving is only
1710 	 * enabled across suitably sized nodes (default is >= 16MB), or
1711 	 * fall back to the largest node if they're all smaller.
1712 	 */
1713 	nodes_clear(interleave_nodes);
1714 	for_each_node_state(nid, N_HIGH_MEMORY) {
1715 		unsigned long total_pages = node_present_pages(nid);
1716 
1717 		/* Preserve the largest node */
1718 		if (largest < total_pages) {
1719 			largest = total_pages;
1720 			prefer = nid;
1721 		}
1722 
1723 		/* Interleave this node? */
1724 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1725 			node_set(nid, interleave_nodes);
1726 	}
1727 
1728 	/* All too small, use the largest */
1729 	if (unlikely(nodes_empty(interleave_nodes)))
1730 		node_set(prefer, interleave_nodes);
1731 
1732 	if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
1733 		printk("numa_policy_init: interleaving failed\n");
1734 }
1735 
1736 /* Reset policy of current process to default */
1737 void numa_default_policy(void)
1738 {
1739 	do_set_mempolicy(MPOL_DEFAULT, NULL);
1740 }
1741 
1742 /* Migrate a policy to a different set of nodes */
1743 static void mpol_rebind_policy(struct mempolicy *pol,
1744 			       const nodemask_t *newmask)
1745 {
1746 	nodemask_t *mpolmask;
1747 	nodemask_t tmp;
1748 
1749 	if (!pol)
1750 		return;
1751 	mpolmask = &pol->cpuset_mems_allowed;
1752 	if (nodes_equal(*mpolmask, *newmask))
1753 		return;
1754 
1755 	switch (pol->policy) {
1756 	case MPOL_DEFAULT:
1757 		break;
1758 	case MPOL_INTERLEAVE:
1759 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1760 		pol->v.nodes = tmp;
1761 		*mpolmask = *newmask;
1762 		current->il_next = node_remap(current->il_next,
1763 						*mpolmask, *newmask);
1764 		break;
1765 	case MPOL_PREFERRED:
1766 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1767 						*mpolmask, *newmask);
1768 		*mpolmask = *newmask;
1769 		break;
1770 	case MPOL_BIND: {
1771 		nodemask_t nodes;
1772 		struct zone **z;
1773 		struct zonelist *zonelist;
1774 
1775 		nodes_clear(nodes);
1776 		for (z = pol->v.zonelist->zones; *z; z++)
1777 			node_set(zone_to_nid(*z), nodes);
1778 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
1779 		nodes = tmp;
1780 
1781 		zonelist = bind_zonelist(&nodes);
1782 
1783 		/* If no mem, then zonelist is NULL and we keep old zonelist.
1784 		 * If that old zonelist has no remaining mems_allowed nodes,
1785 		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1786 		 */
1787 
1788 		if (!IS_ERR(zonelist)) {
1789 			/* Good - got mem - substitute new zonelist */
1790 			kfree(pol->v.zonelist);
1791 			pol->v.zonelist = zonelist;
1792 		}
1793 		*mpolmask = *newmask;
1794 		break;
1795 	}
1796 	default:
1797 		BUG();
1798 		break;
1799 	}
1800 }
1801 
1802 /*
1803  * Wrapper for mpol_rebind_policy() that just requires task
1804  * pointer, and updates task mempolicy.
1805  */
1806 
1807 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1808 {
1809 	mpol_rebind_policy(tsk->mempolicy, new);
1810 }
1811 
1812 /*
1813  * Rebind each vma in mm to new nodemask.
1814  *
1815  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1816  */
1817 
1818 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1819 {
1820 	struct vm_area_struct *vma;
1821 
1822 	down_write(&mm->mmap_sem);
1823 	for (vma = mm->mmap; vma; vma = vma->vm_next)
1824 		mpol_rebind_policy(vma->vm_policy, new);
1825 	up_write(&mm->mmap_sem);
1826 }
1827 
1828 /*
1829  * Display pages allocated per node and memory policy via /proc.
1830  */
1831 
1832 static const char * const policy_types[] =
1833 	{ "default", "prefer", "bind", "interleave" };
1834 
1835 /*
1836  * Convert a mempolicy into a string.
1837  * Returns the number of characters in buffer (if positive)
1838  * or an error (negative)
1839  */
1840 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1841 {
1842 	char *p = buffer;
1843 	int l;
1844 	nodemask_t nodes;
1845 	int mode = pol ? pol->policy : MPOL_DEFAULT;
1846 
1847 	switch (mode) {
1848 	case MPOL_DEFAULT:
1849 		nodes_clear(nodes);
1850 		break;
1851 
1852 	case MPOL_PREFERRED:
1853 		nodes_clear(nodes);
1854 		node_set(pol->v.preferred_node, nodes);
1855 		break;
1856 
1857 	case MPOL_BIND:
1858 		get_zonemask(pol, &nodes);
1859 		break;
1860 
1861 	case MPOL_INTERLEAVE:
1862 		nodes = pol->v.nodes;
1863 		break;
1864 
1865 	default:
1866 		BUG();
1867 		return -EFAULT;
1868 	}
1869 
1870 	l = strlen(policy_types[mode]);
1871  	if (buffer + maxlen < p + l + 1)
1872  		return -ENOSPC;
1873 
1874 	strcpy(p, policy_types[mode]);
1875 	p += l;
1876 
1877 	if (!nodes_empty(nodes)) {
1878 		if (buffer + maxlen < p + 2)
1879 			return -ENOSPC;
1880 		*p++ = '=';
1881 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1882 	}
1883 	return p - buffer;
1884 }
1885 
1886 struct numa_maps {
1887 	unsigned long pages;
1888 	unsigned long anon;
1889 	unsigned long active;
1890 	unsigned long writeback;
1891 	unsigned long mapcount_max;
1892 	unsigned long dirty;
1893 	unsigned long swapcache;
1894 	unsigned long node[MAX_NUMNODES];
1895 };
1896 
1897 static void gather_stats(struct page *page, void *private, int pte_dirty)
1898 {
1899 	struct numa_maps *md = private;
1900 	int count = page_mapcount(page);
1901 
1902 	md->pages++;
1903 	if (pte_dirty || PageDirty(page))
1904 		md->dirty++;
1905 
1906 	if (PageSwapCache(page))
1907 		md->swapcache++;
1908 
1909 	if (PageActive(page))
1910 		md->active++;
1911 
1912 	if (PageWriteback(page))
1913 		md->writeback++;
1914 
1915 	if (PageAnon(page))
1916 		md->anon++;
1917 
1918 	if (count > md->mapcount_max)
1919 		md->mapcount_max = count;
1920 
1921 	md->node[page_to_nid(page)]++;
1922 }
1923 
1924 #ifdef CONFIG_HUGETLB_PAGE
1925 static void check_huge_range(struct vm_area_struct *vma,
1926 		unsigned long start, unsigned long end,
1927 		struct numa_maps *md)
1928 {
1929 	unsigned long addr;
1930 	struct page *page;
1931 
1932 	for (addr = start; addr < end; addr += HPAGE_SIZE) {
1933 		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1934 		pte_t pte;
1935 
1936 		if (!ptep)
1937 			continue;
1938 
1939 		pte = *ptep;
1940 		if (pte_none(pte))
1941 			continue;
1942 
1943 		page = pte_page(pte);
1944 		if (!page)
1945 			continue;
1946 
1947 		gather_stats(page, md, pte_dirty(*ptep));
1948 	}
1949 }
1950 #else
1951 static inline void check_huge_range(struct vm_area_struct *vma,
1952 		unsigned long start, unsigned long end,
1953 		struct numa_maps *md)
1954 {
1955 }
1956 #endif
1957 
1958 int show_numa_map(struct seq_file *m, void *v)
1959 {
1960 	struct proc_maps_private *priv = m->private;
1961 	struct vm_area_struct *vma = v;
1962 	struct numa_maps *md;
1963 	struct file *file = vma->vm_file;
1964 	struct mm_struct *mm = vma->vm_mm;
1965 	struct mempolicy *pol;
1966 	int n;
1967 	char buffer[50];
1968 
1969 	if (!mm)
1970 		return 0;
1971 
1972 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1973 	if (!md)
1974 		return 0;
1975 
1976 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
1977 	mpol_to_str(buffer, sizeof(buffer), pol);
1978 	/*
1979 	 * unref shared or other task's mempolicy
1980 	 */
1981 	if (pol != &default_policy && pol != current->mempolicy)
1982 		__mpol_free(pol);
1983 
1984 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1985 
1986 	if (file) {
1987 		seq_printf(m, " file=");
1988 		seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1989 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1990 		seq_printf(m, " heap");
1991 	} else if (vma->vm_start <= mm->start_stack &&
1992 			vma->vm_end >= mm->start_stack) {
1993 		seq_printf(m, " stack");
1994 	}
1995 
1996 	if (is_vm_hugetlb_page(vma)) {
1997 		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1998 		seq_printf(m, " huge");
1999 	} else {
2000 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2001 			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2002 	}
2003 
2004 	if (!md->pages)
2005 		goto out;
2006 
2007 	if (md->anon)
2008 		seq_printf(m," anon=%lu",md->anon);
2009 
2010 	if (md->dirty)
2011 		seq_printf(m," dirty=%lu",md->dirty);
2012 
2013 	if (md->pages != md->anon && md->pages != md->dirty)
2014 		seq_printf(m, " mapped=%lu", md->pages);
2015 
2016 	if (md->mapcount_max > 1)
2017 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2018 
2019 	if (md->swapcache)
2020 		seq_printf(m," swapcache=%lu", md->swapcache);
2021 
2022 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2023 		seq_printf(m," active=%lu", md->active);
2024 
2025 	if (md->writeback)
2026 		seq_printf(m," writeback=%lu", md->writeback);
2027 
2028 	for_each_node_state(n, N_HIGH_MEMORY)
2029 		if (md->node[n])
2030 			seq_printf(m, " N%d=%lu", n, md->node[n]);
2031 out:
2032 	seq_putc(m, '\n');
2033 	kfree(md);
2034 
2035 	if (m->count < m->size)
2036 		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2037 	return 0;
2038 }
2039