xref: /linux/mm/vmalloc.c (revision 930cc144a043ff95e56b6888fa51c618b33f89e7)
1 /*
2  *  linux/mm/vmalloc.c
3  *
4  *  Copyright (C) 1993  Linus Torvalds
5  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
6  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
7  *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
8  *  Numa awareness, Christoph Lameter, SGI, June 2005
9  */
10 
11 #include <linux/vmalloc.h>
12 #include <linux/mm.h>
13 #include <linux/module.h>
14 #include <linux/highmem.h>
15 #include <linux/slab.h>
16 #include <linux/spinlock.h>
17 #include <linux/interrupt.h>
18 #include <linux/seq_file.h>
19 #include <linux/debugobjects.h>
20 #include <linux/kallsyms.h>
21 #include <linux/list.h>
22 #include <linux/rbtree.h>
23 #include <linux/radix-tree.h>
24 #include <linux/rcupdate.h>
25 
26 #include <asm/atomic.h>
27 #include <asm/uaccess.h>
28 #include <asm/tlbflush.h>
29 
30 
31 /*** Page table manipulation functions ***/
32 
33 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
34 {
35 	pte_t *pte;
36 
37 	pte = pte_offset_kernel(pmd, addr);
38 	do {
39 		pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
40 		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
41 	} while (pte++, addr += PAGE_SIZE, addr != end);
42 }
43 
44 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
45 {
46 	pmd_t *pmd;
47 	unsigned long next;
48 
49 	pmd = pmd_offset(pud, addr);
50 	do {
51 		next = pmd_addr_end(addr, end);
52 		if (pmd_none_or_clear_bad(pmd))
53 			continue;
54 		vunmap_pte_range(pmd, addr, next);
55 	} while (pmd++, addr = next, addr != end);
56 }
57 
58 static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
59 {
60 	pud_t *pud;
61 	unsigned long next;
62 
63 	pud = pud_offset(pgd, addr);
64 	do {
65 		next = pud_addr_end(addr, end);
66 		if (pud_none_or_clear_bad(pud))
67 			continue;
68 		vunmap_pmd_range(pud, addr, next);
69 	} while (pud++, addr = next, addr != end);
70 }
71 
72 static void vunmap_page_range(unsigned long addr, unsigned long end)
73 {
74 	pgd_t *pgd;
75 	unsigned long next;
76 
77 	BUG_ON(addr >= end);
78 	pgd = pgd_offset_k(addr);
79 	flush_cache_vunmap(addr, end);
80 	do {
81 		next = pgd_addr_end(addr, end);
82 		if (pgd_none_or_clear_bad(pgd))
83 			continue;
84 		vunmap_pud_range(pgd, addr, next);
85 	} while (pgd++, addr = next, addr != end);
86 }
87 
88 static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
89 		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
90 {
91 	pte_t *pte;
92 
93 	/*
94 	 * nr is a running index into the array which helps higher level
95 	 * callers keep track of where we're up to.
96 	 */
97 
98 	pte = pte_alloc_kernel(pmd, addr);
99 	if (!pte)
100 		return -ENOMEM;
101 	do {
102 		struct page *page = pages[*nr];
103 
104 		if (WARN_ON(!pte_none(*pte)))
105 			return -EBUSY;
106 		if (WARN_ON(!page))
107 			return -ENOMEM;
108 		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
109 		(*nr)++;
110 	} while (pte++, addr += PAGE_SIZE, addr != end);
111 	return 0;
112 }
113 
114 static int vmap_pmd_range(pud_t *pud, unsigned long addr,
115 		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
116 {
117 	pmd_t *pmd;
118 	unsigned long next;
119 
120 	pmd = pmd_alloc(&init_mm, pud, addr);
121 	if (!pmd)
122 		return -ENOMEM;
123 	do {
124 		next = pmd_addr_end(addr, end);
125 		if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
126 			return -ENOMEM;
127 	} while (pmd++, addr = next, addr != end);
128 	return 0;
129 }
130 
131 static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
132 		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
133 {
134 	pud_t *pud;
135 	unsigned long next;
136 
137 	pud = pud_alloc(&init_mm, pgd, addr);
138 	if (!pud)
139 		return -ENOMEM;
140 	do {
141 		next = pud_addr_end(addr, end);
142 		if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
143 			return -ENOMEM;
144 	} while (pud++, addr = next, addr != end);
145 	return 0;
146 }
147 
148 /*
149  * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
150  * will have pfns corresponding to the "pages" array.
151  *
152  * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
153  */
154 static int vmap_page_range(unsigned long addr, unsigned long end,
155 				pgprot_t prot, struct page **pages)
156 {
157 	pgd_t *pgd;
158 	unsigned long next;
159 	int err = 0;
160 	int nr = 0;
161 
162 	BUG_ON(addr >= end);
163 	pgd = pgd_offset_k(addr);
164 	do {
165 		next = pgd_addr_end(addr, end);
166 		err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
167 		if (err)
168 			break;
169 	} while (pgd++, addr = next, addr != end);
170 	flush_cache_vmap(addr, end);
171 
172 	if (unlikely(err))
173 		return err;
174 	return nr;
175 }
176 
177 static inline int is_vmalloc_or_module_addr(const void *x)
178 {
179 	/*
180 	 * x86-64 and sparc64 put modules in a special place,
181 	 * and fall back on vmalloc() if that fails. Others
182 	 * just put it in the vmalloc space.
183 	 */
184 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
185 	unsigned long addr = (unsigned long)x;
186 	if (addr >= MODULES_VADDR && addr < MODULES_END)
187 		return 1;
188 #endif
189 	return is_vmalloc_addr(x);
190 }
191 
192 /*
193  * Walk a vmap address to the struct page it maps.
194  */
195 struct page *vmalloc_to_page(const void *vmalloc_addr)
196 {
197 	unsigned long addr = (unsigned long) vmalloc_addr;
198 	struct page *page = NULL;
199 	pgd_t *pgd = pgd_offset_k(addr);
200 
201 	/*
202 	 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
203 	 * architectures that do not vmalloc module space
204 	 */
205 	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
206 
207 	if (!pgd_none(*pgd)) {
208 		pud_t *pud = pud_offset(pgd, addr);
209 		if (!pud_none(*pud)) {
210 			pmd_t *pmd = pmd_offset(pud, addr);
211 			if (!pmd_none(*pmd)) {
212 				pte_t *ptep, pte;
213 
214 				ptep = pte_offset_map(pmd, addr);
215 				pte = *ptep;
216 				if (pte_present(pte))
217 					page = pte_page(pte);
218 				pte_unmap(ptep);
219 			}
220 		}
221 	}
222 	return page;
223 }
224 EXPORT_SYMBOL(vmalloc_to_page);
225 
226 /*
227  * Map a vmalloc()-space virtual address to the physical page frame number.
228  */
229 unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
230 {
231 	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
232 }
233 EXPORT_SYMBOL(vmalloc_to_pfn);
234 
235 
236 /*** Global kva allocator ***/
237 
238 #define VM_LAZY_FREE	0x01
239 #define VM_LAZY_FREEING	0x02
240 #define VM_VM_AREA	0x04
241 
242 struct vmap_area {
243 	unsigned long va_start;
244 	unsigned long va_end;
245 	unsigned long flags;
246 	struct rb_node rb_node;		/* address sorted rbtree */
247 	struct list_head list;		/* address sorted list */
248 	struct list_head purge_list;	/* "lazy purge" list */
249 	void *private;
250 	struct rcu_head rcu_head;
251 };
252 
253 static DEFINE_SPINLOCK(vmap_area_lock);
254 static struct rb_root vmap_area_root = RB_ROOT;
255 static LIST_HEAD(vmap_area_list);
256 
257 static struct vmap_area *__find_vmap_area(unsigned long addr)
258 {
259 	struct rb_node *n = vmap_area_root.rb_node;
260 
261 	while (n) {
262 		struct vmap_area *va;
263 
264 		va = rb_entry(n, struct vmap_area, rb_node);
265 		if (addr < va->va_start)
266 			n = n->rb_left;
267 		else if (addr > va->va_start)
268 			n = n->rb_right;
269 		else
270 			return va;
271 	}
272 
273 	return NULL;
274 }
275 
276 static void __insert_vmap_area(struct vmap_area *va)
277 {
278 	struct rb_node **p = &vmap_area_root.rb_node;
279 	struct rb_node *parent = NULL;
280 	struct rb_node *tmp;
281 
282 	while (*p) {
283 		struct vmap_area *tmp;
284 
285 		parent = *p;
286 		tmp = rb_entry(parent, struct vmap_area, rb_node);
287 		if (va->va_start < tmp->va_end)
288 			p = &(*p)->rb_left;
289 		else if (va->va_end > tmp->va_start)
290 			p = &(*p)->rb_right;
291 		else
292 			BUG();
293 	}
294 
295 	rb_link_node(&va->rb_node, parent, p);
296 	rb_insert_color(&va->rb_node, &vmap_area_root);
297 
298 	/* address-sort this list so it is usable like the vmlist */
299 	tmp = rb_prev(&va->rb_node);
300 	if (tmp) {
301 		struct vmap_area *prev;
302 		prev = rb_entry(tmp, struct vmap_area, rb_node);
303 		list_add_rcu(&va->list, &prev->list);
304 	} else
305 		list_add_rcu(&va->list, &vmap_area_list);
306 }
307 
308 static void purge_vmap_area_lazy(void);
309 
310 /*
311  * Allocate a region of KVA of the specified size and alignment, within the
312  * vstart and vend.
313  */
314 static struct vmap_area *alloc_vmap_area(unsigned long size,
315 				unsigned long align,
316 				unsigned long vstart, unsigned long vend,
317 				int node, gfp_t gfp_mask)
318 {
319 	struct vmap_area *va;
320 	struct rb_node *n;
321 	unsigned long addr;
322 	int purged = 0;
323 
324 	BUG_ON(size & ~PAGE_MASK);
325 
326 	addr = ALIGN(vstart, align);
327 
328 	va = kmalloc_node(sizeof(struct vmap_area),
329 			gfp_mask & GFP_RECLAIM_MASK, node);
330 	if (unlikely(!va))
331 		return ERR_PTR(-ENOMEM);
332 
333 retry:
334 	spin_lock(&vmap_area_lock);
335 	/* XXX: could have a last_hole cache */
336 	n = vmap_area_root.rb_node;
337 	if (n) {
338 		struct vmap_area *first = NULL;
339 
340 		do {
341 			struct vmap_area *tmp;
342 			tmp = rb_entry(n, struct vmap_area, rb_node);
343 			if (tmp->va_end >= addr) {
344 				if (!first && tmp->va_start < addr + size)
345 					first = tmp;
346 				n = n->rb_left;
347 			} else {
348 				first = tmp;
349 				n = n->rb_right;
350 			}
351 		} while (n);
352 
353 		if (!first)
354 			goto found;
355 
356 		if (first->va_end < addr) {
357 			n = rb_next(&first->rb_node);
358 			if (n)
359 				first = rb_entry(n, struct vmap_area, rb_node);
360 			else
361 				goto found;
362 		}
363 
364 		while (addr + size >= first->va_start && addr + size <= vend) {
365 			addr = ALIGN(first->va_end + PAGE_SIZE, align);
366 
367 			n = rb_next(&first->rb_node);
368 			if (n)
369 				first = rb_entry(n, struct vmap_area, rb_node);
370 			else
371 				goto found;
372 		}
373 	}
374 found:
375 	if (addr + size > vend) {
376 		spin_unlock(&vmap_area_lock);
377 		if (!purged) {
378 			purge_vmap_area_lazy();
379 			purged = 1;
380 			goto retry;
381 		}
382 		if (printk_ratelimit())
383 			printk(KERN_WARNING "vmap allocation failed: "
384 				 "use vmalloc=<size> to increase size.\n");
385 		return ERR_PTR(-EBUSY);
386 	}
387 
388 	BUG_ON(addr & (align-1));
389 
390 	va->va_start = addr;
391 	va->va_end = addr + size;
392 	va->flags = 0;
393 	__insert_vmap_area(va);
394 	spin_unlock(&vmap_area_lock);
395 
396 	return va;
397 }
398 
399 static void rcu_free_va(struct rcu_head *head)
400 {
401 	struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
402 
403 	kfree(va);
404 }
405 
406 static void __free_vmap_area(struct vmap_area *va)
407 {
408 	BUG_ON(RB_EMPTY_NODE(&va->rb_node));
409 	rb_erase(&va->rb_node, &vmap_area_root);
410 	RB_CLEAR_NODE(&va->rb_node);
411 	list_del_rcu(&va->list);
412 
413 	call_rcu(&va->rcu_head, rcu_free_va);
414 }
415 
416 /*
417  * Free a region of KVA allocated by alloc_vmap_area
418  */
419 static void free_vmap_area(struct vmap_area *va)
420 {
421 	spin_lock(&vmap_area_lock);
422 	__free_vmap_area(va);
423 	spin_unlock(&vmap_area_lock);
424 }
425 
426 /*
427  * Clear the pagetable entries of a given vmap_area
428  */
429 static void unmap_vmap_area(struct vmap_area *va)
430 {
431 	vunmap_page_range(va->va_start, va->va_end);
432 }
433 
434 /*
435  * lazy_max_pages is the maximum amount of virtual address space we gather up
436  * before attempting to purge with a TLB flush.
437  *
438  * There is a tradeoff here: a larger number will cover more kernel page tables
439  * and take slightly longer to purge, but it will linearly reduce the number of
440  * global TLB flushes that must be performed. It would seem natural to scale
441  * this number up linearly with the number of CPUs (because vmapping activity
442  * could also scale linearly with the number of CPUs), however it is likely
443  * that in practice, workloads might be constrained in other ways that mean
444  * vmap activity will not scale linearly with CPUs. Also, I want to be
445  * conservative and not introduce a big latency on huge systems, so go with
446  * a less aggressive log scale. It will still be an improvement over the old
447  * code, and it will be simple to change the scale factor if we find that it
448  * becomes a problem on bigger systems.
449  */
450 static unsigned long lazy_max_pages(void)
451 {
452 	unsigned int log;
453 
454 	log = fls(num_online_cpus());
455 
456 	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
457 }
458 
459 static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
460 
461 /*
462  * Purges all lazily-freed vmap areas.
463  *
464  * If sync is 0 then don't purge if there is already a purge in progress.
465  * If force_flush is 1, then flush kernel TLBs between *start and *end even
466  * if we found no lazy vmap areas to unmap (callers can use this to optimise
467  * their own TLB flushing).
468  * Returns with *start = min(*start, lowest purged address)
469  *              *end = max(*end, highest purged address)
470  */
471 static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
472 					int sync, int force_flush)
473 {
474 	static DEFINE_SPINLOCK(purge_lock);
475 	LIST_HEAD(valist);
476 	struct vmap_area *va;
477 	int nr = 0;
478 
479 	/*
480 	 * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
481 	 * should not expect such behaviour. This just simplifies locking for
482 	 * the case that isn't actually used at the moment anyway.
483 	 */
484 	if (!sync && !force_flush) {
485 		if (!spin_trylock(&purge_lock))
486 			return;
487 	} else
488 		spin_lock(&purge_lock);
489 
490 	rcu_read_lock();
491 	list_for_each_entry_rcu(va, &vmap_area_list, list) {
492 		if (va->flags & VM_LAZY_FREE) {
493 			if (va->va_start < *start)
494 				*start = va->va_start;
495 			if (va->va_end > *end)
496 				*end = va->va_end;
497 			nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
498 			unmap_vmap_area(va);
499 			list_add_tail(&va->purge_list, &valist);
500 			va->flags |= VM_LAZY_FREEING;
501 			va->flags &= ~VM_LAZY_FREE;
502 		}
503 	}
504 	rcu_read_unlock();
505 
506 	if (nr) {
507 		BUG_ON(nr > atomic_read(&vmap_lazy_nr));
508 		atomic_sub(nr, &vmap_lazy_nr);
509 	}
510 
511 	if (nr || force_flush)
512 		flush_tlb_kernel_range(*start, *end);
513 
514 	if (nr) {
515 		spin_lock(&vmap_area_lock);
516 		list_for_each_entry(va, &valist, purge_list)
517 			__free_vmap_area(va);
518 		spin_unlock(&vmap_area_lock);
519 	}
520 	spin_unlock(&purge_lock);
521 }
522 
523 /*
524  * Kick off a purge of the outstanding lazy areas.
525  */
526 static void purge_vmap_area_lazy(void)
527 {
528 	unsigned long start = ULONG_MAX, end = 0;
529 
530 	__purge_vmap_area_lazy(&start, &end, 0, 0);
531 }
532 
533 /*
534  * Free and unmap a vmap area
535  */
536 static void free_unmap_vmap_area(struct vmap_area *va)
537 {
538 	va->flags |= VM_LAZY_FREE;
539 	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
540 	if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
541 		purge_vmap_area_lazy();
542 }
543 
544 static struct vmap_area *find_vmap_area(unsigned long addr)
545 {
546 	struct vmap_area *va;
547 
548 	spin_lock(&vmap_area_lock);
549 	va = __find_vmap_area(addr);
550 	spin_unlock(&vmap_area_lock);
551 
552 	return va;
553 }
554 
555 static void free_unmap_vmap_area_addr(unsigned long addr)
556 {
557 	struct vmap_area *va;
558 
559 	va = find_vmap_area(addr);
560 	BUG_ON(!va);
561 	free_unmap_vmap_area(va);
562 }
563 
564 
565 /*** Per cpu kva allocator ***/
566 
567 /*
568  * vmap space is limited especially on 32 bit architectures. Ensure there is
569  * room for at least 16 percpu vmap blocks per CPU.
570  */
571 /*
572  * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
573  * to #define VMALLOC_SPACE		(VMALLOC_END-VMALLOC_START). Guess
574  * instead (we just need a rough idea)
575  */
576 #if BITS_PER_LONG == 32
577 #define VMALLOC_SPACE		(128UL*1024*1024)
578 #else
579 #define VMALLOC_SPACE		(128UL*1024*1024*1024)
580 #endif
581 
582 #define VMALLOC_PAGES		(VMALLOC_SPACE / PAGE_SIZE)
583 #define VMAP_MAX_ALLOC		BITS_PER_LONG	/* 256K with 4K pages */
584 #define VMAP_BBMAP_BITS_MAX	1024	/* 4MB with 4K pages */
585 #define VMAP_BBMAP_BITS_MIN	(VMAP_MAX_ALLOC*2)
586 #define VMAP_MIN(x, y)		((x) < (y) ? (x) : (y)) /* can't use min() */
587 #define VMAP_MAX(x, y)		((x) > (y) ? (x) : (y)) /* can't use max() */
588 #define VMAP_BBMAP_BITS		VMAP_MIN(VMAP_BBMAP_BITS_MAX,		\
589 					VMAP_MAX(VMAP_BBMAP_BITS_MIN,	\
590 						VMALLOC_PAGES / NR_CPUS / 16))
591 
592 #define VMAP_BLOCK_SIZE		(VMAP_BBMAP_BITS * PAGE_SIZE)
593 
594 struct vmap_block_queue {
595 	spinlock_t lock;
596 	struct list_head free;
597 	struct list_head dirty;
598 	unsigned int nr_dirty;
599 };
600 
601 struct vmap_block {
602 	spinlock_t lock;
603 	struct vmap_area *va;
604 	struct vmap_block_queue *vbq;
605 	unsigned long free, dirty;
606 	DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
607 	DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
608 	union {
609 		struct {
610 			struct list_head free_list;
611 			struct list_head dirty_list;
612 		};
613 		struct rcu_head rcu_head;
614 	};
615 };
616 
617 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
618 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
619 
620 /*
621  * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
622  * in the free path. Could get rid of this if we change the API to return a
623  * "cookie" from alloc, to be passed to free. But no big deal yet.
624  */
625 static DEFINE_SPINLOCK(vmap_block_tree_lock);
626 static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
627 
628 /*
629  * We should probably have a fallback mechanism to allocate virtual memory
630  * out of partially filled vmap blocks. However vmap block sizing should be
631  * fairly reasonable according to the vmalloc size, so it shouldn't be a
632  * big problem.
633  */
634 
635 static unsigned long addr_to_vb_idx(unsigned long addr)
636 {
637 	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
638 	addr /= VMAP_BLOCK_SIZE;
639 	return addr;
640 }
641 
642 static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
643 {
644 	struct vmap_block_queue *vbq;
645 	struct vmap_block *vb;
646 	struct vmap_area *va;
647 	unsigned long vb_idx;
648 	int node, err;
649 
650 	node = numa_node_id();
651 
652 	vb = kmalloc_node(sizeof(struct vmap_block),
653 			gfp_mask & GFP_RECLAIM_MASK, node);
654 	if (unlikely(!vb))
655 		return ERR_PTR(-ENOMEM);
656 
657 	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
658 					VMALLOC_START, VMALLOC_END,
659 					node, gfp_mask);
660 	if (unlikely(IS_ERR(va))) {
661 		kfree(vb);
662 		return ERR_PTR(PTR_ERR(va));
663 	}
664 
665 	err = radix_tree_preload(gfp_mask);
666 	if (unlikely(err)) {
667 		kfree(vb);
668 		free_vmap_area(va);
669 		return ERR_PTR(err);
670 	}
671 
672 	spin_lock_init(&vb->lock);
673 	vb->va = va;
674 	vb->free = VMAP_BBMAP_BITS;
675 	vb->dirty = 0;
676 	bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
677 	bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
678 	INIT_LIST_HEAD(&vb->free_list);
679 	INIT_LIST_HEAD(&vb->dirty_list);
680 
681 	vb_idx = addr_to_vb_idx(va->va_start);
682 	spin_lock(&vmap_block_tree_lock);
683 	err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
684 	spin_unlock(&vmap_block_tree_lock);
685 	BUG_ON(err);
686 	radix_tree_preload_end();
687 
688 	vbq = &get_cpu_var(vmap_block_queue);
689 	vb->vbq = vbq;
690 	spin_lock(&vbq->lock);
691 	list_add(&vb->free_list, &vbq->free);
692 	spin_unlock(&vbq->lock);
693 	put_cpu_var(vmap_cpu_blocks);
694 
695 	return vb;
696 }
697 
698 static void rcu_free_vb(struct rcu_head *head)
699 {
700 	struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
701 
702 	kfree(vb);
703 }
704 
705 static void free_vmap_block(struct vmap_block *vb)
706 {
707 	struct vmap_block *tmp;
708 	unsigned long vb_idx;
709 
710 	spin_lock(&vb->vbq->lock);
711 	if (!list_empty(&vb->free_list))
712 		list_del(&vb->free_list);
713 	if (!list_empty(&vb->dirty_list))
714 		list_del(&vb->dirty_list);
715 	spin_unlock(&vb->vbq->lock);
716 
717 	vb_idx = addr_to_vb_idx(vb->va->va_start);
718 	spin_lock(&vmap_block_tree_lock);
719 	tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
720 	spin_unlock(&vmap_block_tree_lock);
721 	BUG_ON(tmp != vb);
722 
723 	free_unmap_vmap_area(vb->va);
724 	call_rcu(&vb->rcu_head, rcu_free_vb);
725 }
726 
727 static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
728 {
729 	struct vmap_block_queue *vbq;
730 	struct vmap_block *vb;
731 	unsigned long addr = 0;
732 	unsigned int order;
733 
734 	BUG_ON(size & ~PAGE_MASK);
735 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
736 	order = get_order(size);
737 
738 again:
739 	rcu_read_lock();
740 	vbq = &get_cpu_var(vmap_block_queue);
741 	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
742 		int i;
743 
744 		spin_lock(&vb->lock);
745 		i = bitmap_find_free_region(vb->alloc_map,
746 						VMAP_BBMAP_BITS, order);
747 
748 		if (i >= 0) {
749 			addr = vb->va->va_start + (i << PAGE_SHIFT);
750 			BUG_ON(addr_to_vb_idx(addr) !=
751 					addr_to_vb_idx(vb->va->va_start));
752 			vb->free -= 1UL << order;
753 			if (vb->free == 0) {
754 				spin_lock(&vbq->lock);
755 				list_del_init(&vb->free_list);
756 				spin_unlock(&vbq->lock);
757 			}
758 			spin_unlock(&vb->lock);
759 			break;
760 		}
761 		spin_unlock(&vb->lock);
762 	}
763 	put_cpu_var(vmap_cpu_blocks);
764 	rcu_read_unlock();
765 
766 	if (!addr) {
767 		vb = new_vmap_block(gfp_mask);
768 		if (IS_ERR(vb))
769 			return vb;
770 		goto again;
771 	}
772 
773 	return (void *)addr;
774 }
775 
776 static void vb_free(const void *addr, unsigned long size)
777 {
778 	unsigned long offset;
779 	unsigned long vb_idx;
780 	unsigned int order;
781 	struct vmap_block *vb;
782 
783 	BUG_ON(size & ~PAGE_MASK);
784 	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
785 	order = get_order(size);
786 
787 	offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
788 
789 	vb_idx = addr_to_vb_idx((unsigned long)addr);
790 	rcu_read_lock();
791 	vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
792 	rcu_read_unlock();
793 	BUG_ON(!vb);
794 
795 	spin_lock(&vb->lock);
796 	bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
797 	if (!vb->dirty) {
798 		spin_lock(&vb->vbq->lock);
799 		list_add(&vb->dirty_list, &vb->vbq->dirty);
800 		spin_unlock(&vb->vbq->lock);
801 	}
802 	vb->dirty += 1UL << order;
803 	if (vb->dirty == VMAP_BBMAP_BITS) {
804 		BUG_ON(vb->free || !list_empty(&vb->free_list));
805 		spin_unlock(&vb->lock);
806 		free_vmap_block(vb);
807 	} else
808 		spin_unlock(&vb->lock);
809 }
810 
811 /**
812  * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
813  *
814  * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
815  * to amortize TLB flushing overheads. What this means is that any page you
816  * have now, may, in a former life, have been mapped into kernel virtual
817  * address by the vmap layer and so there might be some CPUs with TLB entries
818  * still referencing that page (additional to the regular 1:1 kernel mapping).
819  *
820  * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
821  * be sure that none of the pages we have control over will have any aliases
822  * from the vmap layer.
823  */
824 void vm_unmap_aliases(void)
825 {
826 	unsigned long start = ULONG_MAX, end = 0;
827 	int cpu;
828 	int flush = 0;
829 
830 	for_each_possible_cpu(cpu) {
831 		struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
832 		struct vmap_block *vb;
833 
834 		rcu_read_lock();
835 		list_for_each_entry_rcu(vb, &vbq->free, free_list) {
836 			int i;
837 
838 			spin_lock(&vb->lock);
839 			i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
840 			while (i < VMAP_BBMAP_BITS) {
841 				unsigned long s, e;
842 				int j;
843 				j = find_next_zero_bit(vb->dirty_map,
844 					VMAP_BBMAP_BITS, i);
845 
846 				s = vb->va->va_start + (i << PAGE_SHIFT);
847 				e = vb->va->va_start + (j << PAGE_SHIFT);
848 				vunmap_page_range(s, e);
849 				flush = 1;
850 
851 				if (s < start)
852 					start = s;
853 				if (e > end)
854 					end = e;
855 
856 				i = j;
857 				i = find_next_bit(vb->dirty_map,
858 							VMAP_BBMAP_BITS, i);
859 			}
860 			spin_unlock(&vb->lock);
861 		}
862 		rcu_read_unlock();
863 	}
864 
865 	__purge_vmap_area_lazy(&start, &end, 1, flush);
866 }
867 EXPORT_SYMBOL_GPL(vm_unmap_aliases);
868 
869 /**
870  * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
871  * @mem: the pointer returned by vm_map_ram
872  * @count: the count passed to that vm_map_ram call (cannot unmap partial)
873  */
874 void vm_unmap_ram(const void *mem, unsigned int count)
875 {
876 	unsigned long size = count << PAGE_SHIFT;
877 	unsigned long addr = (unsigned long)mem;
878 
879 	BUG_ON(!addr);
880 	BUG_ON(addr < VMALLOC_START);
881 	BUG_ON(addr > VMALLOC_END);
882 	BUG_ON(addr & (PAGE_SIZE-1));
883 
884 	debug_check_no_locks_freed(mem, size);
885 
886 	if (likely(count <= VMAP_MAX_ALLOC))
887 		vb_free(mem, size);
888 	else
889 		free_unmap_vmap_area_addr(addr);
890 }
891 EXPORT_SYMBOL(vm_unmap_ram);
892 
893 /**
894  * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
895  * @pages: an array of pointers to the pages to be mapped
896  * @count: number of pages
897  * @node: prefer to allocate data structures on this node
898  * @prot: memory protection to use. PAGE_KERNEL for regular RAM
899  * @returns: a pointer to the address that has been mapped, or NULL on failure
900  */
901 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
902 {
903 	unsigned long size = count << PAGE_SHIFT;
904 	unsigned long addr;
905 	void *mem;
906 
907 	if (likely(count <= VMAP_MAX_ALLOC)) {
908 		mem = vb_alloc(size, GFP_KERNEL);
909 		if (IS_ERR(mem))
910 			return NULL;
911 		addr = (unsigned long)mem;
912 	} else {
913 		struct vmap_area *va;
914 		va = alloc_vmap_area(size, PAGE_SIZE,
915 				VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
916 		if (IS_ERR(va))
917 			return NULL;
918 
919 		addr = va->va_start;
920 		mem = (void *)addr;
921 	}
922 	if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
923 		vm_unmap_ram(mem, count);
924 		return NULL;
925 	}
926 	return mem;
927 }
928 EXPORT_SYMBOL(vm_map_ram);
929 
930 void __init vmalloc_init(void)
931 {
932 	int i;
933 
934 	for_each_possible_cpu(i) {
935 		struct vmap_block_queue *vbq;
936 
937 		vbq = &per_cpu(vmap_block_queue, i);
938 		spin_lock_init(&vbq->lock);
939 		INIT_LIST_HEAD(&vbq->free);
940 		INIT_LIST_HEAD(&vbq->dirty);
941 		vbq->nr_dirty = 0;
942 	}
943 }
944 
945 void unmap_kernel_range(unsigned long addr, unsigned long size)
946 {
947 	unsigned long end = addr + size;
948 	vunmap_page_range(addr, end);
949 	flush_tlb_kernel_range(addr, end);
950 }
951 
952 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
953 {
954 	unsigned long addr = (unsigned long)area->addr;
955 	unsigned long end = addr + area->size - PAGE_SIZE;
956 	int err;
957 
958 	err = vmap_page_range(addr, end, prot, *pages);
959 	if (err > 0) {
960 		*pages += err;
961 		err = 0;
962 	}
963 
964 	return err;
965 }
966 EXPORT_SYMBOL_GPL(map_vm_area);
967 
968 /*** Old vmalloc interfaces ***/
969 DEFINE_RWLOCK(vmlist_lock);
970 struct vm_struct *vmlist;
971 
972 static struct vm_struct *__get_vm_area_node(unsigned long size,
973 		unsigned long flags, unsigned long start, unsigned long end,
974 		int node, gfp_t gfp_mask, void *caller)
975 {
976 	static struct vmap_area *va;
977 	struct vm_struct *area;
978 	struct vm_struct *tmp, **p;
979 	unsigned long align = 1;
980 
981 	BUG_ON(in_interrupt());
982 	if (flags & VM_IOREMAP) {
983 		int bit = fls(size);
984 
985 		if (bit > IOREMAP_MAX_ORDER)
986 			bit = IOREMAP_MAX_ORDER;
987 		else if (bit < PAGE_SHIFT)
988 			bit = PAGE_SHIFT;
989 
990 		align = 1ul << bit;
991 	}
992 
993 	size = PAGE_ALIGN(size);
994 	if (unlikely(!size))
995 		return NULL;
996 
997 	area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
998 	if (unlikely(!area))
999 		return NULL;
1000 
1001 	/*
1002 	 * We always allocate a guard page.
1003 	 */
1004 	size += PAGE_SIZE;
1005 
1006 	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
1007 	if (IS_ERR(va)) {
1008 		kfree(area);
1009 		return NULL;
1010 	}
1011 
1012 	area->flags = flags;
1013 	area->addr = (void *)va->va_start;
1014 	area->size = size;
1015 	area->pages = NULL;
1016 	area->nr_pages = 0;
1017 	area->phys_addr = 0;
1018 	area->caller = caller;
1019 	va->private = area;
1020 	va->flags |= VM_VM_AREA;
1021 
1022 	write_lock(&vmlist_lock);
1023 	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1024 		if (tmp->addr >= area->addr)
1025 			break;
1026 	}
1027 	area->next = *p;
1028 	*p = area;
1029 	write_unlock(&vmlist_lock);
1030 
1031 	return area;
1032 }
1033 
1034 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1035 				unsigned long start, unsigned long end)
1036 {
1037 	return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
1038 						__builtin_return_address(0));
1039 }
1040 EXPORT_SYMBOL_GPL(__get_vm_area);
1041 
1042 /**
1043  *	get_vm_area  -  reserve a contiguous kernel virtual area
1044  *	@size:		size of the area
1045  *	@flags:		%VM_IOREMAP for I/O mappings or VM_ALLOC
1046  *
1047  *	Search an area of @size in the kernel virtual mapping area,
1048  *	and reserved it for out purposes.  Returns the area descriptor
1049  *	on success or %NULL on failure.
1050  */
1051 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1052 {
1053 	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
1054 				-1, GFP_KERNEL, __builtin_return_address(0));
1055 }
1056 
1057 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1058 				void *caller)
1059 {
1060 	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
1061 						-1, GFP_KERNEL, caller);
1062 }
1063 
1064 struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
1065 				   int node, gfp_t gfp_mask)
1066 {
1067 	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
1068 				  gfp_mask, __builtin_return_address(0));
1069 }
1070 
1071 static struct vm_struct *find_vm_area(const void *addr)
1072 {
1073 	struct vmap_area *va;
1074 
1075 	va = find_vmap_area((unsigned long)addr);
1076 	if (va && va->flags & VM_VM_AREA)
1077 		return va->private;
1078 
1079 	return NULL;
1080 }
1081 
1082 /**
1083  *	remove_vm_area  -  find and remove a continuous kernel virtual area
1084  *	@addr:		base address
1085  *
1086  *	Search for the kernel VM area starting at @addr, and remove it.
1087  *	This function returns the found VM area, but using it is NOT safe
1088  *	on SMP machines, except for its size or flags.
1089  */
1090 struct vm_struct *remove_vm_area(const void *addr)
1091 {
1092 	struct vmap_area *va;
1093 
1094 	va = find_vmap_area((unsigned long)addr);
1095 	if (va && va->flags & VM_VM_AREA) {
1096 		struct vm_struct *vm = va->private;
1097 		struct vm_struct *tmp, **p;
1098 		free_unmap_vmap_area(va);
1099 		vm->size -= PAGE_SIZE;
1100 
1101 		write_lock(&vmlist_lock);
1102 		for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1103 			;
1104 		*p = tmp->next;
1105 		write_unlock(&vmlist_lock);
1106 
1107 		return vm;
1108 	}
1109 	return NULL;
1110 }
1111 
1112 static void __vunmap(const void *addr, int deallocate_pages)
1113 {
1114 	struct vm_struct *area;
1115 
1116 	if (!addr)
1117 		return;
1118 
1119 	if ((PAGE_SIZE-1) & (unsigned long)addr) {
1120 		WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
1121 		return;
1122 	}
1123 
1124 	area = remove_vm_area(addr);
1125 	if (unlikely(!area)) {
1126 		WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
1127 				addr);
1128 		return;
1129 	}
1130 
1131 	debug_check_no_locks_freed(addr, area->size);
1132 	debug_check_no_obj_freed(addr, area->size);
1133 
1134 	if (deallocate_pages) {
1135 		int i;
1136 
1137 		for (i = 0; i < area->nr_pages; i++) {
1138 			struct page *page = area->pages[i];
1139 
1140 			BUG_ON(!page);
1141 			__free_page(page);
1142 		}
1143 
1144 		if (area->flags & VM_VPAGES)
1145 			vfree(area->pages);
1146 		else
1147 			kfree(area->pages);
1148 	}
1149 
1150 	kfree(area);
1151 	return;
1152 }
1153 
1154 /**
1155  *	vfree  -  release memory allocated by vmalloc()
1156  *	@addr:		memory base address
1157  *
1158  *	Free the virtually continuous memory area starting at @addr, as
1159  *	obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
1160  *	NULL, no operation is performed.
1161  *
1162  *	Must not be called in interrupt context.
1163  */
1164 void vfree(const void *addr)
1165 {
1166 	BUG_ON(in_interrupt());
1167 	__vunmap(addr, 1);
1168 }
1169 EXPORT_SYMBOL(vfree);
1170 
1171 /**
1172  *	vunmap  -  release virtual mapping obtained by vmap()
1173  *	@addr:		memory base address
1174  *
1175  *	Free the virtually contiguous memory area starting at @addr,
1176  *	which was created from the page array passed to vmap().
1177  *
1178  *	Must not be called in interrupt context.
1179  */
1180 void vunmap(const void *addr)
1181 {
1182 	BUG_ON(in_interrupt());
1183 	__vunmap(addr, 0);
1184 }
1185 EXPORT_SYMBOL(vunmap);
1186 
1187 /**
1188  *	vmap  -  map an array of pages into virtually contiguous space
1189  *	@pages:		array of page pointers
1190  *	@count:		number of pages to map
1191  *	@flags:		vm_area->flags
1192  *	@prot:		page protection for the mapping
1193  *
1194  *	Maps @count pages from @pages into contiguous kernel virtual
1195  *	space.
1196  */
1197 void *vmap(struct page **pages, unsigned int count,
1198 		unsigned long flags, pgprot_t prot)
1199 {
1200 	struct vm_struct *area;
1201 
1202 	if (count > num_physpages)
1203 		return NULL;
1204 
1205 	area = get_vm_area_caller((count << PAGE_SHIFT), flags,
1206 					__builtin_return_address(0));
1207 	if (!area)
1208 		return NULL;
1209 
1210 	if (map_vm_area(area, prot, &pages)) {
1211 		vunmap(area->addr);
1212 		return NULL;
1213 	}
1214 
1215 	return area->addr;
1216 }
1217 EXPORT_SYMBOL(vmap);
1218 
1219 static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1220 			    int node, void *caller);
1221 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1222 				 pgprot_t prot, int node, void *caller)
1223 {
1224 	struct page **pages;
1225 	unsigned int nr_pages, array_size, i;
1226 
1227 	nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
1228 	array_size = (nr_pages * sizeof(struct page *));
1229 
1230 	area->nr_pages = nr_pages;
1231 	/* Please note that the recursion is strictly bounded. */
1232 	if (array_size > PAGE_SIZE) {
1233 		pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
1234 				PAGE_KERNEL, node, caller);
1235 		area->flags |= VM_VPAGES;
1236 	} else {
1237 		pages = kmalloc_node(array_size,
1238 				(gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
1239 				node);
1240 	}
1241 	area->pages = pages;
1242 	area->caller = caller;
1243 	if (!area->pages) {
1244 		remove_vm_area(area->addr);
1245 		kfree(area);
1246 		return NULL;
1247 	}
1248 
1249 	for (i = 0; i < area->nr_pages; i++) {
1250 		struct page *page;
1251 
1252 		if (node < 0)
1253 			page = alloc_page(gfp_mask);
1254 		else
1255 			page = alloc_pages_node(node, gfp_mask, 0);
1256 
1257 		if (unlikely(!page)) {
1258 			/* Successfully allocated i pages, free them in __vunmap() */
1259 			area->nr_pages = i;
1260 			goto fail;
1261 		}
1262 		area->pages[i] = page;
1263 	}
1264 
1265 	if (map_vm_area(area, prot, &pages))
1266 		goto fail;
1267 	return area->addr;
1268 
1269 fail:
1270 	vfree(area->addr);
1271 	return NULL;
1272 }
1273 
1274 void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1275 {
1276 	return __vmalloc_area_node(area, gfp_mask, prot, -1,
1277 					__builtin_return_address(0));
1278 }
1279 
1280 /**
1281  *	__vmalloc_node  -  allocate virtually contiguous memory
1282  *	@size:		allocation size
1283  *	@gfp_mask:	flags for the page level allocator
1284  *	@prot:		protection mask for the allocated pages
1285  *	@node:		node to use for allocation or -1
1286  *	@caller:	caller's return address
1287  *
1288  *	Allocate enough pages to cover @size from the page level
1289  *	allocator with @gfp_mask flags.  Map them into contiguous
1290  *	kernel virtual space, using a pagetable protection of @prot.
1291  */
1292 static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1293 						int node, void *caller)
1294 {
1295 	struct vm_struct *area;
1296 
1297 	size = PAGE_ALIGN(size);
1298 	if (!size || (size >> PAGE_SHIFT) > num_physpages)
1299 		return NULL;
1300 
1301 	area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
1302 						node, gfp_mask, caller);
1303 
1304 	if (!area)
1305 		return NULL;
1306 
1307 	return __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1308 }
1309 
1310 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1311 {
1312 	return __vmalloc_node(size, gfp_mask, prot, -1,
1313 				__builtin_return_address(0));
1314 }
1315 EXPORT_SYMBOL(__vmalloc);
1316 
1317 /**
1318  *	vmalloc  -  allocate virtually contiguous memory
1319  *	@size:		allocation size
1320  *	Allocate enough pages to cover @size from the page level
1321  *	allocator and map them into contiguous kernel virtual space.
1322  *
1323  *	For tight control over page level allocator and protection flags
1324  *	use __vmalloc() instead.
1325  */
1326 void *vmalloc(unsigned long size)
1327 {
1328 	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1329 					-1, __builtin_return_address(0));
1330 }
1331 EXPORT_SYMBOL(vmalloc);
1332 
1333 /**
1334  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
1335  * @size: allocation size
1336  *
1337  * The resulting memory area is zeroed so it can be mapped to userspace
1338  * without leaking data.
1339  */
1340 void *vmalloc_user(unsigned long size)
1341 {
1342 	struct vm_struct *area;
1343 	void *ret;
1344 
1345 	ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
1346 	if (ret) {
1347 		area = find_vm_area(ret);
1348 		area->flags |= VM_USERMAP;
1349 	}
1350 	return ret;
1351 }
1352 EXPORT_SYMBOL(vmalloc_user);
1353 
1354 /**
1355  *	vmalloc_node  -  allocate memory on a specific node
1356  *	@size:		allocation size
1357  *	@node:		numa node
1358  *
1359  *	Allocate enough pages to cover @size from the page level
1360  *	allocator and map them into contiguous kernel virtual space.
1361  *
1362  *	For tight control over page level allocator and protection flags
1363  *	use __vmalloc() instead.
1364  */
1365 void *vmalloc_node(unsigned long size, int node)
1366 {
1367 	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1368 					node, __builtin_return_address(0));
1369 }
1370 EXPORT_SYMBOL(vmalloc_node);
1371 
1372 #ifndef PAGE_KERNEL_EXEC
1373 # define PAGE_KERNEL_EXEC PAGE_KERNEL
1374 #endif
1375 
1376 /**
1377  *	vmalloc_exec  -  allocate virtually contiguous, executable memory
1378  *	@size:		allocation size
1379  *
1380  *	Kernel-internal function to allocate enough pages to cover @size
1381  *	the page level allocator and map them into contiguous and
1382  *	executable kernel virtual space.
1383  *
1384  *	For tight control over page level allocator and protection flags
1385  *	use __vmalloc() instead.
1386  */
1387 
1388 void *vmalloc_exec(unsigned long size)
1389 {
1390 	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
1391 }
1392 
1393 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
1394 #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
1395 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
1396 #define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
1397 #else
1398 #define GFP_VMALLOC32 GFP_KERNEL
1399 #endif
1400 
1401 /**
1402  *	vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
1403  *	@size:		allocation size
1404  *
1405  *	Allocate enough 32bit PA addressable pages to cover @size from the
1406  *	page level allocator and map them into contiguous kernel virtual space.
1407  */
1408 void *vmalloc_32(unsigned long size)
1409 {
1410 	return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL);
1411 }
1412 EXPORT_SYMBOL(vmalloc_32);
1413 
1414 /**
1415  * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
1416  *	@size:		allocation size
1417  *
1418  * The resulting memory area is 32bit addressable and zeroed so it can be
1419  * mapped to userspace without leaking data.
1420  */
1421 void *vmalloc_32_user(unsigned long size)
1422 {
1423 	struct vm_struct *area;
1424 	void *ret;
1425 
1426 	ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
1427 	if (ret) {
1428 		area = find_vm_area(ret);
1429 		area->flags |= VM_USERMAP;
1430 	}
1431 	return ret;
1432 }
1433 EXPORT_SYMBOL(vmalloc_32_user);
1434 
1435 long vread(char *buf, char *addr, unsigned long count)
1436 {
1437 	struct vm_struct *tmp;
1438 	char *vaddr, *buf_start = buf;
1439 	unsigned long n;
1440 
1441 	/* Don't allow overflow */
1442 	if ((unsigned long) addr + count < count)
1443 		count = -(unsigned long) addr;
1444 
1445 	read_lock(&vmlist_lock);
1446 	for (tmp = vmlist; tmp; tmp = tmp->next) {
1447 		vaddr = (char *) tmp->addr;
1448 		if (addr >= vaddr + tmp->size - PAGE_SIZE)
1449 			continue;
1450 		while (addr < vaddr) {
1451 			if (count == 0)
1452 				goto finished;
1453 			*buf = '\0';
1454 			buf++;
1455 			addr++;
1456 			count--;
1457 		}
1458 		n = vaddr + tmp->size - PAGE_SIZE - addr;
1459 		do {
1460 			if (count == 0)
1461 				goto finished;
1462 			*buf = *addr;
1463 			buf++;
1464 			addr++;
1465 			count--;
1466 		} while (--n > 0);
1467 	}
1468 finished:
1469 	read_unlock(&vmlist_lock);
1470 	return buf - buf_start;
1471 }
1472 
1473 long vwrite(char *buf, char *addr, unsigned long count)
1474 {
1475 	struct vm_struct *tmp;
1476 	char *vaddr, *buf_start = buf;
1477 	unsigned long n;
1478 
1479 	/* Don't allow overflow */
1480 	if ((unsigned long) addr + count < count)
1481 		count = -(unsigned long) addr;
1482 
1483 	read_lock(&vmlist_lock);
1484 	for (tmp = vmlist; tmp; tmp = tmp->next) {
1485 		vaddr = (char *) tmp->addr;
1486 		if (addr >= vaddr + tmp->size - PAGE_SIZE)
1487 			continue;
1488 		while (addr < vaddr) {
1489 			if (count == 0)
1490 				goto finished;
1491 			buf++;
1492 			addr++;
1493 			count--;
1494 		}
1495 		n = vaddr + tmp->size - PAGE_SIZE - addr;
1496 		do {
1497 			if (count == 0)
1498 				goto finished;
1499 			*addr = *buf;
1500 			buf++;
1501 			addr++;
1502 			count--;
1503 		} while (--n > 0);
1504 	}
1505 finished:
1506 	read_unlock(&vmlist_lock);
1507 	return buf - buf_start;
1508 }
1509 
1510 /**
1511  *	remap_vmalloc_range  -  map vmalloc pages to userspace
1512  *	@vma:		vma to cover (map full range of vma)
1513  *	@addr:		vmalloc memory
1514  *	@pgoff:		number of pages into addr before first page to map
1515  *
1516  *	Returns:	0 for success, -Exxx on failure
1517  *
1518  *	This function checks that addr is a valid vmalloc'ed area, and
1519  *	that it is big enough to cover the vma. Will return failure if
1520  *	that criteria isn't met.
1521  *
1522  *	Similar to remap_pfn_range() (see mm/memory.c)
1523  */
1524 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1525 						unsigned long pgoff)
1526 {
1527 	struct vm_struct *area;
1528 	unsigned long uaddr = vma->vm_start;
1529 	unsigned long usize = vma->vm_end - vma->vm_start;
1530 
1531 	if ((PAGE_SIZE-1) & (unsigned long)addr)
1532 		return -EINVAL;
1533 
1534 	area = find_vm_area(addr);
1535 	if (!area)
1536 		return -EINVAL;
1537 
1538 	if (!(area->flags & VM_USERMAP))
1539 		return -EINVAL;
1540 
1541 	if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
1542 		return -EINVAL;
1543 
1544 	addr += pgoff << PAGE_SHIFT;
1545 	do {
1546 		struct page *page = vmalloc_to_page(addr);
1547 		int ret;
1548 
1549 		ret = vm_insert_page(vma, uaddr, page);
1550 		if (ret)
1551 			return ret;
1552 
1553 		uaddr += PAGE_SIZE;
1554 		addr += PAGE_SIZE;
1555 		usize -= PAGE_SIZE;
1556 	} while (usize > 0);
1557 
1558 	/* Prevent "things" like memory migration? VM_flags need a cleanup... */
1559 	vma->vm_flags |= VM_RESERVED;
1560 
1561 	return 0;
1562 }
1563 EXPORT_SYMBOL(remap_vmalloc_range);
1564 
1565 /*
1566  * Implement a stub for vmalloc_sync_all() if the architecture chose not to
1567  * have one.
1568  */
1569 void  __attribute__((weak)) vmalloc_sync_all(void)
1570 {
1571 }
1572 
1573 
1574 static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
1575 {
1576 	/* apply_to_page_range() does all the hard work. */
1577 	return 0;
1578 }
1579 
1580 /**
1581  *	alloc_vm_area - allocate a range of kernel address space
1582  *	@size:		size of the area
1583  *
1584  *	Returns:	NULL on failure, vm_struct on success
1585  *
1586  *	This function reserves a range of kernel address space, and
1587  *	allocates pagetables to map that range.  No actual mappings
1588  *	are created.  If the kernel address space is not shared
1589  *	between processes, it syncs the pagetable across all
1590  *	processes.
1591  */
1592 struct vm_struct *alloc_vm_area(size_t size)
1593 {
1594 	struct vm_struct *area;
1595 
1596 	area = get_vm_area_caller(size, VM_IOREMAP,
1597 				__builtin_return_address(0));
1598 	if (area == NULL)
1599 		return NULL;
1600 
1601 	/*
1602 	 * This ensures that page tables are constructed for this region
1603 	 * of kernel virtual address space and mapped into init_mm.
1604 	 */
1605 	if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
1606 				area->size, f, NULL)) {
1607 		free_vm_area(area);
1608 		return NULL;
1609 	}
1610 
1611 	/* Make sure the pagetables are constructed in process kernel
1612 	   mappings */
1613 	vmalloc_sync_all();
1614 
1615 	return area;
1616 }
1617 EXPORT_SYMBOL_GPL(alloc_vm_area);
1618 
1619 void free_vm_area(struct vm_struct *area)
1620 {
1621 	struct vm_struct *ret;
1622 	ret = remove_vm_area(area->addr);
1623 	BUG_ON(ret != area);
1624 	kfree(area);
1625 }
1626 EXPORT_SYMBOL_GPL(free_vm_area);
1627 
1628 
1629 #ifdef CONFIG_PROC_FS
1630 static void *s_start(struct seq_file *m, loff_t *pos)
1631 {
1632 	loff_t n = *pos;
1633 	struct vm_struct *v;
1634 
1635 	read_lock(&vmlist_lock);
1636 	v = vmlist;
1637 	while (n > 0 && v) {
1638 		n--;
1639 		v = v->next;
1640 	}
1641 	if (!n)
1642 		return v;
1643 
1644 	return NULL;
1645 
1646 }
1647 
1648 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
1649 {
1650 	struct vm_struct *v = p;
1651 
1652 	++*pos;
1653 	return v->next;
1654 }
1655 
1656 static void s_stop(struct seq_file *m, void *p)
1657 {
1658 	read_unlock(&vmlist_lock);
1659 }
1660 
1661 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
1662 {
1663 	if (NUMA_BUILD) {
1664 		unsigned int nr, *counters = m->private;
1665 
1666 		if (!counters)
1667 			return;
1668 
1669 		memset(counters, 0, nr_node_ids * sizeof(unsigned int));
1670 
1671 		for (nr = 0; nr < v->nr_pages; nr++)
1672 			counters[page_to_nid(v->pages[nr])]++;
1673 
1674 		for_each_node_state(nr, N_HIGH_MEMORY)
1675 			if (counters[nr])
1676 				seq_printf(m, " N%u=%u", nr, counters[nr]);
1677 	}
1678 }
1679 
1680 static int s_show(struct seq_file *m, void *p)
1681 {
1682 	struct vm_struct *v = p;
1683 
1684 	seq_printf(m, "0x%p-0x%p %7ld",
1685 		v->addr, v->addr + v->size, v->size);
1686 
1687 	if (v->caller) {
1688 		char buff[2 * KSYM_NAME_LEN];
1689 
1690 		seq_putc(m, ' ');
1691 		sprint_symbol(buff, (unsigned long)v->caller);
1692 		seq_puts(m, buff);
1693 	}
1694 
1695 	if (v->nr_pages)
1696 		seq_printf(m, " pages=%d", v->nr_pages);
1697 
1698 	if (v->phys_addr)
1699 		seq_printf(m, " phys=%lx", v->phys_addr);
1700 
1701 	if (v->flags & VM_IOREMAP)
1702 		seq_printf(m, " ioremap");
1703 
1704 	if (v->flags & VM_ALLOC)
1705 		seq_printf(m, " vmalloc");
1706 
1707 	if (v->flags & VM_MAP)
1708 		seq_printf(m, " vmap");
1709 
1710 	if (v->flags & VM_USERMAP)
1711 		seq_printf(m, " user");
1712 
1713 	if (v->flags & VM_VPAGES)
1714 		seq_printf(m, " vpages");
1715 
1716 	show_numa_info(m, v);
1717 	seq_putc(m, '\n');
1718 	return 0;
1719 }
1720 
1721 const struct seq_operations vmalloc_op = {
1722 	.start = s_start,
1723 	.next = s_next,
1724 	.stop = s_stop,
1725 	.show = s_show,
1726 };
1727 #endif
1728 
1729