1 /* 2 * linux/mm/vmalloc.c 3 * 4 * Copyright (C) 1993 Linus Torvalds 5 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 6 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 7 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 8 * Numa awareness, Christoph Lameter, SGI, June 2005 9 */ 10 11 #include <linux/vmalloc.h> 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/highmem.h> 15 #include <linux/sched.h> 16 #include <linux/slab.h> 17 #include <linux/spinlock.h> 18 #include <linux/interrupt.h> 19 #include <linux/proc_fs.h> 20 #include <linux/seq_file.h> 21 #include <linux/debugobjects.h> 22 #include <linux/kallsyms.h> 23 #include <linux/list.h> 24 #include <linux/rbtree.h> 25 #include <linux/radix-tree.h> 26 #include <linux/rcupdate.h> 27 #include <linux/pfn.h> 28 #include <linux/kmemleak.h> 29 #include <asm/atomic.h> 30 #include <asm/uaccess.h> 31 #include <asm/tlbflush.h> 32 #include <asm/shmparam.h> 33 34 bool vmap_lazy_unmap __read_mostly = true; 35 36 /*** Page table manipulation functions ***/ 37 38 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 39 { 40 pte_t *pte; 41 42 pte = pte_offset_kernel(pmd, addr); 43 do { 44 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); 45 WARN_ON(!pte_none(ptent) && !pte_present(ptent)); 46 } while (pte++, addr += PAGE_SIZE, addr != end); 47 } 48 49 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) 50 { 51 pmd_t *pmd; 52 unsigned long next; 53 54 pmd = pmd_offset(pud, addr); 55 do { 56 next = pmd_addr_end(addr, end); 57 if (pmd_none_or_clear_bad(pmd)) 58 continue; 59 vunmap_pte_range(pmd, addr, next); 60 } while (pmd++, addr = next, addr != end); 61 } 62 63 static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) 64 { 65 pud_t *pud; 66 unsigned long next; 67 68 pud = pud_offset(pgd, addr); 69 do { 70 next = pud_addr_end(addr, end); 71 if (pud_none_or_clear_bad(pud)) 72 continue; 73 vunmap_pmd_range(pud, addr, next); 74 } while (pud++, addr = next, addr != end); 75 } 76 77 static void vunmap_page_range(unsigned long addr, unsigned long end) 78 { 79 pgd_t *pgd; 80 unsigned long next; 81 82 BUG_ON(addr >= end); 83 pgd = pgd_offset_k(addr); 84 do { 85 next = pgd_addr_end(addr, end); 86 if (pgd_none_or_clear_bad(pgd)) 87 continue; 88 vunmap_pud_range(pgd, addr, next); 89 } while (pgd++, addr = next, addr != end); 90 } 91 92 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, 93 unsigned long end, pgprot_t prot, struct page **pages, int *nr) 94 { 95 pte_t *pte; 96 97 /* 98 * nr is a running index into the array which helps higher level 99 * callers keep track of where we're up to. 100 */ 101 102 pte = pte_alloc_kernel(pmd, addr); 103 if (!pte) 104 return -ENOMEM; 105 do { 106 struct page *page = pages[*nr]; 107 108 if (WARN_ON(!pte_none(*pte))) 109 return -EBUSY; 110 if (WARN_ON(!page)) 111 return -ENOMEM; 112 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 113 (*nr)++; 114 } while (pte++, addr += PAGE_SIZE, addr != end); 115 return 0; 116 } 117 118 static int vmap_pmd_range(pud_t *pud, unsigned long addr, 119 unsigned long end, pgprot_t prot, struct page **pages, int *nr) 120 { 121 pmd_t *pmd; 122 unsigned long next; 123 124 pmd = pmd_alloc(&init_mm, pud, addr); 125 if (!pmd) 126 return -ENOMEM; 127 do { 128 next = pmd_addr_end(addr, end); 129 if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) 130 return -ENOMEM; 131 } while (pmd++, addr = next, addr != end); 132 return 0; 133 } 134 135 static int vmap_pud_range(pgd_t *pgd, unsigned long addr, 136 unsigned long end, pgprot_t prot, struct page **pages, int *nr) 137 { 138 pud_t *pud; 139 unsigned long next; 140 141 pud = pud_alloc(&init_mm, pgd, addr); 142 if (!pud) 143 return -ENOMEM; 144 do { 145 next = pud_addr_end(addr, end); 146 if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) 147 return -ENOMEM; 148 } while (pud++, addr = next, addr != end); 149 return 0; 150 } 151 152 /* 153 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and 154 * will have pfns corresponding to the "pages" array. 155 * 156 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] 157 */ 158 static int vmap_page_range_noflush(unsigned long start, unsigned long end, 159 pgprot_t prot, struct page **pages) 160 { 161 pgd_t *pgd; 162 unsigned long next; 163 unsigned long addr = start; 164 int err = 0; 165 int nr = 0; 166 167 BUG_ON(addr >= end); 168 pgd = pgd_offset_k(addr); 169 do { 170 next = pgd_addr_end(addr, end); 171 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); 172 if (err) 173 return err; 174 } while (pgd++, addr = next, addr != end); 175 176 return nr; 177 } 178 179 static int vmap_page_range(unsigned long start, unsigned long end, 180 pgprot_t prot, struct page **pages) 181 { 182 int ret; 183 184 ret = vmap_page_range_noflush(start, end, prot, pages); 185 flush_cache_vmap(start, end); 186 return ret; 187 } 188 189 int is_vmalloc_or_module_addr(const void *x) 190 { 191 /* 192 * ARM, x86-64 and sparc64 put modules in a special place, 193 * and fall back on vmalloc() if that fails. Others 194 * just put it in the vmalloc space. 195 */ 196 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) 197 unsigned long addr = (unsigned long)x; 198 if (addr >= MODULES_VADDR && addr < MODULES_END) 199 return 1; 200 #endif 201 return is_vmalloc_addr(x); 202 } 203 204 /* 205 * Walk a vmap address to the struct page it maps. 206 */ 207 struct page *vmalloc_to_page(const void *vmalloc_addr) 208 { 209 unsigned long addr = (unsigned long) vmalloc_addr; 210 struct page *page = NULL; 211 pgd_t *pgd = pgd_offset_k(addr); 212 213 /* 214 * XXX we might need to change this if we add VIRTUAL_BUG_ON for 215 * architectures that do not vmalloc module space 216 */ 217 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); 218 219 if (!pgd_none(*pgd)) { 220 pud_t *pud = pud_offset(pgd, addr); 221 if (!pud_none(*pud)) { 222 pmd_t *pmd = pmd_offset(pud, addr); 223 if (!pmd_none(*pmd)) { 224 pte_t *ptep, pte; 225 226 ptep = pte_offset_map(pmd, addr); 227 pte = *ptep; 228 if (pte_present(pte)) 229 page = pte_page(pte); 230 pte_unmap(ptep); 231 } 232 } 233 } 234 return page; 235 } 236 EXPORT_SYMBOL(vmalloc_to_page); 237 238 /* 239 * Map a vmalloc()-space virtual address to the physical page frame number. 240 */ 241 unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 242 { 243 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 244 } 245 EXPORT_SYMBOL(vmalloc_to_pfn); 246 247 248 /*** Global kva allocator ***/ 249 250 #define VM_LAZY_FREE 0x01 251 #define VM_LAZY_FREEING 0x02 252 #define VM_VM_AREA 0x04 253 254 struct vmap_area { 255 unsigned long va_start; 256 unsigned long va_end; 257 unsigned long flags; 258 struct rb_node rb_node; /* address sorted rbtree */ 259 struct list_head list; /* address sorted list */ 260 struct list_head purge_list; /* "lazy purge" list */ 261 void *private; 262 struct rcu_head rcu_head; 263 }; 264 265 static DEFINE_SPINLOCK(vmap_area_lock); 266 static struct rb_root vmap_area_root = RB_ROOT; 267 static LIST_HEAD(vmap_area_list); 268 static unsigned long vmap_area_pcpu_hole; 269 270 static struct vmap_area *__find_vmap_area(unsigned long addr) 271 { 272 struct rb_node *n = vmap_area_root.rb_node; 273 274 while (n) { 275 struct vmap_area *va; 276 277 va = rb_entry(n, struct vmap_area, rb_node); 278 if (addr < va->va_start) 279 n = n->rb_left; 280 else if (addr > va->va_start) 281 n = n->rb_right; 282 else 283 return va; 284 } 285 286 return NULL; 287 } 288 289 static void __insert_vmap_area(struct vmap_area *va) 290 { 291 struct rb_node **p = &vmap_area_root.rb_node; 292 struct rb_node *parent = NULL; 293 struct rb_node *tmp; 294 295 while (*p) { 296 struct vmap_area *tmp; 297 298 parent = *p; 299 tmp = rb_entry(parent, struct vmap_area, rb_node); 300 if (va->va_start < tmp->va_end) 301 p = &(*p)->rb_left; 302 else if (va->va_end > tmp->va_start) 303 p = &(*p)->rb_right; 304 else 305 BUG(); 306 } 307 308 rb_link_node(&va->rb_node, parent, p); 309 rb_insert_color(&va->rb_node, &vmap_area_root); 310 311 /* address-sort this list so it is usable like the vmlist */ 312 tmp = rb_prev(&va->rb_node); 313 if (tmp) { 314 struct vmap_area *prev; 315 prev = rb_entry(tmp, struct vmap_area, rb_node); 316 list_add_rcu(&va->list, &prev->list); 317 } else 318 list_add_rcu(&va->list, &vmap_area_list); 319 } 320 321 static void purge_vmap_area_lazy(void); 322 323 /* 324 * Allocate a region of KVA of the specified size and alignment, within the 325 * vstart and vend. 326 */ 327 static struct vmap_area *alloc_vmap_area(unsigned long size, 328 unsigned long align, 329 unsigned long vstart, unsigned long vend, 330 int node, gfp_t gfp_mask) 331 { 332 struct vmap_area *va; 333 struct rb_node *n; 334 unsigned long addr; 335 int purged = 0; 336 337 BUG_ON(!size); 338 BUG_ON(size & ~PAGE_MASK); 339 340 va = kmalloc_node(sizeof(struct vmap_area), 341 gfp_mask & GFP_RECLAIM_MASK, node); 342 if (unlikely(!va)) 343 return ERR_PTR(-ENOMEM); 344 345 retry: 346 addr = ALIGN(vstart, align); 347 348 spin_lock(&vmap_area_lock); 349 if (addr + size - 1 < addr) 350 goto overflow; 351 352 /* XXX: could have a last_hole cache */ 353 n = vmap_area_root.rb_node; 354 if (n) { 355 struct vmap_area *first = NULL; 356 357 do { 358 struct vmap_area *tmp; 359 tmp = rb_entry(n, struct vmap_area, rb_node); 360 if (tmp->va_end >= addr) { 361 if (!first && tmp->va_start < addr + size) 362 first = tmp; 363 n = n->rb_left; 364 } else { 365 first = tmp; 366 n = n->rb_right; 367 } 368 } while (n); 369 370 if (!first) 371 goto found; 372 373 if (first->va_end < addr) { 374 n = rb_next(&first->rb_node); 375 if (n) 376 first = rb_entry(n, struct vmap_area, rb_node); 377 else 378 goto found; 379 } 380 381 while (addr + size > first->va_start && addr + size <= vend) { 382 addr = ALIGN(first->va_end + PAGE_SIZE, align); 383 if (addr + size - 1 < addr) 384 goto overflow; 385 386 n = rb_next(&first->rb_node); 387 if (n) 388 first = rb_entry(n, struct vmap_area, rb_node); 389 else 390 goto found; 391 } 392 } 393 found: 394 if (addr + size > vend) { 395 overflow: 396 spin_unlock(&vmap_area_lock); 397 if (!purged) { 398 purge_vmap_area_lazy(); 399 purged = 1; 400 goto retry; 401 } 402 if (printk_ratelimit()) 403 printk(KERN_WARNING 404 "vmap allocation for size %lu failed: " 405 "use vmalloc=<size> to increase size.\n", size); 406 kfree(va); 407 return ERR_PTR(-EBUSY); 408 } 409 410 BUG_ON(addr & (align-1)); 411 412 va->va_start = addr; 413 va->va_end = addr + size; 414 va->flags = 0; 415 __insert_vmap_area(va); 416 spin_unlock(&vmap_area_lock); 417 418 return va; 419 } 420 421 static void rcu_free_va(struct rcu_head *head) 422 { 423 struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); 424 425 kfree(va); 426 } 427 428 static void __free_vmap_area(struct vmap_area *va) 429 { 430 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 431 rb_erase(&va->rb_node, &vmap_area_root); 432 RB_CLEAR_NODE(&va->rb_node); 433 list_del_rcu(&va->list); 434 435 /* 436 * Track the highest possible candidate for pcpu area 437 * allocation. Areas outside of vmalloc area can be returned 438 * here too, consider only end addresses which fall inside 439 * vmalloc area proper. 440 */ 441 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) 442 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); 443 444 call_rcu(&va->rcu_head, rcu_free_va); 445 } 446 447 /* 448 * Free a region of KVA allocated by alloc_vmap_area 449 */ 450 static void free_vmap_area(struct vmap_area *va) 451 { 452 spin_lock(&vmap_area_lock); 453 __free_vmap_area(va); 454 spin_unlock(&vmap_area_lock); 455 } 456 457 /* 458 * Clear the pagetable entries of a given vmap_area 459 */ 460 static void unmap_vmap_area(struct vmap_area *va) 461 { 462 vunmap_page_range(va->va_start, va->va_end); 463 } 464 465 static void vmap_debug_free_range(unsigned long start, unsigned long end) 466 { 467 /* 468 * Unmap page tables and force a TLB flush immediately if 469 * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free 470 * bugs similarly to those in linear kernel virtual address 471 * space after a page has been freed. 472 * 473 * All the lazy freeing logic is still retained, in order to 474 * minimise intrusiveness of this debugging feature. 475 * 476 * This is going to be *slow* (linear kernel virtual address 477 * debugging doesn't do a broadcast TLB flush so it is a lot 478 * faster). 479 */ 480 #ifdef CONFIG_DEBUG_PAGEALLOC 481 vunmap_page_range(start, end); 482 flush_tlb_kernel_range(start, end); 483 #endif 484 } 485 486 /* 487 * lazy_max_pages is the maximum amount of virtual address space we gather up 488 * before attempting to purge with a TLB flush. 489 * 490 * There is a tradeoff here: a larger number will cover more kernel page tables 491 * and take slightly longer to purge, but it will linearly reduce the number of 492 * global TLB flushes that must be performed. It would seem natural to scale 493 * this number up linearly with the number of CPUs (because vmapping activity 494 * could also scale linearly with the number of CPUs), however it is likely 495 * that in practice, workloads might be constrained in other ways that mean 496 * vmap activity will not scale linearly with CPUs. Also, I want to be 497 * conservative and not introduce a big latency on huge systems, so go with 498 * a less aggressive log scale. It will still be an improvement over the old 499 * code, and it will be simple to change the scale factor if we find that it 500 * becomes a problem on bigger systems. 501 */ 502 static unsigned long lazy_max_pages(void) 503 { 504 unsigned int log; 505 506 if (!vmap_lazy_unmap) 507 return 0; 508 509 log = fls(num_online_cpus()); 510 511 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 512 } 513 514 static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); 515 516 /* for per-CPU blocks */ 517 static void purge_fragmented_blocks_allcpus(void); 518 519 /* 520 * Purges all lazily-freed vmap areas. 521 * 522 * If sync is 0 then don't purge if there is already a purge in progress. 523 * If force_flush is 1, then flush kernel TLBs between *start and *end even 524 * if we found no lazy vmap areas to unmap (callers can use this to optimise 525 * their own TLB flushing). 526 * Returns with *start = min(*start, lowest purged address) 527 * *end = max(*end, highest purged address) 528 */ 529 static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, 530 int sync, int force_flush) 531 { 532 static DEFINE_SPINLOCK(purge_lock); 533 LIST_HEAD(valist); 534 struct vmap_area *va; 535 struct vmap_area *n_va; 536 int nr = 0; 537 538 /* 539 * If sync is 0 but force_flush is 1, we'll go sync anyway but callers 540 * should not expect such behaviour. This just simplifies locking for 541 * the case that isn't actually used at the moment anyway. 542 */ 543 if (!sync && !force_flush) { 544 if (!spin_trylock(&purge_lock)) 545 return; 546 } else 547 spin_lock(&purge_lock); 548 549 if (sync) 550 purge_fragmented_blocks_allcpus(); 551 552 rcu_read_lock(); 553 list_for_each_entry_rcu(va, &vmap_area_list, list) { 554 if (va->flags & VM_LAZY_FREE) { 555 if (va->va_start < *start) 556 *start = va->va_start; 557 if (va->va_end > *end) 558 *end = va->va_end; 559 nr += (va->va_end - va->va_start) >> PAGE_SHIFT; 560 unmap_vmap_area(va); 561 list_add_tail(&va->purge_list, &valist); 562 va->flags |= VM_LAZY_FREEING; 563 va->flags &= ~VM_LAZY_FREE; 564 } 565 } 566 rcu_read_unlock(); 567 568 if (nr) 569 atomic_sub(nr, &vmap_lazy_nr); 570 571 if (nr || force_flush) 572 flush_tlb_kernel_range(*start, *end); 573 574 if (nr) { 575 spin_lock(&vmap_area_lock); 576 list_for_each_entry_safe(va, n_va, &valist, purge_list) 577 __free_vmap_area(va); 578 spin_unlock(&vmap_area_lock); 579 } 580 spin_unlock(&purge_lock); 581 } 582 583 /* 584 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody 585 * is already purging. 586 */ 587 static void try_purge_vmap_area_lazy(void) 588 { 589 unsigned long start = ULONG_MAX, end = 0; 590 591 __purge_vmap_area_lazy(&start, &end, 0, 0); 592 } 593 594 /* 595 * Kick off a purge of the outstanding lazy areas. 596 */ 597 static void purge_vmap_area_lazy(void) 598 { 599 unsigned long start = ULONG_MAX, end = 0; 600 601 __purge_vmap_area_lazy(&start, &end, 1, 0); 602 } 603 604 /* 605 * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been 606 * called for the correct range previously. 607 */ 608 static void free_unmap_vmap_area_noflush(struct vmap_area *va) 609 { 610 va->flags |= VM_LAZY_FREE; 611 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); 612 if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) 613 try_purge_vmap_area_lazy(); 614 } 615 616 /* 617 * Free and unmap a vmap area 618 */ 619 static void free_unmap_vmap_area(struct vmap_area *va) 620 { 621 flush_cache_vunmap(va->va_start, va->va_end); 622 free_unmap_vmap_area_noflush(va); 623 } 624 625 static struct vmap_area *find_vmap_area(unsigned long addr) 626 { 627 struct vmap_area *va; 628 629 spin_lock(&vmap_area_lock); 630 va = __find_vmap_area(addr); 631 spin_unlock(&vmap_area_lock); 632 633 return va; 634 } 635 636 static void free_unmap_vmap_area_addr(unsigned long addr) 637 { 638 struct vmap_area *va; 639 640 va = find_vmap_area(addr); 641 BUG_ON(!va); 642 free_unmap_vmap_area(va); 643 } 644 645 646 /*** Per cpu kva allocator ***/ 647 648 /* 649 * vmap space is limited especially on 32 bit architectures. Ensure there is 650 * room for at least 16 percpu vmap blocks per CPU. 651 */ 652 /* 653 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able 654 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess 655 * instead (we just need a rough idea) 656 */ 657 #if BITS_PER_LONG == 32 658 #define VMALLOC_SPACE (128UL*1024*1024) 659 #else 660 #define VMALLOC_SPACE (128UL*1024*1024*1024) 661 #endif 662 663 #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) 664 #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ 665 #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ 666 #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 667 #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 668 #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 669 #define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 670 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 671 VMALLOC_PAGES / NR_CPUS / 16)) 672 673 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 674 675 static bool vmap_initialized __read_mostly = false; 676 677 struct vmap_block_queue { 678 spinlock_t lock; 679 struct list_head free; 680 }; 681 682 struct vmap_block { 683 spinlock_t lock; 684 struct vmap_area *va; 685 struct vmap_block_queue *vbq; 686 unsigned long free, dirty; 687 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); 688 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 689 struct list_head free_list; 690 struct rcu_head rcu_head; 691 struct list_head purge; 692 }; 693 694 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 695 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); 696 697 /* 698 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block 699 * in the free path. Could get rid of this if we change the API to return a 700 * "cookie" from alloc, to be passed to free. But no big deal yet. 701 */ 702 static DEFINE_SPINLOCK(vmap_block_tree_lock); 703 static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); 704 705 /* 706 * We should probably have a fallback mechanism to allocate virtual memory 707 * out of partially filled vmap blocks. However vmap block sizing should be 708 * fairly reasonable according to the vmalloc size, so it shouldn't be a 709 * big problem. 710 */ 711 712 static unsigned long addr_to_vb_idx(unsigned long addr) 713 { 714 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); 715 addr /= VMAP_BLOCK_SIZE; 716 return addr; 717 } 718 719 static struct vmap_block *new_vmap_block(gfp_t gfp_mask) 720 { 721 struct vmap_block_queue *vbq; 722 struct vmap_block *vb; 723 struct vmap_area *va; 724 unsigned long vb_idx; 725 int node, err; 726 727 node = numa_node_id(); 728 729 vb = kmalloc_node(sizeof(struct vmap_block), 730 gfp_mask & GFP_RECLAIM_MASK, node); 731 if (unlikely(!vb)) 732 return ERR_PTR(-ENOMEM); 733 734 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 735 VMALLOC_START, VMALLOC_END, 736 node, gfp_mask); 737 if (unlikely(IS_ERR(va))) { 738 kfree(vb); 739 return ERR_CAST(va); 740 } 741 742 err = radix_tree_preload(gfp_mask); 743 if (unlikely(err)) { 744 kfree(vb); 745 free_vmap_area(va); 746 return ERR_PTR(err); 747 } 748 749 spin_lock_init(&vb->lock); 750 vb->va = va; 751 vb->free = VMAP_BBMAP_BITS; 752 vb->dirty = 0; 753 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); 754 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); 755 INIT_LIST_HEAD(&vb->free_list); 756 757 vb_idx = addr_to_vb_idx(va->va_start); 758 spin_lock(&vmap_block_tree_lock); 759 err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); 760 spin_unlock(&vmap_block_tree_lock); 761 BUG_ON(err); 762 radix_tree_preload_end(); 763 764 vbq = &get_cpu_var(vmap_block_queue); 765 vb->vbq = vbq; 766 spin_lock(&vbq->lock); 767 list_add_rcu(&vb->free_list, &vbq->free); 768 spin_unlock(&vbq->lock); 769 put_cpu_var(vmap_block_queue); 770 771 return vb; 772 } 773 774 static void rcu_free_vb(struct rcu_head *head) 775 { 776 struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); 777 778 kfree(vb); 779 } 780 781 static void free_vmap_block(struct vmap_block *vb) 782 { 783 struct vmap_block *tmp; 784 unsigned long vb_idx; 785 786 vb_idx = addr_to_vb_idx(vb->va->va_start); 787 spin_lock(&vmap_block_tree_lock); 788 tmp = radix_tree_delete(&vmap_block_tree, vb_idx); 789 spin_unlock(&vmap_block_tree_lock); 790 BUG_ON(tmp != vb); 791 792 free_unmap_vmap_area_noflush(vb->va); 793 call_rcu(&vb->rcu_head, rcu_free_vb); 794 } 795 796 static void purge_fragmented_blocks(int cpu) 797 { 798 LIST_HEAD(purge); 799 struct vmap_block *vb; 800 struct vmap_block *n_vb; 801 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 802 803 rcu_read_lock(); 804 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 805 806 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) 807 continue; 808 809 spin_lock(&vb->lock); 810 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 811 vb->free = 0; /* prevent further allocs after releasing lock */ 812 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 813 bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); 814 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); 815 spin_lock(&vbq->lock); 816 list_del_rcu(&vb->free_list); 817 spin_unlock(&vbq->lock); 818 spin_unlock(&vb->lock); 819 list_add_tail(&vb->purge, &purge); 820 } else 821 spin_unlock(&vb->lock); 822 } 823 rcu_read_unlock(); 824 825 list_for_each_entry_safe(vb, n_vb, &purge, purge) { 826 list_del(&vb->purge); 827 free_vmap_block(vb); 828 } 829 } 830 831 static void purge_fragmented_blocks_thiscpu(void) 832 { 833 purge_fragmented_blocks(smp_processor_id()); 834 } 835 836 static void purge_fragmented_blocks_allcpus(void) 837 { 838 int cpu; 839 840 for_each_possible_cpu(cpu) 841 purge_fragmented_blocks(cpu); 842 } 843 844 static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 845 { 846 struct vmap_block_queue *vbq; 847 struct vmap_block *vb; 848 unsigned long addr = 0; 849 unsigned int order; 850 int purge = 0; 851 852 BUG_ON(size & ~PAGE_MASK); 853 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 854 order = get_order(size); 855 856 again: 857 rcu_read_lock(); 858 vbq = &get_cpu_var(vmap_block_queue); 859 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 860 int i; 861 862 spin_lock(&vb->lock); 863 if (vb->free < 1UL << order) 864 goto next; 865 866 i = bitmap_find_free_region(vb->alloc_map, 867 VMAP_BBMAP_BITS, order); 868 869 if (i < 0) { 870 if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { 871 /* fragmented and no outstanding allocations */ 872 BUG_ON(vb->dirty != VMAP_BBMAP_BITS); 873 purge = 1; 874 } 875 goto next; 876 } 877 addr = vb->va->va_start + (i << PAGE_SHIFT); 878 BUG_ON(addr_to_vb_idx(addr) != 879 addr_to_vb_idx(vb->va->va_start)); 880 vb->free -= 1UL << order; 881 if (vb->free == 0) { 882 spin_lock(&vbq->lock); 883 list_del_rcu(&vb->free_list); 884 spin_unlock(&vbq->lock); 885 } 886 spin_unlock(&vb->lock); 887 break; 888 next: 889 spin_unlock(&vb->lock); 890 } 891 892 if (purge) 893 purge_fragmented_blocks_thiscpu(); 894 895 put_cpu_var(vmap_block_queue); 896 rcu_read_unlock(); 897 898 if (!addr) { 899 vb = new_vmap_block(gfp_mask); 900 if (IS_ERR(vb)) 901 return vb; 902 goto again; 903 } 904 905 return (void *)addr; 906 } 907 908 static void vb_free(const void *addr, unsigned long size) 909 { 910 unsigned long offset; 911 unsigned long vb_idx; 912 unsigned int order; 913 struct vmap_block *vb; 914 915 BUG_ON(size & ~PAGE_MASK); 916 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 917 918 flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); 919 920 order = get_order(size); 921 922 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); 923 924 vb_idx = addr_to_vb_idx((unsigned long)addr); 925 rcu_read_lock(); 926 vb = radix_tree_lookup(&vmap_block_tree, vb_idx); 927 rcu_read_unlock(); 928 BUG_ON(!vb); 929 930 spin_lock(&vb->lock); 931 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); 932 933 vb->dirty += 1UL << order; 934 if (vb->dirty == VMAP_BBMAP_BITS) { 935 BUG_ON(vb->free); 936 spin_unlock(&vb->lock); 937 free_vmap_block(vb); 938 } else 939 spin_unlock(&vb->lock); 940 } 941 942 /** 943 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer 944 * 945 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily 946 * to amortize TLB flushing overheads. What this means is that any page you 947 * have now, may, in a former life, have been mapped into kernel virtual 948 * address by the vmap layer and so there might be some CPUs with TLB entries 949 * still referencing that page (additional to the regular 1:1 kernel mapping). 950 * 951 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can 952 * be sure that none of the pages we have control over will have any aliases 953 * from the vmap layer. 954 */ 955 void vm_unmap_aliases(void) 956 { 957 unsigned long start = ULONG_MAX, end = 0; 958 int cpu; 959 int flush = 0; 960 961 if (unlikely(!vmap_initialized)) 962 return; 963 964 for_each_possible_cpu(cpu) { 965 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 966 struct vmap_block *vb; 967 968 rcu_read_lock(); 969 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 970 int i; 971 972 spin_lock(&vb->lock); 973 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); 974 while (i < VMAP_BBMAP_BITS) { 975 unsigned long s, e; 976 int j; 977 j = find_next_zero_bit(vb->dirty_map, 978 VMAP_BBMAP_BITS, i); 979 980 s = vb->va->va_start + (i << PAGE_SHIFT); 981 e = vb->va->va_start + (j << PAGE_SHIFT); 982 vunmap_page_range(s, e); 983 flush = 1; 984 985 if (s < start) 986 start = s; 987 if (e > end) 988 end = e; 989 990 i = j; 991 i = find_next_bit(vb->dirty_map, 992 VMAP_BBMAP_BITS, i); 993 } 994 spin_unlock(&vb->lock); 995 } 996 rcu_read_unlock(); 997 } 998 999 __purge_vmap_area_lazy(&start, &end, 1, flush); 1000 } 1001 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 1002 1003 /** 1004 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram 1005 * @mem: the pointer returned by vm_map_ram 1006 * @count: the count passed to that vm_map_ram call (cannot unmap partial) 1007 */ 1008 void vm_unmap_ram(const void *mem, unsigned int count) 1009 { 1010 unsigned long size = count << PAGE_SHIFT; 1011 unsigned long addr = (unsigned long)mem; 1012 1013 BUG_ON(!addr); 1014 BUG_ON(addr < VMALLOC_START); 1015 BUG_ON(addr > VMALLOC_END); 1016 BUG_ON(addr & (PAGE_SIZE-1)); 1017 1018 debug_check_no_locks_freed(mem, size); 1019 vmap_debug_free_range(addr, addr+size); 1020 1021 if (likely(count <= VMAP_MAX_ALLOC)) 1022 vb_free(mem, size); 1023 else 1024 free_unmap_vmap_area_addr(addr); 1025 } 1026 EXPORT_SYMBOL(vm_unmap_ram); 1027 1028 /** 1029 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) 1030 * @pages: an array of pointers to the pages to be mapped 1031 * @count: number of pages 1032 * @node: prefer to allocate data structures on this node 1033 * @prot: memory protection to use. PAGE_KERNEL for regular RAM 1034 * 1035 * Returns: a pointer to the address that has been mapped, or %NULL on failure 1036 */ 1037 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) 1038 { 1039 unsigned long size = count << PAGE_SHIFT; 1040 unsigned long addr; 1041 void *mem; 1042 1043 if (likely(count <= VMAP_MAX_ALLOC)) { 1044 mem = vb_alloc(size, GFP_KERNEL); 1045 if (IS_ERR(mem)) 1046 return NULL; 1047 addr = (unsigned long)mem; 1048 } else { 1049 struct vmap_area *va; 1050 va = alloc_vmap_area(size, PAGE_SIZE, 1051 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); 1052 if (IS_ERR(va)) 1053 return NULL; 1054 1055 addr = va->va_start; 1056 mem = (void *)addr; 1057 } 1058 if (vmap_page_range(addr, addr + size, prot, pages) < 0) { 1059 vm_unmap_ram(mem, count); 1060 return NULL; 1061 } 1062 return mem; 1063 } 1064 EXPORT_SYMBOL(vm_map_ram); 1065 1066 /** 1067 * vm_area_register_early - register vmap area early during boot 1068 * @vm: vm_struct to register 1069 * @align: requested alignment 1070 * 1071 * This function is used to register kernel vm area before 1072 * vmalloc_init() is called. @vm->size and @vm->flags should contain 1073 * proper values on entry and other fields should be zero. On return, 1074 * vm->addr contains the allocated address. 1075 * 1076 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 1077 */ 1078 void __init vm_area_register_early(struct vm_struct *vm, size_t align) 1079 { 1080 static size_t vm_init_off __initdata; 1081 unsigned long addr; 1082 1083 addr = ALIGN(VMALLOC_START + vm_init_off, align); 1084 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; 1085 1086 vm->addr = (void *)addr; 1087 1088 vm->next = vmlist; 1089 vmlist = vm; 1090 } 1091 1092 void __init vmalloc_init(void) 1093 { 1094 struct vmap_area *va; 1095 struct vm_struct *tmp; 1096 int i; 1097 1098 for_each_possible_cpu(i) { 1099 struct vmap_block_queue *vbq; 1100 1101 vbq = &per_cpu(vmap_block_queue, i); 1102 spin_lock_init(&vbq->lock); 1103 INIT_LIST_HEAD(&vbq->free); 1104 } 1105 1106 /* Import existing vmlist entries. */ 1107 for (tmp = vmlist; tmp; tmp = tmp->next) { 1108 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); 1109 va->flags = tmp->flags | VM_VM_AREA; 1110 va->va_start = (unsigned long)tmp->addr; 1111 va->va_end = va->va_start + tmp->size; 1112 __insert_vmap_area(va); 1113 } 1114 1115 vmap_area_pcpu_hole = VMALLOC_END; 1116 1117 vmap_initialized = true; 1118 } 1119 1120 /** 1121 * map_kernel_range_noflush - map kernel VM area with the specified pages 1122 * @addr: start of the VM area to map 1123 * @size: size of the VM area to map 1124 * @prot: page protection flags to use 1125 * @pages: pages to map 1126 * 1127 * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size 1128 * specify should have been allocated using get_vm_area() and its 1129 * friends. 1130 * 1131 * NOTE: 1132 * This function does NOT do any cache flushing. The caller is 1133 * responsible for calling flush_cache_vmap() on to-be-mapped areas 1134 * before calling this function. 1135 * 1136 * RETURNS: 1137 * The number of pages mapped on success, -errno on failure. 1138 */ 1139 int map_kernel_range_noflush(unsigned long addr, unsigned long size, 1140 pgprot_t prot, struct page **pages) 1141 { 1142 return vmap_page_range_noflush(addr, addr + size, prot, pages); 1143 } 1144 1145 /** 1146 * unmap_kernel_range_noflush - unmap kernel VM area 1147 * @addr: start of the VM area to unmap 1148 * @size: size of the VM area to unmap 1149 * 1150 * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size 1151 * specify should have been allocated using get_vm_area() and its 1152 * friends. 1153 * 1154 * NOTE: 1155 * This function does NOT do any cache flushing. The caller is 1156 * responsible for calling flush_cache_vunmap() on to-be-mapped areas 1157 * before calling this function and flush_tlb_kernel_range() after. 1158 */ 1159 void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) 1160 { 1161 vunmap_page_range(addr, addr + size); 1162 } 1163 1164 /** 1165 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB 1166 * @addr: start of the VM area to unmap 1167 * @size: size of the VM area to unmap 1168 * 1169 * Similar to unmap_kernel_range_noflush() but flushes vcache before 1170 * the unmapping and tlb after. 1171 */ 1172 void unmap_kernel_range(unsigned long addr, unsigned long size) 1173 { 1174 unsigned long end = addr + size; 1175 1176 flush_cache_vunmap(addr, end); 1177 vunmap_page_range(addr, end); 1178 flush_tlb_kernel_range(addr, end); 1179 } 1180 1181 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) 1182 { 1183 unsigned long addr = (unsigned long)area->addr; 1184 unsigned long end = addr + area->size - PAGE_SIZE; 1185 int err; 1186 1187 err = vmap_page_range(addr, end, prot, *pages); 1188 if (err > 0) { 1189 *pages += err; 1190 err = 0; 1191 } 1192 1193 return err; 1194 } 1195 EXPORT_SYMBOL_GPL(map_vm_area); 1196 1197 /*** Old vmalloc interfaces ***/ 1198 DEFINE_RWLOCK(vmlist_lock); 1199 struct vm_struct *vmlist; 1200 1201 static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1202 unsigned long flags, void *caller) 1203 { 1204 struct vm_struct *tmp, **p; 1205 1206 vm->flags = flags; 1207 vm->addr = (void *)va->va_start; 1208 vm->size = va->va_end - va->va_start; 1209 vm->caller = caller; 1210 va->private = vm; 1211 va->flags |= VM_VM_AREA; 1212 1213 write_lock(&vmlist_lock); 1214 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 1215 if (tmp->addr >= vm->addr) 1216 break; 1217 } 1218 vm->next = *p; 1219 *p = vm; 1220 write_unlock(&vmlist_lock); 1221 } 1222 1223 static struct vm_struct *__get_vm_area_node(unsigned long size, 1224 unsigned long align, unsigned long flags, unsigned long start, 1225 unsigned long end, int node, gfp_t gfp_mask, void *caller) 1226 { 1227 static struct vmap_area *va; 1228 struct vm_struct *area; 1229 1230 BUG_ON(in_interrupt()); 1231 if (flags & VM_IOREMAP) { 1232 int bit = fls(size); 1233 1234 if (bit > IOREMAP_MAX_ORDER) 1235 bit = IOREMAP_MAX_ORDER; 1236 else if (bit < PAGE_SHIFT) 1237 bit = PAGE_SHIFT; 1238 1239 align = 1ul << bit; 1240 } 1241 1242 size = PAGE_ALIGN(size); 1243 if (unlikely(!size)) 1244 return NULL; 1245 1246 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 1247 if (unlikely(!area)) 1248 return NULL; 1249 1250 /* 1251 * We always allocate a guard page. 1252 */ 1253 size += PAGE_SIZE; 1254 1255 va = alloc_vmap_area(size, align, start, end, node, gfp_mask); 1256 if (IS_ERR(va)) { 1257 kfree(area); 1258 return NULL; 1259 } 1260 1261 insert_vmalloc_vm(area, va, flags, caller); 1262 return area; 1263 } 1264 1265 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1266 unsigned long start, unsigned long end) 1267 { 1268 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1269 __builtin_return_address(0)); 1270 } 1271 EXPORT_SYMBOL_GPL(__get_vm_area); 1272 1273 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 1274 unsigned long start, unsigned long end, 1275 void *caller) 1276 { 1277 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1278 caller); 1279 } 1280 1281 /** 1282 * get_vm_area - reserve a contiguous kernel virtual area 1283 * @size: size of the area 1284 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 1285 * 1286 * Search an area of @size in the kernel virtual mapping area, 1287 * and reserved it for out purposes. Returns the area descriptor 1288 * on success or %NULL on failure. 1289 */ 1290 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 1291 { 1292 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1293 -1, GFP_KERNEL, __builtin_return_address(0)); 1294 } 1295 1296 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1297 void *caller) 1298 { 1299 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1300 -1, GFP_KERNEL, caller); 1301 } 1302 1303 struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, 1304 int node, gfp_t gfp_mask) 1305 { 1306 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1307 node, gfp_mask, __builtin_return_address(0)); 1308 } 1309 1310 static struct vm_struct *find_vm_area(const void *addr) 1311 { 1312 struct vmap_area *va; 1313 1314 va = find_vmap_area((unsigned long)addr); 1315 if (va && va->flags & VM_VM_AREA) 1316 return va->private; 1317 1318 return NULL; 1319 } 1320 1321 /** 1322 * remove_vm_area - find and remove a continuous kernel virtual area 1323 * @addr: base address 1324 * 1325 * Search for the kernel VM area starting at @addr, and remove it. 1326 * This function returns the found VM area, but using it is NOT safe 1327 * on SMP machines, except for its size or flags. 1328 */ 1329 struct vm_struct *remove_vm_area(const void *addr) 1330 { 1331 struct vmap_area *va; 1332 1333 va = find_vmap_area((unsigned long)addr); 1334 if (va && va->flags & VM_VM_AREA) { 1335 struct vm_struct *vm = va->private; 1336 struct vm_struct *tmp, **p; 1337 /* 1338 * remove from list and disallow access to this vm_struct 1339 * before unmap. (address range confliction is maintained by 1340 * vmap.) 1341 */ 1342 write_lock(&vmlist_lock); 1343 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) 1344 ; 1345 *p = tmp->next; 1346 write_unlock(&vmlist_lock); 1347 1348 vmap_debug_free_range(va->va_start, va->va_end); 1349 free_unmap_vmap_area(va); 1350 vm->size -= PAGE_SIZE; 1351 1352 return vm; 1353 } 1354 return NULL; 1355 } 1356 1357 static void __vunmap(const void *addr, int deallocate_pages) 1358 { 1359 struct vm_struct *area; 1360 1361 if (!addr) 1362 return; 1363 1364 if ((PAGE_SIZE-1) & (unsigned long)addr) { 1365 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); 1366 return; 1367 } 1368 1369 area = remove_vm_area(addr); 1370 if (unlikely(!area)) { 1371 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 1372 addr); 1373 return; 1374 } 1375 1376 debug_check_no_locks_freed(addr, area->size); 1377 debug_check_no_obj_freed(addr, area->size); 1378 1379 if (deallocate_pages) { 1380 int i; 1381 1382 for (i = 0; i < area->nr_pages; i++) { 1383 struct page *page = area->pages[i]; 1384 1385 BUG_ON(!page); 1386 __free_page(page); 1387 } 1388 1389 if (area->flags & VM_VPAGES) 1390 vfree(area->pages); 1391 else 1392 kfree(area->pages); 1393 } 1394 1395 kfree(area); 1396 return; 1397 } 1398 1399 /** 1400 * vfree - release memory allocated by vmalloc() 1401 * @addr: memory base address 1402 * 1403 * Free the virtually continuous memory area starting at @addr, as 1404 * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is 1405 * NULL, no operation is performed. 1406 * 1407 * Must not be called in interrupt context. 1408 */ 1409 void vfree(const void *addr) 1410 { 1411 BUG_ON(in_interrupt()); 1412 1413 kmemleak_free(addr); 1414 1415 __vunmap(addr, 1); 1416 } 1417 EXPORT_SYMBOL(vfree); 1418 1419 /** 1420 * vunmap - release virtual mapping obtained by vmap() 1421 * @addr: memory base address 1422 * 1423 * Free the virtually contiguous memory area starting at @addr, 1424 * which was created from the page array passed to vmap(). 1425 * 1426 * Must not be called in interrupt context. 1427 */ 1428 void vunmap(const void *addr) 1429 { 1430 BUG_ON(in_interrupt()); 1431 might_sleep(); 1432 __vunmap(addr, 0); 1433 } 1434 EXPORT_SYMBOL(vunmap); 1435 1436 /** 1437 * vmap - map an array of pages into virtually contiguous space 1438 * @pages: array of page pointers 1439 * @count: number of pages to map 1440 * @flags: vm_area->flags 1441 * @prot: page protection for the mapping 1442 * 1443 * Maps @count pages from @pages into contiguous kernel virtual 1444 * space. 1445 */ 1446 void *vmap(struct page **pages, unsigned int count, 1447 unsigned long flags, pgprot_t prot) 1448 { 1449 struct vm_struct *area; 1450 1451 might_sleep(); 1452 1453 if (count > totalram_pages) 1454 return NULL; 1455 1456 area = get_vm_area_caller((count << PAGE_SHIFT), flags, 1457 __builtin_return_address(0)); 1458 if (!area) 1459 return NULL; 1460 1461 if (map_vm_area(area, prot, &pages)) { 1462 vunmap(area->addr); 1463 return NULL; 1464 } 1465 1466 return area->addr; 1467 } 1468 EXPORT_SYMBOL(vmap); 1469 1470 static void *__vmalloc_node(unsigned long size, unsigned long align, 1471 gfp_t gfp_mask, pgprot_t prot, 1472 int node, void *caller); 1473 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1474 pgprot_t prot, int node, void *caller) 1475 { 1476 struct page **pages; 1477 unsigned int nr_pages, array_size, i; 1478 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 1479 1480 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; 1481 array_size = (nr_pages * sizeof(struct page *)); 1482 1483 area->nr_pages = nr_pages; 1484 /* Please note that the recursion is strictly bounded. */ 1485 if (array_size > PAGE_SIZE) { 1486 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, 1487 PAGE_KERNEL, node, caller); 1488 area->flags |= VM_VPAGES; 1489 } else { 1490 pages = kmalloc_node(array_size, nested_gfp, node); 1491 } 1492 area->pages = pages; 1493 area->caller = caller; 1494 if (!area->pages) { 1495 remove_vm_area(area->addr); 1496 kfree(area); 1497 return NULL; 1498 } 1499 1500 for (i = 0; i < area->nr_pages; i++) { 1501 struct page *page; 1502 1503 if (node < 0) 1504 page = alloc_page(gfp_mask); 1505 else 1506 page = alloc_pages_node(node, gfp_mask, 0); 1507 1508 if (unlikely(!page)) { 1509 /* Successfully allocated i pages, free them in __vunmap() */ 1510 area->nr_pages = i; 1511 goto fail; 1512 } 1513 area->pages[i] = page; 1514 } 1515 1516 if (map_vm_area(area, prot, &pages)) 1517 goto fail; 1518 return area->addr; 1519 1520 fail: 1521 vfree(area->addr); 1522 return NULL; 1523 } 1524 1525 void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) 1526 { 1527 void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, 1528 __builtin_return_address(0)); 1529 1530 /* 1531 * A ref_count = 3 is needed because the vm_struct and vmap_area 1532 * structures allocated in the __get_vm_area_node() function contain 1533 * references to the virtual address of the vmalloc'ed block. 1534 */ 1535 kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); 1536 1537 return addr; 1538 } 1539 1540 /** 1541 * __vmalloc_node - allocate virtually contiguous memory 1542 * @size: allocation size 1543 * @align: desired alignment 1544 * @gfp_mask: flags for the page level allocator 1545 * @prot: protection mask for the allocated pages 1546 * @node: node to use for allocation or -1 1547 * @caller: caller's return address 1548 * 1549 * Allocate enough pages to cover @size from the page level 1550 * allocator with @gfp_mask flags. Map them into contiguous 1551 * kernel virtual space, using a pagetable protection of @prot. 1552 */ 1553 static void *__vmalloc_node(unsigned long size, unsigned long align, 1554 gfp_t gfp_mask, pgprot_t prot, 1555 int node, void *caller) 1556 { 1557 struct vm_struct *area; 1558 void *addr; 1559 unsigned long real_size = size; 1560 1561 size = PAGE_ALIGN(size); 1562 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1563 return NULL; 1564 1565 area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, 1566 VMALLOC_END, node, gfp_mask, caller); 1567 1568 if (!area) 1569 return NULL; 1570 1571 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1572 1573 /* 1574 * A ref_count = 3 is needed because the vm_struct and vmap_area 1575 * structures allocated in the __get_vm_area_node() function contain 1576 * references to the virtual address of the vmalloc'ed block. 1577 */ 1578 kmemleak_alloc(addr, real_size, 3, gfp_mask); 1579 1580 return addr; 1581 } 1582 1583 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1584 { 1585 return __vmalloc_node(size, 1, gfp_mask, prot, -1, 1586 __builtin_return_address(0)); 1587 } 1588 EXPORT_SYMBOL(__vmalloc); 1589 1590 /** 1591 * vmalloc - allocate virtually contiguous memory 1592 * @size: allocation size 1593 * Allocate enough pages to cover @size from the page level 1594 * allocator and map them into contiguous kernel virtual space. 1595 * 1596 * For tight control over page level allocator and protection flags 1597 * use __vmalloc() instead. 1598 */ 1599 void *vmalloc(unsigned long size) 1600 { 1601 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1602 -1, __builtin_return_address(0)); 1603 } 1604 EXPORT_SYMBOL(vmalloc); 1605 1606 /** 1607 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 1608 * @size: allocation size 1609 * 1610 * The resulting memory area is zeroed so it can be mapped to userspace 1611 * without leaking data. 1612 */ 1613 void *vmalloc_user(unsigned long size) 1614 { 1615 struct vm_struct *area; 1616 void *ret; 1617 1618 ret = __vmalloc_node(size, SHMLBA, 1619 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 1620 PAGE_KERNEL, -1, __builtin_return_address(0)); 1621 if (ret) { 1622 area = find_vm_area(ret); 1623 area->flags |= VM_USERMAP; 1624 } 1625 return ret; 1626 } 1627 EXPORT_SYMBOL(vmalloc_user); 1628 1629 /** 1630 * vmalloc_node - allocate memory on a specific node 1631 * @size: allocation size 1632 * @node: numa node 1633 * 1634 * Allocate enough pages to cover @size from the page level 1635 * allocator and map them into contiguous kernel virtual space. 1636 * 1637 * For tight control over page level allocator and protection flags 1638 * use __vmalloc() instead. 1639 */ 1640 void *vmalloc_node(unsigned long size, int node) 1641 { 1642 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1643 node, __builtin_return_address(0)); 1644 } 1645 EXPORT_SYMBOL(vmalloc_node); 1646 1647 #ifndef PAGE_KERNEL_EXEC 1648 # define PAGE_KERNEL_EXEC PAGE_KERNEL 1649 #endif 1650 1651 /** 1652 * vmalloc_exec - allocate virtually contiguous, executable memory 1653 * @size: allocation size 1654 * 1655 * Kernel-internal function to allocate enough pages to cover @size 1656 * the page level allocator and map them into contiguous and 1657 * executable kernel virtual space. 1658 * 1659 * For tight control over page level allocator and protection flags 1660 * use __vmalloc() instead. 1661 */ 1662 1663 void *vmalloc_exec(unsigned long size) 1664 { 1665 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, 1666 -1, __builtin_return_address(0)); 1667 } 1668 1669 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 1670 #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL 1671 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) 1672 #define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL 1673 #else 1674 #define GFP_VMALLOC32 GFP_KERNEL 1675 #endif 1676 1677 /** 1678 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 1679 * @size: allocation size 1680 * 1681 * Allocate enough 32bit PA addressable pages to cover @size from the 1682 * page level allocator and map them into contiguous kernel virtual space. 1683 */ 1684 void *vmalloc_32(unsigned long size) 1685 { 1686 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, 1687 -1, __builtin_return_address(0)); 1688 } 1689 EXPORT_SYMBOL(vmalloc_32); 1690 1691 /** 1692 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 1693 * @size: allocation size 1694 * 1695 * The resulting memory area is 32bit addressable and zeroed so it can be 1696 * mapped to userspace without leaking data. 1697 */ 1698 void *vmalloc_32_user(unsigned long size) 1699 { 1700 struct vm_struct *area; 1701 void *ret; 1702 1703 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 1704 -1, __builtin_return_address(0)); 1705 if (ret) { 1706 area = find_vm_area(ret); 1707 area->flags |= VM_USERMAP; 1708 } 1709 return ret; 1710 } 1711 EXPORT_SYMBOL(vmalloc_32_user); 1712 1713 /* 1714 * small helper routine , copy contents to buf from addr. 1715 * If the page is not present, fill zero. 1716 */ 1717 1718 static int aligned_vread(char *buf, char *addr, unsigned long count) 1719 { 1720 struct page *p; 1721 int copied = 0; 1722 1723 while (count) { 1724 unsigned long offset, length; 1725 1726 offset = (unsigned long)addr & ~PAGE_MASK; 1727 length = PAGE_SIZE - offset; 1728 if (length > count) 1729 length = count; 1730 p = vmalloc_to_page(addr); 1731 /* 1732 * To do safe access to this _mapped_ area, we need 1733 * lock. But adding lock here means that we need to add 1734 * overhead of vmalloc()/vfree() calles for this _debug_ 1735 * interface, rarely used. Instead of that, we'll use 1736 * kmap() and get small overhead in this access function. 1737 */ 1738 if (p) { 1739 /* 1740 * we can expect USER0 is not used (see vread/vwrite's 1741 * function description) 1742 */ 1743 void *map = kmap_atomic(p, KM_USER0); 1744 memcpy(buf, map + offset, length); 1745 kunmap_atomic(map, KM_USER0); 1746 } else 1747 memset(buf, 0, length); 1748 1749 addr += length; 1750 buf += length; 1751 copied += length; 1752 count -= length; 1753 } 1754 return copied; 1755 } 1756 1757 static int aligned_vwrite(char *buf, char *addr, unsigned long count) 1758 { 1759 struct page *p; 1760 int copied = 0; 1761 1762 while (count) { 1763 unsigned long offset, length; 1764 1765 offset = (unsigned long)addr & ~PAGE_MASK; 1766 length = PAGE_SIZE - offset; 1767 if (length > count) 1768 length = count; 1769 p = vmalloc_to_page(addr); 1770 /* 1771 * To do safe access to this _mapped_ area, we need 1772 * lock. But adding lock here means that we need to add 1773 * overhead of vmalloc()/vfree() calles for this _debug_ 1774 * interface, rarely used. Instead of that, we'll use 1775 * kmap() and get small overhead in this access function. 1776 */ 1777 if (p) { 1778 /* 1779 * we can expect USER0 is not used (see vread/vwrite's 1780 * function description) 1781 */ 1782 void *map = kmap_atomic(p, KM_USER0); 1783 memcpy(map + offset, buf, length); 1784 kunmap_atomic(map, KM_USER0); 1785 } 1786 addr += length; 1787 buf += length; 1788 copied += length; 1789 count -= length; 1790 } 1791 return copied; 1792 } 1793 1794 /** 1795 * vread() - read vmalloc area in a safe way. 1796 * @buf: buffer for reading data 1797 * @addr: vm address. 1798 * @count: number of bytes to be read. 1799 * 1800 * Returns # of bytes which addr and buf should be increased. 1801 * (same number to @count). Returns 0 if [addr...addr+count) doesn't 1802 * includes any intersect with alive vmalloc area. 1803 * 1804 * This function checks that addr is a valid vmalloc'ed area, and 1805 * copy data from that area to a given buffer. If the given memory range 1806 * of [addr...addr+count) includes some valid address, data is copied to 1807 * proper area of @buf. If there are memory holes, they'll be zero-filled. 1808 * IOREMAP area is treated as memory hole and no copy is done. 1809 * 1810 * If [addr...addr+count) doesn't includes any intersects with alive 1811 * vm_struct area, returns 0. 1812 * @buf should be kernel's buffer. Because this function uses KM_USER0, 1813 * the caller should guarantee KM_USER0 is not used. 1814 * 1815 * Note: In usual ops, vread() is never necessary because the caller 1816 * should know vmalloc() area is valid and can use memcpy(). 1817 * This is for routines which have to access vmalloc area without 1818 * any informaion, as /dev/kmem. 1819 * 1820 */ 1821 1822 long vread(char *buf, char *addr, unsigned long count) 1823 { 1824 struct vm_struct *tmp; 1825 char *vaddr, *buf_start = buf; 1826 unsigned long buflen = count; 1827 unsigned long n; 1828 1829 /* Don't allow overflow */ 1830 if ((unsigned long) addr + count < count) 1831 count = -(unsigned long) addr; 1832 1833 read_lock(&vmlist_lock); 1834 for (tmp = vmlist; count && tmp; tmp = tmp->next) { 1835 vaddr = (char *) tmp->addr; 1836 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1837 continue; 1838 while (addr < vaddr) { 1839 if (count == 0) 1840 goto finished; 1841 *buf = '\0'; 1842 buf++; 1843 addr++; 1844 count--; 1845 } 1846 n = vaddr + tmp->size - PAGE_SIZE - addr; 1847 if (n > count) 1848 n = count; 1849 if (!(tmp->flags & VM_IOREMAP)) 1850 aligned_vread(buf, addr, n); 1851 else /* IOREMAP area is treated as memory hole */ 1852 memset(buf, 0, n); 1853 buf += n; 1854 addr += n; 1855 count -= n; 1856 } 1857 finished: 1858 read_unlock(&vmlist_lock); 1859 1860 if (buf == buf_start) 1861 return 0; 1862 /* zero-fill memory holes */ 1863 if (buf != buf_start + buflen) 1864 memset(buf, 0, buflen - (buf - buf_start)); 1865 1866 return buflen; 1867 } 1868 1869 /** 1870 * vwrite() - write vmalloc area in a safe way. 1871 * @buf: buffer for source data 1872 * @addr: vm address. 1873 * @count: number of bytes to be read. 1874 * 1875 * Returns # of bytes which addr and buf should be incresed. 1876 * (same number to @count). 1877 * If [addr...addr+count) doesn't includes any intersect with valid 1878 * vmalloc area, returns 0. 1879 * 1880 * This function checks that addr is a valid vmalloc'ed area, and 1881 * copy data from a buffer to the given addr. If specified range of 1882 * [addr...addr+count) includes some valid address, data is copied from 1883 * proper area of @buf. If there are memory holes, no copy to hole. 1884 * IOREMAP area is treated as memory hole and no copy is done. 1885 * 1886 * If [addr...addr+count) doesn't includes any intersects with alive 1887 * vm_struct area, returns 0. 1888 * @buf should be kernel's buffer. Because this function uses KM_USER0, 1889 * the caller should guarantee KM_USER0 is not used. 1890 * 1891 * Note: In usual ops, vwrite() is never necessary because the caller 1892 * should know vmalloc() area is valid and can use memcpy(). 1893 * This is for routines which have to access vmalloc area without 1894 * any informaion, as /dev/kmem. 1895 * 1896 * The caller should guarantee KM_USER1 is not used. 1897 */ 1898 1899 long vwrite(char *buf, char *addr, unsigned long count) 1900 { 1901 struct vm_struct *tmp; 1902 char *vaddr; 1903 unsigned long n, buflen; 1904 int copied = 0; 1905 1906 /* Don't allow overflow */ 1907 if ((unsigned long) addr + count < count) 1908 count = -(unsigned long) addr; 1909 buflen = count; 1910 1911 read_lock(&vmlist_lock); 1912 for (tmp = vmlist; count && tmp; tmp = tmp->next) { 1913 vaddr = (char *) tmp->addr; 1914 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1915 continue; 1916 while (addr < vaddr) { 1917 if (count == 0) 1918 goto finished; 1919 buf++; 1920 addr++; 1921 count--; 1922 } 1923 n = vaddr + tmp->size - PAGE_SIZE - addr; 1924 if (n > count) 1925 n = count; 1926 if (!(tmp->flags & VM_IOREMAP)) { 1927 aligned_vwrite(buf, addr, n); 1928 copied++; 1929 } 1930 buf += n; 1931 addr += n; 1932 count -= n; 1933 } 1934 finished: 1935 read_unlock(&vmlist_lock); 1936 if (!copied) 1937 return 0; 1938 return buflen; 1939 } 1940 1941 /** 1942 * remap_vmalloc_range - map vmalloc pages to userspace 1943 * @vma: vma to cover (map full range of vma) 1944 * @addr: vmalloc memory 1945 * @pgoff: number of pages into addr before first page to map 1946 * 1947 * Returns: 0 for success, -Exxx on failure 1948 * 1949 * This function checks that addr is a valid vmalloc'ed area, and 1950 * that it is big enough to cover the vma. Will return failure if 1951 * that criteria isn't met. 1952 * 1953 * Similar to remap_pfn_range() (see mm/memory.c) 1954 */ 1955 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 1956 unsigned long pgoff) 1957 { 1958 struct vm_struct *area; 1959 unsigned long uaddr = vma->vm_start; 1960 unsigned long usize = vma->vm_end - vma->vm_start; 1961 1962 if ((PAGE_SIZE-1) & (unsigned long)addr) 1963 return -EINVAL; 1964 1965 area = find_vm_area(addr); 1966 if (!area) 1967 return -EINVAL; 1968 1969 if (!(area->flags & VM_USERMAP)) 1970 return -EINVAL; 1971 1972 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) 1973 return -EINVAL; 1974 1975 addr += pgoff << PAGE_SHIFT; 1976 do { 1977 struct page *page = vmalloc_to_page(addr); 1978 int ret; 1979 1980 ret = vm_insert_page(vma, uaddr, page); 1981 if (ret) 1982 return ret; 1983 1984 uaddr += PAGE_SIZE; 1985 addr += PAGE_SIZE; 1986 usize -= PAGE_SIZE; 1987 } while (usize > 0); 1988 1989 /* Prevent "things" like memory migration? VM_flags need a cleanup... */ 1990 vma->vm_flags |= VM_RESERVED; 1991 1992 return 0; 1993 } 1994 EXPORT_SYMBOL(remap_vmalloc_range); 1995 1996 /* 1997 * Implement a stub for vmalloc_sync_all() if the architecture chose not to 1998 * have one. 1999 */ 2000 void __attribute__((weak)) vmalloc_sync_all(void) 2001 { 2002 } 2003 2004 2005 static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) 2006 { 2007 /* apply_to_page_range() does all the hard work. */ 2008 return 0; 2009 } 2010 2011 /** 2012 * alloc_vm_area - allocate a range of kernel address space 2013 * @size: size of the area 2014 * 2015 * Returns: NULL on failure, vm_struct on success 2016 * 2017 * This function reserves a range of kernel address space, and 2018 * allocates pagetables to map that range. No actual mappings 2019 * are created. If the kernel address space is not shared 2020 * between processes, it syncs the pagetable across all 2021 * processes. 2022 */ 2023 struct vm_struct *alloc_vm_area(size_t size) 2024 { 2025 struct vm_struct *area; 2026 2027 area = get_vm_area_caller(size, VM_IOREMAP, 2028 __builtin_return_address(0)); 2029 if (area == NULL) 2030 return NULL; 2031 2032 /* 2033 * This ensures that page tables are constructed for this region 2034 * of kernel virtual address space and mapped into init_mm. 2035 */ 2036 if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 2037 area->size, f, NULL)) { 2038 free_vm_area(area); 2039 return NULL; 2040 } 2041 2042 /* Make sure the pagetables are constructed in process kernel 2043 mappings */ 2044 vmalloc_sync_all(); 2045 2046 return area; 2047 } 2048 EXPORT_SYMBOL_GPL(alloc_vm_area); 2049 2050 void free_vm_area(struct vm_struct *area) 2051 { 2052 struct vm_struct *ret; 2053 ret = remove_vm_area(area->addr); 2054 BUG_ON(ret != area); 2055 kfree(area); 2056 } 2057 EXPORT_SYMBOL_GPL(free_vm_area); 2058 2059 static struct vmap_area *node_to_va(struct rb_node *n) 2060 { 2061 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; 2062 } 2063 2064 /** 2065 * pvm_find_next_prev - find the next and prev vmap_area surrounding @end 2066 * @end: target address 2067 * @pnext: out arg for the next vmap_area 2068 * @pprev: out arg for the previous vmap_area 2069 * 2070 * Returns: %true if either or both of next and prev are found, 2071 * %false if no vmap_area exists 2072 * 2073 * Find vmap_areas end addresses of which enclose @end. ie. if not 2074 * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. 2075 */ 2076 static bool pvm_find_next_prev(unsigned long end, 2077 struct vmap_area **pnext, 2078 struct vmap_area **pprev) 2079 { 2080 struct rb_node *n = vmap_area_root.rb_node; 2081 struct vmap_area *va = NULL; 2082 2083 while (n) { 2084 va = rb_entry(n, struct vmap_area, rb_node); 2085 if (end < va->va_end) 2086 n = n->rb_left; 2087 else if (end > va->va_end) 2088 n = n->rb_right; 2089 else 2090 break; 2091 } 2092 2093 if (!va) 2094 return false; 2095 2096 if (va->va_end > end) { 2097 *pnext = va; 2098 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); 2099 } else { 2100 *pprev = va; 2101 *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); 2102 } 2103 return true; 2104 } 2105 2106 /** 2107 * pvm_determine_end - find the highest aligned address between two vmap_areas 2108 * @pnext: in/out arg for the next vmap_area 2109 * @pprev: in/out arg for the previous vmap_area 2110 * @align: alignment 2111 * 2112 * Returns: determined end address 2113 * 2114 * Find the highest aligned address between *@pnext and *@pprev below 2115 * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned 2116 * down address is between the end addresses of the two vmap_areas. 2117 * 2118 * Please note that the address returned by this function may fall 2119 * inside *@pnext vmap_area. The caller is responsible for checking 2120 * that. 2121 */ 2122 static unsigned long pvm_determine_end(struct vmap_area **pnext, 2123 struct vmap_area **pprev, 2124 unsigned long align) 2125 { 2126 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 2127 unsigned long addr; 2128 2129 if (*pnext) 2130 addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); 2131 else 2132 addr = vmalloc_end; 2133 2134 while (*pprev && (*pprev)->va_end > addr) { 2135 *pnext = *pprev; 2136 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); 2137 } 2138 2139 return addr; 2140 } 2141 2142 /** 2143 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator 2144 * @offsets: array containing offset of each area 2145 * @sizes: array containing size of each area 2146 * @nr_vms: the number of areas to allocate 2147 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 2148 * @gfp_mask: allocation mask 2149 * 2150 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 2151 * vm_structs on success, %NULL on failure 2152 * 2153 * Percpu allocator wants to use congruent vm areas so that it can 2154 * maintain the offsets among percpu areas. This function allocates 2155 * congruent vmalloc areas for it. These areas tend to be scattered 2156 * pretty far, distance between two areas easily going up to 2157 * gigabytes. To avoid interacting with regular vmallocs, these areas 2158 * are allocated from top. 2159 * 2160 * Despite its complicated look, this allocator is rather simple. It 2161 * does everything top-down and scans areas from the end looking for 2162 * matching slot. While scanning, if any of the areas overlaps with 2163 * existing vmap_area, the base address is pulled down to fit the 2164 * area. Scanning is repeated till all the areas fit and then all 2165 * necessary data structres are inserted and the result is returned. 2166 */ 2167 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 2168 const size_t *sizes, int nr_vms, 2169 size_t align, gfp_t gfp_mask) 2170 { 2171 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 2172 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 2173 struct vmap_area **vas, *prev, *next; 2174 struct vm_struct **vms; 2175 int area, area2, last_area, term_area; 2176 unsigned long base, start, end, last_end; 2177 bool purged = false; 2178 2179 gfp_mask &= GFP_RECLAIM_MASK; 2180 2181 /* verify parameters and allocate data structures */ 2182 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); 2183 for (last_area = 0, area = 0; area < nr_vms; area++) { 2184 start = offsets[area]; 2185 end = start + sizes[area]; 2186 2187 /* is everything aligned properly? */ 2188 BUG_ON(!IS_ALIGNED(offsets[area], align)); 2189 BUG_ON(!IS_ALIGNED(sizes[area], align)); 2190 2191 /* detect the area with the highest address */ 2192 if (start > offsets[last_area]) 2193 last_area = area; 2194 2195 for (area2 = 0; area2 < nr_vms; area2++) { 2196 unsigned long start2 = offsets[area2]; 2197 unsigned long end2 = start2 + sizes[area2]; 2198 2199 if (area2 == area) 2200 continue; 2201 2202 BUG_ON(start2 >= start && start2 < end); 2203 BUG_ON(end2 <= end && end2 > start); 2204 } 2205 } 2206 last_end = offsets[last_area] + sizes[last_area]; 2207 2208 if (vmalloc_end - vmalloc_start < last_end) { 2209 WARN_ON(true); 2210 return NULL; 2211 } 2212 2213 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); 2214 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); 2215 if (!vas || !vms) 2216 goto err_free; 2217 2218 for (area = 0; area < nr_vms; area++) { 2219 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); 2220 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); 2221 if (!vas[area] || !vms[area]) 2222 goto err_free; 2223 } 2224 retry: 2225 spin_lock(&vmap_area_lock); 2226 2227 /* start scanning - we scan from the top, begin with the last area */ 2228 area = term_area = last_area; 2229 start = offsets[area]; 2230 end = start + sizes[area]; 2231 2232 if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { 2233 base = vmalloc_end - last_end; 2234 goto found; 2235 } 2236 base = pvm_determine_end(&next, &prev, align) - end; 2237 2238 while (true) { 2239 BUG_ON(next && next->va_end <= base + end); 2240 BUG_ON(prev && prev->va_end > base + end); 2241 2242 /* 2243 * base might have underflowed, add last_end before 2244 * comparing. 2245 */ 2246 if (base + last_end < vmalloc_start + last_end) { 2247 spin_unlock(&vmap_area_lock); 2248 if (!purged) { 2249 purge_vmap_area_lazy(); 2250 purged = true; 2251 goto retry; 2252 } 2253 goto err_free; 2254 } 2255 2256 /* 2257 * If next overlaps, move base downwards so that it's 2258 * right below next and then recheck. 2259 */ 2260 if (next && next->va_start < base + end) { 2261 base = pvm_determine_end(&next, &prev, align) - end; 2262 term_area = area; 2263 continue; 2264 } 2265 2266 /* 2267 * If prev overlaps, shift down next and prev and move 2268 * base so that it's right below new next and then 2269 * recheck. 2270 */ 2271 if (prev && prev->va_end > base + start) { 2272 next = prev; 2273 prev = node_to_va(rb_prev(&next->rb_node)); 2274 base = pvm_determine_end(&next, &prev, align) - end; 2275 term_area = area; 2276 continue; 2277 } 2278 2279 /* 2280 * This area fits, move on to the previous one. If 2281 * the previous one is the terminal one, we're done. 2282 */ 2283 area = (area + nr_vms - 1) % nr_vms; 2284 if (area == term_area) 2285 break; 2286 start = offsets[area]; 2287 end = start + sizes[area]; 2288 pvm_find_next_prev(base + end, &next, &prev); 2289 } 2290 found: 2291 /* we've found a fitting base, insert all va's */ 2292 for (area = 0; area < nr_vms; area++) { 2293 struct vmap_area *va = vas[area]; 2294 2295 va->va_start = base + offsets[area]; 2296 va->va_end = va->va_start + sizes[area]; 2297 __insert_vmap_area(va); 2298 } 2299 2300 vmap_area_pcpu_hole = base + offsets[last_area]; 2301 2302 spin_unlock(&vmap_area_lock); 2303 2304 /* insert all vm's */ 2305 for (area = 0; area < nr_vms; area++) 2306 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, 2307 pcpu_get_vm_areas); 2308 2309 kfree(vas); 2310 return vms; 2311 2312 err_free: 2313 for (area = 0; area < nr_vms; area++) { 2314 if (vas) 2315 kfree(vas[area]); 2316 if (vms) 2317 kfree(vms[area]); 2318 } 2319 kfree(vas); 2320 kfree(vms); 2321 return NULL; 2322 } 2323 2324 /** 2325 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator 2326 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() 2327 * @nr_vms: the number of allocated areas 2328 * 2329 * Free vm_structs and the array allocated by pcpu_get_vm_areas(). 2330 */ 2331 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) 2332 { 2333 int i; 2334 2335 for (i = 0; i < nr_vms; i++) 2336 free_vm_area(vms[i]); 2337 kfree(vms); 2338 } 2339 2340 #ifdef CONFIG_PROC_FS 2341 static void *s_start(struct seq_file *m, loff_t *pos) 2342 { 2343 loff_t n = *pos; 2344 struct vm_struct *v; 2345 2346 read_lock(&vmlist_lock); 2347 v = vmlist; 2348 while (n > 0 && v) { 2349 n--; 2350 v = v->next; 2351 } 2352 if (!n) 2353 return v; 2354 2355 return NULL; 2356 2357 } 2358 2359 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 2360 { 2361 struct vm_struct *v = p; 2362 2363 ++*pos; 2364 return v->next; 2365 } 2366 2367 static void s_stop(struct seq_file *m, void *p) 2368 { 2369 read_unlock(&vmlist_lock); 2370 } 2371 2372 static void show_numa_info(struct seq_file *m, struct vm_struct *v) 2373 { 2374 if (NUMA_BUILD) { 2375 unsigned int nr, *counters = m->private; 2376 2377 if (!counters) 2378 return; 2379 2380 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 2381 2382 for (nr = 0; nr < v->nr_pages; nr++) 2383 counters[page_to_nid(v->pages[nr])]++; 2384 2385 for_each_node_state(nr, N_HIGH_MEMORY) 2386 if (counters[nr]) 2387 seq_printf(m, " N%u=%u", nr, counters[nr]); 2388 } 2389 } 2390 2391 static int s_show(struct seq_file *m, void *p) 2392 { 2393 struct vm_struct *v = p; 2394 2395 seq_printf(m, "0x%p-0x%p %7ld", 2396 v->addr, v->addr + v->size, v->size); 2397 2398 if (v->caller) { 2399 char buff[KSYM_SYMBOL_LEN]; 2400 2401 seq_putc(m, ' '); 2402 sprint_symbol(buff, (unsigned long)v->caller); 2403 seq_puts(m, buff); 2404 } 2405 2406 if (v->nr_pages) 2407 seq_printf(m, " pages=%d", v->nr_pages); 2408 2409 if (v->phys_addr) 2410 seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); 2411 2412 if (v->flags & VM_IOREMAP) 2413 seq_printf(m, " ioremap"); 2414 2415 if (v->flags & VM_ALLOC) 2416 seq_printf(m, " vmalloc"); 2417 2418 if (v->flags & VM_MAP) 2419 seq_printf(m, " vmap"); 2420 2421 if (v->flags & VM_USERMAP) 2422 seq_printf(m, " user"); 2423 2424 if (v->flags & VM_VPAGES) 2425 seq_printf(m, " vpages"); 2426 2427 show_numa_info(m, v); 2428 seq_putc(m, '\n'); 2429 return 0; 2430 } 2431 2432 static const struct seq_operations vmalloc_op = { 2433 .start = s_start, 2434 .next = s_next, 2435 .stop = s_stop, 2436 .show = s_show, 2437 }; 2438 2439 static int vmalloc_open(struct inode *inode, struct file *file) 2440 { 2441 unsigned int *ptr = NULL; 2442 int ret; 2443 2444 if (NUMA_BUILD) { 2445 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); 2446 if (ptr == NULL) 2447 return -ENOMEM; 2448 } 2449 ret = seq_open(file, &vmalloc_op); 2450 if (!ret) { 2451 struct seq_file *m = file->private_data; 2452 m->private = ptr; 2453 } else 2454 kfree(ptr); 2455 return ret; 2456 } 2457 2458 static const struct file_operations proc_vmalloc_operations = { 2459 .open = vmalloc_open, 2460 .read = seq_read, 2461 .llseek = seq_lseek, 2462 .release = seq_release_private, 2463 }; 2464 2465 static int __init proc_vmalloc_init(void) 2466 { 2467 proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations); 2468 return 0; 2469 } 2470 module_init(proc_vmalloc_init); 2471 #endif 2472 2473