1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1993 Linus Torvalds 4 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 5 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 6 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 7 * Numa awareness, Christoph Lameter, SGI, June 2005 8 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 9 */ 10 11 #include <linux/vmalloc.h> 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/highmem.h> 15 #include <linux/sched/signal.h> 16 #include <linux/slab.h> 17 #include <linux/spinlock.h> 18 #include <linux/interrupt.h> 19 #include <linux/proc_fs.h> 20 #include <linux/seq_file.h> 21 #include <linux/set_memory.h> 22 #include <linux/debugobjects.h> 23 #include <linux/kallsyms.h> 24 #include <linux/list.h> 25 #include <linux/notifier.h> 26 #include <linux/rbtree.h> 27 #include <linux/xarray.h> 28 #include <linux/io.h> 29 #include <linux/rcupdate.h> 30 #include <linux/pfn.h> 31 #include <linux/kmemleak.h> 32 #include <linux/atomic.h> 33 #include <linux/compiler.h> 34 #include <linux/llist.h> 35 #include <linux/bitops.h> 36 #include <linux/rbtree_augmented.h> 37 #include <linux/overflow.h> 38 #include <linux/pgtable.h> 39 #include <linux/uaccess.h> 40 #include <linux/hugetlb.h> 41 #include <asm/tlbflush.h> 42 #include <asm/shmparam.h> 43 44 #include "internal.h" 45 #include "pgalloc-track.h" 46 47 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP 48 static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1; 49 50 static int __init set_nohugeiomap(char *str) 51 { 52 ioremap_max_page_shift = PAGE_SHIFT; 53 return 0; 54 } 55 early_param("nohugeiomap", set_nohugeiomap); 56 #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 57 static const unsigned int ioremap_max_page_shift = PAGE_SHIFT; 58 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 59 60 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 61 static bool __ro_after_init vmap_allow_huge = true; 62 63 static int __init set_nohugevmalloc(char *str) 64 { 65 vmap_allow_huge = false; 66 return 0; 67 } 68 early_param("nohugevmalloc", set_nohugevmalloc); 69 #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ 70 static const bool vmap_allow_huge = false; 71 #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ 72 73 bool is_vmalloc_addr(const void *x) 74 { 75 unsigned long addr = (unsigned long)x; 76 77 return addr >= VMALLOC_START && addr < VMALLOC_END; 78 } 79 EXPORT_SYMBOL(is_vmalloc_addr); 80 81 struct vfree_deferred { 82 struct llist_head list; 83 struct work_struct wq; 84 }; 85 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); 86 87 static void __vunmap(const void *, int); 88 89 static void free_work(struct work_struct *w) 90 { 91 struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); 92 struct llist_node *t, *llnode; 93 94 llist_for_each_safe(llnode, t, llist_del_all(&p->list)) 95 __vunmap((void *)llnode, 1); 96 } 97 98 /*** Page table manipulation functions ***/ 99 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 100 phys_addr_t phys_addr, pgprot_t prot, 101 unsigned int max_page_shift, pgtbl_mod_mask *mask) 102 { 103 pte_t *pte; 104 u64 pfn; 105 unsigned long size = PAGE_SIZE; 106 107 pfn = phys_addr >> PAGE_SHIFT; 108 pte = pte_alloc_kernel_track(pmd, addr, mask); 109 if (!pte) 110 return -ENOMEM; 111 do { 112 BUG_ON(!pte_none(*pte)); 113 114 #ifdef CONFIG_HUGETLB_PAGE 115 size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); 116 if (size != PAGE_SIZE) { 117 pte_t entry = pfn_pte(pfn, prot); 118 119 entry = pte_mkhuge(entry); 120 entry = arch_make_huge_pte(entry, ilog2(size), 0); 121 set_huge_pte_at(&init_mm, addr, pte, entry); 122 pfn += PFN_DOWN(size); 123 continue; 124 } 125 #endif 126 set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); 127 pfn++; 128 } while (pte += PFN_DOWN(size), addr += size, addr != end); 129 *mask |= PGTBL_PTE_MODIFIED; 130 return 0; 131 } 132 133 static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, 134 phys_addr_t phys_addr, pgprot_t prot, 135 unsigned int max_page_shift) 136 { 137 if (max_page_shift < PMD_SHIFT) 138 return 0; 139 140 if (!arch_vmap_pmd_supported(prot)) 141 return 0; 142 143 if ((end - addr) != PMD_SIZE) 144 return 0; 145 146 if (!IS_ALIGNED(addr, PMD_SIZE)) 147 return 0; 148 149 if (!IS_ALIGNED(phys_addr, PMD_SIZE)) 150 return 0; 151 152 if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) 153 return 0; 154 155 return pmd_set_huge(pmd, phys_addr, prot); 156 } 157 158 static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 159 phys_addr_t phys_addr, pgprot_t prot, 160 unsigned int max_page_shift, pgtbl_mod_mask *mask) 161 { 162 pmd_t *pmd; 163 unsigned long next; 164 165 pmd = pmd_alloc_track(&init_mm, pud, addr, mask); 166 if (!pmd) 167 return -ENOMEM; 168 do { 169 next = pmd_addr_end(addr, end); 170 171 if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, 172 max_page_shift)) { 173 *mask |= PGTBL_PMD_MODIFIED; 174 continue; 175 } 176 177 if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask)) 178 return -ENOMEM; 179 } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); 180 return 0; 181 } 182 183 static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, 184 phys_addr_t phys_addr, pgprot_t prot, 185 unsigned int max_page_shift) 186 { 187 if (max_page_shift < PUD_SHIFT) 188 return 0; 189 190 if (!arch_vmap_pud_supported(prot)) 191 return 0; 192 193 if ((end - addr) != PUD_SIZE) 194 return 0; 195 196 if (!IS_ALIGNED(addr, PUD_SIZE)) 197 return 0; 198 199 if (!IS_ALIGNED(phys_addr, PUD_SIZE)) 200 return 0; 201 202 if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) 203 return 0; 204 205 return pud_set_huge(pud, phys_addr, prot); 206 } 207 208 static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 209 phys_addr_t phys_addr, pgprot_t prot, 210 unsigned int max_page_shift, pgtbl_mod_mask *mask) 211 { 212 pud_t *pud; 213 unsigned long next; 214 215 pud = pud_alloc_track(&init_mm, p4d, addr, mask); 216 if (!pud) 217 return -ENOMEM; 218 do { 219 next = pud_addr_end(addr, end); 220 221 if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, 222 max_page_shift)) { 223 *mask |= PGTBL_PUD_MODIFIED; 224 continue; 225 } 226 227 if (vmap_pmd_range(pud, addr, next, phys_addr, prot, 228 max_page_shift, mask)) 229 return -ENOMEM; 230 } while (pud++, phys_addr += (next - addr), addr = next, addr != end); 231 return 0; 232 } 233 234 static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, 235 phys_addr_t phys_addr, pgprot_t prot, 236 unsigned int max_page_shift) 237 { 238 if (max_page_shift < P4D_SHIFT) 239 return 0; 240 241 if (!arch_vmap_p4d_supported(prot)) 242 return 0; 243 244 if ((end - addr) != P4D_SIZE) 245 return 0; 246 247 if (!IS_ALIGNED(addr, P4D_SIZE)) 248 return 0; 249 250 if (!IS_ALIGNED(phys_addr, P4D_SIZE)) 251 return 0; 252 253 if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) 254 return 0; 255 256 return p4d_set_huge(p4d, phys_addr, prot); 257 } 258 259 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 260 phys_addr_t phys_addr, pgprot_t prot, 261 unsigned int max_page_shift, pgtbl_mod_mask *mask) 262 { 263 p4d_t *p4d; 264 unsigned long next; 265 266 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); 267 if (!p4d) 268 return -ENOMEM; 269 do { 270 next = p4d_addr_end(addr, end); 271 272 if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, 273 max_page_shift)) { 274 *mask |= PGTBL_P4D_MODIFIED; 275 continue; 276 } 277 278 if (vmap_pud_range(p4d, addr, next, phys_addr, prot, 279 max_page_shift, mask)) 280 return -ENOMEM; 281 } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); 282 return 0; 283 } 284 285 static int vmap_range_noflush(unsigned long addr, unsigned long end, 286 phys_addr_t phys_addr, pgprot_t prot, 287 unsigned int max_page_shift) 288 { 289 pgd_t *pgd; 290 unsigned long start; 291 unsigned long next; 292 int err; 293 pgtbl_mod_mask mask = 0; 294 295 might_sleep(); 296 BUG_ON(addr >= end); 297 298 start = addr; 299 pgd = pgd_offset_k(addr); 300 do { 301 next = pgd_addr_end(addr, end); 302 err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, 303 max_page_shift, &mask); 304 if (err) 305 break; 306 } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); 307 308 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 309 arch_sync_kernel_mappings(start, end); 310 311 return err; 312 } 313 314 int ioremap_page_range(unsigned long addr, unsigned long end, 315 phys_addr_t phys_addr, pgprot_t prot) 316 { 317 int err; 318 319 err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), 320 ioremap_max_page_shift); 321 flush_cache_vmap(addr, end); 322 return err; 323 } 324 325 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 326 pgtbl_mod_mask *mask) 327 { 328 pte_t *pte; 329 330 pte = pte_offset_kernel(pmd, addr); 331 do { 332 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); 333 WARN_ON(!pte_none(ptent) && !pte_present(ptent)); 334 } while (pte++, addr += PAGE_SIZE, addr != end); 335 *mask |= PGTBL_PTE_MODIFIED; 336 } 337 338 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 339 pgtbl_mod_mask *mask) 340 { 341 pmd_t *pmd; 342 unsigned long next; 343 int cleared; 344 345 pmd = pmd_offset(pud, addr); 346 do { 347 next = pmd_addr_end(addr, end); 348 349 cleared = pmd_clear_huge(pmd); 350 if (cleared || pmd_bad(*pmd)) 351 *mask |= PGTBL_PMD_MODIFIED; 352 353 if (cleared) 354 continue; 355 if (pmd_none_or_clear_bad(pmd)) 356 continue; 357 vunmap_pte_range(pmd, addr, next, mask); 358 359 cond_resched(); 360 } while (pmd++, addr = next, addr != end); 361 } 362 363 static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 364 pgtbl_mod_mask *mask) 365 { 366 pud_t *pud; 367 unsigned long next; 368 int cleared; 369 370 pud = pud_offset(p4d, addr); 371 do { 372 next = pud_addr_end(addr, end); 373 374 cleared = pud_clear_huge(pud); 375 if (cleared || pud_bad(*pud)) 376 *mask |= PGTBL_PUD_MODIFIED; 377 378 if (cleared) 379 continue; 380 if (pud_none_or_clear_bad(pud)) 381 continue; 382 vunmap_pmd_range(pud, addr, next, mask); 383 } while (pud++, addr = next, addr != end); 384 } 385 386 static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 387 pgtbl_mod_mask *mask) 388 { 389 p4d_t *p4d; 390 unsigned long next; 391 int cleared; 392 393 p4d = p4d_offset(pgd, addr); 394 do { 395 next = p4d_addr_end(addr, end); 396 397 cleared = p4d_clear_huge(p4d); 398 if (cleared || p4d_bad(*p4d)) 399 *mask |= PGTBL_P4D_MODIFIED; 400 401 if (cleared) 402 continue; 403 if (p4d_none_or_clear_bad(p4d)) 404 continue; 405 vunmap_pud_range(p4d, addr, next, mask); 406 } while (p4d++, addr = next, addr != end); 407 } 408 409 /* 410 * vunmap_range_noflush is similar to vunmap_range, but does not 411 * flush caches or TLBs. 412 * 413 * The caller is responsible for calling flush_cache_vmap() before calling 414 * this function, and flush_tlb_kernel_range after it has returned 415 * successfully (and before the addresses are expected to cause a page fault 416 * or be re-mapped for something else, if TLB flushes are being delayed or 417 * coalesced). 418 * 419 * This is an internal function only. Do not use outside mm/. 420 */ 421 void vunmap_range_noflush(unsigned long start, unsigned long end) 422 { 423 unsigned long next; 424 pgd_t *pgd; 425 unsigned long addr = start; 426 pgtbl_mod_mask mask = 0; 427 428 BUG_ON(addr >= end); 429 pgd = pgd_offset_k(addr); 430 do { 431 next = pgd_addr_end(addr, end); 432 if (pgd_bad(*pgd)) 433 mask |= PGTBL_PGD_MODIFIED; 434 if (pgd_none_or_clear_bad(pgd)) 435 continue; 436 vunmap_p4d_range(pgd, addr, next, &mask); 437 } while (pgd++, addr = next, addr != end); 438 439 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 440 arch_sync_kernel_mappings(start, end); 441 } 442 443 /** 444 * vunmap_range - unmap kernel virtual addresses 445 * @addr: start of the VM area to unmap 446 * @end: end of the VM area to unmap (non-inclusive) 447 * 448 * Clears any present PTEs in the virtual address range, flushes TLBs and 449 * caches. Any subsequent access to the address before it has been re-mapped 450 * is a kernel bug. 451 */ 452 void vunmap_range(unsigned long addr, unsigned long end) 453 { 454 flush_cache_vunmap(addr, end); 455 vunmap_range_noflush(addr, end); 456 flush_tlb_kernel_range(addr, end); 457 } 458 459 static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, 460 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 461 pgtbl_mod_mask *mask) 462 { 463 pte_t *pte; 464 465 /* 466 * nr is a running index into the array which helps higher level 467 * callers keep track of where we're up to. 468 */ 469 470 pte = pte_alloc_kernel_track(pmd, addr, mask); 471 if (!pte) 472 return -ENOMEM; 473 do { 474 struct page *page = pages[*nr]; 475 476 if (WARN_ON(!pte_none(*pte))) 477 return -EBUSY; 478 if (WARN_ON(!page)) 479 return -ENOMEM; 480 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 481 (*nr)++; 482 } while (pte++, addr += PAGE_SIZE, addr != end); 483 *mask |= PGTBL_PTE_MODIFIED; 484 return 0; 485 } 486 487 static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, 488 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 489 pgtbl_mod_mask *mask) 490 { 491 pmd_t *pmd; 492 unsigned long next; 493 494 pmd = pmd_alloc_track(&init_mm, pud, addr, mask); 495 if (!pmd) 496 return -ENOMEM; 497 do { 498 next = pmd_addr_end(addr, end); 499 if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) 500 return -ENOMEM; 501 } while (pmd++, addr = next, addr != end); 502 return 0; 503 } 504 505 static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, 506 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 507 pgtbl_mod_mask *mask) 508 { 509 pud_t *pud; 510 unsigned long next; 511 512 pud = pud_alloc_track(&init_mm, p4d, addr, mask); 513 if (!pud) 514 return -ENOMEM; 515 do { 516 next = pud_addr_end(addr, end); 517 if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) 518 return -ENOMEM; 519 } while (pud++, addr = next, addr != end); 520 return 0; 521 } 522 523 static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, 524 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 525 pgtbl_mod_mask *mask) 526 { 527 p4d_t *p4d; 528 unsigned long next; 529 530 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); 531 if (!p4d) 532 return -ENOMEM; 533 do { 534 next = p4d_addr_end(addr, end); 535 if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) 536 return -ENOMEM; 537 } while (p4d++, addr = next, addr != end); 538 return 0; 539 } 540 541 static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, 542 pgprot_t prot, struct page **pages) 543 { 544 unsigned long start = addr; 545 pgd_t *pgd; 546 unsigned long next; 547 int err = 0; 548 int nr = 0; 549 pgtbl_mod_mask mask = 0; 550 551 BUG_ON(addr >= end); 552 pgd = pgd_offset_k(addr); 553 do { 554 next = pgd_addr_end(addr, end); 555 if (pgd_bad(*pgd)) 556 mask |= PGTBL_PGD_MODIFIED; 557 err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); 558 if (err) 559 return err; 560 } while (pgd++, addr = next, addr != end); 561 562 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 563 arch_sync_kernel_mappings(start, end); 564 565 return 0; 566 } 567 568 /* 569 * vmap_pages_range_noflush is similar to vmap_pages_range, but does not 570 * flush caches. 571 * 572 * The caller is responsible for calling flush_cache_vmap() after this 573 * function returns successfully and before the addresses are accessed. 574 * 575 * This is an internal function only. Do not use outside mm/. 576 */ 577 int vmap_pages_range_noflush(unsigned long addr, unsigned long end, 578 pgprot_t prot, struct page **pages, unsigned int page_shift) 579 { 580 unsigned int i, nr = (end - addr) >> PAGE_SHIFT; 581 582 WARN_ON(page_shift < PAGE_SHIFT); 583 584 if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || 585 page_shift == PAGE_SHIFT) 586 return vmap_small_pages_range_noflush(addr, end, prot, pages); 587 588 for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { 589 int err; 590 591 err = vmap_range_noflush(addr, addr + (1UL << page_shift), 592 __pa(page_address(pages[i])), prot, 593 page_shift); 594 if (err) 595 return err; 596 597 addr += 1UL << page_shift; 598 } 599 600 return 0; 601 } 602 603 /** 604 * vmap_pages_range - map pages to a kernel virtual address 605 * @addr: start of the VM area to map 606 * @end: end of the VM area to map (non-inclusive) 607 * @prot: page protection flags to use 608 * @pages: pages to map (always PAGE_SIZE pages) 609 * @page_shift: maximum shift that the pages may be mapped with, @pages must 610 * be aligned and contiguous up to at least this shift. 611 * 612 * RETURNS: 613 * 0 on success, -errno on failure. 614 */ 615 static int vmap_pages_range(unsigned long addr, unsigned long end, 616 pgprot_t prot, struct page **pages, unsigned int page_shift) 617 { 618 int err; 619 620 err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); 621 flush_cache_vmap(addr, end); 622 return err; 623 } 624 625 int is_vmalloc_or_module_addr(const void *x) 626 { 627 /* 628 * ARM, x86-64 and sparc64 put modules in a special place, 629 * and fall back on vmalloc() if that fails. Others 630 * just put it in the vmalloc space. 631 */ 632 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) 633 unsigned long addr = (unsigned long)x; 634 if (addr >= MODULES_VADDR && addr < MODULES_END) 635 return 1; 636 #endif 637 return is_vmalloc_addr(x); 638 } 639 640 /* 641 * Walk a vmap address to the struct page it maps. Huge vmap mappings will 642 * return the tail page that corresponds to the base page address, which 643 * matches small vmap mappings. 644 */ 645 struct page *vmalloc_to_page(const void *vmalloc_addr) 646 { 647 unsigned long addr = (unsigned long) vmalloc_addr; 648 struct page *page = NULL; 649 pgd_t *pgd = pgd_offset_k(addr); 650 p4d_t *p4d; 651 pud_t *pud; 652 pmd_t *pmd; 653 pte_t *ptep, pte; 654 655 /* 656 * XXX we might need to change this if we add VIRTUAL_BUG_ON for 657 * architectures that do not vmalloc module space 658 */ 659 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); 660 661 if (pgd_none(*pgd)) 662 return NULL; 663 if (WARN_ON_ONCE(pgd_leaf(*pgd))) 664 return NULL; /* XXX: no allowance for huge pgd */ 665 if (WARN_ON_ONCE(pgd_bad(*pgd))) 666 return NULL; 667 668 p4d = p4d_offset(pgd, addr); 669 if (p4d_none(*p4d)) 670 return NULL; 671 if (p4d_leaf(*p4d)) 672 return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); 673 if (WARN_ON_ONCE(p4d_bad(*p4d))) 674 return NULL; 675 676 pud = pud_offset(p4d, addr); 677 if (pud_none(*pud)) 678 return NULL; 679 if (pud_leaf(*pud)) 680 return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 681 if (WARN_ON_ONCE(pud_bad(*pud))) 682 return NULL; 683 684 pmd = pmd_offset(pud, addr); 685 if (pmd_none(*pmd)) 686 return NULL; 687 if (pmd_leaf(*pmd)) 688 return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 689 if (WARN_ON_ONCE(pmd_bad(*pmd))) 690 return NULL; 691 692 ptep = pte_offset_map(pmd, addr); 693 pte = *ptep; 694 if (pte_present(pte)) 695 page = pte_page(pte); 696 pte_unmap(ptep); 697 698 return page; 699 } 700 EXPORT_SYMBOL(vmalloc_to_page); 701 702 /* 703 * Map a vmalloc()-space virtual address to the physical page frame number. 704 */ 705 unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 706 { 707 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 708 } 709 EXPORT_SYMBOL(vmalloc_to_pfn); 710 711 712 /*** Global kva allocator ***/ 713 714 #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 715 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 716 717 718 static DEFINE_SPINLOCK(vmap_area_lock); 719 static DEFINE_SPINLOCK(free_vmap_area_lock); 720 /* Export for kexec only */ 721 LIST_HEAD(vmap_area_list); 722 static struct rb_root vmap_area_root = RB_ROOT; 723 static bool vmap_initialized __read_mostly; 724 725 static struct rb_root purge_vmap_area_root = RB_ROOT; 726 static LIST_HEAD(purge_vmap_area_list); 727 static DEFINE_SPINLOCK(purge_vmap_area_lock); 728 729 /* 730 * This kmem_cache is used for vmap_area objects. Instead of 731 * allocating from slab we reuse an object from this cache to 732 * make things faster. Especially in "no edge" splitting of 733 * free block. 734 */ 735 static struct kmem_cache *vmap_area_cachep; 736 737 /* 738 * This linked list is used in pair with free_vmap_area_root. 739 * It gives O(1) access to prev/next to perform fast coalescing. 740 */ 741 static LIST_HEAD(free_vmap_area_list); 742 743 /* 744 * This augment red-black tree represents the free vmap space. 745 * All vmap_area objects in this tree are sorted by va->va_start 746 * address. It is used for allocation and merging when a vmap 747 * object is released. 748 * 749 * Each vmap_area node contains a maximum available free block 750 * of its sub-tree, right or left. Therefore it is possible to 751 * find a lowest match of free area. 752 */ 753 static struct rb_root free_vmap_area_root = RB_ROOT; 754 755 /* 756 * Preload a CPU with one object for "no edge" split case. The 757 * aim is to get rid of allocations from the atomic context, thus 758 * to use more permissive allocation masks. 759 */ 760 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); 761 762 static __always_inline unsigned long 763 va_size(struct vmap_area *va) 764 { 765 return (va->va_end - va->va_start); 766 } 767 768 static __always_inline unsigned long 769 get_subtree_max_size(struct rb_node *node) 770 { 771 struct vmap_area *va; 772 773 va = rb_entry_safe(node, struct vmap_area, rb_node); 774 return va ? va->subtree_max_size : 0; 775 } 776 777 /* 778 * Gets called when remove the node and rotate. 779 */ 780 static __always_inline unsigned long 781 compute_subtree_max_size(struct vmap_area *va) 782 { 783 return max3(va_size(va), 784 get_subtree_max_size(va->rb_node.rb_left), 785 get_subtree_max_size(va->rb_node.rb_right)); 786 } 787 788 RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, 789 struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) 790 791 static void purge_vmap_area_lazy(void); 792 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); 793 static unsigned long lazy_max_pages(void); 794 795 static atomic_long_t nr_vmalloc_pages; 796 797 unsigned long vmalloc_nr_pages(void) 798 { 799 return atomic_long_read(&nr_vmalloc_pages); 800 } 801 802 static struct vmap_area *__find_vmap_area(unsigned long addr) 803 { 804 struct rb_node *n = vmap_area_root.rb_node; 805 806 while (n) { 807 struct vmap_area *va; 808 809 va = rb_entry(n, struct vmap_area, rb_node); 810 if (addr < va->va_start) 811 n = n->rb_left; 812 else if (addr >= va->va_end) 813 n = n->rb_right; 814 else 815 return va; 816 } 817 818 return NULL; 819 } 820 821 /* 822 * This function returns back addresses of parent node 823 * and its left or right link for further processing. 824 * 825 * Otherwise NULL is returned. In that case all further 826 * steps regarding inserting of conflicting overlap range 827 * have to be declined and actually considered as a bug. 828 */ 829 static __always_inline struct rb_node ** 830 find_va_links(struct vmap_area *va, 831 struct rb_root *root, struct rb_node *from, 832 struct rb_node **parent) 833 { 834 struct vmap_area *tmp_va; 835 struct rb_node **link; 836 837 if (root) { 838 link = &root->rb_node; 839 if (unlikely(!*link)) { 840 *parent = NULL; 841 return link; 842 } 843 } else { 844 link = &from; 845 } 846 847 /* 848 * Go to the bottom of the tree. When we hit the last point 849 * we end up with parent rb_node and correct direction, i name 850 * it link, where the new va->rb_node will be attached to. 851 */ 852 do { 853 tmp_va = rb_entry(*link, struct vmap_area, rb_node); 854 855 /* 856 * During the traversal we also do some sanity check. 857 * Trigger the BUG() if there are sides(left/right) 858 * or full overlaps. 859 */ 860 if (va->va_start < tmp_va->va_end && 861 va->va_end <= tmp_va->va_start) 862 link = &(*link)->rb_left; 863 else if (va->va_end > tmp_va->va_start && 864 va->va_start >= tmp_va->va_end) 865 link = &(*link)->rb_right; 866 else { 867 WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", 868 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); 869 870 return NULL; 871 } 872 } while (*link); 873 874 *parent = &tmp_va->rb_node; 875 return link; 876 } 877 878 static __always_inline struct list_head * 879 get_va_next_sibling(struct rb_node *parent, struct rb_node **link) 880 { 881 struct list_head *list; 882 883 if (unlikely(!parent)) 884 /* 885 * The red-black tree where we try to find VA neighbors 886 * before merging or inserting is empty, i.e. it means 887 * there is no free vmap space. Normally it does not 888 * happen but we handle this case anyway. 889 */ 890 return NULL; 891 892 list = &rb_entry(parent, struct vmap_area, rb_node)->list; 893 return (&parent->rb_right == link ? list->next : list); 894 } 895 896 static __always_inline void 897 link_va(struct vmap_area *va, struct rb_root *root, 898 struct rb_node *parent, struct rb_node **link, struct list_head *head) 899 { 900 /* 901 * VA is still not in the list, but we can 902 * identify its future previous list_head node. 903 */ 904 if (likely(parent)) { 905 head = &rb_entry(parent, struct vmap_area, rb_node)->list; 906 if (&parent->rb_right != link) 907 head = head->prev; 908 } 909 910 /* Insert to the rb-tree */ 911 rb_link_node(&va->rb_node, parent, link); 912 if (root == &free_vmap_area_root) { 913 /* 914 * Some explanation here. Just perform simple insertion 915 * to the tree. We do not set va->subtree_max_size to 916 * its current size before calling rb_insert_augmented(). 917 * It is because of we populate the tree from the bottom 918 * to parent levels when the node _is_ in the tree. 919 * 920 * Therefore we set subtree_max_size to zero after insertion, 921 * to let __augment_tree_propagate_from() puts everything to 922 * the correct order later on. 923 */ 924 rb_insert_augmented(&va->rb_node, 925 root, &free_vmap_area_rb_augment_cb); 926 va->subtree_max_size = 0; 927 } else { 928 rb_insert_color(&va->rb_node, root); 929 } 930 931 /* Address-sort this list */ 932 list_add(&va->list, head); 933 } 934 935 static __always_inline void 936 unlink_va(struct vmap_area *va, struct rb_root *root) 937 { 938 if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) 939 return; 940 941 if (root == &free_vmap_area_root) 942 rb_erase_augmented(&va->rb_node, 943 root, &free_vmap_area_rb_augment_cb); 944 else 945 rb_erase(&va->rb_node, root); 946 947 list_del(&va->list); 948 RB_CLEAR_NODE(&va->rb_node); 949 } 950 951 #if DEBUG_AUGMENT_PROPAGATE_CHECK 952 static void 953 augment_tree_propagate_check(void) 954 { 955 struct vmap_area *va; 956 unsigned long computed_size; 957 958 list_for_each_entry(va, &free_vmap_area_list, list) { 959 computed_size = compute_subtree_max_size(va); 960 if (computed_size != va->subtree_max_size) 961 pr_emerg("tree is corrupted: %lu, %lu\n", 962 va_size(va), va->subtree_max_size); 963 } 964 } 965 #endif 966 967 /* 968 * This function populates subtree_max_size from bottom to upper 969 * levels starting from VA point. The propagation must be done 970 * when VA size is modified by changing its va_start/va_end. Or 971 * in case of newly inserting of VA to the tree. 972 * 973 * It means that __augment_tree_propagate_from() must be called: 974 * - After VA has been inserted to the tree(free path); 975 * - After VA has been shrunk(allocation path); 976 * - After VA has been increased(merging path). 977 * 978 * Please note that, it does not mean that upper parent nodes 979 * and their subtree_max_size are recalculated all the time up 980 * to the root node. 981 * 982 * 4--8 983 * /\ 984 * / \ 985 * / \ 986 * 2--2 8--8 987 * 988 * For example if we modify the node 4, shrinking it to 2, then 989 * no any modification is required. If we shrink the node 2 to 1 990 * its subtree_max_size is updated only, and set to 1. If we shrink 991 * the node 8 to 6, then its subtree_max_size is set to 6 and parent 992 * node becomes 4--6. 993 */ 994 static __always_inline void 995 augment_tree_propagate_from(struct vmap_area *va) 996 { 997 /* 998 * Populate the tree from bottom towards the root until 999 * the calculated maximum available size of checked node 1000 * is equal to its current one. 1001 */ 1002 free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); 1003 1004 #if DEBUG_AUGMENT_PROPAGATE_CHECK 1005 augment_tree_propagate_check(); 1006 #endif 1007 } 1008 1009 static void 1010 insert_vmap_area(struct vmap_area *va, 1011 struct rb_root *root, struct list_head *head) 1012 { 1013 struct rb_node **link; 1014 struct rb_node *parent; 1015 1016 link = find_va_links(va, root, NULL, &parent); 1017 if (link) 1018 link_va(va, root, parent, link, head); 1019 } 1020 1021 static void 1022 insert_vmap_area_augment(struct vmap_area *va, 1023 struct rb_node *from, struct rb_root *root, 1024 struct list_head *head) 1025 { 1026 struct rb_node **link; 1027 struct rb_node *parent; 1028 1029 if (from) 1030 link = find_va_links(va, NULL, from, &parent); 1031 else 1032 link = find_va_links(va, root, NULL, &parent); 1033 1034 if (link) { 1035 link_va(va, root, parent, link, head); 1036 augment_tree_propagate_from(va); 1037 } 1038 } 1039 1040 /* 1041 * Merge de-allocated chunk of VA memory with previous 1042 * and next free blocks. If coalesce is not done a new 1043 * free area is inserted. If VA has been merged, it is 1044 * freed. 1045 * 1046 * Please note, it can return NULL in case of overlap 1047 * ranges, followed by WARN() report. Despite it is a 1048 * buggy behaviour, a system can be alive and keep 1049 * ongoing. 1050 */ 1051 static __always_inline struct vmap_area * 1052 merge_or_add_vmap_area(struct vmap_area *va, 1053 struct rb_root *root, struct list_head *head) 1054 { 1055 struct vmap_area *sibling; 1056 struct list_head *next; 1057 struct rb_node **link; 1058 struct rb_node *parent; 1059 bool merged = false; 1060 1061 /* 1062 * Find a place in the tree where VA potentially will be 1063 * inserted, unless it is merged with its sibling/siblings. 1064 */ 1065 link = find_va_links(va, root, NULL, &parent); 1066 if (!link) 1067 return NULL; 1068 1069 /* 1070 * Get next node of VA to check if merging can be done. 1071 */ 1072 next = get_va_next_sibling(parent, link); 1073 if (unlikely(next == NULL)) 1074 goto insert; 1075 1076 /* 1077 * start end 1078 * | | 1079 * |<------VA------>|<-----Next----->| 1080 * | | 1081 * start end 1082 */ 1083 if (next != head) { 1084 sibling = list_entry(next, struct vmap_area, list); 1085 if (sibling->va_start == va->va_end) { 1086 sibling->va_start = va->va_start; 1087 1088 /* Free vmap_area object. */ 1089 kmem_cache_free(vmap_area_cachep, va); 1090 1091 /* Point to the new merged area. */ 1092 va = sibling; 1093 merged = true; 1094 } 1095 } 1096 1097 /* 1098 * start end 1099 * | | 1100 * |<-----Prev----->|<------VA------>| 1101 * | | 1102 * start end 1103 */ 1104 if (next->prev != head) { 1105 sibling = list_entry(next->prev, struct vmap_area, list); 1106 if (sibling->va_end == va->va_start) { 1107 /* 1108 * If both neighbors are coalesced, it is important 1109 * to unlink the "next" node first, followed by merging 1110 * with "previous" one. Otherwise the tree might not be 1111 * fully populated if a sibling's augmented value is 1112 * "normalized" because of rotation operations. 1113 */ 1114 if (merged) 1115 unlink_va(va, root); 1116 1117 sibling->va_end = va->va_end; 1118 1119 /* Free vmap_area object. */ 1120 kmem_cache_free(vmap_area_cachep, va); 1121 1122 /* Point to the new merged area. */ 1123 va = sibling; 1124 merged = true; 1125 } 1126 } 1127 1128 insert: 1129 if (!merged) 1130 link_va(va, root, parent, link, head); 1131 1132 return va; 1133 } 1134 1135 static __always_inline struct vmap_area * 1136 merge_or_add_vmap_area_augment(struct vmap_area *va, 1137 struct rb_root *root, struct list_head *head) 1138 { 1139 va = merge_or_add_vmap_area(va, root, head); 1140 if (va) 1141 augment_tree_propagate_from(va); 1142 1143 return va; 1144 } 1145 1146 static __always_inline bool 1147 is_within_this_va(struct vmap_area *va, unsigned long size, 1148 unsigned long align, unsigned long vstart) 1149 { 1150 unsigned long nva_start_addr; 1151 1152 if (va->va_start > vstart) 1153 nva_start_addr = ALIGN(va->va_start, align); 1154 else 1155 nva_start_addr = ALIGN(vstart, align); 1156 1157 /* Can be overflowed due to big size or alignment. */ 1158 if (nva_start_addr + size < nva_start_addr || 1159 nva_start_addr < vstart) 1160 return false; 1161 1162 return (nva_start_addr + size <= va->va_end); 1163 } 1164 1165 /* 1166 * Find the first free block(lowest start address) in the tree, 1167 * that will accomplish the request corresponding to passing 1168 * parameters. 1169 */ 1170 static __always_inline struct vmap_area * 1171 find_vmap_lowest_match(unsigned long size, 1172 unsigned long align, unsigned long vstart) 1173 { 1174 struct vmap_area *va; 1175 struct rb_node *node; 1176 unsigned long length; 1177 1178 /* Start from the root. */ 1179 node = free_vmap_area_root.rb_node; 1180 1181 /* Adjust the search size for alignment overhead. */ 1182 length = size + align - 1; 1183 1184 while (node) { 1185 va = rb_entry(node, struct vmap_area, rb_node); 1186 1187 if (get_subtree_max_size(node->rb_left) >= length && 1188 vstart < va->va_start) { 1189 node = node->rb_left; 1190 } else { 1191 if (is_within_this_va(va, size, align, vstart)) 1192 return va; 1193 1194 /* 1195 * Does not make sense to go deeper towards the right 1196 * sub-tree if it does not have a free block that is 1197 * equal or bigger to the requested search length. 1198 */ 1199 if (get_subtree_max_size(node->rb_right) >= length) { 1200 node = node->rb_right; 1201 continue; 1202 } 1203 1204 /* 1205 * OK. We roll back and find the first right sub-tree, 1206 * that will satisfy the search criteria. It can happen 1207 * only once due to "vstart" restriction. 1208 */ 1209 while ((node = rb_parent(node))) { 1210 va = rb_entry(node, struct vmap_area, rb_node); 1211 if (is_within_this_va(va, size, align, vstart)) 1212 return va; 1213 1214 if (get_subtree_max_size(node->rb_right) >= length && 1215 vstart <= va->va_start) { 1216 node = node->rb_right; 1217 break; 1218 } 1219 } 1220 } 1221 } 1222 1223 return NULL; 1224 } 1225 1226 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1227 #include <linux/random.h> 1228 1229 static struct vmap_area * 1230 find_vmap_lowest_linear_match(unsigned long size, 1231 unsigned long align, unsigned long vstart) 1232 { 1233 struct vmap_area *va; 1234 1235 list_for_each_entry(va, &free_vmap_area_list, list) { 1236 if (!is_within_this_va(va, size, align, vstart)) 1237 continue; 1238 1239 return va; 1240 } 1241 1242 return NULL; 1243 } 1244 1245 static void 1246 find_vmap_lowest_match_check(unsigned long size) 1247 { 1248 struct vmap_area *va_1, *va_2; 1249 unsigned long vstart; 1250 unsigned int rnd; 1251 1252 get_random_bytes(&rnd, sizeof(rnd)); 1253 vstart = VMALLOC_START + rnd; 1254 1255 va_1 = find_vmap_lowest_match(size, 1, vstart); 1256 va_2 = find_vmap_lowest_linear_match(size, 1, vstart); 1257 1258 if (va_1 != va_2) 1259 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", 1260 va_1, va_2, vstart); 1261 } 1262 #endif 1263 1264 enum fit_type { 1265 NOTHING_FIT = 0, 1266 FL_FIT_TYPE = 1, /* full fit */ 1267 LE_FIT_TYPE = 2, /* left edge fit */ 1268 RE_FIT_TYPE = 3, /* right edge fit */ 1269 NE_FIT_TYPE = 4 /* no edge fit */ 1270 }; 1271 1272 static __always_inline enum fit_type 1273 classify_va_fit_type(struct vmap_area *va, 1274 unsigned long nva_start_addr, unsigned long size) 1275 { 1276 enum fit_type type; 1277 1278 /* Check if it is within VA. */ 1279 if (nva_start_addr < va->va_start || 1280 nva_start_addr + size > va->va_end) 1281 return NOTHING_FIT; 1282 1283 /* Now classify. */ 1284 if (va->va_start == nva_start_addr) { 1285 if (va->va_end == nva_start_addr + size) 1286 type = FL_FIT_TYPE; 1287 else 1288 type = LE_FIT_TYPE; 1289 } else if (va->va_end == nva_start_addr + size) { 1290 type = RE_FIT_TYPE; 1291 } else { 1292 type = NE_FIT_TYPE; 1293 } 1294 1295 return type; 1296 } 1297 1298 static __always_inline int 1299 adjust_va_to_fit_type(struct vmap_area *va, 1300 unsigned long nva_start_addr, unsigned long size, 1301 enum fit_type type) 1302 { 1303 struct vmap_area *lva = NULL; 1304 1305 if (type == FL_FIT_TYPE) { 1306 /* 1307 * No need to split VA, it fully fits. 1308 * 1309 * | | 1310 * V NVA V 1311 * |---------------| 1312 */ 1313 unlink_va(va, &free_vmap_area_root); 1314 kmem_cache_free(vmap_area_cachep, va); 1315 } else if (type == LE_FIT_TYPE) { 1316 /* 1317 * Split left edge of fit VA. 1318 * 1319 * | | 1320 * V NVA V R 1321 * |-------|-------| 1322 */ 1323 va->va_start += size; 1324 } else if (type == RE_FIT_TYPE) { 1325 /* 1326 * Split right edge of fit VA. 1327 * 1328 * | | 1329 * L V NVA V 1330 * |-------|-------| 1331 */ 1332 va->va_end = nva_start_addr; 1333 } else if (type == NE_FIT_TYPE) { 1334 /* 1335 * Split no edge of fit VA. 1336 * 1337 * | | 1338 * L V NVA V R 1339 * |---|-------|---| 1340 */ 1341 lva = __this_cpu_xchg(ne_fit_preload_node, NULL); 1342 if (unlikely(!lva)) { 1343 /* 1344 * For percpu allocator we do not do any pre-allocation 1345 * and leave it as it is. The reason is it most likely 1346 * never ends up with NE_FIT_TYPE splitting. In case of 1347 * percpu allocations offsets and sizes are aligned to 1348 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE 1349 * are its main fitting cases. 1350 * 1351 * There are a few exceptions though, as an example it is 1352 * a first allocation (early boot up) when we have "one" 1353 * big free space that has to be split. 1354 * 1355 * Also we can hit this path in case of regular "vmap" 1356 * allocations, if "this" current CPU was not preloaded. 1357 * See the comment in alloc_vmap_area() why. If so, then 1358 * GFP_NOWAIT is used instead to get an extra object for 1359 * split purpose. That is rare and most time does not 1360 * occur. 1361 * 1362 * What happens if an allocation gets failed. Basically, 1363 * an "overflow" path is triggered to purge lazily freed 1364 * areas to free some memory, then, the "retry" path is 1365 * triggered to repeat one more time. See more details 1366 * in alloc_vmap_area() function. 1367 */ 1368 lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); 1369 if (!lva) 1370 return -1; 1371 } 1372 1373 /* 1374 * Build the remainder. 1375 */ 1376 lva->va_start = va->va_start; 1377 lva->va_end = nva_start_addr; 1378 1379 /* 1380 * Shrink this VA to remaining size. 1381 */ 1382 va->va_start = nva_start_addr + size; 1383 } else { 1384 return -1; 1385 } 1386 1387 if (type != FL_FIT_TYPE) { 1388 augment_tree_propagate_from(va); 1389 1390 if (lva) /* type == NE_FIT_TYPE */ 1391 insert_vmap_area_augment(lva, &va->rb_node, 1392 &free_vmap_area_root, &free_vmap_area_list); 1393 } 1394 1395 return 0; 1396 } 1397 1398 /* 1399 * Returns a start address of the newly allocated area, if success. 1400 * Otherwise a vend is returned that indicates failure. 1401 */ 1402 static __always_inline unsigned long 1403 __alloc_vmap_area(unsigned long size, unsigned long align, 1404 unsigned long vstart, unsigned long vend) 1405 { 1406 unsigned long nva_start_addr; 1407 struct vmap_area *va; 1408 enum fit_type type; 1409 int ret; 1410 1411 va = find_vmap_lowest_match(size, align, vstart); 1412 if (unlikely(!va)) 1413 return vend; 1414 1415 if (va->va_start > vstart) 1416 nva_start_addr = ALIGN(va->va_start, align); 1417 else 1418 nva_start_addr = ALIGN(vstart, align); 1419 1420 /* Check the "vend" restriction. */ 1421 if (nva_start_addr + size > vend) 1422 return vend; 1423 1424 /* Classify what we have found. */ 1425 type = classify_va_fit_type(va, nva_start_addr, size); 1426 if (WARN_ON_ONCE(type == NOTHING_FIT)) 1427 return vend; 1428 1429 /* Update the free vmap_area. */ 1430 ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); 1431 if (ret) 1432 return vend; 1433 1434 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1435 find_vmap_lowest_match_check(size); 1436 #endif 1437 1438 return nva_start_addr; 1439 } 1440 1441 /* 1442 * Free a region of KVA allocated by alloc_vmap_area 1443 */ 1444 static void free_vmap_area(struct vmap_area *va) 1445 { 1446 /* 1447 * Remove from the busy tree/list. 1448 */ 1449 spin_lock(&vmap_area_lock); 1450 unlink_va(va, &vmap_area_root); 1451 spin_unlock(&vmap_area_lock); 1452 1453 /* 1454 * Insert/Merge it back to the free tree/list. 1455 */ 1456 spin_lock(&free_vmap_area_lock); 1457 merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); 1458 spin_unlock(&free_vmap_area_lock); 1459 } 1460 1461 static inline void 1462 preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) 1463 { 1464 struct vmap_area *va = NULL; 1465 1466 /* 1467 * Preload this CPU with one extra vmap_area object. It is used 1468 * when fit type of free area is NE_FIT_TYPE. It guarantees that 1469 * a CPU that does an allocation is preloaded. 1470 * 1471 * We do it in non-atomic context, thus it allows us to use more 1472 * permissive allocation masks to be more stable under low memory 1473 * condition and high memory pressure. 1474 */ 1475 if (!this_cpu_read(ne_fit_preload_node)) 1476 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1477 1478 spin_lock(lock); 1479 1480 if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) 1481 kmem_cache_free(vmap_area_cachep, va); 1482 } 1483 1484 /* 1485 * Allocate a region of KVA of the specified size and alignment, within the 1486 * vstart and vend. 1487 */ 1488 static struct vmap_area *alloc_vmap_area(unsigned long size, 1489 unsigned long align, 1490 unsigned long vstart, unsigned long vend, 1491 int node, gfp_t gfp_mask) 1492 { 1493 struct vmap_area *va; 1494 unsigned long addr; 1495 int purged = 0; 1496 int ret; 1497 1498 BUG_ON(!size); 1499 BUG_ON(offset_in_page(size)); 1500 BUG_ON(!is_power_of_2(align)); 1501 1502 if (unlikely(!vmap_initialized)) 1503 return ERR_PTR(-EBUSY); 1504 1505 might_sleep(); 1506 gfp_mask = gfp_mask & GFP_RECLAIM_MASK; 1507 1508 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1509 if (unlikely(!va)) 1510 return ERR_PTR(-ENOMEM); 1511 1512 /* 1513 * Only scan the relevant parts containing pointers to other objects 1514 * to avoid false negatives. 1515 */ 1516 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); 1517 1518 retry: 1519 preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); 1520 addr = __alloc_vmap_area(size, align, vstart, vend); 1521 spin_unlock(&free_vmap_area_lock); 1522 1523 /* 1524 * If an allocation fails, the "vend" address is 1525 * returned. Therefore trigger the overflow path. 1526 */ 1527 if (unlikely(addr == vend)) 1528 goto overflow; 1529 1530 va->va_start = addr; 1531 va->va_end = addr + size; 1532 va->vm = NULL; 1533 1534 spin_lock(&vmap_area_lock); 1535 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 1536 spin_unlock(&vmap_area_lock); 1537 1538 BUG_ON(!IS_ALIGNED(va->va_start, align)); 1539 BUG_ON(va->va_start < vstart); 1540 BUG_ON(va->va_end > vend); 1541 1542 ret = kasan_populate_vmalloc(addr, size); 1543 if (ret) { 1544 free_vmap_area(va); 1545 return ERR_PTR(ret); 1546 } 1547 1548 return va; 1549 1550 overflow: 1551 if (!purged) { 1552 purge_vmap_area_lazy(); 1553 purged = 1; 1554 goto retry; 1555 } 1556 1557 if (gfpflags_allow_blocking(gfp_mask)) { 1558 unsigned long freed = 0; 1559 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); 1560 if (freed > 0) { 1561 purged = 0; 1562 goto retry; 1563 } 1564 } 1565 1566 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) 1567 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", 1568 size); 1569 1570 kmem_cache_free(vmap_area_cachep, va); 1571 return ERR_PTR(-EBUSY); 1572 } 1573 1574 int register_vmap_purge_notifier(struct notifier_block *nb) 1575 { 1576 return blocking_notifier_chain_register(&vmap_notify_list, nb); 1577 } 1578 EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); 1579 1580 int unregister_vmap_purge_notifier(struct notifier_block *nb) 1581 { 1582 return blocking_notifier_chain_unregister(&vmap_notify_list, nb); 1583 } 1584 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); 1585 1586 /* 1587 * lazy_max_pages is the maximum amount of virtual address space we gather up 1588 * before attempting to purge with a TLB flush. 1589 * 1590 * There is a tradeoff here: a larger number will cover more kernel page tables 1591 * and take slightly longer to purge, but it will linearly reduce the number of 1592 * global TLB flushes that must be performed. It would seem natural to scale 1593 * this number up linearly with the number of CPUs (because vmapping activity 1594 * could also scale linearly with the number of CPUs), however it is likely 1595 * that in practice, workloads might be constrained in other ways that mean 1596 * vmap activity will not scale linearly with CPUs. Also, I want to be 1597 * conservative and not introduce a big latency on huge systems, so go with 1598 * a less aggressive log scale. It will still be an improvement over the old 1599 * code, and it will be simple to change the scale factor if we find that it 1600 * becomes a problem on bigger systems. 1601 */ 1602 static unsigned long lazy_max_pages(void) 1603 { 1604 unsigned int log; 1605 1606 log = fls(num_online_cpus()); 1607 1608 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 1609 } 1610 1611 static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); 1612 1613 /* 1614 * Serialize vmap purging. There is no actual critical section protected 1615 * by this look, but we want to avoid concurrent calls for performance 1616 * reasons and to make the pcpu_get_vm_areas more deterministic. 1617 */ 1618 static DEFINE_MUTEX(vmap_purge_lock); 1619 1620 /* for per-CPU blocks */ 1621 static void purge_fragmented_blocks_allcpus(void); 1622 1623 #ifdef CONFIG_X86_64 1624 /* 1625 * called before a call to iounmap() if the caller wants vm_area_struct's 1626 * immediately freed. 1627 */ 1628 void set_iounmap_nonlazy(void) 1629 { 1630 atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); 1631 } 1632 #endif /* CONFIG_X86_64 */ 1633 1634 /* 1635 * Purges all lazily-freed vmap areas. 1636 */ 1637 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) 1638 { 1639 unsigned long resched_threshold; 1640 struct list_head local_pure_list; 1641 struct vmap_area *va, *n_va; 1642 1643 lockdep_assert_held(&vmap_purge_lock); 1644 1645 spin_lock(&purge_vmap_area_lock); 1646 purge_vmap_area_root = RB_ROOT; 1647 list_replace_init(&purge_vmap_area_list, &local_pure_list); 1648 spin_unlock(&purge_vmap_area_lock); 1649 1650 if (unlikely(list_empty(&local_pure_list))) 1651 return false; 1652 1653 start = min(start, 1654 list_first_entry(&local_pure_list, 1655 struct vmap_area, list)->va_start); 1656 1657 end = max(end, 1658 list_last_entry(&local_pure_list, 1659 struct vmap_area, list)->va_end); 1660 1661 flush_tlb_kernel_range(start, end); 1662 resched_threshold = lazy_max_pages() << 1; 1663 1664 spin_lock(&free_vmap_area_lock); 1665 list_for_each_entry_safe(va, n_va, &local_pure_list, list) { 1666 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; 1667 unsigned long orig_start = va->va_start; 1668 unsigned long orig_end = va->va_end; 1669 1670 /* 1671 * Finally insert or merge lazily-freed area. It is 1672 * detached and there is no need to "unlink" it from 1673 * anything. 1674 */ 1675 va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, 1676 &free_vmap_area_list); 1677 1678 if (!va) 1679 continue; 1680 1681 if (is_vmalloc_or_module_addr((void *)orig_start)) 1682 kasan_release_vmalloc(orig_start, orig_end, 1683 va->va_start, va->va_end); 1684 1685 atomic_long_sub(nr, &vmap_lazy_nr); 1686 1687 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) 1688 cond_resched_lock(&free_vmap_area_lock); 1689 } 1690 spin_unlock(&free_vmap_area_lock); 1691 return true; 1692 } 1693 1694 /* 1695 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody 1696 * is already purging. 1697 */ 1698 static void try_purge_vmap_area_lazy(void) 1699 { 1700 if (mutex_trylock(&vmap_purge_lock)) { 1701 __purge_vmap_area_lazy(ULONG_MAX, 0); 1702 mutex_unlock(&vmap_purge_lock); 1703 } 1704 } 1705 1706 /* 1707 * Kick off a purge of the outstanding lazy areas. 1708 */ 1709 static void purge_vmap_area_lazy(void) 1710 { 1711 mutex_lock(&vmap_purge_lock); 1712 purge_fragmented_blocks_allcpus(); 1713 __purge_vmap_area_lazy(ULONG_MAX, 0); 1714 mutex_unlock(&vmap_purge_lock); 1715 } 1716 1717 /* 1718 * Free a vmap area, caller ensuring that the area has been unmapped 1719 * and flush_cache_vunmap had been called for the correct range 1720 * previously. 1721 */ 1722 static void free_vmap_area_noflush(struct vmap_area *va) 1723 { 1724 unsigned long nr_lazy; 1725 1726 spin_lock(&vmap_area_lock); 1727 unlink_va(va, &vmap_area_root); 1728 spin_unlock(&vmap_area_lock); 1729 1730 nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> 1731 PAGE_SHIFT, &vmap_lazy_nr); 1732 1733 /* 1734 * Merge or place it to the purge tree/list. 1735 */ 1736 spin_lock(&purge_vmap_area_lock); 1737 merge_or_add_vmap_area(va, 1738 &purge_vmap_area_root, &purge_vmap_area_list); 1739 spin_unlock(&purge_vmap_area_lock); 1740 1741 /* After this point, we may free va at any time */ 1742 if (unlikely(nr_lazy > lazy_max_pages())) 1743 try_purge_vmap_area_lazy(); 1744 } 1745 1746 /* 1747 * Free and unmap a vmap area 1748 */ 1749 static void free_unmap_vmap_area(struct vmap_area *va) 1750 { 1751 flush_cache_vunmap(va->va_start, va->va_end); 1752 vunmap_range_noflush(va->va_start, va->va_end); 1753 if (debug_pagealloc_enabled_static()) 1754 flush_tlb_kernel_range(va->va_start, va->va_end); 1755 1756 free_vmap_area_noflush(va); 1757 } 1758 1759 static struct vmap_area *find_vmap_area(unsigned long addr) 1760 { 1761 struct vmap_area *va; 1762 1763 spin_lock(&vmap_area_lock); 1764 va = __find_vmap_area(addr); 1765 spin_unlock(&vmap_area_lock); 1766 1767 return va; 1768 } 1769 1770 /*** Per cpu kva allocator ***/ 1771 1772 /* 1773 * vmap space is limited especially on 32 bit architectures. Ensure there is 1774 * room for at least 16 percpu vmap blocks per CPU. 1775 */ 1776 /* 1777 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able 1778 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess 1779 * instead (we just need a rough idea) 1780 */ 1781 #if BITS_PER_LONG == 32 1782 #define VMALLOC_SPACE (128UL*1024*1024) 1783 #else 1784 #define VMALLOC_SPACE (128UL*1024*1024*1024) 1785 #endif 1786 1787 #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) 1788 #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ 1789 #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ 1790 #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 1791 #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 1792 #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 1793 #define VMAP_BBMAP_BITS \ 1794 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 1795 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 1796 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) 1797 1798 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 1799 1800 struct vmap_block_queue { 1801 spinlock_t lock; 1802 struct list_head free; 1803 }; 1804 1805 struct vmap_block { 1806 spinlock_t lock; 1807 struct vmap_area *va; 1808 unsigned long free, dirty; 1809 unsigned long dirty_min, dirty_max; /*< dirty range */ 1810 struct list_head free_list; 1811 struct rcu_head rcu_head; 1812 struct list_head purge; 1813 }; 1814 1815 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 1816 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); 1817 1818 /* 1819 * XArray of vmap blocks, indexed by address, to quickly find a vmap block 1820 * in the free path. Could get rid of this if we change the API to return a 1821 * "cookie" from alloc, to be passed to free. But no big deal yet. 1822 */ 1823 static DEFINE_XARRAY(vmap_blocks); 1824 1825 /* 1826 * We should probably have a fallback mechanism to allocate virtual memory 1827 * out of partially filled vmap blocks. However vmap block sizing should be 1828 * fairly reasonable according to the vmalloc size, so it shouldn't be a 1829 * big problem. 1830 */ 1831 1832 static unsigned long addr_to_vb_idx(unsigned long addr) 1833 { 1834 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); 1835 addr /= VMAP_BLOCK_SIZE; 1836 return addr; 1837 } 1838 1839 static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) 1840 { 1841 unsigned long addr; 1842 1843 addr = va_start + (pages_off << PAGE_SHIFT); 1844 BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); 1845 return (void *)addr; 1846 } 1847 1848 /** 1849 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this 1850 * block. Of course pages number can't exceed VMAP_BBMAP_BITS 1851 * @order: how many 2^order pages should be occupied in newly allocated block 1852 * @gfp_mask: flags for the page level allocator 1853 * 1854 * Return: virtual address in a newly allocated block or ERR_PTR(-errno) 1855 */ 1856 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) 1857 { 1858 struct vmap_block_queue *vbq; 1859 struct vmap_block *vb; 1860 struct vmap_area *va; 1861 unsigned long vb_idx; 1862 int node, err; 1863 void *vaddr; 1864 1865 node = numa_node_id(); 1866 1867 vb = kmalloc_node(sizeof(struct vmap_block), 1868 gfp_mask & GFP_RECLAIM_MASK, node); 1869 if (unlikely(!vb)) 1870 return ERR_PTR(-ENOMEM); 1871 1872 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 1873 VMALLOC_START, VMALLOC_END, 1874 node, gfp_mask); 1875 if (IS_ERR(va)) { 1876 kfree(vb); 1877 return ERR_CAST(va); 1878 } 1879 1880 vaddr = vmap_block_vaddr(va->va_start, 0); 1881 spin_lock_init(&vb->lock); 1882 vb->va = va; 1883 /* At least something should be left free */ 1884 BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); 1885 vb->free = VMAP_BBMAP_BITS - (1UL << order); 1886 vb->dirty = 0; 1887 vb->dirty_min = VMAP_BBMAP_BITS; 1888 vb->dirty_max = 0; 1889 INIT_LIST_HEAD(&vb->free_list); 1890 1891 vb_idx = addr_to_vb_idx(va->va_start); 1892 err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask); 1893 if (err) { 1894 kfree(vb); 1895 free_vmap_area(va); 1896 return ERR_PTR(err); 1897 } 1898 1899 vbq = &get_cpu_var(vmap_block_queue); 1900 spin_lock(&vbq->lock); 1901 list_add_tail_rcu(&vb->free_list, &vbq->free); 1902 spin_unlock(&vbq->lock); 1903 put_cpu_var(vmap_block_queue); 1904 1905 return vaddr; 1906 } 1907 1908 static void free_vmap_block(struct vmap_block *vb) 1909 { 1910 struct vmap_block *tmp; 1911 1912 tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); 1913 BUG_ON(tmp != vb); 1914 1915 free_vmap_area_noflush(vb->va); 1916 kfree_rcu(vb, rcu_head); 1917 } 1918 1919 static void purge_fragmented_blocks(int cpu) 1920 { 1921 LIST_HEAD(purge); 1922 struct vmap_block *vb; 1923 struct vmap_block *n_vb; 1924 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 1925 1926 rcu_read_lock(); 1927 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1928 1929 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) 1930 continue; 1931 1932 spin_lock(&vb->lock); 1933 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 1934 vb->free = 0; /* prevent further allocs after releasing lock */ 1935 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 1936 vb->dirty_min = 0; 1937 vb->dirty_max = VMAP_BBMAP_BITS; 1938 spin_lock(&vbq->lock); 1939 list_del_rcu(&vb->free_list); 1940 spin_unlock(&vbq->lock); 1941 spin_unlock(&vb->lock); 1942 list_add_tail(&vb->purge, &purge); 1943 } else 1944 spin_unlock(&vb->lock); 1945 } 1946 rcu_read_unlock(); 1947 1948 list_for_each_entry_safe(vb, n_vb, &purge, purge) { 1949 list_del(&vb->purge); 1950 free_vmap_block(vb); 1951 } 1952 } 1953 1954 static void purge_fragmented_blocks_allcpus(void) 1955 { 1956 int cpu; 1957 1958 for_each_possible_cpu(cpu) 1959 purge_fragmented_blocks(cpu); 1960 } 1961 1962 static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 1963 { 1964 struct vmap_block_queue *vbq; 1965 struct vmap_block *vb; 1966 void *vaddr = NULL; 1967 unsigned int order; 1968 1969 BUG_ON(offset_in_page(size)); 1970 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 1971 if (WARN_ON(size == 0)) { 1972 /* 1973 * Allocating 0 bytes isn't what caller wants since 1974 * get_order(0) returns funny result. Just warn and terminate 1975 * early. 1976 */ 1977 return NULL; 1978 } 1979 order = get_order(size); 1980 1981 rcu_read_lock(); 1982 vbq = &get_cpu_var(vmap_block_queue); 1983 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1984 unsigned long pages_off; 1985 1986 spin_lock(&vb->lock); 1987 if (vb->free < (1UL << order)) { 1988 spin_unlock(&vb->lock); 1989 continue; 1990 } 1991 1992 pages_off = VMAP_BBMAP_BITS - vb->free; 1993 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); 1994 vb->free -= 1UL << order; 1995 if (vb->free == 0) { 1996 spin_lock(&vbq->lock); 1997 list_del_rcu(&vb->free_list); 1998 spin_unlock(&vbq->lock); 1999 } 2000 2001 spin_unlock(&vb->lock); 2002 break; 2003 } 2004 2005 put_cpu_var(vmap_block_queue); 2006 rcu_read_unlock(); 2007 2008 /* Allocate new block if nothing was found */ 2009 if (!vaddr) 2010 vaddr = new_vmap_block(order, gfp_mask); 2011 2012 return vaddr; 2013 } 2014 2015 static void vb_free(unsigned long addr, unsigned long size) 2016 { 2017 unsigned long offset; 2018 unsigned int order; 2019 struct vmap_block *vb; 2020 2021 BUG_ON(offset_in_page(size)); 2022 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 2023 2024 flush_cache_vunmap(addr, addr + size); 2025 2026 order = get_order(size); 2027 offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; 2028 vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); 2029 2030 vunmap_range_noflush(addr, addr + size); 2031 2032 if (debug_pagealloc_enabled_static()) 2033 flush_tlb_kernel_range(addr, addr + size); 2034 2035 spin_lock(&vb->lock); 2036 2037 /* Expand dirty range */ 2038 vb->dirty_min = min(vb->dirty_min, offset); 2039 vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); 2040 2041 vb->dirty += 1UL << order; 2042 if (vb->dirty == VMAP_BBMAP_BITS) { 2043 BUG_ON(vb->free); 2044 spin_unlock(&vb->lock); 2045 free_vmap_block(vb); 2046 } else 2047 spin_unlock(&vb->lock); 2048 } 2049 2050 static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) 2051 { 2052 int cpu; 2053 2054 if (unlikely(!vmap_initialized)) 2055 return; 2056 2057 might_sleep(); 2058 2059 for_each_possible_cpu(cpu) { 2060 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 2061 struct vmap_block *vb; 2062 2063 rcu_read_lock(); 2064 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 2065 spin_lock(&vb->lock); 2066 if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) { 2067 unsigned long va_start = vb->va->va_start; 2068 unsigned long s, e; 2069 2070 s = va_start + (vb->dirty_min << PAGE_SHIFT); 2071 e = va_start + (vb->dirty_max << PAGE_SHIFT); 2072 2073 start = min(s, start); 2074 end = max(e, end); 2075 2076 flush = 1; 2077 } 2078 spin_unlock(&vb->lock); 2079 } 2080 rcu_read_unlock(); 2081 } 2082 2083 mutex_lock(&vmap_purge_lock); 2084 purge_fragmented_blocks_allcpus(); 2085 if (!__purge_vmap_area_lazy(start, end) && flush) 2086 flush_tlb_kernel_range(start, end); 2087 mutex_unlock(&vmap_purge_lock); 2088 } 2089 2090 /** 2091 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer 2092 * 2093 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily 2094 * to amortize TLB flushing overheads. What this means is that any page you 2095 * have now, may, in a former life, have been mapped into kernel virtual 2096 * address by the vmap layer and so there might be some CPUs with TLB entries 2097 * still referencing that page (additional to the regular 1:1 kernel mapping). 2098 * 2099 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can 2100 * be sure that none of the pages we have control over will have any aliases 2101 * from the vmap layer. 2102 */ 2103 void vm_unmap_aliases(void) 2104 { 2105 unsigned long start = ULONG_MAX, end = 0; 2106 int flush = 0; 2107 2108 _vm_unmap_aliases(start, end, flush); 2109 } 2110 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 2111 2112 /** 2113 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram 2114 * @mem: the pointer returned by vm_map_ram 2115 * @count: the count passed to that vm_map_ram call (cannot unmap partial) 2116 */ 2117 void vm_unmap_ram(const void *mem, unsigned int count) 2118 { 2119 unsigned long size = (unsigned long)count << PAGE_SHIFT; 2120 unsigned long addr = (unsigned long)mem; 2121 struct vmap_area *va; 2122 2123 might_sleep(); 2124 BUG_ON(!addr); 2125 BUG_ON(addr < VMALLOC_START); 2126 BUG_ON(addr > VMALLOC_END); 2127 BUG_ON(!PAGE_ALIGNED(addr)); 2128 2129 kasan_poison_vmalloc(mem, size); 2130 2131 if (likely(count <= VMAP_MAX_ALLOC)) { 2132 debug_check_no_locks_freed(mem, size); 2133 vb_free(addr, size); 2134 return; 2135 } 2136 2137 va = find_vmap_area(addr); 2138 BUG_ON(!va); 2139 debug_check_no_locks_freed((void *)va->va_start, 2140 (va->va_end - va->va_start)); 2141 free_unmap_vmap_area(va); 2142 } 2143 EXPORT_SYMBOL(vm_unmap_ram); 2144 2145 /** 2146 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) 2147 * @pages: an array of pointers to the pages to be mapped 2148 * @count: number of pages 2149 * @node: prefer to allocate data structures on this node 2150 * 2151 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be 2152 * faster than vmap so it's good. But if you mix long-life and short-life 2153 * objects with vm_map_ram(), it could consume lots of address space through 2154 * fragmentation (especially on a 32bit machine). You could see failures in 2155 * the end. Please use this function for short-lived objects. 2156 * 2157 * Returns: a pointer to the address that has been mapped, or %NULL on failure 2158 */ 2159 void *vm_map_ram(struct page **pages, unsigned int count, int node) 2160 { 2161 unsigned long size = (unsigned long)count << PAGE_SHIFT; 2162 unsigned long addr; 2163 void *mem; 2164 2165 if (likely(count <= VMAP_MAX_ALLOC)) { 2166 mem = vb_alloc(size, GFP_KERNEL); 2167 if (IS_ERR(mem)) 2168 return NULL; 2169 addr = (unsigned long)mem; 2170 } else { 2171 struct vmap_area *va; 2172 va = alloc_vmap_area(size, PAGE_SIZE, 2173 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); 2174 if (IS_ERR(va)) 2175 return NULL; 2176 2177 addr = va->va_start; 2178 mem = (void *)addr; 2179 } 2180 2181 kasan_unpoison_vmalloc(mem, size); 2182 2183 if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, 2184 pages, PAGE_SHIFT) < 0) { 2185 vm_unmap_ram(mem, count); 2186 return NULL; 2187 } 2188 2189 return mem; 2190 } 2191 EXPORT_SYMBOL(vm_map_ram); 2192 2193 static struct vm_struct *vmlist __initdata; 2194 2195 static inline unsigned int vm_area_page_order(struct vm_struct *vm) 2196 { 2197 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 2198 return vm->page_order; 2199 #else 2200 return 0; 2201 #endif 2202 } 2203 2204 static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) 2205 { 2206 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 2207 vm->page_order = order; 2208 #else 2209 BUG_ON(order != 0); 2210 #endif 2211 } 2212 2213 /** 2214 * vm_area_add_early - add vmap area early during boot 2215 * @vm: vm_struct to add 2216 * 2217 * This function is used to add fixed kernel vm area to vmlist before 2218 * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags 2219 * should contain proper values and the other fields should be zero. 2220 * 2221 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 2222 */ 2223 void __init vm_area_add_early(struct vm_struct *vm) 2224 { 2225 struct vm_struct *tmp, **p; 2226 2227 BUG_ON(vmap_initialized); 2228 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 2229 if (tmp->addr >= vm->addr) { 2230 BUG_ON(tmp->addr < vm->addr + vm->size); 2231 break; 2232 } else 2233 BUG_ON(tmp->addr + tmp->size > vm->addr); 2234 } 2235 vm->next = *p; 2236 *p = vm; 2237 } 2238 2239 /** 2240 * vm_area_register_early - register vmap area early during boot 2241 * @vm: vm_struct to register 2242 * @align: requested alignment 2243 * 2244 * This function is used to register kernel vm area before 2245 * vmalloc_init() is called. @vm->size and @vm->flags should contain 2246 * proper values on entry and other fields should be zero. On return, 2247 * vm->addr contains the allocated address. 2248 * 2249 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 2250 */ 2251 void __init vm_area_register_early(struct vm_struct *vm, size_t align) 2252 { 2253 static size_t vm_init_off __initdata; 2254 unsigned long addr; 2255 2256 addr = ALIGN(VMALLOC_START + vm_init_off, align); 2257 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; 2258 2259 vm->addr = (void *)addr; 2260 2261 vm_area_add_early(vm); 2262 } 2263 2264 static void vmap_init_free_space(void) 2265 { 2266 unsigned long vmap_start = 1; 2267 const unsigned long vmap_end = ULONG_MAX; 2268 struct vmap_area *busy, *free; 2269 2270 /* 2271 * B F B B B F 2272 * -|-----|.....|-----|-----|-----|.....|- 2273 * | The KVA space | 2274 * |<--------------------------------->| 2275 */ 2276 list_for_each_entry(busy, &vmap_area_list, list) { 2277 if (busy->va_start - vmap_start > 0) { 2278 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 2279 if (!WARN_ON_ONCE(!free)) { 2280 free->va_start = vmap_start; 2281 free->va_end = busy->va_start; 2282 2283 insert_vmap_area_augment(free, NULL, 2284 &free_vmap_area_root, 2285 &free_vmap_area_list); 2286 } 2287 } 2288 2289 vmap_start = busy->va_end; 2290 } 2291 2292 if (vmap_end - vmap_start > 0) { 2293 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 2294 if (!WARN_ON_ONCE(!free)) { 2295 free->va_start = vmap_start; 2296 free->va_end = vmap_end; 2297 2298 insert_vmap_area_augment(free, NULL, 2299 &free_vmap_area_root, 2300 &free_vmap_area_list); 2301 } 2302 } 2303 } 2304 2305 void __init vmalloc_init(void) 2306 { 2307 struct vmap_area *va; 2308 struct vm_struct *tmp; 2309 int i; 2310 2311 /* 2312 * Create the cache for vmap_area objects. 2313 */ 2314 vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); 2315 2316 for_each_possible_cpu(i) { 2317 struct vmap_block_queue *vbq; 2318 struct vfree_deferred *p; 2319 2320 vbq = &per_cpu(vmap_block_queue, i); 2321 spin_lock_init(&vbq->lock); 2322 INIT_LIST_HEAD(&vbq->free); 2323 p = &per_cpu(vfree_deferred, i); 2324 init_llist_head(&p->list); 2325 INIT_WORK(&p->wq, free_work); 2326 } 2327 2328 /* Import existing vmlist entries. */ 2329 for (tmp = vmlist; tmp; tmp = tmp->next) { 2330 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 2331 if (WARN_ON_ONCE(!va)) 2332 continue; 2333 2334 va->va_start = (unsigned long)tmp->addr; 2335 va->va_end = va->va_start + tmp->size; 2336 va->vm = tmp; 2337 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 2338 } 2339 2340 /* 2341 * Now we can initialize a free vmap space. 2342 */ 2343 vmap_init_free_space(); 2344 vmap_initialized = true; 2345 } 2346 2347 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, 2348 struct vmap_area *va, unsigned long flags, const void *caller) 2349 { 2350 vm->flags = flags; 2351 vm->addr = (void *)va->va_start; 2352 vm->size = va->va_end - va->va_start; 2353 vm->caller = caller; 2354 va->vm = vm; 2355 } 2356 2357 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 2358 unsigned long flags, const void *caller) 2359 { 2360 spin_lock(&vmap_area_lock); 2361 setup_vmalloc_vm_locked(vm, va, flags, caller); 2362 spin_unlock(&vmap_area_lock); 2363 } 2364 2365 static void clear_vm_uninitialized_flag(struct vm_struct *vm) 2366 { 2367 /* 2368 * Before removing VM_UNINITIALIZED, 2369 * we should make sure that vm has proper values. 2370 * Pair with smp_rmb() in show_numa_info(). 2371 */ 2372 smp_wmb(); 2373 vm->flags &= ~VM_UNINITIALIZED; 2374 } 2375 2376 static struct vm_struct *__get_vm_area_node(unsigned long size, 2377 unsigned long align, unsigned long shift, unsigned long flags, 2378 unsigned long start, unsigned long end, int node, 2379 gfp_t gfp_mask, const void *caller) 2380 { 2381 struct vmap_area *va; 2382 struct vm_struct *area; 2383 unsigned long requested_size = size; 2384 2385 BUG_ON(in_interrupt()); 2386 size = ALIGN(size, 1ul << shift); 2387 if (unlikely(!size)) 2388 return NULL; 2389 2390 if (flags & VM_IOREMAP) 2391 align = 1ul << clamp_t(int, get_count_order_long(size), 2392 PAGE_SHIFT, IOREMAP_MAX_ORDER); 2393 2394 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 2395 if (unlikely(!area)) 2396 return NULL; 2397 2398 if (!(flags & VM_NO_GUARD)) 2399 size += PAGE_SIZE; 2400 2401 va = alloc_vmap_area(size, align, start, end, node, gfp_mask); 2402 if (IS_ERR(va)) { 2403 kfree(area); 2404 return NULL; 2405 } 2406 2407 kasan_unpoison_vmalloc((void *)va->va_start, requested_size); 2408 2409 setup_vmalloc_vm(area, va, flags, caller); 2410 2411 return area; 2412 } 2413 2414 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 2415 unsigned long start, unsigned long end, 2416 const void *caller) 2417 { 2418 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, 2419 NUMA_NO_NODE, GFP_KERNEL, caller); 2420 } 2421 2422 /** 2423 * get_vm_area - reserve a contiguous kernel virtual area 2424 * @size: size of the area 2425 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 2426 * 2427 * Search an area of @size in the kernel virtual mapping area, 2428 * and reserved it for out purposes. Returns the area descriptor 2429 * on success or %NULL on failure. 2430 * 2431 * Return: the area descriptor on success or %NULL on failure. 2432 */ 2433 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 2434 { 2435 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 2436 VMALLOC_START, VMALLOC_END, 2437 NUMA_NO_NODE, GFP_KERNEL, 2438 __builtin_return_address(0)); 2439 } 2440 2441 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 2442 const void *caller) 2443 { 2444 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 2445 VMALLOC_START, VMALLOC_END, 2446 NUMA_NO_NODE, GFP_KERNEL, caller); 2447 } 2448 2449 /** 2450 * find_vm_area - find a continuous kernel virtual area 2451 * @addr: base address 2452 * 2453 * Search for the kernel VM area starting at @addr, and return it. 2454 * It is up to the caller to do all required locking to keep the returned 2455 * pointer valid. 2456 * 2457 * Return: the area descriptor on success or %NULL on failure. 2458 */ 2459 struct vm_struct *find_vm_area(const void *addr) 2460 { 2461 struct vmap_area *va; 2462 2463 va = find_vmap_area((unsigned long)addr); 2464 if (!va) 2465 return NULL; 2466 2467 return va->vm; 2468 } 2469 2470 /** 2471 * remove_vm_area - find and remove a continuous kernel virtual area 2472 * @addr: base address 2473 * 2474 * Search for the kernel VM area starting at @addr, and remove it. 2475 * This function returns the found VM area, but using it is NOT safe 2476 * on SMP machines, except for its size or flags. 2477 * 2478 * Return: the area descriptor on success or %NULL on failure. 2479 */ 2480 struct vm_struct *remove_vm_area(const void *addr) 2481 { 2482 struct vmap_area *va; 2483 2484 might_sleep(); 2485 2486 spin_lock(&vmap_area_lock); 2487 va = __find_vmap_area((unsigned long)addr); 2488 if (va && va->vm) { 2489 struct vm_struct *vm = va->vm; 2490 2491 va->vm = NULL; 2492 spin_unlock(&vmap_area_lock); 2493 2494 kasan_free_shadow(vm); 2495 free_unmap_vmap_area(va); 2496 2497 return vm; 2498 } 2499 2500 spin_unlock(&vmap_area_lock); 2501 return NULL; 2502 } 2503 2504 static inline void set_area_direct_map(const struct vm_struct *area, 2505 int (*set_direct_map)(struct page *page)) 2506 { 2507 int i; 2508 2509 /* HUGE_VMALLOC passes small pages to set_direct_map */ 2510 for (i = 0; i < area->nr_pages; i++) 2511 if (page_address(area->pages[i])) 2512 set_direct_map(area->pages[i]); 2513 } 2514 2515 /* Handle removing and resetting vm mappings related to the vm_struct. */ 2516 static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) 2517 { 2518 unsigned long start = ULONG_MAX, end = 0; 2519 unsigned int page_order = vm_area_page_order(area); 2520 int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; 2521 int flush_dmap = 0; 2522 int i; 2523 2524 remove_vm_area(area->addr); 2525 2526 /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ 2527 if (!flush_reset) 2528 return; 2529 2530 /* 2531 * If not deallocating pages, just do the flush of the VM area and 2532 * return. 2533 */ 2534 if (!deallocate_pages) { 2535 vm_unmap_aliases(); 2536 return; 2537 } 2538 2539 /* 2540 * If execution gets here, flush the vm mapping and reset the direct 2541 * map. Find the start and end range of the direct mappings to make sure 2542 * the vm_unmap_aliases() flush includes the direct map. 2543 */ 2544 for (i = 0; i < area->nr_pages; i += 1U << page_order) { 2545 unsigned long addr = (unsigned long)page_address(area->pages[i]); 2546 if (addr) { 2547 unsigned long page_size; 2548 2549 page_size = PAGE_SIZE << page_order; 2550 start = min(addr, start); 2551 end = max(addr + page_size, end); 2552 flush_dmap = 1; 2553 } 2554 } 2555 2556 /* 2557 * Set direct map to something invalid so that it won't be cached if 2558 * there are any accesses after the TLB flush, then flush the TLB and 2559 * reset the direct map permissions to the default. 2560 */ 2561 set_area_direct_map(area, set_direct_map_invalid_noflush); 2562 _vm_unmap_aliases(start, end, flush_dmap); 2563 set_area_direct_map(area, set_direct_map_default_noflush); 2564 } 2565 2566 static void __vunmap(const void *addr, int deallocate_pages) 2567 { 2568 struct vm_struct *area; 2569 2570 if (!addr) 2571 return; 2572 2573 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", 2574 addr)) 2575 return; 2576 2577 area = find_vm_area(addr); 2578 if (unlikely(!area)) { 2579 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 2580 addr); 2581 return; 2582 } 2583 2584 debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); 2585 debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); 2586 2587 kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); 2588 2589 vm_remove_mappings(area, deallocate_pages); 2590 2591 if (deallocate_pages) { 2592 unsigned int page_order = vm_area_page_order(area); 2593 int i; 2594 2595 for (i = 0; i < area->nr_pages; i += 1U << page_order) { 2596 struct page *page = area->pages[i]; 2597 2598 BUG_ON(!page); 2599 __free_pages(page, page_order); 2600 cond_resched(); 2601 } 2602 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); 2603 2604 kvfree(area->pages); 2605 } 2606 2607 kfree(area); 2608 } 2609 2610 static inline void __vfree_deferred(const void *addr) 2611 { 2612 /* 2613 * Use raw_cpu_ptr() because this can be called from preemptible 2614 * context. Preemption is absolutely fine here, because the llist_add() 2615 * implementation is lockless, so it works even if we are adding to 2616 * another cpu's list. schedule_work() should be fine with this too. 2617 */ 2618 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); 2619 2620 if (llist_add((struct llist_node *)addr, &p->list)) 2621 schedule_work(&p->wq); 2622 } 2623 2624 /** 2625 * vfree_atomic - release memory allocated by vmalloc() 2626 * @addr: memory base address 2627 * 2628 * This one is just like vfree() but can be called in any atomic context 2629 * except NMIs. 2630 */ 2631 void vfree_atomic(const void *addr) 2632 { 2633 BUG_ON(in_nmi()); 2634 2635 kmemleak_free(addr); 2636 2637 if (!addr) 2638 return; 2639 __vfree_deferred(addr); 2640 } 2641 2642 static void __vfree(const void *addr) 2643 { 2644 if (unlikely(in_interrupt())) 2645 __vfree_deferred(addr); 2646 else 2647 __vunmap(addr, 1); 2648 } 2649 2650 /** 2651 * vfree - Release memory allocated by vmalloc() 2652 * @addr: Memory base address 2653 * 2654 * Free the virtually continuous memory area starting at @addr, as obtained 2655 * from one of the vmalloc() family of APIs. This will usually also free the 2656 * physical memory underlying the virtual allocation, but that memory is 2657 * reference counted, so it will not be freed until the last user goes away. 2658 * 2659 * If @addr is NULL, no operation is performed. 2660 * 2661 * Context: 2662 * May sleep if called *not* from interrupt context. 2663 * Must not be called in NMI context (strictly speaking, it could be 2664 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 2665 * conventions for vfree() arch-dependent would be a really bad idea). 2666 */ 2667 void vfree(const void *addr) 2668 { 2669 BUG_ON(in_nmi()); 2670 2671 kmemleak_free(addr); 2672 2673 might_sleep_if(!in_interrupt()); 2674 2675 if (!addr) 2676 return; 2677 2678 __vfree(addr); 2679 } 2680 EXPORT_SYMBOL(vfree); 2681 2682 /** 2683 * vunmap - release virtual mapping obtained by vmap() 2684 * @addr: memory base address 2685 * 2686 * Free the virtually contiguous memory area starting at @addr, 2687 * which was created from the page array passed to vmap(). 2688 * 2689 * Must not be called in interrupt context. 2690 */ 2691 void vunmap(const void *addr) 2692 { 2693 BUG_ON(in_interrupt()); 2694 might_sleep(); 2695 if (addr) 2696 __vunmap(addr, 0); 2697 } 2698 EXPORT_SYMBOL(vunmap); 2699 2700 /** 2701 * vmap - map an array of pages into virtually contiguous space 2702 * @pages: array of page pointers 2703 * @count: number of pages to map 2704 * @flags: vm_area->flags 2705 * @prot: page protection for the mapping 2706 * 2707 * Maps @count pages from @pages into contiguous kernel virtual space. 2708 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself 2709 * (which must be kmalloc or vmalloc memory) and one reference per pages in it 2710 * are transferred from the caller to vmap(), and will be freed / dropped when 2711 * vfree() is called on the return value. 2712 * 2713 * Return: the address of the area or %NULL on failure 2714 */ 2715 void *vmap(struct page **pages, unsigned int count, 2716 unsigned long flags, pgprot_t prot) 2717 { 2718 struct vm_struct *area; 2719 unsigned long addr; 2720 unsigned long size; /* In bytes */ 2721 2722 might_sleep(); 2723 2724 if (count > totalram_pages()) 2725 return NULL; 2726 2727 size = (unsigned long)count << PAGE_SHIFT; 2728 area = get_vm_area_caller(size, flags, __builtin_return_address(0)); 2729 if (!area) 2730 return NULL; 2731 2732 addr = (unsigned long)area->addr; 2733 if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), 2734 pages, PAGE_SHIFT) < 0) { 2735 vunmap(area->addr); 2736 return NULL; 2737 } 2738 2739 if (flags & VM_MAP_PUT_PAGES) { 2740 area->pages = pages; 2741 area->nr_pages = count; 2742 } 2743 return area->addr; 2744 } 2745 EXPORT_SYMBOL(vmap); 2746 2747 #ifdef CONFIG_VMAP_PFN 2748 struct vmap_pfn_data { 2749 unsigned long *pfns; 2750 pgprot_t prot; 2751 unsigned int idx; 2752 }; 2753 2754 static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) 2755 { 2756 struct vmap_pfn_data *data = private; 2757 2758 if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) 2759 return -EINVAL; 2760 *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); 2761 return 0; 2762 } 2763 2764 /** 2765 * vmap_pfn - map an array of PFNs into virtually contiguous space 2766 * @pfns: array of PFNs 2767 * @count: number of pages to map 2768 * @prot: page protection for the mapping 2769 * 2770 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns 2771 * the start address of the mapping. 2772 */ 2773 void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) 2774 { 2775 struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; 2776 struct vm_struct *area; 2777 2778 area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, 2779 __builtin_return_address(0)); 2780 if (!area) 2781 return NULL; 2782 if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 2783 count * PAGE_SIZE, vmap_pfn_apply, &data)) { 2784 free_vm_area(area); 2785 return NULL; 2786 } 2787 return area->addr; 2788 } 2789 EXPORT_SYMBOL_GPL(vmap_pfn); 2790 #endif /* CONFIG_VMAP_PFN */ 2791 2792 static inline unsigned int 2793 vm_area_alloc_pages(gfp_t gfp, int nid, 2794 unsigned int order, unsigned long nr_pages, struct page **pages) 2795 { 2796 unsigned int nr_allocated = 0; 2797 2798 /* 2799 * For order-0 pages we make use of bulk allocator, if 2800 * the page array is partly or not at all populated due 2801 * to fails, fallback to a single page allocator that is 2802 * more permissive. 2803 */ 2804 if (!order) 2805 nr_allocated = alloc_pages_bulk_array_node( 2806 gfp, nid, nr_pages, pages); 2807 else 2808 /* 2809 * Compound pages required for remap_vmalloc_page if 2810 * high-order pages. 2811 */ 2812 gfp |= __GFP_COMP; 2813 2814 /* High-order pages or fallback path if "bulk" fails. */ 2815 while (nr_allocated < nr_pages) { 2816 struct page *page; 2817 int i; 2818 2819 page = alloc_pages_node(nid, gfp, order); 2820 if (unlikely(!page)) 2821 break; 2822 2823 /* 2824 * Careful, we allocate and map page-order pages, but 2825 * tracking is done per PAGE_SIZE page so as to keep the 2826 * vm_struct APIs independent of the physical/mapped size. 2827 */ 2828 for (i = 0; i < (1U << order); i++) 2829 pages[nr_allocated + i] = page + i; 2830 2831 if (gfpflags_allow_blocking(gfp)) 2832 cond_resched(); 2833 2834 nr_allocated += 1U << order; 2835 } 2836 2837 return nr_allocated; 2838 } 2839 2840 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 2841 pgprot_t prot, unsigned int page_shift, 2842 int node) 2843 { 2844 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 2845 unsigned long addr = (unsigned long)area->addr; 2846 unsigned long size = get_vm_area_size(area); 2847 unsigned long array_size; 2848 unsigned int nr_small_pages = size >> PAGE_SHIFT; 2849 unsigned int page_order; 2850 2851 array_size = (unsigned long)nr_small_pages * sizeof(struct page *); 2852 gfp_mask |= __GFP_NOWARN; 2853 if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) 2854 gfp_mask |= __GFP_HIGHMEM; 2855 2856 /* Please note that the recursion is strictly bounded. */ 2857 if (array_size > PAGE_SIZE) { 2858 area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, 2859 area->caller); 2860 } else { 2861 area->pages = kmalloc_node(array_size, nested_gfp, node); 2862 } 2863 2864 if (!area->pages) { 2865 warn_alloc(gfp_mask, NULL, 2866 "vmalloc error: size %lu, failed to allocated page array size %lu", 2867 nr_small_pages * PAGE_SIZE, array_size); 2868 free_vm_area(area); 2869 return NULL; 2870 } 2871 2872 set_vm_area_page_order(area, page_shift - PAGE_SHIFT); 2873 page_order = vm_area_page_order(area); 2874 2875 area->nr_pages = vm_area_alloc_pages(gfp_mask, node, 2876 page_order, nr_small_pages, area->pages); 2877 2878 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); 2879 2880 /* 2881 * If not enough pages were obtained to accomplish an 2882 * allocation request, free them via __vfree() if any. 2883 */ 2884 if (area->nr_pages != nr_small_pages) { 2885 warn_alloc(gfp_mask, NULL, 2886 "vmalloc error: size %lu, page order %u, failed to allocate pages", 2887 area->nr_pages * PAGE_SIZE, page_order); 2888 goto fail; 2889 } 2890 2891 if (vmap_pages_range(addr, addr + size, prot, area->pages, 2892 page_shift) < 0) { 2893 warn_alloc(gfp_mask, NULL, 2894 "vmalloc error: size %lu, failed to map pages", 2895 area->nr_pages * PAGE_SIZE); 2896 goto fail; 2897 } 2898 2899 return area->addr; 2900 2901 fail: 2902 __vfree(area->addr); 2903 return NULL; 2904 } 2905 2906 /** 2907 * __vmalloc_node_range - allocate virtually contiguous memory 2908 * @size: allocation size 2909 * @align: desired alignment 2910 * @start: vm area range start 2911 * @end: vm area range end 2912 * @gfp_mask: flags for the page level allocator 2913 * @prot: protection mask for the allocated pages 2914 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) 2915 * @node: node to use for allocation or NUMA_NO_NODE 2916 * @caller: caller's return address 2917 * 2918 * Allocate enough pages to cover @size from the page level 2919 * allocator with @gfp_mask flags. Map them into contiguous 2920 * kernel virtual space, using a pagetable protection of @prot. 2921 * 2922 * Return: the address of the area or %NULL on failure 2923 */ 2924 void *__vmalloc_node_range(unsigned long size, unsigned long align, 2925 unsigned long start, unsigned long end, gfp_t gfp_mask, 2926 pgprot_t prot, unsigned long vm_flags, int node, 2927 const void *caller) 2928 { 2929 struct vm_struct *area; 2930 void *addr; 2931 unsigned long real_size = size; 2932 unsigned long real_align = align; 2933 unsigned int shift = PAGE_SHIFT; 2934 2935 if (WARN_ON_ONCE(!size)) 2936 return NULL; 2937 2938 if ((size >> PAGE_SHIFT) > totalram_pages()) { 2939 warn_alloc(gfp_mask, NULL, 2940 "vmalloc error: size %lu, exceeds total pages", 2941 real_size); 2942 return NULL; 2943 } 2944 2945 if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) { 2946 unsigned long size_per_node; 2947 2948 /* 2949 * Try huge pages. Only try for PAGE_KERNEL allocations, 2950 * others like modules don't yet expect huge pages in 2951 * their allocations due to apply_to_page_range not 2952 * supporting them. 2953 */ 2954 2955 size_per_node = size; 2956 if (node == NUMA_NO_NODE) 2957 size_per_node /= num_online_nodes(); 2958 if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE) 2959 shift = PMD_SHIFT; 2960 else 2961 shift = arch_vmap_pte_supported_shift(size_per_node); 2962 2963 align = max(real_align, 1UL << shift); 2964 size = ALIGN(real_size, 1UL << shift); 2965 } 2966 2967 again: 2968 area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | 2969 VM_UNINITIALIZED | vm_flags, start, end, node, 2970 gfp_mask, caller); 2971 if (!area) { 2972 warn_alloc(gfp_mask, NULL, 2973 "vmalloc error: size %lu, vm_struct allocation failed", 2974 real_size); 2975 goto fail; 2976 } 2977 2978 addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); 2979 if (!addr) 2980 goto fail; 2981 2982 /* 2983 * In this function, newly allocated vm_struct has VM_UNINITIALIZED 2984 * flag. It means that vm_struct is not fully initialized. 2985 * Now, it is fully initialized, so remove this flag here. 2986 */ 2987 clear_vm_uninitialized_flag(area); 2988 2989 size = PAGE_ALIGN(size); 2990 kmemleak_vmalloc(area, size, gfp_mask); 2991 2992 return addr; 2993 2994 fail: 2995 if (shift > PAGE_SHIFT) { 2996 shift = PAGE_SHIFT; 2997 align = real_align; 2998 size = real_size; 2999 goto again; 3000 } 3001 3002 return NULL; 3003 } 3004 3005 /** 3006 * __vmalloc_node - allocate virtually contiguous memory 3007 * @size: allocation size 3008 * @align: desired alignment 3009 * @gfp_mask: flags for the page level allocator 3010 * @node: node to use for allocation or NUMA_NO_NODE 3011 * @caller: caller's return address 3012 * 3013 * Allocate enough pages to cover @size from the page level allocator with 3014 * @gfp_mask flags. Map them into contiguous kernel virtual space. 3015 * 3016 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL 3017 * and __GFP_NOFAIL are not supported 3018 * 3019 * Any use of gfp flags outside of GFP_KERNEL should be consulted 3020 * with mm people. 3021 * 3022 * Return: pointer to the allocated memory or %NULL on error 3023 */ 3024 void *__vmalloc_node(unsigned long size, unsigned long align, 3025 gfp_t gfp_mask, int node, const void *caller) 3026 { 3027 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 3028 gfp_mask, PAGE_KERNEL, 0, node, caller); 3029 } 3030 /* 3031 * This is only for performance analysis of vmalloc and stress purpose. 3032 * It is required by vmalloc test module, therefore do not use it other 3033 * than that. 3034 */ 3035 #ifdef CONFIG_TEST_VMALLOC_MODULE 3036 EXPORT_SYMBOL_GPL(__vmalloc_node); 3037 #endif 3038 3039 void *__vmalloc(unsigned long size, gfp_t gfp_mask) 3040 { 3041 return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, 3042 __builtin_return_address(0)); 3043 } 3044 EXPORT_SYMBOL(__vmalloc); 3045 3046 /** 3047 * vmalloc - allocate virtually contiguous memory 3048 * @size: allocation size 3049 * 3050 * Allocate enough pages to cover @size from the page level 3051 * allocator and map them into contiguous kernel virtual space. 3052 * 3053 * For tight control over page level allocator and protection flags 3054 * use __vmalloc() instead. 3055 * 3056 * Return: pointer to the allocated memory or %NULL on error 3057 */ 3058 void *vmalloc(unsigned long size) 3059 { 3060 return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, 3061 __builtin_return_address(0)); 3062 } 3063 EXPORT_SYMBOL(vmalloc); 3064 3065 /** 3066 * vmalloc_no_huge - allocate virtually contiguous memory using small pages 3067 * @size: allocation size 3068 * 3069 * Allocate enough non-huge pages to cover @size from the page level 3070 * allocator and map them into contiguous kernel virtual space. 3071 * 3072 * Return: pointer to the allocated memory or %NULL on error 3073 */ 3074 void *vmalloc_no_huge(unsigned long size) 3075 { 3076 return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, 3077 GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP, 3078 NUMA_NO_NODE, __builtin_return_address(0)); 3079 } 3080 EXPORT_SYMBOL(vmalloc_no_huge); 3081 3082 /** 3083 * vzalloc - allocate virtually contiguous memory with zero fill 3084 * @size: allocation size 3085 * 3086 * Allocate enough pages to cover @size from the page level 3087 * allocator and map them into contiguous kernel virtual space. 3088 * The memory allocated is set to zero. 3089 * 3090 * For tight control over page level allocator and protection flags 3091 * use __vmalloc() instead. 3092 * 3093 * Return: pointer to the allocated memory or %NULL on error 3094 */ 3095 void *vzalloc(unsigned long size) 3096 { 3097 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 3098 __builtin_return_address(0)); 3099 } 3100 EXPORT_SYMBOL(vzalloc); 3101 3102 /** 3103 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 3104 * @size: allocation size 3105 * 3106 * The resulting memory area is zeroed so it can be mapped to userspace 3107 * without leaking data. 3108 * 3109 * Return: pointer to the allocated memory or %NULL on error 3110 */ 3111 void *vmalloc_user(unsigned long size) 3112 { 3113 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 3114 GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, 3115 VM_USERMAP, NUMA_NO_NODE, 3116 __builtin_return_address(0)); 3117 } 3118 EXPORT_SYMBOL(vmalloc_user); 3119 3120 /** 3121 * vmalloc_node - allocate memory on a specific node 3122 * @size: allocation size 3123 * @node: numa node 3124 * 3125 * Allocate enough pages to cover @size from the page level 3126 * allocator and map them into contiguous kernel virtual space. 3127 * 3128 * For tight control over page level allocator and protection flags 3129 * use __vmalloc() instead. 3130 * 3131 * Return: pointer to the allocated memory or %NULL on error 3132 */ 3133 void *vmalloc_node(unsigned long size, int node) 3134 { 3135 return __vmalloc_node(size, 1, GFP_KERNEL, node, 3136 __builtin_return_address(0)); 3137 } 3138 EXPORT_SYMBOL(vmalloc_node); 3139 3140 /** 3141 * vzalloc_node - allocate memory on a specific node with zero fill 3142 * @size: allocation size 3143 * @node: numa node 3144 * 3145 * Allocate enough pages to cover @size from the page level 3146 * allocator and map them into contiguous kernel virtual space. 3147 * The memory allocated is set to zero. 3148 * 3149 * Return: pointer to the allocated memory or %NULL on error 3150 */ 3151 void *vzalloc_node(unsigned long size, int node) 3152 { 3153 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, 3154 __builtin_return_address(0)); 3155 } 3156 EXPORT_SYMBOL(vzalloc_node); 3157 3158 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 3159 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 3160 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) 3161 #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) 3162 #else 3163 /* 3164 * 64b systems should always have either DMA or DMA32 zones. For others 3165 * GFP_DMA32 should do the right thing and use the normal zone. 3166 */ 3167 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 3168 #endif 3169 3170 /** 3171 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 3172 * @size: allocation size 3173 * 3174 * Allocate enough 32bit PA addressable pages to cover @size from the 3175 * page level allocator and map them into contiguous kernel virtual space. 3176 * 3177 * Return: pointer to the allocated memory or %NULL on error 3178 */ 3179 void *vmalloc_32(unsigned long size) 3180 { 3181 return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, 3182 __builtin_return_address(0)); 3183 } 3184 EXPORT_SYMBOL(vmalloc_32); 3185 3186 /** 3187 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 3188 * @size: allocation size 3189 * 3190 * The resulting memory area is 32bit addressable and zeroed so it can be 3191 * mapped to userspace without leaking data. 3192 * 3193 * Return: pointer to the allocated memory or %NULL on error 3194 */ 3195 void *vmalloc_32_user(unsigned long size) 3196 { 3197 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 3198 GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 3199 VM_USERMAP, NUMA_NO_NODE, 3200 __builtin_return_address(0)); 3201 } 3202 EXPORT_SYMBOL(vmalloc_32_user); 3203 3204 /* 3205 * small helper routine , copy contents to buf from addr. 3206 * If the page is not present, fill zero. 3207 */ 3208 3209 static int aligned_vread(char *buf, char *addr, unsigned long count) 3210 { 3211 struct page *p; 3212 int copied = 0; 3213 3214 while (count) { 3215 unsigned long offset, length; 3216 3217 offset = offset_in_page(addr); 3218 length = PAGE_SIZE - offset; 3219 if (length > count) 3220 length = count; 3221 p = vmalloc_to_page(addr); 3222 /* 3223 * To do safe access to this _mapped_ area, we need 3224 * lock. But adding lock here means that we need to add 3225 * overhead of vmalloc()/vfree() calls for this _debug_ 3226 * interface, rarely used. Instead of that, we'll use 3227 * kmap() and get small overhead in this access function. 3228 */ 3229 if (p) { 3230 /* We can expect USER0 is not used -- see vread() */ 3231 void *map = kmap_atomic(p); 3232 memcpy(buf, map + offset, length); 3233 kunmap_atomic(map); 3234 } else 3235 memset(buf, 0, length); 3236 3237 addr += length; 3238 buf += length; 3239 copied += length; 3240 count -= length; 3241 } 3242 return copied; 3243 } 3244 3245 /** 3246 * vread() - read vmalloc area in a safe way. 3247 * @buf: buffer for reading data 3248 * @addr: vm address. 3249 * @count: number of bytes to be read. 3250 * 3251 * This function checks that addr is a valid vmalloc'ed area, and 3252 * copy data from that area to a given buffer. If the given memory range 3253 * of [addr...addr+count) includes some valid address, data is copied to 3254 * proper area of @buf. If there are memory holes, they'll be zero-filled. 3255 * IOREMAP area is treated as memory hole and no copy is done. 3256 * 3257 * If [addr...addr+count) doesn't includes any intersects with alive 3258 * vm_struct area, returns 0. @buf should be kernel's buffer. 3259 * 3260 * Note: In usual ops, vread() is never necessary because the caller 3261 * should know vmalloc() area is valid and can use memcpy(). 3262 * This is for routines which have to access vmalloc area without 3263 * any information, as /proc/kcore. 3264 * 3265 * Return: number of bytes for which addr and buf should be increased 3266 * (same number as @count) or %0 if [addr...addr+count) doesn't 3267 * include any intersection with valid vmalloc area 3268 */ 3269 long vread(char *buf, char *addr, unsigned long count) 3270 { 3271 struct vmap_area *va; 3272 struct vm_struct *vm; 3273 char *vaddr, *buf_start = buf; 3274 unsigned long buflen = count; 3275 unsigned long n; 3276 3277 /* Don't allow overflow */ 3278 if ((unsigned long) addr + count < count) 3279 count = -(unsigned long) addr; 3280 3281 spin_lock(&vmap_area_lock); 3282 va = __find_vmap_area((unsigned long)addr); 3283 if (!va) 3284 goto finished; 3285 list_for_each_entry_from(va, &vmap_area_list, list) { 3286 if (!count) 3287 break; 3288 3289 if (!va->vm) 3290 continue; 3291 3292 vm = va->vm; 3293 vaddr = (char *) vm->addr; 3294 if (addr >= vaddr + get_vm_area_size(vm)) 3295 continue; 3296 while (addr < vaddr) { 3297 if (count == 0) 3298 goto finished; 3299 *buf = '\0'; 3300 buf++; 3301 addr++; 3302 count--; 3303 } 3304 n = vaddr + get_vm_area_size(vm) - addr; 3305 if (n > count) 3306 n = count; 3307 if (!(vm->flags & VM_IOREMAP)) 3308 aligned_vread(buf, addr, n); 3309 else /* IOREMAP area is treated as memory hole */ 3310 memset(buf, 0, n); 3311 buf += n; 3312 addr += n; 3313 count -= n; 3314 } 3315 finished: 3316 spin_unlock(&vmap_area_lock); 3317 3318 if (buf == buf_start) 3319 return 0; 3320 /* zero-fill memory holes */ 3321 if (buf != buf_start + buflen) 3322 memset(buf, 0, buflen - (buf - buf_start)); 3323 3324 return buflen; 3325 } 3326 3327 /** 3328 * remap_vmalloc_range_partial - map vmalloc pages to userspace 3329 * @vma: vma to cover 3330 * @uaddr: target user address to start at 3331 * @kaddr: virtual address of vmalloc kernel memory 3332 * @pgoff: offset from @kaddr to start at 3333 * @size: size of map area 3334 * 3335 * Returns: 0 for success, -Exxx on failure 3336 * 3337 * This function checks that @kaddr is a valid vmalloc'ed area, 3338 * and that it is big enough to cover the range starting at 3339 * @uaddr in @vma. Will return failure if that criteria isn't 3340 * met. 3341 * 3342 * Similar to remap_pfn_range() (see mm/memory.c) 3343 */ 3344 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, 3345 void *kaddr, unsigned long pgoff, 3346 unsigned long size) 3347 { 3348 struct vm_struct *area; 3349 unsigned long off; 3350 unsigned long end_index; 3351 3352 if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) 3353 return -EINVAL; 3354 3355 size = PAGE_ALIGN(size); 3356 3357 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) 3358 return -EINVAL; 3359 3360 area = find_vm_area(kaddr); 3361 if (!area) 3362 return -EINVAL; 3363 3364 if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) 3365 return -EINVAL; 3366 3367 if (check_add_overflow(size, off, &end_index) || 3368 end_index > get_vm_area_size(area)) 3369 return -EINVAL; 3370 kaddr += off; 3371 3372 do { 3373 struct page *page = vmalloc_to_page(kaddr); 3374 int ret; 3375 3376 ret = vm_insert_page(vma, uaddr, page); 3377 if (ret) 3378 return ret; 3379 3380 uaddr += PAGE_SIZE; 3381 kaddr += PAGE_SIZE; 3382 size -= PAGE_SIZE; 3383 } while (size > 0); 3384 3385 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3386 3387 return 0; 3388 } 3389 3390 /** 3391 * remap_vmalloc_range - map vmalloc pages to userspace 3392 * @vma: vma to cover (map full range of vma) 3393 * @addr: vmalloc memory 3394 * @pgoff: number of pages into addr before first page to map 3395 * 3396 * Returns: 0 for success, -Exxx on failure 3397 * 3398 * This function checks that addr is a valid vmalloc'ed area, and 3399 * that it is big enough to cover the vma. Will return failure if 3400 * that criteria isn't met. 3401 * 3402 * Similar to remap_pfn_range() (see mm/memory.c) 3403 */ 3404 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 3405 unsigned long pgoff) 3406 { 3407 return remap_vmalloc_range_partial(vma, vma->vm_start, 3408 addr, pgoff, 3409 vma->vm_end - vma->vm_start); 3410 } 3411 EXPORT_SYMBOL(remap_vmalloc_range); 3412 3413 void free_vm_area(struct vm_struct *area) 3414 { 3415 struct vm_struct *ret; 3416 ret = remove_vm_area(area->addr); 3417 BUG_ON(ret != area); 3418 kfree(area); 3419 } 3420 EXPORT_SYMBOL_GPL(free_vm_area); 3421 3422 #ifdef CONFIG_SMP 3423 static struct vmap_area *node_to_va(struct rb_node *n) 3424 { 3425 return rb_entry_safe(n, struct vmap_area, rb_node); 3426 } 3427 3428 /** 3429 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to 3430 * @addr: target address 3431 * 3432 * Returns: vmap_area if it is found. If there is no such area 3433 * the first highest(reverse order) vmap_area is returned 3434 * i.e. va->va_start < addr && va->va_end < addr or NULL 3435 * if there are no any areas before @addr. 3436 */ 3437 static struct vmap_area * 3438 pvm_find_va_enclose_addr(unsigned long addr) 3439 { 3440 struct vmap_area *va, *tmp; 3441 struct rb_node *n; 3442 3443 n = free_vmap_area_root.rb_node; 3444 va = NULL; 3445 3446 while (n) { 3447 tmp = rb_entry(n, struct vmap_area, rb_node); 3448 if (tmp->va_start <= addr) { 3449 va = tmp; 3450 if (tmp->va_end >= addr) 3451 break; 3452 3453 n = n->rb_right; 3454 } else { 3455 n = n->rb_left; 3456 } 3457 } 3458 3459 return va; 3460 } 3461 3462 /** 3463 * pvm_determine_end_from_reverse - find the highest aligned address 3464 * of free block below VMALLOC_END 3465 * @va: 3466 * in - the VA we start the search(reverse order); 3467 * out - the VA with the highest aligned end address. 3468 * @align: alignment for required highest address 3469 * 3470 * Returns: determined end address within vmap_area 3471 */ 3472 static unsigned long 3473 pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) 3474 { 3475 unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3476 unsigned long addr; 3477 3478 if (likely(*va)) { 3479 list_for_each_entry_from_reverse((*va), 3480 &free_vmap_area_list, list) { 3481 addr = min((*va)->va_end & ~(align - 1), vmalloc_end); 3482 if ((*va)->va_start < addr) 3483 return addr; 3484 } 3485 } 3486 3487 return 0; 3488 } 3489 3490 /** 3491 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator 3492 * @offsets: array containing offset of each area 3493 * @sizes: array containing size of each area 3494 * @nr_vms: the number of areas to allocate 3495 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 3496 * 3497 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 3498 * vm_structs on success, %NULL on failure 3499 * 3500 * Percpu allocator wants to use congruent vm areas so that it can 3501 * maintain the offsets among percpu areas. This function allocates 3502 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to 3503 * be scattered pretty far, distance between two areas easily going up 3504 * to gigabytes. To avoid interacting with regular vmallocs, these 3505 * areas are allocated from top. 3506 * 3507 * Despite its complicated look, this allocator is rather simple. It 3508 * does everything top-down and scans free blocks from the end looking 3509 * for matching base. While scanning, if any of the areas do not fit the 3510 * base address is pulled down to fit the area. Scanning is repeated till 3511 * all the areas fit and then all necessary data structures are inserted 3512 * and the result is returned. 3513 */ 3514 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 3515 const size_t *sizes, int nr_vms, 3516 size_t align) 3517 { 3518 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 3519 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3520 struct vmap_area **vas, *va; 3521 struct vm_struct **vms; 3522 int area, area2, last_area, term_area; 3523 unsigned long base, start, size, end, last_end, orig_start, orig_end; 3524 bool purged = false; 3525 enum fit_type type; 3526 3527 /* verify parameters and allocate data structures */ 3528 BUG_ON(offset_in_page(align) || !is_power_of_2(align)); 3529 for (last_area = 0, area = 0; area < nr_vms; area++) { 3530 start = offsets[area]; 3531 end = start + sizes[area]; 3532 3533 /* is everything aligned properly? */ 3534 BUG_ON(!IS_ALIGNED(offsets[area], align)); 3535 BUG_ON(!IS_ALIGNED(sizes[area], align)); 3536 3537 /* detect the area with the highest address */ 3538 if (start > offsets[last_area]) 3539 last_area = area; 3540 3541 for (area2 = area + 1; area2 < nr_vms; area2++) { 3542 unsigned long start2 = offsets[area2]; 3543 unsigned long end2 = start2 + sizes[area2]; 3544 3545 BUG_ON(start2 < end && start < end2); 3546 } 3547 } 3548 last_end = offsets[last_area] + sizes[last_area]; 3549 3550 if (vmalloc_end - vmalloc_start < last_end) { 3551 WARN_ON(true); 3552 return NULL; 3553 } 3554 3555 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); 3556 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); 3557 if (!vas || !vms) 3558 goto err_free2; 3559 3560 for (area = 0; area < nr_vms; area++) { 3561 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); 3562 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); 3563 if (!vas[area] || !vms[area]) 3564 goto err_free; 3565 } 3566 retry: 3567 spin_lock(&free_vmap_area_lock); 3568 3569 /* start scanning - we scan from the top, begin with the last area */ 3570 area = term_area = last_area; 3571 start = offsets[area]; 3572 end = start + sizes[area]; 3573 3574 va = pvm_find_va_enclose_addr(vmalloc_end); 3575 base = pvm_determine_end_from_reverse(&va, align) - end; 3576 3577 while (true) { 3578 /* 3579 * base might have underflowed, add last_end before 3580 * comparing. 3581 */ 3582 if (base + last_end < vmalloc_start + last_end) 3583 goto overflow; 3584 3585 /* 3586 * Fitting base has not been found. 3587 */ 3588 if (va == NULL) 3589 goto overflow; 3590 3591 /* 3592 * If required width exceeds current VA block, move 3593 * base downwards and then recheck. 3594 */ 3595 if (base + end > va->va_end) { 3596 base = pvm_determine_end_from_reverse(&va, align) - end; 3597 term_area = area; 3598 continue; 3599 } 3600 3601 /* 3602 * If this VA does not fit, move base downwards and recheck. 3603 */ 3604 if (base + start < va->va_start) { 3605 va = node_to_va(rb_prev(&va->rb_node)); 3606 base = pvm_determine_end_from_reverse(&va, align) - end; 3607 term_area = area; 3608 continue; 3609 } 3610 3611 /* 3612 * This area fits, move on to the previous one. If 3613 * the previous one is the terminal one, we're done. 3614 */ 3615 area = (area + nr_vms - 1) % nr_vms; 3616 if (area == term_area) 3617 break; 3618 3619 start = offsets[area]; 3620 end = start + sizes[area]; 3621 va = pvm_find_va_enclose_addr(base + end); 3622 } 3623 3624 /* we've found a fitting base, insert all va's */ 3625 for (area = 0; area < nr_vms; area++) { 3626 int ret; 3627 3628 start = base + offsets[area]; 3629 size = sizes[area]; 3630 3631 va = pvm_find_va_enclose_addr(start); 3632 if (WARN_ON_ONCE(va == NULL)) 3633 /* It is a BUG(), but trigger recovery instead. */ 3634 goto recovery; 3635 3636 type = classify_va_fit_type(va, start, size); 3637 if (WARN_ON_ONCE(type == NOTHING_FIT)) 3638 /* It is a BUG(), but trigger recovery instead. */ 3639 goto recovery; 3640 3641 ret = adjust_va_to_fit_type(va, start, size, type); 3642 if (unlikely(ret)) 3643 goto recovery; 3644 3645 /* Allocated area. */ 3646 va = vas[area]; 3647 va->va_start = start; 3648 va->va_end = start + size; 3649 } 3650 3651 spin_unlock(&free_vmap_area_lock); 3652 3653 /* populate the kasan shadow space */ 3654 for (area = 0; area < nr_vms; area++) { 3655 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) 3656 goto err_free_shadow; 3657 3658 kasan_unpoison_vmalloc((void *)vas[area]->va_start, 3659 sizes[area]); 3660 } 3661 3662 /* insert all vm's */ 3663 spin_lock(&vmap_area_lock); 3664 for (area = 0; area < nr_vms; area++) { 3665 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); 3666 3667 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, 3668 pcpu_get_vm_areas); 3669 } 3670 spin_unlock(&vmap_area_lock); 3671 3672 kfree(vas); 3673 return vms; 3674 3675 recovery: 3676 /* 3677 * Remove previously allocated areas. There is no 3678 * need in removing these areas from the busy tree, 3679 * because they are inserted only on the final step 3680 * and when pcpu_get_vm_areas() is success. 3681 */ 3682 while (area--) { 3683 orig_start = vas[area]->va_start; 3684 orig_end = vas[area]->va_end; 3685 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 3686 &free_vmap_area_list); 3687 if (va) 3688 kasan_release_vmalloc(orig_start, orig_end, 3689 va->va_start, va->va_end); 3690 vas[area] = NULL; 3691 } 3692 3693 overflow: 3694 spin_unlock(&free_vmap_area_lock); 3695 if (!purged) { 3696 purge_vmap_area_lazy(); 3697 purged = true; 3698 3699 /* Before "retry", check if we recover. */ 3700 for (area = 0; area < nr_vms; area++) { 3701 if (vas[area]) 3702 continue; 3703 3704 vas[area] = kmem_cache_zalloc( 3705 vmap_area_cachep, GFP_KERNEL); 3706 if (!vas[area]) 3707 goto err_free; 3708 } 3709 3710 goto retry; 3711 } 3712 3713 err_free: 3714 for (area = 0; area < nr_vms; area++) { 3715 if (vas[area]) 3716 kmem_cache_free(vmap_area_cachep, vas[area]); 3717 3718 kfree(vms[area]); 3719 } 3720 err_free2: 3721 kfree(vas); 3722 kfree(vms); 3723 return NULL; 3724 3725 err_free_shadow: 3726 spin_lock(&free_vmap_area_lock); 3727 /* 3728 * We release all the vmalloc shadows, even the ones for regions that 3729 * hadn't been successfully added. This relies on kasan_release_vmalloc 3730 * being able to tolerate this case. 3731 */ 3732 for (area = 0; area < nr_vms; area++) { 3733 orig_start = vas[area]->va_start; 3734 orig_end = vas[area]->va_end; 3735 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 3736 &free_vmap_area_list); 3737 if (va) 3738 kasan_release_vmalloc(orig_start, orig_end, 3739 va->va_start, va->va_end); 3740 vas[area] = NULL; 3741 kfree(vms[area]); 3742 } 3743 spin_unlock(&free_vmap_area_lock); 3744 kfree(vas); 3745 kfree(vms); 3746 return NULL; 3747 } 3748 3749 /** 3750 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator 3751 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() 3752 * @nr_vms: the number of allocated areas 3753 * 3754 * Free vm_structs and the array allocated by pcpu_get_vm_areas(). 3755 */ 3756 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) 3757 { 3758 int i; 3759 3760 for (i = 0; i < nr_vms; i++) 3761 free_vm_area(vms[i]); 3762 kfree(vms); 3763 } 3764 #endif /* CONFIG_SMP */ 3765 3766 #ifdef CONFIG_PRINTK 3767 bool vmalloc_dump_obj(void *object) 3768 { 3769 struct vm_struct *vm; 3770 void *objp = (void *)PAGE_ALIGN((unsigned long)object); 3771 3772 vm = find_vm_area(objp); 3773 if (!vm) 3774 return false; 3775 pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", 3776 vm->nr_pages, (unsigned long)vm->addr, vm->caller); 3777 return true; 3778 } 3779 #endif 3780 3781 #ifdef CONFIG_PROC_FS 3782 static void *s_start(struct seq_file *m, loff_t *pos) 3783 __acquires(&vmap_purge_lock) 3784 __acquires(&vmap_area_lock) 3785 { 3786 mutex_lock(&vmap_purge_lock); 3787 spin_lock(&vmap_area_lock); 3788 3789 return seq_list_start(&vmap_area_list, *pos); 3790 } 3791 3792 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3793 { 3794 return seq_list_next(p, &vmap_area_list, pos); 3795 } 3796 3797 static void s_stop(struct seq_file *m, void *p) 3798 __releases(&vmap_area_lock) 3799 __releases(&vmap_purge_lock) 3800 { 3801 spin_unlock(&vmap_area_lock); 3802 mutex_unlock(&vmap_purge_lock); 3803 } 3804 3805 static void show_numa_info(struct seq_file *m, struct vm_struct *v) 3806 { 3807 if (IS_ENABLED(CONFIG_NUMA)) { 3808 unsigned int nr, *counters = m->private; 3809 3810 if (!counters) 3811 return; 3812 3813 if (v->flags & VM_UNINITIALIZED) 3814 return; 3815 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ 3816 smp_rmb(); 3817 3818 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 3819 3820 for (nr = 0; nr < v->nr_pages; nr++) 3821 counters[page_to_nid(v->pages[nr])]++; 3822 3823 for_each_node_state(nr, N_HIGH_MEMORY) 3824 if (counters[nr]) 3825 seq_printf(m, " N%u=%u", nr, counters[nr]); 3826 } 3827 } 3828 3829 static void show_purge_info(struct seq_file *m) 3830 { 3831 struct vmap_area *va; 3832 3833 spin_lock(&purge_vmap_area_lock); 3834 list_for_each_entry(va, &purge_vmap_area_list, list) { 3835 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", 3836 (void *)va->va_start, (void *)va->va_end, 3837 va->va_end - va->va_start); 3838 } 3839 spin_unlock(&purge_vmap_area_lock); 3840 } 3841 3842 static int s_show(struct seq_file *m, void *p) 3843 { 3844 struct vmap_area *va; 3845 struct vm_struct *v; 3846 3847 va = list_entry(p, struct vmap_area, list); 3848 3849 /* 3850 * s_show can encounter race with remove_vm_area, !vm on behalf 3851 * of vmap area is being tear down or vm_map_ram allocation. 3852 */ 3853 if (!va->vm) { 3854 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", 3855 (void *)va->va_start, (void *)va->va_end, 3856 va->va_end - va->va_start); 3857 3858 return 0; 3859 } 3860 3861 v = va->vm; 3862 3863 seq_printf(m, "0x%pK-0x%pK %7ld", 3864 v->addr, v->addr + v->size, v->size); 3865 3866 if (v->caller) 3867 seq_printf(m, " %pS", v->caller); 3868 3869 if (v->nr_pages) 3870 seq_printf(m, " pages=%d", v->nr_pages); 3871 3872 if (v->phys_addr) 3873 seq_printf(m, " phys=%pa", &v->phys_addr); 3874 3875 if (v->flags & VM_IOREMAP) 3876 seq_puts(m, " ioremap"); 3877 3878 if (v->flags & VM_ALLOC) 3879 seq_puts(m, " vmalloc"); 3880 3881 if (v->flags & VM_MAP) 3882 seq_puts(m, " vmap"); 3883 3884 if (v->flags & VM_USERMAP) 3885 seq_puts(m, " user"); 3886 3887 if (v->flags & VM_DMA_COHERENT) 3888 seq_puts(m, " dma-coherent"); 3889 3890 if (is_vmalloc_addr(v->pages)) 3891 seq_puts(m, " vpages"); 3892 3893 show_numa_info(m, v); 3894 seq_putc(m, '\n'); 3895 3896 /* 3897 * As a final step, dump "unpurged" areas. 3898 */ 3899 if (list_is_last(&va->list, &vmap_area_list)) 3900 show_purge_info(m); 3901 3902 return 0; 3903 } 3904 3905 static const struct seq_operations vmalloc_op = { 3906 .start = s_start, 3907 .next = s_next, 3908 .stop = s_stop, 3909 .show = s_show, 3910 }; 3911 3912 static int __init proc_vmalloc_init(void) 3913 { 3914 if (IS_ENABLED(CONFIG_NUMA)) 3915 proc_create_seq_private("vmallocinfo", 0400, NULL, 3916 &vmalloc_op, 3917 nr_node_ids * sizeof(unsigned int), NULL); 3918 else 3919 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); 3920 return 0; 3921 } 3922 module_init(proc_vmalloc_init); 3923 3924 #endif 3925